49 static const unsigned int INITIAL_ROUNDING_MODE = _MM_GET_ROUNDING_MODE();
50 static unsigned int PREVIOUS_ROUNDING_MODE = INITIAL_ROUNDING_MODE;
52 inline void sse_restore_initial_rounding_mode() {
53 _MM_SET_ROUNDING_MODE(INITIAL_ROUNDING_MODE);
56 inline void sse_restore_previous_rounding_mode() {
57 const unsigned int mode = _MM_GET_ROUNDING_MODE();
58 _MM_SET_ROUNDING_MODE(PREVIOUS_ROUNDING_MODE);
59 PREVIOUS_ROUNDING_MODE = mode;
62 inline void sse_set_rounding_mode(
const unsigned int mode) {
63 PREVIOUS_ROUNDING_MODE = _MM_GET_ROUNDING_MODE();
64 _MM_SET_ROUNDING_MODE(mode);
73 inline int sse_is_16byte_aligned(
const T *ptr) {
74 return !(((uintptr_t)ptr) & 15);
78 inline int sse_is_not_16byte_aligned(
const T *ptr) {
79 return (((uintptr_t)ptr) & 15);
83 inline int sse_is_aligned(
const T *ptr,
const unsigned int bytes) {
84 return !(((uintptr_t)ptr) & (bytes-1));
88 inline int sse_is_not_aligned(
const T *ptr,
const unsigned int bytes) {
89 return (((uintptr_t)ptr) & (bytes-1));
98 inline T sse_if(
const T &vIf,
const T &v0) {
104 inline T sse_ifelse(
const T &vIf,
const T &v0,
const T &v1) {
106 ret += andnot(v1, vIf);
119 template<
class S,
class D>
120 inline void sse_for(
const S *src0,
122 void (*subMethod)(
const S*, D*),
123 void (*subSSEMethod)(
const S*, D*),
125 D *dstSSEEnd = dstEnd - (step - 1);
127 for (; dst0<dstSSEEnd;) {
129 (*subSSEMethod)(src0, dst0);
136 for (; dst0<dstEnd; ++src0, ++dst0) {
138 (*subMethod)(src0, dst0);
142 template<
class S,
class D>
143 inline void sse_for(
const S *src0,
144 D *dst0, D *dst1, D *dstEnd,
145 void (*subMethod)(
const S*, D*, D*),
146 void (*subSSEMethod)(
const S*, D*, D*),
148 D *dstSSEEnd = dstEnd - (step - 1);
150 for (; dst0<dstSSEEnd;) {
152 (*subSSEMethod)(src0, dst0, dst1);
160 for (; dst0<dstEnd; ++src0, ++dst0, ++dst1) {
162 (*subMethod)(src0, dst0, dst1);
166 template<
class S,
class D>
167 inline void sse_for(
const S *src0,
168 D *dst0, D *dst1, D *dst2, D *dstEnd,
169 void (*subMethod)(
const S*, D*, D*, D*),
170 void (*subSSEMethod)(
const S*, D*, D*, D*),
172 D *dstSSEEnd = dstEnd - (step - 1);
174 for (; dst0<dstSSEEnd;) {
176 (*subSSEMethod)(src0, dst0, dst1, dst2);
185 for (; dst0<dstEnd; ++src0, ++dst0, ++dst1, ++dst2) {
187 (*subMethod)(src0, dst0, dst1, dst2);
191 template<
class S,
class D>
192 inline void sse_for(
const S *src0,
193 D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd,
194 void (*subMethod)(
const S*, D*, D*, D*, D*),
195 void (*subSSEMethod)(
const S*, D*, D*, D*, D*),
197 D *dstSSEEnd = dstEnd - (step - 1);
199 for (; dst0<dstSSEEnd;) {
201 (*subSSEMethod)(src0, dst0, dst1, dst2, dst3);
211 for (; dst0<dstEnd; ++src0, ++dst0, ++dst1, ++dst2, ++dst3) {
213 (*subMethod)(src0, dst0, dst1, dst2, dst3);
217 template<
class S,
class D>
218 inline void sse_for(
const S *src0,
const S *src1,
220 void (*subMethod)(
const S*,
const S*, D*),
221 void (*subSSEMethod)(
const S*,
const S*, D*),
223 D *dstSSEEnd = dstEnd - (step - 1);
225 for (; dst0<dstSSEEnd;) {
227 (*subSSEMethod)(src0, src1, dst0);
235 for (; dst0<dstEnd; ++src0, ++src1, ++dst0) {
237 (*subMethod)(src0, src1, dst0);
241 template<
class S,
class D>
242 inline void sse_for(
const S *src0,
const S *src1,
243 D *dst0, D *dst1, D *dstEnd,
244 void (*subMethod)(
const S*,
const S*, D*, D*),
245 void (*subSSEMethod)(
const S*,
const S*, D*, D*),
247 D *dstSSEEnd = dstEnd - (step - 1);
249 for (; dst0<dstSSEEnd;) {
251 (*subSSEMethod)(src0, src1, dst0, dst1);
260 for (; dst0<dstEnd; ++src0, ++src1, ++dst0, ++dst1) {
262 (*subMethod)(src0, src1, dst0, dst1);
266 template<
class S,
class D>
267 inline void sse_for(
const S *src0,
const S *src1,
268 D *dst0, D *dst1, D *dst2, D *dstEnd,
269 void (*subMethod)(
const S*,
const S*, D*, D*, D*),
270 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*),
272 D *dstSSEEnd = dstEnd - (step - 1);
274 for (; dst0<dstSSEEnd;) {
276 (*subSSEMethod)(src0, src1, dst0, dst1, dst2);
286 for (; dst0<dstEnd; ++src0, ++src1, ++dst0, ++dst1, ++dst2) {
288 (*subMethod)(src0, src1, dst0, dst1, dst2);
292 template<
class S,
class D>
293 inline void sse_for(
const S *src0,
const S *src1,
294 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
295 void (*subMethod)(
const S*,
const S*, D*, D*, D*, D*),
296 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*, D*),
298 D *dstSSEEnd = dstEnd - (step - 1);
300 for (; dst0<dstSSEEnd;) {
302 (*subSSEMethod)(src0, src1, dst0, dst1, dst2, dst3);
313 for (; dst0<dstEnd; ++src0, ++src1, ++dst0, ++dst1, ++dst2, ++dst3) {
315 (*subMethod)(src0, src1, dst0, dst1, dst2, dst3);
319 template<
class S,
class D>
320 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
322 void (*subMethod)(
const S*,
const S*,
const S*, D*),
323 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*),
325 D *dstSSEEnd = dstEnd - (step - 1);
327 for (; dst0<dstSSEEnd;) {
329 (*subSSEMethod)(src0, src1, src2, dst0);
338 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++dst0) {
340 (*subMethod)(src0, src1, src2, dst0);
344 template<
class S,
class D>
345 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
346 D *dst0, D *dst1, D *dstEnd,
347 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*),
348 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*),
350 D *dstSSEEnd = dstEnd - (step - 1);
352 for (; dst0<dstSSEEnd;) {
354 (*subSSEMethod)(src0, src1, src2, dst0, dst1);
364 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++dst0, ++dst1) {
366 (*subMethod)(src0, src1, src2, dst0, dst1);
370 template<
class S,
class D>
371 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
372 D *dst0, D *dst1, D *dst2, D *dstEnd,
373 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
374 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
376 D *dstSSEEnd = dstEnd - (step - 1);
378 for (; dst0<dstSSEEnd;) {
380 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2);
391 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++dst0, ++dst1, ++dst2) {
393 (*subMethod)(src0, src1, src2, dst0, dst1, dst2);
397 template<
class S,
class D>
398 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
399 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
400 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
401 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
403 D *dstSSEEnd = dstEnd - (step - 1);
405 for (; dst0<dstSSEEnd;) {
407 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
419 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++dst0, ++dst1, ++dst2, ++dst3) {
421 (*subMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
425 template<
class S,
class D>
426 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
428 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*),
429 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*),
431 D *dstSSEEnd = dstEnd - (step - 1);
433 for (; dst0<dstSSEEnd;) {
435 (*subSSEMethod)(src0, src1, src2, src3, dst0);
445 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++src3, ++dst0) {
447 (*subMethod)(src0, src1, src2, src3, dst0);
451 template<
class S,
class D>
452 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
453 D *dst0, D *dst1, D *dstEnd,
454 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
455 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
457 D *dstSSEEnd = dstEnd - (step - 1);
459 for (; dst0<dstSSEEnd;) {
461 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1);
472 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++src3, ++dst0, ++dst1) {
474 (*subMethod)(src0, src1, src2, src3, dst0, dst1);
478 template<
class S,
class D>
479 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
480 D *dst0, D *dst1, D *dst2, D *dstEnd,
481 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
482 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
484 D *dstSSEEnd = dstEnd - (step - 1);
486 for (; dst0<dstSSEEnd;) {
488 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
500 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++src3, ++dst0, ++dst1, ++dst2) {
502 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
506 template<
class S,
class D>
507 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
508 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
509 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
510 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
512 D *dstSSEEnd = dstEnd - (step - 1);
514 for (; dst0<dstSSEEnd;) {
516 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
529 for (; dst0<dstEnd; ++src0, ++src1, ++src2, ++src3, ++dst0, ++dst1, ++dst2, ++dst3) {
531 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
539 template<
class S,
class D>
540 inline void sse_for(
const S *src0,
542 void (*subMethod)(
const S*, D*),
543 void (*subSSEMethod)(
const S*, D*),
544 long srcStep,
long dstStep) {
545 D *dstSSEEnd = dstEnd - (dstStep - 1);
548 if (srcStep < dstStep) {
549 dStep = dstStep / srcStep;
552 sStep = srcStep / dstStep;
556 for (; dst0<dstSSEEnd;) {
558 (*subSSEMethod)(src0, dst0);
565 for (; dst0<dstEnd; src0 += sStep, dst0 += dStep) {
567 (*subMethod)(src0, dst0);
571 template<
class S,
class D>
572 inline void sse_for(
const S *src0,
573 D *dst0, D *dst1, D *dstEnd,
574 void (*subMethod)(
const S*, D*, D*),
575 void (*subSSEMethod)(
const S*, D*, D*),
576 long srcStep,
long dstStep) {
577 D *dstSSEEnd = dstEnd - (dstStep - 1);
580 if (srcStep < dstStep) {
581 dStep = dstStep / srcStep;
584 sStep = srcStep / dstStep;
588 for (; dst0<dstSSEEnd;) {
590 (*subSSEMethod)(src0, dst0, dst1);
598 for (; dst0<dstEnd; src0 += sStep, dst0 += dStep, dst1 += dStep) {
600 (*subMethod)(src0, dst0, dst1);
604 template<
class S,
class D>
605 inline void sse_for(
const S *src0,
606 D *dst0, D *dst1, D *dst2, D *dstEnd,
607 void (*subMethod)(
const S*, D*, D*, D*),
608 void (*subSSEMethod)(
const S*, D*, D*, D*),
609 long srcStep,
long dstStep) {
610 D *dstSSEEnd = dstEnd - (dstStep - 1);
613 if (srcStep < dstStep) {
614 dStep = dstStep / srcStep;
617 sStep = srcStep / dstStep;
621 for (; dst0<dstSSEEnd;) {
623 (*subSSEMethod)(src0, dst0, dst1, dst2);
632 for (; dst0<dstEnd; src0 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
634 (*subMethod)(src0, dst0, dst1, dst2);
638 template<
class S,
class D>
639 inline void sse_for(
const S *src0,
640 D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd,
641 void (*subMethod)(
const S*, D*, D*, D*, D*),
642 void (*subSSEMethod)(
const S*, D*, D*, D*, D*),
643 long srcStep,
long dstStep) {
644 D *dstSSEEnd = dstEnd - (dstStep - 1);
647 if (srcStep < dstStep) {
648 dStep = dstStep / srcStep;
651 sStep = srcStep / dstStep;
655 for (; dst0<dstSSEEnd;) {
657 (*subSSEMethod)(src0, dst0, dst1, dst2, dst3);
667 for (; dst0<dstEnd; src0 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
669 (*subMethod)(src0, dst0, dst1, dst2, dst3);
673 template<
class S,
class D>
674 inline void sse_for(
const S *src0,
const S *src1,
676 void (*subMethod)(
const S*,
const S*, D*),
677 void (*subSSEMethod)(
const S*,
const S*, D*),
678 long srcStep,
long dstStep) {
679 D *dstSSEEnd = dstEnd - (dstStep - 1);
682 if (srcStep < dstStep) {
683 dStep = dstStep / srcStep;
686 sStep = srcStep / dstStep;
690 for (; dst0<dstSSEEnd;) {
692 (*subSSEMethod)(src0, src1, dst0);
700 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, dst0 += dStep) {
702 (*subMethod)(src0, src1, dst0);
706 template<
class S,
class D>
707 inline void sse_for(
const S *src0,
const S *src1,
708 D *dst0, D *dst1, D *dstEnd,
709 void (*subMethod)(
const S*,
const S*, D*, D*),
710 void (*subSSEMethod)(
const S*,
const S*, D*, D*),
711 long srcStep,
long dstStep) {
712 D *dstSSEEnd = dstEnd - (dstStep - 1);
715 if (srcStep < dstStep) {
716 dStep = dstStep / srcStep;
719 sStep = srcStep / dstStep;
723 for (; dst0<dstSSEEnd;) {
725 (*subSSEMethod)(src0, src1, dst0, dst1);
734 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, dst0 += dStep, dst1 += dStep) {
736 (*subMethod)(src0, src1, dst0, dst1);
740 template<
class S,
class D>
741 inline void sse_for(
const S *src0,
const S *src1,
742 D *dst0, D *dst1, D *dst2, D *dstEnd,
743 void (*subMethod)(
const S*,
const S*, D*, D*, D*),
744 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*),
745 long srcStep,
long dstStep) {
746 D *dstSSEEnd = dstEnd - (dstStep - 1);
749 if (srcStep < dstStep) {
750 dStep = dstStep / srcStep;
753 sStep = srcStep / dstStep;
757 for (; dst0<dstSSEEnd;) {
759 (*subSSEMethod)(src0, src1, dst0, dst1, dst2);
769 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
771 (*subMethod)(src0, src1, dst0, dst1, dst2);
775 template<
class S,
class D>
776 inline void sse_for(
const S *src0,
const S *src1,
777 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
778 void (*subMethod)(
const S*,
const S*, D*, D*, D*, D*),
779 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*, D*),
780 long srcStep,
long dstStep) {
781 D *dstSSEEnd = dstEnd - (dstStep - 1);
784 if (srcStep < dstStep) {
785 dStep = dstStep / srcStep;
788 sStep = srcStep / dstStep;
792 for (; dst0<dstSSEEnd;) {
794 (*subSSEMethod)(src0, src1, dst0, dst1, dst2, dst3);
805 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
807 (*subMethod)(src0, src1, dst0, dst1, dst2, dst3);
811 template<
class S,
class D>
812 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
814 void (*subMethod)(
const S*,
const S*,
const S*, D*),
815 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*),
816 long srcStep,
long dstStep) {
817 D *dstSSEEnd = dstEnd - (dstStep - 1);
820 if (srcStep < dstStep) {
821 dStep = dstStep / srcStep;
824 sStep = srcStep / dstStep;
828 for (; dst0<dstSSEEnd;) {
830 (*subSSEMethod)(src0, src1, src2, dst0);
839 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep) {
841 (*subMethod)(src0, src1, src2, dst0);
845 template<
class S,
class D>
846 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
847 D *dst0, D *dst1, D *dstEnd,
848 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*),
849 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*),
850 long srcStep,
long dstStep) {
851 D *dstSSEEnd = dstEnd - (dstStep - 1);
854 if (srcStep < dstStep) {
855 dStep = dstStep / srcStep;
858 sStep = srcStep / dstStep;
862 for (; dst0<dstSSEEnd;) {
864 (*subSSEMethod)(src0, src1, src2, dst0, dst1);
874 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep, dst1 += dStep) {
876 (*subMethod)(src0, src1, src2, dst0, dst1);
880 template<
class S,
class D>
881 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
882 D *dst0, D *dst1, D *dst2, D *dstEnd,
883 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
884 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
885 long srcStep,
long dstStep) {
886 D *dstSSEEnd = dstEnd - (dstStep - 1);
889 if (srcStep < dstStep) {
890 dStep = dstStep / srcStep;
893 sStep = srcStep / dstStep;
897 for (; dst0<dstSSEEnd;) {
899 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2);
910 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
912 (*subMethod)(src0, src1, src2, dst0, dst1, dst2);
916 template<
class S,
class D>
917 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
918 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
919 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
920 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
921 long srcStep,
long dstStep) {
922 D *dstSSEEnd = dstEnd - (dstStep - 1);
925 if (srcStep < dstStep) {
926 dStep = dstStep / srcStep;
929 sStep = srcStep / dstStep;
933 for (; dst0<dstSSEEnd;) {
935 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
947 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
949 (*subMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
953 template<
class S,
class D>
954 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
956 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*),
957 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*),
958 long srcStep,
long dstStep) {
959 D *dstSSEEnd = dstEnd - (dstStep - 1);
962 if (srcStep < dstStep) {
963 dStep = dstStep / srcStep;
966 sStep = srcStep / dstStep;
970 for (; dst0<dstSSEEnd;) {
972 (*subSSEMethod)(src0, src1, src2, src3, dst0);
982 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep) {
984 (*subMethod)(src0, src1, src2, src3, dst0);
988 template<
class S,
class D>
989 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
990 D *dst0, D *dst1, D *dstEnd,
991 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
992 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
993 long srcStep,
long dstStep) {
994 D *dstSSEEnd = dstEnd - (dstStep - 1);
997 if (srcStep < dstStep) {
998 dStep = dstStep / srcStep;
1001 sStep = srcStep / dstStep;
1005 for (; dst0<dstSSEEnd;) {
1007 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1);
1018 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep, dst1 += dStep) {
1020 (*subMethod)(src0, src1, src2, src3, dst0, dst1);
1024 template<
class S,
class D>
1025 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
1026 D *dst0, D *dst1, D *dst2, D *dstEnd,
1027 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
1028 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
1029 long srcStep,
long dstStep) {
1030 D *dstSSEEnd = dstEnd - (dstStep - 1);
1033 if (srcStep < dstStep) {
1034 dStep = dstStep / srcStep;
1037 sStep = srcStep / dstStep;
1041 for (; dst0<dstSSEEnd;) {
1043 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
1055 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
1057 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
1061 template<
class S,
class D>
1062 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
1063 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
1064 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
1065 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
1066 long srcStep,
long dstStep) {
1067 D *dstSSEEnd = dstEnd - (dstStep - 1);
1070 if (srcStep < dstStep) {
1071 dStep = dstStep / srcStep;
1074 sStep = srcStep / dstStep;
1078 for (; dst0<dstSSEEnd;) {
1080 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
1093 for (; dst0<dstEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
1095 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
1108 template<
class S,
class D>
1109 inline void sse_for(
const S *src0,
1111 long srcWidth,
long dstWidth,
long lineWidth,
1112 void (*subMethod)(
const S*, D*),
1113 void (*subSSEMethod)(
const S*, D*),
1115 D *dstLEnd = dst0 + lineWidth;
1116 D *dstSSEEnd = dstLEnd - (step - 1);
1117 long srcOffset = srcWidth - lineWidth;
1118 long dstOffset = dstWidth - lineWidth;
1120 for (; dst0<dstEnd;) {
1121 if (dst0<dstSSEEnd) {
1123 (*subSSEMethod)(src0, dst0);
1129 for (; dst0<dstLEnd; ++src0, ++dst0) {
1131 (*subMethod)(src0, dst0);
1135 dstLEnd += dstWidth;
1136 dstSSEEnd += dstWidth;
1143 template<
class S,
class D>
1144 inline void sse_for(
const S *src0,
1145 D *dst0, D *dst1, D *dstEnd,
1146 long srcWidth,
long dstWidth,
long lineWidth,
1147 void (*subMethod)(
const S*, D*, D*),
1148 void (*subSSEMethod)(
const S*, D*, D*),
1150 D *dstLEnd = dst0 + lineWidth;
1151 D *dstSSEEnd = dstLEnd - (step - 1);
1152 long srcOffset = srcWidth - lineWidth;
1153 long dstOffset = dstWidth - lineWidth;
1155 for (; dst0<dstEnd;) {
1156 if (dst0<dstSSEEnd) {
1158 (*subSSEMethod)(src0, dst0, dst1);
1165 for (; dst0<dstLEnd; ++src0, ++dst0, ++dst1) {
1167 (*subMethod)(src0, dst0, dst1);
1171 dstLEnd += dstWidth;
1172 dstSSEEnd += dstWidth;
1180 template<
class S,
class D>
1181 inline void sse_for(
const S *src0,
1182 D *dst0, D *dst1, D *dst2, D *dstEnd,
1183 long srcWidth,
long dstWidth,
long lineWidth,
1184 void (*subMethod)(
const S*, D*, D*, D*),
1185 void (*subSSEMethod)(
const S*, D*, D*, D*),
1187 D *dstLEnd = dst0 + lineWidth;
1188 D *dstSSEEnd = dstLEnd - (step - 1);
1189 long srcOffset = srcWidth - lineWidth;
1190 long dstOffset = dstWidth - lineWidth;
1192 for (; dst0<dstEnd;) {
1193 if (dst0<dstSSEEnd) {
1195 (*subSSEMethod)(src0, dst0, dst1, dst2);
1203 for (; dst0<dstLEnd; ++src0, ++dst0, ++dst1, ++dst2) {
1205 (*subMethod)(src0, dst0, dst1, dst2);
1209 dstLEnd += dstWidth;
1210 dstSSEEnd += dstWidth;
1219 template<
class S,
class D>
1220 inline void sse_for(
const S *src0,
1221 D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd,
1222 long srcWidth,
long dstWidth,
long lineWidth,
1223 void (*subMethod)(
const S*, D*, D*, D*, D*),
1224 void (*subSSEMethod)(
const S*, D*, D*, D*, D*),
1226 D *dstLEnd = dst0 + lineWidth;
1227 D *dstSSEEnd = dstLEnd - (step - 1);
1228 long srcOffset = srcWidth - lineWidth;
1229 long dstOffset = dstWidth - lineWidth;
1231 for (; dst0<dstEnd;) {
1232 if (dst0<dstSSEEnd) {
1234 (*subSSEMethod)(src0, dst0, dst1, dst2, dst3);
1243 for (; dst0<dstLEnd; ++src0, ++dst0, ++dst1, ++dst2, ++dst3) {
1245 (*subMethod)(src0, dst0, dst1, dst2, dst3);
1249 dstLEnd += dstWidth;
1250 dstSSEEnd += dstWidth;
1260 template<
class S,
class D>
1261 inline void sse_for(
const S *src0,
const S *src1,
1262 D *dst0, D *dst1, D *dstEnd,
1263 long srcWidth,
long dstWidth,
long lineWidth,
1264 void (*subMethod)(
const S*,
const S*, D*, D*),
1265 void (*subSSEMethod)(
const S*,
const S*, D*, D*),
1267 D *dstLEnd = dst0 + lineWidth;
1268 D *dstSSEEnd = dstLEnd - (step - 1);
1269 long srcOffset = srcWidth - lineWidth;
1270 long dstOffset = dstWidth - lineWidth;
1272 for (; dst0<dstEnd;) {
1273 if (dst0<dstSSEEnd) {
1275 (*subSSEMethod)(src0, src1, dst0, dst1);
1283 for (; dst0<dstLEnd; ++src0, ++src1, ++dst0, ++dst1) {
1285 (*subMethod)(src0, src1, dst0, dst1);
1289 dstLEnd += dstWidth;
1290 dstSSEEnd += dstWidth;
1299 template<
class S,
class D>
1300 inline void sse_for(
const S *src0,
const S *src1,
1301 D *dst0, D *dst1, D *dst2, D *dstEnd,
1302 long srcWidth,
long dstWidth,
long lineWidth,
1303 void (*subMethod)(
const S*,
const S*, D*, D*, D*),
1304 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*),
1306 D *dstLEnd = dst0 + lineWidth;
1307 D *dstSSEEnd = dstLEnd - (step - 1);
1308 long srcOffset = srcWidth - lineWidth;
1309 long dstOffset = dstWidth - lineWidth;
1311 for (; dst0<dstEnd;) {
1312 if (dst0<dstSSEEnd) {
1314 (*subSSEMethod)(src0, src1, dst0, dst1, dst2);
1323 for (; dst0<dstLEnd; ++src0, ++src1, ++dst0, ++dst1, ++dst2) {
1325 (*subMethod)(src0, src1, dst0, dst1, dst2);
1329 dstLEnd += dstWidth;
1330 dstSSEEnd += dstWidth;
1340 template<
class S,
class D>
1341 inline void sse_for(
const S *src0,
const S *src1,
1342 D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd,
1343 long srcWidth,
long dstWidth,
long lineWidth,
1344 void (*subMethod)(
const S*,
const S*, D*, D*, D*, D*),
1345 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*, D*),
1347 D *dstLEnd = dst0 + lineWidth;
1348 D *dstSSEEnd = dstLEnd - (step - 1);
1349 long srcOffset = srcWidth - lineWidth;
1350 long dstOffset = dstWidth - lineWidth;
1352 for (; dst0<dstEnd;) {
1353 if (dst0<dstSSEEnd) {
1355 (*subSSEMethod)(src0, src1, dst0, dst1, dst2, dst3);
1365 for (; dst0<dstLEnd; ++src0, ++src1, ++dst0, ++dst1, ++dst2, ++dst3) {
1367 (*subMethod)(src0, src1, dst0, dst1, dst2, dst3);
1371 dstLEnd += dstWidth;
1372 dstSSEEnd += dstWidth;
1383 template<
class S,
class D>
1384 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
1386 long srcWidth,
long dstWidth,
long lineWidth,
1387 void (*subMethod)(
const S*,
const S*,
const S*, D*),
1388 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*),
1390 D *dstLEnd = dst0 + lineWidth;
1391 D *dstSSEEnd = dstLEnd - (step - 1);
1392 long srcOffset = srcWidth - lineWidth;
1393 long dstOffset = dstWidth - lineWidth;
1395 for (; dst0<dstEnd;) {
1396 if (dst0<dstSSEEnd) {
1398 (*subSSEMethod)(src0, src1, src2, dst0);
1406 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++dst0) {
1408 (*subMethod)(src0, src1, src2, dst0);
1412 dstLEnd += dstWidth;
1413 dstSSEEnd += dstWidth;
1422 template<
class S,
class D>
1423 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
1424 D *dst0, D *dst1, D *dstEnd,
1425 long srcWidth,
long dstWidth,
long lineWidth,
1426 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*),
1427 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*),
1429 D *dstLEnd = dst0 + lineWidth;
1430 D *dstSSEEnd = dstLEnd - (step - 1);
1431 long srcOffset = srcWidth - lineWidth;
1432 long dstOffset = dstWidth - lineWidth;
1434 for (; dst0<dstEnd;) {
1435 if (dst0<dstSSEEnd) {
1437 (*subSSEMethod)(src0, src1, src2, dst0, dst1);
1446 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++dst0, ++dst1) {
1448 (*subMethod)(src0, src1, src2, dst0, dst1);
1452 dstLEnd += dstWidth;
1453 dstSSEEnd += dstWidth;
1463 template<
class S,
class D>
1464 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
1465 D *dst0, D *dst1, D *dst2, D *dstEnd,
1466 long srcWidth,
long dstWidth,
long lineWidth,
1467 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
1468 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
1470 D *dstLEnd = dst0 + lineWidth;
1471 D *dstSSEEnd = dstLEnd - (step - 1);
1472 long srcOffset = srcWidth - lineWidth;
1473 long dstOffset = dstWidth - lineWidth;
1475 for (; dst0<dstEnd;) {
1476 if (dst0<dstSSEEnd) {
1478 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2);
1488 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++dst0, ++dst1, ++dst2) {
1490 (*subMethod)(src0, src1, src2, dst0, dst1, dst2);
1494 dstLEnd += dstWidth;
1495 dstSSEEnd += dstWidth;
1506 template<
class S,
class D>
1507 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
1508 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
1509 long srcWidth,
long dstWidth,
long lineWidth,
1510 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
1511 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
1513 D *dstLEnd = dst0 + lineWidth;
1514 D *dstSSEEnd = dstLEnd - (step - 1);
1515 long srcOffset = srcWidth - lineWidth;
1516 long dstOffset = dstWidth - lineWidth;
1518 for (; dst0<dstEnd;) {
1519 if (dst0<dstSSEEnd) {
1521 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
1532 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++dst0, ++dst1, ++dst2, ++dst3) {
1534 (*subMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
1538 dstLEnd += dstWidth;
1539 dstSSEEnd += dstWidth;
1551 template<
class S,
class D>
1552 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
1554 long srcWidth,
long dstWidth,
long lineWidth,
1555 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*),
1556 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*),
1558 D *dstLEnd = dst0 + lineWidth;
1559 D *dstSSEEnd = dstLEnd - (step - 1);
1560 long srcOffset = srcWidth - lineWidth;
1561 long dstOffset = dstWidth - lineWidth;
1563 for (; dst0<dstEnd;) {
1564 if (dst0<dstSSEEnd) {
1566 (*subSSEMethod)(src0, src1, src2, src3, dst0);
1575 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++src3, ++dst0) {
1577 (*subMethod)(src0, src1, src2, src3, dst0);
1581 dstLEnd += dstWidth;
1582 dstSSEEnd += dstWidth;
1592 template<
class S,
class D>
1593 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
1594 D *dst0, D *dst1, D *dstEnd,
1595 long srcWidth,
long dstWidth,
long lineWidth,
1596 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
1597 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
1599 D *dstLEnd = dst0 + lineWidth;
1600 D *dstSSEEnd = dstLEnd - (step - 1);
1601 long srcOffset = srcWidth - lineWidth;
1602 long dstOffset = dstWidth - lineWidth;
1604 for (; dst0<dstEnd;) {
1605 if (dst0<dstSSEEnd) {
1607 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1);
1617 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++src3, ++dst0, ++dst1) {
1619 (*subMethod)(src0, src1, src2, src3, dst0, dst1);
1623 dstLEnd += dstWidth;
1624 dstSSEEnd += dstWidth;
1635 template<
class S,
class D>
1636 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
1637 D *dst0, D *dst1, D *dst2, D *dstEnd,
1638 long srcWidth,
long dstWidth,
long lineWidth,
1639 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
1640 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
1642 D *dstLEnd = dst0 + lineWidth;
1643 D *dstSSEEnd = dstLEnd - (step - 1);
1644 long srcOffset = srcWidth - lineWidth;
1645 long dstOffset = dstWidth - lineWidth;
1647 for (; dst0<dstEnd;) {
1648 if (dst0<dstSSEEnd) {
1650 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
1661 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++src3, ++dst0, ++dst1, ++dst2) {
1663 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
1667 dstLEnd += dstWidth;
1668 dstSSEEnd += dstWidth;
1680 template<
class S,
class D>
1681 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
1682 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
1683 long srcWidth,
long dstWidth,
long lineWidth,
1684 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
1685 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
1687 D *dstLEnd = dst0 + lineWidth;
1688 D *dstSSEEnd = dstLEnd - (step - 1);
1689 long srcOffset = srcWidth - lineWidth;
1690 long dstOffset = dstWidth - lineWidth;
1692 for (; dst0<dstEnd;) {
1693 if (dst0<dstSSEEnd) {
1695 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
1707 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++src3, ++dst0, ++dst1, ++dst2, ++dst3) {
1709 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
1713 dstLEnd += dstWidth;
1714 dstSSEEnd += dstWidth;
1727 template<
class S,
class D>
1728 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
const S *src4,
1730 long srcWidth,
long dstWidth,
long lineWidth,
1731 void (*subMethod)(
const S*,
const S*,
const S*,
const S*,
const S*, D*),
1732 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*,
const S*, D*),
1734 D *dstLEnd = dst0 + lineWidth;
1735 D *dstSSEEnd = dstLEnd - (step - 1);
1736 long srcOffset = srcWidth - lineWidth;
1737 long dstOffset = dstWidth - lineWidth;
1739 for (; dst0<dstEnd;) {
1740 if (dst0<dstSSEEnd) {
1742 (*subSSEMethod)(src0, src1, src2, src3, src4, dst0);
1752 for (; dst0<dstLEnd; ++src0, ++src1, ++src2, ++src3, ++src4, ++dst0) {
1754 (*subMethod)(src0, src1, src2, src3, src4, dst0);
1758 dstLEnd += dstWidth;
1759 dstSSEEnd += dstWidth;
1774 template<
class S,
class D>
1775 inline void sse_for(
const S *src0,
1777 long srcWidth,
long dstWidth,
long lineWidth,
1778 void (*subMethod)(
const S*, D*),
1779 void (*subSSEMethod)(
const S*, D*),
1780 long srcStep,
long dstStep) {
1781 D *dstLEnd = dst0 + lineWidth;
1782 D *dstSSEEnd = dstLEnd - (dstStep - 1);
1783 long srcOffset = srcWidth - lineWidth;
1784 long dstOffset = dstWidth - lineWidth;
1787 if (srcStep < dstStep) {
1788 dStep = dstStep / srcStep;
1791 sStep = srcStep / dstStep;
1795 for (; dst0<dstEnd;) {
1796 if (dst0<dstSSEEnd) {
1798 (*subSSEMethod)(src0, dst0);
1804 for (; dst0<dstLEnd; src0 += sStep, dst0 += dStep) {
1806 (*subMethod)(src0, dst0);
1810 dstLEnd += dstWidth;
1811 dstSSEEnd += dstWidth;
1818 template<
class S,
class D>
1819 inline void sse_for(
const S *src0,
1820 D *dst0, D *dst1, D *dstEnd,
1821 long srcWidth,
long dstWidth,
long lineWidth,
1822 void (*subMethod)(
const S*, D*, D*),
1823 void (*subSSEMethod)(
const S*, D*, D*),
1824 long srcStep,
long dstStep) {
1825 D *dstLEnd = dst0 + lineWidth;
1826 D *dstSSEEnd = dstLEnd - (dstStep - 1);
1827 long srcOffset = srcWidth - lineWidth;
1828 long dstOffset = dstWidth - lineWidth;
1831 if (srcStep < dstStep) {
1832 dStep = dstStep / srcStep;
1835 sStep = srcStep / dstStep;
1839 for (; dst0<dstEnd;) {
1840 if (dst0<dstSSEEnd) {
1842 (*subSSEMethod)(src0, dst0, dst1);
1849 for (; dst0<dstLEnd; src0 += sStep, dst0 += dStep, dst1 += dStep) {
1851 (*subMethod)(src0, dst0, dst1);
1855 dstLEnd += dstWidth;
1856 dstSSEEnd += dstWidth;
1864 template<
class S,
class D>
1865 inline void sse_for(
const S *src0,
1866 D *dst0, D *dst1, D *dst2, D *dstEnd,
1867 long srcWidth,
long dstWidth,
long lineWidth,
1868 void (*subMethod)(
const S*, D*, D*, D*),
1869 void (*subSSEMethod)(
const S*, D*, D*, D*),
1870 long srcStep,
long dstStep) {
1871 D *dstLEnd = dst0 + lineWidth;
1872 D *dstSSEEnd = dstLEnd - (dstStep - 1);
1873 long srcOffset = srcWidth - lineWidth;
1874 long dstOffset = dstWidth - lineWidth;
1877 if (srcStep < dstStep) {
1878 dStep = dstStep / srcStep;
1881 sStep = srcStep / dstStep;
1885 for (; dst0<dstEnd;) {
1886 if (dst0<dstSSEEnd) {
1888 (*subSSEMethod)(src0, dst0, dst1, dst2);
1896 for (; dst0<dstLEnd; src0 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
1898 (*subMethod)(src0, dst0, dst1, dst2);
1902 dstLEnd += dstWidth;
1903 dstSSEEnd += dstWidth;
1912 template<
class S,
class D>
1913 inline void sse_for(
const S *src0,
1914 D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd,
1915 long srcWidth,
long dstWidth,
long lineWidth,
1916 void (*subMethod)(
const S*, D*, D*, D*, D*),
1917 void (*subSSEMethod)(
const S*, D*, D*, D*, D*),
1918 long srcStep,
long dstStep) {
1919 D *dstLEnd = dst0 + lineWidth;
1920 D *dstSSEEnd = dstLEnd - (dstStep - 1);
1921 long srcOffset = srcWidth - lineWidth;
1922 long dstOffset = dstWidth - lineWidth;
1925 if (srcStep < dstStep) {
1926 dStep = dstStep / srcStep;
1929 sStep = srcStep / dstStep;
1933 for (; dst0<dstEnd;) {
1934 if (dst0<dstSSEEnd) {
1936 (*subSSEMethod)(src0, dst0, dst1, dst2, dst3);
1945 for (; dst0<dstLEnd; src0 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
1947 (*subMethod)(src0, dst0, dst1, dst2, dst3);
1951 dstLEnd += dstWidth;
1952 dstSSEEnd += dstWidth;
1962 template<
class S,
class D>
1963 inline void sse_for(
const S *src0,
const S *src1,
1964 D *dst0, D *dst1, D *dstEnd,
1965 long srcWidth,
long dstWidth,
long lineWidth,
1966 void (*subMethod)(
const S*,
const S*, D*, D*),
1967 void (*subSSEMethod)(
const S*,
const S*, D*, D*),
1968 long srcStep,
long dstStep) {
1969 D *dstLEnd = dst0 + lineWidth;
1970 D *dstSSEEnd = dstLEnd - (dstStep - 1);
1971 long srcOffset = srcWidth - lineWidth;
1972 long dstOffset = dstWidth - lineWidth;
1975 if (srcStep < dstStep) {
1976 dStep = dstStep / srcStep;
1979 sStep = srcStep / dstStep;
1983 for (; dst0<dstEnd;) {
1984 if (dst0<dstSSEEnd) {
1986 (*subSSEMethod)(src0, src1, dst0, dst1);
1994 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, dst0 += dStep, dst1 += dStep) {
1996 (*subMethod)(src0, src1, dst0, dst1);
2000 dstLEnd += dstWidth;
2001 dstSSEEnd += dstWidth;
2010 template<
class S,
class D>
2011 inline void sse_for(
const S *src0,
const S *src1,
2012 D *dst0, D *dst1, D *dst2, D *dstEnd,
2013 long srcWidth,
long dstWidth,
long lineWidth,
2014 void (*subMethod)(
const S*,
const S*, D*, D*, D*),
2015 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*),
2016 long srcStep,
long dstStep) {
2017 D *dstLEnd = dst0 + lineWidth;
2018 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2019 long srcOffset = srcWidth - lineWidth;
2020 long dstOffset = dstWidth - lineWidth;
2023 if (srcStep < dstStep) {
2024 dStep = dstStep / srcStep;
2027 sStep = srcStep / dstStep;
2031 for (; dst0<dstEnd;) {
2032 if (dst0<dstSSEEnd) {
2034 (*subSSEMethod)(src0, src1, dst0, dst1, dst2);
2043 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
2045 (*subMethod)(src0, src1, dst0, dst1, dst2);
2049 dstLEnd += dstWidth;
2050 dstSSEEnd += dstWidth;
2060 template<
class S,
class D>
2061 inline void sse_for(
const S *src0,
const S *src1,
2062 D *dst0, D *dst1, D *dst2, D *dst3, D *dstEnd,
2063 long srcWidth,
long dstWidth,
long lineWidth,
2064 void (*subMethod)(
const S*,
const S*, D*, D*, D*, D*),
2065 void (*subSSEMethod)(
const S*,
const S*, D*, D*, D*, D*),
2066 long srcStep,
long dstStep) {
2067 D *dstLEnd = dst0 + lineWidth;
2068 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2069 long srcOffset = srcWidth - lineWidth;
2070 long dstOffset = dstWidth - lineWidth;
2073 if (srcStep < dstStep) {
2074 dStep = dstStep / srcStep;
2077 sStep = srcStep / dstStep;
2081 for (; dst0<dstEnd;) {
2082 if (dst0<dstSSEEnd) {
2084 (*subSSEMethod)(src0, src1, dst0, dst1, dst2, dst3);
2094 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
2096 (*subMethod)(src0, src1, dst0, dst1, dst2, dst3);
2100 dstLEnd += dstWidth;
2101 dstSSEEnd += dstWidth;
2112 template<
class S,
class D>
2113 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
2115 long srcWidth,
long dstWidth,
long lineWidth,
2116 void (*subMethod)(
const S*,
const S*,
const S*, D*),
2117 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*),
2118 long srcStep,
long dstStep) {
2119 D *dstLEnd = dst0 + lineWidth;
2120 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2121 long srcOffset = srcWidth - lineWidth;
2122 long dstOffset = dstWidth - lineWidth;
2125 if (srcStep < dstStep) {
2126 dStep = dstStep / srcStep;
2129 sStep = srcStep / dstStep;
2133 for (; dst0<dstEnd;) {
2134 if (dst0<dstSSEEnd) {
2136 (*subSSEMethod)(src0, src1, src2, dst0);
2144 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep) {
2146 (*subMethod)(src0, src1, src2, dst0);
2150 dstLEnd += dstWidth;
2151 dstSSEEnd += dstWidth;
2160 template<
class S,
class D>
2161 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
2162 D *dst0, D *dst1, D *dstEnd,
2163 long srcWidth,
long dstWidth,
long lineWidth,
2164 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*),
2165 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*),
2166 long srcStep,
long dstStep) {
2167 D *dstLEnd = dst0 + lineWidth;
2168 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2169 long srcOffset = srcWidth - lineWidth;
2170 long dstOffset = dstWidth - lineWidth;
2173 if (srcStep < dstStep) {
2174 dStep = dstStep / srcStep;
2177 sStep = srcStep / dstStep;
2181 for (; dst0<dstEnd;) {
2182 if (dst0<dstSSEEnd) {
2184 (*subSSEMethod)(src0, src1, src2, dst0, dst1);
2193 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep, dst1 += dStep) {
2195 (*subMethod)(src0, src1, src2, dst0, dst1);
2199 dstLEnd += dstWidth;
2200 dstSSEEnd += dstWidth;
2210 template<
class S,
class D>
2211 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
2212 D *dst0, D *dst1, D *dst2, D *dstEnd,
2213 long srcWidth,
long dstWidth,
long lineWidth,
2214 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
2215 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*),
2216 long srcStep,
long dstStep) {
2217 D *dstLEnd = dst0 + lineWidth;
2218 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2219 long srcOffset = srcWidth - lineWidth;
2220 long dstOffset = dstWidth - lineWidth;
2223 if (srcStep < dstStep) {
2224 dStep = dstStep / srcStep;
2227 sStep = srcStep / dstStep;
2231 for (; dst0<dstEnd;) {
2232 if (dst0<dstSSEEnd) {
2234 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2);
2244 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
2246 (*subMethod)(src0, src1, src2, dst0, dst1, dst2);
2250 dstLEnd += dstWidth;
2251 dstSSEEnd += dstWidth;
2262 template<
class S,
class D>
2263 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
2264 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
2265 long srcWidth,
long dstWidth,
long lineWidth,
2266 void (*subMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
2267 void (*subSSEMethod)(
const S*,
const S*,
const S*, D*, D*, D*, D*),
2268 long srcStep,
long dstStep) {
2269 D *dstLEnd = dst0 + lineWidth;
2270 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2271 long srcOffset = srcWidth - lineWidth;
2272 long dstOffset = dstWidth - lineWidth;
2275 if (srcStep < dstStep) {
2276 dStep = dstStep / srcStep;
2279 sStep = srcStep / dstStep;
2283 for (; dst0<dstEnd;) {
2284 if (dst0<dstSSEEnd) {
2286 (*subSSEMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
2297 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
2299 (*subMethod)(src0, src1, src2, dst0, dst1, dst2, dst3);
2303 dstLEnd += dstWidth;
2304 dstSSEEnd += dstWidth;
2316 template<
class S,
class D>
2317 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
2319 long srcWidth,
long dstWidth,
long lineWidth,
2320 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*),
2321 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*),
2322 long srcStep,
long dstStep) {
2323 D *dstLEnd = dst0 + lineWidth;
2324 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2325 long srcOffset = srcWidth - lineWidth;
2326 long dstOffset = dstWidth - lineWidth;
2329 if (srcStep < dstStep) {
2330 dStep = dstStep / srcStep;
2333 sStep = srcStep / dstStep;
2337 for (; dst0<dstEnd;) {
2338 if (dst0<dstSSEEnd) {
2340 (*subSSEMethod)(src0, src1, src2, src3, dst0);
2349 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep) {
2351 (*subMethod)(src0, src1, src2, src3, dst0);
2355 dstLEnd += dstWidth;
2356 dstSSEEnd += dstWidth;
2366 template<
class S,
class D>
2367 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
2368 D *dst0, D *dst1, D *dstEnd,
2369 long srcWidth,
long dstWidth,
long lineWidth,
2370 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
2371 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*),
2372 long srcStep,
long dstStep) {
2373 D *dstLEnd = dst0 + lineWidth;
2374 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2375 long srcOffset = srcWidth - lineWidth;
2376 long dstOffset = dstWidth - lineWidth;
2379 if (srcStep < dstStep) {
2380 dStep = dstStep / srcStep;
2383 sStep = srcStep / dstStep;
2387 for (; dst0<dstEnd;) {
2388 if (dst0<dstSSEEnd) {
2390 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1);
2400 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep, dst1 += dStep) {
2402 (*subMethod)(src0, src1, src2, src3, dst0, dst1);
2406 dstLEnd += dstWidth;
2407 dstSSEEnd += dstWidth;
2418 template<
class S,
class D>
2419 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
2420 D *dst0, D *dst1, D *dst2, D *dstEnd,
2421 long srcWidth,
long dstWidth,
long lineWidth,
2422 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
2423 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*),
2424 long srcStep,
long dstStep) {
2425 D *dstLEnd = dst0 + lineWidth;
2426 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2427 long srcOffset = srcWidth - lineWidth;
2428 long dstOffset = dstWidth - lineWidth;
2431 if (srcStep < dstStep) {
2432 dStep = dstStep / srcStep;
2435 sStep = srcStep / dstStep;
2439 for (; dst0<dstEnd;) {
2440 if (dst0<dstSSEEnd) {
2442 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
2453 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep) {
2455 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2);
2459 dstLEnd += dstWidth;
2460 dstSSEEnd += dstWidth;
2472 template<
class S,
class D>
2473 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
2474 D *dst0, D *dst1, D *dst2, D* dst3, D *dstEnd,
2475 long srcWidth,
long dstWidth,
long lineWidth,
2476 void (*subMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
2477 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*, D*, D*, D*, D*),
2478 long srcStep,
long dstStep) {
2479 D *dstLEnd = dst0 + lineWidth;
2480 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2481 long srcOffset = srcWidth - lineWidth;
2482 long dstOffset = dstWidth - lineWidth;
2485 if (srcStep < dstStep) {
2486 dStep = dstStep / srcStep;
2489 sStep = srcStep / dstStep;
2493 for (; dst0<dstEnd;) {
2494 if (dst0<dstSSEEnd) {
2496 (*subSSEMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
2508 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, dst0 += dStep, dst1 += dStep, dst2 += dStep, dst3 += dStep) {
2510 (*subMethod)(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
2514 dstLEnd += dstWidth;
2515 dstSSEEnd += dstWidth;
2528 template<
class S,
class D>
2529 inline void sse_for(
const S *src0,
const S *src1,
const S *src2,
const S *src3,
const S *src4,
2531 long srcWidth,
long dstWidth,
long lineWidth,
2532 void (*subMethod)(
const S*,
const S*,
const S*,
const S*,
const S*, D*),
2533 void (*subSSEMethod)(
const S*,
const S*,
const S*,
const S*,
const S*, D*),
2534 long srcStep,
long dstStep) {
2535 D *dstLEnd = dst0 + lineWidth;
2536 D *dstSSEEnd = dstLEnd - (dstStep - 1);
2537 long srcOffset = srcWidth - lineWidth;
2538 long dstOffset = dstWidth - lineWidth;
2541 if (srcStep < dstStep) {
2542 dStep = dstStep / srcStep;
2545 sStep = srcStep / dstStep;
2549 for (; dst0<dstEnd;) {
2550 if (dst0<dstSSEEnd) {
2552 (*subSSEMethod)(src0, src1, src2, src3, src4, dst0);
2562 for (; dst0<dstLEnd; src0 += sStep, src1 += sStep, src2 += sStep, src3 += sStep, src4 += sStep, dst0 += dStep) {
2564 (*subMethod)(src0, src1, src2, src3, src4, dst0);
2568 dstLEnd += dstWidth;
2569 dstSSEEnd += dstWidth;
undocument this line if you encounter any issues!
Definition: Any.h:37