33 #if defined ICL_USE_SSE2 && (__SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)) 34 #include "emmintrin.h" 36 #if defined ICL_USE_SSE3 && (__SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)) 37 #include "pmmintrin.h" 39 #if defined ICL_USE_SSSE3 && (__SSSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)) 40 #include "tmmintrin.h" 41 #define ICL_HAVE_SSSE3 211 inline Icl128i(
const Icl128i &v) {
215 inline Icl128i(
const __m128i &v) {
219 inline Icl128i(
const __m128i *v) {
220 v0 = _mm_loadu_si128(v);
223 inline Icl128i& operator=(
const Icl128i &v) {
228 inline operator __m128i ()
const {
232 inline Icl128i& operator&=(
const Icl128i &v) {
233 v0 = _mm_and_si128(v0, v.v0);
237 inline Icl128i& operator|=(
const Icl128i &v) {
238 v0 = _mm_or_si128(v0, v.v0);
242 inline Icl128i& operator^=(
const Icl128i &v) {
243 v0 = _mm_xor_si128(v0, v.v0);
247 inline Icl128i& andnot(
const Icl128i &v) {
248 v0 = _mm_andnot_si128(v.v0, v0);
252 inline void store(__m128i *v)
const {
253 _mm_store_si128(v, v0);
256 inline void storeu(__m128i *v)
const {
257 _mm_storeu_si128(v, v0);
266 __m128 v0; __m128 v1;
270 __m128i v0; __m128i v1;
275 inline Icl256i(
const Icl256i &v) {
280 inline Icl256i(
const __m128i &vl,
const __m128i &vh) {
285 inline Icl256i(
const __m128i *v) {
290 inline Icl256i& operator=(
const Icl256i &v) {
296 inline Icl256i& operator&=(
const Icl256i &v) {
297 v0 = _mm_and_si128(v0, v.v0);
298 v1 = _mm_and_si128(v1, v.v1);
302 inline Icl256i& operator|=(
const Icl256i &v) {
303 v0 = _mm_or_si128(v0, v.v0);
304 v1 = _mm_or_si128(v1, v.v1);
308 inline Icl256i& operator^=(
const Icl256i &v) {
309 v0 = _mm_xor_si128(v0, v.v0);
310 v1 = _mm_xor_si128(v1, v.v1);
314 inline Icl256i& andnot(
const Icl256i &v) {
315 v0 = _mm_andnot_si128(v.v0, v0);
316 v1 = _mm_andnot_si128(v.v1, v1);
320 inline void store(__m128i *v)
const {
321 _mm_store_si128(v, v0);
322 _mm_store_si128(v + 1, v1);
325 inline void storeu(__m128i *v)
const {
326 _mm_storeu_si128(v, v0);
327 _mm_storeu_si128(v + 1, v1);
332 __m128d v0; __m128d v1;
336 __m128 v0; __m128 v1; __m128 v2; __m128 v3;
340 __m128i v0; __m128i v1; __m128i v2; __m128i v3;
345 inline Icl512i(
const Icl512i &v) {
352 inline Icl512i(
const __m128i &vll,
const __m128i &vlh,
353 const __m128i &vhl,
const __m128i &vhh) {
360 inline Icl512i(
const __m128i *v) {
367 inline Icl512i& operator=(
const Icl512i &v) {
375 inline Icl512i& operator&=(
const Icl512i &v) {
376 v0 = _mm_and_si128(v0, v.v0);
377 v1 = _mm_and_si128(v1, v.v1);
378 v2 = _mm_and_si128(v2, v.v2);
379 v3 = _mm_and_si128(v3, v.v3);
383 inline Icl512i& operator|=(
const Icl512i &v) {
384 v0 = _mm_or_si128(v0, v.v0);
385 v1 = _mm_or_si128(v1, v.v1);
386 v2 = _mm_or_si128(v2, v.v2);
387 v3 = _mm_or_si128(v3, v.v3);
391 inline Icl512i& operator^=(
const Icl512i &v) {
392 v0 = _mm_xor_si128(v0, v.v0);
393 v1 = _mm_xor_si128(v1, v.v1);
394 v2 = _mm_xor_si128(v2, v.v2);
395 v3 = _mm_xor_si128(v3, v.v3);
399 inline Icl512i& andnot(
const Icl512i &v) {
400 v0 = _mm_andnot_si128(v.v0, v0);
401 v1 = _mm_andnot_si128(v.v1, v1);
402 v2 = _mm_andnot_si128(v.v2, v2);
403 v3 = _mm_andnot_si128(v.v3, v3);
407 inline void store(__m128i *v)
const {
408 _mm_store_si128(v, v0);
409 _mm_store_si128(v + 1, v1);
410 _mm_store_si128(v + 2, v2);
411 _mm_store_si128(v + 3, v3);
414 inline void storeu(__m128i *v)
const {
415 _mm_storeu_si128(v, v0);
416 _mm_storeu_si128(v + 1, v1);
417 _mm_storeu_si128(v + 2, v2);
418 _mm_storeu_si128(v + 3, v3);
423 __m128d v0; __m128d v1; __m128d v2; __m128d v3;
427 __m128d v0; __m128d v1; __m128d v2; __m128d v3;
428 __m128d v4; __m128d v5; __m128d v6; __m128d v7;
437 struct icl128 : Icl128 {
441 inline icl128(
const icl128 &v) {
445 inline icl128(
const __m128 &v) {
449 inline icl128(
const icl32f *v) {
450 v0 = _mm_loadu_ps(v);
453 inline icl128(
const icl32f v) {
457 inline icl128(
const Icl128 &v) {
461 inline icl128(
const Icl128i &v) {
462 v0 = _mm_cvtepi32_ps(v.v0);
480 inline icl128& operator=(
const icl128 &v) {
485 inline icl128& operator=(
const Icl128 &v) {
495 inline operator __m128 ()
const {
499 inline icl128& operator+=(
const Icl128 &v) {
500 v0 = _mm_add_ps(v0, v.v0);
504 inline icl128& operator-=(
const Icl128 &v) {
505 v0 = _mm_sub_ps(v0, v.v0);
510 v0 = _mm_mul_ps(v0, v.v0);
514 inline icl128& operator/=(
const Icl128 &v) {
515 v0 = _mm_div_ps(v0, v.v0);
519 inline icl128& operator&=(
const Icl128 &v) {
520 v0 = _mm_and_ps(v0, v.v0);
524 inline icl128& operator|=(
const Icl128 &v) {
525 v0 = _mm_or_ps(v0, v.v0);
529 inline icl128& operator^=(
const Icl128 &v) {
530 v0 = _mm_xor_ps(v0, v.v0);
534 inline icl128& andnot(
const Icl128 &v) {
535 v0 = _mm_andnot_ps(v.v0, v0);
539 inline icl128& rcp() {
544 inline void store(
icl32f *v)
const {
548 inline void storeu(
icl32f *v)
const {
549 _mm_storeu_ps(v, v0);
554 struct icl256 : Icl256 {
558 inline icl256(
const icl256 &v) {
563 inline icl256(
const __m128 &vl,
const __m128 &vh) {
568 inline icl256(
const __m128 *v) {
573 inline icl256(
const icl32f v) {
578 inline icl256(
const Icl256 &v) {
583 inline icl256(
const Icl256i &v) {
584 v0 = _mm_cvtepi32_ps(v.v0);
585 v1 = _mm_cvtepi32_ps(v.v1);
600 inline icl256& operator=(
const icl256 &v) {
606 inline icl256& operator=(
const Icl256 &v) {
618 inline icl256& operator+=(
const Icl256 &v) {
619 v0 = _mm_add_ps(v0, v.v0);
620 v1 = _mm_add_ps(v1, v.v1);
624 inline icl256& operator-=(
const Icl256 &v) {
625 v0 = _mm_sub_ps(v0, v.v0);
626 v1 = _mm_sub_ps(v1, v.v1);
631 v0 = _mm_mul_ps(v0, v.v0);
632 v1 = _mm_mul_ps(v1, v.v1);
636 inline icl256& operator/=(
const Icl256 &v) {
637 v0 = _mm_div_ps(v0, v.v0);
638 v1 = _mm_div_ps(v1, v.v1);
642 inline icl256& operator&=(
const Icl256 &v) {
643 v0 = _mm_and_ps(v0, v.v0);
644 v1 = _mm_and_ps(v1, v.v1);
648 inline icl256& operator|=(
const Icl256 &v) {
649 v0 = _mm_or_ps(v0, v.v0);
650 v1 = _mm_or_ps(v1, v.v1);
654 inline icl256& operator^=(
const Icl256 &v) {
655 v0 = _mm_xor_ps(v0, v.v0);
656 v1 = _mm_xor_ps(v1, v.v1);
660 inline icl256& andnot(
const Icl256 &v) {
661 v0 = _mm_andnot_ps(v.v0, v0);
662 v1 = _mm_andnot_ps(v.v1, v1);
666 inline icl256& rcp() {
672 inline void store(
icl32f *v)
const {
674 _mm_store_ps(v + 4, v1);
677 inline void storeu(
icl32f *v)
const {
678 _mm_storeu_ps(v, v0);
679 _mm_storeu_ps(v + 4, v1);
684 struct icl512 : Icl512 {
688 inline icl512(
const icl512 &v) {
695 inline icl512(
const __m128 &vll,
const __m128 &vlh,
696 const __m128 &vhl,
const __m128 &vhh) {
703 inline icl512(
const __m128 *v) {
710 inline icl512(
const icl8u *v) {
711 const __m128i vk0 = _mm_setzero_si128();
712 __m128i vt0, vt1, vt2, vt3;
714 vt3 = _mm_loadu_si128((__m128i*)v);
716 vt1 = _mm_unpacklo_epi8(vt3, vk0);
717 vt3 = _mm_unpackhi_epi8(vt3, vk0);
719 vt0 = _mm_unpacklo_epi16(vt1, vk0);
720 vt1 = _mm_unpackhi_epi16(vt1, vk0);
721 vt2 = _mm_unpacklo_epi16(vt3, vk0);
722 vt3 = _mm_unpackhi_epi16(vt3, vk0);
724 v0 = _mm_cvtepi32_ps(vt0);
725 v1 = _mm_cvtepi32_ps(vt1);
726 v2 = _mm_cvtepi32_ps(vt2);
727 v3 = _mm_cvtepi32_ps(vt3);
730 inline icl512(
const icl32f *v) {
731 v0 = _mm_loadu_ps(v);
732 v1 = _mm_loadu_ps(v + 4);
733 v2 = _mm_loadu_ps(v + 8);
734 v3 = _mm_loadu_ps(v + 12);
737 inline icl512(
const Icl512 &v) {
744 inline icl512(
const Icl512i &v) {
745 v0 = _mm_cvtepi32_ps(v.v0);
746 v1 = _mm_cvtepi32_ps(v.v1);
747 v2 = _mm_cvtepi32_ps(v.v2);
748 v3 = _mm_cvtepi32_ps(v.v3);
751 inline icl512(
const icl32f v) {
774 inline icl512& operator=(
const icl512 &v) {
782 inline icl512& operator=(
const Icl512 &v) {
806 inline icl512& operator+=(
const Icl512 &v) {
807 v0 = _mm_add_ps(v0, v.v0);
808 v1 = _mm_add_ps(v1, v.v1);
809 v2 = _mm_add_ps(v2, v.v2);
810 v3 = _mm_add_ps(v3, v.v3);
814 inline icl512& operator-=(
const Icl512 &v) {
815 v0 = _mm_sub_ps(v0, v.v0);
816 v1 = _mm_sub_ps(v1, v.v1);
817 v2 = _mm_sub_ps(v2, v.v2);
818 v3 = _mm_sub_ps(v3, v.v3);
823 v0 = _mm_mul_ps(v0, v.v0);
824 v1 = _mm_mul_ps(v1, v.v1);
825 v2 = _mm_mul_ps(v2, v.v2);
826 v3 = _mm_mul_ps(v3, v.v3);
830 inline icl512& operator/=(
const Icl512 &v) {
831 v0 = _mm_div_ps(v0, v.v0);
832 v1 = _mm_div_ps(v1, v.v1);
833 v2 = _mm_div_ps(v2, v.v2);
834 v3 = _mm_div_ps(v3, v.v3);
838 inline icl512& operator&=(
const Icl512 &v) {
839 v0 = _mm_and_ps(v0, v.v0);
840 v1 = _mm_and_ps(v1, v.v1);
841 v2 = _mm_and_ps(v2, v.v2);
842 v3 = _mm_and_ps(v3, v.v3);
846 inline icl512& operator|=(
const Icl512 &v) {
847 v0 = _mm_or_ps(v0, v.v0);
848 v1 = _mm_or_ps(v1, v.v1);
849 v2 = _mm_or_ps(v2, v.v2);
850 v3 = _mm_or_ps(v3, v.v3);
854 inline icl512& operator^=(
const Icl512 &v) {
855 v0 = _mm_xor_ps(v0, v.v0);
856 v1 = _mm_xor_ps(v1, v.v1);
857 v2 = _mm_xor_ps(v2, v.v2);
858 v3 = _mm_xor_ps(v3, v.v3);
862 inline icl512& andnot(
const Icl512 &v) {
863 v0 = _mm_andnot_ps(v.v0, v0);
864 v1 = _mm_andnot_ps(v.v1, v1);
865 v2 = _mm_andnot_ps(v.v2, v2);
866 v3 = _mm_andnot_ps(v.v3, v3);
870 inline icl512& rcp() {
878 inline void store(
icl8u *v)
const {
885 __m128i vt0 = _mm_cvtps_epi32(v0);
886 __m128i vt1 = _mm_cvtps_epi32(v1);
887 __m128i vt2 = _mm_cvtps_epi32(v2);
888 __m128i vt3 = _mm_cvtps_epi32(v3);
890 vt0 = _mm_packus_epi16(_mm_packs_epi32(vt0, vt1), _mm_packs_epi32(vt2, vt3));
891 _mm_store_si128((__m128i*)v, vt0);
894 inline void storeu(
icl8u *v)
const {
901 __m128i vt0 = _mm_cvtps_epi32(v0);
902 __m128i vt1 = _mm_cvtps_epi32(v1);
903 __m128i vt2 = _mm_cvtps_epi32(v2);
904 __m128i vt3 = _mm_cvtps_epi32(v3);
906 vt0 = _mm_packus_epi16(_mm_packs_epi32(vt0, vt1), _mm_packs_epi32(vt2, vt3));
907 _mm_storeu_si128((__m128i*)v, vt0);
910 inline void store(
icl32f *v)
const {
912 _mm_store_ps(v + 4, v1);
913 _mm_store_ps(v + 8, v2);
914 _mm_store_ps(v + 12, v3);
917 inline void storeu(
icl32f *v)
const {
918 _mm_storeu_ps(v, v0);
919 _mm_storeu_ps(v + 4, v1);
920 _mm_storeu_ps(v + 8, v2);
921 _mm_storeu_ps(v + 12, v3);
926 struct icl128i8u : Icl128i {
930 inline icl128i8u(
const Icl128i &v) {
934 inline icl128i8u(
const icl128i8u &v) {
938 inline icl128i8u(
const __m128i &v) {
942 inline icl128i8u(
const __m128i *v) {
943 v0 = _mm_loadu_si128(v);
946 inline icl128i8u(
const icl8s *v) {
947 v0 = _mm_loadu_si128((__m128i*)v);
950 inline icl128i8u(
const icl8u *v) {
951 v0 = _mm_loadu_si128((__m128i*)v);
954 inline icl128i8u(
const icl8s v) {
955 v0 = _mm_set1_epi8(v);
958 inline icl128i8u(
const Icl256i &v) {
960 v0 = _mm_packus_epi16(v.v0, v.v1);
963 inline icl128i8u(
const Icl512i &v) {
965 v0 = _mm_packus_epi16(_mm_packs_epi32(v.v0, v.v1), _mm_packs_epi32(v.v2, v.v3));
968 inline operator Icl128i ()
const {
972 inline icl128i8u& operator=(
const icl128i8u &v) {
977 inline icl128i8u& operator=(
const Icl128i &v) {
982 inline icl128i8u& operator+=(
const icl128i8u &v) {
983 v0 = _mm_add_epi8(v0, v.v0);
987 inline icl128i8u& operator-=(
const icl128i8u &v) {
988 v0 = _mm_sub_epi8(v0, v.v0);
992 inline void store(__m128i *v)
const {
993 _mm_store_si128(v, v0);
996 inline void storeu(__m128i *v)
const {
997 _mm_storeu_si128(v, v0);
1000 inline void store(
icl8s *v)
const {
1001 _mm_store_si128((__m128i*)v, v0);
1004 inline void storeu(
icl8s *v)
const {
1005 _mm_storeu_si128((__m128i*)v, v0);
1008 inline void store(
icl8u *v)
const {
1009 _mm_store_si128((__m128i*)v, v0);
1012 inline void storeu(
icl8u *v)
const {
1013 _mm_storeu_si128((__m128i*)v, v0);
1018 struct icl128i16s : Icl128i {
1019 inline icl128i16s() {
1022 inline icl128i16s(
const icl128i16s &v) {
1026 inline icl128i16s(
const Icl128i &v) {
1030 inline icl128i16s(
const __m128i &v) {
1034 inline icl128i16s(
const __m128i *v) {
1035 v0 = _mm_loadu_si128(v);
1038 inline icl128i16s(
const icl16s *v) {
1039 v0 = _mm_loadu_si128((__m128i*)v);
1042 inline icl128i16s(
const icl16u *v) {
1043 v0 = _mm_loadu_si128((__m128i*)v);
1046 inline icl128i16s(
const icl16s v) {
1047 v0 = _mm_set1_epi16(v);
1050 inline icl128i16s(
const Icl256i &v) {
1051 v0 = _mm_packs_epi32(v.v0, v.v1);
1054 inline operator Icl128i ()
const {
1058 inline icl128i16s& operator=(
const icl128i16s &v) {
1063 inline icl128i16s& operator=(
const Icl128i &v) {
1068 inline icl128i16s& operator+=(
const icl128i16s &v) {
1069 v0 = _mm_add_epi16(v0, v.v0);
1073 inline icl128i16s& operator-=(
const icl128i16s &v) {
1074 v0 = _mm_sub_epi16(v0, v.v0);
1078 inline void store(__m128i *v)
const {
1079 _mm_store_si128(v, v0);
1082 inline void storeu(__m128i *v)
const {
1083 _mm_storeu_si128(v, v0);
1086 inline void store(
icl16s *v)
const {
1087 _mm_store_si128((__m128i*)v, v0);
1090 inline void storeu(
icl16s *v)
const {
1091 _mm_storeu_si128((__m128i*)v, v0);
1094 inline void store(
icl16u *v)
const {
1095 _mm_store_si128((__m128i*)v, v0);
1098 inline void storeu(
icl16u *v)
const {
1099 _mm_storeu_si128((__m128i*)v, v0);
1104 struct icl128i32s : Icl128i {
1105 inline icl128i32s() {
1108 inline icl128i32s(
const icl128i32s &v) {
1112 inline icl128i32s(
const Icl128i &v) {
1116 inline icl128i32s(
const __m128i &v) {
1120 inline icl128i32s(
const __m128i *v) {
1121 v0 = _mm_loadu_si128(v);
1124 inline icl128i32s(
const icl32s *v) {
1125 v0 = _mm_loadu_si128((__m128i*)v);
1128 inline icl128i32s(
const icl32u *v) {
1129 v0 = _mm_loadu_si128((__m128i*)v);
1133 v0 = _mm_set_epi32(i3, i2, i1, i0);
1136 inline icl128i32s(
const icl32s v) {
1137 v0 = _mm_set1_epi32(v);
1140 inline icl128i32s(
const Icl128 &v) {
1144 v0 = _mm_cvtps_epi32(v.v0);
1147 inline operator Icl128i ()
const {
1152 inline icl128i32s& operator=(
const icl128i32s &v) {
1157 inline icl128i32s& operator=(
const Icl128i &v) {
1162 inline icl128i32s& operator+=(
const icl128i32s &v) {
1163 v0 = _mm_add_epi32(v0, v.v0);
1167 inline icl128i32s& operator-=(
const icl128i32s &v) {
1168 v0 = _mm_sub_epi32(v0, v.v0);
1172 inline void store(__m128i *v)
const {
1173 _mm_store_si128(v, v0);
1176 inline void storeu(__m128i *v)
const {
1177 _mm_storeu_si128(v, v0);
1180 inline void store(
icl32s *v)
const {
1181 _mm_store_si128((__m128i*)v, v0);
1184 inline void storeu(
icl32s *v)
const {
1185 _mm_storeu_si128((__m128i*)v, v0);
1188 inline void store(
icl32u *v)
const {
1189 _mm_store_si128((__m128i*)v, v0);
1192 inline void storeu(
icl32u *v)
const {
1193 _mm_storeu_si128((__m128i*)v, v0);
1198 struct icl256i16s : Icl256i {
1199 inline icl256i16s() {
1202 inline icl256i16s(
const icl256i16s &v) {
1207 inline icl256i16s(
const Icl256i &v) {
1212 inline icl256i16s(
const __m128i &vl,
const __m128i &vh) {
1217 inline icl256i16s(
const __m128i *v) {
1222 inline icl256i16s(
const icl16s *v) {
1223 v0 = _mm_loadu_si128((__m128i*)v);
1224 v1 = _mm_loadu_si128((__m128i*)(v + 8));
1227 inline icl256i16s(
const icl16s v) {
1228 v0 = _mm_set1_epi16(v);
1229 v1 = _mm_set1_epi16(v);
1232 inline icl256i16s(
const icl128i8u &v) {
1233 const __m128i vk0 = _mm_setzero_si128();
1234 v0 = _mm_unpacklo_epi8(v.v0, vk0);
1235 v1 = _mm_unpackhi_epi8(v.v0, vk0);
1238 inline icl256i16s(
const Icl512i &v) {
1239 v0 = _mm_packs_epi32(v.v0, v.v1);
1240 v1 = _mm_packs_epi32(v.v2, v.v3);
1243 inline operator Icl256i ()
const {
1247 inline icl256i16s& operator=(
const icl256i16s &v) {
1253 inline icl256i16s& operator=(
const Icl256i &v) {
1259 inline icl256i16s& operator+=(
const icl256i16s &v) {
1260 v0 = _mm_add_epi16(v0, v.v0);
1261 v1 = _mm_add_epi16(v1, v.v1);
1265 inline icl256i16s& operator-=(
const icl256i16s &v) {
1266 v0 = _mm_sub_epi16(v0, v.v0);
1267 v1 = _mm_sub_epi16(v1, v.v1);
1271 inline void store(__m128i *v)
const {
1272 _mm_store_si128(v, v0);
1273 _mm_store_si128(v + 1, v1);
1276 inline void storeu(__m128i *v)
const {
1277 _mm_storeu_si128(v, v0);
1278 _mm_storeu_si128(v + 1, v1);
1281 inline void store(
icl16s *v)
const {
1282 _mm_store_si128((__m128i*)v, v0);
1283 _mm_store_si128((__m128i*)(v + 8), v1);
1286 inline void storeu(
icl16s *v)
const {
1287 _mm_storeu_si128((__m128i*)v, v0);
1288 _mm_storeu_si128((__m128i*)(v + 8), v1);
1291 inline void store(
icl16u *v)
const {
1292 _mm_store_si128((__m128i*)v, v0);
1293 _mm_store_si128((__m128i*)(v + 8), v1);
1296 inline void storeu(
icl16u *v)
const {
1297 _mm_storeu_si128((__m128i*)v, v0);
1298 _mm_storeu_si128((__m128i*)(v + 8), v1);
1303 struct icl256i32s : Icl256i {
1305 inline icl256i32s() {
1308 inline icl256i32s(
const icl256i32s &v) {
1313 inline icl256i32s(
const Icl256i &v) {
1318 inline icl256i32s(
const __m128i &vl,
const __m128i &vh) {
1323 inline icl256i32s(
const __m128i *v) {
1328 inline icl256i32s(
const icl32s *v) {
1329 v0 = _mm_loadu_si128((__m128i*)v);
1330 v1 = _mm_loadu_si128((__m128i*)(v + 4));
1333 inline icl256i32s(
const icl32s v) {
1334 v0 = _mm_set1_epi32(v);
1335 v1 = _mm_set1_epi32(v);
1338 inline icl256i32s(
const Icl256 &v) {
1343 v0 = _mm_cvtps_epi32(v.v0);
1344 v1 = _mm_cvtps_epi32(v.v1);
1347 inline icl256i32s& operator=(
const icl256i32s &v) {
1353 inline icl256i32s& operator=(
const Icl256i &v) {
1359 inline icl256i32s& operator+=(
const icl256i32s &v) {
1360 v0 = _mm_add_epi16(v0, v.v0);
1361 v1 = _mm_add_epi16(v1, v.v1);
1365 inline icl256i32s& operator-=(
const icl256i32s &v) {
1366 v0 = _mm_sub_epi16(v0, v.v0);
1367 v1 = _mm_sub_epi16(v1, v.v1);
1371 inline void store(__m128i *v)
const {
1372 _mm_store_si128(v, v0);
1373 _mm_store_si128(v + 1, v1);
1376 inline void storeu(__m128i *v)
const {
1377 _mm_storeu_si128(v, v0);
1378 _mm_storeu_si128(v + 1, v1);
1381 inline void store(
icl32s *v)
const {
1382 _mm_store_si128((__m128i*)v, v0);
1383 _mm_store_si128((__m128i*)(v + 4), v1);
1386 inline void storeu(
icl32s *v)
const {
1387 _mm_storeu_si128((__m128i*)v, v0);
1388 _mm_storeu_si128((__m128i*)(v + 4), v1);
1391 inline void store(
icl32u *v)
const {
1392 _mm_store_si128((__m128i*)v, v0);
1393 _mm_store_si128((__m128i*)(v + 4), v1);
1396 inline void storeu(
icl32u *v)
const {
1397 _mm_storeu_si128((__m128i*)v, v0);
1398 _mm_storeu_si128((__m128i*)(v + 4), v1);
1403 struct icl512i32s : Icl512i {
1404 inline icl512i32s() {
1407 inline icl512i32s(
const icl512i32s &v) {
1414 inline icl512i32s(
const Icl512i &v) {
1421 inline icl512i32s(
const __m128i &vll,
const __m128i &vlh,
1422 const __m128i &vhl,
const __m128i &vhh) {
1429 inline icl512i32s(
const icl32s *v) {
1430 v0 = _mm_loadu_si128((__m128i*)v);
1431 v1 = _mm_loadu_si128((__m128i*)(v + 4));
1432 v2 = _mm_loadu_si128((__m128i*)(v + 8));
1433 v3 = _mm_loadu_si128((__m128i*)(v + 12));
1436 inline icl512i32s(
const Icl256i &v) {
1437 const __m128i vk0 = _mm_setzero_si128();
1438 v0 = _mm_unpacklo_epi16(v.v0, vk0);
1439 v1 = _mm_unpackhi_epi16(v.v0, vk0);
1440 v2 = _mm_unpacklo_epi16(v.v1, vk0);
1441 v3 = _mm_unpackhi_epi16(v.v1, vk0);
1444 inline icl512i32s(
const icl32s v) {
1445 v0 = _mm_set1_epi32(v);
1446 v1 = _mm_set1_epi32(v);
1447 v2 = _mm_set1_epi32(v);
1448 v3 = _mm_set1_epi32(v);
1451 inline icl512i32s(
const Icl512 &v) {
1458 v0 = _mm_cvtps_epi32(v.v0);
1459 v1 = _mm_cvtps_epi32(v.v1);
1460 v2 = _mm_cvtps_epi32(v.v2);
1461 v3 = _mm_cvtps_epi32(v.v3);
1464 inline icl512i32s& operator=(
const icl512i32s &v) {
1472 inline icl512i32s& operator=(
const Icl512i &v) {
1480 inline icl512i32s& operator+=(
const icl512i32s &v) {
1481 v0 = _mm_add_epi32(v0, v.v0);
1482 v1 = _mm_add_epi32(v1, v.v1);
1483 v2 = _mm_add_epi32(v2, v.v2);
1484 v3 = _mm_add_epi32(v3, v.v3);
1488 inline icl512i32s& operator-=(
const icl512i32s &v) {
1489 v0 = _mm_sub_epi32(v0, v.v0);
1490 v1 = _mm_sub_epi32(v1, v.v1);
1491 v2 = _mm_sub_epi32(v2, v.v2);
1492 v3 = _mm_sub_epi32(v3, v.v3);
1496 inline void store(
icl32s *v)
const {
1497 _mm_store_si128((__m128i*)v, v0);
1498 _mm_store_si128((__m128i*)(v + 4), v1);
1499 _mm_store_si128((__m128i*)(v + 8), v1);
1500 _mm_store_si128((__m128i*)(v + 12), v1);
1503 inline void storeu(
icl32s *v)
const {
1504 _mm_storeu_si128((__m128i*)v, v0);
1505 _mm_storeu_si128((__m128i*)(v + 4), v1);
1506 _mm_storeu_si128((__m128i*)(v + 8), v1);
1507 _mm_storeu_si128((__m128i*)(v + 12), v1);
1510 inline void store(
icl32u *v)
const {
1511 _mm_store_si128((__m128i*)v, v0);
1512 _mm_store_si128((__m128i*)(v + 4), v1);
1513 _mm_store_si128((__m128i*)(v + 8), v1);
1514 _mm_store_si128((__m128i*)(v + 12), v1);
1517 inline void storeu(
icl32u *v)
const {
1518 _mm_storeu_si128((__m128i*)v, v0);
1519 _mm_storeu_si128((__m128i*)(v + 4), v1);
1520 _mm_storeu_si128((__m128i*)(v + 8), v1);
1521 _mm_storeu_si128((__m128i*)(v + 12), v1);
1526 struct icl128d : Icl128d {
1530 inline icl128d(
const __m128d &v) {
1534 inline icl128d(
const icl64f *v) {
1535 v0 = _mm_loadu_pd(v);
1538 inline icl128d(
const icl64f v) {
1539 v0 = _mm_set1_pd(v);
1542 inline icl128d(
const icl128d &v) {
1546 inline icl128d& operator=(
const __m128d &v) {
1551 inline icl128d& operator=(
const icl64f *v) {
1552 v0 = _mm_loadu_pd(v);
1556 inline icl128d& operator=(
const icl128d &v) {
1561 inline operator __m128d ()
const {
1565 inline icl128d& operator+=(
const Icl128d &v) {
1566 v0 = _mm_add_pd(v0, v.v0);
1570 inline icl128d& operator-=(
const Icl128d &v) {
1571 v0 = _mm_sub_pd(v0, v.v0);
1575 inline icl128d&
operator*=(
const Icl128d &v) {
1576 v0 = _mm_mul_pd(v0, v.v0);
1580 inline icl128d& operator/=(
const Icl128d &v) {
1581 v0 = _mm_div_pd(v0, v.v0);
1585 inline icl128d& operator&=(
const Icl128d &v) {
1586 v0 = _mm_and_pd(v0, v.v0);
1590 inline icl128d& operator|=(
const Icl128d &v) {
1591 v0 = _mm_or_pd(v0, v.v0);
1595 inline icl128d& operator^=(
const Icl128d &v) {
1596 v0 = _mm_xor_pd(v0, v.v0);
1600 inline icl128d& andnot(
const Icl128d &v) {
1601 v0 = _mm_andnot_pd(v.v0, v0);
1605 inline void store(
icl64f *v)
const {
1606 _mm_store_pd(v, v0);
1609 inline void storeu(
icl64f *v)
const {
1610 _mm_storeu_pd(v, v0);
1615 struct icl256d : Icl512d {
1620 struct icl512d : Icl512d {
1625 struct icl1024d : Icl1024d {
1636 inline icl128
operator+(
const icl128 &lv,
const icl128 &rv) {
1641 inline icl128
operator-(
const icl128 &lv,
const icl128 &rv) {
1646 inline icl128
operator*(
const icl128 &lv,
const icl128 &rv) {
1652 inline icl128
operator/(
const icl128 &lv,
const icl128 &rv) {
1658 inline icl256
operator+(
const icl256 &lv,
const icl256 &rv) {
1663 inline icl256
operator-(
const icl256 &lv,
const icl256 &rv) {
1668 inline icl256
operator*(
const icl256 &lv,
const icl256 &rv) {
1673 inline icl256
operator/(
const icl256 &lv,
const icl256 &rv) {
1678 inline icl512
operator+(
const icl512 &lv,
const icl512 &rv) {
1683 inline icl512
operator-(
const icl512 &lv,
const icl512 &rv) {
1688 inline icl512
operator*(
const icl512 &lv,
const icl512 &rv) {
1693 inline icl512
operator/(
const icl512 &lv,
const icl512 &rv) {
1702 inline icl128 operator==(
const icl128 &lv,
const icl128 &rv) {
1704 ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0);
1708 inline icl128 operator!=(
const icl128 &lv,
const icl128 &rv) {
1710 ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0);
1714 inline icl128 operator<(
const icl128 &lv,
const icl128 &rv) {
1716 ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0);
1720 inline icl128 operator>(
const icl128 &lv,
const icl128 &rv) {
1722 ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0);
1726 inline icl128 operator<=(
const icl128 &lv,
const icl128 &rv) {
1728 ret.v0 = _mm_cmple_ps(lv.v0, rv.v0);
1732 inline icl128 operator>=(
const icl128 &lv,
const icl128 &rv) {
1734 ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0);
1738 inline icl256 operator==(
const icl256 &lv,
const icl256 &rv) {
1740 ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0);
1741 ret.v1 = _mm_cmpeq_ps(lv.v1, rv.v1);
1745 inline icl256 operator!=(
const icl256 &lv,
const icl256 &rv) {
1747 ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0);
1748 ret.v1 = _mm_cmpneq_ps(lv.v1, rv.v1);
1752 inline icl256 operator<(
const icl256 &lv,
const icl256 &rv) {
1754 ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0);
1755 ret.v1 = _mm_cmplt_ps(lv.v1, rv.v1);
1759 inline icl256 operator>(
const icl256 &lv,
const icl256 &rv) {
1761 ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0);
1762 ret.v1 = _mm_cmpgt_ps(lv.v1, rv.v1);
1766 inline icl256 operator<=(
const icl256 &lv,
const icl256 &rv) {
1768 ret.v0 = _mm_cmple_ps(lv.v0, rv.v0);
1769 ret.v1 = _mm_cmple_ps(lv.v1, rv.v1);
1773 inline icl256 operator>=(
const icl256 &lv,
const icl256 &rv) {
1775 ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0);
1776 ret.v1 = _mm_cmpge_ps(lv.v1, rv.v1);
1780 inline icl512 operator==(
const icl512 &lv,
const icl512 &rv) {
1782 ret.v0 = _mm_cmpeq_ps(lv.v0, rv.v0);
1783 ret.v1 = _mm_cmpeq_ps(lv.v1, rv.v1);
1784 ret.v2 = _mm_cmpeq_ps(lv.v2, rv.v2);
1785 ret.v3 = _mm_cmpeq_ps(lv.v3, rv.v3);
1789 inline icl512 operator!=(
const icl512 &lv,
const icl512 &rv) {
1791 ret.v0 = _mm_cmpneq_ps(lv.v0, rv.v0);
1792 ret.v1 = _mm_cmpneq_ps(lv.v1, rv.v1);
1793 ret.v2 = _mm_cmpneq_ps(lv.v2, rv.v2);
1794 ret.v3 = _mm_cmpneq_ps(lv.v3, rv.v3);
1798 inline icl512 operator<(
const icl512 &lv,
const icl512 &rv) {
1800 ret.v0 = _mm_cmplt_ps(lv.v0, rv.v0);
1801 ret.v1 = _mm_cmplt_ps(lv.v1, rv.v1);
1802 ret.v2 = _mm_cmplt_ps(lv.v2, rv.v2);
1803 ret.v3 = _mm_cmplt_ps(lv.v3, rv.v3);
1807 inline icl512 operator>(
const icl512 &lv,
const icl512 &rv) {
1809 ret.v0 = _mm_cmpgt_ps(lv.v0, rv.v0);
1810 ret.v1 = _mm_cmpgt_ps(lv.v1, rv.v1);
1811 ret.v2 = _mm_cmpgt_ps(lv.v2, rv.v2);
1812 ret.v3 = _mm_cmpgt_ps(lv.v3, rv.v3);
1816 inline icl512 operator<=(
const icl512 &lv,
const icl512 &rv) {
1818 ret.v0 = _mm_cmple_ps(lv.v0, rv.v0);
1819 ret.v1 = _mm_cmple_ps(lv.v1, rv.v1);
1820 ret.v2 = _mm_cmple_ps(lv.v2, rv.v2);
1821 ret.v3 = _mm_cmple_ps(lv.v3, rv.v3);
1825 inline icl512 operator>=(
const icl512 &lv,
const icl512 &rv) {
1827 ret.v0 = _mm_cmpge_ps(lv.v0, rv.v0);
1828 ret.v1 = _mm_cmpge_ps(lv.v1, rv.v1);
1829 ret.v2 = _mm_cmpge_ps(lv.v2, rv.v2);
1830 ret.v3 = _mm_cmpge_ps(lv.v3, rv.v3);
1838 inline icl128 operator&(
const icl128 &lv,
const icl128 &rv) {
1840 ret.v0 = _mm_and_ps(lv.v0, rv.v0);
1844 inline icl128
operator|(
const icl128 &lv,
const icl128 &rv) {
1846 ret.v0 = _mm_or_ps(lv.v0, rv.v0);
1850 inline icl128 operator^(
const icl128 &lv,
const icl128 &rv) {
1852 ret.v0 = _mm_xor_ps(lv.v0, rv.v0);
1856 inline icl128 andnot(
const icl128 &lv,
const icl128 &rv) {
1858 ret.v0 = _mm_andnot_ps(rv.v0, lv.v0);
1862 inline Icl128i operator&(
const Icl128i &lv,
const Icl128i &rv) {
1864 ret.v0 = _mm_and_si128(lv.v0, rv.v0);
1868 inline Icl128i
operator|(
const Icl128i &lv,
const Icl128i &rv) {
1870 ret.v0 = _mm_or_si128(lv.v0, rv.v0);
1874 inline Icl128i operator^(
const Icl128i &lv,
const Icl128i &rv) {
1876 ret.v0 = _mm_xor_si128(lv.v0, rv.v0);
1880 inline Icl128i andnot(
const Icl128i &lv,
const Icl128i &rv) {
1882 ret.v0 = _mm_andnot_si128(rv.v0, lv.v0);
1886 inline icl256 operator&(
const icl256 &lv,
const icl256 &rv) {
1888 ret.v0 = _mm_and_ps(lv.v0, rv.v0);
1889 ret.v1 = _mm_and_ps(lv.v1, rv.v1);
1893 inline icl256
operator|(
const icl256 &lv,
const icl256 &rv) {
1895 ret.v0 = _mm_or_ps(lv.v0, rv.v0);
1896 ret.v1 = _mm_or_ps(lv.v1, rv.v1);
1900 inline icl256 operator^(
const icl256 &lv,
const icl256 &rv) {
1902 ret.v0 = _mm_xor_ps(lv.v0, rv.v0);
1903 ret.v1 = _mm_xor_ps(lv.v1, rv.v1);
1907 inline icl256 andnot(
const icl256 &lv,
const icl256 &rv) {
1909 ret.v0 = _mm_andnot_ps(rv.v0, lv.v0);
1910 ret.v1 = _mm_andnot_ps(rv.v1, lv.v1);
1914 inline Icl256i operator&(
const Icl256i &lv,
const Icl256i &rv) {
1916 ret.v0 = _mm_and_si128(lv.v0, rv.v0);
1917 ret.v1 = _mm_and_si128(lv.v1, rv.v1);
1921 inline Icl256i
operator|(
const Icl256i &lv,
const Icl256i &rv) {
1923 ret.v0 = _mm_or_si128(lv.v0, rv.v0);
1924 ret.v1 = _mm_or_si128(lv.v1, rv.v1);
1928 inline Icl256i operator^(
const Icl256i &lv,
const Icl256i &rv) {
1930 ret.v0 = _mm_xor_si128(lv.v0, rv.v0);
1931 ret.v1 = _mm_xor_si128(lv.v1, rv.v1);
1935 inline Icl256i andnot(
const Icl256i &lv,
const Icl256i &rv) {
1937 ret.v0 = _mm_andnot_si128(rv.v0, lv.v0);
1938 ret.v1 = _mm_andnot_si128(rv.v1, lv.v1);
1942 inline icl512 operator&(
const icl512 &lv,
const icl512 &rv) {
1944 ret.v0 = _mm_and_ps(lv.v0, rv.v0);
1945 ret.v1 = _mm_and_ps(lv.v1, rv.v1);
1946 ret.v2 = _mm_and_ps(lv.v2, rv.v2);
1947 ret.v3 = _mm_and_ps(lv.v3, rv.v3);
1951 inline icl512
operator|(
const icl512 &lv,
const icl512 &rv) {
1953 ret.v0 = _mm_or_ps(lv.v0, rv.v0);
1954 ret.v1 = _mm_or_ps(lv.v1, rv.v1);
1955 ret.v2 = _mm_or_ps(lv.v2, rv.v2);
1956 ret.v3 = _mm_or_ps(lv.v3, rv.v3);
1960 inline icl512 operator^(
const icl512 &lv,
const icl512 &rv) {
1962 ret.v0 = _mm_xor_ps(lv.v0, rv.v0);
1963 ret.v1 = _mm_xor_ps(lv.v1, rv.v1);
1964 ret.v2 = _mm_xor_ps(lv.v2, rv.v2);
1965 ret.v3 = _mm_xor_ps(lv.v3, rv.v3);
1969 inline icl512 andnot(
const icl512 &lv,
const icl512 &rv) {
1971 ret.v0 = _mm_andnot_ps(rv.v0, lv.v0);
1972 ret.v1 = _mm_andnot_ps(rv.v1, lv.v1);
1973 ret.v2 = _mm_andnot_ps(rv.v2, lv.v2);
1974 ret.v3 = _mm_andnot_ps(rv.v3, lv.v3);
1978 inline Icl512i operator&(
const Icl512i &lv,
const Icl512i &rv) {
1980 ret.v0 = _mm_and_si128(lv.v0, rv.v0);
1981 ret.v1 = _mm_and_si128(lv.v1, rv.v1);
1982 ret.v2 = _mm_and_si128(lv.v2, rv.v2);
1983 ret.v3 = _mm_and_si128(lv.v3, rv.v3);
1987 inline Icl512i
operator|(
const Icl512i &lv,
const Icl512i &rv) {
1989 ret.v0 = _mm_or_si128(lv.v0, rv.v0);
1990 ret.v1 = _mm_or_si128(lv.v1, rv.v1);
1991 ret.v2 = _mm_or_si128(lv.v2, rv.v2);
1992 ret.v3 = _mm_or_si128(lv.v3, rv.v3);
1996 inline Icl512i operator^(
const Icl512i &lv,
const Icl512i &rv) {
1998 ret.v0 = _mm_xor_si128(lv.v0, rv.v0);
1999 ret.v1 = _mm_xor_si128(lv.v1, rv.v1);
2000 ret.v2 = _mm_xor_si128(lv.v2, rv.v2);
2001 ret.v3 = _mm_xor_si128(lv.v3, rv.v3);
2005 inline Icl512i andnot(
const Icl512i &lv,
const Icl512i &rv) {
2007 ret.v0 = _mm_andnot_si128(rv.v0, lv.v0);
2008 ret.v1 = _mm_andnot_si128(rv.v1, lv.v1);
2009 ret.v2 = _mm_andnot_si128(rv.v2, lv.v2);
2010 ret.v3 = _mm_andnot_si128(rv.v3, lv.v3);
2018 inline Icl128i&
operator<<(Icl128i &v,
const int i) {
2019 v.v0 = _mm_slli_epi32(v.v0, i);
2023 inline Icl128i&
operator>>(Icl128i &v,
const int i) {
2024 v.v0 = _mm_srai_epi32(v.v0, i);
2028 inline Icl256i&
operator<<(Icl256i &v,
const int i) {
2029 v.v0 = _mm_slli_epi32(v.v0, i);
2030 v.v1 = _mm_slli_epi32(v.v1, i);
2034 inline Icl256i&
operator>>(Icl256i &v,
const int i) {
2035 v.v0 = _mm_srai_epi32(v.v0, i);
2036 v.v1 = _mm_srai_epi32(v.v1, i);
2040 inline Icl512i&
operator<<(Icl512i &v,
const int i) {
2041 v.v0 = _mm_slli_epi32(v.v0, i);
2042 v.v1 = _mm_slli_epi32(v.v1, i);
2043 v.v2 = _mm_slli_epi32(v.v2, i);
2044 v.v3 = _mm_slli_epi32(v.v3, i);
2048 inline Icl512i&
operator>>(Icl512i &v,
const int i) {
2049 v.v0 = _mm_srai_epi32(v.v0, i);
2050 v.v1 = _mm_srai_epi32(v.v1, i);
2051 v.v2 = _mm_srai_epi32(v.v2, i);
2052 v.v3 = _mm_srai_epi32(v.v3, i);
2060 inline icl128i8u min(
const icl128i8u &lv,
const icl128i8u &rv) {
2062 ret.v0 = _mm_min_epu8(lv.v0, rv.v0);
2066 inline icl128i8u max(
const icl128i8u &lv,
const icl128i8u &rv) {
2068 ret.v0 = _mm_max_epu8(lv.v0, rv.v0);
2072 inline icl128i16s min(
const icl128i16s &lv,
const icl128i16s &rv) {
2074 ret.v0 = _mm_min_epi16(lv.v0, rv.v0);
2078 inline icl128i16s max(
const icl128i16s &lv,
const icl128i16s &rv) {
2080 ret.v0 = _mm_max_epi16(lv.v0, rv.v0);
2084 inline icl256i16s min(
const icl256i16s &lv,
const icl256i16s &rv) {
2086 ret.v0 = _mm_min_epi16(lv.v0, rv.v0);
2087 ret.v1 = _mm_min_epi16(lv.v1, rv.v1);
2091 inline icl256i16s max(
const icl256i16s &lv,
const icl256i16s &rv) {
2093 ret.v0 = _mm_max_epi16(lv.v0, rv.v0);
2094 ret.v1 = _mm_max_epi16(lv.v1, rv.v1);
2098 inline icl128 min(
const icl128 &lv,
const icl128 &rv) {
2100 ret.v0 = _mm_min_ps(lv.v0, rv.v0);
2104 inline icl128 max(
const icl128 &lv,
const icl128 &rv) {
2106 ret.v0 = _mm_max_ps(lv.v0, rv.v0);
2110 inline icl256 min(
const icl256 &lv,
const icl256 &rv) {
2112 ret.v0 = _mm_min_ps(lv.v0, rv.v0);
2113 ret.v1 = _mm_min_ps(lv.v1, rv.v1);
2117 inline icl256 max(
const icl256 &lv,
const icl256 &rv) {
2119 ret.v0 = _mm_max_ps(lv.v0, rv.v0);
2120 ret.v1 = _mm_max_ps(lv.v1, rv.v1);
2124 inline icl512 min(
const icl512 &lv,
const icl512 &rv) {
2126 ret.v0 = _mm_min_ps(lv.v0, rv.v0);
2127 ret.v1 = _mm_min_ps(lv.v1, rv.v1);
2128 ret.v2 = _mm_min_ps(lv.v2, rv.v2);
2129 ret.v3 = _mm_min_ps(lv.v3, rv.v3);
2133 inline icl512 max(
const icl512 &lv,
const icl512 &rv) {
2135 ret.v0 = _mm_max_ps(lv.v0, rv.v0);
2136 ret.v1 = _mm_max_ps(lv.v1, rv.v1);
2137 ret.v2 = _mm_max_ps(lv.v2, rv.v2);
2138 ret.v3 = _mm_max_ps(lv.v3, rv.v3);
2147 #ifdef ICL_HAVE_SSE3 2148 inline icl128i8u
abs(
const icl128i8u &v) {
2150 ret.v0 = _mm_abs_epi8(v.v0);
2154 inline icl128i16s
abs(
const icl128i16s &v) {
2156 ret.v0 = _mm_abs_epi16(v.v0);
2160 inline icl128i32s
abs(
const icl128i32s &v) {
2162 ret.v0 = _mm_abs_epi32(v.v0);
2166 inline icl256i16s
abs(
const icl256i16s &v) {
2168 ret.v0 = _mm_abs_epi16(v.v0);
2169 ret.v1 = _mm_abs_epi16(v.v1);
2173 inline icl256i32s
abs(
const icl256i32s &v) {
2175 ret.v0 = _mm_abs_epi32(v.v0);
2176 ret.v1 = _mm_abs_epi32(v.v1);
2180 inline icl512i32s
abs(
const icl512i32s &v) {
2182 ret.v0 = _mm_abs_epi32(v.v0);
2183 ret.v1 = _mm_abs_epi32(v.v1);
2184 ret.v2 = _mm_abs_epi32(v.v2);
2185 ret.v3 = _mm_abs_epi32(v.v3);
2192 inline icl128
abs(
const icl128 &v) {
2194 ret.v0 = _mm_andnot_ps(icl128(-0.0f), v.v0);
2198 inline icl256
abs(
const icl256 &v) {
2201 ret.v0 = _mm_andnot_ps(tmp.v0, v.v0);
2202 ret.v1 = _mm_andnot_ps(tmp.v0, v.v1);
2206 inline icl512
abs(
const icl512 &v) {
2209 ret.v0 = _mm_andnot_ps(tmp.v0, v.v0);
2210 ret.v1 = _mm_andnot_ps(tmp.v0, v.v1);
2211 ret.v2 = _mm_andnot_ps(tmp.v0, v.v2);
2212 ret.v3 = _mm_andnot_ps(tmp.v0, v.v3);
2221 inline icl128
sqrt(
const icl128 &v) {
2223 r.v0 = _mm_sqrt_ps(v.v0);
2227 inline icl256
sqrt(
const icl256 &v) {
2229 r.v0 = _mm_sqrt_ps(v.v0);
2230 r.v1 = _mm_sqrt_ps(v.v1);
2234 inline icl512
sqrt(
const icl512 &v) {
2236 r.v0 = _mm_sqrt_ps(v.v0);
2237 r.v1 = _mm_sqrt_ps(v.v1);
2238 r.v2 = _mm_sqrt_ps(v.v2);
2239 r.v3 = _mm_sqrt_ps(v.v3);
2243 inline icl128d
sqrt(
const icl128d &v) {
2245 r.v0 = _mm_sqrt_pd(v.v0);
2249 inline icl256d
sqrt(
const icl256d &v) {
2251 r.v0 = _mm_sqrt_pd(v.v0);
2252 r.v1 = _mm_sqrt_pd(v.v1);
2256 inline icl512d
sqrt(
const icl512d &v) {
2258 r.v0 = _mm_sqrt_pd(v.v0);
2259 r.v1 = _mm_sqrt_pd(v.v1);
2260 r.v2 = _mm_sqrt_pd(v.v2);
2261 r.v3 = _mm_sqrt_pd(v.v3);
2265 inline icl1024d
sqrt(
const icl1024d &v) {
2267 r.v0 = _mm_sqrt_pd(v.v0);
2268 r.v1 = _mm_sqrt_pd(v.v1);
2269 r.v2 = _mm_sqrt_pd(v.v2);
2270 r.v3 = _mm_sqrt_pd(v.v3);
2271 r.v4 = _mm_sqrt_pd(v.v4);
2272 r.v5 = _mm_sqrt_pd(v.v5);
2273 r.v6 = _mm_sqrt_pd(v.v6);
2274 r.v7 = _mm_sqrt_pd(v.v7);
2283 inline icl128 cbrt(
const icl128 &v) {
2284 icl128i32s tmp = icl128i32s(_mm_castps_si128(v));
2285 tmp = tmp / icl128i32s(3) + icl128i32s(709921077);
2286 icl128 a = icl128(_mm_castsi128_ps(tmp));
2287 icl128 a3 = a * a * a;
2288 return a * (a3 + v + v) * (a3 + a3 + v).rcp();
2291 inline icl256 cbrt(
const icl256 &v) {
2292 __m128i t0 = _mm_castps_si128(v.v0);
2293 __m128i t1 = _mm_castps_si128(v.v1);
2294 icl256i32s tmp = icl256i32s(t0, t1);
2295 tmp = tmp / icl256i32s(3) + icl256i32s(709921077);
2296 icl256 a = icl256(_mm_castsi128_ps(tmp.v0),
2297 _mm_castsi128_ps(tmp.v1));
2298 icl256 a3 = a * a * a;
2299 return a * (a3 + v + v) * (a3 + a3 + v).rcp();
2302 inline icl512 cbrt(
const icl512 &v) {
2303 __m128i t0 = _mm_castps_si128(v.v0);
2304 __m128i t1 = _mm_castps_si128(v.v1);
2305 __m128i t2 = _mm_castps_si128(v.v2);
2306 __m128i t3 = _mm_castps_si128(v.v3);
2307 icl512i32s tmp = icl512i32s(t0, t1, t2, t3);
2308 tmp = tmp / icl512i32s(3) + icl512i32s(709921077);
2309 icl512 a = icl512(_mm_castsi128_ps(tmp.v0),
2310 _mm_castsi128_ps(tmp.v1),
2311 _mm_castsi128_ps(tmp.v2),
2312 _mm_castsi128_ps(tmp.v3));
2313 icl512 a3 = a * a * a;
2314 return a * (a3 + v + v) * (a3 + a3 + v).rcp();
2321 typedef icl128 icl32fx4;
2322 typedef icl256 icl32fx8;
2323 typedef icl512 icl32fx16;
2324 typedef icl128i8u icl8ux16;
2325 typedef icl128i16s icl16sx8;
2326 typedef icl128i32s icl32sx4;
2327 typedef icl256i16s icl16sx16;
2328 typedef icl256i32s icl32sx8;
2329 typedef icl512i32s icl32sx16;
2330 typedef icl128d icl64fx2;
2331 typedef icl256d icl64fx4;
2332 typedef icl512d icl64fx8;
2333 typedef icl1024d icl64fx16;
ICLQt_API ImgQ sqrt(const ImgQ &image)
calls sqrt( each pixel)
undocument this line if you encounter any issues!
Definition: Any.h:37
Ipp8u icl8u
8Bit unsigned integer type for the ICL
Definition: BasicTypes.h:64
ICLQt_API ImgQ operator/(const ImgQ &a, const ImgQ &b)
divides two images pixel-wise
ICLQt_API ImgQ operator|(const ImgQ &a, const ImgQ &b)
channel concatenation of images
ICLQt_API ImgQ operator-(const ImgQ &a, const ImgQ &b)
subtracts two images pixel-wise
Ipp32s icl32s
32bit signed integer type for the ICL
Definition: BasicTypes.h:58
FixedMatrix< T, V_COLS, M_ROWS_AND_COLS > & operator *=(FixedMatrix< T, V_COLS, M_ROWS_AND_COLS > &v, const FixedMatrix< T, M_ROWS_AND_COLS, M_ROWS_AND_COLS > &m)
Matrix multiplication (inplace)
Definition: FixedMatrix.h:959
Ipp32f icl32f
32Bit floating point type for the ICL
Definition: BasicTypes.h:55
Ipp64f icl64f
64Bit floating point type for the ICL
Definition: BasicTypes.h:52
ICLQt_API ImgQ abs(const ImgQ &image)
calls abs ( each pixel)
ICLUtils_API std::ostream & operator<<(std::ostream &s, const ConfigFile &cf)
Default ostream operator to put a ConfigFile into a stream.
ICLUtils_API std::istream & operator>>(std::istream &s, Point &p)
istream operator
uint32_t icl32u
32bit unsigned integer type for the ICL
Definition: BasicTypes.h:88
ICLQt_API ImgQ operator *(const ImgQ &a, const ImgQ &b)
multiplies two images pixel-wise
int8_t icl8s
8bit signed integer
Definition: BasicTypes.h:85
ICLQt_API ImgQ operator+(const ImgQ &a, const ImgQ &b)
adds two images pixel-wise
uint16_t icl16u
16bit unsigned integer type for the ICL
Definition: BasicTypes.h:91
Ipp16s icl16s
16bit signed integer type for the ICL (range [-32767, 32768 ])
Definition: BasicTypes.h:61