simd.h (33818B)
1 #pragma once 2 3 /* Note: This is a work in progress. Bindings for instructions should be 4 * added as needed. We use GCC builtin's instead of the portable Intel 5 * intrinsics so that we get type-safety and because including x86intrin.h 6 * adds like 1s to compile times. Also, when working with intrinsics, one 7 * will probably accidently depend on compiler details anyway. */ 8 9 #ifndef __GNUC__ 10 #error "rcx/simd.h requires GCC extensions" 11 #endif 12 13 #include "def.h" 14 15 /* TODO: MMX, AVX-512 */ 16 /* TODO: Unaligned 128 bit typedefs, and corresponding load/store intrinsics */ 17 18 /* 128 bit */ 19 typedef i8 v16i8 __attribute__((vector_size(16))); 20 typedef u8 v16u8 __attribute__((vector_size(16))); 21 typedef i16 v8i16 __attribute__((vector_size(16))); 22 typedef u16 v8u16 __attribute__((vector_size(16))); 23 typedef i32 v4i32 __attribute__((vector_size(16))); 24 typedef u32 v4u32 __attribute__((vector_size(16))); 25 typedef i64 v2i64 __attribute__((vector_size(16))); 26 typedef u64 v2u64 __attribute__((vector_size(16))); 27 /* These are for casting inputs/output of the GCC builtins. */ 28 typedef char r_v16qi_ __attribute__((vector_size(16))); 29 typedef short r_v8hi_ __attribute__((vector_size(16))); 30 typedef int r_v4si_ __attribute__((vector_size(16))); 31 typedef long long r_v2di_ __attribute__((vector_size(16))); 32 #define v16qi r_v16qi_ 33 #define v8hi r_v8hi_ 34 #define v4si r_v4si_ 35 #define v2di r_v2di_ 36 37 /* 256 bit */ 38 typedef i8 v32i8 __attribute__((vector_size(32))); 39 typedef u8 v32u8 __attribute__((vector_size(32))); 40 typedef i16 v16i16 __attribute__((vector_size(32))); 41 typedef u16 v16u16 __attribute__((vector_size(32))); 42 typedef i32 v8i32 __attribute__((vector_size(32))); 43 typedef u32 v8u32 __attribute__((vector_size(32))); 44 typedef i64 v4i64 __attribute__((vector_size(32))); 45 typedef u64 v4u64 __attribute__((vector_size(32))); 46 typedef i8 v32i8a1 __attribute__((vector_size(32), aligned(1))); 47 typedef u8 v32u8a1 __attribute__((vector_size(32), aligned(1))); 48 typedef i16 v16i16a1 __attribute__((vector_size(32), aligned(1))); 49 typedef u16 v16u16a1 __attribute__((vector_size(32), aligned(1))); 50 typedef i32 v8i32a1 __attribute__((vector_size(32), aligned(1))); 51 typedef u32 v8u32a1 __attribute__((vector_size(32), aligned(1))); 52 typedef i64 v4i64a1 __attribute__((vector_size(32), aligned(1))); 53 typedef u64 v4u64a1 __attribute__((vector_size(32), aligned(1))); 54 /* These are for casting inputs/output of the GCC builtins. */ 55 typedef char r_v32qi_ __attribute__((vector_size(32))); 56 typedef short r_v16hi_ __attribute__((vector_size(32))); 57 typedef int r_v8si_ __attribute__((vector_size(32))); 58 typedef long long r_v4di_ __attribute__((vector_size(32))); 59 #define v32qi r_v32qi_ 60 #define v16hi r_v16hi_ 61 #define v8si r_v8si_ 62 #define v4di r_v4di_ 63 64 #ifdef R_HAVE_SSE2 65 static inline v16i8 v16i8_set( 66 i8 x15, i8 x14, i8 x13, i8 x12, i8 x11, i8 x10, i8 x09, i8 x08, 67 i8 x07, i8 x06, i8 x05, i8 x04, i8 x03, i8 x02, i8 x01, i8 x00 68 ) { return (v16i8){ 69 x15, x14, x13, x12, x11, x10, x09, x08, 70 x07, x06, x05, x04, x03, x02, x01, x00 71 }; } 72 static inline v16u8 v16u8_set( 73 u8 x15, u8 x14, u8 x13, u8 x12, u8 x11, u8 x10, u8 x09, u8 x08, 74 u8 x07, u8 x06, u8 x05, u8 x04, u8 x03, u8 x02, u8 x01, u8 x00 75 ) { return (v16u8){ 76 x15, x14, x13, x12, x11, x10, x09, x08, 77 x07, x06, x05, x04, x03, x02, x01, x00 78 }; } 79 static inline v8i16 v8i16_set(i16 x7, i16 x6, i16 x5, i16 x4, i16 x3, i16 x2, i16 x1, i16 x0) 80 { return (v8i16){ x7, x6, x5, x4, x3, x2, x1, x0 }; } 81 static inline v8u16 v8u16_set(u16 x7, u16 x6, u16 x5, u16 x4, u16 x3, u16 x2, u16 x1, u16 x0) 82 { return (v8u16){ x7, x6, x5, x4, x3, x2, x1, x0 }; } 83 static inline v4i32 v4i32_set(i32 x3, i32 x2, i32 x1, i32 x0) { return (v4i32){ x3, x2, x1, x0 }; } 84 static inline v4u32 v4u32_set(u32 x3, u32 x2, u32 x1, u32 x0) { return (v4u32){ x3, x2, x1, x0 }; } 85 static inline v2i64 v2i64_set(i64 x1, i64 x0) { return (v2i64){ x1, x0 }; } 86 static inline v2u64 v2u64_set(u64 x1, u64 x0) { return (v2u64){ x1, x0 }; } 87 88 static inline v16i8 v16i8_fill(i8 x) { return v16i8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } 89 static inline v16u8 v16u8_fill(u8 x) { return v16u8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } 90 static inline v8i16 v8i16_fill(i16 x) { return v8i16_set(x, x, x, x, x, x, x, x); } 91 static inline v8u16 v8u16_fill(u16 x) { return v8u16_set(x, x, x, x, x, x, x, x); } 92 static inline v4i32 v4i32_fill(i32 x) { return v4i32_set(x, x, x, x); } 93 static inline v4u32 v4u32_fill(u32 x) { return v4u32_set(x, x, x, x); } 94 static inline v2i64 v2i64_fill(i64 x) { return v2i64_set(x, x); } 95 static inline v2u64 v2u64_fill(u64 x) { return v2u64_set(x, x); } 96 97 static inline v16i8 v16i8_add(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); } 98 static inline v16u8 v16u8_add(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); } 99 static inline v8i16 v8i16_add(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); } 100 static inline v8u16 v8u16_add(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); } 101 static inline v4i32 v4i32_add(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_paddd128((v4si)x, (v4si)y); } 102 static inline v4u32 v4u32_add(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_paddd128((v4si)x, (v4si)y); } 103 static inline v2i64 v2i64_add(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_paddq128((v2di)x, (v2di)y); } 104 static inline v2u64 v2u64_add(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_paddq128((v2di)x, (v2di)y); } 105 106 static inline v16i8 v16i8_sub(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); } 107 static inline v16u8 v16u8_sub(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); } 108 static inline v8i16 v8i16_sub(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); } 109 static inline v8u16 v8u16_sub(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); } 110 static inline v4i32 v4i32_sub(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psubd128((v4si)x, (v4si)y); } 111 static inline v4u32 v4u32_sub(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psubd128((v4si)x, (v4si)y); } 112 static inline v2i64 v2i64_sub(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_psubq128((v2di)x, (v2di)y); } 113 static inline v2u64 v2u64_sub(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psubq128((v2di)x, (v2di)y); } 114 115 static inline v16i8 v16i8_and(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pand128((v2di)x, (v2di)y); } 116 static inline v16u8 v16u8_and(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pand128((v2di)x, (v2di)y); } 117 static inline v8i16 v8i16_and(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pand128((v2di)x, (v2di)y); } 118 static inline v8u16 v8u16_and(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pand128((v2di)x, (v2di)y); } 119 static inline v4i32 v4i32_and(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pand128((v2di)x, (v2di)y); } 120 static inline v4u32 v4u32_and(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pand128((v2di)x, (v2di)y); } 121 static inline v2i64 v2i64_and(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pand128((v2di)x, (v2di)y); } 122 static inline v2u64 v2u64_and(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pand128((v2di)x, (v2di)y); } 123 124 static inline v16i8 v16i8_andnot(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 125 static inline v16u8 v16u8_andnot(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 126 static inline v8i16 v8i16_andnot(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 127 static inline v8u16 v8u16_andnot(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 128 static inline v4i32 v4i32_andnot(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 129 static inline v4u32 v4u32_andnot(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 130 static inline v2i64 v2i64_andnot(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 131 static inline v2u64 v2u64_andnot(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pandn128((v2di)x, (v2di)y); } 132 133 static inline v16i8 v16i8_or(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_por128((v2di)x, (v2di)y); } 134 static inline v16u8 v16u8_or(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_por128((v2di)x, (v2di)y); } 135 static inline v8i16 v8i16_or(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_por128((v2di)x, (v2di)y); } 136 static inline v8u16 v8u16_or(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_por128((v2di)x, (v2di)y); } 137 static inline v4i32 v4i32_or(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_por128((v2di)x, (v2di)y); } 138 static inline v4u32 v4u32_or(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_por128((v2di)x, (v2di)y); } 139 static inline v2i64 v2i64_or(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_por128((v2di)x, (v2di)y); } 140 static inline v2u64 v2u64_or(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_por128((v2di)x, (v2di)y); } 141 142 static inline v16i8 v16i8_xor(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 143 static inline v16u8 v16u8_xor(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 144 static inline v8i16 v8i16_xor(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 145 static inline v8u16 v8u16_xor(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 146 static inline v4i32 v4i32_xor(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 147 static inline v4u32 v4u32_xor(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 148 static inline v2i64 v2i64_xor(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 149 static inline v2u64 v2u64_xor(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pxor128((v2di)x, (v2di)y); } 150 151 static inline v8u16 v8u16_sl(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psllw128((v8hi)x, (v8hi)y); } 152 static inline v4u32 v4u32_sl(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pslld128((v4si)x, (v4si)y); } 153 static inline v2u64 v2u64_sl(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psllq128((v2di)x, (v2di)y); } 154 155 static inline v8u16 v8u16_sr(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psrlw128((v8hi)x, (v8hi)y); } 156 static inline v4u32 v4u32_sr(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psrld128((v4si)x, (v4si)y); } 157 static inline v2u64 v2u64_sr(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psrlq128((v2di)x, (v2di)y); } 158 159 static inline v8i16 v8i16_sr(v8i16 x, v8u16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); } 160 static inline v4i32 v4i32_sr(v4i32 x, v4u32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); } 161 162 static inline v8u16 v8u16_sli(v8u16 x, uint c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); } 163 static inline v4u32 v4u32_sli(v4u32 x, uint c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); } 164 static inline v2u64 v2u64_sli(v2u64 x, uint c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); } 165 166 static inline v8u16 v8u16_sri(v8u16 x, uint c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); } 167 static inline v4u32 v4u32_sri(v4u32 x, uint c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); } 168 static inline v2u64 v2u64_sri(v2u64 x, uint c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); } 169 170 static inline v8i16 v8i16_sri(v8i16 x, uint c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); } 171 static inline v4i32 v4i32_sri(v4i32 x, uint c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); } 172 173 static inline v16u8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); } 174 static inline v16u8 v16u8_cmpeq(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); } 175 static inline v8u16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); } 176 static inline v8u16 v8u16_cmpeq(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); } 177 static inline v4u32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); } 178 static inline v4u32 v4u32_cmpeq(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); } 179 180 static inline v16u8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); } 181 static inline v16u8 v16u8_cmpgt(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); } 182 static inline v8u16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); } 183 static inline v8u16 v8u16_cmpgt(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); } 184 static inline v4u32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); } 185 static inline v4u32 v4u32_cmpgt(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); } 186 187 static inline uint v16u8_msb(v16u8 x) { return __builtin_ia32_pmovmskb128((v16qi)x); } 188 #endif 189 190 #ifdef R_HAVE_SSSE3 191 static inline v16u8 v16u8_shuf(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pshufb128((v16qi)x, (v16qi)y); } 192 193 static inline v16i8 v16i8_sign(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psignb128((v16qi)x, (v16qi)y); } 194 static inline v8i16 v8i16_sign(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psignw128((v8hi)x, (v8hi)y); } 195 static inline v4i32 v4i32_sign(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psignd128((v4si)x, (v4si)y); } 196 #endif 197 198 #ifdef R_HAVE_SSE4_1 199 static inline v2u64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); } 200 static inline v2u64 v2u64_cmpeq(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); } 201 202 static inline v8i16 v16i8_ext_v8i16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); } 203 static inline v4i32 v16i8_ext_v4i32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); } 204 static inline v2i64 v16i8_ext_v2i64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); } 205 static inline v4i32 v8i16_ext_v4i32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); } 206 static inline v2i64 v8i16_ext_v2i64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); } 207 static inline v2i64 v4i32_ext_v2i64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); } 208 209 static inline v8u16 v16u8_ext_v8u16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); } 210 static inline v4u32 v16u8_ext_v4u32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); } 211 static inline v2u64 v16u8_ext_v2u64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); } 212 static inline v4u32 v8u16_ext_v4u32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); } 213 static inline v2u64 v8u16_ext_v2u64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); } 214 static inline v2u64 v4u32_ext_v2u64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); } 215 #endif 216 217 #ifdef R_HAVE_SSE4_2 218 static inline v2u64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); } 219 static inline v2u64 v2u64_cmpgt(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); } 220 #endif 221 222 #ifdef R_HAVE_AVX2 223 static inline v32i8 v32i8_set( 224 i8 x31, i8 x30, i8 x29, i8 x28, i8 x27, i8 x26, i8 x25, i8 x24, 225 i8 x23, i8 x22, i8 x21, i8 x20, i8 x19, i8 x18, i8 x17, i8 x16, 226 i8 x15, i8 x14, i8 x13, i8 x12, i8 x11, i8 x10, i8 x09, i8 x08, 227 i8 x07, i8 x06, i8 x05, i8 x04, i8 x03, i8 x02, i8 x01, i8 x00 228 ) { return (v32i8){ 229 x31, x30, x29, x28, x27, x26, x25, x24, 230 x23, x22, x21, x20, x19, x18, x17, x16, 231 x15, x14, x13, x12, x11, x10, x09, x08, 232 x07, x06, x05, x04, x03, x02, x01, x00 233 }; } 234 static inline v32u8 v32u8_set( 235 u8 x31, u8 x30, u8 x29, u8 x28, u8 x27, u8 x26, u8 x25, u8 x24, 236 u8 x23, u8 x22, u8 x21, u8 x20, u8 x19, u8 x18, u8 x17, u8 x16, 237 u8 x15, u8 x14, u8 x13, u8 x12, u8 x11, u8 x10, u8 x09, u8 x08, 238 u8 x07, u8 x06, u8 x05, u8 x04, u8 x03, u8 x02, u8 x01, u8 x00 239 ) { return (v32u8){ 240 x31, x30, x29, x28, x27, x26, x25, x24, 241 x23, x22, x21, x20, x19, x18, x17, x16, 242 x15, x14, x13, x12, x11, x10, x09, x08, 243 x07, x06, x05, x04, x03, x02, x01, x00 244 }; } 245 static inline v16i16 v16i16_set( 246 i16 x15, i16 x14, i16 x13, i16 x12, i16 x11, i16 x10, i16 x09, i16 x08, 247 i16 x07, i16 x06, i16 x05, i16 x04, i16 x03, i16 x02, i16 x01, i16 x00 248 ) { return (v16i16){ 249 x15, x14, x13, x12, x11, x10, x09, x08, 250 x07, x06, x05, x04, x03, x02, x01, x00 251 }; } 252 static inline v16u16 v16u16_set( 253 u16 x15, u16 x14, u16 x13, u16 x12, u16 x11, u16 x10, u16 x09, u16 x08, 254 u16 x07, u16 x06, u16 x05, u16 x04, u16 x03, u16 x02, u16 x01, u16 x00 255 ) { return (v16u16){ 256 x15, x14, x13, x12, x11, x10, x09, x08, 257 x07, x06, x05, x04, x03, x02, x01, x00 258 }; } 259 static inline v8i32 v8i32_set(i32 x7, i32 x6, i32 x5, i32 x4, i32 x3, i32 x2, i32 x1, i32 x0) 260 { return (v8i32){ x7, x6, x5, x4, x3, x2, x1, x0 }; } 261 static inline v8u32 v8u32_set(u32 x7, u32 x6, u32 x5, u32 x4, u32 x3, u32 x2, u32 x1, u32 x0) 262 { return (v8u32){ x7, x6, x5, x4, x3, x2, x1, x0 }; } 263 static inline v4i64 v4i64_set(i64 x3, i64 x2, i64 x1, i64 x0) { return (v4i64){ x3, x2, x1, x0 }; } 264 static inline v4u64 v4u64_set(u64 x3, u64 x2, u64 x1, u64 x0) { return (v4u64){ x3, x2, x1, x0 }; } 265 266 static inline v32i8 v32i8_fill(i8 x) { return v32i8_set( 267 x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 268 x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x 269 ); } 270 static inline v32u8 v32u8_fill(u8 x) { return v32u8_set( 271 x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, 272 x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x 273 ); } 274 static inline v16i16 v16i16_fill(i16 x) { return v16i16_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } 275 static inline v16u16 v16u16_fill(u16 x) { return v16u16_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } 276 static inline v8i32 v8i32_fill(i32 x) { return v8i32_set(x, x, x, x, x, x, x, x); } 277 static inline v8u32 v8u32_fill(u32 x) { return v8u32_set(x, x, x, x, x, x, x, x); } 278 static inline v4i64 v4i64_fill(i64 x) { return v4i64_set(x, x, x, x); } 279 static inline v4u64 v4u64_fill(u64 x) { return v4u64_set(x, x, x, x); } 280 281 static inline v32i8 v32i8_load(v32i8 *p) { return *p; } 282 static inline v32u8 v32u8_load(v32u8 *p) { return *p; } 283 static inline v16i16 v16i16_load(v16i16 *p) { return *p; } 284 static inline v16u16 v16u16_load(v16u16 *p) { return *p; } 285 static inline v8i32 v8i32_load(v8i32 *p) { return *p; } 286 static inline v8u32 v8u32_load(v8u32 *p) { return *p; } 287 static inline v4i64 v4i64_load(v4i64 *p) { return *p; } 288 static inline v4u64 v4u64_load(v4u64 *p) { return *p; } 289 290 static inline v32i8 v32i8_loadu(v32i8a1 *p) { return *p; } 291 static inline v32u8 v32u8_loadu(v32u8a1 *p) { return *p; } 292 static inline v16i16 v16i16_loadu(v16i16a1 *p) { return *p; } 293 static inline v16u16 v16u16_loadu(v16u16a1 *p) { return *p; } 294 static inline v8i32 v8i32_loadu(v8i32a1 *p) { return *p; } 295 static inline v8u32 v8u32_loadu(v8u32a1 *p) { return *p; } 296 static inline v4i64 v4i64_loadu(v4i64a1 *p) { return *p; } 297 static inline v4u64 v4u64_loadu(v4u64a1 *p) { return *p; } 298 299 static inline void v32i8_store(v32i8 *p, v32i8 x) { *p = x; } 300 static inline void v32u8_store(v32u8 *p, v32u8 x) { *p = x; } 301 static inline void v16i16_store(v16i16 *p, v16i16 x) { *p = x; } 302 static inline void v16u16_store(v16u16 *p, v16u16 x) { *p = x; } 303 static inline void v8i32_store(v8i32 *p, v8i32 x) { *p = x; } 304 static inline void v8u32_store(v8u32 *p, v8u32 x) { *p = x; } 305 static inline void v4i64_store(v4i64 *p, v4i64 x) { *p = x; } 306 static inline void v4u64_store(v4u64 *p, v4u64 x) { *p = x; } 307 308 static inline void v32i8_storeu(v32i8a1 *p, v32i8 x) { *p = x; } 309 static inline void v32u8_storeu(v32u8a1 *p, v32u8 x) { *p = x; } 310 static inline void v16i16_storeu(v16i16a1 *p, v16i16 x) { *p = x; } 311 static inline void v16u16_storeu(v16u16a1 *p, v16u16 x) { *p = x; } 312 static inline void v8i32_storeu(v8i32a1 *p, v8i32 x) { *p = x; } 313 static inline void v8u32_storeu(v8u32a1 *p, v8u32 x) { *p = x; } 314 static inline void v4i64_storeu(v4i64a1 *p, v4i64 x) { *p = x; } 315 static inline void v4u64_storeu(v4u64a1 *p, v4u64 x) { *p = x; } 316 317 static inline v32i8 v32i8_add(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); } 318 static inline v32u8 v32u8_add(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); } 319 static inline v16i16 v16i16_add(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); } 320 static inline v16u16 v16u16_add(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); } 321 static inline v8i32 v8i32_add(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_paddd256((v8si)x, (v8si)y); } 322 static inline v8u32 v8u32_add(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_paddd256((v8si)x, (v8si)y); } 323 static inline v4i64 v4i64_add(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_paddq256((v4di)x, (v4di)y); } 324 static inline v4u64 v4u64_add(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_paddq256((v4di)x, (v4di)y); } 325 326 static inline v32i8 v32i8_sub(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_psubb256((v32qi)x, (v32qi)y); } 327 static inline v32u8 v32u8_sub(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_psubb256((v32qi)x, (v32qi)y); } 328 static inline v16i16 v16i16_sub(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_psubw256((v16hi)x, (v16hi)y); } 329 static inline v16u16 v16u16_sub(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_psubw256((v16hi)x, (v16hi)y); } 330 static inline v8i32 v8i32_sub(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_psubd256((v8si)x, (v8si)y); } 331 static inline v8u32 v8u32_sub(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_psubd256((v8si)x, (v8si)y); } 332 static inline v4i64 v4i64_sub(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_psubq256((v4di)x, (v4di)y); } 333 static inline v4u64 v4u64_sub(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_psubq256((v4di)x, (v4di)y); } 334 335 static inline v32i8 v32i8_and(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_andsi256((v4di)x, (v4di)y); } 336 static inline v32u8 v32u8_and(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_andsi256((v4di)x, (v4di)y); } 337 static inline v16i16 v16i16_and(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_andsi256((v4di)x, (v4di)y); } 338 static inline v16u16 v16u16_and(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_andsi256((v4di)x, (v4di)y); } 339 static inline v8i32 v8i32_and(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_andsi256((v4di)x, (v4di)y); } 340 static inline v8u32 v8u32_and(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_andsi256((v4di)x, (v4di)y); } 341 static inline v4i64 v4i64_and(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_andsi256((v4di)x, (v4di)y); } 342 static inline v4u64 v4u64_and(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_andsi256((v4di)x, (v4di)y); } 343 344 static inline v32i8 v32i8_andnot(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 345 static inline v32u8 v32u8_andnot(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 346 static inline v16i16 v16i16_andnot(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 347 static inline v16u16 v16u16_andnot(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 348 static inline v8i32 v8i32_andnot(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 349 static inline v8u32 v8u32_andnot(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 350 static inline v4i64 v4i64_andnot(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 351 static inline v4u64 v4u64_andnot(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); } 352 353 static inline v32i8 v32i8_or(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_por256((v4di)x, (v4di)y); } 354 static inline v32u8 v32u8_or(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_por256((v4di)x, (v4di)y); } 355 static inline v16i16 v16i16_or(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_por256((v4di)x, (v4di)y); } 356 static inline v16u16 v16u16_or(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_por256((v4di)x, (v4di)y); } 357 static inline v8i32 v8i32_or(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_por256((v4di)x, (v4di)y); } 358 static inline v8u32 v8u32_or(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_por256((v4di)x, (v4di)y); } 359 static inline v4i64 v4i64_or(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_por256((v4di)x, (v4di)y); } 360 static inline v4u64 v4u64_or(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_por256((v4di)x, (v4di)y); } 361 362 static inline v32i8 v32i8_xor(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_pxor256((v4di)x, (v4di)y); } 363 static inline v32u8 v32u8_xor(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_pxor256((v4di)x, (v4di)y); } 364 static inline v16i16 v16i16_xor(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_pxor256((v4di)x, (v4di)y); } 365 static inline v16u16 v16u16_xor(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pxor256((v4di)x, (v4di)y); } 366 static inline v8i32 v8i32_xor(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_pxor256((v4di)x, (v4di)y); } 367 static inline v8u32 v8u32_xor(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_pxor256((v4di)x, (v4di)y); } 368 static inline v4i64 v4i64_xor(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_pxor256((v4di)x, (v4di)y); } 369 static inline v4u64 v4u64_xor(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_pxor256((v4di)x, (v4di)y); } 370 371 static inline v16u16 v16u16_sl(v16u16 x, v8u16 y) { return (v16u16)__builtin_ia32_psllw256((v16hi)x, (v8hi)y); } 372 static inline v8u32 v8u32_sl(v8u32 x, v4u32 y) { return (v8u32)__builtin_ia32_pslld256((v8si)x, (v4si)y); } 373 static inline v4u64 v4u64_sl(v4u64 x, v2u64 y) { return (v4u64)__builtin_ia32_psllq256((v4di)x, (v2di)y); } 374 375 static inline v16u16 v16u16_sr(v16u16 x, v8u16 y) { return (v16u16)__builtin_ia32_psrlw256((v16hi)x, (v8hi)y); } 376 static inline v8u32 v8u32_sr(v8u32 x, v4u32 y) { return (v8u32)__builtin_ia32_psrld256((v8si)x, (v4si)y); } 377 static inline v4u64 v4u64_sr(v4u64 x, v2u64 y) { return (v4u64)__builtin_ia32_psrlq256((v4di)x, (v2di)y); } 378 379 static inline v16i16 v16i16_sr(v16i16 x, v8u16 y) { return (v16i16)__builtin_ia32_psraw256((v16hi)x, (v8hi)y); } 380 static inline v8i32 v8i32_sr(v8i32 x, v4u32 y) { return (v8i32)__builtin_ia32_psrad256((v8si)x, (v4si)y); } 381 382 static inline v16u16 v16u16_sli(v16u16 x, uint c) { return (v16u16)__builtin_ia32_psllwi256((v16hi)x, c); } 383 static inline v8u32 v8u32_sli(v8u32 x, uint c) { return (v8u32)__builtin_ia32_pslldi256((v8si)x, c); } 384 static inline v4u64 v4u64_sli(v4u64 x, uint c) { return (v4u64)__builtin_ia32_psllqi256((v4di)x, c); } 385 386 static inline v16u16 v16u16_sri(v16u16 x, uint c) { return (v16u16)__builtin_ia32_psrlwi256((v16hi)x, c); } 387 static inline v8u32 v8u32_sri(v8u32 x, uint c) { return (v8u32)__builtin_ia32_psrldi256((v8si)x, c); } 388 static inline v4u64 v4u64_sri(v4u64 x, uint c) { return (v4u64)__builtin_ia32_psrlqi256((v4di)x, c); } 389 390 static inline v16i16 v16i16_sri(v16i16 x, uint c) { return (v16i16)__builtin_ia32_psrawi256((v16hi)x, c); } 391 static inline v8i32 v8i32_sri(v8i32 x, uint c) { return (v8i32)__builtin_ia32_psradi256((v8si)x, c); } 392 393 static inline v32u8 v32i8_cmpeq(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); } 394 static inline v32u8 v32u8_cmpeq(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); } 395 static inline v16u16 v16i16_cmpeq(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); } 396 static inline v16u16 v16u16_cmpeq(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); } 397 static inline v8u32 v8i32_cmpeq(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); } 398 static inline v8u32 v8u32_cmpeq(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); } 399 static inline v4u64 v4i64_cmpeq(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); } 400 static inline v4u64 v4u64_cmpeq(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); } 401 402 static inline v32u8 v32i8_cmpgt(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); } 403 static inline v32u8 v32u8_cmpgt(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); } 404 static inline v16u16 v16i16_cmpgt(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); } 405 static inline v16u16 v16u16_cmpgt(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); } 406 static inline v8u32 v8i32_cmpgt(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); } 407 static inline v8u32 v8u32_cmpgt(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); } 408 static inline v4u64 v4i64_cmpgt(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); } 409 static inline v4u64 v4u64_cmpgt(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); } 410 411 static inline int v32i8_testc(v32i8 x, v32i8 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 412 static inline int v32u8_testc(v32u8 x, v32u8 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 413 static inline int v16i16_testc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 414 static inline int v16u16_testc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 415 static inline int v8i32_testc(v8i32 x, v8i32 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 416 static inline int v8u32_testc(v8u32 x, v8u32 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 417 static inline int v4i64_testc(v4i64 x, v4i64 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 418 static inline int v4u64_testc(v4u64 x, v4u64 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } 419 420 static inline int v32i8_testz(v32i8 x, v32i8 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 421 static inline int v32u8_testz(v32u8 x, v32u8 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 422 static inline int v16i16_testz(v16i16 x, v16i16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 423 static inline int v16u16_testz(v16u16 x, v16u16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 424 static inline int v8i32_testz(v8i32 x, v8i32 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 425 static inline int v8u32_testz(v8u32 x, v8u32 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 426 static inline int v4i64_testz(v4i64 x, v4i64 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 427 static inline int v4u64_testz(v4u64 x, v4u64 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } 428 429 static inline int v32i8_testnzc(v32i8 x, v32i8 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 430 static inline int v32u8_testnzc(v32u8 x, v32u8 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 431 static inline int v16i16_testnzc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 432 static inline int v16u16_testnzc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 433 static inline int v8i32_testnzc(v8i32 x, v8i32 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 434 static inline int v8u32_testnzc(v8u32 x, v8u32 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 435 static inline int v4i64_testnzc(v4i64 x, v4i64 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 436 static inline int v4u64_testnzc(v4u64 x, v4u64 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } 437 438 static inline uint v32u8_msb(v32u8 x) { return __builtin_ia32_pmovmskb256((v32qi)x); } 439 440 static inline v16i16 v16i8_ext_v16i16(v16i8 x) { return (v16i16)__builtin_ia32_pmovsxbw256((v16qi)x); } 441 static inline v8i32 v16i8_ext_v8i32(v16i8 x) { return (v8i32)__builtin_ia32_pmovsxbd256((v16qi)x); } 442 static inline v4i64 v16i8_ext_v4i64(v16i8 x) { return (v4i64)__builtin_ia32_pmovsxbq256((v16qi)x); } 443 static inline v8i32 v8i16_ext_v8i32(v8i16 x) { return (v8i32)__builtin_ia32_pmovsxwd256((v8hi)x); } 444 static inline v4i64 v8i16_ext_v4i64(v8i16 x) { return (v4i64)__builtin_ia32_pmovsxwq256((v8hi)x); } 445 static inline v4i64 v4i32_ext_v4i64(v4i32 x) { return (v4i64)__builtin_ia32_pmovsxdq256((v4si)x); } 446 447 static inline v16u16 v16u8_ext_v16u16(v16u8 x) { return (v16u16)__builtin_ia32_pmovzxbw256((v16qi)x); } 448 static inline v8u32 v16u8_ext_v8u32(v16u8 x) { return (v8u32)__builtin_ia32_pmovzxbd256((v16qi)x); } 449 static inline v4u64 v16u8_ext_v4u64(v16u8 x) { return (v4u64)__builtin_ia32_pmovzxbq256((v16qi)x); } 450 static inline v8u32 v8u16_ext_v8u32(v8u16 x) { return (v8u32)__builtin_ia32_pmovzxwd256((v8hi)x); } 451 static inline v4u64 v8u16_ext_v4u64(v8u16 x) { return (v4u64)__builtin_ia32_pmovzxwq256((v8hi)x); } 452 static inline v4u64 v4u32_ext_v4u64(v4u32 x) { return (v4u64)__builtin_ia32_pmovzxdq256((v4si)x); } 453 454 static inline v32u8 v32u8_shuf(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pshufb256((v32qi)x, (v32qi)y); } 455 456 static inline v32i8 v32i8_sign(v32i8 x, v32i8 y) { return (v32i8)__builtin_ia32_psignb256((v32qi)x, (v32qi)y); } 457 static inline v16i16 v16i16_sign(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_psignw256((v16hi)x, (v16hi)y); } 458 static inline v8i32 v8i32_sign(v8i32 x, v8i32 y) { return (v8i32)__builtin_ia32_psignd256((v8si)x, (v8si)y); } 459 #endif 460 461 #undef v2di 462 #undef v4si 463 #undef v8hi 464 #undef v16qi 465 #undef v4di 466 #undef v8si 467 #undef v16hi 468 #undef v32qi