rcx

miscellaneous C library
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

simd.h (33818B)


      1 #pragma once
      2 
      3 /* Note: This is a work in progress. Bindings for instructions should be
      4  * added as needed. We use GCC builtin's instead of the portable Intel
      5  * intrinsics so that we get type-safety and because including x86intrin.h
      6  * adds like 1s to compile times. Also, when working with intrinsics, one
      7  * will probably accidently depend on compiler details anyway. */
      8 
      9 #ifndef __GNUC__
     10 #error "rcx/simd.h requires GCC extensions"
     11 #endif
     12 
     13 #include "def.h"
     14 
     15 /* TODO: MMX, AVX-512 */
     16 /* TODO: Unaligned 128 bit typedefs, and corresponding load/store intrinsics */
     17 
     18 /* 128 bit */
     19 typedef i8  v16i8 __attribute__((vector_size(16)));
     20 typedef u8  v16u8 __attribute__((vector_size(16)));
     21 typedef i16 v8i16 __attribute__((vector_size(16)));
     22 typedef u16 v8u16 __attribute__((vector_size(16)));
     23 typedef i32 v4i32 __attribute__((vector_size(16)));
     24 typedef u32 v4u32 __attribute__((vector_size(16)));
     25 typedef i64 v2i64 __attribute__((vector_size(16)));
     26 typedef u64 v2u64 __attribute__((vector_size(16)));
     27 /* These are for casting inputs/output of the GCC builtins. */
     28 typedef char      r_v16qi_ __attribute__((vector_size(16)));
     29 typedef short     r_v8hi_  __attribute__((vector_size(16)));
     30 typedef int       r_v4si_  __attribute__((vector_size(16)));
     31 typedef long long r_v2di_  __attribute__((vector_size(16)));
     32 #define v16qi r_v16qi_
     33 #define v8hi  r_v8hi_
     34 #define v4si  r_v4si_
     35 #define v2di  r_v2di_
     36 
     37 /* 256 bit */
     38 typedef i8  v32i8    __attribute__((vector_size(32)));
     39 typedef u8  v32u8    __attribute__((vector_size(32)));
     40 typedef i16 v16i16   __attribute__((vector_size(32)));
     41 typedef u16 v16u16   __attribute__((vector_size(32)));
     42 typedef i32 v8i32    __attribute__((vector_size(32)));
     43 typedef u32 v8u32    __attribute__((vector_size(32)));
     44 typedef i64 v4i64    __attribute__((vector_size(32)));
     45 typedef u64 v4u64    __attribute__((vector_size(32)));
     46 typedef i8  v32i8a1  __attribute__((vector_size(32), aligned(1)));
     47 typedef u8  v32u8a1  __attribute__((vector_size(32), aligned(1)));
     48 typedef i16 v16i16a1 __attribute__((vector_size(32), aligned(1)));
     49 typedef u16 v16u16a1 __attribute__((vector_size(32), aligned(1)));
     50 typedef i32 v8i32a1  __attribute__((vector_size(32), aligned(1)));
     51 typedef u32 v8u32a1  __attribute__((vector_size(32), aligned(1)));
     52 typedef i64 v4i64a1  __attribute__((vector_size(32), aligned(1)));
     53 typedef u64 v4u64a1  __attribute__((vector_size(32), aligned(1)));
     54 /* These are for casting inputs/output of the GCC builtins. */
     55 typedef char      r_v32qi_ __attribute__((vector_size(32)));
     56 typedef short     r_v16hi_ __attribute__((vector_size(32)));
     57 typedef int       r_v8si_  __attribute__((vector_size(32)));
     58 typedef long long r_v4di_  __attribute__((vector_size(32)));
     59 #define v32qi r_v32qi_
     60 #define v16hi r_v16hi_
     61 #define v8si  r_v8si_
     62 #define v4di  r_v4di_
     63 
     64 #ifdef R_HAVE_SSE2
     65 static inline v16i8 v16i8_set(
     66 	i8 x15, i8 x14, i8 x13, i8 x12, i8 x11, i8 x10, i8 x09, i8 x08,
     67 	i8 x07, i8 x06, i8 x05, i8 x04, i8 x03, i8 x02, i8 x01, i8 x00
     68 ) { return (v16i8){
     69 	x15, x14, x13, x12, x11, x10, x09, x08,
     70 	x07, x06, x05, x04, x03, x02, x01, x00
     71 }; }
     72 static inline v16u8 v16u8_set(
     73 	u8 x15, u8 x14, u8 x13, u8 x12, u8 x11, u8 x10, u8 x09, u8 x08,
     74 	u8 x07, u8 x06, u8 x05, u8 x04, u8 x03, u8 x02, u8 x01, u8 x00
     75 ) { return (v16u8){
     76 	x15, x14, x13, x12, x11, x10, x09, x08,
     77 	x07, x06, x05, x04, x03, x02, x01, x00
     78 }; }
     79 static inline v8i16 v8i16_set(i16 x7, i16 x6, i16 x5, i16 x4, i16 x3, i16 x2, i16 x1, i16 x0)
     80 { return (v8i16){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
     81 static inline v8u16 v8u16_set(u16 x7, u16 x6, u16 x5, u16 x4, u16 x3, u16 x2, u16 x1, u16 x0)
     82 { return (v8u16){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
     83 static inline v4i32 v4i32_set(i32 x3, i32 x2, i32 x1, i32 x0) { return (v4i32){ x3, x2, x1, x0 }; }
     84 static inline v4u32 v4u32_set(u32 x3, u32 x2, u32 x1, u32 x0) { return (v4u32){ x3, x2, x1, x0 }; }
     85 static inline v2i64 v2i64_set(i64 x1, i64 x0) { return (v2i64){ x1, x0 }; }
     86 static inline v2u64 v2u64_set(u64 x1, u64 x0) { return (v2u64){ x1, x0 }; }
     87 
     88 static inline v16i8 v16i8_fill(i8  x) { return v16i8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
     89 static inline v16u8 v16u8_fill(u8  x) { return v16u8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
     90 static inline v8i16 v8i16_fill(i16 x) { return v8i16_set(x, x, x, x, x, x, x, x); }
     91 static inline v8u16 v8u16_fill(u16 x) { return v8u16_set(x, x, x, x, x, x, x, x); }
     92 static inline v4i32 v4i32_fill(i32 x) { return v4i32_set(x, x, x, x); }
     93 static inline v4u32 v4u32_fill(u32 x) { return v4u32_set(x, x, x, x); }
     94 static inline v2i64 v2i64_fill(i64 x) { return v2i64_set(x, x); }
     95 static inline v2u64 v2u64_fill(u64 x) { return v2u64_set(x, x); }
     96 
     97 static inline v16i8 v16i8_add(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
     98 static inline v16u8 v16u8_add(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
     99 static inline v8i16 v8i16_add(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); }
    100 static inline v8u16 v8u16_add(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); }
    101 static inline v4i32 v4i32_add(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_paddd128((v4si)x, (v4si)y); }
    102 static inline v4u32 v4u32_add(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_paddd128((v4si)x, (v4si)y); }
    103 static inline v2i64 v2i64_add(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_paddq128((v2di)x, (v2di)y); }
    104 static inline v2u64 v2u64_add(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_paddq128((v2di)x, (v2di)y); }
    105 
    106 static inline v16i8 v16i8_sub(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); }
    107 static inline v16u8 v16u8_sub(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); }
    108 static inline v8i16 v8i16_sub(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); }
    109 static inline v8u16 v8u16_sub(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); }
    110 static inline v4i32 v4i32_sub(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psubd128((v4si)x, (v4si)y); }
    111 static inline v4u32 v4u32_sub(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psubd128((v4si)x, (v4si)y); }
    112 static inline v2i64 v2i64_sub(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_psubq128((v2di)x, (v2di)y); }
    113 static inline v2u64 v2u64_sub(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psubq128((v2di)x, (v2di)y); }
    114 
    115 static inline v16i8 v16i8_and(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    116 static inline v16u8 v16u8_and(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    117 static inline v8i16 v8i16_and(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    118 static inline v8u16 v8u16_and(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    119 static inline v4i32 v4i32_and(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    120 static inline v4u32 v4u32_and(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    121 static inline v2i64 v2i64_and(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    122 static inline v2u64 v2u64_and(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pand128((v2di)x, (v2di)y); }
    123 
    124 static inline v16i8 v16i8_andnot(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    125 static inline v16u8 v16u8_andnot(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    126 static inline v8i16 v8i16_andnot(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    127 static inline v8u16 v8u16_andnot(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    128 static inline v4i32 v4i32_andnot(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    129 static inline v4u32 v4u32_andnot(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    130 static inline v2i64 v2i64_andnot(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    131 static inline v2u64 v2u64_andnot(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
    132 
    133 static inline v16i8 v16i8_or(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_por128((v2di)x, (v2di)y); }
    134 static inline v16u8 v16u8_or(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_por128((v2di)x, (v2di)y); }
    135 static inline v8i16 v8i16_or(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_por128((v2di)x, (v2di)y); }
    136 static inline v8u16 v8u16_or(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_por128((v2di)x, (v2di)y); }
    137 static inline v4i32 v4i32_or(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_por128((v2di)x, (v2di)y); }
    138 static inline v4u32 v4u32_or(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_por128((v2di)x, (v2di)y); }
    139 static inline v2i64 v2i64_or(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_por128((v2di)x, (v2di)y); }
    140 static inline v2u64 v2u64_or(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_por128((v2di)x, (v2di)y); }
    141 
    142 static inline v16i8 v16i8_xor(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    143 static inline v16u8 v16u8_xor(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    144 static inline v8i16 v8i16_xor(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    145 static inline v8u16 v8u16_xor(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    146 static inline v4i32 v4i32_xor(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    147 static inline v4u32 v4u32_xor(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    148 static inline v2i64 v2i64_xor(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    149 static inline v2u64 v2u64_xor(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
    150 
    151 static inline v8u16 v8u16_sl(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psllw128((v8hi)x, (v8hi)y); }
    152 static inline v4u32 v4u32_sl(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pslld128((v4si)x, (v4si)y); }
    153 static inline v2u64 v2u64_sl(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psllq128((v2di)x, (v2di)y); }
    154 
    155 static inline v8u16 v8u16_sr(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psrlw128((v8hi)x, (v8hi)y); }
    156 static inline v4u32 v4u32_sr(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psrld128((v4si)x, (v4si)y); }
    157 static inline v2u64 v2u64_sr(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psrlq128((v2di)x, (v2di)y); }
    158 
    159 static inline v8i16 v8i16_sr(v8i16 x, v8u16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); }
    160 static inline v4i32 v4i32_sr(v4i32 x, v4u32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); }
    161 
    162 static inline v8u16 v8u16_sli(v8u16 x, uint c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); }
    163 static inline v4u32 v4u32_sli(v4u32 x, uint c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); }
    164 static inline v2u64 v2u64_sli(v2u64 x, uint c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); }
    165 
    166 static inline v8u16 v8u16_sri(v8u16 x, uint c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); }
    167 static inline v4u32 v4u32_sri(v4u32 x, uint c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); }
    168 static inline v2u64 v2u64_sri(v2u64 x, uint c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); }
    169 
    170 static inline v8i16 v8i16_sri(v8i16 x, uint c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); }
    171 static inline v4i32 v4i32_sri(v4i32 x, uint c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); }
    172 
    173 static inline v16u8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
    174 static inline v16u8 v16u8_cmpeq(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
    175 static inline v8u16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
    176 static inline v8u16 v8u16_cmpeq(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
    177 static inline v4u32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
    178 static inline v4u32 v4u32_cmpeq(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
    179 
    180 static inline v16u8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
    181 static inline v16u8 v16u8_cmpgt(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
    182 static inline v8u16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
    183 static inline v8u16 v8u16_cmpgt(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
    184 static inline v4u32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
    185 static inline v4u32 v4u32_cmpgt(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
    186 
    187 static inline uint v16u8_msb(v16u8 x) { return __builtin_ia32_pmovmskb128((v16qi)x); }
    188 #endif
    189 
    190 #ifdef R_HAVE_SSSE3
    191 static inline v16u8 v16u8_shuf(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pshufb128((v16qi)x, (v16qi)y); }
    192 
    193 static inline v16i8 v16i8_sign(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psignb128((v16qi)x, (v16qi)y); }
    194 static inline v8i16 v8i16_sign(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psignw128((v8hi)x, (v8hi)y); }
    195 static inline v4i32 v4i32_sign(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psignd128((v4si)x, (v4si)y); }
    196 #endif
    197 
    198 #ifdef R_HAVE_SSE4_1
    199 static inline v2u64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
    200 static inline v2u64 v2u64_cmpeq(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
    201 
    202 static inline v8i16 v16i8_ext_v8i16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); }
    203 static inline v4i32 v16i8_ext_v4i32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); }
    204 static inline v2i64 v16i8_ext_v2i64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); }
    205 static inline v4i32 v8i16_ext_v4i32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); }
    206 static inline v2i64 v8i16_ext_v2i64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); }
    207 static inline v2i64 v4i32_ext_v2i64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); }
    208 
    209 static inline v8u16 v16u8_ext_v8u16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); }
    210 static inline v4u32 v16u8_ext_v4u32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); }
    211 static inline v2u64 v16u8_ext_v2u64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); }
    212 static inline v4u32 v8u16_ext_v4u32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); }
    213 static inline v2u64 v8u16_ext_v2u64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); }
    214 static inline v2u64 v4u32_ext_v2u64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); }
    215 #endif
    216 
    217 #ifdef R_HAVE_SSE4_2
    218 static inline v2u64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
    219 static inline v2u64 v2u64_cmpgt(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
    220 #endif
    221 
    222 #ifdef R_HAVE_AVX2
    223 static inline v32i8 v32i8_set(
    224 	i8 x31, i8 x30, i8 x29, i8 x28, i8 x27, i8 x26, i8 x25, i8 x24,
    225 	i8 x23, i8 x22, i8 x21, i8 x20, i8 x19, i8 x18, i8 x17, i8 x16,
    226 	i8 x15, i8 x14, i8 x13, i8 x12, i8 x11, i8 x10, i8 x09, i8 x08,
    227 	i8 x07, i8 x06, i8 x05, i8 x04, i8 x03, i8 x02, i8 x01, i8 x00
    228 ) { return (v32i8){
    229 	x31, x30, x29, x28, x27, x26, x25, x24,
    230 	x23, x22, x21, x20, x19, x18, x17, x16,
    231 	x15, x14, x13, x12, x11, x10, x09, x08,
    232 	x07, x06, x05, x04, x03, x02, x01, x00
    233 }; }
    234 static inline v32u8 v32u8_set(
    235 	u8 x31, u8 x30, u8 x29, u8 x28, u8 x27, u8 x26, u8 x25, u8 x24,
    236 	u8 x23, u8 x22, u8 x21, u8 x20, u8 x19, u8 x18, u8 x17, u8 x16,
    237 	u8 x15, u8 x14, u8 x13, u8 x12, u8 x11, u8 x10, u8 x09, u8 x08,
    238 	u8 x07, u8 x06, u8 x05, u8 x04, u8 x03, u8 x02, u8 x01, u8 x00
    239 ) { return (v32u8){
    240 	x31, x30, x29, x28, x27, x26, x25, x24,
    241 	x23, x22, x21, x20, x19, x18, x17, x16,
    242 	x15, x14, x13, x12, x11, x10, x09, x08,
    243 	x07, x06, x05, x04, x03, x02, x01, x00
    244 }; }
    245 static inline v16i16 v16i16_set(
    246 	i16 x15, i16 x14, i16 x13, i16 x12, i16 x11, i16 x10, i16 x09, i16 x08,
    247 	i16 x07, i16 x06, i16 x05, i16 x04, i16 x03, i16 x02, i16 x01, i16 x00
    248 ) { return (v16i16){
    249 	x15, x14, x13, x12, x11, x10, x09, x08,
    250 	x07, x06, x05, x04, x03, x02, x01, x00
    251 }; }
    252 static inline v16u16 v16u16_set(
    253 	u16 x15, u16 x14, u16 x13, u16 x12, u16 x11, u16 x10, u16 x09, u16 x08,
    254 	u16 x07, u16 x06, u16 x05, u16 x04, u16 x03, u16 x02, u16 x01, u16 x00
    255 ) { return (v16u16){
    256 	x15, x14, x13, x12, x11, x10, x09, x08,
    257 	x07, x06, x05, x04, x03, x02, x01, x00
    258 }; }
    259 static inline v8i32 v8i32_set(i32 x7, i32 x6, i32 x5, i32 x4, i32 x3, i32 x2, i32 x1, i32 x0)
    260 { return (v8i32){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
    261 static inline v8u32 v8u32_set(u32 x7, u32 x6, u32 x5, u32 x4, u32 x3, u32 x2, u32 x1, u32 x0)
    262 { return (v8u32){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
    263 static inline v4i64 v4i64_set(i64 x3, i64 x2, i64 x1, i64 x0) { return (v4i64){ x3, x2, x1, x0 }; }
    264 static inline v4u64 v4u64_set(u64 x3, u64 x2, u64 x1, u64 x0) { return (v4u64){ x3, x2, x1, x0 }; }
    265 
    266 static inline v32i8 v32i8_fill(i8 x) { return v32i8_set(
    267 	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
    268 	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x
    269 ); }
    270 static inline v32u8 v32u8_fill(u8 x) { return v32u8_set(
    271 	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
    272 	x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x
    273 ); }
    274 static inline v16i16 v16i16_fill(i16 x) { return v16i16_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
    275 static inline v16u16 v16u16_fill(u16 x) { return v16u16_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
    276 static inline v8i32   v8i32_fill(i32 x) { return  v8i32_set(x, x, x, x, x, x, x, x); }
    277 static inline v8u32   v8u32_fill(u32 x) { return  v8u32_set(x, x, x, x, x, x, x, x); }
    278 static inline v4i64   v4i64_fill(i64 x) { return  v4i64_set(x, x, x, x); }
    279 static inline v4u64   v4u64_fill(u64 x) { return  v4u64_set(x, x, x, x); }
    280 
    281 static inline v32i8   v32i8_load(v32i8  *p) { return *p; }
    282 static inline v32u8   v32u8_load(v32u8  *p) { return *p; }
    283 static inline v16i16 v16i16_load(v16i16 *p) { return *p; }
    284 static inline v16u16 v16u16_load(v16u16 *p) { return *p; }
    285 static inline v8i32   v8i32_load(v8i32  *p) { return *p; }
    286 static inline v8u32   v8u32_load(v8u32  *p) { return *p; }
    287 static inline v4i64   v4i64_load(v4i64  *p) { return *p; }
    288 static inline v4u64   v4u64_load(v4u64  *p) { return *p; }
    289 
    290 static inline v32i8   v32i8_loadu(v32i8a1  *p) { return *p; }
    291 static inline v32u8   v32u8_loadu(v32u8a1  *p) { return *p; }
    292 static inline v16i16 v16i16_loadu(v16i16a1 *p) { return *p; }
    293 static inline v16u16 v16u16_loadu(v16u16a1 *p) { return *p; }
    294 static inline v8i32   v8i32_loadu(v8i32a1  *p) { return *p; }
    295 static inline v8u32   v8u32_loadu(v8u32a1  *p) { return *p; }
    296 static inline v4i64   v4i64_loadu(v4i64a1  *p) { return *p; }
    297 static inline v4u64   v4u64_loadu(v4u64a1  *p) { return *p; }
    298 
    299 static inline void  v32i8_store(v32i8  *p, v32i8  x) { *p = x; }
    300 static inline void  v32u8_store(v32u8  *p, v32u8  x) { *p = x; }
    301 static inline void v16i16_store(v16i16 *p, v16i16 x) { *p = x; }
    302 static inline void v16u16_store(v16u16 *p, v16u16 x) { *p = x; }
    303 static inline void  v8i32_store(v8i32  *p, v8i32  x) { *p = x; }
    304 static inline void  v8u32_store(v8u32  *p, v8u32  x) { *p = x; }
    305 static inline void  v4i64_store(v4i64  *p, v4i64  x) { *p = x; }
    306 static inline void  v4u64_store(v4u64  *p, v4u64  x) { *p = x; }
    307 
    308 static inline void  v32i8_storeu(v32i8a1  *p, v32i8  x) { *p = x; }
    309 static inline void  v32u8_storeu(v32u8a1  *p, v32u8  x) { *p = x; }
    310 static inline void v16i16_storeu(v16i16a1 *p, v16i16 x) { *p = x; }
    311 static inline void v16u16_storeu(v16u16a1 *p, v16u16 x) { *p = x; }
    312 static inline void  v8i32_storeu(v8i32a1  *p, v8i32  x) { *p = x; }
    313 static inline void  v8u32_storeu(v8u32a1  *p, v8u32  x) { *p = x; }
    314 static inline void  v4i64_storeu(v4i64a1  *p, v4i64  x) { *p = x; }
    315 static inline void  v4u64_storeu(v4u64a1  *p, v4u64  x) { *p = x; }
    316 
    317 static inline v32i8   v32i8_add(v32i8  x, v32i8  y) { return (v32i8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); }
    318 static inline v32u8   v32u8_add(v32u8  x, v32u8  y) { return (v32u8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); }
    319 static inline v16i16 v16i16_add(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); }
    320 static inline v16u16 v16u16_add(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); }
    321 static inline v8i32   v8i32_add(v8i32  x, v8i32  y) { return (v8i32) __builtin_ia32_paddd256((v8si)x, (v8si)y); }
    322 static inline v8u32   v8u32_add(v8u32  x, v8u32  y) { return (v8u32) __builtin_ia32_paddd256((v8si)x, (v8si)y); }
    323 static inline v4i64   v4i64_add(v4i64  x, v4i64  y) { return (v4i64) __builtin_ia32_paddq256((v4di)x, (v4di)y); }
    324 static inline v4u64   v4u64_add(v4u64  x, v4u64  y) { return (v4u64) __builtin_ia32_paddq256((v4di)x, (v4di)y); }
    325 
    326 static inline v32i8   v32i8_sub(v32i8  x, v32i8  y) { return (v32i8) __builtin_ia32_psubb256((v32qi)x, (v32qi)y); }
    327 static inline v32u8   v32u8_sub(v32u8  x, v32u8  y) { return (v32u8) __builtin_ia32_psubb256((v32qi)x, (v32qi)y); }
    328 static inline v16i16 v16i16_sub(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_psubw256((v16hi)x, (v16hi)y); }
    329 static inline v16u16 v16u16_sub(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_psubw256((v16hi)x, (v16hi)y); }
    330 static inline v8i32   v8i32_sub(v8i32  x, v8i32  y) { return (v8i32) __builtin_ia32_psubd256((v8si)x, (v8si)y); }
    331 static inline v8u32   v8u32_sub(v8u32  x, v8u32  y) { return (v8u32) __builtin_ia32_psubd256((v8si)x, (v8si)y); }
    332 static inline v4i64   v4i64_sub(v4i64  x, v4i64  y) { return (v4i64) __builtin_ia32_psubq256((v4di)x, (v4di)y); }
    333 static inline v4u64   v4u64_sub(v4u64  x, v4u64  y) { return (v4u64) __builtin_ia32_psubq256((v4di)x, (v4di)y); }
    334 
    335 static inline v32i8   v32i8_and(v32i8  x, v32i8  y) { return (v32i8) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
    336 static inline v32u8   v32u8_and(v32u8  x, v32u8  y) { return (v32u8) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
    337 static inline v16i16 v16i16_and(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_andsi256((v4di)x, (v4di)y); }
    338 static inline v16u16 v16u16_and(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_andsi256((v4di)x, (v4di)y); }
    339 static inline v8i32   v8i32_and(v8i32  x, v8i32  y) { return (v8i32) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
    340 static inline v8u32   v8u32_and(v8u32  x, v8u32  y) { return (v8u32) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
    341 static inline v4i64   v4i64_and(v4i64  x, v4i64  y) { return (v4i64) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
    342 static inline v4u64   v4u64_and(v4u64  x, v4u64  y) { return (v4u64) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
    343 
    344 static inline v32i8   v32i8_andnot(v32i8  x, v32i8  y) { return (v32i8) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    345 static inline v32u8   v32u8_andnot(v32u8  x, v32u8  y) { return (v32u8) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    346 static inline v16i16 v16i16_andnot(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    347 static inline v16u16 v16u16_andnot(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    348 static inline v8i32   v8i32_andnot(v8i32  x, v8i32  y) { return (v8i32) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    349 static inline v8u32   v8u32_andnot(v8u32  x, v8u32  y) { return (v8u32) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    350 static inline v4i64   v4i64_andnot(v4i64  x, v4i64  y) { return (v4i64) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    351 static inline v4u64   v4u64_andnot(v4u64  x, v4u64  y) { return (v4u64) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
    352 
    353 static inline v32i8   v32i8_or(v32i8  x, v32i8  y) { return (v32i8) __builtin_ia32_por256((v4di)x, (v4di)y); }
    354 static inline v32u8   v32u8_or(v32u8  x, v32u8  y) { return (v32u8) __builtin_ia32_por256((v4di)x, (v4di)y); }
    355 static inline v16i16 v16i16_or(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_por256((v4di)x, (v4di)y); }
    356 static inline v16u16 v16u16_or(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_por256((v4di)x, (v4di)y); }
    357 static inline v8i32   v8i32_or(v8i32  x, v8i32  y) { return (v8i32) __builtin_ia32_por256((v4di)x, (v4di)y); }
    358 static inline v8u32   v8u32_or(v8u32  x, v8u32  y) { return (v8u32) __builtin_ia32_por256((v4di)x, (v4di)y); }
    359 static inline v4i64   v4i64_or(v4i64  x, v4i64  y) { return (v4i64) __builtin_ia32_por256((v4di)x, (v4di)y); }
    360 static inline v4u64   v4u64_or(v4u64  x, v4u64  y) { return (v4u64) __builtin_ia32_por256((v4di)x, (v4di)y); }
    361 
    362 static inline v32i8   v32i8_xor(v32i8  x, v32i8  y) { return (v32i8) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
    363 static inline v32u8   v32u8_xor(v32u8  x, v32u8  y) { return (v32u8) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
    364 static inline v16i16 v16i16_xor(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_pxor256((v4di)x, (v4di)y); }
    365 static inline v16u16 v16u16_xor(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pxor256((v4di)x, (v4di)y); }
    366 static inline v8i32   v8i32_xor(v8i32  x, v8i32  y) { return (v8i32) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
    367 static inline v8u32   v8u32_xor(v8u32  x, v8u32  y) { return (v8u32) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
    368 static inline v4i64   v4i64_xor(v4i64  x, v4i64  y) { return (v4i64) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
    369 static inline v4u64   v4u64_xor(v4u64  x, v4u64  y) { return (v4u64) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
    370 
    371 static inline v16u16 v16u16_sl(v16u16 x, v8u16 y) { return (v16u16)__builtin_ia32_psllw256((v16hi)x, (v8hi)y); }
    372 static inline v8u32   v8u32_sl(v8u32  x, v4u32 y) { return  (v8u32)__builtin_ia32_pslld256((v8si)x, (v4si)y); }
    373 static inline v4u64   v4u64_sl(v4u64  x, v2u64 y) { return  (v4u64)__builtin_ia32_psllq256((v4di)x, (v2di)y); }
    374 
    375 static inline v16u16 v16u16_sr(v16u16 x, v8u16 y) { return (v16u16)__builtin_ia32_psrlw256((v16hi)x, (v8hi)y); }
    376 static inline v8u32   v8u32_sr(v8u32  x, v4u32 y) { return  (v8u32)__builtin_ia32_psrld256((v8si)x, (v4si)y); }
    377 static inline v4u64   v4u64_sr(v4u64  x, v2u64 y) { return  (v4u64)__builtin_ia32_psrlq256((v4di)x, (v2di)y); }
    378 
    379 static inline v16i16 v16i16_sr(v16i16 x, v8u16 y) { return (v16i16)__builtin_ia32_psraw256((v16hi)x, (v8hi)y); }
    380 static inline v8i32   v8i32_sr(v8i32  x, v4u32 y) { return  (v8i32)__builtin_ia32_psrad256((v8si)x, (v4si)y); }
    381 
    382 static inline v16u16 v16u16_sli(v16u16 x, uint c) { return (v16u16)__builtin_ia32_psllwi256((v16hi)x, c); }
    383 static inline v8u32   v8u32_sli(v8u32  x, uint c) { return  (v8u32)__builtin_ia32_pslldi256((v8si)x, c); }
    384 static inline v4u64   v4u64_sli(v4u64  x, uint c) { return  (v4u64)__builtin_ia32_psllqi256((v4di)x, c); }
    385 
    386 static inline v16u16 v16u16_sri(v16u16 x, uint c) { return (v16u16)__builtin_ia32_psrlwi256((v16hi)x, c); }
    387 static inline v8u32   v8u32_sri(v8u32  x, uint c) { return  (v8u32)__builtin_ia32_psrldi256((v8si)x, c); }
    388 static inline v4u64   v4u64_sri(v4u64  x, uint c) { return  (v4u64)__builtin_ia32_psrlqi256((v4di)x, c); }
    389 
    390 static inline v16i16 v16i16_sri(v16i16 x, uint c) { return (v16i16)__builtin_ia32_psrawi256((v16hi)x, c); }
    391 static inline v8i32   v8i32_sri(v8i32  x, uint c) { return  (v8i32)__builtin_ia32_psradi256((v8si)x, c); }
    392 
    393 static inline v32u8   v32i8_cmpeq(v32i8  x, v32i8  y) { return  (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
    394 static inline v32u8   v32u8_cmpeq(v32u8  x, v32u8  y) { return  (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
    395 static inline v16u16 v16i16_cmpeq(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); }
    396 static inline v16u16 v16u16_cmpeq(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); }
    397 static inline v8u32   v8i32_cmpeq(v8i32  x, v8i32  y) { return  (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
    398 static inline v8u32   v8u32_cmpeq(v8u32  x, v8u32  y) { return  (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
    399 static inline v4u64   v4i64_cmpeq(v4i64  x, v4i64  y) { return  (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
    400 static inline v4u64   v4u64_cmpeq(v4u64  x, v4u64  y) { return  (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
    401 
    402 static inline v32u8   v32i8_cmpgt(v32i8  x, v32i8  y) { return  (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
    403 static inline v32u8   v32u8_cmpgt(v32u8  x, v32u8  y) { return  (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
    404 static inline v16u16 v16i16_cmpgt(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); }
    405 static inline v16u16 v16u16_cmpgt(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); }
    406 static inline v8u32   v8i32_cmpgt(v8i32  x, v8i32  y) { return  (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
    407 static inline v8u32   v8u32_cmpgt(v8u32  x, v8u32  y) { return  (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
    408 static inline v4u64   v4i64_cmpgt(v4i64  x, v4i64  y) { return  (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
    409 static inline v4u64   v4u64_cmpgt(v4u64  x, v4u64  y) { return  (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
    410 
    411 static inline int  v32i8_testc(v32i8  x, v32i8  y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    412 static inline int  v32u8_testc(v32u8  x, v32u8  y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    413 static inline int v16i16_testc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    414 static inline int v16u16_testc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    415 static inline int  v8i32_testc(v8i32  x, v8i32  y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    416 static inline int  v8u32_testc(v8u32  x, v8u32  y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    417 static inline int  v4i64_testc(v4i64  x, v4i64  y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    418 static inline int  v4u64_testc(v4u64  x, v4u64  y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
    419 
    420 static inline int  v32i8_testz(v32i8  x, v32i8  y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    421 static inline int  v32u8_testz(v32u8  x, v32u8  y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    422 static inline int v16i16_testz(v16i16 x, v16i16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    423 static inline int v16u16_testz(v16u16 x, v16u16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    424 static inline int  v8i32_testz(v8i32  x, v8i32  y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    425 static inline int  v8u32_testz(v8u32  x, v8u32  y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    426 static inline int  v4i64_testz(v4i64  x, v4i64  y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    427 static inline int  v4u64_testz(v4u64  x, v4u64  y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
    428 
    429 static inline int  v32i8_testnzc(v32i8  x, v32i8  y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    430 static inline int  v32u8_testnzc(v32u8  x, v32u8  y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    431 static inline int v16i16_testnzc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    432 static inline int v16u16_testnzc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    433 static inline int  v8i32_testnzc(v8i32  x, v8i32  y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    434 static inline int  v8u32_testnzc(v8u32  x, v8u32  y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    435 static inline int  v4i64_testnzc(v4i64  x, v4i64  y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    436 static inline int  v4u64_testnzc(v4u64  x, v4u64  y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
    437 
    438 static inline uint v32u8_msb(v32u8 x) { return __builtin_ia32_pmovmskb256((v32qi)x); }
    439 
    440 static inline v16i16 v16i8_ext_v16i16(v16i8 x) { return (v16i16)__builtin_ia32_pmovsxbw256((v16qi)x); }
    441 static inline v8i32  v16i8_ext_v8i32(v16i8 x)  { return  (v8i32)__builtin_ia32_pmovsxbd256((v16qi)x); }
    442 static inline v4i64  v16i8_ext_v4i64(v16i8 x)  { return  (v4i64)__builtin_ia32_pmovsxbq256((v16qi)x); }
    443 static inline v8i32  v8i16_ext_v8i32(v8i16 x)  { return  (v8i32)__builtin_ia32_pmovsxwd256((v8hi)x); }
    444 static inline v4i64  v8i16_ext_v4i64(v8i16 x)  { return  (v4i64)__builtin_ia32_pmovsxwq256((v8hi)x); }
    445 static inline v4i64  v4i32_ext_v4i64(v4i32 x)  { return  (v4i64)__builtin_ia32_pmovsxdq256((v4si)x); }
    446 
    447 static inline v16u16 v16u8_ext_v16u16(v16u8 x) { return (v16u16)__builtin_ia32_pmovzxbw256((v16qi)x); }
    448 static inline v8u32  v16u8_ext_v8u32(v16u8 x)  { return  (v8u32)__builtin_ia32_pmovzxbd256((v16qi)x); }
    449 static inline v4u64  v16u8_ext_v4u64(v16u8 x)  { return  (v4u64)__builtin_ia32_pmovzxbq256((v16qi)x); }
    450 static inline v8u32  v8u16_ext_v8u32(v8u16 x)  { return  (v8u32)__builtin_ia32_pmovzxwd256((v8hi)x); }
    451 static inline v4u64  v8u16_ext_v4u64(v8u16 x)  { return  (v4u64)__builtin_ia32_pmovzxwq256((v8hi)x); }
    452 static inline v4u64  v4u32_ext_v4u64(v4u32 x)  { return  (v4u64)__builtin_ia32_pmovzxdq256((v4si)x); }
    453 
    454 static inline v32u8 v32u8_shuf(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pshufb256((v32qi)x, (v32qi)y); }
    455 
    456 static inline v32i8   v32i8_sign(v32i8 x,  v32i8 y)  { return  (v32i8)__builtin_ia32_psignb256((v32qi)x, (v32qi)y); }
    457 static inline v16i16 v16i16_sign(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_psignw256((v16hi)x, (v16hi)y); }
    458 static inline v8i32   v8i32_sign(v8i32 x,  v8i32 y)  { return  (v8i32)__builtin_ia32_psignd256((v8si)x, (v8si)y); }
    459 #endif
    460 
    461 #undef v2di
    462 #undef v4si
    463 #undef v8hi
    464 #undef v16qi
    465 #undef v4di
    466 #undef v8si
    467 #undef v16hi
    468 #undef v32qi