commit 29fd9da20d5abe0f67784b3882fd917cdb90c8e9
parent b21c6abd41671c31da3cae3cd576027d15444be0
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Fri, 2 Jun 2023 18:56:21 -0700
Add basic AVX2 bindings
Diffstat:
| M | inc/simd.h | | | 270 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- |
1 file changed, 239 insertions(+), 31 deletions(-)
diff --git a/inc/simd.h b/inc/simd.h
@@ -87,6 +87,38 @@ typedef long long r_v4di_ __attribute__((vector_size(32)));
#define v4di r_v4di_
#ifdef R_HAVE_SSE2
+static inline v16i8 v16i8_set(
+ i8 x15, i8 x14, i8 x13, i8 x12, i8 x11, i8 x10, i8 x09, i8 x08,
+ i8 x07, i8 x06, i8 x05, i8 x04, i8 x03, i8 x02, i8 x01, i8 x00
+) { return (v16i8){
+ x15, x14, x13, x12, x11, x10, x09, x08,
+ x07, x06, x05, x04, x03, x02, x01, x00
+}; }
+static inline v16u8 v16u8_set(
+ u8 x15, u8 x14, u8 x13, u8 x12, u8 x11, u8 x10, u8 x09, u8 x08,
+ u8 x07, u8 x06, u8 x05, u8 x04, u8 x03, u8 x02, u8 x01, u8 x00
+) { return (v16u8){
+ x15, x14, x13, x12, x11, x10, x09, x08,
+ x07, x06, x05, x04, x03, x02, x01, x00
+}; }
+static inline v8i16 v8i16_set(i16 x7, i16 x6, i16 x5, i16 x4, i16 x3, i16 x2, i16 x1, i16 x0)
+{ return (v8i16){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
+static inline v8u16 v8u16_set(u16 x7, u16 x6, u16 x5, u16 x4, u16 x3, u16 x2, u16 x1, u16 x0)
+{ return (v8u16){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
+static inline v4i32 v4i32_set(i32 x3, i32 x2, i32 x1, i32 x0) { return (v4i32){ x3, x2, x1, x0 }; }
+static inline v4u32 v4u32_set(u32 x3, u32 x2, u32 x1, u32 x0) { return (v4u32){ x3, x2, x1, x0 }; }
+static inline v2i64 v2i64_set(i64 x1, i64 x0) { return (v2i64){ x1, x0 }; }
+static inline v2u64 v2u64_set(u64 x1, u64 x0) { return (v2u64){ x1, x0 }; }
+
+static inline v16i8 v16i8_fill(i8 x) { return v16i8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
+static inline v16u8 v16u8_fill(u8 x) { return v16u8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
+static inline v8i16 v8i16_fill(i16 x) { return v8i16_set(x, x, x, x, x, x, x, x); }
+static inline v8u16 v8u16_fill(u16 x) { return v8u16_set(x, x, x, x, x, x, x, x); }
+static inline v4i32 v4i32_fill(i32 x) { return v4i32_set(x, x, x, x); }
+static inline v4u32 v4u32_fill(u32 x) { return v4u32_set(x, x, x, x); }
+static inline v2i64 v2i64_fill(i64 x) { return v2i64_set(x, x); }
+static inline v2u64 v2u64_fill(u64 x) { return v2u64_set(x, x); }
+
static inline v16i8 v16i8_add(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
static inline v16u8 v16u8_add(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
static inline v8i16 v8i16_add(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); }
@@ -149,32 +181,32 @@ static inline v8u16 v8u16_sr(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_ps
static inline v4u32 v4u32_sr(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psrld128((v4si)x, (v4si)y); }
static inline v2u64 v2u64_sr(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psrlq128((v2di)x, (v2di)y); }
-static inline v8i16 v8i16_sr(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); }
-static inline v4i32 v4i32_sr(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); }
+static inline v8i16 v8i16_sr(v8i16 x, v8u16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_sr(v4i32 x, v4u32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); }
-static inline v8u16 v8u16_sli(v8u16 x, int c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); }
-static inline v4u32 v4u32_sli(v4u32 x, int c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); }
-static inline v2u64 v2u64_sli(v2u64 x, int c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); }
+static inline v8u16 v8u16_sli(v8u16 x, uint c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); }
+static inline v4u32 v4u32_sli(v4u32 x, uint c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); }
+static inline v2u64 v2u64_sli(v2u64 x, uint c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); }
-static inline v8u16 v8u16_sri(v8u16 x, int c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); }
-static inline v4u32 v4u32_sri(v4u32 x, int c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); }
-static inline v2u64 v2u64_sri(v2u64 x, int c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); }
+static inline v8u16 v8u16_sri(v8u16 x, uint c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); }
+static inline v4u32 v4u32_sri(v4u32 x, uint c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); }
+static inline v2u64 v2u64_sri(v2u64 x, uint c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); }
-static inline v8i16 v8i16_sri(v8i16 x, int c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); }
-static inline v4i32 v4i32_sri(v4i32 x, int c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); }
+static inline v8i16 v8i16_sri(v8i16 x, uint c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); }
+static inline v4i32 v4i32_sri(v4i32 x, uint c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); }
-static inline v16i8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
static inline v16u8 v16u8_cmpeq(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
-static inline v8i16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
static inline v8u16 v8u16_cmpeq(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
-static inline v4i32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
+static inline v4u32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
static inline v4u32 v4u32_cmpeq(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
-static inline v16i8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
static inline v16u8 v16u8_cmpgt(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
-static inline v8i16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
static inline v8u16 v8u16_cmpgt(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
-static inline v4i32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
+static inline v4u32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
static inline v4u32 v4u32_cmpgt(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
static inline uint v16u8_msb(v16u8 x) { return __builtin_ia32_pmovmskb128((v16qi)x); }
@@ -189,29 +221,205 @@ static inline v4i32 v4i32_sign(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_
#endif
#ifdef R_HAVE_SSE4_1
-static inline v2i64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
+static inline v2u64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
static inline v2u64 v2u64_cmpeq(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
-static inline v8i16 v16i8_ext16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); }
-static inline v4i32 v16i8_ext32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); }
-static inline v2i64 v16i8_ext64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); }
-static inline v4i32 v8i16_ext32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); }
-static inline v2i64 v8i16_ext64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); }
-static inline v2i64 v4i32_ext64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); }
-
-static inline v8u16 v16u8_ext16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); }
-static inline v4u32 v16u8_ext32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); }
-static inline v2u64 v16u8_ext64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); }
-static inline v4u32 v8u16_ext32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); }
-static inline v2u64 v8u16_ext64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); }
-static inline v2u64 v4u32_ext64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); }
+static inline v8i16 v16i8_ext_v8i16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); }
+static inline v4i32 v16i8_ext_v4i32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); }
+static inline v2i64 v16i8_ext_v2i64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); }
+static inline v4i32 v8i16_ext_v4i32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); }
+static inline v2i64 v8i16_ext_v2i64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); }
+static inline v2i64 v4i32_ext_v2i64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); }
+
+static inline v8u16 v16u8_ext_v8u16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); }
+static inline v4u32 v16u8_ext_v4u32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); }
+static inline v2u64 v16u8_ext_v2u64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); }
+static inline v4u32 v8u16_ext_v4u32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); }
+static inline v2u64 v8u16_ext_v2u64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); }
+static inline v2u64 v4u32_ext_v2u64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); }
#endif
#ifdef R_HAVE_SSE4_2
-static inline v2i64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
+static inline v2u64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
static inline v2u64 v2u64_cmpgt(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
#endif
+#ifdef R_HAVE_AVX2
+static inline v32i8 v32i8_set(
+ i8 x31, i8 x30, i8 x29, i8 x28, i8 x27, i8 x26, i8 x25, i8 x24,
+ i8 x23, i8 x22, i8 x21, i8 x20, i8 x19, i8 x18, i8 x17, i8 x16,
+ i8 x15, i8 x14, i8 x13, i8 x12, i8 x11, i8 x10, i8 x09, i8 x08,
+ i8 x07, i8 x06, i8 x05, i8 x04, i8 x03, i8 x02, i8 x01, i8 x00
+) { return (v32i8){
+ x31, x30, x29, x28, x27, x26, x25, x24,
+ x23, x22, x21, x20, x19, x18, x17, x16,
+ x15, x14, x13, x12, x11, x10, x09, x08,
+ x07, x06, x05, x04, x03, x02, x01, x00
+}; }
+static inline v32u8 v32u8_set(
+ u8 x31, u8 x30, u8 x29, u8 x28, u8 x27, u8 x26, u8 x25, u8 x24,
+ u8 x23, u8 x22, u8 x21, u8 x20, u8 x19, u8 x18, u8 x17, u8 x16,
+ u8 x15, u8 x14, u8 x13, u8 x12, u8 x11, u8 x10, u8 x09, u8 x08,
+ u8 x07, u8 x06, u8 x05, u8 x04, u8 x03, u8 x02, u8 x01, u8 x00
+) { return (v32u8){
+ x31, x30, x29, x28, x27, x26, x25, x24,
+ x23, x22, x21, x20, x19, x18, x17, x16,
+ x15, x14, x13, x12, x11, x10, x09, x08,
+ x07, x06, x05, x04, x03, x02, x01, x00
+}; }
+static inline v16i16 v16i16_set(
+ i16 x15, i16 x14, i16 x13, i16 x12, i16 x11, i16 x10, i16 x09, i16 x08,
+ i16 x07, i16 x06, i16 x05, i16 x04, i16 x03, i16 x02, i16 x01, i16 x00
+) { return (v16i16){
+ x15, x14, x13, x12, x11, x10, x09, x08,
+ x07, x06, x05, x04, x03, x02, x01, x00
+}; }
+static inline v16u16 v16u16_set(
+ u16 x15, u16 x14, u16 x13, u16 x12, u16 x11, u16 x10, u16 x09, u16 x08,
+ u16 x07, u16 x06, u16 x05, u16 x04, u16 x03, u16 x02, u16 x01, u16 x00
+) { return (v16u16){
+ x15, x14, x13, x12, x11, x10, x09, x08,
+ x07, x06, x05, x04, x03, x02, x01, x00
+}; }
+static inline v8i32 v8i32_set(i32 x7, i32 x6, i32 x5, i32 x4, i32 x3, i32 x2, i32 x1, i32 x0)
+{ return (v8i32){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
+static inline v8u32 v8u32_set(u32 x7, u32 x6, u32 x5, u32 x4, u32 x3, u32 x2, u32 x1, u32 x0)
+{ return (v8u32){ x7, x6, x5, x4, x3, x2, x1, x0 }; }
+static inline v4i64 v4i64_set(i64 x3, i64 x2, i64 x1, i64 x0) { return (v4i64){ x3, x2, x1, x0 }; }
+static inline v4u64 v4u64_set(u64 x3, u64 x2, u64 x1, u64 x0) { return (v4u64){ x3, x2, x1, x0 }; }
+
+static inline v32i8 v32i8_fill(i8 x) { return v32i8_set(
+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x
+); }
+static inline v32u8 v32u8_fill(u8 x) { return v32u8_set(
+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x,
+ x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x
+); }
+static inline v16i16 v16i16_fill(i16 x) { return v16i16_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
+static inline v16u16 v16u16_fill(u16 x) { return v16u16_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
+static inline v8i32 v8i32_fill(i32 x) { return v8i32_set(x, x, x, x, x, x, x, x); }
+static inline v8u32 v8u32_fill(u32 x) { return v8u32_set(x, x, x, x, x, x, x, x); }
+static inline v4i64 v4i64_fill(i64 x) { return v4i64_set(x, x, x, x); }
+static inline v4u64 v4u64_fill(u64 x) { return v4u64_set(x, x, x, x); }
+
+static inline v32i8 v32i8_add(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32u8_add(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); }
+static inline v16i16 v16i16_add(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); }
+static inline v16u16 v16u16_add(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); }
+static inline v8i32 v8i32_add(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_paddd256((v8si)x, (v8si)y); }
+static inline v8u32 v8u32_add(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_paddd256((v8si)x, (v8si)y); }
+static inline v4i64 v4i64_add(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_paddq256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_add(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_paddq256((v4di)x, (v4di)y); }
+
+static inline v32i8 v32i8_sub(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_psubb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32u8_sub(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_psubb256((v32qi)x, (v32qi)y); }
+static inline v16i16 v16i16_sub(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_psubw256((v16hi)x, (v16hi)y); }
+static inline v16u16 v16u16_sub(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_psubw256((v16hi)x, (v16hi)y); }
+static inline v8i32 v8i32_sub(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_psubd256((v8si)x, (v8si)y); }
+static inline v8u32 v8u32_sub(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_psubd256((v8si)x, (v8si)y); }
+static inline v4i64 v4i64_sub(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_psubq256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_sub(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_psubq256((v4di)x, (v4di)y); }
+
+static inline v32i8 v32i8_and(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
+static inline v32u8 v32u8_and(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
+static inline v16i16 v16i16_and(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_andsi256((v4di)x, (v4di)y); }
+static inline v16u16 v16u16_and(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_andsi256((v4di)x, (v4di)y); }
+static inline v8i32 v8i32_and(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
+static inline v8u32 v8u32_and(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
+static inline v4i64 v4i64_and(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_and(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_andsi256((v4di)x, (v4di)y); }
+
+static inline v32i8 v32i8_andnot(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+static inline v32u8 v32u8_andnot(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+static inline v16i16 v16i16_andnot(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+static inline v16u16 v16u16_andnot(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+static inline v8i32 v8i32_andnot(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+static inline v8u32 v8u32_andnot(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+static inline v4i64 v4i64_andnot(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_andnot(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_andnotsi256((v4di)x, (v4di)y); }
+
+static inline v32i8 v32i8_por(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_por256((v4di)x, (v4di)y); }
+static inline v32u8 v32u8_por(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_por256((v4di)x, (v4di)y); }
+static inline v16i16 v16i16_por(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_por256((v4di)x, (v4di)y); }
+static inline v16u16 v16u16_por(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_por256((v4di)x, (v4di)y); }
+static inline v8i32 v8i32_por(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_por256((v4di)x, (v4di)y); }
+static inline v8u32 v8u32_por(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_por256((v4di)x, (v4di)y); }
+static inline v4i64 v4i64_por(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_por256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_por(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_por256((v4di)x, (v4di)y); }
+
+static inline v32i8 v32i8_pxor(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
+static inline v32u8 v32u8_pxor(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
+static inline v16i16 v16i16_pxor(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_pxor256((v4di)x, (v4di)y); }
+static inline v16u16 v16u16_pxor(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pxor256((v4di)x, (v4di)y); }
+static inline v8i32 v8i32_pxor(v8i32 x, v8i32 y) { return (v8i32) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
+static inline v8u32 v8u32_pxor(v8u32 x, v8u32 y) { return (v8u32) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
+static inline v4i64 v4i64_pxor(v4i64 x, v4i64 y) { return (v4i64) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_pxor(v4u64 x, v4u64 y) { return (v4u64) __builtin_ia32_pxor256((v4di)x, (v4di)y); }
+
+static inline v16u16 v16u16_sl(v16u16 x, v8u16 y) { return (v16u16)__builtin_ia32_psllw256((v16hi)x, (v8hi)y); }
+static inline v8u32 v8u32_sl(v8u32 x, v4u32 y) { return (v8u32)__builtin_ia32_pslld256((v8si)x, (v4si)y); }
+static inline v4u64 v4u64_sl(v4u64 x, v2u64 y) { return (v4u64)__builtin_ia32_psllq256((v4di)x, (v2di)y); }
+
+static inline v16u16 v16u16_sr(v16u16 x, v8u16 y) { return (v16u16)__builtin_ia32_psrlw256((v16hi)x, (v8hi)y); }
+static inline v8u32 v8u32_sr(v8u32 x, v4u32 y) { return (v8u32)__builtin_ia32_psrld256((v8si)x, (v4si)y); }
+static inline v4u64 v4u64_sr(v4u64 x, v2u64 y) { return (v4u64)__builtin_ia32_psrlq256((v4di)x, (v2di)y); }
+
+static inline v16i16 v16i16_sr(v16i16 x, v8u16 y) { return (v16i16)__builtin_ia32_psraw256((v16hi)x, (v8hi)y); }
+static inline v8i32 v8i32_sr(v8i32 x, v4u32 y) { return (v8i32)__builtin_ia32_psrad256((v8si)x, (v4si)y); }
+
+static inline v16u16 v16u16_sli(v16u16 x, uint c) { return (v16u16)__builtin_ia32_psllwi256((v16hi)x, c); }
+static inline v8u32 v8u32_sli(v8u32 x, uint c) { return (v8u32)__builtin_ia32_pslldi256((v8si)x, c); }
+static inline v4u64 v4u64_sli(v4u64 x, uint c) { return (v4u64)__builtin_ia32_psllqi256((v4di)x, c); }
+
+static inline v16u16 v16u16_sri(v16u16 x, uint c) { return (v16u16)__builtin_ia32_psrlwi256((v16hi)x, c); }
+static inline v8u32 v8u32_sri(v8u32 x, uint c) { return (v8u32)__builtin_ia32_psrldi256((v8si)x, c); }
+static inline v4u64 v4u64_sri(v4u64 x, uint c) { return (v4u64)__builtin_ia32_psrlqi256((v4di)x, c); }
+
+static inline v16i16 v16i16_sri(v16i16 x, uint c) { return (v16i16)__builtin_ia32_psrawi256((v16hi)x, c); }
+static inline v8i32 v8i32_sri(v8i32 x, uint c) { return (v8i32)__builtin_ia32_psradi256((v8si)x, c); }
+
+static inline v32u8 v32i8_cmpeq(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32u8_cmpeq(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
+static inline v16u16 v16i16_cmpeq(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); }
+static inline v16u16 v16u16_cmpeq(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); }
+static inline v8u32 v8i32_cmpeq(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
+static inline v8u32 v8u32_cmpeq(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
+static inline v4u64 v4i64_cmpeq(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_cmpeq(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
+
+static inline v32u8 v32i8_cmpgt(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32u8_cmpgt(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
+static inline v16u16 v16i16_cmpgt(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); }
+static inline v16u16 v16u16_cmpgt(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); }
+static inline v8u32 v8i32_cmpgt(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
+static inline v8u32 v8u32_cmpgt(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
+static inline v4u64 v4i64_cmpgt(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_cmpgt(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
+
+static inline uint v32u8_msb(v32u8 x) { return __builtin_ia32_pmovmskb256((v32qi)x); }
+
+static inline v16i16 v16i8_ext_v16i16(v16i8 x) { return (v16i16)__builtin_ia32_pmovsxbw256((v16qi)x); }
+static inline v8i32 v16i8_ext_v8i32(v16i8 x) { return (v8i32)__builtin_ia32_pmovsxbd256((v16qi)x); }
+static inline v4i64 v16i8_ext_v4i64(v16i8 x) { return (v4i64)__builtin_ia32_pmovsxbq256((v16qi)x); }
+static inline v8i32 v8i16_ext_v8i32(v8i16 x) { return (v8i32)__builtin_ia32_pmovsxwd256((v8hi)x); }
+static inline v4i64 v8i16_ext_v4i64(v8i16 x) { return (v4i64)__builtin_ia32_pmovsxwq256((v8hi)x); }
+static inline v4i64 v4i32_ext_v4i64(v4i32 x) { return (v4i64)__builtin_ia32_pmovsxdq256((v4si)x); }
+
+static inline v16u16 v16u8_ext_v16u16(v16u8 x) { return (v16u16)__builtin_ia32_pmovzxbw256((v16qi)x); }
+static inline v8u32 v16u8_ext_v8u32(v16u8 x) { return (v8u32)__builtin_ia32_pmovzxbd256((v16qi)x); }
+static inline v4u64 v16u8_ext_v4u64(v16u8 x) { return (v4u64)__builtin_ia32_pmovzxbq256((v16qi)x); }
+static inline v8u32 v8u16_ext_v8u32(v8u16 x) { return (v8u32)__builtin_ia32_pmovzxwd256((v8hi)x); }
+static inline v4u64 v8u16_ext_v4u64(v8u16 x) { return (v4u64)__builtin_ia32_pmovzxwq256((v8hi)x); }
+static inline v4u64 v4u32_ext_v4u64(v4u32 x) { return (v4u64)__builtin_ia32_pmovzxdq256((v4si)x); }
+
+static inline v32u8 v32u8_shuf(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pshufb256((v32qi)x, (v32qi)y); }
+
+static inline v32i8 v32i8_sign(v32i8 x, v32i8 y) { return (v32i8)__builtin_ia32_psignb256((v32qi)x, (v32qi)y); }
+static inline v16i16 v16i16_sign(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_psignw256((v16hi)x, (v16hi)y); }
+static inline v8i32 v8i32_sign(v8i32 x, v8i32 y) { return (v8i32)__builtin_ia32_psignd256((v8si)x, (v8si)y); }
+#endif
+
#undef v2di
#undef v4si
#undef v8hi