commit 67b68bf991b3688a3657dfe382a92074afc454c4
parent a4fbab1bc49397ad153cd0d75db52315cd210628
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Sat, 6 Jul 2024 16:25:23 -0700
Expand SIMD
Diffstat:
| M | inc/simd.h | | | 142 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------- |
1 file changed, 107 insertions(+), 35 deletions(-)
diff --git a/inc/simd.h b/inc/simd.h
@@ -11,6 +11,7 @@
#endif
/* TODO: MMX, AVX-512 */
+/* TODO: Unaligned 128 bit typedefs, and corresponding load/store intrinsics */
/* 128 bit */
typedef i8 v16i8 __attribute__((vector_size(16)));
@@ -22,33 +23,41 @@ typedef u32 v4u32 __attribute__((vector_size(16)));
typedef i64 v2i64 __attribute__((vector_size(16)));
typedef u64 v2u64 __attribute__((vector_size(16)));
/* These are for casting inputs/output of the GCC builtins. */
-typedef char r_v16qi_ __attribute__((vector_size(16)));
-typedef short r_v8hi_ __attribute__((vector_size(16)));
-typedef int r_v4si_ __attribute__((vector_size(16)));
-typedef long long r_v2di_ __attribute__((vector_size(16)));
+typedef char r_v16qi_ __attribute__((vector_size(16)));
+typedef short r_v8hi_ __attribute__((vector_size(16)));
+typedef int r_v4si_ __attribute__((vector_size(16)));
+typedef long long r_v2di_ __attribute__((vector_size(16)));
#define v16qi r_v16qi_
-#define v8hi r_v8hi_
-#define v4si r_v4si_
-#define v2di r_v2di_
+#define v8hi r_v8hi_
+#define v4si r_v4si_
+#define v2di r_v2di_
/* 256 bit */
-typedef i8 v32i8 __attribute__((vector_size(32)));
-typedef u8 v32u8 __attribute__((vector_size(32)));
-typedef i16 v16i16 __attribute__((vector_size(32)));
-typedef u16 v16u16 __attribute__((vector_size(32)));
-typedef i32 v8i32 __attribute__((vector_size(32)));
-typedef u32 v8u32 __attribute__((vector_size(32)));
-typedef i64 v4i64 __attribute__((vector_size(32)));
-typedef u64 v4u64 __attribute__((vector_size(32)));
+typedef i8 v32i8 __attribute__((vector_size(32)));
+typedef u8 v32u8 __attribute__((vector_size(32)));
+typedef i16 v16i16 __attribute__((vector_size(32)));
+typedef u16 v16u16 __attribute__((vector_size(32)));
+typedef i32 v8i32 __attribute__((vector_size(32)));
+typedef u32 v8u32 __attribute__((vector_size(32)));
+typedef i64 v4i64 __attribute__((vector_size(32)));
+typedef u64 v4u64 __attribute__((vector_size(32)));
+typedef i8 v32i8a1 __attribute__((vector_size(32), aligned(1)));
+typedef u8 v32u8a1 __attribute__((vector_size(32), aligned(1)));
+typedef i16 v16i16a1 __attribute__((vector_size(32), aligned(1)));
+typedef u16 v16u16a1 __attribute__((vector_size(32), aligned(1)));
+typedef i32 v8i32a1 __attribute__((vector_size(32), aligned(1)));
+typedef u32 v8u32a1 __attribute__((vector_size(32), aligned(1)));
+typedef i64 v4i64a1 __attribute__((vector_size(32), aligned(1)));
+typedef u64 v4u64a1 __attribute__((vector_size(32), aligned(1)));
/* These are for casting inputs/output of the GCC builtins. */
-typedef char r_v32qi_ __attribute__((vector_size(32)));
-typedef short r_v16hi_ __attribute__((vector_size(32)));
-typedef int r_v8si_ __attribute__((vector_size(32)));
-typedef long long r_v4di_ __attribute__((vector_size(32)));
+typedef char r_v32qi_ __attribute__((vector_size(32)));
+typedef short r_v16hi_ __attribute__((vector_size(32)));
+typedef int r_v8si_ __attribute__((vector_size(32)));
+typedef long long r_v4di_ __attribute__((vector_size(32)));
#define v32qi r_v32qi_
#define v16hi r_v16hi_
-#define v8si r_v8si_
-#define v4di r_v4di_
+#define v8si r_v8si_
+#define v4di r_v4di_
#ifdef R_HAVE_SSE2
static inline v16i8 v16i8_set(
@@ -74,8 +83,8 @@ static inline v4u32 v4u32_set(u32 x3, u32 x2, u32 x1, u32 x0) { return (v4u32){
static inline v2i64 v2i64_set(i64 x1, i64 x0) { return (v2i64){ x1, x0 }; }
static inline v2u64 v2u64_set(u64 x1, u64 x0) { return (v2u64){ x1, x0 }; }
-static inline v16i8 v16i8_fill(i8 x) { return v16i8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
-static inline v16u8 v16u8_fill(u8 x) { return v16u8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
+static inline v16i8 v16i8_fill(i8 x) { return v16i8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
+static inline v16u8 v16u8_fill(u8 x) { return v16u8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); }
static inline v8i16 v8i16_fill(i16 x) { return v8i16_set(x, x, x, x, x, x, x, x); }
static inline v8u16 v8u16_fill(u16 x) { return v8u16_set(x, x, x, x, x, x, x, x); }
static inline v4i32 v4i32_fill(i32 x) { return v4i32_set(x, x, x, x); }
@@ -267,6 +276,42 @@ static inline v8u32 v8u32_fill(u32 x) { return v8u32_set(x, x, x, x, x, x, x,
static inline v4i64 v4i64_fill(i64 x) { return v4i64_set(x, x, x, x); }
static inline v4u64 v4u64_fill(u64 x) { return v4u64_set(x, x, x, x); }
+static inline v32i8 v32i8_load(v32i8 *p) { return *p; }
+static inline v32u8 v32u8_load(v32u8 *p) { return *p; }
+static inline v16i16 v16i16_load(v16i16 *p) { return *p; }
+static inline v16u16 v16u16_load(v16u16 *p) { return *p; }
+static inline v8i32 v8i32_load(v8i32 *p) { return *p; }
+static inline v8u32 v8u32_load(v8u32 *p) { return *p; }
+static inline v4i64 v4i64_load(v4i64 *p) { return *p; }
+static inline v4u64 v4u64_load(v4u64 *p) { return *p; }
+
+static inline v32i8 v32i8_loadu(v32i8a1 *p) { return *p; }
+static inline v32u8 v32u8_loadu(v32u8a1 *p) { return *p; }
+static inline v16i16 v16i16_loadu(v16i16a1 *p) { return *p; }
+static inline v16u16 v16u16_loadu(v16u16a1 *p) { return *p; }
+static inline v8i32 v8i32_loadu(v8i32a1 *p) { return *p; }
+static inline v8u32 v8u32_loadu(v8u32a1 *p) { return *p; }
+static inline v4i64 v4i64_loadu(v4i64a1 *p) { return *p; }
+static inline v4u64 v4u64_loadu(v4u64a1 *p) { return *p; }
+
+static inline void v32i8_store(v32i8 *p, v32i8 x) { *p = x; }
+static inline void v32u8_store(v32u8 *p, v32u8 x) { *p = x; }
+static inline void v16i16_store(v16i16 *p, v16i16 x) { *p = x; }
+static inline void v16u16_store(v16u16 *p, v16u16 x) { *p = x; }
+static inline void v8i32_store(v8i32 *p, v8i32 x) { *p = x; }
+static inline void v8u32_store(v8u32 *p, v8u32 x) { *p = x; }
+static inline void v4i64_store(v4i64 *p, v4i64 x) { *p = x; }
+static inline void v4u64_store(v4u64 *p, v4u64 x) { *p = x; }
+
+static inline void v32i8_storeu(v32i8a1 *p, v32i8 x) { *p = x; }
+static inline void v32u8_storeu(v32u8a1 *p, v32u8 x) { *p = x; }
+static inline void v16i16_storeu(v16i16a1 *p, v16i16 x) { *p = x; }
+static inline void v16u16_storeu(v16u16a1 *p, v16u16 x) { *p = x; }
+static inline void v8i32_storeu(v8i32a1 *p, v8i32 x) { *p = x; }
+static inline void v8u32_storeu(v8u32a1 *p, v8u32 x) { *p = x; }
+static inline void v4i64_storeu(v4i64a1 *p, v4i64 x) { *p = x; }
+static inline void v4u64_storeu(v4u64a1 *p, v4u64 x) { *p = x; }
+
static inline v32i8 v32i8_add(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); }
static inline v32u8 v32u8_add(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); }
static inline v16i16 v16i16_add(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); }
@@ -343,23 +388,50 @@ static inline v4u64 v4u64_sri(v4u64 x, uint c) { return (v4u64)__builtin_ia3
static inline v16i16 v16i16_sri(v16i16 x, uint c) { return (v16i16)__builtin_ia32_psrawi256((v16hi)x, c); }
static inline v8i32 v8i32_sri(v8i32 x, uint c) { return (v8i32)__builtin_ia32_psradi256((v8si)x, c); }
-static inline v32u8 v32i8_cmpeq(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
-static inline v32u8 v32u8_cmpeq(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32i8_cmpeq(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32u8_cmpeq(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); }
static inline v16u16 v16i16_cmpeq(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); }
static inline v16u16 v16u16_cmpeq(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); }
-static inline v8u32 v8i32_cmpeq(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
-static inline v8u32 v8u32_cmpeq(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
-static inline v4u64 v4i64_cmpeq(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
-static inline v4u64 v4u64_cmpeq(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
+static inline v8u32 v8i32_cmpeq(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
+static inline v8u32 v8u32_cmpeq(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); }
+static inline v4u64 v4i64_cmpeq(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_cmpeq(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); }
-static inline v32u8 v32i8_cmpgt(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
-static inline v32u8 v32u8_cmpgt(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32i8_cmpgt(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
+static inline v32u8 v32u8_cmpgt(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); }
static inline v16u16 v16i16_cmpgt(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); }
static inline v16u16 v16u16_cmpgt(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); }
-static inline v8u32 v8i32_cmpgt(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
-static inline v8u32 v8u32_cmpgt(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
-static inline v4u64 v4i64_cmpgt(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
-static inline v4u64 v4u64_cmpgt(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
+static inline v8u32 v8i32_cmpgt(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
+static inline v8u32 v8u32_cmpgt(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); }
+static inline v4u64 v4i64_cmpgt(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
+static inline v4u64 v4u64_cmpgt(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); }
+
+static inline int v32i8_testc(v32i8 x, v32i8 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+static inline int v32u8_testc(v32u8 x, v32u8 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+static inline int v16i16_testc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+static inline int v16u16_testc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+static inline int v8i32_testc(v8i32 x, v8i32 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+static inline int v8u32_testc(v8u32 x, v8u32 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+static inline int v4i64_testc(v4i64 x, v4i64 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+static inline int v4u64_testc(v4u64 x, v4u64 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); }
+
+static inline int v32i8_testz(v32i8 x, v32i8 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+static inline int v32u8_testz(v32u8 x, v32u8 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+static inline int v16i16_testz(v16i16 x, v16i16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+static inline int v16u16_testz(v16u16 x, v16u16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+static inline int v8i32_testz(v8i32 x, v8i32 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+static inline int v8u32_testz(v8u32 x, v8u32 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+static inline int v4i64_testz(v4i64 x, v4i64 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+static inline int v4u64_testz(v4u64 x, v4u64 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); }
+
+static inline int v32i8_testnzc(v32i8 x, v32i8 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
+static inline int v32u8_testnzc(v32u8 x, v32u8 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
+static inline int v16i16_testnzc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
+static inline int v16u16_testnzc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
+static inline int v8i32_testnzc(v8i32 x, v8i32 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
+static inline int v8u32_testnzc(v8u32 x, v8u32 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
+static inline int v4i64_testnzc(v4i64 x, v4i64 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
+static inline int v4u64_testnzc(v4u64 x, v4u64 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); }
static inline uint v32u8_msb(v32u8 x) { return __builtin_ia32_pmovmskb256((v32qi)x); }