commit b3b7fcc9d17a0a71eadd20f9d24790515da6d2b7
Author: Robert Russell <robert@rr3.xyz>
Date: Mon, 22 Dec 2025 11:55:28 -0800
Initial commit
Diffstat:
| A | .gitignore | | | 3 | +++ |
| A | Makefile | | | 8 | ++++++++ |
| A | func.c | | | 70 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | func.h | | | 7 | +++++++ |
| A | main.c | | | 47 | +++++++++++++++++++++++++++++++++++++++++++++++ |
| A | whoa.nasm | | | 65 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
6 files changed, 200 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+func.o
+whoa.o
+main
diff --git a/Makefile b/Makefile
@@ -0,0 +1,8 @@
+main: main.c func.c func.h whoa.nasm
+ nasm -f elf64 whoa.nasm
+ gcc -c -Wall -O3 -march=native func.c -lrcx
+ gcc -o main -Wall -O3 -march=native main.c func.o whoa.o -lrcx
+
+.PHONY: clean
+clean:
+ rm -f func.o main
diff --git a/func.c b/func.c
@@ -0,0 +1,70 @@
+#include <rcx/all.h>
+
+// TODO: Try also with 128 bit partial sums.
+
+u64
+scan_scalar1024(u64 *y, u64 *x) {
+ u64 s = 0;
+ for (usize i = 0; i < 16; i++) {
+ s += x[i];
+ y[i] = s;
+ }
+ return s;
+}
+
+u64
+scan_scalar2048(u64 *y, u64 *x) {
+ u64 s = 0;
+ for (usize i = 0; i < 32; i++) {
+ s += x[i];
+ y[i] = s;
+ }
+ return s;
+}
+
+u64
+scan_scalar2048_unrolled(u64 *y, u64 *x) {
+ u64 s = 0;
+ for (usize i = 0; i < 8; i++) {
+ s += x[i+0]; y[i+0] = s;
+ s += x[i+1]; y[i+1] = s;
+ s += x[i+2]; y[i+2] = s;
+ s += x[i+3]; y[i+3] = s;
+ }
+ return s;
+}
+
+u64
+scan_avx1024_serial(u64 *y, u64 *x) {
+ v4u64 s = { 0, 0, 0, 0 };
+ for (usize i = 0; i < 4; i++) {
+ u64 x0 = x[i+0];
+ u64 x1 = x0 + x[i+1];
+ u64 x2 = x1 + x[i+2];
+ u64 x3 = x2 + x[i+3];
+ v4u64 v = { x0, x1, x2, x3 };
+ v += s;
+ memcpy(&y[i], &v, 32);
+ s = __builtin_shufflevector(v, v, 3, 3, 3, 3);
+ }
+ return s[0];
+}
+
+u64
+scan_avx1024(u64 *y, u64 *x) {
+ v4u64 z = { 0, 0, 0, 0 };
+
+ v4u64 s = z;
+ for (usize i = 0; i < 4; i++) {
+ v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+(4u*i), 32);
+ v4u64 x_z_0_1_2 = __builtin_shufflevector(z, x_0_1_2_3, 0, 4, 5, 6);
+ v4u64 x_0_01_12_23 = x_0_1_2_3 + x_z_0_1_2;
+ v4u64 x_z_z_0_01 = __builtin_shufflevector(z, x_0_01_12_23, 0, 0, 4, 5);
+ v4u64 x_0_01_012_0123 = x_0_01_12_23 + x_z_z_0_01;
+ v4u64 x_s0_s01_s012_s0123 = s + x_0_01_012_0123;
+ memcpy(y+(4u*i), &x_s0_s01_s012_s0123, 32);
+ s = __builtin_shufflevector(x_s0_s01_s012_s0123, z, 3, 3, 3, 3);
+ }
+
+ return s[3];
+}
diff --git a/func.h b/func.h
@@ -0,0 +1,7 @@
+#include <rcx/def.h>
+
+u64 scan_scalar1024(u64 *y, u64 *x);
+u64 scan_scalar2048(u64 *y, u64 *x);
+u64 scan_scalar2048_unrolled(u64 *y, u64 *x);
+u64 scan_avx1024_serial(u64 *y, u64 *x);
+u64 scan_avx1024(u64 *y, u64 *x);
diff --git a/main.c b/main.c
@@ -0,0 +1,47 @@
+#include <rcx/all.h>
+#include <rcx/bench.h>
+#include <stdio.h>
+
+#include "func.h"
+
+extern u64 scan_avx1024_whoa(u64 *y, u64 *x);
+
+u64 in[32], out[32];
+volatile u64 out_total;
+
+#define BENCHMARK(f) \
+ void \
+ benchmark_##f(u64 N) { \
+ r_bench_start(); \
+ for (u64 n = 0; n < N; n++) \
+ out_total = f(out, in); \
+ r_bench_stop(); \
+ }
+
+BENCHMARK(scan_scalar1024)
+BENCHMARK(scan_scalar2048)
+BENCHMARK(scan_scalar2048_unrolled)
+BENCHMARK(scan_avx1024_serial)
+BENCHMARK(scan_avx1024)
+BENCHMARK(scan_avx1024_whoa)
+
+int
+main(void) {
+ printf("%lu\n", scan_avx1024_whoa(out, (u64[16]){
+ 0x0, 0x1, 0x2, 0x3,
+ 0x4, 0x5, 0x6, 0x7,
+ 0x8, 0x9, 0xa, 0xb,
+ 0xc, 0xd, 0xe, 0xf,
+ }));
+ printf("%lu %lu %lu %lu\n", out[0], out[1], out[2], out[3]);
+ printf("%lu %lu %lu %lu\n", out[4], out[5], out[6], out[7]);
+ printf("%lu %lu %lu %lu\n", out[8], out[9], out[10], out[11]);
+ printf("%lu %lu %lu %lu\n", out[12], out[13], out[14], out[15]);
+ // return 0;
+ r_bench(benchmark_scan_scalar1024, 3000);
+ r_bench(benchmark_scan_scalar2048, 3000);
+ r_bench(benchmark_scan_scalar2048_unrolled, 3000);
+ r_bench(benchmark_scan_avx1024_serial, 3000);
+ r_bench(benchmark_scan_avx1024, 3000);
+ r_bench(benchmark_scan_avx1024_whoa, 3000);
+}
diff --git a/whoa.nasm b/whoa.nasm
@@ -0,0 +1,65 @@
+default rel
+
+section .text
+
+; RAX RDI RSI
+; u64 scan_avx1024_whoa(u64 *y, u64 *x);
+global scan_avx1024_whoa
+scan_avx1024_whoa:
+ ; 1. Load
+ vmovdqu ymm0, [rsi+00]
+ vmovdqu ymm1, [rsi+32]
+ vmovdqu ymm2, [rsi+64]
+ vmovdqu ymm3, [rsi+96]
+
+ ; 2. Transpose
+ vpunpcklqdq ymm4, ymm0, ymm1
+ vpunpckhqdq ymm5, ymm0, ymm1
+ vpunpcklqdq ymm6, ymm2, ymm3
+ vpunpckhqdq ymm7, ymm2, ymm3
+ vperm2i128 ymm0, ymm4, ymm6, 0x20
+ vperm2i128 ymm1, ymm5, ymm7, 0x20
+ vperm2i128 ymm2, ymm4, ymm6, 0x31
+ vperm2i128 ymm3, ymm5, ymm7, 0x31
+
+ ; 3. Scan blocks
+ vpaddq ymm1, ymm1, ymm0
+ vpaddq ymm2, ymm2, ymm1
+ vpaddq ymm3, ymm3, ymm2
+ ; ymm3 now contains block sums
+
+ ; 4. Scan sums (intra-register exclusive scan)
+ vpslldq ymm4, ymm3, 8
+ vpaddq ymm4, ymm3, ymm4
+ vpermq ymm5, ymm4, 0x50
+ vpxor ymm6, ymm6, ymm6
+ vblendpd ymm5, ymm5, ymm6, 3
+ vpaddq ymm4, ymm4, ymm5
+ vpsubq ymm4, ymm4, ymm3
+
+ ; 5. Add in scanned block sums
+ vpaddq ymm0, ymm0, ymm4
+ vpaddq ymm1, ymm1, ymm4
+ vpaddq ymm2, ymm2, ymm4
+ vpaddq ymm3, ymm3, ymm4
+
+ ; 6. Transpose
+ vpunpcklqdq ymm4, ymm0, ymm1
+ vpunpckhqdq ymm5, ymm0, ymm1
+ vpunpcklqdq ymm6, ymm2, ymm3
+ vpunpckhqdq ymm7, ymm2, ymm3
+ vperm2i128 ymm0, ymm4, ymm6, 0x20
+ vperm2i128 ymm1, ymm5, ymm7, 0x20
+ vperm2i128 ymm2, ymm4, ymm6, 0x31
+ vperm2i128 ymm3, ymm5, ymm7, 0x31
+
+ ; 7. Store
+ vmovdqu [rdi+00], ymm0
+ vmovdqu [rdi+32], ymm1
+ vmovdqu [rdi+64], ymm2
+ vmovdqu [rdi+96], ymm3
+
+ ; vextracti128 xmm15, ymm6, 1
+ ; pextrq rax, xmm15, 1
+ mov rax, 7 ; TODO: Output total
+ ret