simd-scan

SIMD scan implementation and benchmark
git clone git://git.rr3.xyz/simd-scan
Log | Files | Refs

func.c (1458B)


      1 #include <rcx/all.h>
      2 
      3 // TODO: Try also with 128 bit partial sums.
      4 
      5 u64
      6 scan_scalar1024(u64 *y, u64 *x) {
      7 	u64 s = 0;
      8 	for (usize i = 0; i < 16; i++) {
      9 		s += x[i];
     10 		y[i] = s;
     11 	}
     12 	return s;
     13 }
     14 
     15 u64
     16 scan_scalar2048(u64 *y, u64 *x) {
     17 	u64 s = 0;
     18 	for (usize i = 0; i < 32; i++) {
     19 		s += x[i];
     20 		y[i] = s;
     21 	}
     22 	return s;
     23 }
     24 
     25 u64
     26 scan_scalar2048_unrolled(u64 *y, u64 *x) {
     27 	u64 s = 0;
     28 	for (usize i = 0; i < 8; i++) {
     29 		s += x[i+0]; y[i+0] = s;
     30 		s += x[i+1]; y[i+1] = s;
     31 		s += x[i+2]; y[i+2] = s;
     32 		s += x[i+3]; y[i+3] = s;
     33 	}
     34 	return s;
     35 }
     36 
     37 u64
     38 scan_avx1024_serial(u64 *y, u64 *x) {
     39 	v4u64 s = { 0, 0, 0, 0 };
     40 	for (usize i = 0; i < 4; i++) {
     41 		u64 x0 = x[i+0];
     42 		u64 x1 = x0 + x[i+1];
     43 		u64 x2 = x1 + x[i+2];
     44 		u64 x3 = x2 + x[i+3];
     45 		v4u64 v = { x0, x1, x2, x3 };
     46 		v += s;
     47 		memcpy(&y[i], &v, 32);
     48 		s = __builtin_shufflevector(v, v, 3, 3, 3, 3);
     49 	}
     50 	return s[0];
     51 }
     52 
     53 u64
     54 scan_avx1024(u64 *y, u64 *x) {
     55 	v4u64 z = { 0, 0, 0, 0 };
     56 
     57 	v4u64 s = z;
     58 	for (usize i = 0; i < 4; i++) {
     59 		v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+(4u*i), 32);
     60 		v4u64 x_z_0_1_2 = __builtin_shufflevector(z, x_0_1_2_3, 0, 4, 5, 6);
     61 		v4u64 x_0_01_12_23 = x_0_1_2_3 + x_z_0_1_2;
     62 		v4u64 x_z_z_0_01 = __builtin_shufflevector(z, x_0_01_12_23, 0, 0, 4, 5);
     63 		v4u64 x_0_01_012_0123 = x_0_01_12_23 + x_z_z_0_01;
     64 		v4u64 x_s0_s01_s012_s0123 = s + x_0_01_012_0123;
     65 		memcpy(y+(4u*i), &x_s0_s01_s012_s0123, 32);
     66 		s = __builtin_shufflevector(x_s0_s01_s012_s0123, z, 3, 3, 3, 3);
     67 	}
     68 
     69 	return s[3];
     70 }