func.c (1458B)
1 #include <rcx/all.h> 2 3 // TODO: Try also with 128 bit partial sums. 4 5 u64 6 scan_scalar1024(u64 *y, u64 *x) { 7 u64 s = 0; 8 for (usize i = 0; i < 16; i++) { 9 s += x[i]; 10 y[i] = s; 11 } 12 return s; 13 } 14 15 u64 16 scan_scalar2048(u64 *y, u64 *x) { 17 u64 s = 0; 18 for (usize i = 0; i < 32; i++) { 19 s += x[i]; 20 y[i] = s; 21 } 22 return s; 23 } 24 25 u64 26 scan_scalar2048_unrolled(u64 *y, u64 *x) { 27 u64 s = 0; 28 for (usize i = 0; i < 8; i++) { 29 s += x[i+0]; y[i+0] = s; 30 s += x[i+1]; y[i+1] = s; 31 s += x[i+2]; y[i+2] = s; 32 s += x[i+3]; y[i+3] = s; 33 } 34 return s; 35 } 36 37 u64 38 scan_avx1024_serial(u64 *y, u64 *x) { 39 v4u64 s = { 0, 0, 0, 0 }; 40 for (usize i = 0; i < 4; i++) { 41 u64 x0 = x[i+0]; 42 u64 x1 = x0 + x[i+1]; 43 u64 x2 = x1 + x[i+2]; 44 u64 x3 = x2 + x[i+3]; 45 v4u64 v = { x0, x1, x2, x3 }; 46 v += s; 47 memcpy(&y[i], &v, 32); 48 s = __builtin_shufflevector(v, v, 3, 3, 3, 3); 49 } 50 return s[0]; 51 } 52 53 u64 54 scan_avx1024(u64 *y, u64 *x) { 55 v4u64 z = { 0, 0, 0, 0 }; 56 57 v4u64 s = z; 58 for (usize i = 0; i < 4; i++) { 59 v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+(4u*i), 32); 60 v4u64 x_z_0_1_2 = __builtin_shufflevector(z, x_0_1_2_3, 0, 4, 5, 6); 61 v4u64 x_0_01_12_23 = x_0_1_2_3 + x_z_0_1_2; 62 v4u64 x_z_z_0_01 = __builtin_shufflevector(z, x_0_01_12_23, 0, 0, 4, 5); 63 v4u64 x_0_01_012_0123 = x_0_01_12_23 + x_z_z_0_01; 64 v4u64 x_s0_s01_s012_s0123 = s + x_0_01_012_0123; 65 memcpy(y+(4u*i), &x_s0_s01_s012_s0123, 32); 66 s = __builtin_shufflevector(x_s0_s01_s012_s0123, z, 3, 3, 3, 3); 67 } 68 69 return s[3]; 70 }