simd-reduce

SIMD reduce implementation and benchmark
git clone git://git.rr3.xyz/simd-reduce
Log | Files | Refs

func.c (4888B)


      1 #include <rcx/all.h>
      2 
      3 // GCC unrolls this completely on -O3.
      4 u128
      5 reduce_scalar1024(u64 *x) {
      6 	u128 s = 0;
      7 	for (usize i = 0; i < 16; i++)
      8 		s += (u128)x[i];
      9 	return s;
     10 }
     11 
     12 // GCC does not unroll this on -O3.
     13 u128
     14 reduce_scalar2048(u64 *x) {
     15 	u128 s = 0;
     16 	for (usize i = 0; i < 32; i++)
     17 		s += (u128)x[i];
     18 	return s;
     19 }
     20 
     21 u128
     22 reduce_avx1024(u64 *x) {
     23 	// 1. Load input
     24 	v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x0, 32);
     25 	v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x4, 32);
     26 	v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x8, 32);
     27 	v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0xc, 32);
     28 
     29 	// 2. Reduce to 2x256 bits
     30 	v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7;
     31 	v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63;
     32 	v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F;
     33 	v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63;
     34 	v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF;
     35 	v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63)
     36 		+ h_04_15_26_37 + h_8C_9D_AE_BF;
     37 
     38 	// 3. Split to 128-bit halves
     39 	v2u64 l_048C_159D = { l_048C_159D_26AE_37BF[0], l_048C_159D_26AE_37BF[1] };
     40 	v2u64 h_048C_159D = { h_048C_159D_26AE_37BF[0], h_048C_159D_26AE_37BF[1] };
     41 	v2u64 l_26AE_37BF = { l_048C_159D_26AE_37BF[2], l_048C_159D_26AE_37BF[3] };
     42 	v2u64 h_26AE_37BF = { h_048C_159D_26AE_37BF[2], h_048C_159D_26AE_37BF[3] };
     43 
     44 	// 4. Reduce to 2x128 bits
     45 	v2u64 l_02468ACE_13579BDF = l_048C_159D + l_26AE_37BF;
     46 	v2u64 h_02468ACE_13579BDF = ((l_02468ACE_13579BDF < l_048C_159D) >> 63)
     47 		+ h_048C_159D + h_26AE_37BF;
     48 
     49 	// 5. Split to 64-bits halves
     50 	u64 l_02468ACE = l_02468ACE_13579BDF[0];
     51 	u64 h_02468ACE = h_02468ACE_13579BDF[0];
     52 	u64 l_13579BDF = l_02468ACE_13579BDF[1];
     53 	u64 h_13579BDF = h_02468ACE_13579BDF[1];
     54 
     55 	// 6. Reduce to 2x64 bits
     56 	u64 l_0123456789ABCDEF = l_02468ACE + l_13579BDF;
     57 	u64 h_0123456789ABCDEF = (l_0123456789ABCDEF < l_02468ACE)
     58 		+ h_02468ACE + h_13579BDF;
     59 
     60 	return ((u128)h_0123456789ABCDEF << 64) | (u128)l_0123456789ABCDEF;
     61 }
     62 
     63 u128
     64 reduce_avx2048(u64 *x) {
     65 	// 1. Load input
     66 	v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x00, 32);
     67 	v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x04, 32);
     68 	v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x08, 32);
     69 	v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0x0c, 32);
     70 	v4u64 x_G_H_I_J; memcpy(&x_G_H_I_J, x+0x10, 32);
     71 	v4u64 x_K_L_M_N; memcpy(&x_K_L_M_N, x+0x14, 32);
     72 	v4u64 x_O_P_Q_R; memcpy(&x_O_P_Q_R, x+0x18, 32);
     73 	v4u64 x_S_T_U_V; memcpy(&x_S_T_U_V, x+0x1c, 32);
     74 
     75 	// 2. Reduce to 2x256 bits
     76 	v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7;
     77 	v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63;
     78 	v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F;
     79 	v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63;
     80 	v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF;
     81 	v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63)
     82 		+ h_04_15_26_37 + h_8C_9D_AE_BF;
     83 	v4u64 l_GK_HL_IM_JN = x_G_H_I_J + x_K_L_M_N;
     84 	v4u64 h_GK_HL_IM_JN = (l_GK_HL_IM_JN < x_G_H_I_J) >> 63;
     85 	v4u64 l_OS_PT_QU_RV = x_O_P_Q_R + x_S_T_U_V;
     86 	v4u64 h_OS_PT_QU_RV = (l_OS_PT_QU_RV < x_O_P_Q_R) >> 63;
     87 	v4u64 l_GKOS_HLPT_IMQU_JNRV = l_GK_HL_IM_JN + l_OS_PT_QU_RV;
     88 	v4u64 h_GKOS_HLPT_IMQU_JNRV = ((l_GKOS_HLPT_IMQU_JNRV < l_GK_HL_IM_JN) >> 63)
     89 		+ h_GK_HL_IM_JN + h_OS_PT_QU_RV;
     90 	v4u64 l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = l_048C_159D_26AE_37BF + l_GKOS_HLPT_IMQU_JNRV;
     91 	v4u64 h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = ((l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV < l_048C_159D_26AE_37BF) >> 63)
     92 		+ h_048C_159D_26AE_37BF + h_GKOS_HLPT_IMQU_JNRV;
     93 
     94 	// 3. Split to 128-bit halves
     95 	v2u64 l_048CGKOS_159DHLPT = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] };
     96 	v2u64 h_048CGKOS_159DHLPT = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] };
     97 	v2u64 l_26AEIMQU_37BFJNRV = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] };
     98 	v2u64 h_26AEIMQU_37BFJNRV = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] };
     99 
    100 	// 4. Reduce to 2x128 bits
    101 	v2u64 l_02468ACEGIKMOQSU_13579BDFHJLNPRTV = l_048CGKOS_159DHLPT + l_26AEIMQU_37BFJNRV;
    102 	v2u64 h_02468ACEGIKMOQSU_13579BDFHJLNPRTV = ((l_02468ACEGIKMOQSU_13579BDFHJLNPRTV < l_048CGKOS_159DHLPT) >> 63)
    103 		+ h_048CGKOS_159DHLPT + h_26AEIMQU_37BFJNRV;
    104 
    105 	// 5. Split to 64-bits halves
    106 	u64 l_02468ACEGIKMOQSU = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0];
    107 	u64 h_02468ACEGIKMOQSU = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0];
    108 	u64 l_13579BDFHJLNPRTV = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1];
    109 	u64 h_13579BDFHJLNPRTV = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1];
    110 
    111 	// 6. Reduce to 2x64 bits
    112 	u64 l_0123456789ABCDEFGHIJKLMNOPQRSTUV = l_02468ACEGIKMOQSU + l_13579BDFHJLNPRTV;
    113 	u64 h_0123456789ABCDEFGHIJKLMNOPQRSTUV = (l_0123456789ABCDEFGHIJKLMNOPQRSTUV < l_02468ACEGIKMOQSU)
    114 		+ h_02468ACEGIKMOQSU + h_13579BDFHJLNPRTV;
    115 
    116 	return ((u128)h_0123456789ABCDEFGHIJKLMNOPQRSTUV << 64) | (u128)l_0123456789ABCDEFGHIJKLMNOPQRSTUV;
    117 }