func.c (4888B)
1 #include <rcx/all.h> 2 3 // GCC unrolls this completely on -O3. 4 u128 5 reduce_scalar1024(u64 *x) { 6 u128 s = 0; 7 for (usize i = 0; i < 16; i++) 8 s += (u128)x[i]; 9 return s; 10 } 11 12 // GCC does not unroll this on -O3. 13 u128 14 reduce_scalar2048(u64 *x) { 15 u128 s = 0; 16 for (usize i = 0; i < 32; i++) 17 s += (u128)x[i]; 18 return s; 19 } 20 21 u128 22 reduce_avx1024(u64 *x) { 23 // 1. Load input 24 v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x0, 32); 25 v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x4, 32); 26 v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x8, 32); 27 v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0xc, 32); 28 29 // 2. Reduce to 2x256 bits 30 v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7; 31 v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63; 32 v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F; 33 v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63; 34 v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF; 35 v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63) 36 + h_04_15_26_37 + h_8C_9D_AE_BF; 37 38 // 3. Split to 128-bit halves 39 v2u64 l_048C_159D = { l_048C_159D_26AE_37BF[0], l_048C_159D_26AE_37BF[1] }; 40 v2u64 h_048C_159D = { h_048C_159D_26AE_37BF[0], h_048C_159D_26AE_37BF[1] }; 41 v2u64 l_26AE_37BF = { l_048C_159D_26AE_37BF[2], l_048C_159D_26AE_37BF[3] }; 42 v2u64 h_26AE_37BF = { h_048C_159D_26AE_37BF[2], h_048C_159D_26AE_37BF[3] }; 43 44 // 4. Reduce to 2x128 bits 45 v2u64 l_02468ACE_13579BDF = l_048C_159D + l_26AE_37BF; 46 v2u64 h_02468ACE_13579BDF = ((l_02468ACE_13579BDF < l_048C_159D) >> 63) 47 + h_048C_159D + h_26AE_37BF; 48 49 // 5. Split to 64-bits halves 50 u64 l_02468ACE = l_02468ACE_13579BDF[0]; 51 u64 h_02468ACE = h_02468ACE_13579BDF[0]; 52 u64 l_13579BDF = l_02468ACE_13579BDF[1]; 53 u64 h_13579BDF = h_02468ACE_13579BDF[1]; 54 55 // 6. Reduce to 2x64 bits 56 u64 l_0123456789ABCDEF = l_02468ACE + l_13579BDF; 57 u64 h_0123456789ABCDEF = (l_0123456789ABCDEF < l_02468ACE) 58 + h_02468ACE + h_13579BDF; 59 60 return ((u128)h_0123456789ABCDEF << 64) | (u128)l_0123456789ABCDEF; 61 } 62 63 u128 64 reduce_avx2048(u64 *x) { 65 // 1. Load input 66 v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x00, 32); 67 v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x04, 32); 68 v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x08, 32); 69 v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0x0c, 32); 70 v4u64 x_G_H_I_J; memcpy(&x_G_H_I_J, x+0x10, 32); 71 v4u64 x_K_L_M_N; memcpy(&x_K_L_M_N, x+0x14, 32); 72 v4u64 x_O_P_Q_R; memcpy(&x_O_P_Q_R, x+0x18, 32); 73 v4u64 x_S_T_U_V; memcpy(&x_S_T_U_V, x+0x1c, 32); 74 75 // 2. Reduce to 2x256 bits 76 v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7; 77 v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63; 78 v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F; 79 v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63; 80 v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF; 81 v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63) 82 + h_04_15_26_37 + h_8C_9D_AE_BF; 83 v4u64 l_GK_HL_IM_JN = x_G_H_I_J + x_K_L_M_N; 84 v4u64 h_GK_HL_IM_JN = (l_GK_HL_IM_JN < x_G_H_I_J) >> 63; 85 v4u64 l_OS_PT_QU_RV = x_O_P_Q_R + x_S_T_U_V; 86 v4u64 h_OS_PT_QU_RV = (l_OS_PT_QU_RV < x_O_P_Q_R) >> 63; 87 v4u64 l_GKOS_HLPT_IMQU_JNRV = l_GK_HL_IM_JN + l_OS_PT_QU_RV; 88 v4u64 h_GKOS_HLPT_IMQU_JNRV = ((l_GKOS_HLPT_IMQU_JNRV < l_GK_HL_IM_JN) >> 63) 89 + h_GK_HL_IM_JN + h_OS_PT_QU_RV; 90 v4u64 l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = l_048C_159D_26AE_37BF + l_GKOS_HLPT_IMQU_JNRV; 91 v4u64 h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = ((l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV < l_048C_159D_26AE_37BF) >> 63) 92 + h_048C_159D_26AE_37BF + h_GKOS_HLPT_IMQU_JNRV; 93 94 // 3. Split to 128-bit halves 95 v2u64 l_048CGKOS_159DHLPT = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] }; 96 v2u64 h_048CGKOS_159DHLPT = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] }; 97 v2u64 l_26AEIMQU_37BFJNRV = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] }; 98 v2u64 h_26AEIMQU_37BFJNRV = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] }; 99 100 // 4. Reduce to 2x128 bits 101 v2u64 l_02468ACEGIKMOQSU_13579BDFHJLNPRTV = l_048CGKOS_159DHLPT + l_26AEIMQU_37BFJNRV; 102 v2u64 h_02468ACEGIKMOQSU_13579BDFHJLNPRTV = ((l_02468ACEGIKMOQSU_13579BDFHJLNPRTV < l_048CGKOS_159DHLPT) >> 63) 103 + h_048CGKOS_159DHLPT + h_26AEIMQU_37BFJNRV; 104 105 // 5. Split to 64-bits halves 106 u64 l_02468ACEGIKMOQSU = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0]; 107 u64 h_02468ACEGIKMOQSU = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0]; 108 u64 l_13579BDFHJLNPRTV = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1]; 109 u64 h_13579BDFHJLNPRTV = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1]; 110 111 // 6. Reduce to 2x64 bits 112 u64 l_0123456789ABCDEFGHIJKLMNOPQRSTUV = l_02468ACEGIKMOQSU + l_13579BDFHJLNPRTV; 113 u64 h_0123456789ABCDEFGHIJKLMNOPQRSTUV = (l_0123456789ABCDEFGHIJKLMNOPQRSTUV < l_02468ACEGIKMOQSU) 114 + h_02468ACEGIKMOQSU + h_13579BDFHJLNPRTV; 115 116 return ((u128)h_0123456789ABCDEFGHIJKLMNOPQRSTUV << 64) | (u128)l_0123456789ABCDEFGHIJKLMNOPQRSTUV; 117 }