rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit 85a6abca80562cee013da614b4fc6c6ec1b67fc3
parent ae8e306e4c19015767a8b50cb3ceafffab4d3f2d
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sun, 28 May 2023 13:34:38 -0700

Optimize popcnt functions

Diffstat:
Minc/bits.h | 54++++++++++++++++++++++++++++++++----------------------
Minc/def.h | 2+-
2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/inc/bits.h b/inc/bits.h @@ -4,51 +4,60 @@ #include "def.h" -/* The popcnt functions are implemented with a divide and conquer strategy. - * See Henry Warren, Hacker's Delight, 2 ed., sec. 5.1. */ +/* The GCC builtin's do not use hardware popcnt unless an appropriate -march + * is specified, in which case __POPCNT__ is defined. Otherwise, GCC emits + * calls to libgcc which are slower than the below custom implementions. */ +#if defined(__GNUC__) && defined(__POPCNT__) + +static inline int r_popcnt8 (u8 n) { return __builtin_popcount(n); } +static inline int r_popcnt16(u16 n) { return __builtin_popcount(n); } +static inline int r_popcnt32(u32 n) { return __builtin_popcountl(n); } +static inline int r_popcnt64(u64 n) { return __builtin_popcountll(n); } + +#else + +/* See Henry Warren, "Hacker's Delight", 2 ed., sec. 5.1. */ static inline int r_popcnt8(u8 n) { - n = (n & U8_C(0x55)) + ((n>>1) & U8_C(0x55)); - n = (n & U8_C(0x33)) + ((n>>2) & U8_C(0x33)); - n = (n & U8_C(0x0f)) + ((n>>4) & U8_C(0x0f)); - return n; + static u8 popcnt4[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + return popcnt4[n&0x0f] + popcnt4[n>>4]; } static inline int r_popcnt16(u16 n) { - n = (n & U16_C(0x5555)) + ((n>>1) & U16_C(0x5555)); - n = (n & U16_C(0x3333)) + ((n>>2) & U16_C(0x3333)); - n = (n & U16_C(0x0f0f)) + ((n>>4) & U16_C(0x0f0f)); - n = (n & U16_C(0x00ff)) + ((n>>8) & U16_C(0x00ff)); - return n; + return r_popcnt8(n&0xff) + r_popcnt8(n>>8); } static inline int r_popcnt32(u32 n) { - n = (n & U32_C(0x55555555)) + ((n>>1) & U32_C(0x55555555)); - n = (n & U32_C(0x33333333)) + ((n>>2) & U32_C(0x33333333)); - n = (n & U32_C(0x0f0f0f0f)) + ((n>>4) & U32_C(0x0f0f0f0f)); - n = (n & U32_C(0x00ff00ff)) + ((n>>8) & U32_C(0x00ff00ff)); - n = (n & U32_C(0x0000ffff)) + ((n>>16) & U32_C(0x0000ffff)); + n = n - ((n>>1) & U32_C(0x55555555)); + n = (n & U32_C(0x33333333)) + ((n>>2) & U32_C(0x33333333)); + n = (n + (n>>4)) & U32_C(0x0f0f0f0f); + n = (u32)(n * U32_C(0x01010101)) >> 24; return n; } static inline int r_popcnt64(u64 n) { - n = (n & U64_C(0x5555555555555555)) + ((n>>1) & U64_C(0x5555555555555555)); - n = (n & U64_C(0x3333333333333333)) + ((n>>2) & U64_C(0x3333333333333333)); - n = (n & U64_C(0x0f0f0f0f0f0f0f0f)) + ((n>>4) & U64_C(0x0f0f0f0f0f0f0f0f)); - n = (n & U64_C(0x00ff00ff00ff00ff)) + ((n>>8) & U64_C(0x00ff00ff00ff00ff)); - n = (n & U64_C(0x0000ffff0000ffff)) + ((n>>16) & U64_C(0x0000ffff0000ffff)); - n = (n & U64_C(0x00000000ffffffff)) + ((n>>32) & U64_C(0x00000000ffffffff)); + n = n - ((n>>1) & U64_C(0x5555555555555555)); + n = (n & U64_C(0x3333333333333333)) + ((n>>2) & U64_C(0x3333333333333333)); + n = (n + (n>>4)) & U64_C(0x0f0f0f0f0f0f0f0f); + n = (u64)(n * U64_C(0x0101010101010101)) >> 56; return n; } +#endif + /* Perform a full-width multiply of x and y, storing the upper (resp., lower) * 64 bits of the product in *h (resp., *l). */ static inline void r_mul64(u64 *h, u64 *l, u64 x, u64 y) { +#ifdef R_HAVE_128 + u128 xy = (u128)x * (u128)y; + *l = xy; + *h = xy >> 64; +#else const u64 m = (U64_C(1)<<32) - 1; u64 x0 = x & m; @@ -65,6 +74,7 @@ r_mul64(u64 *h, u64 *l, u64 x, u64 y) { *l = x0y0 + (x0y1<<32) + (x1y0<<32); *h = x1y1 + (x0y1>>32) + (x1y0>>32) + c; +#endif } static inline u16 r_read16b(u8 *p) { return ((u16)p[0] << 8) | ((u16)p[1] << 0); } diff --git a/inc/def.h b/inc/def.h @@ -57,7 +57,7 @@ #error "Expected CHAR_BIT == 8" #endif -#if !defined(__STRICT_ANSI__) && defined(__SIZEOF_INT128__) +#if defined(__SIZEOF_INT128__) #define R_HAVE_128 1 #endif