commit 85a6abca80562cee013da614b4fc6c6ec1b67fc3
parent ae8e306e4c19015767a8b50cb3ceafffab4d3f2d
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Sun, 28 May 2023 13:34:38 -0700
Optimize popcnt functions
Diffstat:
2 files changed, 33 insertions(+), 23 deletions(-)
diff --git a/inc/bits.h b/inc/bits.h
@@ -4,51 +4,60 @@
#include "def.h"
-/* The popcnt functions are implemented with a divide and conquer strategy.
- * See Henry Warren, Hacker's Delight, 2 ed., sec. 5.1. */
+/* The GCC builtin's do not use hardware popcnt unless an appropriate -march
+ * is specified, in which case __POPCNT__ is defined. Otherwise, GCC emits
+ * calls to libgcc which are slower than the below custom implementions. */
+#if defined(__GNUC__) && defined(__POPCNT__)
+
+static inline int r_popcnt8 (u8 n) { return __builtin_popcount(n); }
+static inline int r_popcnt16(u16 n) { return __builtin_popcount(n); }
+static inline int r_popcnt32(u32 n) { return __builtin_popcountl(n); }
+static inline int r_popcnt64(u64 n) { return __builtin_popcountll(n); }
+
+#else
+
+/* See Henry Warren, "Hacker's Delight", 2 ed., sec. 5.1. */
static inline int
r_popcnt8(u8 n) {
- n = (n & U8_C(0x55)) + ((n>>1) & U8_C(0x55));
- n = (n & U8_C(0x33)) + ((n>>2) & U8_C(0x33));
- n = (n & U8_C(0x0f)) + ((n>>4) & U8_C(0x0f));
- return n;
+ static u8 popcnt4[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+ return popcnt4[n&0x0f] + popcnt4[n>>4];
}
static inline int
r_popcnt16(u16 n) {
- n = (n & U16_C(0x5555)) + ((n>>1) & U16_C(0x5555));
- n = (n & U16_C(0x3333)) + ((n>>2) & U16_C(0x3333));
- n = (n & U16_C(0x0f0f)) + ((n>>4) & U16_C(0x0f0f));
- n = (n & U16_C(0x00ff)) + ((n>>8) & U16_C(0x00ff));
- return n;
+ return r_popcnt8(n&0xff) + r_popcnt8(n>>8);
}
static inline int
r_popcnt32(u32 n) {
- n = (n & U32_C(0x55555555)) + ((n>>1) & U32_C(0x55555555));
- n = (n & U32_C(0x33333333)) + ((n>>2) & U32_C(0x33333333));
- n = (n & U32_C(0x0f0f0f0f)) + ((n>>4) & U32_C(0x0f0f0f0f));
- n = (n & U32_C(0x00ff00ff)) + ((n>>8) & U32_C(0x00ff00ff));
- n = (n & U32_C(0x0000ffff)) + ((n>>16) & U32_C(0x0000ffff));
+ n = n - ((n>>1) & U32_C(0x55555555));
+ n = (n & U32_C(0x33333333)) + ((n>>2) & U32_C(0x33333333));
+ n = (n + (n>>4)) & U32_C(0x0f0f0f0f);
+ n = (u32)(n * U32_C(0x01010101)) >> 24;
return n;
}
static inline int
r_popcnt64(u64 n) {
- n = (n & U64_C(0x5555555555555555)) + ((n>>1) & U64_C(0x5555555555555555));
- n = (n & U64_C(0x3333333333333333)) + ((n>>2) & U64_C(0x3333333333333333));
- n = (n & U64_C(0x0f0f0f0f0f0f0f0f)) + ((n>>4) & U64_C(0x0f0f0f0f0f0f0f0f));
- n = (n & U64_C(0x00ff00ff00ff00ff)) + ((n>>8) & U64_C(0x00ff00ff00ff00ff));
- n = (n & U64_C(0x0000ffff0000ffff)) + ((n>>16) & U64_C(0x0000ffff0000ffff));
- n = (n & U64_C(0x00000000ffffffff)) + ((n>>32) & U64_C(0x00000000ffffffff));
+ n = n - ((n>>1) & U64_C(0x5555555555555555));
+ n = (n & U64_C(0x3333333333333333)) + ((n>>2) & U64_C(0x3333333333333333));
+ n = (n + (n>>4)) & U64_C(0x0f0f0f0f0f0f0f0f);
+ n = (u64)(n * U64_C(0x0101010101010101)) >> 56;
return n;
}
+#endif
+
/* Perform a full-width multiply of x and y, storing the upper (resp., lower)
* 64 bits of the product in *h (resp., *l). */
static inline void
r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
+#ifdef R_HAVE_128
+ u128 xy = (u128)x * (u128)y;
+ *l = xy;
+ *h = xy >> 64;
+#else
const u64 m = (U64_C(1)<<32) - 1;
u64 x0 = x & m;
@@ -65,6 +74,7 @@ r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
*l = x0y0 + (x0y1<<32) + (x1y0<<32);
*h = x1y1 + (x0y1>>32) + (x1y0>>32) + c;
+#endif
}
static inline u16 r_read16b(u8 *p) { return ((u16)p[0] << 8) | ((u16)p[1] << 0); }
diff --git a/inc/def.h b/inc/def.h
@@ -57,7 +57,7 @@
#error "Expected CHAR_BIT == 8"
#endif
-#if !defined(__STRICT_ANSI__) && defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__)
#define R_HAVE_128 1
#endif