Optimize popcnt functions - rcx - library of miscellaneous bits of C code

commit 85a6abca80562cee013da614b4fc6c6ec1b67fc3
parent ae8e306e4c19015767a8b50cb3ceafffab4d3f2d
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sun, 28 May 2023 13:34:38 -0700

Optimize popcnt functions

Diffstat:
M inc/bits.h  | 54 ++++++++++++++++++++++++++++++++----------------------
M inc/def.h  | 2 +-

2 files changed, 33 insertions(+), 23 deletions(-)
diff --git a/inc/bits.h b/inc/bits.h
@@ -4,51 +4,60 @@
 
 #include "def.h"
 
-/* The popcnt functions are implemented with a divide and conquer strategy.
- * See Henry Warren, Hacker's Delight, 2 ed., sec. 5.1. */
+/* The GCC builtin's do not use hardware popcnt unless an appropriate -march
+ * is specified, in which case __POPCNT__ is defined. Otherwise, GCC emits
+ * calls to libgcc which are slower than the below custom implementions. */
+#if defined(__GNUC__) && defined(__POPCNT__)
+
+static inline int r_popcnt8 (u8  n) { return __builtin_popcount(n); }
+static inline int r_popcnt16(u16 n) { return __builtin_popcount(n); }
+static inline int r_popcnt32(u32 n) { return __builtin_popcountl(n); }
+static inline int r_popcnt64(u64 n) { return __builtin_popcountll(n); }
+
+#else
+
+/* See Henry Warren, "Hacker's Delight", 2 ed., sec. 5.1. */
 
 static inline int
 r_popcnt8(u8 n) {
-	n = (n & U8_C(0x55)) + ((n>>1) & U8_C(0x55));
-	n = (n & U8_C(0x33)) + ((n>>2) & U8_C(0x33));
-	n = (n & U8_C(0x0f)) + ((n>>4) & U8_C(0x0f));
-	return n;
+	static u8 popcnt4[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+	return popcnt4[n&0x0f] + popcnt4[n>>4];
 }
 
 static inline int
 r_popcnt16(u16 n) {
-	n = (n & U16_C(0x5555)) + ((n>>1) & U16_C(0x5555));
-	n = (n & U16_C(0x3333)) + ((n>>2) & U16_C(0x3333));
-	n = (n & U16_C(0x0f0f)) + ((n>>4) & U16_C(0x0f0f));
-	n = (n & U16_C(0x00ff)) + ((n>>8) & U16_C(0x00ff));
-	return n;
+	return r_popcnt8(n&0xff) + r_popcnt8(n>>8);
 }
 
 static inline int
 r_popcnt32(u32 n) {
-	n = (n & U32_C(0x55555555)) + ((n>>1)  & U32_C(0x55555555));
-	n = (n & U32_C(0x33333333)) + ((n>>2)  & U32_C(0x33333333));
-	n = (n & U32_C(0x0f0f0f0f)) + ((n>>4)  & U32_C(0x0f0f0f0f));
-	n = (n & U32_C(0x00ff00ff)) + ((n>>8)  & U32_C(0x00ff00ff));
-	n = (n & U32_C(0x0000ffff)) + ((n>>16) & U32_C(0x0000ffff));
+	n = n - ((n>>1) & U32_C(0x55555555));
+	n = (n & U32_C(0x33333333)) + ((n>>2) & U32_C(0x33333333));
+	n = (n + (n>>4)) & U32_C(0x0f0f0f0f);
+	n = (u32)(n * U32_C(0x01010101)) >> 24;
 	return n;
 }
 
 static inline int
 r_popcnt64(u64 n) {
-	n = (n & U64_C(0x5555555555555555)) + ((n>>1)  & U64_C(0x5555555555555555));
-	n = (n & U64_C(0x3333333333333333)) + ((n>>2)  & U64_C(0x3333333333333333));
-	n = (n & U64_C(0x0f0f0f0f0f0f0f0f)) + ((n>>4)  & U64_C(0x0f0f0f0f0f0f0f0f));
-	n = (n & U64_C(0x00ff00ff00ff00ff)) + ((n>>8)  & U64_C(0x00ff00ff00ff00ff));
-	n = (n & U64_C(0x0000ffff0000ffff)) + ((n>>16) & U64_C(0x0000ffff0000ffff));
-	n = (n & U64_C(0x00000000ffffffff)) + ((n>>32) & U64_C(0x00000000ffffffff));
+	n = n - ((n>>1) & U64_C(0x5555555555555555));
+	n = (n & U64_C(0x3333333333333333)) + ((n>>2) & U64_C(0x3333333333333333));
+	n = (n + (n>>4)) & U64_C(0x0f0f0f0f0f0f0f0f);
+	n = (u64)(n * U64_C(0x0101010101010101)) >> 56;
 	return n;
 }
 
+#endif
+
 /* Perform a full-width multiply of x and y, storing the upper (resp., lower)
  * 64 bits of the product in *h (resp., *l). */
 static inline void
 r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
+#ifdef R_HAVE_128
+	u128 xy = (u128)x * (u128)y;
+	*l = xy;
+	*h = xy >> 64;
+#else
 	const u64 m = (U64_C(1)<<32) - 1;
 
 	u64 x0 = x & m;
@@ -65,6 +74,7 @@ r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
 
 	*l = x0y0 + (x0y1<<32) + (x1y0<<32);
 	*h = x1y1 + (x0y1>>32) + (x1y0>>32) + c;
+#endif
 }
 
 static inline u16 r_read16b(u8 *p) { return ((u16)p[0] << 8) | ((u16)p[1] << 0); }
diff --git a/inc/def.h b/inc/def.h
@@ -57,7 +57,7 @@
 #error "Expected CHAR_BIT == 8"
 #endif
 
-#if !defined(__STRICT_ANSI__) && defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__)
 #define R_HAVE_128 1
 #endif

	rcx library of miscellaneous bits of C code
	git clone git://git.rr3.xyz/rcx
	Log \| Files \| Refs \| README \| LICENSE

M	inc/bits.h	\|	54	++++++++++++++++++++++++++++++++----------------------
M	inc/def.h	\|	2	+-