bits.h (16558B)
1 #pragma once 2 3 #include <stdbool.h> 4 #include <string.h> 5 6 #include "def.h" 7 8 9 /* ----- Bit rotates ----- */ 10 11 static inline u8 r_rotl8 (u8 n, uint c) { return (n << (c & 7u)) | (n >> (-c & 7u)); } 12 static inline u16 r_rotl16(u16 n, uint c) { return (n << (c & 15u)) | (n >> (-c & 15u)); } 13 static inline u32 r_rotl32(u32 n, uint c) { return (n << (c & 31u)) | (n >> (-c & 31u)); } 14 static inline u64 r_rotl64(u64 n, uint c) { return (n << (c & 63u)) | (n >> (-c & 63u)); } 15 16 static inline u8 r_rotr8 (u8 n, uint c) { return (n >> (c & 7u)) | (n << (-c & 7u)); } 17 static inline u16 r_rotr16(u16 n, uint c) { return (n >> (c & 15u)) | (n << (-c & 15u)); } 18 static inline u32 r_rotr32(u32 n, uint c) { return (n >> (c & 31u)) | (n << (-c & 31u)); } 19 static inline u64 r_rotr64(u64 n, uint c) { return (n >> (c & 63u)) | (n << (-c & 63u)); } 20 21 22 /* ----- Population count ----- */ 23 24 /* The GCC builtin's do not use hardware popcnt unless an appropriate -march 25 * is specified, in which case __POPCNT__ is defined. Otherwise, GCC emits 26 * calls to libgcc which are slower than the below custom implementions. */ 27 28 #if defined(__GNUC__) && defined(__POPCNT__) 29 30 static inline int r_popcnt8 (u8 n) { return __builtin_popcount(n); } 31 static inline int r_popcnt16(u16 n) { return __builtin_popcount(n); } 32 static inline int r_popcnt32(u32 n) { return __builtin_popcountl(n); } 33 static inline int r_popcnt64(u64 n) { return __builtin_popcountll(n); } 34 35 #else 36 37 /* See Warren, "Hacker's Delight", 2 ed., sec. 5.1. */ 38 39 static inline int 40 r_popcnt8(u8 n) { 41 static u8 popcnt4[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; 42 return popcnt4[n&0x0f] + popcnt4[n>>4]; 43 } 44 45 static inline int 46 r_popcnt16(u16 n) { 47 return r_popcnt8(n&0xff) + r_popcnt8(n>>8); 48 } 49 50 static inline int 51 r_popcnt32(u32 n) { 52 n = n - ((n>>1) & U32_C(0x55555555)); 53 n = (n & U32_C(0x33333333)) + ((n>>2) & U32_C(0x33333333)); 54 n = (n + (n>>4)) & U32_C(0x0f0f0f0f); 55 n = (u32)(n * U32_C(0x01010101)) >> 24; 56 return n; 57 } 58 59 static inline int 60 r_popcnt64(u64 n) { 61 n = n - ((n>>1) & U64_C(0x5555555555555555)); 62 n = (n & U64_C(0x3333333333333333)) + ((n>>2) & U64_C(0x3333333333333333)); 63 n = (n + (n>>4)) & U64_C(0x0f0f0f0f0f0f0f0f); 64 n = (u64)(n * U64_C(0x0101010101010101)) >> 56; 65 return n; 66 } 67 68 #endif 69 70 71 /* ----- Count trailing zeros ----- */ 72 73 /* The ctz functions are implemented with the following theory: Let n be a 74 * b bit integer. We handle the case n == 0 separately, so assume n != 0. Then 75 * ctzb(n) is the index of least significant 1 bit in n. We can isolate this 76 * bit as n & -n. Thus, we have reduced computing ctzb to determining the index 77 * of a single bit. For this, we construct a minimal perfect hash function H; 78 * i.e., given a b bit integer with a single bit set, H outputs a integer in 79 * [0..b). We then apply a suitable permutation P to the output of H to get the 80 * bit index. 81 * 82 * The hash function H is constructed with the notion of de Bruijn cycles: Let 83 * A be an alphabet with k symbols and let l > 0. An (A,l)-de Bruijn cycle is a 84 * length k^l cyclic string over A which has all length l strings over A as a 85 * substring. It is a fact that de Bruijn cycles exist for any A and l. For our 86 * application, we take A = {0,1} and l = lg(b). For example, here is a 87 * de Bruijn cycle with b = 8 (hence l = 3): 88 * 89 * 0 0 0 1 1 1 0 1 = 0x1d 90 * --------------- 91 * 0 0 0 92 * 0 0 1 93 * 0 1 1 94 * 1 1 1 95 * 1 1 0 96 * 1 0 1 97 * 0 0 1 (wrap-around) 98 * 0 0 1 (wrap-around) 99 * 100 * Clearly, when such a de Bruijn cycle is rotated left by 0,1,...,b-1, all 101 * possible integers [0..b) appear on its top l bits. If the de Bruijn cycle 102 * has l 0's on the left (as in the above example), then the top l bits are the 103 * same whether we rotate left or shift left (because left shift brings in 0's 104 * on the right). (Of course, any de Bruijn cycle can be rotated until l 0's 105 * appear at the left.) But shifting left is equivalent to multiplying by an 106 * integer with exactly 1 bit set. It is now clear how to define H: 107 * 108 * H(n) = (c * n) >> (b - l) 109 * 110 * where c is any ({0,1},l)-de Bruijn cycle starting with l 0's. Once c is 111 * fixed, the desired permutation P is easily computed. */ 112 113 /* Somehow, the de Bruijn cycle method beats the code generated by 114 * __builtin_clz{,l,ll} (using the tzcnt instruction) unless an appropriate 115 * -march is specified, in which case GCC recognizes that the de Bruijn code 116 * (at least in the 32-bit and 64-bit cases) implements ctz and it regresses 117 * to using the tzcnt instruction, thereby tying with the builtin. Ugh. */ 118 119 static inline int 120 r_ctz8(u8 n) { 121 static u8 cycle = U8_C(0x1d); 122 static u8 perm[8] = {0, 1, 6, 2, 7, 5, 4, 3}; 123 if (n == 0) return 8; 124 return perm[(u8)((n & -n) * cycle) >> (8 - 3)]; 125 } 126 127 static inline int 128 r_ctz16(u16 n) { 129 static u16 cycle = U16_C(0x0f65); 130 static u8 perm[16] = { 131 0, 1, 11, 2, 14, 12, 8, 3, 15, 10, 13, 7, 9, 6, 5, 4, 132 }; 133 if (n == 0) return 16; 134 return perm[(u16)((n & -n) * cycle) >> (16 - 4)]; 135 } 136 137 static inline int 138 r_ctz32(u32 n) { 139 /* cycle and perm are from Warren, "Hacker's Delight", 2 ed., p. 112. */ 140 static u32 cycle = U32_C(0x04d7651f); 141 static u8 perm[32] = { 142 0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26, 143 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27, 144 }; 145 if (n == 0) return 32; 146 return perm[(u32)((n & -n) * cycle) >> (32 - 5)]; 147 } 148 149 static inline int 150 r_ctz64(u64 n) { 151 /* cycle and perm are from Knuth, TAOCP, vol. 4A, p. 142. */ 152 static u64 cycle = U64_C(0x03f79d71b4ca8b09); 153 static u8 perm[64] = { 154 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4, 155 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5, 156 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11, 157 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, 158 }; 159 if (n == 0) return 64; 160 return perm[(u64)((n & -n) * cycle) >> (64 - 6)]; 161 } 162 163 164 /* ----- Rightmost zero byte searching ----- */ 165 166 /* See Warren, "Hacker's Delight", 2 ed., sec. 6.1. The basic idea is to map 167 * the rightmost 0x00 byte to 0x80 and all bytes right of the rightmost 0x00 to 168 * 0x00, and then use ctz. When the argument n has no zero byte, rzb returns 169 * sizeof n. By first XORing with a suitable value, these functions can be used 170 * to find arbitrary bytes. Also, by replacing the masks within the definition, 171 * this method adapts to searching for arbitrary length sequences of 0 bits 172 * aligned in a particular way. */ 173 174 static inline int 175 r_rzb16(u16 n) { 176 n = (n - U16_C(0x0101)) & ~n & U16_C(0x8080); 177 return r_ctz16(n) / 8u; 178 } 179 180 static inline int 181 r_rzb32(u32 n) { 182 n = (n - U32_C(0x01010101)) & ~n & U32_C(0x80808080); 183 return r_ctz32(n) / 8u; 184 } 185 186 static inline int 187 r_rzb64(u64 n) { 188 n = (n - U64_C(0x0101010101010101)) & ~n & U64_C(0x8080808080808080); 189 return r_ctz64(n) / 8u; 190 } 191 192 193 /* ----- Sign extension ----- */ 194 195 static inline u8 196 r_sext8(u8 x, uint b) { 197 uint c = 8 - b; 198 return (u8)((i8)(x << c) >> c); 199 } 200 201 static inline u16 202 r_sext16(u16 x, uint b) { 203 uint c = 16 - b; 204 return (u16)((i16)(x << c) >> c); 205 } 206 207 static inline u32 208 r_sext32(u32 x, uint b) { 209 uint c = 32 - b; 210 return (u32)((i32)(x << c) >> c); 211 } 212 213 static inline u64 214 r_sext64(u64 x, uint b) { 215 uint c = 64 - b; 216 return (u64)((i64)(x << c) >> c); 217 } 218 219 220 /* ----- Ternary add and subtract ----- */ 221 222 /* We implement ternary add/sub on arbitrary unsigned integers instead of with 223 * the third argument contrained to be a 1 bit carry/borrow, because compilers 224 * (or at least GCC) suck at generating good code with carries/borrows, so we 225 * might as well take the extra flexibility. Actually, the GCC 14.2 code gen 226 * for these functions is slightly better than the new __builtin_addc{,l,ll} 227 * intrinsics on my machine (according to a microbenchmark). */ 228 229 static inline void 230 r_add8(u8 *h, u8 *l, u8 x, u8 y, u8 z) { 231 u16 hl = (u16)x + (u16)y + (u16)z; 232 *h = hl >> 8; 233 *l = hl; 234 } 235 236 static inline void 237 r_add16(u16 *h, u16 *l, u16 x, u16 y, u16 z) { 238 u32 hl = (u32)x + (u32)y + (u32)z; 239 *h = hl >> 16; 240 *l = hl; 241 } 242 243 static inline void 244 r_add32(u32 *h, u32 *l, u32 x, u32 y, u32 z) { 245 u64 hl = (u64)x + (u64)y + (u64)z; 246 *h = hl >> 32; 247 *l = hl; 248 } 249 250 static inline void 251 r_add64(u64 *h, u64 *l, u64 x, u64 y, u64 z) { 252 #ifdef R_HAVE_128 253 u128 hl = (u128)x + (u128)y + (u128)z; 254 *h = hl >> 64; 255 *l = hl; 256 #else 257 u64 s = x + y; 258 bool c0 = s < x; 259 u64 t = s + z; 260 bool c1 = t < s; 261 *h = c0 + c1; 262 *l = t; 263 #endif 264 } 265 266 static inline void 267 r_sub8(u8 *h, u8 *l, u8 x, u8 y, u8 z) { 268 u16 hl = (u16)x - (u16)y - (u16)z; 269 *h = hl >> 8; 270 *l = hl; 271 } 272 273 static inline void 274 r_sub16(u16 *h, u16 *l, u16 x, u16 y, u16 z) { 275 u32 hl = (u32)x - (u32)y - (u32)z; 276 *h = hl >> 16; 277 *l = hl; 278 } 279 280 static inline void 281 r_sub32(u32 *h, u32 *l, u32 x, u32 y, u32 z) { 282 u64 hl = (u64)x - (u64)y - (u64)z; 283 *h = hl >> 32; 284 *l = hl; 285 } 286 287 static inline void 288 r_sub64(u64 *h, u64 *l, u64 x, u64 y, u64 z) { 289 #ifdef R_HAVE_128 290 u128 hl = (u128)x - (u128)y - (u128)z; 291 *h = hl >> 64; 292 *l = hl; 293 #else 294 u64 s = x - y; 295 bool c0 = s > x; 296 u64 t = s - z; 297 bool c1 = t > s; 298 *h = -c0 - c1; 299 *l = t; 300 #endif 301 } 302 303 304 /* ----- Full width multiply ----- */ 305 306 static inline void 307 r_mulu8(u8 *h, u8 *l, u8 x, u8 y) { 308 u16 hl = (u16)x * (u16)y; 309 *h = hl >> 8; 310 *l = hl; 311 } 312 313 static inline void 314 r_mulu16(u16 *h, u16 *l, u16 x, u16 y) { 315 u32 hl = (u32)x * (u32)y; 316 *h = hl >> 16; 317 *l = hl; 318 } 319 320 static inline void 321 r_mulu32(u32 *h, u32 *l, u32 x, u32 y) { 322 u64 hl = (u64)x * (u64)y; 323 *h = hl >> 32; 324 *l = hl; 325 } 326 327 static inline void 328 r_mulu64(u64 *h, u64 *l, u64 x, u64 y) { 329 #ifdef R_HAVE_128 330 u128 hl = (u128)x * (u128)y; 331 *h = hl >> 64; 332 *l = hl; 333 #else 334 const u64 m = (U64_C(1) << 32) - 1; 335 336 u64 x0 = x & m; 337 u64 x1 = x >> 32; 338 u64 y0 = y & m; 339 u64 y1 = y >> 32; 340 341 u64 x0y0 = x0 * y0; 342 u64 x0y1 = x0 * y1; 343 u64 x1y0 = x1 * y0; 344 u64 x1y1 = x1 * y1; 345 346 u64 c = ((x0y1 & m) + (x1y0 & m) + (x0y0 >> 32)) >> 32; 347 348 *h = x1y1 + (x0y1 >> 32) + (x1y0 >> 32) + c; 349 *l = x0y0 + (x0y1 << 32) + (x1y0 << 32); 350 #endif 351 } 352 353 static inline void 354 r_muls8(i8 *h, u8 *l, i8 x, i8 y) { 355 i16 hl = (i16)x * (i16)y; 356 *h = hl >> 8; 357 *l = hl; 358 } 359 360 static inline void 361 r_muls16(i16 *h, u16 *l, i16 x, i16 y) { 362 i32 hl = (i32)x * (i32)y; 363 *h = hl >> 16; 364 *l = hl; 365 } 366 367 static inline void 368 r_muls32(i32 *h, u32 *l, i32 x, i32 y) { 369 i64 hl = (i64)x * (i64)y; 370 *h = hl >> 32; 371 *l = hl; 372 } 373 374 static inline void 375 r_muls64(i64 *h, u64 *l, i64 x, i64 y) { 376 #ifdef R_HAVE_128 377 i128 hl = (i128)x * (i128)y; 378 *h = hl >> 64; 379 *l = hl; 380 #else 381 const u64 m = (U64_C(1) << 32) - 1; 382 383 u64 x0 = x & m; 384 i64 x1 = x >> 32; 385 u64 y0 = y & m; 386 i64 y1 = y >> 32; 387 388 u64 x0y0 = x0 * y0; 389 i64 x0y1 = x0 * y1; 390 i64 x1y0 = x1 * y0; 391 i64 x1y1 = x1 * y1; 392 393 u64 c = ((x0y1 & m) + (x1y0 & m) + (x0y0 >> 32)) >> 32; 394 395 *h = x1y1 + (x0y1 >> 32) + (x1y0 >> 32) + c; 396 *l = x0y0 + (x0y1 << 32) + (x1y0 << 32); 397 #endif 398 } 399 400 401 /* ----- Floor division ----- */ 402 403 static inline void 404 r_divs8(i8 *q, i8 *r, i8 x, i8 y) { 405 i8 qq = x / y, rr = x % y; 406 bool neg = (x < 0) ^ (y < 0); 407 *q = qq - ((rr != 0) & neg); 408 *r = neg ? rr + y : rr; 409 } 410 411 static inline void 412 r_divs16(i16 *q, i16 *r, i16 x, i16 y) { 413 i16 qq = x / y, rr = x % y; 414 bool neg = (x < 0) ^ (y < 0); 415 *q = qq - ((rr != 0) & neg); 416 *r = neg ? rr + y : rr; 417 } 418 419 static inline void 420 r_divs32(i32 *q, i32 *r, i32 x, i32 y) { 421 i32 qq = x / y, rr = x % y; 422 bool neg = (x < 0) ^ (y < 0); 423 *q = qq - ((rr != 0) & neg); 424 *r = neg ? rr + y : rr; 425 } 426 427 static inline void 428 r_divs64(i64 *q, i64 *r, i64 x, i64 y) { 429 i64 qq = x / y, rr = x % y; 430 bool neg = (x < 0) ^ (y < 0); 431 *q = qq - ((rr != 0) & neg); 432 *r = neg ? rr + y : rr; 433 } 434 435 436 /* ----- Byte swaps/reversals ----- */ 437 438 #ifdef __GNUC__ 439 440 static inline u16 r_swap16(u16 n) { return __builtin_bswap16(n); } 441 static inline u32 r_swap32(u32 n) { return __builtin_bswap32(n); } 442 static inline u64 r_swap64(u64 n) { return __builtin_bswap64(n); } 443 444 #else 445 446 /* See Warren, "Hacker's Delight", 2 ed., sec. 7.1. */ 447 448 static inline u16 449 r_swap16(u16 n) { 450 return (n << 8) | (n >> 8); 451 } 452 453 static inline u32 454 r_swap32(u32 n) { 455 n = ((n & U32_C(0x00ff00ff)) << 8) | ((n >> 8) & U32_C(0x00ff00ff)); 456 return (n << 16) | (n >> 16); 457 } 458 459 static inline u64 460 r_swap64(u64 n) { 461 n = ((n & U64_C(0x00ff00ff00ff00ff)) << 8) | ((n >> 8) & U64_C(0x00ff00ff00ff00ff)); 462 n = ((n & U64_C(0x0000ffff0000ffff)) << 16) | ((n >> 16) & U64_C(0x0000ffff0000ffff)); 463 return (n << 32) | (n >> 32); 464 } 465 466 #endif 467 468 469 /* ----- Endian conversions ----- */ 470 471 /* There is 2x redundancy here (e.g., ltoh = htol), but this allows code using 472 * these functions to make the intent clear. */ 473 474 #if defined(R_LITTLE_ENDIAN) 475 static inline u16 r_ltoh16(u16 n) { return n; } 476 static inline u32 r_ltoh32(u32 n) { return n; } 477 static inline u64 r_ltoh64(u64 n) { return n; } 478 static inline u16 r_btoh16(u16 n) { return r_swap16(n); } 479 static inline u32 r_btoh32(u32 n) { return r_swap32(n); } 480 static inline u64 r_btoh64(u64 n) { return r_swap64(n); } 481 static inline u16 r_htol16(u16 n) { return n; } 482 static inline u32 r_htol32(u32 n) { return n; } 483 static inline u64 r_htol64(u64 n) { return n; } 484 static inline u16 r_htob16(u16 n) { return r_swap16(n); } 485 static inline u32 r_htob32(u32 n) { return r_swap32(n); } 486 static inline u64 r_htob64(u64 n) { return r_swap64(n); } 487 #elif defined(R_BIG_ENDIAN) 488 static inline u16 r_btoh16(u16 n) { return n; } 489 static inline u32 r_btoh32(u32 n) { return n; } 490 static inline u64 r_btoh64(u64 n) { return n; } 491 static inline u16 r_ltoh16(u16 n) { return r_swap16(n); } 492 static inline u32 r_ltoh32(u32 n) { return r_swap32(n); } 493 static inline u64 r_ltoh64(u64 n) { return r_swap64(n); } 494 static inline u16 r_htob16(u16 n) { return n; } 495 static inline u32 r_htob32(u32 n) { return n; } 496 static inline u64 r_htob64(u64 n) { return n; } 497 static inline u16 r_htol16(u16 n) { return r_swap16(n); } 498 static inline u32 r_htol32(u32 n) { return r_swap32(n); } 499 static inline u64 r_htol64(u64 n) { return r_swap64(n); } 500 #endif 501 502 503 /* ----- Binary decoding ----- */ 504 505 #if defined(R_LITTLE_ENDIAN) 506 static inline u16 r_readl16(void *p) { u16 v; memcpy(&v, p, 2); return v; } 507 static inline u32 r_readl32(void *p) { u32 v; memcpy(&v, p, 4); return v; } 508 static inline u64 r_readl64(void *p) { u64 v; memcpy(&v, p, 8); return v; } 509 static inline u16 r_readb16(void *p) { return r_swap16(r_readl16(p)); } 510 static inline u32 r_readb32(void *p) { return r_swap32(r_readl32(p)); } 511 static inline u64 r_readb64(void *p) { return r_swap64(r_readl64(p)); } 512 static inline u16 r_readh16(void *p) { return r_readl16(p); } 513 static inline u32 r_readh32(void *p) { return r_readl32(p); } 514 static inline u64 r_readh64(void *p) { return r_readl64(p); } 515 #elif defined(R_BIG_ENDIAN) 516 static inline u16 r_readb16(void *p) { u16 v; memcpy(&v, p, 2); return v; } 517 static inline u32 r_readb32(void *p) { u32 v; memcpy(&v, p, 4); return v; } 518 static inline u64 r_readb64(void *p) { u64 v; memcpy(&v, p, 8); return v; } 519 static inline u16 r_readl16(void *p) { return r_swap16(r_readb16(p)); } 520 static inline u32 r_readl32(void *p) { return r_swap32(r_readb32(p)); } 521 static inline u64 r_readl64(void *p) { return r_swap64(r_readb64(p)); } 522 static inline u16 r_readh16(void *p) { return r_readb16(p); } 523 static inline u32 r_readh32(void *p) { return r_readb32(p); } 524 static inline u64 r_readh64(void *p) { return r_readb64(p); } 525 #endif 526 527 528 /* ----- Binary encoding ----- */ 529 530 #if defined(R_LITTLE_ENDIAN) 531 static inline void r_writel16(void *p, u16 v) { memcpy(p, &v, 2); } 532 static inline void r_writel32(void *p, u32 v) { memcpy(p, &v, 4); } 533 static inline void r_writel64(void *p, u64 v) { memcpy(p, &v, 8); } 534 static inline void r_writeb16(void *p, u16 v) { r_writel16(p, r_swap16(v)); } 535 static inline void r_writeb32(void *p, u32 v) { r_writel32(p, r_swap32(v)); } 536 static inline void r_writeb64(void *p, u64 v) { r_writel64(p, r_swap64(v)); } 537 static inline void r_writeh16(void *p, u16 v) { r_writel16(p, v); } 538 static inline void r_writeh32(void *p, u32 v) { r_writel32(p, v); } 539 static inline void r_writeh64(void *p, u64 v) { r_writel64(p, v); } 540 #elif defined(R_BIG_ENDIAN) 541 static inline void r_writeb16(void *p, u16 v) { memcpy(&v, p, 2); } 542 static inline void r_writeb32(void *p, u32 v) { memcpy(&v, p, 4); } 543 static inline void r_writeb64(void *p, u64 v) { memcpy(&v, p, 8); } 544 static inline void r_writel16(void *p, u16 v) { r_writeb16(p, r_swap16(v)); } 545 static inline void r_writel32(void *p, u32 v) { r_writeb32(p, r_swap32(v)); } 546 static inline void r_writel64(void *p, u64 v) { r_writeb64(p, r_swap64(v)); } 547 static inline void r_writeh16(void *p, u16 v) { r_writeb16(p, v); } 548 static inline void r_writeh32(void *p, u32 v) { r_writeb32(p, v); } 549 static inline void r_writeh64(void *p, u64 v) { r_writeb64(p, v); } 550 #endif