commit 4de55af2687bc2d3ade6b80219127ec049e605aa
parent 4095b9d1ea56b41cf7a2ee544b56d8084ae14298
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Tue, 30 May 2023 17:50:22 -0700
Add generic hash table implementation
...in the same style as vector.h and deque.h, except we make an
attemp here to reduce code bloat by having just one implemenation.
(Maybe vector.h and deque.h should be modified to use the same
approach.)
The performance is mediocre at the moment.
Diffstat:
| M | .gitignore | | | 1 | + |
| M | Makefile | | | 2 | ++ |
| M | inc/all.h | | | 1 | + |
| A | inc/dict.h | | | 106 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/dict.c | | | 296 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 406 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
*.a
gen/*
tool/ucattab
+experiment
diff --git a/Makefile b/Makefile
@@ -7,6 +7,7 @@ SRC =\
src/bench.c\
src/buffer.c\
src/debug.c\
+ src/dict.c\
src/log.c\
src/rand.c\
src/str.c\
@@ -27,6 +28,7 @@ src/alloc.o: src/alloc.c inc/alloc.h inc/def.h inc/log.h inc/rcx.h inc/internal/
src/bench.o: src/bench.c inc/bench.h inc/def.h inc/log.h inc/rcx.h config.mk
src/buffer.o: src/buffer.c inc/alloc.h inc/buffer.h inc/debug.h inc/def.h inc/log.h inc/rcx.h inc/string.h config.mk
src/debug.o: src/debug.c inc/debug.h inc/def.h inc/rcx.h config.mk
+src/dict.o: src/dict.c inc/bits.h inc/debug.h inc/def.h inc/dict.h inc/internal/util.h inc/log.h inc/rand.h inc/rcx.h inc/string.h config.mk
src/log.o: src/log.c inc/def.h inc/log.h inc/rcx.h config.mk
src/opt.o: src/opt.c inc/def.h inc/opt.h inc/rcx.h config.mk
src/rand.o: src/rand.c inc/bits.h inc/def.h inc/rand.h inc/rcx.h inc/unix.h config.mk
diff --git a/inc/all.h b/inc/all.h
@@ -4,6 +4,7 @@
#include "buffer.h"
#include "debug.h"
#include "deque.h"
+#include "dict.h"
#include "error.h"
#include "log.h"
#include "opt.h"
diff --git a/inc/dict.h b/inc/dict.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <string.h>
+
+#include "debug.h"
+#include "def.h"
+#include "log.h"
+#include "rand.h"
+#include "string.h"
+
+typedef struct r_dict_spec RDictSpec;
+typedef bool (*RDictEqFunc)(void *a, void *b, RDictSpec *spec);
+typedef u64 (*RDictHashFunc)(void *k, u64 seed, RDictSpec *spec);
+typedef void *(*RDictAlloczFunc)(usize size);
+typedef void (*RDictFreeFunc)(void *p);
+
+struct r_dict_spec {
+ usize ksize;
+ usize vsize;
+ RDictEqFunc eq;
+ RDictHashFunc hash;
+ RDictAlloczFunc allocz;
+ RDictFreeFunc free;
+};
+
+struct r_dict; /* opaque */
+
+static inline bool r_dict_mem_eq(void *a, void *b, RDictSpec *spec) { return memcmp(a, b, spec->ksize) == 0; }
+static inline u64 r_dict_mem_hash(void *k, u64 seed, RDictSpec *spec) { return r_hash(k, spec->ksize, seed); }
+
+static inline bool r_dict_cstr_eq(char **a, char **b, RDictSpec *spec) { return strcmp(*a, *b) == 0; }
+static inline u64 r_dict_cstr_hash(char **k, u64 seed, RDictSpec *spec) { return r_hash(*k, strlen(*k), seed); }
+
+static inline bool r_dict_str_eq(Str *a, Str *b, RDictSpec *spec) { return str_eq(*a, *b); }
+static inline u64 r_dict_str_hash(Str *k, u64 seed, RDictSpec *spec) { return r_hash(str_bytes(*k), str_len(*k), seed); }
+
+/* TODO: add functions for shrinking dict and reserving space? */
+int r_dict_init(struct r_dict *d, usize hint, RDictSpec *spec);
+int r_dict_access(struct r_dict *d, void **v, void *k, u64 h, RDictSpec *spec);
+int r_dict_create(struct r_dict *d, void **v, void *k, u64 h, RDictSpec *spec);
+int r_dict_delete(struct r_dict *d, void **v, void *k, u64 h, RDictSpec *spec);
+void r_dict_clear(struct r_dict *d, RDictSpec *spec);
+
+/* Defaults */
+#define R_DICT_STATIC
+#define R_DICT_METHOD(name, prefix) JOIN(JOIN(prefix,_),name)
+#define R_DICT_EQ r_dict_mem_eq
+#define R_DICT_HASH r_dict_mem_hash
+#define R_DICT_ALLOCZ r_eallocz
+#define R_DICT_FREE free
+
+#define R_DICT_TYPEDEF(D, K, V)\
+typedef struct D { \
+ u8 b; \
+ u32 niter; \
+ u64 seed; \
+ usize count; \
+ usize ntombs; \
+ void *pages; \
+ V default_val; \
+} D;
+
+#define R_DICT_SPEC_(K,V) &(RDictSpec){ \
+ .ksize = sizeof(K), \
+ .vsize = sizeof(V), \
+ .eq = (RDictEqFunc)(R_DICT_EQ), \
+ .hash = (RDictHashFunc)(R_DICT_HASH), \
+ .allocz = (RDictAlloczFunc)(R_DICT_ALLOCZ), \
+ .free = (RDictFreeFunc)(R_DICT_FREE), \
+ }
+
+/* TODO: function for assigning without allocating */
+#define R_DICT_DECLARE(D, K, V, ...)\
+static inline UNUSED usize R_DICT_METHOD(len,##__VA_ARGS__)(D *d) { return d->count; } \
+static inline UNUSED void R_DICT_METHOD(set_default,##__VA_ARGS__)(D *d, V v) { d->default_val = v; } \
+static inline UNUSED V R_DICT_METHOD(get_default,##__VA_ARGS__)(D *d) { return d->default_val; } \
+static inline UNUSED int R_DICT_METHOD(init,##__VA_ARGS__)(D *d, usize hint) { \
+ memset(&d->default_val, 0, sizeof(V)); \
+ return r_dict_init((void *)d, hint, R_DICT_SPEC_(K,V)); \
+} \
+static inline UNUSED void R_DICT_METHOD(free,##__VA_ARGS__)(D *d) { \
+ if (d->niter > 0) \
+ r_warnf("dict freed with %"PRId32" iterators alive", d->niter); \
+ R_DICT_FREE(d->pages); \
+} \
+static inline UNUSED bool R_DICT_METHOD(get,##__VA_ARGS__)(D *d, V *v, K k) { \
+ u64 h = R_DICT_HASH(&k, d->seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
+ void *vp; int r = r_dict_access((void *)d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
+ if (v) *v = vp ? *(V *)vp : d->default_val; \
+ return r > 0; \
+} \
+static inline UNUSED int R_DICT_METHOD(set,##__VA_ARGS__)(D *d, K k, V v) { \
+ u64 h = R_DICT_HASH(&k, d->seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
+ void *vp; int r = r_dict_create((void *)d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
+ if (vp) *(V *)vp = v; \
+ return r; \
+} \
+static inline UNUSED bool R_DICT_METHOD(del,##__VA_ARGS__)(D *d, V *v, K k) { \
+ u64 h = R_DICT_HASH(&k, d->seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
+ void *vp; int r = r_dict_delete((void *)d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
+ if (v) *v = vp ? *(V *)vp : d->default_val; \
+ return r > 0; \
+} \
+static inline UNUSED void R_DICT_METHOD(clr,##__VA_ARGS__)(D *d) { r_dict_clear((void *)d, R_DICT_SPEC_(K,V)); }
+
+#define R_DICT_DEFINE(D, K, V, ...) /* No op. Eveything is static inline. */
diff --git a/src/dict.c b/src/dict.c
@@ -0,0 +1,296 @@
+#include <errno.h>
+#include <string.h>
+
+#include "bits.h"
+#include "dict.h"
+#include "internal/util.h"
+#include "rand.h"
+#include "rcx.h"
+
+/* PAGE_LEN should be a multiple of the machine word size (i.e., 8 on 64-bit
+ * arches) for optimal packing and to prevent alignment issues. There are
+ * places where we assume it is exactly 8, so don't change this without
+ * auditing the code. */
+#define PAGE_LEN 8u
+#define LF_NUM 3u
+#define LF_DEN 4u
+#define EMPTY 0u
+#define TOMBSTONE 1u
+#define MIN_HASH2 2u
+
+typedef struct r_dict_spec Spec;
+typedef struct r_dict Dict;
+typedef struct page Page;
+
+enum scan_mode {
+ ACCESS = 0,
+ CREATE = 1,
+ DELETE = 2,
+};
+
+struct r_dict {
+ u8 b;
+ u32 niter;
+ u64 seed;
+ usize count;
+ usize ntombs;
+ Page *pages;
+ /* V default_val; */
+};
+
+struct page {
+ u8 h2[PAGE_LEN];
+ /* K key[PAGE_LEN]; */
+ /* V val[PAGE_LEN]; */
+};
+
+static inline u64
+hash0(u64 h, u8 b) {
+ return h & ((U64_C(1) << b) - 1u);
+}
+
+static inline u64
+hash1(u64 h, u8 b) {
+ return (h >> b) & (PAGE_LEN - 1u);
+}
+
+static inline u8
+hash2(u64 h, u8 b) {
+ u64 t = h >> 56;
+ if (t < MIN_HASH2) t += MIN_HASH2;
+ return t;
+}
+
+static inline bool
+exceeds_lf(u64 n, u8 b) {
+ /* Note that the math here is 64-bit to avoid overflow. */
+ return LF_DEN * n > LF_NUM * PAGE_LEN * (U64_C(1) << b);
+}
+
+/* Compute Page size *without* overflow checks. */
+static inline usize
+psize_unsafe(Spec *spec) {
+ return sizeof(Page) + PAGE_LEN*spec->ksize + PAGE_LEN*spec->vsize;
+}
+
+/* Compute Page size *with* overflow checks. */
+static inline usize
+psize_safe(Spec *spec) {
+ usize psize = sizeof(Page);
+
+ usize ksize = spec->ksize;
+ assert(!r_mul_will_overflow_(PAGE_LEN, ksize)
+ && PAGE_LEN*ksize <= USIZE_MAX - psize);
+ psize += PAGE_LEN * ksize;
+
+ usize vsize = spec->vsize;
+ assert(!r_mul_will_overflow_(PAGE_LEN, vsize)
+ && PAGE_LEN*vsize <= USIZE_MAX - psize);
+ psize += PAGE_LEN * vsize;
+
+ return psize;
+}
+
+static inline void *
+key(Page *p, usize j, Spec *spec) {
+ return (u8 *)(p+1) + j*spec->ksize;
+}
+
+static inline void *
+val(Page *p, usize j, Spec *spec) {
+ return (u8 *)(p+1) + PAGE_LEN*spec->ksize + j*spec->vsize;
+}
+
+static void
+insert(Page *pages, u8 b, void *k, void *v, u64 h, Spec *spec) {
+ usize psize = psize_unsafe(spec);
+
+ u64 h0 = hash0(h, b);
+ u64 h1 = hash1(h, b);
+ u8 h2 = hash2(h, b);
+
+ usize pn = h0;
+ for (usize i = 1;; i++) {
+ Page *p = (void *)((u8 *)pages + pn*psize);
+
+ usize j = h1;
+ do {
+ if (p->h2[j] < MIN_HASH2) {
+ p->h2[j] = h2;
+ memcpy(key(p, j, spec), k, spec->ksize);
+ memcpy(val(p, j, spec), v, spec->vsize);
+ return;
+ }
+
+ j = (j + 1) & (PAGE_LEN - 1);
+ } while (j != h1);
+
+ pn = (pn + i) & ((USIZE_C(1) << b) - 1u);
+ }
+}
+
+static int
+grow(Dict *d, Spec *spec) {
+ usize psize = psize_unsafe(spec);
+
+ u8 oldb = d->b;
+ u8 newb = oldb + 1;
+ assert(newb < USIZE_BITS);
+
+ usize oldlen = USIZE_C(1) << oldb;
+ Page *old = d->pages;
+
+ usize newlen = USIZE_C(1) << newb;
+ if (r_mul_will_overflow_(newlen, psize)) {
+ errno = ENOMEM;
+ return -1;
+ }
+ Page *new = spec->allocz(newlen * psize);
+ if (!new) return -1;
+
+ for (usize i = 0; i < oldlen; i++) {
+ Page *p = (void *)((u8 *)old + i*psize);
+
+ for (usize j = 0; j < PAGE_LEN; j++) {
+ if (p->h2[j] < MIN_HASH2) continue;
+ void *k = key(p, j, spec);
+ void *v = val(p, j, spec);
+ u64 h = spec->hash(k, d->seed, spec);
+ insert(new, newb, k, v, h, spec);
+ }
+ }
+
+ d->b = newb;
+ d->pages = new;
+ d->ntombs = 0;
+ spec->free(old);
+
+ return 0;
+}
+
+static inline int
+scan(Dict *d, void **v, void *k, u64 h, int mode, Spec *spec) {
+ assert(mode != CREATE || d->niter == 0,
+ "can not create new dict entry while iterating");
+
+ *v = 0;
+
+ if (mode == CREATE && exceeds_lf(d->count + d->ntombs, d->b)) {
+ if (grow(d, spec) < 0)
+ return -1;
+ }
+
+ usize psize = psize_unsafe(spec);
+ u64 h0 = hash0(h, d->b);
+ u64 h1 = hash1(h, d->b);
+ u8 h2 = hash2(h, d->b);
+
+ usize pn = h0;
+ Page *createp = 0;
+ usize createj = 0; /* XXX: this does not need to be initialized, but GCC doesn't realize that */
+ for (usize i = 1;; i++) {
+ Page *p = (void *)((u8 *)d->pages + pn*psize);
+
+ bool seen_empty = false;
+ usize j = h1;
+ do {
+ if (p->h2[j] < MIN_HASH2) {
+ seen_empty = p->h2[j] == EMPTY;
+ if (!createp) {
+ createp = p;
+ createj = j;
+ }
+ } else if (p->h2[j] == h2 && spec->eq(key(p, j, spec), k, spec)) {
+ /* TODO: try branch predict hint true on eq */
+ if (mode == DELETE) {
+ if (seen_empty || r_r8search64(r_readh64(p->h2)) < 8) {
+ p->h2[j] = EMPTY;
+ } else {
+ p->h2[j] = TOMBSTONE;
+ d->ntombs += 1;
+ }
+ d->count -= 1;
+ }
+ *v = val(p, j, spec);
+ return 1;
+ }
+
+ j = (j + 1) & (PAGE_LEN - 1);
+ } while (j != h1);
+
+ if (seen_empty) break;
+
+ pn = (pn + i) & ((USIZE_C(1) << d->b) - 1u);
+ }
+
+ if (mode == CREATE) {
+ d->count += 1;
+ createp->h2[createj] = h2;
+ /* TODO: try returning a pointer to the key slot instead, so that the
+ * compiler can generate copy code when ksize is a constant */
+ memcpy(key(createp, createj, spec), k, spec->ksize);
+ *v = val(createp, createj, spec);
+ }
+
+ return 0;
+}
+
+int
+r_dict_init(Dict *d, usize hint, Spec *spec) {
+ memset(d, 0, sizeof *d);
+
+ /* Impose maximum on hint to prevent overflow and the following while
+ * loop from being infinite. See exceeds_lf. */
+ hint = MIN(hint, (U64_MAX / LF_DEN) >> 1);
+ u8 b = 1;
+ while (exceeds_lf(hint, b)) b += 1;
+ assert(b < USIZE_BITS);
+ d->b = b;
+
+ usize len = USIZE_C(1) << b;
+ usize psize = psize_safe(spec);
+ if (r_mul_will_overflow_(len, psize)) {
+ errno = ENOMEM;
+ return -1;
+ }
+ d->pages = spec->allocz(len * psize);
+ if (!d->pages) return -1;
+
+ d->seed = r_prand64();
+
+ return 0;
+}
+
+int
+r_dict_access(Dict *d, void **v, void *k, u64 h, Spec *spec) {
+ return scan(d, v, k, h, ACCESS, spec);
+}
+
+int
+r_dict_create(Dict *d, void **v, void *k, u64 h, Spec *spec) {
+ return scan(d, v, k, h, CREATE, spec);
+}
+
+int
+r_dict_delete(Dict *d, void **v, void *k, u64 h, Spec *spec) {
+ return scan(d, v, k, h, DELETE, spec);
+}
+
+void
+r_dict_clear(Dict *d, Spec *spec) {
+ usize psize = psize_unsafe(spec);
+
+ usize len = USIZE_C(1) << d->b;
+ for (usize i = 0; i < len; i++) {
+ Page *p = (void *)((u8 *)d->pages + i*psize);
+ memset(p->h2, EMPTY, sizeof p->h2);
+ }
+
+ d->count = 0;
+ d->ntombs = 0;
+
+ /* Re-seed to prevent some attacks. */
+ d->seed = r_prand64();
+}
+
+// TODO: iter