From 1515b91546c42d764e6c4195c8958ff714a02459 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Mon, 25 Mar 2013 02:39:48 +0600 Subject: [PATCH] update hat-trie C library to latest version --- hat-trie/Makefile.am | 2 + hat-trie/configure.ac | 7 +- hat-trie/src/Makefile.am | 4 +- hat-trie/src/ahtable.c | 247 +++++++-- hat-trie/src/ahtable.h | 7 +- hat-trie/src/hat-trie.c | 238 +++++---- hat-trie/src/hat-trie.h | 5 +- hat-trie/src/murmurhash3.h | 12 + hat-trie/src/pstdint.h | 800 ++++++++++++++++++++++++++++++ hat-trie/src/superfasthash.c | 97 ---- hat-trie/src/superfasthash.h | 41 -- hat-trie/test/Makefile.am | 6 +- hat-trie/test/bench_sorted_iter.c | 69 +++ hat-trie/test/check_ahtable.c | 88 +++- hat-trie/test/check_hattrie.c | 103 +++- hat-trie/test/str_map.c | 25 + hat-trie/test/str_map.h | 2 +- 17 files changed, 1438 insertions(+), 315 deletions(-) create mode 100644 hat-trie/src/murmurhash3.h create mode 100644 hat-trie/src/pstdint.h delete mode 100644 hat-trie/src/superfasthash.c delete mode 100644 hat-trie/src/superfasthash.h create mode 100644 hat-trie/test/bench_sorted_iter.c diff --git a/hat-trie/Makefile.am b/hat-trie/Makefile.am index 831d581..9df925f 100644 --- a/hat-trie/Makefile.am +++ b/hat-trie/Makefile.am @@ -6,3 +6,5 @@ EXTRA_DIST = README.md COPYING pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = hat-trie-0.1.pc +ACLOCAL_AMFLAGS=-I m4 + diff --git a/hat-trie/configure.ac b/hat-trie/configure.ac index aa66c24..870b786 100644 --- a/hat-trie/configure.ac +++ b/hat-trie/configure.ac @@ -1,8 +1,8 @@ AC_INIT([hat-trie], [0.1.0], [dcjones@cs.washington.edu]) AM_INIT_AUTOMAKE([foreign]) -AC_CONFIG_HEADERS([config.h]) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) +AC_CONFIG_MACRO_DIR([m4]) base_CFLAGS="-std=c99 -Wall -Wextra -pedantic" opt_CFLAGS="${base_CFLAGS} -O3" @@ -27,12 +27,7 @@ AC_DISABLE_SHARED AC_PROG_LIBTOOL AC_C_BIGENDIAN([AC_MSG_ERROR([Big-endian systems are not currently supported.])]) -AC_CHECK_HEADERS([stdint.h stdlib.h]) AC_HEADER_STDBOOL -AC_TYPE_SIZE_T -AC_TYPE_UINT16_T -AC_TYPE_UINT32_T -AC_TYPE_UINT8_T AC_CONFIG_FILES([hat-trie-0.1.pc Makefile src/Makefile test/Makefile]) AC_OUTPUT diff --git a/hat-trie/src/Makefile.am b/hat-trie/src/Makefile.am index b0ca614..942bc65 100644 --- a/hat-trie/src/Makefile.am +++ b/hat-trie/src/Makefile.am @@ -5,7 +5,7 @@ libhat_trie_la_SOURCES = common.h \ ahtable.h ahtable.c \ hat-trie.h hat-trie.c \ misc.h misc.c \ - superfasthash.h superfasthash.c + murmurhash3.h murmurhash3.c -pkginclude_HEADERS = hat-trie.h ahtable.h common.h +pkginclude_HEADERS = hat-trie.h ahtable.h common.h pstdint.h diff --git a/hat-trie/src/ahtable.c b/hat-trie/src/ahtable.c index a0a812c..01bb4a9 100644 --- a/hat-trie/src/ahtable.c +++ b/hat-trie/src/ahtable.c @@ -7,8 +7,7 @@ #include "ahtable.h" #include "misc.h" -#include "superfasthash.h" -// #include "config.h" +#include "murmurhash3.h" #include #include @@ -18,6 +17,15 @@ const double ahtable_max_load_factor = 100000.0; /* arbitrary large number => do const const size_t ahtable_initial_size = 4096; static const uint16_t LONG_KEYLEN_MASK = 0x7fff; +static size_t keylen(slot_t s) { + if (0x1 & *s) { + return (size_t) (*((uint16_t*) s) >> 1); + } + else { + return (size_t) (*s >> 1); + } +} + ahtable_t* ahtable_create() { @@ -46,6 +54,7 @@ ahtable_t* ahtable_create_n(size_t n) void ahtable_free(ahtable_t* T) { + if (T == NULL) return; size_t i; for (i = 0; i < T->n; ++i) free(T->slots[i]); free(T->slots); @@ -106,14 +115,15 @@ static void ahtable_expand(ahtable_t* T) * One little shortcut we can take on the memory allocation front is to * figure out how much memory each slot needs in advance. */ + assert(T->n > 0); size_t new_n = 2 * T->n; size_t* slot_sizes = malloc_or_die(new_n * sizeof(size_t)); memset(slot_sizes, 0, new_n * sizeof(size_t)); const char* key; - size_t len; + size_t len = 0; size_t m = 0; - ahtable_iter_t* i = ahtable_iter_begin(T); + ahtable_iter_t* i = ahtable_iter_begin(T, false); while (!ahtable_iter_finished(i)) { key = ahtable_iter_key(i, &len); slot_sizes[hash(key, len) % new_n] += @@ -146,7 +156,7 @@ static void ahtable_expand(ahtable_t* T) m = 0; value_t* u; value_t* v; - i = ahtable_iter_begin(T); + i = ahtable_iter_begin(T, false); while (!ahtable_iter_finished(i)) { key = ahtable_iter_key(i, &len); @@ -193,16 +203,9 @@ static value_t* get_key(ahtable_t* T, const char* key, size_t len, bool insert_m /* search the array for our key */ s = T->slots[i]; while ((size_t) (s - T->slots[i]) < T->slot_sizes[i]) { - /* get the key length */ - if (0x1 & *s) { - k = (size_t) (*((uint16_t*) s) >> 1); - s += 2; - } - else { - k = (size_t) (*s >> 1); - s += 1; - } + k = keylen(s); + s += k < 128 ? 1 : 2; /* skip keys that are longer than ours */ if (k != len) { @@ -253,7 +256,7 @@ value_t* ahtable_tryget(ahtable_t* T, const char* key, size_t len ) } -void ahtable_del(ahtable_t* T, const char* key, size_t len) +int ahtable_del(ahtable_t* T, const char* key, size_t len) { uint32_t i = hash(key, len) % T->n; size_t k; @@ -262,16 +265,9 @@ void ahtable_del(ahtable_t* T, const char* key, size_t len) /* search the array for our key */ s = T->slots[i]; while ((size_t) (s - T->slots[i]) < T->slot_sizes[i]) { - /* get the key length */ - if (0x1 & *s) { - k = (size_t) (*((uint16_t*) s)) >> 1; - s += 2; - } - else { - k = (size_t) (*s >> 1); - s += 1; - } + k = keylen(s); + s += k < 128 ? 1 : 2; /* skip keys that are longer than ours */ if (k != len) { @@ -283,11 +279,11 @@ void ahtable_del(ahtable_t* T, const char* key, size_t len) if (memcmp(s, key, len) == 0) { /* move everything over, resize the array */ unsigned char* t = s + len + sizeof(value_t); - s -= k > 255 ? 2 : 1; + s -= k < 128 ? 1 : 2; memmove(s, t, T->slot_sizes[i] - (size_t) (t - T->slots[i])); T->slot_sizes[i] -= (size_t) (t - s); --T->m; - return; + return 0; } /* key not found. */ else { @@ -297,22 +293,119 @@ void ahtable_del(ahtable_t* T, const char* key, size_t len) } // Key was not found. Do nothing. + return -1; } -struct ahtable_iter_t_ +static int cmpkey(const void* a_, const void* b_) +{ + slot_t a = *(slot_t*) a_; + slot_t b = *(slot_t*) b_; + + size_t ka = keylen(a), kb = keylen(b); + + a += ka < 128 ? 1 : 2; + b += kb < 128 ? 1 : 2; + + int c = memcmp(a, b, ka < kb ? ka : kb); + return c == 0 ? (int) ka - (int) kb : c; +} + + +/* Sorted/unsorted iterators are kept private and exposed by passing the +sorted flag to ahtable_iter_begin. */ + +typedef struct ahtable_sorted_iter_t_ +{ + const ahtable_t* T; // parent + slot_t* xs; // pointers to keys + size_t i; // current key +} ahtable_sorted_iter_t; + + +static ahtable_sorted_iter_t* ahtable_sorted_iter_begin(const ahtable_t* T) +{ + ahtable_sorted_iter_t* i = malloc_or_die(sizeof(ahtable_sorted_iter_t)); + i->T = T; + i->xs = malloc_or_die(T->m * sizeof(slot_t)); + i->i = 0; + + slot_t s; + size_t j, k, u; + for (j = 0, u = 0; j < T->n; ++j) { + s = T->slots[j]; + while (s < T->slots[j] + T->slot_sizes[j]) { + i->xs[u++] = s; + k = keylen(s); + s += k < 128 ? 1 : 2; + s += k + sizeof(value_t); + } + } + + qsort(i->xs, T->m, sizeof(slot_t), cmpkey); + + return i; +} + + +static bool ahtable_sorted_iter_finished(ahtable_sorted_iter_t* i) +{ + return i->i >= i->T->m; +} + + +static void ahtable_sorted_iter_next(ahtable_sorted_iter_t* i) +{ + if (ahtable_sorted_iter_finished(i)) return; + ++i->i; +} + + +static void ahtable_sorted_iter_free(ahtable_sorted_iter_t* i) +{ + if (i == NULL) return; + free(i->xs); + free(i); +} + + +static const char* ahtable_sorted_iter_key(ahtable_sorted_iter_t* i, size_t* len) +{ + if (ahtable_sorted_iter_finished(i)) return NULL; + + slot_t s = i->xs[i->i]; + *len = keylen(s); + + return (const char*) (s + (*len < 128 ? 1 : 2)); +} + + +static value_t* ahtable_sorted_iter_val(ahtable_sorted_iter_t* i) +{ + if (ahtable_sorted_iter_finished(i)) return NULL; + + slot_t s = i->xs[i->i]; + size_t k = keylen(s); + + s += k < 128 ? 1 : 2; + s += k; + + return (value_t*) s; +} + + +typedef struct ahtable_unsorted_iter_t_ { const ahtable_t* T; // parent size_t i; // slot index slot_t s; // slot position -}; +} ahtable_unsorted_iter_t; - -ahtable_iter_t* ahtable_iter_begin(const ahtable_t* T) +static ahtable_unsorted_iter_t* ahtable_unsorted_iter_begin(const ahtable_t* T) { - ahtable_iter_t* i = malloc_or_die(sizeof(ahtable_iter_t)); + ahtable_unsorted_iter_t* i = malloc_or_die(sizeof(ahtable_unsorted_iter_t)); i->T = T; for (i->i = 0; i->i < i->T->n; ++i->i) { @@ -325,21 +418,19 @@ ahtable_iter_t* ahtable_iter_begin(const ahtable_t* T) } -void ahtable_iter_next(ahtable_iter_t* i) +static bool ahtable_unsorted_iter_finished(ahtable_unsorted_iter_t* i) { - if (ahtable_iter_finished(i)) return; + return i->i >= i->T->n; +} - size_t k; + +static void ahtable_unsorted_iter_next(ahtable_unsorted_iter_t* i) +{ + if (ahtable_unsorted_iter_finished(i)) return; /* get the key length */ - if (0x1 & *i->s) { - k = (size_t) ((*((uint16_t*) i->s)) >> 1); - i->s += 2; - } - else { - k = (size_t) (*i->s >> 1); - i->s += 1; - } + size_t k = keylen(i->s); + i->s += k < 128 ? 1 : 2; /* skip to the next key */ i->s += k + sizeof(value_t); @@ -356,23 +447,15 @@ void ahtable_iter_next(ahtable_iter_t* i) } - -bool ahtable_iter_finished(ahtable_iter_t* i) -{ - return i->i >= i->T->n; -} - - -void ahtable_iter_free(ahtable_iter_t* i) +static void ahtable_unsorted_iter_free(ahtable_unsorted_iter_t* i) { free(i); } - -const char* ahtable_iter_key(ahtable_iter_t* i, size_t* len) +static const char* ahtable_unsorted_iter_key(ahtable_unsorted_iter_t* i, size_t* len) { - if (ahtable_iter_finished(i)) return NULL; + if (ahtable_unsorted_iter_finished(i)) return NULL; slot_t s = i->s; size_t k; @@ -390,9 +473,9 @@ const char* ahtable_iter_key(ahtable_iter_t* i, size_t* len) } -value_t* ahtable_iter_val(ahtable_iter_t* i) +static value_t* ahtable_unsorted_iter_val(ahtable_unsorted_iter_t* i) { - if (ahtable_iter_finished(i)) return NULL; + if (ahtable_unsorted_iter_finished(i)) return NULL; slot_t s = i->s; @@ -411,4 +494,58 @@ value_t* ahtable_iter_val(ahtable_iter_t* i) } +struct ahtable_iter_t_ +{ + bool sorted; + union { + ahtable_unsorted_iter_t* unsorted; + ahtable_sorted_iter_t* sorted; + } i; +}; + + +ahtable_iter_t* ahtable_iter_begin(const ahtable_t* T, bool sorted) { + ahtable_iter_t* i = malloc_or_die(sizeof(ahtable_iter_t)); + i->sorted = sorted; + if (sorted) i->i.sorted = ahtable_sorted_iter_begin(T); + else i->i.unsorted = ahtable_unsorted_iter_begin(T); + return i; +} + + +void ahtable_iter_next(ahtable_iter_t* i) +{ + if (i->sorted) ahtable_sorted_iter_next(i->i.sorted); + else ahtable_unsorted_iter_next(i->i.unsorted); +} + + +bool ahtable_iter_finished(ahtable_iter_t* i) +{ + if (i->sorted) return ahtable_sorted_iter_finished(i->i.sorted); + else return ahtable_unsorted_iter_finished(i->i.unsorted); +} + + +void ahtable_iter_free(ahtable_iter_t* i) +{ + if (i == NULL) return; + if (i->sorted) ahtable_sorted_iter_free(i->i.sorted); + else ahtable_unsorted_iter_free(i->i.unsorted); + free(i); +} + + +const char* ahtable_iter_key(ahtable_iter_t* i, size_t* len) +{ + if (i->sorted) return ahtable_sorted_iter_key(i->i.sorted, len); + else return ahtable_unsorted_iter_key(i->i.unsorted, len); +} + + +value_t* ahtable_iter_val(ahtable_iter_t* i) +{ + if (i->sorted) return ahtable_sorted_iter_val(i->i.sorted); + else return ahtable_unsorted_iter_val(i->i.unsorted); +} diff --git a/hat-trie/src/ahtable.h b/hat-trie/src/ahtable.h index 1a6f3a1..bf6f782 100644 --- a/hat-trie/src/ahtable.h +++ b/hat-trie/src/ahtable.h @@ -24,8 +24,8 @@ extern "C" { #endif #include -#include #include +#include "pstdint.h" #include "common.h" typedef unsigned char* slot_t; @@ -72,18 +72,19 @@ value_t* ahtable_get (ahtable_t*, const char* key, size_t len); value_t* ahtable_tryget (ahtable_t*, const char* key, size_t len); -void ahtable_del(ahtable_t*, const char* key, size_t len); +int ahtable_del(ahtable_t*, const char* key, size_t len); typedef struct ahtable_iter_t_ ahtable_iter_t; -ahtable_iter_t* ahtable_iter_begin (const ahtable_t*); +ahtable_iter_t* ahtable_iter_begin (const ahtable_t*, bool sorted); void ahtable_iter_next (ahtable_iter_t*); bool ahtable_iter_finished (ahtable_iter_t*); void ahtable_iter_free (ahtable_iter_t*); const char* ahtable_iter_key (ahtable_iter_t*, size_t* len); value_t* ahtable_iter_val (ahtable_iter_t*); + #ifdef __cplusplus } #endif diff --git a/hat-trie/src/hat-trie.c b/hat-trie/src/hat-trie.c index 310b541..8b87752 100644 --- a/hat-trie/src/hat-trie.c +++ b/hat-trie/src/hat-trie.c @@ -8,16 +8,21 @@ #include "hat-trie.h" #include "ahtable.h" #include "misc.h" +#include "pstdint.h" #include -#include #include +#define HT_UNUSED(x) x=x + /* maximum number of keys that may be stored in a bucket before it is burst */ static const size_t MAX_BUCKET_SIZE = 16384; +#define NODE_MAXCHAR 0xff // 0x7f for 7-bit ASCII +#define NODE_CHILDS (NODE_MAXCHAR+1) static const uint8_t NODE_TYPE_TRIE = 0x1; static const uint8_t NODE_TYPE_PURE_BUCKET = 0x2; static const uint8_t NODE_TYPE_HYBRID_BUCKET = 0x4; +static const uint8_t NODE_HAS_VAL = 0x8; struct trie_node_t_; @@ -38,37 +43,101 @@ typedef struct trie_node_t_ /* the value for the key that is consumed on a trie node */ value_t val; - bool has_val; /* Map a character to either a trie_node_t or a ahtable_t. The first byte * must be examined to determine which. */ - node_ptr xs[256]; + node_ptr xs[NODE_CHILDS]; } trie_node_t; +struct hattrie_t_ +{ + node_ptr root; // root node + size_t m; // number of stored keys +}; /* Create a new trie node with all pointer pointing to the given child (which * can be NULL). */ -static trie_node_t* alloc_trie_node(node_ptr child) +static trie_node_t* alloc_trie_node(hattrie_t* T, node_ptr child) { trie_node_t* node = malloc_or_die(sizeof(trie_node_t)); node->flag = NODE_TYPE_TRIE; - node->val = 0; - node->has_val = false; - + node->val = 0; + + /* pass T to allow custom allocator for trie. */ + HT_UNUSED(T); /* unused now */ + size_t i; - for (i = 0; i < 256; ++i) node->xs[i] = child; + for (i = 0; i < NODE_CHILDS; ++i) node->xs[i] = child; return node; } +/* iterate trie nodes until string is consumed or bucket is found */ +static node_ptr hattrie_consume(node_ptr *p, const char **k, size_t *l, unsigned brk) +{ + node_ptr node = p->t->xs[(unsigned char) **k]; + while (*node.flag & NODE_TYPE_TRIE && *l > brk) { + ++*k; + --*l; + *p = node; + node = node.t->xs[(unsigned char) **k]; + } -struct hattrie_t_ + /* copy and writeback variables if it's faster */ + + assert(*p->flag & NODE_TYPE_TRIE); + return node; +} + +/* use node value and return pointer to it */ +static inline value_t* hattrie_useval(hattrie_t *T, node_ptr n) { - node_ptr root; // root node - size_t m; // number of stored keys -}; + if (!(n.t->flag & NODE_HAS_VAL)) { + n.t->flag |= NODE_HAS_VAL; + ++T->m; + } + return &n.t->val; +} + +/* clear node value if exists */ +static inline int hattrie_clrval(hattrie_t *T, node_ptr n) +{ + if (n.t->flag & NODE_HAS_VAL) { + n.t->flag &= ~NODE_HAS_VAL; + n.t->val = 0; + --T->m; + return 0; + } + return -1; +} + +/* find node in trie */ +static node_ptr hattrie_find(hattrie_t* T, const char **key, size_t *len) +{ + node_ptr parent = T->root; + assert(*parent.flag & NODE_TYPE_TRIE); + if (*len == 0) return parent; + node_ptr node = hattrie_consume(&parent, key, len, 1); + + /* if the trie node consumes value, use it */ + if (*node.flag & NODE_TYPE_TRIE) { + if (!(node.t->flag & NODE_HAS_VAL)) { + node.flag = NULL; + } + return node; + } + + /* pure bucket holds only key suffixes, skip current char */ + if (*node.flag & NODE_TYPE_PURE_BUCKET) { + *key += 1; + *len -= 1; + } + + /* do not scan bucket, it's not needed for this operation */ + return node; +} hattrie_t* hattrie_create() { @@ -79,8 +148,8 @@ hattrie_t* hattrie_create() node.b = ahtable_create(); node.b->flag = NODE_TYPE_HYBRID_BUCKET; node.b->c0 = 0x00; - node.b->c1 = 0xff; - T->root.t = alloc_trie_node(node); + node.b->c1 = NODE_MAXCHAR; + T->root.t = alloc_trie_node(T, node); return T; } @@ -90,7 +159,7 @@ static void hattrie_free_node(node_ptr node) { if (*node.flag & NODE_TYPE_TRIE) { size_t i; - for (i = 0; i < 256; ++i) { + for (i = 0; i < NODE_CHILDS; ++i) { if (i > 0 && node.t->xs[i].t == node.t->xs[i - 1].t) continue; /* XXX: recursion might not be the best choice here. It is possible @@ -113,7 +182,7 @@ void hattrie_free(hattrie_t* T) /* Perform one split operation on the given node with the given parent. */ -static void hattrie_split(node_ptr parent, node_ptr node) +static void hattrie_split(hattrie_t* T, node_ptr parent, node_ptr node) { /* only buckets may be split */ assert(*node.flag & NODE_TYPE_PURE_BUCKET || @@ -123,19 +192,19 @@ static void hattrie_split(node_ptr parent, node_ptr node) if (*node.flag & NODE_TYPE_PURE_BUCKET) { /* turn the pure bucket into a hybrid bucket */ - parent.t->xs[node.b->c0].t = alloc_trie_node(node); + parent.t->xs[node.b->c0].t = alloc_trie_node(T, node); /* if the bucket had an empty key, move it to the new trie node */ value_t* val = ahtable_tryget(node.b, NULL, 0); if (val) { parent.t->xs[node.b->c0].t->val = *val; - parent.t->xs[node.b->c0].t->has_val = true; + parent.t->xs[node.b->c0].t->flag |= NODE_HAS_VAL; *val = 0; ahtable_del(node.b, NULL, 0); } node.b->c0 = 0x00; - node.b->c1 = 0xff; + node.b->c1 = NODE_MAXCHAR; node.b->flag = NODE_TYPE_HYBRID_BUCKET; return; @@ -144,12 +213,12 @@ static void hattrie_split(node_ptr parent, node_ptr node) /* This is a hybrid bucket. Perform a proper split. */ /* count the number of occourances of every leading character */ - unsigned int cs[256]; // occurance count for leading chars - memset(cs, 0, 256 * sizeof(unsigned int)); + unsigned int cs[NODE_CHILDS]; // occurance count for leading chars + memset(cs, 0, NODE_CHILDS * sizeof(unsigned int)); size_t len; const char* key; - ahtable_iter_t* i = ahtable_iter_begin(node.b); + ahtable_iter_t* i = ahtable_iter_begin(node.b, false); while (!ahtable_iter_finished(i)) { key = ahtable_iter_key(i, &len); assert(len > 0); @@ -177,7 +246,7 @@ static void hattrie_split(node_ptr parent, node_ptr node) } /* now split into two node cooresponding to ranges [0, j] and - * [j + 1, 255], respectively. */ + * [j + 1, NODE_MAXCHAR], respectively. */ /* create new left and right nodes */ @@ -222,7 +291,7 @@ static void hattrie_split(node_ptr parent, node_ptr node) /* distribute keys to the new left or right node */ value_t* u; value_t* v; - i = ahtable_iter_begin(node.b); + i = ahtable_iter_begin(node.b, false); while (!ahtable_iter_finished(i)) { key = ahtable_iter_key(i, &len); u = ahtable_iter_val(i); @@ -257,7 +326,6 @@ static void hattrie_split(node_ptr parent, node_ptr node) ahtable_free(node.b); } - value_t* hattrie_get(hattrie_t* T, const char* key, size_t len) { node_ptr parent = T->root; @@ -265,68 +333,36 @@ value_t* hattrie_get(hattrie_t* T, const char* key, size_t len) if (len == 0) return &parent.t->val; - node_ptr node = parent.t->xs[(unsigned char) *key]; - - while (*node.flag & NODE_TYPE_TRIE && len > 0) { - ++key; - --len; - parent = node; - node = node.t->xs[(unsigned char) *key]; - } - + /* consume all trie nodes, now parent must be trie and child anything */ + node_ptr node = hattrie_consume(&parent, &key, &len, 0); assert(*parent.flag & NODE_TYPE_TRIE); - /* if the key has been consumed on a trie node, use its value */ if (len == 0) { if (*node.flag & NODE_TYPE_TRIE) { - if (!node.t->has_val) { - node.t->has_val = true; - ++T->m; - } - return &node.t->val; + return hattrie_useval(T, node); } else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) { - if (!parent.t->has_val) { - parent.t->has_val = true; - ++T->m; - } - return &parent.t->val; + return hattrie_useval(T, parent); } } /* preemptively split the bucket if it is full */ while (ahtable_size(node.b) >= MAX_BUCKET_SIZE) { - hattrie_split(parent, node); + hattrie_split(T, parent, node); /* after the split, the node pointer is invalidated, so we search from * the parent again. */ - node = parent.t->xs[(unsigned char) *key]; - while (*node.flag & NODE_TYPE_TRIE && len > 0) { - ++key; - --len; - parent = node; - node = node.t->xs[(unsigned char) *key]; - } - - assert(*parent.flag & NODE_TYPE_TRIE); + node = hattrie_consume(&parent, &key, &len, 0); /* if the key has been consumed on a trie node, use its value */ if (len == 0) { if (*node.flag & NODE_TYPE_TRIE) { - if (!node.t->has_val) { - node.t->has_val = true; - ++T->m; - } - return &node.t->val; + return hattrie_useval(T, node); } else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) { - if (!parent.t->has_val) { - parent.t->has_val = true; - ++T->m; - } - return &parent.t->val; + return hattrie_useval(T, parent); } } } @@ -350,34 +386,46 @@ value_t* hattrie_get(hattrie_t* T, const char* key, size_t len) value_t* hattrie_tryget(hattrie_t* T, const char* key, size_t len) { - node_ptr parent = T->root; - assert(*parent.flag & NODE_TYPE_TRIE); - - if (len == 0) return &parent.t->val; - node_ptr node = parent.t->xs[(unsigned char) *key]; - - while (*node.flag & NODE_TYPE_TRIE && len > 1) { - ++key; - --len; - parent = node; - node = node.t->xs[(unsigned char) *key]; + /* find node for given key */ + node_ptr node = hattrie_find(T, &key, &len); + if (node.flag == NULL) { + return NULL; } - - - /* if the key has been consumed on a trie node, use its value */ + + /* if the trie node consumes value, use it */ if (*node.flag & NODE_TYPE_TRIE) { - if (!node.t->has_val) { - node.t->has_val = true; - ++T->m; - } return &node.t->val; } - else if (*node.flag & NODE_TYPE_PURE_BUCKET) { - return ahtable_tryget(node.b, key + 1, len - 1); + + return ahtable_tryget(node.b, key, len); +} + + +int hattrie_del(hattrie_t* T, const char* key, size_t len) +{ + node_ptr parent = T->root; + assert(*parent.flag & NODE_TYPE_TRIE); + + /* find node for deletion */ + node_ptr node = hattrie_find(T, &key, &len); + if (node.flag == NULL) { + return -1; } - else { - return ahtable_tryget(node.b, key, len); + + /* if consumed on a trie node, clear the value */ + if (*node.flag & NODE_TYPE_TRIE) { + return hattrie_clrval(T, node); } + + /* remove from bucket */ + size_t m_old = ahtable_size(node.b); + int ret = ahtable_del(node.b, key, len); + T->m -= (m_old - ahtable_size(node.b)); + + /* merge empty buckets */ + /*! \todo */ + + return ret; } @@ -409,6 +457,7 @@ struct hattrie_iter_t_ value_t nil_val; const hattrie_t* T; + bool sorted; ahtable_iter_t* i; hattrie_node_stack_t* stack; }; @@ -450,15 +499,17 @@ static void hattrie_iter_nextnode(hattrie_iter_t* i) if (*node.flag & NODE_TYPE_TRIE) { hattrie_iter_pushchar(i, level, c); - if(node.t->has_val) { + if(node.t->flag & NODE_HAS_VAL) { i->has_nil_key = true; i->nil_val = node.t->val; } /* push all child nodes from right to left */ int j; - for (j = 255; j >= 0; --j) { - if (j < 255 && node.t->xs[j].t == node.t->xs[j + 1].t) continue; + for (j = NODE_MAXCHAR; j >= 0; --j) { + + /* skip repeated pointers to hybrid bucket */ + if (j < NODE_MAXCHAR && node.t->xs[j].t == node.t->xs[j + 1].t) continue; // push stack next = i->stack; @@ -477,15 +528,16 @@ static void hattrie_iter_nextnode(hattrie_iter_t* i) i->level = level - 1; } - i->i = ahtable_iter_begin(node.b); + i->i = ahtable_iter_begin(node.b, i->sorted); } } -hattrie_iter_t* hattrie_iter_begin(const hattrie_t* T) +hattrie_iter_t* hattrie_iter_begin(const hattrie_t* T, bool sorted) { hattrie_iter_t* i = malloc_or_die(sizeof(hattrie_iter_t)); i->T = T; + i->sorted = sorted; i->i = NULL; i->keysize = 16; i->key = malloc_or_die(i->keysize * sizeof(char)); @@ -517,7 +569,6 @@ hattrie_iter_t* hattrie_iter_begin(const hattrie_t* T) } - void hattrie_iter_next(hattrie_iter_t* i) { if (hattrie_iter_finished(i)) return; @@ -546,7 +597,6 @@ void hattrie_iter_next(hattrie_iter_t* i) } - bool hattrie_iter_finished(hattrie_iter_t* i) { return i->stack == NULL && i->i == NULL && !i->has_nil_key; diff --git a/hat-trie/src/hat-trie.h b/hat-trie/src/hat-trie.h index eebf5ed..d8439b6 100644 --- a/hat-trie/src/hat-trie.h +++ b/hat-trie/src/hat-trie.h @@ -48,10 +48,13 @@ value_t* hattrie_get (hattrie_t*, const char* key, size_t len); * exist. */ value_t* hattrie_tryget (hattrie_t*, const char* key, size_t len); +/** Delete a given key from trie. Returns 0 if successful or -1 if not found. + */ +int hattrie_del(hattrie_t* T, const char* key, size_t len); typedef struct hattrie_iter_t_ hattrie_iter_t; -hattrie_iter_t* hattrie_iter_begin (const hattrie_t*); +hattrie_iter_t* hattrie_iter_begin (const hattrie_t*, bool sorted); void hattrie_iter_next (hattrie_iter_t*); bool hattrie_iter_finished (hattrie_iter_t*); void hattrie_iter_free (hattrie_iter_t*); diff --git a/hat-trie/src/murmurhash3.h b/hat-trie/src/murmurhash3.h new file mode 100644 index 0000000..37fbf41 --- /dev/null +++ b/hat-trie/src/murmurhash3.h @@ -0,0 +1,12 @@ + +#ifndef MURMURHASH3_H +#define MURMURHASH3_H + +#include + +#include "pstdint.h" + +uint32_t hash(const char* data, size_t len); + +#endif + diff --git a/hat-trie/src/pstdint.h b/hat-trie/src/pstdint.h new file mode 100644 index 0000000..fa64dbe --- /dev/null +++ b/hat-trie/src/pstdint.h @@ -0,0 +1,800 @@ +/* A portable stdint.h + **************************************************************************** + * BSD License: + **************************************************************************** + * + * Copyright (c) 2005-2011 Paul Hsieh + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************** + * + * Version 0.1.12 + * + * The ANSI C standard committee, for the C99 standard, specified the + * inclusion of a new standard include file called stdint.h. This is + * a very useful and long desired include file which contains several + * very precise definitions for integer scalar types that is + * critically important for making portable several classes of + * applications including cryptography, hashing, variable length + * integer libraries and so on. But for most developers its likely + * useful just for programming sanity. + * + * The problem is that most compiler vendors have decided not to + * implement the C99 standard, and the next C++ language standard + * (which has a lot more mindshare these days) will be a long time in + * coming and its unknown whether or not it will include stdint.h or + * how much adoption it will have. Either way, it will be a long time + * before all compilers come with a stdint.h and it also does nothing + * for the extremely large number of compilers available today which + * do not include this file, or anything comparable to it. + * + * So that's what this file is all about. Its an attempt to build a + * single universal include file that works on as many platforms as + * possible to deliver what stdint.h is supposed to. A few things + * that should be noted about this file: + * + * 1) It is not guaranteed to be portable and/or present an identical + * interface on all platforms. The extreme variability of the + * ANSI C standard makes this an impossibility right from the + * very get go. Its really only meant to be useful for the vast + * majority of platforms that possess the capability of + * implementing usefully and precisely defined, standard sized + * integer scalars. Systems which are not intrinsically 2s + * complement may produce invalid constants. + * + * 2) There is an unavoidable use of non-reserved symbols. + * + * 3) Other standard include files are invoked. + * + * 4) This file may come in conflict with future platforms that do + * include stdint.h. The hope is that one or the other can be + * used with no real difference. + * + * 5) In the current verison, if your platform can't represent + * int32_t, int16_t and int8_t, it just dumps out with a compiler + * error. + * + * 6) 64 bit integers may or may not be defined. Test for their + * presence with the test: #ifdef INT64_MAX or #ifdef UINT64_MAX. + * Note that this is different from the C99 specification which + * requires the existence of 64 bit support in the compiler. If + * this is not defined for your platform, yet it is capable of + * dealing with 64 bits then it is because this file has not yet + * been extended to cover all of your system's capabilities. + * + * 7) (u)intptr_t may or may not be defined. Test for its presence + * with the test: #ifdef PTRDIFF_MAX. If this is not defined + * for your platform, then it is because this file has not yet + * been extended to cover all of your system's capabilities, not + * because its optional. + * + * 8) The following might not been defined even if your platform is + * capable of defining it: + * + * WCHAR_MIN + * WCHAR_MAX + * (u)int64_t + * PTRDIFF_MIN + * PTRDIFF_MAX + * (u)intptr_t + * + * 9) The following have not been defined: + * + * WINT_MIN + * WINT_MAX + * + * 10) The criteria for defining (u)int_least(*)_t isn't clear, + * except for systems which don't have a type that precisely + * defined 8, 16, or 32 bit types (which this include file does + * not support anyways). Default definitions have been given. + * + * 11) The criteria for defining (u)int_fast(*)_t isn't something I + * would trust to any particular compiler vendor or the ANSI C + * committee. It is well known that "compatible systems" are + * commonly created that have very different performance + * characteristics from the systems they are compatible with, + * especially those whose vendors make both the compiler and the + * system. Default definitions have been given, but its strongly + * recommended that users never use these definitions for any + * reason (they do *NOT* deliver any serious guarantee of + * improved performance -- not in this file, nor any vendor's + * stdint.h). + * + * 12) The following macros: + * + * PRINTF_INTMAX_MODIFIER + * PRINTF_INT64_MODIFIER + * PRINTF_INT32_MODIFIER + * PRINTF_INT16_MODIFIER + * PRINTF_LEAST64_MODIFIER + * PRINTF_LEAST32_MODIFIER + * PRINTF_LEAST16_MODIFIER + * PRINTF_INTPTR_MODIFIER + * + * are strings which have been defined as the modifiers required + * for the "d", "u" and "x" printf formats to correctly output + * (u)intmax_t, (u)int64_t, (u)int32_t, (u)int16_t, (u)least64_t, + * (u)least32_t, (u)least16_t and (u)intptr_t types respectively. + * PRINTF_INTPTR_MODIFIER is not defined for some systems which + * provide their own stdint.h. PRINTF_INT64_MODIFIER is not + * defined if INT64_MAX is not defined. These are an extension + * beyond what C99 specifies must be in stdint.h. + * + * In addition, the following macros are defined: + * + * PRINTF_INTMAX_HEX_WIDTH + * PRINTF_INT64_HEX_WIDTH + * PRINTF_INT32_HEX_WIDTH + * PRINTF_INT16_HEX_WIDTH + * PRINTF_INT8_HEX_WIDTH + * PRINTF_INTMAX_DEC_WIDTH + * PRINTF_INT64_DEC_WIDTH + * PRINTF_INT32_DEC_WIDTH + * PRINTF_INT16_DEC_WIDTH + * PRINTF_INT8_DEC_WIDTH + * + * Which specifies the maximum number of characters required to + * print the number of that type in either hexadecimal or decimal. + * These are an extension beyond what C99 specifies must be in + * stdint.h. + * + * Compilers tested (all with 0 warnings at their highest respective + * settings): Borland Turbo C 2.0, WATCOM C/C++ 11.0 (16 bits and 32 + * bits), Microsoft Visual C++ 6.0 (32 bit), Microsoft Visual Studio + * .net (VC7), Intel C++ 4.0, GNU gcc v3.3.3 + * + * This file should be considered a work in progress. Suggestions for + * improvements, especially those which increase coverage are strongly + * encouraged. + * + * Acknowledgements + * + * The following people have made significant contributions to the + * development and testing of this file: + * + * Chris Howie + * John Steele Scott + * Dave Thorup + * John Dill + * + */ + +#include +#include +#include + +/* + * For gcc with _STDINT_H, fill in the PRINTF_INT*_MODIFIER macros, and + * do nothing else. On the Mac OS X version of gcc this is _STDINT_H_. + */ + +#if ((defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) || (defined (__WATCOMC__) && (defined (_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_) || defined (__UINT_FAST64_TYPE__)) )) && !defined (_PSTDINT_H_INCLUDED) +#include +#define _PSTDINT_H_INCLUDED +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER +# endif +# ifndef PRINTF_INT64_HEX_WIDTH +# define PRINTF_INT64_HEX_WIDTH "16" +# endif +# ifndef PRINTF_INT32_HEX_WIDTH +# define PRINTF_INT32_HEX_WIDTH "8" +# endif +# ifndef PRINTF_INT16_HEX_WIDTH +# define PRINTF_INT16_HEX_WIDTH "4" +# endif +# ifndef PRINTF_INT8_HEX_WIDTH +# define PRINTF_INT8_HEX_WIDTH "2" +# endif +# ifndef PRINTF_INT64_DEC_WIDTH +# define PRINTF_INT64_DEC_WIDTH "20" +# endif +# ifndef PRINTF_INT32_DEC_WIDTH +# define PRINTF_INT32_DEC_WIDTH "10" +# endif +# ifndef PRINTF_INT16_DEC_WIDTH +# define PRINTF_INT16_DEC_WIDTH "5" +# endif +# ifndef PRINTF_INT8_DEC_WIDTH +# define PRINTF_INT8_DEC_WIDTH "3" +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH +# endif + +/* + * Something really weird is going on with Open Watcom. Just pull some of + * these duplicated definitions from Open Watcom's stdint.h file for now. + */ + +# if defined (__WATCOMC__) && __WATCOMC__ >= 1250 +# if !defined (INT64_C) +# define INT64_C(x) (x + (INT64_MAX - INT64_MAX)) +# endif +# if !defined (UINT64_C) +# define UINT64_C(x) (x + (UINT64_MAX - UINT64_MAX)) +# endif +# if !defined (INT32_C) +# define INT32_C(x) (x + (INT32_MAX - INT32_MAX)) +# endif +# if !defined (UINT32_C) +# define UINT32_C(x) (x + (UINT32_MAX - UINT32_MAX)) +# endif +# if !defined (INT16_C) +# define INT16_C(x) (x) +# endif +# if !defined (UINT16_C) +# define UINT16_C(x) (x) +# endif +# if !defined (INT8_C) +# define INT8_C(x) (x) +# endif +# if !defined (UINT8_C) +# define UINT8_C(x) (x) +# endif +# if !defined (UINT64_MAX) +# define UINT64_MAX 18446744073709551615ULL +# endif +# if !defined (INT64_MAX) +# define INT64_MAX 9223372036854775807LL +# endif +# if !defined (UINT32_MAX) +# define UINT32_MAX 4294967295UL +# endif +# if !defined (INT32_MAX) +# define INT32_MAX 2147483647L +# endif +# if !defined (INTMAX_MAX) +# define INTMAX_MAX INT64_MAX +# endif +# if !defined (INTMAX_MIN) +# define INTMAX_MIN INT64_MIN +# endif +# endif +#endif + +#ifndef _PSTDINT_H_INCLUDED +#define _PSTDINT_H_INCLUDED + +#ifndef SIZE_MAX +# define SIZE_MAX (~(size_t)0) +#endif + +/* + * Deduce the type assignments from limits.h under the assumption that + * integer sizes in bits are powers of 2, and follow the ANSI + * definitions. + */ + +#ifndef UINT8_MAX +# define UINT8_MAX 0xff +#endif +#ifndef uint8_t +# if (UCHAR_MAX == UINT8_MAX) || defined (S_SPLINT_S) + typedef unsigned char uint8_t; +# define UINT8_C(v) ((uint8_t) v) +# else +# error "Platform not supported" +# endif +#endif + +#ifndef INT8_MAX +# define INT8_MAX 0x7f +#endif +#ifndef INT8_MIN +# define INT8_MIN INT8_C(0x80) +#endif +#ifndef int8_t +# if (SCHAR_MAX == INT8_MAX) || defined (S_SPLINT_S) + typedef signed char int8_t; +# define INT8_C(v) ((int8_t) v) +# else +# error "Platform not supported" +# endif +#endif + +#ifndef UINT16_MAX +# define UINT16_MAX 0xffff +#endif +#ifndef uint16_t +#if (UINT_MAX == UINT16_MAX) || defined (S_SPLINT_S) + typedef unsigned int uint16_t; +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "" +# endif +# define UINT16_C(v) ((uint16_t) (v)) +#elif (USHRT_MAX == UINT16_MAX) + typedef unsigned short uint16_t; +# define UINT16_C(v) ((uint16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef INT16_MAX +# define INT16_MAX 0x7fff +#endif +#ifndef INT16_MIN +# define INT16_MIN INT16_C(0x8000) +#endif +#ifndef int16_t +#if (INT_MAX == INT16_MAX) || defined (S_SPLINT_S) + typedef signed int int16_t; +# define INT16_C(v) ((int16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "" +# endif +#elif (SHRT_MAX == INT16_MAX) + typedef signed short int16_t; +# define INT16_C(v) ((int16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef UINT32_MAX +# define UINT32_MAX (0xffffffffUL) +#endif +#ifndef uint32_t +#if (ULONG_MAX == UINT32_MAX) || defined (S_SPLINT_S) + typedef unsigned long uint32_t; +# define UINT32_C(v) v ## UL +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +#elif (UINT_MAX == UINT32_MAX) + typedef unsigned int uint32_t; +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +# define UINT32_C(v) v ## U +#elif (USHRT_MAX == UINT32_MAX) + typedef unsigned short uint32_t; +# define UINT32_C(v) ((unsigned short) (v)) +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef INT32_MAX +# define INT32_MAX (0x7fffffffL) +#endif +#ifndef INT32_MIN +# define INT32_MIN INT32_C(0x80000000) +#endif +#ifndef int32_t +#if (LONG_MAX == INT32_MAX) || defined (S_SPLINT_S) + typedef signed long int32_t; +# define INT32_C(v) v ## L +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +#elif (INT_MAX == INT32_MAX) + typedef signed int int32_t; +# define INT32_C(v) v +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#elif (SHRT_MAX == INT32_MAX) + typedef signed short int32_t; +# define INT32_C(v) ((short) (v)) +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#else +#error "Platform not supported" +#endif +#endif + +/* + * The macro stdint_int64_defined is temporarily used to record + * whether or not 64 integer support is available. It must be + * defined for any 64 integer extensions for new platforms that are + * added. + */ + +#undef stdint_int64_defined +#if (defined(__STDC__) && defined(__STDC_VERSION__)) || defined (S_SPLINT_S) +# if (__STDC__ && __STDC_VERSION__ >= 199901L) || defined (S_SPLINT_S) +# define stdint_int64_defined + typedef long long int64_t; + typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# endif +#endif + +#if !defined (stdint_int64_defined) +# if defined(__GNUC__) +# define stdint_int64_defined + __extension__ typedef long long int64_t; + __extension__ typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# elif defined(__MWERKS__) || defined (__SUNPRO_C) || defined (__SUNPRO_CC) || defined (__APPLE_CC__) || defined (_LONG_LONG) || defined (_CRAYC) || defined (S_SPLINT_S) +# define stdint_int64_defined + typedef long long int64_t; + typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined (__BORLANDC__) && __BORLANDC__ > 0x460) || defined (__alpha) || defined (__DECC) +# define stdint_int64_defined + typedef __int64 int64_t; + typedef unsigned __int64 uint64_t; +# define UINT64_C(v) v ## UI64 +# define INT64_C(v) v ## I64 +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "I64" +# endif +# endif +#endif + +#if !defined (LONG_LONG_MAX) && defined (INT64_C) +# define LONG_LONG_MAX INT64_C (9223372036854775807) +#endif +#ifndef ULONG_LONG_MAX +# define ULONG_LONG_MAX UINT64_C (18446744073709551615) +#endif + +#if !defined (INT64_MAX) && defined (INT64_C) +# define INT64_MAX INT64_C (9223372036854775807) +#endif +#if !defined (INT64_MIN) && defined (INT64_C) +# define INT64_MIN INT64_C (-9223372036854775808) +#endif +#if !defined (UINT64_MAX) && defined (INT64_C) +# define UINT64_MAX UINT64_C (18446744073709551615) +#endif + +/* + * Width of hexadecimal for number field. + */ + +#ifndef PRINTF_INT64_HEX_WIDTH +# define PRINTF_INT64_HEX_WIDTH "16" +#endif +#ifndef PRINTF_INT32_HEX_WIDTH +# define PRINTF_INT32_HEX_WIDTH "8" +#endif +#ifndef PRINTF_INT16_HEX_WIDTH +# define PRINTF_INT16_HEX_WIDTH "4" +#endif +#ifndef PRINTF_INT8_HEX_WIDTH +# define PRINTF_INT8_HEX_WIDTH "2" +#endif + +#ifndef PRINTF_INT64_DEC_WIDTH +# define PRINTF_INT64_DEC_WIDTH "20" +#endif +#ifndef PRINTF_INT32_DEC_WIDTH +# define PRINTF_INT32_DEC_WIDTH "10" +#endif +#ifndef PRINTF_INT16_DEC_WIDTH +# define PRINTF_INT16_DEC_WIDTH "5" +#endif +#ifndef PRINTF_INT8_DEC_WIDTH +# define PRINTF_INT8_DEC_WIDTH "3" +#endif + +/* + * Ok, lets not worry about 128 bit integers for now. Moore's law says + * we don't need to worry about that until about 2040 at which point + * we'll have bigger things to worry about. + */ + +#ifdef stdint_int64_defined + typedef int64_t intmax_t; + typedef uint64_t uintmax_t; +# define INTMAX_MAX INT64_MAX +# define INTMAX_MIN INT64_MIN +# define UINTMAX_MAX UINT64_MAX +# define UINTMAX_C(v) UINT64_C(v) +# define INTMAX_C(v) INT64_C(v) +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH +# endif +#else + typedef int32_t intmax_t; + typedef uint32_t uintmax_t; +# define INTMAX_MAX INT32_MAX +# define UINTMAX_MAX UINT32_MAX +# define UINTMAX_C(v) UINT32_C(v) +# define INTMAX_C(v) INT32_C(v) +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT32_MODIFIER +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT32_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT32_DEC_WIDTH +# endif +#endif + +/* + * Because this file currently only supports platforms which have + * precise powers of 2 as bit sizes for the default integers, the + * least definitions are all trivial. Its possible that a future + * version of this file could have different definitions. + */ + +#ifndef stdint_least_defined + typedef int8_t int_least8_t; + typedef uint8_t uint_least8_t; + typedef int16_t int_least16_t; + typedef uint16_t uint_least16_t; + typedef int32_t int_least32_t; + typedef uint32_t uint_least32_t; +# define PRINTF_LEAST32_MODIFIER PRINTF_INT32_MODIFIER +# define PRINTF_LEAST16_MODIFIER PRINTF_INT16_MODIFIER +# define UINT_LEAST8_MAX UINT8_MAX +# define INT_LEAST8_MAX INT8_MAX +# define UINT_LEAST16_MAX UINT16_MAX +# define INT_LEAST16_MAX INT16_MAX +# define UINT_LEAST32_MAX UINT32_MAX +# define INT_LEAST32_MAX INT32_MAX +# define INT_LEAST8_MIN INT8_MIN +# define INT_LEAST16_MIN INT16_MIN +# define INT_LEAST32_MIN INT32_MIN +# ifdef stdint_int64_defined + typedef int64_t int_least64_t; + typedef uint64_t uint_least64_t; +# define PRINTF_LEAST64_MODIFIER PRINTF_INT64_MODIFIER +# define UINT_LEAST64_MAX UINT64_MAX +# define INT_LEAST64_MAX INT64_MAX +# define INT_LEAST64_MIN INT64_MIN +# endif +#endif +#undef stdint_least_defined + +/* + * The ANSI C committee pretending to know or specify anything about + * performance is the epitome of misguided arrogance. The mandate of + * this file is to *ONLY* ever support that absolute minimum + * definition of the fast integer types, for compatibility purposes. + * No extensions, and no attempt to suggest what may or may not be a + * faster integer type will ever be made in this file. Developers are + * warned to stay away from these types when using this or any other + * stdint.h. + */ + +typedef int_least8_t int_fast8_t; +typedef uint_least8_t uint_fast8_t; +typedef int_least16_t int_fast16_t; +typedef uint_least16_t uint_fast16_t; +typedef int_least32_t int_fast32_t; +typedef uint_least32_t uint_fast32_t; +#define UINT_FAST8_MAX UINT_LEAST8_MAX +#define INT_FAST8_MAX INT_LEAST8_MAX +#define UINT_FAST16_MAX UINT_LEAST16_MAX +#define INT_FAST16_MAX INT_LEAST16_MAX +#define UINT_FAST32_MAX UINT_LEAST32_MAX +#define INT_FAST32_MAX INT_LEAST32_MAX +#define INT_FAST8_MIN INT_LEAST8_MIN +#define INT_FAST16_MIN INT_LEAST16_MIN +#define INT_FAST32_MIN INT_LEAST32_MIN +#ifdef stdint_int64_defined + typedef int_least64_t int_fast64_t; + typedef uint_least64_t uint_fast64_t; +# define UINT_FAST64_MAX UINT_LEAST64_MAX +# define INT_FAST64_MAX INT_LEAST64_MAX +# define INT_FAST64_MIN INT_LEAST64_MIN +#endif + +#undef stdint_int64_defined + +/* + * Whatever piecemeal, per compiler thing we can do about the wchar_t + * type limits. + */ + +#if defined(__WATCOMC__) || defined(_MSC_VER) || defined (__GNUC__) +# include +# ifndef WCHAR_MIN +# define WCHAR_MIN 0 +# endif +# ifndef WCHAR_MAX +# define WCHAR_MAX ((wchar_t)-1) +# endif +#endif + +/* + * Whatever piecemeal, per compiler/platform thing we can do about the + * (u)intptr_t types and limits. + */ + +#if defined (_MSC_VER) && defined (_UINTPTR_T_DEFINED) +# define STDINT_H_UINTPTR_T_DEFINED +#endif + +#ifndef STDINT_H_UINTPTR_T_DEFINED +# if defined (__alpha__) || defined (__ia64__) || defined (__x86_64__) || defined (_WIN64) +# define stdint_intptr_bits 64 +# elif defined (__WATCOMC__) || defined (__TURBOC__) +# if defined(__TINY__) || defined(__SMALL__) || defined(__MEDIUM__) +# define stdint_intptr_bits 16 +# else +# define stdint_intptr_bits 32 +# endif +# elif defined (__i386__) || defined (_WIN32) || defined (WIN32) +# define stdint_intptr_bits 32 +# elif defined (__INTEL_COMPILER) +/* TODO -- what did Intel do about x86-64? */ +# endif + +# ifdef stdint_intptr_bits +# define stdint_intptr_glue3_i(a,b,c) a##b##c +# define stdint_intptr_glue3(a,b,c) stdint_intptr_glue3_i(a,b,c) +# ifndef PRINTF_INTPTR_MODIFIER +# define PRINTF_INTPTR_MODIFIER stdint_intptr_glue3(PRINTF_INT,stdint_intptr_bits,_MODIFIER) +# endif +# ifndef PTRDIFF_MAX +# define PTRDIFF_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) +# endif +# ifndef PTRDIFF_MIN +# define PTRDIFF_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) +# endif +# ifndef UINTPTR_MAX +# define UINTPTR_MAX stdint_intptr_glue3(UINT,stdint_intptr_bits,_MAX) +# endif +# ifndef INTPTR_MAX +# define INTPTR_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) +# endif +# ifndef INTPTR_MIN +# define INTPTR_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) +# endif +# ifndef INTPTR_C +# define INTPTR_C(x) stdint_intptr_glue3(INT,stdint_intptr_bits,_C)(x) +# endif +# ifndef UINTPTR_C +# define UINTPTR_C(x) stdint_intptr_glue3(UINT,stdint_intptr_bits,_C)(x) +# endif + typedef stdint_intptr_glue3(uint,stdint_intptr_bits,_t) uintptr_t; + typedef stdint_intptr_glue3( int,stdint_intptr_bits,_t) intptr_t; +# else +/* TODO -- This following is likely wrong for some platforms, and does + nothing for the definition of uintptr_t. */ + typedef ptrdiff_t intptr_t; +# endif +# define STDINT_H_UINTPTR_T_DEFINED +#endif + +/* + * Assumes sig_atomic_t is signed and we have a 2s complement machine. + */ + +#ifndef SIG_ATOMIC_MAX +# define SIG_ATOMIC_MAX ((((sig_atomic_t) 1) << (sizeof (sig_atomic_t)*CHAR_BIT-1)) - 1) +#endif + +#endif + +#if defined (__TEST_PSTDINT_FOR_CORRECTNESS) + +/* + * Please compile with the maximum warning settings to make sure macros are not + * defined more than once. + */ + +#include +#include +#include + +#define glue3_aux(x,y,z) x ## y ## z +#define glue3(x,y,z) glue3_aux(x,y,z) + +#define DECLU(bits) glue3(uint,bits,_t) glue3(u,bits,=) glue3(UINT,bits,_C) (0); +#define DECLI(bits) glue3(int,bits,_t) glue3(i,bits,=) glue3(INT,bits,_C) (0); + +#define DECL(us,bits) glue3(DECL,us,) (bits) + +#define TESTUMAX(bits) glue3(u,bits,=) glue3(~,u,bits); if (glue3(UINT,bits,_MAX) glue3(!=,u,bits)) printf ("Something wrong with UINT%d_MAX\n", bits) + +int main () { + DECL(I,8) + DECL(U,8) + DECL(I,16) + DECL(U,16) + DECL(I,32) + DECL(U,32) +#ifdef INT64_MAX + DECL(I,64) + DECL(U,64) +#endif + intmax_t imax = INTMAX_C(0); + uintmax_t umax = UINTMAX_C(0); + char str0[256], str1[256]; + + sprintf (str0, "%d %x\n", 0, ~0); + + sprintf (str1, "%d %x\n", i8, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i8 : %s\n", str1); + sprintf (str1, "%u %x\n", u8, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with u8 : %s\n", str1); + sprintf (str1, "%d %x\n", i16, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i16 : %s\n", str1); + sprintf (str1, "%u %x\n", u16, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with u16 : %s\n", str1); + sprintf (str1, "%" PRINTF_INT32_MODIFIER "d %x\n", i32, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i32 : %s\n", str1); + sprintf (str1, "%" PRINTF_INT32_MODIFIER "u %x\n", u32, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with u32 : %s\n", str1); +#ifdef INT64_MAX + sprintf (str1, "%" PRINTF_INT64_MODIFIER "d %x\n", i64, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with i64 : %s\n", str1); +#endif + sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "d %x\n", imax, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with imax : %s\n", str1); + sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "u %x\n", umax, ~0); + if (0 != strcmp (str0, str1)) printf ("Something wrong with umax : %s\n", str1); + + TESTUMAX(8); + TESTUMAX(16); + TESTUMAX(32); +#ifdef INT64_MAX + TESTUMAX(64); +#endif + + return EXIT_SUCCESS; +} + +#endif diff --git a/hat-trie/src/superfasthash.c b/hat-trie/src/superfasthash.c deleted file mode 100644 index 6bd6a09..0000000 --- a/hat-trie/src/superfasthash.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2010, Paul Hsieh - * - * All rights reserved. Redistribution and use in source and binary forms, - * with or without modification, are permitted provided that the following - * conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither my name, Paul Hsieh, nor the names of any other contributors to - * the code use may not be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include "superfasthash.h" -#include -#include - -#undef get16bits -#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ - || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) -#define get16bits(d) (*((const uint16_t *) (d))) -#endif - -#if !defined (get16bits) -#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ - +(uint32_t)(((const uint8_t *)(d))[0]) ) -#endif - -uint32_t hash(const char * data, int len) -{ - return hash_inc(data, len, (uint32_t) len); -} - -uint32_t hash_inc(const char * data, int len, uint32_t hash) -{ - uint32_t tmp; - int rem; - - if (len <= 0 || data == NULL) return 0; - - rem = len & 3; - len >>= 2; - - /* Main loop */ - for (;len > 0; len--) { - hash += get16bits (data); - tmp = (get16bits (data+2) << 11) ^ hash; - hash = (hash << 16) ^ tmp; - data += 2*sizeof (uint16_t); - hash += hash >> 11; - } - - /* Handle end cases */ - switch (rem) { - case 3: hash += get16bits (data); - hash ^= hash << 16; - hash ^= data[sizeof (uint16_t)] << 18; - hash += hash >> 11; - break; - case 2: hash += get16bits (data); - hash ^= hash << 11; - hash += hash >> 17; - break; - case 1: hash += *data; - hash ^= hash << 10; - hash += hash >> 1; - } - - /* Force "avalanching" of final 127 bits */ - hash ^= hash << 3; - hash += hash >> 5; - hash ^= hash << 4; - hash += hash >> 17; - hash ^= hash << 25; - hash += hash >> 6; - - return hash; -} - diff --git a/hat-trie/src/superfasthash.h b/hat-trie/src/superfasthash.h deleted file mode 100644 index 552430d..0000000 --- a/hat-trie/src/superfasthash.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2010, Paul Hsieh - * - * All rights reserved. Redistribution and use in source and binary forms, - * with or without modification, are permitted provided that the following - * conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither my name, Paul Hsieh, nor the names of any other contributors to - * the code use may not be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - - -#ifndef SUPERFASTHASH_H -#define SUPERFASTHASH_H - -#include - -uint32_t hash(const char * data, int len); -uint32_t hash_inc(const char * data, int len, uint32_t hash); - -#endif diff --git a/hat-trie/test/Makefile.am b/hat-trie/test/Makefile.am index ede6e1e..30a5e31 100644 --- a/hat-trie/test/Makefile.am +++ b/hat-trie/test/Makefile.am @@ -1,6 +1,6 @@ TESTS = check_ahtable check_hattrie -check_PROGRAMS = check_ahtable check_hattrie +check_PROGRAMS = check_ahtable check_hattrie bench_sorted_iter check_ahtable_SOURCES = check_ahtable.c str_map.c check_ahtable_LDADD = $(top_builddir)/src/libhat-trie.la @@ -9,3 +9,7 @@ check_ahtable_CPPFLAGS = -I$(top_builddir)/src check_hattrie_SOURCES = check_hattrie.c str_map.c check_hattrie_LDADD = $(top_builddir)/src/libhat-trie.la check_hattrie_CPPFLAGS = -I$(top_builddir)/src + +bench_sorted_iter_SOURCES = bench_sorted_iter.c +bench_sorted_iter_LDADD = $(top_builddir)/src/libhat-trie.la +bench_sorted_iter_CPPFLAGS = -I$(top_builddir)/src diff --git a/hat-trie/test/bench_sorted_iter.c b/hat-trie/test/bench_sorted_iter.c new file mode 100644 index 0000000..0271bcb --- /dev/null +++ b/hat-trie/test/bench_sorted_iter.c @@ -0,0 +1,69 @@ + +/* A quick test of the degree to which ordered iteration is slower than unordered. */ + +#include "../src/hat-trie.h" +#include +#include + + +/* Simple random string generation. */ +void randstr(char* x, size_t len) +{ + x[len] = '\0'; + while (len > 0) { + x[--len] = '\x20' + (rand() % ('\x7e' - '\x20' + 1)); + } +} + +int main() +{ + hattrie_t* T = hattrie_create(); + const size_t n = 1000000; // how many strings + const size_t m_low = 50; // minimum length of each string + const size_t m_high = 500; // maximum length of each string + char x[501]; + + size_t i, m; + for (i = 0; i < n; ++i) { + m = m_low + rand() % (m_high - m_low); + randstr(x, m); + *hattrie_get(T, x, m) = 1; + } + + hattrie_iter_t* it; + clock_t t0, t; + const size_t repetitions = 100; + size_t r; + + /* iterate in unsorted order */ + fprintf(stderr, "iterating out of order ... "); + t0 = clock(); + for (r = 0; r < repetitions; ++r) { + it = hattrie_iter_begin(T, false); + while (!hattrie_iter_finished(it)) { + hattrie_iter_next(it); + } + hattrie_iter_free(it); + } + t = clock(); + fprintf(stderr, "finished. (%0.2f seconds)\n", (double) (t - t0) / (double) CLOCKS_PER_SEC); + + + /* iterate in sorted order */ + fprintf(stderr, "iterating in order ... "); + t0 = clock(); + for (r = 0; r < repetitions; ++r) { + it = hattrie_iter_begin(T, true); + while (!hattrie_iter_finished(it)) { + hattrie_iter_next(it); + } + hattrie_iter_free(it); + } + t = clock(); + fprintf(stderr, "finished. (%0.2f seconds)\n", (double) (t - t0) / (double) CLOCKS_PER_SEC); + + + hattrie_free(T); + + return 0; +} diff --git a/hat-trie/test/check_ahtable.c b/hat-trie/test/check_ahtable.c index a549c14..5df6311 100644 --- a/hat-trie/test/check_ahtable.c +++ b/hat-trie/test/check_ahtable.c @@ -16,10 +16,10 @@ void randstr(char* x, size_t len) } -const size_t n = 2000000; // how many uniques strings +const size_t n = 100000; // how many unique strings const size_t m_low = 50; // minimum length of each string const size_t m_high = 500; // maximum length of each string -const size_t k = 2000000; // number of insertions +const size_t k = 200000; // number of insertions char** xs; ahtable_t* T; @@ -82,17 +82,27 @@ void test_ahtable_insert() *u, v); } } + + /* delete some keys */ + for (j = 0; i < k/100; ++j) { + i = rand() % n; + ahtable_del(T, xs[i], strlen(xs[i])); + str_map_del(M, xs[i], strlen(xs[i])); + u = ahtable_tryget(T, xs[i], strlen(xs[i])); + if (u) { + fprintf(stderr, "[error] deleted node found in ahtable\n"); + } + } fprintf(stderr, "done.\n"); } - void test_ahtable_iteration() { fprintf(stderr, "iterating through %zu keys ... \n", k); - ahtable_iter_t* i = ahtable_iter_begin(T); + ahtable_iter_t* i = ahtable_iter_begin(T, false); size_t count = 0; value_t* u; @@ -106,8 +116,7 @@ void test_ahtable_iteration() key = ahtable_iter_key(i, &len); u = ahtable_iter_val(i); - - v = str_map_get(M, key, len); + v = str_map_get(M, key, len); if (*u != v) { if (v == 0) { @@ -136,6 +145,63 @@ void test_ahtable_iteration() } +int cmpkey(const char* a, size_t ka, const char* b, size_t kb) +{ + int c = memcmp(a, b, ka < kb ? ka : kb); + return c == 0 ? (int) ka - (int) kb : c; +} + + +void test_ahtable_sorted_iteration() +{ + fprintf(stderr, "iterating in order through %zu keys ... \n", k); + + ahtable_iter_t* i = ahtable_iter_begin(T, true); + + size_t count = 0; + value_t* u; + value_t v; + + char* prev_key = malloc(m_high + 1); + size_t prev_len = 0; + + const char *key = NULL; + size_t len = 0; + + while (!ahtable_iter_finished(i)) { + memcpy(prev_key, key, len); + prev_len = len; + ++count; + + key = ahtable_iter_key(i, &len); + if (prev_key != NULL && cmpkey(prev_key, prev_len, key, len) > 0) { + fprintf(stderr, "[error] iteration is not correctly ordered.\n"); + } + + u = ahtable_iter_val(i); + v = str_map_get(M, key, len); + + if (*u != v) { + if (v == 0) { + fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v); + } + else { + fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v); + } + } + + // this way we will see an error if the same key is iterated through + // twice + str_map_set(M, key, len, 0); + + ahtable_iter_next(i); + } + + ahtable_iter_free(i); + free(prev_key); + + fprintf(stderr, "done.\n"); +} int main() @@ -145,10 +211,10 @@ int main() test_ahtable_iteration(); teardown(); + setup(); + test_ahtable_insert(); + test_ahtable_sorted_iteration(); + teardown(); + return 0; } - - - - - diff --git a/hat-trie/test/check_hattrie.c b/hat-trie/test/check_hattrie.c index ba047cb..797a981 100644 --- a/hat-trie/test/check_hattrie.c +++ b/hat-trie/test/check_hattrie.c @@ -15,12 +15,14 @@ void randstr(char* x, size_t len) } } - -const size_t n = 100000; // how many uniques strings +const size_t n = 100000; // how many unique strings const size_t m_low = 50; // minimum length of each string const size_t m_high = 500; // maximum length of each string const size_t k = 200000; // number of insertions +const size_t d = 50000; + char** xs; +char** ds; hattrie_t* T; str_map* M; @@ -30,6 +32,7 @@ void setup() { fprintf(stderr, "generating %zu keys ... ", n); xs = malloc(n * sizeof(char*)); + ds = malloc(d * sizeof(char*)); size_t i; size_t m; for (i = 0; i < n; ++i) { @@ -37,6 +40,10 @@ void setup() xs[i] = malloc(m + 1); randstr(xs[i], m); } + for (i = 0; i < d; ++i) { + m = rand()%n; + ds[i] = xs[m]; + } T = hattrie_create(); M = str_map_create(); @@ -54,6 +61,7 @@ void teardown() free(xs[i]); } free(xs); + free(ds); } @@ -83,6 +91,17 @@ void test_hattrie_insert() } } + fprintf(stderr, "deleting %zu keys ... \n", d); + for (j = 0; j < d; ++j) { + str_map_del(M, ds[j], strlen(ds[j])); + hattrie_del(T, ds[j], strlen(ds[j])); + u = hattrie_tryget(T, ds[j], strlen(ds[j])); + if (u) { + fprintf(stderr, "[error] item %zu still found in trie after delete\n", + j); + } + } + fprintf(stderr, "done.\n"); } @@ -92,7 +111,7 @@ void test_hattrie_iteration() { fprintf(stderr, "iterating through %zu keys ... \n", k); - hattrie_iter_t* i = hattrie_iter_begin(T); + hattrie_iter_t* i = hattrie_iter_begin(T, false); size_t count = 0; value_t* u; @@ -135,6 +154,79 @@ void test_hattrie_iteration() fprintf(stderr, "done.\n"); } + +int cmpkey(const char* a, size_t ka, const char* b, size_t kb) +{ + int c = memcmp(a, b, ka < kb ? ka : kb); + return c == 0 ? (int) ka - (int) kb : c; +} + + +void test_hattrie_sorted_iteration() +{ + fprintf(stderr, "iterating in order through %zu keys ... \n", k); + + hattrie_iter_t* i = hattrie_iter_begin(T, true); + + size_t count = 0; + value_t* u; + value_t v; + + char* key_copy = malloc(m_high + 1); + char* prev_key = malloc(m_high + 1); + memset(prev_key, 0, m_high + 1); + size_t prev_len = 0; + + const char *key = NULL; + size_t len = 0; + + while (!hattrie_iter_finished(i)) { + memcpy(prev_key, key_copy, len); + prev_key[len] = '\0'; + prev_len = len; + ++count; + + key = hattrie_iter_key(i, &len); + + /* memory for key may be changed on iter, copy it */ + strncpy(key_copy, key, len); + + if (prev_key != NULL && cmpkey(prev_key, prev_len, key, len) > 0) { + fprintf(stderr, "[error] iteration is not correctly ordered.\n"); + } + + u = hattrie_iter_val(i); + v = str_map_get(M, key, len); + + if (*u != v) { + if (v == 0) { + fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v); + } + else { + fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v); + } + } + + // this way we will see an error if the same key is iterated through + // twice + str_map_set(M, key, len, 0); + + hattrie_iter_next(i); + } + + if (count != M->m) { + fprintf(stderr, "[error] iterated through %zu element, expected %zu\n", + count, M->m); + } + + hattrie_iter_free(i); + free(prev_key); + free(key_copy); + + fprintf(stderr, "done.\n"); +} + + void test_trie_non_ascii() { fprintf(stderr, "checking non-ascii... \n"); @@ -167,6 +259,11 @@ int main() test_hattrie_iteration(); teardown(); + setup(); + test_hattrie_insert(); + test_hattrie_sorted_iteration(); + teardown(); + return 0; } diff --git a/hat-trie/test/str_map.c b/hat-trie/test/str_map.c index 82be054..68303a3 100644 --- a/hat-trie/test/str_map.c +++ b/hat-trie/test/str_map.c @@ -213,4 +213,29 @@ value_t str_map_get(const str_map* T, const char* key, size_t keylen) return 0; } +void str_map_del(str_map* T, const char* key, size_t keylen) +{ + uint32_t h = hash(key, keylen) % T->n; + + str_map_pair* u = T->A[h]; + str_map_pair* p = NULL; + while (u) { + + if (u->keylen == keylen && memcmp(u->key, key, keylen) == 0) { + if (p) { + p->next = u->next; + } else { + T->A[h] = u->next; + } + free(u->key); + free(u); + --T->m; + return; + } + + p = u; + u = u->next; + } + +} diff --git a/hat-trie/test/str_map.h b/hat-trie/test/str_map.h index a452560..7d000d5 100644 --- a/hat-trie/test/str_map.h +++ b/hat-trie/test/str_map.h @@ -44,7 +44,7 @@ str_map* str_map_create(void); void str_map_destroy(str_map*); void str_map_set(str_map*, const char* key, size_t keylen, value_t value); value_t str_map_get(const str_map*, const char* key, size_t keylen); - +void str_map_del(str_map* T, const char* key, size_t keylen); #if defined(__cplusplus) }