diff --git a/.hgignore b/.hgignore new file mode 100644 index 0000000..2bdb155 --- /dev/null +++ b/.hgignore @@ -0,0 +1,15 @@ +^build +^MANIFEST$ +^dist +\.so$ +\.o$ +\.lo$ + +^stuff/ +\.rej$ +\.pyc$ +^.tox +\.orig$ +\.prof$ +\.coverage$ +\.git \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d71198e --- /dev/null +++ b/LICENSE @@ -0,0 +1,18 @@ +Copyright (c) Mikhail Korobov, 2012 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished +to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR +A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE +OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..258b2a5 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,14 @@ +include README.rst +include CHANGES.rst +include LICENSE +include tox.ini +include update_c.sh + +recursive-include hat-trie/src *.h *.c +include hat-trie/src/config.h.in +include hat-trie/configure +include hat-trie/configure.ac + +include src/hat_trie.pyx +include src/chat_datrie.pxd + diff --git a/hat-trie/COPYING b/hat-trie/COPYING new file mode 100644 index 0000000..bbc6dc3 --- /dev/null +++ b/hat-trie/COPYING @@ -0,0 +1,19 @@ +Copyright (C) 2011 by Daniel C. Jones + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + diff --git a/hat-trie/Makefile.am b/hat-trie/Makefile.am new file mode 100644 index 0000000..831d581 --- /dev/null +++ b/hat-trie/Makefile.am @@ -0,0 +1,8 @@ + +SUBDIRS = src test + +EXTRA_DIST = README.md COPYING + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = hat-trie-0.1.pc + diff --git a/hat-trie/README.md b/hat-trie/README.md new file mode 100644 index 0000000..5941011 --- /dev/null +++ b/hat-trie/README.md @@ -0,0 +1,34 @@ + +Hat-Trie +======== + +This a ANSI C99 implementation of the HAT-trie data structure of Askitis and +Sinha, an extremely efficient (space and time) modern variant of tries. + +The version implemented here maps arrays of bytes to words (i.e., unsigned +longs), which can be used to store counts, pointers, etc, or not used at all if +you simply want to maintain a set of unique strings. + +For details see, + + 1. Askitis, N., & Sinha, R. (2007). HAT-trie: a cache-conscious trie-based data + structure for strings. Proceedings of the thirtieth Australasian conference on + Computer science-Volume 62 (pp. 97–105). Australian Computer Society, Inc. + + 2. Askitis, N., & Zobel, J. (2005). Cache-conscious collision resolution in + string hash tables. String Processing and Information Retrieval (pp. + 91–102). Springer. + + +Installation +------------ + + git clone git@github.com:dcjones/hat-trie.git + cd hat-trie + autoreconf -i + ./configure + make install + +To use the library, include `hat-trie.h` and link using `lhat-trie`. + + diff --git a/hat-trie/TODO b/hat-trie/TODO new file mode 100644 index 0000000..be8bd3a --- /dev/null +++ b/hat-trie/TODO @@ -0,0 +1,6 @@ + +todo: + * Deletion in ahtable. + * Deletion in hattrie. + + diff --git a/hat-trie/configure.ac b/hat-trie/configure.ac new file mode 100644 index 0000000..aa66c24 --- /dev/null +++ b/hat-trie/configure.ac @@ -0,0 +1,39 @@ + +AC_INIT([hat-trie], [0.1.0], [dcjones@cs.washington.edu]) +AM_INIT_AUTOMAKE([foreign]) +AC_CONFIG_HEADERS([config.h]) +m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) + +base_CFLAGS="-std=c99 -Wall -Wextra -pedantic" +opt_CFLAGS="${base_CFLAGS} -O3" +dbg_CFLAGS="${base_CFLAGS} -g -O0" + +AC_ARG_ENABLE([debugging], + [AS_HELP_STRING([--enable-debugging], + [enable debugging info (default is no)])], + [], [enable_debugging=no]) + +AS_IF([test "x$enable_debugging" = xyes], + [CFLAGS="$dbg_CFLAGS"], + [CFLAGS="$opt_CFLAGS"]) + + +AC_PROG_CC +AC_PROG_CPP +AC_PROG_INSTALL +AC_PROG_LN_S +AC_PROG_MAKE_SET +AC_DISABLE_SHARED +AC_PROG_LIBTOOL + +AC_C_BIGENDIAN([AC_MSG_ERROR([Big-endian systems are not currently supported.])]) +AC_CHECK_HEADERS([stdint.h stdlib.h]) +AC_HEADER_STDBOOL +AC_TYPE_SIZE_T +AC_TYPE_UINT16_T +AC_TYPE_UINT32_T +AC_TYPE_UINT8_T + +AC_CONFIG_FILES([hat-trie-0.1.pc Makefile src/Makefile test/Makefile]) +AC_OUTPUT + diff --git a/hat-trie/hat-trie-0.1.pc.in b/hat-trie/hat-trie-0.1.pc.in new file mode 100644 index 0000000..b694008 --- /dev/null +++ b/hat-trie/hat-trie-0.1.pc.in @@ -0,0 +1,12 @@ + +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: @PACKAGE_NAME@ +Description: An efficient trie implementation. +Version: @PACKAGE_VERSION@ +Cflags: -I{includedir} +Libs: -L${libdir} + diff --git a/hat-trie/src/Makefile.am b/hat-trie/src/Makefile.am new file mode 100644 index 0000000..b0ca614 --- /dev/null +++ b/hat-trie/src/Makefile.am @@ -0,0 +1,11 @@ + +lib_LTLIBRARIES = libhat-trie.la + +libhat_trie_la_SOURCES = common.h \ + ahtable.h ahtable.c \ + hat-trie.h hat-trie.c \ + misc.h misc.c \ + superfasthash.h superfasthash.c + +pkginclude_HEADERS = hat-trie.h ahtable.h common.h + diff --git a/hat-trie/src/ahtable.c b/hat-trie/src/ahtable.c new file mode 100644 index 0000000..a0a812c --- /dev/null +++ b/hat-trie/src/ahtable.c @@ -0,0 +1,414 @@ +/* + * This file is part of hat-trie. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + +#include "ahtable.h" +#include "misc.h" +#include "superfasthash.h" +// #include "config.h" +#include +#include + + + +const double ahtable_max_load_factor = 100000.0; /* arbitrary large number => don't resize */ +const const size_t ahtable_initial_size = 4096; +static const uint16_t LONG_KEYLEN_MASK = 0x7fff; + + +ahtable_t* ahtable_create() +{ + return ahtable_create_n(ahtable_initial_size); +} + + +ahtable_t* ahtable_create_n(size_t n) +{ + ahtable_t* T = malloc_or_die(sizeof(ahtable_t)); + T->flag = 0; + T->c0 = T->c1 = '\0'; + + T->n = n; + T->m = 0; + T->max_m = (size_t) (ahtable_max_load_factor * (double) T->n); + T->slots = malloc_or_die(n * sizeof(slot_t)); + memset(T->slots, 0, n * sizeof(slot_t)); + + T->slot_sizes = malloc_or_die(n * sizeof(size_t)); + memset(T->slot_sizes, 0, n * sizeof(size_t)); + + return T; +} + + +void ahtable_free(ahtable_t* T) +{ + size_t i; + for (i = 0; i < T->n; ++i) free(T->slots[i]); + free(T->slots); + free(T->slot_sizes); + free(T); +} + + +size_t ahtable_size(const ahtable_t* T) +{ + return T->m; +} + + +void ahtable_clear(ahtable_t* T) +{ + size_t i; + for (i = 0; i < T->n; ++i) free(T->slots[i]); + T->n = ahtable_initial_size; + T->slots = realloc_or_die(T->slots, T->n * sizeof(slot_t)); + memset(T->slots, 0, T->n * sizeof(slot_t)); + + T->slot_sizes = realloc_or_die(T->slot_sizes, T->n * sizeof(size_t)); + memset(T->slot_sizes, 0, T->n * sizeof(size_t)); +} + + +static slot_t ins_key(slot_t s, const char* key, size_t len, value_t** val) +{ + // key length + if (len < 128) { + s[0] = (unsigned char) (len << 1); + s += 1; + } + else { + /* The most significant bit is set to indicate that two bytes are + * being used to store the key length. */ + *((uint16_t*) s) = ((uint16_t) len << 1) | 0x1; + s += 2; + } + + // key + memcpy(s, key, len * sizeof(unsigned char)); + s += len; + + // value + *val = (value_t*) s; + **val = 0; + s += sizeof(value_t); + + return s; +} + + +static void ahtable_expand(ahtable_t* T) +{ + /* Resizing a table is essentially building a brand new one. + * One little shortcut we can take on the memory allocation front is to + * figure out how much memory each slot needs in advance. + */ + size_t new_n = 2 * T->n; + size_t* slot_sizes = malloc_or_die(new_n * sizeof(size_t)); + memset(slot_sizes, 0, new_n * sizeof(size_t)); + + const char* key; + size_t len; + size_t m = 0; + ahtable_iter_t* i = ahtable_iter_begin(T); + while (!ahtable_iter_finished(i)) { + key = ahtable_iter_key(i, &len); + slot_sizes[hash(key, len) % new_n] += + len + sizeof(value_t) + (len >= 128 ? 2 : 1); + + ++m; + ahtable_iter_next(i); + } + assert(m == T->m); + ahtable_iter_free(i); + + + /* allocate slots */ + slot_t* slots = malloc_or_die(new_n * sizeof(slot_t)); + size_t j; + for (j = 0; j < new_n; ++j) { + if (slot_sizes[j] > 0) { + slots[j] = malloc_or_die(slot_sizes[j]); + } + else slots[j] = NULL; + } + + /* rehash values. A few shortcuts can be taken here as well, as we know + * there will be no collisions. Instead of the regular insertion routine, + * we keep track of the ends of every slot and simply insert keys. + * */ + slot_t* slots_next = malloc_or_die(new_n * sizeof(slot_t)); + memcpy(slots_next, slots, new_n * sizeof(slot_t)); + size_t h; + m = 0; + value_t* u; + value_t* v; + i = ahtable_iter_begin(T); + while (!ahtable_iter_finished(i)) { + + key = ahtable_iter_key(i, &len); + h = hash(key, len) % new_n; + + slots_next[h] = ins_key(slots_next[h], key, len, &u); + v = ahtable_iter_val(i); + *u = *v; + + ++m; + ahtable_iter_next(i); + } + assert(m == T->m); + ahtable_iter_free(i); + + + free(slots_next); + for (j = 0; j < T->n; ++j) free(T->slots[j]); + + free(T->slots); + T->slots = slots; + + free(T->slot_sizes); + T->slot_sizes = slot_sizes; + + T->n = new_n; + T->max_m = (size_t) (ahtable_max_load_factor * (double) T->n); +} + + +static value_t* get_key(ahtable_t* T, const char* key, size_t len, bool insert_missing) +{ + /* if we are at capacity, preemptively resize */ + if (insert_missing && T->m >= T->max_m) { + ahtable_expand(T); + } + + + uint32_t i = hash(key, len) % T->n; + size_t k; + slot_t s; + value_t* val; + + /* search the array for our key */ + s = T->slots[i]; + while ((size_t) (s - T->slots[i]) < T->slot_sizes[i]) { + + /* get the key length */ + if (0x1 & *s) { + k = (size_t) (*((uint16_t*) s) >> 1); + s += 2; + } + else { + k = (size_t) (*s >> 1); + s += 1; + } + + /* skip keys that are longer than ours */ + if (k != len) { + s += k + sizeof(value_t); + continue; + } + + /* key found. */ + if (memcmp(s, key, len) == 0) { + return (value_t*) (s + len); + } + /* key not found. */ + else { + s += k + sizeof(value_t); + continue; + } + } + + + if (insert_missing) { + /* the key was not found, so we must insert it. */ + size_t new_size = T->slot_sizes[i]; + new_size += 1 + (len >= 128 ? 1 : 0); // key length + new_size += len * sizeof(unsigned char); // key + new_size += sizeof(value_t); // value + + T->slots[i] = realloc_or_die(T->slots[i], new_size); + + ++T->m; + ins_key(T->slots[i] + T->slot_sizes[i], key, len, &val); + T->slot_sizes[i] = new_size; + + return val; + } + else return NULL; +} + + +value_t* ahtable_get(ahtable_t* T, const char* key, size_t len) +{ + return get_key(T, key, len, true); +} + + +value_t* ahtable_tryget(ahtable_t* T, const char* key, size_t len ) +{ + return get_key(T, key, len, false); +} + + +void ahtable_del(ahtable_t* T, const char* key, size_t len) +{ + uint32_t i = hash(key, len) % T->n; + size_t k; + slot_t s; + + /* search the array for our key */ + s = T->slots[i]; + while ((size_t) (s - T->slots[i]) < T->slot_sizes[i]) { + + /* get the key length */ + if (0x1 & *s) { + k = (size_t) (*((uint16_t*) s)) >> 1; + s += 2; + } + else { + k = (size_t) (*s >> 1); + s += 1; + } + + /* skip keys that are longer than ours */ + if (k != len) { + s += k + sizeof(value_t); + continue; + } + + /* key found. */ + if (memcmp(s, key, len) == 0) { + /* move everything over, resize the array */ + unsigned char* t = s + len + sizeof(value_t); + s -= k > 255 ? 2 : 1; + memmove(s, t, T->slot_sizes[i] - (size_t) (t - T->slots[i])); + T->slot_sizes[i] -= (size_t) (t - s); + --T->m; + return; + } + /* key not found. */ + else { + s += k + sizeof(value_t); + continue; + } + } + + // Key was not found. Do nothing. +} + + + +struct ahtable_iter_t_ +{ + const ahtable_t* T; // parent + size_t i; // slot index + slot_t s; // slot position +}; + + + +ahtable_iter_t* ahtable_iter_begin(const ahtable_t* T) +{ + ahtable_iter_t* i = malloc_or_die(sizeof(ahtable_iter_t)); + i->T = T; + + for (i->i = 0; i->i < i->T->n; ++i->i) { + i->s = T->slots[i->i]; + if ((size_t) (i->s - T->slots[i->i]) >= T->slot_sizes[i->i]) continue; + break; + } + + return i; +} + + +void ahtable_iter_next(ahtable_iter_t* i) +{ + if (ahtable_iter_finished(i)) return; + + size_t k; + + /* get the key length */ + if (0x1 & *i->s) { + k = (size_t) ((*((uint16_t*) i->s)) >> 1); + i->s += 2; + } + else { + k = (size_t) (*i->s >> 1); + i->s += 1; + } + + /* skip to the next key */ + i->s += k + sizeof(value_t); + + if ((size_t) (i->s - i->T->slots[i->i]) >= i->T->slot_sizes[i->i]) { + do { + ++i->i; + } while(i->i < i->T->n && + i->T->slot_sizes[i->i] == 0); + + if (i->i < i->T->n) i->s = i->T->slots[i->i]; + else i->s = NULL; + } +} + + + +bool ahtable_iter_finished(ahtable_iter_t* i) +{ + return i->i >= i->T->n; +} + + +void ahtable_iter_free(ahtable_iter_t* i) +{ + free(i); +} + + + +const char* ahtable_iter_key(ahtable_iter_t* i, size_t* len) +{ + if (ahtable_iter_finished(i)) return NULL; + + slot_t s = i->s; + size_t k; + if (0x1 & *s) { + k = (size_t) (*((uint16_t*) s)) >> 1; + s += 2; + } + else { + k = (size_t) (*s >> 1); + s += 1; + } + + *len = k; + return (const char*) s; +} + + +value_t* ahtable_iter_val(ahtable_iter_t* i) +{ + if (ahtable_iter_finished(i)) return NULL; + + slot_t s = i->s; + + size_t k; + if (0x1 & *s) { + k = (size_t) (*((uint16_t*) s)) >> 1; + s += 2; + } + else { + k = (size_t) (*s >> 1); + s += 1; + } + + s += k; + return (value_t*) s; +} + + + diff --git a/hat-trie/src/ahtable.h b/hat-trie/src/ahtable.h new file mode 100644 index 0000000..1a6f3a1 --- /dev/null +++ b/hat-trie/src/ahtable.h @@ -0,0 +1,92 @@ +/* + * This file is part of hat-trie. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * + * This is an implementation of the 'cache-conscious' hash tables described in, + * + * Askitis, N., & Zobel, J. (2005). Cache-conscious collision resolution in + * string hash tables. String Processing and Information Retrieval (pp. + * 91–102). Springer. + * + * Briefly, the idea is, as opposed to separate chaining with linked lists, to + * store keys contiguously in one big array, thereby improving the caching + * behavior, and reducing space requirments. + * + */ + +#ifndef HATTRIE_AHTABLE_H +#define HATTRIE_AHTABLE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include "common.h" + +typedef unsigned char* slot_t; + +typedef struct ahtable_t_ +{ + /* these fields are reserved for hattrie to fiddle with */ + uint8_t flag; + unsigned char c0; + unsigned char c1; + + size_t n; // number of slots + size_t m; // numbur of key/value pairs stored + size_t max_m; // number of stored keys before we resize + + size_t* slot_sizes; + slot_t* slots; +} ahtable_t; + +extern const double ahtable_max_load_factor; +extern const size_t ahtable_initial_size; + +ahtable_t* ahtable_create (void); // Create an empty hash table. +ahtable_t* ahtable_create_n (size_t n); // Create an empty hash table, with + // n slots reserved. + +void ahtable_free (ahtable_t*); // Free all memory used by a table. +void ahtable_clear (ahtable_t*); // Remove all entries. +size_t ahtable_size (const ahtable_t*); // Number of stored keys. + + +/** Find the given key in the table, inserting it if it does not exist, and + * returning a pointer to it's key. + * + * This pointer is not guaranteed to be valid after additional calls to + * ahtable_get, ahtable_del, ahtable_clear, or other functions that modifies the + * table. + */ +value_t* ahtable_get (ahtable_t*, const char* key, size_t len); + + +/** Find a given key in the table, returning a NULL pointer if it does not + * exist. */ +value_t* ahtable_tryget (ahtable_t*, const char* key, size_t len); + + +void ahtable_del(ahtable_t*, const char* key, size_t len); + + +typedef struct ahtable_iter_t_ ahtable_iter_t; + +ahtable_iter_t* ahtable_iter_begin (const ahtable_t*); +void ahtable_iter_next (ahtable_iter_t*); +bool ahtable_iter_finished (ahtable_iter_t*); +void ahtable_iter_free (ahtable_iter_t*); +const char* ahtable_iter_key (ahtable_iter_t*, size_t* len); +value_t* ahtable_iter_val (ahtable_iter_t*); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/hat-trie/src/common.h b/hat-trie/src/common.h new file mode 100644 index 0000000..91dac33 --- /dev/null +++ b/hat-trie/src/common.h @@ -0,0 +1,19 @@ +/* + * This file is part of hat-trie. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * + * Common typedefs, etc. + * + */ + + +#ifndef HATTRIE_COMMON_H +#define HATTRIE_COMMON_H + +typedef unsigned long value_t; + +#endif + + diff --git a/hat-trie/src/hat-trie.c b/hat-trie/src/hat-trie.c new file mode 100644 index 0000000..1fb0265 --- /dev/null +++ b/hat-trie/src/hat-trie.c @@ -0,0 +1,608 @@ +/* + * This file is part of hat-trie. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + +#include "hat-trie.h" +#include "ahtable.h" +#include "misc.h" +#include +#include +#include + +/* maximum number of keys that may be stored in a bucket before it is burst */ +static const size_t MAX_BUCKET_SIZE = 16384; + +static const uint8_t NODE_TYPE_TRIE = 0x1; +static const uint8_t NODE_TYPE_PURE_BUCKET = 0x2; +static const uint8_t NODE_TYPE_HYBRID_BUCKET = 0x4; + + +struct trie_node_t_; + +/* Node's may be trie nodes or buckets. This union allows us to keep + * non-specific pointer. */ +typedef union node_ptr_ +{ + ahtable_t* b; + struct trie_node_t_* t; + uint8_t* flag; +} node_ptr; + + +typedef struct trie_node_t_ +{ + uint8_t flag; + + /* the value for the key that is consumed on a trie node */ + value_t val; + bool has_val; + + /* Map a character to either a trie_node_t or a ahtable_t. The first byte + * must be examined to determine which. */ + node_ptr xs[256]; + +} trie_node_t; + + +/* Create a new trie node with all pointer pointing to the given child (which + * can be NULL). */ +static trie_node_t* alloc_trie_node(node_ptr child) +{ + trie_node_t* node = malloc_or_die(sizeof(trie_node_t)); + node->flag = NODE_TYPE_TRIE; + node->val = 0; + node->has_val = false; + + size_t i; + for (i = 0; i < 256; ++i) node->xs[i] = child; + return node; +} + + +struct hattrie_t_ +{ + node_ptr root; // root node + size_t m; // number of stored keys +}; + + + +hattrie_t* hattrie_create() +{ + hattrie_t* T = malloc_or_die(sizeof(hattrie_t)); + T->m = 0; + + node_ptr node; + node.b = ahtable_create(); + node.b->flag = NODE_TYPE_HYBRID_BUCKET; + node.b->c0 = 0x00; + node.b->c1 = 0xff; + T->root.t = alloc_trie_node(node); + + return T; +} + + +static void hattrie_free_node(node_ptr node) +{ + if (*node.flag & NODE_TYPE_TRIE) { + size_t i; + for (i = 0; i < 256; ++i) { + if (i > 0 && node.t->xs[i].t == node.t->xs[i - 1].t) continue; + + /* XXX: recursion might not be the best choice here. It is possible + * to build a very deep trie. */ + if (node.t->xs[i].t) hattrie_free_node(node.t->xs[i]); + } + free(node.t); + } + else { + ahtable_free(node.b); + } +} + + +void hattrie_free(hattrie_t* T) +{ + hattrie_free_node(T->root); + free(T); +} + +/* Perform one split operation on the given node with the given parent. + */ +static void hattrie_split(node_ptr parent, node_ptr node) +{ + /* only buckets may be split */ + assert(*node.flag & NODE_TYPE_PURE_BUCKET || + *node.flag & NODE_TYPE_HYBRID_BUCKET); + + assert(*parent.flag & NODE_TYPE_TRIE); + + if (*node.flag & NODE_TYPE_PURE_BUCKET) { + /* turn the pure bucket into a hybrid bucket */ + parent.t->xs[node.b->c0].t = alloc_trie_node(node); + + /* if the bucket had an empty key, move it to the new trie node */ + value_t* val = ahtable_tryget(node.b, NULL, 0); + if (val) { + parent.t->xs[node.b->c0].t->val = *val; + parent.t->xs[node.b->c0].t->has_val = true; + *val = 0; + ahtable_del(node.b, NULL, 0); + } + + node.b->c0 = 0x00; + node.b->c1 = 0xff; + node.b->flag = NODE_TYPE_HYBRID_BUCKET; + + return; + } + + /* This is a hybrid bucket. Perform a proper split. */ + + /* count the number of occourances of every leading character */ + unsigned int cs[256]; // occurance count for leading chars + memset(cs, 0, 256 * sizeof(unsigned int)); + size_t len; + const char* key; + + ahtable_iter_t* i = ahtable_iter_begin(node.b); + while (!ahtable_iter_finished(i)) { + key = ahtable_iter_key(i, &len); + assert(len > 0); + cs[(size_t) key[0]] += 1; + ahtable_iter_next(i); + } + ahtable_iter_free(i); + + /* choose a split point */ + unsigned int left_m, right_m, all_m; + size_t j = node.b->c0; + all_m = ahtable_size(node.b); + left_m = cs[j]; + right_m = all_m - left_m; + int d; + + while (j + 1 < node.b->c1) { + d = abs((int) (left_m + cs[j + 1]) - (int) (right_m - cs[j + 1])); + if (d <= abs(left_m - right_m) && left_m + cs[j + 1] < all_m) { + j += 1; + left_m += cs[j]; + right_m -= cs[j]; + } + else break; + } + + /* now split into two node cooresponding to ranges [0, j] and + * [j + 1, 255], respectively. */ + + + /* create new left and right nodes */ + + /* TODO: Add a special case if either node is a hybrid bucket containing all + * the keys. In such a case, do not build a new table, just use the old one. + * */ + size_t num_slots; + + + for (num_slots = ahtable_initial_size; + (double) left_m > ahtable_max_load_factor * (double) num_slots; + num_slots *= 2); + + node_ptr left, right; + left.b = ahtable_create_n(num_slots); + left.b->c0 = node.b->c0; + left.b->c1 = (uint8_t) j; + left.b->flag = left.b->c0 == left.b->c1 ? + NODE_TYPE_PURE_BUCKET : NODE_TYPE_HYBRID_BUCKET; + + + for (num_slots = ahtable_initial_size; + (double) right_m > ahtable_max_load_factor * (double) num_slots; + num_slots *= 2); + + right.b = ahtable_create_n(num_slots); + right.b->c0 = (uint8_t) j + 1; + right.b->c1 = node.b->c1; + right.b->flag = right.b->c0 == right.b->c1 ? + NODE_TYPE_PURE_BUCKET : NODE_TYPE_HYBRID_BUCKET; + + + /* update the parent's pointer */ + + size_t c; + for (c = (size_t) node.b->c0; c <= j; ++c) parent.t->xs[c] = left; + for (; c <= (size_t) node.b->c1; ++c) parent.t->xs[c] = right; + + + + /* distribute keys to the new left or right node */ + value_t* u; + value_t* v; + i = ahtable_iter_begin(node.b); + while (!ahtable_iter_finished(i)) { + key = ahtable_iter_key(i, &len); + u = ahtable_iter_val(i); + assert(len > 0); + + /* left */ + if ((size_t) key[0] <= j) { + if (*left.flag & NODE_TYPE_PURE_BUCKET) { + v = ahtable_get(left.b, key + 1, len - 1); + } + else { + v = ahtable_get(left.b, key, len); + } + *v = *u; + } + + /* right */ + else { + if (*right.flag & NODE_TYPE_PURE_BUCKET) { + v = ahtable_get(right.b, key + 1, len - 1); + } + else { + v = ahtable_get(right.b, key, len); + } + *v = *u; + } + + ahtable_iter_next(i); + } + + ahtable_iter_free(i); + ahtable_free(node.b); +} + + +value_t* hattrie_get(hattrie_t* T, const char* key, size_t len) +{ + node_ptr parent = T->root; + assert(*parent.flag & NODE_TYPE_TRIE); + + if (len == 0) return &parent.t->val; + node_ptr node = parent.t->xs[(size_t) *key]; + + while (*node.flag & NODE_TYPE_TRIE && len > 0) { + ++key; + --len; + parent = node; + node = node.t->xs[(size_t) *key]; + } + + assert(*parent.flag & NODE_TYPE_TRIE); + + + /* if the key has been consumed on a trie node, use its value */ + if (len == 0) { + if (*node.flag & NODE_TYPE_TRIE) { + if (!node.t->has_val) { + node.t->has_val = true; + ++T->m; + } + return &node.t->val; + } + else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) { + if (!parent.t->has_val) { + parent.t->has_val = true; + ++T->m; + } + return &parent.t->val; + } + } + + + /* preemptively split the bucket if it is full */ + while (ahtable_size(node.b) >= MAX_BUCKET_SIZE) { + hattrie_split(parent, node); + + /* after the split, the node pointer is invalidated, so we search from + * the parent again. */ + node = parent.t->xs[(size_t) *key]; + while (*node.flag & NODE_TYPE_TRIE && len > 0) { + ++key; + --len; + parent = node; + node = node.t->xs[(size_t) *key]; + } + + assert(*parent.flag & NODE_TYPE_TRIE); + + /* if the key has been consumed on a trie node, use its value */ + if (len == 0) { + if (*node.flag & NODE_TYPE_TRIE) { + if (!node.t->has_val) { + node.t->has_val = true; + ++T->m; + } + return &node.t->val; + } + else if (*node.flag & NODE_TYPE_HYBRID_BUCKET) { + if (!parent.t->has_val) { + parent.t->has_val = true; + ++T->m; + } + return &parent.t->val; + } + } + } + + assert(*node.flag & NODE_TYPE_PURE_BUCKET || *node.flag & NODE_TYPE_HYBRID_BUCKET); + + assert(len > 0); + size_t m_old = node.b->m; + value_t* val; + if (*node.flag & NODE_TYPE_PURE_BUCKET) { + val = ahtable_get(node.b, key + 1, len - 1); + } + else { + val = ahtable_get(node.b, key, len); + } + T->m += (node.b->m - m_old); + + return val; +} + + +value_t* hattrie_tryget(hattrie_t* T, const char* key, size_t len) +{ + node_ptr parent = T->root; + assert(*parent.flag & NODE_TYPE_TRIE); + + if (len == 0) return &parent.t->val; + node_ptr node = parent.t->xs[(size_t) *key]; + + while (*node.flag & NODE_TYPE_TRIE && len > 1) { + ++key; + --len; + parent = node; + node = node.t->xs[(size_t) *key]; + } + + + /* if the key has been consumed on a trie node, use its value */ + if (*node.flag & NODE_TYPE_TRIE) { + if (!node.t->has_val) { + node.t->has_val = true; + ++T->m; + } + return &node.t->val; + } + else if (*node.flag & NODE_TYPE_PURE_BUCKET) { + return ahtable_tryget(node.b, key + 1, len - 1); + } + else { + return ahtable_tryget(node.b, key, len); + } +} + + +/* plan for iteration: + * This is tricky, as we have no parent pointers currently, and I would like to + * avoid adding them. That means maintaining a stack + * + */ + +typedef struct hattrie_node_stack_t_ +{ + char c; + size_t level; + + node_ptr node; + struct hattrie_node_stack_t_* next; + +} hattrie_node_stack_t; + + +struct hattrie_iter_t_ +{ + char* key; + size_t keysize; // space reserved for the key + size_t level; + + /* keep track of keys stored in trie nodes */ + bool has_nil_key; + value_t nil_val; + + const hattrie_t* T; + ahtable_iter_t* i; + hattrie_node_stack_t* stack; +}; + + +static void hattrie_iter_pushchar(hattrie_iter_t* i, size_t level, char c) +{ + if (i->keysize < level) { + i->keysize *= 2; + i->key = realloc_or_die(i->key, i->keysize * sizeof(char)); + } + + if (level > 0) { + i->key[level - 1] = c; + } + + i->level = level; +} + + +static void hattrie_iter_nextnode(hattrie_iter_t* i) +{ + if (i->stack == NULL) return; + + /* pop the stack */ + node_ptr node; + hattrie_node_stack_t* next; + char c; + size_t level; + + node = i->stack->node; + next = i->stack->next; + c = i->stack->c; + level = i->stack->level; + + free(i->stack); + i->stack = next; + + if (*node.flag & NODE_TYPE_TRIE) { + hattrie_iter_pushchar(i, level, c); + + if(node.t->has_val) { + i->has_nil_key = true; + i->nil_val = node.t->val; + } + + /* push all child nodes from right to left */ + int j; + for (j = 255; j >= 0; --j) { + if (j < 255 && node.t->xs[j].t == node.t->xs[j + 1].t) continue; + + // push stack + next = i->stack; + i->stack = malloc_or_die(sizeof(hattrie_node_stack_t)); + i->stack->node = node.t->xs[j]; + i->stack->next = next; + i->stack->level = level + 1; + i->stack->c = (char) j; + } + } + else { + if (*node.flag & NODE_TYPE_PURE_BUCKET) { + hattrie_iter_pushchar(i, level, c); + } + else { + i->level = level - 1; + } + + i->i = ahtable_iter_begin(node.b); + } +} + + +hattrie_iter_t* hattrie_iter_begin(const hattrie_t* T) +{ + hattrie_iter_t* i = malloc_or_die(sizeof(hattrie_iter_t)); + i->T = T; + i->i = NULL; + i->keysize = 16; + i->key = malloc_or_die(i->keysize * sizeof(char)); + i->level = 0; + i->has_nil_key = false; + i->nil_val = 0; + + i->stack = malloc_or_die(sizeof(hattrie_node_stack_t)); + i->stack->next = NULL; + i->stack->node = T->root; + i->stack->c = '\0'; + i->stack->level = 0; + + + while (((i->i == NULL || ahtable_iter_finished(i->i)) && !i->has_nil_key) && + i->stack != NULL ) { + + ahtable_iter_free(i->i); + i->i = NULL; + hattrie_iter_nextnode(i); + } + + if (i->i != NULL && ahtable_iter_finished(i->i)) { + ahtable_iter_free(i->i); + i->i = NULL; + } + + return i; +} + + + +void hattrie_iter_next(hattrie_iter_t* i) +{ + if (hattrie_iter_finished(i)) return; + + if (i->i != NULL && !ahtable_iter_finished(i->i)) { + ahtable_iter_next(i->i); + } + else if (i->has_nil_key) { + i->has_nil_key = false; + i->nil_val = 0; + hattrie_iter_nextnode(i); + } + + while (((i->i == NULL || ahtable_iter_finished(i->i)) && !i->has_nil_key) && + i->stack != NULL ) { + + ahtable_iter_free(i->i); + i->i = NULL; + hattrie_iter_nextnode(i); + } + + if (i->i != NULL && ahtable_iter_finished(i->i)) { + ahtable_iter_free(i->i); + i->i = NULL; + } +} + + + +bool hattrie_iter_finished(hattrie_iter_t* i) +{ + return i->stack == NULL && i->i == NULL && !i->has_nil_key; +} + + +void hattrie_iter_free(hattrie_iter_t* i) +{ + if (i == NULL) return; + if (i->i) ahtable_iter_free(i->i); + + hattrie_node_stack_t* next; + while (i->stack) { + next = i->stack->next; + free(i->stack); + i->stack = next; + } + + free(i->key); + free(i); +} + + +const char* hattrie_iter_key(hattrie_iter_t* i, size_t* len) +{ + if (hattrie_iter_finished(i)) return NULL; + + size_t sublen; + const char* subkey; + + if (i->has_nil_key) { + subkey = NULL; + sublen = 0; + } + else subkey = ahtable_iter_key(i->i, &sublen); + + if (i->keysize < i->level + sublen + 1) { + while (i->keysize < i->level + sublen + 1) i->keysize *= 2; + i->key = realloc_or_die(i->key, i->keysize * sizeof(char)); + } + + memcpy(i->key + i->level, subkey, sublen); + i->key[i->level + sublen] = '\0'; + + *len = i->level + sublen; + return i->key; +} + + +value_t* hattrie_iter_val(hattrie_iter_t* i) +{ + if (i->has_nil_key) return &i->nil_val; + + if (hattrie_iter_finished(i)) return NULL; + + return ahtable_iter_val(i->i); +} + + + diff --git a/hat-trie/src/hat-trie.h b/hat-trie/src/hat-trie.h new file mode 100644 index 0000000..eebf5ed --- /dev/null +++ b/hat-trie/src/hat-trie.h @@ -0,0 +1,67 @@ +/* + * This file is part of hat-trie + * + * Copyright (c) 2011 by Daniel C. Jones + * + * + * This is an implementation of the HAT-trie data structure described in, + * + * Askitis, N., & Sinha, R. (2007). HAT-trie: a cache-conscious trie-based data + * structure for strings. Proceedings of the thirtieth Australasian conference on + * Computer science-Volume 62 (pp. 97–105). Australian Computer Society, Inc. + * + * The HAT-trie is in essence a hybrid data structure, combining tries and hash + * tables in a clever way to try to get the best of both worlds. + * + */ + +#ifndef HATTRIE_HATTRIE_H +#define HATTRIE_HATTRIE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "common.h" +#include +#include + +typedef struct hattrie_t_ hattrie_t; + +hattrie_t* hattrie_create (void); //< Create an empty hat-trie. +void hattrie_free (hattrie_t*); //< Free all memory used by a trie. +hattrie_t* hattrie_dup (const hattrie_t*); //< Duplicate an existing trie. +void hattrie_clear (hattrie_t*); //< Remove all entries. + + +/** Find the given key in the trie, inserting it if it does not exist, and + * returning a pointer to it's key. + * + * This pointer is not guaranteed to be valid after additional calls to + * hattrie_get, hattrie_del, hattrie_clear, or other functions that modifies the + * trie. + */ +value_t* hattrie_get (hattrie_t*, const char* key, size_t len); + + +/** Find a given key in the table, returning a NULL pointer if it does not + * exist. */ +value_t* hattrie_tryget (hattrie_t*, const char* key, size_t len); + + +typedef struct hattrie_iter_t_ hattrie_iter_t; + +hattrie_iter_t* hattrie_iter_begin (const hattrie_t*); +void hattrie_iter_next (hattrie_iter_t*); +bool hattrie_iter_finished (hattrie_iter_t*); +void hattrie_iter_free (hattrie_iter_t*); +const char* hattrie_iter_key (hattrie_iter_t*, size_t* len); +value_t* hattrie_iter_val (hattrie_iter_t*); + +#ifdef __cplusplus +} +#endif + +#endif + + diff --git a/hat-trie/src/misc.c b/hat-trie/src/misc.c new file mode 100644 index 0000000..0530c34 --- /dev/null +++ b/hat-trie/src/misc.c @@ -0,0 +1,46 @@ +/* + * This file is part of hat-trie. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + +#include "misc.h" +#include + + +void* malloc_or_die(size_t n) +{ + void* p = malloc(n); + if (p == NULL && n != 0) { + fprintf(stderr, "Cannot allocate %zu bytes.\n", n); + exit(EXIT_FAILURE); + } + return p; +} + + +void* realloc_or_die(void* ptr, size_t n) +{ + void* p = realloc(ptr, n); + if (p == NULL && n != 0) { + fprintf(stderr, "Cannot allocate %zu bytes.\n", n); + exit(EXIT_FAILURE); + } + return p; +} + + +FILE* fopen_or_die(const char* path, const char* mode) +{ + FILE* f = fopen(path, mode); + if (f == NULL) { + fprintf(stderr, "Cannot open file %s with mode %s.\n", path, mode); + exit(EXIT_FAILURE); + } + return f; +} + + + + diff --git a/hat-trie/src/misc.h b/hat-trie/src/misc.h new file mode 100644 index 0000000..7223b8b --- /dev/null +++ b/hat-trie/src/misc.h @@ -0,0 +1,22 @@ +/* + * This file is part of hat-trie. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * misc : + * miscelaneous functions. + * + */ + +#ifndef LINESET_MISC_H +#define LINESET_MISC_H + +#include + +void* malloc_or_die(size_t); +void* realloc_or_die(void*, size_t); +FILE* fopen_or_die(const char*, const char*); + +#endif + + diff --git a/hat-trie/src/superfasthash.c b/hat-trie/src/superfasthash.c new file mode 100644 index 0000000..6bd6a09 --- /dev/null +++ b/hat-trie/src/superfasthash.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2010, Paul Hsieh + * + * All rights reserved. Redistribution and use in source and binary forms, + * with or without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither my name, Paul Hsieh, nor the names of any other contributors to + * the code use may not be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "superfasthash.h" +#include +#include + +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + +uint32_t hash(const char * data, int len) +{ + return hash_inc(data, len, (uint32_t) len); +} + +uint32_t hash_inc(const char * data, int len, uint32_t hash) +{ + uint32_t tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + diff --git a/hat-trie/src/superfasthash.h b/hat-trie/src/superfasthash.h new file mode 100644 index 0000000..552430d --- /dev/null +++ b/hat-trie/src/superfasthash.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010, Paul Hsieh + * + * All rights reserved. Redistribution and use in source and binary forms, + * with or without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither my name, Paul Hsieh, nor the names of any other contributors to + * the code use may not be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + + +#ifndef SUPERFASTHASH_H +#define SUPERFASTHASH_H + +#include + +uint32_t hash(const char * data, int len); +uint32_t hash_inc(const char * data, int len, uint32_t hash); + +#endif diff --git a/hat-trie/test/Makefile.am b/hat-trie/test/Makefile.am new file mode 100644 index 0000000..ede6e1e --- /dev/null +++ b/hat-trie/test/Makefile.am @@ -0,0 +1,11 @@ + +TESTS = check_ahtable check_hattrie +check_PROGRAMS = check_ahtable check_hattrie + +check_ahtable_SOURCES = check_ahtable.c str_map.c +check_ahtable_LDADD = $(top_builddir)/src/libhat-trie.la +check_ahtable_CPPFLAGS = -I$(top_builddir)/src + +check_hattrie_SOURCES = check_hattrie.c str_map.c +check_hattrie_LDADD = $(top_builddir)/src/libhat-trie.la +check_hattrie_CPPFLAGS = -I$(top_builddir)/src diff --git a/hat-trie/test/check_ahtable.c b/hat-trie/test/check_ahtable.c new file mode 100644 index 0000000..a549c14 --- /dev/null +++ b/hat-trie/test/check_ahtable.c @@ -0,0 +1,154 @@ + +#include +#include +#include + +#include "str_map.h" +#include "../src/ahtable.h" + +/* Simple random string generation. */ +void randstr(char* x, size_t len) +{ + x[len] = '\0'; + while (len > 0) { + x[--len] = '\x20' + (rand() % ('\x7e' - '\x20' + 1)); + } +} + + +const size_t n = 2000000; // how many uniques strings +const size_t m_low = 50; // minimum length of each string +const size_t m_high = 500; // maximum length of each string +const size_t k = 2000000; // number of insertions +char** xs; + +ahtable_t* T; +str_map* M; + + +void setup() +{ + fprintf(stderr, "generating %zu keys ... ", n); + xs = malloc(n * sizeof(char*)); + size_t i; + size_t m; + for (i = 0; i < n; ++i) { + m = m_low + rand() % (m_high - m_low); + xs[i] = malloc(m + 1); + randstr(xs[i], m); + } + + T = ahtable_create(); + M = str_map_create(); + fprintf(stderr, "done.\n"); +} + + +void teardown() +{ + ahtable_free(T); + str_map_destroy(M); + + size_t i; + for (i = 0; i < n; ++i) { + free(xs[i]); + } + free(xs); +} + + +void test_ahtable_insert() +{ + fprintf(stderr, "inserting %zu keys ... \n", k); + + size_t i, j; + value_t* u; + value_t v; + + for (j = 0; j < k; ++j) { + i = rand() % n; + + + v = 1 + str_map_get(M, xs[i], strlen(xs[i])); + str_map_set(M, xs[i], strlen(xs[i]), v); + + + u = ahtable_get(T, xs[i], strlen(xs[i])); + *u += 1; + + + if (*u != v) { + fprintf(stderr, "[error] tally mismatch (reported: %lu, correct: %lu)\n", + *u, v); + } + } + + fprintf(stderr, "done.\n"); +} + + + +void test_ahtable_iteration() +{ + fprintf(stderr, "iterating through %zu keys ... \n", k); + + ahtable_iter_t* i = ahtable_iter_begin(T); + + size_t count = 0; + value_t* u; + value_t v; + + size_t len; + const char* key; + + while (!ahtable_iter_finished(i)) { + ++count; + + key = ahtable_iter_key(i, &len); + u = ahtable_iter_val(i); + + v = str_map_get(M, key, len); + + if (*u != v) { + if (v == 0) { + fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v); + } + else { + fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v); + } + } + + // this way we will see an error if the same key is iterated through + // twice + str_map_set(M, key, len, 0); + + ahtable_iter_next(i); + } + + if (count != M->m) { + fprintf(stderr, "[error] iterated through %zu element, expected %zu\n", + count, M->m); + } + + ahtable_iter_free(i); + + fprintf(stderr, "done.\n"); +} + + + + +int main() +{ + setup(); + test_ahtable_insert(); + test_ahtable_iteration(); + teardown(); + + return 0; +} + + + + + diff --git a/hat-trie/test/check_hattrie.c b/hat-trie/test/check_hattrie.c new file mode 100644 index 0000000..4485b43 --- /dev/null +++ b/hat-trie/test/check_hattrie.c @@ -0,0 +1,154 @@ + +#include +#include +#include + +#include "str_map.h" +#include "../src/hat-trie.h" + +/* Simple random string generation. */ +void randstr(char* x, size_t len) +{ + x[len] = '\0'; + while (len > 0) { + x[--len] = '\x20' + (rand() % ('\x7e' - '\x20' + 1)); + } +} + + +const size_t n = 100000; // how many uniques strings +const size_t m_low = 50; // minimum length of each string +const size_t m_high = 500; // maximum length of each string +const size_t k = 200000; // number of insertions +char** xs; + +hattrie_t* T; +str_map* M; + + +void setup() +{ + fprintf(stderr, "generating %zu keys ... ", n); + xs = malloc(n * sizeof(char*)); + size_t i; + size_t m; + for (i = 0; i < n; ++i) { + m = m_low + rand() % (m_high - m_low); + xs[i] = malloc(m + 1); + randstr(xs[i], m); + } + + T = hattrie_create(); + M = str_map_create(); + fprintf(stderr, "done.\n"); +} + + +void teardown() +{ + hattrie_free(T); + str_map_destroy(M); + + size_t i; + for (i = 0; i < n; ++i) { + free(xs[i]); + } + free(xs); +} + + +void test_hattrie_insert() +{ + fprintf(stderr, "inserting %zu keys ... \n", k); + + size_t i, j; + value_t* u; + value_t v; + + for (j = 0; j < k; ++j) { + i = rand() % n; + + + v = 1 + str_map_get(M, xs[i], strlen(xs[i])); + str_map_set(M, xs[i], strlen(xs[i]), v); + + + u = hattrie_get(T, xs[i], strlen(xs[i])); + *u += 1; + + + if (*u != v) { + fprintf(stderr, "[error] tally mismatch (reported: %lu, correct: %lu)\n", + *u, v); + } + } + + fprintf(stderr, "done.\n"); +} + + + +void test_hattrie_iteration() +{ + fprintf(stderr, "iterating through %zu keys ... \n", k); + + hattrie_iter_t* i = hattrie_iter_begin(T); + + size_t count = 0; + value_t* u; + value_t v; + + size_t len; + const char* key; + + while (!hattrie_iter_finished(i)) { + ++count; + + key = hattrie_iter_key(i, &len); + u = hattrie_iter_val(i); + + v = str_map_get(M, key, len); + + if (*u != v) { + if (v == 0) { + fprintf(stderr, "[error] incorrect iteration (%lu, %lu)\n", *u, v); + } + else { + fprintf(stderr, "[error] incorrect iteration tally (%lu, %lu)\n", *u, v); + } + } + + // this way we will see an error if the same key is iterated through + // twice + str_map_set(M, key, len, 0); + + hattrie_iter_next(i); + } + + if (count != M->m) { + fprintf(stderr, "[error] iterated through %zu element, expected %zu\n", + count, M->m); + } + + hattrie_iter_free(i); + + fprintf(stderr, "done.\n"); +} + + + + +int main() +{ + setup(); + test_hattrie_insert(); + test_hattrie_iteration(); + teardown(); + + return 0; +} + + + + + diff --git a/hat-trie/test/str_map.c b/hat-trie/test/str_map.c new file mode 100644 index 0000000..82be054 --- /dev/null +++ b/hat-trie/test/str_map.c @@ -0,0 +1,216 @@ + +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + + +#include "str_map.h" +#include "misc.h" +#include +#include +#include + + +static const size_t INITIAL_TABLE_SIZE = 16; +static const double MAX_LOAD = 0.77; + + +/* + * Paul Hsieh's SuperFastHash + * http://www.azillionmonkeys.com/qed/hash.html + */ + + +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + +static uint32_t hash(const char * data, size_t len) { + uint32_t hash = len, tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + + + +static void rehash(str_map* T, size_t new_n); +static void clear(str_map*); + + + +str_map* str_map_create() +{ + str_map* T = malloc_or_die(sizeof(str_map)); + T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(str_map_pair*)); + memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(str_map_pair*)); + T->n = INITIAL_TABLE_SIZE; + T->m = 0; + T->max_m = T->n * MAX_LOAD; + + return T; +} + + +void str_map_destroy(str_map* T) +{ + if (T != NULL) { + clear(T); + free(T->A); + free(T); + } +} + + + +void clear(str_map* T) +{ + str_map_pair* u; + size_t i; + for (i = 0; i < T->n; i++) { + while (T->A[i]) { + u = T->A[i]->next; + free(T->A[i]->key); + free(T->A[i]); + T->A[i] = u; + } + } + + T->m = 0; +} + + +static void insert_without_copy(str_map* T, str_map_pair* V) +{ + uint32_t h = hash(V->key, V->keylen) % T->n; + V->next = T->A[h]; + T->A[h] = V; + T->m++; +} + + + +static void rehash(str_map* T, size_t new_n) +{ + str_map U; + U.n = new_n; + U.m = 0; + U.max_m = U.n * MAX_LOAD; + U.A = malloc_or_die(U.n * sizeof(str_map_pair*)); + memset(U.A, 0, U.n * sizeof(str_map_pair*)); + + str_map_pair *j, *k; + size_t i; + for (i = 0; i < T->n; i++) { + j = T->A[i]; + while (j) { + k = j->next; + insert_without_copy(&U, j); + j = k; + } + T->A[i] = NULL; + } + + free(T->A); + T->A = U.A; + T->n = U.n; + T->max_m = U.max_m; +} + + +void str_map_set(str_map* T, const char* key, size_t keylen, value_t value) +{ + if (T->m >= T->max_m) rehash(T, T->n * 2); + + uint32_t h = hash(key, keylen) % T->n; + + str_map_pair* u = T->A[h]; + + while (u) { + if (u->keylen == keylen && memcmp(u->key, key, keylen) == 0) { + u->value = value; + return; + } + + u = u->next; + } + + u = malloc_or_die(sizeof(str_map_pair)); + u->key = malloc_or_die(keylen); + memcpy(u->key, key, keylen); + u->keylen = keylen; + u->value = value; + + u->next = T->A[h]; + T->A[h] = u; + + T->m++; +} + + +value_t str_map_get(const str_map* T, const char* key, size_t keylen) +{ + uint32_t h = hash(key, keylen) % T->n; + + str_map_pair* u = T->A[h]; + + while (u) { + if (u->keylen == keylen && memcmp(u->key, key, keylen) == 0) { + return u->value; + } + + u = u->next; + } + + return 0; +} + + diff --git a/hat-trie/test/str_map.h b/hat-trie/test/str_map.h new file mode 100644 index 0000000..a452560 --- /dev/null +++ b/hat-trie/test/str_map.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2011 by Daniel C. Jones + * + * hash : + * A quick and simple hash table mapping strings to things. + * + */ + + +#ifndef ISOLATOR_STR_MAP_H +#define ISOLATOR_STR_MAP_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include + +#include "common.h" + + +typedef struct str_map_pair_ +{ + char* key; + size_t keylen; + value_t value; + + struct str_map_pair_* next; +} str_map_pair; + + +typedef struct +{ + str_map_pair** A; /* table proper */ + size_t n; /* table size */ + size_t m; /* hashed items */ + size_t max_m; /* max hashed items before rehash */ +} str_map; + + + +str_map* str_map_create(void); +void str_map_destroy(str_map*); +void str_map_set(str_map*, const char* key, size_t keylen, value_t value); +value_t str_map_get(const str_map*, const char* key, size_t keylen); + + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..e725fdd --- /dev/null +++ b/setup.py @@ -0,0 +1,49 @@ +#! /usr/bin/env python +import os +from distutils.core import setup +from distutils.extension import Extension +from Cython.Distutils import build_ext + +HATTRIE_DIR = 'hat-trie/src' +HATTRIE_FILE_NAMES = ['ahtable.c', 'hat-trie.c', 'misc.c', 'superfasthash.c'] +HATTRIE_FILES = [os.path.join(HATTRIE_DIR, name) for name in HATTRIE_FILE_NAMES] + +setup( + name="hat-trie", + version="0.1", + description="State-of-art Trie for Python", + long_description = open('README.rst').read() + open('CHANGES.rst').read(), + author='Mikhail Korobov', + author_email='kmike84@gmail.com', + url='https://github.com/kmike/hat-trie/', + #packages = ['hat_trie'], + cmdclass = {'build_ext': build_ext}, + + ext_modules = [ + Extension( + "hat_trie", + ['src/hat_trie.pyx', 'src/chat_trie.pxd'] + HATTRIE_FILES, + #['src/datrie.c', 'src/cdatrie.c', 'src/stdio_ext.c'] + HATTRIE_FILES, + include_dirs=['hat-trie/src'], + ) + ], + + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Cython', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: Implementation :: CPython', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Scientific/Engineering :: Information Analysis', + 'Topic :: Text Processing :: Linguistic', + ], +) diff --git a/src/chat_trie.pxd b/src/chat_trie.pxd new file mode 100644 index 0000000..6bf7940 --- /dev/null +++ b/src/chat_trie.pxd @@ -0,0 +1,33 @@ +cdef extern from "../hat-trie/src/hat-trie.h": + + cdef int value_t + cdef int size_t + + ctypedef struct hattrie_t: + pass + + hattrie_t* hattrie_create (void) # Create an empty hat-trie. + void hattrie_free (hattrie_t*) # Free all memory used by a trie. + hattrie_t* hattrie_dup (const hattrie_t*) # Duplicate an existing trie. + void hattrie_clear (hattrie_t*) # Remove all entries. + + + # Find the given key in the trie, inserting it if it does not exist, and + # returning a pointer to it's key. + # This pointer is not guaranteed to be valid after additional calls to + # hattrie_get, hattrie_del, hattrie_clear, or other functions that + # modifies the trie. + value_t* hattrie_get (hattrie_t*, char* key, size_t len) + + # Find a given key in the table, returning a NULL pointer if it does not exist. + value_t* hattrie_tryget (hattrie_t*, char* key, size_t len) + + ctypedef struct hattrie_iter_t: + pass + + hattrie_iter_t* hattrie_iter_begin (const hattrie_t*) + void hattrie_iter_next (hattrie_iter_t*) + bool hattrie_iter_finished (hattrie_iter_t*) + void hattrie_iter_free (hattrie_iter_t*) + char* hattrie_iter_key (hattrie_iter_t*, size_t* len) + value_t* hattrie_iter_val (hattrie_iter_t*) diff --git a/src/hat_trie.pyx b/src/hat_trie.pyx new file mode 100644 index 0000000..e69de29 diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..ca4e616 --- /dev/null +++ b/tox.ini @@ -0,0 +1,16 @@ +[tox] +envlist = py26,py27,py32,py33 + +[testenv] +deps = + cython + pytest + # psutil +commands= + py.test [] +# python bench/speed.py + +[testenv:pypy] +deps = + git+https://github.com/cython/cython.git@8102e17127206b51d7a419a3e9673ad795672a7d#egg=cython + pytest diff --git a/update_c.sh b/update_c.sh new file mode 100755 index 0000000..91b0e1a --- /dev/null +++ b/update_c.sh @@ -0,0 +1,2 @@ +#!/bin/sh +cython src/hat_trie.pyx src/chat_trie.pxd -a \ No newline at end of file