> 文章列表 > Twitter-高性能hash_map

Twitter-高性能hash_map

Twitter-高性能hash_map

学习twitter的高性能散列表源码:

个人认为Twitter散列表的优点
1、使用C宏定义实现C++泛型的思想;
2、散列函数冲突小;
3、使用bitmap思想,标志位占用空间小;
4、自动扩展容量,判断扩容的条件;

个人认为Twitter散列表的缺点
1、值的类型都用指针指向,对于值类型是简单整型有指针占用空间的缺点;(虽然值类型是基本整型的情况较少,但是可以通过宏进行值类型是基本类型进行选定)

1、使用C的#define写类似于C++泛型的代码;

#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \\__KHASH_TYPE(name, khkey_t, khval_t)                        \\__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)

name:散列表的名称可以自定义;

khval_t:散列表的值类型可以自定义,在散列表里是指针;

khkey_t:散列表的key可以不可以自定义,但是可以选择32位或者64位的key;

__hash_func:散列函数可以选择,提供很多种散列函数;

/* --- BEGIN OF HASH FUNCTIONS --- *//*! @function@abstract     Integer hash function@param  key   The integer [khint32_t]@return       The hash value [khint_t]*/
#define kh_int_hash_func(key) (khint32_t)(key)
/*! @function@abstract     Integer comparison function*/
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function@abstract     64-bit integer hash function@param  key   The integer [khint64_t]@return       The hash value [khint_t]*/
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
/*! @function@abstract     64-bit integer comparison function*/
#define kh_int64_hash_equal(a, b) ((a) == (b))
/*! @function@abstract     const char* hash function@param  s     Pointer to a null terminated string@return       The hash value*/
static kh_inline khint_t __ac_X31_hash_string(const char *s)
{khint_t h = (khint_t)*s;if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;return h;
}
/*! @function@abstract     Another interface to const char* hash function@param  key   Pointer to a null terminated string [const char*]@return       The hash value [khint_t]*/
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
/*! @function@abstract     Const char* comparison function*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)static kh_inline khint_t __ac_Wang_hash(khint_t key)
{key += ~(key << 15);key ^=  (key >> 10);key +=  (key << 3);key ^=  (key >> 6);key += ~(key << 11);key ^=  (key >> 16);return key;
}
#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)

__hash_equal:散列里面的判断key相等,因为只有32和64的整数,所以不可以选择;

下面的定义类型结构体:

__KHASH_TYPE(name, khkey_t, khval_t)
#define __KHASH_TYPE(name, khkey_t, khval_t) \\typedef struct kh_##name##_s { \\khint_t n_buckets, size, n_occupied, upper_bound; \\khint32_t *flags; \\khkey_t *keys; \\khval_t *vals; \\} kh_##name##_t;

下面的定义name和khval_t的函数:

__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \\SCOPE kh_##name##_t *kh_init_##name(void) {                    \\return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));      \\}                                                  \\SCOPE void kh_destroy_##name(kh_##name##_t *h)                 \\{                                                  \\if (h) {                                        \\kfree((void *)h->keys); kfree(h->flags);              \\kfree((void *)h->vals);                            \\kfree(h);                                       \\}                                               \\}                                                  \\SCOPE void kh_clear_##name(kh_##name##_t *h)                \\{                                                  \\if (h && h->flags) {                               \\memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \\h->size = h->n_occupied = 0;                       \\}                                               \\}                                                  \\SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)  \\{                                                  \\if (h->n_buckets) {                                   \\khint_t k, i, last, mask, step = 0; \\mask = h->n_buckets - 1;                           \\k = __hash_func(key); i = k & mask;                   \\last = i; \\while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \\i = (i + (++step)) & mask; \\if (i == last) return h->n_buckets;                \\}                                            \\return __ac_iseither(h->flags, i)? h->n_buckets : i;     \\} else return 0;                                   \\}                                                  \\SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \\{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \\khint32_t *new_flags = 0;                             \\khint_t j = 1;                                     \\{                                               \\kroundup32(new_n_buckets);                            \\if (new_n_buckets < 4) new_n_buckets = 4;             \\if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \\else { /* hash table size to be changed (shrink or expand); rehash */ \\new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \\if (!new_flags) return -1;                      \\memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \\if (h->n_buckets < new_n_buckets) { /* expand */      \\khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \\if (!new_keys) { kfree(new_flags); return -1; }    \\h->keys = new_keys;                          \\if (kh_is_map) {                          \\khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \\if (!new_vals) { kfree(new_flags); return -1; } \\h->vals = new_vals;                       \\}                                      \\} /* otherwise shrink */                        \\}                                            \\}                                               \\if (j) { /* rehashing is needed */                       \\for (j = 0; j != h->n_buckets; ++j) {                 \\if (__ac_iseither(h->flags, j) == 0) {             \\khkey_t key = h->keys[j];                    \\khval_t val;                              \\khint_t new_mask;                         \\new_mask = new_n_buckets - 1;                   \\if (kh_is_map) val = h->vals[j];             \\__ac_set_isdel_true(h->flags, j);               \\while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \\khint_t k, i, step = 0; \\k = __hash_func(key);                     \\i = k & new_mask;                      \\while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \\__ac_set_isempty_false(new_flags, i);        \\if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \\{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \\if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \\__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \\} else { /* write the element and jump out of the loop */ \\h->keys[i] = key;                   \\if (kh_is_map) h->vals[i] = val;       \\break;                              \\}                                   \\}                                      \\}                                         \\}                                            \\if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \\h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \\if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \\}                                            \\kfree(h->flags); /* free the working space */            \\h->flags = new_flags;                              \\h->n_buckets = new_n_buckets;                      \\h->n_occupied = h->size;                           \\h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \\}                                               \\return 0;                                          \\}                                                  \\SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \\{                                                  \\khint_t x;                                         \\if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \\if (h->n_buckets > (h->size<<1)) {                    \\if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \\*ret = -1; return h->n_buckets;                 \\}                                         \\} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \\*ret = -1; return h->n_buckets;                    \\}                                            \\} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \\{                                               \\khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \\x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \\if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \\else {                                          \\last = i; \\while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \\if (__ac_isdel(h->flags, i)) site = i;          \\i = (i + (++step)) & mask; \\if (i == last) { x = site; break; }             \\}                                         \\if (x == h->n_buckets) {                        \\if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \\else x = i;                               \\}                                         \\}                                            \\}                                               \\if (__ac_isempty(h->flags, x)) { /* not present at all */      \\h->keys[x] = key;                               \\__ac_set_isboth_false(h->flags, x);                   \\++h->size; ++h->n_occupied;                           \\*ret = 1;                                       \\} else if (__ac_isdel(h->flags, x)) { /* deleted */            \\h->keys[x] = key;                               \\__ac_set_isboth_false(h->flags, x);                   \\++h->size;                                      \\*ret = 2;                                       \\} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \\return x;                                          \\}                                                  \\SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)          \\{                                                  \\if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {        \\__ac_set_isdel_true(h->flags, x);                     \\--h->size;                                      \\}                                               \\}

2、散列表支持动态扩展桶的数目;

在put函数里面,会判断,如果满足

h->n_occupied >= h->upper_bound

则通过resize进行扩展:

khint_t x;
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ if (h->n_buckets > (h->size<<1)) { if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ *ret = -1; return h->n_buckets;                 }                                         } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ *ret = -1; return h->n_buckets;                }                                          
}

3、使用bitmap作为桶有没有被放置的标记;

khint32_t *flags;//散列表里的bit标志,bitmap

加上一系列对于bitmap的快速位操作,进行判断空、某个位置是否存在key等

#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))

4、定义一些工具函数

//函数声明
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)                \\extern kh_##name##_t *kh_init_##name(void);                    \\extern void kh_destroy_##name(kh_##name##_t *h);               \\extern void kh_clear_##name(kh_##name##_t *h);                 \\extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);   \\extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \\extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \\extern void kh_del_##name(kh_##name##_t *h, khint_t x);
//方便使用的宏定义,可以通过name和语义进行调用;/*!@abstract Type of the hash table.@param  name  Name of the hash table [symbol]*/
#define khash_t(name) kh_##name##_t/*! @function@abstract     Initiate a hash table.@param  name  Name of the hash table [symbol]@return       Pointer to the hash table [khash_t(name)*]*/
#define kh_init(name) kh_init_##name()/*! @function@abstract     Destroy a hash table.@param  name  Name of the hash table [symbol]@param  h     Pointer to the hash table [khash_t(name)*]*/
#define kh_destroy(name, h) kh_destroy_##name(h)/*! @function@abstract     Reset a hash table without deallocating memory.@param  name  Name of the hash table [symbol]@param  h     Pointer to the hash table [khash_t(name)*]*/
#define kh_clear(name, h) kh_clear_##name(h)/*! @function@abstract     Resize a hash table.@param  name  Name of the hash table [symbol]@param  h     Pointer to the hash table [khash_t(name)*]@param  s     New size [khint_t]*/
#define kh_resize(name, h, s) kh_resize_##name(h, s)/*! @function@abstract     Insert a key to the hash table.@param  name  Name of the hash table [symbol]@param  h     Pointer to the hash table [khash_t(name)*]@param  k     Key [type of keys]@param  r     Extra return code: -1 if the operation failed;0 if the key is present in the hash table;1 if the bucket is empty (never used); 2 if the element inthe bucket has been deleted [int*]@return       Iterator to the inserted element [khint_t]*/
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)/*! @function@abstract     Retrieve a key from the hash table.@param  name  Name of the hash table [symbol]@param  h     Pointer to the hash table [khash_t(name)*]@param  k     Key [type of keys]@return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]*/
#define kh_get(name, h, k) kh_get_##name(h, k)/*! @function@abstract     Remove a key from the hash table.@param  name  Name of the hash table [symbol]@param  h     Pointer to the hash table [khash_t(name)*]@param  k     Iterator to the element to be deleted [khint_t]*/
#define kh_del(name, h, k) kh_del_##name(h, k)/*! @function@abstract     Test whether a bucket contains data.@param  h     Pointer to the hash table [khash_t(name)*]@param  x     Iterator to the bucket [khint_t]@return       1 if containing data; 0 otherwise [int]*/
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))/*! @function@abstract     Get key given an iterator@param  h     Pointer to the hash table [khash_t(name)*]@param  x     Iterator to the bucket [khint_t]@return       Key [type of keys]*/
#define kh_key(h, x) ((h)->keys[x])/*! @function@abstract     Get value given an iterator@param  h     Pointer to the hash table [khash_t(name)*]@param  x     Iterator to the bucket [khint_t]@return       Value [type of values]@discussion   For hash sets, calling this results in segfault.*/
#define kh_val(h, x) ((h)->vals[x])/*! @function@abstract     Alias of kh_val()*/
#define kh_value(h, x) ((h)->vals[x])/*! @function@abstract     Get the start iterator@param  h     Pointer to the hash table [khash_t(name)*]@return       The start iterator [khint_t]*/
#define kh_begin(h) (khint_t)(0)/*! @function@abstract     Get the end iterator@param  h     Pointer to the hash table [khash_t(name)*]@return       The end iterator [khint_t]*/
#define kh_end(h) ((h)->n_buckets)/*! @function@abstract     Get the number of elements in the hash table@param  h     Pointer to the hash table [khash_t(name)*]@return       Number of elements in the hash table [khint_t]*/
#define kh_size(h) ((h)->size)/*! @function@abstract     Get the number of buckets in the hash table@param  h     Pointer to the hash table [khash_t(name)*]@return       Number of buckets in the hash table [khint_t]*/
#define kh_n_buckets(h) ((h)->n_buckets)/*! @function@abstract     Iterate over the entries in the hash table@param  h     Pointer to the hash table [khash_t(name)*]@param  kvar  Variable to which key will be assigned@param  vvar  Variable to which value will be assigned@param  code  Block of code to execute*/
#define kh_foreach(h, kvar, vvar, code) { khint_t __i;      \\for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \\if (!kh_exist(h,__i)) continue;                 \\(kvar) = kh_key(h,__i);                      \\(vvar) = kh_val(h,__i);                      \\code;                                  \\} }/*! @function@abstract     Iterate over the values in the hash table@param  h     Pointer to the hash table [khash_t(name)*]@param  vvar  Variable to which value will be assigned@param  code  Block of code to execute*/
#define kh_foreach_value(h, vvar, code) { khint_t __i;      \\for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \\if (!kh_exist(h,__i)) continue;                 \\(vvar) = kh_val(h,__i);                      \\code;                                  \\} }/* More conenient interfaces *//*! @function@abstract     Instantiate a hash set containing integer keys@param  name  Name of the hash table [symbol]*/
#define KHASH_SET_INIT_INT(name)                            \\KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)/*! @function@abstract     Instantiate a hash map containing integer keys@param  name  Name of the hash table [symbol]@param  khval_t  Type of values [type]*/
#define KHASH_MAP_INIT_INT(name, khval_t)                      \\KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)/*! @function@abstract     Instantiate a hash map containing 64-bit integer keys@param  name  Name of the hash table [symbol]*/
#define KHASH_SET_INIT_INT64(name)                             \\KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)/*! @function@abstract     Instantiate a hash map containing 64-bit integer keys@param  name  Name of the hash table [symbol]@param  khval_t  Type of values [type]*/
#define KHASH_MAP_INIT_INT64(name, khval_t)                       \\KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)typedef const char *kh_cstr_t;
/*! @function@abstract     Instantiate a hash map containing const char* keys@param  name  Name of the hash table [symbol]*/
#define KHASH_SET_INIT_STR(name)                            \\KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)/*! @function@abstract     Instantiate a hash map containing const char* keys@param  name  Name of the hash table [symbol]@param  khval_t  Type of values [type]*/
#define KHASH_MAP_INIT_STR(name, khval_t)                      \\KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)

使用散列表

KHASH_MAP_INIT_INT(32, char)void test_khash_map()
{int ret, is_missing;khiter_t k;khash_t(32)* h = kh_init(32);k = kh_put(32, h, 5, &ret);kh_value(h, k) = 10;k = kh_get(32, h, 10);is_missing = (k == kh_end(h));k = kh_get(32, h, 5);kh_del(32, h, k);for (k = kh_begin(h); k != kh_end(h); ++k)if (kh_exist(h, k)) kh_value(h, k) = 1;kh_destroy(32, h);return;
}

世纪图书馆