Twitter-高性能hash_map
学习twitter的高性能散列表源码:
个人认为Twitter散列表的优点:
1、使用C宏定义实现C++泛型的思想;
2、散列函数冲突小;
3、使用bitmap思想,标志位占用空间小;
4、自动扩展容量,判断扩容的条件;
个人认为Twitter散列表的缺点:
1、值的类型都用指针指向,对于值类型是简单整型有指针占用空间的缺点;(虽然值类型是基本整型的情况较少,但是可以通过宏进行值类型是基本类型进行选定)
1、使用C的#define写类似于C++泛型的代码;
#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \\__KHASH_TYPE(name, khkey_t, khval_t) \\__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
name:散列表的名称可以自定义;
khval_t:散列表的值类型可以自定义,在散列表里是指针;
khkey_t:散列表的key可以不可以自定义,但是可以选择32位或者64位的key;
__hash_func:散列函数可以选择,提供很多种散列函数;
/* --- BEGIN OF HASH FUNCTIONS --- *//*! @function@abstract Integer hash function@param key The integer [khint32_t]@return The hash value [khint_t]*/
#define kh_int_hash_func(key) (khint32_t)(key)
/*! @function@abstract Integer comparison function*/
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function@abstract 64-bit integer hash function@param key The integer [khint64_t]@return The hash value [khint_t]*/
#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
/*! @function@abstract 64-bit integer comparison function*/
#define kh_int64_hash_equal(a, b) ((a) == (b))
/*! @function@abstract const char* hash function@param s Pointer to a null terminated string@return The hash value*/
static kh_inline khint_t __ac_X31_hash_string(const char *s)
{khint_t h = (khint_t)*s;if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;return h;
}
/*! @function@abstract Another interface to const char* hash function@param key Pointer to a null terminated string [const char*]@return The hash value [khint_t]*/
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
/*! @function@abstract Const char* comparison function*/
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)static kh_inline khint_t __ac_Wang_hash(khint_t key)
{key += ~(key << 15);key ^= (key >> 10);key += (key << 3);key ^= (key >> 6);key += ~(key << 11);key ^= (key >> 16);return key;
}
#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
__hash_equal:散列里面的判断key相等,因为只有32和64的整数,所以不可以选择;
下面的定义类型结构体:
__KHASH_TYPE(name, khkey_t, khval_t)
#define __KHASH_TYPE(name, khkey_t, khval_t) \\typedef struct kh_##name##_s { \\khint_t n_buckets, size, n_occupied, upper_bound; \\khint32_t *flags; \\khkey_t *keys; \\khval_t *vals; \\} kh_##name##_t;
下面的定义name和khval_t的函数:
__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \\SCOPE kh_##name##_t *kh_init_##name(void) { \\return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \\} \\SCOPE void kh_destroy_##name(kh_##name##_t *h) \\{ \\if (h) { \\kfree((void *)h->keys); kfree(h->flags); \\kfree((void *)h->vals); \\kfree(h); \\} \\} \\SCOPE void kh_clear_##name(kh_##name##_t *h) \\{ \\if (h && h->flags) { \\memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \\h->size = h->n_occupied = 0; \\} \\} \\SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \\{ \\if (h->n_buckets) { \\khint_t k, i, last, mask, step = 0; \\mask = h->n_buckets - 1; \\k = __hash_func(key); i = k & mask; \\last = i; \\while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \\i = (i + (++step)) & mask; \\if (i == last) return h->n_buckets; \\} \\return __ac_iseither(h->flags, i)? h->n_buckets : i; \\} else return 0; \\} \\SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \\{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \\khint32_t *new_flags = 0; \\khint_t j = 1; \\{ \\kroundup32(new_n_buckets); \\if (new_n_buckets < 4) new_n_buckets = 4; \\if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \\else { /* hash table size to be changed (shrink or expand); rehash */ \\new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \\if (!new_flags) return -1; \\memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \\if (h->n_buckets < new_n_buckets) { /* expand */ \\khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \\if (!new_keys) { kfree(new_flags); return -1; } \\h->keys = new_keys; \\if (kh_is_map) { \\khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \\if (!new_vals) { kfree(new_flags); return -1; } \\h->vals = new_vals; \\} \\} /* otherwise shrink */ \\} \\} \\if (j) { /* rehashing is needed */ \\for (j = 0; j != h->n_buckets; ++j) { \\if (__ac_iseither(h->flags, j) == 0) { \\khkey_t key = h->keys[j]; \\khval_t val; \\khint_t new_mask; \\new_mask = new_n_buckets - 1; \\if (kh_is_map) val = h->vals[j]; \\__ac_set_isdel_true(h->flags, j); \\while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \\khint_t k, i, step = 0; \\k = __hash_func(key); \\i = k & new_mask; \\while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \\__ac_set_isempty_false(new_flags, i); \\if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \\{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \\if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \\__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \\} else { /* write the element and jump out of the loop */ \\h->keys[i] = key; \\if (kh_is_map) h->vals[i] = val; \\break; \\} \\} \\} \\} \\if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \\h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \\if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \\} \\kfree(h->flags); /* free the working space */ \\h->flags = new_flags; \\h->n_buckets = new_n_buckets; \\h->n_occupied = h->size; \\h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \\} \\return 0; \\} \\SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \\{ \\khint_t x; \\if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \\if (h->n_buckets > (h->size<<1)) { \\if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \\*ret = -1; return h->n_buckets; \\} \\} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \\*ret = -1; return h->n_buckets; \\} \\} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \\{ \\khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \\x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \\if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \\else { \\last = i; \\while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \\if (__ac_isdel(h->flags, i)) site = i; \\i = (i + (++step)) & mask; \\if (i == last) { x = site; break; } \\} \\if (x == h->n_buckets) { \\if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \\else x = i; \\} \\} \\} \\if (__ac_isempty(h->flags, x)) { /* not present at all */ \\h->keys[x] = key; \\__ac_set_isboth_false(h->flags, x); \\++h->size; ++h->n_occupied; \\*ret = 1; \\} else if (__ac_isdel(h->flags, x)) { /* deleted */ \\h->keys[x] = key; \\__ac_set_isboth_false(h->flags, x); \\++h->size; \\*ret = 2; \\} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \\return x; \\} \\SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \\{ \\if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \\__ac_set_isdel_true(h->flags, x); \\--h->size; \\} \\}
2、散列表支持动态扩展桶的数目;
在put函数里面,会判断,如果满足
h->n_occupied >= h->upper_bound
则通过resize进行扩展:
khint_t x;
if (h->n_occupied >= h->upper_bound) { /* update the hash table */ if (h->n_buckets > (h->size<<1)) { if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ *ret = -1; return h->n_buckets; } } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ *ret = -1; return h->n_buckets; }
}
3、使用bitmap作为桶有没有被放置的标记;
khint32_t *flags;//散列表里的bit标志,bitmap
加上一系列对于bitmap的快速位操作,进行判断空、某个位置是否存在key等
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
4、定义一些工具函数
//函数声明
#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \\extern kh_##name##_t *kh_init_##name(void); \\extern void kh_destroy_##name(kh_##name##_t *h); \\extern void kh_clear_##name(kh_##name##_t *h); \\extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \\extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \\extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \\extern void kh_del_##name(kh_##name##_t *h, khint_t x);
//方便使用的宏定义,可以通过name和语义进行调用;/*!@abstract Type of the hash table.@param name Name of the hash table [symbol]*/
#define khash_t(name) kh_##name##_t/*! @function@abstract Initiate a hash table.@param name Name of the hash table [symbol]@return Pointer to the hash table [khash_t(name)*]*/
#define kh_init(name) kh_init_##name()/*! @function@abstract Destroy a hash table.@param name Name of the hash table [symbol]@param h Pointer to the hash table [khash_t(name)*]*/
#define kh_destroy(name, h) kh_destroy_##name(h)/*! @function@abstract Reset a hash table without deallocating memory.@param name Name of the hash table [symbol]@param h Pointer to the hash table [khash_t(name)*]*/
#define kh_clear(name, h) kh_clear_##name(h)/*! @function@abstract Resize a hash table.@param name Name of the hash table [symbol]@param h Pointer to the hash table [khash_t(name)*]@param s New size [khint_t]*/
#define kh_resize(name, h, s) kh_resize_##name(h, s)/*! @function@abstract Insert a key to the hash table.@param name Name of the hash table [symbol]@param h Pointer to the hash table [khash_t(name)*]@param k Key [type of keys]@param r Extra return code: -1 if the operation failed;0 if the key is present in the hash table;1 if the bucket is empty (never used); 2 if the element inthe bucket has been deleted [int*]@return Iterator to the inserted element [khint_t]*/
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)/*! @function@abstract Retrieve a key from the hash table.@param name Name of the hash table [symbol]@param h Pointer to the hash table [khash_t(name)*]@param k Key [type of keys]@return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]*/
#define kh_get(name, h, k) kh_get_##name(h, k)/*! @function@abstract Remove a key from the hash table.@param name Name of the hash table [symbol]@param h Pointer to the hash table [khash_t(name)*]@param k Iterator to the element to be deleted [khint_t]*/
#define kh_del(name, h, k) kh_del_##name(h, k)/*! @function@abstract Test whether a bucket contains data.@param h Pointer to the hash table [khash_t(name)*]@param x Iterator to the bucket [khint_t]@return 1 if containing data; 0 otherwise [int]*/
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))/*! @function@abstract Get key given an iterator@param h Pointer to the hash table [khash_t(name)*]@param x Iterator to the bucket [khint_t]@return Key [type of keys]*/
#define kh_key(h, x) ((h)->keys[x])/*! @function@abstract Get value given an iterator@param h Pointer to the hash table [khash_t(name)*]@param x Iterator to the bucket [khint_t]@return Value [type of values]@discussion For hash sets, calling this results in segfault.*/
#define kh_val(h, x) ((h)->vals[x])/*! @function@abstract Alias of kh_val()*/
#define kh_value(h, x) ((h)->vals[x])/*! @function@abstract Get the start iterator@param h Pointer to the hash table [khash_t(name)*]@return The start iterator [khint_t]*/
#define kh_begin(h) (khint_t)(0)/*! @function@abstract Get the end iterator@param h Pointer to the hash table [khash_t(name)*]@return The end iterator [khint_t]*/
#define kh_end(h) ((h)->n_buckets)/*! @function@abstract Get the number of elements in the hash table@param h Pointer to the hash table [khash_t(name)*]@return Number of elements in the hash table [khint_t]*/
#define kh_size(h) ((h)->size)/*! @function@abstract Get the number of buckets in the hash table@param h Pointer to the hash table [khash_t(name)*]@return Number of buckets in the hash table [khint_t]*/
#define kh_n_buckets(h) ((h)->n_buckets)/*! @function@abstract Iterate over the entries in the hash table@param h Pointer to the hash table [khash_t(name)*]@param kvar Variable to which key will be assigned@param vvar Variable to which value will be assigned@param code Block of code to execute*/
#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \\for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \\if (!kh_exist(h,__i)) continue; \\(kvar) = kh_key(h,__i); \\(vvar) = kh_val(h,__i); \\code; \\} }/*! @function@abstract Iterate over the values in the hash table@param h Pointer to the hash table [khash_t(name)*]@param vvar Variable to which value will be assigned@param code Block of code to execute*/
#define kh_foreach_value(h, vvar, code) { khint_t __i; \\for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \\if (!kh_exist(h,__i)) continue; \\(vvar) = kh_val(h,__i); \\code; \\} }/* More conenient interfaces *//*! @function@abstract Instantiate a hash set containing integer keys@param name Name of the hash table [symbol]*/
#define KHASH_SET_INIT_INT(name) \\KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)/*! @function@abstract Instantiate a hash map containing integer keys@param name Name of the hash table [symbol]@param khval_t Type of values [type]*/
#define KHASH_MAP_INIT_INT(name, khval_t) \\KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)/*! @function@abstract Instantiate a hash map containing 64-bit integer keys@param name Name of the hash table [symbol]*/
#define KHASH_SET_INIT_INT64(name) \\KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)/*! @function@abstract Instantiate a hash map containing 64-bit integer keys@param name Name of the hash table [symbol]@param khval_t Type of values [type]*/
#define KHASH_MAP_INIT_INT64(name, khval_t) \\KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)typedef const char *kh_cstr_t;
/*! @function@abstract Instantiate a hash map containing const char* keys@param name Name of the hash table [symbol]*/
#define KHASH_SET_INIT_STR(name) \\KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)/*! @function@abstract Instantiate a hash map containing const char* keys@param name Name of the hash table [symbol]@param khval_t Type of values [type]*/
#define KHASH_MAP_INIT_STR(name, khval_t) \\KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
使用散列表
KHASH_MAP_INIT_INT(32, char)void test_khash_map()
{int ret, is_missing;khiter_t k;khash_t(32)* h = kh_init(32);k = kh_put(32, h, 5, &ret);kh_value(h, k) = 10;k = kh_get(32, h, 10);is_missing = (k == kh_end(h));k = kh_get(32, h, 5);kh_del(32, h, k);for (k = kh_begin(h); k != kh_end(h); ++k)if (kh_exist(h, k)) kh_value(h, k) = 1;kh_destroy(32, h);return;
}