Jabberd 2 哈希表设计(xhash源码)

Image

根据Andrew Binstock, “Hashing Rehashed,” Dr. Dobb’s Journal, April 1996. 这位论文的算法生成一个哈希值, 以该值来索引其在一个固定长度的哈希表中的位置.因为是定长并且采用链地址法处理散列冲突, 所以它的哈希表用数组的形式表示(zen[…]), 每个数组元素分别代表一个哈希桶的head节点(整个桶是由_xhn节点组成的双向链表构成).

主要的两个结构体:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
typedef struct xhn_struct
{
struct xhn_struct *next;
struct xhn_struct *prev;
const char *key;
int keylen;
void *val;
} *xhn, _xhn;

typedef struct xht_struct
{
pool_t p;
int prime;
int dirty;
int count;
struct xhn_struct *zen;
struct xhn_struct *free_list; // list of zaped elements to be reused.
int iter_bucket;
xhn iter_node;
int *stat;
} *xht, _xht;

几个主要的函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static int _xhasher(const char *s, int len) // 生成hash值
{
/* ELF hash uses unsigned chars and unsigned arithmetic for portability */
const unsigned char *name = (const unsigned char *)s;
unsigned long h = 0, g;
int i;

for(i=0;i<len;i++)
{ /* do some fancy bitwanking on the string */
h = (h << 4) + (unsigned long)(name[i]);
if ((g = (h & 0xF0000000UL))!=0)
h ^= (g >> 24);
h &= ~g;

}

return (int)h;
}

static xhn _xhash_node_new(xht h, int index);

这个函数返回在index指定的桶中要附加一个节点的确切位置.
如果位于zen数组的桶的首节点为空,则直接返回zen[index% h->prime];
否则从free_list中看有没有被删除过的可用节点.有则直接取出改点,否则从内存池里分配一个.
然后将该节点放在桶的首节点后面.代码注释上有误,说是桶的首节点.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
static xhn _xhash_node_new(xht h, int index)
{
xhn n;
int i = index % h->prime;

/* track total */
h->count++;

#ifdef XHASH_DEBUG
h->stat[i]++;
#endif

// if the zen[i] is empty, reuse it, else get a new one.
n = &h->zen[i];

if( n->key != NULL )
{
if( h->free_list )
{
n = h->free_list;
h->free_list = h->free_list->next;
}else
n = pmalloco(h->p, sizeof(_xhn));

//add it to the bucket list head.
n->prev = &h->zen[i];
n->next = h->zen[i].next;

if( n->next ) n->next->prev = n;
h->zen[i].next = n;
}

return n;
}

static xhn _xhash_node_get(xht h, const char *key, int len, int index);

返回一个完全吻合的哈希节点.

1
2
3
4
5
6
7
8
9
static xhn _xhash_node_get(xht h, const char *key, int len, int index)
{
xhn n;
int i = index % h->prime;
for(n = &h->zen[i]; n != NULL; n = n->next)
if(n->key != NULL && (n->keylen==len) && (strncmp(key, n->key, len) == 0))
return n;
return NULL;
}

xht xhash_new(int prime);

新建一个预设长度哈希表结构(_xht),主要的创建函数. 先创建内存池, 由该内存池管理整个哈希对象.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
xht xhash_new(int prime)
{
xht xnew;
pool_t p;

/* log_debug(ZONE,"creating new hash table of size %d",prime); */

/**
* NOTE:
* all xhash's memory should be allocated from the pool by using pmalloco()/pmallocx(),
* so that the xhash_free() can just call pool_free() simply.
*/

p = pool_heap(sizeof(_xhn)*prime + sizeof(_xht));
xnew = pmalloco(p, sizeof(_xht));
xnew->prime = prime;
xnew->p = p;
xnew->zen = pmalloco(p, sizeof(_xhn)*prime); /* array of xhn size of prime */

xnew->free_list = NULL;

xnew->iter_bucket = -1;
xnew->iter_node = NULL;

#ifdef XHASH_DEBUG
xnew->stat = pmalloco(p, sizeof(int)*prime );
#else
xnew->stat = NULL;
#endif

return xnew;
}

void xhash_put(xht h, const char *key, void *val);

void xhash_putx(xht h, const char *key, int len, void *val);

根据关键字key的hash值插入一个节点.如果key已存在则直接复写.

如果不存在则用上面的**_xhash_node_new**返回节点位置后写入.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
void xhash_putx(xht h, const char *key, int len, void *val)
{
int index;
xhn n;

if(h == NULL || key == NULL)
return;

index = _xhasher(key,len);

/* dirty the xht */
h->dirty++;

/* if existing key, replace it */
if((n = _xhash_node_get(h, key, len, index)) != NULL)
{
/* log_debug(ZONE,"replacing %s with new val %X",key,val); */

n->key = key;
n->keylen = len;
n->val = val;
return;
}

/* log_debug(ZONE,"saving %s val %X",key,val); */

/* new node */
n = _xhash_node_new(h, index);
n->key = key;
n->keylen = len;
n->val = val;
}

void xhash_put(xht h, const char *key, void *val)
{
if(h == NULL || key == NULL) return;
xhash_putx(h,key,strlen(key),val);
}

void xhash_zap_inner( xht h, xhn n, int index);

剔除一个节点. 如果节点位于zen数组的桶首节点或者是当前迭代位置节点则直接写NULL来代表删除, 否则加入free_list的头部, 桶首节点不能加入free_list.

看出在这点删除上是可以快速删除的, 而且, 第一次插入也不需要重新分配内存, 非常高效.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
void xhash_zap_inner( xht h, xhn n, int index)
{
int i = index % h->prime;

// if element:n is in bucket list and it's not the current iter
if( &h->zen[i] != n && h->iter_node != n )
{
if(n->prev) n->prev->next = n->next;
if(n->next) n->next->prev = n->prev;

// add it to the free_list head.
n->prev = NULL;
n->next = h->free_list;
h->free_list = n;
}

//empty the value.
n->key = NULL;
n->val = NULL;

/* dirty the xht and track the total */
h->dirty++;
h->count--;

#ifdef XHASH_DEBUG
h->stat[i]--;
#endif
}

void xhash_walk(xht h, xhash_walker w, void *arg);

遍历所有桶的所有节点,并调用回调函数.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
void xhash_walk(xht h, xhash_walker w, void *arg)
{
int i;
xhn n;

if(h == NULL || w == NULL)
return;

/* log_debug(ZONE,"walking %X",h); */

for(i = 0; i < h->prime; i++)
for(n = &h->zen[i]; n != NULL; n = n->next)
if(n->key != NULL && n->val != NULL)
(*w)(n->key, n->keylen, n->val, arg);
}

int xhash_iter_first(xht h);

把迭代位置设-1后,调用一次xhash_iter_next, 来让迭代位置reset到首位.

1
2
3
4
5
6
7
8
9
/** iteration */
int xhash_iter_first(xht h) {
if(h == NULL) return 0;

h->iter_bucket = -1;
h->iter_node = NULL;

return xhash_iter_next(h);
}

int xhash_iter_next(xht h);

这个函数相对比较复杂,让迭代器指向当前位置下一个不为空的节点.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
//首先在iter_node当前的桶里遍历,

     /* next in this bucket */
    h->iter_node = h->iter_node ? h->iter_node->next : NULL;
    while(h->iter_node != NULL) {
        xhn n = h->iter_node;
        if(n->key != NULL && n->val != NULL)
            return 1;
        h->iter_node = n->next;

// xhash_zap_inner里有说到如果是当前迭代器位置则直接置key为NULL,而不加入free_list
//这个next函数如果遇到这样的节点会将其加入free_list里,相当于清理桶的作用.
        if (n != &h->zen[h->iter_bucket]) {
            if(n->prev) n->prev->next = n->next;
            if(n->next) n->next->prev = n->prev;
            // add it to the free_list head.
            n->prev = NULL;
            n->next = h->free_list;
            h->free_list = n;
        }
    }
//如果都没有找到,从后续的桶里逐个迭代
    /* next bucket */
    for(h->iter_bucket++; h->iter_bucket < h->prime; h->iter_bucket++) {
        h->iter_node = &h->zen[h->iter_bucket];
        while(h->iter_node != NULL) {
            if(h->iter_node->key != NULL && h->iter_node->val != NULL)
                return 1;
            h->iter_node = h->iter_node->next;

            // 在别的桶里就没有清理
        }
    }

其它函数都简单易懂, 不过有个dirty变量, 目测是用来统计对hash表的访问(添加/删除/修改)次数.

使用范例:

1
2
3
// 使用时key和val都可以在pool中分配.
elem = pmalloco(xhash_pool(c->hash), sizeof(struct config_elem_st));
xhash_put(c->hash, pstrdup(xhash_pool(c->hash), "id"), elem);

总结:

优点:散列值计算时间复杂度O(n), 每个位置的首次散列速度快, 内存的复用都很简单高效.链地址法处理散列冲突, 适用于大多程序.

如果要说缺点:散列表长固定, 散列的性能取决于初始化设定的表长.如果有严格的性能要求可以考虑用动态hash表替换.