Jabberd 2 哈希表设计(xhash源码)

2014-07-25 C/C++ C/C++, jabberd2, xmpp 0 Comments Word Count: 1.8k(words) Read Count: 9(minutes)

根据Andrew Binstock, “Hashing Rehashed,” Dr. Dobb’s Journal, April 1996. 这位论文的算法生成一个哈希值, 以该值来索引其在一个固定长度的哈希表中的位置.因为是定长并且采用链地址法处理散列冲突, 所以它的哈希表用数组的形式表示(zen[…]), 每个数组元素分别代表一个哈希桶的head节点(整个桶是由_xhn节点组成的双向链表构成).

主要的两个结构体:

typedef struct xhn_struct
{
    struct xhn_struct *next;
    struct xhn_struct *prev;
    const char *key;
    int keylen;
    void *val;
} *xhn, _xhn;

typedef struct xht_struct
{
    pool_t p;
    int prime;
    int dirty;
    int count;
    struct xhn_struct *zen;
    struct xhn_struct *free_list; // list of zaped elements to be reused.
    int iter_bucket;
    xhn iter_node;
    int *stat;
} *xht, _xht;

几个主要的函数:

static int _xhasher(const char *s, int len) // 生成hash值
{
    /* ELF hash uses unsigned chars and unsigned arithmetic for portability */
    const unsigned char *name = (const unsigned char *)s;
    unsigned long h = 0, g;
    int i;

    for(i=0;i<len;i++)
    { /* do some fancy bitwanking on the string */
        h = (h << 4) + (unsigned long)(name[i]);
        if ((g = (h & 0xF0000000UL))!=0)
            h ^= (g >> 24);
        h &= ~g;

    }

    return (int)h;
}

static xhn _xhash_node_new(xht h, int index);

这个函数返回在index指定的桶中要附加一个节点的确切位置.
如果位于zen数组的桶的首节点为空,则直接返回zen[index% h->prime];
否则从free_list中看有没有被删除过的可用节点.有则直接取出改点,否则从内存池里分配一个.
然后将该节点放在桶的首节点后面.代码注释上有误,说是桶的首节点.

static xhn _xhash_node_new(xht h, int index)
{
    xhn n;
    int i = index % h->prime;

    /* track total */
    h->count++;

#ifdef XHASH_DEBUG
    h->stat[i]++;
#endif

    // if the zen[i] is empty, reuse it, else get a new one.
    n = &h->zen[i];

    if( n->key != NULL )
    {
        if( h->free_list )
        {
            n = h->free_list;
            h->free_list = h->free_list->next;
        }else
            n = pmalloco(h->p, sizeof(_xhn));

        //add it to the bucket list head.
        n->prev = &h->zen[i];
        n->next = h->zen[i].next;

        if( n->next ) n->next->prev = n;
        h->zen[i].next = n;
    }

    return n;
}

static xhn _xhash_node_get(xht h, const char *key, int len, int index);

返回一个完全吻合的哈希节点.

static xhn _xhash_node_get(xht h, const char *key, int len, int index)
{
    xhn n;
    int i = index % h->prime;
    for(n = &h->zen[i]; n != NULL; n = n->next)
        if(n->key != NULL && (n->keylen==len) && (strncmp(key, n->key, len) == 0))
            return n;
    return NULL;
}

xht xhash_new(int prime);

新建一个预设长度哈希表结构(_xht),主要的创建函数. 先创建内存池, 由该内存池管理整个哈希对象.

xht xhash_new(int prime)
{
    xht xnew;
    pool_t p;

/*    log_debug(ZONE,"creating new hash table of size %d",prime); */

    /**
     * NOTE:
     * all xhash's memory should be allocated from the pool by using pmalloco()/pmallocx(),
     * so that the xhash_free() can just call pool_free() simply.
     */

    p = pool_heap(sizeof(_xhn)*prime + sizeof(_xht));
    xnew = pmalloco(p, sizeof(_xht));
    xnew->prime = prime;
    xnew->p = p;
    xnew->zen = pmalloco(p, sizeof(_xhn)*prime); /* array of xhn size of prime */

    xnew->free_list = NULL;

    xnew->iter_bucket = -1;
    xnew->iter_node = NULL;

#ifdef XHASH_DEBUG
    xnew->stat = pmalloco(p, sizeof(int)*prime );
#else
    xnew->stat = NULL;
#endif

    return xnew;
}

void xhash_put(xht h, const char *key, void *val);
void xhash_putx(xht h, const char *key, int len, void *val);

根据关键字key的hash值插入一个节点.如果key已存在则直接复写.

如果不存在则用上面的**_xhash_node_new**返回节点位置后写入.

void xhash_putx(xht h, const char *key, int len, void *val)
{
    int index;
    xhn n;

    if(h == NULL || key == NULL)
        return;

    index = _xhasher(key,len);

    /* dirty the xht */
    h->dirty++;

    /* if existing key, replace it */
    if((n = _xhash_node_get(h, key, len, index)) != NULL)
    {
/*        log_debug(ZONE,"replacing %s with new val %X",key,val); */

        n->key = key;
        n->keylen = len;
        n->val = val;
        return;
    }

/*    log_debug(ZONE,"saving %s val %X",key,val); */

    /* new node */
    n = _xhash_node_new(h, index);
    n->key = key;
    n->keylen = len;
    n->val = val;
}

void xhash_put(xht h, const char *key, void *val)
{
    if(h == NULL || key == NULL) return;
    xhash_putx(h,key,strlen(key),val);
}

void xhash_zap_inner( xht h, xhn n, int index);

剔除一个节点. 如果节点位于zen数组的桶首节点或者是当前迭代位置节点则直接写NULL来代表删除, 否则加入free_list的头部, 桶首节点不能加入free_list.

看出在这点删除上是可以快速删除的, 而且, 第一次插入也不需要重新分配内存, 非常高效.

void xhash_zap_inner( xht h, xhn n, int index)
{
    int i = index % h->prime;

    // if element:n is in bucket list and it's not the current iter
    if( &h->zen[i] != n && h->iter_node != n )
    {
        if(n->prev) n->prev->next = n->next;
        if(n->next) n->next->prev = n->prev;

        // add it to the free_list head.
        n->prev = NULL;
        n->next = h->free_list;
        h->free_list = n;
    }

    //empty the value.
    n->key = NULL;
    n->val = NULL;

    /* dirty the xht and track the total */
    h->dirty++;
    h->count--;

#ifdef XHASH_DEBUG
    h->stat[i]--;
#endif
}

void xhash_walk(xht h, xhash_walker w, void *arg);

遍历所有桶的所有节点,并调用回调函数.

void xhash_walk(xht h, xhash_walker w, void *arg)
{
    int i;
    xhn n;

    if(h == NULL || w == NULL)
        return;

/*    log_debug(ZONE,"walking %X",h); */

    for(i = 0; i < h->prime; i++)
        for(n = &h->zen[i]; n != NULL; n = n->next)
            if(n->key != NULL && n->val != NULL)
                (*w)(n->key, n->keylen, n->val, arg);
}

int xhash_iter_first(xht h);

把迭代位置设-1后,调用一次xhash_iter_next, 来让迭代位置reset到首位.

/** iteration */
int xhash_iter_first(xht h) {
    if(h == NULL) return 0;

    h->iter_bucket = -1;
    h->iter_node = NULL;

    return xhash_iter_next(h);
}

int xhash_iter_next(xht h);

这个函数相对比较复杂,让迭代器指向当前位置下一个不为空的节点.

//首先在iter_node当前的桶里遍历,

     /* next in this bucket */
    h->iter_node = h->iter_node ? h->iter_node->next : NULL;
    while(h->iter_node != NULL) {
        xhn n = h->iter_node;
        if(n->key != NULL && n->val != NULL)
            return 1;
        h->iter_node = n->next;

// xhash_zap_inner里有说到如果是当前迭代器位置则直接置key为NULL,而不加入free_list
//这个next函数如果遇到这样的节点会将其加入free_list里,相当于清理桶的作用.
        if (n != &h->zen[h->iter_bucket]) {
            if(n->prev) n->prev->next = n->next;
            if(n->next) n->next->prev = n->prev;
            // add it to the free_list head.
            n->prev = NULL;
            n->next = h->free_list;
            h->free_list = n;
        }
    }
//如果都没有找到,从后续的桶里逐个迭代
    /* next bucket */
    for(h->iter_bucket++; h->iter_bucket < h->prime; h->iter_bucket++) {
        h->iter_node = &h->zen[h->iter_bucket];
        while(h->iter_node != NULL) {
            if(h->iter_node->key != NULL && h->iter_node->val != NULL)
                return 1;
            h->iter_node = h->iter_node->next;

            // 在别的桶里就没有清理
        }
    }

其它函数都简单易懂, 不过有个dirty变量, 目测是用来统计对hash表的访问(添加/删除/修改)次数.

使用范例:

1
2
3

// 使用时key和val都可以在pool中分配.
elem = pmalloco(xhash_pool(c->hash), sizeof(struct config_elem_st));
xhash_put(c->hash, pstrdup(xhash_pool(c->hash), "id"), elem);

总结:

优点:散列值计算时间复杂度O(n), 每个位置的首次散列速度快, 内存的复用都很简单高效.链地址法处理散列冲突, 适用于大多程序.

如果要说缺点:散列表长固定, 散列的性能取决于初始化设定的表长.如果有严格的性能要求可以考虑用动态hash表替换.

不插电码农

A coder, love sketch and drumming.