在 C 语言中实现字典的快速方法

在用 C 编写程序时,我错过的一件事情是字典数据结构。在 C 语言中实现一个最方便的方法是什么?我不是在寻找性能,而是从头开始编码它的容易程度。我也不希望它是一般的——像 char*int这样的就可以了。但我确实希望它能够存储任意数量的物品。

这更像是一种练习。我知道有第三方图书馆可供使用。但是想一想,它们并不存在。在这种情况下,您能够实现满足上述要求的字典的最快方法是什么。

272649 次浏览

最快的的方法是使用一个已经存在的实现,比如 Uthash

而且,如果您想自己编写 真的代码,那么可以检查和重用 uthash的算法。它是 BSD 授权的,所以除了传达版权声明的要求之外,你可以无限制地使用它。

根据散列创建一个简单的散列函数和一些结构的链表,分配要在其中插入值的链表。也可以使用散列来检索它。

不久前我做了一个简单的实现:

...
#define K 16 // chaining coefficient


struct dict
{
char *name; /* name of key */
int val;   /*  value */
struct dict *next; /* link field */
};


typedef struct dict dict;
dict *table[K];
int initialized = 0;




void  putval ( char *,int);


void init_dict()
{
initialized = 1;
int i;
for(i=0;iname = (char *) malloc (strlen(key_name)+1);
ptr->val = sval;
strcpy (ptr->name,key_name);




ptr->next = (struct dict *)table[hsh];
table[hsh] = ptr;


}




int getval ( char *key_name )
{
int hsh = hash(key_name);
dict *ptr;
for (ptr = table[hsh]; ptr != (dict *) 0;
ptr = (dict *)ptr->next)
if (strcmp (ptr->name,key_name) == 0)
return ptr->val;
return -1;
}

C 程序设计语言的6.6节提供了一个简单的 dictionary (hashtable)数据结构。我不认为一个有用的字典实现会比这更简单。为了方便起见,我在这里复制了代码。

struct nlist { /* table entry: */
struct nlist *next; /* next entry in chain */
char *name; /* defined name */
char *defn; /* replacement text */
};


#define HASHSIZE 101
static struct nlist *hashtab[HASHSIZE]; /* pointer table */


/* hash: form hash value for string s */
unsigned hash(char *s)
{
unsigned hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31 * hashval;
return hashval % HASHSIZE;
}


/* lookup: look for s in hashtab */
struct nlist *lookup(char *s)
{
struct nlist *np;
for (np = hashtab[hash(s)]; np != NULL; np = np->next)
if (strcmp(s, np->name) == 0)
return np; /* found */
return NULL; /* not found */
}


char *strdup(char *);
/* install: put (name, defn) in hashtab */
struct nlist *install(char *name, char *defn)
{
struct nlist *np;
unsigned hashval;
if ((np = lookup(name)) == NULL) { /* not found */
np = (struct nlist *) malloc(sizeof(*np));
if (np == NULL || (np->name = strdup(name)) == NULL)
return NULL;
hashval = hash(name);
np->next = hashtab[hashval];
hashtab[hashval] = np;
} else /* already there */
free((void *) np->defn); /*free previous defn */
if ((np->defn = strdup(defn)) == NULL)
return NULL;
return np;
}


char *strdup(char *s) /* make a duplicate of s */
{
char *p;
p = (char *) malloc(strlen(s)+1); /* +1 for ’\0’ */
if (p != NULL)
strcpy(p, s);
return p;
}

请注意,如果两个字符串的哈希值发生冲突,可能会导致 O(n)查找时间。可以通过增加 HASHSIZE的值来减少碰撞的可能性。有关数据结构的完整讨论,请参阅本书。

哈希是关键。我认为应该使用查找表和散列键。您可以在线找到许多散列函数。

最快的方法是使用二叉树,最坏的情况也只有 O (logn)。

为了便于实现,很难通过数组进行天真的搜索。除了一些错误检查之外,这是一个完整的实现(未经测试)。

typedef struct dict_entry_s {
const char *key;
int value;
} dict_entry_s;


typedef struct dict_s {
int len;
int cap;
dict_entry_s *entry;
} dict_s, *dict_t;


int dict_find_index(dict_t dict, const char *key) {
for (int i = 0; i < dict->len; i++) {
if (!strcmp(dict->entry[i], key)) {
return i;
}
}
return -1;
}


int dict_find(dict_t dict, const char *key, int def) {
int idx = dict_find_index(dict, key);
return idx == -1 ? def : dict->entry[idx].value;
}


void dict_add(dict_t dict, const char *key, int value) {
int idx = dict_find_index(dict, key);
if (idx != -1) {
dict->entry[idx].value = value;
return;
}
if (dict->len == dict->cap) {
dict->cap *= 2;
dict->entry = realloc(dict->entry, dict->cap * sizeof(dict_entry_s));
}
dict->entry[dict->len].key = strdup(key);
dict->entry[dict->len].value = value;
dict->len++;
}


dict_t dict_new(void) {
dict_s proto = {0, 10, malloc(10 * sizeof(dict_entry_s))};
dict_t d = malloc(sizeof(dict_s));
*d = proto;
return d;
}


void dict_free(dict_t dict) {
for (int i = 0; i < dict->len; i++) {
free(dict->entry[i].key);
}
free(dict->entry);
free(dict);
}

这里有一个快速实现,我用它从一个字符串中获得一个“ Matrix”(struct)。 你可以拥有一个更大的数组,并在运行过程中改变它的值:

typedef struct  { int** lines; int isDefined; }mat;
mat matA, matB, matC, matD, matE, matF;


/* an auxilary struct to be used in a dictionary */
typedef struct  { char* str; mat *matrix; }stringToMat;


/* creating a 'dictionary' for a mat name to its mat. lower case only! */
stringToMat matCases [] =
{
{ "mat_a", &matA },
{ "mat_b", &matB },
{ "mat_c", &matC },
{ "mat_d", &matD },
{ "mat_e", &matE },
{ "mat_f", &matF },
};


mat* getMat(char * str)
{
stringToMat* pCase;
mat * selected = NULL;
if (str != NULL)
{
/* runing on the dictionary to get the mat selected */
for(pCase = matCases; pCase != matCases + sizeof(matCases) / sizeof(matCases[0]); pCase++ )
{
if(!strcmp( pCase->str, str))
selected = (pCase->matrix);
}
if (selected == NULL)
printf("%s is not a valid matrix name\n", str);
}
else
printf("expected matrix name, got NULL\n");
return selected;
}

GLib 和 gnulib

如果您没有更具体的需求,那么这些可能是您最好的选择,因为它们广泛可用、便携并且可能有效。

参见: 有没有具有通用数据结构的开源 C 库?

我很惊讶没有人提到 Hsearch/hcreate的库集,虽然在 windows 上不可用,但是是由 POSIX 强制的,因此在 Linux/GNU 系统中可用。

该链接有一个简单而完整的基本示例,很好地解释了它的用法。

它甚至有线程安全的变种,易于使用和非常性能。

另外,你可以使用 Google CityHash:

#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>


#include <byteswap.h>


#include "city.h"


void swap(uint32* a, uint32* b) {
int temp = *a;
*a = *b;
*b = temp;
}


#define PERMUTE3(a, b, c) swap(&a, &b); swap(&a, &c);


// Magic numbers for 32-bit hashing.  Copied from Murmur3.
static const uint32 c1 = 0xcc9e2d51;
static const uint32 c2 = 0x1b873593;


static uint32 UNALIGNED_LOAD32(const char *p) {
uint32 result;
memcpy(&result, p, sizeof(result));
return result;
}


static uint32 Fetch32(const char *p) {
return UNALIGNED_LOAD32(p);
}


// A 32-bit to 32-bit integer hash copied from Murmur3.
static uint32 fmix(uint32 h)
{
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}


static uint32 Rotate32(uint32 val, int shift) {
// Avoid shifting by 32: doing so yields an undefined result.
return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
}


static uint32 Mur(uint32 a, uint32 h) {
// Helper from Murmur3 for combining two 32-bit values.
a *= c1;
a = Rotate32(a, 17);
a *= c2;
h ^= a;
h = Rotate32(h, 19);
return h * 5 + 0xe6546b64;
}


static uint32 Hash32Len13to24(const char *s, size_t len) {
uint32 a = Fetch32(s - 4 + (len >> 1));
uint32 b = Fetch32(s + 4);
uint32 c = Fetch32(s + len - 8);
uint32 d = Fetch32(s + (len >> 1));
uint32 e = Fetch32(s);
uint32 f = Fetch32(s + len - 4);
uint32 h = len;


return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h)))))));
}


static uint32 Hash32Len0to4(const char *s, size_t len) {
uint32 b = 0;
uint32 c = 9;
for (size_t i = 0; i < len; i++) {
signed char v = s[i];
b = b * c1 + v;
c ^= b;
}
return fmix(Mur(b, Mur(len, c)));
}


static uint32 Hash32Len5to12(const char *s, size_t len) {
uint32 a = len, b = len * 5, c = 9, d = b;
a += Fetch32(s);
b += Fetch32(s + len - 4);
c += Fetch32(s + ((len >> 1) & 4));
return fmix(Mur(c, Mur(b, Mur(a, d))));
}


uint32 CityHash32(const char *s, size_t len) {
if (len <= 24) {
return len <= 12 ?
(len <= 4 ? Hash32Len0to4(s, len) : Hash32Len5to12(s, len)) :
Hash32Len13to24(s, len);
}


// len > 24
uint32 h = len, g = c1 * len, f = g;
uint32 a0 = Rotate32(Fetch32(s + len - 4) * c1, 17) * c2;
uint32 a1 = Rotate32(Fetch32(s + len - 8) * c1, 17) * c2;
uint32 a2 = Rotate32(Fetch32(s + len - 16) * c1, 17) * c2;
uint32 a3 = Rotate32(Fetch32(s + len - 12) * c1, 17) * c2;
uint32 a4 = Rotate32(Fetch32(s + len - 20) * c1, 17) * c2;
h ^= a0;
h = Rotate32(h, 19);
h = h * 5 + 0xe6546b64;
h ^= a2;
h = Rotate32(h, 19);
h = h * 5 + 0xe6546b64;
g ^= a1;
g = Rotate32(g, 19);
g = g * 5 + 0xe6546b64;
g ^= a3;
g = Rotate32(g, 19);
g = g * 5 + 0xe6546b64;
f += a4;
f = Rotate32(f, 19);
f = f * 5 + 0xe6546b64;
size_t iters = (len - 1) / 20;
do {
uint32 a0 = Rotate32(Fetch32(s) * c1, 17) * c2;
uint32 a1 = Fetch32(s + 4);
uint32 a2 = Rotate32(Fetch32(s + 8) * c1, 17) * c2;
uint32 a3 = Rotate32(Fetch32(s + 12) * c1, 17) * c2;
uint32 a4 = Fetch32(s + 16);
h ^= a0;
h = Rotate32(h, 18);
h = h * 5 + 0xe6546b64;
f += a1;
f = Rotate32(f, 19);
f = f * c1;
g += a2;
g = Rotate32(g, 18);
g = g * 5 + 0xe6546b64;
h ^= a3 + a1;
h = Rotate32(h, 19);
h = h * 5 + 0xe6546b64;
g ^= a4;
g = bswap_32(g) * 5;
h += a4 * 5;
h = bswap_32(h);
f += a0;
PERMUTE3(f, h, g);
s += 20;
} while (--iters != 0);
g = Rotate32(g, 11) * c1;
g = Rotate32(g, 17) * c1;
f = Rotate32(f, 11) * c1;
f = Rotate32(f, 17) * c1;
h = Rotate32(h + g, 19);
h = h * 5 + 0xe6546b64;
h = Rotate32(h, 17) * c1;
h = Rotate32(h + f, 19);
h = h * 5 + 0xe6546b64;
h = Rotate32(h, 17) * c1;
return h;
}

Redis 是一个很好的参考,因为 redis 是一个键值系统,它的哈希表就是一个很好的例子