在用 C 编写程序时,我错过的一件事情是字典数据结构。在 C 语言中实现一个最方便的方法是什么?我不是在寻找性能,而是从头开始编码它的容易程度。我也不希望它是一般的——像 char*→ int这样的就可以了。但我确实希望它能够存储任意数量的物品。
char*
int
这更像是一种练习。我知道有第三方图书馆可供使用。但是想一想,它们并不存在。在这种情况下,您能够实现满足上述要求的字典的最快方法是什么。
最快的的方法是使用一个已经存在的实现,比如 Uthash。
而且,如果您想自己编写 真的代码,那么可以检查和重用 uthash的算法。它是 BSD 授权的,所以除了传达版权声明的要求之外,你可以无限制地使用它。
uthash
散列表是简单“ Dictionary”的传统实现。如果您不关心速度或大小,只需 寻找它。有许多免费可用的实现。
这是我看到的第一个图片——乍一看,我觉得还不错。(这是相当基本的。如果您真的希望它拥有无限数量的数据,那么您需要在表内存增长时添加一些逻辑来“ realloc”表内存。)
根据散列创建一个简单的散列函数和一些结构的链表,分配要在其中插入值的链表。也可以使用散列来检索它。
不久前我做了一个简单的实现:
... #define K 16 // chaining coefficient struct dict { char *name; /* name of key */ int val; /* value */ struct dict *next; /* link field */ }; typedef struct dict dict; dict *table[K]; int initialized = 0; void putval ( char *,int); void init_dict() { initialized = 1; int i; for(i=0;iname = (char *) malloc (strlen(key_name)+1); ptr->val = sval; strcpy (ptr->name,key_name); ptr->next = (struct dict *)table[hsh]; table[hsh] = ptr; } int getval ( char *key_name ) { int hsh = hash(key_name); dict *ptr; for (ptr = table[hsh]; ptr != (dict *) 0; ptr = (dict *)ptr->next) if (strcmp (ptr->name,key_name) == 0) return ptr->val; return -1; }
C 程序设计语言的6.6节提供了一个简单的 dictionary (hashtable)数据结构。我不认为一个有用的字典实现会比这更简单。为了方便起见,我在这里复制了代码。
struct nlist { /* table entry: */ struct nlist *next; /* next entry in chain */ char *name; /* defined name */ char *defn; /* replacement text */ }; #define HASHSIZE 101 static struct nlist *hashtab[HASHSIZE]; /* pointer table */ /* hash: form hash value for string s */ unsigned hash(char *s) { unsigned hashval; for (hashval = 0; *s != '\0'; s++) hashval = *s + 31 * hashval; return hashval % HASHSIZE; } /* lookup: look for s in hashtab */ struct nlist *lookup(char *s) { struct nlist *np; for (np = hashtab[hash(s)]; np != NULL; np = np->next) if (strcmp(s, np->name) == 0) return np; /* found */ return NULL; /* not found */ } char *strdup(char *); /* install: put (name, defn) in hashtab */ struct nlist *install(char *name, char *defn) { struct nlist *np; unsigned hashval; if ((np = lookup(name)) == NULL) { /* not found */ np = (struct nlist *) malloc(sizeof(*np)); if (np == NULL || (np->name = strdup(name)) == NULL) return NULL; hashval = hash(name); np->next = hashtab[hashval]; hashtab[hashval] = np; } else /* already there */ free((void *) np->defn); /*free previous defn */ if ((np->defn = strdup(defn)) == NULL) return NULL; return np; } char *strdup(char *s) /* make a duplicate of s */ { char *p; p = (char *) malloc(strlen(s)+1); /* +1 for ’\0’ */ if (p != NULL) strcpy(p, s); return p; }
请注意,如果两个字符串的哈希值发生冲突,可能会导致 O(n)查找时间。可以通过增加 HASHSIZE的值来减少碰撞的可能性。有关数据结构的完整讨论,请参阅本书。
O(n)
HASHSIZE
哈希是关键。我认为应该使用查找表和散列键。您可以在线找到许多散列函数。
最快的方法是使用二叉树,最坏的情况也只有 O (logn)。
为了便于实现,很难通过数组进行天真的搜索。除了一些错误检查之外,这是一个完整的实现(未经测试)。
typedef struct dict_entry_s { const char *key; int value; } dict_entry_s; typedef struct dict_s { int len; int cap; dict_entry_s *entry; } dict_s, *dict_t; int dict_find_index(dict_t dict, const char *key) { for (int i = 0; i < dict->len; i++) { if (!strcmp(dict->entry[i], key)) { return i; } } return -1; } int dict_find(dict_t dict, const char *key, int def) { int idx = dict_find_index(dict, key); return idx == -1 ? def : dict->entry[idx].value; } void dict_add(dict_t dict, const char *key, int value) { int idx = dict_find_index(dict, key); if (idx != -1) { dict->entry[idx].value = value; return; } if (dict->len == dict->cap) { dict->cap *= 2; dict->entry = realloc(dict->entry, dict->cap * sizeof(dict_entry_s)); } dict->entry[dict->len].key = strdup(key); dict->entry[dict->len].value = value; dict->len++; } dict_t dict_new(void) { dict_s proto = {0, 10, malloc(10 * sizeof(dict_entry_s))}; dict_t d = malloc(sizeof(dict_s)); *d = proto; return d; } void dict_free(dict_t dict) { for (int i = 0; i < dict->len; i++) { free(dict->entry[i].key); } free(dict->entry); free(dict); }
这里有一个快速实现,我用它从一个字符串中获得一个“ Matrix”(struct)。 你可以拥有一个更大的数组,并在运行过程中改变它的值:
typedef struct { int** lines; int isDefined; }mat; mat matA, matB, matC, matD, matE, matF; /* an auxilary struct to be used in a dictionary */ typedef struct { char* str; mat *matrix; }stringToMat; /* creating a 'dictionary' for a mat name to its mat. lower case only! */ stringToMat matCases [] = { { "mat_a", &matA }, { "mat_b", &matB }, { "mat_c", &matC }, { "mat_d", &matD }, { "mat_e", &matE }, { "mat_f", &matF }, }; mat* getMat(char * str) { stringToMat* pCase; mat * selected = NULL; if (str != NULL) { /* runing on the dictionary to get the mat selected */ for(pCase = matCases; pCase != matCases + sizeof(matCases) / sizeof(matCases[0]); pCase++ ) { if(!strcmp( pCase->str, str)) selected = (pCase->matrix); } if (selected == NULL) printf("%s is not a valid matrix name\n", str); } else printf("expected matrix name, got NULL\n"); return selected; }
GLib 和 gnulib
如果您没有更具体的需求,那么这些可能是您最好的选择,因为它们广泛可用、便携并且可能有效。
GNOME 项目编写的 GLib: https://developer.gnome.org/glib/。在以下位置记录了几个容器: https://developer.gnome.org/glib/stable/glib-data-types.html,包括“散列表”和“平衡二叉树”。许可证: LGPL
Gnulib: GNU 项目的 https://www.gnu.org/software/gnulib/。您需要将源代码复制粘贴到代码中。在 https://www.gnu.org/software/gnulib/MODULES.html#ansic_ext_container中记录了几个容器,包括“ rbtree-list”、“ linkedhash-list”和“ rbtreehash-list”。GPL 许可证。
参见: 有没有具有通用数据结构的开源 C 库?
我很惊讶没有人提到 Hsearch/hcreate的库集,虽然在 windows 上不可用,但是是由 POSIX 强制的,因此在 Linux/GNU 系统中可用。
该链接有一个简单而完整的基本示例,很好地解释了它的用法。
它甚至有线程安全的变种,易于使用和非常性能。
另外,你可以使用 Google CityHash:
#include <stdlib.h> #include <stddef.h> #include <stdio.h> #include <string.h> #include <byteswap.h> #include "city.h" void swap(uint32* a, uint32* b) { int temp = *a; *a = *b; *b = temp; } #define PERMUTE3(a, b, c) swap(&a, &b); swap(&a, &c); // Magic numbers for 32-bit hashing. Copied from Murmur3. static const uint32 c1 = 0xcc9e2d51; static const uint32 c2 = 0x1b873593; static uint32 UNALIGNED_LOAD32(const char *p) { uint32 result; memcpy(&result, p, sizeof(result)); return result; } static uint32 Fetch32(const char *p) { return UNALIGNED_LOAD32(p); } // A 32-bit to 32-bit integer hash copied from Murmur3. static uint32 fmix(uint32 h) { h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; h ^= h >> 16; return h; } static uint32 Rotate32(uint32 val, int shift) { // Avoid shifting by 32: doing so yields an undefined result. return shift == 0 ? val : ((val >> shift) | (val << (32 - shift))); } static uint32 Mur(uint32 a, uint32 h) { // Helper from Murmur3 for combining two 32-bit values. a *= c1; a = Rotate32(a, 17); a *= c2; h ^= a; h = Rotate32(h, 19); return h * 5 + 0xe6546b64; } static uint32 Hash32Len13to24(const char *s, size_t len) { uint32 a = Fetch32(s - 4 + (len >> 1)); uint32 b = Fetch32(s + 4); uint32 c = Fetch32(s + len - 8); uint32 d = Fetch32(s + (len >> 1)); uint32 e = Fetch32(s); uint32 f = Fetch32(s + len - 4); uint32 h = len; return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h))))))); } static uint32 Hash32Len0to4(const char *s, size_t len) { uint32 b = 0; uint32 c = 9; for (size_t i = 0; i < len; i++) { signed char v = s[i]; b = b * c1 + v; c ^= b; } return fmix(Mur(b, Mur(len, c))); } static uint32 Hash32Len5to12(const char *s, size_t len) { uint32 a = len, b = len * 5, c = 9, d = b; a += Fetch32(s); b += Fetch32(s + len - 4); c += Fetch32(s + ((len >> 1) & 4)); return fmix(Mur(c, Mur(b, Mur(a, d)))); } uint32 CityHash32(const char *s, size_t len) { if (len <= 24) { return len <= 12 ? (len <= 4 ? Hash32Len0to4(s, len) : Hash32Len5to12(s, len)) : Hash32Len13to24(s, len); } // len > 24 uint32 h = len, g = c1 * len, f = g; uint32 a0 = Rotate32(Fetch32(s + len - 4) * c1, 17) * c2; uint32 a1 = Rotate32(Fetch32(s + len - 8) * c1, 17) * c2; uint32 a2 = Rotate32(Fetch32(s + len - 16) * c1, 17) * c2; uint32 a3 = Rotate32(Fetch32(s + len - 12) * c1, 17) * c2; uint32 a4 = Rotate32(Fetch32(s + len - 20) * c1, 17) * c2; h ^= a0; h = Rotate32(h, 19); h = h * 5 + 0xe6546b64; h ^= a2; h = Rotate32(h, 19); h = h * 5 + 0xe6546b64; g ^= a1; g = Rotate32(g, 19); g = g * 5 + 0xe6546b64; g ^= a3; g = Rotate32(g, 19); g = g * 5 + 0xe6546b64; f += a4; f = Rotate32(f, 19); f = f * 5 + 0xe6546b64; size_t iters = (len - 1) / 20; do { uint32 a0 = Rotate32(Fetch32(s) * c1, 17) * c2; uint32 a1 = Fetch32(s + 4); uint32 a2 = Rotate32(Fetch32(s + 8) * c1, 17) * c2; uint32 a3 = Rotate32(Fetch32(s + 12) * c1, 17) * c2; uint32 a4 = Fetch32(s + 16); h ^= a0; h = Rotate32(h, 18); h = h * 5 + 0xe6546b64; f += a1; f = Rotate32(f, 19); f = f * c1; g += a2; g = Rotate32(g, 18); g = g * 5 + 0xe6546b64; h ^= a3 + a1; h = Rotate32(h, 19); h = h * 5 + 0xe6546b64; g ^= a4; g = bswap_32(g) * 5; h += a4 * 5; h = bswap_32(h); f += a0; PERMUTE3(f, h, g); s += 20; } while (--iters != 0); g = Rotate32(g, 11) * c1; g = Rotate32(g, 17) * c1; f = Rotate32(f, 11) * c1; f = Rotate32(f, 17) * c1; h = Rotate32(h + g, 19); h = h * 5 + 0xe6546b64; h = Rotate32(h, 17) * c1; h = Rotate32(h + f, 19); h = h * 5 + 0xe6546b64; h = Rotate32(h, 17) * c1; return h; }
Redis 是一个很好的参考,因为 redis 是一个键值系统,它的哈希表就是一个很好的例子