ckh.c source code [jemalloc/src/ckh.c]

1	/*
2	*******************************************************************************
3	* Implementation of (2^1+,2) cuckoo hashing, where 2^1+ indicates that each
4	* hash bucket contains 2^n cells, for n >= 1, and 2 indicates that two hash
5	* functions are employed. The original cuckoo hashing algorithm was described
6	* in:
7	*
8	* Pagh, R., F.F. Rodler (2004) Cuckoo Hashing. Journal of Algorithms
9	* 51(2):122-144.
10	*
11	* Generalization of cuckoo hashing was discussed in:
12	*
13	* Erlingsson, U., M. Manasse, F. McSherry (2006) A cool and practical
14	* alternative to traditional hash tables. In Proceedings of the 7th
15	* Workshop on Distributed Data and Structures (WDAS'06), Santa Clara, CA,
16	* January 2006.
17	*
18	* This implementation uses precisely two hash functions because that is the
19	* fewest that can work, and supporting multiple hashes is an implementation
20	* burden. Here is a reproduction of Figure 1 from Erlingsson et al. (2006)
21	* that shows approximate expected maximum load factors for various
22	* configurations:
23	*
24	* \| #cells/bucket \|
25	* #hashes \| 1 \| 2 \| 4 \| 8 \|
26	* --------+-------+-------+-------+-------+
27	* 1 \| 0.006 \| 0.006 \| 0.03 \| 0.12 \|
28	* 2 \| 0.49 \| 0.86 \|>0.93< \|>0.96< \|
29	* 3 \| 0.91 \| 0.97 \| 0.98 \| 0.999 \|
30	* 4 \| 0.97 \| 0.99 \| 0.999 \| \|
31	*
32	* The number of cells per bucket is chosen such that a bucket fits in one cache
33	* line. So, on 32- and 64-bit systems, we use (8,2) and (4,2) cuckoo hashing,
34	* respectively.
35	*
36	******************************************************************************/
37	#include "jemalloc/internal/jemalloc_preamble.h"
38
39	#include "jemalloc/internal/ckh.h"
40
41	#include "jemalloc/internal/jemalloc_internal_includes.h"
42
43	#include "jemalloc/internal/assert.h"
44	#include "jemalloc/internal/hash.h"
45	#include "jemalloc/internal/malloc_io.h"
46	#include "jemalloc/internal/prng.h"
47	#include "jemalloc/internal/util.h"
48
49	/****************************************************************************/
50	/ Function prototypes for non-inline static functions. /
51
52	static bool ckh_grow(tsd_t tsd, ckh_t ckh);
53	static void ckh_shrink(tsd_t tsd, ckh_t ckh);
54
55	/****************************************************************************/
56
57	/*
58	* Search bucket for key and return the cell number if found; SIZE_T_MAX
59	* otherwise.
60	*/
61	static size_t
62	ckh_bucket_search(ckh_t ckh, size_t bucket, const* void *key) {
63	ckhc_t *cell;
64	unsigned i;
65
66	for (i = `0`; i < (ZU(`1`) << LG_CKH_BUCKET_CELLS); i++) {
67	cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i];
68	if (cell->key != NULL && ckh->keycomp(key, cell->key)) {
69	return (bucket << LG_CKH_BUCKET_CELLS) + i;
70	}
71	}
72
73	return SIZE_T_MAX;
74	}
75
76	/*
77	* Search table for key and return cell number if found; SIZE_T_MAX otherwise.
78	*/
79	static size_t
80	ckh_isearch(ckh_t ckh, const* void *key) {
81	size_t hashes[`2`], bucket, cell;
82
83	assert(ckh != NULL);
84
85	ckh->hash(key, hashes);
86
87	/ Search primary bucket. /
88	bucket = hashes[`0`] & ((ZU(`1`) << ckh->lg_curbuckets) - `1`);
89	cell = ckh_bucket_search(ckh, bucket, key);
90	if (cell != SIZE_T_MAX) {
91	return cell;
92	}
93
94	/ Search secondary bucket. /
95	bucket = hashes[`1`] & ((ZU(`1`) << ckh->lg_curbuckets) - `1`);
96	cell = ckh_bucket_search(ckh, bucket, key);
97	return cell;
98	}
99
100	static bool
101	ckh_try_bucket_insert(ckh_t ckh, size_t bucket, const* void *key,
102	const void *data) {
103	ckhc_t *cell;
104	unsigned offset, i;
105
106	/*
107	* Cycle through the cells in the bucket, starting at a random position.
108	* The randomness avoids worst-case search overhead as buckets fill up.
109	*/
110	offset = (unsigned)prng_lg_range_u64(&ckh->prng_state,
111	LG_CKH_BUCKET_CELLS);
112	for (i = `0`; i < (ZU(`1`) << LG_CKH_BUCKET_CELLS); i++) {
113	cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) +
114	((i + offset) & ((ZU(`1`) << LG_CKH_BUCKET_CELLS) - `1`))];
115	if (cell->key == NULL) {
116	cell->key = key;
117	cell->data = data;
118	ckh->count++;
119	return false;
120	}
121	}
122
123	return true;
124	}
125
126	/*
127	* No space is available in bucket. Randomly evict an item, then try to find an
128	* alternate location for that item. Iteratively repeat this
129	* eviction/relocation procedure until either success or detection of an
130	* eviction/relocation bucket cycle.
131	*/
132	static bool
133	ckh_evict_reloc_insert(ckh_t ckh, size_t argbucket, void* const **argkey,
134	void const **argdata) {
135	const void key, data, tkey, tdata;
136	ckhc_t *cell;
137	size_t hashes[`2`], bucket, tbucket;
138	unsigned i;
139
140	bucket = argbucket;
141	key = *argkey;
142	data = *argdata;
143	while (true) {
144	/*
145	* Choose a random item within the bucket to evict. This is
146	* critical to correct function, because without (eventually)
147	* evicting all items within a bucket during iteration, it
148	* would be possible to get stuck in an infinite loop if there
149	* were an item for which both hashes indicated the same
150	* bucket.
151	*/
152	i = (unsigned)prng_lg_range_u64(&ckh->prng_state,
153	LG_CKH_BUCKET_CELLS);
154	cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i];
155	assert(cell->key != NULL);
156
157	/ Swap cell->{key,data} and {key,data} (evict). /
158	tkey = cell->key; tdata = cell->data;
159	cell->key = key; cell->data = data;
160	key = tkey; data = tdata;
161
162	#ifdef CKH_COUNT
163	ckh->nrelocs++;
164	#endif
165
166	/ Find the alternate bucket for the evicted item. /
167	ckh->hash(key, hashes);
168	tbucket = hashes[`1`] & ((ZU(`1`) << ckh->lg_curbuckets) - `1`);
169	if (tbucket == bucket) {
170	tbucket = hashes[`0`] & ((ZU(`1`) << ckh->lg_curbuckets)
171	- `1`);
172	/*
173	* It may be that (tbucket == bucket) still, if the
174	* item's hashes both indicate this bucket. However,
175	* we are guaranteed to eventually escape this bucket
176	* during iteration, assuming pseudo-random item
177	* selection (true randomness would make infinite
178	* looping a remote possibility). The reason we can
179	* never get trapped forever is that there are two
180	* cases:
181	*
182	* 1) This bucket == argbucket, so we will quickly
183	* detect an eviction cycle and terminate.
184	* 2) An item was evicted to this bucket from another,
185	* which means that at least one item in this bucket
186	* has hashes that indicate distinct buckets.
187	*/
188	}
189	/ Check for a cycle. /
190	if (tbucket == argbucket) {
191	*argkey = key;
192	*argdata = data;
193	return true;
194	}
195
196	bucket = tbucket;
197	if (!ckh_try_bucket_insert(ckh, bucket, key, data)) {
198	return false;
199	}
200	}
201	}
202
203	static bool
204	ckh_try_insert(ckh_t ckh, void* const*argkey, void* const**argdata) {
205	size_t hashes[`2`], bucket;
206	const void key = argkey;
207	const void data = argdata;
208
209	ckh->hash(key, hashes);
210
211	/ Try to insert in primary bucket. /
212	bucket = hashes[`0`] & ((ZU(`1`) << ckh->lg_curbuckets) - `1`);
213	if (!ckh_try_bucket_insert(ckh, bucket, key, data)) {
214	return false;
215	}
216
217	/ Try to insert in secondary bucket. /
218	bucket = hashes[`1`] & ((ZU(`1`) << ckh->lg_curbuckets) - `1`);
219	if (!ckh_try_bucket_insert(ckh, bucket, key, data)) {
220	return false;
221	}
222
223	/*
224	* Try to find a place for this item via iterative eviction/relocation.
225	*/
226	return ckh_evict_reloc_insert(ckh, bucket, argkey, argdata);
227	}
228
229	/*
230	* Try to rebuild the hash table from scratch by inserting all items from the
231	* old table into the new.
232	*/
233	static bool
234	ckh_rebuild(ckh_t ckh, ckhc_t aTab) {
235	size_t count, i, nins;
236	const void key, data;
237
238	count = ckh->count;
239	ckh->count = `0`;
240	for (i = nins = `0`; nins < count; i++) {
241	if (aTab[i].key != NULL) {
242	key = aTab[i].key;
243	data = aTab[i].data;
244	if (ckh_try_insert(ckh, &key, &data)) {
245	ckh->count = count;
246	return true;
247	}
248	nins++;
249	}
250	}
251
252	return false;
253	}
254
255	static bool
256	ckh_grow(tsd_t tsd, ckh_t ckh) {
257	bool ret;
258	ckhc_t tab, ttab;
259	unsigned lg_prevbuckets, lg_curcells;
260
261	#ifdef CKH_COUNT
262	ckh->ngrows++;
263	#endif
264
265	/*
266	* It is possible (though unlikely, given well behaved hashes) that the
267	* table will have to be doubled more than once in order to create a
268	* usable table.
269	*/
270	lg_prevbuckets = ckh->lg_curbuckets;
271	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS;
272	while (true) {
273	size_t usize;
274
275	lg_curcells++;
276	usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
277	if (unlikely(usize == `0`
278	\|\| usize > SC_LARGE_MAXCLASS)) {
279	ret = true;
280	goto label_return;
281	}
282	tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE,
283	true, NULL, true, arena_ichoose(tsd, NULL));
284	if (tab == NULL) {
285	ret = true;
286	goto label_return;
287	}
288	/ Swap in new table. /
289	ttab = ckh->tab;
290	ckh->tab = tab;
291	tab = ttab;
292	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
293
294	if (!ckh_rebuild(ckh, tab)) {
295	idalloctm(tsd_tsdn(tsd), tab, NULL, NULL, true, true);
296	break;
297	}
298
299	/ Rebuilding failed, so back out partially rebuilt table. /
300	idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true);
301	ckh->tab = tab;
302	ckh->lg_curbuckets = lg_prevbuckets;
303	}
304
305	ret = false;
306	label_return:
307	return ret;
308	}
309
310	static void
311	ckh_shrink(tsd_t tsd, ckh_t ckh) {
312	ckhc_t tab, ttab;
313	size_t usize;
314	unsigned lg_prevbuckets, lg_curcells;
315
316	/*
317	* It is possible (though unlikely, given well behaved hashes) that the
318	* table rebuild will fail.
319	*/
320	lg_prevbuckets = ckh->lg_curbuckets;
321	lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - `1`;
322	usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
323	if (unlikely(usize == `0` \|\| usize > SC_LARGE_MAXCLASS)) {
324	return;
325	}
326	tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL,
327	true, arena_ichoose(tsd, NULL));
328	if (tab == NULL) {
329	/*
330	* An OOM error isn't worth propagating, since it doesn't
331	* prevent this or future operations from proceeding.
332	*/
333	return;
334	}
335	/ Swap in new table. /
336	ttab = ckh->tab;
337	ckh->tab = tab;
338	tab = ttab;
339	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
340
341	if (!ckh_rebuild(ckh, tab)) {
342	idalloctm(tsd_tsdn(tsd), tab, NULL, NULL, true, true);
343	#ifdef CKH_COUNT
344	ckh->nshrinks++;
345	#endif
346	return;
347	}
348
349	/ Rebuilding failed, so back out partially rebuilt table. /
350	idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true);
351	ckh->tab = tab;
352	ckh->lg_curbuckets = lg_prevbuckets;
353	#ifdef CKH_COUNT
354	ckh->nshrinkfails++;
355	#endif
356	}
357
358	bool
359	ckh_new(tsd_t tsd, ckh_t ckh, size_t minitems, ckh_hash_t *ckh_hash,
360	ckh_keycomp_t *keycomp) {
361	bool ret;
362	size_t mincells, usize;
363	unsigned lg_mincells;
364
365	assert(minitems > `0`);
366	assert(ckh_hash != NULL);
367	assert(keycomp != NULL);
368
369	#ifdef CKH_COUNT
370	ckh->ngrows = `0`;
371	ckh->nshrinks = `0`;
372	ckh->nshrinkfails = `0`;
373	ckh->ninserts = `0`;
374	ckh->nrelocs = `0`;
375	#endif
376	ckh->prng_state = `42`; / Value doesn't really matter. /
377	ckh->count = `0`;
378
379	/*
380	* Find the minimum power of 2 that is large enough to fit minitems
381	* entries. We are using (2+,2) cuckoo hashing, which has an expected
382	* maximum load factor of at least ~0.86, so 0.75 is a conservative load
383	* factor that will typically allow mincells items to fit without ever
384	* growing the table.
385	*/
386	assert(LG_CKH_BUCKET_CELLS > `0`);
387	mincells = ((minitems + (`3` - (minitems % `3`))) / `3`) << `2`;
388	for (lg_mincells = LG_CKH_BUCKET_CELLS;
389	(ZU(`1`) << lg_mincells) < mincells;
390	lg_mincells++) {
391	/ Do nothing. /
392	}
393	ckh->lg_minbuckets = lg_mincells - LG_CKH_BUCKET_CELLS;
394	ckh->lg_curbuckets = lg_mincells - LG_CKH_BUCKET_CELLS;
395	ckh->hash = ckh_hash;
396	ckh->keycomp = keycomp;
397
398	usize = sz_sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE);
399	if (unlikely(usize == `0` \|\| usize > SC_LARGE_MAXCLASS)) {
400	ret = true;
401	goto label_return;
402	}
403	ckh->tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true,
404	NULL, true, arena_ichoose(tsd, NULL));
405	if (ckh->tab == NULL) {
406	ret = true;
407	goto label_return;
408	}
409
410	ret = false;
411	label_return:
412	return ret;
413	}
414
415	void
416	ckh_delete(tsd_t tsd, ckh_t ckh) {
417	assert(ckh != NULL);
418
419	#ifdef CKH_VERBOSE
420	malloc_printf(
421	"%s(%p): ngrows: %"FMTu64", nshrinks: %"FMTu64","
422	" nshrinkfails: %"FMTu64", ninserts: %"FMTu64","
423	" nrelocs: %"FMTu64"\n", __func__, ckh,
424	(unsigned long long)ckh->ngrows,
425	(unsigned long long)ckh->nshrinks,
426	(unsigned long long)ckh->nshrinkfails,
427	(unsigned long long)ckh->ninserts,
428	(unsigned long long)ckh->nrelocs);
429	#endif
430
431	idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true);
432	if (config_debug) {
433	memset(ckh, JEMALLOC_FREE_JUNK, sizeof(ckh_t));
434	}
435	}
436
437	size_t
438	ckh_count(ckh_t *ckh) {
439	assert(ckh != NULL);
440
441	return ckh->count;
442	}
443
444	bool
445	ckh_iter(ckh_t ckh, size_t tabind, void *key, void* **data) {
446	size_t i, ncells;
447
448	for (i = *tabind, ncells = (ZU(`1`) << (ckh->lg_curbuckets +
449	LG_CKH_BUCKET_CELLS)); i < ncells; i++) {
450	if (ckh->tab[i].key != NULL) {
451	if (key != NULL) {
452	key = (void* *)ckh->tab[i].key;
453	}
454	if (data != NULL) {
455	data = (void* *)ckh->tab[i].data;
456	}
457	*tabind = i + `1`;
458	return false;
459	}
460	}
461
462	return true;
463	}
464
465	bool
466	ckh_insert(tsd_t tsd, ckh_t ckh, const void key, const* void *data) {
467	bool ret;
468
469	assert(ckh != NULL);
470	assert(ckh_search(ckh, key, NULL, NULL));
471
472	#ifdef CKH_COUNT
473	ckh->ninserts++;
474	#endif
475
476	while (ckh_try_insert(ckh, &key, &data)) {
477	if (ckh_grow(tsd, ckh)) {
478	ret = true;
479	goto label_return;
480	}
481	}
482
483	ret = false;
484	label_return:
485	return ret;
486	}
487
488	bool
489	ckh_remove(tsd_t tsd, ckh_t ckh, const void searchkey, void* **key,
490	void **data) {
491	size_t cell;
492
493	assert(ckh != NULL);
494
495	cell = ckh_isearch(ckh, searchkey);
496	if (cell != SIZE_T_MAX) {
497	if (key != NULL) {
498	key = (void* *)ckh->tab[cell].key;
499	}
500	if (data != NULL) {
501	data = (void* *)ckh->tab[cell].data;
502	}
503	ckh->tab[cell].key = NULL;
504	ckh->tab[cell].data = NULL; / Not necessary. /
505
506	ckh->count--;
507	/ Try to halve the table if it is less than 1/4 full. /
508	if (ckh->count < (ZU(`1`) << (ckh->lg_curbuckets
509	+ LG_CKH_BUCKET_CELLS - `2`)) && ckh->lg_curbuckets
510	> ckh->lg_minbuckets) {
511	/ Ignore error due to OOM. /
512	ckh_shrink(tsd, ckh);
513	}
514
515	return false;
516	}
517
518	return true;
519	}
520
521	bool
522	ckh_search(ckh_t ckh, const* void searchkey, void* *key, void* **data) {
523	size_t cell;
524
525	assert(ckh != NULL);
526
527	cell = ckh_isearch(ckh, searchkey);
528	if (cell != SIZE_T_MAX) {
529	if (key != NULL) {
530	key = (void* *)ckh->tab[cell].key;
531	}
532	if (data != NULL) {
533	data = (void* *)ckh->tab[cell].data;
534	}
535	return false;
536	}
537
538	return true;
539	}
540
541	void
542	ckh_string_hash(const void *key, size_t r_hash[`2`]) {
543	hash(key, strlen((const char *)key), `0x94122f33U`, r_hash);
544	}
545
546	bool
547	ckh_string_keycomp(const void k1, const* void *k2) {
548	assert(k1 != NULL);
549	assert(k2 != NULL);
550
551	return !strcmp((char )k1, (char* *)k2);
552	}
553
554	void
555	ckh_pointer_hash(const void *key, size_t r_hash[`2`]) {
556	union {
557	const void *v;
558	size_t i;
559	} u;
560
561	assert(sizeof(u.v) == sizeof(u.i));
562	u.v = key;
563	hash(&u.i, sizeof(u.i), `0xd983396eU`, r_hash);
564	}
565
566	bool
567	ckh_pointer_keycomp(const void k1, const* void *k2) {
568	return (k1 == k2);
569	}
570

Browse the source code of jemalloc/src/ckh.c