Home | History | Annotate | Line # | Download | only in kern
subr_thmap.c revision 1.1
      1 /*-
      2  * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu>
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     24  * SUCH DAMAGE.
     25  *
     26  * Upstream: https://github.com/rmind/thmap/
     27  */
     28 
     29 /*
     30  * Concurrent trie-hash map.
     31  *
     32  * The data structure is conceptually a radix trie on hashed keys.
     33  * Keys are hashed using a 32-bit function.  The root level is a special
     34  * case: it is managed using the compare-and-swap (CAS) atomic operation
     35  * and has a fanout of 64.  The subsequent levels are constructed using
     36  * intermediate nodes with a fanout of 16 (using 4 bits).  As more levels
     37  * are created, more blocks of the 32-bit hash value might be generated
     38  * by incrementing the seed parameter of the hash function.
     39  *
     40  * Concurrency
     41  *
     42  * - READERS: Descending is simply walking through the slot values of
     43  *   the intermediate nodes.  It is lock-free as there is no intermediate
     44  *   state: the slot is either empty or has a pointer to the child node.
     45  *   The main assumptions here are the following:
     46  *
     47  *   i) modifications must preserve consistency with the respect to the
     48  *   readers i.e. the readers can only see the valid node values;
     49  *
     50  *   ii) any invalid view must "fail" the reads, e.g. by making them
     51  *   re-try from the root; this is a case for deletions and is achieved
     52  *   using the NODE_DELETED flag.
     53  *
     54  *   iii) the node destruction must be synchronised with the readers,
     55  *   e.g. by using the Epoch-based reclamation or other techniques.
     56  *
     57  * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which
     58  *   is implemented using the NODE_LOCKED bit) -- it provides mutual
     59  *   exclusion amongst concurrent writers.  The lock order for the nodes
     60  *   is "bottom-up" i.e. they are locked as we ascend the trie.  A key
     61  *   constraint here is that parent pointer never changes.
     62  *
     63  * - DELETES: In addition to writer's locking, the deletion keeps the
     64  *   intermediate nodes in a valid state and sets the NODE_DELETED flag,
     65  *   to indicate that the readers must re-start the walk from the root.
     66  *   As the levels are collapsed, NODE_DELETED gets propagated up-tree.
     67  *   The leaf nodes just stay as-is until they are reclaimed.
     68  *
     69  * - ROOT LEVEL: The root level is a special case, as it is implemented
     70  *   as an array (rather than intermediate node).  The root-level slot can
     71  *   only be set using CAS and it can only be set to a valid intermediate
     72  *   node.  The root-level slot can only be cleared when the node it points
     73  *   at becomes empty, is locked and marked as NODE_DELETED (this causes
     74  *   the insert/delete operations to re-try until the slot is set to NULL).
     75  *
     76  * References:
     77  *
     78  *	W. Litwin, 1981, Trie Hashing.
     79  *	Proceedings of the 1981 ACM SIGMOD, p. 19-29
     80  *	https://dl.acm.org/citation.cfm?id=582322
     81  *
     82  *	P. L. Lehman and S. B. Yao.
     83  *	Efficient locking for concurrent operations on B-trees.
     84  *	ACM TODS, 6(4):650-670, 1981
     85  *	https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf
     86  */
     87 
     88 #ifdef _KERNEL
     89 #include <sys/cdefs.h>
     90 #include <sys/param.h>
     91 #include <sys/types.h>
     92 #include <sys/thmap.h>
     93 #include <sys/kmem.h>
     94 #include <sys/lock.h>
     95 #include <sys/atomic.h>
     96 #include <sys/hash.h>
     97 #else
     98 #include <stdio.h>
     99 #include <stdlib.h>
    100 #include <stdbool.h>
    101 #include <stddef.h>
    102 #include <inttypes.h>
    103 #include <string.h>
    104 #include <limits.h>
    105 
    106 #include "thmap.h"
    107 #include "utils.h"
    108 #endif
    109 
    110 /*
    111  * NetBSD kernel wrappers
    112  */
    113 #ifdef _KERNEL
    114 #define	ASSERT KASSERT
    115 #define	DEBUG 1
    116 #define	atomic_thread_fence(x) x
    117 #define	memory_order_stores membar_producer()
    118 #define	memory_order_loads membar_consumer()
    119 #define	atomic_cas_32_p(p, e, n) (atomic_cas_32((p), (e), (n)) == (e))
    120 #define	atomic_cas_ptr_p(p, e, n) \
    121     (atomic_cas_ptr((p), (void *)(e), (void *)(n)) == (e))
    122 #define	atomic_exchange atomic_swap_ptr
    123 #define	murmurhash3 murmurhash2
    124 #endif
    125 
    126 /*
    127  * The root level fanout is 64 (indexed by the last 6 bits of the hash
    128  * value XORed with the length).  Each subsequent level, represented by
    129  * intermediate nodes, has a fanout of 16 (using 4 bits).
    130  *
    131  * The hash function produces 32-bit values.
    132  */
    133 
    134 #define	HASHVAL_BITS	(32)
    135 #define	HASHVAL_MOD	(HASHVAL_BITS - 1)
    136 #define	HASHVAL_SHIFT	(5)
    137 
    138 #define	ROOT_BITS	(6)
    139 #define	ROOT_SIZE	(1 << ROOT_BITS)
    140 #define	ROOT_MASK	(ROOT_SIZE - 1)
    141 #define	ROOT_MSBITS	(HASHVAL_BITS - ROOT_BITS)
    142 
    143 #define	LEVEL_BITS	(4)
    144 #define	LEVEL_SIZE	(1 << LEVEL_BITS)
    145 #define	LEVEL_MASK	(LEVEL_SIZE - 1)
    146 
    147 /*
    148  * Instead of raw pointers, we use offsets from the base address.
    149  * This accommodates the use of this data structure in shared memory,
    150  * where mappings can be in different address spaces.
    151  *
    152  * The pointers must be aligned, since pointer tagging is used to
    153  * differentiate the intermediate nodes from leaves.  We reserve the
    154  * least significant bit.
    155  */
    156 typedef uintptr_t thmap_ptr_t;
    157 
    158 #define	THMAP_NULL		((thmap_ptr_t)0)
    159 
    160 #define	THMAP_LEAF_BIT		(0x1)
    161 
    162 #define	THMAP_ALIGNED_P(p)	(((uintptr_t)(p) & 3) == 0)
    163 #define	THMAP_ALIGN(p)		((uintptr_t)(p) & ~(uintptr_t)3)
    164 #define	THMAP_INODE_P(p)	(((uintptr_t)(p) & THMAP_LEAF_BIT) == 0)
    165 
    166 #define	THMAP_GETPTR(th, p)	((void *)((th)->baseptr + (uintptr_t)(p)))
    167 #define	THMAP_GETOFF(th, p)	((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr))
    168 #define	THMAP_NODE(th, p)	THMAP_GETPTR(th, THMAP_ALIGN(p))
    169 
    170 /*
    171  * State field.
    172  */
    173 
    174 #define	NODE_LOCKED		(1U << 31)		// lock (writers)
    175 #define	NODE_DELETED		(1U << 30)		// node deleted
    176 #define	NODE_COUNT(s)		((s) & 0x3fffffff)	// slot count mask
    177 
    178 /*
    179  * There are two types of nodes:
    180  * - Intermediate nodes -- arrays pointing to another level or a leaf;
    181  * - Leaves, which store a key-value pair.
    182  */
    183 
    184 typedef struct {
    185 	uint32_t	state;
    186 	thmap_ptr_t	parent;
    187 	thmap_ptr_t	slots[LEVEL_SIZE];
    188 } thmap_inode_t;
    189 
    190 #define	THMAP_INODE_LEN	sizeof(thmap_inode_t)
    191 
    192 typedef struct {
    193 	thmap_ptr_t	key;
    194 	size_t		len;
    195 	void *		val;
    196 } thmap_leaf_t;
    197 
    198 typedef struct {
    199 	unsigned	rslot;		// root-level slot index
    200 	unsigned	level;		// current level in the tree
    201 	unsigned	hashidx;	// current hash index (block of bits)
    202 	uint32_t	hashval;	// current hash value
    203 } thmap_query_t;
    204 
    205 typedef struct {
    206 	uintptr_t	addr;
    207 	size_t		len;
    208 	void *		next;
    209 } thmap_gc_t;
    210 
    211 #define	THMAP_ROOT_LEN	(sizeof(thmap_ptr_t) * ROOT_SIZE)
    212 
    213 struct thmap {
    214 	uintptr_t	baseptr;
    215 	thmap_ptr_t *	root;
    216 	unsigned	flags;
    217 	const thmap_ops_t *ops;
    218 	thmap_gc_t *	gc_list;
    219 };
    220 
    221 static void	stage_mem_gc(thmap_t *, uintptr_t, size_t);
    222 
    223 /*
    224  * A few low-level helper routines.
    225  */
    226 
    227 static uintptr_t
    228 alloc_wrapper(size_t len)
    229 {
    230 	return (uintptr_t)kmem_intr_alloc(len, KM_SLEEP);
    231 }
    232 
    233 static void
    234 free_wrapper(uintptr_t addr, size_t len)
    235 {
    236 	kmem_intr_free((void *)addr, len);
    237 }
    238 
    239 static const thmap_ops_t thmap_default_ops = {
    240 	.alloc = alloc_wrapper,
    241 	.free = free_wrapper
    242 };
    243 
    244 /*
    245  * NODE LOCKING.
    246  */
    247 
    248 #ifdef DEBUG
    249 static inline bool
    250 node_locked_p(const thmap_inode_t *node)
    251 {
    252 	return (node->state & NODE_LOCKED) != 0;
    253 }
    254 #endif
    255 
    256 static void
    257 lock_node(thmap_inode_t *node)
    258 {
    259 	unsigned bcount = SPINLOCK_BACKOFF_MIN;
    260 	uint32_t s;
    261 again:
    262 	s = node->state;
    263 	if (s & NODE_LOCKED) {
    264 		SPINLOCK_BACKOFF(bcount);
    265 		goto again;
    266 	}
    267 	/*
    268 	 * CAS will issue a full memory fence for us.
    269 	 *
    270 	 * WARNING: for optimisations purposes, callers rely on us
    271 	 * issuing load and store fence
    272 	 */
    273 	if (!atomic_cas_32_p(&node->state, s, s | NODE_LOCKED)) {
    274 		bcount = SPINLOCK_BACKOFF_MIN;
    275 		goto again;
    276 	}
    277 }
    278 
    279 static void
    280 unlock_node(thmap_inode_t *node)
    281 {
    282 	uint32_t s = node->state & ~NODE_LOCKED;
    283 
    284 	ASSERT(node_locked_p(node));
    285 	atomic_thread_fence(memory_order_stores);
    286 	node->state = s; // atomic store
    287 }
    288 
    289 /*
    290  * HASH VALUE AND KEY OPERATIONS.
    291  */
    292 
    293 static inline void
    294 hashval_init(thmap_query_t *query, const void * restrict key, size_t len)
    295 {
    296 	const uint32_t hashval = murmurhash3(key, len, 0);
    297 
    298 	query->rslot = ((hashval >> ROOT_MSBITS) ^ len) & ROOT_MASK;
    299 	query->level = 0;
    300 	query->hashval = hashval;
    301 	query->hashidx = 0;
    302 }
    303 
    304 /*
    305  * hashval_getslot: given the key, compute the hash (if not already cached)
    306  * and return the offset for the current level.
    307  */
    308 static unsigned
    309 hashval_getslot(thmap_query_t *query, const void * restrict key, size_t len)
    310 {
    311 	const unsigned offset = query->level * LEVEL_BITS;
    312 	const unsigned shift = offset & HASHVAL_MOD;
    313 	const unsigned i = offset >> HASHVAL_SHIFT;
    314 
    315 	if (query->hashidx != i) {
    316 		/* Generate a hash value for a required range. */
    317 		query->hashval = murmurhash3(key, len, i);
    318 		query->hashidx = i;
    319 	}
    320 	return (query->hashval >> shift) & LEVEL_MASK;
    321 }
    322 
    323 static unsigned
    324 hashval_getleafslot(const thmap_t *thmap,
    325     const thmap_leaf_t *leaf, unsigned level)
    326 {
    327 	const void *key = THMAP_GETPTR(thmap, leaf->key);
    328 	const unsigned offset = level * LEVEL_BITS;
    329 	const unsigned shift = offset & HASHVAL_MOD;
    330 	const unsigned i = offset >> HASHVAL_SHIFT;
    331 
    332 	return (murmurhash3(key, leaf->len, i) >> shift) & LEVEL_MASK;
    333 }
    334 
    335 static inline unsigned
    336 hashval_getl0slot(const thmap_t *thmap, const thmap_query_t *query,
    337     const thmap_leaf_t *leaf)
    338 {
    339 	if (__predict_true(query->hashidx == 0)) {
    340 		return query->hashval & LEVEL_MASK;
    341 	}
    342 	return hashval_getleafslot(thmap, leaf, 0);
    343 }
    344 
    345 static bool
    346 key_cmp_p(const thmap_t *thmap, const thmap_leaf_t *leaf,
    347     const void * restrict key, size_t len)
    348 {
    349 	const void *leafkey = THMAP_GETPTR(thmap, leaf->key);
    350 	return len == leaf->len && memcmp(key, leafkey, len) == 0;
    351 }
    352 
    353 /*
    354  * INTER-NODE OPERATIONS.
    355  */
    356 
    357 static thmap_inode_t *
    358 node_create(thmap_t *thmap, thmap_inode_t *parent)
    359 {
    360 	thmap_inode_t *node;
    361 	uintptr_t p;
    362 
    363 	p = thmap->ops->alloc(THMAP_INODE_LEN);
    364 	if (!p) {
    365 		return NULL;
    366 	}
    367 	node = THMAP_GETPTR(thmap, p);
    368 	ASSERT(THMAP_ALIGNED_P(node));
    369 
    370 	memset(node, 0, THMAP_INODE_LEN);
    371 	if (parent) {
    372 		node->state = NODE_LOCKED;
    373 		node->parent = THMAP_GETOFF(thmap, parent);
    374 	}
    375 	return node;
    376 }
    377 
    378 static void
    379 node_insert(thmap_inode_t *node, unsigned slot, thmap_ptr_t child)
    380 {
    381 	ASSERT(node_locked_p(node) || node->parent == THMAP_NULL);
    382 	ASSERT((node->state & NODE_DELETED) == 0);
    383 	ASSERT(node->slots[slot] == THMAP_NULL);
    384 
    385 	ASSERT(NODE_COUNT(node->state) < LEVEL_SIZE);
    386 
    387 	node->slots[slot] = child;
    388 	node->state++;
    389 }
    390 
    391 static void
    392 node_remove(thmap_inode_t *node, unsigned slot)
    393 {
    394 	ASSERT(node_locked_p(node));
    395 	ASSERT((node->state & NODE_DELETED) == 0);
    396 	ASSERT(node->slots[slot] != THMAP_NULL);
    397 
    398 	ASSERT(NODE_COUNT(node->state) > 0);
    399 	ASSERT(NODE_COUNT(node->state) <= LEVEL_SIZE);
    400 
    401 	node->slots[slot] = THMAP_NULL;
    402 	node->state--;
    403 }
    404 
    405 /*
    406  * LEAF OPERATIONS.
    407  */
    408 
    409 static thmap_leaf_t *
    410 leaf_create(const thmap_t *thmap, const void *key, size_t len, void *val)
    411 {
    412 	thmap_leaf_t *leaf;
    413 	uintptr_t leaf_off, key_off;
    414 
    415 	leaf_off = thmap->ops->alloc(sizeof(thmap_leaf_t));
    416 	if (!leaf_off) {
    417 		return NULL;
    418 	}
    419 	leaf = THMAP_GETPTR(thmap, leaf_off);
    420 	ASSERT(THMAP_ALIGNED_P(leaf));
    421 
    422 	if ((thmap->flags & THMAP_NOCOPY) == 0) {
    423 		/*
    424 		 * Copy the key.
    425 		 */
    426 		key_off = thmap->ops->alloc(len);
    427 		if (!key_off) {
    428 			thmap->ops->free(leaf_off, sizeof(thmap_leaf_t));
    429 			return NULL;
    430 		}
    431 		memcpy(THMAP_GETPTR(thmap, key_off), key, len);
    432 		leaf->key = key_off;
    433 	} else {
    434 		/* Otherwise, we use a reference. */
    435 		leaf->key = (uintptr_t)key;
    436 	}
    437 	leaf->len = len;
    438 	leaf->val = val;
    439 	return leaf;
    440 }
    441 
    442 static void
    443 leaf_free(const thmap_t *thmap, thmap_leaf_t *leaf)
    444 {
    445 	if ((thmap->flags & THMAP_NOCOPY) == 0) {
    446 		thmap->ops->free(leaf->key, leaf->len);
    447 	}
    448 	thmap->ops->free(THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
    449 }
    450 
    451 static thmap_leaf_t *
    452 get_leaf(const thmap_t *thmap, thmap_inode_t *parent, unsigned slot)
    453 {
    454 	thmap_ptr_t node;
    455 
    456 	node = parent->slots[slot];
    457 	if (THMAP_INODE_P(node)) {
    458 		return NULL;
    459 	}
    460 	return THMAP_NODE(thmap, node);
    461 }
    462 
    463 /*
    464  * ROOT OPERATIONS.
    465  */
    466 
    467 static inline bool
    468 root_try_put(thmap_t *thmap, const thmap_query_t *query, thmap_leaf_t *leaf)
    469 {
    470 	const unsigned i = query->rslot;
    471 	thmap_inode_t *node;
    472 	thmap_ptr_t nptr;
    473 	unsigned slot;
    474 
    475 	/*
    476 	 * Must pre-check first.
    477 	 */
    478 	if (thmap->root[i]) {
    479 		return false;
    480 	}
    481 
    482 	/*
    483 	 * Create an intermediate node.  Since there is no parent set,
    484 	 * it will be created unlocked and the CAS operation will issue
    485 	 * the store memory fence for us.
    486 	 */
    487 	node = node_create(thmap, NULL);
    488 	slot = hashval_getl0slot(thmap, query, leaf);
    489 	node_insert(node, slot, THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT);
    490 	nptr = THMAP_GETOFF(thmap, node);
    491 again:
    492 	if (thmap->root[i]) {
    493 		thmap->ops->free(nptr, THMAP_INODE_LEN);
    494 		return false;
    495 	}
    496 	if (!atomic_cas_ptr_p(&thmap->root[i], THMAP_NULL, nptr)) {
    497 		goto again;
    498 	}
    499 	return true;
    500 }
    501 
    502 /*
    503  * find_edge_node: given the hash, traverse the tree to find the edge node.
    504  *
    505  * => Returns an aligned (clean) pointer to the parent node.
    506  * => Returns the slot number and sets current level.
    507  */
    508 static thmap_inode_t *
    509 find_edge_node(const thmap_t *thmap, thmap_query_t *query,
    510     const void * restrict key, size_t len, unsigned *slot)
    511 {
    512 	thmap_ptr_t root_slot = thmap->root[query->rslot];
    513 	thmap_inode_t *parent;
    514 	thmap_ptr_t node;
    515 	unsigned off;
    516 
    517 	ASSERT(query->level == 0);
    518 
    519 	parent = THMAP_NODE(thmap, root_slot);
    520 	if (!parent) {
    521 		return NULL;
    522 	}
    523 descend:
    524 	off = hashval_getslot(query, key, len);
    525 	node = parent->slots[off];
    526 
    527 	/* Ensure the parent load happens before the child load. */
    528 	atomic_thread_fence(memory_order_loads);
    529 
    530 	/* Descend the tree until we find a leaf or empty slot. */
    531 	if (node && THMAP_INODE_P(node)) {
    532 		parent = THMAP_NODE(thmap, node);
    533 		query->level++;
    534 		goto descend;
    535 	}
    536 	if (parent->state & NODE_DELETED) {
    537 		return NULL;
    538 	}
    539 	*slot = off;
    540 	return parent;
    541 }
    542 
    543 /*
    544  * find_edge_node_locked: traverse the tree, like find_edge_node(),
    545  * but attempt to lock the edge node.
    546  *
    547  * => Returns NULL if the deleted node is found.  This indicates that
    548  *    the caller must re-try from the root, as the root slot might have
    549  *    changed too.
    550  */
    551 static thmap_inode_t *
    552 find_edge_node_locked(const thmap_t *thmap, thmap_query_t *query,
    553     const void * restrict key, size_t len, unsigned *slot)
    554 {
    555 	thmap_inode_t *node;
    556 	thmap_ptr_t target;
    557 retry:
    558 	/*
    559 	 * Find the edge node and lock it!  Re-check the state since
    560 	 * the tree might change by the time we acquire the lock.
    561 	 */
    562 	node = find_edge_node(thmap, query, key, len, slot);
    563 	if (!node) {
    564 		/* The root slot is empty -- let the caller decide. */
    565 		query->level = 0;
    566 		return NULL;
    567 	}
    568 	lock_node(node);
    569 	if (__predict_false(node->state & NODE_DELETED)) {
    570 		/*
    571 		 * The node has been deleted.  The tree might have a new
    572 		 * shape now, therefore we must re-start from the root.
    573 		 */
    574 		unlock_node(node);
    575 		query->level = 0;
    576 		return NULL;
    577 	}
    578 	target = node->slots[*slot];
    579 	if (__predict_false(target && THMAP_INODE_P(target))) {
    580 		/*
    581 		 * The target slot has been changed and it is now an
    582 		 * intermediate node.  Re-start from the top internode.
    583 		 */
    584 		unlock_node(node);
    585 		query->level = 0;
    586 		goto retry;
    587 	}
    588 	return node;
    589 }
    590 
    591 /*
    592  * thmap_get: lookup a value given the key.
    593  */
    594 void *
    595 thmap_get(thmap_t *thmap, const void *key, size_t len)
    596 {
    597 	thmap_query_t query;
    598 	thmap_inode_t *parent;
    599 	thmap_leaf_t *leaf;
    600 	unsigned slot;
    601 
    602 	hashval_init(&query, key, len);
    603 	parent = find_edge_node(thmap, &query, key, len, &slot);
    604 	if (!parent) {
    605 		return NULL;
    606 	}
    607 	leaf = get_leaf(thmap, parent, slot);
    608 	if (!leaf) {
    609 		return NULL;
    610 	}
    611 	if (!key_cmp_p(thmap, leaf, key, len)) {
    612 		return NULL;
    613 	}
    614 	return leaf->val;
    615 }
    616 
    617 /*
    618  * thmap_put: insert a value given the key.
    619  *
    620  * => If the key is already present, return the associated value.
    621  * => Otherwise, on successful insert, return the given value.
    622  */
    623 void *
    624 thmap_put(thmap_t *thmap, const void *key, size_t len, void *val)
    625 {
    626 	thmap_query_t query;
    627 	thmap_leaf_t *leaf, *other;
    628 	thmap_inode_t *parent, *child;
    629 	unsigned slot, other_slot;
    630 	thmap_ptr_t target;
    631 
    632 	/*
    633 	 * First, pre-allocate and initialise the leaf node.
    634 	 *
    635 	 * NOTE: locking of the edge node below will issue the
    636 	 * store fence for us.
    637 	 */
    638 	leaf = leaf_create(thmap, key, len, val);
    639 	if (__predict_false(!leaf)) {
    640 		return NULL;
    641 	}
    642 	hashval_init(&query, key, len);
    643 retry:
    644 	/*
    645 	 * Try to insert into the root first, if its slot is empty.
    646 	 */
    647 	if (root_try_put(thmap, &query, leaf)) {
    648 		/* Success: the leaf was inserted; no locking involved. */
    649 		return val;
    650 	}
    651 
    652 	/*
    653 	 * Find the edge node and the target slot.
    654 	 */
    655 	parent = find_edge_node_locked(thmap, &query, key, len, &slot);
    656 	if (!parent) {
    657 		goto retry;
    658 	}
    659 	target = parent->slots[slot]; // tagged offset
    660 	if (THMAP_INODE_P(target)) {
    661 		/*
    662 		 * Empty slot: simply insert the new leaf.  The store
    663 		 * fence is already issued for us.
    664 		 */
    665 		target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
    666 		node_insert(parent, slot, target);
    667 		goto out;
    668 	}
    669 
    670 	/*
    671 	 * Collision or duplicate.
    672 	 */
    673 	other = THMAP_NODE(thmap, target);
    674 	if (key_cmp_p(thmap, other, key, len)) {
    675 		/*
    676 		 * Duplicate.  Free the pre-allocated leaf and
    677 		 * return the present value.
    678 		 */
    679 		leaf_free(thmap, leaf);
    680 		val = other->val;
    681 		goto out;
    682 	}
    683 descend:
    684 	/*
    685 	 * Collision -- expand the tree.  Create an intermediate node
    686 	 * which will be locked (NODE_LOCKED) for us.  At this point,
    687 	 * we advance to the next level.
    688 	 */
    689 	child = node_create(thmap, parent);
    690 	if (__predict_false(!child)) {
    691 		leaf_free(thmap, leaf);
    692 		val = NULL;
    693 		goto out;
    694 	}
    695 	query.level++;
    696 
    697 	/*
    698 	 * Insert the other (colliding) leaf first.
    699 	 */
    700 	other_slot = hashval_getleafslot(thmap, other, query.level);
    701 	target = THMAP_GETOFF(thmap, other) | THMAP_LEAF_BIT;
    702 	node_insert(child, other_slot, target);
    703 
    704 	/*
    705 	 * Insert the intermediate node into the parent node.
    706 	 * It becomes the new parent for the our new leaf.
    707 	 *
    708 	 * Ensure that stores to the child (and leaf) reach the
    709 	 * global visibility before it gets inserted to the parent.
    710 	 */
    711 	atomic_thread_fence(memory_order_stores);
    712 	parent->slots[slot] = THMAP_GETOFF(thmap, child);
    713 
    714 	unlock_node(parent);
    715 	ASSERT(node_locked_p(child));
    716 	parent = child;
    717 
    718 	/*
    719 	 * Get the new slot and check for another collision
    720 	 * at the next level.
    721 	 */
    722 	slot = hashval_getslot(&query, key, len);
    723 	if (slot == other_slot) {
    724 		/* Another collision -- descend and expand again. */
    725 		goto descend;
    726 	}
    727 
    728 	/* Insert our new leaf once we expanded enough. */
    729 	target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT;
    730 	node_insert(parent, slot, target);
    731 out:
    732 	unlock_node(parent);
    733 	return val;
    734 }
    735 
    736 /*
    737  * thmap_del: remove the entry given the key.
    738  */
    739 void *
    740 thmap_del(thmap_t *thmap, const void *key, size_t len)
    741 {
    742 	thmap_query_t query;
    743 	thmap_leaf_t *leaf;
    744 	thmap_inode_t *parent;
    745 	unsigned slot;
    746 	void *val;
    747 
    748 	hashval_init(&query, key, len);
    749 	parent = find_edge_node_locked(thmap, &query, key, len, &slot);
    750 	if (!parent) {
    751 		/* Root slot empty: not found. */
    752 		return NULL;
    753 	}
    754 	leaf = get_leaf(thmap, parent, slot);
    755 	if (!leaf || !key_cmp_p(thmap, leaf, key, len)) {
    756 		/* Not found. */
    757 		unlock_node(parent);
    758 		return NULL;
    759 	}
    760 
    761 	/* Remove the leaf. */
    762 	ASSERT(THMAP_NODE(thmap, parent->slots[slot]) == leaf);
    763 	node_remove(parent, slot);
    764 
    765 	/*
    766 	 * Collapse the levels if removing the last item.
    767 	 */
    768 	while (query.level && NODE_COUNT(parent->state) == 0) {
    769 		thmap_inode_t *node = parent;
    770 
    771 		ASSERT(node->state == NODE_LOCKED);
    772 
    773 		/*
    774 		 * Ascend one level up.
    775 		 * => Mark our current parent as deleted.
    776 		 * => Lock the parent one level up.
    777 		 */
    778 		query.level--;
    779 		slot = hashval_getslot(&query, key, len);
    780 		parent = THMAP_NODE(thmap, node->parent);
    781 		ASSERT(parent != NULL);
    782 
    783 		lock_node(parent);
    784 		ASSERT((parent->state & NODE_DELETED) == 0);
    785 
    786 		node->state |= NODE_DELETED;
    787 		unlock_node(node); // memory_order_stores
    788 
    789 		ASSERT(THMAP_NODE(thmap, parent->slots[slot]) == node);
    790 		node_remove(parent, slot);
    791 
    792 		/* Stage the removed node for G/C. */
    793 		stage_mem_gc(thmap, THMAP_GETOFF(thmap, node), THMAP_INODE_LEN);
    794 	}
    795 
    796 	/*
    797 	 * If the top node is empty, then we need to remove it from the
    798 	 * root level.  Mark the node as deleted and clear the slot.
    799 	 *
    800 	 * Note: acquiring the lock on the top node effectively prevents
    801 	 * the root slot from changing.
    802 	 */
    803 	if (NODE_COUNT(parent->state) == 0) {
    804 		const unsigned rslot = query.rslot;
    805 		const thmap_ptr_t nptr = thmap->root[rslot];
    806 
    807 		ASSERT(query.level == 0);
    808 		ASSERT(parent->parent == THMAP_NULL);
    809 		ASSERT(THMAP_GETOFF(thmap, parent) == nptr);
    810 
    811 		/* Mark as deleted and remove from the root-level slot. */
    812 		parent->state |= NODE_DELETED;
    813 		atomic_thread_fence(memory_order_stores);
    814 		thmap->root[rslot] = THMAP_NULL;
    815 
    816 		stage_mem_gc(thmap, nptr, THMAP_INODE_LEN);
    817 	}
    818 	unlock_node(parent);
    819 
    820 	/*
    821 	 * Save the value and stage the leaf for G/C.
    822 	 */
    823 	val = leaf->val;
    824 	if ((thmap->flags & THMAP_NOCOPY) == 0) {
    825 		stage_mem_gc(thmap, leaf->key, leaf->len);
    826 	}
    827 	stage_mem_gc(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t));
    828 	return val;
    829 }
    830 
    831 /*
    832  * G/C routines.
    833  */
    834 
    835 static void
    836 stage_mem_gc(thmap_t *thmap, uintptr_t addr, size_t len)
    837 {
    838 	thmap_gc_t *head, *gc;
    839 
    840 	gc = kmem_intr_alloc(sizeof(thmap_gc_t), KM_SLEEP);
    841 	gc->addr = addr;
    842 	gc->len = len;
    843 retry:
    844 	gc->next = head = thmap->gc_list;
    845 	if (!atomic_cas_ptr_p(&thmap->gc_list, head, gc)) {
    846 		goto retry;
    847 	}
    848 }
    849 
    850 void *
    851 thmap_stage_gc(thmap_t *thmap)
    852 {
    853 	return atomic_exchange(&thmap->gc_list, NULL);
    854 }
    855 
    856 void
    857 thmap_gc(thmap_t *thmap, void *ref)
    858 {
    859 	thmap_gc_t *gc = ref;
    860 
    861 	while (gc) {
    862 		thmap_gc_t *next = gc->next;
    863 		thmap->ops->free(gc->addr, gc->len);
    864 		kmem_intr_free(gc, sizeof(thmap_gc_t));
    865 		gc = next;
    866 	}
    867 }
    868 
    869 /*
    870  * thmap_create: construct a new trie-hash map object.
    871  */
    872 thmap_t *
    873 thmap_create(uintptr_t baseptr, const thmap_ops_t *ops, unsigned flags)
    874 {
    875 	thmap_t *thmap;
    876 	uintptr_t root;
    877 
    878 	/*
    879 	 * Setup the map object.
    880 	 */
    881 	if (!THMAP_ALIGNED_P(baseptr)) {
    882 		return NULL;
    883 	}
    884 	thmap = kmem_zalloc(sizeof(thmap_t), KM_SLEEP);
    885 	if (!thmap) {
    886 		return NULL;
    887 	}
    888 	thmap->baseptr = baseptr;
    889 	thmap->ops = ops ? ops : &thmap_default_ops;
    890 	thmap->flags = flags;
    891 
    892 	if ((thmap->flags & THMAP_SETROOT) == 0) {
    893 		/* Allocate the root level. */
    894 		root = thmap->ops->alloc(THMAP_ROOT_LEN);
    895 		thmap->root = THMAP_GETPTR(thmap, root);
    896 		if (!thmap->root) {
    897 			kmem_free(thmap, sizeof(thmap_t));
    898 			return NULL;
    899 		}
    900 		memset(thmap->root, 0, THMAP_ROOT_LEN);
    901 	}
    902 	return thmap;
    903 }
    904 
    905 int
    906 thmap_setroot(thmap_t *thmap, uintptr_t root_off)
    907 {
    908 	if (thmap->root) {
    909 		return -1;
    910 	}
    911 	thmap->root = THMAP_GETPTR(thmap, root_off);
    912 	return 0;
    913 }
    914 
    915 uintptr_t
    916 thmap_getroot(const thmap_t *thmap)
    917 {
    918 	return THMAP_GETOFF(thmap, thmap->root);
    919 }
    920 
    921 void
    922 thmap_destroy(thmap_t *thmap)
    923 {
    924 	uintptr_t root = THMAP_GETOFF(thmap, thmap->root);
    925 	void *ref;
    926 
    927 	ref = thmap_stage_gc(thmap);
    928 	thmap_gc(thmap, ref);
    929 
    930 	if ((thmap->flags & THMAP_SETROOT) == 0) {
    931 		thmap->ops->free(root, THMAP_ROOT_LEN);
    932 	}
    933 	kmem_free(thmap, sizeof(thmap_t));
    934 }
    935