1 /* $NetBSD: subr_thmap.c,v 1.17 2026/01/04 03:21:04 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2018 Mindaugas Rasiukevicius <rmind at noxt eu> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * Upstream: https://github.com/rmind/thmap/ 29 */ 30 31 /* 32 * Concurrent trie-hash map. 33 * 34 * The data structure is conceptually a radix trie on hashed keys. 35 * Keys are hashed using a 32-bit function. The root level is a special 36 * case: it is managed using the compare-and-swap (CAS) atomic operation 37 * and has a fanout of 64. The subsequent levels are constructed using 38 * intermediate nodes with a fanout of 16 (using 4 bits). As more levels 39 * are created, more blocks of the 32-bit hash value might be generated 40 * by incrementing the seed parameter of the hash function. 41 * 42 * Concurrency 43 * 44 * - READERS: Descending is simply walking through the slot values of 45 * the intermediate nodes. It is lock-free as there is no intermediate 46 * state: the slot is either empty or has a pointer to the child node. 47 * The main assumptions here are the following: 48 * 49 * i) modifications must preserve consistency with the respect to the 50 * readers i.e. the readers can only see the valid node values; 51 * 52 * ii) any invalid view must "fail" the reads, e.g. by making them 53 * re-try from the root; this is a case for deletions and is achieved 54 * using the NODE_DELETED flag. 55 * 56 * iii) the node destruction must be synchronized with the readers, 57 * e.g. by using the Epoch-based reclamation or other techniques. 58 * 59 * - WRITERS AND LOCKING: Each intermediate node has a spin-lock (which 60 * is implemented using the NODE_LOCKED bit) -- it provides mutual 61 * exclusion amongst concurrent writers. The lock order for the nodes 62 * is "bottom-up" i.e. they are locked as we ascend the trie. A key 63 * constraint here is that parent pointer never changes. 64 * 65 * - DELETES: In addition to writer's locking, the deletion keeps the 66 * intermediate nodes in a valid state and sets the NODE_DELETED flag, 67 * to indicate that the readers must re-start the walk from the root. 68 * As the levels are collapsed, NODE_DELETED gets propagated up-tree. 69 * The leaf nodes just stay as-is until they are reclaimed. 70 * 71 * - ROOT LEVEL: The root level is a special case, as it is implemented 72 * as an array (rather than intermediate node). The root-level slot can 73 * only be set using CAS and it can only be set to a valid intermediate 74 * node. The root-level slot can only be cleared when the node it points 75 * at becomes empty, is locked and marked as NODE_DELETED (this causes 76 * the insert/delete operations to re-try until the slot is set to NULL). 77 * 78 * References: 79 * 80 * W. Litwin, 1981, Trie Hashing. 81 * Proceedings of the 1981 ACM SIGMOD, p. 19-29 82 * https://dl.acm.org/citation.cfm?id=582322 83 * 84 * P. L. Lehman and S. B. Yao. 85 * Efficient locking for concurrent operations on B-trees. 86 * ACM TODS, 6(4):650-670, 1981 87 * https://www.csd.uoc.gr/~hy460/pdf/p650-lehman.pdf 88 */ 89 90 #ifdef _KERNEL 91 92 #include <sys/cdefs.h> 93 #include <sys/param.h> 94 #include <sys/types.h> 95 96 #include <sys/atomic.h> 97 #include <sys/cprng.h> 98 #include <sys/hash.h> 99 #include <sys/kmem.h> 100 #include <sys/lock.h> 101 #include <sys/sdt.h> 102 #include <sys/thmap.h> 103 104 #define THMAP_RCSID(a) __KERNEL_RCSID(0, a) 105 106 #else 107 108 #include <inttypes.h> 109 #include <limits.h> 110 #include <stdbool.h> 111 #include <stddef.h> 112 #include <stdio.h> 113 #include <stdlib.h> 114 #include <string.h> 115 116 #define THMAP_RCSID(a) __RCSID(a) 117 118 #define SET_ERROR(e) (e) 119 120 #include "thmap.h" 121 #include "utils.h" 122 123 #endif 124 125 THMAP_RCSID("$NetBSD: subr_thmap.c,v 1.17 2026/01/04 03:21:04 riastradh Exp $"); 126 127 #include <crypto/blake2/blake2s.h> 128 129 /* 130 * NetBSD kernel wrappers 131 */ 132 #ifdef _KERNEL 133 #define ASSERT KASSERT 134 #define atomic_thread_fence(x) membar_release() /* only used for release order */ 135 #define atomic_compare_exchange_weak_explicit_32(p, e, n, m1, m2) \ 136 (atomic_cas_32((p), *(e), (n)) == *(e)) 137 #define atomic_compare_exchange_weak_explicit_ptr(p, e, n, m1, m2) \ 138 (atomic_cas_ptr((p), *(void **)(e), (void *)(n)) == *(void **)(e)) 139 #define atomic_exchange_explicit(o, n, m1) atomic_swap_ptr((o), (n)) 140 #define murmurhash3 murmurhash2 141 #endif 142 143 /* 144 * The root level fanout is 64 (indexed by the last 6 bits of the hash 145 * value XORed with the length). Each subsequent level, represented by 146 * intermediate nodes, has a fanout of 16 (using 4 bits). 147 * 148 * The hash function produces 32-bit values. 149 */ 150 151 #define HASHVAL_SEEDLEN (16) 152 #define HASHVAL_BITS (32) 153 #define HASHVAL_MOD (HASHVAL_BITS - 1) 154 #define HASHVAL_SHIFT (5) 155 156 #define ROOT_BITS (6) 157 #define ROOT_SIZE (1 << ROOT_BITS) 158 #define ROOT_MASK (ROOT_SIZE - 1) 159 #define ROOT_MSBITS (HASHVAL_BITS - ROOT_BITS) 160 161 #define LEVEL_BITS (4) 162 #define LEVEL_SIZE (1 << LEVEL_BITS) 163 #define LEVEL_MASK (LEVEL_SIZE - 1) 164 165 /* 166 * Instead of raw pointers, we use offsets from the base address. 167 * This accommodates the use of this data structure in shared memory, 168 * where mappings can be in different address spaces. 169 * 170 * The pointers must be aligned, since pointer tagging is used to 171 * differentiate the intermediate nodes from leaves. We reserve the 172 * least significant bit. 173 */ 174 typedef uintptr_t thmap_ptr_t; 175 typedef uintptr_t atomic_thmap_ptr_t; // C11 _Atomic 176 177 #define THMAP_NULL ((thmap_ptr_t)0) 178 179 #define THMAP_LEAF_BIT (0x1) 180 181 #define THMAP_ALIGNED_P(p) (((uintptr_t)(p) & 3) == 0) 182 #define THMAP_ALIGN(p) ((uintptr_t)(p) & ~(uintptr_t)3) 183 #define THMAP_INODE_P(p) (((uintptr_t)(p) & THMAP_LEAF_BIT) == 0) 184 185 #define THMAP_GETPTR(th, p) ((void *)((th)->baseptr + (uintptr_t)(p))) 186 #define THMAP_GETOFF(th, p) ((thmap_ptr_t)((uintptr_t)(p) - (th)->baseptr)) 187 #define THMAP_NODE(th, p) THMAP_GETPTR(th, THMAP_ALIGN(p)) 188 189 /* 190 * State field. 191 */ 192 193 #define NODE_LOCKED (1U << 31) // lock (writers) 194 #define NODE_DELETED (1U << 30) // node deleted 195 #define NODE_COUNT(s) ((s) & 0x3fffffff) // slot count mask 196 197 /* 198 * There are two types of nodes: 199 * - Intermediate nodes -- arrays pointing to another level or a leaf; 200 * - Leaves, which store a key-value pair. 201 */ 202 203 typedef struct { 204 uint32_t state; // C11 _Atomic 205 thmap_ptr_t parent; 206 atomic_thmap_ptr_t slots[LEVEL_SIZE]; 207 } thmap_inode_t; 208 209 #define THMAP_INODE_LEN sizeof(thmap_inode_t) 210 211 typedef struct { 212 thmap_ptr_t key; 213 size_t len; 214 void * val; 215 } thmap_leaf_t; 216 217 typedef struct { 218 const uint8_t * seed; // secret seed 219 unsigned rslot; // root-level slot index 220 unsigned level; // current level in the tree 221 unsigned hashidx; // current hash index (block of bits) 222 uint32_t hashval; // current hash value 223 } thmap_query_t; 224 225 union thmap_align { 226 void * p; 227 uint64_t v; 228 }; 229 230 typedef struct thmap_gc thmap_gc_t; 231 struct thmap_gc { 232 size_t len; 233 thmap_gc_t * next; 234 char data[] __aligned(sizeof(union thmap_align)); 235 }; 236 237 #define THMAP_ROOT_LEN (sizeof(thmap_ptr_t) * ROOT_SIZE) 238 239 struct thmap { 240 uintptr_t baseptr; 241 atomic_thmap_ptr_t * root; 242 unsigned flags; 243 const thmap_ops_t * ops; 244 thmap_gc_t * gc_list; // C11 _Atomic 245 uint8_t seed[HASHVAL_SEEDLEN]; 246 }; 247 248 static void stage_mem_gc(thmap_t *, uintptr_t, size_t); 249 250 /* 251 * A few low-level helper routines. 252 */ 253 254 static uintptr_t 255 alloc_wrapper(size_t len) 256 { 257 return (uintptr_t)kmem_intr_alloc(len, KM_NOSLEEP); 258 } 259 260 static void 261 free_wrapper(uintptr_t addr, size_t len) 262 { 263 kmem_intr_free((void *)addr, len); 264 } 265 266 static const thmap_ops_t thmap_default_ops = { 267 .alloc = alloc_wrapper, 268 .free = free_wrapper 269 }; 270 271 static uintptr_t 272 gc_alloc(const thmap_t *thmap, size_t len) 273 { 274 const size_t alloclen = offsetof(struct thmap_gc, data[len]); 275 const uintptr_t gcaddr = thmap->ops->alloc(alloclen); 276 277 if (!gcaddr) 278 return 0; 279 280 thmap_gc_t *const gc = THMAP_GETPTR(thmap, gcaddr); 281 gc->len = len; 282 return THMAP_GETOFF(thmap, &gc->data[0]); 283 } 284 285 static void 286 gc_free(const thmap_t *thmap, uintptr_t addr, size_t len) 287 { 288 const size_t alloclen = offsetof(struct thmap_gc, data[len]); 289 char *const ptr = THMAP_GETPTR(thmap, addr); 290 thmap_gc_t *const gc = container_of(ptr, struct thmap_gc, data[0]); 291 const uintptr_t gcaddr = THMAP_GETOFF(thmap, gc); 292 293 KASSERTMSG(gc->len == len, "thmap=%p ops=%p addr=%p len=%zu" 294 " gc=%p gc->len=%zu", 295 thmap, thmap->ops, (void *)addr, len, gc, gc->len); 296 thmap->ops->free(gcaddr, alloclen); 297 } 298 299 /* 300 * NODE LOCKING. 301 */ 302 303 static inline bool __diagused 304 node_locked_p(thmap_inode_t *node) 305 { 306 return (atomic_load_relaxed(&node->state) & NODE_LOCKED) != 0; 307 } 308 309 static void 310 lock_node(thmap_inode_t *node) 311 { 312 unsigned bcount = SPINLOCK_BACKOFF_MIN; 313 uint32_t s; 314 again: 315 s = atomic_load_relaxed(&node->state); 316 if (s & NODE_LOCKED) { 317 SPINLOCK_BACKOFF(bcount); 318 goto again; 319 } 320 /* Acquire from prior release in unlock_node.() */ 321 if (!atomic_compare_exchange_weak_explicit_32(&node->state, 322 &s, s | NODE_LOCKED, memory_order_acquire, memory_order_relaxed)) { 323 bcount = SPINLOCK_BACKOFF_MIN; 324 goto again; 325 } 326 } 327 328 static void 329 unlock_node(thmap_inode_t *node) 330 { 331 uint32_t s = atomic_load_relaxed(&node->state) & ~NODE_LOCKED; 332 333 ASSERT(node_locked_p(node)); 334 /* Release to subsequent acquire in lock_node(). */ 335 atomic_store_release(&node->state, s); 336 } 337 338 /* 339 * HASH VALUE AND KEY OPERATIONS. 340 */ 341 342 static inline uint32_t 343 hash(const uint8_t seed[static HASHVAL_SEEDLEN], const void *key, size_t len, 344 uint32_t level) 345 { 346 struct blake2s B; 347 uint32_t h; 348 349 if (level == 0) 350 return murmurhash3(key, len, 0); 351 352 /* 353 * Byte order is not significant here because this is 354 * intentionally secret and independent for each thmap. 355 * 356 * XXX We get 32 bytes of output at a time; we could march 357 * through them sequentially rather than throwing away 28 bytes 358 * and recomputing BLAKE2 each time. But the number of 359 * iterations ought to be geometric in the collision 360 * probability at each level which should be very small anyway. 361 */ 362 blake2s_init(&B, sizeof h, seed, HASHVAL_SEEDLEN); 363 blake2s_update(&B, &level, sizeof level); 364 blake2s_update(&B, key, len); 365 blake2s_final(&B, &h); 366 367 return h; 368 } 369 370 static inline void 371 hashval_init(thmap_query_t *query, const uint8_t seed[static HASHVAL_SEEDLEN], 372 const void * restrict key, size_t len) 373 { 374 const uint32_t hashval = hash(seed, key, len, 0); 375 376 query->seed = seed; 377 query->rslot = ((hashval >> ROOT_MSBITS) ^ len) & ROOT_MASK; 378 query->level = 0; 379 query->hashval = hashval; 380 query->hashidx = 0; 381 } 382 383 /* 384 * hashval_getslot: given the key, compute the hash (if not already cached) 385 * and return the offset for the current level. 386 */ 387 static unsigned 388 hashval_getslot(thmap_query_t *query, const void * restrict key, size_t len) 389 { 390 const unsigned offset = query->level * LEVEL_BITS; 391 const unsigned shift = offset & HASHVAL_MOD; 392 const unsigned i = offset >> HASHVAL_SHIFT; 393 394 if (query->hashidx != i) { 395 /* Generate a hash value for a required range. */ 396 query->hashval = hash(query->seed, key, len, i); 397 query->hashidx = i; 398 } 399 return (query->hashval >> shift) & LEVEL_MASK; 400 } 401 402 static unsigned 403 hashval_getleafslot(const thmap_t *thmap, 404 const thmap_leaf_t *leaf, unsigned level) 405 { 406 const void *key = THMAP_GETPTR(thmap, leaf->key); 407 const unsigned offset = level * LEVEL_BITS; 408 const unsigned shift = offset & HASHVAL_MOD; 409 const unsigned i = offset >> HASHVAL_SHIFT; 410 411 return (hash(thmap->seed, key, leaf->len, i) >> shift) & LEVEL_MASK; 412 } 413 414 static inline unsigned 415 hashval_getl0slot(const thmap_t *thmap, const thmap_query_t *query, 416 const thmap_leaf_t *leaf) 417 { 418 if (__predict_true(query->hashidx == 0)) { 419 return query->hashval & LEVEL_MASK; 420 } 421 return hashval_getleafslot(thmap, leaf, 0); 422 } 423 424 static bool 425 key_cmp_p(const thmap_t *thmap, const thmap_leaf_t *leaf, 426 const void * restrict key, size_t len) 427 { 428 const void *leafkey = THMAP_GETPTR(thmap, leaf->key); 429 return len == leaf->len && memcmp(key, leafkey, len) == 0; 430 } 431 432 /* 433 * INTER-NODE OPERATIONS. 434 */ 435 436 static thmap_inode_t * 437 node_create(thmap_t *thmap, thmap_inode_t *parent) 438 { 439 thmap_inode_t *node; 440 uintptr_t p; 441 442 p = gc_alloc(thmap, THMAP_INODE_LEN); 443 if (!p) { 444 return NULL; 445 } 446 node = THMAP_GETPTR(thmap, p); 447 ASSERT(THMAP_ALIGNED_P(node)); 448 449 memset(node, 0, THMAP_INODE_LEN); 450 if (parent) { 451 /* Not yet published, no need for ordering. */ 452 atomic_store_relaxed(&node->state, NODE_LOCKED); 453 node->parent = THMAP_GETOFF(thmap, parent); 454 } 455 return node; 456 } 457 458 static void 459 node_insert(thmap_inode_t *node, unsigned slot, thmap_ptr_t child) 460 { 461 ASSERT(node_locked_p(node) || node->parent == THMAP_NULL); 462 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0); 463 ASSERT(atomic_load_relaxed(&node->slots[slot]) == THMAP_NULL); 464 465 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) < LEVEL_SIZE); 466 467 /* 468 * If node is public already, caller is responsible for issuing 469 * release fence; if node is not public, no ordering is needed. 470 * Hence relaxed ordering. 471 */ 472 atomic_store_relaxed(&node->slots[slot], child); 473 atomic_store_relaxed(&node->state, 474 atomic_load_relaxed(&node->state) + 1); 475 } 476 477 static void 478 node_remove(thmap_inode_t *node, unsigned slot) 479 { 480 ASSERT(node_locked_p(node)); 481 ASSERT((atomic_load_relaxed(&node->state) & NODE_DELETED) == 0); 482 ASSERT(atomic_load_relaxed(&node->slots[slot]) != THMAP_NULL); 483 484 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) > 0); 485 ASSERT(NODE_COUNT(atomic_load_relaxed(&node->state)) <= LEVEL_SIZE); 486 487 /* Element will be GC-ed later; no need for ordering here. */ 488 atomic_store_relaxed(&node->slots[slot], THMAP_NULL); 489 atomic_store_relaxed(&node->state, 490 atomic_load_relaxed(&node->state) - 1); 491 } 492 493 /* 494 * LEAF OPERATIONS. 495 */ 496 497 static thmap_leaf_t * 498 leaf_create(const thmap_t *thmap, const void *key, size_t len, void *val) 499 { 500 thmap_leaf_t *leaf; 501 uintptr_t leaf_off, key_off; 502 503 leaf_off = gc_alloc(thmap, sizeof(thmap_leaf_t)); 504 if (!leaf_off) { 505 return NULL; 506 } 507 leaf = THMAP_GETPTR(thmap, leaf_off); 508 ASSERT(THMAP_ALIGNED_P(leaf)); 509 510 if ((thmap->flags & THMAP_NOCOPY) == 0) { 511 /* 512 * Copy the key. 513 */ 514 key_off = gc_alloc(thmap, len); 515 if (!key_off) { 516 gc_free(thmap, leaf_off, sizeof(thmap_leaf_t)); 517 return NULL; 518 } 519 memcpy(THMAP_GETPTR(thmap, key_off), key, len); 520 leaf->key = key_off; 521 } else { 522 /* Otherwise, we use a reference. */ 523 leaf->key = (uintptr_t)key; 524 } 525 leaf->len = len; 526 leaf->val = val; 527 return leaf; 528 } 529 530 static void 531 leaf_free(const thmap_t *thmap, thmap_leaf_t *leaf) 532 { 533 if ((thmap->flags & THMAP_NOCOPY) == 0) { 534 gc_free(thmap, leaf->key, leaf->len); 535 } 536 gc_free(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t)); 537 } 538 539 static thmap_leaf_t * 540 get_leaf(const thmap_t *thmap, thmap_inode_t *parent, unsigned slot) 541 { 542 thmap_ptr_t node; 543 544 /* Consume from prior release in thmap_put(). */ 545 node = atomic_load_consume(&parent->slots[slot]); 546 if (THMAP_INODE_P(node)) { 547 return NULL; 548 } 549 return THMAP_NODE(thmap, node); 550 } 551 552 /* 553 * ROOT OPERATIONS. 554 */ 555 556 /* 557 * root_try_put: Try to set a root pointer at query->rslot. 558 * 559 * => Implies release operation on success. 560 * => Implies no ordering on failure. 561 */ 562 static inline int 563 root_try_put(thmap_t *thmap, const thmap_query_t *query, thmap_leaf_t *leaf) 564 { 565 thmap_ptr_t expected; 566 const unsigned i = query->rslot; 567 thmap_inode_t *node; 568 thmap_ptr_t nptr; 569 unsigned slot; 570 571 /* 572 * Must pre-check first. No ordering required because we will 573 * check again before taking any actions, and start over if 574 * this changes from null. 575 */ 576 if (atomic_load_relaxed(&thmap->root[i])) { 577 return SET_ERROR(EEXIST); 578 } 579 580 /* 581 * Create an intermediate node. Since there is no parent set, 582 * it will be created unlocked and the CAS operation will 583 * release it to readers. 584 */ 585 node = node_create(thmap, NULL); 586 if (__predict_false(node == NULL)) { 587 return SET_ERROR(ENOMEM); 588 } 589 slot = hashval_getl0slot(thmap, query, leaf); 590 node_insert(node, slot, THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT); 591 nptr = THMAP_GETOFF(thmap, node); 592 again: 593 if (atomic_load_relaxed(&thmap->root[i])) { 594 gc_free(thmap, nptr, THMAP_INODE_LEN); 595 return SET_ERROR(EEXIST); 596 } 597 /* Release to subsequent consume in find_edge_node(). */ 598 expected = THMAP_NULL; 599 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->root[i], &expected, 600 nptr, memory_order_release, memory_order_relaxed)) { 601 goto again; 602 } 603 return 0; 604 } 605 606 /* 607 * find_edge_node: given the hash, traverse the tree to find the edge node. 608 * 609 * => Returns an aligned (clean) pointer to the parent node. 610 * => Returns the slot number and sets current level. 611 */ 612 static thmap_inode_t * 613 find_edge_node(const thmap_t *thmap, thmap_query_t *query, 614 const void * restrict key, size_t len, unsigned *slot) 615 { 616 thmap_ptr_t root_slot; 617 thmap_inode_t *parent; 618 thmap_ptr_t node; 619 unsigned off; 620 621 ASSERT(query->level == 0); 622 623 /* Consume from prior release in root_try_put(). */ 624 root_slot = atomic_load_consume(&thmap->root[query->rslot]); 625 parent = THMAP_NODE(thmap, root_slot); 626 if (!parent) { 627 return NULL; 628 } 629 descend: 630 off = hashval_getslot(query, key, len); 631 /* Consume from prior release in thmap_put(). */ 632 node = atomic_load_consume(&parent->slots[off]); 633 634 /* Descend the tree until we find a leaf or empty slot. */ 635 if (node && THMAP_INODE_P(node)) { 636 parent = THMAP_NODE(thmap, node); 637 query->level++; 638 goto descend; 639 } 640 /* 641 * NODE_DELETED does not become stale until GC runs, which 642 * cannot happen while we are in the middle of an operation, 643 * hence relaxed ordering. 644 */ 645 if (atomic_load_relaxed(&parent->state) & NODE_DELETED) { 646 return NULL; 647 } 648 *slot = off; 649 return parent; 650 } 651 652 /* 653 * find_edge_node_locked: traverse the tree, like find_edge_node(), 654 * but attempt to lock the edge node. 655 * 656 * => Returns NULL if the deleted node is found. This indicates that 657 * the caller must re-try from the root, as the root slot might have 658 * changed too. 659 */ 660 static thmap_inode_t * 661 find_edge_node_locked(const thmap_t *thmap, thmap_query_t *query, 662 const void * restrict key, size_t len, unsigned *slot) 663 { 664 thmap_inode_t *node; 665 thmap_ptr_t target; 666 retry: 667 /* 668 * Find the edge node and lock it! Re-check the state since 669 * the tree might change by the time we acquire the lock. 670 */ 671 node = find_edge_node(thmap, query, key, len, slot); 672 if (!node) { 673 /* The root slot is empty -- let the caller decide. */ 674 query->level = 0; 675 return NULL; 676 } 677 lock_node(node); 678 if (__predict_false(atomic_load_relaxed(&node->state) & NODE_DELETED)) { 679 /* 680 * The node has been deleted. The tree might have a new 681 * shape now, therefore we must re-start from the root. 682 */ 683 unlock_node(node); 684 query->level = 0; 685 return NULL; 686 } 687 target = atomic_load_relaxed(&node->slots[*slot]); 688 if (__predict_false(target && THMAP_INODE_P(target))) { 689 /* 690 * The target slot has been changed and it is now an 691 * intermediate node. Re-start from the top internode. 692 */ 693 unlock_node(node); 694 query->level = 0; 695 goto retry; 696 } 697 return node; 698 } 699 700 /* 701 * thmap_get: lookup a value given the key. 702 */ 703 void * 704 thmap_get(thmap_t *thmap, const void *key, size_t len) 705 { 706 thmap_query_t query; 707 thmap_inode_t *parent; 708 thmap_leaf_t *leaf; 709 unsigned slot; 710 711 hashval_init(&query, thmap->seed, key, len); 712 parent = find_edge_node(thmap, &query, key, len, &slot); 713 if (!parent) { 714 return NULL; 715 } 716 leaf = get_leaf(thmap, parent, slot); 717 if (!leaf) { 718 return NULL; 719 } 720 if (!key_cmp_p(thmap, leaf, key, len)) { 721 return NULL; 722 } 723 return leaf->val; 724 } 725 726 /* 727 * thmap_put: insert a value given the key. 728 * 729 * => If the key is already present, return the associated value. 730 * => Otherwise, on successful insert, return the given value. 731 */ 732 void * 733 thmap_put(thmap_t *thmap, const void *key, size_t len, void *val) 734 { 735 thmap_query_t query; 736 thmap_leaf_t *leaf, *other; 737 thmap_inode_t *parent, *child; 738 unsigned slot, other_slot; 739 thmap_ptr_t target; 740 741 /* 742 * First, pre-allocate and initialize the leaf node. 743 */ 744 leaf = leaf_create(thmap, key, len, val); 745 if (__predict_false(!leaf)) { 746 return NULL; 747 } 748 hashval_init(&query, thmap->seed, key, len); 749 retry: 750 /* 751 * Try to insert into the root first, if its slot is empty. 752 */ 753 switch (root_try_put(thmap, &query, leaf)) { 754 case 0: 755 /* Success: the leaf was inserted; no locking involved. */ 756 return val; 757 case EEXIST: 758 break; 759 case ENOMEM: 760 return NULL; 761 default: 762 __unreachable(); 763 } 764 765 /* 766 * Release node via store in node_insert (*) to subsequent 767 * consume in get_leaf() or find_edge_node(). 768 */ 769 atomic_thread_fence(memory_order_release); 770 771 /* 772 * Find the edge node and the target slot. 773 */ 774 parent = find_edge_node_locked(thmap, &query, key, len, &slot); 775 if (!parent) { 776 goto retry; 777 } 778 target = atomic_load_relaxed(&parent->slots[slot]); // tagged offset 779 if (THMAP_INODE_P(target)) { 780 /* 781 * Empty slot: simply insert the new leaf. The release 782 * fence is already issued for us. 783 */ 784 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT; 785 node_insert(parent, slot, target); /* (*) */ 786 goto out; 787 } 788 789 /* 790 * Collision or duplicate. 791 */ 792 other = THMAP_NODE(thmap, target); 793 if (key_cmp_p(thmap, other, key, len)) { 794 /* 795 * Duplicate. Free the pre-allocated leaf and 796 * return the present value. 797 */ 798 leaf_free(thmap, leaf); 799 val = other->val; 800 goto out; 801 } 802 descend: 803 /* 804 * Collision -- expand the tree. Create an intermediate node 805 * which will be locked (NODE_LOCKED) for us. At this point, 806 * we advance to the next level. 807 */ 808 child = node_create(thmap, parent); 809 if (__predict_false(!child)) { 810 leaf_free(thmap, leaf); 811 val = NULL; 812 goto out; 813 } 814 query.level++; 815 816 /* 817 * Insert the other (colliding) leaf first. The new child is 818 * not yet published, so memory order is relaxed. 819 */ 820 other_slot = hashval_getleafslot(thmap, other, query.level); 821 target = THMAP_GETOFF(thmap, other) | THMAP_LEAF_BIT; 822 node_insert(child, other_slot, target); 823 824 /* 825 * Insert the intermediate node into the parent node. 826 * It becomes the new parent for the our new leaf. 827 * 828 * Ensure that stores to the child (and leaf) reach global 829 * visibility before it gets inserted to the parent, as 830 * consumed by get_leaf() or find_edge_node(). 831 */ 832 atomic_store_release(&parent->slots[slot], THMAP_GETOFF(thmap, child)); 833 834 unlock_node(parent); 835 ASSERT(node_locked_p(child)); 836 parent = child; 837 838 /* 839 * Get the new slot and check for another collision 840 * at the next level. 841 */ 842 slot = hashval_getslot(&query, key, len); 843 if (slot == other_slot) { 844 /* Another collision -- descend and expand again. */ 845 goto descend; 846 } 847 848 /* 849 * Insert our new leaf once we expanded enough. The release 850 * fence is already issued for us. 851 */ 852 target = THMAP_GETOFF(thmap, leaf) | THMAP_LEAF_BIT; 853 node_insert(parent, slot, target); /* (*) */ 854 out: 855 unlock_node(parent); 856 return val; 857 } 858 859 /* 860 * thmap_del: remove the entry given the key. 861 */ 862 void * 863 thmap_del(thmap_t *thmap, const void *key, size_t len) 864 { 865 thmap_query_t query; 866 thmap_leaf_t *leaf; 867 thmap_inode_t *parent; 868 unsigned slot; 869 void *val; 870 871 hashval_init(&query, thmap->seed, key, len); 872 parent = find_edge_node_locked(thmap, &query, key, len, &slot); 873 if (!parent) { 874 /* Root slot empty: not found. */ 875 return NULL; 876 } 877 leaf = get_leaf(thmap, parent, slot); 878 if (!leaf || !key_cmp_p(thmap, leaf, key, len)) { 879 /* Not found. */ 880 unlock_node(parent); 881 return NULL; 882 } 883 884 /* Remove the leaf. */ 885 ASSERT(THMAP_NODE(thmap, atomic_load_relaxed(&parent->slots[slot])) 886 == leaf); 887 node_remove(parent, slot); 888 889 /* 890 * Collapse the levels if removing the last item. 891 */ 892 while (query.level && 893 NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) { 894 thmap_inode_t *node = parent; 895 896 ASSERT(atomic_load_relaxed(&node->state) == NODE_LOCKED); 897 898 /* 899 * Ascend one level up. 900 * => Mark our current parent as deleted. 901 * => Lock the parent one level up. 902 */ 903 query.level--; 904 slot = hashval_getslot(&query, key, len); 905 parent = THMAP_NODE(thmap, node->parent); 906 ASSERT(parent != NULL); 907 908 lock_node(parent); 909 ASSERT((atomic_load_relaxed(&parent->state) & NODE_DELETED) 910 == 0); 911 912 /* 913 * Lock is exclusive, so nobody else can be writing at 914 * the same time, and no need for atomic R/M/W, but 915 * readers may read without the lock and so need atomic 916 * load/store. No ordering here needed because the 917 * entry itself stays valid until GC. 918 */ 919 atomic_store_relaxed(&node->state, 920 atomic_load_relaxed(&node->state) | NODE_DELETED); 921 unlock_node(node); // memory_order_release 922 923 ASSERT(THMAP_NODE(thmap, 924 atomic_load_relaxed(&parent->slots[slot])) == node); 925 node_remove(parent, slot); 926 927 /* Stage the removed node for G/C. */ 928 stage_mem_gc(thmap, THMAP_GETOFF(thmap, node), THMAP_INODE_LEN); 929 } 930 931 /* 932 * If the top node is empty, then we need to remove it from the 933 * root level. Mark the node as deleted and clear the slot. 934 * 935 * Note: acquiring the lock on the top node effectively prevents 936 * the root slot from changing. 937 */ 938 if (NODE_COUNT(atomic_load_relaxed(&parent->state)) == 0) { 939 const unsigned rslot = query.rslot; 940 const thmap_ptr_t nptr = 941 atomic_load_relaxed(&thmap->root[rslot]); 942 943 ASSERT(query.level == 0); 944 ASSERT(parent->parent == THMAP_NULL); 945 ASSERT(THMAP_GETOFF(thmap, parent) == nptr); 946 947 /* Mark as deleted and remove from the root-level slot. */ 948 atomic_store_relaxed(&parent->state, 949 atomic_load_relaxed(&parent->state) | NODE_DELETED); 950 atomic_store_relaxed(&thmap->root[rslot], THMAP_NULL); 951 952 stage_mem_gc(thmap, nptr, THMAP_INODE_LEN); 953 } 954 unlock_node(parent); 955 956 /* 957 * Save the value and stage the leaf for G/C. 958 */ 959 val = leaf->val; 960 if ((thmap->flags & THMAP_NOCOPY) == 0) { 961 stage_mem_gc(thmap, leaf->key, leaf->len); 962 } 963 stage_mem_gc(thmap, THMAP_GETOFF(thmap, leaf), sizeof(thmap_leaf_t)); 964 return val; 965 } 966 967 /* 968 * G/C routines. 969 */ 970 971 static void 972 stage_mem_gc(thmap_t *thmap, uintptr_t addr, size_t len) 973 { 974 char *const ptr = THMAP_GETPTR(thmap, addr); 975 thmap_gc_t *head, *gc; 976 977 gc = container_of(ptr, struct thmap_gc, data[0]); 978 KASSERTMSG(gc->len == len, 979 "thmap=%p ops=%p ptr=%p len=%zu gc=%p gc->len=%zu", 980 thmap, thmap->ops, (char *)addr, len, gc, gc->len); 981 retry: 982 head = atomic_load_relaxed(&thmap->gc_list); 983 gc->next = head; // not yet published 984 985 /* Release to subsequent acquire in thmap_stage_gc(). */ 986 if (!atomic_compare_exchange_weak_explicit_ptr(&thmap->gc_list, &head, gc, 987 memory_order_release, memory_order_relaxed)) { 988 goto retry; 989 } 990 } 991 992 void * 993 thmap_stage_gc(thmap_t *thmap) 994 { 995 /* Acquire from prior release in stage_mem_gc(). */ 996 return atomic_exchange_explicit(&thmap->gc_list, NULL, 997 memory_order_acquire); 998 } 999 1000 void 1001 thmap_gc(thmap_t *thmap, void *ref) 1002 { 1003 thmap_gc_t *gc = ref; 1004 1005 while (gc) { 1006 thmap_gc_t *next = gc->next; 1007 gc_free(thmap, THMAP_GETOFF(thmap, &gc->data[0]), gc->len); 1008 gc = next; 1009 } 1010 } 1011 1012 /* 1013 * thmap_create: construct a new trie-hash map object. 1014 */ 1015 thmap_t * 1016 thmap_create(uintptr_t baseptr, const thmap_ops_t *ops, unsigned flags) 1017 { 1018 thmap_t *thmap; 1019 uintptr_t root; 1020 1021 /* 1022 * Setup the map object. 1023 */ 1024 if (!THMAP_ALIGNED_P(baseptr)) { 1025 return NULL; 1026 } 1027 thmap = kmem_zalloc(sizeof(thmap_t), KM_SLEEP); 1028 thmap->baseptr = baseptr; 1029 thmap->ops = ops ? ops : &thmap_default_ops; 1030 thmap->flags = flags; 1031 1032 if ((thmap->flags & THMAP_SETROOT) == 0) { 1033 /* Allocate the root level. */ 1034 root = gc_alloc(thmap, THMAP_ROOT_LEN); 1035 if (!root) { 1036 kmem_free(thmap, sizeof(thmap_t)); 1037 return NULL; 1038 } 1039 thmap->root = THMAP_GETPTR(thmap, root); 1040 memset(thmap->root, 0, THMAP_ROOT_LEN); 1041 } 1042 1043 cprng_strong(kern_cprng, thmap->seed, sizeof thmap->seed, 0); 1044 1045 return thmap; 1046 } 1047 1048 int 1049 thmap_setroot(thmap_t *thmap, uintptr_t root_off) 1050 { 1051 if (thmap->root) { 1052 return -1; 1053 } 1054 thmap->root = THMAP_GETPTR(thmap, root_off); 1055 return 0; 1056 } 1057 1058 uintptr_t 1059 thmap_getroot(const thmap_t *thmap) 1060 { 1061 return THMAP_GETOFF(thmap, thmap->root); 1062 } 1063 1064 void 1065 thmap_destroy(thmap_t *thmap) 1066 { 1067 uintptr_t root = THMAP_GETOFF(thmap, thmap->root); 1068 void *ref; 1069 1070 ref = thmap_stage_gc(thmap); 1071 thmap_gc(thmap, ref); 1072 1073 if ((thmap->flags & THMAP_SETROOT) == 0) { 1074 gc_free(thmap, root, THMAP_ROOT_LEN); 1075 } 1076 kmem_free(thmap, sizeof(thmap_t)); 1077 } 1078