1 /* $NetBSD: qp_p.h,v 1.3 2026/01/29 18:37:49 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 /* 17 * For an overview, see doc/design/qp-trie.md 18 * 19 * This private header defines the internal data structures, 20 */ 21 22 #pragma once 23 24 #include <isc/refcount.h> 25 26 #include <dns/qp.h> 27 28 /*********************************************************************** 29 * 30 * interior node basics 31 */ 32 33 /* 34 * A qp-trie node is almost always one of two types: branch or leaf. 35 * (A third type is used only to anchor the root of a trie; see below.) 36 * 37 * A node contains a 64-bit word and a 32-bit word. In order to avoid 38 * unwanted padding, they are declared as three 32-bit words; this keeps 39 * the size down to 12 bytes. They are in native endian order, so getting 40 * the 64-bit part should compile down to an unaligned load. 41 * 42 * The node type is identified by the least significant bits of the 64-bit 43 * word. 44 * 45 * In a leaf node: 46 * - The 64-bit word is used to store a pointer value. (Pointers must be 47 * word-aligned so the least significant bits are zero; those bits can 48 * then act as a node tag to indicate that this is a leaf. This 49 * requirement is enforced by the make_leaf() constructor.) 50 * - The 32-bit word is used to store an integer value. Both the 51 * pointer and integer values can be retrieved when looking up a key. 52 * 53 * In a branch node: 54 * - The 64-bit word is subdivided into three portions: the least 55 * significant bits are the node type (for a branch, 0x1); the 56 * most sigificant 15 bits are an offset value into the key, and 57 * the 47 bits in the middle are a bitmap; see the documentation 58 * for the SHIFT_* enum below. 59 * - The 32-bit word is a reference (dns_qpref_t) to the packed sparse 60 * vector of "twigs", i.e. child nodes. A branch node has at least 61 * two and at most 47 twigs. (The qp-trie update functions ensure that 62 * branches actually branch, i.e. a branch cannot have only one child.) 63 * 64 * A third node type, reader nodes, anchors the root of a trie. 65 * A pair of reader nodes together contain a packed `dns_qpreader_t`. 66 * See the section on "packed reader nodes" for details. 67 */ 68 struct dns_qpnode { 69 #if WORDS_BIGENDIAN 70 uint32_t bighi, biglo, small; 71 #else 72 uint32_t biglo, bighi, small; 73 #endif 74 }; 75 76 /* 77 * The possible values of the node type tag. Type tags must fit in two bits 78 * for compatibility with 4-byte pointer alignment on 32-bit systems. 79 */ 80 enum { 81 LEAF_TAG = 0, /* leaf node */ 82 BRANCH_TAG = 1, /* branch node */ 83 READER_TAG = 2, /* reader node */ 84 TAG_MASK = 3, /* mask covering tag bits */ 85 }; 86 87 /* 88 * This code does not work on CPUs with large pointers, e.g. CHERI capability 89 * architectures. When porting to that kind of machine, a `dns_qpnode` should 90 * be just a `uintptr_t`; a leaf node will contain a single pointer, and a 91 * branch node will fit in the same space with room to spare. 92 */ 93 STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t), 94 "pointers must fit in 64 bits"); 95 96 /* 97 * The 64-bit word in a branch node is comprised of a node type tag, a 98 * bitmap, and an offset into the key. It is called an "index word" because 99 * it describes how to access the twigs vector (think "database index"). 100 * The following enum sets up the bit positions of these parts. 101 * 102 * The bitmap is just above the type tag. The `dns_qp_bits_for_byte[]` table 103 * is used to fill in a key so that bit tests can work directly against the 104 * index word without superfluous masking or shifting; we don't need to 105 * mask out the bitmap before testing a bit, but we do need to mask the 106 * bitmap before calling popcount. 107 * 108 * The byte offset into the key is at the top of the word, so that it 109 * can be extracted with just a shift, with no masking needed. 110 * 111 * The names are SHIFT_thing because they are dns_qpshift_t values. (See 112 * below for the various `qp_*` type declarations.) 113 * 114 * These values are relatively fixed in practice: SHIFT_NOBYTE needs 115 * to leave space for the type tag, and the implementation of 116 * `dns_qpkey_fromname()` depends on the bitmap being large enough. 117 * The symbolic names avoid mystery numbers in the code. 118 */ 119 enum { 120 SHIFT_NOBYTE = 2, /* label separator has no byte value */ 121 SHIFT_BITMAP, /* many bits here */ 122 SHIFT_OFFSET = 49, /* offset of byte in key */ 123 }; 124 125 /*********************************************************************** 126 * 127 * garbage collector tuning parameters 128 */ 129 130 /* 131 * A "cell" is a location that can contain a `dns_qpnode_t`, and a "chunk" 132 * is a moderately large array of cells. A big trie can occupy 133 * multiple chunks. (Unlike other nodes, a trie's root node lives in 134 * its `struct dns_qp` instead of being allocated in a cell.) 135 * 136 * The qp-trie allocator hands out space for twigs vectors. Allocations are 137 * made sequentially from one of the chunks; this kind of "sequential 138 * allocator" is also known as a "bump allocator", so in `struct dns_qp` 139 * (see below) the allocation chunk is called `bump`. 140 */ 141 142 /* 143 * Number of cells in a chunk is a power of 2, which must have space for 144 * a full twigs vector (48 wide). When testing, use a much smaller chunk 145 * size to make the allocator work harder. 146 */ 147 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 148 #define QP_CHUNK_LOG_MIN 7 149 #define QP_CHUNK_LOG_MAX 7 150 #else 151 #define QP_CHUNK_LOG_MIN 3 152 #define QP_CHUNK_LOG_MAX 12 153 #endif 154 155 STATIC_ASSERT(2 <= QP_CHUNK_LOG_MIN && QP_CHUNK_LOG_MIN <= QP_CHUNK_LOG_MAX, 156 "qp-trie min chunk size is unreasonable"); 157 STATIC_ASSERT(6 <= QP_CHUNK_LOG_MAX && QP_CHUNK_LOG_MAX <= 20, 158 "qp-trie max chunk size is unreasonable"); 159 160 #define QP_CHUNK_SIZE (1U << QP_CHUNK_LOG_MAX) 161 #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(dns_qpnode_t)) 162 163 STATIC_ASSERT(QP_SAFETY_MARGIN >= QP_CHUNK_BYTES, 164 "qp-trie safety margin too small"); 165 166 /* 167 * We need a bitfield this big to count how much of a chunk is in use: 168 * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG_MAX`. 169 */ 170 #define QP_USAGE_BITS (QP_CHUNK_LOG_MAX + 1) 171 172 /* 173 * A chunk needs to be compacted if it is less full than this threshold. 174 * (12% overhead seems reasonable) 175 */ 176 #define QP_MAX_FREE (QP_CHUNK_SIZE / 8) 177 #define QP_MIN_USED (QP_CHUNK_SIZE - QP_MAX_FREE) 178 179 /* 180 * Compact automatically when we pass this threshold: when there is a lot 181 * of free space in absolute terms, and when we have freed more than half 182 * of the space we allocated. 183 * 184 * The current compaction algorithm scans the whole trie, so it is important 185 * to scale the threshold based on the size of the trie to avoid quadratic 186 * behaviour. XXXFANF find an algorithm that scans less of the trie! 187 * 188 * During a modification transaction, when we copy-on-write some twigs we 189 * count the old copy as "free", because they will be when the transaction 190 * commits. But they cannot be recovered immediately so they are also 191 * counted as on hold, and discounted when we decide whether to compact. 192 */ 193 #define QP_GC_HEURISTIC(qp, free) \ 194 ((free) > QP_CHUNK_SIZE * 4 && (free) > (qp)->used_count / 2) 195 196 #define QP_NEEDGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count) 197 #define QP_AUTOGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count - (qp)->hold_count) 198 199 /* 200 * The chunk base and usage arrays are resized geometically and start off 201 * with two entries. 202 */ 203 #define GROWTH_FACTOR(size) ((size) + (size) / 2 + 2) 204 205 /* 206 * Constructors and accessors for dns_qpref_t values, defined here to show 207 * how the dns_qpref_t, dns_qpchunk_t, dns_qpcell_t types relate to each other 208 */ 209 210 static inline dns_qpref_t 211 make_ref(dns_qpchunk_t chunk, dns_qpcell_t cell) { 212 return QP_CHUNK_SIZE * chunk + cell; 213 } 214 215 static inline dns_qpchunk_t 216 ref_chunk(dns_qpref_t ref) { 217 return ref / QP_CHUNK_SIZE; 218 } 219 220 static inline dns_qpcell_t 221 ref_cell(dns_qpref_t ref) { 222 return ref % QP_CHUNK_SIZE; 223 } 224 225 /* 226 * We should not use the `root_ref` in an empty trie, so we set it 227 * to a value that should trigger an obvious bug. See qp_init() 228 * and get_root() below. 229 */ 230 #define INVALID_REF ((dns_qpref_t)~0UL) 231 232 /*********************************************************************** 233 * 234 * chunk arrays 235 */ 236 237 /* 238 * A `dns_qp_t` contains two arrays holding information about each chunk. 239 * 240 * The `base` array holds pointers to the base of each chunk. 241 * The `usage` array hold the allocator's state for each chunk. 242 * 243 * The `base` array is used by the hot qp-trie traversal paths. It can 244 * be shared by multiple versions of a trie, which are tracked with a 245 * refcount. Old versions of the trie can retain old versions of the 246 * `base` array. 247 * 248 * In multithreaded code, the `usage` array is only used when the 249 * `dns_qpmulti_t` mutex is held, and there is only one version of 250 * it in active use (maybe with a snapshot for rollback support). 251 * 252 * The two arrays are separate because they have rather different 253 * access patterns, different lifetimes, and different element sizes. 254 */ 255 256 /* 257 * For most purposes we don't need to know exactly which cells are 258 * in use in a chunk, we only need to know how many of them there are. 259 * 260 * After we have finished allocating from a chunk, the `used` counter 261 * is the size we need to know for shrinking the chunk and for 262 * scanning it to detach leaf values before the chunk is free()d. The 263 * `free` counter tells us when the chunk needs compacting and when it 264 * has become empty. 265 * 266 * The `exists` flag allows the chunk scanning loops to look at the 267 * usage array only. 268 * 269 * In multithreaded code, we mark chunks as `immutable` when a modify 270 * transaction is opened. (We don't mark them immutable on commit, 271 * because the old bump chunk must remain mutable between write 272 * transactions, but it must become immutable when an update 273 * transaction is opened.) 274 * 275 * There are a few flags used to mark which chunks are still needed by 276 * snapshots after the chunks have passed their normal reclamation 277 * phase. 278 */ 279 typedef struct qp_usage { 280 /*% the allocation point, increases monotonically */ 281 dns_qpcell_t used : QP_USAGE_BITS; 282 /*% the actual size of the allocation */ 283 dns_qpcell_t capacity : QP_USAGE_BITS; 284 /*% count of nodes no longer needed, also monotonic */ 285 dns_qpcell_t free : QP_USAGE_BITS; 286 /*% qp->base->ptr[chunk] != NULL */ 287 bool exists : 1; 288 /*% is this chunk shared? [MT] */ 289 bool immutable : 1; 290 /*% already subtracted from multi->*_count [MT] */ 291 bool discounted : 1; 292 /*% is a snapshot using this chunk? [MT] */ 293 bool snapshot : 1; 294 /*% tried to free it but a snapshot needs it [MT] */ 295 bool snapfree : 1; 296 /*% for mark/sweep snapshot flag updates [MT] */ 297 bool snapmark : 1; 298 } qp_usage_t; 299 300 /* 301 * The chunks are owned by the current version of the `base` array. 302 * When the array is resized, the old version might still be in use by 303 * concurrent readers, in which case it is free()d later when its 304 * refcount drops to zero. 305 * 306 * A `dns_qpbase_t` counts references from `dns_qp_t` objects and 307 * from packed readers, but not from `dns_qpread_t` nor from 308 * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t` 309 * would wreck multicore scalability; instead we rely on RCU. 310 * 311 * The `usage` array determines when a chunk is no longer needed: old 312 * chunk pointers in old `base` arrays are ignored. (They can become 313 * dangling pointers to free memory, but they will never be 314 * dereferenced.) 315 * 316 * We ensure that individual chunk base pointers remain immutable 317 * after assignment, and they are not cleared until the chunk is 318 * free()d, after all readers have departed. Slots can be reused, and 319 * we allow transactions to fill or re-fill empty slots adjacent to 320 * busy slots that are in use by readers. 321 */ 322 struct dns_qpbase { 323 unsigned int magic; 324 isc_refcount_t refcount; 325 dns_qpnode_t *ptr[]; 326 }; 327 328 /* 329 * Chunks that may be in use by readers are reclaimed asynchronously. 330 * When a transaction commits, immutable chunks that are now empty are 331 * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`. 332 */ 333 typedef struct qp_rcuctx { 334 unsigned int magic; 335 struct rcu_head rcu_head; 336 isc_mem_t *mctx; 337 dns_qpmulti_t *multi; 338 ISC_LINK(struct qp_rcuctx) link; 339 dns_qpchunk_t count; 340 dns_qpchunk_t chunk[]; 341 } qp_rcuctx_t; 342 343 /* 344 * Returns true when the base array can be free()d. 345 */ 346 static inline bool 347 qpbase_unref(dns_qpreadable_t qpr) { 348 dns_qpreader_t *qp = dns_qpreader(qpr); 349 return qp->base != NULL && 350 isc_refcount_decrement(&qp->base->refcount) == 1; 351 } 352 353 /* 354 * Now we know about `dns_qpreader_t` and `dns_qpbase_t`, 355 * here's how we convert a twig reference into a pointer. 356 */ 357 static inline dns_qpnode_t * 358 ref_ptr(dns_qpreadable_t qpr, dns_qpref_t ref) { 359 dns_qpreader_t *qp = dns_qpreader(qpr); 360 return qp->base->ptr[ref_chunk(ref)] + ref_cell(ref); 361 } 362 363 /*********************************************************************** 364 * 365 * main qp-trie structures 366 */ 367 368 #define QP_MAGIC ISC_MAGIC('t', 'r', 'i', 'e') 369 #define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't') 370 #define QPCHAIN_MAGIC ISC_MAGIC('q', 'p', 'c', 'h') 371 #define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v') 372 #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x') 373 #define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p') 374 #define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b') 375 376 #define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC) 377 #define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC) 378 #define QPCHAIN_VALID(qp) ISC_MAGIC_VALID(qp, QPCHAIN_MAGIC) 379 #define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC) 380 #define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC) 381 #define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC) 382 383 /* 384 * Polymorphic initialization of the `dns_qpreader_t` prefix. 385 * 386 * The location of the root node is actually a dns_qpref_t, but is 387 * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too 388 * many internal details into the public API. 389 * 390 * The `uctx` and `methods` support callbacks into the user's code. 391 * They are constant after initialization. 392 */ 393 #define QP_INIT(qp, m, x) \ 394 (*(qp) = (typeof(*(qp))){ \ 395 .magic = QP_MAGIC, \ 396 .root_ref = INVALID_REF, \ 397 .uctx = x, \ 398 .methods = m, \ 399 }) 400 401 /* 402 * Snapshots have some extra cleanup machinery. 403 * 404 * Originally, a snapshot was basically just a `dns_qpread_t` 405 * allocated on the heap, with the extra behaviour that memory 406 * reclamation is suppressed for a particular trie while it has any 407 * snapshots. However that design gets into trouble for a zone with 408 * frequent updates and many zone transfers. 409 * 410 * Instead, each snapshot records which chunks it needs. When a 411 * snapshot is created, it makes a copy of the `base` array, except 412 * for chunks that are empty and waiting to be reclaimed. When a 413 * snapshot is destroyed, we can traverse the list of snapshots to 414 * accurately mark which chunks are still needed. 415 * 416 * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is 417 * not muddled up with the wrong `dns_qpmulti_t`. 418 * 419 * A trie's `base` array might have grown after the snapshot was 420 * created, so it records its own `chunk_max`. 421 */ 422 struct dns_qpsnap { 423 DNS_QPREADER_FIELDS; 424 dns_qpmulti_t *whence; 425 uint32_t chunk_max; 426 ISC_LINK(struct dns_qpsnap) link; 427 }; 428 429 /* 430 * Read-write access to a qp-trie requires extra fields to support the 431 * allocator and garbage collector. 432 * 433 * Bare instances of a `struct dns_qp` are used for stand-alone 434 * single-threaded tries. For multithreaded access, a `dns_qpmulti_t` 435 * wraps a `dns_qp_t` with a mutex and other fields that are only needed 436 * at the start or end of a transaction. 437 * 438 * Allocations are made sequentially in the `bump` chunk. A sequence 439 * of lightweight write transactions can use the same `bump` chunk, so 440 * its prefix before `fender` is immutable, and the rest is mutable. 441 * 442 * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines 443 * the values of `used_count`, `free_count`, and `hold_count`. The 444 * `hold_count` tracks nodes that need to be retained while readers are 445 * using them; they are free but cannot be reclaimed until the transaction 446 * has committed, so the `hold_count` is discounted from QP_MAX_GARBAGE() 447 * during a transaction. 448 * 449 * There are some flags that alter the behaviour of write transactions. 450 * 451 * - The `transaction_mode` indicates whether the current transaction is a 452 * light write or a heavy update, or (between transactions) the previous 453 * transaction's mode, because the setup for the next transaction 454 * depends on how the previous one committed. The mode is set at the 455 * start of each transaction. It is QP_NONE in a single-threaded qp-trie 456 * to detect if part of a `dns_qpmulti_t` is passed to dns_qp_destroy(). 457 * 458 * - The `compact_all` flag is used when every node in the trie should be 459 * copied. (Usually compation aims to avoid moving nodes out of 460 * unfragmented chunks.) It is used when compaction is explicitly 461 * requested via `dns_qp_compact()`, and as an emergency mechanism if 462 * normal compaction failed to clear the QP_MAX_GARBAGE() condition. 463 * (This emergency is a bug even tho we have a rescue mechanism.) 464 * 465 * - When a qp-trie is destroyed while it has pending cleanup work, its 466 * `destroy` flag is set so that it is destroyed by the reclaim worker. 467 * (Because items cannot be removed from the middle of the cleanup list.) 468 * 469 * - When built with fuzzing support, we can use mprotect() and munmap() 470 * to ensure that incorrect memory accesses cause fatal errors. The 471 * `write_protect` flag must be set straight after the `dns_qpmulti_t` 472 * is created, then left unchanged. 473 * 474 * Some of the dns_qp_t fields are only needed for multithreaded transactions 475 * (marked [MT] below) but the same code paths are also used for single- 476 * threaded writes. 477 */ 478 struct dns_qp { 479 DNS_QPREADER_FIELDS; 480 /*% memory context (const) */ 481 isc_mem_t *mctx; 482 /*% array of per-chunk allocation counters */ 483 qp_usage_t *usage; 484 /*% number of slots in `chunk` and `usage` arrays */ 485 dns_qpchunk_t chunk_max; 486 /*% which chunk is used for allocations */ 487 dns_qpchunk_t bump; 488 /*% nodes in the `bump` chunk below `fender` are read only [MT] */ 489 dns_qpcell_t fender; 490 /*% number of leaf nodes */ 491 dns_qpcell_t leaf_count; 492 /*% total of all usage[] counters */ 493 dns_qpcell_t used_count, free_count; 494 /*% free cells that cannot be recovered right now */ 495 dns_qpcell_t hold_count; 496 /*% capacity of last allocated chunk, for exponential chunk growth */ 497 dns_qpcell_t chunk_capacity; 498 /*% what kind of transaction was most recently started [MT] */ 499 enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2; 500 /*% compact the entire trie [MT] */ 501 bool compact_all : 1; 502 /*% optionally when compiled with fuzzing support [MT] */ 503 bool write_protect : 1; 504 }; 505 506 /* 507 * Concurrent access to a qp-trie. 508 * 509 * The `reader` pointer provides wait-free access to the current version 510 * of the trie. See the "packed reader nodes" section below for a 511 * description of what it points to. 512 * 513 * The main object under the protection of the mutex is the `writer` 514 * containing all the allocator state. There can be a backup copy when 515 * we want to be able to rollback an update transaction. 516 * 517 * There is a `reader_ref` which corresponds to the `reader` pointer 518 * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is 519 * necessary when freeing the space used by the reader, because there 520 * isn't a good way to recover a dns_qpref_t from a dns_qpnode_t pointer. 521 * 522 * There is a per-trie list of snapshots that is used for reclaiming 523 * memory when a snapshot is destroyed. 524 * 525 * Finally, we maintain a global list of `dns_qpmulti_t` objects that 526 * need asynchronous safe memory recovery. 527 */ 528 struct dns_qpmulti { 529 uint32_t magic; 530 /*% RCU-protected pointer to current packed reader */ 531 dns_qpnode_t *reader; 532 /*% the mutex protects the rest of this structure */ 533 isc_mutex_t mutex; 534 /*% ref_ptr(writer, reader_ref) == reader */ 535 dns_qpref_t reader_ref; 536 /*% the main working structure */ 537 dns_qp_t writer; 538 /*% saved allocator state to support rollback */ 539 dns_qp_t *rollback; 540 /*% all snapshots of this trie */ 541 ISC_LIST(dns_qpsnap_t) snapshots; 542 /*% refcount for memory reclamation */ 543 isc_refcount_t references; 544 }; 545 546 /*********************************************************************** 547 * 548 * interior node constructors and accessors 549 */ 550 551 /* 552 * See the comments under "interior node basics" above, which explain 553 * the layout of nodes as implemented by the following functions. 554 * 555 * These functions are (mostly) constructors and getters. Imagine how 556 * much less code there would be if C had sum types with control over 557 * the layout... 558 */ 559 560 /* 561 * Get the 64-bit word of a node. 562 */ 563 static inline uint64_t 564 node64(dns_qpnode_t *n) { 565 uint64_t lo = n->biglo; 566 uint64_t hi = n->bighi; 567 return lo | (hi << 32); 568 } 569 570 /* 571 * Get the 32-bit word of a node. 572 */ 573 static inline uint32_t 574 node32(dns_qpnode_t *n) { 575 return n->small; 576 } 577 578 /* 579 * Create a node from its parts 580 */ 581 static inline dns_qpnode_t 582 make_node(uint64_t big, uint32_t small) { 583 return (dns_qpnode_t){ 584 .biglo = (uint32_t)(big), 585 .bighi = (uint32_t)(big >> 32), 586 .small = small, 587 }; 588 } 589 590 /* 591 * Extract a pointer from a node's 64 bit word. The double cast is to avoid 592 * a warning about mismatched pointer/integer sizes on 32 bit systems. 593 */ 594 static inline void * 595 node_pointer(dns_qpnode_t *n) { 596 return (void *)(uintptr_t)(node64(n) & ~TAG_MASK); 597 } 598 599 /* 600 * Examine a node's tag bits 601 */ 602 static inline uint32_t 603 node_tag(dns_qpnode_t *n) { 604 return n->biglo & TAG_MASK; 605 } 606 607 /* 608 * simplified for the hot path 609 */ 610 static inline bool 611 is_branch(dns_qpnode_t *n) { 612 return n->biglo & BRANCH_TAG; 613 } 614 615 /* leaf nodes *********************************************************/ 616 617 /* 618 * Get a leaf's pointer value. 619 */ 620 static inline void * 621 leaf_pval(dns_qpnode_t *n) { 622 return node_pointer(n); 623 } 624 625 /* 626 * Get a leaf's integer value 627 */ 628 static inline uint32_t 629 leaf_ival(dns_qpnode_t *n) { 630 return node32(n); 631 } 632 633 /* 634 * Create a leaf node from its parts 635 */ 636 static inline dns_qpnode_t 637 make_leaf(const void *pval, uint32_t ival) { 638 dns_qpnode_t leaf = make_node((uintptr_t)pval, ival); 639 REQUIRE(node_tag(&leaf) == LEAF_TAG); 640 return leaf; 641 } 642 643 /* branch nodes *******************************************************/ 644 645 /* 646 * The following function names use plural `twigs` when they work on a 647 * branch's twigs vector as a whole, and singular `twig` when they work on 648 * a particular twig. 649 */ 650 651 /* 652 * Get a branch node's index word 653 */ 654 static inline uint64_t 655 branch_index(dns_qpnode_t *n) { 656 return node64(n); 657 } 658 659 /* 660 * Get a reference to a branch node's child twigs. 661 */ 662 static inline dns_qpref_t 663 branch_twigs_ref(dns_qpnode_t *n) { 664 return node32(n); 665 } 666 667 /* 668 * Bit positions in the bitmap come directly from the key. DNS names are 669 * converted to keys using the tables declared at the end of this file. 670 */ 671 static inline dns_qpshift_t 672 qpkey_bit(const dns_qpkey_t key, size_t len, size_t offset) { 673 if (offset < len) { 674 return key[offset]; 675 } else { 676 return SHIFT_NOBYTE; 677 } 678 } 679 680 /* 681 * Extract a branch node's offset field, used to index the key. 682 */ 683 static inline size_t 684 branch_key_offset(dns_qpnode_t *n) { 685 return (size_t)(branch_index(n) >> SHIFT_OFFSET); 686 } 687 688 /* 689 * Which bit identifies the twig of this node for this key? 690 */ 691 static inline dns_qpshift_t 692 branch_keybit(dns_qpnode_t *n, const dns_qpkey_t key, size_t len) { 693 return qpkey_bit(key, len, branch_key_offset(n)); 694 } 695 696 /* 697 * Get a pointer to a the first twig of a branch (this also functions 698 * as a pointer to the entire twig vector). 699 */ 700 static inline dns_qpnode_t * 701 branch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) { 702 return ref_ptr(qpr, branch_twigs_ref(n)); 703 } 704 705 /* 706 * Warm up the cache while calculating which twig we want. 707 */ 708 static inline void 709 prefetch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) { 710 __builtin_prefetch(ref_ptr(qpr, branch_twigs_ref(n))); 711 } 712 713 /* root node **********************************************************/ 714 715 /* 716 * Get a pointer to the root node, checking if the trie is empty. 717 */ 718 static inline dns_qpnode_t * 719 get_root(dns_qpreadable_t qpr) { 720 dns_qpreader_t *qp = dns_qpreader(qpr); 721 if (qp->root_ref == INVALID_REF) { 722 return NULL; 723 } else { 724 return ref_ptr(qp, qp->root_ref); 725 } 726 } 727 728 /* 729 * When we need to move the root node, we avoid repeating allocation 730 * logistics by making a temporary fake branch node that has 731 * `branch_twigs_size() == 1 && branch_twigs_ref() == root_ref` 732 * just enough to treat the root node as a vector of one twig. 733 */ 734 #define MOVABLE_ROOT(qp) \ 735 (&(dns_qpnode_t){ \ 736 .biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \ 737 .small = qp->root_ref, \ 738 }) 739 740 /*********************************************************************** 741 * 742 * bitmap popcount shenanigans 743 */ 744 745 /* 746 * How many twigs appear in the vector before the one corresponding to the 747 * given bit? Calculated using popcount of part of the branch's bitmap. 748 * 749 * To calculate a mask that covers the lesser bits in the bitmap, 750 * we subtract 1 to set all lesser bits, and subtract the tag mask 751 * because the type tag is not part of the bitmap. 752 */ 753 static inline dns_qpweight_t 754 branch_count_bitmap_before(dns_qpnode_t *n, dns_qpshift_t bit) { 755 uint64_t mask = (1ULL << bit) - 1 - TAG_MASK; 756 uint64_t bitmap = branch_index(n) & mask; 757 return (dns_qpweight_t)__builtin_popcountll(bitmap); 758 } 759 760 /* 761 * How many twigs does this branch have? 762 * 763 * The offset is directly after the bitmap so the offset's lesser bits 764 * covers the whole bitmap, and the bitmap's weight is the number of twigs. 765 */ 766 static inline dns_qpweight_t 767 branch_twigs_size(dns_qpnode_t *n) { 768 return branch_count_bitmap_before(n, SHIFT_OFFSET); 769 } 770 771 /* 772 * Position of a twig within the packed sparse vector. 773 */ 774 static inline dns_qpweight_t 775 branch_twig_pos(dns_qpnode_t *n, dns_qpshift_t bit) { 776 return branch_count_bitmap_before(n, bit); 777 } 778 779 /* 780 * Get a pointer to the twig for a given bit number. 781 */ 782 static inline dns_qpnode_t * 783 branch_twig_ptr(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpshift_t bit) { 784 return ref_ptr(qpr, branch_twigs_ref(n) + branch_twig_pos(n, bit)); 785 } 786 787 /* 788 * Is the twig identified by this bit present? 789 */ 790 static inline bool 791 branch_has_twig(dns_qpnode_t *n, dns_qpshift_t bit) { 792 return branch_index(n) & (1ULL << bit); 793 } 794 795 /* twig logistics *****************************************************/ 796 797 static inline void 798 move_twigs(dns_qpnode_t *to, dns_qpnode_t *from, dns_qpweight_t size) { 799 memmove(to, from, size * sizeof(dns_qpnode_t)); 800 } 801 802 static inline void 803 zero_twigs(dns_qpnode_t *twigs, dns_qpweight_t size) { 804 memset(twigs, 0, size * sizeof(dns_qpnode_t)); 805 } 806 807 /*********************************************************************** 808 * 809 * packed reader nodes 810 */ 811 812 /* 813 * The purpose of these packed reader nodes is to simplify safe memory 814 * reclamation for a multithreaded qp-trie. 815 * 816 * After the `reader` pointer in a qpmulti is replaced, we need to wait 817 * for a grace period before we can reclaim the memory that is no longer 818 * needed by the trie. So we need some kind of structure to hold 819 * pointers to the (logically) detached memory until it is safe to free. 820 * This memory includes the chunks and the `base` arrays. 821 * 822 * Packed reader nodes save us from having to track `dns_qpread_t` 823 * objects as distinct allocations: the packed reader nodes get 824 * reclaimed when the the chunk containing their cells is reclaimed. 825 * When a real `dns_qpread_t` object is needed, it is allocated on the 826 * stack (it must not live longer than a isc_loop callback) and the 827 * packed reader is unpacked into it. 828 * 829 * Chunks are owned by the current `base` array, so unused chunks are 830 * held there until they are free()d. Old `base` arrays are attached 831 * to packed reader nodes with a refcount. When a chunk is reclaimed, 832 * it is scanned so that `chunk_free()` can call `detach_leaf()` on 833 * any remaining references to leaf objects. Similarly, it calls 834 * `qpbase_unref()` to reclaim old `base` arrays. 835 */ 836 837 /* 838 * Two nodes is just enough space for the information needed by 839 * readers and for deferred memory reclamation. 840 */ 841 #define READER_SIZE 2 842 843 /* 844 * Create a packed reader; space for the reader should have been 845 * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`. 846 */ 847 static inline void 848 make_reader(dns_qpnode_t *reader, dns_qpmulti_t *multi) { 849 dns_qp_t *qp = &multi->writer; 850 reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC); 851 reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref); 852 } 853 854 static inline bool 855 reader_valid(dns_qpnode_t *reader) { 856 return reader != NULL && // 857 node_tag(&reader[0]) == READER_TAG && 858 node_tag(&reader[1]) == READER_TAG && 859 node32(&reader[0]) == QPREADER_MAGIC; 860 } 861 862 /* 863 * Verify and unpack a reader. We return the `multi` pointer to use in 864 * consistency checks. 865 */ 866 static inline dns_qpmulti_t * 867 unpack_reader(dns_qpreader_t *qp, dns_qpnode_t *reader) { 868 INSIST(reader_valid(reader)); 869 dns_qpmulti_t *multi = node_pointer(&reader[0]); 870 dns_qpbase_t *base = node_pointer(&reader[1]); 871 INSIST(QPMULTI_VALID(multi)); 872 INSIST(QPBASE_VALID(base)); 873 *qp = (dns_qpreader_t){ 874 .magic = QP_MAGIC, 875 .uctx = multi->writer.uctx, 876 .methods = multi->writer.methods, 877 .root_ref = node32(&reader[1]), 878 .base = base, 879 }; 880 return multi; 881 } 882 883 /*********************************************************************** 884 * 885 * method invocation helpers 886 */ 887 888 static inline void 889 attach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) { 890 dns_qpreader_t *qp = dns_qpreader(qpr); 891 qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n)); 892 } 893 894 static inline void 895 detach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) { 896 dns_qpreader_t *qp = dns_qpreader(qpr); 897 qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n)); 898 } 899 900 static inline size_t 901 leaf_qpkey(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpkey_t key) { 902 dns_qpreader_t *qp = dns_qpreader(qpr); 903 size_t len = qp->methods->makekey(key, qp->uctx, leaf_pval(n), 904 leaf_ival(n)); 905 INSIST(len < sizeof(dns_qpkey_t)); 906 return len; 907 } 908 909 static inline char * 910 triename(dns_qpreadable_t qpr, char *buf, size_t size) { 911 dns_qpreader_t *qp = dns_qpreader(qpr); 912 qp->methods->triename(qp->uctx, buf, size); 913 return buf; 914 } 915 916 #define TRIENAME(qp) \ 917 triename(qp, (char[DNS_QP_TRIENAME_MAX]){}, DNS_QP_TRIENAME_MAX) 918 919 /*********************************************************************** 920 * 921 * converting DNS names to trie keys 922 */ 923 924 /* 925 * This is a deliberate simplification of the hostname characters, 926 * because it doesn't matter much if we treat a few extra characters 927 * favourably: there is plenty of space in the index word for a 928 * slightly larger bitmap. 929 */ 930 static inline bool 931 qp_common_character(uint8_t byte) { 932 return ('-' <= byte && byte <= '9') || ('_' <= byte && byte <= 'z'); 933 } 934 935 /* 936 * Lookup table mapping bytes in DNS names to bit positions, used 937 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys. 938 */ 939 extern uint16_t dns_qp_bits_for_byte[]; 940 941 /* 942 * And the reverse, mapping bit positions to characters, so the tests 943 * can print diagnostics involving qp-trie keys. 944 */ 945 extern uint8_t dns_qp_byte_for_bit[]; 946 947 /**********************************************************************/ 948