qp_p.h revision 1.2.4.2 1 /* $NetBSD: qp_p.h,v 1.2.4.2 2025/08/02 05:53:28 perseant Exp $ */
2
3 /*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16 /*
17 * For an overview, see doc/design/qp-trie.md
18 *
19 * This private header defines the internal data structures,
20 */
21
22 #pragma once
23
24 /***********************************************************************
25 *
26 * interior node basics
27 */
28
29 /*
30 * A qp-trie node is almost always one of two types: branch or leaf.
31 * (A third type is used only to anchor the root of a trie; see below.)
32 *
33 * A node contains a 64-bit word and a 32-bit word. In order to avoid
34 * unwanted padding, they are declared as three 32-bit words; this keeps
35 * the size down to 12 bytes. They are in native endian order, so getting
36 * the 64-bit part should compile down to an unaligned load.
37 *
38 * The node type is identified by the least significant bits of the 64-bit
39 * word.
40 *
41 * In a leaf node:
42 * - The 64-bit word is used to store a pointer value. (Pointers must be
43 * word-aligned so the least significant bits are zero; those bits can
44 * then act as a node tag to indicate that this is a leaf. This
45 * requirement is enforced by the make_leaf() constructor.)
46 * - The 32-bit word is used to store an integer value. Both the
47 * pointer and integer values can be retrieved when looking up a key.
48 *
49 * In a branch node:
50 * - The 64-bit word is subdivided into three portions: the least
51 * significant bits are the node type (for a branch, 0x1); the
52 * most sigificant 15 bits are an offset value into the key, and
53 * the 47 bits in the middle are a bitmap; see the documentation
54 * for the SHIFT_* enum below.
55 * - The 32-bit word is a reference (dns_qpref_t) to the packed sparse
56 * vector of "twigs", i.e. child nodes. A branch node has at least
57 * two and at most 47 twigs. (The qp-trie update functions ensure that
58 * branches actually branch, i.e. a branch cannot have only one child.)
59 *
60 * A third node type, reader nodes, anchors the root of a trie.
61 * A pair of reader nodes together contain a packed `dns_qpreader_t`.
62 * See the section on "packed reader nodes" for details.
63 */
64 struct dns_qpnode {
65 #if WORDS_BIGENDIAN
66 uint32_t bighi, biglo, small;
67 #else
68 uint32_t biglo, bighi, small;
69 #endif
70 };
71
72 /*
73 * The possible values of the node type tag. Type tags must fit in two bits
74 * for compatibility with 4-byte pointer alignment on 32-bit systems.
75 */
76 enum {
77 LEAF_TAG = 0, /* leaf node */
78 BRANCH_TAG = 1, /* branch node */
79 READER_TAG = 2, /* reader node */
80 TAG_MASK = 3, /* mask covering tag bits */
81 };
82
83 /*
84 * This code does not work on CPUs with large pointers, e.g. CHERI capability
85 * architectures. When porting to that kind of machine, a `dns_qpnode` should
86 * be just a `uintptr_t`; a leaf node will contain a single pointer, and a
87 * branch node will fit in the same space with room to spare.
88 */
89 STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t),
90 "pointers must fit in 64 bits");
91
92 /*
93 * The 64-bit word in a branch node is comprised of a node type tag, a
94 * bitmap, and an offset into the key. It is called an "index word" because
95 * it describes how to access the twigs vector (think "database index").
96 * The following enum sets up the bit positions of these parts.
97 *
98 * The bitmap is just above the type tag. The `dns_qp_bits_for_byte[]` table
99 * is used to fill in a key so that bit tests can work directly against the
100 * index word without superfluous masking or shifting; we don't need to
101 * mask out the bitmap before testing a bit, but we do need to mask the
102 * bitmap before calling popcount.
103 *
104 * The byte offset into the key is at the top of the word, so that it
105 * can be extracted with just a shift, with no masking needed.
106 *
107 * The names are SHIFT_thing because they are dns_qpshift_t values. (See
108 * below for the various `qp_*` type declarations.)
109 *
110 * These values are relatively fixed in practice: SHIFT_NOBYTE needs
111 * to leave space for the type tag, and the implementation of
112 * `dns_qpkey_fromname()` depends on the bitmap being large enough.
113 * The symbolic names avoid mystery numbers in the code.
114 */
115 enum {
116 SHIFT_NOBYTE = 2, /* label separator has no byte value */
117 SHIFT_BITMAP, /* many bits here */
118 SHIFT_OFFSET = 49, /* offset of byte in key */
119 };
120
121 /***********************************************************************
122 *
123 * garbage collector tuning parameters
124 */
125
126 /*
127 * A "cell" is a location that can contain a `dns_qpnode_t`, and a "chunk"
128 * is a moderately large array of cells. A big trie can occupy
129 * multiple chunks. (Unlike other nodes, a trie's root node lives in
130 * its `struct dns_qp` instead of being allocated in a cell.)
131 *
132 * The qp-trie allocator hands out space for twigs vectors. Allocations are
133 * made sequentially from one of the chunks; this kind of "sequential
134 * allocator" is also known as a "bump allocator", so in `struct dns_qp`
135 * (see below) the allocation chunk is called `bump`.
136 */
137
138 /*
139 * Number of cells in a chunk is a power of 2, which must have space for
140 * a full twigs vector (48 wide). When testing, use a much smaller chunk
141 * size to make the allocator work harder.
142 */
143 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
144 #define QP_CHUNK_LOG 7
145 #else
146 #define QP_CHUNK_LOG 10
147 #endif
148
149 STATIC_ASSERT(6 <= QP_CHUNK_LOG && QP_CHUNK_LOG <= 20,
150 "qp-trie chunk size is unreasonable");
151
152 #define QP_CHUNK_SIZE (1U << QP_CHUNK_LOG)
153 #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(dns_qpnode_t))
154
155 /*
156 * We need a bitfield this big to count how much of a chunk is in use:
157 * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG`.
158 */
159 #define QP_USAGE_BITS (QP_CHUNK_LOG + 1)
160
161 /*
162 * A chunk needs to be compacted if it is less full than this threshold.
163 * (12% overhead seems reasonable)
164 */
165 #define QP_MAX_FREE (QP_CHUNK_SIZE / 8)
166 #define QP_MIN_USED (QP_CHUNK_SIZE - QP_MAX_FREE)
167
168 /*
169 * Compact automatically when we pass this threshold: when there is a lot
170 * of free space in absolute terms, and when we have freed more than half
171 * of the space we allocated.
172 *
173 * The current compaction algorithm scans the whole trie, so it is important
174 * to scale the threshold based on the size of the trie to avoid quadratic
175 * behaviour. XXXFANF find an algorithm that scans less of the trie!
176 *
177 * During a modification transaction, when we copy-on-write some twigs we
178 * count the old copy as "free", because they will be when the transaction
179 * commits. But they cannot be recovered immediately so they are also
180 * counted as on hold, and discounted when we decide whether to compact.
181 */
182 #define QP_GC_HEURISTIC(qp, free) \
183 ((free) > QP_CHUNK_SIZE * 4 && (free) > (qp)->used_count / 2)
184
185 #define QP_NEEDGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count)
186 #define QP_AUTOGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count - (qp)->hold_count)
187
188 /*
189 * The chunk base and usage arrays are resized geometically and start off
190 * with two entries.
191 */
192 #define GROWTH_FACTOR(size) ((size) + (size) / 2 + 2)
193
194 /*
195 * Constructors and accessors for dns_qpref_t values, defined here to show
196 * how the dns_qpref_t, dns_qpchunk_t, dns_qpcell_t types relate to each other
197 */
198
199 static inline dns_qpref_t
200 make_ref(dns_qpchunk_t chunk, dns_qpcell_t cell) {
201 return QP_CHUNK_SIZE * chunk + cell;
202 }
203
204 static inline dns_qpchunk_t
205 ref_chunk(dns_qpref_t ref) {
206 return ref / QP_CHUNK_SIZE;
207 }
208
209 static inline dns_qpcell_t
210 ref_cell(dns_qpref_t ref) {
211 return ref % QP_CHUNK_SIZE;
212 }
213
214 /*
215 * We should not use the `root_ref` in an empty trie, so we set it
216 * to a value that should trigger an obvious bug. See qp_init()
217 * and get_root() below.
218 */
219 #define INVALID_REF ((dns_qpref_t)~0UL)
220
221 /***********************************************************************
222 *
223 * chunk arrays
224 */
225
226 /*
227 * A `dns_qp_t` contains two arrays holding information about each chunk.
228 *
229 * The `base` array holds pointers to the base of each chunk.
230 * The `usage` array hold the allocator's state for each chunk.
231 *
232 * The `base` array is used by the hot qp-trie traversal paths. It can
233 * be shared by multiple versions of a trie, which are tracked with a
234 * refcount. Old versions of the trie can retain old versions of the
235 * `base` array.
236 *
237 * In multithreaded code, the `usage` array is only used when the
238 * `dns_qpmulti_t` mutex is held, and there is only one version of
239 * it in active use (maybe with a snapshot for rollback support).
240 *
241 * The two arrays are separate because they have rather different
242 * access patterns, different lifetimes, and different element sizes.
243 */
244
245 /*
246 * For most purposes we don't need to know exactly which cells are
247 * in use in a chunk, we only need to know how many of them there are.
248 *
249 * After we have finished allocating from a chunk, the `used` counter
250 * is the size we need to know for shrinking the chunk and for
251 * scanning it to detach leaf values before the chunk is free()d. The
252 * `free` counter tells us when the chunk needs compacting and when it
253 * has become empty.
254 *
255 * The `exists` flag allows the chunk scanning loops to look at the
256 * usage array only.
257 *
258 * In multithreaded code, we mark chunks as `immutable` when a modify
259 * transaction is opened. (We don't mark them immutable on commit,
260 * because the old bump chunk must remain mutable between write
261 * transactions, but it must become immutable when an update
262 * transaction is opened.)
263 *
264 * There are a few flags used to mark which chunks are still needed by
265 * snapshots after the chunks have passed their normal reclamation
266 * phase.
267 */
268 typedef struct qp_usage {
269 /*% the allocation point, increases monotonically */
270 dns_qpcell_t used : QP_USAGE_BITS;
271 /*% count of nodes no longer needed, also monotonic */
272 dns_qpcell_t free : QP_USAGE_BITS;
273 /*% qp->base->ptr[chunk] != NULL */
274 bool exists : 1;
275 /*% is this chunk shared? [MT] */
276 bool immutable : 1;
277 /*% already subtracted from multi->*_count [MT] */
278 bool discounted : 1;
279 /*% is a snapshot using this chunk? [MT] */
280 bool snapshot : 1;
281 /*% tried to free it but a snapshot needs it [MT] */
282 bool snapfree : 1;
283 /*% for mark/sweep snapshot flag updates [MT] */
284 bool snapmark : 1;
285 } qp_usage_t;
286
287 /*
288 * The chunks are owned by the current version of the `base` array.
289 * When the array is resized, the old version might still be in use by
290 * concurrent readers, in which case it is free()d later when its
291 * refcount drops to zero.
292 *
293 * A `dns_qpbase_t` counts references from `dns_qp_t` objects and
294 * from packed readers, but not from `dns_qpread_t` nor from
295 * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t`
296 * would wreck multicore scalability; instead we rely on RCU.
297 *
298 * The `usage` array determines when a chunk is no longer needed: old
299 * chunk pointers in old `base` arrays are ignored. (They can become
300 * dangling pointers to free memory, but they will never be
301 * dereferenced.)
302 *
303 * We ensure that individual chunk base pointers remain immutable
304 * after assignment, and they are not cleared until the chunk is
305 * free()d, after all readers have departed. Slots can be reused, and
306 * we allow transactions to fill or re-fill empty slots adjacent to
307 * busy slots that are in use by readers.
308 */
309 struct dns_qpbase {
310 unsigned int magic;
311 isc_refcount_t refcount;
312 dns_qpnode_t *ptr[];
313 };
314
315 /*
316 * Chunks that may be in use by readers are reclaimed asynchronously.
317 * When a transaction commits, immutable chunks that are now empty are
318 * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`.
319 */
320 typedef struct qp_rcuctx {
321 unsigned int magic;
322 struct rcu_head rcu_head;
323 isc_mem_t *mctx;
324 dns_qpmulti_t *multi;
325 dns_qpchunk_t count;
326 dns_qpchunk_t chunk[];
327 } qp_rcuctx_t;
328
329 /*
330 * Returns true when the base array can be free()d.
331 */
332 static inline bool
333 qpbase_unref(dns_qpreadable_t qpr) {
334 dns_qpreader_t *qp = dns_qpreader(qpr);
335 return qp->base != NULL &&
336 isc_refcount_decrement(&qp->base->refcount) == 1;
337 }
338
339 /*
340 * Now we know about `dns_qpreader_t` and `dns_qpbase_t`,
341 * here's how we convert a twig reference into a pointer.
342 */
343 static inline dns_qpnode_t *
344 ref_ptr(dns_qpreadable_t qpr, dns_qpref_t ref) {
345 dns_qpreader_t *qp = dns_qpreader(qpr);
346 return qp->base->ptr[ref_chunk(ref)] + ref_cell(ref);
347 }
348
349 /***********************************************************************
350 *
351 * main qp-trie structures
352 */
353
354 #define QP_MAGIC ISC_MAGIC('t', 'r', 'i', 'e')
355 #define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't')
356 #define QPCHAIN_MAGIC ISC_MAGIC('q', 'p', 'c', 'h')
357 #define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v')
358 #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x')
359 #define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p')
360 #define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b')
361
362 #define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC)
363 #define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC)
364 #define QPCHAIN_VALID(qp) ISC_MAGIC_VALID(qp, QPCHAIN_MAGIC)
365 #define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC)
366 #define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC)
367 #define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC)
368
369 /*
370 * Polymorphic initialization of the `dns_qpreader_t` prefix.
371 *
372 * The location of the root node is actually a dns_qpref_t, but is
373 * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too
374 * many internal details into the public API.
375 *
376 * The `uctx` and `methods` support callbacks into the user's code.
377 * They are constant after initialization.
378 */
379 #define QP_INIT(qp, m, x) \
380 (*(qp) = (typeof(*(qp))){ \
381 .magic = QP_MAGIC, \
382 .root_ref = INVALID_REF, \
383 .uctx = x, \
384 .methods = m, \
385 })
386
387 /*
388 * Snapshots have some extra cleanup machinery.
389 *
390 * Originally, a snapshot was basically just a `dns_qpread_t`
391 * allocated on the heap, with the extra behaviour that memory
392 * reclamation is suppressed for a particular trie while it has any
393 * snapshots. However that design gets into trouble for a zone with
394 * frequent updates and many zone transfers.
395 *
396 * Instead, each snapshot records which chunks it needs. When a
397 * snapshot is created, it makes a copy of the `base` array, except
398 * for chunks that are empty and waiting to be reclaimed. When a
399 * snapshot is destroyed, we can traverse the list of snapshots to
400 * accurately mark which chunks are still needed.
401 *
402 * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is
403 * not muddled up with the wrong `dns_qpmulti_t`.
404 *
405 * A trie's `base` array might have grown after the snapshot was
406 * created, so it records its own `chunk_max`.
407 */
408 struct dns_qpsnap {
409 DNS_QPREADER_FIELDS;
410 dns_qpmulti_t *whence;
411 uint32_t chunk_max;
412 ISC_LINK(struct dns_qpsnap) link;
413 };
414
415 /*
416 * Read-write access to a qp-trie requires extra fields to support the
417 * allocator and garbage collector.
418 *
419 * Bare instances of a `struct dns_qp` are used for stand-alone
420 * single-threaded tries. For multithreaded access, a `dns_qpmulti_t`
421 * wraps a `dns_qp_t` with a mutex and other fields that are only needed
422 * at the start or end of a transaction.
423 *
424 * Allocations are made sequentially in the `bump` chunk. A sequence
425 * of lightweight write transactions can use the same `bump` chunk, so
426 * its prefix before `fender` is immutable, and the rest is mutable.
427 *
428 * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines
429 * the values of `used_count`, `free_count`, and `hold_count`. The
430 * `hold_count` tracks nodes that need to be retained while readers are
431 * using them; they are free but cannot be reclaimed until the transaction
432 * has committed, so the `hold_count` is discounted from QP_MAX_GARBAGE()
433 * during a transaction.
434 *
435 * There are some flags that alter the behaviour of write transactions.
436 *
437 * - The `transaction_mode` indicates whether the current transaction is a
438 * light write or a heavy update, or (between transactions) the previous
439 * transaction's mode, because the setup for the next transaction
440 * depends on how the previous one committed. The mode is set at the
441 * start of each transaction. It is QP_NONE in a single-threaded qp-trie
442 * to detect if part of a `dns_qpmulti_t` is passed to dns_qp_destroy().
443 *
444 * - The `compact_all` flag is used when every node in the trie should be
445 * copied. (Usually compation aims to avoid moving nodes out of
446 * unfragmented chunks.) It is used when compaction is explicitly
447 * requested via `dns_qp_compact()`, and as an emergency mechanism if
448 * normal compaction failed to clear the QP_MAX_GARBAGE() condition.
449 * (This emergency is a bug even tho we have a rescue mechanism.)
450 *
451 * - When a qp-trie is destroyed while it has pending cleanup work, its
452 * `destroy` flag is set so that it is destroyed by the reclaim worker.
453 * (Because items cannot be removed from the middle of the cleanup list.)
454 *
455 * - When built with fuzzing support, we can use mprotect() and munmap()
456 * to ensure that incorrect memory accesses cause fatal errors. The
457 * `write_protect` flag must be set straight after the `dns_qpmulti_t`
458 * is created, then left unchanged.
459 *
460 * Some of the dns_qp_t fields are only needed for multithreaded transactions
461 * (marked [MT] below) but the same code paths are also used for single-
462 * threaded writes.
463 */
464 struct dns_qp {
465 DNS_QPREADER_FIELDS;
466 /*% memory context (const) */
467 isc_mem_t *mctx;
468 /*% array of per-chunk allocation counters */
469 qp_usage_t *usage;
470 /*% number of slots in `chunk` and `usage` arrays */
471 dns_qpchunk_t chunk_max;
472 /*% which chunk is used for allocations */
473 dns_qpchunk_t bump;
474 /*% nodes in the `bump` chunk below `fender` are read only [MT] */
475 dns_qpcell_t fender;
476 /*% number of leaf nodes */
477 dns_qpcell_t leaf_count;
478 /*% total of all usage[] counters */
479 dns_qpcell_t used_count, free_count;
480 /*% free cells that cannot be recovered right now */
481 dns_qpcell_t hold_count;
482 /*% what kind of transaction was most recently started [MT] */
483 enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2;
484 /*% compact the entire trie [MT] */
485 bool compact_all : 1;
486 /*% optionally when compiled with fuzzing support [MT] */
487 bool write_protect : 1;
488 };
489
490 /*
491 * Concurrent access to a qp-trie.
492 *
493 * The `reader` pointer provides wait-free access to the current version
494 * of the trie. See the "packed reader nodes" section below for a
495 * description of what it points to.
496 *
497 * The main object under the protection of the mutex is the `writer`
498 * containing all the allocator state. There can be a backup copy when
499 * we want to be able to rollback an update transaction.
500 *
501 * There is a `reader_ref` which corresponds to the `reader` pointer
502 * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is
503 * necessary when freeing the space used by the reader, because there
504 * isn't a good way to recover a dns_qpref_t from a dns_qpnode_t pointer.
505 *
506 * There is a per-trie list of snapshots that is used for reclaiming
507 * memory when a snapshot is destroyed.
508 *
509 * Finally, we maintain a global list of `dns_qpmulti_t` objects that
510 * need asynchronous safe memory recovery.
511 */
512 struct dns_qpmulti {
513 uint32_t magic;
514 /*% RCU-protected pointer to current packed reader */
515 dns_qpnode_t *reader;
516 /*% the mutex protects the rest of this structure */
517 isc_mutex_t mutex;
518 /*% ref_ptr(writer, reader_ref) == reader */
519 dns_qpref_t reader_ref;
520 /*% the main working structure */
521 dns_qp_t writer;
522 /*% saved allocator state to support rollback */
523 dns_qp_t *rollback;
524 /*% all snapshots of this trie */
525 ISC_LIST(dns_qpsnap_t) snapshots;
526 };
527
528 /***********************************************************************
529 *
530 * interior node constructors and accessors
531 */
532
533 /*
534 * See the comments under "interior node basics" above, which explain
535 * the layout of nodes as implemented by the following functions.
536 *
537 * These functions are (mostly) constructors and getters. Imagine how
538 * much less code there would be if C had sum types with control over
539 * the layout...
540 */
541
542 /*
543 * Get the 64-bit word of a node.
544 */
545 static inline uint64_t
546 node64(dns_qpnode_t *n) {
547 uint64_t lo = n->biglo;
548 uint64_t hi = n->bighi;
549 return lo | (hi << 32);
550 }
551
552 /*
553 * Get the 32-bit word of a node.
554 */
555 static inline uint32_t
556 node32(dns_qpnode_t *n) {
557 return n->small;
558 }
559
560 /*
561 * Create a node from its parts
562 */
563 static inline dns_qpnode_t
564 make_node(uint64_t big, uint32_t small) {
565 return (dns_qpnode_t){
566 .biglo = (uint32_t)(big),
567 .bighi = (uint32_t)(big >> 32),
568 .small = small,
569 };
570 }
571
572 /*
573 * Extract a pointer from a node's 64 bit word. The double cast is to avoid
574 * a warning about mismatched pointer/integer sizes on 32 bit systems.
575 */
576 static inline void *
577 node_pointer(dns_qpnode_t *n) {
578 return (void *)(uintptr_t)(node64(n) & ~TAG_MASK);
579 }
580
581 /*
582 * Examine a node's tag bits
583 */
584 static inline uint32_t
585 node_tag(dns_qpnode_t *n) {
586 return n->biglo & TAG_MASK;
587 }
588
589 /*
590 * simplified for the hot path
591 */
592 static inline bool
593 is_branch(dns_qpnode_t *n) {
594 return n->biglo & BRANCH_TAG;
595 }
596
597 /* leaf nodes *********************************************************/
598
599 /*
600 * Get a leaf's pointer value.
601 */
602 static inline void *
603 leaf_pval(dns_qpnode_t *n) {
604 return node_pointer(n);
605 }
606
607 /*
608 * Get a leaf's integer value
609 */
610 static inline uint32_t
611 leaf_ival(dns_qpnode_t *n) {
612 return node32(n);
613 }
614
615 /*
616 * Create a leaf node from its parts
617 */
618 static inline dns_qpnode_t
619 make_leaf(const void *pval, uint32_t ival) {
620 dns_qpnode_t leaf = make_node((uintptr_t)pval, ival);
621 REQUIRE(node_tag(&leaf) == LEAF_TAG);
622 return leaf;
623 }
624
625 /* branch nodes *******************************************************/
626
627 /*
628 * The following function names use plural `twigs` when they work on a
629 * branch's twigs vector as a whole, and singular `twig` when they work on
630 * a particular twig.
631 */
632
633 /*
634 * Get a branch node's index word
635 */
636 static inline uint64_t
637 branch_index(dns_qpnode_t *n) {
638 return node64(n);
639 }
640
641 /*
642 * Get a reference to a branch node's child twigs.
643 */
644 static inline dns_qpref_t
645 branch_twigs_ref(dns_qpnode_t *n) {
646 return node32(n);
647 }
648
649 /*
650 * Bit positions in the bitmap come directly from the key. DNS names are
651 * converted to keys using the tables declared at the end of this file.
652 */
653 static inline dns_qpshift_t
654 qpkey_bit(const dns_qpkey_t key, size_t len, size_t offset) {
655 if (offset < len) {
656 return key[offset];
657 } else {
658 return SHIFT_NOBYTE;
659 }
660 }
661
662 /*
663 * Extract a branch node's offset field, used to index the key.
664 */
665 static inline size_t
666 branch_key_offset(dns_qpnode_t *n) {
667 return (size_t)(branch_index(n) >> SHIFT_OFFSET);
668 }
669
670 /*
671 * Which bit identifies the twig of this node for this key?
672 */
673 static inline dns_qpshift_t
674 branch_keybit(dns_qpnode_t *n, const dns_qpkey_t key, size_t len) {
675 return qpkey_bit(key, len, branch_key_offset(n));
676 }
677
678 /*
679 * Get a pointer to a the first twig of a branch (this also functions
680 * as a pointer to the entire twig vector).
681 */
682 static inline dns_qpnode_t *
683 branch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) {
684 return ref_ptr(qpr, branch_twigs_ref(n));
685 }
686
687 /*
688 * Warm up the cache while calculating which twig we want.
689 */
690 static inline void
691 prefetch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) {
692 __builtin_prefetch(ref_ptr(qpr, branch_twigs_ref(n)));
693 }
694
695 /* root node **********************************************************/
696
697 /*
698 * Get a pointer to the root node, checking if the trie is empty.
699 */
700 static inline dns_qpnode_t *
701 get_root(dns_qpreadable_t qpr) {
702 dns_qpreader_t *qp = dns_qpreader(qpr);
703 if (qp->root_ref == INVALID_REF) {
704 return NULL;
705 } else {
706 return ref_ptr(qp, qp->root_ref);
707 }
708 }
709
710 /*
711 * When we need to move the root node, we avoid repeating allocation
712 * logistics by making a temporary fake branch node that has
713 * `branch_twigs_size() == 1 && branch_twigs_ref() == root_ref`
714 * just enough to treat the root node as a vector of one twig.
715 */
716 #define MOVABLE_ROOT(qp) \
717 (&(dns_qpnode_t){ \
718 .biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \
719 .small = qp->root_ref, \
720 })
721
722 /***********************************************************************
723 *
724 * bitmap popcount shenanigans
725 */
726
727 /*
728 * How many twigs appear in the vector before the one corresponding to the
729 * given bit? Calculated using popcount of part of the branch's bitmap.
730 *
731 * To calculate a mask that covers the lesser bits in the bitmap,
732 * we subtract 1 to set all lesser bits, and subtract the tag mask
733 * because the type tag is not part of the bitmap.
734 */
735 static inline dns_qpweight_t
736 branch_count_bitmap_before(dns_qpnode_t *n, dns_qpshift_t bit) {
737 uint64_t mask = (1ULL << bit) - 1 - TAG_MASK;
738 uint64_t bitmap = branch_index(n) & mask;
739 return (dns_qpweight_t)__builtin_popcountll(bitmap);
740 }
741
742 /*
743 * How many twigs does this branch have?
744 *
745 * The offset is directly after the bitmap so the offset's lesser bits
746 * covers the whole bitmap, and the bitmap's weight is the number of twigs.
747 */
748 static inline dns_qpweight_t
749 branch_twigs_size(dns_qpnode_t *n) {
750 return branch_count_bitmap_before(n, SHIFT_OFFSET);
751 }
752
753 /*
754 * Position of a twig within the packed sparse vector.
755 */
756 static inline dns_qpweight_t
757 branch_twig_pos(dns_qpnode_t *n, dns_qpshift_t bit) {
758 return branch_count_bitmap_before(n, bit);
759 }
760
761 /*
762 * Get a pointer to the twig for a given bit number.
763 */
764 static inline dns_qpnode_t *
765 branch_twig_ptr(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpshift_t bit) {
766 return ref_ptr(qpr, branch_twigs_ref(n) + branch_twig_pos(n, bit));
767 }
768
769 /*
770 * Is the twig identified by this bit present?
771 */
772 static inline bool
773 branch_has_twig(dns_qpnode_t *n, dns_qpshift_t bit) {
774 return branch_index(n) & (1ULL << bit);
775 }
776
777 /* twig logistics *****************************************************/
778
779 static inline void
780 move_twigs(dns_qpnode_t *to, dns_qpnode_t *from, dns_qpweight_t size) {
781 memmove(to, from, size * sizeof(dns_qpnode_t));
782 }
783
784 static inline void
785 zero_twigs(dns_qpnode_t *twigs, dns_qpweight_t size) {
786 memset(twigs, 0, size * sizeof(dns_qpnode_t));
787 }
788
789 /***********************************************************************
790 *
791 * packed reader nodes
792 */
793
794 /*
795 * The purpose of these packed reader nodes is to simplify safe memory
796 * reclamation for a multithreaded qp-trie.
797 *
798 * After the `reader` pointer in a qpmulti is replaced, we need to wait
799 * for a grace period before we can reclaim the memory that is no longer
800 * needed by the trie. So we need some kind of structure to hold
801 * pointers to the (logically) detached memory until it is safe to free.
802 * This memory includes the chunks and the `base` arrays.
803 *
804 * Packed reader nodes save us from having to track `dns_qpread_t`
805 * objects as distinct allocations: the packed reader nodes get
806 * reclaimed when the the chunk containing their cells is reclaimed.
807 * When a real `dns_qpread_t` object is needed, it is allocated on the
808 * stack (it must not live longer than a isc_loop callback) and the
809 * packed reader is unpacked into it.
810 *
811 * Chunks are owned by the current `base` array, so unused chunks are
812 * held there until they are free()d. Old `base` arrays are attached
813 * to packed reader nodes with a refcount. When a chunk is reclaimed,
814 * it is scanned so that `chunk_free()` can call `detach_leaf()` on
815 * any remaining references to leaf objects. Similarly, it calls
816 * `qpbase_unref()` to reclaim old `base` arrays.
817 */
818
819 /*
820 * Two nodes is just enough space for the information needed by
821 * readers and for deferred memory reclamation.
822 */
823 #define READER_SIZE 2
824
825 /*
826 * Create a packed reader; space for the reader should have been
827 * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`.
828 */
829 static inline void
830 make_reader(dns_qpnode_t *reader, dns_qpmulti_t *multi) {
831 dns_qp_t *qp = &multi->writer;
832 reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC);
833 reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref);
834 }
835
836 static inline bool
837 reader_valid(dns_qpnode_t *reader) {
838 return reader != NULL && //
839 node_tag(&reader[0]) == READER_TAG &&
840 node_tag(&reader[1]) == READER_TAG &&
841 node32(&reader[0]) == QPREADER_MAGIC;
842 }
843
844 /*
845 * Verify and unpack a reader. We return the `multi` pointer to use in
846 * consistency checks.
847 */
848 static inline dns_qpmulti_t *
849 unpack_reader(dns_qpreader_t *qp, dns_qpnode_t *reader) {
850 INSIST(reader_valid(reader));
851 dns_qpmulti_t *multi = node_pointer(&reader[0]);
852 dns_qpbase_t *base = node_pointer(&reader[1]);
853 INSIST(QPMULTI_VALID(multi));
854 INSIST(QPBASE_VALID(base));
855 *qp = (dns_qpreader_t){
856 .magic = QP_MAGIC,
857 .uctx = multi->writer.uctx,
858 .methods = multi->writer.methods,
859 .root_ref = node32(&reader[1]),
860 .base = base,
861 };
862 return multi;
863 }
864
865 /***********************************************************************
866 *
867 * method invocation helpers
868 */
869
870 static inline void
871 attach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) {
872 dns_qpreader_t *qp = dns_qpreader(qpr);
873 qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n));
874 }
875
876 static inline void
877 detach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) {
878 dns_qpreader_t *qp = dns_qpreader(qpr);
879 qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n));
880 }
881
882 static inline size_t
883 leaf_qpkey(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpkey_t key) {
884 dns_qpreader_t *qp = dns_qpreader(qpr);
885 size_t len = qp->methods->makekey(key, qp->uctx, leaf_pval(n),
886 leaf_ival(n));
887 INSIST(len < sizeof(dns_qpkey_t));
888 return len;
889 }
890
891 static inline char *
892 triename(dns_qpreadable_t qpr, char *buf, size_t size) {
893 dns_qpreader_t *qp = dns_qpreader(qpr);
894 qp->methods->triename(qp->uctx, buf, size);
895 return buf;
896 }
897
898 #define TRIENAME(qp) \
899 triename(qp, (char[DNS_QP_TRIENAME_MAX]){}, DNS_QP_TRIENAME_MAX)
900
901 /***********************************************************************
902 *
903 * converting DNS names to trie keys
904 */
905
906 /*
907 * This is a deliberate simplification of the hostname characters,
908 * because it doesn't matter much if we treat a few extra characters
909 * favourably: there is plenty of space in the index word for a
910 * slightly larger bitmap.
911 */
912 static inline bool
913 qp_common_character(uint8_t byte) {
914 return ('-' <= byte && byte <= '9') || ('_' <= byte && byte <= 'z');
915 }
916
917 /*
918 * Lookup table mapping bytes in DNS names to bit positions, used
919 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
920 */
921 extern uint16_t dns_qp_bits_for_byte[];
922
923 /*
924 * And the reverse, mapping bit positions to characters, so the tests
925 * can print diagnostics involving qp-trie keys.
926 */
927 extern uint8_t dns_qp_byte_for_bit[];
928
929 /**********************************************************************/
930