qp_p.h revision 1.3 1 /* $NetBSD: qp_p.h,v 1.3 2026/01/29 18:37:49 christos Exp $ */
2
3 /*
4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5 *
6 * SPDX-License-Identifier: MPL-2.0
7 *
8 * This Source Code Form is subject to the terms of the Mozilla Public
9 * License, v. 2.0. If a copy of the MPL was not distributed with this
10 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11 *
12 * See the COPYRIGHT file distributed with this work for additional
13 * information regarding copyright ownership.
14 */
15
16 /*
17 * For an overview, see doc/design/qp-trie.md
18 *
19 * This private header defines the internal data structures,
20 */
21
22 #pragma once
23
24 #include <isc/refcount.h>
25
26 #include <dns/qp.h>
27
28 /***********************************************************************
29 *
30 * interior node basics
31 */
32
33 /*
34 * A qp-trie node is almost always one of two types: branch or leaf.
35 * (A third type is used only to anchor the root of a trie; see below.)
36 *
37 * A node contains a 64-bit word and a 32-bit word. In order to avoid
38 * unwanted padding, they are declared as three 32-bit words; this keeps
39 * the size down to 12 bytes. They are in native endian order, so getting
40 * the 64-bit part should compile down to an unaligned load.
41 *
42 * The node type is identified by the least significant bits of the 64-bit
43 * word.
44 *
45 * In a leaf node:
46 * - The 64-bit word is used to store a pointer value. (Pointers must be
47 * word-aligned so the least significant bits are zero; those bits can
48 * then act as a node tag to indicate that this is a leaf. This
49 * requirement is enforced by the make_leaf() constructor.)
50 * - The 32-bit word is used to store an integer value. Both the
51 * pointer and integer values can be retrieved when looking up a key.
52 *
53 * In a branch node:
54 * - The 64-bit word is subdivided into three portions: the least
55 * significant bits are the node type (for a branch, 0x1); the
56 * most sigificant 15 bits are an offset value into the key, and
57 * the 47 bits in the middle are a bitmap; see the documentation
58 * for the SHIFT_* enum below.
59 * - The 32-bit word is a reference (dns_qpref_t) to the packed sparse
60 * vector of "twigs", i.e. child nodes. A branch node has at least
61 * two and at most 47 twigs. (The qp-trie update functions ensure that
62 * branches actually branch, i.e. a branch cannot have only one child.)
63 *
64 * A third node type, reader nodes, anchors the root of a trie.
65 * A pair of reader nodes together contain a packed `dns_qpreader_t`.
66 * See the section on "packed reader nodes" for details.
67 */
68 struct dns_qpnode {
69 #if WORDS_BIGENDIAN
70 uint32_t bighi, biglo, small;
71 #else
72 uint32_t biglo, bighi, small;
73 #endif
74 };
75
76 /*
77 * The possible values of the node type tag. Type tags must fit in two bits
78 * for compatibility with 4-byte pointer alignment on 32-bit systems.
79 */
80 enum {
81 LEAF_TAG = 0, /* leaf node */
82 BRANCH_TAG = 1, /* branch node */
83 READER_TAG = 2, /* reader node */
84 TAG_MASK = 3, /* mask covering tag bits */
85 };
86
87 /*
88 * This code does not work on CPUs with large pointers, e.g. CHERI capability
89 * architectures. When porting to that kind of machine, a `dns_qpnode` should
90 * be just a `uintptr_t`; a leaf node will contain a single pointer, and a
91 * branch node will fit in the same space with room to spare.
92 */
93 STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t),
94 "pointers must fit in 64 bits");
95
96 /*
97 * The 64-bit word in a branch node is comprised of a node type tag, a
98 * bitmap, and an offset into the key. It is called an "index word" because
99 * it describes how to access the twigs vector (think "database index").
100 * The following enum sets up the bit positions of these parts.
101 *
102 * The bitmap is just above the type tag. The `dns_qp_bits_for_byte[]` table
103 * is used to fill in a key so that bit tests can work directly against the
104 * index word without superfluous masking or shifting; we don't need to
105 * mask out the bitmap before testing a bit, but we do need to mask the
106 * bitmap before calling popcount.
107 *
108 * The byte offset into the key is at the top of the word, so that it
109 * can be extracted with just a shift, with no masking needed.
110 *
111 * The names are SHIFT_thing because they are dns_qpshift_t values. (See
112 * below for the various `qp_*` type declarations.)
113 *
114 * These values are relatively fixed in practice: SHIFT_NOBYTE needs
115 * to leave space for the type tag, and the implementation of
116 * `dns_qpkey_fromname()` depends on the bitmap being large enough.
117 * The symbolic names avoid mystery numbers in the code.
118 */
119 enum {
120 SHIFT_NOBYTE = 2, /* label separator has no byte value */
121 SHIFT_BITMAP, /* many bits here */
122 SHIFT_OFFSET = 49, /* offset of byte in key */
123 };
124
125 /***********************************************************************
126 *
127 * garbage collector tuning parameters
128 */
129
130 /*
131 * A "cell" is a location that can contain a `dns_qpnode_t`, and a "chunk"
132 * is a moderately large array of cells. A big trie can occupy
133 * multiple chunks. (Unlike other nodes, a trie's root node lives in
134 * its `struct dns_qp` instead of being allocated in a cell.)
135 *
136 * The qp-trie allocator hands out space for twigs vectors. Allocations are
137 * made sequentially from one of the chunks; this kind of "sequential
138 * allocator" is also known as a "bump allocator", so in `struct dns_qp`
139 * (see below) the allocation chunk is called `bump`.
140 */
141
142 /*
143 * Number of cells in a chunk is a power of 2, which must have space for
144 * a full twigs vector (48 wide). When testing, use a much smaller chunk
145 * size to make the allocator work harder.
146 */
147 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
148 #define QP_CHUNK_LOG_MIN 7
149 #define QP_CHUNK_LOG_MAX 7
150 #else
151 #define QP_CHUNK_LOG_MIN 3
152 #define QP_CHUNK_LOG_MAX 12
153 #endif
154
155 STATIC_ASSERT(2 <= QP_CHUNK_LOG_MIN && QP_CHUNK_LOG_MIN <= QP_CHUNK_LOG_MAX,
156 "qp-trie min chunk size is unreasonable");
157 STATIC_ASSERT(6 <= QP_CHUNK_LOG_MAX && QP_CHUNK_LOG_MAX <= 20,
158 "qp-trie max chunk size is unreasonable");
159
160 #define QP_CHUNK_SIZE (1U << QP_CHUNK_LOG_MAX)
161 #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(dns_qpnode_t))
162
163 STATIC_ASSERT(QP_SAFETY_MARGIN >= QP_CHUNK_BYTES,
164 "qp-trie safety margin too small");
165
166 /*
167 * We need a bitfield this big to count how much of a chunk is in use:
168 * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG_MAX`.
169 */
170 #define QP_USAGE_BITS (QP_CHUNK_LOG_MAX + 1)
171
172 /*
173 * A chunk needs to be compacted if it is less full than this threshold.
174 * (12% overhead seems reasonable)
175 */
176 #define QP_MAX_FREE (QP_CHUNK_SIZE / 8)
177 #define QP_MIN_USED (QP_CHUNK_SIZE - QP_MAX_FREE)
178
179 /*
180 * Compact automatically when we pass this threshold: when there is a lot
181 * of free space in absolute terms, and when we have freed more than half
182 * of the space we allocated.
183 *
184 * The current compaction algorithm scans the whole trie, so it is important
185 * to scale the threshold based on the size of the trie to avoid quadratic
186 * behaviour. XXXFANF find an algorithm that scans less of the trie!
187 *
188 * During a modification transaction, when we copy-on-write some twigs we
189 * count the old copy as "free", because they will be when the transaction
190 * commits. But they cannot be recovered immediately so they are also
191 * counted as on hold, and discounted when we decide whether to compact.
192 */
193 #define QP_GC_HEURISTIC(qp, free) \
194 ((free) > QP_CHUNK_SIZE * 4 && (free) > (qp)->used_count / 2)
195
196 #define QP_NEEDGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count)
197 #define QP_AUTOGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count - (qp)->hold_count)
198
199 /*
200 * The chunk base and usage arrays are resized geometically and start off
201 * with two entries.
202 */
203 #define GROWTH_FACTOR(size) ((size) + (size) / 2 + 2)
204
205 /*
206 * Constructors and accessors for dns_qpref_t values, defined here to show
207 * how the dns_qpref_t, dns_qpchunk_t, dns_qpcell_t types relate to each other
208 */
209
210 static inline dns_qpref_t
211 make_ref(dns_qpchunk_t chunk, dns_qpcell_t cell) {
212 return QP_CHUNK_SIZE * chunk + cell;
213 }
214
215 static inline dns_qpchunk_t
216 ref_chunk(dns_qpref_t ref) {
217 return ref / QP_CHUNK_SIZE;
218 }
219
220 static inline dns_qpcell_t
221 ref_cell(dns_qpref_t ref) {
222 return ref % QP_CHUNK_SIZE;
223 }
224
225 /*
226 * We should not use the `root_ref` in an empty trie, so we set it
227 * to a value that should trigger an obvious bug. See qp_init()
228 * and get_root() below.
229 */
230 #define INVALID_REF ((dns_qpref_t)~0UL)
231
232 /***********************************************************************
233 *
234 * chunk arrays
235 */
236
237 /*
238 * A `dns_qp_t` contains two arrays holding information about each chunk.
239 *
240 * The `base` array holds pointers to the base of each chunk.
241 * The `usage` array hold the allocator's state for each chunk.
242 *
243 * The `base` array is used by the hot qp-trie traversal paths. It can
244 * be shared by multiple versions of a trie, which are tracked with a
245 * refcount. Old versions of the trie can retain old versions of the
246 * `base` array.
247 *
248 * In multithreaded code, the `usage` array is only used when the
249 * `dns_qpmulti_t` mutex is held, and there is only one version of
250 * it in active use (maybe with a snapshot for rollback support).
251 *
252 * The two arrays are separate because they have rather different
253 * access patterns, different lifetimes, and different element sizes.
254 */
255
256 /*
257 * For most purposes we don't need to know exactly which cells are
258 * in use in a chunk, we only need to know how many of them there are.
259 *
260 * After we have finished allocating from a chunk, the `used` counter
261 * is the size we need to know for shrinking the chunk and for
262 * scanning it to detach leaf values before the chunk is free()d. The
263 * `free` counter tells us when the chunk needs compacting and when it
264 * has become empty.
265 *
266 * The `exists` flag allows the chunk scanning loops to look at the
267 * usage array only.
268 *
269 * In multithreaded code, we mark chunks as `immutable` when a modify
270 * transaction is opened. (We don't mark them immutable on commit,
271 * because the old bump chunk must remain mutable between write
272 * transactions, but it must become immutable when an update
273 * transaction is opened.)
274 *
275 * There are a few flags used to mark which chunks are still needed by
276 * snapshots after the chunks have passed their normal reclamation
277 * phase.
278 */
279 typedef struct qp_usage {
280 /*% the allocation point, increases monotonically */
281 dns_qpcell_t used : QP_USAGE_BITS;
282 /*% the actual size of the allocation */
283 dns_qpcell_t capacity : QP_USAGE_BITS;
284 /*% count of nodes no longer needed, also monotonic */
285 dns_qpcell_t free : QP_USAGE_BITS;
286 /*% qp->base->ptr[chunk] != NULL */
287 bool exists : 1;
288 /*% is this chunk shared? [MT] */
289 bool immutable : 1;
290 /*% already subtracted from multi->*_count [MT] */
291 bool discounted : 1;
292 /*% is a snapshot using this chunk? [MT] */
293 bool snapshot : 1;
294 /*% tried to free it but a snapshot needs it [MT] */
295 bool snapfree : 1;
296 /*% for mark/sweep snapshot flag updates [MT] */
297 bool snapmark : 1;
298 } qp_usage_t;
299
300 /*
301 * The chunks are owned by the current version of the `base` array.
302 * When the array is resized, the old version might still be in use by
303 * concurrent readers, in which case it is free()d later when its
304 * refcount drops to zero.
305 *
306 * A `dns_qpbase_t` counts references from `dns_qp_t` objects and
307 * from packed readers, but not from `dns_qpread_t` nor from
308 * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t`
309 * would wreck multicore scalability; instead we rely on RCU.
310 *
311 * The `usage` array determines when a chunk is no longer needed: old
312 * chunk pointers in old `base` arrays are ignored. (They can become
313 * dangling pointers to free memory, but they will never be
314 * dereferenced.)
315 *
316 * We ensure that individual chunk base pointers remain immutable
317 * after assignment, and they are not cleared until the chunk is
318 * free()d, after all readers have departed. Slots can be reused, and
319 * we allow transactions to fill or re-fill empty slots adjacent to
320 * busy slots that are in use by readers.
321 */
322 struct dns_qpbase {
323 unsigned int magic;
324 isc_refcount_t refcount;
325 dns_qpnode_t *ptr[];
326 };
327
328 /*
329 * Chunks that may be in use by readers are reclaimed asynchronously.
330 * When a transaction commits, immutable chunks that are now empty are
331 * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`.
332 */
333 typedef struct qp_rcuctx {
334 unsigned int magic;
335 struct rcu_head rcu_head;
336 isc_mem_t *mctx;
337 dns_qpmulti_t *multi;
338 ISC_LINK(struct qp_rcuctx) link;
339 dns_qpchunk_t count;
340 dns_qpchunk_t chunk[];
341 } qp_rcuctx_t;
342
343 /*
344 * Returns true when the base array can be free()d.
345 */
346 static inline bool
347 qpbase_unref(dns_qpreadable_t qpr) {
348 dns_qpreader_t *qp = dns_qpreader(qpr);
349 return qp->base != NULL &&
350 isc_refcount_decrement(&qp->base->refcount) == 1;
351 }
352
353 /*
354 * Now we know about `dns_qpreader_t` and `dns_qpbase_t`,
355 * here's how we convert a twig reference into a pointer.
356 */
357 static inline dns_qpnode_t *
358 ref_ptr(dns_qpreadable_t qpr, dns_qpref_t ref) {
359 dns_qpreader_t *qp = dns_qpreader(qpr);
360 return qp->base->ptr[ref_chunk(ref)] + ref_cell(ref);
361 }
362
363 /***********************************************************************
364 *
365 * main qp-trie structures
366 */
367
368 #define QP_MAGIC ISC_MAGIC('t', 'r', 'i', 'e')
369 #define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't')
370 #define QPCHAIN_MAGIC ISC_MAGIC('q', 'p', 'c', 'h')
371 #define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v')
372 #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x')
373 #define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p')
374 #define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b')
375
376 #define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC)
377 #define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC)
378 #define QPCHAIN_VALID(qp) ISC_MAGIC_VALID(qp, QPCHAIN_MAGIC)
379 #define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC)
380 #define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC)
381 #define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC)
382
383 /*
384 * Polymorphic initialization of the `dns_qpreader_t` prefix.
385 *
386 * The location of the root node is actually a dns_qpref_t, but is
387 * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too
388 * many internal details into the public API.
389 *
390 * The `uctx` and `methods` support callbacks into the user's code.
391 * They are constant after initialization.
392 */
393 #define QP_INIT(qp, m, x) \
394 (*(qp) = (typeof(*(qp))){ \
395 .magic = QP_MAGIC, \
396 .root_ref = INVALID_REF, \
397 .uctx = x, \
398 .methods = m, \
399 })
400
401 /*
402 * Snapshots have some extra cleanup machinery.
403 *
404 * Originally, a snapshot was basically just a `dns_qpread_t`
405 * allocated on the heap, with the extra behaviour that memory
406 * reclamation is suppressed for a particular trie while it has any
407 * snapshots. However that design gets into trouble for a zone with
408 * frequent updates and many zone transfers.
409 *
410 * Instead, each snapshot records which chunks it needs. When a
411 * snapshot is created, it makes a copy of the `base` array, except
412 * for chunks that are empty and waiting to be reclaimed. When a
413 * snapshot is destroyed, we can traverse the list of snapshots to
414 * accurately mark which chunks are still needed.
415 *
416 * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is
417 * not muddled up with the wrong `dns_qpmulti_t`.
418 *
419 * A trie's `base` array might have grown after the snapshot was
420 * created, so it records its own `chunk_max`.
421 */
422 struct dns_qpsnap {
423 DNS_QPREADER_FIELDS;
424 dns_qpmulti_t *whence;
425 uint32_t chunk_max;
426 ISC_LINK(struct dns_qpsnap) link;
427 };
428
429 /*
430 * Read-write access to a qp-trie requires extra fields to support the
431 * allocator and garbage collector.
432 *
433 * Bare instances of a `struct dns_qp` are used for stand-alone
434 * single-threaded tries. For multithreaded access, a `dns_qpmulti_t`
435 * wraps a `dns_qp_t` with a mutex and other fields that are only needed
436 * at the start or end of a transaction.
437 *
438 * Allocations are made sequentially in the `bump` chunk. A sequence
439 * of lightweight write transactions can use the same `bump` chunk, so
440 * its prefix before `fender` is immutable, and the rest is mutable.
441 *
442 * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines
443 * the values of `used_count`, `free_count`, and `hold_count`. The
444 * `hold_count` tracks nodes that need to be retained while readers are
445 * using them; they are free but cannot be reclaimed until the transaction
446 * has committed, so the `hold_count` is discounted from QP_MAX_GARBAGE()
447 * during a transaction.
448 *
449 * There are some flags that alter the behaviour of write transactions.
450 *
451 * - The `transaction_mode` indicates whether the current transaction is a
452 * light write or a heavy update, or (between transactions) the previous
453 * transaction's mode, because the setup for the next transaction
454 * depends on how the previous one committed. The mode is set at the
455 * start of each transaction. It is QP_NONE in a single-threaded qp-trie
456 * to detect if part of a `dns_qpmulti_t` is passed to dns_qp_destroy().
457 *
458 * - The `compact_all` flag is used when every node in the trie should be
459 * copied. (Usually compation aims to avoid moving nodes out of
460 * unfragmented chunks.) It is used when compaction is explicitly
461 * requested via `dns_qp_compact()`, and as an emergency mechanism if
462 * normal compaction failed to clear the QP_MAX_GARBAGE() condition.
463 * (This emergency is a bug even tho we have a rescue mechanism.)
464 *
465 * - When a qp-trie is destroyed while it has pending cleanup work, its
466 * `destroy` flag is set so that it is destroyed by the reclaim worker.
467 * (Because items cannot be removed from the middle of the cleanup list.)
468 *
469 * - When built with fuzzing support, we can use mprotect() and munmap()
470 * to ensure that incorrect memory accesses cause fatal errors. The
471 * `write_protect` flag must be set straight after the `dns_qpmulti_t`
472 * is created, then left unchanged.
473 *
474 * Some of the dns_qp_t fields are only needed for multithreaded transactions
475 * (marked [MT] below) but the same code paths are also used for single-
476 * threaded writes.
477 */
478 struct dns_qp {
479 DNS_QPREADER_FIELDS;
480 /*% memory context (const) */
481 isc_mem_t *mctx;
482 /*% array of per-chunk allocation counters */
483 qp_usage_t *usage;
484 /*% number of slots in `chunk` and `usage` arrays */
485 dns_qpchunk_t chunk_max;
486 /*% which chunk is used for allocations */
487 dns_qpchunk_t bump;
488 /*% nodes in the `bump` chunk below `fender` are read only [MT] */
489 dns_qpcell_t fender;
490 /*% number of leaf nodes */
491 dns_qpcell_t leaf_count;
492 /*% total of all usage[] counters */
493 dns_qpcell_t used_count, free_count;
494 /*% free cells that cannot be recovered right now */
495 dns_qpcell_t hold_count;
496 /*% capacity of last allocated chunk, for exponential chunk growth */
497 dns_qpcell_t chunk_capacity;
498 /*% what kind of transaction was most recently started [MT] */
499 enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2;
500 /*% compact the entire trie [MT] */
501 bool compact_all : 1;
502 /*% optionally when compiled with fuzzing support [MT] */
503 bool write_protect : 1;
504 };
505
506 /*
507 * Concurrent access to a qp-trie.
508 *
509 * The `reader` pointer provides wait-free access to the current version
510 * of the trie. See the "packed reader nodes" section below for a
511 * description of what it points to.
512 *
513 * The main object under the protection of the mutex is the `writer`
514 * containing all the allocator state. There can be a backup copy when
515 * we want to be able to rollback an update transaction.
516 *
517 * There is a `reader_ref` which corresponds to the `reader` pointer
518 * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is
519 * necessary when freeing the space used by the reader, because there
520 * isn't a good way to recover a dns_qpref_t from a dns_qpnode_t pointer.
521 *
522 * There is a per-trie list of snapshots that is used for reclaiming
523 * memory when a snapshot is destroyed.
524 *
525 * Finally, we maintain a global list of `dns_qpmulti_t` objects that
526 * need asynchronous safe memory recovery.
527 */
528 struct dns_qpmulti {
529 uint32_t magic;
530 /*% RCU-protected pointer to current packed reader */
531 dns_qpnode_t *reader;
532 /*% the mutex protects the rest of this structure */
533 isc_mutex_t mutex;
534 /*% ref_ptr(writer, reader_ref) == reader */
535 dns_qpref_t reader_ref;
536 /*% the main working structure */
537 dns_qp_t writer;
538 /*% saved allocator state to support rollback */
539 dns_qp_t *rollback;
540 /*% all snapshots of this trie */
541 ISC_LIST(dns_qpsnap_t) snapshots;
542 /*% refcount for memory reclamation */
543 isc_refcount_t references;
544 };
545
546 /***********************************************************************
547 *
548 * interior node constructors and accessors
549 */
550
551 /*
552 * See the comments under "interior node basics" above, which explain
553 * the layout of nodes as implemented by the following functions.
554 *
555 * These functions are (mostly) constructors and getters. Imagine how
556 * much less code there would be if C had sum types with control over
557 * the layout...
558 */
559
560 /*
561 * Get the 64-bit word of a node.
562 */
563 static inline uint64_t
564 node64(dns_qpnode_t *n) {
565 uint64_t lo = n->biglo;
566 uint64_t hi = n->bighi;
567 return lo | (hi << 32);
568 }
569
570 /*
571 * Get the 32-bit word of a node.
572 */
573 static inline uint32_t
574 node32(dns_qpnode_t *n) {
575 return n->small;
576 }
577
578 /*
579 * Create a node from its parts
580 */
581 static inline dns_qpnode_t
582 make_node(uint64_t big, uint32_t small) {
583 return (dns_qpnode_t){
584 .biglo = (uint32_t)(big),
585 .bighi = (uint32_t)(big >> 32),
586 .small = small,
587 };
588 }
589
590 /*
591 * Extract a pointer from a node's 64 bit word. The double cast is to avoid
592 * a warning about mismatched pointer/integer sizes on 32 bit systems.
593 */
594 static inline void *
595 node_pointer(dns_qpnode_t *n) {
596 return (void *)(uintptr_t)(node64(n) & ~TAG_MASK);
597 }
598
599 /*
600 * Examine a node's tag bits
601 */
602 static inline uint32_t
603 node_tag(dns_qpnode_t *n) {
604 return n->biglo & TAG_MASK;
605 }
606
607 /*
608 * simplified for the hot path
609 */
610 static inline bool
611 is_branch(dns_qpnode_t *n) {
612 return n->biglo & BRANCH_TAG;
613 }
614
615 /* leaf nodes *********************************************************/
616
617 /*
618 * Get a leaf's pointer value.
619 */
620 static inline void *
621 leaf_pval(dns_qpnode_t *n) {
622 return node_pointer(n);
623 }
624
625 /*
626 * Get a leaf's integer value
627 */
628 static inline uint32_t
629 leaf_ival(dns_qpnode_t *n) {
630 return node32(n);
631 }
632
633 /*
634 * Create a leaf node from its parts
635 */
636 static inline dns_qpnode_t
637 make_leaf(const void *pval, uint32_t ival) {
638 dns_qpnode_t leaf = make_node((uintptr_t)pval, ival);
639 REQUIRE(node_tag(&leaf) == LEAF_TAG);
640 return leaf;
641 }
642
643 /* branch nodes *******************************************************/
644
645 /*
646 * The following function names use plural `twigs` when they work on a
647 * branch's twigs vector as a whole, and singular `twig` when they work on
648 * a particular twig.
649 */
650
651 /*
652 * Get a branch node's index word
653 */
654 static inline uint64_t
655 branch_index(dns_qpnode_t *n) {
656 return node64(n);
657 }
658
659 /*
660 * Get a reference to a branch node's child twigs.
661 */
662 static inline dns_qpref_t
663 branch_twigs_ref(dns_qpnode_t *n) {
664 return node32(n);
665 }
666
667 /*
668 * Bit positions in the bitmap come directly from the key. DNS names are
669 * converted to keys using the tables declared at the end of this file.
670 */
671 static inline dns_qpshift_t
672 qpkey_bit(const dns_qpkey_t key, size_t len, size_t offset) {
673 if (offset < len) {
674 return key[offset];
675 } else {
676 return SHIFT_NOBYTE;
677 }
678 }
679
680 /*
681 * Extract a branch node's offset field, used to index the key.
682 */
683 static inline size_t
684 branch_key_offset(dns_qpnode_t *n) {
685 return (size_t)(branch_index(n) >> SHIFT_OFFSET);
686 }
687
688 /*
689 * Which bit identifies the twig of this node for this key?
690 */
691 static inline dns_qpshift_t
692 branch_keybit(dns_qpnode_t *n, const dns_qpkey_t key, size_t len) {
693 return qpkey_bit(key, len, branch_key_offset(n));
694 }
695
696 /*
697 * Get a pointer to a the first twig of a branch (this also functions
698 * as a pointer to the entire twig vector).
699 */
700 static inline dns_qpnode_t *
701 branch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) {
702 return ref_ptr(qpr, branch_twigs_ref(n));
703 }
704
705 /*
706 * Warm up the cache while calculating which twig we want.
707 */
708 static inline void
709 prefetch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) {
710 __builtin_prefetch(ref_ptr(qpr, branch_twigs_ref(n)));
711 }
712
713 /* root node **********************************************************/
714
715 /*
716 * Get a pointer to the root node, checking if the trie is empty.
717 */
718 static inline dns_qpnode_t *
719 get_root(dns_qpreadable_t qpr) {
720 dns_qpreader_t *qp = dns_qpreader(qpr);
721 if (qp->root_ref == INVALID_REF) {
722 return NULL;
723 } else {
724 return ref_ptr(qp, qp->root_ref);
725 }
726 }
727
728 /*
729 * When we need to move the root node, we avoid repeating allocation
730 * logistics by making a temporary fake branch node that has
731 * `branch_twigs_size() == 1 && branch_twigs_ref() == root_ref`
732 * just enough to treat the root node as a vector of one twig.
733 */
734 #define MOVABLE_ROOT(qp) \
735 (&(dns_qpnode_t){ \
736 .biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \
737 .small = qp->root_ref, \
738 })
739
740 /***********************************************************************
741 *
742 * bitmap popcount shenanigans
743 */
744
745 /*
746 * How many twigs appear in the vector before the one corresponding to the
747 * given bit? Calculated using popcount of part of the branch's bitmap.
748 *
749 * To calculate a mask that covers the lesser bits in the bitmap,
750 * we subtract 1 to set all lesser bits, and subtract the tag mask
751 * because the type tag is not part of the bitmap.
752 */
753 static inline dns_qpweight_t
754 branch_count_bitmap_before(dns_qpnode_t *n, dns_qpshift_t bit) {
755 uint64_t mask = (1ULL << bit) - 1 - TAG_MASK;
756 uint64_t bitmap = branch_index(n) & mask;
757 return (dns_qpweight_t)__builtin_popcountll(bitmap);
758 }
759
760 /*
761 * How many twigs does this branch have?
762 *
763 * The offset is directly after the bitmap so the offset's lesser bits
764 * covers the whole bitmap, and the bitmap's weight is the number of twigs.
765 */
766 static inline dns_qpweight_t
767 branch_twigs_size(dns_qpnode_t *n) {
768 return branch_count_bitmap_before(n, SHIFT_OFFSET);
769 }
770
771 /*
772 * Position of a twig within the packed sparse vector.
773 */
774 static inline dns_qpweight_t
775 branch_twig_pos(dns_qpnode_t *n, dns_qpshift_t bit) {
776 return branch_count_bitmap_before(n, bit);
777 }
778
779 /*
780 * Get a pointer to the twig for a given bit number.
781 */
782 static inline dns_qpnode_t *
783 branch_twig_ptr(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpshift_t bit) {
784 return ref_ptr(qpr, branch_twigs_ref(n) + branch_twig_pos(n, bit));
785 }
786
787 /*
788 * Is the twig identified by this bit present?
789 */
790 static inline bool
791 branch_has_twig(dns_qpnode_t *n, dns_qpshift_t bit) {
792 return branch_index(n) & (1ULL << bit);
793 }
794
795 /* twig logistics *****************************************************/
796
797 static inline void
798 move_twigs(dns_qpnode_t *to, dns_qpnode_t *from, dns_qpweight_t size) {
799 memmove(to, from, size * sizeof(dns_qpnode_t));
800 }
801
802 static inline void
803 zero_twigs(dns_qpnode_t *twigs, dns_qpweight_t size) {
804 memset(twigs, 0, size * sizeof(dns_qpnode_t));
805 }
806
807 /***********************************************************************
808 *
809 * packed reader nodes
810 */
811
812 /*
813 * The purpose of these packed reader nodes is to simplify safe memory
814 * reclamation for a multithreaded qp-trie.
815 *
816 * After the `reader` pointer in a qpmulti is replaced, we need to wait
817 * for a grace period before we can reclaim the memory that is no longer
818 * needed by the trie. So we need some kind of structure to hold
819 * pointers to the (logically) detached memory until it is safe to free.
820 * This memory includes the chunks and the `base` arrays.
821 *
822 * Packed reader nodes save us from having to track `dns_qpread_t`
823 * objects as distinct allocations: the packed reader nodes get
824 * reclaimed when the the chunk containing their cells is reclaimed.
825 * When a real `dns_qpread_t` object is needed, it is allocated on the
826 * stack (it must not live longer than a isc_loop callback) and the
827 * packed reader is unpacked into it.
828 *
829 * Chunks are owned by the current `base` array, so unused chunks are
830 * held there until they are free()d. Old `base` arrays are attached
831 * to packed reader nodes with a refcount. When a chunk is reclaimed,
832 * it is scanned so that `chunk_free()` can call `detach_leaf()` on
833 * any remaining references to leaf objects. Similarly, it calls
834 * `qpbase_unref()` to reclaim old `base` arrays.
835 */
836
837 /*
838 * Two nodes is just enough space for the information needed by
839 * readers and for deferred memory reclamation.
840 */
841 #define READER_SIZE 2
842
843 /*
844 * Create a packed reader; space for the reader should have been
845 * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`.
846 */
847 static inline void
848 make_reader(dns_qpnode_t *reader, dns_qpmulti_t *multi) {
849 dns_qp_t *qp = &multi->writer;
850 reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC);
851 reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref);
852 }
853
854 static inline bool
855 reader_valid(dns_qpnode_t *reader) {
856 return reader != NULL && //
857 node_tag(&reader[0]) == READER_TAG &&
858 node_tag(&reader[1]) == READER_TAG &&
859 node32(&reader[0]) == QPREADER_MAGIC;
860 }
861
862 /*
863 * Verify and unpack a reader. We return the `multi` pointer to use in
864 * consistency checks.
865 */
866 static inline dns_qpmulti_t *
867 unpack_reader(dns_qpreader_t *qp, dns_qpnode_t *reader) {
868 INSIST(reader_valid(reader));
869 dns_qpmulti_t *multi = node_pointer(&reader[0]);
870 dns_qpbase_t *base = node_pointer(&reader[1]);
871 INSIST(QPMULTI_VALID(multi));
872 INSIST(QPBASE_VALID(base));
873 *qp = (dns_qpreader_t){
874 .magic = QP_MAGIC,
875 .uctx = multi->writer.uctx,
876 .methods = multi->writer.methods,
877 .root_ref = node32(&reader[1]),
878 .base = base,
879 };
880 return multi;
881 }
882
883 /***********************************************************************
884 *
885 * method invocation helpers
886 */
887
888 static inline void
889 attach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) {
890 dns_qpreader_t *qp = dns_qpreader(qpr);
891 qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n));
892 }
893
894 static inline void
895 detach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) {
896 dns_qpreader_t *qp = dns_qpreader(qpr);
897 qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n));
898 }
899
900 static inline size_t
901 leaf_qpkey(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpkey_t key) {
902 dns_qpreader_t *qp = dns_qpreader(qpr);
903 size_t len = qp->methods->makekey(key, qp->uctx, leaf_pval(n),
904 leaf_ival(n));
905 INSIST(len < sizeof(dns_qpkey_t));
906 return len;
907 }
908
909 static inline char *
910 triename(dns_qpreadable_t qpr, char *buf, size_t size) {
911 dns_qpreader_t *qp = dns_qpreader(qpr);
912 qp->methods->triename(qp->uctx, buf, size);
913 return buf;
914 }
915
916 #define TRIENAME(qp) \
917 triename(qp, (char[DNS_QP_TRIENAME_MAX]){}, DNS_QP_TRIENAME_MAX)
918
919 /***********************************************************************
920 *
921 * converting DNS names to trie keys
922 */
923
924 /*
925 * This is a deliberate simplification of the hostname characters,
926 * because it doesn't matter much if we treat a few extra characters
927 * favourably: there is plenty of space in the index word for a
928 * slightly larger bitmap.
929 */
930 static inline bool
931 qp_common_character(uint8_t byte) {
932 return ('-' <= byte && byte <= '9') || ('_' <= byte && byte <= 'z');
933 }
934
935 /*
936 * Lookup table mapping bytes in DNS names to bit positions, used
937 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
938 */
939 extern uint16_t dns_qp_bits_for_byte[];
940
941 /*
942 * And the reverse, mapping bit positions to characters, so the tests
943 * can print diagnostics involving qp-trie keys.
944 */
945 extern uint8_t dns_qp_byte_for_bit[];
946
947 /**********************************************************************/
948