Home | History | Annotate | Line # | Download | only in dns
      1 /*	$NetBSD: qp_p.h,v 1.3 2026/01/29 18:37:49 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  *
      6  * SPDX-License-Identifier: MPL-2.0
      7  *
      8  * This Source Code Form is subject to the terms of the Mozilla Public
      9  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  *
     12  * See the COPYRIGHT file distributed with this work for additional
     13  * information regarding copyright ownership.
     14  */
     15 
     16 /*
     17  * For an overview, see doc/design/qp-trie.md
     18  *
     19  * This private header defines the internal data structures,
     20  */
     21 
     22 #pragma once
     23 
     24 #include <isc/refcount.h>
     25 
     26 #include <dns/qp.h>
     27 
     28 /***********************************************************************
     29  *
     30  *  interior node basics
     31  */
     32 
     33 /*
     34  * A qp-trie node is almost always one of two types: branch or leaf.
     35  * (A third type is used only to anchor the root of a trie; see below.)
     36  *
     37  * A node contains a 64-bit word and a 32-bit word. In order to avoid
     38  * unwanted padding, they are declared as three 32-bit words; this keeps
     39  * the size down to 12 bytes. They are in native endian order, so getting
     40  * the 64-bit part should compile down to an unaligned load.
     41  *
     42  * The node type is identified by the least significant bits of the 64-bit
     43  * word.
     44  *
     45  * In a leaf node:
     46  * - The 64-bit word is used to store a pointer value. (Pointers must be
     47  *   word-aligned so the least significant bits are zero; those bits can
     48  *   then act as a node tag to indicate that this is a leaf. This
     49  *   requirement is enforced by the make_leaf() constructor.)
     50  * - The 32-bit word is used to store an integer value.  Both the
     51  *   pointer and integer values can be retrieved when looking up a key.
     52  *
     53  * In a branch node:
     54  * - The 64-bit word is subdivided into three portions: the least
     55  *   significant bits are the node type (for a branch, 0x1); the
     56  *   most sigificant 15 bits are an offset value into the key, and
     57  *   the 47 bits in the middle are a bitmap; see the documentation
     58  *   for the SHIFT_* enum below.
     59  * - The 32-bit word is a reference (dns_qpref_t) to the packed sparse
     60  *   vector of "twigs", i.e. child nodes. A branch node has at least
     61  *   two and at most 47 twigs. (The qp-trie update functions ensure that
     62  *   branches actually branch, i.e. a branch cannot have only one child.)
     63  *
     64  * A third node type, reader nodes, anchors the root of a trie.
     65  * A pair of reader nodes together contain a packed `dns_qpreader_t`.
     66  * See the section on "packed reader nodes" for details.
     67  */
     68 struct dns_qpnode {
     69 #if WORDS_BIGENDIAN
     70 	uint32_t bighi, biglo, small;
     71 #else
     72 	uint32_t biglo, bighi, small;
     73 #endif
     74 };
     75 
     76 /*
     77  * The possible values of the node type tag. Type tags must fit in two bits
     78  * for compatibility with 4-byte pointer alignment on 32-bit systems.
     79  */
     80 enum {
     81 	LEAF_TAG = 0,	/* leaf node */
     82 	BRANCH_TAG = 1, /* branch node */
     83 	READER_TAG = 2, /* reader node */
     84 	TAG_MASK = 3,	/* mask covering tag bits */
     85 };
     86 
     87 /*
     88  * This code does not work on CPUs with large pointers, e.g. CHERI capability
     89  * architectures. When porting to that kind of machine, a `dns_qpnode` should
     90  * be just a `uintptr_t`; a leaf node will contain a single pointer, and a
     91  * branch node will fit in the same space with room to spare.
     92  */
     93 STATIC_ASSERT(sizeof(void *) <= sizeof(uint64_t),
     94 	      "pointers must fit in 64 bits");
     95 
     96 /*
     97  * The 64-bit word in a branch node is comprised of a node type tag, a
     98  * bitmap, and an offset into the key. It is called an "index word" because
     99  * it describes how to access the twigs vector (think "database index").
    100  * The following enum sets up the bit positions of these parts.
    101  *
    102  * The bitmap is just above the type tag. The `dns_qp_bits_for_byte[]` table
    103  * is used to fill in a key so that bit tests can work directly against the
    104  * index word without superfluous masking or shifting; we don't need to
    105  * mask out the bitmap before testing a bit, but we do need to mask the
    106  * bitmap before calling popcount.
    107  *
    108  * The byte offset into the key is at the top of the word, so that it
    109  * can be extracted with just a shift, with no masking needed.
    110  *
    111  * The names are SHIFT_thing because they are dns_qpshift_t values. (See
    112  * below for the various `qp_*` type declarations.)
    113  *
    114  * These values are relatively fixed in practice: SHIFT_NOBYTE needs
    115  * to leave space for the type tag, and the implementation of
    116  * `dns_qpkey_fromname()` depends on the bitmap being large enough.
    117  * The symbolic names avoid mystery numbers in the code.
    118  */
    119 enum {
    120 	SHIFT_NOBYTE = 2,  /* label separator has no byte value */
    121 	SHIFT_BITMAP,	   /* many bits here */
    122 	SHIFT_OFFSET = 49, /* offset of byte in key */
    123 };
    124 
    125 /***********************************************************************
    126  *
    127  *  garbage collector tuning parameters
    128  */
    129 
    130 /*
    131  * A "cell" is a location that can contain a `dns_qpnode_t`, and a "chunk"
    132  * is a moderately large array of cells. A big trie can occupy
    133  * multiple chunks. (Unlike other nodes, a trie's root node lives in
    134  * its `struct dns_qp` instead of being allocated in a cell.)
    135  *
    136  * The qp-trie allocator hands out space for twigs vectors. Allocations are
    137  * made sequentially from one of the chunks; this kind of "sequential
    138  * allocator" is also known as a "bump allocator", so in `struct dns_qp`
    139  * (see below) the allocation chunk is called `bump`.
    140  */
    141 
    142 /*
    143  * Number of cells in a chunk is a power of 2, which must have space for
    144  * a full twigs vector (48 wide). When testing, use a much smaller chunk
    145  * size to make the allocator work harder.
    146  */
    147 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
    148 #define QP_CHUNK_LOG_MIN 7
    149 #define QP_CHUNK_LOG_MAX 7
    150 #else
    151 #define QP_CHUNK_LOG_MIN 3
    152 #define QP_CHUNK_LOG_MAX 12
    153 #endif
    154 
    155 STATIC_ASSERT(2 <= QP_CHUNK_LOG_MIN && QP_CHUNK_LOG_MIN <= QP_CHUNK_LOG_MAX,
    156 	      "qp-trie min chunk size is unreasonable");
    157 STATIC_ASSERT(6 <= QP_CHUNK_LOG_MAX && QP_CHUNK_LOG_MAX <= 20,
    158 	      "qp-trie max chunk size is unreasonable");
    159 
    160 #define QP_CHUNK_SIZE  (1U << QP_CHUNK_LOG_MAX)
    161 #define QP_CHUNK_BYTES (QP_CHUNK_SIZE * sizeof(dns_qpnode_t))
    162 
    163 STATIC_ASSERT(QP_SAFETY_MARGIN >= QP_CHUNK_BYTES,
    164 	      "qp-trie safety margin too small");
    165 
    166 /*
    167  * We need a bitfield this big to count how much of a chunk is in use:
    168  * it needs to count from 0 up to and including `1 << QP_CHUNK_LOG_MAX`.
    169  */
    170 #define QP_USAGE_BITS (QP_CHUNK_LOG_MAX + 1)
    171 
    172 /*
    173  * A chunk needs to be compacted if it is less full than this threshold.
    174  * (12% overhead seems reasonable)
    175  */
    176 #define QP_MAX_FREE (QP_CHUNK_SIZE / 8)
    177 #define QP_MIN_USED (QP_CHUNK_SIZE - QP_MAX_FREE)
    178 
    179 /*
    180  * Compact automatically when we pass this threshold: when there is a lot
    181  * of free space in absolute terms, and when we have freed more than half
    182  * of the space we allocated.
    183  *
    184  * The current compaction algorithm scans the whole trie, so it is important
    185  * to scale the threshold based on the size of the trie to avoid quadratic
    186  * behaviour. XXXFANF find an algorithm that scans less of the trie!
    187  *
    188  * During a modification transaction, when we copy-on-write some twigs we
    189  * count the old copy as "free", because they will be when the transaction
    190  * commits. But they cannot be recovered immediately so they are also
    191  * counted as on hold, and discounted when we decide whether to compact.
    192  */
    193 #define QP_GC_HEURISTIC(qp, free) \
    194 	((free) > QP_CHUNK_SIZE * 4 && (free) > (qp)->used_count / 2)
    195 
    196 #define QP_NEEDGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count)
    197 #define QP_AUTOGC(qp) QP_GC_HEURISTIC(qp, (qp)->free_count - (qp)->hold_count)
    198 
    199 /*
    200  * The chunk base and usage arrays are resized geometically and start off
    201  * with two entries.
    202  */
    203 #define GROWTH_FACTOR(size) ((size) + (size) / 2 + 2)
    204 
    205 /*
    206  * Constructors and accessors for dns_qpref_t values, defined here to show
    207  * how the dns_qpref_t, dns_qpchunk_t, dns_qpcell_t types relate to each other
    208  */
    209 
    210 static inline dns_qpref_t
    211 make_ref(dns_qpchunk_t chunk, dns_qpcell_t cell) {
    212 	return QP_CHUNK_SIZE * chunk + cell;
    213 }
    214 
    215 static inline dns_qpchunk_t
    216 ref_chunk(dns_qpref_t ref) {
    217 	return ref / QP_CHUNK_SIZE;
    218 }
    219 
    220 static inline dns_qpcell_t
    221 ref_cell(dns_qpref_t ref) {
    222 	return ref % QP_CHUNK_SIZE;
    223 }
    224 
    225 /*
    226  * We should not use the `root_ref` in an empty trie, so we set it
    227  * to a value that should trigger an obvious bug. See qp_init()
    228  * and get_root() below.
    229  */
    230 #define INVALID_REF ((dns_qpref_t)~0UL)
    231 
    232 /***********************************************************************
    233  *
    234  *  chunk arrays
    235  */
    236 
    237 /*
    238  * A `dns_qp_t` contains two arrays holding information about each chunk.
    239  *
    240  * The `base` array holds pointers to the base of each chunk.
    241  * The `usage` array hold the allocator's state for each chunk.
    242  *
    243  * The `base` array is used by the hot qp-trie traversal paths. It can
    244  * be shared by multiple versions of a trie, which are tracked with a
    245  * refcount. Old versions of the trie can retain old versions of the
    246  * `base` array.
    247  *
    248  * In multithreaded code, the `usage` array is only used when the
    249  * `dns_qpmulti_t` mutex is held, and there is only one version of
    250  * it in active use (maybe with a snapshot for rollback support).
    251  *
    252  * The two arrays are separate because they have rather different
    253  * access patterns, different lifetimes, and different element sizes.
    254  */
    255 
    256 /*
    257  * For most purposes we don't need to know exactly which cells are
    258  * in use in a chunk, we only need to know how many of them there are.
    259  *
    260  * After we have finished allocating from a chunk, the `used` counter
    261  * is the size we need to know for shrinking the chunk and for
    262  * scanning it to detach leaf values before the chunk is free()d. The
    263  * `free` counter tells us when the chunk needs compacting and when it
    264  * has become empty.
    265  *
    266  * The `exists` flag allows the chunk scanning loops to look at the
    267  * usage array only.
    268  *
    269  * In multithreaded code, we mark chunks as `immutable` when a modify
    270  * transaction is opened. (We don't mark them immutable on commit,
    271  * because the old bump chunk must remain mutable between write
    272  * transactions, but it must become immutable when an update
    273  * transaction is opened.)
    274  *
    275  * There are a few flags used to mark which chunks are still needed by
    276  * snapshots after the chunks have passed their normal reclamation
    277  * phase.
    278  */
    279 typedef struct qp_usage {
    280 	/*% the allocation point, increases monotonically */
    281 	dns_qpcell_t used : QP_USAGE_BITS;
    282 	/*% the actual size of the allocation */
    283 	dns_qpcell_t capacity : QP_USAGE_BITS;
    284 	/*% count of nodes no longer needed, also monotonic */
    285 	dns_qpcell_t free : QP_USAGE_BITS;
    286 	/*% qp->base->ptr[chunk] != NULL */
    287 	bool exists : 1;
    288 	/*% is this chunk shared? [MT] */
    289 	bool immutable : 1;
    290 	/*% already subtracted from multi->*_count [MT] */
    291 	bool discounted : 1;
    292 	/*% is a snapshot using this chunk? [MT] */
    293 	bool snapshot : 1;
    294 	/*% tried to free it but a snapshot needs it [MT] */
    295 	bool snapfree : 1;
    296 	/*% for mark/sweep snapshot flag updates [MT] */
    297 	bool snapmark : 1;
    298 } qp_usage_t;
    299 
    300 /*
    301  * The chunks are owned by the current version of the `base` array.
    302  * When the array is resized, the old version might still be in use by
    303  * concurrent readers, in which case it is free()d later when its
    304  * refcount drops to zero.
    305  *
    306  * A `dns_qpbase_t` counts references from `dns_qp_t` objects and
    307  * from packed readers, but not from `dns_qpread_t` nor from
    308  * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t`
    309  * would wreck multicore scalability; instead we rely on RCU.
    310  *
    311  * The `usage` array determines when a chunk is no longer needed: old
    312  * chunk pointers in old `base` arrays are ignored. (They can become
    313  * dangling pointers to free memory, but they will never be
    314  * dereferenced.)
    315  *
    316  * We ensure that individual chunk base pointers remain immutable
    317  * after assignment, and they are not cleared until the chunk is
    318  * free()d, after all readers have departed. Slots can be reused, and
    319  * we allow transactions to fill or re-fill empty slots adjacent to
    320  * busy slots that are in use by readers.
    321  */
    322 struct dns_qpbase {
    323 	unsigned int magic;
    324 	isc_refcount_t refcount;
    325 	dns_qpnode_t *ptr[];
    326 };
    327 
    328 /*
    329  * Chunks that may be in use by readers are reclaimed asynchronously.
    330  * When a transaction commits, immutable chunks that are now empty are
    331  * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`.
    332  */
    333 typedef struct qp_rcuctx {
    334 	unsigned int magic;
    335 	struct rcu_head rcu_head;
    336 	isc_mem_t *mctx;
    337 	dns_qpmulti_t *multi;
    338 	ISC_LINK(struct qp_rcuctx) link;
    339 	dns_qpchunk_t count;
    340 	dns_qpchunk_t chunk[];
    341 } qp_rcuctx_t;
    342 
    343 /*
    344  * Returns true when the base array can be free()d.
    345  */
    346 static inline bool
    347 qpbase_unref(dns_qpreadable_t qpr) {
    348 	dns_qpreader_t *qp = dns_qpreader(qpr);
    349 	return qp->base != NULL &&
    350 	       isc_refcount_decrement(&qp->base->refcount) == 1;
    351 }
    352 
    353 /*
    354  * Now we know about `dns_qpreader_t` and `dns_qpbase_t`,
    355  * here's how we convert a twig reference into a pointer.
    356  */
    357 static inline dns_qpnode_t *
    358 ref_ptr(dns_qpreadable_t qpr, dns_qpref_t ref) {
    359 	dns_qpreader_t *qp = dns_qpreader(qpr);
    360 	return qp->base->ptr[ref_chunk(ref)] + ref_cell(ref);
    361 }
    362 
    363 /***********************************************************************
    364  *
    365  *  main qp-trie structures
    366  */
    367 
    368 #define QP_MAGIC       ISC_MAGIC('t', 'r', 'i', 'e')
    369 #define QPITER_MAGIC   ISC_MAGIC('q', 'p', 'i', 't')
    370 #define QPCHAIN_MAGIC  ISC_MAGIC('q', 'p', 'c', 'h')
    371 #define QPMULTI_MAGIC  ISC_MAGIC('q', 'p', 'm', 'v')
    372 #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x')
    373 #define QPBASE_MAGIC   ISC_MAGIC('q', 'p', 'b', 'p')
    374 #define QPRCU_MAGIC    ISC_MAGIC('q', 'p', 'c', 'b')
    375 
    376 #define QP_VALID(qp)	  ISC_MAGIC_VALID(qp, QP_MAGIC)
    377 #define QPITER_VALID(qp)  ISC_MAGIC_VALID(qp, QPITER_MAGIC)
    378 #define QPCHAIN_VALID(qp) ISC_MAGIC_VALID(qp, QPCHAIN_MAGIC)
    379 #define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC)
    380 #define QPBASE_VALID(qp)  ISC_MAGIC_VALID(qp, QPBASE_MAGIC)
    381 #define QPRCU_VALID(qp)	  ISC_MAGIC_VALID(qp, QPRCU_MAGIC)
    382 
    383 /*
    384  * Polymorphic initialization of the `dns_qpreader_t` prefix.
    385  *
    386  * The location of the root node is actually a dns_qpref_t, but is
    387  * declared in DNS_QPREADER_FIELDS as uint32_t to avoid leaking too
    388  * many internal details into the public API.
    389  *
    390  * The `uctx` and `methods` support callbacks into the user's code.
    391  * They are constant after initialization.
    392  */
    393 #define QP_INIT(qp, m, x)                 \
    394 	(*(qp) = (typeof(*(qp))){         \
    395 		 .magic = QP_MAGIC,       \
    396 		 .root_ref = INVALID_REF, \
    397 		 .uctx = x,               \
    398 		 .methods = m,            \
    399 	 })
    400 
    401 /*
    402  * Snapshots have some extra cleanup machinery.
    403  *
    404  * Originally, a snapshot was basically just a `dns_qpread_t`
    405  * allocated on the heap, with the extra behaviour that memory
    406  * reclamation is suppressed for a particular trie while it has any
    407  * snapshots. However that design gets into trouble for a zone with
    408  * frequent updates and many zone transfers.
    409  *
    410  * Instead, each snapshot records which chunks it needs. When a
    411  * snapshot is created, it makes a copy of the `base` array, except
    412  * for chunks that are empty and waiting to be reclaimed. When a
    413  * snapshot is destroyed, we can traverse the list of snapshots to
    414  * accurately mark which chunks are still needed.
    415  *
    416  * A snapshot's `whence` pointer helps ensure that a `dns_qpsnap_t`is
    417  * not muddled up with the wrong `dns_qpmulti_t`.
    418  *
    419  * A trie's `base` array might have grown after the snapshot was
    420  * created, so it records its own `chunk_max`.
    421  */
    422 struct dns_qpsnap {
    423 	DNS_QPREADER_FIELDS;
    424 	dns_qpmulti_t *whence;
    425 	uint32_t chunk_max;
    426 	ISC_LINK(struct dns_qpsnap) link;
    427 };
    428 
    429 /*
    430  * Read-write access to a qp-trie requires extra fields to support the
    431  * allocator and garbage collector.
    432  *
    433  * Bare instances of a `struct dns_qp` are used for stand-alone
    434  * single-threaded tries. For multithreaded access, a `dns_qpmulti_t`
    435  * wraps a `dns_qp_t` with a mutex and other fields that are only needed
    436  * at the start or end of a transaction.
    437  *
    438  * Allocations are made sequentially in the `bump` chunk. A sequence
    439  * of lightweight write transactions can use the same `bump` chunk, so
    440  * its prefix before `fender` is immutable, and the rest is mutable.
    441  *
    442  * To decide when to compact and reclaim space, QP_MAX_GARBAGE() examines
    443  * the values of `used_count`, `free_count`, and `hold_count`. The
    444  * `hold_count` tracks nodes that need to be retained while readers are
    445  * using them; they are free but cannot be reclaimed until the transaction
    446  * has committed, so the `hold_count` is discounted from QP_MAX_GARBAGE()
    447  * during a transaction.
    448  *
    449  * There are some flags that alter the behaviour of write transactions.
    450  *
    451  *  - The `transaction_mode` indicates whether the current transaction is a
    452  *    light write or a heavy update, or (between transactions) the previous
    453  *    transaction's mode, because the setup for the next transaction
    454  *    depends on how the previous one committed. The mode is set at the
    455  *    start of each transaction. It is QP_NONE in a single-threaded qp-trie
    456  *    to detect if part of a `dns_qpmulti_t` is passed to dns_qp_destroy().
    457  *
    458  *  - The `compact_all` flag is used when every node in the trie should be
    459  *    copied. (Usually compation aims to avoid moving nodes out of
    460  *    unfragmented chunks.) It is used when compaction is explicitly
    461  *    requested via `dns_qp_compact()`, and as an emergency mechanism if
    462  *    normal compaction failed to clear the QP_MAX_GARBAGE() condition.
    463  *    (This emergency is a bug even tho we have a rescue mechanism.)
    464  *
    465  *  - When a qp-trie is destroyed while it has pending cleanup work, its
    466  *    `destroy` flag is set so that it is destroyed by the reclaim worker.
    467  *    (Because items cannot be removed from the middle of the cleanup list.)
    468  *
    469  *  - When built with fuzzing support, we can use mprotect() and munmap()
    470  *    to ensure that incorrect memory accesses cause fatal errors. The
    471  *    `write_protect` flag must be set straight after the `dns_qpmulti_t`
    472  *    is created, then left unchanged.
    473  *
    474  * Some of the dns_qp_t fields are only needed for multithreaded transactions
    475  * (marked [MT] below) but the same code paths are also used for single-
    476  * threaded writes.
    477  */
    478 struct dns_qp {
    479 	DNS_QPREADER_FIELDS;
    480 	/*% memory context (const) */
    481 	isc_mem_t *mctx;
    482 	/*% array of per-chunk allocation counters */
    483 	qp_usage_t *usage;
    484 	/*% number of slots in `chunk` and `usage` arrays */
    485 	dns_qpchunk_t chunk_max;
    486 	/*% which chunk is used for allocations */
    487 	dns_qpchunk_t bump;
    488 	/*% nodes in the `bump` chunk below `fender` are read only [MT] */
    489 	dns_qpcell_t fender;
    490 	/*% number of leaf nodes */
    491 	dns_qpcell_t leaf_count;
    492 	/*% total of all usage[] counters */
    493 	dns_qpcell_t used_count, free_count;
    494 	/*% free cells that cannot be recovered right now */
    495 	dns_qpcell_t hold_count;
    496 	/*% capacity of last allocated chunk, for exponential chunk growth */
    497 	dns_qpcell_t chunk_capacity;
    498 	/*% what kind of transaction was most recently started [MT] */
    499 	enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2;
    500 	/*% compact the entire trie [MT] */
    501 	bool compact_all : 1;
    502 	/*% optionally when compiled with fuzzing support [MT] */
    503 	bool write_protect : 1;
    504 };
    505 
    506 /*
    507  * Concurrent access to a qp-trie.
    508  *
    509  * The `reader` pointer provides wait-free access to the current version
    510  * of the trie. See the "packed reader nodes" section below for a
    511  * description of what it points to.
    512  *
    513  * The main object under the protection of the mutex is the `writer`
    514  * containing all the allocator state. There can be a backup copy when
    515  * we want to be able to rollback an update transaction.
    516  *
    517  * There is a `reader_ref` which corresponds to the `reader` pointer
    518  * (`ref_ptr(multi->reader_ref) == multi->reader`). The `reader_ref` is
    519  * necessary when freeing the space used by the reader, because there
    520  * isn't a good way to recover a dns_qpref_t from a dns_qpnode_t pointer.
    521  *
    522  * There is a per-trie list of snapshots that is used for reclaiming
    523  * memory when a snapshot is destroyed.
    524  *
    525  * Finally, we maintain a global list of `dns_qpmulti_t` objects that
    526  * need asynchronous safe memory recovery.
    527  */
    528 struct dns_qpmulti {
    529 	uint32_t magic;
    530 	/*% RCU-protected pointer to current packed reader */
    531 	dns_qpnode_t *reader;
    532 	/*% the mutex protects the rest of this structure */
    533 	isc_mutex_t mutex;
    534 	/*% ref_ptr(writer, reader_ref) == reader */
    535 	dns_qpref_t reader_ref;
    536 	/*% the main working structure */
    537 	dns_qp_t writer;
    538 	/*% saved allocator state to support rollback */
    539 	dns_qp_t *rollback;
    540 	/*% all snapshots of this trie */
    541 	ISC_LIST(dns_qpsnap_t) snapshots;
    542 	/*% refcount for memory reclamation */
    543 	isc_refcount_t references;
    544 };
    545 
    546 /***********************************************************************
    547  *
    548  *  interior node constructors and accessors
    549  */
    550 
    551 /*
    552  * See the comments under "interior node basics" above, which explain
    553  * the layout of nodes as implemented by the following functions.
    554  *
    555  * These functions are (mostly) constructors and getters. Imagine how
    556  * much less code there would be if C had sum types with control over
    557  * the layout...
    558  */
    559 
    560 /*
    561  * Get the 64-bit word of a node.
    562  */
    563 static inline uint64_t
    564 node64(dns_qpnode_t *n) {
    565 	uint64_t lo = n->biglo;
    566 	uint64_t hi = n->bighi;
    567 	return lo | (hi << 32);
    568 }
    569 
    570 /*
    571  * Get the 32-bit word of a node.
    572  */
    573 static inline uint32_t
    574 node32(dns_qpnode_t *n) {
    575 	return n->small;
    576 }
    577 
    578 /*
    579  * Create a node from its parts
    580  */
    581 static inline dns_qpnode_t
    582 make_node(uint64_t big, uint32_t small) {
    583 	return (dns_qpnode_t){
    584 		.biglo = (uint32_t)(big),
    585 		.bighi = (uint32_t)(big >> 32),
    586 		.small = small,
    587 	};
    588 }
    589 
    590 /*
    591  * Extract a pointer from a node's 64 bit word. The double cast is to avoid
    592  * a warning about mismatched pointer/integer sizes on 32 bit systems.
    593  */
    594 static inline void *
    595 node_pointer(dns_qpnode_t *n) {
    596 	return (void *)(uintptr_t)(node64(n) & ~TAG_MASK);
    597 }
    598 
    599 /*
    600  * Examine a node's tag bits
    601  */
    602 static inline uint32_t
    603 node_tag(dns_qpnode_t *n) {
    604 	return n->biglo & TAG_MASK;
    605 }
    606 
    607 /*
    608  * simplified for the hot path
    609  */
    610 static inline bool
    611 is_branch(dns_qpnode_t *n) {
    612 	return n->biglo & BRANCH_TAG;
    613 }
    614 
    615 /* leaf nodes *********************************************************/
    616 
    617 /*
    618  * Get a leaf's pointer value.
    619  */
    620 static inline void *
    621 leaf_pval(dns_qpnode_t *n) {
    622 	return node_pointer(n);
    623 }
    624 
    625 /*
    626  * Get a leaf's integer value
    627  */
    628 static inline uint32_t
    629 leaf_ival(dns_qpnode_t *n) {
    630 	return node32(n);
    631 }
    632 
    633 /*
    634  * Create a leaf node from its parts
    635  */
    636 static inline dns_qpnode_t
    637 make_leaf(const void *pval, uint32_t ival) {
    638 	dns_qpnode_t leaf = make_node((uintptr_t)pval, ival);
    639 	REQUIRE(node_tag(&leaf) == LEAF_TAG);
    640 	return leaf;
    641 }
    642 
    643 /* branch nodes *******************************************************/
    644 
    645 /*
    646  * The following function names use plural `twigs` when they work on a
    647  * branch's twigs vector as a whole, and singular `twig` when they work on
    648  * a particular twig.
    649  */
    650 
    651 /*
    652  * Get a branch node's index word
    653  */
    654 static inline uint64_t
    655 branch_index(dns_qpnode_t *n) {
    656 	return node64(n);
    657 }
    658 
    659 /*
    660  * Get a reference to a branch node's child twigs.
    661  */
    662 static inline dns_qpref_t
    663 branch_twigs_ref(dns_qpnode_t *n) {
    664 	return node32(n);
    665 }
    666 
    667 /*
    668  * Bit positions in the bitmap come directly from the key. DNS names are
    669  * converted to keys using the tables declared at the end of this file.
    670  */
    671 static inline dns_qpshift_t
    672 qpkey_bit(const dns_qpkey_t key, size_t len, size_t offset) {
    673 	if (offset < len) {
    674 		return key[offset];
    675 	} else {
    676 		return SHIFT_NOBYTE;
    677 	}
    678 }
    679 
    680 /*
    681  * Extract a branch node's offset field, used to index the key.
    682  */
    683 static inline size_t
    684 branch_key_offset(dns_qpnode_t *n) {
    685 	return (size_t)(branch_index(n) >> SHIFT_OFFSET);
    686 }
    687 
    688 /*
    689  * Which bit identifies the twig of this node for this key?
    690  */
    691 static inline dns_qpshift_t
    692 branch_keybit(dns_qpnode_t *n, const dns_qpkey_t key, size_t len) {
    693 	return qpkey_bit(key, len, branch_key_offset(n));
    694 }
    695 
    696 /*
    697  * Get a pointer to a the first twig of a branch (this also functions
    698  * as a pointer to the entire twig vector).
    699  */
    700 static inline dns_qpnode_t *
    701 branch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) {
    702 	return ref_ptr(qpr, branch_twigs_ref(n));
    703 }
    704 
    705 /*
    706  * Warm up the cache while calculating which twig we want.
    707  */
    708 static inline void
    709 prefetch_twigs(dns_qpreadable_t qpr, dns_qpnode_t *n) {
    710 	__builtin_prefetch(ref_ptr(qpr, branch_twigs_ref(n)));
    711 }
    712 
    713 /* root node **********************************************************/
    714 
    715 /*
    716  * Get a pointer to the root node, checking if the trie is empty.
    717  */
    718 static inline dns_qpnode_t *
    719 get_root(dns_qpreadable_t qpr) {
    720 	dns_qpreader_t *qp = dns_qpreader(qpr);
    721 	if (qp->root_ref == INVALID_REF) {
    722 		return NULL;
    723 	} else {
    724 		return ref_ptr(qp, qp->root_ref);
    725 	}
    726 }
    727 
    728 /*
    729  * When we need to move the root node, we avoid repeating allocation
    730  * logistics by making a temporary fake branch node that has
    731  *	`branch_twigs_size() == 1 && branch_twigs_ref() == root_ref`
    732  * just enough to treat the root node as a vector of one twig.
    733  */
    734 #define MOVABLE_ROOT(qp)                                   \
    735 	(&(dns_qpnode_t){                                  \
    736 		.biglo = BRANCH_TAG | (1 << SHIFT_NOBYTE), \
    737 		.small = qp->root_ref,                     \
    738 	})
    739 
    740 /***********************************************************************
    741  *
    742  *  bitmap popcount shenanigans
    743  */
    744 
    745 /*
    746  * How many twigs appear in the vector before the one corresponding to the
    747  * given bit? Calculated using popcount of part of the branch's bitmap.
    748  *
    749  * To calculate a mask that covers the lesser bits in the bitmap,
    750  * we subtract 1 to set all lesser bits, and subtract the tag mask
    751  * because the type tag is not part of the bitmap.
    752  */
    753 static inline dns_qpweight_t
    754 branch_count_bitmap_before(dns_qpnode_t *n, dns_qpshift_t bit) {
    755 	uint64_t mask = (1ULL << bit) - 1 - TAG_MASK;
    756 	uint64_t bitmap = branch_index(n) & mask;
    757 	return (dns_qpweight_t)__builtin_popcountll(bitmap);
    758 }
    759 
    760 /*
    761  * How many twigs does this branch have?
    762  *
    763  * The offset is directly after the bitmap so the offset's lesser bits
    764  * covers the whole bitmap, and the bitmap's weight is the number of twigs.
    765  */
    766 static inline dns_qpweight_t
    767 branch_twigs_size(dns_qpnode_t *n) {
    768 	return branch_count_bitmap_before(n, SHIFT_OFFSET);
    769 }
    770 
    771 /*
    772  * Position of a twig within the packed sparse vector.
    773  */
    774 static inline dns_qpweight_t
    775 branch_twig_pos(dns_qpnode_t *n, dns_qpshift_t bit) {
    776 	return branch_count_bitmap_before(n, bit);
    777 }
    778 
    779 /*
    780  * Get a pointer to the twig for a given bit number.
    781  */
    782 static inline dns_qpnode_t *
    783 branch_twig_ptr(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpshift_t bit) {
    784 	return ref_ptr(qpr, branch_twigs_ref(n) + branch_twig_pos(n, bit));
    785 }
    786 
    787 /*
    788  * Is the twig identified by this bit present?
    789  */
    790 static inline bool
    791 branch_has_twig(dns_qpnode_t *n, dns_qpshift_t bit) {
    792 	return branch_index(n) & (1ULL << bit);
    793 }
    794 
    795 /* twig logistics *****************************************************/
    796 
    797 static inline void
    798 move_twigs(dns_qpnode_t *to, dns_qpnode_t *from, dns_qpweight_t size) {
    799 	memmove(to, from, size * sizeof(dns_qpnode_t));
    800 }
    801 
    802 static inline void
    803 zero_twigs(dns_qpnode_t *twigs, dns_qpweight_t size) {
    804 	memset(twigs, 0, size * sizeof(dns_qpnode_t));
    805 }
    806 
    807 /***********************************************************************
    808  *
    809  *  packed reader nodes
    810  */
    811 
    812 /*
    813  * The purpose of these packed reader nodes is to simplify safe memory
    814  * reclamation for a multithreaded qp-trie.
    815  *
    816  * After the `reader` pointer in a qpmulti is replaced, we need to wait
    817  * for a grace period before we can reclaim the memory that is no longer
    818  * needed by the trie. So we need some kind of structure to hold
    819  * pointers to the (logically) detached memory until it is safe to free.
    820  * This memory includes the chunks and the `base` arrays.
    821  *
    822  * Packed reader nodes save us from having to track `dns_qpread_t`
    823  * objects as distinct allocations: the packed reader nodes get
    824  * reclaimed when the the chunk containing their cells is reclaimed.
    825  * When a real `dns_qpread_t` object is needed, it is allocated on the
    826  * stack (it must not live longer than a isc_loop callback) and the
    827  * packed reader is unpacked into it.
    828  *
    829  * Chunks are owned by the current `base` array, so unused chunks are
    830  * held there until they are free()d. Old `base` arrays are attached
    831  * to packed reader nodes with a refcount. When a chunk is reclaimed,
    832  * it is scanned so that `chunk_free()` can call `detach_leaf()` on
    833  * any remaining references to leaf objects. Similarly, it calls
    834  * `qpbase_unref()` to reclaim old `base` arrays.
    835  */
    836 
    837 /*
    838  * Two nodes is just enough space for the information needed by
    839  * readers and for deferred memory reclamation.
    840  */
    841 #define READER_SIZE 2
    842 
    843 /*
    844  * Create a packed reader; space for the reader should have been
    845  * allocated using `alloc_twigs(&multi->writer, READER_SIZE)`.
    846  */
    847 static inline void
    848 make_reader(dns_qpnode_t *reader, dns_qpmulti_t *multi) {
    849 	dns_qp_t *qp = &multi->writer;
    850 	reader[0] = make_node(READER_TAG | (uintptr_t)multi, QPREADER_MAGIC);
    851 	reader[1] = make_node(READER_TAG | (uintptr_t)qp->base, qp->root_ref);
    852 }
    853 
    854 static inline bool
    855 reader_valid(dns_qpnode_t *reader) {
    856 	return reader != NULL && //
    857 	       node_tag(&reader[0]) == READER_TAG &&
    858 	       node_tag(&reader[1]) == READER_TAG &&
    859 	       node32(&reader[0]) == QPREADER_MAGIC;
    860 }
    861 
    862 /*
    863  * Verify and unpack a reader. We return the `multi` pointer to use in
    864  * consistency checks.
    865  */
    866 static inline dns_qpmulti_t *
    867 unpack_reader(dns_qpreader_t *qp, dns_qpnode_t *reader) {
    868 	INSIST(reader_valid(reader));
    869 	dns_qpmulti_t *multi = node_pointer(&reader[0]);
    870 	dns_qpbase_t *base = node_pointer(&reader[1]);
    871 	INSIST(QPMULTI_VALID(multi));
    872 	INSIST(QPBASE_VALID(base));
    873 	*qp = (dns_qpreader_t){
    874 		.magic = QP_MAGIC,
    875 		.uctx = multi->writer.uctx,
    876 		.methods = multi->writer.methods,
    877 		.root_ref = node32(&reader[1]),
    878 		.base = base,
    879 	};
    880 	return multi;
    881 }
    882 
    883 /***********************************************************************
    884  *
    885  *  method invocation helpers
    886  */
    887 
    888 static inline void
    889 attach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) {
    890 	dns_qpreader_t *qp = dns_qpreader(qpr);
    891 	qp->methods->attach(qp->uctx, leaf_pval(n), leaf_ival(n));
    892 }
    893 
    894 static inline void
    895 detach_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n) {
    896 	dns_qpreader_t *qp = dns_qpreader(qpr);
    897 	qp->methods->detach(qp->uctx, leaf_pval(n), leaf_ival(n));
    898 }
    899 
    900 static inline size_t
    901 leaf_qpkey(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpkey_t key) {
    902 	dns_qpreader_t *qp = dns_qpreader(qpr);
    903 	size_t len = qp->methods->makekey(key, qp->uctx, leaf_pval(n),
    904 					  leaf_ival(n));
    905 	INSIST(len < sizeof(dns_qpkey_t));
    906 	return len;
    907 }
    908 
    909 static inline char *
    910 triename(dns_qpreadable_t qpr, char *buf, size_t size) {
    911 	dns_qpreader_t *qp = dns_qpreader(qpr);
    912 	qp->methods->triename(qp->uctx, buf, size);
    913 	return buf;
    914 }
    915 
    916 #define TRIENAME(qp) \
    917 	triename(qp, (char[DNS_QP_TRIENAME_MAX]){}, DNS_QP_TRIENAME_MAX)
    918 
    919 /***********************************************************************
    920  *
    921  *  converting DNS names to trie keys
    922  */
    923 
    924 /*
    925  * This is a deliberate simplification of the hostname characters,
    926  * because it doesn't matter much if we treat a few extra characters
    927  * favourably: there is plenty of space in the index word for a
    928  * slightly larger bitmap.
    929  */
    930 static inline bool
    931 qp_common_character(uint8_t byte) {
    932 	return ('-' <= byte && byte <= '9') || ('_' <= byte && byte <= 'z');
    933 }
    934 
    935 /*
    936  * Lookup table mapping bytes in DNS names to bit positions, used
    937  * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
    938  */
    939 extern uint16_t dns_qp_bits_for_byte[];
    940 
    941 /*
    942  * And the reverse, mapping bit positions to characters, so the tests
    943  * can print diagnostics involving qp-trie keys.
    944  */
    945 extern uint8_t dns_qp_byte_for_bit[];
    946 
    947 /**********************************************************************/
    948