Home | History | Annotate | Line # | Download | only in dns
      1 /*	$NetBSD: qp.c,v 1.5 2026/01/29 18:37:49 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  *
      6  * SPDX-License-Identifier: MPL-2.0
      7  *
      8  * This Source Code Form is subject to the terms of the Mozilla Public
      9  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  *
     12  * See the COPYRIGHT file distributed with this work for additional
     13  * information regarding copyright ownership.
     14  */
     15 
     16 /*
     17  * For an overview, see doc/design/qp-trie.md
     18  */
     19 
     20 #include <inttypes.h>
     21 #include <stdbool.h>
     22 #include <stddef.h>
     23 #include <stdint.h>
     24 #include <string.h>
     25 
     26 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
     27 #include <sys/mman.h>
     28 #include <unistd.h>
     29 #endif
     30 
     31 #include <isc/atomic.h>
     32 #include <isc/buffer.h>
     33 #include <isc/magic.h>
     34 #include <isc/mem.h>
     35 #include <isc/mutex.h>
     36 #include <isc/refcount.h>
     37 #include <isc/result.h>
     38 #include <isc/rwlock.h>
     39 #include <isc/tid.h>
     40 #include <isc/time.h>
     41 #include <isc/types.h>
     42 #include <isc/urcu.h>
     43 #include <isc/util.h>
     44 
     45 #include <dns/fixedname.h>
     46 #include <dns/log.h>
     47 #include <dns/name.h>
     48 #include <dns/qp.h>
     49 #include <dns/types.h>
     50 
     51 #include "qp_p.h"
     52 
     53 #ifndef DNS_QP_LOG_STATS
     54 #define DNS_QP_LOG_STATS 0
     55 #endif
     56 #ifndef DNS_QP_TRACE
     57 #define DNS_QP_TRACE 0
     58 #endif
     59 
     60 /*
     61  * very basic garbage collector statistics
     62  *
     63  * XXXFANF for now we're logging GC times, but ideally we should
     64  * accumulate stats more quietly and report via the statschannel
     65  */
     66 #ifdef _LP64
     67 static atomic_uint_fast64_t compact_time;
     68 static atomic_uint_fast64_t recycle_time;
     69 static atomic_uint_fast64_t rollback_time;
     70 #define ISC_QP_ADD(v, a) atomic_fetch_add_relaxed(&(v), (a))
     71 #define ISC_QP_GET(v) atomic_load_relaxed(&(v))
     72 #else
     73 static uint64_t compact_time;
     74 static uint64_t recycle_time;
     75 static uint64_t rollback_time;
     76 static isc_mutex_t qp_mutex = PTHREAD_MUTEX_INITIALIZER;
     77 #define ISC_QP_ADD(v, a) \
     78 	({ \
     79 		isc_mutex_lock(&qp_mutex); \
     80 		uint64_t x = (v) + (a); \
     81 		isc_mutex_unlock(&qp_mutex); \
     82 		x; \
     83 	})
     84 #define ISC_QP_GET(v) \
     85 	({ \
     86 		isc_mutex_lock(&qp_mutex); \
     87 		uint64_t x = (v); \
     88 		isc_mutex_unlock(&qp_mutex); \
     89 		x; \
     90 	})
     91 #endif
     92 
     93 
     94 /* for LOG_STATS() format strings */
     95 #define PRItime " %" PRIu64 " ns "
     96 
     97 #if DNS_QP_LOG_STATS
     98 #define LOG_STATS(...)                                                      \
     99 	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \
    100 		      ISC_LOG_DEBUG(1), __VA_ARGS__)
    101 #else
    102 #define LOG_STATS(...)
    103 #endif
    104 
    105 #if DNS_QP_TRACE
    106 /*
    107  * TRACE is generally used in allocation-related functions so it doesn't
    108  * trace very high-frequency ops
    109  */
    110 #define TRACE(fmt, ...)                                                        \
    111 	do {                                                                   \
    112 		if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) {            \
    113 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,      \
    114 				      DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7),      \
    115 				      "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \
    116 				      __FILE__, __LINE__, __func__, qp,        \
    117 				      qp ? TRIENAME(qp) : "(null)", isc_tid(), \
    118 				      ##__VA_ARGS__);                          \
    119 		}                                                              \
    120 	} while (0)
    121 #else
    122 #define TRACE(...)
    123 #endif
    124 
    125 #if DNS_QPMULTI_TRACE
    126 ISC_REFCOUNT_STATIC_TRACE_DECL(dns_qpmulti);
    127 #define dns_qpmulti_ref(ptr) dns_qpmulti__ref(ptr, __func__, __FILE__, __LINE__)
    128 #define dns_qpmulti_unref(ptr) \
    129 	dns_qpmulti__unref(ptr, __func__, __FILE__, __LINE__)
    130 #define dns_qpmulti_attach(ptr, ptrp) \
    131 	dns_qpmulti__attach(ptr, ptrp, __func__, __FILE__, __LINE__)
    132 #define dns_qpmulti_detach(ptrp) \
    133 	dns_qpmulti__detach(ptrp, __func__, __FILE__, __LINE__)
    134 #else
    135 ISC_REFCOUNT_STATIC_DECL(dns_qpmulti);
    136 #endif
    137 
    138 /***********************************************************************
    139  *
    140  *  converting DNS names to trie keys
    141  */
    142 
    143 /*
    144  * Number of distinct byte values, i.e. 256
    145  */
    146 #define BYTE_VALUES (UINT8_MAX + 1)
    147 
    148 /*
    149  * Lookup table mapping bytes in DNS names to bit positions, used
    150  * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
    151  *
    152  * Each element holds one or two bit positions, bit_one in the
    153  * lower half and bit_two in the upper half.
    154  *
    155  * For common hostname characters, bit_two is zero (which cannot
    156  * be a valid bit position).
    157  *
    158  * For others, bit_one is the escape bit, and bit_two is the
    159  * position of the character within the escaped range.
    160  */
    161 uint16_t dns_qp_bits_for_byte[BYTE_VALUES] = { 0 };
    162 
    163 /*
    164  * And the reverse, mapping bit positions to characters, so the tests
    165  * can print diagnostics involving qp-trie keys.
    166  *
    167  * This table only handles the first bit in an escape sequence; we
    168  * arrange that we can calculate the byte value for both bits by
    169  * adding the the second bit to the first bit's byte value.
    170  */
    171 uint8_t dns_qp_byte_for_bit[SHIFT_OFFSET] = { 0 };
    172 
    173 /*
    174  * Fill in the lookup tables at program startup. (It doesn't matter
    175  * when this is initialized relative to other startup code.)
    176  */
    177 static void
    178 initialize_bits_for_byte(void) ISC_CONSTRUCTOR;
    179 
    180 /*
    181  * The bit positions for bytes inside labels have to be between
    182  * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.)
    183  *
    184  * Each byte range in between common hostname characters has a different
    185  * escape character, to preserve the correct lexical order.
    186  *
    187  * Escaped byte ranges mostly fit into the space available in the
    188  * bitmap, except for those above 'z' (which is mostly bytes with the
    189  * top bit set). So, when we reach the end of the bitmap we roll over
    190  * to the next escape character.
    191  *
    192  * After filling the table we ensure that the bit positions for
    193  * hostname characters and escape characters all fit.
    194  */
    195 static void
    196 initialize_bits_for_byte(void) {
    197 	/* zero common character marker not a valid shift position */
    198 	INSIST(0 < SHIFT_BITMAP);
    199 	/* first bit is common byte or escape byte */
    200 	dns_qpshift_t bit_one = SHIFT_BITMAP;
    201 	/* second bit is position in escaped range */
    202 	dns_qpshift_t bit_two = SHIFT_BITMAP;
    203 	bool escaping = true;
    204 
    205 	for (unsigned int byte = 0; byte < BYTE_VALUES; byte++) {
    206 		if (qp_common_character(byte)) {
    207 			escaping = false;
    208 			bit_one++;
    209 			dns_qp_byte_for_bit[bit_one] = byte;
    210 			dns_qp_bits_for_byte[byte] = bit_one;
    211 		} else if ('A' <= byte && byte <= 'Z') {
    212 			/* map upper case to lower case */
    213 			dns_qpshift_t after_esc = bit_one + 1;
    214 			dns_qpshift_t skip_punct = 'a' - '_';
    215 			dns_qpshift_t letter = byte - 'A';
    216 			dns_qpshift_t bit = after_esc + skip_punct + letter;
    217 			dns_qp_bits_for_byte[byte] = bit;
    218 			/* to simplify reverse conversion */
    219 			bit_two++;
    220 		} else {
    221 			/* non-hostname characters need to be escaped */
    222 			if (!escaping || bit_two >= SHIFT_OFFSET) {
    223 				escaping = true;
    224 				bit_one++;
    225 				dns_qp_byte_for_bit[bit_one] = byte;
    226 				bit_two = SHIFT_BITMAP;
    227 			}
    228 			dns_qp_bits_for_byte[byte] = bit_two << 8 | bit_one;
    229 			bit_two++;
    230 		}
    231 	}
    232 	ENSURE(bit_one < SHIFT_OFFSET);
    233 }
    234 
    235 /*
    236  * Convert a DNS name into a trie lookup key.
    237  *
    238  * Returns the length of the key.
    239  *
    240  * For performance we get our hands dirty in the guts of the name.
    241  *
    242  * We don't worry about the distinction between absolute and relative
    243  * names. When the trie is only used with absolute names, the first byte
    244  * of the key will always be SHIFT_NOBYTE and it will always be skipped
    245  * when traversing the trie. So keeping the root label costs little, and
    246  * it allows us to support tries of relative names too. In fact absolute
    247  * and relative names can be mixed in the same trie without causing
    248  * confusion, because the presence or absence of the initial
    249  * SHIFT_NOBYTE in the key disambiguates them (exactly like a trailing
    250  * dot in a zone file).
    251  */
    252 size_t
    253 dns_qpkey_fromname(dns_qpkey_t key, const dns_name_t *name) {
    254 	size_t len, label;
    255 	dns_fixedname_t fixed;
    256 
    257 	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
    258 
    259 	if (name->labels == 0) {
    260 		key[0] = SHIFT_NOBYTE;
    261 		return 0;
    262 	}
    263 
    264 	if (name->offsets == NULL) {
    265 		dns_name_t *clone = dns_fixedname_initname(&fixed);
    266 		dns_name_clone(name, clone);
    267 		name = clone;
    268 	}
    269 
    270 	len = 0;
    271 	label = name->labels;
    272 	while (label-- > 0) {
    273 		const uint8_t *ldata = name->ndata + name->offsets[label];
    274 		size_t label_len = *ldata++;
    275 		while (label_len-- > 0) {
    276 			uint16_t bits = dns_qp_bits_for_byte[*ldata++];
    277 			key[len++] = bits & 0xFF;	/* bit_one */
    278 			if ((bits >> 8) != 0) {		/* escape? */
    279 				key[len++] = bits >> 8; /* bit_two */
    280 			}
    281 		}
    282 		/* label terminator */
    283 		key[len++] = SHIFT_NOBYTE;
    284 	}
    285 	/* mark end with a double NOBYTE */
    286 	key[len] = SHIFT_NOBYTE;
    287 	ENSURE(len < sizeof(dns_qpkey_t));
    288 	return len;
    289 }
    290 
    291 void
    292 dns_qpkey_toname(const dns_qpkey_t key, size_t keylen, dns_name_t *name) {
    293 	size_t locs[DNS_NAME_MAXLABELS];
    294 	size_t loc = 0, opos = 0;
    295 	size_t offset;
    296 
    297 	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
    298 	REQUIRE(name->buffer != NULL);
    299 	REQUIRE(name->offsets != NULL);
    300 
    301 	dns_name_reset(name);
    302 
    303 	if (keylen == 0) {
    304 		return;
    305 	}
    306 
    307 	/* Scan the key looking for label boundaries */
    308 	for (offset = 0; offset <= keylen; offset++) {
    309 		INSIST(key[offset] >= SHIFT_NOBYTE &&
    310 		       key[offset] < SHIFT_OFFSET);
    311 		INSIST(loc < DNS_NAME_MAXLABELS);
    312 		if (qpkey_bit(key, keylen, offset) == SHIFT_NOBYTE) {
    313 			if (qpkey_bit(key, keylen, offset + 1) == SHIFT_NOBYTE)
    314 			{
    315 				locs[loc] = offset + 1;
    316 				goto scanned;
    317 			}
    318 			locs[loc++] = offset + 1;
    319 		} else if (offset == 0) {
    320 			/* This happens for a relative name */
    321 			locs[loc++] = offset;
    322 		}
    323 	}
    324 	UNREACHABLE();
    325 scanned:
    326 
    327 	/*
    328 	 * In the key the labels are encoded in reverse order, so
    329 	 * we step backward through the label boundaries, then forward
    330 	 * through the labels, to create the DNS wire format data.
    331 	 */
    332 	name->labels = loc;
    333 	while (loc-- > 0) {
    334 		uint8_t len = 0, *lenp = NULL;
    335 
    336 		/* Add a length byte to the name data and set an offset */
    337 		lenp = isc_buffer_used(name->buffer);
    338 		isc_buffer_putuint8(name->buffer, 0);
    339 		name->offsets[opos++] = name->length++;
    340 
    341 		/* Convert from escaped byte ranges to ASCII */
    342 		for (offset = locs[loc]; offset < locs[loc + 1] - 1; offset++) {
    343 			uint8_t bit = qpkey_bit(key, keylen, offset);
    344 			uint8_t byte = dns_qp_byte_for_bit[bit];
    345 			if (qp_common_character(byte)) {
    346 				isc_buffer_putuint8(name->buffer, byte);
    347 			} else {
    348 				byte += key[++offset] - SHIFT_BITMAP;
    349 				isc_buffer_putuint8(name->buffer, byte);
    350 			}
    351 			len++;
    352 		}
    353 
    354 		name->length += len;
    355 		*lenp = len;
    356 	}
    357 
    358 	/* Add a root label for absolute names */
    359 	if (key[0] == SHIFT_NOBYTE) {
    360 		name->attributes.absolute = true;
    361 		isc_buffer_putuint8(name->buffer, 0);
    362 		name->offsets[opos++] = name->length++;
    363 		name->labels++;
    364 	}
    365 
    366 	name->ndata = isc_buffer_base(name->buffer);
    367 }
    368 
    369 /*
    370  * Sentinel value for equal keys
    371  */
    372 #define QPKEY_EQUAL (~(size_t)0)
    373 
    374 /*
    375  * Compare two keys and return the offset where they differ.
    376  *
    377  * This offset is used to work out where a trie search diverged: when one
    378  * of the keys is in the trie and one is not, the common prefix (up to the
    379  * offset) is the part of the unknown key that exists in the trie. This
    380  * matters for adding new keys or finding neighbours of missing keys.
    381  *
    382  * When the keys are different lengths it is possible (but unwise) for
    383  * the longer key to be the same as the shorter key but with superfluous
    384  * trailing SHIFT_NOBYTE elements. This makes the keys equal for the
    385  * purpose of traversing the trie.
    386  */
    387 static size_t
    388 qpkey_compare(const dns_qpkey_t key_a, const size_t keylen_a,
    389 	      const dns_qpkey_t key_b, const size_t keylen_b) {
    390 	size_t keylen = ISC_MAX(keylen_a, keylen_b);
    391 	for (size_t offset = 0; offset < keylen; offset++) {
    392 		if (qpkey_bit(key_a, keylen_a, offset) !=
    393 		    qpkey_bit(key_b, keylen_b, offset))
    394 		{
    395 			return offset;
    396 		}
    397 	}
    398 	return QPKEY_EQUAL;
    399 }
    400 
    401 /***********************************************************************
    402  *
    403  *  allocator wrappers
    404  */
    405 
    406 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
    407 
    408 /*
    409  * Optionally (for debugging) during a copy-on-write transaction, use
    410  * memory protection to ensure that the shared chunks are not modified.
    411  * Once a chunk becomes shared, it remains read-only until it is freed.
    412  * POSIX says we have to use mmap() to get an allocation that we can
    413  * definitely pass to mprotect().
    414  */
    415 
    416 static size_t
    417 chunk_size_raw(void) {
    418 	size_t size = (size_t)sysconf(_SC_PAGE_SIZE);
    419 	return ISC_MAX(size, QP_CHUNK_BYTES);
    420 }
    421 
    422 static void *
    423 chunk_get_raw(dns_qp_t *qp) {
    424 	if (qp->write_protect) {
    425 		size_t size = chunk_size_raw();
    426 		void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
    427 				 MAP_ANON | MAP_PRIVATE, -1, 0);
    428 		RUNTIME_CHECK(ptr != MAP_FAILED);
    429 		return ptr;
    430 	} else {
    431 		return isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES);
    432 	}
    433 }
    434 
    435 static void
    436 chunk_free_raw(dns_qp_t *qp, void *ptr) {
    437 	if (qp->write_protect) {
    438 		RUNTIME_CHECK(munmap(ptr, chunk_size_raw()) == 0);
    439 	} else {
    440 		isc_mem_free(qp->mctx, ptr);
    441 	}
    442 }
    443 
    444 static void *
    445 chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) {
    446 	if (qp->write_protect) {
    447 		return ptr;
    448 	} else {
    449 		return isc_mem_reallocate(qp->mctx, ptr, bytes);
    450 	}
    451 }
    452 
    453 static void
    454 write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) {
    455 	if (qp->write_protect) {
    456 		/* see transaction_open() wrt this special case */
    457 		if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) {
    458 			return;
    459 		}
    460 		TRACE("chunk %u", chunk);
    461 		void *ptr = qp->base->ptr[chunk];
    462 		size_t size = chunk_size_raw();
    463 		RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0);
    464 	}
    465 }
    466 
    467 #else
    468 
    469 #define chunk_get_raw(qp, size) isc_mem_allocate(qp->mctx, size)
    470 #define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr)
    471 
    472 #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size)
    473 
    474 #define write_protect(qp, chunk)
    475 
    476 #endif
    477 
    478 /***********************************************************************
    479  *
    480  *  allocator
    481  */
    482 
    483 /*
    484  * When we reuse the bump chunk across multiple write transactions,
    485  * it can have an immutable prefix and a mutable suffix.
    486  */
    487 static inline bool
    488 cells_immutable(dns_qp_t *qp, dns_qpref_t ref) {
    489 	dns_qpchunk_t chunk = ref_chunk(ref);
    490 	dns_qpcell_t cell = ref_cell(ref);
    491 	if (chunk == qp->bump) {
    492 		return cell < qp->fender;
    493 	} else {
    494 		return qp->usage[chunk].immutable;
    495 	}
    496 }
    497 
    498 /*
    499  * Find the next power that is both bigger than size and prev_capacity,
    500  * but still within the chunk min and max sizes.
    501  */
    502 static dns_qpcell_t
    503 next_capacity(uint32_t prev_capacity, uint32_t size) {
    504 	/*
    505 	 * Unfortunately builtin_clz is undefined for 0. We work around this
    506 	 * issue by flooring the request size at 2.
    507 	 */
    508 	size = ISC_MAX3(size, prev_capacity, 2u);
    509 	uint32_t log2 = 32u - __builtin_clz(size - 1u);
    510 
    511 	return 1U << ISC_CLAMP(log2, QP_CHUNK_LOG_MIN, QP_CHUNK_LOG_MAX);
    512 }
    513 
    514 /*
    515  * Create a fresh bump chunk and allocate some twigs from it.
    516  */
    517 static dns_qpref_t
    518 chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) {
    519 	INSIST(qp->base->ptr[chunk] == NULL);
    520 	INSIST(qp->usage[chunk].used == 0);
    521 	INSIST(qp->usage[chunk].free == 0);
    522 	INSIST(qp->chunk_capacity <= QP_CHUNK_SIZE);
    523 
    524 	qp->chunk_capacity = next_capacity(qp->chunk_capacity * 2u, size);
    525 	qp->base->ptr[chunk] =
    526 		chunk_get_raw(qp, qp->chunk_capacity * sizeof(dns_qpnode_t));
    527 
    528 	qp->usage[chunk] = (qp_usage_t){ .exists = true,
    529 					 .used = size,
    530 					 .capacity = qp->chunk_capacity };
    531 	qp->used_count += size;
    532 	qp->bump = chunk;
    533 	qp->fender = 0;
    534 
    535 	if (qp->write_protect) {
    536 		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
    537 	}
    538 	return make_ref(chunk, 0);
    539 }
    540 
    541 /*
    542  * This is used to grow the chunk arrays when they fill up. If the old
    543  * base array is in use by readers, we must make a clone, otherwise we
    544  * can reallocate in place.
    545  *
    546  * The isc_refcount_init() and qpbase_unref() in this function are a pair.
    547  */
    548 static void
    549 realloc_chunk_arrays(dns_qp_t *qp, dns_qpchunk_t newmax) {
    550 	size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max;
    551 	size_t newptrs = sizeof(qp->base->ptr[0]) * newmax;
    552 	size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax);
    553 
    554 	if (qp->base == NULL || qpbase_unref(qp)) {
    555 		qp->base = isc_mem_reallocate(qp->mctx, qp->base, size);
    556 	} else {
    557 		dns_qpbase_t *oldbase = qp->base;
    558 		qp->base = isc_mem_allocate(qp->mctx, size);
    559 		memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs);
    560 	}
    561 	memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs);
    562 	isc_refcount_init(&qp->base->refcount, 1);
    563 	qp->base->magic = QPBASE_MAGIC;
    564 
    565 	/* usage array is exclusive to the writer */
    566 	size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max;
    567 	size_t newusage = sizeof(qp->usage[0]) * newmax;
    568 	qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage);
    569 	memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage);
    570 
    571 	qp->chunk_max = newmax;
    572 
    573 	TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max);
    574 }
    575 
    576 /*
    577  * There was no space in the bump chunk, so find a place to put a fresh
    578  * chunk in the chunk arrays, then allocate some twigs from it.
    579  */
    580 static dns_qpref_t
    581 alloc_slow(dns_qp_t *qp, dns_qpweight_t size) {
    582 	dns_qpchunk_t chunk;
    583 
    584 	for (chunk = 0; chunk < qp->chunk_max; chunk++) {
    585 		if (!qp->usage[chunk].exists) {
    586 			return chunk_alloc(qp, chunk, size);
    587 		}
    588 	}
    589 	ENSURE(chunk == qp->chunk_max);
    590 	realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk));
    591 	return chunk_alloc(qp, chunk, size);
    592 }
    593 
    594 /*
    595  * Ensure we are using a fresh bump chunk.
    596  */
    597 static void
    598 alloc_reset(dns_qp_t *qp) {
    599 	(void)alloc_slow(qp, 0);
    600 }
    601 
    602 /*
    603  * Allocate some fresh twigs. This is the bump allocator fast path.
    604  */
    605 static inline dns_qpref_t
    606 alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) {
    607 	dns_qpchunk_t chunk = qp->bump;
    608 	dns_qpcell_t cell = qp->usage[chunk].used;
    609 
    610 	if (cell + size <= qp->usage[chunk].capacity) {
    611 		qp->usage[chunk].used += size;
    612 		qp->used_count += size;
    613 		return make_ref(chunk, cell);
    614 	} else {
    615 		return alloc_slow(qp, size);
    616 	}
    617 }
    618 
    619 /*
    620  * Record that some twigs are no longer being used, and if possible
    621  * zero them to ensure that there isn't a spurious double detach when
    622  * the chunk is later recycled.
    623  *
    624  * Returns true if the twigs were immediately destroyed.
    625  *
    626  * NOTE: the caller is responsible for attaching or detaching any
    627  * leaves as required.
    628  */
    629 static inline bool
    630 free_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
    631 	dns_qpchunk_t chunk = ref_chunk(twigs);
    632 
    633 	qp->free_count += size;
    634 	qp->usage[chunk].free += size;
    635 	ENSURE(qp->free_count <= qp->used_count);
    636 	ENSURE(qp->usage[chunk].free <= qp->usage[chunk].used);
    637 
    638 	if (cells_immutable(qp, twigs)) {
    639 		qp->hold_count += size;
    640 		ENSURE(qp->free_count >= qp->hold_count);
    641 		return false;
    642 	} else {
    643 		zero_twigs(ref_ptr(qp, twigs), size);
    644 		return true;
    645 	}
    646 }
    647 
    648 /*
    649  * When some twigs have been copied, and free_twigs() could not
    650  * immediately destroy the old copy, we need to update the refcount
    651  * on any leaves that were duplicated.
    652  */
    653 static void
    654 attach_twigs(dns_qp_t *qp, dns_qpnode_t *twigs, dns_qpweight_t size) {
    655 	for (dns_qpweight_t pos = 0; pos < size; pos++) {
    656 		if (node_tag(&twigs[pos]) == LEAF_TAG) {
    657 			attach_leaf(qp, &twigs[pos]);
    658 		}
    659 	}
    660 }
    661 
    662 /***********************************************************************
    663  *
    664  *  chunk reclamation
    665  */
    666 
    667 /*
    668  * Is any of this chunk still in use?
    669  */
    670 static inline dns_qpcell_t
    671 chunk_usage(dns_qp_t *qp, dns_qpchunk_t chunk) {
    672 	return qp->usage[chunk].used - qp->usage[chunk].free;
    673 }
    674 
    675 /*
    676  * We remove each empty chunk from the total counts when the chunk is
    677  * freed, or when it is scheduled for safe memory reclamation. We check
    678  * the chunk's phase to avoid discounting it twice in the latter case.
    679  */
    680 static void
    681 chunk_discount(dns_qp_t *qp, dns_qpchunk_t chunk) {
    682 	if (qp->usage[chunk].discounted) {
    683 		return;
    684 	}
    685 	INSIST(qp->used_count >= qp->usage[chunk].used);
    686 	INSIST(qp->free_count >= qp->usage[chunk].free);
    687 	qp->used_count -= qp->usage[chunk].used;
    688 	qp->free_count -= qp->usage[chunk].free;
    689 	qp->usage[chunk].discounted = true;
    690 }
    691 
    692 /*
    693  * When a chunk is being recycled, we need to detach any leaves that
    694  * remain, and free any `base` arrays that have been marked as unused.
    695  */
    696 static void
    697 chunk_free(dns_qp_t *qp, dns_qpchunk_t chunk) {
    698 	if (qp->write_protect) {
    699 		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
    700 	}
    701 
    702 	dns_qpnode_t *n = qp->base->ptr[chunk];
    703 	for (dns_qpcell_t count = qp->usage[chunk].used; count > 0;
    704 	     count--, n++)
    705 	{
    706 		if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) {
    707 			detach_leaf(qp, n);
    708 		} else if (count > 1 && reader_valid(n)) {
    709 			dns_qpreader_t qpr;
    710 			unpack_reader(&qpr, n);
    711 			/* pairs with dns_qpmulti_commit() */
    712 			if (qpbase_unref(&qpr)) {
    713 				isc_mem_free(qp->mctx, qpr.base);
    714 			}
    715 		}
    716 	}
    717 	chunk_discount(qp, chunk);
    718 	chunk_free_raw(qp, qp->base->ptr[chunk]);
    719 	qp->base->ptr[chunk] = NULL;
    720 	qp->usage[chunk] = (qp_usage_t){};
    721 }
    722 
    723 /*
    724  * Free any chunks that we can while a trie is in use.
    725  */
    726 static void
    727 recycle(dns_qp_t *qp) {
    728 	unsigned int nfree = 0;
    729 
    730 	isc_nanosecs_t start = isc_time_monotonic();
    731 
    732 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
    733 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
    734 		    qp->usage[chunk].exists && !qp->usage[chunk].immutable)
    735 		{
    736 			chunk_free(qp, chunk);
    737 			nfree++;
    738 		}
    739 	}
    740 
    741 	isc_nanosecs_t time = isc_time_monotonic() - start;
    742 	ISC_QP_ADD(recycle_time, time);
    743 
    744 	if (nfree > 0) {
    745 		LOG_STATS("qp recycle" PRItime "free %u chunks", time, nfree);
    746 		LOG_STATS("qp recycle leaf %u live %u used %u free %u hold %u",
    747 			  qp->leaf_count, qp->used_count - qp->free_count,
    748 			  qp->used_count, qp->free_count, qp->hold_count);
    749 	}
    750 }
    751 
    752 /*
    753  * asynchronous cleanup
    754  */
    755 static void
    756 reclaim_chunks_cb(struct rcu_head *arg) {
    757 	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
    758 	REQUIRE(QPRCU_VALID(rcuctx));
    759 	dns_qpmulti_t *multi = rcuctx->multi;
    760 	REQUIRE(QPMULTI_VALID(multi));
    761 
    762 	LOCK(&multi->mutex);
    763 	dns_qp_t *qp = &multi->writer;
    764 
    765 	/*
    766 	 * If chunk_max is zero, chunks have already been freed.
    767 	 */
    768 	if (qp->chunk_max != 0) {
    769 		unsigned int free = 0;
    770 		isc_nanosecs_t start = isc_time_monotonic();
    771 
    772 		INSIST(QP_VALID(qp));
    773 
    774 		for (unsigned int i = 0; i < rcuctx->count; i++) {
    775 			dns_qpchunk_t chunk = rcuctx->chunk[i];
    776 			if (qp->usage[chunk].snapshot) {
    777 				/* clean up when snapshot is destroyed */
    778 				qp->usage[chunk].snapfree = true;
    779 			} else {
    780 				chunk_free(qp, chunk);
    781 				free++;
    782 			}
    783 		}
    784 
    785 		isc_nanosecs_t time = isc_time_monotonic() - start;
    786 		recycle_time += time;
    787 
    788 		if (free > 0) {
    789 			LOG_STATS("qp reclaim" PRItime "free %u chunks", time,
    790 				  free);
    791 			LOG_STATS(
    792 				"qp reclaim leaf %u live %u used %u free %u "
    793 				"hold %u",
    794 				qp->leaf_count, qp->used_count - qp->free_count,
    795 				qp->used_count, qp->free_count, qp->hold_count);
    796 		}
    797 	}
    798 
    799 	UNLOCK(&multi->mutex);
    800 
    801 	dns_qpmulti_detach(&multi);
    802 	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
    803 			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
    804 }
    805 
    806 /*
    807  * At the end of a transaction, schedule empty but immutable chunks
    808  * for reclamation later.
    809  */
    810 static void
    811 reclaim_chunks(dns_qpmulti_t *multi) {
    812 	dns_qp_t *qp = &multi->writer;
    813 
    814 	unsigned int count = 0;
    815 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
    816 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
    817 		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
    818 		    !qp->usage[chunk].discounted)
    819 		{
    820 			count++;
    821 		}
    822 	}
    823 
    824 	if (count == 0) {
    825 		return;
    826 	}
    827 
    828 	qp_rcuctx_t *rcuctx =
    829 		isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count));
    830 	*rcuctx = (qp_rcuctx_t){
    831 		.magic = QPRCU_MAGIC,
    832 		.multi = multi,
    833 		.count = count,
    834 	};
    835 	isc_mem_attach(qp->mctx, &rcuctx->mctx);
    836 
    837 	unsigned int i = 0;
    838 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
    839 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
    840 		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
    841 		    !qp->usage[chunk].discounted)
    842 		{
    843 			rcuctx->chunk[i++] = chunk;
    844 			chunk_discount(qp, chunk);
    845 		}
    846 	}
    847 
    848 	/*
    849 	 * Reference the qpmulti object to keep it from being
    850 	 * freed until reclaim_chunks_cb() runs.
    851 	 */
    852 	dns_qpmulti_ref(multi);
    853 	call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb);
    854 
    855 	LOG_STATS("qp will reclaim %u chunks", count);
    856 }
    857 
    858 /*
    859  * When a snapshot is destroyed, clean up chunks that need free()ing
    860  * and are not used by any remaining snapshots.
    861  */
    862 static void
    863 marksweep_chunks(dns_qpmulti_t *multi) {
    864 	unsigned int nfree = 0;
    865 
    866 	isc_nanosecs_t start = isc_time_monotonic();
    867 
    868 	dns_qp_t *qpw = &multi->writer;
    869 
    870 	for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL;
    871 	     qps = ISC_LIST_NEXT(qps, link))
    872 	{
    873 		for (dns_qpchunk_t chunk = 0; chunk < qps->chunk_max; chunk++) {
    874 			if (qps->base->ptr[chunk] != NULL) {
    875 				INSIST(qps->base->ptr[chunk] ==
    876 				       qpw->base->ptr[chunk]);
    877 				qpw->usage[chunk].snapmark = true;
    878 			}
    879 		}
    880 	}
    881 
    882 	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
    883 		qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark;
    884 		qpw->usage[chunk].snapmark = false;
    885 		if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) {
    886 			chunk_free(qpw, chunk);
    887 			nfree++;
    888 		}
    889 	}
    890 
    891 	isc_nanosecs_t time = isc_time_monotonic() - start;
    892 	ISC_QP_ADD(recycle_time, time);
    893 
    894 	if (nfree > 0) {
    895 		LOG_STATS("qp marksweep" PRItime "free %u chunks", time, nfree);
    896 		LOG_STATS(
    897 			"qp marksweep leaf %u live %u used %u free %u hold %u",
    898 			qpw->leaf_count, qpw->used_count - qpw->free_count,
    899 			qpw->used_count, qpw->free_count, qpw->hold_count);
    900 	}
    901 }
    902 
    903 /***********************************************************************
    904  *
    905  *  garbage collector
    906  */
    907 
    908 /*
    909  * Move a branch node's twigs to the `bump` chunk, for copy-on-write
    910  * or for garbage collection. We don't update the node in place
    911  * because `compact_recursive()` does not ensure the node itself is
    912  * mutable until after it discovers evacuation was necessary.
    913  *
    914  * If free_twigs() could not immediately destroy the old twigs, we have
    915  * to re-attach to any leaves.
    916  */
    917 static dns_qpref_t
    918 evacuate(dns_qp_t *qp, dns_qpnode_t *n) {
    919 	dns_qpweight_t size = branch_twigs_size(n);
    920 	dns_qpref_t old_ref = branch_twigs_ref(n);
    921 	dns_qpref_t new_ref = alloc_twigs(qp, size);
    922 	dns_qpnode_t *old_twigs = ref_ptr(qp, old_ref);
    923 	dns_qpnode_t *new_twigs = ref_ptr(qp, new_ref);
    924 
    925 	move_twigs(new_twigs, old_twigs, size);
    926 	if (!free_twigs(qp, old_ref, size)) {
    927 		attach_twigs(qp, new_twigs, size);
    928 	}
    929 
    930 	return new_ref;
    931 }
    932 
    933 /*
    934  * Immutable nodes need copy-on-write. As we walk down the trie finding the
    935  * right place to modify, make_root_mutable() and make_twigs_mutable()
    936  * are called to ensure that immutable nodes on the path from the root are
    937  * copied to a mutable chunk.
    938  */
    939 
    940 static inline dns_qpnode_t *
    941 make_root_mutable(dns_qp_t *qp) {
    942 	if (cells_immutable(qp, qp->root_ref)) {
    943 		qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp));
    944 	}
    945 	return ref_ptr(qp, qp->root_ref);
    946 }
    947 
    948 static inline void
    949 make_twigs_mutable(dns_qp_t *qp, dns_qpnode_t *n) {
    950 	if (cells_immutable(qp, branch_twigs_ref(n))) {
    951 		*n = make_node(branch_index(n), evacuate(qp, n));
    952 	}
    953 }
    954 
    955 /*
    956  * Compact the trie by traversing the whole thing recursively, copying
    957  * bottom-up as required. The aim is to avoid evacuation as much as
    958  * possible, but when parts of the trie are immutable, we need to evacuate
    959  * the paths from the root to the parts of the trie that occupy
    960  * fragmented chunks.
    961  *
    962  * Without the QP_MIN_USED check, the algorithm will leave the trie
    963  * unchanged. If the children are all leaves, the loop changes nothing,
    964  * so we will return this node's original ref. If all of the children
    965  * that are branches did not need moving, again, the loop changes
    966  * nothing. So the evacuation check is the only place that the
    967  * algorithm introduces ref changes, that then bubble up towards the
    968  * root through the logic inside the loop.
    969  */
    970 static dns_qpref_t
    971 compact_recursive(dns_qp_t *qp, dns_qpnode_t *parent) {
    972 	dns_qpweight_t size = branch_twigs_size(parent);
    973 	dns_qpref_t twigs_ref = branch_twigs_ref(parent);
    974 	dns_qpchunk_t chunk = ref_chunk(twigs_ref);
    975 
    976 	if (qp->compact_all ||
    977 	    (chunk != qp->bump && chunk_usage(qp, chunk) < QP_MIN_USED))
    978 	{
    979 		twigs_ref = evacuate(qp, parent);
    980 	}
    981 	bool immutable = cells_immutable(qp, twigs_ref);
    982 	for (dns_qpweight_t pos = 0; pos < size; pos++) {
    983 		dns_qpnode_t *child = ref_ptr(qp, twigs_ref) + pos;
    984 		if (!is_branch(child)) {
    985 			continue;
    986 		}
    987 		dns_qpref_t old_grandtwigs = branch_twigs_ref(child);
    988 		dns_qpref_t new_grandtwigs = compact_recursive(qp, child);
    989 		if (old_grandtwigs == new_grandtwigs) {
    990 			continue;
    991 		}
    992 		if (immutable) {
    993 			twigs_ref = evacuate(qp, parent);
    994 			/* the twigs have moved */
    995 			child = ref_ptr(qp, twigs_ref) + pos;
    996 			immutable = false;
    997 		}
    998 		*child = make_node(branch_index(child), new_grandtwigs);
    999 	}
   1000 	return twigs_ref;
   1001 }
   1002 
   1003 static void
   1004 compact(dns_qp_t *qp) {
   1005 	LOG_STATS("qp compact before leaf %u live %u used %u free %u hold %u",
   1006 		  qp->leaf_count, qp->used_count - qp->free_count,
   1007 		  qp->used_count, qp->free_count, qp->hold_count);
   1008 
   1009 	isc_nanosecs_t start = isc_time_monotonic();
   1010 
   1011 	if (qp->usage[qp->bump].free > QP_MAX_FREE) {
   1012 		alloc_reset(qp);
   1013 	}
   1014 
   1015 	if (qp->leaf_count > 0) {
   1016 		qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp));
   1017 	}
   1018 	qp->compact_all = false;
   1019 
   1020 	isc_nanosecs_t time = isc_time_monotonic() - start;
   1021 	ISC_QP_ADD(compact_time, time);
   1022 
   1023 	LOG_STATS("qp compact" PRItime
   1024 		  "leaf %u live %u used %u free %u hold %u",
   1025 		  time, qp->leaf_count, qp->used_count - qp->free_count,
   1026 		  qp->used_count, qp->free_count, qp->hold_count);
   1027 }
   1028 
   1029 void
   1030 dns_qp_compact(dns_qp_t *qp, dns_qpgc_t mode) {
   1031 	REQUIRE(QP_VALID(qp));
   1032 	if (mode == DNS_QPGC_MAYBE && !QP_NEEDGC(qp)) {
   1033 		return;
   1034 	}
   1035 	if (mode == DNS_QPGC_ALL) {
   1036 		alloc_reset(qp);
   1037 		qp->compact_all = true;
   1038 	}
   1039 	compact(qp);
   1040 	recycle(qp);
   1041 }
   1042 
   1043 /*
   1044  * Free some twigs and (if they were destroyed immediately so that the
   1045  * result from QP_MAX_GARBAGE can change) compact the trie if necessary.
   1046  *
   1047  * This is called by the trie modification API entry points. The
   1048  * free_twigs() function requires the caller to attach or detach any
   1049  * leaves as necessary. Callers of squash_twigs() satisfy this
   1050  * requirement by calling make_twigs_mutable().
   1051  *
   1052  * Aside: In typical garbage collectors, compaction is triggered when
   1053  * the allocator runs out of space. But that is because typical garbage
   1054  * collectors do not know how much memory can be recovered, so they must
   1055  * find out by scanning the heap. The qp-trie code was originally
   1056  * designed to use malloc() and free(), so it has more information about
   1057  * when garbage collection might be worthwhile. Hence we can trigger
   1058  * collection when garbage passes a threshold.
   1059  *
   1060  * XXXFANF: If we need to avoid latency outliers caused by compaction in
   1061  * write transactions, we can check qp->transaction_mode here.
   1062  */
   1063 static inline bool
   1064 squash_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
   1065 	bool destroyed = free_twigs(qp, twigs, size);
   1066 	if (destroyed && QP_AUTOGC(qp)) {
   1067 		compact(qp);
   1068 		recycle(qp);
   1069 		/*
   1070 		 * This shouldn't happen if the garbage collector is
   1071 		 * working correctly. We can recover at the cost of some
   1072 		 * time and space, but recovery should be cheaper than
   1073 		 * letting compact+recycle fail repeatedly.
   1074 		 */
   1075 		if (QP_AUTOGC(qp)) {
   1076 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
   1077 				      DNS_LOGMODULE_QP, ISC_LOG_NOTICE,
   1078 				      "qp %p uctx \"%s\" compact/recycle "
   1079 				      "failed to recover any space, "
   1080 				      "scheduling a full compaction",
   1081 				      qp, TRIENAME(qp));
   1082 			qp->compact_all = true;
   1083 		}
   1084 	}
   1085 	return destroyed;
   1086 }
   1087 
   1088 /***********************************************************************
   1089  *
   1090  *  public accessors for memory management internals
   1091  */
   1092 
   1093 dns_qp_memusage_t
   1094 dns_qp_memusage(dns_qp_t *qp) {
   1095 	REQUIRE(QP_VALID(qp));
   1096 
   1097 	dns_qp_memusage_t memusage = {
   1098 		.uctx = qp->uctx,
   1099 		.leaves = qp->leaf_count,
   1100 		.live = qp->used_count - qp->free_count,
   1101 		.used = qp->used_count,
   1102 		.hold = qp->hold_count,
   1103 		.free = qp->free_count,
   1104 		.node_size = sizeof(dns_qpnode_t),
   1105 		.fragmented = QP_NEEDGC(qp),
   1106 	};
   1107 
   1108 	size_t chunk_usage_bytes = 0;
   1109 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1110 		if (qp->base->ptr[chunk] != NULL) {
   1111 			chunk_usage_bytes += qp->usage[chunk].capacity;
   1112 			memusage.chunk_count += 1;
   1113 		}
   1114 	}
   1115 
   1116 	/*
   1117 	 * XXXFANF does not subtract chunks that have been shrunk,
   1118 	 * and does not count unreclaimed dns_qpbase_t objects
   1119 	 */
   1120 	memusage.bytes = chunk_usage_bytes +
   1121 			 qp->chunk_max * sizeof(qp->base->ptr[0]) +
   1122 			 qp->chunk_max * sizeof(qp->usage[0]);
   1123 
   1124 	return memusage;
   1125 }
   1126 
   1127 dns_qp_memusage_t
   1128 dns_qpmulti_memusage(dns_qpmulti_t *multi) {
   1129 	REQUIRE(QPMULTI_VALID(multi));
   1130 	LOCK(&multi->mutex);
   1131 
   1132 	dns_qp_t *qp = &multi->writer;
   1133 	INSIST(QP_VALID(qp));
   1134 
   1135 	dns_qp_memusage_t memusage = dns_qp_memusage(qp);
   1136 
   1137 	if (qp->transaction_mode == QP_UPDATE && qp->usage != NULL) {
   1138 		memusage.bytes -= qp->usage[qp->bump].capacity;
   1139 		memusage.bytes += qp->usage[qp->bump].used *
   1140 				  sizeof(dns_qpnode_t);
   1141 	}
   1142 
   1143 	UNLOCK(&multi->mutex);
   1144 	return memusage;
   1145 }
   1146 
   1147 void
   1148 dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p,
   1149 	      isc_nanosecs_t *rollback_p) {
   1150 	*compact_p = ISC_QP_GET(compact_time);
   1151 	*recycle_p = ISC_QP_GET(recycle_time);
   1152 	*rollback_p = ISC_QP_GET(rollback_time);
   1153 }
   1154 
   1155 /***********************************************************************
   1156  *
   1157  *  read-write transactions
   1158  */
   1159 
   1160 static dns_qp_t *
   1161 transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1162 	REQUIRE(QPMULTI_VALID(multi));
   1163 	REQUIRE(qptp != NULL && *qptp == NULL);
   1164 
   1165 	LOCK(&multi->mutex);
   1166 
   1167 	dns_qp_t *qp = &multi->writer;
   1168 	INSIST(QP_VALID(qp));
   1169 
   1170 	/*
   1171 	 * Mark existing chunks as immutable.
   1172 	 *
   1173 	 * Aside: The bump chunk is special: in a series of write
   1174 	 * transactions the bump chunk is reused; the first part (up
   1175 	 * to fender) is immutable, the rest mutable. But we set its
   1176 	 * immutable flag so that when the bump chunk fills up, the
   1177 	 * first part continues to be treated as immutable. (And the
   1178 	 * rest of the chunk too, but that's OK.)
   1179 	 */
   1180 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1181 		if (qp->usage[chunk].exists) {
   1182 			qp->usage[chunk].immutable = true;
   1183 			write_protect(qp, chunk);
   1184 		}
   1185 	}
   1186 
   1187 	/*
   1188 	 * Ensure QP_AUTOGC() ignores free space in immutable chunks.
   1189 	 */
   1190 	qp->hold_count = qp->free_count;
   1191 
   1192 	*qptp = qp;
   1193 	return qp;
   1194 }
   1195 
   1196 /*
   1197  * a write is light
   1198  *
   1199  * We need to ensure we allocate from a fresh chunk if the last transaction
   1200  * shrunk the bump chunk; but usually in a sequence of write transactions
   1201  * we just put `fender` at the point where we started this generation.
   1202  *
   1203  * (Aside: Instead of keeping the previous transaction's mode, I
   1204  * considered forcing allocation into the slow path by fiddling with
   1205  * the bump chunk's usage counters. But that is troublesome because
   1206  * `chunk_free()` needs to know how much of the chunk to scan.)
   1207  */
   1208 void
   1209 dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1210 	dns_qp_t *qp = transaction_open(multi, qptp);
   1211 	TRACE("");
   1212 
   1213 	if (qp->transaction_mode == QP_WRITE) {
   1214 		qp->fender = qp->usage[qp->bump].used;
   1215 	} else {
   1216 		alloc_reset(qp);
   1217 	}
   1218 	qp->transaction_mode = QP_WRITE;
   1219 }
   1220 
   1221 /*
   1222  * an update is heavier
   1223  *
   1224  * We always reset the allocator to the start of a fresh chunk,
   1225  * because the previous transaction was probably an update that shrunk
   1226  * the bump chunk. It simplifies rollback because `fender` is always zero.
   1227  *
   1228  * To rollback a transaction, we need to reset all the allocation
   1229  * counters to their previous state, in particular we need to un-free
   1230  * any nodes that were copied to make them mutable. This means we need
   1231  * to make a copy of basically the whole `dns_qp_t writer`: everything
   1232  * but the chunks holding the trie nodes.
   1233  *
   1234  * We do most of the transaction setup before creating the rollback
   1235  * state so that after rollback we have a correct idea of which chunks
   1236  * are immutable, and so we have the correct transaction mode to make
   1237  * the next transaction allocate a new bump chunk. The exception is
   1238  * resetting the allocator, which we do after creating the rollback
   1239  * state; if this transaction is rolled back then the next transaction
   1240  * will start from the rollback state and also reset the allocator as
   1241  * one of its first actions.
   1242  */
   1243 void
   1244 dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1245 	dns_qp_t *qp = transaction_open(multi, qptp);
   1246 	TRACE("");
   1247 
   1248 	qp->transaction_mode = QP_UPDATE;
   1249 
   1250 	dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback));
   1251 	memmove(rollback, qp, sizeof(*rollback));
   1252 	/* can be uninitialized on the first transaction */
   1253 	if (rollback->base != NULL) {
   1254 		INSIST(QPBASE_VALID(rollback->base));
   1255 		INSIST(qp->usage != NULL && qp->chunk_max > 0);
   1256 		/* paired with either _commit() or _rollback() */
   1257 		isc_refcount_increment(&rollback->base->refcount);
   1258 		size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max;
   1259 		rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes);
   1260 		memmove(rollback->usage, qp->usage, usage_bytes);
   1261 	}
   1262 	INSIST(multi->rollback == NULL);
   1263 	multi->rollback = rollback;
   1264 
   1265 	alloc_reset(qp);
   1266 }
   1267 
   1268 void
   1269 dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1270 	REQUIRE(QPMULTI_VALID(multi));
   1271 	REQUIRE(qptp != NULL && *qptp == &multi->writer);
   1272 	REQUIRE(multi->writer.transaction_mode == QP_WRITE ||
   1273 		multi->writer.transaction_mode == QP_UPDATE);
   1274 
   1275 	dns_qp_t *qp = *qptp;
   1276 	TRACE("");
   1277 
   1278 	if (qp->transaction_mode == QP_UPDATE) {
   1279 		INSIST(multi->rollback != NULL);
   1280 		/* paired with dns_qpmulti_update() */
   1281 		if (qpbase_unref(multi->rollback)) {
   1282 			isc_mem_free(qp->mctx, multi->rollback->base);
   1283 		}
   1284 		if (multi->rollback->usage != NULL) {
   1285 			isc_mem_free(qp->mctx, multi->rollback->usage);
   1286 		}
   1287 		isc_mem_free(qp->mctx, multi->rollback);
   1288 	}
   1289 	INSIST(multi->rollback == NULL);
   1290 
   1291 	/* not the first commit? */
   1292 	if (multi->reader_ref != INVALID_REF) {
   1293 		INSIST(cells_immutable(qp, multi->reader_ref));
   1294 		free_twigs(qp, multi->reader_ref, READER_SIZE);
   1295 	}
   1296 
   1297 	if (qp->transaction_mode == QP_UPDATE) {
   1298 		/* minimize memory overhead */
   1299 		compact(qp);
   1300 		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
   1301 		qp->base->ptr[qp->bump] = chunk_shrink_raw(
   1302 			qp, qp->base->ptr[qp->bump],
   1303 			qp->usage[qp->bump].used * sizeof(dns_qpnode_t));
   1304 	} else {
   1305 		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
   1306 	}
   1307 
   1308 	/* anchor a new version of the trie */
   1309 	dns_qpnode_t *reader = ref_ptr(qp, multi->reader_ref);
   1310 	make_reader(reader, multi);
   1311 	/* paired with chunk_free() */
   1312 	isc_refcount_increment(&qp->base->refcount);
   1313 
   1314 	rcu_assign_pointer(multi->reader, reader); /* COMMIT */
   1315 
   1316 	/* clean up what we can right now */
   1317 	if (qp->transaction_mode == QP_UPDATE || QP_NEEDGC(qp)) {
   1318 		recycle(qp);
   1319 	}
   1320 
   1321 	/* schedule the rest for later */
   1322 	reclaim_chunks(multi);
   1323 
   1324 	*qptp = NULL;
   1325 	UNLOCK(&multi->mutex);
   1326 }
   1327 
   1328 /*
   1329  * Throw away everything that was allocated during this transaction.
   1330  */
   1331 void
   1332 dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1333 	/*
   1334 	 * nfree is only used when logging stats, hence the attribute.
   1335 	 */
   1336 	unsigned int nfree ISC_ATTR_UNUSED = 0;
   1337 
   1338 	REQUIRE(QPMULTI_VALID(multi));
   1339 	REQUIRE(multi->writer.transaction_mode == QP_UPDATE);
   1340 	REQUIRE(qptp != NULL && *qptp == &multi->writer);
   1341 
   1342 	dns_qp_t *qp = *qptp;
   1343 	TRACE("");
   1344 
   1345 	isc_nanosecs_t start = isc_time_monotonic();
   1346 
   1347 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1348 		if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable)
   1349 		{
   1350 			chunk_free(qp, chunk);
   1351 			/*
   1352 			 * we need to clear its base pointer in the rollback
   1353 			 * trie, in case the arrays were resized
   1354 			 */
   1355 			if (chunk < multi->rollback->chunk_max) {
   1356 				INSIST(!multi->rollback->usage[chunk].exists);
   1357 				multi->rollback->base->ptr[chunk] = NULL;
   1358 			}
   1359 			nfree++;
   1360 		}
   1361 	}
   1362 
   1363 	/*
   1364 	 * multi->rollback->base and multi->writer->base are the same,
   1365 	 * unless there was a realloc_chunk_arrays() during the transaction
   1366 	 */
   1367 	if (qpbase_unref(qp)) {
   1368 		/* paired with dns_qpmulti_update() */
   1369 		isc_mem_free(qp->mctx, qp->base);
   1370 	}
   1371 	isc_mem_free(qp->mctx, qp->usage);
   1372 
   1373 	/* reset allocator state */
   1374 	INSIST(multi->rollback != NULL);
   1375 	memmove(qp, multi->rollback, sizeof(*qp));
   1376 	isc_mem_free(qp->mctx, multi->rollback);
   1377 	INSIST(multi->rollback == NULL);
   1378 
   1379 	isc_nanosecs_t time = isc_time_monotonic() - start;
   1380 	ISC_QP_ADD(rollback_time, time);
   1381 
   1382 	LOG_STATS("qp rollback" PRItime "free %u chunks", time, nfree);
   1383 
   1384 	*qptp = NULL;
   1385 	UNLOCK(&multi->mutex);
   1386 }
   1387 
   1388 /***********************************************************************
   1389  *
   1390  *  read-only transactions
   1391  */
   1392 
   1393 static dns_qpmulti_t *
   1394 reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) {
   1395 	dns_qpreader_t *qp = dns_qpreader(qpr);
   1396 	dns_qpnode_t *reader = rcu_dereference(multi->reader);
   1397 	if (reader == NULL) {
   1398 		QP_INIT(qp, multi->writer.methods, multi->writer.uctx);
   1399 	} else {
   1400 		multi = unpack_reader(qp, reader);
   1401 	}
   1402 	return multi;
   1403 }
   1404 
   1405 /*
   1406  * a query is light
   1407  */
   1408 
   1409 void
   1410 dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) {
   1411 	REQUIRE(QPMULTI_VALID(multi));
   1412 	REQUIRE(qp != NULL);
   1413 
   1414 	qp->tid = isc_tid();
   1415 	rcu_read_lock();
   1416 
   1417 	dns_qpmulti_t *whence = reader_open(multi, qp);
   1418 	INSIST(whence == multi);
   1419 }
   1420 
   1421 void
   1422 dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) {
   1423 	REQUIRE(QPMULTI_VALID(multi));
   1424 	REQUIRE(QP_VALID(qp));
   1425 	REQUIRE(qp->tid == isc_tid());
   1426 	*qp = (dns_qpread_t){};
   1427 	rcu_read_unlock();
   1428 }
   1429 
   1430 /*
   1431  * a snapshot is heavy
   1432  */
   1433 
   1434 void
   1435 dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
   1436 	REQUIRE(QPMULTI_VALID(multi));
   1437 	REQUIRE(qpsp != NULL && *qpsp == NULL);
   1438 
   1439 	rcu_read_lock();
   1440 
   1441 	LOCK(&multi->mutex);
   1442 
   1443 	dns_qp_t *qpw = &multi->writer;
   1444 	size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) +
   1445 		       sizeof(qpw->base->ptr[0]) * qpw->chunk_max;
   1446 	dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes);
   1447 
   1448 	qps->whence = reader_open(multi, qps);
   1449 	INSIST(qps->whence == multi);
   1450 
   1451 	/* not a separate allocation */
   1452 	qps->base = (dns_qpbase_t *)(qps + 1);
   1453 	isc_refcount_init(&qps->base->refcount, 0);
   1454 
   1455 	/*
   1456 	 * only copy base pointers of chunks we need, so we can
   1457 	 * reclaim unused memory in dns_qpsnap_destroy()
   1458 	 */
   1459 	qps->chunk_max = qpw->chunk_max;
   1460 	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
   1461 		if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) {
   1462 			qpw->usage[chunk].snapshot = true;
   1463 			qps->base->ptr[chunk] = qpw->base->ptr[chunk];
   1464 		} else {
   1465 			qps->base->ptr[chunk] = NULL;
   1466 		}
   1467 	}
   1468 	ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link);
   1469 
   1470 	*qpsp = qps;
   1471 	UNLOCK(&multi->mutex);
   1472 
   1473 	rcu_read_unlock();
   1474 }
   1475 
   1476 void
   1477 dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
   1478 	REQUIRE(QPMULTI_VALID(multi));
   1479 	REQUIRE(qpsp != NULL && *qpsp != NULL);
   1480 
   1481 	LOCK(&multi->mutex);
   1482 
   1483 	dns_qpsnap_t *qp = *qpsp;
   1484 
   1485 	/* make sure the API is being used correctly */
   1486 	REQUIRE(qp->whence == multi);
   1487 
   1488 	ISC_LIST_UNLINK(multi->snapshots, qp, link);
   1489 
   1490 	/*
   1491 	 * eagerly reclaim chunks that are now unused, so that memory does
   1492 	 * not accumulate when a trie has a lot of updates and snapshots
   1493 	 */
   1494 	marksweep_chunks(multi);
   1495 
   1496 	isc_mem_free(multi->writer.mctx, qp);
   1497 
   1498 	*qpsp = NULL;
   1499 	UNLOCK(&multi->mutex);
   1500 }
   1501 
   1502 /***********************************************************************
   1503  *
   1504  *  constructors, destructors
   1505  */
   1506 
   1507 void
   1508 dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
   1509 	      dns_qp_t **qptp) {
   1510 	REQUIRE(qptp != NULL && *qptp == NULL);
   1511 
   1512 	dns_qp_t *qp = isc_mem_get(mctx, sizeof(*qp));
   1513 	QP_INIT(qp, methods, uctx);
   1514 	isc_mem_attach(mctx, &qp->mctx);
   1515 	alloc_reset(qp);
   1516 	TRACE("");
   1517 	*qptp = qp;
   1518 }
   1519 
   1520 void
   1521 dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
   1522 		   dns_qpmulti_t **qpmp) {
   1523 	REQUIRE(qpmp != NULL && *qpmp == NULL);
   1524 
   1525 	dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi));
   1526 	*multi = (dns_qpmulti_t){ .magic = QPMULTI_MAGIC,
   1527 				  .reader_ref = INVALID_REF,
   1528 				  .references = ISC_REFCOUNT_INITIALIZER(1) };
   1529 	isc_mutex_init(&multi->mutex);
   1530 	ISC_LIST_INIT(multi->snapshots);
   1531 
   1532 	/*
   1533 	 * Do not waste effort allocating a bump chunk that will be thrown
   1534 	 * away when a transaction is opened. dns_qpmulti_update() always
   1535 	 * allocates; to ensure dns_qpmulti_write() does too, pretend the
   1536 	 * previous transaction was an update
   1537 	 */
   1538 	dns_qp_t *qp = &multi->writer;
   1539 	QP_INIT(qp, methods, uctx);
   1540 	isc_mem_attach(mctx, &qp->mctx);
   1541 	qp->transaction_mode = QP_UPDATE;
   1542 	TRACE("");
   1543 	*qpmp = multi;
   1544 }
   1545 
   1546 static void
   1547 destroy_guts(dns_qp_t *qp) {
   1548 	if (qp->chunk_max == 0) {
   1549 		return;
   1550 	}
   1551 
   1552 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1553 		if (qp->base->ptr[chunk] != NULL) {
   1554 			chunk_free(qp, chunk);
   1555 		}
   1556 	}
   1557 	qp->chunk_max = 0;
   1558 	ENSURE(qp->used_count == 0);
   1559 	ENSURE(qp->free_count == 0);
   1560 	ENSURE(isc_refcount_current(&qp->base->refcount) == 1);
   1561 	isc_mem_free(qp->mctx, qp->base);
   1562 	isc_mem_free(qp->mctx, qp->usage);
   1563 	qp->magic = 0;
   1564 }
   1565 
   1566 void
   1567 dns_qp_destroy(dns_qp_t **qptp) {
   1568 	REQUIRE(qptp != NULL);
   1569 	REQUIRE(QP_VALID(*qptp));
   1570 
   1571 	dns_qp_t *qp = *qptp;
   1572 	*qptp = NULL;
   1573 
   1574 	/* do not try to destroy part of a dns_qpmulti_t */
   1575 	REQUIRE(qp->transaction_mode == QP_NONE);
   1576 
   1577 	TRACE("");
   1578 	destroy_guts(qp);
   1579 	isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp));
   1580 }
   1581 
   1582 static void
   1583 qpmulti_free_mem(dns_qpmulti_t *multi) {
   1584 	REQUIRE(QPMULTI_VALID(multi));
   1585 
   1586 	/* reassure thread sanitizer */
   1587 	LOCK(&multi->mutex);
   1588 	dns_qp_t *qp = &multi->writer;
   1589 	UNLOCK(&multi->mutex);
   1590 
   1591 	isc_mutex_destroy(&multi->mutex);
   1592 	isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
   1593 }
   1594 
   1595 #if QPMULTI_TRACE
   1596 ISC_REFCOUNT_STATIC_TRACE_IMPL(dns_qpmulti, qpmulti_free_mem)
   1597 #else
   1598 ISC_REFCOUNT_STATIC_IMPL(dns_qpmulti, qpmulti_free_mem)
   1599 #endif
   1600 
   1601 static void
   1602 qpmulti_destroy_guts_cb(struct rcu_head *arg) {
   1603 	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
   1604 	REQUIRE(QPRCU_VALID(rcuctx));
   1605 	/* only nonzero for reclaim_chunks_cb() */
   1606 	REQUIRE(rcuctx->count == 0);
   1607 
   1608 	dns_qpmulti_t *multi = rcuctx->multi;
   1609 	REQUIRE(QPMULTI_VALID(multi));
   1610 
   1611 	/* reassure thread sanitizer */
   1612 	LOCK(&multi->mutex);
   1613 
   1614 	dns_qp_t *qp = &multi->writer;
   1615 	REQUIRE(QP_VALID(qp));
   1616 
   1617 	destroy_guts(qp);
   1618 
   1619 	UNLOCK(&multi->mutex);
   1620 
   1621 	dns_qpmulti_detach(&multi);
   1622 	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
   1623 			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
   1624 }
   1625 
   1626 void
   1627 dns_qpmulti_destroy(dns_qpmulti_t **qpmp) {
   1628 	dns_qp_t *qp = NULL;
   1629 	dns_qpmulti_t *multi = NULL;
   1630 	qp_rcuctx_t *rcuctx = NULL;
   1631 
   1632 	REQUIRE(qpmp != NULL);
   1633 	REQUIRE(QPMULTI_VALID(*qpmp));
   1634 
   1635 	multi = *qpmp;
   1636 	qp = &multi->writer;
   1637 	*qpmp = NULL;
   1638 
   1639 	REQUIRE(QP_VALID(qp));
   1640 	REQUIRE(multi->rollback == NULL);
   1641 	REQUIRE(ISC_LIST_EMPTY(multi->snapshots));
   1642 
   1643 	rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0));
   1644 	*rcuctx = (qp_rcuctx_t){
   1645 		.magic = QPRCU_MAGIC,
   1646 		.multi = multi,
   1647 	};
   1648 	isc_mem_attach(qp->mctx, &rcuctx->mctx);
   1649 	call_rcu(&rcuctx->rcu_head, qpmulti_destroy_guts_cb);
   1650 }
   1651 
   1652 /***********************************************************************
   1653  *
   1654  *  modification
   1655  */
   1656 
   1657 isc_result_t
   1658 dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) {
   1659 	dns_qpref_t new_ref, old_ref;
   1660 	dns_qpnode_t new_leaf, old_node;
   1661 	dns_qpnode_t *new_twigs = NULL, *old_twigs = NULL;
   1662 	dns_qpshift_t new_bit, old_bit;
   1663 	dns_qpweight_t old_size, new_size;
   1664 	dns_qpkey_t new_key, old_key;
   1665 	size_t new_keylen, old_keylen;
   1666 	size_t offset;
   1667 	uint64_t index;
   1668 	dns_qpshift_t bit;
   1669 	dns_qpweight_t pos;
   1670 	dns_qpnode_t *n = NULL;
   1671 
   1672 	REQUIRE(QP_VALID(qp));
   1673 
   1674 	new_leaf = make_leaf(pval, ival);
   1675 	new_keylen = leaf_qpkey(qp, &new_leaf, new_key);
   1676 
   1677 	/* first leaf in an empty trie? */
   1678 	if (qp->leaf_count == 0) {
   1679 		new_ref = alloc_twigs(qp, 1);
   1680 		new_twigs = ref_ptr(qp, new_ref);
   1681 		*new_twigs = new_leaf;
   1682 		attach_leaf(qp, new_twigs);
   1683 		qp->leaf_count++;
   1684 		qp->root_ref = new_ref;
   1685 		return ISC_R_SUCCESS;
   1686 	}
   1687 
   1688 	/*
   1689 	 * We need to keep searching down to a leaf even if our key is
   1690 	 * missing from this branch. It doesn't matter which twig we
   1691 	 * choose since the keys are all the same up to this node's
   1692 	 * offset. Note that if we simply use branch_twig_pos(n, bit)
   1693 	 * we may get an out-of-bounds access if our bit is greater
   1694 	 * than all the set bits in the node.
   1695 	 */
   1696 	n = ref_ptr(qp, qp->root_ref);
   1697 	while (is_branch(n)) {
   1698 		prefetch_twigs(qp, n);
   1699 		dns_qpref_t ref = branch_twigs_ref(n);
   1700 		bit = branch_keybit(n, new_key, new_keylen);
   1701 		pos = branch_has_twig(n, bit) ? branch_twig_pos(n, bit) : 0;
   1702 		n = ref_ptr(qp, ref + pos);
   1703 	}
   1704 
   1705 	/* do the keys differ, and if so, where? */
   1706 	old_keylen = leaf_qpkey(qp, n, old_key);
   1707 	offset = qpkey_compare(new_key, new_keylen, old_key, old_keylen);
   1708 	if (offset == QPKEY_EQUAL) {
   1709 		return ISC_R_EXISTS;
   1710 	}
   1711 	new_bit = qpkey_bit(new_key, new_keylen, offset);
   1712 	old_bit = qpkey_bit(old_key, old_keylen, offset);
   1713 
   1714 	/* find where to insert a branch or grow an existing branch. */
   1715 	n = make_root_mutable(qp);
   1716 	while (is_branch(n)) {
   1717 		prefetch_twigs(qp, n);
   1718 		if (offset < branch_key_offset(n)) {
   1719 			goto newbranch;
   1720 		}
   1721 		if (offset == branch_key_offset(n)) {
   1722 			goto growbranch;
   1723 		}
   1724 		make_twigs_mutable(qp, n);
   1725 		bit = branch_keybit(n, new_key, new_keylen);
   1726 		INSIST(branch_has_twig(n, bit));
   1727 		n = branch_twig_ptr(qp, n, bit);
   1728 	}
   1729 	/* fall through */
   1730 
   1731 newbranch:
   1732 	new_ref = alloc_twigs(qp, 2);
   1733 	new_twigs = ref_ptr(qp, new_ref);
   1734 
   1735 	/* save before overwriting. */
   1736 	old_node = *n;
   1737 
   1738 	/* new branch node takes old node's place */
   1739 	index = BRANCH_TAG | (1ULL << new_bit) | (1ULL << old_bit) |
   1740 		((uint64_t)offset << SHIFT_OFFSET);
   1741 	*n = make_node(index, new_ref);
   1742 
   1743 	/* populate twigs */
   1744 	new_twigs[old_bit > new_bit] = old_node;
   1745 	new_twigs[new_bit > old_bit] = new_leaf;
   1746 
   1747 	attach_leaf(qp, &new_leaf);
   1748 	qp->leaf_count++;
   1749 
   1750 	return ISC_R_SUCCESS;
   1751 
   1752 growbranch:
   1753 	INSIST(!branch_has_twig(n, new_bit));
   1754 
   1755 	/* locate twigs vectors */
   1756 	old_size = branch_twigs_size(n);
   1757 	new_size = old_size + 1;
   1758 	old_ref = branch_twigs_ref(n);
   1759 	new_ref = alloc_twigs(qp, new_size);
   1760 	old_twigs = ref_ptr(qp, old_ref);
   1761 	new_twigs = ref_ptr(qp, new_ref);
   1762 
   1763 	/* embiggen branch node */
   1764 	index = branch_index(n) | (1ULL << new_bit);
   1765 	*n = make_node(index, new_ref);
   1766 
   1767 	/* embiggen twigs vector */
   1768 	pos = branch_twig_pos(n, new_bit);
   1769 	move_twigs(new_twigs, old_twigs, pos);
   1770 	new_twigs[pos] = new_leaf;
   1771 	move_twigs(new_twigs + pos + 1, old_twigs + pos, old_size - pos);
   1772 
   1773 	if (squash_twigs(qp, old_ref, old_size)) {
   1774 		/* old twigs destroyed, only attach to new leaf */
   1775 		attach_leaf(qp, &new_leaf);
   1776 	} else {
   1777 		/* old twigs duplicated, attach to all leaves */
   1778 		attach_twigs(qp, new_twigs, new_size);
   1779 	}
   1780 	qp->leaf_count++;
   1781 
   1782 	return ISC_R_SUCCESS;
   1783 }
   1784 
   1785 isc_result_t
   1786 dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key,
   1787 		 size_t search_keylen, void **pval_r, uint32_t *ival_r) {
   1788 	REQUIRE(QP_VALID(qp));
   1789 	REQUIRE(search_keylen < sizeof(dns_qpkey_t));
   1790 
   1791 	if (get_root(qp) == NULL) {
   1792 		return ISC_R_NOTFOUND;
   1793 	}
   1794 
   1795 	dns_qpshift_t bit = 0; /* suppress warning */
   1796 	dns_qpnode_t *parent = NULL;
   1797 	dns_qpnode_t *n = make_root_mutable(qp);
   1798 	while (is_branch(n)) {
   1799 		prefetch_twigs(qp, n);
   1800 		bit = branch_keybit(n, search_key, search_keylen);
   1801 		if (!branch_has_twig(n, bit)) {
   1802 			return ISC_R_NOTFOUND;
   1803 		}
   1804 		make_twigs_mutable(qp, n);
   1805 		parent = n;
   1806 		n = branch_twig_ptr(qp, n, bit);
   1807 	}
   1808 
   1809 	dns_qpkey_t found_key;
   1810 	size_t found_keylen = leaf_qpkey(qp, n, found_key);
   1811 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
   1812 	    QPKEY_EQUAL)
   1813 	{
   1814 		return ISC_R_NOTFOUND;
   1815 	}
   1816 
   1817 	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   1818 	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   1819 	detach_leaf(qp, n);
   1820 	qp->leaf_count--;
   1821 
   1822 	/* trie becomes empty */
   1823 	if (qp->leaf_count == 0) {
   1824 		INSIST(parent == NULL);
   1825 		INSIST(n == get_root(qp));
   1826 		free_twigs(qp, qp->root_ref, 1);
   1827 		qp->root_ref = INVALID_REF;
   1828 		return ISC_R_SUCCESS;
   1829 	}
   1830 
   1831 	/* step back to parent node */
   1832 	n = parent;
   1833 	parent = NULL;
   1834 
   1835 	INSIST(bit != 0);
   1836 	dns_qpweight_t size = branch_twigs_size(n);
   1837 	dns_qpweight_t pos = branch_twig_pos(n, bit);
   1838 	dns_qpref_t ref = branch_twigs_ref(n);
   1839 	dns_qpnode_t *twigs = ref_ptr(qp, ref);
   1840 
   1841 	if (size == 2) {
   1842 		/*
   1843 		 * move the other twig to the parent branch.
   1844 		 */
   1845 		*n = twigs[!pos];
   1846 		squash_twigs(qp, ref, 2);
   1847 	} else {
   1848 		/*
   1849 		 * shrink the twigs in place, to avoid using the bump
   1850 		 * chunk too fast - the gc will clean up after us
   1851 		 */
   1852 		*n = make_node(branch_index(n) & ~(1ULL << bit), ref);
   1853 		move_twigs(twigs + pos, twigs + pos + 1, size - pos - 1);
   1854 		squash_twigs(qp, ref + size - 1, 1);
   1855 	}
   1856 
   1857 	return ISC_R_SUCCESS;
   1858 }
   1859 
   1860 isc_result_t
   1861 dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name, void **pval_r,
   1862 		  uint32_t *ival_r) {
   1863 	dns_qpkey_t key;
   1864 	size_t keylen = dns_qpkey_fromname(key, name);
   1865 	return dns_qp_deletekey(qp, key, keylen, pval_r, ival_r);
   1866 }
   1867 
   1868 /***********************************************************************
   1869  *  chains
   1870  */
   1871 static void
   1872 maybe_set_name(dns_qpreader_t *qp, dns_qpnode_t *node, dns_name_t *name) {
   1873 	dns_qpkey_t key;
   1874 	size_t len;
   1875 
   1876 	if (name == NULL) {
   1877 		return;
   1878 	}
   1879 
   1880 	dns_name_reset(name);
   1881 	len = leaf_qpkey(qp, node, key);
   1882 	dns_qpkey_toname(key, len, name);
   1883 }
   1884 
   1885 void
   1886 dns_qpchain_init(dns_qpreadable_t qpr, dns_qpchain_t *chain) {
   1887 	dns_qpreader_t *qp = dns_qpreader(qpr);
   1888 	REQUIRE(QP_VALID(qp));
   1889 	REQUIRE(chain != NULL);
   1890 
   1891 	*chain = (dns_qpchain_t){
   1892 		.magic = QPCHAIN_MAGIC,
   1893 		.qp = qp,
   1894 	};
   1895 }
   1896 
   1897 unsigned int
   1898 dns_qpchain_length(dns_qpchain_t *chain) {
   1899 	REQUIRE(QPCHAIN_VALID(chain));
   1900 
   1901 	return chain->len;
   1902 }
   1903 
   1904 void
   1905 dns_qpchain_node(dns_qpchain_t *chain, unsigned int level, dns_name_t *name,
   1906 		 void **pval_r, uint32_t *ival_r) {
   1907 	dns_qpnode_t *node = NULL;
   1908 
   1909 	REQUIRE(QPCHAIN_VALID(chain));
   1910 	REQUIRE(level < chain->len);
   1911 
   1912 	node = chain->chain[level].node;
   1913 	maybe_set_name(chain->qp, node, name);
   1914 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
   1915 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
   1916 }
   1917 
   1918 /***********************************************************************
   1919  *  iterators
   1920  */
   1921 
   1922 void
   1923 dns_qpiter_init(dns_qpreadable_t qpr, dns_qpiter_t *qpi) {
   1924 	dns_qpreader_t *qp = dns_qpreader(qpr);
   1925 	REQUIRE(QP_VALID(qp));
   1926 	REQUIRE(qpi != NULL);
   1927 	*qpi = (dns_qpiter_t){
   1928 		.qp = qp,
   1929 		.magic = QPITER_MAGIC,
   1930 	};
   1931 }
   1932 
   1933 /*
   1934  * are we at the last twig in this branch (in whichever direction
   1935  * we're iterating)?
   1936  */
   1937 static bool
   1938 last_twig(dns_qpiter_t *qpi, bool forward) {
   1939 	dns_qpweight_t pos = 0, max = 0;
   1940 	if (qpi->sp > 0) {
   1941 		dns_qpnode_t *child = qpi->stack[qpi->sp];
   1942 		dns_qpnode_t *parent = qpi->stack[qpi->sp - 1];
   1943 		pos = child - ref_ptr(qpi->qp, branch_twigs_ref(parent));
   1944 		if (forward) {
   1945 			max = branch_twigs_size(parent) - 1;
   1946 		}
   1947 	}
   1948 	return pos == max;
   1949 }
   1950 
   1951 /*
   1952  * move a QP iterator forward or back to the next or previous leaf.
   1953  * note: this function can go wrong when the iterator refers to
   1954  * a mutable view of the trie which is altered while iterating
   1955  */
   1956 static isc_result_t
   1957 iterate(bool forward, dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   1958 	uint32_t *ival_r) {
   1959 	dns_qpnode_t *node = NULL;
   1960 	bool initial_branch = true;
   1961 
   1962 	REQUIRE(QPITER_VALID(qpi));
   1963 
   1964 	dns_qpreader_t *qp = qpi->qp;
   1965 
   1966 	REQUIRE(QP_VALID(qp));
   1967 
   1968 	node = get_root(qp);
   1969 	if (node == NULL) {
   1970 		return ISC_R_NOMORE;
   1971 	}
   1972 
   1973 	do {
   1974 		if (qpi->stack[qpi->sp] == NULL) {
   1975 			/* newly initialized iterator: use the root node */
   1976 			INSIST(qpi->sp == 0);
   1977 			qpi->stack[0] = node;
   1978 		} else if (!initial_branch) {
   1979 			/*
   1980 			 * in a prior loop, we reached a branch; from
   1981 			 * here we just need to get the highest or lowest
   1982 			 * leaf in the subtree; we don't need to bother
   1983 			 * stepping forward or backward through twigs
   1984 			 * anymore.
   1985 			 */
   1986 			INSIST(qpi->sp > 0);
   1987 		} else if (last_twig(qpi, forward)) {
   1988 			/*
   1989 			 * we've stepped to the end (or the beginning,
   1990 			 * if we're iterating backwards) of a set of twigs.
   1991 			 */
   1992 			if (qpi->sp == 0) {
   1993 				/*
   1994 				 * we've finished iterating. reinitialize
   1995 				 * the iterator, then return ISC_R_NOMORE.
   1996 				 */
   1997 				dns_qpiter_init(qpi->qp, qpi);
   1998 				return ISC_R_NOMORE;
   1999 			}
   2000 
   2001 			/*
   2002 			 * pop the stack, and resume at the parent branch.
   2003 			 */
   2004 			qpi->stack[qpi->sp] = NULL;
   2005 			qpi->sp--;
   2006 			continue;
   2007 		} else {
   2008 			/*
   2009 			 * there are more twigs in the current branch,
   2010 			 * so step the node pointer forward (or back).
   2011 			 */
   2012 			qpi->stack[qpi->sp] += (forward ? 1 : -1);
   2013 			node = qpi->stack[qpi->sp];
   2014 		}
   2015 
   2016 		/*
   2017 		 * if we're at a branch now, we loop down to the
   2018 		 * left- or rightmost leaf.
   2019 		 */
   2020 		if (is_branch(node)) {
   2021 			qpi->sp++;
   2022 			INSIST(qpi->sp < DNS_QP_MAXKEY);
   2023 			node = ref_ptr(qp, branch_twigs_ref(node)) +
   2024 			       (forward ? 0 : branch_twigs_size(node) - 1);
   2025 			qpi->stack[qpi->sp] = node;
   2026 			initial_branch = false;
   2027 		}
   2028 	} while (is_branch(node));
   2029 
   2030 	/* we're at a leaf: return its data to the caller */
   2031 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
   2032 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
   2033 	maybe_set_name(qpi->qp, node, name);
   2034 	return ISC_R_SUCCESS;
   2035 }
   2036 
   2037 isc_result_t
   2038 dns_qpiter_next(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   2039 		uint32_t *ival_r) {
   2040 	return iterate(true, qpi, name, pval_r, ival_r);
   2041 }
   2042 
   2043 isc_result_t
   2044 dns_qpiter_prev(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   2045 		uint32_t *ival_r) {
   2046 	return iterate(false, qpi, name, pval_r, ival_r);
   2047 }
   2048 
   2049 isc_result_t
   2050 dns_qpiter_current(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   2051 		   uint32_t *ival_r) {
   2052 	dns_qpnode_t *node = NULL;
   2053 
   2054 	REQUIRE(QPITER_VALID(qpi));
   2055 
   2056 	node = qpi->stack[qpi->sp];
   2057 	if (node == NULL || is_branch(node)) {
   2058 		return ISC_R_FAILURE;
   2059 	}
   2060 
   2061 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
   2062 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
   2063 	maybe_set_name(qpi->qp, node, name);
   2064 	return ISC_R_SUCCESS;
   2065 }
   2066 
   2067 /***********************************************************************
   2068  *
   2069  *  search
   2070  */
   2071 
   2072 isc_result_t
   2073 dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key,
   2074 	      size_t search_keylen, void **pval_r, uint32_t *ival_r) {
   2075 	dns_qpreader_t *qp = dns_qpreader(qpr);
   2076 	dns_qpkey_t found_key;
   2077 	size_t found_keylen;
   2078 	dns_qpshift_t bit;
   2079 	dns_qpnode_t *n = NULL;
   2080 
   2081 	REQUIRE(QP_VALID(qp));
   2082 	REQUIRE(search_keylen < sizeof(dns_qpkey_t));
   2083 
   2084 	n = get_root(qp);
   2085 	if (n == NULL) {
   2086 		return ISC_R_NOTFOUND;
   2087 	}
   2088 
   2089 	while (is_branch(n)) {
   2090 		prefetch_twigs(qp, n);
   2091 		bit = branch_keybit(n, search_key, search_keylen);
   2092 		if (!branch_has_twig(n, bit)) {
   2093 			return ISC_R_NOTFOUND;
   2094 		}
   2095 		n = branch_twig_ptr(qp, n, bit);
   2096 	}
   2097 
   2098 	found_keylen = leaf_qpkey(qp, n, found_key);
   2099 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
   2100 	    QPKEY_EQUAL)
   2101 	{
   2102 		return ISC_R_NOTFOUND;
   2103 	}
   2104 
   2105 	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   2106 	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   2107 	return ISC_R_SUCCESS;
   2108 }
   2109 
   2110 isc_result_t
   2111 dns_qp_getname(dns_qpreadable_t qpr, const dns_name_t *name, void **pval_r,
   2112 	       uint32_t *ival_r) {
   2113 	dns_qpkey_t key;
   2114 	size_t keylen = dns_qpkey_fromname(key, name);
   2115 	return dns_qp_getkey(qpr, key, keylen, pval_r, ival_r);
   2116 }
   2117 
   2118 static inline void
   2119 add_link(dns_qpchain_t *chain, dns_qpnode_t *node, size_t offset) {
   2120 	/* prevent duplication */
   2121 	if (chain->len != 0 && chain->chain[chain->len - 1].node == node) {
   2122 		return;
   2123 	}
   2124 	chain->chain[chain->len].node = node;
   2125 	chain->chain[chain->len].offset = offset;
   2126 	chain->len++;
   2127 	INSIST(chain->len <= DNS_NAME_MAXLABELS);
   2128 }
   2129 
   2130 static inline void
   2131 prevleaf(dns_qpiter_t *it) {
   2132 	isc_result_t result = dns_qpiter_prev(it, NULL, NULL, NULL);
   2133 	if (result == ISC_R_NOMORE) {
   2134 		result = dns_qpiter_prev(it, NULL, NULL, NULL);
   2135 	}
   2136 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
   2137 }
   2138 
   2139 static inline void
   2140 greatest_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpiter_t *iter) {
   2141 	while (is_branch(n)) {
   2142 		dns_qpref_t ref = branch_twigs_ref(n) + branch_twigs_size(n) -
   2143 				  1;
   2144 		iter->stack[++iter->sp] = n;
   2145 		n = ref_ptr(qpr, ref);
   2146 	}
   2147 	iter->stack[++iter->sp] = n;
   2148 }
   2149 
   2150 static inline dns_qpnode_t *
   2151 anyleaf(dns_qpreader_t *qp, dns_qpnode_t *n) {
   2152 	while (is_branch(n)) {
   2153 		n = branch_twigs(qp, n);
   2154 	}
   2155 	return n;
   2156 }
   2157 
   2158 static inline int
   2159 twig_offset(dns_qpnode_t *n, dns_qpshift_t sbit, dns_qpshift_t kbit,
   2160 	    dns_qpshift_t fbit) {
   2161 	dns_qpweight_t pos = branch_twig_pos(n, sbit);
   2162 	if (branch_has_twig(n, sbit)) {
   2163 		return pos - (kbit < fbit);
   2164 	}
   2165 	return pos - 1;
   2166 }
   2167 
   2168 /*
   2169  * If dns_qp_lookup() was passed an iterator, we want it to point at the
   2170  * matching name in the case of an exact match, or at the predecessor name
   2171  * for a non-exact match.
   2172  *
   2173  * If there is an exact match, then there is nothing to be done. Otherwise,
   2174  * we pop up the iterator stack until we find a parent branch with an offset
   2175  * that is before the position where the search key differs from the found key.
   2176  * From there we can step to the leaf that is the predecessor of the searched
   2177  * name.
   2178  *
   2179  * Requires the iterator to be pointing at a leaf node.
   2180  */
   2181 static void
   2182 fix_iterator(dns_qpreader_t *qp, dns_qpiter_t *it, dns_qpkey_t key,
   2183 	     size_t len) {
   2184 	dns_qpnode_t *n = it->stack[it->sp];
   2185 
   2186 	REQUIRE(!is_branch(n));
   2187 
   2188 	dns_qpkey_t found;
   2189 	size_t foundlen = leaf_qpkey(qp, n, found);
   2190 	size_t to = qpkey_compare(key, len, found, foundlen);
   2191 
   2192 	/* If the keys are equal, the iterator is already at the right node. */
   2193 	if (to == QPKEY_EQUAL) {
   2194 		return;
   2195 	}
   2196 
   2197 	/*
   2198 	 * Special case: if the key differs even before the root
   2199 	 * key offset, it means the name desired either precedes or
   2200 	 * follows the entire range of names in the database, and
   2201 	 * popping up the stack won't help us, so just move the
   2202 	 * iterator one step back from the origin and return.
   2203 	 */
   2204 	if (to < branch_key_offset(it->stack[0])) {
   2205 		dns_qpiter_init(qp, it);
   2206 		prevleaf(it);
   2207 		return;
   2208 	}
   2209 
   2210 	/*
   2211 	 * As long as the branch offset point is after the point where the
   2212 	 * key differs, we need to branch up and find a better node.
   2213 	 */
   2214 	while (it->sp > 0) {
   2215 		dns_qpnode_t *b = it->stack[it->sp - 1];
   2216 		if (branch_key_offset(b) < to) {
   2217 			break;
   2218 		}
   2219 		it->sp--;
   2220 	}
   2221 	n = it->stack[it->sp];
   2222 
   2223 	/*
   2224 	 * Either we are now at the correct branch, or we are at the
   2225 	 * first unmatched node. Determine the bit position for the
   2226 	 * twig we need (sbit).
   2227 	 */
   2228 	dns_qpshift_t kbit = qpkey_bit(key, len, to);
   2229 	dns_qpshift_t fbit = qpkey_bit(found, foundlen, to);
   2230 	dns_qpshift_t sbit = 0;
   2231 
   2232 	if (is_branch(n) && branch_key_offset(n) == to) {
   2233 		/* We are on the correct branch now. */
   2234 		sbit = kbit;
   2235 	} else if (it->sp == 0) {
   2236 		/*
   2237 		 * We are on the root branch, popping up the stack won't
   2238 		 * help us, so just move the iterator one step back from the
   2239 		 * origin and return.
   2240 		 */
   2241 		dns_qpiter_init(qp, it);
   2242 		prevleaf(it);
   2243 		return;
   2244 	} else {
   2245 		/* We are at the first unmatched node, pop up the stack. */
   2246 		n = it->stack[--it->sp];
   2247 		sbit = qpkey_bit(key, len, branch_key_offset(n));
   2248 	}
   2249 
   2250 	INSIST(is_branch(n));
   2251 
   2252 	prefetch_twigs(qp, n);
   2253 	dns_qpnode_t *twigs = branch_twigs(qp, n);
   2254 	int toff = twig_offset(n, sbit, kbit, fbit);
   2255 	if (toff >= 0) {
   2256 		/*
   2257 		 * The name we want would've been after some twig in
   2258 		 * this branch. Walk down from that twig to the
   2259 		 * highest leaf in its subtree to get the predecessor.
   2260 		 */
   2261 		greatest_leaf(qp, twigs + toff, it);
   2262 	} else {
   2263 		/*
   2264 		 * Every leaf below this node is greater than the one we
   2265 		 * wanted, so the previous leaf is the predecessor.
   2266 		 */
   2267 		prevleaf(it);
   2268 	}
   2269 }
   2270 
   2271 /*
   2272  * When searching for a requested name in dns_qp_lookup(), we might add
   2273  * a leaf node to the chain, then subsequently determine that it was a
   2274  * dead end. When this happens, the chain can be left holding a node
   2275  * that is *not* an ancestor of the requested name. We correct for that
   2276  * here.
   2277  */
   2278 static void
   2279 fix_chain(dns_qpchain_t *chain, size_t offset) {
   2280 	while (chain->len > 0 && chain->chain[chain->len - 1].offset >= offset)
   2281 	{
   2282 		chain->len--;
   2283 		chain->chain[chain->len].node = NULL;
   2284 		chain->chain[chain->len].offset = 0;
   2285 	}
   2286 }
   2287 
   2288 isc_result_t
   2289 dns_qp_lookup(dns_qpreadable_t qpr, const dns_name_t *name,
   2290 	      dns_name_t *foundname, dns_qpiter_t *iter, dns_qpchain_t *chain,
   2291 	      void **pval_r, uint32_t *ival_r) {
   2292 	dns_qpreader_t *qp = dns_qpreader(qpr);
   2293 	dns_qpkey_t search, found;
   2294 	size_t searchlen, foundlen;
   2295 	size_t offset = 0;
   2296 	dns_qpnode_t *n = NULL;
   2297 	dns_qpshift_t bit = SHIFT_NOBYTE;
   2298 	dns_qpchain_t oc;
   2299 	dns_qpiter_t it;
   2300 	bool matched = false;
   2301 	bool setiter = true;
   2302 
   2303 	REQUIRE(QP_VALID(qp));
   2304 	REQUIRE(foundname == NULL || ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
   2305 
   2306 	searchlen = dns_qpkey_fromname(search, name);
   2307 
   2308 	if (chain == NULL) {
   2309 		chain = &oc;
   2310 	}
   2311 	if (iter == NULL) {
   2312 		iter = &it;
   2313 		setiter = false;
   2314 	}
   2315 	dns_qpchain_init(qp, chain);
   2316 	dns_qpiter_init(qp, iter);
   2317 
   2318 	n = get_root(qp);
   2319 	if (n == NULL) {
   2320 		return ISC_R_NOTFOUND;
   2321 	}
   2322 	iter->stack[0] = n;
   2323 
   2324 	/*
   2325 	 * Like `dns_qp_insert()`, we must find a leaf. However, we don't make a
   2326 	 * second pass: instead, we keep track of any leaves with shorter keys
   2327 	 * that we discover along the way. (In general, qp-trie searches can be
   2328 	 * one-pass, by recording their traversal, or two-pass, for less stack
   2329 	 * memory usage.)
   2330 	 */
   2331 	while (is_branch(n)) {
   2332 		prefetch_twigs(qp, n);
   2333 
   2334 		offset = branch_key_offset(n);
   2335 		bit = qpkey_bit(search, searchlen, offset);
   2336 		dns_qpnode_t *twigs = branch_twigs(qp, n);
   2337 
   2338 		/*
   2339 		 * A shorter key that can be a parent domain always has a
   2340 		 * leaf node at SHIFT_NOBYTE (indicating end of its key)
   2341 		 * where our search key has a normal character immediately
   2342 		 * after a label separator.
   2343 		 *
   2344 		 * Note 1: It is OK if `off - 1` underflows: it will
   2345 		 * become SIZE_MAX, which is greater than `searchlen`, so
   2346 		 * `qpkey_bit()` will return SHIFT_NOBYTE, which is what we
   2347 		 * want when `off == 0`.
   2348 		 *
   2349 		 * Note 2: If SHIFT_NOBYTE twig is present, it will always
   2350 		 * be in position 0, the first location in 'twigs'.
   2351 		 */
   2352 		if (bit != SHIFT_NOBYTE && branch_has_twig(n, SHIFT_NOBYTE) &&
   2353 		    qpkey_bit(search, searchlen, offset - 1) == SHIFT_NOBYTE &&
   2354 		    !is_branch(twigs))
   2355 		{
   2356 			add_link(chain, twigs, offset);
   2357 		}
   2358 
   2359 		matched = branch_has_twig(n, bit);
   2360 		if (matched) {
   2361 			/*
   2362 			 * found a match: if it's a branch, we keep
   2363 			 * searching, and if it's a leaf, we drop out of
   2364 			 * the loop.
   2365 			 */
   2366 			n = branch_twig_ptr(qp, n, bit);
   2367 		} else {
   2368 			/*
   2369 			 * this branch is a dead end, and the predecessor
   2370 			 * doesn't matter. now we just need to find a leaf
   2371 			 * to end on so that qpkey_leaf() will work below.
   2372 			 */
   2373 			n = anyleaf(qp, twigs);
   2374 		}
   2375 
   2376 		iter->stack[++iter->sp] = n;
   2377 	}
   2378 
   2379 	if (setiter) {
   2380 		/*
   2381 		 * we found a leaf, but it might not be the leaf we wanted.
   2382 		 * if it isn't, and if the caller passed us an iterator,
   2383 		 * then we might need to reposition it.
   2384 		 */
   2385 		fix_iterator(qp, iter, search, searchlen);
   2386 		n = iter->stack[iter->sp];
   2387 	}
   2388 
   2389 	/* at this point, n can only be a leaf node */
   2390 	INSIST(!is_branch(n));
   2391 
   2392 	foundlen = leaf_qpkey(qp, n, found);
   2393 	offset = qpkey_compare(search, searchlen, found, foundlen);
   2394 
   2395 	/* the search ended with an exact or partial match */
   2396 	if (offset == QPKEY_EQUAL || offset == foundlen) {
   2397 		isc_result_t result = ISC_R_SUCCESS;
   2398 
   2399 		if (offset == foundlen) {
   2400 			fix_chain(chain, offset);
   2401 			result = DNS_R_PARTIALMATCH;
   2402 		}
   2403 		add_link(chain, n, offset);
   2404 
   2405 		SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   2406 		SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   2407 		maybe_set_name(qp, n, foundname);
   2408 		return result;
   2409 	}
   2410 
   2411 	/*
   2412 	 * the requested name was not found, but if an ancestor
   2413 	 * was, we can retrieve that from the chain.
   2414 	 */
   2415 	int len = chain->len;
   2416 	while (len-- > 0) {
   2417 		if (offset >= chain->chain[len].offset) {
   2418 			n = chain->chain[len].node;
   2419 			SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   2420 			SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   2421 			maybe_set_name(qp, n, foundname);
   2422 			return DNS_R_PARTIALMATCH;
   2423 		} else {
   2424 			/*
   2425 			 * oops, during the search we found and added
   2426 			 * a leaf that's longer than the requested
   2427 			 * name; remove it from the chain.
   2428 			 */
   2429 			chain->len--;
   2430 		}
   2431 	}
   2432 
   2433 	/* nothing was found at all */
   2434 	return ISC_R_NOTFOUND;
   2435 }
   2436 
   2437 /**********************************************************************/
   2438