Home | History | Annotate | Line # | Download | only in dns
qp.c revision 1.3
      1  1.3  christos /*	$NetBSD: qp.c,v 1.3 2025/01/27 02:16:05 christos Exp $	*/
      2  1.1  christos 
      3  1.1  christos /*
      4  1.1  christos  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  1.1  christos  *
      6  1.1  christos  * SPDX-License-Identifier: MPL-2.0
      7  1.1  christos  *
      8  1.1  christos  * This Source Code Form is subject to the terms of the Mozilla Public
      9  1.1  christos  * License, v. 2.0. If a copy of the MPL was not distributed with this
     10  1.1  christos  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
     11  1.1  christos  *
     12  1.1  christos  * See the COPYRIGHT file distributed with this work for additional
     13  1.1  christos  * information regarding copyright ownership.
     14  1.1  christos  */
     15  1.1  christos 
     16  1.1  christos /*
     17  1.1  christos  * For an overview, see doc/design/qp-trie.md
     18  1.1  christos  */
     19  1.1  christos 
     20  1.1  christos #include <inttypes.h>
     21  1.1  christos #include <stdbool.h>
     22  1.1  christos #include <stddef.h>
     23  1.1  christos #include <stdint.h>
     24  1.1  christos #include <string.h>
     25  1.1  christos 
     26  1.1  christos #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
     27  1.1  christos #include <sys/mman.h>
     28  1.1  christos #include <unistd.h>
     29  1.1  christos #endif
     30  1.1  christos 
     31  1.1  christos #include <isc/atomic.h>
     32  1.1  christos #include <isc/buffer.h>
     33  1.1  christos #include <isc/magic.h>
     34  1.1  christos #include <isc/mem.h>
     35  1.1  christos #include <isc/mutex.h>
     36  1.1  christos #include <isc/refcount.h>
     37  1.1  christos #include <isc/result.h>
     38  1.1  christos #include <isc/rwlock.h>
     39  1.1  christos #include <isc/tid.h>
     40  1.1  christos #include <isc/time.h>
     41  1.1  christos #include <isc/types.h>
     42  1.1  christos #include <isc/urcu.h>
     43  1.1  christos #include <isc/util.h>
     44  1.1  christos 
     45  1.1  christos #include <dns/fixedname.h>
     46  1.1  christos #include <dns/log.h>
     47  1.1  christos #include <dns/name.h>
     48  1.1  christos #include <dns/qp.h>
     49  1.1  christos #include <dns/types.h>
     50  1.1  christos 
     51  1.1  christos #include "qp_p.h"
     52  1.1  christos 
     53  1.1  christos #ifndef DNS_QP_LOG_STATS
     54  1.1  christos #define DNS_QP_LOG_STATS 1
     55  1.1  christos #endif
     56  1.1  christos #ifndef DNS_QP_TRACE
     57  1.1  christos #define DNS_QP_TRACE 0
     58  1.1  christos #endif
     59  1.1  christos 
     60  1.1  christos /*
     61  1.1  christos  * very basic garbage collector statistics
     62  1.1  christos  *
     63  1.1  christos  * XXXFANF for now we're logging GC times, but ideally we should
     64  1.1  christos  * accumulate stats more quietly and report via the statschannel
     65  1.1  christos  */
     66  1.3  christos #ifdef _LP64
     67  1.1  christos static atomic_uint_fast64_t compact_time;
     68  1.1  christos static atomic_uint_fast64_t recycle_time;
     69  1.1  christos static atomic_uint_fast64_t rollback_time;
     70  1.3  christos #define ISC_QP_ADD(v, a) atomic_fetch_add_relaxed(&(v), (a))
     71  1.3  christos #define ISC_QP_GET(v) atomic_load_relaxed(v)
     72  1.3  christos #else
     73  1.3  christos static uint64_t compact_time;
     74  1.3  christos static uint64_t recycle_time;
     75  1.3  christos static uint64_t rollback_time;
     76  1.3  christos static isc_mutex_t qp_mutex = PTHREAD_MUTEX_INITIALIZER;
     77  1.3  christos #define ISC_QP_ADD(v, a) \
     78  1.3  christos 	({ \
     79  1.3  christos 		isc_mutex_lock(&qp_mutex); \
     80  1.3  christos 		uint64_t x = (v) + (a); \
     81  1.3  christos 		isc_mutex_unlock(&qp_mutex); \
     82  1.3  christos 		x; \
     83  1.3  christos 	})
     84  1.3  christos #define ISC_QP_GET(v) \
     85  1.3  christos 	({ \
     86  1.3  christos 		isc_mutex_lock(&qp_mutex); \
     87  1.3  christos 		uint64_t x = (v); \
     88  1.3  christos 		isc_mutex_unlock(&qp_mutex); \
     89  1.3  christos 		x; \
     90  1.3  christos 	})
     91  1.3  christos #endif
     92  1.3  christos 
     93  1.1  christos 
     94  1.1  christos /* for LOG_STATS() format strings */
     95  1.1  christos #define PRItime " %" PRIu64 " ns "
     96  1.1  christos 
     97  1.1  christos #if DNS_QP_LOG_STATS
     98  1.1  christos #define LOG_STATS(...)                                                      \
     99  1.1  christos 	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \
    100  1.1  christos 		      ISC_LOG_DEBUG(1), __VA_ARGS__)
    101  1.1  christos #else
    102  1.1  christos #define LOG_STATS(...)
    103  1.1  christos #endif
    104  1.1  christos 
    105  1.1  christos #if DNS_QP_TRACE
    106  1.1  christos /*
    107  1.1  christos  * TRACE is generally used in allocation-related functions so it doesn't
    108  1.1  christos  * trace very high-frequency ops
    109  1.1  christos  */
    110  1.1  christos #define TRACE(fmt, ...)                                                        \
    111  1.1  christos 	do {                                                                   \
    112  1.1  christos 		if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) {            \
    113  1.1  christos 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,      \
    114  1.1  christos 				      DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7),      \
    115  1.1  christos 				      "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \
    116  1.1  christos 				      __FILE__, __LINE__, __func__, qp,        \
    117  1.1  christos 				      qp ? TRIENAME(qp) : "(null)", isc_tid(), \
    118  1.1  christos 				      ##__VA_ARGS__);                          \
    119  1.1  christos 		}                                                              \
    120  1.1  christos 	} while (0)
    121  1.1  christos #else
    122  1.1  christos #define TRACE(...)
    123  1.1  christos #endif
    124  1.1  christos 
    125  1.1  christos /***********************************************************************
    126  1.1  christos  *
    127  1.1  christos  *  converting DNS names to trie keys
    128  1.1  christos  */
    129  1.1  christos 
    130  1.1  christos /*
    131  1.1  christos  * Number of distinct byte values, i.e. 256
    132  1.1  christos  */
    133  1.1  christos #define BYTE_VALUES (UINT8_MAX + 1)
    134  1.1  christos 
    135  1.1  christos /*
    136  1.1  christos  * Lookup table mapping bytes in DNS names to bit positions, used
    137  1.1  christos  * by dns_qpkey_fromname() to convert DNS names to qp-trie keys.
    138  1.1  christos  *
    139  1.1  christos  * Each element holds one or two bit positions, bit_one in the
    140  1.1  christos  * lower half and bit_two in the upper half.
    141  1.1  christos  *
    142  1.1  christos  * For common hostname characters, bit_two is zero (which cannot
    143  1.1  christos  * be a valid bit position).
    144  1.1  christos  *
    145  1.1  christos  * For others, bit_one is the escape bit, and bit_two is the
    146  1.1  christos  * position of the character within the escaped range.
    147  1.1  christos  */
    148  1.1  christos uint16_t dns_qp_bits_for_byte[BYTE_VALUES] = { 0 };
    149  1.1  christos 
    150  1.1  christos /*
    151  1.1  christos  * And the reverse, mapping bit positions to characters, so the tests
    152  1.1  christos  * can print diagnostics involving qp-trie keys.
    153  1.1  christos  *
    154  1.1  christos  * This table only handles the first bit in an escape sequence; we
    155  1.1  christos  * arrange that we can calculate the byte value for both bits by
    156  1.1  christos  * adding the the second bit to the first bit's byte value.
    157  1.1  christos  */
    158  1.1  christos uint8_t dns_qp_byte_for_bit[SHIFT_OFFSET] = { 0 };
    159  1.1  christos 
    160  1.1  christos /*
    161  1.1  christos  * Fill in the lookup tables at program startup. (It doesn't matter
    162  1.1  christos  * when this is initialized relative to other startup code.)
    163  1.1  christos  */
    164  1.1  christos static void
    165  1.1  christos initialize_bits_for_byte(void) ISC_CONSTRUCTOR;
    166  1.1  christos 
    167  1.1  christos /*
    168  1.1  christos  * The bit positions for bytes inside labels have to be between
    169  1.1  christos  * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.)
    170  1.1  christos  *
    171  1.1  christos  * Each byte range in between common hostname characters has a different
    172  1.1  christos  * escape character, to preserve the correct lexical order.
    173  1.1  christos  *
    174  1.1  christos  * Escaped byte ranges mostly fit into the space available in the
    175  1.1  christos  * bitmap, except for those above 'z' (which is mostly bytes with the
    176  1.1  christos  * top bit set). So, when we reach the end of the bitmap we roll over
    177  1.1  christos  * to the next escape character.
    178  1.1  christos  *
    179  1.1  christos  * After filling the table we ensure that the bit positions for
    180  1.1  christos  * hostname characters and escape characters all fit.
    181  1.1  christos  */
    182  1.1  christos static void
    183  1.1  christos initialize_bits_for_byte(void) {
    184  1.1  christos 	/* zero common character marker not a valid shift position */
    185  1.1  christos 	INSIST(0 < SHIFT_BITMAP);
    186  1.1  christos 	/* first bit is common byte or escape byte */
    187  1.1  christos 	dns_qpshift_t bit_one = SHIFT_BITMAP;
    188  1.1  christos 	/* second bit is position in escaped range */
    189  1.1  christos 	dns_qpshift_t bit_two = SHIFT_BITMAP;
    190  1.1  christos 	bool escaping = true;
    191  1.1  christos 
    192  1.1  christos 	for (unsigned int byte = 0; byte < BYTE_VALUES; byte++) {
    193  1.1  christos 		if (qp_common_character(byte)) {
    194  1.1  christos 			escaping = false;
    195  1.1  christos 			bit_one++;
    196  1.1  christos 			dns_qp_byte_for_bit[bit_one] = byte;
    197  1.1  christos 			dns_qp_bits_for_byte[byte] = bit_one;
    198  1.1  christos 		} else if ('A' <= byte && byte <= 'Z') {
    199  1.1  christos 			/* map upper case to lower case */
    200  1.1  christos 			dns_qpshift_t after_esc = bit_one + 1;
    201  1.1  christos 			dns_qpshift_t skip_punct = 'a' - '_';
    202  1.1  christos 			dns_qpshift_t letter = byte - 'A';
    203  1.1  christos 			dns_qpshift_t bit = after_esc + skip_punct + letter;
    204  1.1  christos 			dns_qp_bits_for_byte[byte] = bit;
    205  1.1  christos 			/* to simplify reverse conversion */
    206  1.1  christos 			bit_two++;
    207  1.1  christos 		} else {
    208  1.1  christos 			/* non-hostname characters need to be escaped */
    209  1.1  christos 			if (!escaping || bit_two >= SHIFT_OFFSET) {
    210  1.1  christos 				escaping = true;
    211  1.1  christos 				bit_one++;
    212  1.1  christos 				dns_qp_byte_for_bit[bit_one] = byte;
    213  1.1  christos 				bit_two = SHIFT_BITMAP;
    214  1.1  christos 			}
    215  1.1  christos 			dns_qp_bits_for_byte[byte] = bit_two << 8 | bit_one;
    216  1.1  christos 			bit_two++;
    217  1.1  christos 		}
    218  1.1  christos 	}
    219  1.1  christos 	ENSURE(bit_one < SHIFT_OFFSET);
    220  1.1  christos }
    221  1.1  christos 
    222  1.1  christos /*
    223  1.1  christos  * Convert a DNS name into a trie lookup key.
    224  1.1  christos  *
    225  1.1  christos  * Returns the length of the key.
    226  1.1  christos  *
    227  1.1  christos  * For performance we get our hands dirty in the guts of the name.
    228  1.1  christos  *
    229  1.1  christos  * We don't worry about the distinction between absolute and relative
    230  1.1  christos  * names. When the trie is only used with absolute names, the first byte
    231  1.1  christos  * of the key will always be SHIFT_NOBYTE and it will always be skipped
    232  1.1  christos  * when traversing the trie. So keeping the root label costs little, and
    233  1.1  christos  * it allows us to support tries of relative names too. In fact absolute
    234  1.1  christos  * and relative names can be mixed in the same trie without causing
    235  1.1  christos  * confusion, because the presence or absence of the initial
    236  1.1  christos  * SHIFT_NOBYTE in the key disambiguates them (exactly like a trailing
    237  1.1  christos  * dot in a zone file).
    238  1.1  christos  */
    239  1.1  christos size_t
    240  1.1  christos dns_qpkey_fromname(dns_qpkey_t key, const dns_name_t *name) {
    241  1.1  christos 	size_t len, label;
    242  1.1  christos 	dns_fixedname_t fixed;
    243  1.1  christos 
    244  1.1  christos 	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
    245  1.1  christos 
    246  1.1  christos 	if (name->labels == 0) {
    247  1.1  christos 		key[0] = SHIFT_NOBYTE;
    248  1.1  christos 		return 0;
    249  1.1  christos 	}
    250  1.1  christos 
    251  1.1  christos 	if (name->offsets == NULL) {
    252  1.1  christos 		dns_name_t *clone = dns_fixedname_initname(&fixed);
    253  1.1  christos 		dns_name_clone(name, clone);
    254  1.1  christos 		name = clone;
    255  1.1  christos 	}
    256  1.1  christos 
    257  1.1  christos 	len = 0;
    258  1.1  christos 	label = name->labels;
    259  1.1  christos 	while (label-- > 0) {
    260  1.1  christos 		const uint8_t *ldata = name->ndata + name->offsets[label];
    261  1.1  christos 		size_t label_len = *ldata++;
    262  1.1  christos 		while (label_len-- > 0) {
    263  1.1  christos 			uint16_t bits = dns_qp_bits_for_byte[*ldata++];
    264  1.1  christos 			key[len++] = bits & 0xFF;	/* bit_one */
    265  1.1  christos 			if ((bits >> 8) != 0) {		/* escape? */
    266  1.1  christos 				key[len++] = bits >> 8; /* bit_two */
    267  1.1  christos 			}
    268  1.1  christos 		}
    269  1.1  christos 		/* label terminator */
    270  1.1  christos 		key[len++] = SHIFT_NOBYTE;
    271  1.1  christos 	}
    272  1.1  christos 	/* mark end with a double NOBYTE */
    273  1.1  christos 	key[len] = SHIFT_NOBYTE;
    274  1.1  christos 	ENSURE(len < sizeof(dns_qpkey_t));
    275  1.1  christos 	return len;
    276  1.1  christos }
    277  1.1  christos 
    278  1.1  christos void
    279  1.1  christos dns_qpkey_toname(const dns_qpkey_t key, size_t keylen, dns_name_t *name) {
    280  1.1  christos 	size_t locs[DNS_NAME_MAXLABELS];
    281  1.1  christos 	size_t loc = 0, opos = 0;
    282  1.1  christos 	size_t offset;
    283  1.1  christos 
    284  1.1  christos 	REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
    285  1.1  christos 	REQUIRE(name->buffer != NULL);
    286  1.1  christos 	REQUIRE(name->offsets != NULL);
    287  1.1  christos 
    288  1.1  christos 	dns_name_reset(name);
    289  1.1  christos 
    290  1.1  christos 	if (keylen == 0) {
    291  1.1  christos 		return;
    292  1.1  christos 	}
    293  1.1  christos 
    294  1.1  christos 	/* Scan the key looking for label boundaries */
    295  1.1  christos 	for (offset = 0; offset <= keylen; offset++) {
    296  1.1  christos 		INSIST(key[offset] >= SHIFT_NOBYTE &&
    297  1.1  christos 		       key[offset] < SHIFT_OFFSET);
    298  1.1  christos 		INSIST(loc < DNS_NAME_MAXLABELS);
    299  1.1  christos 		if (qpkey_bit(key, keylen, offset) == SHIFT_NOBYTE) {
    300  1.1  christos 			if (qpkey_bit(key, keylen, offset + 1) == SHIFT_NOBYTE)
    301  1.1  christos 			{
    302  1.1  christos 				locs[loc] = offset + 1;
    303  1.1  christos 				goto scanned;
    304  1.1  christos 			}
    305  1.1  christos 			locs[loc++] = offset + 1;
    306  1.1  christos 		} else if (offset == 0) {
    307  1.1  christos 			/* This happens for a relative name */
    308  1.1  christos 			locs[loc++] = offset;
    309  1.1  christos 		}
    310  1.1  christos 	}
    311  1.1  christos 	UNREACHABLE();
    312  1.1  christos scanned:
    313  1.1  christos 
    314  1.1  christos 	/*
    315  1.1  christos 	 * In the key the labels are encoded in reverse order, so
    316  1.1  christos 	 * we step backward through the label boundaries, then forward
    317  1.1  christos 	 * through the labels, to create the DNS wire format data.
    318  1.1  christos 	 */
    319  1.1  christos 	name->labels = loc;
    320  1.1  christos 	while (loc-- > 0) {
    321  1.1  christos 		uint8_t len = 0, *lenp = NULL;
    322  1.1  christos 
    323  1.1  christos 		/* Add a length byte to the name data and set an offset */
    324  1.1  christos 		lenp = isc_buffer_used(name->buffer);
    325  1.1  christos 		isc_buffer_putuint8(name->buffer, 0);
    326  1.1  christos 		name->offsets[opos++] = name->length++;
    327  1.1  christos 
    328  1.1  christos 		/* Convert from escaped byte ranges to ASCII */
    329  1.1  christos 		for (offset = locs[loc]; offset < locs[loc + 1] - 1; offset++) {
    330  1.1  christos 			uint8_t bit = qpkey_bit(key, keylen, offset);
    331  1.1  christos 			uint8_t byte = dns_qp_byte_for_bit[bit];
    332  1.1  christos 			if (qp_common_character(byte)) {
    333  1.1  christos 				isc_buffer_putuint8(name->buffer, byte);
    334  1.1  christos 			} else {
    335  1.1  christos 				byte += key[++offset] - SHIFT_BITMAP;
    336  1.1  christos 				isc_buffer_putuint8(name->buffer, byte);
    337  1.1  christos 			}
    338  1.1  christos 			len++;
    339  1.1  christos 		}
    340  1.1  christos 
    341  1.1  christos 		name->length += len;
    342  1.1  christos 		*lenp = len;
    343  1.1  christos 	}
    344  1.1  christos 
    345  1.1  christos 	/* Add a root label for absolute names */
    346  1.1  christos 	if (key[0] == SHIFT_NOBYTE) {
    347  1.1  christos 		name->attributes.absolute = true;
    348  1.1  christos 		isc_buffer_putuint8(name->buffer, 0);
    349  1.1  christos 		name->offsets[opos++] = name->length++;
    350  1.1  christos 		name->labels++;
    351  1.1  christos 	}
    352  1.1  christos 
    353  1.1  christos 	name->ndata = isc_buffer_base(name->buffer);
    354  1.1  christos }
    355  1.1  christos 
    356  1.1  christos /*
    357  1.1  christos  * Sentinel value for equal keys
    358  1.1  christos  */
    359  1.1  christos #define QPKEY_EQUAL (~(size_t)0)
    360  1.1  christos 
    361  1.1  christos /*
    362  1.1  christos  * Compare two keys and return the offset where they differ.
    363  1.1  christos  *
    364  1.1  christos  * This offset is used to work out where a trie search diverged: when one
    365  1.1  christos  * of the keys is in the trie and one is not, the common prefix (up to the
    366  1.1  christos  * offset) is the part of the unknown key that exists in the trie. This
    367  1.1  christos  * matters for adding new keys or finding neighbours of missing keys.
    368  1.1  christos  *
    369  1.1  christos  * When the keys are different lengths it is possible (but unwise) for
    370  1.1  christos  * the longer key to be the same as the shorter key but with superfluous
    371  1.1  christos  * trailing SHIFT_NOBYTE elements. This makes the keys equal for the
    372  1.1  christos  * purpose of traversing the trie.
    373  1.1  christos  */
    374  1.1  christos static size_t
    375  1.1  christos qpkey_compare(const dns_qpkey_t key_a, const size_t keylen_a,
    376  1.1  christos 	      const dns_qpkey_t key_b, const size_t keylen_b) {
    377  1.1  christos 	size_t keylen = ISC_MAX(keylen_a, keylen_b);
    378  1.1  christos 	for (size_t offset = 0; offset < keylen; offset++) {
    379  1.1  christos 		if (qpkey_bit(key_a, keylen_a, offset) !=
    380  1.1  christos 		    qpkey_bit(key_b, keylen_b, offset))
    381  1.1  christos 		{
    382  1.1  christos 			return offset;
    383  1.1  christos 		}
    384  1.1  christos 	}
    385  1.1  christos 	return QPKEY_EQUAL;
    386  1.1  christos }
    387  1.1  christos 
    388  1.1  christos /***********************************************************************
    389  1.1  christos  *
    390  1.1  christos  *  allocator wrappers
    391  1.1  christos  */
    392  1.1  christos 
    393  1.1  christos #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
    394  1.1  christos 
    395  1.1  christos /*
    396  1.1  christos  * Optionally (for debugging) during a copy-on-write transaction, use
    397  1.1  christos  * memory protection to ensure that the shared chunks are not modified.
    398  1.1  christos  * Once a chunk becomes shared, it remains read-only until it is freed.
    399  1.1  christos  * POSIX says we have to use mmap() to get an allocation that we can
    400  1.1  christos  * definitely pass to mprotect().
    401  1.1  christos  */
    402  1.1  christos 
    403  1.1  christos static size_t
    404  1.1  christos chunk_size_raw(void) {
    405  1.1  christos 	size_t size = (size_t)sysconf(_SC_PAGE_SIZE);
    406  1.1  christos 	return ISC_MAX(size, QP_CHUNK_BYTES);
    407  1.1  christos }
    408  1.1  christos 
    409  1.1  christos static void *
    410  1.1  christos chunk_get_raw(dns_qp_t *qp) {
    411  1.1  christos 	if (qp->write_protect) {
    412  1.1  christos 		size_t size = chunk_size_raw();
    413  1.1  christos 		void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
    414  1.1  christos 				 MAP_ANON | MAP_PRIVATE, -1, 0);
    415  1.1  christos 		RUNTIME_CHECK(ptr != MAP_FAILED);
    416  1.1  christos 		return ptr;
    417  1.1  christos 	} else {
    418  1.1  christos 		return isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES);
    419  1.1  christos 	}
    420  1.1  christos }
    421  1.1  christos 
    422  1.1  christos static void
    423  1.1  christos chunk_free_raw(dns_qp_t *qp, void *ptr) {
    424  1.1  christos 	if (qp->write_protect) {
    425  1.1  christos 		RUNTIME_CHECK(munmap(ptr, chunk_size_raw()) == 0);
    426  1.1  christos 	} else {
    427  1.1  christos 		isc_mem_free(qp->mctx, ptr);
    428  1.1  christos 	}
    429  1.1  christos }
    430  1.1  christos 
    431  1.1  christos static void *
    432  1.1  christos chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) {
    433  1.1  christos 	if (qp->write_protect) {
    434  1.1  christos 		return ptr;
    435  1.1  christos 	} else {
    436  1.1  christos 		return isc_mem_reallocate(qp->mctx, ptr, bytes);
    437  1.1  christos 	}
    438  1.1  christos }
    439  1.1  christos 
    440  1.1  christos static void
    441  1.1  christos write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) {
    442  1.1  christos 	if (qp->write_protect) {
    443  1.1  christos 		/* see transaction_open() wrt this special case */
    444  1.1  christos 		if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) {
    445  1.1  christos 			return;
    446  1.1  christos 		}
    447  1.1  christos 		TRACE("chunk %u", chunk);
    448  1.1  christos 		void *ptr = qp->base->ptr[chunk];
    449  1.1  christos 		size_t size = chunk_size_raw();
    450  1.1  christos 		RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0);
    451  1.1  christos 	}
    452  1.1  christos }
    453  1.1  christos 
    454  1.1  christos #else
    455  1.1  christos 
    456  1.1  christos #define chunk_get_raw(qp)	isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES)
    457  1.1  christos #define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr)
    458  1.1  christos 
    459  1.1  christos #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size)
    460  1.1  christos 
    461  1.1  christos #define write_protect(qp, chunk)
    462  1.1  christos 
    463  1.1  christos #endif
    464  1.1  christos 
    465  1.1  christos /***********************************************************************
    466  1.1  christos  *
    467  1.1  christos  *  allocator
    468  1.1  christos  */
    469  1.1  christos 
    470  1.1  christos /*
    471  1.1  christos  * When we reuse the bump chunk across multiple write transactions,
    472  1.1  christos  * it can have an immutable prefix and a mutable suffix.
    473  1.1  christos  */
    474  1.1  christos static inline bool
    475  1.1  christos cells_immutable(dns_qp_t *qp, dns_qpref_t ref) {
    476  1.1  christos 	dns_qpchunk_t chunk = ref_chunk(ref);
    477  1.1  christos 	dns_qpcell_t cell = ref_cell(ref);
    478  1.1  christos 	if (chunk == qp->bump) {
    479  1.1  christos 		return cell < qp->fender;
    480  1.1  christos 	} else {
    481  1.1  christos 		return qp->usage[chunk].immutable;
    482  1.1  christos 	}
    483  1.1  christos }
    484  1.1  christos 
    485  1.1  christos /*
    486  1.1  christos  * Create a fresh bump chunk and allocate some twigs from it.
    487  1.1  christos  */
    488  1.1  christos static dns_qpref_t
    489  1.1  christos chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) {
    490  1.1  christos 	INSIST(qp->base->ptr[chunk] == NULL);
    491  1.1  christos 	INSIST(qp->usage[chunk].used == 0);
    492  1.1  christos 	INSIST(qp->usage[chunk].free == 0);
    493  1.1  christos 
    494  1.1  christos 	qp->base->ptr[chunk] = chunk_get_raw(qp);
    495  1.1  christos 	qp->usage[chunk] = (qp_usage_t){ .exists = true, .used = size };
    496  1.1  christos 	qp->used_count += size;
    497  1.1  christos 	qp->bump = chunk;
    498  1.1  christos 	qp->fender = 0;
    499  1.1  christos 
    500  1.1  christos 	if (qp->write_protect) {
    501  1.1  christos 		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
    502  1.1  christos 	}
    503  1.1  christos 	return make_ref(chunk, 0);
    504  1.1  christos }
    505  1.1  christos 
    506  1.1  christos /*
    507  1.1  christos  * This is used to grow the chunk arrays when they fill up. If the old
    508  1.1  christos  * base array is in use by readers, we must make a clone, otherwise we
    509  1.1  christos  * can reallocate in place.
    510  1.1  christos  *
    511  1.1  christos  * The isc_refcount_init() and qpbase_unref() in this function are a pair.
    512  1.1  christos  */
    513  1.1  christos static void
    514  1.1  christos realloc_chunk_arrays(dns_qp_t *qp, dns_qpchunk_t newmax) {
    515  1.1  christos 	size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max;
    516  1.1  christos 	size_t newptrs = sizeof(qp->base->ptr[0]) * newmax;
    517  1.1  christos 	size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax);
    518  1.1  christos 
    519  1.1  christos 	if (qp->base == NULL || qpbase_unref(qp)) {
    520  1.1  christos 		qp->base = isc_mem_reallocate(qp->mctx, qp->base, size);
    521  1.1  christos 	} else {
    522  1.1  christos 		dns_qpbase_t *oldbase = qp->base;
    523  1.1  christos 		qp->base = isc_mem_allocate(qp->mctx, size);
    524  1.1  christos 		memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs);
    525  1.1  christos 	}
    526  1.1  christos 	memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs);
    527  1.1  christos 	isc_refcount_init(&qp->base->refcount, 1);
    528  1.1  christos 	qp->base->magic = QPBASE_MAGIC;
    529  1.1  christos 
    530  1.1  christos 	/* usage array is exclusive to the writer */
    531  1.1  christos 	size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max;
    532  1.1  christos 	size_t newusage = sizeof(qp->usage[0]) * newmax;
    533  1.1  christos 	qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage);
    534  1.1  christos 	memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage);
    535  1.1  christos 
    536  1.1  christos 	qp->chunk_max = newmax;
    537  1.1  christos 
    538  1.1  christos 	TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max);
    539  1.1  christos }
    540  1.1  christos 
    541  1.1  christos /*
    542  1.1  christos  * There was no space in the bump chunk, so find a place to put a fresh
    543  1.1  christos  * chunk in the chunk arrays, then allocate some twigs from it.
    544  1.1  christos  */
    545  1.1  christos static dns_qpref_t
    546  1.1  christos alloc_slow(dns_qp_t *qp, dns_qpweight_t size) {
    547  1.1  christos 	dns_qpchunk_t chunk;
    548  1.1  christos 
    549  1.1  christos 	for (chunk = 0; chunk < qp->chunk_max; chunk++) {
    550  1.1  christos 		if (!qp->usage[chunk].exists) {
    551  1.1  christos 			return chunk_alloc(qp, chunk, size);
    552  1.1  christos 		}
    553  1.1  christos 	}
    554  1.1  christos 	ENSURE(chunk == qp->chunk_max);
    555  1.1  christos 	realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk));
    556  1.1  christos 	return chunk_alloc(qp, chunk, size);
    557  1.1  christos }
    558  1.1  christos 
    559  1.1  christos /*
    560  1.1  christos  * Ensure we are using a fresh bump chunk.
    561  1.1  christos  */
    562  1.1  christos static void
    563  1.1  christos alloc_reset(dns_qp_t *qp) {
    564  1.1  christos 	(void)alloc_slow(qp, 0);
    565  1.1  christos }
    566  1.1  christos 
    567  1.1  christos /*
    568  1.1  christos  * Allocate some fresh twigs. This is the bump allocator fast path.
    569  1.1  christos  */
    570  1.1  christos static inline dns_qpref_t
    571  1.1  christos alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) {
    572  1.1  christos 	dns_qpchunk_t chunk = qp->bump;
    573  1.1  christos 	dns_qpcell_t cell = qp->usage[chunk].used;
    574  1.1  christos 
    575  1.1  christos 	if (cell + size <= QP_CHUNK_SIZE) {
    576  1.1  christos 		qp->usage[chunk].used += size;
    577  1.1  christos 		qp->used_count += size;
    578  1.1  christos 		return make_ref(chunk, cell);
    579  1.1  christos 	} else {
    580  1.1  christos 		return alloc_slow(qp, size);
    581  1.1  christos 	}
    582  1.1  christos }
    583  1.1  christos 
    584  1.1  christos /*
    585  1.1  christos  * Record that some twigs are no longer being used, and if possible
    586  1.1  christos  * zero them to ensure that there isn't a spurious double detach when
    587  1.1  christos  * the chunk is later recycled.
    588  1.1  christos  *
    589  1.1  christos  * Returns true if the twigs were immediately destroyed.
    590  1.1  christos  *
    591  1.1  christos  * NOTE: the caller is responsible for attaching or detaching any
    592  1.1  christos  * leaves as required.
    593  1.1  christos  */
    594  1.1  christos static inline bool
    595  1.1  christos free_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
    596  1.1  christos 	dns_qpchunk_t chunk = ref_chunk(twigs);
    597  1.1  christos 
    598  1.1  christos 	qp->free_count += size;
    599  1.1  christos 	qp->usage[chunk].free += size;
    600  1.1  christos 	ENSURE(qp->free_count <= qp->used_count);
    601  1.1  christos 	ENSURE(qp->usage[chunk].free <= qp->usage[chunk].used);
    602  1.1  christos 
    603  1.1  christos 	if (cells_immutable(qp, twigs)) {
    604  1.1  christos 		qp->hold_count += size;
    605  1.1  christos 		ENSURE(qp->free_count >= qp->hold_count);
    606  1.1  christos 		return false;
    607  1.1  christos 	} else {
    608  1.1  christos 		zero_twigs(ref_ptr(qp, twigs), size);
    609  1.1  christos 		return true;
    610  1.1  christos 	}
    611  1.1  christos }
    612  1.1  christos 
    613  1.1  christos /*
    614  1.1  christos  * When some twigs have been copied, and free_twigs() could not
    615  1.1  christos  * immediately destroy the old copy, we need to update the refcount
    616  1.1  christos  * on any leaves that were duplicated.
    617  1.1  christos  */
    618  1.1  christos static void
    619  1.1  christos attach_twigs(dns_qp_t *qp, dns_qpnode_t *twigs, dns_qpweight_t size) {
    620  1.1  christos 	for (dns_qpweight_t pos = 0; pos < size; pos++) {
    621  1.1  christos 		if (node_tag(&twigs[pos]) == LEAF_TAG) {
    622  1.1  christos 			attach_leaf(qp, &twigs[pos]);
    623  1.1  christos 		}
    624  1.1  christos 	}
    625  1.1  christos }
    626  1.1  christos 
    627  1.1  christos /***********************************************************************
    628  1.1  christos  *
    629  1.1  christos  *  chunk reclamation
    630  1.1  christos  */
    631  1.1  christos 
    632  1.1  christos /*
    633  1.1  christos  * Is any of this chunk still in use?
    634  1.1  christos  */
    635  1.1  christos static inline dns_qpcell_t
    636  1.1  christos chunk_usage(dns_qp_t *qp, dns_qpchunk_t chunk) {
    637  1.1  christos 	return qp->usage[chunk].used - qp->usage[chunk].free;
    638  1.1  christos }
    639  1.1  christos 
    640  1.1  christos /*
    641  1.1  christos  * We remove each empty chunk from the total counts when the chunk is
    642  1.1  christos  * freed, or when it is scheduled for safe memory reclamation. We check
    643  1.1  christos  * the chunk's phase to avoid discounting it twice in the latter case.
    644  1.1  christos  */
    645  1.1  christos static void
    646  1.1  christos chunk_discount(dns_qp_t *qp, dns_qpchunk_t chunk) {
    647  1.1  christos 	if (qp->usage[chunk].discounted) {
    648  1.1  christos 		return;
    649  1.1  christos 	}
    650  1.1  christos 	INSIST(qp->used_count >= qp->usage[chunk].used);
    651  1.1  christos 	INSIST(qp->free_count >= qp->usage[chunk].free);
    652  1.1  christos 	qp->used_count -= qp->usage[chunk].used;
    653  1.1  christos 	qp->free_count -= qp->usage[chunk].free;
    654  1.1  christos 	qp->usage[chunk].discounted = true;
    655  1.1  christos }
    656  1.1  christos 
    657  1.1  christos /*
    658  1.1  christos  * When a chunk is being recycled, we need to detach any leaves that
    659  1.1  christos  * remain, and free any `base` arrays that have been marked as unused.
    660  1.1  christos  */
    661  1.1  christos static void
    662  1.1  christos chunk_free(dns_qp_t *qp, dns_qpchunk_t chunk) {
    663  1.1  christos 	if (qp->write_protect) {
    664  1.1  christos 		TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]);
    665  1.1  christos 	}
    666  1.1  christos 
    667  1.1  christos 	dns_qpnode_t *n = qp->base->ptr[chunk];
    668  1.1  christos 	for (dns_qpcell_t count = qp->usage[chunk].used; count > 0;
    669  1.1  christos 	     count--, n++)
    670  1.1  christos 	{
    671  1.1  christos 		if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) {
    672  1.1  christos 			detach_leaf(qp, n);
    673  1.1  christos 		} else if (count > 1 && reader_valid(n)) {
    674  1.1  christos 			dns_qpreader_t qpr;
    675  1.1  christos 			unpack_reader(&qpr, n);
    676  1.1  christos 			/* pairs with dns_qpmulti_commit() */
    677  1.1  christos 			if (qpbase_unref(&qpr)) {
    678  1.1  christos 				isc_mem_free(qp->mctx, qpr.base);
    679  1.1  christos 			}
    680  1.1  christos 		}
    681  1.1  christos 	}
    682  1.1  christos 	chunk_discount(qp, chunk);
    683  1.1  christos 	chunk_free_raw(qp, qp->base->ptr[chunk]);
    684  1.1  christos 	qp->base->ptr[chunk] = NULL;
    685  1.1  christos 	qp->usage[chunk] = (qp_usage_t){};
    686  1.1  christos }
    687  1.1  christos 
    688  1.1  christos /*
    689  1.1  christos  * Free any chunks that we can while a trie is in use.
    690  1.1  christos  */
    691  1.1  christos static void
    692  1.1  christos recycle(dns_qp_t *qp) {
    693  1.1  christos 	unsigned int free = 0;
    694  1.1  christos 
    695  1.1  christos 	isc_nanosecs_t start = isc_time_monotonic();
    696  1.1  christos 
    697  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
    698  1.1  christos 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
    699  1.1  christos 		    qp->usage[chunk].exists && !qp->usage[chunk].immutable)
    700  1.1  christos 		{
    701  1.1  christos 			chunk_free(qp, chunk);
    702  1.1  christos 			free++;
    703  1.1  christos 		}
    704  1.1  christos 	}
    705  1.1  christos 
    706  1.1  christos 	isc_nanosecs_t time = isc_time_monotonic() - start;
    707  1.3  christos 	ISC_QP_ADD(recycle_time, time);
    708  1.1  christos 
    709  1.1  christos 	if (free > 0) {
    710  1.1  christos 		LOG_STATS("qp recycle" PRItime "free %u chunks", time, free);
    711  1.1  christos 		LOG_STATS("qp recycle leaf %u live %u used %u free %u hold %u",
    712  1.1  christos 			  qp->leaf_count, qp->used_count - qp->free_count,
    713  1.1  christos 			  qp->used_count, qp->free_count, qp->hold_count);
    714  1.1  christos 	}
    715  1.1  christos }
    716  1.1  christos 
    717  1.1  christos /*
    718  1.1  christos  * asynchronous cleanup
    719  1.1  christos  */
    720  1.1  christos static void
    721  1.1  christos reclaim_chunks_cb(struct rcu_head *arg) {
    722  1.1  christos 	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
    723  1.1  christos 	REQUIRE(QPRCU_VALID(rcuctx));
    724  1.1  christos 	dns_qpmulti_t *multi = rcuctx->multi;
    725  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
    726  1.1  christos 
    727  1.1  christos 	LOCK(&multi->mutex);
    728  1.1  christos 
    729  1.1  christos 	dns_qp_t *qp = &multi->writer;
    730  1.1  christos 	REQUIRE(QP_VALID(qp));
    731  1.1  christos 
    732  1.1  christos 	unsigned int free = 0;
    733  1.1  christos 	isc_nanosecs_t start = isc_time_monotonic();
    734  1.1  christos 
    735  1.1  christos 	for (unsigned int i = 0; i < rcuctx->count; i++) {
    736  1.1  christos 		dns_qpchunk_t chunk = rcuctx->chunk[i];
    737  1.1  christos 		if (qp->usage[chunk].snapshot) {
    738  1.1  christos 			/* cleanup when snapshot is destroyed */
    739  1.1  christos 			qp->usage[chunk].snapfree = true;
    740  1.1  christos 		} else {
    741  1.1  christos 			chunk_free(qp, chunk);
    742  1.1  christos 			free++;
    743  1.1  christos 		}
    744  1.1  christos 	}
    745  1.1  christos 
    746  1.1  christos 	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
    747  1.1  christos 			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
    748  1.1  christos 
    749  1.1  christos 	isc_nanosecs_t time = isc_time_monotonic() - start;
    750  1.3  christos 	ISC_QP_ADD(recycle_time, time);
    751  1.1  christos 
    752  1.1  christos 	if (free > 0) {
    753  1.1  christos 		LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free);
    754  1.1  christos 		LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u",
    755  1.1  christos 			  qp->leaf_count, qp->used_count - qp->free_count,
    756  1.1  christos 			  qp->used_count, qp->free_count, qp->hold_count);
    757  1.1  christos 	}
    758  1.1  christos 
    759  1.1  christos 	UNLOCK(&multi->mutex);
    760  1.1  christos }
    761  1.1  christos 
    762  1.1  christos /*
    763  1.1  christos  * At the end of a transaction, schedule empty but immutable chunks
    764  1.1  christos  * for reclamation later.
    765  1.1  christos  */
    766  1.1  christos static void
    767  1.1  christos reclaim_chunks(dns_qpmulti_t *multi) {
    768  1.1  christos 	dns_qp_t *qp = &multi->writer;
    769  1.1  christos 
    770  1.1  christos 	unsigned int count = 0;
    771  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
    772  1.1  christos 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
    773  1.1  christos 		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
    774  1.1  christos 		    !qp->usage[chunk].discounted)
    775  1.1  christos 		{
    776  1.1  christos 			count++;
    777  1.1  christos 		}
    778  1.1  christos 	}
    779  1.1  christos 
    780  1.1  christos 	if (count == 0) {
    781  1.1  christos 		return;
    782  1.1  christos 	}
    783  1.1  christos 
    784  1.1  christos 	qp_rcuctx_t *rcuctx =
    785  1.1  christos 		isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count));
    786  1.1  christos 	*rcuctx = (qp_rcuctx_t){
    787  1.1  christos 		.magic = QPRCU_MAGIC,
    788  1.1  christos 		.multi = multi,
    789  1.1  christos 		.count = count,
    790  1.1  christos 	};
    791  1.1  christos 	isc_mem_attach(qp->mctx, &rcuctx->mctx);
    792  1.1  christos 
    793  1.1  christos 	unsigned int i = 0;
    794  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
    795  1.1  christos 		if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
    796  1.1  christos 		    qp->usage[chunk].exists && qp->usage[chunk].immutable &&
    797  1.1  christos 		    !qp->usage[chunk].discounted)
    798  1.1  christos 		{
    799  1.1  christos 			rcuctx->chunk[i++] = chunk;
    800  1.1  christos 			chunk_discount(qp, chunk);
    801  1.1  christos 		}
    802  1.1  christos 	}
    803  1.1  christos 
    804  1.1  christos 	call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb);
    805  1.1  christos 
    806  1.1  christos 	LOG_STATS("qp will reclaim %u chunks", count);
    807  1.1  christos }
    808  1.1  christos 
    809  1.1  christos /*
    810  1.1  christos  * When a snapshot is destroyed, clean up chunks that need free()ing
    811  1.1  christos  * and are not used by any remaining snapshots.
    812  1.1  christos  */
    813  1.1  christos static void
    814  1.1  christos marksweep_chunks(dns_qpmulti_t *multi) {
    815  1.1  christos 	unsigned int free = 0;
    816  1.1  christos 
    817  1.1  christos 	isc_nanosecs_t start = isc_time_monotonic();
    818  1.1  christos 
    819  1.1  christos 	dns_qp_t *qpw = &multi->writer;
    820  1.1  christos 
    821  1.1  christos 	for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL;
    822  1.1  christos 	     qps = ISC_LIST_NEXT(qps, link))
    823  1.1  christos 	{
    824  1.1  christos 		for (dns_qpchunk_t chunk = 0; chunk < qps->chunk_max; chunk++) {
    825  1.1  christos 			if (qps->base->ptr[chunk] != NULL) {
    826  1.1  christos 				INSIST(qps->base->ptr[chunk] ==
    827  1.1  christos 				       qpw->base->ptr[chunk]);
    828  1.1  christos 				qpw->usage[chunk].snapmark = true;
    829  1.1  christos 			}
    830  1.1  christos 		}
    831  1.1  christos 	}
    832  1.1  christos 
    833  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
    834  1.1  christos 		qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark;
    835  1.1  christos 		qpw->usage[chunk].snapmark = false;
    836  1.1  christos 		if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) {
    837  1.1  christos 			chunk_free(qpw, chunk);
    838  1.1  christos 			free++;
    839  1.1  christos 		}
    840  1.1  christos 	}
    841  1.1  christos 
    842  1.1  christos 	isc_nanosecs_t time = isc_time_monotonic() - start;
    843  1.3  christos 	ISC_QP_ADD(recycle_time, time);
    844  1.1  christos 
    845  1.1  christos 	if (free > 0) {
    846  1.1  christos 		LOG_STATS("qp marksweep" PRItime "free %u chunks", time, free);
    847  1.1  christos 		LOG_STATS(
    848  1.1  christos 			"qp marksweep leaf %u live %u used %u free %u hold %u",
    849  1.1  christos 			qpw->leaf_count, qpw->used_count - qpw->free_count,
    850  1.1  christos 			qpw->used_count, qpw->free_count, qpw->hold_count);
    851  1.1  christos 	}
    852  1.1  christos }
    853  1.1  christos 
    854  1.1  christos /***********************************************************************
    855  1.1  christos  *
    856  1.1  christos  *  garbage collector
    857  1.1  christos  */
    858  1.1  christos 
    859  1.1  christos /*
    860  1.1  christos  * Move a branch node's twigs to the `bump` chunk, for copy-on-write
    861  1.1  christos  * or for garbage collection. We don't update the node in place
    862  1.1  christos  * because `compact_recursive()` does not ensure the node itself is
    863  1.1  christos  * mutable until after it discovers evacuation was necessary.
    864  1.1  christos  *
    865  1.1  christos  * If free_twigs() could not immediately destroy the old twigs, we have
    866  1.1  christos  * to re-attach to any leaves.
    867  1.1  christos  */
    868  1.1  christos static dns_qpref_t
    869  1.1  christos evacuate(dns_qp_t *qp, dns_qpnode_t *n) {
    870  1.1  christos 	dns_qpweight_t size = branch_twigs_size(n);
    871  1.1  christos 	dns_qpref_t old_ref = branch_twigs_ref(n);
    872  1.1  christos 	dns_qpref_t new_ref = alloc_twigs(qp, size);
    873  1.1  christos 	dns_qpnode_t *old_twigs = ref_ptr(qp, old_ref);
    874  1.1  christos 	dns_qpnode_t *new_twigs = ref_ptr(qp, new_ref);
    875  1.1  christos 
    876  1.1  christos 	move_twigs(new_twigs, old_twigs, size);
    877  1.1  christos 	if (!free_twigs(qp, old_ref, size)) {
    878  1.1  christos 		attach_twigs(qp, new_twigs, size);
    879  1.1  christos 	}
    880  1.1  christos 
    881  1.1  christos 	return new_ref;
    882  1.1  christos }
    883  1.1  christos 
    884  1.1  christos /*
    885  1.1  christos  * Immutable nodes need copy-on-write. As we walk down the trie finding the
    886  1.1  christos  * right place to modify, make_root_mutable() and make_twigs_mutable()
    887  1.1  christos  * are called to ensure that immutable nodes on the path from the root are
    888  1.1  christos  * copied to a mutable chunk.
    889  1.1  christos  */
    890  1.1  christos 
    891  1.1  christos static inline dns_qpnode_t *
    892  1.1  christos make_root_mutable(dns_qp_t *qp) {
    893  1.1  christos 	if (cells_immutable(qp, qp->root_ref)) {
    894  1.1  christos 		qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp));
    895  1.1  christos 	}
    896  1.1  christos 	return ref_ptr(qp, qp->root_ref);
    897  1.1  christos }
    898  1.1  christos 
    899  1.1  christos static inline void
    900  1.1  christos make_twigs_mutable(dns_qp_t *qp, dns_qpnode_t *n) {
    901  1.1  christos 	if (cells_immutable(qp, branch_twigs_ref(n))) {
    902  1.1  christos 		*n = make_node(branch_index(n), evacuate(qp, n));
    903  1.1  christos 	}
    904  1.1  christos }
    905  1.1  christos 
    906  1.1  christos /*
    907  1.1  christos  * Compact the trie by traversing the whole thing recursively, copying
    908  1.1  christos  * bottom-up as required. The aim is to avoid evacuation as much as
    909  1.1  christos  * possible, but when parts of the trie are immutable, we need to evacuate
    910  1.1  christos  * the paths from the root to the parts of the trie that occupy
    911  1.1  christos  * fragmented chunks.
    912  1.1  christos  *
    913  1.1  christos  * Without the QP_MIN_USED check, the algorithm will leave the trie
    914  1.1  christos  * unchanged. If the children are all leaves, the loop changes nothing,
    915  1.1  christos  * so we will return this node's original ref. If all of the children
    916  1.1  christos  * that are branches did not need moving, again, the loop changes
    917  1.1  christos  * nothing. So the evacuation check is the only place that the
    918  1.1  christos  * algorithm introduces ref changes, that then bubble up towards the
    919  1.1  christos  * root through the logic inside the loop.
    920  1.1  christos  */
    921  1.1  christos static dns_qpref_t
    922  1.1  christos compact_recursive(dns_qp_t *qp, dns_qpnode_t *parent) {
    923  1.1  christos 	dns_qpweight_t size = branch_twigs_size(parent);
    924  1.1  christos 	dns_qpref_t twigs_ref = branch_twigs_ref(parent);
    925  1.1  christos 	dns_qpchunk_t chunk = ref_chunk(twigs_ref);
    926  1.1  christos 
    927  1.1  christos 	if (qp->compact_all ||
    928  1.1  christos 	    (chunk != qp->bump && chunk_usage(qp, chunk) < QP_MIN_USED))
    929  1.1  christos 	{
    930  1.1  christos 		twigs_ref = evacuate(qp, parent);
    931  1.1  christos 	}
    932  1.1  christos 	bool immutable = cells_immutable(qp, twigs_ref);
    933  1.1  christos 	for (dns_qpweight_t pos = 0; pos < size; pos++) {
    934  1.1  christos 		dns_qpnode_t *child = ref_ptr(qp, twigs_ref) + pos;
    935  1.1  christos 		if (!is_branch(child)) {
    936  1.1  christos 			continue;
    937  1.1  christos 		}
    938  1.1  christos 		dns_qpref_t old_grandtwigs = branch_twigs_ref(child);
    939  1.1  christos 		dns_qpref_t new_grandtwigs = compact_recursive(qp, child);
    940  1.1  christos 		if (old_grandtwigs == new_grandtwigs) {
    941  1.1  christos 			continue;
    942  1.1  christos 		}
    943  1.1  christos 		if (immutable) {
    944  1.1  christos 			twigs_ref = evacuate(qp, parent);
    945  1.1  christos 			/* the twigs have moved */
    946  1.1  christos 			child = ref_ptr(qp, twigs_ref) + pos;
    947  1.1  christos 			immutable = false;
    948  1.1  christos 		}
    949  1.1  christos 		*child = make_node(branch_index(child), new_grandtwigs);
    950  1.1  christos 	}
    951  1.1  christos 	return twigs_ref;
    952  1.1  christos }
    953  1.1  christos 
    954  1.1  christos static void
    955  1.1  christos compact(dns_qp_t *qp) {
    956  1.1  christos 	LOG_STATS("qp compact before leaf %u live %u used %u free %u hold %u",
    957  1.1  christos 		  qp->leaf_count, qp->used_count - qp->free_count,
    958  1.1  christos 		  qp->used_count, qp->free_count, qp->hold_count);
    959  1.1  christos 
    960  1.1  christos 	isc_nanosecs_t start = isc_time_monotonic();
    961  1.1  christos 
    962  1.1  christos 	if (qp->usage[qp->bump].free > QP_MAX_FREE) {
    963  1.1  christos 		alloc_reset(qp);
    964  1.1  christos 	}
    965  1.1  christos 
    966  1.1  christos 	if (qp->leaf_count > 0) {
    967  1.1  christos 		qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp));
    968  1.1  christos 	}
    969  1.1  christos 	qp->compact_all = false;
    970  1.1  christos 
    971  1.1  christos 	isc_nanosecs_t time = isc_time_monotonic() - start;
    972  1.3  christos 	ISC_QP_ADD(compact_time, time);
    973  1.1  christos 
    974  1.1  christos 	LOG_STATS("qp compact" PRItime
    975  1.1  christos 		  "leaf %u live %u used %u free %u hold %u",
    976  1.1  christos 		  time, qp->leaf_count, qp->used_count - qp->free_count,
    977  1.1  christos 		  qp->used_count, qp->free_count, qp->hold_count);
    978  1.1  christos }
    979  1.1  christos 
    980  1.1  christos void
    981  1.1  christos dns_qp_compact(dns_qp_t *qp, dns_qpgc_t mode) {
    982  1.1  christos 	REQUIRE(QP_VALID(qp));
    983  1.1  christos 	if (mode == DNS_QPGC_MAYBE && !QP_NEEDGC(qp)) {
    984  1.1  christos 		return;
    985  1.1  christos 	}
    986  1.1  christos 	if (mode == DNS_QPGC_ALL) {
    987  1.1  christos 		alloc_reset(qp);
    988  1.1  christos 		qp->compact_all = true;
    989  1.1  christos 	}
    990  1.1  christos 	compact(qp);
    991  1.1  christos 	recycle(qp);
    992  1.1  christos }
    993  1.1  christos 
    994  1.1  christos /*
    995  1.1  christos  * Free some twigs and (if they were destroyed immediately so that the
    996  1.1  christos  * result from QP_MAX_GARBAGE can change) compact the trie if necessary.
    997  1.1  christos  *
    998  1.1  christos  * This is called by the trie modification API entry points. The
    999  1.1  christos  * free_twigs() function requires the caller to attach or detach any
   1000  1.1  christos  * leaves as necessary. Callers of squash_twigs() satisfy this
   1001  1.1  christos  * requirement by calling make_twigs_mutable().
   1002  1.1  christos  *
   1003  1.1  christos  * Aside: In typical garbage collectors, compaction is triggered when
   1004  1.1  christos  * the allocator runs out of space. But that is because typical garbage
   1005  1.1  christos  * collectors do not know how much memory can be recovered, so they must
   1006  1.1  christos  * find out by scanning the heap. The qp-trie code was originally
   1007  1.1  christos  * designed to use malloc() and free(), so it has more information about
   1008  1.1  christos  * when garbage collection might be worthwhile. Hence we can trigger
   1009  1.1  christos  * collection when garbage passes a threshold.
   1010  1.1  christos  *
   1011  1.1  christos  * XXXFANF: If we need to avoid latency outliers caused by compaction in
   1012  1.1  christos  * write transactions, we can check qp->transaction_mode here.
   1013  1.1  christos  */
   1014  1.1  christos static inline bool
   1015  1.1  christos squash_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) {
   1016  1.1  christos 	bool destroyed = free_twigs(qp, twigs, size);
   1017  1.1  christos 	if (destroyed && QP_AUTOGC(qp)) {
   1018  1.1  christos 		compact(qp);
   1019  1.1  christos 		recycle(qp);
   1020  1.1  christos 		/*
   1021  1.1  christos 		 * This shouldn't happen if the garbage collector is
   1022  1.1  christos 		 * working correctly. We can recover at the cost of some
   1023  1.1  christos 		 * time and space, but recovery should be cheaper than
   1024  1.1  christos 		 * letting compact+recycle fail repeatedly.
   1025  1.1  christos 		 */
   1026  1.1  christos 		if (QP_AUTOGC(qp)) {
   1027  1.1  christos 			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
   1028  1.1  christos 				      DNS_LOGMODULE_QP, ISC_LOG_NOTICE,
   1029  1.1  christos 				      "qp %p uctx \"%s\" compact/recycle "
   1030  1.1  christos 				      "failed to recover any space, "
   1031  1.1  christos 				      "scheduling a full compaction",
   1032  1.1  christos 				      qp, TRIENAME(qp));
   1033  1.1  christos 			qp->compact_all = true;
   1034  1.1  christos 		}
   1035  1.1  christos 	}
   1036  1.1  christos 	return destroyed;
   1037  1.1  christos }
   1038  1.1  christos 
   1039  1.1  christos /***********************************************************************
   1040  1.1  christos  *
   1041  1.1  christos  *  public accessors for memory management internals
   1042  1.1  christos  */
   1043  1.1  christos 
   1044  1.1  christos dns_qp_memusage_t
   1045  1.1  christos dns_qp_memusage(dns_qp_t *qp) {
   1046  1.1  christos 	REQUIRE(QP_VALID(qp));
   1047  1.1  christos 
   1048  1.1  christos 	dns_qp_memusage_t memusage = {
   1049  1.1  christos 		.uctx = qp->uctx,
   1050  1.1  christos 		.leaves = qp->leaf_count,
   1051  1.1  christos 		.live = qp->used_count - qp->free_count,
   1052  1.1  christos 		.used = qp->used_count,
   1053  1.1  christos 		.hold = qp->hold_count,
   1054  1.1  christos 		.free = qp->free_count,
   1055  1.1  christos 		.node_size = sizeof(dns_qpnode_t),
   1056  1.1  christos 		.chunk_size = QP_CHUNK_SIZE,
   1057  1.1  christos 		.fragmented = QP_NEEDGC(qp),
   1058  1.1  christos 	};
   1059  1.1  christos 
   1060  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1061  1.1  christos 		if (qp->base->ptr[chunk] != NULL) {
   1062  1.1  christos 			memusage.chunk_count += 1;
   1063  1.1  christos 		}
   1064  1.1  christos 	}
   1065  1.1  christos 
   1066  1.1  christos 	/*
   1067  1.1  christos 	 * XXXFANF does not subtract chunks that have been shrunk,
   1068  1.1  christos 	 * and does not count unreclaimed dns_qpbase_t objects
   1069  1.1  christos 	 */
   1070  1.1  christos 	memusage.bytes = memusage.chunk_count * QP_CHUNK_BYTES +
   1071  1.1  christos 			 qp->chunk_max * sizeof(qp->base->ptr[0]) +
   1072  1.1  christos 			 qp->chunk_max * sizeof(qp->usage[0]);
   1073  1.1  christos 
   1074  1.1  christos 	return memusage;
   1075  1.1  christos }
   1076  1.1  christos 
   1077  1.1  christos dns_qp_memusage_t
   1078  1.1  christos dns_qpmulti_memusage(dns_qpmulti_t *multi) {
   1079  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1080  1.1  christos 	LOCK(&multi->mutex);
   1081  1.1  christos 
   1082  1.1  christos 	dns_qp_t *qp = &multi->writer;
   1083  1.1  christos 	INSIST(QP_VALID(qp));
   1084  1.1  christos 
   1085  1.1  christos 	dns_qp_memusage_t memusage = dns_qp_memusage(qp);
   1086  1.1  christos 
   1087  1.1  christos 	if (qp->transaction_mode == QP_UPDATE) {
   1088  1.1  christos 		memusage.bytes -= QP_CHUNK_BYTES;
   1089  1.1  christos 		memusage.bytes += qp->usage[qp->bump].used *
   1090  1.1  christos 				  sizeof(dns_qpnode_t);
   1091  1.1  christos 	}
   1092  1.1  christos 
   1093  1.1  christos 	UNLOCK(&multi->mutex);
   1094  1.1  christos 	return memusage;
   1095  1.1  christos }
   1096  1.1  christos 
   1097  1.1  christos void
   1098  1.1  christos dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p,
   1099  1.1  christos 	      isc_nanosecs_t *rollback_p) {
   1100  1.3  christos 	*compact_p = ISC_QP_GET(compact_time);
   1101  1.3  christos 	*recycle_p = ISC_QP_GET(recycle_time);
   1102  1.3  christos 	*rollback_p = ISC_QP_GET(rollback_time);
   1103  1.1  christos }
   1104  1.1  christos 
   1105  1.1  christos /***********************************************************************
   1106  1.1  christos  *
   1107  1.1  christos  *  read-write transactions
   1108  1.1  christos  */
   1109  1.1  christos 
   1110  1.1  christos static dns_qp_t *
   1111  1.1  christos transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1112  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1113  1.1  christos 	REQUIRE(qptp != NULL && *qptp == NULL);
   1114  1.1  christos 
   1115  1.1  christos 	LOCK(&multi->mutex);
   1116  1.1  christos 
   1117  1.1  christos 	dns_qp_t *qp = &multi->writer;
   1118  1.1  christos 	INSIST(QP_VALID(qp));
   1119  1.1  christos 
   1120  1.1  christos 	/*
   1121  1.1  christos 	 * Mark existing chunks as immutable.
   1122  1.1  christos 	 *
   1123  1.1  christos 	 * Aside: The bump chunk is special: in a series of write
   1124  1.1  christos 	 * transactions the bump chunk is reused; the first part (up
   1125  1.1  christos 	 * to fender) is immutable, the rest mutable. But we set its
   1126  1.1  christos 	 * immutable flag so that when the bump chunk fills up, the
   1127  1.1  christos 	 * first part continues to be treated as immutable. (And the
   1128  1.1  christos 	 * rest of the chunk too, but that's OK.)
   1129  1.1  christos 	 */
   1130  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1131  1.1  christos 		if (qp->usage[chunk].exists) {
   1132  1.1  christos 			qp->usage[chunk].immutable = true;
   1133  1.1  christos 			write_protect(qp, chunk);
   1134  1.1  christos 		}
   1135  1.1  christos 	}
   1136  1.1  christos 
   1137  1.1  christos 	/*
   1138  1.1  christos 	 * Ensure QP_AUTOGC() ignores free space in immutable chunks.
   1139  1.1  christos 	 */
   1140  1.1  christos 	qp->hold_count = qp->free_count;
   1141  1.1  christos 
   1142  1.1  christos 	*qptp = qp;
   1143  1.1  christos 	return qp;
   1144  1.1  christos }
   1145  1.1  christos 
   1146  1.1  christos /*
   1147  1.1  christos  * a write is light
   1148  1.1  christos  *
   1149  1.1  christos  * We need to ensure we allocate from a fresh chunk if the last transaction
   1150  1.1  christos  * shrunk the bump chunk; but usually in a sequence of write transactions
   1151  1.1  christos  * we just put `fender` at the point where we started this generation.
   1152  1.1  christos  *
   1153  1.1  christos  * (Aside: Instead of keeping the previous transaction's mode, I
   1154  1.1  christos  * considered forcing allocation into the slow path by fiddling with
   1155  1.1  christos  * the bump chunk's usage counters. But that is troublesome because
   1156  1.1  christos  * `chunk_free()` needs to know how much of the chunk to scan.)
   1157  1.1  christos  */
   1158  1.1  christos void
   1159  1.1  christos dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1160  1.1  christos 	dns_qp_t *qp = transaction_open(multi, qptp);
   1161  1.1  christos 	TRACE("");
   1162  1.1  christos 
   1163  1.1  christos 	if (qp->transaction_mode == QP_WRITE) {
   1164  1.1  christos 		qp->fender = qp->usage[qp->bump].used;
   1165  1.1  christos 	} else {
   1166  1.1  christos 		alloc_reset(qp);
   1167  1.1  christos 	}
   1168  1.1  christos 	qp->transaction_mode = QP_WRITE;
   1169  1.1  christos }
   1170  1.1  christos 
   1171  1.1  christos /*
   1172  1.1  christos  * an update is heavier
   1173  1.1  christos  *
   1174  1.1  christos  * We always reset the allocator to the start of a fresh chunk,
   1175  1.1  christos  * because the previous transaction was probably an update that shrunk
   1176  1.1  christos  * the bump chunk. It simplifies rollback because `fender` is always zero.
   1177  1.1  christos  *
   1178  1.1  christos  * To rollback a transaction, we need to reset all the allocation
   1179  1.1  christos  * counters to their previous state, in particular we need to un-free
   1180  1.1  christos  * any nodes that were copied to make them mutable. This means we need
   1181  1.1  christos  * to make a copy of basically the whole `dns_qp_t writer`: everything
   1182  1.1  christos  * but the chunks holding the trie nodes.
   1183  1.1  christos  *
   1184  1.1  christos  * We do most of the transaction setup before creating the rollback
   1185  1.1  christos  * state so that after rollback we have a correct idea of which chunks
   1186  1.1  christos  * are immutable, and so we have the correct transaction mode to make
   1187  1.1  christos  * the next transaction allocate a new bump chunk. The exception is
   1188  1.1  christos  * resetting the allocator, which we do after creating the rollback
   1189  1.1  christos  * state; if this transaction is rolled back then the next transaction
   1190  1.1  christos  * will start from the rollback state and also reset the allocator as
   1191  1.1  christos  * one of its first actions.
   1192  1.1  christos  */
   1193  1.1  christos void
   1194  1.1  christos dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1195  1.1  christos 	dns_qp_t *qp = transaction_open(multi, qptp);
   1196  1.1  christos 	TRACE("");
   1197  1.1  christos 
   1198  1.1  christos 	qp->transaction_mode = QP_UPDATE;
   1199  1.1  christos 
   1200  1.1  christos 	dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback));
   1201  1.1  christos 	memmove(rollback, qp, sizeof(*rollback));
   1202  1.1  christos 	/* can be uninitialized on the first transaction */
   1203  1.1  christos 	if (rollback->base != NULL) {
   1204  1.1  christos 		INSIST(QPBASE_VALID(rollback->base));
   1205  1.1  christos 		INSIST(qp->usage != NULL && qp->chunk_max > 0);
   1206  1.1  christos 		/* paired with either _commit() or _rollback() */
   1207  1.1  christos 		isc_refcount_increment(&rollback->base->refcount);
   1208  1.1  christos 		size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max;
   1209  1.1  christos 		rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes);
   1210  1.1  christos 		memmove(rollback->usage, qp->usage, usage_bytes);
   1211  1.1  christos 	}
   1212  1.1  christos 	INSIST(multi->rollback == NULL);
   1213  1.1  christos 	multi->rollback = rollback;
   1214  1.1  christos 
   1215  1.1  christos 	alloc_reset(qp);
   1216  1.1  christos }
   1217  1.1  christos 
   1218  1.1  christos void
   1219  1.1  christos dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1220  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1221  1.1  christos 	REQUIRE(qptp != NULL && *qptp == &multi->writer);
   1222  1.1  christos 	REQUIRE(multi->writer.transaction_mode == QP_WRITE ||
   1223  1.1  christos 		multi->writer.transaction_mode == QP_UPDATE);
   1224  1.1  christos 
   1225  1.1  christos 	dns_qp_t *qp = *qptp;
   1226  1.1  christos 	TRACE("");
   1227  1.1  christos 
   1228  1.1  christos 	if (qp->transaction_mode == QP_UPDATE) {
   1229  1.1  christos 		INSIST(multi->rollback != NULL);
   1230  1.1  christos 		/* paired with dns_qpmulti_update() */
   1231  1.1  christos 		if (qpbase_unref(multi->rollback)) {
   1232  1.1  christos 			isc_mem_free(qp->mctx, multi->rollback->base);
   1233  1.1  christos 		}
   1234  1.1  christos 		if (multi->rollback->usage != NULL) {
   1235  1.1  christos 			isc_mem_free(qp->mctx, multi->rollback->usage);
   1236  1.1  christos 		}
   1237  1.1  christos 		isc_mem_free(qp->mctx, multi->rollback);
   1238  1.1  christos 	}
   1239  1.1  christos 	INSIST(multi->rollback == NULL);
   1240  1.1  christos 
   1241  1.1  christos 	/* not the first commit? */
   1242  1.1  christos 	if (multi->reader_ref != INVALID_REF) {
   1243  1.1  christos 		INSIST(cells_immutable(qp, multi->reader_ref));
   1244  1.1  christos 		free_twigs(qp, multi->reader_ref, READER_SIZE);
   1245  1.1  christos 	}
   1246  1.1  christos 
   1247  1.1  christos 	if (qp->transaction_mode == QP_UPDATE) {
   1248  1.1  christos 		/* minimize memory overhead */
   1249  1.1  christos 		compact(qp);
   1250  1.1  christos 		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
   1251  1.1  christos 		qp->base->ptr[qp->bump] = chunk_shrink_raw(
   1252  1.1  christos 			qp, qp->base->ptr[qp->bump],
   1253  1.1  christos 			qp->usage[qp->bump].used * sizeof(dns_qpnode_t));
   1254  1.1  christos 	} else {
   1255  1.1  christos 		multi->reader_ref = alloc_twigs(qp, READER_SIZE);
   1256  1.1  christos 	}
   1257  1.1  christos 
   1258  1.1  christos 	/* anchor a new version of the trie */
   1259  1.1  christos 	dns_qpnode_t *reader = ref_ptr(qp, multi->reader_ref);
   1260  1.1  christos 	make_reader(reader, multi);
   1261  1.1  christos 	/* paired with chunk_free() */
   1262  1.1  christos 	isc_refcount_increment(&qp->base->refcount);
   1263  1.1  christos 
   1264  1.1  christos 	rcu_assign_pointer(multi->reader, reader); /* COMMIT */
   1265  1.1  christos 
   1266  1.1  christos 	/* clean up what we can right now */
   1267  1.1  christos 	if (qp->transaction_mode == QP_UPDATE || QP_NEEDGC(qp)) {
   1268  1.1  christos 		recycle(qp);
   1269  1.1  christos 	}
   1270  1.1  christos 
   1271  1.1  christos 	/* schedule the rest for later */
   1272  1.1  christos 	reclaim_chunks(multi);
   1273  1.1  christos 
   1274  1.1  christos 	*qptp = NULL;
   1275  1.1  christos 	UNLOCK(&multi->mutex);
   1276  1.1  christos }
   1277  1.1  christos 
   1278  1.1  christos /*
   1279  1.1  christos  * Throw away everything that was allocated during this transaction.
   1280  1.1  christos  */
   1281  1.1  christos void
   1282  1.1  christos dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) {
   1283  1.1  christos 	unsigned int free = 0;
   1284  1.1  christos 
   1285  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1286  1.1  christos 	REQUIRE(multi->writer.transaction_mode == QP_UPDATE);
   1287  1.1  christos 	REQUIRE(qptp != NULL && *qptp == &multi->writer);
   1288  1.1  christos 
   1289  1.1  christos 	dns_qp_t *qp = *qptp;
   1290  1.1  christos 	TRACE("");
   1291  1.1  christos 
   1292  1.1  christos 	isc_nanosecs_t start = isc_time_monotonic();
   1293  1.1  christos 
   1294  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1295  1.1  christos 		if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable)
   1296  1.1  christos 		{
   1297  1.1  christos 			chunk_free(qp, chunk);
   1298  1.1  christos 			/*
   1299  1.1  christos 			 * we need to clear its base pointer in the rollback
   1300  1.1  christos 			 * trie, in case the arrays were resized
   1301  1.1  christos 			 */
   1302  1.1  christos 			if (chunk < multi->rollback->chunk_max) {
   1303  1.1  christos 				INSIST(!multi->rollback->usage[chunk].exists);
   1304  1.1  christos 				multi->rollback->base->ptr[chunk] = NULL;
   1305  1.1  christos 			}
   1306  1.1  christos 			free++;
   1307  1.1  christos 		}
   1308  1.1  christos 	}
   1309  1.1  christos 
   1310  1.1  christos 	/*
   1311  1.1  christos 	 * multi->rollback->base and multi->writer->base are the same,
   1312  1.1  christos 	 * unless there was a realloc_chunk_arrays() during the transaction
   1313  1.1  christos 	 */
   1314  1.1  christos 	if (qpbase_unref(qp)) {
   1315  1.1  christos 		/* paired with dns_qpmulti_update() */
   1316  1.1  christos 		isc_mem_free(qp->mctx, qp->base);
   1317  1.1  christos 	}
   1318  1.1  christos 	isc_mem_free(qp->mctx, qp->usage);
   1319  1.1  christos 
   1320  1.1  christos 	/* reset allocator state */
   1321  1.1  christos 	INSIST(multi->rollback != NULL);
   1322  1.1  christos 	memmove(qp, multi->rollback, sizeof(*qp));
   1323  1.1  christos 	isc_mem_free(qp->mctx, multi->rollback);
   1324  1.1  christos 	INSIST(multi->rollback == NULL);
   1325  1.1  christos 
   1326  1.1  christos 	isc_nanosecs_t time = isc_time_monotonic() - start;
   1327  1.3  christos 	ISC_QP_ADD(rollback_time, time);
   1328  1.1  christos 
   1329  1.1  christos 	LOG_STATS("qp rollback" PRItime "free %u chunks", time, free);
   1330  1.1  christos 
   1331  1.1  christos 	*qptp = NULL;
   1332  1.1  christos 	UNLOCK(&multi->mutex);
   1333  1.1  christos }
   1334  1.1  christos 
   1335  1.1  christos /***********************************************************************
   1336  1.1  christos  *
   1337  1.1  christos  *  read-only transactions
   1338  1.1  christos  */
   1339  1.1  christos 
   1340  1.1  christos static dns_qpmulti_t *
   1341  1.1  christos reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) {
   1342  1.1  christos 	dns_qpreader_t *qp = dns_qpreader(qpr);
   1343  1.1  christos 	dns_qpnode_t *reader = rcu_dereference(multi->reader);
   1344  1.1  christos 	if (reader == NULL) {
   1345  1.1  christos 		QP_INIT(qp, multi->writer.methods, multi->writer.uctx);
   1346  1.1  christos 	} else {
   1347  1.1  christos 		multi = unpack_reader(qp, reader);
   1348  1.1  christos 	}
   1349  1.1  christos 	return multi;
   1350  1.1  christos }
   1351  1.1  christos 
   1352  1.1  christos /*
   1353  1.1  christos  * a query is light
   1354  1.1  christos  */
   1355  1.1  christos 
   1356  1.1  christos void
   1357  1.1  christos dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) {
   1358  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1359  1.1  christos 	REQUIRE(qp != NULL);
   1360  1.1  christos 
   1361  1.1  christos 	qp->tid = isc_tid();
   1362  1.1  christos 	rcu_read_lock();
   1363  1.1  christos 
   1364  1.1  christos 	dns_qpmulti_t *whence = reader_open(multi, qp);
   1365  1.1  christos 	INSIST(whence == multi);
   1366  1.1  christos }
   1367  1.1  christos 
   1368  1.1  christos void
   1369  1.1  christos dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) {
   1370  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1371  1.1  christos 	REQUIRE(QP_VALID(qp));
   1372  1.1  christos 	REQUIRE(qp->tid == isc_tid());
   1373  1.1  christos 	*qp = (dns_qpread_t){};
   1374  1.1  christos 	rcu_read_unlock();
   1375  1.1  christos }
   1376  1.1  christos 
   1377  1.1  christos /*
   1378  1.1  christos  * a snapshot is heavy
   1379  1.1  christos  */
   1380  1.1  christos 
   1381  1.1  christos void
   1382  1.1  christos dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
   1383  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1384  1.1  christos 	REQUIRE(qpsp != NULL && *qpsp == NULL);
   1385  1.1  christos 
   1386  1.1  christos 	rcu_read_lock();
   1387  1.1  christos 
   1388  1.1  christos 	LOCK(&multi->mutex);
   1389  1.1  christos 
   1390  1.1  christos 	dns_qp_t *qpw = &multi->writer;
   1391  1.1  christos 	size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) +
   1392  1.1  christos 		       sizeof(qpw->base->ptr[0]) * qpw->chunk_max;
   1393  1.1  christos 	dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes);
   1394  1.1  christos 
   1395  1.1  christos 	qps->whence = reader_open(multi, qps);
   1396  1.1  christos 	INSIST(qps->whence == multi);
   1397  1.1  christos 
   1398  1.1  christos 	/* not a separate allocation */
   1399  1.1  christos 	qps->base = (dns_qpbase_t *)(qps + 1);
   1400  1.1  christos 	isc_refcount_init(&qps->base->refcount, 0);
   1401  1.1  christos 
   1402  1.1  christos 	/*
   1403  1.1  christos 	 * only copy base pointers of chunks we need, so we can
   1404  1.1  christos 	 * reclaim unused memory in dns_qpsnap_destroy()
   1405  1.1  christos 	 */
   1406  1.1  christos 	qps->chunk_max = qpw->chunk_max;
   1407  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) {
   1408  1.1  christos 		if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) {
   1409  1.1  christos 			qpw->usage[chunk].snapshot = true;
   1410  1.1  christos 			qps->base->ptr[chunk] = qpw->base->ptr[chunk];
   1411  1.1  christos 		} else {
   1412  1.1  christos 			qps->base->ptr[chunk] = NULL;
   1413  1.1  christos 		}
   1414  1.1  christos 	}
   1415  1.1  christos 	ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link);
   1416  1.1  christos 
   1417  1.1  christos 	*qpsp = qps;
   1418  1.1  christos 	UNLOCK(&multi->mutex);
   1419  1.1  christos 
   1420  1.1  christos 	rcu_read_unlock();
   1421  1.1  christos }
   1422  1.1  christos 
   1423  1.1  christos void
   1424  1.1  christos dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) {
   1425  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1426  1.1  christos 	REQUIRE(qpsp != NULL && *qpsp != NULL);
   1427  1.1  christos 
   1428  1.1  christos 	LOCK(&multi->mutex);
   1429  1.1  christos 
   1430  1.1  christos 	dns_qpsnap_t *qp = *qpsp;
   1431  1.1  christos 
   1432  1.1  christos 	/* make sure the API is being used correctly */
   1433  1.1  christos 	REQUIRE(qp->whence == multi);
   1434  1.1  christos 
   1435  1.1  christos 	ISC_LIST_UNLINK(multi->snapshots, qp, link);
   1436  1.1  christos 
   1437  1.1  christos 	/*
   1438  1.1  christos 	 * eagerly reclaim chunks that are now unused, so that memory does
   1439  1.1  christos 	 * not accumulate when a trie has a lot of updates and snapshots
   1440  1.1  christos 	 */
   1441  1.1  christos 	marksweep_chunks(multi);
   1442  1.1  christos 
   1443  1.1  christos 	isc_mem_free(multi->writer.mctx, qp);
   1444  1.1  christos 
   1445  1.1  christos 	*qpsp = NULL;
   1446  1.1  christos 	UNLOCK(&multi->mutex);
   1447  1.1  christos }
   1448  1.1  christos 
   1449  1.1  christos /***********************************************************************
   1450  1.1  christos  *
   1451  1.1  christos  *  constructors, destructors
   1452  1.1  christos  */
   1453  1.1  christos 
   1454  1.1  christos void
   1455  1.1  christos dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
   1456  1.1  christos 	      dns_qp_t **qptp) {
   1457  1.1  christos 	REQUIRE(qptp != NULL && *qptp == NULL);
   1458  1.1  christos 
   1459  1.1  christos 	dns_qp_t *qp = isc_mem_get(mctx, sizeof(*qp));
   1460  1.1  christos 	QP_INIT(qp, methods, uctx);
   1461  1.1  christos 	isc_mem_attach(mctx, &qp->mctx);
   1462  1.1  christos 	alloc_reset(qp);
   1463  1.1  christos 	TRACE("");
   1464  1.1  christos 	*qptp = qp;
   1465  1.1  christos }
   1466  1.1  christos 
   1467  1.1  christos void
   1468  1.1  christos dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
   1469  1.1  christos 		   dns_qpmulti_t **qpmp) {
   1470  1.1  christos 	REQUIRE(qpmp != NULL && *qpmp == NULL);
   1471  1.1  christos 
   1472  1.1  christos 	dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi));
   1473  1.1  christos 	*multi = (dns_qpmulti_t){
   1474  1.1  christos 		.magic = QPMULTI_MAGIC,
   1475  1.1  christos 		.reader_ref = INVALID_REF,
   1476  1.1  christos 	};
   1477  1.1  christos 	isc_mutex_init(&multi->mutex);
   1478  1.1  christos 	ISC_LIST_INIT(multi->snapshots);
   1479  1.1  christos 	/*
   1480  1.1  christos 	 * Do not waste effort allocating a bump chunk that will be thrown
   1481  1.1  christos 	 * away when a transaction is opened. dns_qpmulti_update() always
   1482  1.1  christos 	 * allocates; to ensure dns_qpmulti_write() does too, pretend the
   1483  1.1  christos 	 * previous transaction was an update
   1484  1.1  christos 	 */
   1485  1.1  christos 	dns_qp_t *qp = &multi->writer;
   1486  1.1  christos 	QP_INIT(qp, methods, uctx);
   1487  1.1  christos 	isc_mem_attach(mctx, &qp->mctx);
   1488  1.1  christos 	qp->transaction_mode = QP_UPDATE;
   1489  1.1  christos 	TRACE("");
   1490  1.1  christos 	*qpmp = multi;
   1491  1.1  christos }
   1492  1.1  christos 
   1493  1.1  christos static void
   1494  1.1  christos destroy_guts(dns_qp_t *qp) {
   1495  1.1  christos 	if (qp->chunk_max == 0) {
   1496  1.1  christos 		return;
   1497  1.1  christos 	}
   1498  1.1  christos 	for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
   1499  1.1  christos 		if (qp->base->ptr[chunk] != NULL) {
   1500  1.1  christos 			chunk_free(qp, chunk);
   1501  1.1  christos 		}
   1502  1.1  christos 	}
   1503  1.1  christos 	ENSURE(qp->used_count == 0);
   1504  1.1  christos 	ENSURE(qp->free_count == 0);
   1505  1.1  christos 	ENSURE(isc_refcount_current(&qp->base->refcount) == 1);
   1506  1.1  christos 	isc_mem_free(qp->mctx, qp->base);
   1507  1.1  christos 	isc_mem_free(qp->mctx, qp->usage);
   1508  1.1  christos 	qp->magic = 0;
   1509  1.1  christos }
   1510  1.1  christos 
   1511  1.1  christos void
   1512  1.1  christos dns_qp_destroy(dns_qp_t **qptp) {
   1513  1.1  christos 	REQUIRE(qptp != NULL);
   1514  1.1  christos 	REQUIRE(QP_VALID(*qptp));
   1515  1.1  christos 
   1516  1.1  christos 	dns_qp_t *qp = *qptp;
   1517  1.1  christos 	*qptp = NULL;
   1518  1.1  christos 
   1519  1.1  christos 	/* do not try to destroy part of a dns_qpmulti_t */
   1520  1.1  christos 	REQUIRE(qp->transaction_mode == QP_NONE);
   1521  1.1  christos 
   1522  1.1  christos 	TRACE("");
   1523  1.1  christos 	destroy_guts(qp);
   1524  1.1  christos 	isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp));
   1525  1.1  christos }
   1526  1.1  christos 
   1527  1.1  christos static void
   1528  1.1  christos qpmulti_destroy_cb(struct rcu_head *arg) {
   1529  1.1  christos 	qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head);
   1530  1.1  christos 	REQUIRE(QPRCU_VALID(rcuctx));
   1531  1.1  christos 	/* only nonzero for reclaim_chunks_cb() */
   1532  1.1  christos 	REQUIRE(rcuctx->count == 0);
   1533  1.1  christos 
   1534  1.1  christos 	dns_qpmulti_t *multi = rcuctx->multi;
   1535  1.1  christos 	REQUIRE(QPMULTI_VALID(multi));
   1536  1.1  christos 
   1537  1.1  christos 	/* reassure thread sanitizer */
   1538  1.1  christos 	LOCK(&multi->mutex);
   1539  1.1  christos 
   1540  1.1  christos 	dns_qp_t *qp = &multi->writer;
   1541  1.1  christos 	REQUIRE(QP_VALID(qp));
   1542  1.1  christos 
   1543  1.1  christos 	destroy_guts(qp);
   1544  1.1  christos 
   1545  1.1  christos 	UNLOCK(&multi->mutex);
   1546  1.1  christos 
   1547  1.1  christos 	isc_mutex_destroy(&multi->mutex);
   1548  1.1  christos 	isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
   1549  1.1  christos 			     STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
   1550  1.1  christos 	isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
   1551  1.1  christos }
   1552  1.1  christos 
   1553  1.1  christos void
   1554  1.1  christos dns_qpmulti_destroy(dns_qpmulti_t **qpmp) {
   1555  1.1  christos 	dns_qp_t *qp = NULL;
   1556  1.1  christos 	dns_qpmulti_t *multi = NULL;
   1557  1.1  christos 	qp_rcuctx_t *rcuctx = NULL;
   1558  1.1  christos 
   1559  1.1  christos 	REQUIRE(qpmp != NULL);
   1560  1.1  christos 	REQUIRE(QPMULTI_VALID(*qpmp));
   1561  1.1  christos 
   1562  1.1  christos 	multi = *qpmp;
   1563  1.1  christos 	qp = &multi->writer;
   1564  1.1  christos 	*qpmp = NULL;
   1565  1.1  christos 
   1566  1.1  christos 	REQUIRE(QP_VALID(qp));
   1567  1.1  christos 	REQUIRE(multi->rollback == NULL);
   1568  1.1  christos 	REQUIRE(ISC_LIST_EMPTY(multi->snapshots));
   1569  1.1  christos 
   1570  1.1  christos 	rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0));
   1571  1.1  christos 	*rcuctx = (qp_rcuctx_t){
   1572  1.1  christos 		.magic = QPRCU_MAGIC,
   1573  1.1  christos 		.multi = multi,
   1574  1.1  christos 	};
   1575  1.1  christos 	isc_mem_attach(qp->mctx, &rcuctx->mctx);
   1576  1.1  christos 	call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb);
   1577  1.1  christos }
   1578  1.1  christos 
   1579  1.1  christos /***********************************************************************
   1580  1.1  christos  *
   1581  1.1  christos  *  modification
   1582  1.1  christos  */
   1583  1.1  christos 
   1584  1.1  christos isc_result_t
   1585  1.1  christos dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) {
   1586  1.1  christos 	dns_qpref_t new_ref, old_ref;
   1587  1.1  christos 	dns_qpnode_t new_leaf, old_node;
   1588  1.1  christos 	dns_qpnode_t *new_twigs = NULL, *old_twigs = NULL;
   1589  1.1  christos 	dns_qpshift_t new_bit, old_bit;
   1590  1.1  christos 	dns_qpweight_t old_size, new_size;
   1591  1.1  christos 	dns_qpkey_t new_key, old_key;
   1592  1.1  christos 	size_t new_keylen, old_keylen;
   1593  1.1  christos 	size_t offset;
   1594  1.1  christos 	uint64_t index;
   1595  1.1  christos 	dns_qpshift_t bit;
   1596  1.1  christos 	dns_qpweight_t pos;
   1597  1.1  christos 	dns_qpnode_t *n = NULL;
   1598  1.1  christos 
   1599  1.1  christos 	REQUIRE(QP_VALID(qp));
   1600  1.1  christos 
   1601  1.1  christos 	new_leaf = make_leaf(pval, ival);
   1602  1.1  christos 	new_keylen = leaf_qpkey(qp, &new_leaf, new_key);
   1603  1.1  christos 
   1604  1.1  christos 	/* first leaf in an empty trie? */
   1605  1.1  christos 	if (qp->leaf_count == 0) {
   1606  1.1  christos 		new_ref = alloc_twigs(qp, 1);
   1607  1.1  christos 		new_twigs = ref_ptr(qp, new_ref);
   1608  1.1  christos 		*new_twigs = new_leaf;
   1609  1.1  christos 		attach_leaf(qp, new_twigs);
   1610  1.1  christos 		qp->leaf_count++;
   1611  1.1  christos 		qp->root_ref = new_ref;
   1612  1.1  christos 		return ISC_R_SUCCESS;
   1613  1.1  christos 	}
   1614  1.1  christos 
   1615  1.1  christos 	/*
   1616  1.1  christos 	 * We need to keep searching down to a leaf even if our key is
   1617  1.1  christos 	 * missing from this branch. It doesn't matter which twig we
   1618  1.1  christos 	 * choose since the keys are all the same up to this node's
   1619  1.1  christos 	 * offset. Note that if we simply use branch_twig_pos(n, bit)
   1620  1.1  christos 	 * we may get an out-of-bounds access if our bit is greater
   1621  1.1  christos 	 * than all the set bits in the node.
   1622  1.1  christos 	 */
   1623  1.1  christos 	n = ref_ptr(qp, qp->root_ref);
   1624  1.1  christos 	while (is_branch(n)) {
   1625  1.1  christos 		prefetch_twigs(qp, n);
   1626  1.1  christos 		dns_qpref_t ref = branch_twigs_ref(n);
   1627  1.1  christos 		bit = branch_keybit(n, new_key, new_keylen);
   1628  1.1  christos 		pos = branch_has_twig(n, bit) ? branch_twig_pos(n, bit) : 0;
   1629  1.1  christos 		n = ref_ptr(qp, ref + pos);
   1630  1.1  christos 	}
   1631  1.1  christos 
   1632  1.1  christos 	/* do the keys differ, and if so, where? */
   1633  1.1  christos 	old_keylen = leaf_qpkey(qp, n, old_key);
   1634  1.1  christos 	offset = qpkey_compare(new_key, new_keylen, old_key, old_keylen);
   1635  1.1  christos 	if (offset == QPKEY_EQUAL) {
   1636  1.1  christos 		return ISC_R_EXISTS;
   1637  1.1  christos 	}
   1638  1.1  christos 	new_bit = qpkey_bit(new_key, new_keylen, offset);
   1639  1.1  christos 	old_bit = qpkey_bit(old_key, old_keylen, offset);
   1640  1.1  christos 
   1641  1.1  christos 	/* find where to insert a branch or grow an existing branch. */
   1642  1.1  christos 	n = make_root_mutable(qp);
   1643  1.1  christos 	while (is_branch(n)) {
   1644  1.1  christos 		prefetch_twigs(qp, n);
   1645  1.1  christos 		if (offset < branch_key_offset(n)) {
   1646  1.1  christos 			goto newbranch;
   1647  1.1  christos 		}
   1648  1.1  christos 		if (offset == branch_key_offset(n)) {
   1649  1.1  christos 			goto growbranch;
   1650  1.1  christos 		}
   1651  1.1  christos 		make_twigs_mutable(qp, n);
   1652  1.1  christos 		bit = branch_keybit(n, new_key, new_keylen);
   1653  1.1  christos 		INSIST(branch_has_twig(n, bit));
   1654  1.1  christos 		n = branch_twig_ptr(qp, n, bit);
   1655  1.1  christos 	}
   1656  1.1  christos 	/* fall through */
   1657  1.1  christos 
   1658  1.1  christos newbranch:
   1659  1.1  christos 	new_ref = alloc_twigs(qp, 2);
   1660  1.1  christos 	new_twigs = ref_ptr(qp, new_ref);
   1661  1.1  christos 
   1662  1.1  christos 	/* save before overwriting. */
   1663  1.1  christos 	old_node = *n;
   1664  1.1  christos 
   1665  1.1  christos 	/* new branch node takes old node's place */
   1666  1.1  christos 	index = BRANCH_TAG | (1ULL << new_bit) | (1ULL << old_bit) |
   1667  1.1  christos 		((uint64_t)offset << SHIFT_OFFSET);
   1668  1.1  christos 	*n = make_node(index, new_ref);
   1669  1.1  christos 
   1670  1.1  christos 	/* populate twigs */
   1671  1.1  christos 	new_twigs[old_bit > new_bit] = old_node;
   1672  1.1  christos 	new_twigs[new_bit > old_bit] = new_leaf;
   1673  1.1  christos 
   1674  1.1  christos 	attach_leaf(qp, &new_leaf);
   1675  1.1  christos 	qp->leaf_count++;
   1676  1.1  christos 
   1677  1.1  christos 	return ISC_R_SUCCESS;
   1678  1.1  christos 
   1679  1.1  christos growbranch:
   1680  1.1  christos 	INSIST(!branch_has_twig(n, new_bit));
   1681  1.1  christos 
   1682  1.1  christos 	/* locate twigs vectors */
   1683  1.1  christos 	old_size = branch_twigs_size(n);
   1684  1.1  christos 	new_size = old_size + 1;
   1685  1.1  christos 	old_ref = branch_twigs_ref(n);
   1686  1.1  christos 	new_ref = alloc_twigs(qp, new_size);
   1687  1.1  christos 	old_twigs = ref_ptr(qp, old_ref);
   1688  1.1  christos 	new_twigs = ref_ptr(qp, new_ref);
   1689  1.1  christos 
   1690  1.1  christos 	/* embiggen branch node */
   1691  1.1  christos 	index = branch_index(n) | (1ULL << new_bit);
   1692  1.1  christos 	*n = make_node(index, new_ref);
   1693  1.1  christos 
   1694  1.1  christos 	/* embiggen twigs vector */
   1695  1.1  christos 	pos = branch_twig_pos(n, new_bit);
   1696  1.1  christos 	move_twigs(new_twigs, old_twigs, pos);
   1697  1.1  christos 	new_twigs[pos] = new_leaf;
   1698  1.1  christos 	move_twigs(new_twigs + pos + 1, old_twigs + pos, old_size - pos);
   1699  1.1  christos 
   1700  1.1  christos 	if (squash_twigs(qp, old_ref, old_size)) {
   1701  1.1  christos 		/* old twigs destroyed, only attach to new leaf */
   1702  1.1  christos 		attach_leaf(qp, &new_leaf);
   1703  1.1  christos 	} else {
   1704  1.1  christos 		/* old twigs duplicated, attach to all leaves */
   1705  1.1  christos 		attach_twigs(qp, new_twigs, new_size);
   1706  1.1  christos 	}
   1707  1.1  christos 	qp->leaf_count++;
   1708  1.1  christos 
   1709  1.1  christos 	return ISC_R_SUCCESS;
   1710  1.1  christos }
   1711  1.1  christos 
   1712  1.1  christos isc_result_t
   1713  1.1  christos dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key,
   1714  1.1  christos 		 size_t search_keylen, void **pval_r, uint32_t *ival_r) {
   1715  1.1  christos 	REQUIRE(QP_VALID(qp));
   1716  1.1  christos 	REQUIRE(search_keylen < sizeof(dns_qpkey_t));
   1717  1.1  christos 
   1718  1.1  christos 	if (get_root(qp) == NULL) {
   1719  1.1  christos 		return ISC_R_NOTFOUND;
   1720  1.1  christos 	}
   1721  1.1  christos 
   1722  1.1  christos 	dns_qpshift_t bit = 0; /* suppress warning */
   1723  1.1  christos 	dns_qpnode_t *parent = NULL;
   1724  1.1  christos 	dns_qpnode_t *n = make_root_mutable(qp);
   1725  1.1  christos 	while (is_branch(n)) {
   1726  1.1  christos 		prefetch_twigs(qp, n);
   1727  1.1  christos 		bit = branch_keybit(n, search_key, search_keylen);
   1728  1.1  christos 		if (!branch_has_twig(n, bit)) {
   1729  1.1  christos 			return ISC_R_NOTFOUND;
   1730  1.1  christos 		}
   1731  1.1  christos 		make_twigs_mutable(qp, n);
   1732  1.1  christos 		parent = n;
   1733  1.1  christos 		n = branch_twig_ptr(qp, n, bit);
   1734  1.1  christos 	}
   1735  1.1  christos 
   1736  1.1  christos 	dns_qpkey_t found_key;
   1737  1.1  christos 	size_t found_keylen = leaf_qpkey(qp, n, found_key);
   1738  1.1  christos 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
   1739  1.1  christos 	    QPKEY_EQUAL)
   1740  1.1  christos 	{
   1741  1.1  christos 		return ISC_R_NOTFOUND;
   1742  1.1  christos 	}
   1743  1.1  christos 
   1744  1.1  christos 	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   1745  1.1  christos 	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   1746  1.1  christos 	detach_leaf(qp, n);
   1747  1.1  christos 	qp->leaf_count--;
   1748  1.1  christos 
   1749  1.1  christos 	/* trie becomes empty */
   1750  1.1  christos 	if (qp->leaf_count == 0) {
   1751  1.1  christos 		INSIST(parent == NULL);
   1752  1.1  christos 		INSIST(n == get_root(qp));
   1753  1.1  christos 		free_twigs(qp, qp->root_ref, 1);
   1754  1.1  christos 		qp->root_ref = INVALID_REF;
   1755  1.1  christos 		return ISC_R_SUCCESS;
   1756  1.1  christos 	}
   1757  1.1  christos 
   1758  1.1  christos 	/* step back to parent node */
   1759  1.1  christos 	n = parent;
   1760  1.1  christos 	parent = NULL;
   1761  1.1  christos 
   1762  1.1  christos 	INSIST(bit != 0);
   1763  1.1  christos 	dns_qpweight_t size = branch_twigs_size(n);
   1764  1.1  christos 	dns_qpweight_t pos = branch_twig_pos(n, bit);
   1765  1.1  christos 	dns_qpref_t ref = branch_twigs_ref(n);
   1766  1.1  christos 	dns_qpnode_t *twigs = ref_ptr(qp, ref);
   1767  1.1  christos 
   1768  1.1  christos 	if (size == 2) {
   1769  1.1  christos 		/*
   1770  1.1  christos 		 * move the other twig to the parent branch.
   1771  1.1  christos 		 */
   1772  1.1  christos 		*n = twigs[!pos];
   1773  1.1  christos 		squash_twigs(qp, ref, 2);
   1774  1.1  christos 	} else {
   1775  1.1  christos 		/*
   1776  1.1  christos 		 * shrink the twigs in place, to avoid using the bump
   1777  1.1  christos 		 * chunk too fast - the gc will clean up after us
   1778  1.1  christos 		 */
   1779  1.1  christos 		*n = make_node(branch_index(n) & ~(1ULL << bit), ref);
   1780  1.1  christos 		move_twigs(twigs + pos, twigs + pos + 1, size - pos - 1);
   1781  1.1  christos 		squash_twigs(qp, ref + size - 1, 1);
   1782  1.1  christos 	}
   1783  1.1  christos 
   1784  1.1  christos 	return ISC_R_SUCCESS;
   1785  1.1  christos }
   1786  1.1  christos 
   1787  1.1  christos isc_result_t
   1788  1.1  christos dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name, void **pval_r,
   1789  1.1  christos 		  uint32_t *ival_r) {
   1790  1.1  christos 	dns_qpkey_t key;
   1791  1.1  christos 	size_t keylen = dns_qpkey_fromname(key, name);
   1792  1.1  christos 	return dns_qp_deletekey(qp, key, keylen, pval_r, ival_r);
   1793  1.1  christos }
   1794  1.1  christos 
   1795  1.1  christos /***********************************************************************
   1796  1.1  christos  *  chains
   1797  1.1  christos  */
   1798  1.1  christos static void
   1799  1.1  christos maybe_set_name(dns_qpreader_t *qp, dns_qpnode_t *node, dns_name_t *name) {
   1800  1.1  christos 	dns_qpkey_t key;
   1801  1.1  christos 	size_t len;
   1802  1.1  christos 
   1803  1.1  christos 	if (name == NULL) {
   1804  1.1  christos 		return;
   1805  1.1  christos 	}
   1806  1.1  christos 
   1807  1.1  christos 	dns_name_reset(name);
   1808  1.1  christos 	len = leaf_qpkey(qp, node, key);
   1809  1.1  christos 	dns_qpkey_toname(key, len, name);
   1810  1.1  christos }
   1811  1.1  christos 
   1812  1.1  christos void
   1813  1.1  christos dns_qpchain_init(dns_qpreadable_t qpr, dns_qpchain_t *chain) {
   1814  1.1  christos 	dns_qpreader_t *qp = dns_qpreader(qpr);
   1815  1.1  christos 	REQUIRE(QP_VALID(qp));
   1816  1.1  christos 	REQUIRE(chain != NULL);
   1817  1.1  christos 
   1818  1.1  christos 	*chain = (dns_qpchain_t){
   1819  1.1  christos 		.magic = QPCHAIN_MAGIC,
   1820  1.1  christos 		.qp = qp,
   1821  1.1  christos 	};
   1822  1.1  christos }
   1823  1.1  christos 
   1824  1.1  christos unsigned int
   1825  1.1  christos dns_qpchain_length(dns_qpchain_t *chain) {
   1826  1.1  christos 	REQUIRE(QPCHAIN_VALID(chain));
   1827  1.1  christos 
   1828  1.1  christos 	return chain->len;
   1829  1.1  christos }
   1830  1.1  christos 
   1831  1.1  christos void
   1832  1.1  christos dns_qpchain_node(dns_qpchain_t *chain, unsigned int level, dns_name_t *name,
   1833  1.1  christos 		 void **pval_r, uint32_t *ival_r) {
   1834  1.1  christos 	dns_qpnode_t *node = NULL;
   1835  1.1  christos 
   1836  1.1  christos 	REQUIRE(QPCHAIN_VALID(chain));
   1837  1.1  christos 	REQUIRE(level < chain->len);
   1838  1.1  christos 
   1839  1.1  christos 	node = chain->chain[level].node;
   1840  1.1  christos 	maybe_set_name(chain->qp, node, name);
   1841  1.1  christos 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
   1842  1.1  christos 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
   1843  1.1  christos }
   1844  1.1  christos 
   1845  1.1  christos /***********************************************************************
   1846  1.1  christos  *  iterators
   1847  1.1  christos  */
   1848  1.1  christos 
   1849  1.1  christos void
   1850  1.1  christos dns_qpiter_init(dns_qpreadable_t qpr, dns_qpiter_t *qpi) {
   1851  1.1  christos 	dns_qpreader_t *qp = dns_qpreader(qpr);
   1852  1.1  christos 	REQUIRE(QP_VALID(qp));
   1853  1.1  christos 	REQUIRE(qpi != NULL);
   1854  1.1  christos 	*qpi = (dns_qpiter_t){
   1855  1.1  christos 		.qp = qp,
   1856  1.1  christos 		.magic = QPITER_MAGIC,
   1857  1.1  christos 	};
   1858  1.1  christos }
   1859  1.1  christos 
   1860  1.1  christos /*
   1861  1.1  christos  * are we at the last twig in this branch (in whichever direction
   1862  1.1  christos  * we're iterating)?
   1863  1.1  christos  */
   1864  1.1  christos static bool
   1865  1.1  christos last_twig(dns_qpiter_t *qpi, bool forward) {
   1866  1.1  christos 	dns_qpweight_t pos = 0, max = 0;
   1867  1.1  christos 	if (qpi->sp > 0) {
   1868  1.1  christos 		dns_qpnode_t *child = qpi->stack[qpi->sp];
   1869  1.1  christos 		dns_qpnode_t *parent = qpi->stack[qpi->sp - 1];
   1870  1.1  christos 		pos = child - ref_ptr(qpi->qp, branch_twigs_ref(parent));
   1871  1.1  christos 		if (forward) {
   1872  1.1  christos 			max = branch_twigs_size(parent) - 1;
   1873  1.1  christos 		}
   1874  1.1  christos 	}
   1875  1.1  christos 	return pos == max;
   1876  1.1  christos }
   1877  1.1  christos 
   1878  1.1  christos /*
   1879  1.1  christos  * move a QP iterator forward or back to the next or previous leaf.
   1880  1.1  christos  * note: this function can go wrong when the iterator refers to
   1881  1.1  christos  * a mutable view of the trie which is altered while iterating
   1882  1.1  christos  */
   1883  1.1  christos static isc_result_t
   1884  1.1  christos iterate(bool forward, dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   1885  1.1  christos 	uint32_t *ival_r) {
   1886  1.1  christos 	dns_qpnode_t *node = NULL;
   1887  1.1  christos 	bool initial_branch = true;
   1888  1.1  christos 
   1889  1.1  christos 	REQUIRE(QPITER_VALID(qpi));
   1890  1.1  christos 
   1891  1.1  christos 	dns_qpreader_t *qp = qpi->qp;
   1892  1.1  christos 
   1893  1.1  christos 	REQUIRE(QP_VALID(qp));
   1894  1.1  christos 
   1895  1.1  christos 	node = get_root(qp);
   1896  1.1  christos 	if (node == NULL) {
   1897  1.1  christos 		return ISC_R_NOMORE;
   1898  1.1  christos 	}
   1899  1.1  christos 
   1900  1.1  christos 	do {
   1901  1.1  christos 		if (qpi->stack[qpi->sp] == NULL) {
   1902  1.1  christos 			/* newly initialized iterator: use the root node */
   1903  1.1  christos 			INSIST(qpi->sp == 0);
   1904  1.1  christos 			qpi->stack[0] = node;
   1905  1.1  christos 		} else if (!initial_branch) {
   1906  1.1  christos 			/*
   1907  1.1  christos 			 * in a prior loop, we reached a branch; from
   1908  1.1  christos 			 * here we just need to get the highest or lowest
   1909  1.1  christos 			 * leaf in the subtree; we don't need to bother
   1910  1.1  christos 			 * stepping forward or backward through twigs
   1911  1.1  christos 			 * anymore.
   1912  1.1  christos 			 */
   1913  1.1  christos 			INSIST(qpi->sp > 0);
   1914  1.1  christos 		} else if (last_twig(qpi, forward)) {
   1915  1.1  christos 			/*
   1916  1.1  christos 			 * we've stepped to the end (or the beginning,
   1917  1.1  christos 			 * if we're iterating backwards) of a set of twigs.
   1918  1.1  christos 			 */
   1919  1.1  christos 			if (qpi->sp == 0) {
   1920  1.1  christos 				/*
   1921  1.1  christos 				 * we've finished iterating. reinitialize
   1922  1.1  christos 				 * the iterator, then return ISC_R_NOMORE.
   1923  1.1  christos 				 */
   1924  1.1  christos 				dns_qpiter_init(qpi->qp, qpi);
   1925  1.1  christos 				return ISC_R_NOMORE;
   1926  1.1  christos 			}
   1927  1.1  christos 
   1928  1.1  christos 			/*
   1929  1.1  christos 			 * pop the stack, and resume at the parent branch.
   1930  1.1  christos 			 */
   1931  1.1  christos 			qpi->stack[qpi->sp] = NULL;
   1932  1.1  christos 			qpi->sp--;
   1933  1.1  christos 			continue;
   1934  1.1  christos 		} else {
   1935  1.1  christos 			/*
   1936  1.1  christos 			 * there are more twigs in the current branch,
   1937  1.1  christos 			 * so step the node pointer forward (or back).
   1938  1.1  christos 			 */
   1939  1.1  christos 			qpi->stack[qpi->sp] += (forward ? 1 : -1);
   1940  1.1  christos 			node = qpi->stack[qpi->sp];
   1941  1.1  christos 		}
   1942  1.1  christos 
   1943  1.1  christos 		/*
   1944  1.1  christos 		 * if we're at a branch now, we loop down to the
   1945  1.1  christos 		 * left- or rightmost leaf.
   1946  1.1  christos 		 */
   1947  1.1  christos 		if (is_branch(node)) {
   1948  1.1  christos 			qpi->sp++;
   1949  1.1  christos 			INSIST(qpi->sp < DNS_QP_MAXKEY);
   1950  1.1  christos 			node = ref_ptr(qp, branch_twigs_ref(node)) +
   1951  1.1  christos 			       (forward ? 0 : branch_twigs_size(node) - 1);
   1952  1.1  christos 			qpi->stack[qpi->sp] = node;
   1953  1.1  christos 			initial_branch = false;
   1954  1.1  christos 		}
   1955  1.1  christos 	} while (is_branch(node));
   1956  1.1  christos 
   1957  1.1  christos 	/* we're at a leaf: return its data to the caller */
   1958  1.1  christos 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
   1959  1.1  christos 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
   1960  1.1  christos 	maybe_set_name(qpi->qp, node, name);
   1961  1.1  christos 	return ISC_R_SUCCESS;
   1962  1.1  christos }
   1963  1.1  christos 
   1964  1.1  christos isc_result_t
   1965  1.1  christos dns_qpiter_next(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   1966  1.1  christos 		uint32_t *ival_r) {
   1967  1.1  christos 	return iterate(true, qpi, name, pval_r, ival_r);
   1968  1.1  christos }
   1969  1.1  christos 
   1970  1.1  christos isc_result_t
   1971  1.1  christos dns_qpiter_prev(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   1972  1.1  christos 		uint32_t *ival_r) {
   1973  1.1  christos 	return iterate(false, qpi, name, pval_r, ival_r);
   1974  1.1  christos }
   1975  1.1  christos 
   1976  1.1  christos isc_result_t
   1977  1.1  christos dns_qpiter_current(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r,
   1978  1.1  christos 		   uint32_t *ival_r) {
   1979  1.1  christos 	dns_qpnode_t *node = NULL;
   1980  1.1  christos 
   1981  1.1  christos 	REQUIRE(QPITER_VALID(qpi));
   1982  1.1  christos 
   1983  1.1  christos 	node = qpi->stack[qpi->sp];
   1984  1.1  christos 	if (node == NULL || is_branch(node)) {
   1985  1.1  christos 		return ISC_R_FAILURE;
   1986  1.1  christos 	}
   1987  1.1  christos 
   1988  1.1  christos 	SET_IF_NOT_NULL(pval_r, leaf_pval(node));
   1989  1.1  christos 	SET_IF_NOT_NULL(ival_r, leaf_ival(node));
   1990  1.1  christos 	maybe_set_name(qpi->qp, node, name);
   1991  1.1  christos 	return ISC_R_SUCCESS;
   1992  1.1  christos }
   1993  1.1  christos 
   1994  1.1  christos /***********************************************************************
   1995  1.1  christos  *
   1996  1.1  christos  *  search
   1997  1.1  christos  */
   1998  1.1  christos 
   1999  1.1  christos isc_result_t
   2000  1.1  christos dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key,
   2001  1.1  christos 	      size_t search_keylen, void **pval_r, uint32_t *ival_r) {
   2002  1.1  christos 	dns_qpreader_t *qp = dns_qpreader(qpr);
   2003  1.1  christos 	dns_qpkey_t found_key;
   2004  1.1  christos 	size_t found_keylen;
   2005  1.1  christos 	dns_qpshift_t bit;
   2006  1.1  christos 	dns_qpnode_t *n = NULL;
   2007  1.1  christos 
   2008  1.1  christos 	REQUIRE(QP_VALID(qp));
   2009  1.1  christos 	REQUIRE(search_keylen < sizeof(dns_qpkey_t));
   2010  1.1  christos 
   2011  1.1  christos 	n = get_root(qp);
   2012  1.1  christos 	if (n == NULL) {
   2013  1.1  christos 		return ISC_R_NOTFOUND;
   2014  1.1  christos 	}
   2015  1.1  christos 
   2016  1.1  christos 	while (is_branch(n)) {
   2017  1.1  christos 		prefetch_twigs(qp, n);
   2018  1.1  christos 		bit = branch_keybit(n, search_key, search_keylen);
   2019  1.1  christos 		if (!branch_has_twig(n, bit)) {
   2020  1.1  christos 			return ISC_R_NOTFOUND;
   2021  1.1  christos 		}
   2022  1.1  christos 		n = branch_twig_ptr(qp, n, bit);
   2023  1.1  christos 	}
   2024  1.1  christos 
   2025  1.1  christos 	found_keylen = leaf_qpkey(qp, n, found_key);
   2026  1.1  christos 	if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) !=
   2027  1.1  christos 	    QPKEY_EQUAL)
   2028  1.1  christos 	{
   2029  1.1  christos 		return ISC_R_NOTFOUND;
   2030  1.1  christos 	}
   2031  1.1  christos 
   2032  1.1  christos 	SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   2033  1.1  christos 	SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   2034  1.1  christos 	return ISC_R_SUCCESS;
   2035  1.1  christos }
   2036  1.1  christos 
   2037  1.1  christos isc_result_t
   2038  1.1  christos dns_qp_getname(dns_qpreadable_t qpr, const dns_name_t *name, void **pval_r,
   2039  1.1  christos 	       uint32_t *ival_r) {
   2040  1.1  christos 	dns_qpkey_t key;
   2041  1.1  christos 	size_t keylen = dns_qpkey_fromname(key, name);
   2042  1.1  christos 	return dns_qp_getkey(qpr, key, keylen, pval_r, ival_r);
   2043  1.1  christos }
   2044  1.1  christos 
   2045  1.1  christos static inline void
   2046  1.1  christos add_link(dns_qpchain_t *chain, dns_qpnode_t *node, size_t offset) {
   2047  1.1  christos 	/* prevent duplication */
   2048  1.1  christos 	if (chain->len != 0 && chain->chain[chain->len - 1].node == node) {
   2049  1.1  christos 		return;
   2050  1.1  christos 	}
   2051  1.1  christos 	chain->chain[chain->len].node = node;
   2052  1.1  christos 	chain->chain[chain->len].offset = offset;
   2053  1.1  christos 	chain->len++;
   2054  1.1  christos 	INSIST(chain->len <= DNS_NAME_MAXLABELS);
   2055  1.1  christos }
   2056  1.1  christos 
   2057  1.1  christos static inline void
   2058  1.1  christos prevleaf(dns_qpiter_t *it) {
   2059  1.1  christos 	isc_result_t result = dns_qpiter_prev(it, NULL, NULL, NULL);
   2060  1.1  christos 	if (result == ISC_R_NOMORE) {
   2061  1.1  christos 		result = dns_qpiter_prev(it, NULL, NULL, NULL);
   2062  1.1  christos 	}
   2063  1.1  christos 	RUNTIME_CHECK(result == ISC_R_SUCCESS);
   2064  1.1  christos }
   2065  1.1  christos 
   2066  1.1  christos static inline void
   2067  1.1  christos greatest_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpiter_t *iter) {
   2068  1.1  christos 	while (is_branch(n)) {
   2069  1.1  christos 		dns_qpref_t ref = branch_twigs_ref(n) + branch_twigs_size(n) -
   2070  1.1  christos 				  1;
   2071  1.1  christos 		iter->stack[++iter->sp] = n;
   2072  1.1  christos 		n = ref_ptr(qpr, ref);
   2073  1.1  christos 	}
   2074  1.1  christos 	iter->stack[++iter->sp] = n;
   2075  1.1  christos }
   2076  1.1  christos 
   2077  1.1  christos static inline dns_qpnode_t *
   2078  1.1  christos anyleaf(dns_qpreader_t *qp, dns_qpnode_t *n) {
   2079  1.1  christos 	while (is_branch(n)) {
   2080  1.1  christos 		n = branch_twigs(qp, n);
   2081  1.1  christos 	}
   2082  1.1  christos 	return n;
   2083  1.1  christos }
   2084  1.1  christos 
   2085  1.1  christos static inline int
   2086  1.1  christos twig_offset(dns_qpnode_t *n, dns_qpshift_t sbit, dns_qpshift_t kbit,
   2087  1.1  christos 	    dns_qpshift_t fbit) {
   2088  1.1  christos 	dns_qpweight_t pos = branch_twig_pos(n, sbit);
   2089  1.1  christos 	if (branch_has_twig(n, sbit)) {
   2090  1.1  christos 		return pos - (kbit < fbit);
   2091  1.1  christos 	}
   2092  1.1  christos 	return pos - 1;
   2093  1.1  christos }
   2094  1.1  christos 
   2095  1.1  christos /*
   2096  1.1  christos  * If dns_qp_lookup() was passed an iterator, we want it to point at the
   2097  1.1  christos  * matching name in the case of an exact match, or at the predecessor name
   2098  1.1  christos  * for a non-exact match.
   2099  1.1  christos  *
   2100  1.1  christos  * If there is an exact match, then there is nothing to be done. Otherwise,
   2101  1.1  christos  * we pop up the iterator stack until we find a parent branch with an offset
   2102  1.1  christos  * that is before the position where the search key differs from the found key.
   2103  1.1  christos  * From there we can step to the leaf that is the predecessor of the searched
   2104  1.1  christos  * name.
   2105  1.1  christos  *
   2106  1.1  christos  * Requires the iterator to be pointing at a leaf node.
   2107  1.1  christos  */
   2108  1.1  christos static void
   2109  1.1  christos fix_iterator(dns_qpreader_t *qp, dns_qpiter_t *it, dns_qpkey_t key,
   2110  1.1  christos 	     size_t len) {
   2111  1.1  christos 	dns_qpnode_t *n = it->stack[it->sp];
   2112  1.1  christos 
   2113  1.1  christos 	REQUIRE(!is_branch(n));
   2114  1.1  christos 
   2115  1.1  christos 	dns_qpkey_t found;
   2116  1.1  christos 	size_t foundlen = leaf_qpkey(qp, n, found);
   2117  1.1  christos 	size_t to = qpkey_compare(key, len, found, foundlen);
   2118  1.1  christos 
   2119  1.1  christos 	/* If the keys are equal, the iterator is already at the right node. */
   2120  1.1  christos 	if (to == QPKEY_EQUAL) {
   2121  1.1  christos 		return;
   2122  1.1  christos 	}
   2123  1.1  christos 
   2124  1.1  christos 	/*
   2125  1.1  christos 	 * Special case: if the key differs even before the root
   2126  1.1  christos 	 * key offset, it means the name desired either precedes or
   2127  1.1  christos 	 * follows the entire range of names in the database, and
   2128  1.1  christos 	 * popping up the stack won't help us, so just move the
   2129  1.1  christos 	 * iterator one step back from the origin and return.
   2130  1.1  christos 	 */
   2131  1.1  christos 	if (to < branch_key_offset(it->stack[0])) {
   2132  1.1  christos 		dns_qpiter_init(qp, it);
   2133  1.1  christos 		prevleaf(it);
   2134  1.1  christos 		return;
   2135  1.1  christos 	}
   2136  1.1  christos 
   2137  1.1  christos 	/*
   2138  1.1  christos 	 * As long as the branch offset point is after the point where the
   2139  1.1  christos 	 * key differs, we need to branch up and find a better node.
   2140  1.1  christos 	 */
   2141  1.1  christos 	while (it->sp > 0) {
   2142  1.1  christos 		dns_qpnode_t *b = it->stack[it->sp - 1];
   2143  1.1  christos 		if (branch_key_offset(b) < to) {
   2144  1.1  christos 			break;
   2145  1.1  christos 		}
   2146  1.1  christos 		it->sp--;
   2147  1.1  christos 	}
   2148  1.1  christos 	n = it->stack[it->sp];
   2149  1.1  christos 
   2150  1.1  christos 	/*
   2151  1.1  christos 	 * Either we are now at the correct branch, or we are at the
   2152  1.1  christos 	 * first unmatched node. Determine the bit position for the
   2153  1.1  christos 	 * twig we need (sbit).
   2154  1.1  christos 	 */
   2155  1.1  christos 	dns_qpshift_t kbit = qpkey_bit(key, len, to);
   2156  1.1  christos 	dns_qpshift_t fbit = qpkey_bit(found, foundlen, to);
   2157  1.1  christos 	dns_qpshift_t sbit = 0;
   2158  1.1  christos 
   2159  1.1  christos 	if (is_branch(n) && branch_key_offset(n) == to) {
   2160  1.1  christos 		/* We are on the correct branch now. */
   2161  1.1  christos 		sbit = kbit;
   2162  1.1  christos 	} else if (it->sp == 0) {
   2163  1.1  christos 		/*
   2164  1.1  christos 		 * We are on the root branch, popping up the stack won't
   2165  1.1  christos 		 * help us, so just move the iterator one step back from the
   2166  1.1  christos 		 * origin and return.
   2167  1.1  christos 		 */
   2168  1.1  christos 		dns_qpiter_init(qp, it);
   2169  1.1  christos 		prevleaf(it);
   2170  1.1  christos 		return;
   2171  1.1  christos 	} else {
   2172  1.1  christos 		/* We are at the first unmatched node, pop up the stack. */
   2173  1.1  christos 		n = it->stack[--it->sp];
   2174  1.1  christos 		sbit = qpkey_bit(key, len, branch_key_offset(n));
   2175  1.1  christos 	}
   2176  1.1  christos 
   2177  1.1  christos 	INSIST(is_branch(n));
   2178  1.1  christos 
   2179  1.1  christos 	prefetch_twigs(qp, n);
   2180  1.1  christos 	dns_qpnode_t *twigs = branch_twigs(qp, n);
   2181  1.1  christos 	int toff = twig_offset(n, sbit, kbit, fbit);
   2182  1.1  christos 	if (toff >= 0) {
   2183  1.1  christos 		/*
   2184  1.1  christos 		 * The name we want would've been after some twig in
   2185  1.1  christos 		 * this branch. Walk down from that twig to the
   2186  1.1  christos 		 * highest leaf in its subtree to get the predecessor.
   2187  1.1  christos 		 */
   2188  1.1  christos 		greatest_leaf(qp, twigs + toff, it);
   2189  1.1  christos 	} else {
   2190  1.1  christos 		/*
   2191  1.1  christos 		 * Every leaf below this node is greater than the one we
   2192  1.1  christos 		 * wanted, so the previous leaf is the predecessor.
   2193  1.1  christos 		 */
   2194  1.1  christos 		prevleaf(it);
   2195  1.1  christos 	}
   2196  1.1  christos }
   2197  1.1  christos 
   2198  1.1  christos /*
   2199  1.1  christos  * When searching for a requested name in dns_qp_lookup(), we might add
   2200  1.1  christos  * a leaf node to the chain, then subsequently determine that it was a
   2201  1.1  christos  * dead end. When this happens, the chain can be left holding a node
   2202  1.1  christos  * that is *not* an ancestor of the requested name. We correct for that
   2203  1.1  christos  * here.
   2204  1.1  christos  */
   2205  1.1  christos static void
   2206  1.1  christos fix_chain(dns_qpchain_t *chain, size_t offset) {
   2207  1.1  christos 	while (chain->len > 0 && chain->chain[chain->len - 1].offset >= offset)
   2208  1.1  christos 	{
   2209  1.1  christos 		chain->len--;
   2210  1.1  christos 		chain->chain[chain->len].node = NULL;
   2211  1.1  christos 		chain->chain[chain->len].offset = 0;
   2212  1.1  christos 	}
   2213  1.1  christos }
   2214  1.1  christos 
   2215  1.1  christos isc_result_t
   2216  1.1  christos dns_qp_lookup(dns_qpreadable_t qpr, const dns_name_t *name,
   2217  1.1  christos 	      dns_name_t *foundname, dns_qpiter_t *iter, dns_qpchain_t *chain,
   2218  1.1  christos 	      void **pval_r, uint32_t *ival_r) {
   2219  1.1  christos 	dns_qpreader_t *qp = dns_qpreader(qpr);
   2220  1.1  christos 	dns_qpkey_t search, found;
   2221  1.1  christos 	size_t searchlen, foundlen;
   2222  1.1  christos 	size_t offset = 0;
   2223  1.1  christos 	dns_qpnode_t *n = NULL;
   2224  1.1  christos 	dns_qpshift_t bit = SHIFT_NOBYTE;
   2225  1.1  christos 	dns_qpchain_t oc;
   2226  1.1  christos 	dns_qpiter_t it;
   2227  1.1  christos 	bool matched = false;
   2228  1.1  christos 	bool setiter = true;
   2229  1.1  christos 
   2230  1.1  christos 	REQUIRE(QP_VALID(qp));
   2231  1.1  christos 	REQUIRE(foundname == NULL || ISC_MAGIC_VALID(name, DNS_NAME_MAGIC));
   2232  1.1  christos 
   2233  1.1  christos 	searchlen = dns_qpkey_fromname(search, name);
   2234  1.1  christos 
   2235  1.1  christos 	if (chain == NULL) {
   2236  1.1  christos 		chain = &oc;
   2237  1.1  christos 	}
   2238  1.1  christos 	if (iter == NULL) {
   2239  1.1  christos 		iter = &it;
   2240  1.1  christos 		setiter = false;
   2241  1.1  christos 	}
   2242  1.1  christos 	dns_qpchain_init(qp, chain);
   2243  1.1  christos 	dns_qpiter_init(qp, iter);
   2244  1.1  christos 
   2245  1.1  christos 	n = get_root(qp);
   2246  1.1  christos 	if (n == NULL) {
   2247  1.1  christos 		return ISC_R_NOTFOUND;
   2248  1.1  christos 	}
   2249  1.1  christos 	iter->stack[0] = n;
   2250  1.1  christos 
   2251  1.1  christos 	/*
   2252  1.1  christos 	 * Like `dns_qp_insert()`, we must find a leaf. However, we don't make a
   2253  1.1  christos 	 * second pass: instead, we keep track of any leaves with shorter keys
   2254  1.1  christos 	 * that we discover along the way. (In general, qp-trie searches can be
   2255  1.1  christos 	 * one-pass, by recording their traversal, or two-pass, for less stack
   2256  1.1  christos 	 * memory usage.)
   2257  1.1  christos 	 */
   2258  1.1  christos 	while (is_branch(n)) {
   2259  1.1  christos 		prefetch_twigs(qp, n);
   2260  1.1  christos 
   2261  1.1  christos 		offset = branch_key_offset(n);
   2262  1.1  christos 		bit = qpkey_bit(search, searchlen, offset);
   2263  1.1  christos 		dns_qpnode_t *twigs = branch_twigs(qp, n);
   2264  1.1  christos 
   2265  1.1  christos 		/*
   2266  1.1  christos 		 * A shorter key that can be a parent domain always has a
   2267  1.1  christos 		 * leaf node at SHIFT_NOBYTE (indicating end of its key)
   2268  1.1  christos 		 * where our search key has a normal character immediately
   2269  1.1  christos 		 * after a label separator.
   2270  1.1  christos 		 *
   2271  1.1  christos 		 * Note 1: It is OK if `off - 1` underflows: it will
   2272  1.1  christos 		 * become SIZE_MAX, which is greater than `searchlen`, so
   2273  1.1  christos 		 * `qpkey_bit()` will return SHIFT_NOBYTE, which is what we
   2274  1.1  christos 		 * want when `off == 0`.
   2275  1.1  christos 		 *
   2276  1.1  christos 		 * Note 2: If SHIFT_NOBYTE twig is present, it will always
   2277  1.1  christos 		 * be in position 0, the first location in 'twigs'.
   2278  1.1  christos 		 */
   2279  1.1  christos 		if (bit != SHIFT_NOBYTE && branch_has_twig(n, SHIFT_NOBYTE) &&
   2280  1.1  christos 		    qpkey_bit(search, searchlen, offset - 1) == SHIFT_NOBYTE &&
   2281  1.1  christos 		    !is_branch(twigs))
   2282  1.1  christos 		{
   2283  1.1  christos 			add_link(chain, twigs, offset);
   2284  1.1  christos 		}
   2285  1.1  christos 
   2286  1.1  christos 		matched = branch_has_twig(n, bit);
   2287  1.1  christos 		if (matched) {
   2288  1.1  christos 			/*
   2289  1.1  christos 			 * found a match: if it's a branch, we keep
   2290  1.1  christos 			 * searching, and if it's a leaf, we drop out of
   2291  1.1  christos 			 * the loop.
   2292  1.1  christos 			 */
   2293  1.1  christos 			n = branch_twig_ptr(qp, n, bit);
   2294  1.1  christos 		} else {
   2295  1.1  christos 			/*
   2296  1.1  christos 			 * this branch is a dead end, and the predecessor
   2297  1.1  christos 			 * doesn't matter. now we just need to find a leaf
   2298  1.1  christos 			 * to end on so that qpkey_leaf() will work below.
   2299  1.1  christos 			 */
   2300  1.1  christos 			n = anyleaf(qp, twigs);
   2301  1.1  christos 		}
   2302  1.1  christos 
   2303  1.1  christos 		iter->stack[++iter->sp] = n;
   2304  1.1  christos 	}
   2305  1.1  christos 
   2306  1.1  christos 	if (setiter) {
   2307  1.1  christos 		/*
   2308  1.1  christos 		 * we found a leaf, but it might not be the leaf we wanted.
   2309  1.1  christos 		 * if it isn't, and if the caller passed us an iterator,
   2310  1.1  christos 		 * then we might need to reposition it.
   2311  1.1  christos 		 */
   2312  1.1  christos 		fix_iterator(qp, iter, search, searchlen);
   2313  1.1  christos 		n = iter->stack[iter->sp];
   2314  1.1  christos 	}
   2315  1.1  christos 
   2316  1.1  christos 	/* at this point, n can only be a leaf node */
   2317  1.1  christos 	INSIST(!is_branch(n));
   2318  1.1  christos 
   2319  1.1  christos 	foundlen = leaf_qpkey(qp, n, found);
   2320  1.1  christos 	offset = qpkey_compare(search, searchlen, found, foundlen);
   2321  1.1  christos 
   2322  1.1  christos 	/* the search ended with an exact or partial match */
   2323  1.1  christos 	if (offset == QPKEY_EQUAL || offset == foundlen) {
   2324  1.1  christos 		isc_result_t result = ISC_R_SUCCESS;
   2325  1.1  christos 
   2326  1.1  christos 		if (offset == foundlen) {
   2327  1.1  christos 			fix_chain(chain, offset);
   2328  1.1  christos 			result = DNS_R_PARTIALMATCH;
   2329  1.1  christos 		}
   2330  1.1  christos 		add_link(chain, n, offset);
   2331  1.1  christos 
   2332  1.1  christos 		SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   2333  1.1  christos 		SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   2334  1.1  christos 		maybe_set_name(qp, n, foundname);
   2335  1.1  christos 		return result;
   2336  1.1  christos 	}
   2337  1.1  christos 
   2338  1.1  christos 	/*
   2339  1.1  christos 	 * the requested name was not found, but if an ancestor
   2340  1.1  christos 	 * was, we can retrieve that from the chain.
   2341  1.1  christos 	 */
   2342  1.1  christos 	int len = chain->len;
   2343  1.1  christos 	while (len-- > 0) {
   2344  1.1  christos 		if (offset >= chain->chain[len].offset) {
   2345  1.1  christos 			n = chain->chain[len].node;
   2346  1.1  christos 			SET_IF_NOT_NULL(pval_r, leaf_pval(n));
   2347  1.1  christos 			SET_IF_NOT_NULL(ival_r, leaf_ival(n));
   2348  1.1  christos 			maybe_set_name(qp, n, foundname);
   2349  1.1  christos 			return DNS_R_PARTIALMATCH;
   2350  1.1  christos 		} else {
   2351  1.1  christos 			/*
   2352  1.1  christos 			 * oops, during the search we found and added
   2353  1.1  christos 			 * a leaf that's longer than the requested
   2354  1.1  christos 			 * name; remove it from the chain.
   2355  1.1  christos 			 */
   2356  1.1  christos 			chain->len--;
   2357  1.1  christos 		}
   2358  1.1  christos 	}
   2359  1.1  christos 
   2360  1.1  christos 	/* nothing was found at all */
   2361  1.1  christos 	return ISC_R_NOTFOUND;
   2362  1.1  christos }
   2363  1.1  christos 
   2364  1.1  christos /**********************************************************************/
   2365