1 /* $NetBSD: qp.c,v 1.5 2026/01/29 18:37:49 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 /* 17 * For an overview, see doc/design/qp-trie.md 18 */ 19 20 #include <inttypes.h> 21 #include <stdbool.h> 22 #include <stddef.h> 23 #include <stdint.h> 24 #include <string.h> 25 26 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 27 #include <sys/mman.h> 28 #include <unistd.h> 29 #endif 30 31 #include <isc/atomic.h> 32 #include <isc/buffer.h> 33 #include <isc/magic.h> 34 #include <isc/mem.h> 35 #include <isc/mutex.h> 36 #include <isc/refcount.h> 37 #include <isc/result.h> 38 #include <isc/rwlock.h> 39 #include <isc/tid.h> 40 #include <isc/time.h> 41 #include <isc/types.h> 42 #include <isc/urcu.h> 43 #include <isc/util.h> 44 45 #include <dns/fixedname.h> 46 #include <dns/log.h> 47 #include <dns/name.h> 48 #include <dns/qp.h> 49 #include <dns/types.h> 50 51 #include "qp_p.h" 52 53 #ifndef DNS_QP_LOG_STATS 54 #define DNS_QP_LOG_STATS 0 55 #endif 56 #ifndef DNS_QP_TRACE 57 #define DNS_QP_TRACE 0 58 #endif 59 60 /* 61 * very basic garbage collector statistics 62 * 63 * XXXFANF for now we're logging GC times, but ideally we should 64 * accumulate stats more quietly and report via the statschannel 65 */ 66 #ifdef _LP64 67 static atomic_uint_fast64_t compact_time; 68 static atomic_uint_fast64_t recycle_time; 69 static atomic_uint_fast64_t rollback_time; 70 #define ISC_QP_ADD(v, a) atomic_fetch_add_relaxed(&(v), (a)) 71 #define ISC_QP_GET(v) atomic_load_relaxed(&(v)) 72 #else 73 static uint64_t compact_time; 74 static uint64_t recycle_time; 75 static uint64_t rollback_time; 76 static isc_mutex_t qp_mutex = PTHREAD_MUTEX_INITIALIZER; 77 #define ISC_QP_ADD(v, a) \ 78 ({ \ 79 isc_mutex_lock(&qp_mutex); \ 80 uint64_t x = (v) + (a); \ 81 isc_mutex_unlock(&qp_mutex); \ 82 x; \ 83 }) 84 #define ISC_QP_GET(v) \ 85 ({ \ 86 isc_mutex_lock(&qp_mutex); \ 87 uint64_t x = (v); \ 88 isc_mutex_unlock(&qp_mutex); \ 89 x; \ 90 }) 91 #endif 92 93 94 /* for LOG_STATS() format strings */ 95 #define PRItime " %" PRIu64 " ns " 96 97 #if DNS_QP_LOG_STATS 98 #define LOG_STATS(...) \ 99 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_QP, \ 100 ISC_LOG_DEBUG(1), __VA_ARGS__) 101 #else 102 #define LOG_STATS(...) 103 #endif 104 105 #if DNS_QP_TRACE 106 /* 107 * TRACE is generally used in allocation-related functions so it doesn't 108 * trace very high-frequency ops 109 */ 110 #define TRACE(fmt, ...) \ 111 do { \ 112 if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(7))) { \ 113 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, \ 114 DNS_LOGMODULE_QP, ISC_LOG_DEBUG(7), \ 115 "%s:%d:%s(qp %p uctx \"%s\"):t%u: " fmt, \ 116 __FILE__, __LINE__, __func__, qp, \ 117 qp ? TRIENAME(qp) : "(null)", isc_tid(), \ 118 ##__VA_ARGS__); \ 119 } \ 120 } while (0) 121 #else 122 #define TRACE(...) 123 #endif 124 125 #if DNS_QPMULTI_TRACE 126 ISC_REFCOUNT_STATIC_TRACE_DECL(dns_qpmulti); 127 #define dns_qpmulti_ref(ptr) dns_qpmulti__ref(ptr, __func__, __FILE__, __LINE__) 128 #define dns_qpmulti_unref(ptr) \ 129 dns_qpmulti__unref(ptr, __func__, __FILE__, __LINE__) 130 #define dns_qpmulti_attach(ptr, ptrp) \ 131 dns_qpmulti__attach(ptr, ptrp, __func__, __FILE__, __LINE__) 132 #define dns_qpmulti_detach(ptrp) \ 133 dns_qpmulti__detach(ptrp, __func__, __FILE__, __LINE__) 134 #else 135 ISC_REFCOUNT_STATIC_DECL(dns_qpmulti); 136 #endif 137 138 /*********************************************************************** 139 * 140 * converting DNS names to trie keys 141 */ 142 143 /* 144 * Number of distinct byte values, i.e. 256 145 */ 146 #define BYTE_VALUES (UINT8_MAX + 1) 147 148 /* 149 * Lookup table mapping bytes in DNS names to bit positions, used 150 * by dns_qpkey_fromname() to convert DNS names to qp-trie keys. 151 * 152 * Each element holds one or two bit positions, bit_one in the 153 * lower half and bit_two in the upper half. 154 * 155 * For common hostname characters, bit_two is zero (which cannot 156 * be a valid bit position). 157 * 158 * For others, bit_one is the escape bit, and bit_two is the 159 * position of the character within the escaped range. 160 */ 161 uint16_t dns_qp_bits_for_byte[BYTE_VALUES] = { 0 }; 162 163 /* 164 * And the reverse, mapping bit positions to characters, so the tests 165 * can print diagnostics involving qp-trie keys. 166 * 167 * This table only handles the first bit in an escape sequence; we 168 * arrange that we can calculate the byte value for both bits by 169 * adding the the second bit to the first bit's byte value. 170 */ 171 uint8_t dns_qp_byte_for_bit[SHIFT_OFFSET] = { 0 }; 172 173 /* 174 * Fill in the lookup tables at program startup. (It doesn't matter 175 * when this is initialized relative to other startup code.) 176 */ 177 static void 178 initialize_bits_for_byte(void) ISC_CONSTRUCTOR; 179 180 /* 181 * The bit positions for bytes inside labels have to be between 182 * SHIFT_BITMAP and SHIFT_OFFSET. (SHIFT_NOBYTE separates labels.) 183 * 184 * Each byte range in between common hostname characters has a different 185 * escape character, to preserve the correct lexical order. 186 * 187 * Escaped byte ranges mostly fit into the space available in the 188 * bitmap, except for those above 'z' (which is mostly bytes with the 189 * top bit set). So, when we reach the end of the bitmap we roll over 190 * to the next escape character. 191 * 192 * After filling the table we ensure that the bit positions for 193 * hostname characters and escape characters all fit. 194 */ 195 static void 196 initialize_bits_for_byte(void) { 197 /* zero common character marker not a valid shift position */ 198 INSIST(0 < SHIFT_BITMAP); 199 /* first bit is common byte or escape byte */ 200 dns_qpshift_t bit_one = SHIFT_BITMAP; 201 /* second bit is position in escaped range */ 202 dns_qpshift_t bit_two = SHIFT_BITMAP; 203 bool escaping = true; 204 205 for (unsigned int byte = 0; byte < BYTE_VALUES; byte++) { 206 if (qp_common_character(byte)) { 207 escaping = false; 208 bit_one++; 209 dns_qp_byte_for_bit[bit_one] = byte; 210 dns_qp_bits_for_byte[byte] = bit_one; 211 } else if ('A' <= byte && byte <= 'Z') { 212 /* map upper case to lower case */ 213 dns_qpshift_t after_esc = bit_one + 1; 214 dns_qpshift_t skip_punct = 'a' - '_'; 215 dns_qpshift_t letter = byte - 'A'; 216 dns_qpshift_t bit = after_esc + skip_punct + letter; 217 dns_qp_bits_for_byte[byte] = bit; 218 /* to simplify reverse conversion */ 219 bit_two++; 220 } else { 221 /* non-hostname characters need to be escaped */ 222 if (!escaping || bit_two >= SHIFT_OFFSET) { 223 escaping = true; 224 bit_one++; 225 dns_qp_byte_for_bit[bit_one] = byte; 226 bit_two = SHIFT_BITMAP; 227 } 228 dns_qp_bits_for_byte[byte] = bit_two << 8 | bit_one; 229 bit_two++; 230 } 231 } 232 ENSURE(bit_one < SHIFT_OFFSET); 233 } 234 235 /* 236 * Convert a DNS name into a trie lookup key. 237 * 238 * Returns the length of the key. 239 * 240 * For performance we get our hands dirty in the guts of the name. 241 * 242 * We don't worry about the distinction between absolute and relative 243 * names. When the trie is only used with absolute names, the first byte 244 * of the key will always be SHIFT_NOBYTE and it will always be skipped 245 * when traversing the trie. So keeping the root label costs little, and 246 * it allows us to support tries of relative names too. In fact absolute 247 * and relative names can be mixed in the same trie without causing 248 * confusion, because the presence or absence of the initial 249 * SHIFT_NOBYTE in the key disambiguates them (exactly like a trailing 250 * dot in a zone file). 251 */ 252 size_t 253 dns_qpkey_fromname(dns_qpkey_t key, const dns_name_t *name) { 254 size_t len, label; 255 dns_fixedname_t fixed; 256 257 REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 258 259 if (name->labels == 0) { 260 key[0] = SHIFT_NOBYTE; 261 return 0; 262 } 263 264 if (name->offsets == NULL) { 265 dns_name_t *clone = dns_fixedname_initname(&fixed); 266 dns_name_clone(name, clone); 267 name = clone; 268 } 269 270 len = 0; 271 label = name->labels; 272 while (label-- > 0) { 273 const uint8_t *ldata = name->ndata + name->offsets[label]; 274 size_t label_len = *ldata++; 275 while (label_len-- > 0) { 276 uint16_t bits = dns_qp_bits_for_byte[*ldata++]; 277 key[len++] = bits & 0xFF; /* bit_one */ 278 if ((bits >> 8) != 0) { /* escape? */ 279 key[len++] = bits >> 8; /* bit_two */ 280 } 281 } 282 /* label terminator */ 283 key[len++] = SHIFT_NOBYTE; 284 } 285 /* mark end with a double NOBYTE */ 286 key[len] = SHIFT_NOBYTE; 287 ENSURE(len < sizeof(dns_qpkey_t)); 288 return len; 289 } 290 291 void 292 dns_qpkey_toname(const dns_qpkey_t key, size_t keylen, dns_name_t *name) { 293 size_t locs[DNS_NAME_MAXLABELS]; 294 size_t loc = 0, opos = 0; 295 size_t offset; 296 297 REQUIRE(ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 298 REQUIRE(name->buffer != NULL); 299 REQUIRE(name->offsets != NULL); 300 301 dns_name_reset(name); 302 303 if (keylen == 0) { 304 return; 305 } 306 307 /* Scan the key looking for label boundaries */ 308 for (offset = 0; offset <= keylen; offset++) { 309 INSIST(key[offset] >= SHIFT_NOBYTE && 310 key[offset] < SHIFT_OFFSET); 311 INSIST(loc < DNS_NAME_MAXLABELS); 312 if (qpkey_bit(key, keylen, offset) == SHIFT_NOBYTE) { 313 if (qpkey_bit(key, keylen, offset + 1) == SHIFT_NOBYTE) 314 { 315 locs[loc] = offset + 1; 316 goto scanned; 317 } 318 locs[loc++] = offset + 1; 319 } else if (offset == 0) { 320 /* This happens for a relative name */ 321 locs[loc++] = offset; 322 } 323 } 324 UNREACHABLE(); 325 scanned: 326 327 /* 328 * In the key the labels are encoded in reverse order, so 329 * we step backward through the label boundaries, then forward 330 * through the labels, to create the DNS wire format data. 331 */ 332 name->labels = loc; 333 while (loc-- > 0) { 334 uint8_t len = 0, *lenp = NULL; 335 336 /* Add a length byte to the name data and set an offset */ 337 lenp = isc_buffer_used(name->buffer); 338 isc_buffer_putuint8(name->buffer, 0); 339 name->offsets[opos++] = name->length++; 340 341 /* Convert from escaped byte ranges to ASCII */ 342 for (offset = locs[loc]; offset < locs[loc + 1] - 1; offset++) { 343 uint8_t bit = qpkey_bit(key, keylen, offset); 344 uint8_t byte = dns_qp_byte_for_bit[bit]; 345 if (qp_common_character(byte)) { 346 isc_buffer_putuint8(name->buffer, byte); 347 } else { 348 byte += key[++offset] - SHIFT_BITMAP; 349 isc_buffer_putuint8(name->buffer, byte); 350 } 351 len++; 352 } 353 354 name->length += len; 355 *lenp = len; 356 } 357 358 /* Add a root label for absolute names */ 359 if (key[0] == SHIFT_NOBYTE) { 360 name->attributes.absolute = true; 361 isc_buffer_putuint8(name->buffer, 0); 362 name->offsets[opos++] = name->length++; 363 name->labels++; 364 } 365 366 name->ndata = isc_buffer_base(name->buffer); 367 } 368 369 /* 370 * Sentinel value for equal keys 371 */ 372 #define QPKEY_EQUAL (~(size_t)0) 373 374 /* 375 * Compare two keys and return the offset where they differ. 376 * 377 * This offset is used to work out where a trie search diverged: when one 378 * of the keys is in the trie and one is not, the common prefix (up to the 379 * offset) is the part of the unknown key that exists in the trie. This 380 * matters for adding new keys or finding neighbours of missing keys. 381 * 382 * When the keys are different lengths it is possible (but unwise) for 383 * the longer key to be the same as the shorter key but with superfluous 384 * trailing SHIFT_NOBYTE elements. This makes the keys equal for the 385 * purpose of traversing the trie. 386 */ 387 static size_t 388 qpkey_compare(const dns_qpkey_t key_a, const size_t keylen_a, 389 const dns_qpkey_t key_b, const size_t keylen_b) { 390 size_t keylen = ISC_MAX(keylen_a, keylen_b); 391 for (size_t offset = 0; offset < keylen; offset++) { 392 if (qpkey_bit(key_a, keylen_a, offset) != 393 qpkey_bit(key_b, keylen_b, offset)) 394 { 395 return offset; 396 } 397 } 398 return QPKEY_EQUAL; 399 } 400 401 /*********************************************************************** 402 * 403 * allocator wrappers 404 */ 405 406 #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 407 408 /* 409 * Optionally (for debugging) during a copy-on-write transaction, use 410 * memory protection to ensure that the shared chunks are not modified. 411 * Once a chunk becomes shared, it remains read-only until it is freed. 412 * POSIX says we have to use mmap() to get an allocation that we can 413 * definitely pass to mprotect(). 414 */ 415 416 static size_t 417 chunk_size_raw(void) { 418 size_t size = (size_t)sysconf(_SC_PAGE_SIZE); 419 return ISC_MAX(size, QP_CHUNK_BYTES); 420 } 421 422 static void * 423 chunk_get_raw(dns_qp_t *qp) { 424 if (qp->write_protect) { 425 size_t size = chunk_size_raw(); 426 void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, 427 MAP_ANON | MAP_PRIVATE, -1, 0); 428 RUNTIME_CHECK(ptr != MAP_FAILED); 429 return ptr; 430 } else { 431 return isc_mem_allocate(qp->mctx, QP_CHUNK_BYTES); 432 } 433 } 434 435 static void 436 chunk_free_raw(dns_qp_t *qp, void *ptr) { 437 if (qp->write_protect) { 438 RUNTIME_CHECK(munmap(ptr, chunk_size_raw()) == 0); 439 } else { 440 isc_mem_free(qp->mctx, ptr); 441 } 442 } 443 444 static void * 445 chunk_shrink_raw(dns_qp_t *qp, void *ptr, size_t bytes) { 446 if (qp->write_protect) { 447 return ptr; 448 } else { 449 return isc_mem_reallocate(qp->mctx, ptr, bytes); 450 } 451 } 452 453 static void 454 write_protect(dns_qp_t *qp, dns_qpchunk_t chunk) { 455 if (qp->write_protect) { 456 /* see transaction_open() wrt this special case */ 457 if (qp->transaction_mode == QP_WRITE && chunk == qp->bump) { 458 return; 459 } 460 TRACE("chunk %u", chunk); 461 void *ptr = qp->base->ptr[chunk]; 462 size_t size = chunk_size_raw(); 463 RUNTIME_CHECK(mprotect(ptr, size, PROT_READ) >= 0); 464 } 465 } 466 467 #else 468 469 #define chunk_get_raw(qp, size) isc_mem_allocate(qp->mctx, size) 470 #define chunk_free_raw(qp, ptr) isc_mem_free(qp->mctx, ptr) 471 472 #define chunk_shrink_raw(qp, ptr, size) isc_mem_reallocate(qp->mctx, ptr, size) 473 474 #define write_protect(qp, chunk) 475 476 #endif 477 478 /*********************************************************************** 479 * 480 * allocator 481 */ 482 483 /* 484 * When we reuse the bump chunk across multiple write transactions, 485 * it can have an immutable prefix and a mutable suffix. 486 */ 487 static inline bool 488 cells_immutable(dns_qp_t *qp, dns_qpref_t ref) { 489 dns_qpchunk_t chunk = ref_chunk(ref); 490 dns_qpcell_t cell = ref_cell(ref); 491 if (chunk == qp->bump) { 492 return cell < qp->fender; 493 } else { 494 return qp->usage[chunk].immutable; 495 } 496 } 497 498 /* 499 * Find the next power that is both bigger than size and prev_capacity, 500 * but still within the chunk min and max sizes. 501 */ 502 static dns_qpcell_t 503 next_capacity(uint32_t prev_capacity, uint32_t size) { 504 /* 505 * Unfortunately builtin_clz is undefined for 0. We work around this 506 * issue by flooring the request size at 2. 507 */ 508 size = ISC_MAX3(size, prev_capacity, 2u); 509 uint32_t log2 = 32u - __builtin_clz(size - 1u); 510 511 return 1U << ISC_CLAMP(log2, QP_CHUNK_LOG_MIN, QP_CHUNK_LOG_MAX); 512 } 513 514 /* 515 * Create a fresh bump chunk and allocate some twigs from it. 516 */ 517 static dns_qpref_t 518 chunk_alloc(dns_qp_t *qp, dns_qpchunk_t chunk, dns_qpweight_t size) { 519 INSIST(qp->base->ptr[chunk] == NULL); 520 INSIST(qp->usage[chunk].used == 0); 521 INSIST(qp->usage[chunk].free == 0); 522 INSIST(qp->chunk_capacity <= QP_CHUNK_SIZE); 523 524 qp->chunk_capacity = next_capacity(qp->chunk_capacity * 2u, size); 525 qp->base->ptr[chunk] = 526 chunk_get_raw(qp, qp->chunk_capacity * sizeof(dns_qpnode_t)); 527 528 qp->usage[chunk] = (qp_usage_t){ .exists = true, 529 .used = size, 530 .capacity = qp->chunk_capacity }; 531 qp->used_count += size; 532 qp->bump = chunk; 533 qp->fender = 0; 534 535 if (qp->write_protect) { 536 TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]); 537 } 538 return make_ref(chunk, 0); 539 } 540 541 /* 542 * This is used to grow the chunk arrays when they fill up. If the old 543 * base array is in use by readers, we must make a clone, otherwise we 544 * can reallocate in place. 545 * 546 * The isc_refcount_init() and qpbase_unref() in this function are a pair. 547 */ 548 static void 549 realloc_chunk_arrays(dns_qp_t *qp, dns_qpchunk_t newmax) { 550 size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max; 551 size_t newptrs = sizeof(qp->base->ptr[0]) * newmax; 552 size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax); 553 554 if (qp->base == NULL || qpbase_unref(qp)) { 555 qp->base = isc_mem_reallocate(qp->mctx, qp->base, size); 556 } else { 557 dns_qpbase_t *oldbase = qp->base; 558 qp->base = isc_mem_allocate(qp->mctx, size); 559 memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs); 560 } 561 memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs); 562 isc_refcount_init(&qp->base->refcount, 1); 563 qp->base->magic = QPBASE_MAGIC; 564 565 /* usage array is exclusive to the writer */ 566 size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max; 567 size_t newusage = sizeof(qp->usage[0]) * newmax; 568 qp->usage = isc_mem_reallocate(qp->mctx, qp->usage, newusage); 569 memset(&qp->usage[qp->chunk_max], 0, newusage - oldusage); 570 571 qp->chunk_max = newmax; 572 573 TRACE("qpbase %p usage %p max %u", qp->base, qp->usage, qp->chunk_max); 574 } 575 576 /* 577 * There was no space in the bump chunk, so find a place to put a fresh 578 * chunk in the chunk arrays, then allocate some twigs from it. 579 */ 580 static dns_qpref_t 581 alloc_slow(dns_qp_t *qp, dns_qpweight_t size) { 582 dns_qpchunk_t chunk; 583 584 for (chunk = 0; chunk < qp->chunk_max; chunk++) { 585 if (!qp->usage[chunk].exists) { 586 return chunk_alloc(qp, chunk, size); 587 } 588 } 589 ENSURE(chunk == qp->chunk_max); 590 realloc_chunk_arrays(qp, GROWTH_FACTOR(chunk)); 591 return chunk_alloc(qp, chunk, size); 592 } 593 594 /* 595 * Ensure we are using a fresh bump chunk. 596 */ 597 static void 598 alloc_reset(dns_qp_t *qp) { 599 (void)alloc_slow(qp, 0); 600 } 601 602 /* 603 * Allocate some fresh twigs. This is the bump allocator fast path. 604 */ 605 static inline dns_qpref_t 606 alloc_twigs(dns_qp_t *qp, dns_qpweight_t size) { 607 dns_qpchunk_t chunk = qp->bump; 608 dns_qpcell_t cell = qp->usage[chunk].used; 609 610 if (cell + size <= qp->usage[chunk].capacity) { 611 qp->usage[chunk].used += size; 612 qp->used_count += size; 613 return make_ref(chunk, cell); 614 } else { 615 return alloc_slow(qp, size); 616 } 617 } 618 619 /* 620 * Record that some twigs are no longer being used, and if possible 621 * zero them to ensure that there isn't a spurious double detach when 622 * the chunk is later recycled. 623 * 624 * Returns true if the twigs were immediately destroyed. 625 * 626 * NOTE: the caller is responsible for attaching or detaching any 627 * leaves as required. 628 */ 629 static inline bool 630 free_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) { 631 dns_qpchunk_t chunk = ref_chunk(twigs); 632 633 qp->free_count += size; 634 qp->usage[chunk].free += size; 635 ENSURE(qp->free_count <= qp->used_count); 636 ENSURE(qp->usage[chunk].free <= qp->usage[chunk].used); 637 638 if (cells_immutable(qp, twigs)) { 639 qp->hold_count += size; 640 ENSURE(qp->free_count >= qp->hold_count); 641 return false; 642 } else { 643 zero_twigs(ref_ptr(qp, twigs), size); 644 return true; 645 } 646 } 647 648 /* 649 * When some twigs have been copied, and free_twigs() could not 650 * immediately destroy the old copy, we need to update the refcount 651 * on any leaves that were duplicated. 652 */ 653 static void 654 attach_twigs(dns_qp_t *qp, dns_qpnode_t *twigs, dns_qpweight_t size) { 655 for (dns_qpweight_t pos = 0; pos < size; pos++) { 656 if (node_tag(&twigs[pos]) == LEAF_TAG) { 657 attach_leaf(qp, &twigs[pos]); 658 } 659 } 660 } 661 662 /*********************************************************************** 663 * 664 * chunk reclamation 665 */ 666 667 /* 668 * Is any of this chunk still in use? 669 */ 670 static inline dns_qpcell_t 671 chunk_usage(dns_qp_t *qp, dns_qpchunk_t chunk) { 672 return qp->usage[chunk].used - qp->usage[chunk].free; 673 } 674 675 /* 676 * We remove each empty chunk from the total counts when the chunk is 677 * freed, or when it is scheduled for safe memory reclamation. We check 678 * the chunk's phase to avoid discounting it twice in the latter case. 679 */ 680 static void 681 chunk_discount(dns_qp_t *qp, dns_qpchunk_t chunk) { 682 if (qp->usage[chunk].discounted) { 683 return; 684 } 685 INSIST(qp->used_count >= qp->usage[chunk].used); 686 INSIST(qp->free_count >= qp->usage[chunk].free); 687 qp->used_count -= qp->usage[chunk].used; 688 qp->free_count -= qp->usage[chunk].free; 689 qp->usage[chunk].discounted = true; 690 } 691 692 /* 693 * When a chunk is being recycled, we need to detach any leaves that 694 * remain, and free any `base` arrays that have been marked as unused. 695 */ 696 static void 697 chunk_free(dns_qp_t *qp, dns_qpchunk_t chunk) { 698 if (qp->write_protect) { 699 TRACE("chunk %u base %p", chunk, qp->base->ptr[chunk]); 700 } 701 702 dns_qpnode_t *n = qp->base->ptr[chunk]; 703 for (dns_qpcell_t count = qp->usage[chunk].used; count > 0; 704 count--, n++) 705 { 706 if (node_tag(n) == LEAF_TAG && node_pointer(n) != NULL) { 707 detach_leaf(qp, n); 708 } else if (count > 1 && reader_valid(n)) { 709 dns_qpreader_t qpr; 710 unpack_reader(&qpr, n); 711 /* pairs with dns_qpmulti_commit() */ 712 if (qpbase_unref(&qpr)) { 713 isc_mem_free(qp->mctx, qpr.base); 714 } 715 } 716 } 717 chunk_discount(qp, chunk); 718 chunk_free_raw(qp, qp->base->ptr[chunk]); 719 qp->base->ptr[chunk] = NULL; 720 qp->usage[chunk] = (qp_usage_t){}; 721 } 722 723 /* 724 * Free any chunks that we can while a trie is in use. 725 */ 726 static void 727 recycle(dns_qp_t *qp) { 728 unsigned int nfree = 0; 729 730 isc_nanosecs_t start = isc_time_monotonic(); 731 732 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 733 if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 734 qp->usage[chunk].exists && !qp->usage[chunk].immutable) 735 { 736 chunk_free(qp, chunk); 737 nfree++; 738 } 739 } 740 741 isc_nanosecs_t time = isc_time_monotonic() - start; 742 ISC_QP_ADD(recycle_time, time); 743 744 if (nfree > 0) { 745 LOG_STATS("qp recycle" PRItime "free %u chunks", time, nfree); 746 LOG_STATS("qp recycle leaf %u live %u used %u free %u hold %u", 747 qp->leaf_count, qp->used_count - qp->free_count, 748 qp->used_count, qp->free_count, qp->hold_count); 749 } 750 } 751 752 /* 753 * asynchronous cleanup 754 */ 755 static void 756 reclaim_chunks_cb(struct rcu_head *arg) { 757 qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head); 758 REQUIRE(QPRCU_VALID(rcuctx)); 759 dns_qpmulti_t *multi = rcuctx->multi; 760 REQUIRE(QPMULTI_VALID(multi)); 761 762 LOCK(&multi->mutex); 763 dns_qp_t *qp = &multi->writer; 764 765 /* 766 * If chunk_max is zero, chunks have already been freed. 767 */ 768 if (qp->chunk_max != 0) { 769 unsigned int free = 0; 770 isc_nanosecs_t start = isc_time_monotonic(); 771 772 INSIST(QP_VALID(qp)); 773 774 for (unsigned int i = 0; i < rcuctx->count; i++) { 775 dns_qpchunk_t chunk = rcuctx->chunk[i]; 776 if (qp->usage[chunk].snapshot) { 777 /* clean up when snapshot is destroyed */ 778 qp->usage[chunk].snapfree = true; 779 } else { 780 chunk_free(qp, chunk); 781 free++; 782 } 783 } 784 785 isc_nanosecs_t time = isc_time_monotonic() - start; 786 recycle_time += time; 787 788 if (free > 0) { 789 LOG_STATS("qp reclaim" PRItime "free %u chunks", time, 790 free); 791 LOG_STATS( 792 "qp reclaim leaf %u live %u used %u free %u " 793 "hold %u", 794 qp->leaf_count, qp->used_count - qp->free_count, 795 qp->used_count, qp->free_count, qp->hold_count); 796 } 797 } 798 799 UNLOCK(&multi->mutex); 800 801 dns_qpmulti_detach(&multi); 802 isc_mem_putanddetach(&rcuctx->mctx, rcuctx, 803 STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); 804 } 805 806 /* 807 * At the end of a transaction, schedule empty but immutable chunks 808 * for reclamation later. 809 */ 810 static void 811 reclaim_chunks(dns_qpmulti_t *multi) { 812 dns_qp_t *qp = &multi->writer; 813 814 unsigned int count = 0; 815 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 816 if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 817 qp->usage[chunk].exists && qp->usage[chunk].immutable && 818 !qp->usage[chunk].discounted) 819 { 820 count++; 821 } 822 } 823 824 if (count == 0) { 825 return; 826 } 827 828 qp_rcuctx_t *rcuctx = 829 isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count)); 830 *rcuctx = (qp_rcuctx_t){ 831 .magic = QPRCU_MAGIC, 832 .multi = multi, 833 .count = count, 834 }; 835 isc_mem_attach(qp->mctx, &rcuctx->mctx); 836 837 unsigned int i = 0; 838 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 839 if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && 840 qp->usage[chunk].exists && qp->usage[chunk].immutable && 841 !qp->usage[chunk].discounted) 842 { 843 rcuctx->chunk[i++] = chunk; 844 chunk_discount(qp, chunk); 845 } 846 } 847 848 /* 849 * Reference the qpmulti object to keep it from being 850 * freed until reclaim_chunks_cb() runs. 851 */ 852 dns_qpmulti_ref(multi); 853 call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb); 854 855 LOG_STATS("qp will reclaim %u chunks", count); 856 } 857 858 /* 859 * When a snapshot is destroyed, clean up chunks that need free()ing 860 * and are not used by any remaining snapshots. 861 */ 862 static void 863 marksweep_chunks(dns_qpmulti_t *multi) { 864 unsigned int nfree = 0; 865 866 isc_nanosecs_t start = isc_time_monotonic(); 867 868 dns_qp_t *qpw = &multi->writer; 869 870 for (dns_qpsnap_t *qps = ISC_LIST_HEAD(multi->snapshots); qps != NULL; 871 qps = ISC_LIST_NEXT(qps, link)) 872 { 873 for (dns_qpchunk_t chunk = 0; chunk < qps->chunk_max; chunk++) { 874 if (qps->base->ptr[chunk] != NULL) { 875 INSIST(qps->base->ptr[chunk] == 876 qpw->base->ptr[chunk]); 877 qpw->usage[chunk].snapmark = true; 878 } 879 } 880 } 881 882 for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) { 883 qpw->usage[chunk].snapshot = qpw->usage[chunk].snapmark; 884 qpw->usage[chunk].snapmark = false; 885 if (qpw->usage[chunk].snapfree && !qpw->usage[chunk].snapshot) { 886 chunk_free(qpw, chunk); 887 nfree++; 888 } 889 } 890 891 isc_nanosecs_t time = isc_time_monotonic() - start; 892 ISC_QP_ADD(recycle_time, time); 893 894 if (nfree > 0) { 895 LOG_STATS("qp marksweep" PRItime "free %u chunks", time, nfree); 896 LOG_STATS( 897 "qp marksweep leaf %u live %u used %u free %u hold %u", 898 qpw->leaf_count, qpw->used_count - qpw->free_count, 899 qpw->used_count, qpw->free_count, qpw->hold_count); 900 } 901 } 902 903 /*********************************************************************** 904 * 905 * garbage collector 906 */ 907 908 /* 909 * Move a branch node's twigs to the `bump` chunk, for copy-on-write 910 * or for garbage collection. We don't update the node in place 911 * because `compact_recursive()` does not ensure the node itself is 912 * mutable until after it discovers evacuation was necessary. 913 * 914 * If free_twigs() could not immediately destroy the old twigs, we have 915 * to re-attach to any leaves. 916 */ 917 static dns_qpref_t 918 evacuate(dns_qp_t *qp, dns_qpnode_t *n) { 919 dns_qpweight_t size = branch_twigs_size(n); 920 dns_qpref_t old_ref = branch_twigs_ref(n); 921 dns_qpref_t new_ref = alloc_twigs(qp, size); 922 dns_qpnode_t *old_twigs = ref_ptr(qp, old_ref); 923 dns_qpnode_t *new_twigs = ref_ptr(qp, new_ref); 924 925 move_twigs(new_twigs, old_twigs, size); 926 if (!free_twigs(qp, old_ref, size)) { 927 attach_twigs(qp, new_twigs, size); 928 } 929 930 return new_ref; 931 } 932 933 /* 934 * Immutable nodes need copy-on-write. As we walk down the trie finding the 935 * right place to modify, make_root_mutable() and make_twigs_mutable() 936 * are called to ensure that immutable nodes on the path from the root are 937 * copied to a mutable chunk. 938 */ 939 940 static inline dns_qpnode_t * 941 make_root_mutable(dns_qp_t *qp) { 942 if (cells_immutable(qp, qp->root_ref)) { 943 qp->root_ref = evacuate(qp, MOVABLE_ROOT(qp)); 944 } 945 return ref_ptr(qp, qp->root_ref); 946 } 947 948 static inline void 949 make_twigs_mutable(dns_qp_t *qp, dns_qpnode_t *n) { 950 if (cells_immutable(qp, branch_twigs_ref(n))) { 951 *n = make_node(branch_index(n), evacuate(qp, n)); 952 } 953 } 954 955 /* 956 * Compact the trie by traversing the whole thing recursively, copying 957 * bottom-up as required. The aim is to avoid evacuation as much as 958 * possible, but when parts of the trie are immutable, we need to evacuate 959 * the paths from the root to the parts of the trie that occupy 960 * fragmented chunks. 961 * 962 * Without the QP_MIN_USED check, the algorithm will leave the trie 963 * unchanged. If the children are all leaves, the loop changes nothing, 964 * so we will return this node's original ref. If all of the children 965 * that are branches did not need moving, again, the loop changes 966 * nothing. So the evacuation check is the only place that the 967 * algorithm introduces ref changes, that then bubble up towards the 968 * root through the logic inside the loop. 969 */ 970 static dns_qpref_t 971 compact_recursive(dns_qp_t *qp, dns_qpnode_t *parent) { 972 dns_qpweight_t size = branch_twigs_size(parent); 973 dns_qpref_t twigs_ref = branch_twigs_ref(parent); 974 dns_qpchunk_t chunk = ref_chunk(twigs_ref); 975 976 if (qp->compact_all || 977 (chunk != qp->bump && chunk_usage(qp, chunk) < QP_MIN_USED)) 978 { 979 twigs_ref = evacuate(qp, parent); 980 } 981 bool immutable = cells_immutable(qp, twigs_ref); 982 for (dns_qpweight_t pos = 0; pos < size; pos++) { 983 dns_qpnode_t *child = ref_ptr(qp, twigs_ref) + pos; 984 if (!is_branch(child)) { 985 continue; 986 } 987 dns_qpref_t old_grandtwigs = branch_twigs_ref(child); 988 dns_qpref_t new_grandtwigs = compact_recursive(qp, child); 989 if (old_grandtwigs == new_grandtwigs) { 990 continue; 991 } 992 if (immutable) { 993 twigs_ref = evacuate(qp, parent); 994 /* the twigs have moved */ 995 child = ref_ptr(qp, twigs_ref) + pos; 996 immutable = false; 997 } 998 *child = make_node(branch_index(child), new_grandtwigs); 999 } 1000 return twigs_ref; 1001 } 1002 1003 static void 1004 compact(dns_qp_t *qp) { 1005 LOG_STATS("qp compact before leaf %u live %u used %u free %u hold %u", 1006 qp->leaf_count, qp->used_count - qp->free_count, 1007 qp->used_count, qp->free_count, qp->hold_count); 1008 1009 isc_nanosecs_t start = isc_time_monotonic(); 1010 1011 if (qp->usage[qp->bump].free > QP_MAX_FREE) { 1012 alloc_reset(qp); 1013 } 1014 1015 if (qp->leaf_count > 0) { 1016 qp->root_ref = compact_recursive(qp, MOVABLE_ROOT(qp)); 1017 } 1018 qp->compact_all = false; 1019 1020 isc_nanosecs_t time = isc_time_monotonic() - start; 1021 ISC_QP_ADD(compact_time, time); 1022 1023 LOG_STATS("qp compact" PRItime 1024 "leaf %u live %u used %u free %u hold %u", 1025 time, qp->leaf_count, qp->used_count - qp->free_count, 1026 qp->used_count, qp->free_count, qp->hold_count); 1027 } 1028 1029 void 1030 dns_qp_compact(dns_qp_t *qp, dns_qpgc_t mode) { 1031 REQUIRE(QP_VALID(qp)); 1032 if (mode == DNS_QPGC_MAYBE && !QP_NEEDGC(qp)) { 1033 return; 1034 } 1035 if (mode == DNS_QPGC_ALL) { 1036 alloc_reset(qp); 1037 qp->compact_all = true; 1038 } 1039 compact(qp); 1040 recycle(qp); 1041 } 1042 1043 /* 1044 * Free some twigs and (if they were destroyed immediately so that the 1045 * result from QP_MAX_GARBAGE can change) compact the trie if necessary. 1046 * 1047 * This is called by the trie modification API entry points. The 1048 * free_twigs() function requires the caller to attach or detach any 1049 * leaves as necessary. Callers of squash_twigs() satisfy this 1050 * requirement by calling make_twigs_mutable(). 1051 * 1052 * Aside: In typical garbage collectors, compaction is triggered when 1053 * the allocator runs out of space. But that is because typical garbage 1054 * collectors do not know how much memory can be recovered, so they must 1055 * find out by scanning the heap. The qp-trie code was originally 1056 * designed to use malloc() and free(), so it has more information about 1057 * when garbage collection might be worthwhile. Hence we can trigger 1058 * collection when garbage passes a threshold. 1059 * 1060 * XXXFANF: If we need to avoid latency outliers caused by compaction in 1061 * write transactions, we can check qp->transaction_mode here. 1062 */ 1063 static inline bool 1064 squash_twigs(dns_qp_t *qp, dns_qpref_t twigs, dns_qpweight_t size) { 1065 bool destroyed = free_twigs(qp, twigs, size); 1066 if (destroyed && QP_AUTOGC(qp)) { 1067 compact(qp); 1068 recycle(qp); 1069 /* 1070 * This shouldn't happen if the garbage collector is 1071 * working correctly. We can recover at the cost of some 1072 * time and space, but recovery should be cheaper than 1073 * letting compact+recycle fail repeatedly. 1074 */ 1075 if (QP_AUTOGC(qp)) { 1076 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, 1077 DNS_LOGMODULE_QP, ISC_LOG_NOTICE, 1078 "qp %p uctx \"%s\" compact/recycle " 1079 "failed to recover any space, " 1080 "scheduling a full compaction", 1081 qp, TRIENAME(qp)); 1082 qp->compact_all = true; 1083 } 1084 } 1085 return destroyed; 1086 } 1087 1088 /*********************************************************************** 1089 * 1090 * public accessors for memory management internals 1091 */ 1092 1093 dns_qp_memusage_t 1094 dns_qp_memusage(dns_qp_t *qp) { 1095 REQUIRE(QP_VALID(qp)); 1096 1097 dns_qp_memusage_t memusage = { 1098 .uctx = qp->uctx, 1099 .leaves = qp->leaf_count, 1100 .live = qp->used_count - qp->free_count, 1101 .used = qp->used_count, 1102 .hold = qp->hold_count, 1103 .free = qp->free_count, 1104 .node_size = sizeof(dns_qpnode_t), 1105 .fragmented = QP_NEEDGC(qp), 1106 }; 1107 1108 size_t chunk_usage_bytes = 0; 1109 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1110 if (qp->base->ptr[chunk] != NULL) { 1111 chunk_usage_bytes += qp->usage[chunk].capacity; 1112 memusage.chunk_count += 1; 1113 } 1114 } 1115 1116 /* 1117 * XXXFANF does not subtract chunks that have been shrunk, 1118 * and does not count unreclaimed dns_qpbase_t objects 1119 */ 1120 memusage.bytes = chunk_usage_bytes + 1121 qp->chunk_max * sizeof(qp->base->ptr[0]) + 1122 qp->chunk_max * sizeof(qp->usage[0]); 1123 1124 return memusage; 1125 } 1126 1127 dns_qp_memusage_t 1128 dns_qpmulti_memusage(dns_qpmulti_t *multi) { 1129 REQUIRE(QPMULTI_VALID(multi)); 1130 LOCK(&multi->mutex); 1131 1132 dns_qp_t *qp = &multi->writer; 1133 INSIST(QP_VALID(qp)); 1134 1135 dns_qp_memusage_t memusage = dns_qp_memusage(qp); 1136 1137 if (qp->transaction_mode == QP_UPDATE && qp->usage != NULL) { 1138 memusage.bytes -= qp->usage[qp->bump].capacity; 1139 memusage.bytes += qp->usage[qp->bump].used * 1140 sizeof(dns_qpnode_t); 1141 } 1142 1143 UNLOCK(&multi->mutex); 1144 return memusage; 1145 } 1146 1147 void 1148 dns_qp_gctime(isc_nanosecs_t *compact_p, isc_nanosecs_t *recycle_p, 1149 isc_nanosecs_t *rollback_p) { 1150 *compact_p = ISC_QP_GET(compact_time); 1151 *recycle_p = ISC_QP_GET(recycle_time); 1152 *rollback_p = ISC_QP_GET(rollback_time); 1153 } 1154 1155 /*********************************************************************** 1156 * 1157 * read-write transactions 1158 */ 1159 1160 static dns_qp_t * 1161 transaction_open(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1162 REQUIRE(QPMULTI_VALID(multi)); 1163 REQUIRE(qptp != NULL && *qptp == NULL); 1164 1165 LOCK(&multi->mutex); 1166 1167 dns_qp_t *qp = &multi->writer; 1168 INSIST(QP_VALID(qp)); 1169 1170 /* 1171 * Mark existing chunks as immutable. 1172 * 1173 * Aside: The bump chunk is special: in a series of write 1174 * transactions the bump chunk is reused; the first part (up 1175 * to fender) is immutable, the rest mutable. But we set its 1176 * immutable flag so that when the bump chunk fills up, the 1177 * first part continues to be treated as immutable. (And the 1178 * rest of the chunk too, but that's OK.) 1179 */ 1180 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1181 if (qp->usage[chunk].exists) { 1182 qp->usage[chunk].immutable = true; 1183 write_protect(qp, chunk); 1184 } 1185 } 1186 1187 /* 1188 * Ensure QP_AUTOGC() ignores free space in immutable chunks. 1189 */ 1190 qp->hold_count = qp->free_count; 1191 1192 *qptp = qp; 1193 return qp; 1194 } 1195 1196 /* 1197 * a write is light 1198 * 1199 * We need to ensure we allocate from a fresh chunk if the last transaction 1200 * shrunk the bump chunk; but usually in a sequence of write transactions 1201 * we just put `fender` at the point where we started this generation. 1202 * 1203 * (Aside: Instead of keeping the previous transaction's mode, I 1204 * considered forcing allocation into the slow path by fiddling with 1205 * the bump chunk's usage counters. But that is troublesome because 1206 * `chunk_free()` needs to know how much of the chunk to scan.) 1207 */ 1208 void 1209 dns_qpmulti_write(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1210 dns_qp_t *qp = transaction_open(multi, qptp); 1211 TRACE(""); 1212 1213 if (qp->transaction_mode == QP_WRITE) { 1214 qp->fender = qp->usage[qp->bump].used; 1215 } else { 1216 alloc_reset(qp); 1217 } 1218 qp->transaction_mode = QP_WRITE; 1219 } 1220 1221 /* 1222 * an update is heavier 1223 * 1224 * We always reset the allocator to the start of a fresh chunk, 1225 * because the previous transaction was probably an update that shrunk 1226 * the bump chunk. It simplifies rollback because `fender` is always zero. 1227 * 1228 * To rollback a transaction, we need to reset all the allocation 1229 * counters to their previous state, in particular we need to un-free 1230 * any nodes that were copied to make them mutable. This means we need 1231 * to make a copy of basically the whole `dns_qp_t writer`: everything 1232 * but the chunks holding the trie nodes. 1233 * 1234 * We do most of the transaction setup before creating the rollback 1235 * state so that after rollback we have a correct idea of which chunks 1236 * are immutable, and so we have the correct transaction mode to make 1237 * the next transaction allocate a new bump chunk. The exception is 1238 * resetting the allocator, which we do after creating the rollback 1239 * state; if this transaction is rolled back then the next transaction 1240 * will start from the rollback state and also reset the allocator as 1241 * one of its first actions. 1242 */ 1243 void 1244 dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1245 dns_qp_t *qp = transaction_open(multi, qptp); 1246 TRACE(""); 1247 1248 qp->transaction_mode = QP_UPDATE; 1249 1250 dns_qp_t *rollback = isc_mem_allocate(qp->mctx, sizeof(*rollback)); 1251 memmove(rollback, qp, sizeof(*rollback)); 1252 /* can be uninitialized on the first transaction */ 1253 if (rollback->base != NULL) { 1254 INSIST(QPBASE_VALID(rollback->base)); 1255 INSIST(qp->usage != NULL && qp->chunk_max > 0); 1256 /* paired with either _commit() or _rollback() */ 1257 isc_refcount_increment(&rollback->base->refcount); 1258 size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max; 1259 rollback->usage = isc_mem_allocate(qp->mctx, usage_bytes); 1260 memmove(rollback->usage, qp->usage, usage_bytes); 1261 } 1262 INSIST(multi->rollback == NULL); 1263 multi->rollback = rollback; 1264 1265 alloc_reset(qp); 1266 } 1267 1268 void 1269 dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1270 REQUIRE(QPMULTI_VALID(multi)); 1271 REQUIRE(qptp != NULL && *qptp == &multi->writer); 1272 REQUIRE(multi->writer.transaction_mode == QP_WRITE || 1273 multi->writer.transaction_mode == QP_UPDATE); 1274 1275 dns_qp_t *qp = *qptp; 1276 TRACE(""); 1277 1278 if (qp->transaction_mode == QP_UPDATE) { 1279 INSIST(multi->rollback != NULL); 1280 /* paired with dns_qpmulti_update() */ 1281 if (qpbase_unref(multi->rollback)) { 1282 isc_mem_free(qp->mctx, multi->rollback->base); 1283 } 1284 if (multi->rollback->usage != NULL) { 1285 isc_mem_free(qp->mctx, multi->rollback->usage); 1286 } 1287 isc_mem_free(qp->mctx, multi->rollback); 1288 } 1289 INSIST(multi->rollback == NULL); 1290 1291 /* not the first commit? */ 1292 if (multi->reader_ref != INVALID_REF) { 1293 INSIST(cells_immutable(qp, multi->reader_ref)); 1294 free_twigs(qp, multi->reader_ref, READER_SIZE); 1295 } 1296 1297 if (qp->transaction_mode == QP_UPDATE) { 1298 /* minimize memory overhead */ 1299 compact(qp); 1300 multi->reader_ref = alloc_twigs(qp, READER_SIZE); 1301 qp->base->ptr[qp->bump] = chunk_shrink_raw( 1302 qp, qp->base->ptr[qp->bump], 1303 qp->usage[qp->bump].used * sizeof(dns_qpnode_t)); 1304 } else { 1305 multi->reader_ref = alloc_twigs(qp, READER_SIZE); 1306 } 1307 1308 /* anchor a new version of the trie */ 1309 dns_qpnode_t *reader = ref_ptr(qp, multi->reader_ref); 1310 make_reader(reader, multi); 1311 /* paired with chunk_free() */ 1312 isc_refcount_increment(&qp->base->refcount); 1313 1314 rcu_assign_pointer(multi->reader, reader); /* COMMIT */ 1315 1316 /* clean up what we can right now */ 1317 if (qp->transaction_mode == QP_UPDATE || QP_NEEDGC(qp)) { 1318 recycle(qp); 1319 } 1320 1321 /* schedule the rest for later */ 1322 reclaim_chunks(multi); 1323 1324 *qptp = NULL; 1325 UNLOCK(&multi->mutex); 1326 } 1327 1328 /* 1329 * Throw away everything that was allocated during this transaction. 1330 */ 1331 void 1332 dns_qpmulti_rollback(dns_qpmulti_t *multi, dns_qp_t **qptp) { 1333 /* 1334 * nfree is only used when logging stats, hence the attribute. 1335 */ 1336 unsigned int nfree ISC_ATTR_UNUSED = 0; 1337 1338 REQUIRE(QPMULTI_VALID(multi)); 1339 REQUIRE(multi->writer.transaction_mode == QP_UPDATE); 1340 REQUIRE(qptp != NULL && *qptp == &multi->writer); 1341 1342 dns_qp_t *qp = *qptp; 1343 TRACE(""); 1344 1345 isc_nanosecs_t start = isc_time_monotonic(); 1346 1347 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1348 if (qp->base->ptr[chunk] != NULL && !qp->usage[chunk].immutable) 1349 { 1350 chunk_free(qp, chunk); 1351 /* 1352 * we need to clear its base pointer in the rollback 1353 * trie, in case the arrays were resized 1354 */ 1355 if (chunk < multi->rollback->chunk_max) { 1356 INSIST(!multi->rollback->usage[chunk].exists); 1357 multi->rollback->base->ptr[chunk] = NULL; 1358 } 1359 nfree++; 1360 } 1361 } 1362 1363 /* 1364 * multi->rollback->base and multi->writer->base are the same, 1365 * unless there was a realloc_chunk_arrays() during the transaction 1366 */ 1367 if (qpbase_unref(qp)) { 1368 /* paired with dns_qpmulti_update() */ 1369 isc_mem_free(qp->mctx, qp->base); 1370 } 1371 isc_mem_free(qp->mctx, qp->usage); 1372 1373 /* reset allocator state */ 1374 INSIST(multi->rollback != NULL); 1375 memmove(qp, multi->rollback, sizeof(*qp)); 1376 isc_mem_free(qp->mctx, multi->rollback); 1377 INSIST(multi->rollback == NULL); 1378 1379 isc_nanosecs_t time = isc_time_monotonic() - start; 1380 ISC_QP_ADD(rollback_time, time); 1381 1382 LOG_STATS("qp rollback" PRItime "free %u chunks", time, nfree); 1383 1384 *qptp = NULL; 1385 UNLOCK(&multi->mutex); 1386 } 1387 1388 /*********************************************************************** 1389 * 1390 * read-only transactions 1391 */ 1392 1393 static dns_qpmulti_t * 1394 reader_open(dns_qpmulti_t *multi, dns_qpreadable_t qpr) { 1395 dns_qpreader_t *qp = dns_qpreader(qpr); 1396 dns_qpnode_t *reader = rcu_dereference(multi->reader); 1397 if (reader == NULL) { 1398 QP_INIT(qp, multi->writer.methods, multi->writer.uctx); 1399 } else { 1400 multi = unpack_reader(qp, reader); 1401 } 1402 return multi; 1403 } 1404 1405 /* 1406 * a query is light 1407 */ 1408 1409 void 1410 dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) { 1411 REQUIRE(QPMULTI_VALID(multi)); 1412 REQUIRE(qp != NULL); 1413 1414 qp->tid = isc_tid(); 1415 rcu_read_lock(); 1416 1417 dns_qpmulti_t *whence = reader_open(multi, qp); 1418 INSIST(whence == multi); 1419 } 1420 1421 void 1422 dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) { 1423 REQUIRE(QPMULTI_VALID(multi)); 1424 REQUIRE(QP_VALID(qp)); 1425 REQUIRE(qp->tid == isc_tid()); 1426 *qp = (dns_qpread_t){}; 1427 rcu_read_unlock(); 1428 } 1429 1430 /* 1431 * a snapshot is heavy 1432 */ 1433 1434 void 1435 dns_qpmulti_snapshot(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) { 1436 REQUIRE(QPMULTI_VALID(multi)); 1437 REQUIRE(qpsp != NULL && *qpsp == NULL); 1438 1439 rcu_read_lock(); 1440 1441 LOCK(&multi->mutex); 1442 1443 dns_qp_t *qpw = &multi->writer; 1444 size_t bytes = sizeof(dns_qpsnap_t) + sizeof(dns_qpbase_t) + 1445 sizeof(qpw->base->ptr[0]) * qpw->chunk_max; 1446 dns_qpsnap_t *qps = isc_mem_allocate(qpw->mctx, bytes); 1447 1448 qps->whence = reader_open(multi, qps); 1449 INSIST(qps->whence == multi); 1450 1451 /* not a separate allocation */ 1452 qps->base = (dns_qpbase_t *)(qps + 1); 1453 isc_refcount_init(&qps->base->refcount, 0); 1454 1455 /* 1456 * only copy base pointers of chunks we need, so we can 1457 * reclaim unused memory in dns_qpsnap_destroy() 1458 */ 1459 qps->chunk_max = qpw->chunk_max; 1460 for (dns_qpchunk_t chunk = 0; chunk < qpw->chunk_max; chunk++) { 1461 if (qpw->usage[chunk].exists && chunk_usage(qpw, chunk) > 0) { 1462 qpw->usage[chunk].snapshot = true; 1463 qps->base->ptr[chunk] = qpw->base->ptr[chunk]; 1464 } else { 1465 qps->base->ptr[chunk] = NULL; 1466 } 1467 } 1468 ISC_LIST_INITANDAPPEND(multi->snapshots, qps, link); 1469 1470 *qpsp = qps; 1471 UNLOCK(&multi->mutex); 1472 1473 rcu_read_unlock(); 1474 } 1475 1476 void 1477 dns_qpsnap_destroy(dns_qpmulti_t *multi, dns_qpsnap_t **qpsp) { 1478 REQUIRE(QPMULTI_VALID(multi)); 1479 REQUIRE(qpsp != NULL && *qpsp != NULL); 1480 1481 LOCK(&multi->mutex); 1482 1483 dns_qpsnap_t *qp = *qpsp; 1484 1485 /* make sure the API is being used correctly */ 1486 REQUIRE(qp->whence == multi); 1487 1488 ISC_LIST_UNLINK(multi->snapshots, qp, link); 1489 1490 /* 1491 * eagerly reclaim chunks that are now unused, so that memory does 1492 * not accumulate when a trie has a lot of updates and snapshots 1493 */ 1494 marksweep_chunks(multi); 1495 1496 isc_mem_free(multi->writer.mctx, qp); 1497 1498 *qpsp = NULL; 1499 UNLOCK(&multi->mutex); 1500 } 1501 1502 /*********************************************************************** 1503 * 1504 * constructors, destructors 1505 */ 1506 1507 void 1508 dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, 1509 dns_qp_t **qptp) { 1510 REQUIRE(qptp != NULL && *qptp == NULL); 1511 1512 dns_qp_t *qp = isc_mem_get(mctx, sizeof(*qp)); 1513 QP_INIT(qp, methods, uctx); 1514 isc_mem_attach(mctx, &qp->mctx); 1515 alloc_reset(qp); 1516 TRACE(""); 1517 *qptp = qp; 1518 } 1519 1520 void 1521 dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, 1522 dns_qpmulti_t **qpmp) { 1523 REQUIRE(qpmp != NULL && *qpmp == NULL); 1524 1525 dns_qpmulti_t *multi = isc_mem_get(mctx, sizeof(*multi)); 1526 *multi = (dns_qpmulti_t){ .magic = QPMULTI_MAGIC, 1527 .reader_ref = INVALID_REF, 1528 .references = ISC_REFCOUNT_INITIALIZER(1) }; 1529 isc_mutex_init(&multi->mutex); 1530 ISC_LIST_INIT(multi->snapshots); 1531 1532 /* 1533 * Do not waste effort allocating a bump chunk that will be thrown 1534 * away when a transaction is opened. dns_qpmulti_update() always 1535 * allocates; to ensure dns_qpmulti_write() does too, pretend the 1536 * previous transaction was an update 1537 */ 1538 dns_qp_t *qp = &multi->writer; 1539 QP_INIT(qp, methods, uctx); 1540 isc_mem_attach(mctx, &qp->mctx); 1541 qp->transaction_mode = QP_UPDATE; 1542 TRACE(""); 1543 *qpmp = multi; 1544 } 1545 1546 static void 1547 destroy_guts(dns_qp_t *qp) { 1548 if (qp->chunk_max == 0) { 1549 return; 1550 } 1551 1552 for (dns_qpchunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { 1553 if (qp->base->ptr[chunk] != NULL) { 1554 chunk_free(qp, chunk); 1555 } 1556 } 1557 qp->chunk_max = 0; 1558 ENSURE(qp->used_count == 0); 1559 ENSURE(qp->free_count == 0); 1560 ENSURE(isc_refcount_current(&qp->base->refcount) == 1); 1561 isc_mem_free(qp->mctx, qp->base); 1562 isc_mem_free(qp->mctx, qp->usage); 1563 qp->magic = 0; 1564 } 1565 1566 void 1567 dns_qp_destroy(dns_qp_t **qptp) { 1568 REQUIRE(qptp != NULL); 1569 REQUIRE(QP_VALID(*qptp)); 1570 1571 dns_qp_t *qp = *qptp; 1572 *qptp = NULL; 1573 1574 /* do not try to destroy part of a dns_qpmulti_t */ 1575 REQUIRE(qp->transaction_mode == QP_NONE); 1576 1577 TRACE(""); 1578 destroy_guts(qp); 1579 isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp)); 1580 } 1581 1582 static void 1583 qpmulti_free_mem(dns_qpmulti_t *multi) { 1584 REQUIRE(QPMULTI_VALID(multi)); 1585 1586 /* reassure thread sanitizer */ 1587 LOCK(&multi->mutex); 1588 dns_qp_t *qp = &multi->writer; 1589 UNLOCK(&multi->mutex); 1590 1591 isc_mutex_destroy(&multi->mutex); 1592 isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi)); 1593 } 1594 1595 #if QPMULTI_TRACE 1596 ISC_REFCOUNT_STATIC_TRACE_IMPL(dns_qpmulti, qpmulti_free_mem) 1597 #else 1598 ISC_REFCOUNT_STATIC_IMPL(dns_qpmulti, qpmulti_free_mem) 1599 #endif 1600 1601 static void 1602 qpmulti_destroy_guts_cb(struct rcu_head *arg) { 1603 qp_rcuctx_t *rcuctx = caa_container_of(arg, qp_rcuctx_t, rcu_head); 1604 REQUIRE(QPRCU_VALID(rcuctx)); 1605 /* only nonzero for reclaim_chunks_cb() */ 1606 REQUIRE(rcuctx->count == 0); 1607 1608 dns_qpmulti_t *multi = rcuctx->multi; 1609 REQUIRE(QPMULTI_VALID(multi)); 1610 1611 /* reassure thread sanitizer */ 1612 LOCK(&multi->mutex); 1613 1614 dns_qp_t *qp = &multi->writer; 1615 REQUIRE(QP_VALID(qp)); 1616 1617 destroy_guts(qp); 1618 1619 UNLOCK(&multi->mutex); 1620 1621 dns_qpmulti_detach(&multi); 1622 isc_mem_putanddetach(&rcuctx->mctx, rcuctx, 1623 STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); 1624 } 1625 1626 void 1627 dns_qpmulti_destroy(dns_qpmulti_t **qpmp) { 1628 dns_qp_t *qp = NULL; 1629 dns_qpmulti_t *multi = NULL; 1630 qp_rcuctx_t *rcuctx = NULL; 1631 1632 REQUIRE(qpmp != NULL); 1633 REQUIRE(QPMULTI_VALID(*qpmp)); 1634 1635 multi = *qpmp; 1636 qp = &multi->writer; 1637 *qpmp = NULL; 1638 1639 REQUIRE(QP_VALID(qp)); 1640 REQUIRE(multi->rollback == NULL); 1641 REQUIRE(ISC_LIST_EMPTY(multi->snapshots)); 1642 1643 rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0)); 1644 *rcuctx = (qp_rcuctx_t){ 1645 .magic = QPRCU_MAGIC, 1646 .multi = multi, 1647 }; 1648 isc_mem_attach(qp->mctx, &rcuctx->mctx); 1649 call_rcu(&rcuctx->rcu_head, qpmulti_destroy_guts_cb); 1650 } 1651 1652 /*********************************************************************** 1653 * 1654 * modification 1655 */ 1656 1657 isc_result_t 1658 dns_qp_insert(dns_qp_t *qp, void *pval, uint32_t ival) { 1659 dns_qpref_t new_ref, old_ref; 1660 dns_qpnode_t new_leaf, old_node; 1661 dns_qpnode_t *new_twigs = NULL, *old_twigs = NULL; 1662 dns_qpshift_t new_bit, old_bit; 1663 dns_qpweight_t old_size, new_size; 1664 dns_qpkey_t new_key, old_key; 1665 size_t new_keylen, old_keylen; 1666 size_t offset; 1667 uint64_t index; 1668 dns_qpshift_t bit; 1669 dns_qpweight_t pos; 1670 dns_qpnode_t *n = NULL; 1671 1672 REQUIRE(QP_VALID(qp)); 1673 1674 new_leaf = make_leaf(pval, ival); 1675 new_keylen = leaf_qpkey(qp, &new_leaf, new_key); 1676 1677 /* first leaf in an empty trie? */ 1678 if (qp->leaf_count == 0) { 1679 new_ref = alloc_twigs(qp, 1); 1680 new_twigs = ref_ptr(qp, new_ref); 1681 *new_twigs = new_leaf; 1682 attach_leaf(qp, new_twigs); 1683 qp->leaf_count++; 1684 qp->root_ref = new_ref; 1685 return ISC_R_SUCCESS; 1686 } 1687 1688 /* 1689 * We need to keep searching down to a leaf even if our key is 1690 * missing from this branch. It doesn't matter which twig we 1691 * choose since the keys are all the same up to this node's 1692 * offset. Note that if we simply use branch_twig_pos(n, bit) 1693 * we may get an out-of-bounds access if our bit is greater 1694 * than all the set bits in the node. 1695 */ 1696 n = ref_ptr(qp, qp->root_ref); 1697 while (is_branch(n)) { 1698 prefetch_twigs(qp, n); 1699 dns_qpref_t ref = branch_twigs_ref(n); 1700 bit = branch_keybit(n, new_key, new_keylen); 1701 pos = branch_has_twig(n, bit) ? branch_twig_pos(n, bit) : 0; 1702 n = ref_ptr(qp, ref + pos); 1703 } 1704 1705 /* do the keys differ, and if so, where? */ 1706 old_keylen = leaf_qpkey(qp, n, old_key); 1707 offset = qpkey_compare(new_key, new_keylen, old_key, old_keylen); 1708 if (offset == QPKEY_EQUAL) { 1709 return ISC_R_EXISTS; 1710 } 1711 new_bit = qpkey_bit(new_key, new_keylen, offset); 1712 old_bit = qpkey_bit(old_key, old_keylen, offset); 1713 1714 /* find where to insert a branch or grow an existing branch. */ 1715 n = make_root_mutable(qp); 1716 while (is_branch(n)) { 1717 prefetch_twigs(qp, n); 1718 if (offset < branch_key_offset(n)) { 1719 goto newbranch; 1720 } 1721 if (offset == branch_key_offset(n)) { 1722 goto growbranch; 1723 } 1724 make_twigs_mutable(qp, n); 1725 bit = branch_keybit(n, new_key, new_keylen); 1726 INSIST(branch_has_twig(n, bit)); 1727 n = branch_twig_ptr(qp, n, bit); 1728 } 1729 /* fall through */ 1730 1731 newbranch: 1732 new_ref = alloc_twigs(qp, 2); 1733 new_twigs = ref_ptr(qp, new_ref); 1734 1735 /* save before overwriting. */ 1736 old_node = *n; 1737 1738 /* new branch node takes old node's place */ 1739 index = BRANCH_TAG | (1ULL << new_bit) | (1ULL << old_bit) | 1740 ((uint64_t)offset << SHIFT_OFFSET); 1741 *n = make_node(index, new_ref); 1742 1743 /* populate twigs */ 1744 new_twigs[old_bit > new_bit] = old_node; 1745 new_twigs[new_bit > old_bit] = new_leaf; 1746 1747 attach_leaf(qp, &new_leaf); 1748 qp->leaf_count++; 1749 1750 return ISC_R_SUCCESS; 1751 1752 growbranch: 1753 INSIST(!branch_has_twig(n, new_bit)); 1754 1755 /* locate twigs vectors */ 1756 old_size = branch_twigs_size(n); 1757 new_size = old_size + 1; 1758 old_ref = branch_twigs_ref(n); 1759 new_ref = alloc_twigs(qp, new_size); 1760 old_twigs = ref_ptr(qp, old_ref); 1761 new_twigs = ref_ptr(qp, new_ref); 1762 1763 /* embiggen branch node */ 1764 index = branch_index(n) | (1ULL << new_bit); 1765 *n = make_node(index, new_ref); 1766 1767 /* embiggen twigs vector */ 1768 pos = branch_twig_pos(n, new_bit); 1769 move_twigs(new_twigs, old_twigs, pos); 1770 new_twigs[pos] = new_leaf; 1771 move_twigs(new_twigs + pos + 1, old_twigs + pos, old_size - pos); 1772 1773 if (squash_twigs(qp, old_ref, old_size)) { 1774 /* old twigs destroyed, only attach to new leaf */ 1775 attach_leaf(qp, &new_leaf); 1776 } else { 1777 /* old twigs duplicated, attach to all leaves */ 1778 attach_twigs(qp, new_twigs, new_size); 1779 } 1780 qp->leaf_count++; 1781 1782 return ISC_R_SUCCESS; 1783 } 1784 1785 isc_result_t 1786 dns_qp_deletekey(dns_qp_t *qp, const dns_qpkey_t search_key, 1787 size_t search_keylen, void **pval_r, uint32_t *ival_r) { 1788 REQUIRE(QP_VALID(qp)); 1789 REQUIRE(search_keylen < sizeof(dns_qpkey_t)); 1790 1791 if (get_root(qp) == NULL) { 1792 return ISC_R_NOTFOUND; 1793 } 1794 1795 dns_qpshift_t bit = 0; /* suppress warning */ 1796 dns_qpnode_t *parent = NULL; 1797 dns_qpnode_t *n = make_root_mutable(qp); 1798 while (is_branch(n)) { 1799 prefetch_twigs(qp, n); 1800 bit = branch_keybit(n, search_key, search_keylen); 1801 if (!branch_has_twig(n, bit)) { 1802 return ISC_R_NOTFOUND; 1803 } 1804 make_twigs_mutable(qp, n); 1805 parent = n; 1806 n = branch_twig_ptr(qp, n, bit); 1807 } 1808 1809 dns_qpkey_t found_key; 1810 size_t found_keylen = leaf_qpkey(qp, n, found_key); 1811 if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) != 1812 QPKEY_EQUAL) 1813 { 1814 return ISC_R_NOTFOUND; 1815 } 1816 1817 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 1818 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 1819 detach_leaf(qp, n); 1820 qp->leaf_count--; 1821 1822 /* trie becomes empty */ 1823 if (qp->leaf_count == 0) { 1824 INSIST(parent == NULL); 1825 INSIST(n == get_root(qp)); 1826 free_twigs(qp, qp->root_ref, 1); 1827 qp->root_ref = INVALID_REF; 1828 return ISC_R_SUCCESS; 1829 } 1830 1831 /* step back to parent node */ 1832 n = parent; 1833 parent = NULL; 1834 1835 INSIST(bit != 0); 1836 dns_qpweight_t size = branch_twigs_size(n); 1837 dns_qpweight_t pos = branch_twig_pos(n, bit); 1838 dns_qpref_t ref = branch_twigs_ref(n); 1839 dns_qpnode_t *twigs = ref_ptr(qp, ref); 1840 1841 if (size == 2) { 1842 /* 1843 * move the other twig to the parent branch. 1844 */ 1845 *n = twigs[!pos]; 1846 squash_twigs(qp, ref, 2); 1847 } else { 1848 /* 1849 * shrink the twigs in place, to avoid using the bump 1850 * chunk too fast - the gc will clean up after us 1851 */ 1852 *n = make_node(branch_index(n) & ~(1ULL << bit), ref); 1853 move_twigs(twigs + pos, twigs + pos + 1, size - pos - 1); 1854 squash_twigs(qp, ref + size - 1, 1); 1855 } 1856 1857 return ISC_R_SUCCESS; 1858 } 1859 1860 isc_result_t 1861 dns_qp_deletename(dns_qp_t *qp, const dns_name_t *name, void **pval_r, 1862 uint32_t *ival_r) { 1863 dns_qpkey_t key; 1864 size_t keylen = dns_qpkey_fromname(key, name); 1865 return dns_qp_deletekey(qp, key, keylen, pval_r, ival_r); 1866 } 1867 1868 /*********************************************************************** 1869 * chains 1870 */ 1871 static void 1872 maybe_set_name(dns_qpreader_t *qp, dns_qpnode_t *node, dns_name_t *name) { 1873 dns_qpkey_t key; 1874 size_t len; 1875 1876 if (name == NULL) { 1877 return; 1878 } 1879 1880 dns_name_reset(name); 1881 len = leaf_qpkey(qp, node, key); 1882 dns_qpkey_toname(key, len, name); 1883 } 1884 1885 void 1886 dns_qpchain_init(dns_qpreadable_t qpr, dns_qpchain_t *chain) { 1887 dns_qpreader_t *qp = dns_qpreader(qpr); 1888 REQUIRE(QP_VALID(qp)); 1889 REQUIRE(chain != NULL); 1890 1891 *chain = (dns_qpchain_t){ 1892 .magic = QPCHAIN_MAGIC, 1893 .qp = qp, 1894 }; 1895 } 1896 1897 unsigned int 1898 dns_qpchain_length(dns_qpchain_t *chain) { 1899 REQUIRE(QPCHAIN_VALID(chain)); 1900 1901 return chain->len; 1902 } 1903 1904 void 1905 dns_qpchain_node(dns_qpchain_t *chain, unsigned int level, dns_name_t *name, 1906 void **pval_r, uint32_t *ival_r) { 1907 dns_qpnode_t *node = NULL; 1908 1909 REQUIRE(QPCHAIN_VALID(chain)); 1910 REQUIRE(level < chain->len); 1911 1912 node = chain->chain[level].node; 1913 maybe_set_name(chain->qp, node, name); 1914 SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 1915 SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 1916 } 1917 1918 /*********************************************************************** 1919 * iterators 1920 */ 1921 1922 void 1923 dns_qpiter_init(dns_qpreadable_t qpr, dns_qpiter_t *qpi) { 1924 dns_qpreader_t *qp = dns_qpreader(qpr); 1925 REQUIRE(QP_VALID(qp)); 1926 REQUIRE(qpi != NULL); 1927 *qpi = (dns_qpiter_t){ 1928 .qp = qp, 1929 .magic = QPITER_MAGIC, 1930 }; 1931 } 1932 1933 /* 1934 * are we at the last twig in this branch (in whichever direction 1935 * we're iterating)? 1936 */ 1937 static bool 1938 last_twig(dns_qpiter_t *qpi, bool forward) { 1939 dns_qpweight_t pos = 0, max = 0; 1940 if (qpi->sp > 0) { 1941 dns_qpnode_t *child = qpi->stack[qpi->sp]; 1942 dns_qpnode_t *parent = qpi->stack[qpi->sp - 1]; 1943 pos = child - ref_ptr(qpi->qp, branch_twigs_ref(parent)); 1944 if (forward) { 1945 max = branch_twigs_size(parent) - 1; 1946 } 1947 } 1948 return pos == max; 1949 } 1950 1951 /* 1952 * move a QP iterator forward or back to the next or previous leaf. 1953 * note: this function can go wrong when the iterator refers to 1954 * a mutable view of the trie which is altered while iterating 1955 */ 1956 static isc_result_t 1957 iterate(bool forward, dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 1958 uint32_t *ival_r) { 1959 dns_qpnode_t *node = NULL; 1960 bool initial_branch = true; 1961 1962 REQUIRE(QPITER_VALID(qpi)); 1963 1964 dns_qpreader_t *qp = qpi->qp; 1965 1966 REQUIRE(QP_VALID(qp)); 1967 1968 node = get_root(qp); 1969 if (node == NULL) { 1970 return ISC_R_NOMORE; 1971 } 1972 1973 do { 1974 if (qpi->stack[qpi->sp] == NULL) { 1975 /* newly initialized iterator: use the root node */ 1976 INSIST(qpi->sp == 0); 1977 qpi->stack[0] = node; 1978 } else if (!initial_branch) { 1979 /* 1980 * in a prior loop, we reached a branch; from 1981 * here we just need to get the highest or lowest 1982 * leaf in the subtree; we don't need to bother 1983 * stepping forward or backward through twigs 1984 * anymore. 1985 */ 1986 INSIST(qpi->sp > 0); 1987 } else if (last_twig(qpi, forward)) { 1988 /* 1989 * we've stepped to the end (or the beginning, 1990 * if we're iterating backwards) of a set of twigs. 1991 */ 1992 if (qpi->sp == 0) { 1993 /* 1994 * we've finished iterating. reinitialize 1995 * the iterator, then return ISC_R_NOMORE. 1996 */ 1997 dns_qpiter_init(qpi->qp, qpi); 1998 return ISC_R_NOMORE; 1999 } 2000 2001 /* 2002 * pop the stack, and resume at the parent branch. 2003 */ 2004 qpi->stack[qpi->sp] = NULL; 2005 qpi->sp--; 2006 continue; 2007 } else { 2008 /* 2009 * there are more twigs in the current branch, 2010 * so step the node pointer forward (or back). 2011 */ 2012 qpi->stack[qpi->sp] += (forward ? 1 : -1); 2013 node = qpi->stack[qpi->sp]; 2014 } 2015 2016 /* 2017 * if we're at a branch now, we loop down to the 2018 * left- or rightmost leaf. 2019 */ 2020 if (is_branch(node)) { 2021 qpi->sp++; 2022 INSIST(qpi->sp < DNS_QP_MAXKEY); 2023 node = ref_ptr(qp, branch_twigs_ref(node)) + 2024 (forward ? 0 : branch_twigs_size(node) - 1); 2025 qpi->stack[qpi->sp] = node; 2026 initial_branch = false; 2027 } 2028 } while (is_branch(node)); 2029 2030 /* we're at a leaf: return its data to the caller */ 2031 SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 2032 SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 2033 maybe_set_name(qpi->qp, node, name); 2034 return ISC_R_SUCCESS; 2035 } 2036 2037 isc_result_t 2038 dns_qpiter_next(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 2039 uint32_t *ival_r) { 2040 return iterate(true, qpi, name, pval_r, ival_r); 2041 } 2042 2043 isc_result_t 2044 dns_qpiter_prev(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 2045 uint32_t *ival_r) { 2046 return iterate(false, qpi, name, pval_r, ival_r); 2047 } 2048 2049 isc_result_t 2050 dns_qpiter_current(dns_qpiter_t *qpi, dns_name_t *name, void **pval_r, 2051 uint32_t *ival_r) { 2052 dns_qpnode_t *node = NULL; 2053 2054 REQUIRE(QPITER_VALID(qpi)); 2055 2056 node = qpi->stack[qpi->sp]; 2057 if (node == NULL || is_branch(node)) { 2058 return ISC_R_FAILURE; 2059 } 2060 2061 SET_IF_NOT_NULL(pval_r, leaf_pval(node)); 2062 SET_IF_NOT_NULL(ival_r, leaf_ival(node)); 2063 maybe_set_name(qpi->qp, node, name); 2064 return ISC_R_SUCCESS; 2065 } 2066 2067 /*********************************************************************** 2068 * 2069 * search 2070 */ 2071 2072 isc_result_t 2073 dns_qp_getkey(dns_qpreadable_t qpr, const dns_qpkey_t search_key, 2074 size_t search_keylen, void **pval_r, uint32_t *ival_r) { 2075 dns_qpreader_t *qp = dns_qpreader(qpr); 2076 dns_qpkey_t found_key; 2077 size_t found_keylen; 2078 dns_qpshift_t bit; 2079 dns_qpnode_t *n = NULL; 2080 2081 REQUIRE(QP_VALID(qp)); 2082 REQUIRE(search_keylen < sizeof(dns_qpkey_t)); 2083 2084 n = get_root(qp); 2085 if (n == NULL) { 2086 return ISC_R_NOTFOUND; 2087 } 2088 2089 while (is_branch(n)) { 2090 prefetch_twigs(qp, n); 2091 bit = branch_keybit(n, search_key, search_keylen); 2092 if (!branch_has_twig(n, bit)) { 2093 return ISC_R_NOTFOUND; 2094 } 2095 n = branch_twig_ptr(qp, n, bit); 2096 } 2097 2098 found_keylen = leaf_qpkey(qp, n, found_key); 2099 if (qpkey_compare(search_key, search_keylen, found_key, found_keylen) != 2100 QPKEY_EQUAL) 2101 { 2102 return ISC_R_NOTFOUND; 2103 } 2104 2105 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 2106 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 2107 return ISC_R_SUCCESS; 2108 } 2109 2110 isc_result_t 2111 dns_qp_getname(dns_qpreadable_t qpr, const dns_name_t *name, void **pval_r, 2112 uint32_t *ival_r) { 2113 dns_qpkey_t key; 2114 size_t keylen = dns_qpkey_fromname(key, name); 2115 return dns_qp_getkey(qpr, key, keylen, pval_r, ival_r); 2116 } 2117 2118 static inline void 2119 add_link(dns_qpchain_t *chain, dns_qpnode_t *node, size_t offset) { 2120 /* prevent duplication */ 2121 if (chain->len != 0 && chain->chain[chain->len - 1].node == node) { 2122 return; 2123 } 2124 chain->chain[chain->len].node = node; 2125 chain->chain[chain->len].offset = offset; 2126 chain->len++; 2127 INSIST(chain->len <= DNS_NAME_MAXLABELS); 2128 } 2129 2130 static inline void 2131 prevleaf(dns_qpiter_t *it) { 2132 isc_result_t result = dns_qpiter_prev(it, NULL, NULL, NULL); 2133 if (result == ISC_R_NOMORE) { 2134 result = dns_qpiter_prev(it, NULL, NULL, NULL); 2135 } 2136 RUNTIME_CHECK(result == ISC_R_SUCCESS); 2137 } 2138 2139 static inline void 2140 greatest_leaf(dns_qpreadable_t qpr, dns_qpnode_t *n, dns_qpiter_t *iter) { 2141 while (is_branch(n)) { 2142 dns_qpref_t ref = branch_twigs_ref(n) + branch_twigs_size(n) - 2143 1; 2144 iter->stack[++iter->sp] = n; 2145 n = ref_ptr(qpr, ref); 2146 } 2147 iter->stack[++iter->sp] = n; 2148 } 2149 2150 static inline dns_qpnode_t * 2151 anyleaf(dns_qpreader_t *qp, dns_qpnode_t *n) { 2152 while (is_branch(n)) { 2153 n = branch_twigs(qp, n); 2154 } 2155 return n; 2156 } 2157 2158 static inline int 2159 twig_offset(dns_qpnode_t *n, dns_qpshift_t sbit, dns_qpshift_t kbit, 2160 dns_qpshift_t fbit) { 2161 dns_qpweight_t pos = branch_twig_pos(n, sbit); 2162 if (branch_has_twig(n, sbit)) { 2163 return pos - (kbit < fbit); 2164 } 2165 return pos - 1; 2166 } 2167 2168 /* 2169 * If dns_qp_lookup() was passed an iterator, we want it to point at the 2170 * matching name in the case of an exact match, or at the predecessor name 2171 * for a non-exact match. 2172 * 2173 * If there is an exact match, then there is nothing to be done. Otherwise, 2174 * we pop up the iterator stack until we find a parent branch with an offset 2175 * that is before the position where the search key differs from the found key. 2176 * From there we can step to the leaf that is the predecessor of the searched 2177 * name. 2178 * 2179 * Requires the iterator to be pointing at a leaf node. 2180 */ 2181 static void 2182 fix_iterator(dns_qpreader_t *qp, dns_qpiter_t *it, dns_qpkey_t key, 2183 size_t len) { 2184 dns_qpnode_t *n = it->stack[it->sp]; 2185 2186 REQUIRE(!is_branch(n)); 2187 2188 dns_qpkey_t found; 2189 size_t foundlen = leaf_qpkey(qp, n, found); 2190 size_t to = qpkey_compare(key, len, found, foundlen); 2191 2192 /* If the keys are equal, the iterator is already at the right node. */ 2193 if (to == QPKEY_EQUAL) { 2194 return; 2195 } 2196 2197 /* 2198 * Special case: if the key differs even before the root 2199 * key offset, it means the name desired either precedes or 2200 * follows the entire range of names in the database, and 2201 * popping up the stack won't help us, so just move the 2202 * iterator one step back from the origin and return. 2203 */ 2204 if (to < branch_key_offset(it->stack[0])) { 2205 dns_qpiter_init(qp, it); 2206 prevleaf(it); 2207 return; 2208 } 2209 2210 /* 2211 * As long as the branch offset point is after the point where the 2212 * key differs, we need to branch up and find a better node. 2213 */ 2214 while (it->sp > 0) { 2215 dns_qpnode_t *b = it->stack[it->sp - 1]; 2216 if (branch_key_offset(b) < to) { 2217 break; 2218 } 2219 it->sp--; 2220 } 2221 n = it->stack[it->sp]; 2222 2223 /* 2224 * Either we are now at the correct branch, or we are at the 2225 * first unmatched node. Determine the bit position for the 2226 * twig we need (sbit). 2227 */ 2228 dns_qpshift_t kbit = qpkey_bit(key, len, to); 2229 dns_qpshift_t fbit = qpkey_bit(found, foundlen, to); 2230 dns_qpshift_t sbit = 0; 2231 2232 if (is_branch(n) && branch_key_offset(n) == to) { 2233 /* We are on the correct branch now. */ 2234 sbit = kbit; 2235 } else if (it->sp == 0) { 2236 /* 2237 * We are on the root branch, popping up the stack won't 2238 * help us, so just move the iterator one step back from the 2239 * origin and return. 2240 */ 2241 dns_qpiter_init(qp, it); 2242 prevleaf(it); 2243 return; 2244 } else { 2245 /* We are at the first unmatched node, pop up the stack. */ 2246 n = it->stack[--it->sp]; 2247 sbit = qpkey_bit(key, len, branch_key_offset(n)); 2248 } 2249 2250 INSIST(is_branch(n)); 2251 2252 prefetch_twigs(qp, n); 2253 dns_qpnode_t *twigs = branch_twigs(qp, n); 2254 int toff = twig_offset(n, sbit, kbit, fbit); 2255 if (toff >= 0) { 2256 /* 2257 * The name we want would've been after some twig in 2258 * this branch. Walk down from that twig to the 2259 * highest leaf in its subtree to get the predecessor. 2260 */ 2261 greatest_leaf(qp, twigs + toff, it); 2262 } else { 2263 /* 2264 * Every leaf below this node is greater than the one we 2265 * wanted, so the previous leaf is the predecessor. 2266 */ 2267 prevleaf(it); 2268 } 2269 } 2270 2271 /* 2272 * When searching for a requested name in dns_qp_lookup(), we might add 2273 * a leaf node to the chain, then subsequently determine that it was a 2274 * dead end. When this happens, the chain can be left holding a node 2275 * that is *not* an ancestor of the requested name. We correct for that 2276 * here. 2277 */ 2278 static void 2279 fix_chain(dns_qpchain_t *chain, size_t offset) { 2280 while (chain->len > 0 && chain->chain[chain->len - 1].offset >= offset) 2281 { 2282 chain->len--; 2283 chain->chain[chain->len].node = NULL; 2284 chain->chain[chain->len].offset = 0; 2285 } 2286 } 2287 2288 isc_result_t 2289 dns_qp_lookup(dns_qpreadable_t qpr, const dns_name_t *name, 2290 dns_name_t *foundname, dns_qpiter_t *iter, dns_qpchain_t *chain, 2291 void **pval_r, uint32_t *ival_r) { 2292 dns_qpreader_t *qp = dns_qpreader(qpr); 2293 dns_qpkey_t search, found; 2294 size_t searchlen, foundlen; 2295 size_t offset = 0; 2296 dns_qpnode_t *n = NULL; 2297 dns_qpshift_t bit = SHIFT_NOBYTE; 2298 dns_qpchain_t oc; 2299 dns_qpiter_t it; 2300 bool matched = false; 2301 bool setiter = true; 2302 2303 REQUIRE(QP_VALID(qp)); 2304 REQUIRE(foundname == NULL || ISC_MAGIC_VALID(name, DNS_NAME_MAGIC)); 2305 2306 searchlen = dns_qpkey_fromname(search, name); 2307 2308 if (chain == NULL) { 2309 chain = &oc; 2310 } 2311 if (iter == NULL) { 2312 iter = ⁢ 2313 setiter = false; 2314 } 2315 dns_qpchain_init(qp, chain); 2316 dns_qpiter_init(qp, iter); 2317 2318 n = get_root(qp); 2319 if (n == NULL) { 2320 return ISC_R_NOTFOUND; 2321 } 2322 iter->stack[0] = n; 2323 2324 /* 2325 * Like `dns_qp_insert()`, we must find a leaf. However, we don't make a 2326 * second pass: instead, we keep track of any leaves with shorter keys 2327 * that we discover along the way. (In general, qp-trie searches can be 2328 * one-pass, by recording their traversal, or two-pass, for less stack 2329 * memory usage.) 2330 */ 2331 while (is_branch(n)) { 2332 prefetch_twigs(qp, n); 2333 2334 offset = branch_key_offset(n); 2335 bit = qpkey_bit(search, searchlen, offset); 2336 dns_qpnode_t *twigs = branch_twigs(qp, n); 2337 2338 /* 2339 * A shorter key that can be a parent domain always has a 2340 * leaf node at SHIFT_NOBYTE (indicating end of its key) 2341 * where our search key has a normal character immediately 2342 * after a label separator. 2343 * 2344 * Note 1: It is OK if `off - 1` underflows: it will 2345 * become SIZE_MAX, which is greater than `searchlen`, so 2346 * `qpkey_bit()` will return SHIFT_NOBYTE, which is what we 2347 * want when `off == 0`. 2348 * 2349 * Note 2: If SHIFT_NOBYTE twig is present, it will always 2350 * be in position 0, the first location in 'twigs'. 2351 */ 2352 if (bit != SHIFT_NOBYTE && branch_has_twig(n, SHIFT_NOBYTE) && 2353 qpkey_bit(search, searchlen, offset - 1) == SHIFT_NOBYTE && 2354 !is_branch(twigs)) 2355 { 2356 add_link(chain, twigs, offset); 2357 } 2358 2359 matched = branch_has_twig(n, bit); 2360 if (matched) { 2361 /* 2362 * found a match: if it's a branch, we keep 2363 * searching, and if it's a leaf, we drop out of 2364 * the loop. 2365 */ 2366 n = branch_twig_ptr(qp, n, bit); 2367 } else { 2368 /* 2369 * this branch is a dead end, and the predecessor 2370 * doesn't matter. now we just need to find a leaf 2371 * to end on so that qpkey_leaf() will work below. 2372 */ 2373 n = anyleaf(qp, twigs); 2374 } 2375 2376 iter->stack[++iter->sp] = n; 2377 } 2378 2379 if (setiter) { 2380 /* 2381 * we found a leaf, but it might not be the leaf we wanted. 2382 * if it isn't, and if the caller passed us an iterator, 2383 * then we might need to reposition it. 2384 */ 2385 fix_iterator(qp, iter, search, searchlen); 2386 n = iter->stack[iter->sp]; 2387 } 2388 2389 /* at this point, n can only be a leaf node */ 2390 INSIST(!is_branch(n)); 2391 2392 foundlen = leaf_qpkey(qp, n, found); 2393 offset = qpkey_compare(search, searchlen, found, foundlen); 2394 2395 /* the search ended with an exact or partial match */ 2396 if (offset == QPKEY_EQUAL || offset == foundlen) { 2397 isc_result_t result = ISC_R_SUCCESS; 2398 2399 if (offset == foundlen) { 2400 fix_chain(chain, offset); 2401 result = DNS_R_PARTIALMATCH; 2402 } 2403 add_link(chain, n, offset); 2404 2405 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 2406 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 2407 maybe_set_name(qp, n, foundname); 2408 return result; 2409 } 2410 2411 /* 2412 * the requested name was not found, but if an ancestor 2413 * was, we can retrieve that from the chain. 2414 */ 2415 int len = chain->len; 2416 while (len-- > 0) { 2417 if (offset >= chain->chain[len].offset) { 2418 n = chain->chain[len].node; 2419 SET_IF_NOT_NULL(pval_r, leaf_pval(n)); 2420 SET_IF_NOT_NULL(ival_r, leaf_ival(n)); 2421 maybe_set_name(qp, n, foundname); 2422 return DNS_R_PARTIALMATCH; 2423 } else { 2424 /* 2425 * oops, during the search we found and added 2426 * a leaf that's longer than the requested 2427 * name; remove it from the chain. 2428 */ 2429 chain->len--; 2430 } 2431 } 2432 2433 /* nothing was found at all */ 2434 return ISC_R_NOTFOUND; 2435 } 2436 2437 /**********************************************************************/ 2438