1 /* $NetBSD: subr_pool.c,v 1.297 2026/01/04 03:20:29 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018, 5 * 2020, 2021 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace 10 * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by 11 * Maxime Villard. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.297 2026/01/04 03:20:29 riastradh Exp $"); 37 38 #ifdef _KERNEL_OPT 39 #include "opt_ddb.h" 40 #include "opt_lockdebug.h" 41 #include "opt_pool.h" 42 #endif 43 44 #include <sys/param.h> 45 #include <sys/types.h> 46 47 #include <sys/asan.h> 48 #include <sys/atomic.h> 49 #include <sys/bitops.h> 50 #include <sys/cpu.h> 51 #include <sys/debug.h> 52 #include <sys/errno.h> 53 #include <sys/fault.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/lockdebug.h> 57 #include <sys/msan.h> 58 #include <sys/pool.h> 59 #include <sys/proc.h> 60 #include <sys/sdt.h> 61 #include <sys/sysctl.h> 62 #include <sys/syslog.h> 63 #include <sys/systm.h> 64 #include <sys/vmem.h> 65 #include <sys/xcall.h> 66 67 #include <uvm/uvm_extern.h> 68 69 /* 70 * Pool resource management utility. 71 * 72 * Memory is allocated in pages which are split into pieces according to 73 * the pool item size. Each page is kept on one of three lists in the 74 * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages', 75 * for empty, full and partially-full pages respectively. The individual 76 * pool items are on a linked list headed by `ph_itemlist' in each page 77 * header. The memory for building the page list is either taken from 78 * the allocated pages themselves (for small pool items) or taken from 79 * an internal pool of page headers (`phpool'). 80 */ 81 82 /* List of all pools. Non static as needed by 'vmstat -m' */ 83 TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head); 84 85 /* Private pool for page header structures */ 86 #define PHPOOL_MAX 8 87 static struct pool phpool[PHPOOL_MAX]; 88 #define PHPOOL_FREELIST_NELEM(idx) \ 89 (((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx))) 90 91 #if !defined(KMSAN) && (defined(DIAGNOSTIC) || defined(KASAN)) 92 #define POOL_REDZONE 93 #endif 94 95 #if defined(POOL_QUARANTINE) 96 #define POOL_NOCACHE 97 #endif 98 99 #ifdef POOL_REDZONE 100 # ifdef KASAN 101 # define POOL_REDZONE_SIZE 8 102 # else 103 # define POOL_REDZONE_SIZE 2 104 # endif 105 static void pool_redzone_init(struct pool *, size_t); 106 static void pool_redzone_fill(struct pool *, void *); 107 static void pool_redzone_check(struct pool *, void *); 108 static void pool_cache_redzone_check(pool_cache_t, void *); 109 #else 110 # define pool_redzone_init(pp, sz) __nothing 111 # define pool_redzone_fill(pp, ptr) __nothing 112 # define pool_redzone_check(pp, ptr) __nothing 113 # define pool_cache_redzone_check(pc, ptr) __nothing 114 #endif 115 116 #ifdef KMSAN 117 static inline void pool_get_kmsan(struct pool *, void *); 118 static inline void pool_put_kmsan(struct pool *, void *); 119 static inline void pool_cache_get_kmsan(pool_cache_t, void *); 120 static inline void pool_cache_put_kmsan(pool_cache_t, void *); 121 #else 122 #define pool_get_kmsan(pp, ptr) __nothing 123 #define pool_put_kmsan(pp, ptr) __nothing 124 #define pool_cache_get_kmsan(pc, ptr) __nothing 125 #define pool_cache_put_kmsan(pc, ptr) __nothing 126 #endif 127 128 #ifdef POOL_QUARANTINE 129 static void pool_quarantine_init(struct pool *); 130 static void pool_quarantine_flush(struct pool *); 131 static bool pool_put_quarantine(struct pool *, void *, 132 struct pool_pagelist *); 133 #else 134 #define pool_quarantine_init(a) __nothing 135 #define pool_quarantine_flush(a) __nothing 136 #define pool_put_quarantine(a, b, c) false 137 #endif 138 139 #ifdef POOL_NOCACHE 140 static bool pool_cache_put_nocache(pool_cache_t, void *); 141 #else 142 #define pool_cache_put_nocache(a, b) false 143 #endif 144 145 #define NO_CTOR __FPTRCAST(int (*)(void *, void *, int), nullop) 146 #define NO_DTOR __FPTRCAST(void (*)(void *, void *), nullop) 147 148 #define pc_has_pser(pc) (((pc)->pc_roflags & PR_PSERIALIZE) != 0) 149 #define pc_has_ctor(pc) ((pc)->pc_ctor != NO_CTOR) 150 #define pc_has_dtor(pc) ((pc)->pc_dtor != NO_DTOR) 151 152 #define pp_has_pser(pp) (((pp)->pr_roflags & PR_PSERIALIZE) != 0) 153 154 #define pool_barrier() xc_barrier(0) 155 156 /* 157 * Pool backend allocators. 158 * 159 * Each pool has a backend allocator that handles allocation, deallocation, 160 * and any additional draining that might be needed. 161 * 162 * We provide two standard allocators: 163 * 164 * pool_allocator_kmem - the default when no allocator is specified 165 * 166 * pool_allocator_nointr - used for pools that will not be accessed 167 * in interrupt context. 168 */ 169 void *pool_page_alloc(struct pool *, int); 170 void pool_page_free(struct pool *, void *); 171 172 static void *pool_page_alloc_meta(struct pool *, int); 173 static void pool_page_free_meta(struct pool *, void *); 174 175 struct pool_allocator pool_allocator_kmem = { 176 .pa_alloc = pool_page_alloc, 177 .pa_free = pool_page_free, 178 .pa_pagesz = 0 179 }; 180 181 struct pool_allocator pool_allocator_nointr = { 182 .pa_alloc = pool_page_alloc, 183 .pa_free = pool_page_free, 184 .pa_pagesz = 0 185 }; 186 187 struct pool_allocator pool_allocator_meta = { 188 .pa_alloc = pool_page_alloc_meta, 189 .pa_free = pool_page_free_meta, 190 .pa_pagesz = 0 191 }; 192 193 #define POOL_ALLOCATOR_BIG_BASE 13 194 static struct pool_allocator pool_allocator_big[] = { 195 { 196 .pa_alloc = pool_page_alloc, 197 .pa_free = pool_page_free, 198 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0), 199 }, 200 { 201 .pa_alloc = pool_page_alloc, 202 .pa_free = pool_page_free, 203 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1), 204 }, 205 { 206 .pa_alloc = pool_page_alloc, 207 .pa_free = pool_page_free, 208 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2), 209 }, 210 { 211 .pa_alloc = pool_page_alloc, 212 .pa_free = pool_page_free, 213 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3), 214 }, 215 { 216 .pa_alloc = pool_page_alloc, 217 .pa_free = pool_page_free, 218 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4), 219 }, 220 { 221 .pa_alloc = pool_page_alloc, 222 .pa_free = pool_page_free, 223 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5), 224 }, 225 { 226 .pa_alloc = pool_page_alloc, 227 .pa_free = pool_page_free, 228 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6), 229 }, 230 { 231 .pa_alloc = pool_page_alloc, 232 .pa_free = pool_page_free, 233 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7), 234 }, 235 { 236 .pa_alloc = pool_page_alloc, 237 .pa_free = pool_page_free, 238 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 8), 239 }, 240 { 241 .pa_alloc = pool_page_alloc, 242 .pa_free = pool_page_free, 243 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 9), 244 }, 245 { 246 .pa_alloc = pool_page_alloc, 247 .pa_free = pool_page_free, 248 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 10), 249 }, 250 { 251 .pa_alloc = pool_page_alloc, 252 .pa_free = pool_page_free, 253 .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 11), 254 } 255 }; 256 257 static int pool_bigidx(size_t); 258 259 /* # of seconds to retain page after last use */ 260 int pool_inactive_time = 10; 261 262 /* Next candidate for drainage (see pool_drain()) */ 263 static struct pool *drainpp; 264 265 /* This lock protects both pool_head and drainpp. */ 266 static kmutex_t pool_head_lock; 267 static kcondvar_t pool_busy; 268 269 /* This lock protects initialization of a potentially shared pool allocator */ 270 static kmutex_t pool_allocator_lock; 271 272 static unsigned int poolid_counter = 0; 273 274 typedef uint32_t pool_item_bitmap_t; 275 #define BITMAP_SIZE (CHAR_BIT * sizeof(pool_item_bitmap_t)) 276 #define BITMAP_MASK (BITMAP_SIZE - 1) 277 #define BITMAP_MIN_SIZE (CHAR_BIT * sizeof(((struct pool_item_header *)NULL)->ph_u2)) 278 279 struct pool_item_header { 280 /* Page headers */ 281 LIST_ENTRY(pool_item_header) 282 ph_pagelist; /* pool page list */ 283 union { 284 /* !PR_PHINPAGE */ 285 struct { 286 SPLAY_ENTRY(pool_item_header) 287 phu_node; /* off-page page headers */ 288 } phu_offpage; 289 /* PR_PHINPAGE */ 290 struct { 291 unsigned int phu_poolid; 292 } phu_onpage; 293 } ph_u1; 294 void * ph_page; /* this page's address */ 295 uint32_t ph_time; /* last referenced */ 296 uint16_t ph_nmissing; /* # of chunks in use */ 297 uint16_t ph_off; /* start offset in page */ 298 union { 299 /* !PR_USEBMAP */ 300 struct { 301 LIST_HEAD(, pool_item) 302 phu_itemlist; /* chunk list for this page */ 303 } phu_normal; 304 /* PR_USEBMAP */ 305 struct { 306 pool_item_bitmap_t phu_bitmap[1]; 307 } phu_notouch; 308 } ph_u2; 309 }; 310 #define ph_node ph_u1.phu_offpage.phu_node 311 #define ph_poolid ph_u1.phu_onpage.phu_poolid 312 #define ph_itemlist ph_u2.phu_normal.phu_itemlist 313 #define ph_bitmap ph_u2.phu_notouch.phu_bitmap 314 315 #define PHSIZE ALIGN(sizeof(struct pool_item_header)) 316 317 CTASSERT(offsetof(struct pool_item_header, ph_u2) + 318 BITMAP_MIN_SIZE / CHAR_BIT == sizeof(struct pool_item_header)); 319 320 #if defined(DIAGNOSTIC) && !defined(KASAN) 321 #define POOL_CHECK_MAGIC 322 #endif 323 324 struct pool_item { 325 #ifdef POOL_CHECK_MAGIC 326 u_int pi_magic; 327 #endif 328 #define PI_MAGIC 0xdeaddeadU 329 /* Other entries use only this list entry */ 330 LIST_ENTRY(pool_item) pi_list; 331 }; 332 333 #define POOL_NEEDS_CATCHUP(pp) \ 334 ((pp)->pr_nitems < (pp)->pr_minitems || \ 335 (pp)->pr_npages < (pp)->pr_minpages) 336 #define POOL_OBJ_TO_PAGE(pp, v) \ 337 (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask) 338 339 /* 340 * Pool cache management. 341 * 342 * Pool caches provide a way for constructed objects to be cached by the 343 * pool subsystem. This can lead to performance improvements by avoiding 344 * needless object construction/destruction; it is deferred until absolutely 345 * necessary. 346 * 347 * Caches are grouped into cache groups. Each cache group references up 348 * to PCG_NUMOBJECTS constructed objects. When a cache allocates an 349 * object from the pool, it calls the object's constructor and places it 350 * into a cache group. When a cache group frees an object back to the 351 * pool, it first calls the object's destructor. This allows the object 352 * to persist in constructed form while freed to the cache. 353 * 354 * The pool references each cache, so that when a pool is drained by the 355 * pagedaemon, it can drain each individual cache as well. Each time a 356 * cache is drained, the most idle cache group is freed to the pool in 357 * its entirety. 358 * 359 * Pool caches are laid on top of pools. By layering them, we can avoid 360 * the complexity of cache management for pools which would not benefit 361 * from it. 362 */ 363 364 static struct pool pcg_normal_pool; 365 static struct pool pcg_large_pool; 366 static struct pool cache_pool; 367 static struct pool cache_cpu_pool; 368 369 static pcg_t *volatile pcg_large_cache __cacheline_aligned; 370 static pcg_t *volatile pcg_normal_cache __cacheline_aligned; 371 372 /* List of all caches. */ 373 TAILQ_HEAD(,pool_cache) pool_cache_head = 374 TAILQ_HEAD_INITIALIZER(pool_cache_head); 375 376 int pool_cache_disable; /* global disable for caching */ 377 static const pcg_t pcg_dummy; /* zero sized: always empty, yet always full */ 378 379 static bool pool_cache_put_slow(pool_cache_t, pool_cache_cpu_t *, int, 380 void *); 381 static bool pool_cache_get_slow(pool_cache_t, pool_cache_cpu_t *, int, 382 void **, paddr_t *, int); 383 static void pool_cache_cpu_init1(struct cpu_info *, pool_cache_t); 384 static int pool_cache_invalidate_groups(pool_cache_t, pcg_t *); 385 static void pool_cache_invalidate_cpu(pool_cache_t, u_int); 386 static void pool_cache_transfer(pool_cache_t); 387 static int pool_pcg_get(pcg_t *volatile *, pcg_t **); 388 static int pool_pcg_put(pcg_t *volatile *, pcg_t *); 389 static pcg_t * pool_pcg_trunc(pcg_t *volatile *); 390 391 static int pool_catchup(struct pool *); 392 static void pool_prime_page(struct pool *, void *, 393 struct pool_item_header *); 394 static void pool_update_curpage(struct pool *); 395 396 static int pool_grow(struct pool *, int); 397 static void *pool_allocator_alloc(struct pool *, int); 398 static void pool_allocator_free(struct pool *, void *); 399 400 static void pool_print_pagelist(struct pool *, struct pool_pagelist *, 401 void (*)(const char *, ...) __printflike(1, 2)); 402 static void pool_print1(struct pool *, const char *, 403 void (*)(const char *, ...) __printflike(1, 2)); 404 405 static int pool_chk_page(struct pool *, const char *, 406 struct pool_item_header *); 407 408 /* -------------------------------------------------------------------------- */ 409 410 static inline unsigned int 411 pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph, 412 const void *v) 413 { 414 const char *cp = v; 415 unsigned int idx; 416 417 KASSERT(pp->pr_roflags & PR_USEBMAP); 418 idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size; 419 420 if (__predict_false(idx >= pp->pr_itemsperpage)) { 421 panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx, 422 pp->pr_itemsperpage); 423 } 424 425 return idx; 426 } 427 428 static inline void 429 pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph, 430 void *obj) 431 { 432 unsigned int idx = pr_item_bitmap_index(pp, ph, obj); 433 pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE); 434 pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK); 435 436 if (__predict_false((*bitmap & mask) != 0)) { 437 panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj); 438 } 439 440 *bitmap |= mask; 441 } 442 443 static inline void * 444 pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph) 445 { 446 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 447 unsigned int idx; 448 int i; 449 450 for (i = 0; ; i++) { 451 int bit; 452 453 KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage); 454 bit = ffs32(bitmap[i]); 455 if (bit) { 456 pool_item_bitmap_t mask; 457 458 bit--; 459 idx = (i * BITMAP_SIZE) + bit; 460 mask = 1U << bit; 461 KASSERT((bitmap[i] & mask) != 0); 462 bitmap[i] &= ~mask; 463 break; 464 } 465 } 466 KASSERT(idx < pp->pr_itemsperpage); 467 return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size; 468 } 469 470 static inline void 471 pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph) 472 { 473 pool_item_bitmap_t *bitmap = ph->ph_bitmap; 474 const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE); 475 int i; 476 477 for (i = 0; i < n; i++) { 478 bitmap[i] = (pool_item_bitmap_t)-1; 479 } 480 } 481 482 /* -------------------------------------------------------------------------- */ 483 484 static inline void 485 pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph, 486 void *obj) 487 { 488 struct pool_item *pi = obj; 489 490 KASSERT(!pp_has_pser(pp)); 491 492 #ifdef POOL_CHECK_MAGIC 493 pi->pi_magic = PI_MAGIC; 494 #endif 495 496 if (pp->pr_redzone) { 497 /* 498 * Mark the pool_item as valid. The rest is already 499 * invalid. 500 */ 501 kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0); 502 } 503 504 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 505 } 506 507 static inline void * 508 pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph) 509 { 510 struct pool_item *pi; 511 void *v; 512 513 v = pi = LIST_FIRST(&ph->ph_itemlist); 514 if (__predict_false(v == NULL)) { 515 mutex_exit(&pp->pr_lock); 516 panic("%s: [%s] page empty", __func__, pp->pr_wchan); 517 } 518 KASSERTMSG((pp->pr_nitems > 0), 519 "%s: [%s] nitems %u inconsistent on itemlist", 520 __func__, pp->pr_wchan, pp->pr_nitems); 521 #ifdef POOL_CHECK_MAGIC 522 KASSERTMSG((pi->pi_magic == PI_MAGIC), 523 "%s: [%s] free list modified: " 524 "magic=%x; page %p; item addr %p", __func__, 525 pp->pr_wchan, pi->pi_magic, ph->ph_page, pi); 526 #endif 527 528 /* 529 * Remove from item list. 530 */ 531 LIST_REMOVE(pi, pi_list); 532 533 return v; 534 } 535 536 /* -------------------------------------------------------------------------- */ 537 538 static inline void 539 pr_phinpage_check(struct pool *pp, struct pool_item_header *ph, void *page, 540 void *object) 541 { 542 if (__predict_false((void *)ph->ph_page != page)) { 543 panic("%s: [%s] item %p not part of pool", __func__, 544 pp->pr_wchan, object); 545 } 546 if (__predict_false((char *)object < (char *)page + ph->ph_off)) { 547 panic("%s: [%s] item %p below item space", __func__, 548 pp->pr_wchan, object); 549 } 550 if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { 551 panic("%s: [%s] item %p poolid %u != %u", __func__, 552 pp->pr_wchan, object, ph->ph_poolid, pp->pr_poolid); 553 } 554 } 555 556 static inline void 557 pc_phinpage_check(pool_cache_t pc, void *object) 558 { 559 struct pool_item_header *ph; 560 struct pool *pp; 561 void *page; 562 563 pp = &pc->pc_pool; 564 page = POOL_OBJ_TO_PAGE(pp, object); 565 ph = (struct pool_item_header *)page; 566 567 pr_phinpage_check(pp, ph, page, object); 568 } 569 570 /* -------------------------------------------------------------------------- */ 571 572 static inline int 573 phtree_compare(struct pool_item_header *a, struct pool_item_header *b) 574 { 575 576 /* 577 * We consider pool_item_header with smaller ph_page bigger. This 578 * unnatural ordering is for the benefit of pr_find_pagehead. 579 */ 580 if (a->ph_page < b->ph_page) 581 return 1; 582 else if (a->ph_page > b->ph_page) 583 return -1; 584 else 585 return 0; 586 } 587 588 SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare); 589 SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare); 590 591 static inline struct pool_item_header * 592 pr_find_pagehead_noalign(struct pool *pp, void *v) 593 { 594 struct pool_item_header *ph, tmp; 595 596 tmp.ph_page = (void *)(uintptr_t)v; 597 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 598 if (ph == NULL) { 599 ph = SPLAY_ROOT(&pp->pr_phtree); 600 if (ph != NULL && phtree_compare(&tmp, ph) >= 0) { 601 ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph); 602 } 603 KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0); 604 } 605 606 return ph; 607 } 608 609 /* 610 * Return the pool page header based on item address. 611 */ 612 static inline struct pool_item_header * 613 pr_find_pagehead(struct pool *pp, void *v) 614 { 615 struct pool_item_header *ph, tmp; 616 617 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 618 ph = pr_find_pagehead_noalign(pp, v); 619 } else { 620 void *page = POOL_OBJ_TO_PAGE(pp, v); 621 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 622 ph = (struct pool_item_header *)page; 623 pr_phinpage_check(pp, ph, page, v); 624 } else { 625 tmp.ph_page = page; 626 ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp); 627 } 628 } 629 630 KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) || 631 ((char *)ph->ph_page <= (char *)v && 632 (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz)); 633 return ph; 634 } 635 636 static void 637 pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq) 638 { 639 struct pool_item_header *ph; 640 641 while ((ph = LIST_FIRST(pq)) != NULL) { 642 LIST_REMOVE(ph, ph_pagelist); 643 pool_allocator_free(pp, ph->ph_page); 644 if ((pp->pr_roflags & PR_PHINPAGE) == 0) 645 pool_put(pp->pr_phpool, ph); 646 } 647 } 648 649 /* 650 * Remove a page from the pool. 651 */ 652 static inline void 653 pr_rmpage(struct pool *pp, struct pool_item_header *ph, 654 struct pool_pagelist *pq) 655 { 656 657 KASSERT(mutex_owned(&pp->pr_lock)); 658 659 /* 660 * If the page was idle, decrement the idle page count. 661 */ 662 if (ph->ph_nmissing == 0) { 663 KASSERT(pp->pr_nidle != 0); 664 KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage), 665 "%s: [%s] nitems=%u < itemsperpage=%u", __func__, 666 pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage); 667 pp->pr_nidle--; 668 } 669 670 pp->pr_nitems -= pp->pr_itemsperpage; 671 672 /* 673 * Unlink the page from the pool and queue it for release. 674 */ 675 LIST_REMOVE(ph, ph_pagelist); 676 if (pp->pr_roflags & PR_PHINPAGE) { 677 if (__predict_false(ph->ph_poolid != pp->pr_poolid)) { 678 panic("%s: [%s] ph %p poolid %u != %u", 679 __func__, pp->pr_wchan, ph, ph->ph_poolid, 680 pp->pr_poolid); 681 } 682 } else { 683 SPLAY_REMOVE(phtree, &pp->pr_phtree, ph); 684 } 685 LIST_INSERT_HEAD(pq, ph, ph_pagelist); 686 687 pp->pr_npages--; 688 pp->pr_npagefree++; 689 690 pool_update_curpage(pp); 691 } 692 693 /* 694 * Initialize all the pools listed in the "pools" link set. 695 */ 696 void 697 pool_subsystem_init(void) 698 { 699 size_t size; 700 int idx; 701 702 mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE); 703 mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE); 704 cv_init(&pool_busy, "poolbusy"); 705 706 /* 707 * Initialize private page header pool and cache magazine pool if we 708 * haven't done so yet. 709 */ 710 for (idx = 0; idx < PHPOOL_MAX; idx++) { 711 static char phpool_names[PHPOOL_MAX][6+1+6+1]; 712 int nelem; 713 size_t sz; 714 715 nelem = PHPOOL_FREELIST_NELEM(idx); 716 KASSERT(nelem != 0); 717 snprintf(phpool_names[idx], sizeof(phpool_names[idx]), 718 "phpool-%d", nelem); 719 sz = offsetof(struct pool_item_header, 720 ph_bitmap[howmany(nelem, BITMAP_SIZE)]); 721 pool_init(&phpool[idx], sz, 0, 0, 0, 722 phpool_names[idx], &pool_allocator_meta, IPL_VM); 723 } 724 725 size = sizeof(pcg_t) + 726 (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t); 727 pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0, 728 "pcgnormal", &pool_allocator_meta, IPL_VM); 729 730 size = sizeof(pcg_t) + 731 (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t); 732 pool_init(&pcg_large_pool, size, coherency_unit, 0, 0, 733 "pcglarge", &pool_allocator_meta, IPL_VM); 734 735 pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit, 736 0, 0, "pcache", &pool_allocator_meta, IPL_NONE); 737 738 pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit, 739 0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE); 740 } 741 742 static inline bool 743 pool_init_is_phinpage(const struct pool *pp) 744 { 745 size_t pagesize; 746 747 if (pp->pr_roflags & PR_PHINPAGE) { 748 return true; 749 } 750 if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) { 751 return false; 752 } 753 754 pagesize = pp->pr_alloc->pa_pagesz; 755 756 /* 757 * Threshold: the item size is below 1/16 of a page size, and below 758 * 8 times the page header size. The latter ensures we go off-page 759 * if the page header would make us waste a rather big item. 760 */ 761 if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) { 762 return true; 763 } 764 765 /* Put the header into the page if it doesn't waste any items. */ 766 if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) { 767 return true; 768 } 769 770 return false; 771 } 772 773 static inline bool 774 pool_init_is_usebmap(const struct pool *pp) 775 { 776 size_t bmapsize; 777 778 if (pp->pr_roflags & PR_NOTOUCH) { 779 return true; 780 } 781 782 /* 783 * If we're off-page, go with a bitmap. 784 */ 785 if (!(pp->pr_roflags & PR_PHINPAGE)) { 786 return true; 787 } 788 789 /* 790 * If we're on-page, and the page header can already contain a bitmap 791 * big enough to cover all the items of the page, go with a bitmap. 792 */ 793 bmapsize = roundup(PHSIZE, pp->pr_align) - 794 offsetof(struct pool_item_header, ph_bitmap[0]); 795 KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0); 796 if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) { 797 return true; 798 } 799 800 return false; 801 } 802 803 /* 804 * Initialize the given pool resource structure. 805 * 806 * We export this routine to allow other kernel parts to declare 807 * static pools that must be initialized before kmem(9) is available. 808 */ 809 void 810 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags, 811 const char *wchan, struct pool_allocator *palloc, int ipl) 812 { 813 struct pool *pp1; 814 size_t prsize; 815 int itemspace, slack; 816 817 /* XXX ioff will be removed. */ 818 KASSERT(ioff == 0); 819 820 #ifdef DEBUG 821 if (__predict_true(!cold)) 822 mutex_enter(&pool_head_lock); 823 /* 824 * Check that the pool hasn't already been initialised and 825 * added to the list of all pools. 826 */ 827 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 828 if (pp == pp1) 829 panic("%s: [%s] already initialised", __func__, 830 wchan); 831 } 832 if (__predict_true(!cold)) 833 mutex_exit(&pool_head_lock); 834 #endif 835 836 if (palloc == NULL) { 837 if (size > PAGE_SIZE) { 838 int bigidx = pool_bigidx(size); 839 840 palloc = &pool_allocator_big[bigidx]; 841 flags |= PR_NOALIGN; 842 } else if (ipl == IPL_NONE) { 843 palloc = &pool_allocator_nointr; 844 } else { 845 palloc = &pool_allocator_kmem; 846 } 847 } 848 849 if (!cold) 850 mutex_enter(&pool_allocator_lock); 851 if (palloc->pa_refcnt++ == 0) { 852 if (palloc->pa_pagesz == 0) 853 palloc->pa_pagesz = PAGE_SIZE; 854 855 TAILQ_INIT(&palloc->pa_list); 856 857 mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM); 858 palloc->pa_pagemask = ~(palloc->pa_pagesz - 1); 859 palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1; 860 } 861 if (!cold) 862 mutex_exit(&pool_allocator_lock); 863 864 /* 865 * PR_PSERIALIZE implies PR_NOTOUCH; freed objects must remain 866 * valid until the the backing page is returned to the system. 867 */ 868 if (flags & PR_PSERIALIZE) { 869 flags |= PR_NOTOUCH; 870 } 871 872 if (align == 0) 873 align = ALIGN(1); 874 875 prsize = size; 876 if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item)) 877 prsize = sizeof(struct pool_item); 878 879 prsize = roundup(prsize, align); 880 KASSERTMSG((prsize <= palloc->pa_pagesz), 881 "%s: [%s] pool item size (%zu) larger than page size (%u)", 882 __func__, wchan, prsize, palloc->pa_pagesz); 883 884 /* 885 * Initialize the pool structure. 886 */ 887 LIST_INIT(&pp->pr_emptypages); 888 LIST_INIT(&pp->pr_fullpages); 889 LIST_INIT(&pp->pr_partpages); 890 pp->pr_cache = NULL; 891 pp->pr_curpage = NULL; 892 pp->pr_npages = 0; 893 pp->pr_minitems = 0; 894 pp->pr_minpages = 0; 895 pp->pr_maxitems = UINT_MAX; 896 pp->pr_maxpages = UINT_MAX; 897 pp->pr_roflags = flags; 898 pp->pr_flags = 0; 899 pp->pr_size = prsize; 900 pp->pr_reqsize = size; 901 pp->pr_align = align; 902 pp->pr_wchan = wchan; 903 pp->pr_alloc = palloc; 904 pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter); 905 pp->pr_nitems = 0; 906 pp->pr_nout = 0; 907 pp->pr_hardlimit = UINT_MAX; 908 pp->pr_hardlimit_warning = NULL; 909 pp->pr_hardlimit_ratecap.tv_sec = 0; 910 pp->pr_hardlimit_ratecap.tv_usec = 0; 911 pp->pr_hardlimit_warning_last.tv_sec = 0; 912 pp->pr_hardlimit_warning_last.tv_usec = 0; 913 pp->pr_drain_hook = NULL; 914 pp->pr_drain_hook_arg = NULL; 915 pp->pr_freecheck = NULL; 916 pp->pr_redzone = false; 917 pool_redzone_init(pp, size); 918 pool_quarantine_init(pp); 919 920 /* 921 * Decide whether to put the page header off-page to avoid wasting too 922 * large a part of the page or too big an item. Off-page page headers 923 * go on a hash table, so we can match a returned item with its header 924 * based on the page address. 925 */ 926 if (pool_init_is_phinpage(pp)) { 927 /* Use the beginning of the page for the page header */ 928 itemspace = palloc->pa_pagesz - roundup(PHSIZE, align); 929 pp->pr_itemoffset = roundup(PHSIZE, align); 930 pp->pr_roflags |= PR_PHINPAGE; 931 } else { 932 /* The page header will be taken from our page header pool */ 933 itemspace = palloc->pa_pagesz; 934 pp->pr_itemoffset = 0; 935 SPLAY_INIT(&pp->pr_phtree); 936 } 937 938 pp->pr_itemsperpage = itemspace / pp->pr_size; 939 KASSERT(pp->pr_itemsperpage != 0); 940 941 /* 942 * Decide whether to use a bitmap or a linked list to manage freed 943 * items. 944 */ 945 if (pool_init_is_usebmap(pp)) { 946 pp->pr_roflags |= PR_USEBMAP; 947 } 948 949 /* 950 * If we're off-page, then we're using a bitmap; choose the appropriate 951 * pool to allocate page headers, whose size varies depending on the 952 * bitmap. If we're on-page, nothing to do. 953 */ 954 if (!(pp->pr_roflags & PR_PHINPAGE)) { 955 int idx; 956 957 KASSERT(pp->pr_roflags & PR_USEBMAP); 958 959 for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx); 960 idx++) { 961 /* nothing */ 962 } 963 if (idx >= PHPOOL_MAX) { 964 /* 965 * if you see this panic, consider to tweak 966 * PHPOOL_MAX and PHPOOL_FREELIST_NELEM. 967 */ 968 panic("%s: [%s] too large itemsperpage(%d) for " 969 "PR_USEBMAP", __func__, 970 pp->pr_wchan, pp->pr_itemsperpage); 971 } 972 pp->pr_phpool = &phpool[idx]; 973 } else { 974 pp->pr_phpool = NULL; 975 } 976 977 /* 978 * Use the slack between the chunks and the page header 979 * for "cache coloring". 980 */ 981 slack = itemspace - pp->pr_itemsperpage * pp->pr_size; 982 pp->pr_maxcolor = rounddown(slack, align); 983 pp->pr_curcolor = 0; 984 985 pp->pr_nget = 0; 986 pp->pr_nfail = 0; 987 pp->pr_nput = 0; 988 pp->pr_npagealloc = 0; 989 pp->pr_npagefree = 0; 990 pp->pr_hiwat = 0; 991 pp->pr_nidle = 0; 992 pp->pr_refcnt = 0; 993 994 mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl); 995 cv_init(&pp->pr_cv, wchan); 996 pp->pr_ipl = ipl; 997 998 /* Insert into the list of all pools. */ 999 if (!cold) 1000 mutex_enter(&pool_head_lock); 1001 TAILQ_FOREACH(pp1, &pool_head, pr_poollist) { 1002 if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0) 1003 break; 1004 } 1005 if (pp1 == NULL) 1006 TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist); 1007 else 1008 TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist); 1009 if (!cold) 1010 mutex_exit(&pool_head_lock); 1011 1012 /* Insert this into the list of pools using this allocator. */ 1013 if (!cold) 1014 mutex_enter(&palloc->pa_lock); 1015 TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list); 1016 if (!cold) 1017 mutex_exit(&palloc->pa_lock); 1018 } 1019 1020 /* 1021 * De-commission a pool resource. 1022 */ 1023 void 1024 pool_destroy(struct pool *pp) 1025 { 1026 struct pool_pagelist pq; 1027 struct pool_item_header *ph; 1028 1029 pool_quarantine_flush(pp); 1030 1031 /* Remove from global pool list */ 1032 mutex_enter(&pool_head_lock); 1033 while (pp->pr_refcnt != 0) 1034 cv_wait(&pool_busy, &pool_head_lock); 1035 TAILQ_REMOVE(&pool_head, pp, pr_poollist); 1036 if (drainpp == pp) 1037 drainpp = NULL; 1038 mutex_exit(&pool_head_lock); 1039 1040 /* Remove this pool from its allocator's list of pools. */ 1041 mutex_enter(&pp->pr_alloc->pa_lock); 1042 TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list); 1043 mutex_exit(&pp->pr_alloc->pa_lock); 1044 1045 mutex_enter(&pool_allocator_lock); 1046 if (--pp->pr_alloc->pa_refcnt == 0) 1047 mutex_destroy(&pp->pr_alloc->pa_lock); 1048 mutex_exit(&pool_allocator_lock); 1049 1050 mutex_enter(&pp->pr_lock); 1051 1052 KASSERT(pp->pr_cache == NULL); 1053 KASSERTMSG((pp->pr_nout == 0), 1054 "%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan, 1055 pp->pr_nout); 1056 KASSERT(LIST_EMPTY(&pp->pr_fullpages)); 1057 KASSERT(LIST_EMPTY(&pp->pr_partpages)); 1058 1059 /* Remove all pages */ 1060 LIST_INIT(&pq); 1061 while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 1062 pr_rmpage(pp, ph, &pq); 1063 1064 mutex_exit(&pp->pr_lock); 1065 1066 pr_pagelist_free(pp, &pq); 1067 cv_destroy(&pp->pr_cv); 1068 mutex_destroy(&pp->pr_lock); 1069 } 1070 1071 void 1072 pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg) 1073 { 1074 1075 /* XXX no locking -- must be used just after pool_init() */ 1076 KASSERTMSG((pp->pr_drain_hook == NULL), 1077 "%s: [%s] already set", __func__, pp->pr_wchan); 1078 pp->pr_drain_hook = fn; 1079 pp->pr_drain_hook_arg = arg; 1080 } 1081 1082 static struct pool_item_header * 1083 pool_alloc_item_header(struct pool *pp, void *storage, int flags) 1084 { 1085 struct pool_item_header *ph; 1086 1087 if ((pp->pr_roflags & PR_PHINPAGE) != 0) 1088 ph = storage; 1089 else 1090 ph = pool_get(pp->pr_phpool, flags); 1091 1092 return ph; 1093 } 1094 1095 /* 1096 * Grab an item from the pool. 1097 */ 1098 void * 1099 pool_get(struct pool *pp, int flags) 1100 { 1101 struct pool_item_header *ph; 1102 void *v; 1103 1104 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 1105 KASSERTMSG((pp->pr_itemsperpage != 0), 1106 "%s: [%s] pr_itemsperpage is zero, " 1107 "pool not initialized?", __func__, pp->pr_wchan); 1108 KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p()) 1109 || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL), 1110 "%s: [%s] is IPL_NONE, but called from interrupt context", 1111 __func__, pp->pr_wchan); 1112 if (flags & PR_WAITOK) { 1113 ASSERT_SLEEPABLE(); 1114 } 1115 1116 if (flags & PR_NOWAIT) { 1117 if (fault_inject()) 1118 return NULL; 1119 } 1120 1121 mutex_enter(&pp->pr_lock); 1122 startover: 1123 /* 1124 * Check to see if we've reached the hard limit. If we have, 1125 * and we can wait, then wait until an item has been returned to 1126 * the pool. 1127 */ 1128 KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit), 1129 "%s: %s: crossed hard limit", __func__, pp->pr_wchan); 1130 if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) { 1131 if (pp->pr_drain_hook != NULL) { 1132 /* 1133 * Since the drain hook is going to free things 1134 * back to the pool, unlock, call the hook, re-lock, 1135 * and check the hardlimit condition again. 1136 */ 1137 mutex_exit(&pp->pr_lock); 1138 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags); 1139 mutex_enter(&pp->pr_lock); 1140 if (pp->pr_nout < pp->pr_hardlimit) 1141 goto startover; 1142 } 1143 1144 if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) { 1145 /* 1146 * XXX: A warning isn't logged in this case. Should 1147 * it be? 1148 */ 1149 pp->pr_flags |= PR_WANTED; 1150 do { 1151 cv_wait(&pp->pr_cv, &pp->pr_lock); 1152 } while (pp->pr_flags & PR_WANTED); 1153 goto startover; 1154 } 1155 1156 /* 1157 * Log a message that the hard limit has been hit. 1158 */ 1159 if (pp->pr_hardlimit_warning != NULL && 1160 ratecheck(&pp->pr_hardlimit_warning_last, 1161 &pp->pr_hardlimit_ratecap)) 1162 log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning); 1163 1164 pp->pr_nfail++; 1165 1166 mutex_exit(&pp->pr_lock); 1167 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); 1168 return NULL; 1169 } 1170 1171 /* 1172 * The convention we use is that if `curpage' is not NULL, then 1173 * it points at a non-empty bucket. In particular, `curpage' 1174 * never points at a page header which has PR_PHINPAGE set and 1175 * has no items in its bucket. 1176 */ 1177 if ((ph = pp->pr_curpage) == NULL) { 1178 int error; 1179 1180 KASSERTMSG((pp->pr_nitems == 0), 1181 "%s: [%s] curpage NULL, inconsistent nitems %u", 1182 __func__, pp->pr_wchan, pp->pr_nitems); 1183 1184 /* 1185 * Call the back-end page allocator for more memory. 1186 * Release the pool lock, as the back-end page allocator 1187 * may block. 1188 */ 1189 error = pool_grow(pp, flags); 1190 if (error != 0) { 1191 /* 1192 * pool_grow aborts when another thread 1193 * is allocating a new page. Retry if it 1194 * waited for it. 1195 */ 1196 if (error == ERESTART) 1197 goto startover; 1198 1199 /* 1200 * We were unable to allocate a page or item 1201 * header, but we released the lock during 1202 * allocation, so perhaps items were freed 1203 * back to the pool. Check for this case. 1204 */ 1205 if (pp->pr_curpage != NULL) 1206 goto startover; 1207 1208 pp->pr_nfail++; 1209 mutex_exit(&pp->pr_lock); 1210 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); 1211 return NULL; 1212 } 1213 1214 /* Start the allocation process over. */ 1215 goto startover; 1216 } 1217 if (pp->pr_roflags & PR_USEBMAP) { 1218 KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage), 1219 "%s: [%s] pool page empty", __func__, pp->pr_wchan); 1220 v = pr_item_bitmap_get(pp, ph); 1221 } else { 1222 v = pr_item_linkedlist_get(pp, ph); 1223 } 1224 pp->pr_nitems--; 1225 pp->pr_nout++; 1226 if (ph->ph_nmissing == 0) { 1227 KASSERT(pp->pr_nidle > 0); 1228 pp->pr_nidle--; 1229 1230 /* 1231 * This page was previously empty. Move it to the list of 1232 * partially-full pages. This page is already curpage. 1233 */ 1234 LIST_REMOVE(ph, ph_pagelist); 1235 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 1236 } 1237 ph->ph_nmissing++; 1238 if (ph->ph_nmissing == pp->pr_itemsperpage) { 1239 KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) || 1240 LIST_EMPTY(&ph->ph_itemlist)), 1241 "%s: [%s] nmissing (%u) inconsistent", __func__, 1242 pp->pr_wchan, ph->ph_nmissing); 1243 /* 1244 * This page is now full. Move it to the full list 1245 * and select a new current page. 1246 */ 1247 LIST_REMOVE(ph, ph_pagelist); 1248 LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); 1249 pool_update_curpage(pp); 1250 } 1251 1252 pp->pr_nget++; 1253 1254 /* 1255 * If we have a low water mark and we are now below that low 1256 * water mark, add more items to the pool. 1257 */ 1258 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 1259 /* 1260 * XXX: Should we log a warning? Should we set up a timeout 1261 * to try again in a second or so? The latter could break 1262 * a caller's assumptions about interrupt protection, etc. 1263 */ 1264 } 1265 1266 mutex_exit(&pp->pr_lock); 1267 KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0); 1268 FREECHECK_OUT(&pp->pr_freecheck, v); 1269 pool_redzone_fill(pp, v); 1270 pool_get_kmsan(pp, v); 1271 if (flags & PR_ZERO) 1272 memset(v, 0, pp->pr_reqsize); 1273 return v; 1274 } 1275 1276 /* 1277 * Internal version of pool_put(). Pool is already locked/entered. 1278 */ 1279 static void 1280 pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq) 1281 { 1282 struct pool_item_header *ph; 1283 1284 KASSERT(mutex_owned(&pp->pr_lock)); 1285 pool_redzone_check(pp, v); 1286 pool_put_kmsan(pp, v); 1287 FREECHECK_IN(&pp->pr_freecheck, v); 1288 LOCKDEBUG_MEM_CHECK(v, pp->pr_size); 1289 1290 KASSERTMSG((pp->pr_nout > 0), 1291 "%s: [%s] putting with none out", __func__, pp->pr_wchan); 1292 1293 if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) { 1294 panic("%s: [%s] page header missing", __func__, pp->pr_wchan); 1295 } 1296 1297 /* 1298 * Return to item list. 1299 */ 1300 if (pp->pr_roflags & PR_USEBMAP) { 1301 pr_item_bitmap_put(pp, ph, v); 1302 } else { 1303 pr_item_linkedlist_put(pp, ph, v); 1304 } 1305 KDASSERT(ph->ph_nmissing != 0); 1306 ph->ph_nmissing--; 1307 pp->pr_nput++; 1308 pp->pr_nitems++; 1309 pp->pr_nout--; 1310 1311 /* Cancel "pool empty" condition if it exists */ 1312 if (pp->pr_curpage == NULL) 1313 pp->pr_curpage = ph; 1314 1315 if (pp->pr_flags & PR_WANTED) { 1316 pp->pr_flags &= ~PR_WANTED; 1317 cv_broadcast(&pp->pr_cv); 1318 } 1319 1320 /* 1321 * If this page is now empty, do one of two things: 1322 * 1323 * (1) If we have more pages than the page high water mark, 1324 * free the page back to the system. ONLY CONSIDER 1325 * FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE 1326 * CLAIM. 1327 * 1328 * (2) Otherwise, move the page to the empty page list. 1329 * 1330 * Either way, select a new current page (so we use a partially-full 1331 * page if one is available). 1332 */ 1333 if (ph->ph_nmissing == 0) { 1334 pp->pr_nidle++; 1335 if (pp->pr_nitems - pp->pr_itemsperpage >= pp->pr_minitems && 1336 pp->pr_npages > pp->pr_minpages && 1337 (pp->pr_npages > pp->pr_maxpages || 1338 pp->pr_nitems > pp->pr_maxitems)) { 1339 pr_rmpage(pp, ph, pq); 1340 } else { 1341 LIST_REMOVE(ph, ph_pagelist); 1342 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1343 1344 /* 1345 * Update the timestamp on the page. A page must 1346 * be idle for some period of time before it can 1347 * be reclaimed by the pagedaemon. This minimizes 1348 * ping-pong'ing for memory. 1349 * 1350 * note for 64-bit time_t: truncating to 32-bit is not 1351 * a problem for our usage. 1352 */ 1353 ph->ph_time = time_uptime; 1354 } 1355 pool_update_curpage(pp); 1356 } 1357 1358 /* 1359 * If the page was previously completely full, move it to the 1360 * partially-full list and make it the current page. The next 1361 * allocation will get the item from this page, instead of 1362 * further fragmenting the pool. 1363 */ 1364 else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { 1365 LIST_REMOVE(ph, ph_pagelist); 1366 LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist); 1367 pp->pr_curpage = ph; 1368 } 1369 } 1370 1371 void 1372 pool_put(struct pool *pp, void *v) 1373 { 1374 struct pool_pagelist pq; 1375 1376 LIST_INIT(&pq); 1377 1378 mutex_enter(&pp->pr_lock); 1379 if (!pool_put_quarantine(pp, v, &pq)) { 1380 pool_do_put(pp, v, &pq); 1381 } 1382 mutex_exit(&pp->pr_lock); 1383 1384 pr_pagelist_free(pp, &pq); 1385 } 1386 1387 /* 1388 * pool_grow: grow a pool by a page. 1389 * 1390 * => called with pool locked. 1391 * => unlock and relock the pool. 1392 * => return with pool locked. 1393 */ 1394 1395 static int 1396 pool_grow(struct pool *pp, int flags) 1397 { 1398 struct pool_item_header *ph; 1399 char *storage; 1400 1401 /* 1402 * If there's a pool_grow in progress, wait for it to complete 1403 * and try again from the top. 1404 */ 1405 if (pp->pr_flags & PR_GROWING) { 1406 if (flags & PR_WAITOK) { 1407 do { 1408 cv_wait(&pp->pr_cv, &pp->pr_lock); 1409 } while (pp->pr_flags & PR_GROWING); 1410 return SET_ERROR(ERESTART); 1411 } else { 1412 if (pp->pr_flags & PR_GROWINGNOWAIT) { 1413 /* 1414 * This needs an unlock/relock dance so 1415 * that the other caller has a chance to 1416 * run and actually do the thing. Note 1417 * that this is effectively a busy-wait. 1418 */ 1419 mutex_exit(&pp->pr_lock); 1420 mutex_enter(&pp->pr_lock); 1421 return SET_ERROR(ERESTART); 1422 } 1423 return SET_ERROR(EWOULDBLOCK); 1424 } 1425 } 1426 pp->pr_flags |= PR_GROWING; 1427 if (flags & PR_WAITOK) 1428 mutex_exit(&pp->pr_lock); 1429 else 1430 pp->pr_flags |= PR_GROWINGNOWAIT; 1431 1432 storage = pool_allocator_alloc(pp, flags); 1433 if (__predict_false(storage == NULL)) 1434 goto out; 1435 1436 ph = pool_alloc_item_header(pp, storage, flags); 1437 if (__predict_false(ph == NULL)) { 1438 pool_allocator_free(pp, storage); 1439 goto out; 1440 } 1441 1442 if (flags & PR_WAITOK) 1443 mutex_enter(&pp->pr_lock); 1444 pool_prime_page(pp, storage, ph); 1445 pp->pr_npagealloc++; 1446 KASSERT(pp->pr_flags & PR_GROWING); 1447 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1448 /* 1449 * If anyone was waiting for pool_grow, notify them that we 1450 * may have just done it. 1451 */ 1452 cv_broadcast(&pp->pr_cv); 1453 return 0; 1454 out: 1455 if (flags & PR_WAITOK) 1456 mutex_enter(&pp->pr_lock); 1457 KASSERT(pp->pr_flags & PR_GROWING); 1458 pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT); 1459 return SET_ERROR(ENOMEM); 1460 } 1461 1462 void 1463 pool_prime(struct pool *pp, int n) 1464 { 1465 1466 mutex_enter(&pp->pr_lock); 1467 pp->pr_minpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1468 if (pp->pr_maxpages <= pp->pr_minpages) 1469 pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */ 1470 while (pp->pr_npages < pp->pr_minpages) 1471 (void) pool_grow(pp, PR_WAITOK); 1472 mutex_exit(&pp->pr_lock); 1473 } 1474 1475 /* 1476 * Add a page worth of items to the pool. 1477 * 1478 * Note, we must be called with the pool descriptor LOCKED. 1479 */ 1480 static void 1481 pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph) 1482 { 1483 const unsigned int align = pp->pr_align; 1484 struct pool_item *pi; 1485 void *cp = storage; 1486 int n; 1487 1488 KASSERT(mutex_owned(&pp->pr_lock)); 1489 KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) || 1490 (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)), 1491 "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp); 1492 1493 /* 1494 * Insert page header. 1495 */ 1496 LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist); 1497 LIST_INIT(&ph->ph_itemlist); 1498 ph->ph_page = storage; 1499 ph->ph_nmissing = 0; 1500 ph->ph_time = time_uptime; 1501 if (pp->pr_roflags & PR_PHINPAGE) 1502 ph->ph_poolid = pp->pr_poolid; 1503 else 1504 SPLAY_INSERT(phtree, &pp->pr_phtree, ph); 1505 1506 pp->pr_nidle++; 1507 1508 /* 1509 * The item space starts after the on-page header, if any. 1510 */ 1511 ph->ph_off = pp->pr_itemoffset; 1512 1513 /* 1514 * Color this page. 1515 */ 1516 ph->ph_off += pp->pr_curcolor; 1517 cp = (char *)cp + ph->ph_off; 1518 if ((pp->pr_curcolor += align) > pp->pr_maxcolor) 1519 pp->pr_curcolor = 0; 1520 1521 KASSERT((((vaddr_t)cp) & (align - 1)) == 0); 1522 1523 /* 1524 * Insert remaining chunks on the bucket list. 1525 */ 1526 n = pp->pr_itemsperpage; 1527 pp->pr_nitems += n; 1528 1529 if (pp->pr_roflags & PR_USEBMAP) { 1530 pr_item_bitmap_init(pp, ph); 1531 } else { 1532 while (n--) { 1533 pi = (struct pool_item *)cp; 1534 1535 KASSERT((((vaddr_t)pi) & (align - 1)) == 0); 1536 1537 /* Insert on page list */ 1538 LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list); 1539 #ifdef POOL_CHECK_MAGIC 1540 pi->pi_magic = PI_MAGIC; 1541 #endif 1542 cp = (char *)cp + pp->pr_size; 1543 1544 KASSERT((((vaddr_t)cp) & (align - 1)) == 0); 1545 } 1546 } 1547 1548 /* 1549 * If the pool was depleted, point at the new page. 1550 */ 1551 if (pp->pr_curpage == NULL) 1552 pp->pr_curpage = ph; 1553 1554 if (++pp->pr_npages > pp->pr_hiwat) 1555 pp->pr_hiwat = pp->pr_npages; 1556 } 1557 1558 /* 1559 * Used by pool_get() when nitems drops below the low water mark. This 1560 * is used to catch up pr_nitems with the low water mark. 1561 * 1562 * Note 1, we never wait for memory here, we let the caller decide what to do. 1563 * 1564 * Note 2, we must be called with the pool already locked, and we return 1565 * with it locked. 1566 */ 1567 static int 1568 pool_catchup(struct pool *pp) 1569 { 1570 int error = 0; 1571 1572 while (POOL_NEEDS_CATCHUP(pp)) { 1573 error = pool_grow(pp, PR_NOWAIT); 1574 if (error) { 1575 if (error == ERESTART) 1576 continue; 1577 break; 1578 } 1579 } 1580 return error; 1581 } 1582 1583 static void 1584 pool_update_curpage(struct pool *pp) 1585 { 1586 1587 pp->pr_curpage = LIST_FIRST(&pp->pr_partpages); 1588 if (pp->pr_curpage == NULL) { 1589 pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages); 1590 } 1591 KASSERTMSG((pp->pr_curpage == NULL) == (pp->pr_nitems == 0), 1592 "pp=%p curpage=%p nitems=%u", pp, pp->pr_curpage, pp->pr_nitems); 1593 } 1594 1595 void 1596 pool_setlowat(struct pool *pp, int n) 1597 { 1598 1599 mutex_enter(&pp->pr_lock); 1600 pp->pr_minitems = n; 1601 1602 /* Make sure we're caught up with the newly-set low water mark. */ 1603 if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) { 1604 /* 1605 * XXX: Should we log a warning? Should we set up a timeout 1606 * to try again in a second or so? The latter could break 1607 * a caller's assumptions about interrupt protection, etc. 1608 */ 1609 } 1610 1611 mutex_exit(&pp->pr_lock); 1612 } 1613 1614 void 1615 pool_sethiwat(struct pool *pp, int n) 1616 { 1617 1618 mutex_enter(&pp->pr_lock); 1619 1620 pp->pr_maxitems = n; 1621 1622 mutex_exit(&pp->pr_lock); 1623 } 1624 1625 void 1626 pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap) 1627 { 1628 1629 mutex_enter(&pp->pr_lock); 1630 1631 pp->pr_hardlimit = n; 1632 pp->pr_hardlimit_warning = warnmess; 1633 pp->pr_hardlimit_ratecap.tv_sec = ratecap; 1634 pp->pr_hardlimit_warning_last.tv_sec = 0; 1635 pp->pr_hardlimit_warning_last.tv_usec = 0; 1636 1637 pp->pr_maxpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage; 1638 1639 mutex_exit(&pp->pr_lock); 1640 } 1641 1642 unsigned int 1643 pool_nget(struct pool *pp) 1644 { 1645 1646 return pp->pr_nget; 1647 } 1648 1649 unsigned int 1650 pool_nput(struct pool *pp) 1651 { 1652 1653 return pp->pr_nput; 1654 } 1655 1656 /* 1657 * Release all complete pages that have not been used recently. 1658 * 1659 * Must not be called from interrupt context. 1660 */ 1661 int 1662 pool_reclaim(struct pool *pp) 1663 { 1664 struct pool_item_header *ph, *phnext; 1665 struct pool_pagelist pq; 1666 struct pool_cache *pc; 1667 uint32_t curtime; 1668 bool klock; 1669 int rv; 1670 1671 KASSERT(!cpu_intr_p()); 1672 KASSERT(!cpu_softintr_p()); 1673 1674 if (pp->pr_drain_hook != NULL) { 1675 /* 1676 * The drain hook must be called with the pool unlocked. 1677 */ 1678 (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT); 1679 } 1680 1681 /* 1682 * XXXSMP Because we do not want to cause non-MPSAFE code 1683 * to block. 1684 */ 1685 if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK || 1686 pp->pr_ipl == IPL_SOFTSERIAL) { 1687 KERNEL_LOCK(1, NULL); 1688 klock = true; 1689 } else 1690 klock = false; 1691 1692 /* Reclaim items from the pool's cache (if any). */ 1693 if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) 1694 pool_cache_invalidate(pc); 1695 1696 if (mutex_tryenter(&pp->pr_lock) == 0) { 1697 if (klock) { 1698 KERNEL_UNLOCK_ONE(NULL); 1699 } 1700 return 0; 1701 } 1702 1703 LIST_INIT(&pq); 1704 1705 curtime = time_uptime; 1706 1707 for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) { 1708 phnext = LIST_NEXT(ph, ph_pagelist); 1709 1710 /* Check our minimum page claim */ 1711 if (pp->pr_npages <= pp->pr_minpages) 1712 break; 1713 1714 KASSERT(ph->ph_nmissing == 0); 1715 if (curtime - ph->ph_time < pool_inactive_time) 1716 continue; 1717 1718 /* 1719 * If freeing this page would put us below the minimum free items 1720 * or the minimum pages, stop now. 1721 */ 1722 if (pp->pr_nitems - pp->pr_itemsperpage < pp->pr_minitems || 1723 pp->pr_npages - 1 < pp->pr_minpages) 1724 break; 1725 1726 pr_rmpage(pp, ph, &pq); 1727 } 1728 1729 mutex_exit(&pp->pr_lock); 1730 1731 if (LIST_EMPTY(&pq)) 1732 rv = 0; 1733 else { 1734 pr_pagelist_free(pp, &pq); 1735 rv = 1; 1736 } 1737 1738 if (klock) { 1739 KERNEL_UNLOCK_ONE(NULL); 1740 } 1741 1742 return rv; 1743 } 1744 1745 /* 1746 * Drain pools, one at a time. The drained pool is returned within ppp. 1747 * 1748 * Note, must never be called from interrupt context. 1749 */ 1750 bool 1751 pool_drain(struct pool **ppp) 1752 { 1753 bool reclaimed; 1754 struct pool *pp; 1755 1756 KASSERT(!TAILQ_EMPTY(&pool_head)); 1757 1758 pp = NULL; 1759 1760 /* Find next pool to drain, and add a reference. */ 1761 mutex_enter(&pool_head_lock); 1762 do { 1763 if (drainpp == NULL) { 1764 drainpp = TAILQ_FIRST(&pool_head); 1765 } 1766 if (drainpp != NULL) { 1767 pp = drainpp; 1768 drainpp = TAILQ_NEXT(pp, pr_poollist); 1769 } 1770 /* 1771 * Skip completely idle pools. We depend on at least 1772 * one pool in the system being active. 1773 */ 1774 } while (pp == NULL || pp->pr_npages == 0); 1775 pp->pr_refcnt++; 1776 mutex_exit(&pool_head_lock); 1777 1778 /* Drain the cache (if any) and pool.. */ 1779 reclaimed = pool_reclaim(pp); 1780 1781 /* Finally, unlock the pool. */ 1782 mutex_enter(&pool_head_lock); 1783 pp->pr_refcnt--; 1784 cv_broadcast(&pool_busy); 1785 mutex_exit(&pool_head_lock); 1786 1787 if (ppp != NULL) 1788 *ppp = pp; 1789 1790 return reclaimed; 1791 } 1792 1793 /* 1794 * Calculate the total number of pages consumed by pools. 1795 */ 1796 int 1797 pool_totalpages(void) 1798 { 1799 1800 mutex_enter(&pool_head_lock); 1801 int pages = pool_totalpages_locked(); 1802 mutex_exit(&pool_head_lock); 1803 1804 return pages; 1805 } 1806 1807 int 1808 pool_totalpages_locked(void) 1809 { 1810 struct pool *pp; 1811 uint64_t total = 0; 1812 1813 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1814 uint64_t bytes = 1815 (uint64_t)pp->pr_npages * pp->pr_alloc->pa_pagesz; 1816 1817 if ((pp->pr_roflags & PR_RECURSIVE) != 0) 1818 bytes -= ((uint64_t)pp->pr_nout * pp->pr_size); 1819 total += bytes; 1820 } 1821 1822 return atop(total); 1823 } 1824 1825 /* 1826 * Diagnostic helpers. 1827 */ 1828 1829 void 1830 pool_printall(const char *modif, void (*pr)(const char *, ...)) 1831 { 1832 struct pool *pp; 1833 1834 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 1835 pool_printit(pp, modif, pr); 1836 } 1837 } 1838 1839 void 1840 pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1841 { 1842 1843 if (pp == NULL) { 1844 (*pr)("Must specify a pool to print.\n"); 1845 return; 1846 } 1847 1848 pool_print1(pp, modif, pr); 1849 } 1850 1851 static void 1852 pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl, 1853 void (*pr)(const char *, ...)) 1854 { 1855 struct pool_item_header *ph; 1856 1857 LIST_FOREACH(ph, pl, ph_pagelist) { 1858 (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n", 1859 ph->ph_page, ph->ph_nmissing, ph->ph_time); 1860 #ifdef POOL_CHECK_MAGIC 1861 struct pool_item *pi; 1862 if (!(pp->pr_roflags & PR_USEBMAP)) { 1863 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 1864 if (pi->pi_magic != PI_MAGIC) { 1865 (*pr)("\t\t\titem %p, magic 0x%x\n", 1866 pi, pi->pi_magic); 1867 } 1868 } 1869 } 1870 #endif 1871 } 1872 } 1873 1874 static void 1875 pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...)) 1876 { 1877 struct pool_item_header *ph; 1878 pool_cache_t pc; 1879 pcg_t *pcg; 1880 pool_cache_cpu_t *cc; 1881 uint64_t cpuhit, cpumiss, pchit, pcmiss; 1882 uint32_t nfull; 1883 int i; 1884 bool print_log = false, print_pagelist = false, print_cache = false; 1885 bool print_short = false, skip_empty = false; 1886 char c; 1887 1888 while ((c = *modif++) != '\0') { 1889 if (c == 'l') 1890 print_log = true; 1891 if (c == 'p') 1892 print_pagelist = true; 1893 if (c == 'c') 1894 print_cache = true; 1895 if (c == 's') 1896 print_short = true; 1897 if (c == 'S') 1898 skip_empty = true; 1899 } 1900 1901 if (skip_empty && pp->pr_nget == 0) 1902 return; 1903 1904 if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) { 1905 (*pr)("POOLCACHE"); 1906 } else { 1907 (*pr)("POOL"); 1908 } 1909 1910 /* Single line output. */ 1911 if (print_short) { 1912 (*pr)(" %s:%p:%u:%u:%u:%u:%u:%u:%u:%u:%u:%u:%zu\n", 1913 pp->pr_wchan, pp, pp->pr_size, pp->pr_align, pp->pr_npages, 1914 pp->pr_nitems, pp->pr_nout, pp->pr_nget, pp->pr_nput, 1915 pp->pr_npagealloc, pp->pr_npagefree, pp->pr_nidle, 1916 (size_t)pp->pr_npagealloc * pp->pr_alloc->pa_pagesz); 1917 return; 1918 } 1919 1920 (*pr)(" %s: itemsize %u, totalmem %zu align %u, ioff %u, roflags 0x%08x\n", 1921 pp->pr_wchan, pp->pr_size, 1922 (size_t)pp->pr_npagealloc * pp->pr_alloc->pa_pagesz, 1923 pp->pr_align, pp->pr_itemoffset, pp->pr_roflags); 1924 (*pr)("\tpool %p, alloc %p\n", pp, pp->pr_alloc); 1925 (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n", 1926 pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages); 1927 (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n", 1928 pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit); 1929 1930 (*pr)("\tnget %lu, nfail %lu, nput %lu\n", 1931 pp->pr_nget, pp->pr_nfail, pp->pr_nput); 1932 (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n", 1933 pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle); 1934 1935 if (!print_pagelist) 1936 goto skip_pagelist; 1937 1938 if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL) 1939 (*pr)("\n\tempty page list:\n"); 1940 pool_print_pagelist(pp, &pp->pr_emptypages, pr); 1941 if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL) 1942 (*pr)("\n\tfull page list:\n"); 1943 pool_print_pagelist(pp, &pp->pr_fullpages, pr); 1944 if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL) 1945 (*pr)("\n\tpartial-page list:\n"); 1946 pool_print_pagelist(pp, &pp->pr_partpages, pr); 1947 1948 if (pp->pr_curpage == NULL) 1949 (*pr)("\tno current page\n"); 1950 else 1951 (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page); 1952 1953 skip_pagelist: 1954 if (print_log) 1955 goto skip_log; 1956 1957 (*pr)("\n"); 1958 1959 skip_log: 1960 1961 #define PR_GROUPLIST(pcg) \ 1962 (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail); \ 1963 for (i = 0; i < pcg->pcg_size; i++) { \ 1964 if (pcg->pcg_objects[i].pcgo_pa != \ 1965 POOL_PADDR_INVALID) { \ 1966 (*pr)("\t\t\t%p, 0x%llx\n", \ 1967 pcg->pcg_objects[i].pcgo_va, \ 1968 (unsigned long long) \ 1969 pcg->pcg_objects[i].pcgo_pa); \ 1970 } else { \ 1971 (*pr)("\t\t\t%p\n", \ 1972 pcg->pcg_objects[i].pcgo_va); \ 1973 } \ 1974 } 1975 1976 if (pc != NULL) { 1977 cpuhit = 0; 1978 cpumiss = 0; 1979 pcmiss = 0; 1980 nfull = 0; 1981 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 1982 if ((cc = pc->pc_cpus[i]) == NULL) 1983 continue; 1984 cpuhit += cc->cc_hits; 1985 cpumiss += cc->cc_misses; 1986 pcmiss += cc->cc_pcmisses; 1987 nfull += cc->cc_nfull; 1988 } 1989 pchit = cpumiss - pcmiss; 1990 (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss); 1991 (*pr)("\tcache layer hits %llu misses %llu\n", pchit, pcmiss); 1992 (*pr)("\tcache layer full groups %u\n", nfull); 1993 if (print_cache) { 1994 (*pr)("\tfull cache groups:\n"); 1995 for (pcg = pc->pc_fullgroups; pcg != NULL; 1996 pcg = pcg->pcg_next) { 1997 PR_GROUPLIST(pcg); 1998 } 1999 } 2000 } 2001 #undef PR_GROUPLIST 2002 } 2003 2004 static int 2005 pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph) 2006 { 2007 struct pool_item *pi; 2008 void *page; 2009 int n; 2010 2011 if ((pp->pr_roflags & PR_NOALIGN) == 0) { 2012 page = POOL_OBJ_TO_PAGE(pp, ph); 2013 if (page != ph->ph_page && 2014 (pp->pr_roflags & PR_PHINPAGE) != 0) { 2015 if (label != NULL) 2016 printf("%s: ", label); 2017 printf("pool(%p:%s): page inconsistency: page %p;" 2018 " at page head addr %p (p %p)\n", pp, 2019 pp->pr_wchan, ph->ph_page, 2020 ph, page); 2021 return 1; 2022 } 2023 } 2024 2025 if ((pp->pr_roflags & PR_USEBMAP) != 0) 2026 return 0; 2027 2028 for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0; 2029 pi != NULL; 2030 pi = LIST_NEXT(pi,pi_list), n++) { 2031 2032 #ifdef POOL_CHECK_MAGIC 2033 if (pi->pi_magic != PI_MAGIC) { 2034 if (label != NULL) 2035 printf("%s: ", label); 2036 printf("pool(%s): free list modified: magic=%x;" 2037 " page %p; item ordinal %d; addr %p\n", 2038 pp->pr_wchan, pi->pi_magic, ph->ph_page, 2039 n, pi); 2040 panic("pool"); 2041 } 2042 #endif 2043 if ((pp->pr_roflags & PR_NOALIGN) != 0) { 2044 continue; 2045 } 2046 page = POOL_OBJ_TO_PAGE(pp, pi); 2047 if (page == ph->ph_page) 2048 continue; 2049 2050 if (label != NULL) 2051 printf("%s: ", label); 2052 printf("pool(%p:%s): page inconsistency: page %p;" 2053 " item ordinal %d; addr %p (p %p)\n", pp, 2054 pp->pr_wchan, ph->ph_page, 2055 n, pi, page); 2056 return 1; 2057 } 2058 return 0; 2059 } 2060 2061 2062 int 2063 pool_chk(struct pool *pp, const char *label) 2064 { 2065 struct pool_item_header *ph; 2066 int r = 0; 2067 2068 mutex_enter(&pp->pr_lock); 2069 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 2070 r = pool_chk_page(pp, label, ph); 2071 if (r) { 2072 goto out; 2073 } 2074 } 2075 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 2076 r = pool_chk_page(pp, label, ph); 2077 if (r) { 2078 goto out; 2079 } 2080 } 2081 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 2082 r = pool_chk_page(pp, label, ph); 2083 if (r) { 2084 goto out; 2085 } 2086 } 2087 2088 out: 2089 mutex_exit(&pp->pr_lock); 2090 return r; 2091 } 2092 2093 /* 2094 * pool_cache_init: 2095 * 2096 * Initialize a pool cache. 2097 */ 2098 pool_cache_t 2099 pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags, 2100 const char *wchan, struct pool_allocator *palloc, int ipl, 2101 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg) 2102 { 2103 pool_cache_t pc; 2104 2105 pc = pool_get(&cache_pool, PR_WAITOK); 2106 if (pc == NULL) 2107 return NULL; 2108 2109 pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan, 2110 palloc, ipl, ctor, dtor, arg); 2111 2112 return pc; 2113 } 2114 2115 /* 2116 * pool_cache_bootstrap: 2117 * 2118 * Kernel-private version of pool_cache_init(). The caller 2119 * provides initial storage. 2120 */ 2121 void 2122 pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align, 2123 u_int align_offset, u_int flags, const char *wchan, 2124 struct pool_allocator *palloc, int ipl, 2125 int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), 2126 void *arg) 2127 { 2128 CPU_INFO_ITERATOR cii; 2129 pool_cache_t pc1; 2130 struct cpu_info *ci; 2131 struct pool *pp; 2132 unsigned int ppflags; 2133 2134 pp = &pc->pc_pool; 2135 ppflags = flags; 2136 if (ctor == NULL) { 2137 ctor = NO_CTOR; 2138 } 2139 if (dtor == NULL) { 2140 dtor = NO_DTOR; 2141 } else { 2142 /* 2143 * If we have a destructor, then the pool layer does not 2144 * need to worry about PR_PSERIALIZE. 2145 */ 2146 ppflags &= ~PR_PSERIALIZE; 2147 } 2148 2149 pool_init(pp, size, align, align_offset, ppflags, wchan, palloc, ipl); 2150 2151 pc->pc_fullgroups = NULL; 2152 pc->pc_partgroups = NULL; 2153 pc->pc_ctor = ctor; 2154 pc->pc_dtor = dtor; 2155 pc->pc_arg = arg; 2156 pc->pc_refcnt = 0; 2157 pc->pc_roflags = flags; 2158 pc->pc_freecheck = NULL; 2159 2160 if ((flags & PR_LARGECACHE) != 0) { 2161 pc->pc_pcgsize = PCG_NOBJECTS_LARGE; 2162 pc->pc_pcgpool = &pcg_large_pool; 2163 pc->pc_pcgcache = &pcg_large_cache; 2164 } else { 2165 pc->pc_pcgsize = PCG_NOBJECTS_NORMAL; 2166 pc->pc_pcgpool = &pcg_normal_pool; 2167 pc->pc_pcgcache = &pcg_normal_cache; 2168 } 2169 2170 /* Allocate per-CPU caches. */ 2171 memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus)); 2172 pc->pc_ncpu = 0; 2173 if (ncpu < 2) { 2174 /* XXX For sparc: boot CPU is not attached yet. */ 2175 pool_cache_cpu_init1(curcpu(), pc); 2176 } else { 2177 for (CPU_INFO_FOREACH(cii, ci)) { 2178 pool_cache_cpu_init1(ci, pc); 2179 } 2180 } 2181 2182 /* Add to list of all pools. */ 2183 if (__predict_true(!cold)) 2184 mutex_enter(&pool_head_lock); 2185 TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) { 2186 if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0) 2187 break; 2188 } 2189 if (pc1 == NULL) 2190 TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist); 2191 else 2192 TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist); 2193 if (__predict_true(!cold)) 2194 mutex_exit(&pool_head_lock); 2195 2196 atomic_store_release(&pp->pr_cache, pc); 2197 } 2198 2199 /* 2200 * pool_cache_destroy: 2201 * 2202 * Destroy a pool cache. 2203 */ 2204 void 2205 pool_cache_destroy(pool_cache_t pc) 2206 { 2207 2208 pool_cache_bootstrap_destroy(pc); 2209 pool_put(&cache_pool, pc); 2210 } 2211 2212 /* 2213 * pool_cache_bootstrap_destroy: 2214 * 2215 * Destroy a pool cache. 2216 */ 2217 void 2218 pool_cache_bootstrap_destroy(pool_cache_t pc) 2219 { 2220 struct pool *pp = &pc->pc_pool; 2221 u_int i; 2222 2223 /* Remove it from the global list. */ 2224 mutex_enter(&pool_head_lock); 2225 while (pc->pc_refcnt != 0) 2226 cv_wait(&pool_busy, &pool_head_lock); 2227 TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist); 2228 mutex_exit(&pool_head_lock); 2229 2230 /* First, invalidate the entire cache. */ 2231 pool_cache_invalidate(pc); 2232 2233 /* Disassociate it from the pool. */ 2234 mutex_enter(&pp->pr_lock); 2235 atomic_store_relaxed(&pp->pr_cache, NULL); 2236 mutex_exit(&pp->pr_lock); 2237 2238 /* Destroy per-CPU data */ 2239 for (i = 0; i < __arraycount(pc->pc_cpus); i++) 2240 pool_cache_invalidate_cpu(pc, i); 2241 2242 /* Finally, destroy it. */ 2243 pool_destroy(pp); 2244 } 2245 2246 /* 2247 * pool_cache_cpu_init1: 2248 * 2249 * Called for each pool_cache whenever a new CPU is attached. 2250 */ 2251 static void 2252 pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc) 2253 { 2254 pool_cache_cpu_t *cc; 2255 int index; 2256 2257 index = ci->ci_index; 2258 2259 KASSERT(index < __arraycount(pc->pc_cpus)); 2260 2261 if ((cc = pc->pc_cpus[index]) != NULL) { 2262 return; 2263 } 2264 2265 /* 2266 * The first CPU is 'free'. This needs to be the case for 2267 * bootstrap - we may not be able to allocate yet. 2268 */ 2269 if (pc->pc_ncpu == 0) { 2270 cc = &pc->pc_cpu0; 2271 pc->pc_ncpu = 1; 2272 } else { 2273 pc->pc_ncpu++; 2274 cc = pool_get(&cache_cpu_pool, PR_WAITOK); 2275 } 2276 2277 cc->cc_current = __UNCONST(&pcg_dummy); 2278 cc->cc_previous = __UNCONST(&pcg_dummy); 2279 cc->cc_pcgcache = pc->pc_pcgcache; 2280 cc->cc_hits = 0; 2281 cc->cc_misses = 0; 2282 cc->cc_pcmisses = 0; 2283 cc->cc_contended = 0; 2284 cc->cc_nfull = 0; 2285 cc->cc_npart = 0; 2286 2287 pc->pc_cpus[index] = cc; 2288 } 2289 2290 /* 2291 * pool_cache_cpu_init: 2292 * 2293 * Called whenever a new CPU is attached. 2294 */ 2295 void 2296 pool_cache_cpu_init(struct cpu_info *ci) 2297 { 2298 pool_cache_t pc; 2299 2300 mutex_enter(&pool_head_lock); 2301 TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) { 2302 pc->pc_refcnt++; 2303 mutex_exit(&pool_head_lock); 2304 2305 pool_cache_cpu_init1(ci, pc); 2306 2307 mutex_enter(&pool_head_lock); 2308 pc->pc_refcnt--; 2309 cv_broadcast(&pool_busy); 2310 } 2311 mutex_exit(&pool_head_lock); 2312 } 2313 2314 /* 2315 * pool_cache_reclaim: 2316 * 2317 * Reclaim memory from a pool cache. 2318 */ 2319 bool 2320 pool_cache_reclaim(pool_cache_t pc) 2321 { 2322 2323 return pool_reclaim(&pc->pc_pool); 2324 } 2325 2326 static inline void 2327 pool_cache_pre_destruct(pool_cache_t pc) 2328 { 2329 /* 2330 * Perform a passive serialization barrier before destructing 2331 * a batch of one or more objects. 2332 */ 2333 if (__predict_false(pc_has_pser(pc))) { 2334 pool_barrier(); 2335 } 2336 } 2337 2338 static void 2339 pool_cache_destruct_object1(pool_cache_t pc, void *object) 2340 { 2341 (*pc->pc_dtor)(pc->pc_arg, object); 2342 pool_put(&pc->pc_pool, object); 2343 } 2344 2345 /* 2346 * pool_cache_destruct_object: 2347 * 2348 * Force destruction of an object and its release back into 2349 * the pool. 2350 */ 2351 void 2352 pool_cache_destruct_object(pool_cache_t pc, void *object) 2353 { 2354 2355 FREECHECK_IN(&pc->pc_freecheck, object); 2356 2357 pool_cache_pre_destruct(pc); 2358 pool_cache_destruct_object1(pc, object); 2359 } 2360 2361 /* 2362 * pool_cache_invalidate_groups: 2363 * 2364 * Invalidate a chain of groups and destruct all objects. Return the 2365 * number of groups that were invalidated. 2366 */ 2367 static int 2368 pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg) 2369 { 2370 void *object; 2371 pcg_t *next; 2372 int i, n; 2373 2374 if (pcg == NULL) { 2375 return 0; 2376 } 2377 2378 pool_cache_pre_destruct(pc); 2379 2380 for (n = 0; pcg != NULL; pcg = next, n++) { 2381 next = pcg->pcg_next; 2382 2383 for (i = 0; i < pcg->pcg_avail; i++) { 2384 object = pcg->pcg_objects[i].pcgo_va; 2385 pool_cache_destruct_object1(pc, object); 2386 } 2387 2388 if (pcg->pcg_size == PCG_NOBJECTS_LARGE) { 2389 pool_put(&pcg_large_pool, pcg); 2390 } else { 2391 KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL); 2392 pool_put(&pcg_normal_pool, pcg); 2393 } 2394 } 2395 return n; 2396 } 2397 2398 /* 2399 * pool_cache_invalidate: 2400 * 2401 * Invalidate a pool cache (destruct and release all of the 2402 * cached objects). Does not reclaim objects from the pool. 2403 * 2404 * Note: For pool caches that provide constructed objects, there 2405 * is an assumption that another level of synchronization is occurring 2406 * between the input to the constructor and the cache invalidation. 2407 * 2408 * Invalidation is a costly process and should not be called from 2409 * interrupt context. 2410 */ 2411 void 2412 pool_cache_invalidate(pool_cache_t pc) 2413 { 2414 uint64_t where; 2415 pcg_t *pcg; 2416 int n, s; 2417 2418 KASSERT(!cpu_intr_p()); 2419 KASSERT(!cpu_softintr_p()); 2420 2421 if (ncpu < 2 || !mp_online) { 2422 /* 2423 * We might be called early enough in the boot process 2424 * for the CPU data structures to not be fully initialized. 2425 * In this case, transfer the content of the local CPU's 2426 * cache back into global cache as only this CPU is currently 2427 * running. 2428 */ 2429 pool_cache_transfer(pc); 2430 } else { 2431 /* 2432 * Signal all CPUs that they must transfer their local 2433 * cache back to the global pool then wait for the xcall to 2434 * complete. 2435 */ 2436 where = xc_broadcast(0, 2437 __FPTRCAST(xcfunc_t, pool_cache_transfer), pc, NULL); 2438 xc_wait(where); 2439 } 2440 2441 /* Now dequeue and invalidate everything. */ 2442 pcg = pool_pcg_trunc(&pcg_normal_cache); 2443 (void)pool_cache_invalidate_groups(pc, pcg); 2444 2445 pcg = pool_pcg_trunc(&pcg_large_cache); 2446 (void)pool_cache_invalidate_groups(pc, pcg); 2447 2448 pcg = pool_pcg_trunc(&pc->pc_fullgroups); 2449 n = pool_cache_invalidate_groups(pc, pcg); 2450 s = splvm(); 2451 ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n; 2452 splx(s); 2453 2454 pcg = pool_pcg_trunc(&pc->pc_partgroups); 2455 n = pool_cache_invalidate_groups(pc, pcg); 2456 s = splvm(); 2457 ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_npart -= n; 2458 splx(s); 2459 } 2460 2461 /* 2462 * pool_cache_invalidate_cpu: 2463 * 2464 * Invalidate all CPU-bound cached objects in pool cache, the CPU being 2465 * identified by its associated index. 2466 * It is caller's responsibility to ensure that no operation is 2467 * taking place on this pool cache while doing this invalidation. 2468 * WARNING: as no inter-CPU locking is enforced, trying to invalidate 2469 * pool cached objects from a CPU different from the one currently running 2470 * may result in an undefined behaviour. 2471 */ 2472 static void 2473 pool_cache_invalidate_cpu(pool_cache_t pc, u_int index) 2474 { 2475 pool_cache_cpu_t *cc; 2476 pcg_t *pcg; 2477 2478 if ((cc = pc->pc_cpus[index]) == NULL) 2479 return; 2480 2481 if ((pcg = cc->cc_current) != &pcg_dummy) { 2482 pcg->pcg_next = NULL; 2483 pool_cache_invalidate_groups(pc, pcg); 2484 } 2485 if ((pcg = cc->cc_previous) != &pcg_dummy) { 2486 pcg->pcg_next = NULL; 2487 pool_cache_invalidate_groups(pc, pcg); 2488 } 2489 if (cc != &pc->pc_cpu0) 2490 pool_put(&cache_cpu_pool, cc); 2491 2492 } 2493 2494 void 2495 pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg) 2496 { 2497 2498 pool_set_drain_hook(&pc->pc_pool, fn, arg); 2499 } 2500 2501 void 2502 pool_cache_setlowat(pool_cache_t pc, int n) 2503 { 2504 2505 pool_setlowat(&pc->pc_pool, n); 2506 } 2507 2508 void 2509 pool_cache_sethiwat(pool_cache_t pc, int n) 2510 { 2511 2512 pool_sethiwat(&pc->pc_pool, n); 2513 } 2514 2515 void 2516 pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap) 2517 { 2518 2519 pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap); 2520 } 2521 2522 void 2523 pool_cache_prime(pool_cache_t pc, int n) 2524 { 2525 2526 pool_prime(&pc->pc_pool, n); 2527 } 2528 2529 unsigned int 2530 pool_cache_nget(pool_cache_t pc) 2531 { 2532 2533 return pool_nget(&pc->pc_pool); 2534 } 2535 2536 unsigned int 2537 pool_cache_nput(pool_cache_t pc) 2538 { 2539 2540 return pool_nput(&pc->pc_pool); 2541 } 2542 2543 /* 2544 * pool_pcg_get: 2545 * 2546 * Get a cache group from the specified list. Return true if 2547 * contention was encountered. Must be called at IPL_VM because 2548 * of spin wait vs. kernel_lock. 2549 */ 2550 static int 2551 pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp) 2552 { 2553 int count = SPINLOCK_BACKOFF_MIN; 2554 pcg_t *o, *n; 2555 2556 for (o = atomic_load_relaxed(head);; o = n) { 2557 if (__predict_false(o == &pcg_dummy)) { 2558 /* Wait for concurrent get to complete. */ 2559 SPINLOCK_BACKOFF(count); 2560 n = atomic_load_relaxed(head); 2561 continue; 2562 } 2563 if (__predict_false(o == NULL)) { 2564 break; 2565 } 2566 /* Lock out concurrent get/put. */ 2567 n = atomic_cas_ptr(head, o, __UNCONST(&pcg_dummy)); 2568 if (o == n) { 2569 /* Fetch pointer to next item and then unlock. */ 2570 membar_datadep_consumer(); /* alpha */ 2571 n = atomic_load_relaxed(&o->pcg_next); 2572 atomic_store_release(head, n); 2573 break; 2574 } 2575 } 2576 *pcgp = o; 2577 return count != SPINLOCK_BACKOFF_MIN; 2578 } 2579 2580 /* 2581 * pool_pcg_trunc: 2582 * 2583 * Chop out entire list of pool cache groups. 2584 */ 2585 static pcg_t * 2586 pool_pcg_trunc(pcg_t *volatile *head) 2587 { 2588 int count = SPINLOCK_BACKOFF_MIN, s; 2589 pcg_t *o, *n; 2590 2591 s = splvm(); 2592 for (o = atomic_load_relaxed(head);; o = n) { 2593 if (__predict_false(o == &pcg_dummy)) { 2594 /* Wait for concurrent get to complete. */ 2595 SPINLOCK_BACKOFF(count); 2596 n = atomic_load_relaxed(head); 2597 continue; 2598 } 2599 n = atomic_cas_ptr(head, o, NULL); 2600 if (o == n) { 2601 splx(s); 2602 membar_datadep_consumer(); /* alpha */ 2603 return o; 2604 } 2605 } 2606 } 2607 2608 /* 2609 * pool_pcg_put: 2610 * 2611 * Put a pool cache group to the specified list. Return true if 2612 * contention was encountered. Must be called at IPL_VM because of 2613 * spin wait vs. kernel_lock. 2614 */ 2615 static int 2616 pool_pcg_put(pcg_t *volatile *head, pcg_t *pcg) 2617 { 2618 int count = SPINLOCK_BACKOFF_MIN; 2619 pcg_t *o, *n; 2620 2621 for (o = atomic_load_relaxed(head);; o = n) { 2622 if (__predict_false(o == &pcg_dummy)) { 2623 /* Wait for concurrent get to complete. */ 2624 SPINLOCK_BACKOFF(count); 2625 n = atomic_load_relaxed(head); 2626 continue; 2627 } 2628 pcg->pcg_next = o; 2629 membar_release(); 2630 n = atomic_cas_ptr(head, o, pcg); 2631 if (o == n) { 2632 return count != SPINLOCK_BACKOFF_MIN; 2633 } 2634 } 2635 } 2636 2637 static bool __noinline 2638 pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, 2639 void **objectp, paddr_t *pap, int flags) 2640 { 2641 pcg_t *pcg, *cur; 2642 void *object; 2643 2644 KASSERT(cc->cc_current->pcg_avail == 0); 2645 KASSERT(cc->cc_previous->pcg_avail == 0); 2646 2647 cc->cc_misses++; 2648 2649 /* 2650 * If there's a full group, release our empty group back to the 2651 * cache. Install the full group as cc_current and return. 2652 */ 2653 cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg); 2654 if (__predict_true(pcg != NULL)) { 2655 KASSERT(pcg->pcg_avail == pcg->pcg_size); 2656 if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { 2657 KASSERT(cur->pcg_avail == 0); 2658 (void)pool_pcg_put(cc->cc_pcgcache, cur); 2659 } 2660 cc->cc_nfull--; 2661 cc->cc_current = pcg; 2662 return true; 2663 } 2664 2665 /* 2666 * Nothing available locally or in cache. Take the slow 2667 * path: fetch a new object from the pool and construct 2668 * it. 2669 */ 2670 cc->cc_pcmisses++; 2671 splx(s); 2672 2673 object = pool_get(&pc->pc_pool, flags); 2674 *objectp = object; 2675 if (__predict_false(object == NULL)) { 2676 KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0); 2677 return false; 2678 } 2679 2680 if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) { 2681 pool_put(&pc->pc_pool, object); 2682 *objectp = NULL; 2683 return false; 2684 } 2685 2686 KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0); 2687 2688 if (pap != NULL) { 2689 #ifdef POOL_VTOPHYS 2690 *pap = POOL_VTOPHYS(object); 2691 #else 2692 *pap = POOL_PADDR_INVALID; 2693 #endif 2694 } 2695 2696 FREECHECK_OUT(&pc->pc_freecheck, object); 2697 return false; 2698 } 2699 2700 /* 2701 * pool_cache_get{,_paddr}: 2702 * 2703 * Get an object from a pool cache (optionally returning 2704 * the physical address of the object). 2705 */ 2706 void * 2707 pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap) 2708 { 2709 pool_cache_cpu_t *cc; 2710 pcg_t *pcg; 2711 void *object; 2712 int s; 2713 2714 KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); 2715 if (pc->pc_pool.pr_ipl == IPL_NONE && 2716 __predict_true(!cold) && 2717 __predict_true(panicstr == NULL)) { 2718 KASSERTMSG(!cpu_intr_p(), 2719 "%s: [%s] is IPL_NONE, but called from interrupt context", 2720 __func__, pc->pc_pool.pr_wchan); 2721 KASSERTMSG(!cpu_softintr_p(), 2722 "%s: [%s] is IPL_NONE," 2723 " but called from soft interrupt context", 2724 __func__, pc->pc_pool.pr_wchan); 2725 } 2726 2727 if (flags & PR_WAITOK) { 2728 ASSERT_SLEEPABLE(); 2729 } 2730 2731 if (flags & PR_NOWAIT) { 2732 if (fault_inject()) 2733 return NULL; 2734 } 2735 2736 /* Lock out interrupts and disable preemption. */ 2737 s = splvm(); 2738 while (/* CONSTCOND */ true) { 2739 /* Try and allocate an object from the current group. */ 2740 cc = pc->pc_cpus[curcpu()->ci_index]; 2741 pcg = cc->cc_current; 2742 if (__predict_true(pcg->pcg_avail > 0)) { 2743 object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va; 2744 if (__predict_false(pap != NULL)) 2745 *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa; 2746 #if defined(DIAGNOSTIC) 2747 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL; 2748 KASSERT(pcg->pcg_avail < pcg->pcg_size); 2749 KASSERT(object != NULL); 2750 #endif 2751 cc->cc_hits++; 2752 splx(s); 2753 FREECHECK_OUT(&pc->pc_freecheck, object); 2754 pool_redzone_fill(&pc->pc_pool, object); 2755 pool_cache_get_kmsan(pc, object); 2756 return object; 2757 } 2758 2759 /* 2760 * That failed. If the previous group isn't empty, swap 2761 * it with the current group and allocate from there. 2762 */ 2763 pcg = cc->cc_previous; 2764 if (__predict_true(pcg->pcg_avail > 0)) { 2765 cc->cc_previous = cc->cc_current; 2766 cc->cc_current = pcg; 2767 continue; 2768 } 2769 2770 /* 2771 * Can't allocate from either group: try the slow path. 2772 * If get_slow() allocated an object for us, or if 2773 * no more objects are available, it will return false. 2774 * Otherwise, we need to retry. 2775 */ 2776 if (!pool_cache_get_slow(pc, cc, s, &object, pap, flags)) { 2777 if (object != NULL) { 2778 kmsan_orig(object, pc->pc_pool.pr_size, 2779 KMSAN_TYPE_POOL, __RET_ADDR); 2780 } 2781 break; 2782 } 2783 } 2784 2785 /* 2786 * We would like to KASSERT(object || (flags & PR_NOWAIT)), but 2787 * pool_cache_get can fail even in the PR_WAITOK case, if the 2788 * constructor fails. 2789 */ 2790 return object; 2791 } 2792 2793 static bool __noinline 2794 pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object) 2795 { 2796 pcg_t *pcg, *cur; 2797 2798 KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size); 2799 KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size); 2800 2801 cc->cc_misses++; 2802 2803 /* 2804 * Try to get an empty group from the cache. If there are no empty 2805 * groups in the cache then allocate one. 2806 */ 2807 (void)pool_pcg_get(cc->cc_pcgcache, &pcg); 2808 if (__predict_false(pcg == NULL)) { 2809 if (__predict_true(!pool_cache_disable)) { 2810 pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT); 2811 } 2812 if (__predict_true(pcg != NULL)) { 2813 pcg->pcg_avail = 0; 2814 pcg->pcg_size = pc->pc_pcgsize; 2815 } 2816 } 2817 2818 /* 2819 * If there's a empty group, release our full group back to the 2820 * cache. Install the empty group to the local CPU and return. 2821 */ 2822 if (pcg != NULL) { 2823 KASSERT(pcg->pcg_avail == 0); 2824 if (__predict_false(cc->cc_previous == &pcg_dummy)) { 2825 cc->cc_previous = pcg; 2826 } else { 2827 cur = cc->cc_current; 2828 if (__predict_true(cur != &pcg_dummy)) { 2829 KASSERT(cur->pcg_avail == cur->pcg_size); 2830 cc->cc_contended += 2831 pool_pcg_put(&pc->pc_fullgroups, cur); 2832 cc->cc_nfull++; 2833 } 2834 cc->cc_current = pcg; 2835 } 2836 return true; 2837 } 2838 2839 /* 2840 * Nothing available locally or in cache, and we didn't 2841 * allocate an empty group. Take the slow path and destroy 2842 * the object here and now. 2843 */ 2844 cc->cc_pcmisses++; 2845 splx(s); 2846 pool_cache_destruct_object(pc, object); 2847 2848 return false; 2849 } 2850 2851 /* 2852 * pool_cache_put{,_paddr}: 2853 * 2854 * Put an object back to the pool cache (optionally caching the 2855 * physical address of the object). 2856 */ 2857 void 2858 pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa) 2859 { 2860 pool_cache_cpu_t *cc; 2861 pcg_t *pcg; 2862 int s; 2863 2864 KASSERT(object != NULL); 2865 pool_cache_put_kmsan(pc, object); 2866 pool_cache_redzone_check(pc, object); 2867 FREECHECK_IN(&pc->pc_freecheck, object); 2868 2869 if (pc->pc_pool.pr_roflags & PR_PHINPAGE) { 2870 pc_phinpage_check(pc, object); 2871 } 2872 2873 if (pool_cache_put_nocache(pc, object)) { 2874 return; 2875 } 2876 2877 /* Lock out interrupts and disable preemption. */ 2878 s = splvm(); 2879 while (/* CONSTCOND */ true) { 2880 /* If the current group isn't full, release it there. */ 2881 cc = pc->pc_cpus[curcpu()->ci_index]; 2882 pcg = cc->cc_current; 2883 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2884 pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object; 2885 pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa; 2886 pcg->pcg_avail++; 2887 cc->cc_hits++; 2888 splx(s); 2889 return; 2890 } 2891 2892 /* 2893 * That failed. If the previous group isn't full, swap 2894 * it with the current group and try again. 2895 */ 2896 pcg = cc->cc_previous; 2897 if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) { 2898 cc->cc_previous = cc->cc_current; 2899 cc->cc_current = pcg; 2900 continue; 2901 } 2902 2903 /* 2904 * Can't free to either group: try the slow path. 2905 * If put_slow() releases the object for us, it 2906 * will return false. Otherwise we need to retry. 2907 */ 2908 if (!pool_cache_put_slow(pc, cc, s, object)) 2909 break; 2910 } 2911 } 2912 2913 /* 2914 * pool_cache_transfer: 2915 * 2916 * Transfer objects from the per-CPU cache to the global cache. 2917 * Run within a cross-call thread. 2918 */ 2919 static void 2920 pool_cache_transfer(pool_cache_t pc) 2921 { 2922 pool_cache_cpu_t *cc; 2923 pcg_t *prev, *cur; 2924 int s; 2925 2926 s = splvm(); 2927 cc = pc->pc_cpus[curcpu()->ci_index]; 2928 cur = cc->cc_current; 2929 cc->cc_current = __UNCONST(&pcg_dummy); 2930 prev = cc->cc_previous; 2931 cc->cc_previous = __UNCONST(&pcg_dummy); 2932 if (cur != &pcg_dummy) { 2933 if (cur->pcg_avail == cur->pcg_size) { 2934 (void)pool_pcg_put(&pc->pc_fullgroups, cur); 2935 cc->cc_nfull++; 2936 } else if (cur->pcg_avail == 0) { 2937 (void)pool_pcg_put(pc->pc_pcgcache, cur); 2938 } else { 2939 (void)pool_pcg_put(&pc->pc_partgroups, cur); 2940 cc->cc_npart++; 2941 } 2942 } 2943 if (prev != &pcg_dummy) { 2944 if (prev->pcg_avail == prev->pcg_size) { 2945 (void)pool_pcg_put(&pc->pc_fullgroups, prev); 2946 cc->cc_nfull++; 2947 } else if (prev->pcg_avail == 0) { 2948 (void)pool_pcg_put(pc->pc_pcgcache, prev); 2949 } else { 2950 (void)pool_pcg_put(&pc->pc_partgroups, prev); 2951 cc->cc_npart++; 2952 } 2953 } 2954 splx(s); 2955 } 2956 2957 static int 2958 pool_bigidx(size_t size) 2959 { 2960 int i; 2961 2962 for (i = 0; i < __arraycount(pool_allocator_big); i++) { 2963 if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size) 2964 return i; 2965 } 2966 panic("pool item size %zu too large, use a custom allocator", size); 2967 } 2968 2969 static void * 2970 pool_allocator_alloc(struct pool *pp, int flags) 2971 { 2972 struct pool_allocator *pa = pp->pr_alloc; 2973 void *res; 2974 2975 res = (*pa->pa_alloc)(pp, flags); 2976 return res; 2977 } 2978 2979 static void 2980 pool_allocator_free(struct pool *pp, void *v) 2981 { 2982 struct pool_allocator *pa = pp->pr_alloc; 2983 2984 if (pp->pr_redzone) { 2985 KASSERT(!pp_has_pser(pp)); 2986 kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0); 2987 } else if (__predict_false(pp_has_pser(pp))) { 2988 /* 2989 * Perform a passive serialization barrier before freeing 2990 * the pool page back to the system. 2991 */ 2992 pool_barrier(); 2993 } 2994 (*pa->pa_free)(pp, v); 2995 } 2996 2997 void * 2998 pool_page_alloc(struct pool *pp, int flags) 2999 { 3000 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 3001 vmem_addr_t va; 3002 int ret; 3003 3004 ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz, 3005 vflags | VM_INSTANTFIT, &va); 3006 3007 return ret ? NULL : (void *)va; 3008 } 3009 3010 void 3011 pool_page_free(struct pool *pp, void *v) 3012 { 3013 3014 uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz); 3015 } 3016 3017 static void * 3018 pool_page_alloc_meta(struct pool *pp, int flags) 3019 { 3020 const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP; 3021 vmem_addr_t va; 3022 int ret; 3023 3024 ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz, 3025 vflags | VM_INSTANTFIT, &va); 3026 3027 return ret ? NULL : (void *)va; 3028 } 3029 3030 static void 3031 pool_page_free_meta(struct pool *pp, void *v) 3032 { 3033 3034 vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz); 3035 } 3036 3037 #ifdef KMSAN 3038 static inline void 3039 pool_get_kmsan(struct pool *pp, void *p) 3040 { 3041 kmsan_orig(p, pp->pr_size, KMSAN_TYPE_POOL, __RET_ADDR); 3042 kmsan_mark(p, pp->pr_size, KMSAN_STATE_UNINIT); 3043 } 3044 3045 static inline void 3046 pool_put_kmsan(struct pool *pp, void *p) 3047 { 3048 kmsan_mark(p, pp->pr_size, KMSAN_STATE_INITED); 3049 } 3050 3051 static inline void 3052 pool_cache_get_kmsan(pool_cache_t pc, void *p) 3053 { 3054 if (__predict_false(pc_has_ctor(pc))) { 3055 return; 3056 } 3057 pool_get_kmsan(&pc->pc_pool, p); 3058 } 3059 3060 static inline void 3061 pool_cache_put_kmsan(pool_cache_t pc, void *p) 3062 { 3063 pool_put_kmsan(&pc->pc_pool, p); 3064 } 3065 #endif 3066 3067 #ifdef POOL_QUARANTINE 3068 static void 3069 pool_quarantine_init(struct pool *pp) 3070 { 3071 pp->pr_quar.rotor = 0; 3072 memset(&pp->pr_quar, 0, sizeof(pp->pr_quar)); 3073 } 3074 3075 static void 3076 pool_quarantine_flush(struct pool *pp) 3077 { 3078 pool_quar_t *quar = &pp->pr_quar; 3079 struct pool_pagelist pq; 3080 size_t i; 3081 3082 LIST_INIT(&pq); 3083 3084 mutex_enter(&pp->pr_lock); 3085 for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) { 3086 if (quar->list[i] == 0) 3087 continue; 3088 pool_do_put(pp, (void *)quar->list[i], &pq); 3089 } 3090 mutex_exit(&pp->pr_lock); 3091 3092 pr_pagelist_free(pp, &pq); 3093 } 3094 3095 static bool 3096 pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq) 3097 { 3098 pool_quar_t *quar = &pp->pr_quar; 3099 uintptr_t old; 3100 3101 if (pp->pr_roflags & PR_NOTOUCH) { 3102 return false; 3103 } 3104 3105 pool_redzone_check(pp, v); 3106 3107 old = quar->list[quar->rotor]; 3108 quar->list[quar->rotor] = (uintptr_t)v; 3109 quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH; 3110 if (old != 0) { 3111 pool_do_put(pp, (void *)old, pq); 3112 } 3113 3114 return true; 3115 } 3116 #endif 3117 3118 #ifdef POOL_NOCACHE 3119 static bool 3120 pool_cache_put_nocache(pool_cache_t pc, void *p) 3121 { 3122 pool_cache_destruct_object(pc, p); 3123 return true; 3124 } 3125 #endif 3126 3127 #ifdef POOL_REDZONE 3128 #if defined(_LP64) 3129 # define PRIME 0x9e37fffffffc0000UL 3130 #else /* defined(_LP64) */ 3131 # define PRIME 0x9e3779b1 3132 #endif /* defined(_LP64) */ 3133 #define STATIC_BYTE 0xFE 3134 CTASSERT(POOL_REDZONE_SIZE > 1); 3135 3136 #ifndef KASAN 3137 static inline uint8_t 3138 pool_pattern_generate(const void *p) 3139 { 3140 return (uint8_t)(((uintptr_t)p) * PRIME 3141 >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT); 3142 } 3143 #endif 3144 3145 static void 3146 pool_redzone_init(struct pool *pp, size_t requested_size) 3147 { 3148 size_t redzsz; 3149 size_t nsz; 3150 3151 #ifdef KASAN 3152 redzsz = requested_size; 3153 kasan_add_redzone(&redzsz); 3154 redzsz -= requested_size; 3155 #else 3156 redzsz = POOL_REDZONE_SIZE; 3157 #endif 3158 3159 if (pp->pr_roflags & PR_NOTOUCH) { 3160 pp->pr_redzone = false; 3161 return; 3162 } 3163 3164 /* 3165 * We may have extended the requested size earlier; check if 3166 * there's naturally space in the padding for a red zone. 3167 */ 3168 if (pp->pr_size - requested_size >= redzsz) { 3169 pp->pr_reqsize_with_redzone = requested_size + redzsz; 3170 pp->pr_redzone = true; 3171 return; 3172 } 3173 3174 /* 3175 * No space in the natural padding; check if we can extend a 3176 * bit the size of the pool. 3177 * 3178 * Avoid using redzone for allocations half of a page or larger. 3179 * For pagesize items, we'd waste a whole new page (could be 3180 * unmapped?), and for half pagesize items, approximately half 3181 * the space is lost (eg, 4K pages, you get one 2K allocation.) 3182 */ 3183 nsz = roundup(pp->pr_size + redzsz, pp->pr_align); 3184 if (nsz <= (pp->pr_alloc->pa_pagesz / 2)) { 3185 /* Ok, we can */ 3186 pp->pr_size = nsz; 3187 pp->pr_reqsize_with_redzone = requested_size + redzsz; 3188 pp->pr_redzone = true; 3189 } else { 3190 /* No space for a red zone... snif :'( */ 3191 pp->pr_redzone = false; 3192 aprint_debug("pool redzone disabled for '%s'\n", pp->pr_wchan); 3193 } 3194 } 3195 3196 static void 3197 pool_redzone_fill(struct pool *pp, void *p) 3198 { 3199 if (!pp->pr_redzone) 3200 return; 3201 KASSERT(!pp_has_pser(pp)); 3202 #ifdef KASAN 3203 kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone, 3204 KASAN_POOL_REDZONE); 3205 #else 3206 uint8_t *cp, pat; 3207 const uint8_t *ep; 3208 3209 cp = (uint8_t *)p + pp->pr_reqsize; 3210 ep = cp + POOL_REDZONE_SIZE; 3211 3212 /* 3213 * We really don't want the first byte of the red zone to be '\0'; 3214 * an off-by-one in a string may not be properly detected. 3215 */ 3216 pat = pool_pattern_generate(cp); 3217 *cp = (pat == '\0') ? STATIC_BYTE: pat; 3218 cp++; 3219 3220 while (cp < ep) { 3221 *cp = pool_pattern_generate(cp); 3222 cp++; 3223 } 3224 #endif 3225 } 3226 3227 static void 3228 pool_redzone_check(struct pool *pp, void *p) 3229 { 3230 if (!pp->pr_redzone) 3231 return; 3232 KASSERT(!pp_has_pser(pp)); 3233 #ifdef KASAN 3234 kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED); 3235 #else 3236 uint8_t *cp, pat, expected; 3237 const uint8_t *ep; 3238 3239 cp = (uint8_t *)p + pp->pr_reqsize; 3240 ep = cp + POOL_REDZONE_SIZE; 3241 3242 pat = pool_pattern_generate(cp); 3243 expected = (pat == '\0') ? STATIC_BYTE: pat; 3244 if (__predict_false(*cp != expected)) { 3245 panic("%s: [%s] 0x%02x != 0x%02x", __func__, 3246 pp->pr_wchan, *cp, expected); 3247 } 3248 cp++; 3249 3250 while (cp < ep) { 3251 expected = pool_pattern_generate(cp); 3252 if (__predict_false(*cp != expected)) { 3253 panic("%s: [%s] 0x%02x != 0x%02x", __func__, 3254 pp->pr_wchan, *cp, expected); 3255 } 3256 cp++; 3257 } 3258 #endif 3259 } 3260 3261 static void 3262 pool_cache_redzone_check(pool_cache_t pc, void *p) 3263 { 3264 #ifdef KASAN 3265 /* 3266 * If there is a ctor/dtor, or if the cache objects use 3267 * passive serialization, leave the data as valid. 3268 */ 3269 if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc) || 3270 pc_has_pser(pc))) { 3271 return; 3272 } 3273 #endif 3274 pool_redzone_check(&pc->pc_pool, p); 3275 } 3276 3277 #endif /* POOL_REDZONE */ 3278 3279 #if defined(DDB) 3280 static bool 3281 pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 3282 { 3283 3284 return (uintptr_t)ph->ph_page <= addr && 3285 addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz; 3286 } 3287 3288 static bool 3289 pool_in_item(struct pool *pp, void *item, uintptr_t addr) 3290 { 3291 3292 return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size; 3293 } 3294 3295 static bool 3296 pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr) 3297 { 3298 int i; 3299 3300 if (pcg == NULL) { 3301 return false; 3302 } 3303 for (i = 0; i < pcg->pcg_avail; i++) { 3304 if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) { 3305 return true; 3306 } 3307 } 3308 return false; 3309 } 3310 3311 static bool 3312 pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr) 3313 { 3314 3315 if ((pp->pr_roflags & PR_USEBMAP) != 0) { 3316 unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr); 3317 pool_item_bitmap_t *bitmap = 3318 ph->ph_bitmap + (idx / BITMAP_SIZE); 3319 pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK); 3320 3321 return (*bitmap & mask) == 0; 3322 } else { 3323 struct pool_item *pi; 3324 3325 LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) { 3326 if (pool_in_item(pp, pi, addr)) { 3327 return false; 3328 } 3329 } 3330 return true; 3331 } 3332 } 3333 3334 void 3335 pool_whatis(uintptr_t addr, void (*pr)(const char *, ...)) 3336 { 3337 struct pool *pp; 3338 3339 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3340 struct pool_item_header *ph; 3341 struct pool_cache *pc; 3342 uintptr_t item; 3343 bool allocated = true; 3344 bool incache = false; 3345 bool incpucache = false; 3346 char cpucachestr[32]; 3347 3348 if ((pp->pr_roflags & PR_PHINPAGE) != 0) { 3349 LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) { 3350 if (pool_in_page(pp, ph, addr)) { 3351 goto found; 3352 } 3353 } 3354 LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) { 3355 if (pool_in_page(pp, ph, addr)) { 3356 allocated = 3357 pool_allocated(pp, ph, addr); 3358 goto found; 3359 } 3360 } 3361 LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) { 3362 if (pool_in_page(pp, ph, addr)) { 3363 allocated = false; 3364 goto found; 3365 } 3366 } 3367 continue; 3368 } else { 3369 ph = pr_find_pagehead_noalign(pp, (void *)addr); 3370 if (ph == NULL || !pool_in_page(pp, ph, addr)) { 3371 continue; 3372 } 3373 allocated = pool_allocated(pp, ph, addr); 3374 } 3375 found: 3376 if (allocated && 3377 (pc = atomic_load_consume(&pp->pr_cache)) != NULL) { 3378 struct pool_cache_group *pcg; 3379 int i; 3380 3381 for (pcg = pc->pc_fullgroups; pcg != NULL; 3382 pcg = pcg->pcg_next) { 3383 if (pool_in_cg(pp, pcg, addr)) { 3384 incache = true; 3385 goto print; 3386 } 3387 } 3388 for (i = 0; i < __arraycount(pc->pc_cpus); i++) { 3389 pool_cache_cpu_t *cc; 3390 3391 if ((cc = pc->pc_cpus[i]) == NULL) { 3392 continue; 3393 } 3394 if (pool_in_cg(pp, cc->cc_current, addr) || 3395 pool_in_cg(pp, cc->cc_previous, addr)) { 3396 struct cpu_info *ci = 3397 cpu_lookup(i); 3398 3399 incpucache = true; 3400 snprintf(cpucachestr, 3401 sizeof(cpucachestr), 3402 "cached by CPU %u", 3403 ci->ci_index); 3404 goto print; 3405 } 3406 } 3407 } 3408 print: 3409 item = (uintptr_t)ph->ph_page + ph->ph_off; 3410 item = item + rounddown(addr - item, pp->pr_size); 3411 (*pr)("%p is %p+%zu in POOL '%s' (%s)\n", 3412 (void *)addr, item, (size_t)(addr - item), 3413 pp->pr_wchan, 3414 incpucache ? cpucachestr : 3415 incache ? "cached" : allocated ? "allocated" : "free"); 3416 } 3417 } 3418 #endif /* defined(DDB) */ 3419 3420 static int 3421 pool_sysctl(SYSCTLFN_ARGS) 3422 { 3423 struct pool_sysctl data; 3424 struct pool *pp; 3425 struct pool_cache *pc; 3426 pool_cache_cpu_t *cc; 3427 int error; 3428 size_t i, written; 3429 3430 if (oldp == NULL) { 3431 *oldlenp = 0; 3432 TAILQ_FOREACH(pp, &pool_head, pr_poollist) 3433 *oldlenp += sizeof(data); 3434 return 0; 3435 } 3436 3437 memset(&data, 0, sizeof(data)); 3438 error = 0; 3439 written = 0; 3440 mutex_enter(&pool_head_lock); 3441 TAILQ_FOREACH(pp, &pool_head, pr_poollist) { 3442 if (written + sizeof(data) > *oldlenp) 3443 break; 3444 pp->pr_refcnt++; 3445 strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan)); 3446 data.pr_pagesize = pp->pr_alloc->pa_pagesz; 3447 data.pr_flags = pp->pr_roflags | pp->pr_flags; 3448 #define COPY(field) data.field = pp->field 3449 COPY(pr_size); 3450 3451 COPY(pr_itemsperpage); 3452 COPY(pr_nitems); 3453 COPY(pr_nout); 3454 COPY(pr_hardlimit); 3455 COPY(pr_npages); 3456 COPY(pr_minpages); 3457 COPY(pr_maxpages); 3458 3459 COPY(pr_nget); 3460 COPY(pr_nfail); 3461 COPY(pr_nput); 3462 COPY(pr_npagealloc); 3463 COPY(pr_npagefree); 3464 COPY(pr_hiwat); 3465 COPY(pr_nidle); 3466 #undef COPY 3467 3468 data.pr_cache_nmiss_pcpu = 0; 3469 data.pr_cache_nhit_pcpu = 0; 3470 data.pr_cache_nmiss_global = 0; 3471 data.pr_cache_nempty = 0; 3472 data.pr_cache_ncontended = 0; 3473 data.pr_cache_npartial = 0; 3474 if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) { 3475 uint32_t nfull = 0; 3476 data.pr_cache_meta_size = pc->pc_pcgsize; 3477 for (i = 0; i < pc->pc_ncpu; ++i) { 3478 cc = pc->pc_cpus[i]; 3479 if (cc == NULL) 3480 continue; 3481 data.pr_cache_ncontended += cc->cc_contended; 3482 data.pr_cache_nmiss_pcpu += cc->cc_misses; 3483 data.pr_cache_nhit_pcpu += cc->cc_hits; 3484 data.pr_cache_nmiss_global += cc->cc_pcmisses; 3485 nfull += cc->cc_nfull; /* 32-bit rollover! */ 3486 data.pr_cache_npartial += cc->cc_npart; 3487 } 3488 data.pr_cache_nfull = nfull; 3489 } else { 3490 data.pr_cache_meta_size = 0; 3491 data.pr_cache_nfull = 0; 3492 } 3493 data.pr_cache_nhit_global = data.pr_cache_nmiss_pcpu - 3494 data.pr_cache_nmiss_global; 3495 3496 if (pp->pr_refcnt == UINT_MAX) /* XXX possible? */ 3497 continue; 3498 mutex_exit(&pool_head_lock); 3499 error = sysctl_copyout(l, &data, oldp, sizeof(data)); 3500 mutex_enter(&pool_head_lock); 3501 if (--pp->pr_refcnt == 0) 3502 cv_broadcast(&pool_busy); 3503 if (error) 3504 break; 3505 written += sizeof(data); 3506 oldp = (char *)oldp + sizeof(data); 3507 } 3508 mutex_exit(&pool_head_lock); 3509 3510 *oldlenp = written; 3511 return error; 3512 } 3513 3514 SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup") 3515 { 3516 const struct sysctlnode *rnode = NULL; 3517 3518 sysctl_createv(clog, 0, NULL, &rnode, 3519 CTLFLAG_PERMANENT, 3520 CTLTYPE_STRUCT, "pool", 3521 SYSCTL_DESCR("Get pool statistics"), 3522 pool_sysctl, 0, NULL, 0, 3523 CTL_KERN, CTL_CREATE, CTL_EOL); 3524 } 3525