1 /* $NetBSD: subr_percpu.c,v 1.26 2026/01/04 03:19:56 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c)2007,2008 YAMAMOTO Takashi, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * per-cpu storage. 31 */ 32 33 #include <sys/cdefs.h> 34 __KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.26 2026/01/04 03:19:56 riastradh Exp $"); 35 36 #include <sys/param.h> 37 38 #include <sys/cpu.h> 39 #include <sys/kernel.h> 40 #include <sys/kmem.h> 41 #include <sys/mutex.h> 42 #include <sys/percpu.h> 43 #include <sys/rwlock.h> 44 #include <sys/sdt.h> 45 #include <sys/vmem.h> 46 #include <sys/xcall.h> 47 48 #define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1) 49 #define PERCPU_QCACHE_MAX 0 50 #define PERCPU_IMPORT_SIZE 2048 51 52 struct percpu { 53 unsigned pc_offset; 54 size_t pc_size; 55 percpu_callback_t pc_ctor; 56 percpu_callback_t pc_dtor; 57 void *pc_cookie; 58 LIST_ENTRY(percpu) pc_list; 59 }; 60 61 static krwlock_t percpu_swap_lock __cacheline_aligned; 62 static vmem_t * percpu_offset_arena __read_mostly; 63 static struct { 64 kmutex_t lock; 65 unsigned int nextoff; 66 LIST_HEAD(, percpu) ctor_list; 67 struct lwp *busy; 68 kcondvar_t cv; 69 } percpu_allocation __cacheline_aligned; 70 71 static percpu_cpu_t * 72 cpu_percpu(struct cpu_info *ci) 73 { 74 75 return &ci->ci_data.cpu_percpu; 76 } 77 78 static unsigned int 79 percpu_offset(percpu_t *pc) 80 { 81 const unsigned int off = pc->pc_offset; 82 83 KASSERT(off < percpu_allocation.nextoff); 84 return off; 85 } 86 87 /* 88 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge 89 */ 90 __noubsan 91 static void 92 percpu_cpu_swap(void *p1, void *p2) 93 { 94 struct cpu_info * const ci = p1; 95 percpu_cpu_t * const newpcc = p2; 96 percpu_cpu_t * const pcc = cpu_percpu(ci); 97 98 KASSERT(ci == curcpu() || !mp_online); 99 100 /* 101 * swap *pcc and *newpcc unless anyone has beaten us. 102 */ 103 rw_enter(&percpu_swap_lock, RW_WRITER); 104 if (newpcc->pcc_size > pcc->pcc_size) { 105 percpu_cpu_t tmp; 106 int s; 107 108 tmp = *pcc; 109 110 /* 111 * block interrupts so that we don't lose their modifications. 112 */ 113 114 s = splhigh(); 115 116 /* 117 * copy data to new storage. 118 */ 119 120 memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size); 121 122 /* 123 * this assignment needs to be atomic for percpu_getptr_remote. 124 */ 125 126 pcc->pcc_data = newpcc->pcc_data; 127 128 splx(s); 129 130 pcc->pcc_size = newpcc->pcc_size; 131 *newpcc = tmp; 132 } 133 rw_exit(&percpu_swap_lock); 134 } 135 136 /* 137 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space 138 */ 139 140 static void 141 percpu_cpu_enlarge(size_t size) 142 { 143 CPU_INFO_ITERATOR cii; 144 struct cpu_info *ci; 145 146 for (CPU_INFO_FOREACH(cii, ci)) { 147 percpu_cpu_t pcc; 148 149 pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */ 150 pcc.pcc_size = size; 151 if (!mp_online) { 152 percpu_cpu_swap(ci, &pcc); 153 } else { 154 uint64_t where; 155 156 where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci); 157 xc_wait(where); 158 } 159 KASSERT(pcc.pcc_size <= size); 160 if (pcc.pcc_data != NULL) { 161 kmem_free(pcc.pcc_data, pcc.pcc_size); 162 } 163 } 164 } 165 166 /* 167 * percpu_backend_alloc: vmem import callback for percpu_offset_arena 168 */ 169 170 static int 171 percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize, 172 vm_flag_t vmflags, vmem_addr_t *addrp) 173 { 174 unsigned int offset; 175 unsigned int nextoff; 176 177 ASSERT_SLEEPABLE(); 178 KASSERT(dummy == NULL); 179 180 if ((vmflags & VM_NOSLEEP) != 0) 181 return SET_ERROR(ENOMEM); 182 183 size = roundup(size, PERCPU_IMPORT_SIZE); 184 mutex_enter(&percpu_allocation.lock); 185 offset = percpu_allocation.nextoff; 186 percpu_allocation.nextoff = nextoff = percpu_allocation.nextoff + size; 187 mutex_exit(&percpu_allocation.lock); 188 189 percpu_cpu_enlarge(nextoff); 190 191 *resultsize = size; 192 *addrp = (vmem_addr_t)offset; 193 return 0; 194 } 195 196 static void 197 percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci) 198 { 199 size_t sz = (uintptr_t)vp2; 200 201 memset(vp, 0, sz); 202 } 203 204 /* 205 * percpu_zero: initialize percpu storage with zero. 206 */ 207 208 static void 209 percpu_zero(percpu_t *pc, size_t sz) 210 { 211 212 percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz); 213 } 214 215 /* 216 * percpu_init: subsystem initialization 217 */ 218 219 void 220 percpu_init(void) 221 { 222 223 ASSERT_SLEEPABLE(); 224 rw_init(&percpu_swap_lock); 225 mutex_init(&percpu_allocation.lock, MUTEX_DEFAULT, IPL_NONE); 226 percpu_allocation.nextoff = PERCPU_QUANTUM_SIZE; 227 LIST_INIT(&percpu_allocation.ctor_list); 228 percpu_allocation.busy = NULL; 229 cv_init(&percpu_allocation.cv, "percpu"); 230 231 percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE, 232 percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP, 233 IPL_NONE); 234 } 235 236 /* 237 * percpu_init_cpu: cpu initialization 238 * 239 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH. 240 * => may be called for static CPUs afterward (typically just primary CPU) 241 */ 242 243 void 244 percpu_init_cpu(struct cpu_info *ci) 245 { 246 percpu_cpu_t * const pcc = cpu_percpu(ci); 247 struct percpu *pc; 248 size_t size = percpu_allocation.nextoff; /* XXX racy */ 249 250 ASSERT_SLEEPABLE(); 251 252 /* 253 * For the primary CPU, prior percpu_create may have already 254 * triggered allocation, so there's nothing more for us to do 255 * here. 256 */ 257 if (pcc->pcc_size) 258 return; 259 KASSERT(pcc->pcc_data == NULL); 260 261 /* 262 * Otherwise, allocate storage and, while the constructor list 263 * is locked, run constructors for all percpus on this CPU. 264 */ 265 pcc->pcc_size = size; 266 if (size) { 267 pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP); 268 mutex_enter(&percpu_allocation.lock); 269 while (percpu_allocation.busy) 270 cv_wait(&percpu_allocation.cv, 271 &percpu_allocation.lock); 272 percpu_allocation.busy = curlwp; 273 LIST_FOREACH(pc, &percpu_allocation.ctor_list, pc_list) { 274 KASSERT(pc->pc_ctor); 275 mutex_exit(&percpu_allocation.lock); 276 (*pc->pc_ctor)((char *)pcc->pcc_data + pc->pc_offset, 277 pc->pc_cookie, ci); 278 mutex_enter(&percpu_allocation.lock); 279 } 280 KASSERT(percpu_allocation.busy == curlwp); 281 percpu_allocation.busy = NULL; 282 cv_broadcast(&percpu_allocation.cv); 283 mutex_exit(&percpu_allocation.lock); 284 } 285 } 286 287 /* 288 * percpu_alloc: allocate percpu storage 289 * 290 * => called in thread context. 291 * => considered as an expensive and rare operation. 292 * => allocated storage is initialized with zeros. 293 */ 294 295 percpu_t * 296 percpu_alloc(size_t size) 297 { 298 299 return percpu_create(size, NULL, NULL, NULL); 300 } 301 302 /* 303 * percpu_create: allocate percpu storage and associate ctor/dtor with it 304 * 305 * => called in thread context. 306 * => considered as an expensive and rare operation. 307 * => allocated storage is initialized by ctor, or zeros if ctor is null 308 * => percpu_free will call dtor first, if dtor is nonnull 309 * => ctor or dtor may sleep, even on allocation 310 */ 311 312 percpu_t * 313 percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor, 314 void *cookie) 315 { 316 vmem_addr_t offset; 317 percpu_t *pc; 318 319 ASSERT_SLEEPABLE(); 320 (void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT, 321 &offset); 322 323 pc = kmem_alloc(sizeof(*pc), KM_SLEEP); 324 pc->pc_offset = offset; 325 pc->pc_size = size; 326 pc->pc_ctor = ctor; 327 pc->pc_dtor = dtor; 328 pc->pc_cookie = cookie; 329 330 if (ctor) { 331 CPU_INFO_ITERATOR cii; 332 struct cpu_info *ci; 333 void *buf; 334 335 /* 336 * Wait until nobody is using the list of percpus with 337 * constructors. 338 */ 339 mutex_enter(&percpu_allocation.lock); 340 while (percpu_allocation.busy) 341 cv_wait(&percpu_allocation.cv, 342 &percpu_allocation.lock); 343 percpu_allocation.busy = curlwp; 344 mutex_exit(&percpu_allocation.lock); 345 346 /* 347 * Run the constructor for all CPUs. We use a 348 * temporary buffer wo that we need not hold the 349 * percpu_swap_lock while running the constructor. 350 */ 351 buf = kmem_alloc(size, KM_SLEEP); 352 for (CPU_INFO_FOREACH(cii, ci)) { 353 memset(buf, 0, size); 354 (*ctor)(buf, cookie, ci); 355 percpu_traverse_enter(); 356 memcpy(percpu_getptr_remote(pc, ci), buf, size); 357 percpu_traverse_exit(); 358 } 359 explicit_memset(buf, 0, size); 360 kmem_free(buf, size); 361 362 /* 363 * Insert the percpu into the list of percpus with 364 * constructors. We are now done using the list, so it 365 * is safe for concurrent percpu_create or concurrent 366 * percpu_init_cpu to run. 367 */ 368 mutex_enter(&percpu_allocation.lock); 369 KASSERT(percpu_allocation.busy == curlwp); 370 percpu_allocation.busy = NULL; 371 cv_broadcast(&percpu_allocation.cv); 372 LIST_INSERT_HEAD(&percpu_allocation.ctor_list, pc, pc_list); 373 mutex_exit(&percpu_allocation.lock); 374 } else { 375 percpu_zero(pc, size); 376 } 377 378 return pc; 379 } 380 381 /* 382 * percpu_free: free percpu storage 383 * 384 * => called in thread context. 385 * => considered as an expensive and rare operation. 386 */ 387 388 void 389 percpu_free(percpu_t *pc, size_t size) 390 { 391 392 ASSERT_SLEEPABLE(); 393 KASSERT(size == pc->pc_size); 394 395 /* 396 * If there's a constructor, take the percpu off the list of 397 * percpus with constructors, but first wait until nobody is 398 * using the list. 399 */ 400 if (pc->pc_ctor) { 401 mutex_enter(&percpu_allocation.lock); 402 while (percpu_allocation.busy) 403 cv_wait(&percpu_allocation.cv, 404 &percpu_allocation.lock); 405 LIST_REMOVE(pc, pc_list); 406 mutex_exit(&percpu_allocation.lock); 407 } 408 409 /* If there's a destructor, run it now for all CPUs. */ 410 if (pc->pc_dtor) { 411 CPU_INFO_ITERATOR cii; 412 struct cpu_info *ci; 413 void *buf; 414 415 buf = kmem_alloc(size, KM_SLEEP); 416 for (CPU_INFO_FOREACH(cii, ci)) { 417 percpu_traverse_enter(); 418 memcpy(buf, percpu_getptr_remote(pc, ci), size); 419 explicit_memset(percpu_getptr_remote(pc, ci), 0, size); 420 percpu_traverse_exit(); 421 (*pc->pc_dtor)(buf, pc->pc_cookie, ci); 422 } 423 explicit_memset(buf, 0, size); 424 kmem_free(buf, size); 425 } 426 427 vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size); 428 kmem_free(pc, sizeof(*pc)); 429 } 430 431 /* 432 * percpu_getref: 433 * 434 * => safe to be used in either thread or interrupt context 435 * => disables preemption; must be bracketed with a percpu_putref() 436 */ 437 438 void * 439 percpu_getref(percpu_t *pc) 440 { 441 442 kpreempt_disable(); 443 return percpu_getptr_remote(pc, curcpu()); 444 } 445 446 /* 447 * percpu_putref: 448 * 449 * => drops the preemption-disabled count after caller is done with per-cpu 450 * data 451 */ 452 453 void 454 percpu_putref(percpu_t *pc) 455 { 456 457 kpreempt_enable(); 458 } 459 460 /* 461 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote: 462 * helpers to access remote cpu's percpu data. 463 * 464 * => called in thread context. 465 * => percpu_traverse_enter can block low-priority xcalls. 466 * => typical usage would be: 467 * 468 * sum = 0; 469 * percpu_traverse_enter(); 470 * for (CPU_INFO_FOREACH(cii, ci)) { 471 * unsigned int *p = percpu_getptr_remote(pc, ci); 472 * sum += *p; 473 * } 474 * percpu_traverse_exit(); 475 */ 476 477 void 478 percpu_traverse_enter(void) 479 { 480 481 ASSERT_SLEEPABLE(); 482 rw_enter(&percpu_swap_lock, RW_READER); 483 } 484 485 void 486 percpu_traverse_exit(void) 487 { 488 489 rw_exit(&percpu_swap_lock); 490 } 491 492 void * 493 percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci) 494 { 495 496 return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)]; 497 } 498 499 /* 500 * percpu_foreach: call the specified callback function for each cpus. 501 * 502 * => must be called from thread context. 503 * => callback executes on **current** CPU (or, really, arbitrary CPU, 504 * in case of preemption) 505 * => caller should not rely on the cpu iteration order. 506 * => the callback function should be minimum because it is executed with 507 * holding a global lock, which can block low-priority xcalls. 508 * eg. it's illegal for a callback function to sleep for memory allocation. 509 */ 510 void 511 percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg) 512 { 513 CPU_INFO_ITERATOR cii; 514 struct cpu_info *ci; 515 516 percpu_traverse_enter(); 517 for (CPU_INFO_FOREACH(cii, ci)) { 518 (*cb)(percpu_getptr_remote(pc, ci), arg, ci); 519 } 520 percpu_traverse_exit(); 521 } 522 523 struct percpu_xcall_ctx { 524 percpu_callback_t ctx_cb; 525 void *ctx_arg; 526 }; 527 528 static void 529 percpu_xcfunc(void * const v1, void * const v2) 530 { 531 percpu_t * const pc = v1; 532 struct percpu_xcall_ctx * const ctx = v2; 533 534 (*ctx->ctx_cb)(percpu_getref(pc), ctx->ctx_arg, curcpu()); 535 percpu_putref(pc); 536 } 537 538 /* 539 * percpu_foreach_xcall: call the specified callback function for each 540 * cpu. This version uses an xcall to run the callback on each cpu. 541 * 542 * => must be called from thread context. 543 * => callback executes on **remote** CPU in soft-interrupt context 544 * (at the specified soft interrupt priority). 545 * => caller should not rely on the cpu iteration order. 546 * => the callback function should be minimum because it may be 547 * executed in soft-interrupt context. eg. it's illegal for 548 * a callback function to sleep for memory allocation. 549 */ 550 void 551 percpu_foreach_xcall(percpu_t *pc, u_int xcflags, percpu_callback_t cb, 552 void *arg) 553 { 554 struct percpu_xcall_ctx ctx = { 555 .ctx_cb = cb, 556 .ctx_arg = arg, 557 }; 558 CPU_INFO_ITERATOR cii; 559 struct cpu_info *ci; 560 561 for (CPU_INFO_FOREACH(cii, ci)) { 562 xc_wait(xc_unicast(xcflags, percpu_xcfunc, pc, &ctx, ci)); 563 } 564 } 565