1 /* $NetBSD: kern_lock.c,v 1.195 2026/03/29 08:36:43 kre Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, and by Andrew Doran. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.195 2026/03/29 08:36:43 kre Exp $"); 36 37 #ifdef _KERNEL_OPT 38 #include "opt_lockdebug.h" 39 #endif 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 44 #include <sys/atomic.h> 45 #include <sys/cpu.h> 46 #include <sys/kernel.h> 47 #include <sys/lock.h> 48 #include <sys/lockdebug.h> 49 #include <sys/lwp.h> 50 #include <sys/proc.h> 51 #include <sys/pserialize.h> 52 #include <sys/sdt.h> 53 #include <sys/syslog.h> 54 #include <sys/systm.h> 55 56 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG) 57 #include <sys/ksyms.h> 58 #endif 59 60 #include <machine/lock.h> 61 62 #include <dev/lockstat.h> 63 64 SDT_PROBE_DEFINE1(sdt, kernel, lock, entry, 65 "unsigned"/*nlocks*/); 66 SDT_PROBE_DEFINE1(sdt, kernel, lock, exit, 67 "unsigned"/*nlocks*/); 68 69 #define RETURN_ADDRESS (uintptr_t)__builtin_return_address(0) 70 71 bool kernel_lock_dodebug; 72 73 struct kernel_lock { 74 __cpu_simple_lock_t lock __aligned(CACHE_LINE_SIZE); 75 struct cpu_info *volatile holder; 76 } kernel_lock_cacheline[CACHE_LINE_SIZE / sizeof(struct kernel_lock)] 77 __cacheline_aligned; 78 __strong_alias(kernel_lock, kernel_lock_cacheline) 79 #define kernel_lock_holder (kernel_lock_cacheline[0].holder) 80 81 void 82 assert_sleepable(void) 83 { 84 const char *reason; 85 long pctr; 86 bool idle; 87 88 if (__predict_false(panicstr != NULL)) { 89 return; 90 } 91 92 LOCKDEBUG_BARRIER(kernel_lock, 1); 93 94 /* 95 * Avoid disabling/re-enabling preemption here since this 96 * routine may be called in delicate situations. 97 */ 98 do { 99 pctr = lwp_pctr(); 100 idle = CURCPU_IDLE_P(); 101 } while (__predict_false(pctr != lwp_pctr())); 102 103 reason = NULL; 104 if (__predict_false(idle) && !cold) { 105 reason = "idle"; 106 goto panic; 107 } 108 if (__predict_false(cpu_intr_p())) { 109 reason = "interrupt"; 110 goto panic; 111 } 112 if (__predict_false(cpu_softintr_p())) { 113 reason = "softint"; 114 goto panic; 115 } 116 if (__predict_false(!pserialize_not_in_read_section())) { 117 reason = "pserialize"; 118 goto panic; 119 } 120 return; 121 122 panic: panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS); 123 } 124 125 /* 126 * Functions for manipulating the kernel_lock. We put them here 127 * so that they show up in profiles. 128 */ 129 130 #define _KERNEL_LOCK_ABORT(msg) \ 131 LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg) 132 133 #ifdef LOCKDEBUG 134 #define _KERNEL_LOCK_ASSERT(cond) \ 135 do { \ 136 if (!(cond)) \ 137 _KERNEL_LOCK_ABORT("assertion failed: " #cond); \ 138 } while (/* CONSTCOND */ 0) 139 #else 140 #define _KERNEL_LOCK_ASSERT(cond) /* nothing */ 141 #endif 142 143 static void _kernel_lock_dump(const volatile void *, lockop_printer_t); 144 145 lockops_t _kernel_lock_ops = { 146 .lo_name = "Kernel lock", 147 .lo_type = LOCKOPS_SPIN, 148 .lo_dump = _kernel_lock_dump, 149 }; 150 151 #ifdef DDB 152 #include <ddb/ddb.h> 153 #endif 154 155 static void 156 kernel_lock_trace_ipi(void *cookie) 157 { 158 159 printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()), 160 curlwp->l_lid, 161 curlwp->l_name ? curlwp->l_name : curproc->p_comm); 162 #ifdef DDB 163 db_stacktrace(); 164 165 /* 166 * Make sure we leave a return address around for db_stacktrace 167 * to find. 168 */ 169 __insn_barrier(); 170 return; 171 #endif 172 } 173 174 static void 175 kernel_lock_spinout(void) 176 { 177 static volatile unsigned kernel_lock_last_report; 178 ipi_msg_t msg = { 179 .func = kernel_lock_trace_ipi, 180 }; 181 unsigned now, then; 182 struct cpu_info *holder; 183 184 /* 185 * Disable preemption so that curcpu() is stable in this function. 186 * Otherwise, it's possible for the curlwp to be migrated to the 187 * holder cpu in the meantime. 188 */ 189 kpreempt_disable(); 190 191 /* 192 * Find who holds the kernel lock. If nobody, we can't report 193 * anything, so pass -- but while this is possible in principle 194 * because we first take the lock and then set the holder, it 195 * is rather unlikely to actually happen in practice because we 196 * wait 10sec to take the lock before trying to report a 197 * problem anyway. 198 */ 199 if (!__SIMPLELOCK_LOCKED_P(kernel_lock)) 200 goto out; 201 202 /* 203 * Note: holder == NULL here basically means 204 * "no one has acquired kernel lock since the boot". 205 * 206 * Theoretically it's possible the first locker has acquired 207 * kernel_lock but has not updated kernel_lock_holder yet. 208 * But it's only theoretical, I suppose. 209 */ 210 holder = atomic_load_relaxed(&kernel_lock_holder); 211 if (holder == NULL) 212 goto out; 213 214 /* 215 * We know we don't have the kernel lock. 216 * 217 * However, the holder value is not reliable because we don't 218 * hold kernel lock. For example, an interrupt on this cpu may 219 * acquire/release the kernel lock and leave kernel_lock_holder 220 * pointing to us. 221 */ 222 if (holder == curcpu()) 223 goto out; 224 225 /* 226 * If we already reported kernel lock hogging in the last ten 227 * seconds, probably not worthwhile to fill the log buffer with 228 * repeated reports, so pass. 229 * 230 * XXX This can roll over, but only after decades of uptime. 231 */ 232 then = atomic_load_relaxed(&kernel_lock_last_report); 233 now = time_uptime; 234 if (now - then <= 10) 235 goto out; 236 if (atomic_cas_uint(&kernel_lock_last_report, then, now) != then) 237 goto out; 238 239 printf("%s[%d %s]: kernel lock spinout\n", cpu_name(curcpu()), 240 curlwp->l_lid, 241 curlwp->l_name ? curlwp->l_name : curproc->p_comm); 242 /* 243 * Send an IPI to whatever CPU holds the kernel lock. 244 */ 245 ipi_unicast(&msg, holder); 246 ipi_wait(&msg); 247 out: 248 kpreempt_enable(); 249 250 #ifdef LOCKDEBUG 251 _KERNEL_LOCK_ABORT("spinout"); 252 #endif 253 return; 254 } 255 256 /* 257 * Initialize the kernel lock. 258 */ 259 void 260 kernel_lock_init(void) 261 { 262 263 __cpu_simple_lock_init(kernel_lock); 264 kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops, 265 RETURN_ADDRESS); 266 } 267 CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t)); 268 269 /* 270 * Print debugging information about the kernel lock. 271 */ 272 static void 273 _kernel_lock_dump(const volatile void *junk, lockop_printer_t pr) 274 { 275 struct cpu_info *ci = curcpu(); 276 277 (void)junk; 278 279 pr("curcpu holds : %18d wanted by: %#018lx\n", 280 ci->ci_biglock_count, (long)ci->ci_biglock_wanted); 281 } 282 283 /* 284 * Acquire 'nlocks' holds on the kernel lock. 285 * 286 * Although it may not look it, this is one of the most central, intricate 287 * routines in the kernel, and tons of code elsewhere depends on its exact 288 * behaviour. If you change something in here, expect it to bite you in the 289 * rear. 290 */ 291 void 292 _kernel_lock(int nlocks) 293 { 294 struct cpu_info *ci; 295 LOCKSTAT_TIMER(spintime); 296 LOCKSTAT_FLAG(lsflag); 297 struct lwp *owant; 298 u_int starttime; 299 int s; 300 struct lwp *l = curlwp; 301 302 _KERNEL_LOCK_ASSERT(nlocks > 0); 303 304 s = splvm(); 305 ci = curcpu(); 306 if (ci->ci_biglock_count != 0) { 307 _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); 308 SDT_PROBE1(sdt, kernel, lock, entry, nlocks); 309 ci->ci_biglock_count += nlocks; 310 l->l_blcnt += nlocks; 311 splx(s); 312 return; 313 } 314 315 _KERNEL_LOCK_ASSERT(l->l_blcnt == 0); 316 LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS, 317 0); 318 319 if (__predict_true(__cpu_simple_lock_try(kernel_lock))) { 320 atomic_store_relaxed(&kernel_lock_holder, curcpu()); 321 SDT_PROBE1(sdt, kernel, lock, entry, nlocks); 322 ci->ci_biglock_count = nlocks; 323 l->l_blcnt = nlocks; 324 LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL, 325 RETURN_ADDRESS, 0); 326 splx(s); 327 return; 328 } 329 330 /* 331 * To remove the ordering constraint between adaptive mutexes 332 * and kernel_lock we must make it appear as if this thread is 333 * blocking. For non-interlocked mutex release, a store fence 334 * is required to ensure that the result of any mutex_exit() 335 * by the current LWP becomes visible on the bus before the set 336 * of ci->ci_biglock_wanted becomes visible. 337 * 338 * This membar_producer matches the membar_consumer in 339 * mutex_vector_enter. 340 * 341 * That way, if l has just released a mutex, mutex_vector_enter 342 * can't see this store ci->ci_biglock_wanted := l until it 343 * will also see the mutex_exit store mtx->mtx_owner := 0 which 344 * clears the has-waiters bit. 345 */ 346 membar_producer(); 347 owant = ci->ci_biglock_wanted; 348 atomic_store_relaxed(&ci->ci_biglock_wanted, l); 349 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG) 350 l->l_ld_wanted = __builtin_return_address(0); 351 #endif 352 353 /* 354 * Spin until we acquire the lock. Once we have it, record the 355 * time spent with lockstat. 356 */ 357 LOCKSTAT_ENTER(lsflag); 358 LOCKSTAT_START_TIMER(lsflag, spintime); 359 360 starttime = getticks(); 361 do { 362 splx(s); 363 while (__SIMPLELOCK_LOCKED_P(kernel_lock)) { 364 if (start_init_exec && 365 (getticks() - starttime) > 10*hz) { 366 kernel_lock_spinout(); 367 } 368 SPINLOCK_BACKOFF_HOOK; 369 SPINLOCK_SPIN_HOOK; 370 } 371 s = splvm(); 372 } while (!__cpu_simple_lock_try(kernel_lock)); 373 374 atomic_store_relaxed(&kernel_lock_holder, curcpu()); 375 376 SDT_PROBE1(sdt, kernel, lock, entry, nlocks); 377 ci->ci_biglock_count = nlocks; 378 l->l_blcnt = nlocks; 379 LOCKSTAT_STOP_TIMER(lsflag, spintime); 380 LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL, 381 RETURN_ADDRESS, 0); 382 if (owant == NULL) { 383 LOCKSTAT_EVENT_RA(lsflag, kernel_lock, 384 LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS); 385 } 386 LOCKSTAT_EXIT(lsflag); 387 splx(s); 388 389 /* 390 * Now that we have kernel_lock, reset ci_biglock_wanted. This 391 * store must be visible on other CPUs before a mutex_exit() on 392 * this CPU can test the has-waiters bit. 393 * 394 * This membar_enter matches the membar_enter in 395 * mutex_vector_enter. (Yes, not membar_exit -- the legacy 396 * naming is confusing, but store-before-load usually pairs 397 * with store-before-load, in the extremely rare cases where it 398 * is used at all.) 399 * 400 * That way, mutex_vector_enter can't see this store 401 * ci->ci_biglock_wanted := owant until it has set the 402 * has-waiters bit. 403 */ 404 (void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant); 405 #ifndef __HAVE_ATOMIC_AS_MEMBAR 406 membar_enter(); 407 #endif 408 } 409 410 /* 411 * Release 'nlocks' holds on the kernel lock. If 'nlocks' is zero, release 412 * all holds. 413 */ 414 void 415 _kernel_unlock(int nlocks, int *countp) 416 { 417 struct cpu_info *ci; 418 u_int olocks; 419 int s; 420 struct lwp *l = curlwp; 421 422 _KERNEL_LOCK_ASSERT(nlocks < 2); 423 424 olocks = l->l_blcnt; 425 426 if (olocks == 0) { 427 _KERNEL_LOCK_ASSERT(nlocks <= 0); 428 if (countp != NULL) 429 *countp = 0; 430 return; 431 } 432 433 _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); 434 435 if (nlocks == 0) 436 nlocks = olocks; 437 else if (nlocks == -1) { 438 nlocks = 1; 439 _KERNEL_LOCK_ASSERT(olocks == 1); 440 } 441 s = splvm(); 442 ci = curcpu(); 443 _KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt); 444 if (ci->ci_biglock_count == nlocks) { 445 LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock, 446 RETURN_ADDRESS, 0); 447 ci->ci_biglock_count = 0; 448 __cpu_simple_unlock(kernel_lock); 449 l->l_blcnt -= nlocks; 450 splx(s); 451 if (l->l_dopreempt) 452 kpreempt(0); 453 } else { 454 ci->ci_biglock_count -= nlocks; 455 l->l_blcnt -= nlocks; 456 splx(s); 457 } 458 459 SDT_PROBE1(sdt, kernel, lock, exit, nlocks); 460 461 if (countp != NULL) 462 *countp = olocks; 463 } 464 465 bool 466 _kernel_locked_p(void) 467 { 468 return __SIMPLELOCK_LOCKED_P(kernel_lock); 469 } 470