1 /* $NetBSD: kern_lock.c,v 1.191 2026/01/03 23:08:16 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, and by Andrew Doran. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <sys/cdefs.h> 35 __KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.191 2026/01/03 23:08:16 riastradh Exp $"); 36 37 #ifdef _KERNEL_OPT 38 #include "opt_lockdebug.h" 39 #endif 40 41 #include <sys/param.h> 42 #include <sys/types.h> 43 44 #include <sys/atomic.h> 45 #include <sys/cpu.h> 46 #include <sys/kernel.h> 47 #include <sys/lock.h> 48 #include <sys/lockdebug.h> 49 #include <sys/lwp.h> 50 #include <sys/proc.h> 51 #include <sys/pserialize.h> 52 #include <sys/sdt.h> 53 #include <sys/syslog.h> 54 #include <sys/systm.h> 55 56 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG) 57 #include <sys/ksyms.h> 58 #endif 59 60 #include <machine/lock.h> 61 62 #include <dev/lockstat.h> 63 64 SDT_PROBE_DEFINE1(sdt, kernel, lock, entry, 65 "unsigned"/*nlocks*/); 66 SDT_PROBE_DEFINE1(sdt, kernel, lock, exit, 67 "unsigned"/*nlocks*/); 68 69 #define RETURN_ADDRESS (uintptr_t)__builtin_return_address(0) 70 71 bool kernel_lock_dodebug; 72 73 struct kernel_lock { 74 __cpu_simple_lock_t lock __aligned(CACHE_LINE_SIZE); 75 struct cpu_info *volatile holder; 76 } kernel_lock_cacheline[CACHE_LINE_SIZE / sizeof(struct kernel_lock)] 77 __cacheline_aligned; 78 __strong_alias(kernel_lock, kernel_lock_cacheline) 79 #define kernel_lock_holder (kernel_lock_cacheline[0].holder) 80 81 void 82 assert_sleepable(void) 83 { 84 const char *reason; 85 long pctr; 86 bool idle; 87 88 if (__predict_false(panicstr != NULL)) { 89 return; 90 } 91 92 LOCKDEBUG_BARRIER(kernel_lock, 1); 93 94 /* 95 * Avoid disabling/re-enabling preemption here since this 96 * routine may be called in delicate situations. 97 */ 98 do { 99 pctr = lwp_pctr(); 100 idle = CURCPU_IDLE_P(); 101 } while (__predict_false(pctr != lwp_pctr())); 102 103 reason = NULL; 104 if (__predict_false(idle) && !cold) { 105 reason = "idle"; 106 goto panic; 107 } 108 if (__predict_false(cpu_intr_p())) { 109 reason = "interrupt"; 110 goto panic; 111 } 112 if (__predict_false(cpu_softintr_p())) { 113 reason = "softint"; 114 goto panic; 115 } 116 if (__predict_false(!pserialize_not_in_read_section())) { 117 reason = "pserialize"; 118 goto panic; 119 } 120 return; 121 122 panic: panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS); 123 } 124 125 /* 126 * Functions for manipulating the kernel_lock. We put them here 127 * so that they show up in profiles. 128 */ 129 130 #define _KERNEL_LOCK_ABORT(msg) \ 131 LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg) 132 133 #ifdef LOCKDEBUG 134 #define _KERNEL_LOCK_ASSERT(cond) \ 135 do { \ 136 if (!(cond)) \ 137 _KERNEL_LOCK_ABORT("assertion failed: " #cond); \ 138 } while (/* CONSTCOND */ 0) 139 #else 140 #define _KERNEL_LOCK_ASSERT(cond) /* nothing */ 141 #endif 142 143 static void _kernel_lock_dump(const volatile void *, lockop_printer_t); 144 145 lockops_t _kernel_lock_ops = { 146 .lo_name = "Kernel lock", 147 .lo_type = LOCKOPS_SPIN, 148 .lo_dump = _kernel_lock_dump, 149 }; 150 151 #ifdef DDB 152 #include <ddb/ddb.h> 153 #endif 154 155 static void 156 kernel_lock_trace_ipi(void *cookie) 157 { 158 159 printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()), 160 curlwp->l_lid, 161 curlwp->l_name ? curlwp->l_name : curproc->p_comm); 162 #ifdef DDB 163 db_stacktrace(); 164 165 /* 166 * Make sure we leave a return address around for db_stacktrace 167 * to find. 168 */ 169 __insn_barrier(); 170 return; 171 #endif 172 } 173 174 static void 175 kernel_lock_spinout(void) 176 { 177 static volatile unsigned kernel_lock_last_report; 178 ipi_msg_t msg = { 179 .func = kernel_lock_trace_ipi, 180 }; 181 unsigned now, then; 182 struct cpu_info *holder; 183 184 /* 185 * Find who holds the kernel lock. If nobody, we can't report 186 * anything, so pass -- but while this is possible in principle 187 * because we first take the lock and then set the holder, it 188 * is rather unlikely to actually happen in practice because we 189 * wait 10sec to take the lock before trying to report a 190 * problem anyway. 191 */ 192 holder = atomic_load_relaxed(&kernel_lock_holder); 193 if (holder == NULL) 194 goto out; 195 196 /* 197 * If we already reported kernel lock hogging in the last ten 198 * seconds, probably not worthwhile to fill the log buffer with 199 * repeated reports, so pass. 200 * 201 * XXX This can roll over, but only after decades of uptime. 202 */ 203 then = atomic_load_relaxed(&kernel_lock_last_report); 204 now = time_uptime; 205 if (now - then <= 10) 206 goto out; 207 if (atomic_cas_uint(&kernel_lock_last_report, then, now) != then) 208 goto out; 209 210 /* 211 * Disable preemption while we send an IPI to whatever CPU 212 * holds the kernel lock. 213 */ 214 printf("%s[%d %s]: kernel lock spinout\n", cpu_name(curcpu()), 215 curlwp->l_lid, 216 curlwp->l_name ? curlwp->l_name : curproc->p_comm); 217 kpreempt_disable(); 218 ipi_unicast(&msg, holder); 219 ipi_wait(&msg); 220 kpreempt_enable(); 221 222 out: 223 #ifdef LOCKDEBUG 224 _KERNEL_LOCK_ABORT("spinout"); 225 #endif 226 return; 227 } 228 229 /* 230 * Initialize the kernel lock. 231 */ 232 void 233 kernel_lock_init(void) 234 { 235 236 __cpu_simple_lock_init(kernel_lock); 237 kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops, 238 RETURN_ADDRESS); 239 } 240 CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t)); 241 242 /* 243 * Print debugging information about the kernel lock. 244 */ 245 static void 246 _kernel_lock_dump(const volatile void *junk, lockop_printer_t pr) 247 { 248 struct cpu_info *ci = curcpu(); 249 250 (void)junk; 251 252 pr("curcpu holds : %18d wanted by: %#018lx\n", 253 ci->ci_biglock_count, (long)ci->ci_biglock_wanted); 254 } 255 256 /* 257 * Acquire 'nlocks' holds on the kernel lock. 258 * 259 * Although it may not look it, this is one of the most central, intricate 260 * routines in the kernel, and tons of code elsewhere depends on its exact 261 * behaviour. If you change something in here, expect it to bite you in the 262 * rear. 263 */ 264 void 265 _kernel_lock(int nlocks) 266 { 267 struct cpu_info *ci; 268 LOCKSTAT_TIMER(spintime); 269 LOCKSTAT_FLAG(lsflag); 270 struct lwp *owant; 271 u_int starttime; 272 int s; 273 struct lwp *l = curlwp; 274 275 _KERNEL_LOCK_ASSERT(nlocks > 0); 276 277 s = splvm(); 278 ci = curcpu(); 279 if (ci->ci_biglock_count != 0) { 280 _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); 281 SDT_PROBE1(sdt, kernel, lock, entry, nlocks); 282 ci->ci_biglock_count += nlocks; 283 l->l_blcnt += nlocks; 284 splx(s); 285 return; 286 } 287 288 _KERNEL_LOCK_ASSERT(l->l_blcnt == 0); 289 LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS, 290 0); 291 292 if (__predict_true(__cpu_simple_lock_try(kernel_lock))) { 293 atomic_store_relaxed(&kernel_lock_holder, curcpu()); 294 SDT_PROBE1(sdt, kernel, lock, entry, nlocks); 295 ci->ci_biglock_count = nlocks; 296 l->l_blcnt = nlocks; 297 LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL, 298 RETURN_ADDRESS, 0); 299 splx(s); 300 return; 301 } 302 303 /* 304 * To remove the ordering constraint between adaptive mutexes 305 * and kernel_lock we must make it appear as if this thread is 306 * blocking. For non-interlocked mutex release, a store fence 307 * is required to ensure that the result of any mutex_exit() 308 * by the current LWP becomes visible on the bus before the set 309 * of ci->ci_biglock_wanted becomes visible. 310 * 311 * This membar_producer matches the membar_consumer in 312 * mutex_vector_enter. 313 * 314 * That way, if l has just released a mutex, mutex_vector_enter 315 * can't see this store ci->ci_biglock_wanted := l until it 316 * will also see the mutex_exit store mtx->mtx_owner := 0 which 317 * clears the has-waiters bit. 318 */ 319 membar_producer(); 320 owant = ci->ci_biglock_wanted; 321 atomic_store_relaxed(&ci->ci_biglock_wanted, l); 322 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG) 323 l->l_ld_wanted = __builtin_return_address(0); 324 #endif 325 326 /* 327 * Spin until we acquire the lock. Once we have it, record the 328 * time spent with lockstat. 329 */ 330 LOCKSTAT_ENTER(lsflag); 331 LOCKSTAT_START_TIMER(lsflag, spintime); 332 333 starttime = getticks(); 334 do { 335 splx(s); 336 while (__SIMPLELOCK_LOCKED_P(kernel_lock)) { 337 if (start_init_exec && 338 (getticks() - starttime) > 10*hz) { 339 kernel_lock_spinout(); 340 } 341 SPINLOCK_BACKOFF_HOOK; 342 SPINLOCK_SPIN_HOOK; 343 } 344 s = splvm(); 345 } while (!__cpu_simple_lock_try(kernel_lock)); 346 347 atomic_store_relaxed(&kernel_lock_holder, curcpu()); 348 349 SDT_PROBE1(sdt, kernel, lock, entry, nlocks); 350 ci->ci_biglock_count = nlocks; 351 l->l_blcnt = nlocks; 352 LOCKSTAT_STOP_TIMER(lsflag, spintime); 353 LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL, 354 RETURN_ADDRESS, 0); 355 if (owant == NULL) { 356 LOCKSTAT_EVENT_RA(lsflag, kernel_lock, 357 LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS); 358 } 359 LOCKSTAT_EXIT(lsflag); 360 splx(s); 361 362 /* 363 * Now that we have kernel_lock, reset ci_biglock_wanted. This 364 * store must be visible on other CPUs before a mutex_exit() on 365 * this CPU can test the has-waiters bit. 366 * 367 * This membar_enter matches the membar_enter in 368 * mutex_vector_enter. (Yes, not membar_exit -- the legacy 369 * naming is confusing, but store-before-load usually pairs 370 * with store-before-load, in the extremely rare cases where it 371 * is used at all.) 372 * 373 * That way, mutex_vector_enter can't see this store 374 * ci->ci_biglock_wanted := owant until it has set the 375 * has-waiters bit. 376 */ 377 (void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant); 378 #ifndef __HAVE_ATOMIC_AS_MEMBAR 379 membar_enter(); 380 #endif 381 } 382 383 /* 384 * Release 'nlocks' holds on the kernel lock. If 'nlocks' is zero, release 385 * all holds. 386 */ 387 void 388 _kernel_unlock(int nlocks, int *countp) 389 { 390 struct cpu_info *ci; 391 u_int olocks; 392 int s; 393 struct lwp *l = curlwp; 394 395 _KERNEL_LOCK_ASSERT(nlocks < 2); 396 397 olocks = l->l_blcnt; 398 399 if (olocks == 0) { 400 _KERNEL_LOCK_ASSERT(nlocks <= 0); 401 if (countp != NULL) 402 *countp = 0; 403 return; 404 } 405 406 _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); 407 408 if (nlocks == 0) 409 nlocks = olocks; 410 else if (nlocks == -1) { 411 nlocks = 1; 412 _KERNEL_LOCK_ASSERT(olocks == 1); 413 } 414 s = splvm(); 415 ci = curcpu(); 416 _KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt); 417 if (ci->ci_biglock_count == nlocks) { 418 LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock, 419 RETURN_ADDRESS, 0); 420 ci->ci_biglock_count = 0; 421 __cpu_simple_unlock(kernel_lock); 422 l->l_blcnt -= nlocks; 423 splx(s); 424 if (l->l_dopreempt) 425 kpreempt(0); 426 } else { 427 ci->ci_biglock_count -= nlocks; 428 l->l_blcnt -= nlocks; 429 splx(s); 430 } 431 432 SDT_PROBE1(sdt, kernel, lock, exit, nlocks); 433 434 if (countp != NULL) 435 *countp = olocks; 436 } 437 438 bool 439 _kernel_locked_p(void) 440 { 441 return __SIMPLELOCK_LOCKED_P(kernel_lock); 442 } 443