Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: kern_lock.c,v 1.191 2026/01/03 23:08:16 riastradh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023
      5  *     The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
     10  * NASA Ames Research Center, and by Andrew Doran.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     31  * POSSIBILITY OF SUCH DAMAGE.
     32  */
     33 
     34 #include <sys/cdefs.h>
     35 __KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.191 2026/01/03 23:08:16 riastradh Exp $");
     36 
     37 #ifdef _KERNEL_OPT
     38 #include "opt_lockdebug.h"
     39 #endif
     40 
     41 #include <sys/param.h>
     42 #include <sys/types.h>
     43 
     44 #include <sys/atomic.h>
     45 #include <sys/cpu.h>
     46 #include <sys/kernel.h>
     47 #include <sys/lock.h>
     48 #include <sys/lockdebug.h>
     49 #include <sys/lwp.h>
     50 #include <sys/proc.h>
     51 #include <sys/pserialize.h>
     52 #include <sys/sdt.h>
     53 #include <sys/syslog.h>
     54 #include <sys/systm.h>
     55 
     56 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
     57 #include <sys/ksyms.h>
     58 #endif
     59 
     60 #include <machine/lock.h>
     61 
     62 #include <dev/lockstat.h>
     63 
     64 SDT_PROBE_DEFINE1(sdt, kernel, lock, entry,
     65     "unsigned"/*nlocks*/);
     66 SDT_PROBE_DEFINE1(sdt, kernel, lock, exit,
     67     "unsigned"/*nlocks*/);
     68 
     69 #define	RETURN_ADDRESS	(uintptr_t)__builtin_return_address(0)
     70 
     71 bool	kernel_lock_dodebug;
     72 
     73 struct kernel_lock {
     74 	__cpu_simple_lock_t	lock __aligned(CACHE_LINE_SIZE);
     75 	struct cpu_info		*volatile holder;
     76 } kernel_lock_cacheline[CACHE_LINE_SIZE / sizeof(struct kernel_lock)]
     77     __cacheline_aligned;
     78 __strong_alias(kernel_lock, kernel_lock_cacheline)
     79 #define	kernel_lock_holder	(kernel_lock_cacheline[0].holder)
     80 
     81 void
     82 assert_sleepable(void)
     83 {
     84 	const char *reason;
     85 	long pctr;
     86 	bool idle;
     87 
     88 	if (__predict_false(panicstr != NULL)) {
     89 		return;
     90 	}
     91 
     92 	LOCKDEBUG_BARRIER(kernel_lock, 1);
     93 
     94 	/*
     95 	 * Avoid disabling/re-enabling preemption here since this
     96 	 * routine may be called in delicate situations.
     97 	 */
     98 	do {
     99 		pctr = lwp_pctr();
    100 		idle = CURCPU_IDLE_P();
    101 	} while (__predict_false(pctr != lwp_pctr()));
    102 
    103 	reason = NULL;
    104 	if (__predict_false(idle) && !cold) {
    105 		reason = "idle";
    106 		goto panic;
    107 	}
    108 	if (__predict_false(cpu_intr_p())) {
    109 		reason = "interrupt";
    110 		goto panic;
    111 	}
    112 	if (__predict_false(cpu_softintr_p())) {
    113 		reason = "softint";
    114 		goto panic;
    115 	}
    116 	if (__predict_false(!pserialize_not_in_read_section())) {
    117 		reason = "pserialize";
    118 		goto panic;
    119 	}
    120 	return;
    121 
    122 panic:	panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS);
    123 }
    124 
    125 /*
    126  * Functions for manipulating the kernel_lock.  We put them here
    127  * so that they show up in profiles.
    128  */
    129 
    130 #define	_KERNEL_LOCK_ABORT(msg)						\
    131     LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg)
    132 
    133 #ifdef LOCKDEBUG
    134 #define	_KERNEL_LOCK_ASSERT(cond)					\
    135 do {									\
    136 	if (!(cond))							\
    137 		_KERNEL_LOCK_ABORT("assertion failed: " #cond);		\
    138 } while (/* CONSTCOND */ 0)
    139 #else
    140 #define	_KERNEL_LOCK_ASSERT(cond)	/* nothing */
    141 #endif
    142 
    143 static void	_kernel_lock_dump(const volatile void *, lockop_printer_t);
    144 
    145 lockops_t _kernel_lock_ops = {
    146 	.lo_name = "Kernel lock",
    147 	.lo_type = LOCKOPS_SPIN,
    148 	.lo_dump = _kernel_lock_dump,
    149 };
    150 
    151 #ifdef DDB
    152 #include <ddb/ddb.h>
    153 #endif
    154 
    155 static void
    156 kernel_lock_trace_ipi(void *cookie)
    157 {
    158 
    159 	printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()),
    160 	    curlwp->l_lid,
    161 	    curlwp->l_name ? curlwp->l_name : curproc->p_comm);
    162 #ifdef DDB
    163 	db_stacktrace();
    164 
    165 	/*
    166 	 * Make sure we leave a return address around for db_stacktrace
    167 	 * to find.
    168 	 */
    169 	__insn_barrier();
    170 	return;
    171 #endif
    172 }
    173 
    174 static void
    175 kernel_lock_spinout(void)
    176 {
    177 	static volatile unsigned kernel_lock_last_report;
    178 	ipi_msg_t msg = {
    179 		.func = kernel_lock_trace_ipi,
    180 	};
    181 	unsigned now, then;
    182 	struct cpu_info *holder;
    183 
    184 	/*
    185 	 * Find who holds the kernel lock.  If nobody, we can't report
    186 	 * anything, so pass -- but while this is possible in principle
    187 	 * because we first take the lock and then set the holder, it
    188 	 * is rather unlikely to actually happen in practice because we
    189 	 * wait 10sec to take the lock before trying to report a
    190 	 * problem anyway.
    191 	 */
    192 	holder = atomic_load_relaxed(&kernel_lock_holder);
    193 	if (holder == NULL)
    194 		goto out;
    195 
    196 	/*
    197 	 * If we already reported kernel lock hogging in the last ten
    198 	 * seconds, probably not worthwhile to fill the log buffer with
    199 	 * repeated reports, so pass.
    200 	 *
    201 	 * XXX This can roll over, but only after decades of uptime.
    202 	 */
    203 	then = atomic_load_relaxed(&kernel_lock_last_report);
    204 	now = time_uptime;
    205 	if (now - then <= 10)
    206 		goto out;
    207 	if (atomic_cas_uint(&kernel_lock_last_report, then, now) != then)
    208 		goto out;
    209 
    210 	/*
    211 	 * Disable preemption while we send an IPI to whatever CPU
    212 	 * holds the kernel lock.
    213 	 */
    214 	printf("%s[%d %s]: kernel lock spinout\n", cpu_name(curcpu()),
    215 	    curlwp->l_lid,
    216 	    curlwp->l_name ? curlwp->l_name : curproc->p_comm);
    217 	kpreempt_disable();
    218 	ipi_unicast(&msg, holder);
    219 	ipi_wait(&msg);
    220 	kpreempt_enable();
    221 
    222 out:
    223 #ifdef LOCKDEBUG
    224 	_KERNEL_LOCK_ABORT("spinout");
    225 #endif
    226 	return;
    227 }
    228 
    229 /*
    230  * Initialize the kernel lock.
    231  */
    232 void
    233 kernel_lock_init(void)
    234 {
    235 
    236 	__cpu_simple_lock_init(kernel_lock);
    237 	kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops,
    238 	    RETURN_ADDRESS);
    239 }
    240 CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t));
    241 
    242 /*
    243  * Print debugging information about the kernel lock.
    244  */
    245 static void
    246 _kernel_lock_dump(const volatile void *junk, lockop_printer_t pr)
    247 {
    248 	struct cpu_info *ci = curcpu();
    249 
    250 	(void)junk;
    251 
    252 	pr("curcpu holds : %18d wanted by: %#018lx\n",
    253 	    ci->ci_biglock_count, (long)ci->ci_biglock_wanted);
    254 }
    255 
    256 /*
    257  * Acquire 'nlocks' holds on the kernel lock.
    258  *
    259  * Although it may not look it, this is one of the most central, intricate
    260  * routines in the kernel, and tons of code elsewhere depends on its exact
    261  * behaviour.  If you change something in here, expect it to bite you in the
    262  * rear.
    263  */
    264 void
    265 _kernel_lock(int nlocks)
    266 {
    267 	struct cpu_info *ci;
    268 	LOCKSTAT_TIMER(spintime);
    269 	LOCKSTAT_FLAG(lsflag);
    270 	struct lwp *owant;
    271 	u_int starttime;
    272 	int s;
    273 	struct lwp *l = curlwp;
    274 
    275 	_KERNEL_LOCK_ASSERT(nlocks > 0);
    276 
    277 	s = splvm();
    278 	ci = curcpu();
    279 	if (ci->ci_biglock_count != 0) {
    280 		_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
    281 		SDT_PROBE1(sdt, kernel, lock, entry,  nlocks);
    282 		ci->ci_biglock_count += nlocks;
    283 		l->l_blcnt += nlocks;
    284 		splx(s);
    285 		return;
    286 	}
    287 
    288 	_KERNEL_LOCK_ASSERT(l->l_blcnt == 0);
    289 	LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS,
    290 	    0);
    291 
    292 	if (__predict_true(__cpu_simple_lock_try(kernel_lock))) {
    293 		atomic_store_relaxed(&kernel_lock_holder, curcpu());
    294 		SDT_PROBE1(sdt, kernel, lock, entry,  nlocks);
    295 		ci->ci_biglock_count = nlocks;
    296 		l->l_blcnt = nlocks;
    297 		LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
    298 		    RETURN_ADDRESS, 0);
    299 		splx(s);
    300 		return;
    301 	}
    302 
    303 	/*
    304 	 * To remove the ordering constraint between adaptive mutexes
    305 	 * and kernel_lock we must make it appear as if this thread is
    306 	 * blocking.  For non-interlocked mutex release, a store fence
    307 	 * is required to ensure that the result of any mutex_exit()
    308 	 * by the current LWP becomes visible on the bus before the set
    309 	 * of ci->ci_biglock_wanted becomes visible.
    310 	 *
    311 	 * This membar_producer matches the membar_consumer in
    312 	 * mutex_vector_enter.
    313 	 *
    314 	 * That way, if l has just released a mutex, mutex_vector_enter
    315 	 * can't see this store ci->ci_biglock_wanted := l until it
    316 	 * will also see the mutex_exit store mtx->mtx_owner := 0 which
    317 	 * clears the has-waiters bit.
    318 	 */
    319 	membar_producer();
    320 	owant = ci->ci_biglock_wanted;
    321 	atomic_store_relaxed(&ci->ci_biglock_wanted, l);
    322 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
    323 	l->l_ld_wanted = __builtin_return_address(0);
    324 #endif
    325 
    326 	/*
    327 	 * Spin until we acquire the lock.  Once we have it, record the
    328 	 * time spent with lockstat.
    329 	 */
    330 	LOCKSTAT_ENTER(lsflag);
    331 	LOCKSTAT_START_TIMER(lsflag, spintime);
    332 
    333 	starttime = getticks();
    334 	do {
    335 		splx(s);
    336 		while (__SIMPLELOCK_LOCKED_P(kernel_lock)) {
    337 			if (start_init_exec &&
    338 			    (getticks() - starttime) > 10*hz) {
    339 				kernel_lock_spinout();
    340 			}
    341 			SPINLOCK_BACKOFF_HOOK;
    342 			SPINLOCK_SPIN_HOOK;
    343 		}
    344 		s = splvm();
    345 	} while (!__cpu_simple_lock_try(kernel_lock));
    346 
    347 	atomic_store_relaxed(&kernel_lock_holder, curcpu());
    348 
    349 	SDT_PROBE1(sdt, kernel, lock, entry,  nlocks);
    350 	ci->ci_biglock_count = nlocks;
    351 	l->l_blcnt = nlocks;
    352 	LOCKSTAT_STOP_TIMER(lsflag, spintime);
    353 	LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
    354 	    RETURN_ADDRESS, 0);
    355 	if (owant == NULL) {
    356 		LOCKSTAT_EVENT_RA(lsflag, kernel_lock,
    357 		    LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS);
    358 	}
    359 	LOCKSTAT_EXIT(lsflag);
    360 	splx(s);
    361 
    362 	/*
    363 	 * Now that we have kernel_lock, reset ci_biglock_wanted.  This
    364 	 * store must be visible on other CPUs before a mutex_exit() on
    365 	 * this CPU can test the has-waiters bit.
    366 	 *
    367 	 * This membar_enter matches the membar_enter in
    368 	 * mutex_vector_enter.  (Yes, not membar_exit -- the legacy
    369 	 * naming is confusing, but store-before-load usually pairs
    370 	 * with store-before-load, in the extremely rare cases where it
    371 	 * is used at all.)
    372 	 *
    373 	 * That way, mutex_vector_enter can't see this store
    374 	 * ci->ci_biglock_wanted := owant until it has set the
    375 	 * has-waiters bit.
    376 	 */
    377 	(void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant);
    378 #ifndef __HAVE_ATOMIC_AS_MEMBAR
    379 	membar_enter();
    380 #endif
    381 }
    382 
    383 /*
    384  * Release 'nlocks' holds on the kernel lock.  If 'nlocks' is zero, release
    385  * all holds.
    386  */
    387 void
    388 _kernel_unlock(int nlocks, int *countp)
    389 {
    390 	struct cpu_info *ci;
    391 	u_int olocks;
    392 	int s;
    393 	struct lwp *l = curlwp;
    394 
    395 	_KERNEL_LOCK_ASSERT(nlocks < 2);
    396 
    397 	olocks = l->l_blcnt;
    398 
    399 	if (olocks == 0) {
    400 		_KERNEL_LOCK_ASSERT(nlocks <= 0);
    401 		if (countp != NULL)
    402 			*countp = 0;
    403 		return;
    404 	}
    405 
    406 	_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
    407 
    408 	if (nlocks == 0)
    409 		nlocks = olocks;
    410 	else if (nlocks == -1) {
    411 		nlocks = 1;
    412 		_KERNEL_LOCK_ASSERT(olocks == 1);
    413 	}
    414 	s = splvm();
    415 	ci = curcpu();
    416 	_KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt);
    417 	if (ci->ci_biglock_count == nlocks) {
    418 		LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock,
    419 		    RETURN_ADDRESS, 0);
    420 		ci->ci_biglock_count = 0;
    421 		__cpu_simple_unlock(kernel_lock);
    422 		l->l_blcnt -= nlocks;
    423 		splx(s);
    424 		if (l->l_dopreempt)
    425 			kpreempt(0);
    426 	} else {
    427 		ci->ci_biglock_count -= nlocks;
    428 		l->l_blcnt -= nlocks;
    429 		splx(s);
    430 	}
    431 
    432 	SDT_PROBE1(sdt, kernel, lock, exit,  nlocks);
    433 
    434 	if (countp != NULL)
    435 		*countp = olocks;
    436 }
    437 
    438 bool
    439 _kernel_locked_p(void)
    440 {
    441 	return __SIMPLELOCK_LOCKED_P(kernel_lock);
    442 }
    443