Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: kern_lock.c,v 1.195 2026/03/29 08:36:43 kre Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023
      5  *     The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
     10  * NASA Ames Research Center, and by Andrew Doran.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     31  * POSSIBILITY OF SUCH DAMAGE.
     32  */
     33 
     34 #include <sys/cdefs.h>
     35 __KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.195 2026/03/29 08:36:43 kre Exp $");
     36 
     37 #ifdef _KERNEL_OPT
     38 #include "opt_lockdebug.h"
     39 #endif
     40 
     41 #include <sys/param.h>
     42 #include <sys/types.h>
     43 
     44 #include <sys/atomic.h>
     45 #include <sys/cpu.h>
     46 #include <sys/kernel.h>
     47 #include <sys/lock.h>
     48 #include <sys/lockdebug.h>
     49 #include <sys/lwp.h>
     50 #include <sys/proc.h>
     51 #include <sys/pserialize.h>
     52 #include <sys/sdt.h>
     53 #include <sys/syslog.h>
     54 #include <sys/systm.h>
     55 
     56 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
     57 #include <sys/ksyms.h>
     58 #endif
     59 
     60 #include <machine/lock.h>
     61 
     62 #include <dev/lockstat.h>
     63 
     64 SDT_PROBE_DEFINE1(sdt, kernel, lock, entry,
     65     "unsigned"/*nlocks*/);
     66 SDT_PROBE_DEFINE1(sdt, kernel, lock, exit,
     67     "unsigned"/*nlocks*/);
     68 
     69 #define	RETURN_ADDRESS	(uintptr_t)__builtin_return_address(0)
     70 
     71 bool	kernel_lock_dodebug;
     72 
     73 struct kernel_lock {
     74 	__cpu_simple_lock_t	lock __aligned(CACHE_LINE_SIZE);
     75 	struct cpu_info		*volatile holder;
     76 } kernel_lock_cacheline[CACHE_LINE_SIZE / sizeof(struct kernel_lock)]
     77     __cacheline_aligned;
     78 __strong_alias(kernel_lock, kernel_lock_cacheline)
     79 #define	kernel_lock_holder	(kernel_lock_cacheline[0].holder)
     80 
     81 void
     82 assert_sleepable(void)
     83 {
     84 	const char *reason;
     85 	long pctr;
     86 	bool idle;
     87 
     88 	if (__predict_false(panicstr != NULL)) {
     89 		return;
     90 	}
     91 
     92 	LOCKDEBUG_BARRIER(kernel_lock, 1);
     93 
     94 	/*
     95 	 * Avoid disabling/re-enabling preemption here since this
     96 	 * routine may be called in delicate situations.
     97 	 */
     98 	do {
     99 		pctr = lwp_pctr();
    100 		idle = CURCPU_IDLE_P();
    101 	} while (__predict_false(pctr != lwp_pctr()));
    102 
    103 	reason = NULL;
    104 	if (__predict_false(idle) && !cold) {
    105 		reason = "idle";
    106 		goto panic;
    107 	}
    108 	if (__predict_false(cpu_intr_p())) {
    109 		reason = "interrupt";
    110 		goto panic;
    111 	}
    112 	if (__predict_false(cpu_softintr_p())) {
    113 		reason = "softint";
    114 		goto panic;
    115 	}
    116 	if (__predict_false(!pserialize_not_in_read_section())) {
    117 		reason = "pserialize";
    118 		goto panic;
    119 	}
    120 	return;
    121 
    122 panic:	panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS);
    123 }
    124 
    125 /*
    126  * Functions for manipulating the kernel_lock.  We put them here
    127  * so that they show up in profiles.
    128  */
    129 
    130 #define	_KERNEL_LOCK_ABORT(msg)						\
    131     LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg)
    132 
    133 #ifdef LOCKDEBUG
    134 #define	_KERNEL_LOCK_ASSERT(cond)					\
    135 do {									\
    136 	if (!(cond))							\
    137 		_KERNEL_LOCK_ABORT("assertion failed: " #cond);		\
    138 } while (/* CONSTCOND */ 0)
    139 #else
    140 #define	_KERNEL_LOCK_ASSERT(cond)	/* nothing */
    141 #endif
    142 
    143 static void	_kernel_lock_dump(const volatile void *, lockop_printer_t);
    144 
    145 lockops_t _kernel_lock_ops = {
    146 	.lo_name = "Kernel lock",
    147 	.lo_type = LOCKOPS_SPIN,
    148 	.lo_dump = _kernel_lock_dump,
    149 };
    150 
    151 #ifdef DDB
    152 #include <ddb/ddb.h>
    153 #endif
    154 
    155 static void
    156 kernel_lock_trace_ipi(void *cookie)
    157 {
    158 
    159 	printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()),
    160 	    curlwp->l_lid,
    161 	    curlwp->l_name ? curlwp->l_name : curproc->p_comm);
    162 #ifdef DDB
    163 	db_stacktrace();
    164 
    165 	/*
    166 	 * Make sure we leave a return address around for db_stacktrace
    167 	 * to find.
    168 	 */
    169 	__insn_barrier();
    170 	return;
    171 #endif
    172 }
    173 
    174 static void
    175 kernel_lock_spinout(void)
    176 {
    177 	static volatile unsigned kernel_lock_last_report;
    178 	ipi_msg_t msg = {
    179 		.func = kernel_lock_trace_ipi,
    180 	};
    181 	unsigned now, then;
    182 	struct cpu_info *holder;
    183 
    184 	/*
    185 	 * Disable preemption so that curcpu() is stable in this function.
    186 	 * Otherwise, it's possible for the curlwp to be migrated to the
    187 	 * holder cpu in the meantime.
    188 	 */
    189 	kpreempt_disable();
    190 
    191 	/*
    192 	 * Find who holds the kernel lock.  If nobody, we can't report
    193 	 * anything, so pass -- but while this is possible in principle
    194 	 * because we first take the lock and then set the holder, it
    195 	 * is rather unlikely to actually happen in practice because we
    196 	 * wait 10sec to take the lock before trying to report a
    197 	 * problem anyway.
    198 	 */
    199 	if (!__SIMPLELOCK_LOCKED_P(kernel_lock))
    200 		goto out;
    201 
    202 	/*
    203 	 * Note: holder == NULL here basically means
    204 	 * "no one has acquired kernel lock since the boot".
    205 	 *
    206 	 * Theoretically it's possible the first locker has acquired
    207 	 * kernel_lock but has not updated kernel_lock_holder yet.
    208 	 * But it's only theoretical, I suppose.
    209 	 */
    210 	holder = atomic_load_relaxed(&kernel_lock_holder);
    211 	if (holder == NULL)
    212 		goto out;
    213 
    214 	/*
    215 	 * We know we don't have the kernel lock.
    216 	 *
    217 	 * However, the holder value is not reliable because we don't
    218 	 * hold kernel lock. For example, an interrupt on this cpu may
    219 	 * acquire/release the kernel lock and leave kernel_lock_holder
    220 	 * pointing to us.
    221 	 */
    222 	if (holder == curcpu())
    223 		goto out;
    224 
    225 	/*
    226 	 * If we already reported kernel lock hogging in the last ten
    227 	 * seconds, probably not worthwhile to fill the log buffer with
    228 	 * repeated reports, so pass.
    229 	 *
    230 	 * XXX This can roll over, but only after decades of uptime.
    231 	 */
    232 	then = atomic_load_relaxed(&kernel_lock_last_report);
    233 	now = time_uptime;
    234 	if (now - then <= 10)
    235 		goto out;
    236 	if (atomic_cas_uint(&kernel_lock_last_report, then, now) != then)
    237 		goto out;
    238 
    239 	printf("%s[%d %s]: kernel lock spinout\n", cpu_name(curcpu()),
    240 	    curlwp->l_lid,
    241 	    curlwp->l_name ? curlwp->l_name : curproc->p_comm);
    242 	/*
    243 	 * Send an IPI to whatever CPU holds the kernel lock.
    244 	 */
    245 	ipi_unicast(&msg, holder);
    246 	ipi_wait(&msg);
    247 out:
    248 	kpreempt_enable();
    249 
    250 #ifdef LOCKDEBUG
    251 	_KERNEL_LOCK_ABORT("spinout");
    252 #endif
    253 	return;
    254 }
    255 
    256 /*
    257  * Initialize the kernel lock.
    258  */
    259 void
    260 kernel_lock_init(void)
    261 {
    262 
    263 	__cpu_simple_lock_init(kernel_lock);
    264 	kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops,
    265 	    RETURN_ADDRESS);
    266 }
    267 CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t));
    268 
    269 /*
    270  * Print debugging information about the kernel lock.
    271  */
    272 static void
    273 _kernel_lock_dump(const volatile void *junk, lockop_printer_t pr)
    274 {
    275 	struct cpu_info *ci = curcpu();
    276 
    277 	(void)junk;
    278 
    279 	pr("curcpu holds : %18d wanted by: %#018lx\n",
    280 	    ci->ci_biglock_count, (long)ci->ci_biglock_wanted);
    281 }
    282 
    283 /*
    284  * Acquire 'nlocks' holds on the kernel lock.
    285  *
    286  * Although it may not look it, this is one of the most central, intricate
    287  * routines in the kernel, and tons of code elsewhere depends on its exact
    288  * behaviour.  If you change something in here, expect it to bite you in the
    289  * rear.
    290  */
    291 void
    292 _kernel_lock(int nlocks)
    293 {
    294 	struct cpu_info *ci;
    295 	LOCKSTAT_TIMER(spintime);
    296 	LOCKSTAT_FLAG(lsflag);
    297 	struct lwp *owant;
    298 	u_int starttime;
    299 	int s;
    300 	struct lwp *l = curlwp;
    301 
    302 	_KERNEL_LOCK_ASSERT(nlocks > 0);
    303 
    304 	s = splvm();
    305 	ci = curcpu();
    306 	if (ci->ci_biglock_count != 0) {
    307 		_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
    308 		SDT_PROBE1(sdt, kernel, lock, entry,  nlocks);
    309 		ci->ci_biglock_count += nlocks;
    310 		l->l_blcnt += nlocks;
    311 		splx(s);
    312 		return;
    313 	}
    314 
    315 	_KERNEL_LOCK_ASSERT(l->l_blcnt == 0);
    316 	LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS,
    317 	    0);
    318 
    319 	if (__predict_true(__cpu_simple_lock_try(kernel_lock))) {
    320 		atomic_store_relaxed(&kernel_lock_holder, curcpu());
    321 		SDT_PROBE1(sdt, kernel, lock, entry,  nlocks);
    322 		ci->ci_biglock_count = nlocks;
    323 		l->l_blcnt = nlocks;
    324 		LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
    325 		    RETURN_ADDRESS, 0);
    326 		splx(s);
    327 		return;
    328 	}
    329 
    330 	/*
    331 	 * To remove the ordering constraint between adaptive mutexes
    332 	 * and kernel_lock we must make it appear as if this thread is
    333 	 * blocking.  For non-interlocked mutex release, a store fence
    334 	 * is required to ensure that the result of any mutex_exit()
    335 	 * by the current LWP becomes visible on the bus before the set
    336 	 * of ci->ci_biglock_wanted becomes visible.
    337 	 *
    338 	 * This membar_producer matches the membar_consumer in
    339 	 * mutex_vector_enter.
    340 	 *
    341 	 * That way, if l has just released a mutex, mutex_vector_enter
    342 	 * can't see this store ci->ci_biglock_wanted := l until it
    343 	 * will also see the mutex_exit store mtx->mtx_owner := 0 which
    344 	 * clears the has-waiters bit.
    345 	 */
    346 	membar_producer();
    347 	owant = ci->ci_biglock_wanted;
    348 	atomic_store_relaxed(&ci->ci_biglock_wanted, l);
    349 #if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
    350 	l->l_ld_wanted = __builtin_return_address(0);
    351 #endif
    352 
    353 	/*
    354 	 * Spin until we acquire the lock.  Once we have it, record the
    355 	 * time spent with lockstat.
    356 	 */
    357 	LOCKSTAT_ENTER(lsflag);
    358 	LOCKSTAT_START_TIMER(lsflag, spintime);
    359 
    360 	starttime = getticks();
    361 	do {
    362 		splx(s);
    363 		while (__SIMPLELOCK_LOCKED_P(kernel_lock)) {
    364 			if (start_init_exec &&
    365 			    (getticks() - starttime) > 10*hz) {
    366 				kernel_lock_spinout();
    367 			}
    368 			SPINLOCK_BACKOFF_HOOK;
    369 			SPINLOCK_SPIN_HOOK;
    370 		}
    371 		s = splvm();
    372 	} while (!__cpu_simple_lock_try(kernel_lock));
    373 
    374 	atomic_store_relaxed(&kernel_lock_holder, curcpu());
    375 
    376 	SDT_PROBE1(sdt, kernel, lock, entry,  nlocks);
    377 	ci->ci_biglock_count = nlocks;
    378 	l->l_blcnt = nlocks;
    379 	LOCKSTAT_STOP_TIMER(lsflag, spintime);
    380 	LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
    381 	    RETURN_ADDRESS, 0);
    382 	if (owant == NULL) {
    383 		LOCKSTAT_EVENT_RA(lsflag, kernel_lock,
    384 		    LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS);
    385 	}
    386 	LOCKSTAT_EXIT(lsflag);
    387 	splx(s);
    388 
    389 	/*
    390 	 * Now that we have kernel_lock, reset ci_biglock_wanted.  This
    391 	 * store must be visible on other CPUs before a mutex_exit() on
    392 	 * this CPU can test the has-waiters bit.
    393 	 *
    394 	 * This membar_enter matches the membar_enter in
    395 	 * mutex_vector_enter.  (Yes, not membar_exit -- the legacy
    396 	 * naming is confusing, but store-before-load usually pairs
    397 	 * with store-before-load, in the extremely rare cases where it
    398 	 * is used at all.)
    399 	 *
    400 	 * That way, mutex_vector_enter can't see this store
    401 	 * ci->ci_biglock_wanted := owant until it has set the
    402 	 * has-waiters bit.
    403 	 */
    404 	(void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant);
    405 #ifndef __HAVE_ATOMIC_AS_MEMBAR
    406 	membar_enter();
    407 #endif
    408 }
    409 
    410 /*
    411  * Release 'nlocks' holds on the kernel lock.  If 'nlocks' is zero, release
    412  * all holds.
    413  */
    414 void
    415 _kernel_unlock(int nlocks, int *countp)
    416 {
    417 	struct cpu_info *ci;
    418 	u_int olocks;
    419 	int s;
    420 	struct lwp *l = curlwp;
    421 
    422 	_KERNEL_LOCK_ASSERT(nlocks < 2);
    423 
    424 	olocks = l->l_blcnt;
    425 
    426 	if (olocks == 0) {
    427 		_KERNEL_LOCK_ASSERT(nlocks <= 0);
    428 		if (countp != NULL)
    429 			*countp = 0;
    430 		return;
    431 	}
    432 
    433 	_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
    434 
    435 	if (nlocks == 0)
    436 		nlocks = olocks;
    437 	else if (nlocks == -1) {
    438 		nlocks = 1;
    439 		_KERNEL_LOCK_ASSERT(olocks == 1);
    440 	}
    441 	s = splvm();
    442 	ci = curcpu();
    443 	_KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt);
    444 	if (ci->ci_biglock_count == nlocks) {
    445 		LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock,
    446 		    RETURN_ADDRESS, 0);
    447 		ci->ci_biglock_count = 0;
    448 		__cpu_simple_unlock(kernel_lock);
    449 		l->l_blcnt -= nlocks;
    450 		splx(s);
    451 		if (l->l_dopreempt)
    452 			kpreempt(0);
    453 	} else {
    454 		ci->ci_biglock_count -= nlocks;
    455 		l->l_blcnt -= nlocks;
    456 		splx(s);
    457 	}
    458 
    459 	SDT_PROBE1(sdt, kernel, lock, exit,  nlocks);
    460 
    461 	if (countp != NULL)
    462 		*countp = olocks;
    463 }
    464 
    465 bool
    466 _kernel_locked_p(void)
    467 {
    468 	return __SIMPLELOCK_LOCKED_P(kernel_lock);
    469 }
    470