Home | History | Annotate | Line # | Download | only in kern
kern_rwlock.c revision 1.59.2.3
      1 /*	$NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020
      5  *     The NetBSD Foundation, Inc.
      6  * All rights reserved.
      7  *
      8  * This code is derived from software contributed to The NetBSD Foundation
      9  * by Jason R. Thorpe and Andrew Doran.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Kernel reader/writer lock implementation, modeled after those
     35  * found in Solaris, a description of which can be found in:
     36  *
     37  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
     38  *	    Richard McDougall.
     39  *
     40  * The NetBSD implementation is different from that described in the book,
     41  * in that the locks are adaptive.  Lock waiters spin wait while the lock
     42  * holders are on CPU (if the holds can be tracked: up to N per-thread).
     43  *
     44  * While spin waiting, threads compete for the lock without the assistance
     45  * of turnstiles.  If a lock holder sleeps for any reason, the lock waiters
     46  * will also sleep in response and at that point turnstiles, priority
     47  * inheritance and strong efforts at ensuring fairness come into play.
     48  *
     49  * The adaptive behaviour is controlled by the RW_SPIN flag bit, which is
     50  * cleared by a lock owner that is going off the CPU, and set again by the
     51  * lock owner that releases the last hold on the lock.
     52  */
     53 
     54 #include <sys/cdefs.h>
     55 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $");
     56 
     57 #include "opt_lockdebug.h"
     58 
     59 #define	__RWLOCK_PRIVATE
     60 
     61 #include <sys/param.h>
     62 #include <sys/proc.h>
     63 #include <sys/rwlock.h>
     64 #include <sys/sched.h>
     65 #include <sys/sleepq.h>
     66 #include <sys/systm.h>
     67 #include <sys/lockdebug.h>
     68 #include <sys/cpu.h>
     69 #include <sys/atomic.h>
     70 #include <sys/lock.h>
     71 #include <sys/pserialize.h>
     72 
     73 #include <dev/lockstat.h>
     74 
     75 /*
     76  * LOCKDEBUG
     77  */
     78 
     79 #define	RW_DEBUG_P(rw)		(((rw)->rw_owner & RW_NODEBUG) == 0)
     80 
     81 #define	RW_WANTLOCK(rw, op) \
     82     LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
     83         (uintptr_t)__builtin_return_address(0), op == RW_READER);
     84 #define	RW_LOCKED(rw, op) \
     85     LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
     86         (uintptr_t)__builtin_return_address(0), op == RW_READER);
     87 #define	RW_UNLOCKED(rw, op) \
     88     LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
     89         (uintptr_t)__builtin_return_address(0), op == RW_READER);
     90 
     91 /*
     92  * DIAGNOSTIC
     93  */
     94 
     95 #if defined(DIAGNOSTIC)
     96 #define	RW_ASSERT(rw, cond) \
     97 do { \
     98 	if (__predict_false(!(cond))) \
     99 		rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
    100 } while (/* CONSTCOND */ 0)
    101 #else
    102 #define	RW_ASSERT(rw, cond)	/* nothing */
    103 #endif	/* DIAGNOSTIC */
    104 
    105 /*
    106  * Memory barriers.
    107  */
    108 #ifdef __HAVE_ATOMIC_AS_MEMBAR
    109 #define	RW_MEMBAR_ENTER()
    110 #define	RW_MEMBAR_EXIT()
    111 #define	RW_MEMBAR_PRODUCER()
    112 #else
    113 #define	RW_MEMBAR_ENTER()		membar_enter()
    114 #define	RW_MEMBAR_EXIT()		membar_exit()
    115 #define	RW_MEMBAR_PRODUCER()		membar_producer()
    116 #endif
    117 
    118 static void	rw_abort(const char *, size_t, krwlock_t *, const char *);
    119 static void	rw_dump(const volatile void *, lockop_printer_t);
    120 static lwp_t	*rw_owner(wchan_t);
    121 
    122 lockops_t rwlock_lockops = {
    123 	.lo_name = "Reader / writer lock",
    124 	.lo_type = LOCKOPS_SLEEP,
    125 	.lo_dump = rw_dump,
    126 };
    127 
    128 syncobj_t rw_syncobj = {
    129 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    130 	.sobj_unsleep	= turnstile_unsleep,
    131 	.sobj_changepri	= turnstile_changepri,
    132 	.sobj_lendpri	= sleepq_lendpri,
    133 	.sobj_owner	= rw_owner,
    134 };
    135 
    136 /*
    137  * rw_cas:
    138  *
    139  *	Do an atomic compare-and-swap on the lock word.
    140  */
    141 static inline uintptr_t
    142 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
    143 {
    144 
    145 	return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
    146 	    (void *)o, (void *)n);
    147 }
    148 
    149 /*
    150  * rw_and:
    151  *
    152  *	Do an atomic AND on the lock word.
    153  */
    154 static inline void
    155 rw_and(krwlock_t *rw, uintptr_t m)
    156 {
    157 
    158 #ifdef _LP64
    159 	atomic_and_64(&rw->rw_owner, m);
    160 #else
    161 	atomic_and_32(&rw->rw_owner, m);
    162 #endif
    163 }
    164 
    165 /*
    166  * rw_swap:
    167  *
    168  *	Do an atomic swap of the lock word.  This is used only when it's
    169  *	known that the lock word is set up such that it can't be changed
    170  *	behind us (assert this), so there's no point considering the result.
    171  */
    172 static inline void
    173 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
    174 {
    175 
    176 	n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
    177 	    (void *)n);
    178 
    179 	RW_ASSERT(rw, n == o);
    180 	RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
    181 }
    182 
    183 /*
    184  * rw_hold_remember:
    185  *
    186  *	Helper - when acquring a lock, record the new hold.
    187  */
    188 static inline uintptr_t
    189 rw_hold_remember(krwlock_t *rw, lwp_t *l)
    190 {
    191 	int i;
    192 
    193 	KASSERT(kpreempt_disabled());
    194 
    195 	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
    196 		if (__predict_true(l->l_rwlocks[i] == NULL)) {
    197 			l->l_rwlocks[i] = rw;
    198 			/*
    199 			 * Clear the write wanted flag on every acquire to
    200 			 * give readers a chance once again.
    201 			 */
    202 			return ~RW_WRITE_WANTED;
    203 		}
    204 	}
    205 
    206 	/*
    207 	 * Nowhere to track the hold so we lose: temporarily disable
    208 	 * spinning on the lock.
    209 	 */
    210 	return ~(RW_WRITE_WANTED | RW_SPIN);
    211 }
    212 
    213 /*
    214  * rw_hold_forget:
    215  *
    216  *	Helper - when releasing a lock, stop tracking the hold.
    217  */
    218 static inline void
    219 rw_hold_forget(krwlock_t *rw, lwp_t *l)
    220 {
    221 	int i;
    222 
    223 	KASSERT(kpreempt_disabled());
    224 
    225 	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
    226 		if (__predict_true(l->l_rwlocks[i] == rw)) {
    227 			l->l_rwlocks[i] = NULL;
    228 			return;
    229 		}
    230 	}
    231 }
    232 
    233 /*
    234  * rw_switch:
    235  *
    236  *	Called by mi_switch() to indicate that an LWP is going off the CPU.
    237  */
    238 void
    239 rw_switch(void)
    240 {
    241 	lwp_t *l = curlwp;
    242 	int i;
    243 
    244 	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
    245 		if (l->l_rwlocks[i] != NULL) {
    246 			rw_and(l->l_rwlocks[i], ~RW_SPIN);
    247 			/* Leave in place for exit to clear. */
    248 		}
    249 	}
    250 }
    251 
    252 /*
    253  * rw_dump:
    254  *
    255  *	Dump the contents of a rwlock structure.
    256  */
    257 static void
    258 rw_dump(const volatile void *cookie, lockop_printer_t pr)
    259 {
    260 	const volatile krwlock_t *rw = cookie;
    261 
    262 	pr("owner/count  : %#018lx flags    : %#018x\n",
    263 	    (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
    264 }
    265 
    266 /*
    267  * rw_abort:
    268  *
    269  *	Dump information about an error and panic the system.  This
    270  *	generates a lot of machine code in the DIAGNOSTIC case, so
    271  *	we ask the compiler to not inline it.
    272  */
    273 static void __noinline
    274 rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
    275 {
    276 
    277 	if (panicstr != NULL)
    278 		return;
    279 
    280 	LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
    281 }
    282 
    283 /*
    284  * rw_init:
    285  *
    286  *	Initialize a rwlock for use.
    287  */
    288 void
    289 _rw_init(krwlock_t *rw, uintptr_t return_address)
    290 {
    291 
    292 	if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
    293 		rw->rw_owner = RW_SPIN;
    294 	else
    295 		rw->rw_owner = RW_SPIN | RW_NODEBUG;
    296 }
    297 
    298 void
    299 rw_init(krwlock_t *rw)
    300 {
    301 
    302 	_rw_init(rw, (uintptr_t)__builtin_return_address(0));
    303 }
    304 
    305 /*
    306  * rw_destroy:
    307  *
    308  *	Tear down a rwlock.
    309  */
    310 void
    311 rw_destroy(krwlock_t *rw)
    312 {
    313 
    314 	RW_ASSERT(rw, (rw->rw_owner & ~(RW_NODEBUG | RW_SPIN)) == 0);
    315 	LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
    316 }
    317 
    318 /*
    319  * rw_vector_enter:
    320  *
    321  *	The slow path for acquiring a rwlock, that considers all conditions.
    322  *	Marked __noinline to prevent the compiler pulling it into rw_enter().
    323  */
    324 static void __noinline
    325 rw_vector_enter(krwlock_t *rw, const krw_t op, uintptr_t mask, uintptr_t ra)
    326 {
    327 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
    328 	turnstile_t *ts;
    329 	int queue;
    330 	lwp_t *l;
    331 	LOCKSTAT_TIMER(slptime);
    332 	LOCKSTAT_TIMER(slpcnt);
    333 	LOCKSTAT_TIMER(spintime);
    334 	LOCKSTAT_COUNTER(spincnt);
    335 	LOCKSTAT_FLAG(lsflag);
    336 
    337 	l = curlwp;
    338 	curthread = (uintptr_t)l;
    339 
    340 	RW_ASSERT(rw, !cpu_intr_p());
    341 	RW_ASSERT(rw, curthread != 0);
    342 	RW_ASSERT(rw, kpreempt_disabled());
    343 	RW_WANTLOCK(rw, op);
    344 
    345 	if (panicstr == NULL) {
    346 		KDASSERT(pserialize_not_in_read_section());
    347 		LOCKDEBUG_BARRIER(&kernel_lock, 1);
    348 	}
    349 
    350 	/*
    351 	 * We play a slight trick here.  If we're a reader, we want
    352 	 * increment the read count.  If we're a writer, we want to
    353 	 * set the owner field and the WRITE_LOCKED bit.
    354 	 *
    355 	 * In the latter case, we expect those bits to be zero,
    356 	 * therefore we can use an add operation to set them, which
    357 	 * means an add operation for both cases.
    358 	 */
    359 	if (__predict_true(op == RW_READER)) {
    360 		incr = RW_READ_INCR;
    361 		set_wait = RW_HAS_WAITERS;
    362 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
    363 		queue = TS_READER_Q;
    364 	} else {
    365 		RW_ASSERT(rw, op == RW_WRITER);
    366 		incr = curthread | RW_WRITE_LOCKED;
    367 		set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
    368 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
    369 		queue = TS_WRITER_Q;
    370 	}
    371 
    372 	LOCKSTAT_ENTER(lsflag);
    373 
    374 	for (owner = rw->rw_owner;;) {
    375 		/*
    376 		 * Read the lock owner field.  If the need-to-wait
    377 		 * indicator is clear, then try to acquire the lock.
    378 		 */
    379 		if ((owner & need_wait) == 0) {
    380 			next = rw_cas(rw, owner, (owner + incr) & mask);
    381 			if (__predict_true(next == owner)) {
    382 				/* Got it! */
    383 				RW_MEMBAR_ENTER();
    384 				break;
    385 			}
    386 
    387 			/*
    388 			 * Didn't get it -- spin around again (we'll
    389 			 * probably sleep on the next iteration).
    390 			 */
    391 			owner = next;
    392 			continue;
    393 		}
    394 		if (__predict_false(RW_OWNER(rw) == curthread)) {
    395 			rw_abort(__func__, __LINE__, rw,
    396 			    "locking against myself");
    397 		}
    398 
    399 		/*
    400 		 * If the lock owner is running on another CPU, and there
    401 		 * are no existing waiters, then spin.  Notes:
    402 		 *
    403 		 * 1) If an LWP on this CPU (possibly curlwp, or an LWP that
    404 		 * curlwp has interupted) holds kernel_lock, we can't spin
    405 		 * without a deadlock.  The CPU that holds the rwlock may be
    406 		 * blocked trying to acquire kernel_lock, or there may be an
    407 		 * unseen chain of dependant locks.  To defeat the potential
    408 		 * deadlock, this LWP needs to sleep (and thereby directly
    409 		 * drop the kernel_lock, or permit the interrupted LWP that
    410 		 * holds kernel_lock to complete its work).
    411 		 *
    412 		 * 2) If trying to acquire a write lock, and the lock is
    413 		 * currently read held, after a brief wait set the write
    414 		 * wanted bit to block out new readers and try to avoid
    415 		 * starvation.  When the hold is acquired, we'll clear the
    416 		 * WRITE_WANTED flag to give readers a chance again.  With
    417 		 * luck this should nudge things in the direction of
    418 		 * interleaving readers and writers when there is high
    419 		 * contention.
    420 		 *
    421 		 * 3) The spin wait can't be done in soft interrupt context,
    422 		 * because a lock holder could be pinned down underneath the
    423 		 * soft interrupt LWP (i.e. curlwp) on the same CPU.  For
    424 		 * the lock holder to make progress and release the lock,
    425 		 * the soft interrupt needs to sleep.
    426 		 */
    427 		if ((owner & RW_SPIN) != 0 && !cpu_softintr_p()) {
    428 			LOCKSTAT_START_TIMER(lsflag, spintime);
    429 			u_int count = SPINLOCK_BACKOFF_MIN;
    430 			do {
    431 				KPREEMPT_ENABLE(curlwp);
    432 				SPINLOCK_BACKOFF(count);
    433 				KPREEMPT_DISABLE(curlwp);
    434 				owner = rw->rw_owner;
    435 				if ((owner & need_wait) == 0)
    436 					break;
    437 				if (count != SPINLOCK_BACKOFF_MAX)
    438 					continue;
    439 				if (curcpu()->ci_biglock_count != 0)
    440 					break;
    441 				if (op == RW_WRITER &&
    442 				    (owner & RW_WRITE_LOCKED) == 0 &&
    443 				    (owner & RW_WRITE_WANTED) == 0) {
    444 					(void)rw_cas(rw, owner,
    445 					    owner | RW_WRITE_WANTED);
    446 				}
    447 			} while ((owner & RW_SPIN) != 0);
    448 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
    449 			LOCKSTAT_COUNT(spincnt, 1);
    450 			if ((owner & need_wait) == 0)
    451 				continue;
    452 		}
    453 
    454 		/*
    455 		 * Grab the turnstile chain lock.  Once we have that, we
    456 		 * can adjust the waiter bits and sleep queue.
    457 		 */
    458 		ts = turnstile_lookup(rw);
    459 
    460 		/*
    461 		 * Mark the rwlock as having waiters, and disable spinning.
    462 		 * If the set fails, then we may not need to sleep and
    463 		 * should spin again.  Reload rw_owner now that we own
    464 		 * the turnstile chain lock.
    465 		 */
    466 		owner = rw->rw_owner;
    467 		if ((owner & need_wait) == 0 ||
    468 		    ((owner & RW_SPIN) != 0 && !cpu_softintr_p())) {
    469 			turnstile_exit(rw);
    470 			continue;
    471 		}
    472 		next = rw_cas(rw, owner, (owner | set_wait) & ~RW_SPIN);
    473 		if (__predict_false(next != owner)) {
    474 			turnstile_exit(rw);
    475 			owner = next;
    476 			continue;
    477 		}
    478 
    479 		LOCKSTAT_START_TIMER(lsflag, slptime);
    480 		turnstile_block(ts, queue, rw, &rw_syncobj);
    481 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
    482 		LOCKSTAT_COUNT(slpcnt, 1);
    483 
    484 		/*
    485 		 * No need for a memory barrier because of context switch.
    486 		 * If not handed the lock, then spin again.
    487 		 */
    488 		if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
    489 			break;
    490 
    491 		owner = rw->rw_owner;
    492 	}
    493 	KPREEMPT_ENABLE(curlwp);
    494 
    495 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
    496 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
    497 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
    498 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
    499 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
    500 	LOCKSTAT_EXIT(lsflag);
    501 
    502 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
    503 	    (op == RW_READER && RW_COUNT(rw) != 0));
    504 	RW_LOCKED(rw, op);
    505 }
    506 
    507 /*
    508  * rw_enter:
    509  *
    510  *	The fast path for acquiring a lock that considers only the
    511  *	uncontended case.  Falls back to rw_vector_enter().
    512  */
    513 void
    514 rw_enter(krwlock_t *rw, const krw_t op)
    515 {
    516 	uintptr_t owner, incr, need_wait, curthread, next, mask;
    517 	lwp_t *l;
    518 
    519 	l = curlwp;
    520 	curthread = (uintptr_t)l;
    521 
    522 	RW_ASSERT(rw, !cpu_intr_p());
    523 	RW_ASSERT(rw, curthread != 0);
    524 	RW_WANTLOCK(rw, op);
    525 
    526 	KPREEMPT_DISABLE(l);
    527 	mask = rw_hold_remember(rw, l);
    528 
    529 	/*
    530 	 * We play a slight trick here.  If we're a reader, we want
    531 	 * increment the read count.  If we're a writer, we want to
    532 	 * set the owner field and the WRITE_LOCKED bit.
    533 	 *
    534 	 * In the latter case, we expect those bits to be zero,
    535 	 * therefore we can use an add operation to set them, which
    536 	 * means an add operation for both cases.
    537 	 */
    538 	if (__predict_true(op == RW_READER)) {
    539 		incr = RW_READ_INCR;
    540 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
    541 	} else {
    542 		RW_ASSERT(rw, op == RW_WRITER);
    543 		incr = curthread | RW_WRITE_LOCKED;
    544 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
    545 	}
    546 
    547 	/*
    548 	 * Read the lock owner field.  If the need-to-wait
    549 	 * indicator is clear, then try to acquire the lock.
    550 	 */
    551 	owner = rw->rw_owner;
    552 	if ((owner & need_wait) == 0) {
    553 		next = rw_cas(rw, owner, (owner + incr) & mask);
    554 		if (__predict_true(next == owner)) {
    555 			/* Got it! */
    556 			KPREEMPT_ENABLE(l);
    557 			RW_MEMBAR_ENTER();
    558 			return;
    559 		}
    560 	}
    561 
    562 	rw_vector_enter(rw, op, mask, (uintptr_t)__builtin_return_address(0));
    563 }
    564 
    565 /*
    566  * rw_vector_exit:
    567  *
    568  *	The slow path for releasing a rwlock, that considers all conditions.
    569  *	Marked __noinline to prevent the compiler pulling it into rw_enter().
    570  */
    571 static void __noinline
    572 rw_vector_exit(krwlock_t *rw)
    573 {
    574 	uintptr_t curthread, owner, decr, newown, next;
    575 	turnstile_t *ts;
    576 	int rcnt, wcnt;
    577 	lwp_t *l;
    578 
    579 	l = curlwp;
    580 	curthread = (uintptr_t)l;
    581 	RW_ASSERT(rw, curthread != 0);
    582 	RW_ASSERT(rw, kpreempt_disabled());
    583 
    584 	/*
    585 	 * Again, we use a trick.  Since we used an add operation to
    586 	 * set the required lock bits, we can use a subtract to clear
    587 	 * them, which makes the read-release and write-release path
    588 	 * the same.
    589 	 */
    590 	owner = rw->rw_owner;
    591 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
    592 		RW_UNLOCKED(rw, RW_WRITER);
    593 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    594 		decr = curthread | RW_WRITE_LOCKED;
    595 	} else {
    596 		RW_UNLOCKED(rw, RW_READER);
    597 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
    598 		decr = RW_READ_INCR;
    599 	}
    600 
    601 	/*
    602 	 * Compute what we expect the new value of the lock to be. Only
    603 	 * proceed to do direct handoff if there are waiters, and if the
    604 	 * lock would become unowned.
    605 	 */
    606 	RW_MEMBAR_EXIT();
    607 	for (;;) {
    608 		newown = (owner - decr);
    609 		if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
    610 			break;
    611 		/* Want spinning enabled if lock is becoming free. */
    612 		if ((newown & RW_THREAD) == 0)
    613 			newown |= RW_SPIN;
    614 		next = rw_cas(rw, owner, newown);
    615 		if (__predict_true(next == owner)) {
    616 			rw_hold_forget(rw, l);
    617 			kpreempt_enable();
    618 			return;
    619 		}
    620 		owner = next;
    621 	}
    622 
    623 	/*
    624 	 * Grab the turnstile chain lock.  This gets the interlock
    625 	 * on the sleep queue.  Once we have that, we can adjust the
    626 	 * waiter bits.
    627 	 */
    628 	ts = turnstile_lookup(rw);
    629 	owner = rw->rw_owner;
    630 	RW_ASSERT(rw, ts != NULL);
    631 	RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
    632 
    633 	wcnt = TS_WAITERS(ts, TS_WRITER_Q);
    634 	rcnt = TS_WAITERS(ts, TS_READER_Q);
    635 
    636 	/*
    637 	 * Give the lock away.
    638 	 *
    639 	 * If we are releasing a write lock, then prefer to wake all
    640 	 * outstanding readers.  Otherwise, wake one writer if there
    641 	 * are outstanding readers, or all writers if there are no
    642 	 * pending readers.  If waking one specific writer, the writer
    643 	 * is handed the lock here.  If waking multiple writers, we
    644 	 * set WRITE_WANTED to block out new readers, and let them
    645 	 * do the work of acquiring the lock in rw_vector_enter().
    646 	 */
    647 	if (rcnt == 0 || decr == RW_READ_INCR) {
    648 		RW_ASSERT(rw, wcnt != 0);
    649 		RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
    650 
    651 		if (rcnt != 0) {
    652 			/* Give the lock to the longest waiting writer. */
    653 			l = TS_FIRST(ts, TS_WRITER_Q);
    654 			newown = (uintptr_t)l | (owner & RW_NODEBUG);
    655 			newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
    656 			if (wcnt > 1)
    657 				newown |= RW_WRITE_WANTED;
    658 			rw_swap(rw, owner, newown);
    659 			rw_hold_forget(rw, l);
    660 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
    661 		} else {
    662 			/* Wake all writers and let them fight it out. */
    663 			newown = owner & RW_NODEBUG;
    664 			newown |= RW_WRITE_WANTED;
    665 			rw_swap(rw, owner, newown);
    666 			rw_hold_forget(rw, l);
    667 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
    668 		}
    669 	} else {
    670 		RW_ASSERT(rw, rcnt != 0);
    671 
    672 		/*
    673 		 * Give the lock to all blocked readers.  If there
    674 		 * is a writer waiting, new readers that arrive
    675 		 * after the release will be blocked out.
    676 		 */
    677 		newown = owner & RW_NODEBUG;
    678 		newown += rcnt << RW_READ_COUNT_SHIFT;
    679 		if (wcnt != 0)
    680 			newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
    681 
    682 		/* Wake up all sleeping readers. */
    683 		rw_swap(rw, owner, newown);
    684 		rw_hold_forget(rw, l);
    685 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
    686 	}
    687 	kpreempt_enable();
    688 }
    689 
    690 /*
    691  * rw_exit:
    692  *
    693  *	The fast path for releasing a lock that considers only the
    694  *	uncontended case.  Falls back to rw_vector_exit().
    695  */
    696 void
    697 rw_exit(krwlock_t *rw)
    698 {
    699 	uintptr_t curthread, owner, decr, newown, next;
    700 	lwp_t *l;
    701 
    702 	l = curlwp;
    703 	curthread = (uintptr_t)l;
    704 	RW_ASSERT(rw, curthread != 0);
    705 
    706 	/*
    707 	 * Again, we use a trick.  Since we used an add operation to
    708 	 * set the required lock bits, we can use a subtract to clear
    709 	 * them, which makes the read-release and write-release path
    710 	 * the same.
    711 	 */
    712 	owner = rw->rw_owner;
    713 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
    714 		RW_UNLOCKED(rw, RW_WRITER);
    715 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    716 		decr = curthread | RW_WRITE_LOCKED;
    717 	} else {
    718 		RW_UNLOCKED(rw, RW_READER);
    719 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
    720 		decr = RW_READ_INCR;
    721 	}
    722 
    723 	/* Now try to release it. */
    724 	RW_MEMBAR_EXIT();
    725 	KPREEMPT_DISABLE(l);
    726 	newown = (owner - decr);
    727 	if (__predict_true((newown & (RW_THREAD | RW_HAS_WAITERS)) !=
    728 	    RW_HAS_WAITERS)) {
    729 		/* Want spinning (re-)enabled if lock is becoming free. */
    730 		if ((newown & RW_THREAD) == 0)
    731 			newown |= RW_SPIN;
    732 		next = rw_cas(rw, owner, newown);
    733 		if (__predict_true(next == owner)) {
    734 			rw_hold_forget(rw, l);
    735 			KPREEMPT_ENABLE(l);
    736 			return;
    737 		}
    738 	}
    739 	rw_vector_exit(rw);
    740 }
    741 
    742 /*
    743  * rw_tryenter:
    744  *
    745  *	Try to acquire a rwlock.
    746  */
    747 int
    748 rw_tryenter(krwlock_t *rw, const krw_t op)
    749 {
    750 	uintptr_t curthread, owner, incr, need_wait, next, mask;
    751 	lwp_t *l;
    752 
    753 	l = curlwp;
    754 	curthread = (uintptr_t)l;
    755 
    756 	RW_ASSERT(rw, curthread != 0);
    757 
    758 	KPREEMPT_DISABLE(l);
    759 	mask = rw_hold_remember(rw, l);
    760 
    761 	if (op == RW_READER) {
    762 		incr = RW_READ_INCR;
    763 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
    764 	} else {
    765 		RW_ASSERT(rw, op == RW_WRITER);
    766 		incr = curthread | RW_WRITE_LOCKED;
    767 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
    768 	}
    769 
    770 	for (owner = rw->rw_owner;; owner = next) {
    771 		if (__predict_false((owner & need_wait) != 0)) {
    772 			rw_hold_forget(rw, l);
    773 			KPREEMPT_ENABLE(l);
    774 			return 0;
    775 		}
    776 		next = rw_cas(rw, owner, (owner + incr) & mask);
    777 		if (__predict_true(next == owner)) {
    778 			/* Got it! */
    779 			break;
    780 		}
    781 	}
    782 
    783 	RW_WANTLOCK(rw, op);
    784 	RW_LOCKED(rw, op);
    785 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
    786 	    (op == RW_READER && RW_COUNT(rw) != 0));
    787 
    788 	KPREEMPT_ENABLE(l);
    789 	RW_MEMBAR_ENTER();
    790 	return 1;
    791 }
    792 
    793 /*
    794  * rw_downgrade:
    795  *
    796  *	Downgrade a write lock to a read lock.
    797  */
    798 void
    799 rw_downgrade(krwlock_t *rw)
    800 {
    801 	uintptr_t owner, curthread, newown, next;
    802 	turnstile_t *ts;
    803 	int rcnt, wcnt;
    804 	lwp_t *l;
    805 
    806 	l = curlwp;
    807 	curthread = (uintptr_t)l;
    808 	RW_ASSERT(rw, curthread != 0);
    809 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
    810 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    811 	RW_UNLOCKED(rw, RW_WRITER);
    812 #if !defined(DIAGNOSTIC)
    813 	__USE(curthread);
    814 #endif
    815 
    816 	RW_MEMBAR_PRODUCER();
    817 
    818 	for (owner = rw->rw_owner;; owner = next) {
    819 		/*
    820 		 * If there are no waiters we can do this the easy way.  Try
    821 		 * swapping us down to one read hold.  If it fails, the lock
    822 		 * condition has changed and we most likely now have
    823 		 * waiters.
    824 		 */
    825 		if ((owner & RW_HAS_WAITERS) == 0) {
    826 			newown = (owner & RW_NODEBUG) | RW_SPIN;
    827 			next = rw_cas(rw, owner, newown + RW_READ_INCR);
    828 			if (__predict_true(next == owner)) {
    829 				RW_LOCKED(rw, RW_READER);
    830 				RW_ASSERT(rw,
    831 				    (rw->rw_owner & RW_WRITE_LOCKED) == 0);
    832 				RW_ASSERT(rw, RW_COUNT(rw) != 0);
    833 				return;
    834 			}
    835 			continue;
    836 		}
    837 
    838 		/*
    839 		 * Grab the turnstile chain lock.  This gets the interlock
    840 		 * on the sleep queue.  Once we have that, we can adjust the
    841 		 * waiter bits.
    842 		 */
    843 		ts = turnstile_lookup(rw);
    844 		RW_ASSERT(rw, ts != NULL);
    845 
    846 		rcnt = TS_WAITERS(ts, TS_READER_Q);
    847 		wcnt = TS_WAITERS(ts, TS_WRITER_Q);
    848 
    849 		if (rcnt == 0) {
    850 			/*
    851 			 * If there are no readers, just preserve the
    852 			 * waiters bits, swap us down to one read hold and
    853 			 * return.  Don't set the spin bit as nobody's
    854 			 * running yet.
    855 			 */
    856 			RW_ASSERT(rw, wcnt != 0);
    857 			RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
    858 			RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
    859 
    860 			newown = owner & RW_NODEBUG;
    861 			newown = RW_READ_INCR | RW_HAS_WAITERS |
    862 			    RW_WRITE_WANTED;
    863 			next = rw_cas(rw, owner, newown);
    864 			turnstile_exit(rw);
    865 			if (__predict_true(next == owner))
    866 				break;
    867 		} else {
    868 			/*
    869 			 * Give the lock to all blocked readers.  We may
    870 			 * retain one read hold if downgrading.  If there is
    871 			 * a writer waiting, new readers will be blocked
    872 			 * out.  Don't set the spin bit as nobody's running
    873 			 * yet.
    874 			 */
    875 			newown = owner & RW_NODEBUG;
    876 			newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
    877 			if (wcnt != 0)
    878 				newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
    879 
    880 			next = rw_cas(rw, owner, newown);
    881 			if (__predict_true(next == owner)) {
    882 				/* Wake up all sleeping readers. */
    883 				turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
    884 				break;
    885 			}
    886 			turnstile_exit(rw);
    887 		}
    888 	}
    889 
    890 	RW_WANTLOCK(rw, RW_READER);
    891 	RW_LOCKED(rw, RW_READER);
    892 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
    893 	RW_ASSERT(rw, RW_COUNT(rw) != 0);
    894 }
    895 
    896 /*
    897  * rw_tryupgrade:
    898  *
    899  *	Try to upgrade a read lock to a write lock.  We must be the only
    900  *	reader.
    901  */
    902 int
    903 rw_tryupgrade(krwlock_t *rw)
    904 {
    905 	uintptr_t owner, curthread, newown, next;
    906 	struct lwp *l;
    907 
    908 	l = curlwp;
    909 	curthread = (uintptr_t)l;
    910 	RW_ASSERT(rw, curthread != 0);
    911 	RW_ASSERT(rw, rw_read_held(rw));
    912 
    913 	for (owner = RW_READ_INCR;; owner = next) {
    914 		newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
    915 		next = rw_cas(rw, owner, newown);
    916 		if (__predict_true(next == owner)) {
    917 			RW_MEMBAR_PRODUCER();
    918 			break;
    919 		}
    920 		RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0);
    921 		if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) {
    922 			RW_ASSERT(rw, (next & RW_THREAD) != 0);
    923 			return 0;
    924 		}
    925 	}
    926 
    927 	RW_UNLOCKED(rw, RW_READER);
    928 	RW_WANTLOCK(rw, RW_WRITER);
    929 	RW_LOCKED(rw, RW_WRITER);
    930 	RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
    931 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    932 
    933 	return 1;
    934 }
    935 
    936 /*
    937  * rw_read_held:
    938  *
    939  *	Returns true if the rwlock is held for reading.  Must only be
    940  *	used for diagnostic assertions, and never be used to make
    941  * 	decisions about how to use a rwlock.
    942  */
    943 int
    944 rw_read_held(krwlock_t *rw)
    945 {
    946 	uintptr_t owner;
    947 
    948 	if (rw == NULL)
    949 		return 0;
    950 	owner = rw->rw_owner;
    951 	return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
    952 }
    953 
    954 /*
    955  * rw_write_held:
    956  *
    957  *	Returns true if the rwlock is held for writing.  Must only be
    958  *	used for diagnostic assertions, and never be used to make
    959  *	decisions about how to use a rwlock.
    960  */
    961 int
    962 rw_write_held(krwlock_t *rw)
    963 {
    964 
    965 	if (rw == NULL)
    966 		return 0;
    967 	return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
    968 	    (RW_WRITE_LOCKED | (uintptr_t)curlwp);
    969 }
    970 
    971 /*
    972  * rw_lock_held:
    973  *
    974  *	Returns true if the rwlock is held for reading or writing.  Must
    975  *	only be used for diagnostic assertions, and never be used to make
    976  *	decisions about how to use a rwlock.
    977  */
    978 int
    979 rw_lock_held(krwlock_t *rw)
    980 {
    981 
    982 	if (rw == NULL)
    983 		return 0;
    984 	return (rw->rw_owner & RW_THREAD) != 0;
    985 }
    986 
    987 /*
    988  * rw_owner:
    989  *
    990  *	Return the current owner of an RW lock, but only if it is write
    991  *	held.  Used for priority inheritance.
    992  */
    993 static lwp_t *
    994 rw_owner(wchan_t obj)
    995 {
    996 	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
    997 	uintptr_t owner = rw->rw_owner;
    998 
    999 	if ((owner & RW_WRITE_LOCKED) == 0)
   1000 		return NULL;
   1001 
   1002 	return (void *)(owner & RW_THREAD);
   1003 }
   1004 
   1005 /*
   1006  * rw_owner_running:
   1007  *
   1008  *	Return true if a RW lock is unheld, or held and the owner is running
   1009  *	on a CPU.  For the pagedaemon only - do not document or use in other
   1010  *	code.
   1011  */
   1012 bool
   1013 rw_owner_running(const krwlock_t *rw)
   1014 {
   1015 	uintptr_t owner = rw->rw_owner;
   1016 
   1017 	return (owner & RW_THREAD) == 0 || (owner & RW_SPIN) != 0;
   1018 }
   1019