Home | History | Annotate | Line # | Download | only in kern
kern_rwlock.c revision 1.59.2.3
      1  1.59.2.2        ad /*	$NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $	*/
      2       1.2        ad 
      3       1.2        ad /*-
      4  1.59.2.1        ad  * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020
      5  1.59.2.1        ad  *     The NetBSD Foundation, Inc.
      6       1.2        ad  * All rights reserved.
      7       1.2        ad  *
      8       1.2        ad  * This code is derived from software contributed to The NetBSD Foundation
      9       1.2        ad  * by Jason R. Thorpe and Andrew Doran.
     10       1.2        ad  *
     11       1.2        ad  * Redistribution and use in source and binary forms, with or without
     12       1.2        ad  * modification, are permitted provided that the following conditions
     13       1.2        ad  * are met:
     14       1.2        ad  * 1. Redistributions of source code must retain the above copyright
     15       1.2        ad  *    notice, this list of conditions and the following disclaimer.
     16       1.2        ad  * 2. Redistributions in binary form must reproduce the above copyright
     17       1.2        ad  *    notice, this list of conditions and the following disclaimer in the
     18       1.2        ad  *    documentation and/or other materials provided with the distribution.
     19       1.2        ad  *
     20       1.2        ad  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21       1.2        ad  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22       1.2        ad  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23       1.2        ad  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24       1.2        ad  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25       1.2        ad  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26       1.2        ad  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27       1.2        ad  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28       1.2        ad  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29       1.2        ad  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30       1.2        ad  * POSSIBILITY OF SUCH DAMAGE.
     31       1.2        ad  */
     32       1.2        ad 
     33       1.2        ad /*
     34       1.2        ad  * Kernel reader/writer lock implementation, modeled after those
     35       1.2        ad  * found in Solaris, a description of which can be found in:
     36       1.2        ad  *
     37       1.2        ad  *	Solaris Internals: Core Kernel Architecture, Jim Mauro and
     38       1.2        ad  *	    Richard McDougall.
     39  1.59.2.3        ad  *
     40  1.59.2.3        ad  * The NetBSD implementation is different from that described in the book,
     41  1.59.2.3        ad  * in that the locks are adaptive.  Lock waiters spin wait while the lock
     42  1.59.2.3        ad  * holders are on CPU (if the holds can be tracked: up to N per-thread).
     43  1.59.2.3        ad  *
     44  1.59.2.3        ad  * While spin waiting, threads compete for the lock without the assistance
     45  1.59.2.3        ad  * of turnstiles.  If a lock holder sleeps for any reason, the lock waiters
     46  1.59.2.3        ad  * will also sleep in response and at that point turnstiles, priority
     47  1.59.2.3        ad  * inheritance and strong efforts at ensuring fairness come into play.
     48  1.59.2.3        ad  *
     49  1.59.2.3        ad  * The adaptive behaviour is controlled by the RW_SPIN flag bit, which is
     50  1.59.2.3        ad  * cleared by a lock owner that is going off the CPU, and set again by the
     51  1.59.2.3        ad  * lock owner that releases the last hold on the lock.
     52       1.2        ad  */
     53       1.2        ad 
     54      1.10       dsl #include <sys/cdefs.h>
     55  1.59.2.2        ad __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $");
     56  1.59.2.2        ad 
     57  1.59.2.2        ad #include "opt_lockdebug.h"
     58       1.2        ad 
     59       1.2        ad #define	__RWLOCK_PRIVATE
     60       1.2        ad 
     61       1.2        ad #include <sys/param.h>
     62       1.2        ad #include <sys/proc.h>
     63       1.2        ad #include <sys/rwlock.h>
     64       1.2        ad #include <sys/sched.h>
     65       1.2        ad #include <sys/sleepq.h>
     66       1.2        ad #include <sys/systm.h>
     67       1.2        ad #include <sys/lockdebug.h>
     68      1.11        ad #include <sys/cpu.h>
     69      1.14        ad #include <sys/atomic.h>
     70      1.15        ad #include <sys/lock.h>
     71      1.51     ozaki #include <sys/pserialize.h>
     72       1.2        ad 
     73       1.2        ad #include <dev/lockstat.h>
     74       1.2        ad 
     75       1.2        ad /*
     76       1.2        ad  * LOCKDEBUG
     77       1.2        ad  */
     78       1.2        ad 
     79  1.59.2.2        ad #define	RW_DEBUG_P(rw)		(((rw)->rw_owner & RW_NODEBUG) == 0)
     80       1.2        ad 
     81  1.59.2.2        ad #define	RW_WANTLOCK(rw, op) \
     82  1.59.2.2        ad     LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
     83  1.59.2.2        ad         (uintptr_t)__builtin_return_address(0), op == RW_READER);
     84  1.59.2.2        ad #define	RW_LOCKED(rw, op) \
     85  1.59.2.2        ad     LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
     86  1.59.2.2        ad         (uintptr_t)__builtin_return_address(0), op == RW_READER);
     87  1.59.2.2        ad #define	RW_UNLOCKED(rw, op) \
     88  1.59.2.2        ad     LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
     89  1.59.2.2        ad         (uintptr_t)__builtin_return_address(0), op == RW_READER);
     90       1.2        ad 
     91       1.2        ad /*
     92       1.2        ad  * DIAGNOSTIC
     93       1.2        ad  */
     94       1.2        ad 
     95       1.2        ad #if defined(DIAGNOSTIC)
     96  1.59.2.2        ad #define	RW_ASSERT(rw, cond) \
     97  1.59.2.2        ad do { \
     98  1.59.2.2        ad 	if (__predict_false(!(cond))) \
     99      1.46  christos 		rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
    100       1.2        ad } while (/* CONSTCOND */ 0)
    101       1.2        ad #else
    102       1.2        ad #define	RW_ASSERT(rw, cond)	/* nothing */
    103       1.2        ad #endif	/* DIAGNOSTIC */
    104       1.2        ad 
    105      1.55        ad /*
    106      1.55        ad  * Memory barriers.
    107      1.55        ad  */
    108      1.55        ad #ifdef __HAVE_ATOMIC_AS_MEMBAR
    109      1.55        ad #define	RW_MEMBAR_ENTER()
    110      1.55        ad #define	RW_MEMBAR_EXIT()
    111      1.55        ad #define	RW_MEMBAR_PRODUCER()
    112      1.55        ad #else
    113      1.55        ad #define	RW_MEMBAR_ENTER()		membar_enter()
    114      1.55        ad #define	RW_MEMBAR_EXIT()		membar_exit()
    115      1.55        ad #define	RW_MEMBAR_PRODUCER()		membar_producer()
    116      1.55        ad #endif
    117      1.55        ad 
    118  1.59.2.2        ad static void	rw_abort(const char *, size_t, krwlock_t *, const char *);
    119  1.59.2.2        ad static void	rw_dump(const volatile void *, lockop_printer_t);
    120  1.59.2.2        ad static lwp_t	*rw_owner(wchan_t);
    121  1.59.2.2        ad 
    122       1.2        ad lockops_t rwlock_lockops = {
    123      1.48     ozaki 	.lo_name = "Reader / writer lock",
    124      1.48     ozaki 	.lo_type = LOCKOPS_SLEEP,
    125      1.48     ozaki 	.lo_dump = rw_dump,
    126       1.2        ad };
    127       1.2        ad 
    128       1.4      yamt syncobj_t rw_syncobj = {
    129      1.49     ozaki 	.sobj_flag	= SOBJ_SLEEPQ_SORTED,
    130      1.49     ozaki 	.sobj_unsleep	= turnstile_unsleep,
    131      1.49     ozaki 	.sobj_changepri	= turnstile_changepri,
    132      1.49     ozaki 	.sobj_lendpri	= sleepq_lendpri,
    133      1.49     ozaki 	.sobj_owner	= rw_owner,
    134       1.4      yamt };
    135       1.4      yamt 
    136       1.2        ad /*
    137  1.59.2.2        ad  * rw_cas:
    138  1.59.2.2        ad  *
    139  1.59.2.2        ad  *	Do an atomic compare-and-swap on the lock word.
    140  1.59.2.2        ad  */
    141  1.59.2.2        ad static inline uintptr_t
    142  1.59.2.2        ad rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
    143  1.59.2.2        ad {
    144  1.59.2.2        ad 
    145  1.59.2.2        ad 	return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
    146  1.59.2.2        ad 	    (void *)o, (void *)n);
    147  1.59.2.2        ad }
    148  1.59.2.2        ad 
    149  1.59.2.2        ad /*
    150  1.59.2.3        ad  * rw_and:
    151  1.59.2.3        ad  *
    152  1.59.2.3        ad  *	Do an atomic AND on the lock word.
    153  1.59.2.3        ad  */
    154  1.59.2.3        ad static inline void
    155  1.59.2.3        ad rw_and(krwlock_t *rw, uintptr_t m)
    156  1.59.2.3        ad {
    157  1.59.2.3        ad 
    158  1.59.2.3        ad #ifdef _LP64
    159  1.59.2.3        ad 	atomic_and_64(&rw->rw_owner, m);
    160  1.59.2.3        ad #else
    161  1.59.2.3        ad 	atomic_and_32(&rw->rw_owner, m);
    162  1.59.2.3        ad #endif
    163  1.59.2.3        ad }
    164  1.59.2.3        ad 
    165  1.59.2.3        ad /*
    166  1.59.2.2        ad  * rw_swap:
    167  1.59.2.2        ad  *
    168  1.59.2.2        ad  *	Do an atomic swap of the lock word.  This is used only when it's
    169  1.59.2.2        ad  *	known that the lock word is set up such that it can't be changed
    170  1.59.2.2        ad  *	behind us (assert this), so there's no point considering the result.
    171  1.59.2.2        ad  */
    172  1.59.2.2        ad static inline void
    173  1.59.2.2        ad rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
    174  1.59.2.2        ad {
    175  1.59.2.2        ad 
    176  1.59.2.2        ad 	n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
    177  1.59.2.2        ad 	    (void *)n);
    178  1.59.2.2        ad 
    179  1.59.2.2        ad 	RW_ASSERT(rw, n == o);
    180  1.59.2.2        ad 	RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
    181  1.59.2.2        ad }
    182  1.59.2.2        ad 
    183  1.59.2.2        ad /*
    184  1.59.2.3        ad  * rw_hold_remember:
    185  1.59.2.3        ad  *
    186  1.59.2.3        ad  *	Helper - when acquring a lock, record the new hold.
    187  1.59.2.3        ad  */
    188  1.59.2.3        ad static inline uintptr_t
    189  1.59.2.3        ad rw_hold_remember(krwlock_t *rw, lwp_t *l)
    190  1.59.2.3        ad {
    191  1.59.2.3        ad 	int i;
    192  1.59.2.3        ad 
    193  1.59.2.3        ad 	KASSERT(kpreempt_disabled());
    194  1.59.2.3        ad 
    195  1.59.2.3        ad 	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
    196  1.59.2.3        ad 		if (__predict_true(l->l_rwlocks[i] == NULL)) {
    197  1.59.2.3        ad 			l->l_rwlocks[i] = rw;
    198  1.59.2.3        ad 			/*
    199  1.59.2.3        ad 			 * Clear the write wanted flag on every acquire to
    200  1.59.2.3        ad 			 * give readers a chance once again.
    201  1.59.2.3        ad 			 */
    202  1.59.2.3        ad 			return ~RW_WRITE_WANTED;
    203  1.59.2.3        ad 		}
    204  1.59.2.3        ad 	}
    205  1.59.2.3        ad 
    206  1.59.2.3        ad 	/*
    207  1.59.2.3        ad 	 * Nowhere to track the hold so we lose: temporarily disable
    208  1.59.2.3        ad 	 * spinning on the lock.
    209  1.59.2.3        ad 	 */
    210  1.59.2.3        ad 	return ~(RW_WRITE_WANTED | RW_SPIN);
    211  1.59.2.3        ad }
    212  1.59.2.3        ad 
    213  1.59.2.3        ad /*
    214  1.59.2.3        ad  * rw_hold_forget:
    215  1.59.2.3        ad  *
    216  1.59.2.3        ad  *	Helper - when releasing a lock, stop tracking the hold.
    217  1.59.2.3        ad  */
    218  1.59.2.3        ad static inline void
    219  1.59.2.3        ad rw_hold_forget(krwlock_t *rw, lwp_t *l)
    220  1.59.2.3        ad {
    221  1.59.2.3        ad 	int i;
    222  1.59.2.3        ad 
    223  1.59.2.3        ad 	KASSERT(kpreempt_disabled());
    224  1.59.2.3        ad 
    225  1.59.2.3        ad 	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
    226  1.59.2.3        ad 		if (__predict_true(l->l_rwlocks[i] == rw)) {
    227  1.59.2.3        ad 			l->l_rwlocks[i] = NULL;
    228  1.59.2.3        ad 			return;
    229  1.59.2.3        ad 		}
    230  1.59.2.3        ad 	}
    231  1.59.2.3        ad }
    232  1.59.2.3        ad 
    233  1.59.2.3        ad /*
    234  1.59.2.3        ad  * rw_switch:
    235  1.59.2.3        ad  *
    236  1.59.2.3        ad  *	Called by mi_switch() to indicate that an LWP is going off the CPU.
    237  1.59.2.3        ad  */
    238  1.59.2.3        ad void
    239  1.59.2.3        ad rw_switch(void)
    240  1.59.2.3        ad {
    241  1.59.2.3        ad 	lwp_t *l = curlwp;
    242  1.59.2.3        ad 	int i;
    243  1.59.2.3        ad 
    244  1.59.2.3        ad 	for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
    245  1.59.2.3        ad 		if (l->l_rwlocks[i] != NULL) {
    246  1.59.2.3        ad 			rw_and(l->l_rwlocks[i], ~RW_SPIN);
    247  1.59.2.3        ad 			/* Leave in place for exit to clear. */
    248  1.59.2.3        ad 		}
    249  1.59.2.3        ad 	}
    250  1.59.2.3        ad }
    251  1.59.2.3        ad 
    252  1.59.2.3        ad /*
    253       1.2        ad  * rw_dump:
    254       1.2        ad  *
    255       1.2        ad  *	Dump the contents of a rwlock structure.
    256       1.2        ad  */
    257      1.11        ad static void
    258      1.54     ozaki rw_dump(const volatile void *cookie, lockop_printer_t pr)
    259       1.2        ad {
    260      1.47  christos 	const volatile krwlock_t *rw = cookie;
    261       1.2        ad 
    262      1.54     ozaki 	pr("owner/count  : %#018lx flags    : %#018x\n",
    263       1.2        ad 	    (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
    264       1.2        ad }
    265       1.2        ad 
    266       1.2        ad /*
    267      1.11        ad  * rw_abort:
    268      1.11        ad  *
    269      1.11        ad  *	Dump information about an error and panic the system.  This
    270      1.11        ad  *	generates a lot of machine code in the DIAGNOSTIC case, so
    271      1.11        ad  *	we ask the compiler to not inline it.
    272      1.11        ad  */
    273      1.26        ad static void __noinline
    274      1.46  christos rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
    275      1.11        ad {
    276      1.11        ad 
    277      1.11        ad 	if (panicstr != NULL)
    278      1.11        ad 		return;
    279      1.11        ad 
    280      1.46  christos 	LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
    281      1.11        ad }
    282      1.11        ad 
    283      1.11        ad /*
    284       1.2        ad  * rw_init:
    285       1.2        ad  *
    286       1.2        ad  *	Initialize a rwlock for use.
    287       1.2        ad  */
    288       1.2        ad void
    289      1.50     ozaki _rw_init(krwlock_t *rw, uintptr_t return_address)
    290       1.2        ad {
    291       1.2        ad 
    292  1.59.2.2        ad 	if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
    293  1.59.2.3        ad 		rw->rw_owner = RW_SPIN;
    294  1.59.2.2        ad 	else
    295  1.59.2.3        ad 		rw->rw_owner = RW_SPIN | RW_NODEBUG;
    296       1.2        ad }
    297       1.2        ad 
    298      1.50     ozaki void
    299      1.50     ozaki rw_init(krwlock_t *rw)
    300      1.50     ozaki {
    301      1.50     ozaki 
    302      1.50     ozaki 	_rw_init(rw, (uintptr_t)__builtin_return_address(0));
    303      1.50     ozaki }
    304      1.50     ozaki 
    305       1.2        ad /*
    306       1.2        ad  * rw_destroy:
    307       1.2        ad  *
    308       1.2        ad  *	Tear down a rwlock.
    309       1.2        ad  */
    310       1.2        ad void
    311       1.2        ad rw_destroy(krwlock_t *rw)
    312       1.2        ad {
    313       1.2        ad 
    314  1.59.2.3        ad 	RW_ASSERT(rw, (rw->rw_owner & ~(RW_NODEBUG | RW_SPIN)) == 0);
    315  1.59.2.2        ad 	LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
    316       1.2        ad }
    317       1.2        ad 
    318       1.2        ad /*
    319       1.2        ad  * rw_vector_enter:
    320       1.2        ad  *
    321  1.59.2.3        ad  *	The slow path for acquiring a rwlock, that considers all conditions.
    322  1.59.2.3        ad  *	Marked __noinline to prevent the compiler pulling it into rw_enter().
    323       1.2        ad  */
    324  1.59.2.3        ad static void __noinline
    325  1.59.2.3        ad rw_vector_enter(krwlock_t *rw, const krw_t op, uintptr_t mask, uintptr_t ra)
    326       1.2        ad {
    327      1.20        ad 	uintptr_t owner, incr, need_wait, set_wait, curthread, next;
    328       1.2        ad 	turnstile_t *ts;
    329       1.2        ad 	int queue;
    330       1.7        ad 	lwp_t *l;
    331       1.2        ad 	LOCKSTAT_TIMER(slptime);
    332      1.20        ad 	LOCKSTAT_TIMER(slpcnt);
    333      1.19        ad 	LOCKSTAT_TIMER(spintime);
    334      1.19        ad 	LOCKSTAT_COUNTER(spincnt);
    335       1.2        ad 	LOCKSTAT_FLAG(lsflag);
    336       1.2        ad 
    337       1.2        ad 	l = curlwp;
    338       1.2        ad 	curthread = (uintptr_t)l;
    339       1.2        ad 
    340      1.13        ad 	RW_ASSERT(rw, !cpu_intr_p());
    341       1.2        ad 	RW_ASSERT(rw, curthread != 0);
    342  1.59.2.3        ad 	RW_ASSERT(rw, kpreempt_disabled());
    343      1.40   mlelstv 	RW_WANTLOCK(rw, op);
    344       1.2        ad 
    345       1.2        ad 	if (panicstr == NULL) {
    346      1.53     ozaki 		KDASSERT(pserialize_not_in_read_section());
    347       1.2        ad 		LOCKDEBUG_BARRIER(&kernel_lock, 1);
    348       1.2        ad 	}
    349       1.2        ad 
    350       1.2        ad 	/*
    351       1.2        ad 	 * We play a slight trick here.  If we're a reader, we want
    352       1.2        ad 	 * increment the read count.  If we're a writer, we want to
    353      1.43     ozaki 	 * set the owner field and the WRITE_LOCKED bit.
    354       1.2        ad 	 *
    355       1.2        ad 	 * In the latter case, we expect those bits to be zero,
    356       1.2        ad 	 * therefore we can use an add operation to set them, which
    357       1.2        ad 	 * means an add operation for both cases.
    358       1.2        ad 	 */
    359       1.2        ad 	if (__predict_true(op == RW_READER)) {
    360       1.2        ad 		incr = RW_READ_INCR;
    361       1.2        ad 		set_wait = RW_HAS_WAITERS;
    362       1.2        ad 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
    363       1.2        ad 		queue = TS_READER_Q;
    364       1.2        ad 	} else {
    365  1.59.2.2        ad 		RW_ASSERT(rw, op == RW_WRITER);
    366       1.2        ad 		incr = curthread | RW_WRITE_LOCKED;
    367       1.2        ad 		set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
    368       1.2        ad 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
    369       1.2        ad 		queue = TS_WRITER_Q;
    370       1.2        ad 	}
    371       1.2        ad 
    372       1.2        ad 	LOCKSTAT_ENTER(lsflag);
    373       1.2        ad 
    374      1.55        ad 	for (owner = rw->rw_owner;;) {
    375       1.2        ad 		/*
    376       1.2        ad 		 * Read the lock owner field.  If the need-to-wait
    377       1.2        ad 		 * indicator is clear, then try to acquire the lock.
    378       1.2        ad 		 */
    379       1.2        ad 		if ((owner & need_wait) == 0) {
    380  1.59.2.3        ad 			next = rw_cas(rw, owner, (owner + incr) & mask);
    381      1.20        ad 			if (__predict_true(next == owner)) {
    382       1.2        ad 				/* Got it! */
    383      1.55        ad 				RW_MEMBAR_ENTER();
    384       1.2        ad 				break;
    385       1.2        ad 			}
    386       1.2        ad 
    387       1.2        ad 			/*
    388       1.2        ad 			 * Didn't get it -- spin around again (we'll
    389       1.2        ad 			 * probably sleep on the next iteration).
    390       1.2        ad 			 */
    391      1.20        ad 			owner = next;
    392       1.2        ad 			continue;
    393       1.2        ad 		}
    394      1.37     rmind 		if (__predict_false(RW_OWNER(rw) == curthread)) {
    395      1.46  christos 			rw_abort(__func__, __LINE__, rw,
    396      1.46  christos 			    "locking against myself");
    397      1.37     rmind 		}
    398  1.59.2.3        ad 
    399      1.19        ad 		/*
    400  1.59.2.3        ad 		 * If the lock owner is running on another CPU, and there
    401  1.59.2.3        ad 		 * are no existing waiters, then spin.  Notes:
    402  1.59.2.3        ad 		 *
    403  1.59.2.3        ad 		 * 1) If an LWP on this CPU (possibly curlwp, or an LWP that
    404  1.59.2.3        ad 		 * curlwp has interupted) holds kernel_lock, we can't spin
    405  1.59.2.3        ad 		 * without a deadlock.  The CPU that holds the rwlock may be
    406  1.59.2.3        ad 		 * blocked trying to acquire kernel_lock, or there may be an
    407  1.59.2.3        ad 		 * unseen chain of dependant locks.  To defeat the potential
    408  1.59.2.3        ad 		 * deadlock, this LWP needs to sleep (and thereby directly
    409  1.59.2.3        ad 		 * drop the kernel_lock, or permit the interrupted LWP that
    410  1.59.2.3        ad 		 * holds kernel_lock to complete its work).
    411  1.59.2.3        ad 		 *
    412  1.59.2.3        ad 		 * 2) If trying to acquire a write lock, and the lock is
    413  1.59.2.3        ad 		 * currently read held, after a brief wait set the write
    414  1.59.2.3        ad 		 * wanted bit to block out new readers and try to avoid
    415  1.59.2.3        ad 		 * starvation.  When the hold is acquired, we'll clear the
    416  1.59.2.3        ad 		 * WRITE_WANTED flag to give readers a chance again.  With
    417  1.59.2.3        ad 		 * luck this should nudge things in the direction of
    418  1.59.2.3        ad 		 * interleaving readers and writers when there is high
    419  1.59.2.3        ad 		 * contention.
    420  1.59.2.3        ad 		 *
    421  1.59.2.3        ad 		 * 3) The spin wait can't be done in soft interrupt context,
    422  1.59.2.3        ad 		 * because a lock holder could be pinned down underneath the
    423  1.59.2.3        ad 		 * soft interrupt LWP (i.e. curlwp) on the same CPU.  For
    424  1.59.2.3        ad 		 * the lock holder to make progress and release the lock,
    425  1.59.2.3        ad 		 * the soft interrupt needs to sleep.
    426      1.19        ad 		 */
    427  1.59.2.3        ad 		if ((owner & RW_SPIN) != 0 && !cpu_softintr_p()) {
    428      1.19        ad 			LOCKSTAT_START_TIMER(lsflag, spintime);
    429      1.19        ad 			u_int count = SPINLOCK_BACKOFF_MIN;
    430      1.20        ad 			do {
    431      1.38     rmind 				KPREEMPT_ENABLE(curlwp);
    432      1.20        ad 				SPINLOCK_BACKOFF(count);
    433      1.38     rmind 				KPREEMPT_DISABLE(curlwp);
    434      1.19        ad 				owner = rw->rw_owner;
    435  1.59.2.3        ad 				if ((owner & need_wait) == 0)
    436  1.59.2.3        ad 					break;
    437  1.59.2.3        ad 				if (count != SPINLOCK_BACKOFF_MAX)
    438  1.59.2.3        ad 					continue;
    439  1.59.2.3        ad 				if (curcpu()->ci_biglock_count != 0)
    440  1.59.2.3        ad 					break;
    441  1.59.2.3        ad 				if (op == RW_WRITER &&
    442  1.59.2.3        ad 				    (owner & RW_WRITE_LOCKED) == 0 &&
    443  1.59.2.3        ad 				    (owner & RW_WRITE_WANTED) == 0) {
    444  1.59.2.3        ad 					(void)rw_cas(rw, owner,
    445  1.59.2.3        ad 					    owner | RW_WRITE_WANTED);
    446  1.59.2.3        ad 				}
    447  1.59.2.3        ad 			} while ((owner & RW_SPIN) != 0);
    448      1.19        ad 			LOCKSTAT_STOP_TIMER(lsflag, spintime);
    449      1.19        ad 			LOCKSTAT_COUNT(spincnt, 1);
    450      1.19        ad 			if ((owner & need_wait) == 0)
    451      1.19        ad 				continue;
    452      1.19        ad 		}
    453      1.19        ad 
    454       1.2        ad 		/*
    455       1.2        ad 		 * Grab the turnstile chain lock.  Once we have that, we
    456       1.2        ad 		 * can adjust the waiter bits and sleep queue.
    457       1.2        ad 		 */
    458       1.2        ad 		ts = turnstile_lookup(rw);
    459       1.2        ad 
    460       1.2        ad 		/*
    461  1.59.2.3        ad 		 * Mark the rwlock as having waiters, and disable spinning.
    462  1.59.2.3        ad 		 * If the set fails, then we may not need to sleep and
    463  1.59.2.3        ad 		 * should spin again.  Reload rw_owner now that we own
    464  1.59.2.3        ad 		 * the turnstile chain lock.
    465       1.2        ad 		 */
    466      1.20        ad 		owner = rw->rw_owner;
    467  1.59.2.3        ad 		if ((owner & need_wait) == 0 ||
    468  1.59.2.3        ad 		    ((owner & RW_SPIN) != 0 && !cpu_softintr_p())) {
    469      1.20        ad 			turnstile_exit(rw);
    470      1.20        ad 			continue;
    471      1.20        ad 		}
    472  1.59.2.3        ad 		next = rw_cas(rw, owner, (owner | set_wait) & ~RW_SPIN);
    473      1.20        ad 		if (__predict_false(next != owner)) {
    474       1.2        ad 			turnstile_exit(rw);
    475      1.20        ad 			owner = next;
    476       1.2        ad 			continue;
    477       1.2        ad 		}
    478       1.2        ad 
    479       1.2        ad 		LOCKSTAT_START_TIMER(lsflag, slptime);
    480       1.4      yamt 		turnstile_block(ts, queue, rw, &rw_syncobj);
    481       1.2        ad 		LOCKSTAT_STOP_TIMER(lsflag, slptime);
    482      1.20        ad 		LOCKSTAT_COUNT(slpcnt, 1);
    483       1.2        ad 
    484      1.20        ad 		/*
    485      1.20        ad 		 * No need for a memory barrier because of context switch.
    486      1.20        ad 		 * If not handed the lock, then spin again.
    487      1.20        ad 		 */
    488      1.58        ad 		if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
    489      1.20        ad 			break;
    490      1.58        ad 
    491      1.39      yamt 		owner = rw->rw_owner;
    492       1.2        ad 	}
    493      1.37     rmind 	KPREEMPT_ENABLE(curlwp);
    494       1.2        ad 
    495  1.59.2.1        ad 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
    496  1.59.2.1        ad 	    (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
    497  1.59.2.3        ad 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
    498  1.59.2.1        ad 	LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
    499  1.59.2.3        ad 	    (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
    500       1.2        ad 	LOCKSTAT_EXIT(lsflag);
    501       1.2        ad 
    502  1.59.2.2        ad 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
    503       1.2        ad 	    (op == RW_READER && RW_COUNT(rw) != 0));
    504       1.2        ad 	RW_LOCKED(rw, op);
    505       1.2        ad }
    506       1.2        ad 
    507       1.2        ad /*
    508  1.59.2.3        ad  * rw_enter:
    509       1.2        ad  *
    510  1.59.2.3        ad  *	The fast path for acquiring a lock that considers only the
    511  1.59.2.3        ad  *	uncontended case.  Falls back to rw_vector_enter().
    512       1.2        ad  */
    513       1.2        ad void
    514  1.59.2.3        ad rw_enter(krwlock_t *rw, const krw_t op)
    515  1.59.2.3        ad {
    516  1.59.2.3        ad 	uintptr_t owner, incr, need_wait, curthread, next, mask;
    517  1.59.2.3        ad 	lwp_t *l;
    518  1.59.2.3        ad 
    519  1.59.2.3        ad 	l = curlwp;
    520  1.59.2.3        ad 	curthread = (uintptr_t)l;
    521  1.59.2.3        ad 
    522  1.59.2.3        ad 	RW_ASSERT(rw, !cpu_intr_p());
    523  1.59.2.3        ad 	RW_ASSERT(rw, curthread != 0);
    524  1.59.2.3        ad 	RW_WANTLOCK(rw, op);
    525  1.59.2.3        ad 
    526  1.59.2.3        ad 	KPREEMPT_DISABLE(l);
    527  1.59.2.3        ad 	mask = rw_hold_remember(rw, l);
    528  1.59.2.3        ad 
    529  1.59.2.3        ad 	/*
    530  1.59.2.3        ad 	 * We play a slight trick here.  If we're a reader, we want
    531  1.59.2.3        ad 	 * increment the read count.  If we're a writer, we want to
    532  1.59.2.3        ad 	 * set the owner field and the WRITE_LOCKED bit.
    533  1.59.2.3        ad 	 *
    534  1.59.2.3        ad 	 * In the latter case, we expect those bits to be zero,
    535  1.59.2.3        ad 	 * therefore we can use an add operation to set them, which
    536  1.59.2.3        ad 	 * means an add operation for both cases.
    537  1.59.2.3        ad 	 */
    538  1.59.2.3        ad 	if (__predict_true(op == RW_READER)) {
    539  1.59.2.3        ad 		incr = RW_READ_INCR;
    540  1.59.2.3        ad 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
    541  1.59.2.3        ad 	} else {
    542  1.59.2.3        ad 		RW_ASSERT(rw, op == RW_WRITER);
    543  1.59.2.3        ad 		incr = curthread | RW_WRITE_LOCKED;
    544  1.59.2.3        ad 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
    545  1.59.2.3        ad 	}
    546  1.59.2.3        ad 
    547  1.59.2.3        ad 	/*
    548  1.59.2.3        ad 	 * Read the lock owner field.  If the need-to-wait
    549  1.59.2.3        ad 	 * indicator is clear, then try to acquire the lock.
    550  1.59.2.3        ad 	 */
    551  1.59.2.3        ad 	owner = rw->rw_owner;
    552  1.59.2.3        ad 	if ((owner & need_wait) == 0) {
    553  1.59.2.3        ad 		next = rw_cas(rw, owner, (owner + incr) & mask);
    554  1.59.2.3        ad 		if (__predict_true(next == owner)) {
    555  1.59.2.3        ad 			/* Got it! */
    556  1.59.2.3        ad 			KPREEMPT_ENABLE(l);
    557  1.59.2.3        ad 			RW_MEMBAR_ENTER();
    558  1.59.2.3        ad 			return;
    559  1.59.2.3        ad 		}
    560  1.59.2.3        ad 	}
    561  1.59.2.3        ad 
    562  1.59.2.3        ad 	rw_vector_enter(rw, op, mask, (uintptr_t)__builtin_return_address(0));
    563  1.59.2.3        ad }
    564  1.59.2.3        ad 
    565  1.59.2.3        ad /*
    566  1.59.2.3        ad  * rw_vector_exit:
    567  1.59.2.3        ad  *
    568  1.59.2.3        ad  *	The slow path for releasing a rwlock, that considers all conditions.
    569  1.59.2.3        ad  *	Marked __noinline to prevent the compiler pulling it into rw_enter().
    570  1.59.2.3        ad  */
    571  1.59.2.3        ad static void __noinline
    572       1.2        ad rw_vector_exit(krwlock_t *rw)
    573       1.2        ad {
    574      1.44      matt 	uintptr_t curthread, owner, decr, newown, next;
    575       1.2        ad 	turnstile_t *ts;
    576       1.2        ad 	int rcnt, wcnt;
    577       1.7        ad 	lwp_t *l;
    578       1.2        ad 
    579  1.59.2.2        ad 	l = curlwp;
    580  1.59.2.2        ad 	curthread = (uintptr_t)l;
    581       1.2        ad 	RW_ASSERT(rw, curthread != 0);
    582  1.59.2.3        ad 	RW_ASSERT(rw, kpreempt_disabled());
    583       1.2        ad 
    584       1.2        ad 	/*
    585       1.2        ad 	 * Again, we use a trick.  Since we used an add operation to
    586       1.2        ad 	 * set the required lock bits, we can use a subtract to clear
    587       1.2        ad 	 * them, which makes the read-release and write-release path
    588       1.2        ad 	 * the same.
    589       1.2        ad 	 */
    590       1.2        ad 	owner = rw->rw_owner;
    591       1.2        ad 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
    592       1.2        ad 		RW_UNLOCKED(rw, RW_WRITER);
    593       1.2        ad 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    594       1.2        ad 		decr = curthread | RW_WRITE_LOCKED;
    595       1.2        ad 	} else {
    596       1.2        ad 		RW_UNLOCKED(rw, RW_READER);
    597       1.2        ad 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
    598       1.2        ad 		decr = RW_READ_INCR;
    599       1.2        ad 	}
    600       1.2        ad 
    601       1.2        ad 	/*
    602       1.2        ad 	 * Compute what we expect the new value of the lock to be. Only
    603       1.2        ad 	 * proceed to do direct handoff if there are waiters, and if the
    604       1.2        ad 	 * lock would become unowned.
    605       1.2        ad 	 */
    606      1.55        ad 	RW_MEMBAR_EXIT();
    607      1.58        ad 	for (;;) {
    608      1.44      matt 		newown = (owner - decr);
    609      1.44      matt 		if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
    610       1.2        ad 			break;
    611  1.59.2.3        ad 		/* Want spinning enabled if lock is becoming free. */
    612  1.59.2.3        ad 		if ((newown & RW_THREAD) == 0)
    613  1.59.2.3        ad 			newown |= RW_SPIN;
    614      1.44      matt 		next = rw_cas(rw, owner, newown);
    615  1.59.2.3        ad 		if (__predict_true(next == owner)) {
    616  1.59.2.3        ad 			rw_hold_forget(rw, l);
    617  1.59.2.3        ad 			kpreempt_enable();
    618       1.2        ad 			return;
    619  1.59.2.3        ad 		}
    620      1.58        ad 		owner = next;
    621       1.2        ad 	}
    622       1.2        ad 
    623      1.20        ad 	/*
    624      1.20        ad 	 * Grab the turnstile chain lock.  This gets the interlock
    625      1.20        ad 	 * on the sleep queue.  Once we have that, we can adjust the
    626      1.20        ad 	 * waiter bits.
    627      1.20        ad 	 */
    628      1.20        ad 	ts = turnstile_lookup(rw);
    629      1.20        ad 	owner = rw->rw_owner;
    630  1.59.2.2        ad 	RW_ASSERT(rw, ts != NULL);
    631  1.59.2.2        ad 	RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
    632       1.2        ad 
    633      1.20        ad 	wcnt = TS_WAITERS(ts, TS_WRITER_Q);
    634      1.20        ad 	rcnt = TS_WAITERS(ts, TS_READER_Q);
    635       1.2        ad 
    636      1.20        ad 	/*
    637      1.20        ad 	 * Give the lock away.
    638      1.20        ad 	 *
    639      1.20        ad 	 * If we are releasing a write lock, then prefer to wake all
    640      1.20        ad 	 * outstanding readers.  Otherwise, wake one writer if there
    641      1.20        ad 	 * are outstanding readers, or all writers if there are no
    642      1.20        ad 	 * pending readers.  If waking one specific writer, the writer
    643      1.20        ad 	 * is handed the lock here.  If waking multiple writers, we
    644      1.20        ad 	 * set WRITE_WANTED to block out new readers, and let them
    645      1.41     skrll 	 * do the work of acquiring the lock in rw_vector_enter().
    646      1.20        ad 	 */
    647      1.32      yamt 	if (rcnt == 0 || decr == RW_READ_INCR) {
    648  1.59.2.2        ad 		RW_ASSERT(rw, wcnt != 0);
    649  1.59.2.2        ad 		RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
    650       1.2        ad 
    651      1.20        ad 		if (rcnt != 0) {
    652      1.20        ad 			/* Give the lock to the longest waiting writer. */
    653       1.2        ad 			l = TS_FIRST(ts, TS_WRITER_Q);
    654  1.59.2.2        ad 			newown = (uintptr_t)l | (owner & RW_NODEBUG);
    655  1.59.2.2        ad 			newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
    656      1.28   thorpej 			if (wcnt > 1)
    657      1.44      matt 				newown |= RW_WRITE_WANTED;
    658      1.44      matt 			rw_swap(rw, owner, newown);
    659  1.59.2.3        ad 			rw_hold_forget(rw, l);
    660       1.7        ad 			turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
    661       1.2        ad 		} else {
    662      1.20        ad 			/* Wake all writers and let them fight it out. */
    663  1.59.2.2        ad 			newown = owner & RW_NODEBUG;
    664  1.59.2.2        ad 			newown |= RW_WRITE_WANTED;
    665  1.59.2.2        ad 			rw_swap(rw, owner, newown);
    666  1.59.2.3        ad 			rw_hold_forget(rw, l);
    667      1.20        ad 			turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
    668      1.20        ad 		}
    669      1.20        ad 	} else {
    670  1.59.2.2        ad 		RW_ASSERT(rw, rcnt != 0);
    671       1.2        ad 
    672      1.20        ad 		/*
    673      1.20        ad 		 * Give the lock to all blocked readers.  If there
    674      1.20        ad 		 * is a writer waiting, new readers that arrive
    675      1.20        ad 		 * after the release will be blocked out.
    676      1.20        ad 		 */
    677  1.59.2.2        ad 		newown = owner & RW_NODEBUG;
    678  1.59.2.2        ad 		newown += rcnt << RW_READ_COUNT_SHIFT;
    679      1.20        ad 		if (wcnt != 0)
    680      1.44      matt 			newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
    681      1.12      yamt 
    682      1.20        ad 		/* Wake up all sleeping readers. */
    683      1.44      matt 		rw_swap(rw, owner, newown);
    684  1.59.2.3        ad 		rw_hold_forget(rw, l);
    685      1.20        ad 		turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
    686       1.2        ad 	}
    687  1.59.2.3        ad 	kpreempt_enable();
    688  1.59.2.3        ad }
    689  1.59.2.3        ad 
    690  1.59.2.3        ad /*
    691  1.59.2.3        ad  * rw_exit:
    692  1.59.2.3        ad  *
    693  1.59.2.3        ad  *	The fast path for releasing a lock that considers only the
    694  1.59.2.3        ad  *	uncontended case.  Falls back to rw_vector_exit().
    695  1.59.2.3        ad  */
    696  1.59.2.3        ad void
    697  1.59.2.3        ad rw_exit(krwlock_t *rw)
    698  1.59.2.3        ad {
    699  1.59.2.3        ad 	uintptr_t curthread, owner, decr, newown, next;
    700  1.59.2.3        ad 	lwp_t *l;
    701  1.59.2.3        ad 
    702  1.59.2.3        ad 	l = curlwp;
    703  1.59.2.3        ad 	curthread = (uintptr_t)l;
    704  1.59.2.3        ad 	RW_ASSERT(rw, curthread != 0);
    705  1.59.2.3        ad 
    706  1.59.2.3        ad 	/*
    707  1.59.2.3        ad 	 * Again, we use a trick.  Since we used an add operation to
    708  1.59.2.3        ad 	 * set the required lock bits, we can use a subtract to clear
    709  1.59.2.3        ad 	 * them, which makes the read-release and write-release path
    710  1.59.2.3        ad 	 * the same.
    711  1.59.2.3        ad 	 */
    712  1.59.2.3        ad 	owner = rw->rw_owner;
    713  1.59.2.3        ad 	if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
    714  1.59.2.3        ad 		RW_UNLOCKED(rw, RW_WRITER);
    715  1.59.2.3        ad 		RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    716  1.59.2.3        ad 		decr = curthread | RW_WRITE_LOCKED;
    717  1.59.2.3        ad 	} else {
    718  1.59.2.3        ad 		RW_UNLOCKED(rw, RW_READER);
    719  1.59.2.3        ad 		RW_ASSERT(rw, RW_COUNT(rw) != 0);
    720  1.59.2.3        ad 		decr = RW_READ_INCR;
    721  1.59.2.3        ad 	}
    722  1.59.2.3        ad 
    723  1.59.2.3        ad 	/* Now try to release it. */
    724  1.59.2.3        ad 	RW_MEMBAR_EXIT();
    725  1.59.2.3        ad 	KPREEMPT_DISABLE(l);
    726  1.59.2.3        ad 	newown = (owner - decr);
    727  1.59.2.3        ad 	if (__predict_true((newown & (RW_THREAD | RW_HAS_WAITERS)) !=
    728  1.59.2.3        ad 	    RW_HAS_WAITERS)) {
    729  1.59.2.3        ad 		/* Want spinning (re-)enabled if lock is becoming free. */
    730  1.59.2.3        ad 		if ((newown & RW_THREAD) == 0)
    731  1.59.2.3        ad 			newown |= RW_SPIN;
    732  1.59.2.3        ad 		next = rw_cas(rw, owner, newown);
    733  1.59.2.3        ad 		if (__predict_true(next == owner)) {
    734  1.59.2.3        ad 			rw_hold_forget(rw, l);
    735  1.59.2.3        ad 			KPREEMPT_ENABLE(l);
    736  1.59.2.3        ad 			return;
    737  1.59.2.3        ad 		}
    738  1.59.2.3        ad 	}
    739  1.59.2.3        ad 	rw_vector_exit(rw);
    740       1.2        ad }
    741       1.2        ad 
    742       1.2        ad /*
    743  1.59.2.3        ad  * rw_tryenter:
    744       1.2        ad  *
    745       1.2        ad  *	Try to acquire a rwlock.
    746       1.2        ad  */
    747       1.2        ad int
    748  1.59.2.3        ad rw_tryenter(krwlock_t *rw, const krw_t op)
    749       1.2        ad {
    750  1.59.2.3        ad 	uintptr_t curthread, owner, incr, need_wait, next, mask;
    751  1.59.2.2        ad 	lwp_t *l;
    752       1.2        ad 
    753  1.59.2.2        ad 	l = curlwp;
    754  1.59.2.2        ad 	curthread = (uintptr_t)l;
    755       1.2        ad 
    756       1.2        ad 	RW_ASSERT(rw, curthread != 0);
    757       1.2        ad 
    758  1.59.2.3        ad 	KPREEMPT_DISABLE(l);
    759  1.59.2.3        ad 	mask = rw_hold_remember(rw, l);
    760  1.59.2.3        ad 
    761       1.2        ad 	if (op == RW_READER) {
    762       1.2        ad 		incr = RW_READ_INCR;
    763       1.2        ad 		need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
    764       1.2        ad 	} else {
    765  1.59.2.2        ad 		RW_ASSERT(rw, op == RW_WRITER);
    766       1.2        ad 		incr = curthread | RW_WRITE_LOCKED;
    767       1.2        ad 		need_wait = RW_WRITE_LOCKED | RW_THREAD;
    768       1.2        ad 	}
    769       1.2        ad 
    770      1.58        ad 	for (owner = rw->rw_owner;; owner = next) {
    771  1.59.2.3        ad 		if (__predict_false((owner & need_wait) != 0)) {
    772  1.59.2.3        ad 			rw_hold_forget(rw, l);
    773  1.59.2.3        ad 			KPREEMPT_ENABLE(l);
    774      1.58        ad 			return 0;
    775  1.59.2.3        ad 		}
    776  1.59.2.3        ad 		next = rw_cas(rw, owner, (owner + incr) & mask);
    777      1.20        ad 		if (__predict_true(next == owner)) {
    778      1.20        ad 			/* Got it! */
    779      1.20        ad 			break;
    780       1.2        ad 		}
    781       1.2        ad 	}
    782       1.2        ad 
    783      1.40   mlelstv 	RW_WANTLOCK(rw, op);
    784       1.2        ad 	RW_LOCKED(rw, op);
    785  1.59.2.2        ad 	RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
    786       1.2        ad 	    (op == RW_READER && RW_COUNT(rw) != 0));
    787       1.7        ad 
    788  1.59.2.3        ad 	KPREEMPT_ENABLE(l);
    789  1.59.2.2        ad 	RW_MEMBAR_ENTER();
    790       1.2        ad 	return 1;
    791       1.2        ad }
    792       1.2        ad 
    793       1.2        ad /*
    794       1.2        ad  * rw_downgrade:
    795       1.2        ad  *
    796  1.59.2.2        ad  *	Downgrade a write lock to a read lock.
    797       1.2        ad  */
    798       1.2        ad void
    799       1.2        ad rw_downgrade(krwlock_t *rw)
    800       1.2        ad {
    801      1.44      matt 	uintptr_t owner, curthread, newown, next;
    802       1.2        ad 	turnstile_t *ts;
    803       1.2        ad 	int rcnt, wcnt;
    804  1.59.2.2        ad 	lwp_t *l;
    805       1.2        ad 
    806  1.59.2.2        ad 	l = curlwp;
    807  1.59.2.2        ad 	curthread = (uintptr_t)l;
    808       1.2        ad 	RW_ASSERT(rw, curthread != 0);
    809  1.59.2.2        ad 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
    810       1.2        ad 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    811       1.2        ad 	RW_UNLOCKED(rw, RW_WRITER);
    812      1.42       mrg #if !defined(DIAGNOSTIC)
    813      1.42       mrg 	__USE(curthread);
    814      1.42       mrg #endif
    815      1.42       mrg 
    816      1.55        ad 	RW_MEMBAR_PRODUCER();
    817       1.2        ad 
    818  1.59.2.2        ad 	for (owner = rw->rw_owner;; owner = next) {
    819  1.59.2.2        ad 		/*
    820  1.59.2.2        ad 		 * If there are no waiters we can do this the easy way.  Try
    821  1.59.2.2        ad 		 * swapping us down to one read hold.  If it fails, the lock
    822  1.59.2.2        ad 		 * condition has changed and we most likely now have
    823  1.59.2.2        ad 		 * waiters.
    824  1.59.2.2        ad 		 */
    825  1.59.2.2        ad 		if ((owner & RW_HAS_WAITERS) == 0) {
    826  1.59.2.3        ad 			newown = (owner & RW_NODEBUG) | RW_SPIN;
    827  1.59.2.2        ad 			next = rw_cas(rw, owner, newown + RW_READ_INCR);
    828  1.59.2.2        ad 			if (__predict_true(next == owner)) {
    829  1.59.2.2        ad 				RW_LOCKED(rw, RW_READER);
    830  1.59.2.2        ad 				RW_ASSERT(rw,
    831  1.59.2.2        ad 				    (rw->rw_owner & RW_WRITE_LOCKED) == 0);
    832  1.59.2.2        ad 				RW_ASSERT(rw, RW_COUNT(rw) != 0);
    833  1.59.2.2        ad 				return;
    834  1.59.2.2        ad 			}
    835  1.59.2.2        ad 			continue;
    836  1.59.2.2        ad 		}
    837  1.59.2.2        ad 
    838  1.59.2.2        ad 		/*
    839  1.59.2.2        ad 		 * Grab the turnstile chain lock.  This gets the interlock
    840  1.59.2.2        ad 		 * on the sleep queue.  Once we have that, we can adjust the
    841  1.59.2.2        ad 		 * waiter bits.
    842  1.59.2.2        ad 		 */
    843       1.2        ad 		ts = turnstile_lookup(rw);
    844  1.59.2.2        ad 		RW_ASSERT(rw, ts != NULL);
    845       1.2        ad 
    846       1.2        ad 		rcnt = TS_WAITERS(ts, TS_READER_Q);
    847       1.2        ad 		wcnt = TS_WAITERS(ts, TS_WRITER_Q);
    848       1.2        ad 
    849       1.2        ad 		if (rcnt == 0) {
    850  1.59.2.2        ad 			/*
    851  1.59.2.2        ad 			 * If there are no readers, just preserve the
    852  1.59.2.2        ad 			 * waiters bits, swap us down to one read hold and
    853  1.59.2.3        ad 			 * return.  Don't set the spin bit as nobody's
    854  1.59.2.3        ad 			 * running yet.
    855  1.59.2.2        ad 			 */
    856  1.59.2.2        ad 			RW_ASSERT(rw, wcnt != 0);
    857  1.59.2.2        ad 			RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
    858  1.59.2.2        ad 			RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
    859  1.59.2.2        ad 
    860  1.59.2.2        ad 			newown = owner & RW_NODEBUG;
    861  1.59.2.2        ad 			newown = RW_READ_INCR | RW_HAS_WAITERS |
    862  1.59.2.2        ad 			    RW_WRITE_WANTED;
    863      1.44      matt 			next = rw_cas(rw, owner, newown);
    864      1.27     rmind 			turnstile_exit(rw);
    865      1.20        ad 			if (__predict_true(next == owner))
    866      1.20        ad 				break;
    867      1.20        ad 		} else {
    868      1.20        ad 			/*
    869      1.20        ad 			 * Give the lock to all blocked readers.  We may
    870  1.59.2.2        ad 			 * retain one read hold if downgrading.  If there is
    871  1.59.2.2        ad 			 * a writer waiting, new readers will be blocked
    872  1.59.2.3        ad 			 * out.  Don't set the spin bit as nobody's running
    873  1.59.2.3        ad 			 * yet.
    874      1.20        ad 			 */
    875  1.59.2.2        ad 			newown = owner & RW_NODEBUG;
    876  1.59.2.2        ad 			newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
    877      1.20        ad 			if (wcnt != 0)
    878      1.44      matt 				newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
    879      1.20        ad 
    880      1.44      matt 			next = rw_cas(rw, owner, newown);
    881      1.20        ad 			if (__predict_true(next == owner)) {
    882      1.20        ad 				/* Wake up all sleeping readers. */
    883      1.20        ad 				turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
    884      1.20        ad 				break;
    885       1.2        ad 			}
    886      1.27     rmind 			turnstile_exit(rw);
    887       1.2        ad 		}
    888       1.2        ad 	}
    889       1.2        ad 
    890      1.40   mlelstv 	RW_WANTLOCK(rw, RW_READER);
    891       1.2        ad 	RW_LOCKED(rw, RW_READER);
    892  1.59.2.2        ad 	RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
    893  1.59.2.2        ad 	RW_ASSERT(rw, RW_COUNT(rw) != 0);
    894       1.2        ad }
    895       1.2        ad 
    896       1.2        ad /*
    897       1.2        ad  * rw_tryupgrade:
    898       1.2        ad  *
    899      1.55        ad  *	Try to upgrade a read lock to a write lock.  We must be the only
    900  1.59.2.2        ad  *	reader.
    901       1.2        ad  */
    902       1.2        ad int
    903       1.2        ad rw_tryupgrade(krwlock_t *rw)
    904       1.2        ad {
    905      1.44      matt 	uintptr_t owner, curthread, newown, next;
    906  1.59.2.2        ad 	struct lwp *l;
    907       1.2        ad 
    908  1.59.2.2        ad 	l = curlwp;
    909  1.59.2.2        ad 	curthread = (uintptr_t)l;
    910       1.2        ad 	RW_ASSERT(rw, curthread != 0);
    911      1.31      yamt 	RW_ASSERT(rw, rw_read_held(rw));
    912       1.2        ad 
    913      1.55        ad 	for (owner = RW_READ_INCR;; owner = next) {
    914      1.44      matt 		newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
    915      1.44      matt 		next = rw_cas(rw, owner, newown);
    916      1.30        ad 		if (__predict_true(next == owner)) {
    917      1.55        ad 			RW_MEMBAR_PRODUCER();
    918       1.2        ad 			break;
    919      1.30        ad 		}
    920      1.55        ad 		RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0);
    921      1.55        ad 		if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) {
    922      1.55        ad 			RW_ASSERT(rw, (next & RW_THREAD) != 0);
    923      1.55        ad 			return 0;
    924      1.55        ad 		}
    925       1.2        ad 	}
    926       1.2        ad 
    927       1.2        ad 	RW_UNLOCKED(rw, RW_READER);
    928      1.40   mlelstv 	RW_WANTLOCK(rw, RW_WRITER);
    929       1.2        ad 	RW_LOCKED(rw, RW_WRITER);
    930  1.59.2.2        ad 	RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
    931  1.59.2.2        ad 	RW_ASSERT(rw, RW_OWNER(rw) == curthread);
    932       1.2        ad 
    933       1.2        ad 	return 1;
    934       1.2        ad }
    935       1.2        ad 
    936       1.2        ad /*
    937       1.2        ad  * rw_read_held:
    938       1.2        ad  *
    939       1.2        ad  *	Returns true if the rwlock is held for reading.  Must only be
    940       1.2        ad  *	used for diagnostic assertions, and never be used to make
    941       1.2        ad  * 	decisions about how to use a rwlock.
    942       1.2        ad  */
    943       1.2        ad int
    944       1.2        ad rw_read_held(krwlock_t *rw)
    945       1.2        ad {
    946       1.2        ad 	uintptr_t owner;
    947       1.2        ad 
    948      1.21        ad 	if (rw == NULL)
    949      1.21        ad 		return 0;
    950       1.2        ad 	owner = rw->rw_owner;
    951       1.2        ad 	return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
    952       1.2        ad }
    953       1.2        ad 
    954       1.2        ad /*
    955       1.2        ad  * rw_write_held:
    956       1.2        ad  *
    957       1.2        ad  *	Returns true if the rwlock is held for writing.  Must only be
    958       1.2        ad  *	used for diagnostic assertions, and never be used to make
    959       1.2        ad  *	decisions about how to use a rwlock.
    960       1.2        ad  */
    961       1.2        ad int
    962       1.2        ad rw_write_held(krwlock_t *rw)
    963       1.2        ad {
    964       1.2        ad 
    965      1.21        ad 	if (rw == NULL)
    966      1.21        ad 		return 0;
    967      1.17        ad 	return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
    968      1.18        ad 	    (RW_WRITE_LOCKED | (uintptr_t)curlwp);
    969       1.2        ad }
    970       1.2        ad 
    971       1.2        ad /*
    972       1.2        ad  * rw_lock_held:
    973       1.2        ad  *
    974       1.2        ad  *	Returns true if the rwlock is held for reading or writing.  Must
    975       1.2        ad  *	only be used for diagnostic assertions, and never be used to make
    976       1.2        ad  *	decisions about how to use a rwlock.
    977       1.2        ad  */
    978       1.2        ad int
    979       1.2        ad rw_lock_held(krwlock_t *rw)
    980       1.2        ad {
    981       1.2        ad 
    982      1.21        ad 	if (rw == NULL)
    983      1.21        ad 		return 0;
    984       1.2        ad 	return (rw->rw_owner & RW_THREAD) != 0;
    985       1.2        ad }
    986       1.4      yamt 
    987       1.5        ad /*
    988       1.5        ad  * rw_owner:
    989       1.5        ad  *
    990       1.5        ad  *	Return the current owner of an RW lock, but only if it is write
    991       1.5        ad  *	held.  Used for priority inheritance.
    992       1.5        ad  */
    993       1.7        ad static lwp_t *
    994       1.4      yamt rw_owner(wchan_t obj)
    995       1.4      yamt {
    996       1.4      yamt 	krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
    997       1.4      yamt 	uintptr_t owner = rw->rw_owner;
    998       1.4      yamt 
    999       1.4      yamt 	if ((owner & RW_WRITE_LOCKED) == 0)
   1000       1.4      yamt 		return NULL;
   1001       1.4      yamt 
   1002       1.4      yamt 	return (void *)(owner & RW_THREAD);
   1003       1.4      yamt }
   1004  1.59.2.3        ad 
   1005  1.59.2.3        ad /*
   1006  1.59.2.3        ad  * rw_owner_running:
   1007  1.59.2.3        ad  *
   1008  1.59.2.3        ad  *	Return true if a RW lock is unheld, or held and the owner is running
   1009  1.59.2.3        ad  *	on a CPU.  For the pagedaemon only - do not document or use in other
   1010  1.59.2.3        ad  *	code.
   1011  1.59.2.3        ad  */
   1012  1.59.2.3        ad bool
   1013  1.59.2.3        ad rw_owner_running(const krwlock_t *rw)
   1014  1.59.2.3        ad {
   1015  1.59.2.3        ad 	uintptr_t owner = rw->rw_owner;
   1016  1.59.2.3        ad 
   1017  1.59.2.3        ad 	return (owner & RW_THREAD) == 0 || (owner & RW_SPIN) != 0;
   1018  1.59.2.3        ad }
   1019