kern_rwlock.c revision 1.59.2.3 1 /* $NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Jason R. Thorpe and Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Kernel reader/writer lock implementation, modeled after those
35 * found in Solaris, a description of which can be found in:
36 *
37 * Solaris Internals: Core Kernel Architecture, Jim Mauro and
38 * Richard McDougall.
39 *
40 * The NetBSD implementation is different from that described in the book,
41 * in that the locks are adaptive. Lock waiters spin wait while the lock
42 * holders are on CPU (if the holds can be tracked: up to N per-thread).
43 *
44 * While spin waiting, threads compete for the lock without the assistance
45 * of turnstiles. If a lock holder sleeps for any reason, the lock waiters
46 * will also sleep in response and at that point turnstiles, priority
47 * inheritance and strong efforts at ensuring fairness come into play.
48 *
49 * The adaptive behaviour is controlled by the RW_SPIN flag bit, which is
50 * cleared by a lock owner that is going off the CPU, and set again by the
51 * lock owner that releases the last hold on the lock.
52 */
53
54 #include <sys/cdefs.h>
55 __KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.59.2.3 2020/01/19 21:08:29 ad Exp $");
56
57 #include "opt_lockdebug.h"
58
59 #define __RWLOCK_PRIVATE
60
61 #include <sys/param.h>
62 #include <sys/proc.h>
63 #include <sys/rwlock.h>
64 #include <sys/sched.h>
65 #include <sys/sleepq.h>
66 #include <sys/systm.h>
67 #include <sys/lockdebug.h>
68 #include <sys/cpu.h>
69 #include <sys/atomic.h>
70 #include <sys/lock.h>
71 #include <sys/pserialize.h>
72
73 #include <dev/lockstat.h>
74
75 /*
76 * LOCKDEBUG
77 */
78
79 #define RW_DEBUG_P(rw) (((rw)->rw_owner & RW_NODEBUG) == 0)
80
81 #define RW_WANTLOCK(rw, op) \
82 LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
83 (uintptr_t)__builtin_return_address(0), op == RW_READER);
84 #define RW_LOCKED(rw, op) \
85 LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
86 (uintptr_t)__builtin_return_address(0), op == RW_READER);
87 #define RW_UNLOCKED(rw, op) \
88 LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
89 (uintptr_t)__builtin_return_address(0), op == RW_READER);
90
91 /*
92 * DIAGNOSTIC
93 */
94
95 #if defined(DIAGNOSTIC)
96 #define RW_ASSERT(rw, cond) \
97 do { \
98 if (__predict_false(!(cond))) \
99 rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
100 } while (/* CONSTCOND */ 0)
101 #else
102 #define RW_ASSERT(rw, cond) /* nothing */
103 #endif /* DIAGNOSTIC */
104
105 /*
106 * Memory barriers.
107 */
108 #ifdef __HAVE_ATOMIC_AS_MEMBAR
109 #define RW_MEMBAR_ENTER()
110 #define RW_MEMBAR_EXIT()
111 #define RW_MEMBAR_PRODUCER()
112 #else
113 #define RW_MEMBAR_ENTER() membar_enter()
114 #define RW_MEMBAR_EXIT() membar_exit()
115 #define RW_MEMBAR_PRODUCER() membar_producer()
116 #endif
117
118 static void rw_abort(const char *, size_t, krwlock_t *, const char *);
119 static void rw_dump(const volatile void *, lockop_printer_t);
120 static lwp_t *rw_owner(wchan_t);
121
122 lockops_t rwlock_lockops = {
123 .lo_name = "Reader / writer lock",
124 .lo_type = LOCKOPS_SLEEP,
125 .lo_dump = rw_dump,
126 };
127
128 syncobj_t rw_syncobj = {
129 .sobj_flag = SOBJ_SLEEPQ_SORTED,
130 .sobj_unsleep = turnstile_unsleep,
131 .sobj_changepri = turnstile_changepri,
132 .sobj_lendpri = sleepq_lendpri,
133 .sobj_owner = rw_owner,
134 };
135
136 /*
137 * rw_cas:
138 *
139 * Do an atomic compare-and-swap on the lock word.
140 */
141 static inline uintptr_t
142 rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
143 {
144
145 return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
146 (void *)o, (void *)n);
147 }
148
149 /*
150 * rw_and:
151 *
152 * Do an atomic AND on the lock word.
153 */
154 static inline void
155 rw_and(krwlock_t *rw, uintptr_t m)
156 {
157
158 #ifdef _LP64
159 atomic_and_64(&rw->rw_owner, m);
160 #else
161 atomic_and_32(&rw->rw_owner, m);
162 #endif
163 }
164
165 /*
166 * rw_swap:
167 *
168 * Do an atomic swap of the lock word. This is used only when it's
169 * known that the lock word is set up such that it can't be changed
170 * behind us (assert this), so there's no point considering the result.
171 */
172 static inline void
173 rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
174 {
175
176 n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
177 (void *)n);
178
179 RW_ASSERT(rw, n == o);
180 RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
181 }
182
183 /*
184 * rw_hold_remember:
185 *
186 * Helper - when acquring a lock, record the new hold.
187 */
188 static inline uintptr_t
189 rw_hold_remember(krwlock_t *rw, lwp_t *l)
190 {
191 int i;
192
193 KASSERT(kpreempt_disabled());
194
195 for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
196 if (__predict_true(l->l_rwlocks[i] == NULL)) {
197 l->l_rwlocks[i] = rw;
198 /*
199 * Clear the write wanted flag on every acquire to
200 * give readers a chance once again.
201 */
202 return ~RW_WRITE_WANTED;
203 }
204 }
205
206 /*
207 * Nowhere to track the hold so we lose: temporarily disable
208 * spinning on the lock.
209 */
210 return ~(RW_WRITE_WANTED | RW_SPIN);
211 }
212
213 /*
214 * rw_hold_forget:
215 *
216 * Helper - when releasing a lock, stop tracking the hold.
217 */
218 static inline void
219 rw_hold_forget(krwlock_t *rw, lwp_t *l)
220 {
221 int i;
222
223 KASSERT(kpreempt_disabled());
224
225 for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
226 if (__predict_true(l->l_rwlocks[i] == rw)) {
227 l->l_rwlocks[i] = NULL;
228 return;
229 }
230 }
231 }
232
233 /*
234 * rw_switch:
235 *
236 * Called by mi_switch() to indicate that an LWP is going off the CPU.
237 */
238 void
239 rw_switch(void)
240 {
241 lwp_t *l = curlwp;
242 int i;
243
244 for (i = 0; i < __arraycount(l->l_rwlocks); i++) {
245 if (l->l_rwlocks[i] != NULL) {
246 rw_and(l->l_rwlocks[i], ~RW_SPIN);
247 /* Leave in place for exit to clear. */
248 }
249 }
250 }
251
252 /*
253 * rw_dump:
254 *
255 * Dump the contents of a rwlock structure.
256 */
257 static void
258 rw_dump(const volatile void *cookie, lockop_printer_t pr)
259 {
260 const volatile krwlock_t *rw = cookie;
261
262 pr("owner/count : %#018lx flags : %#018x\n",
263 (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
264 }
265
266 /*
267 * rw_abort:
268 *
269 * Dump information about an error and panic the system. This
270 * generates a lot of machine code in the DIAGNOSTIC case, so
271 * we ask the compiler to not inline it.
272 */
273 static void __noinline
274 rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
275 {
276
277 if (panicstr != NULL)
278 return;
279
280 LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
281 }
282
283 /*
284 * rw_init:
285 *
286 * Initialize a rwlock for use.
287 */
288 void
289 _rw_init(krwlock_t *rw, uintptr_t return_address)
290 {
291
292 if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
293 rw->rw_owner = RW_SPIN;
294 else
295 rw->rw_owner = RW_SPIN | RW_NODEBUG;
296 }
297
298 void
299 rw_init(krwlock_t *rw)
300 {
301
302 _rw_init(rw, (uintptr_t)__builtin_return_address(0));
303 }
304
305 /*
306 * rw_destroy:
307 *
308 * Tear down a rwlock.
309 */
310 void
311 rw_destroy(krwlock_t *rw)
312 {
313
314 RW_ASSERT(rw, (rw->rw_owner & ~(RW_NODEBUG | RW_SPIN)) == 0);
315 LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
316 }
317
318 /*
319 * rw_vector_enter:
320 *
321 * The slow path for acquiring a rwlock, that considers all conditions.
322 * Marked __noinline to prevent the compiler pulling it into rw_enter().
323 */
324 static void __noinline
325 rw_vector_enter(krwlock_t *rw, const krw_t op, uintptr_t mask, uintptr_t ra)
326 {
327 uintptr_t owner, incr, need_wait, set_wait, curthread, next;
328 turnstile_t *ts;
329 int queue;
330 lwp_t *l;
331 LOCKSTAT_TIMER(slptime);
332 LOCKSTAT_TIMER(slpcnt);
333 LOCKSTAT_TIMER(spintime);
334 LOCKSTAT_COUNTER(spincnt);
335 LOCKSTAT_FLAG(lsflag);
336
337 l = curlwp;
338 curthread = (uintptr_t)l;
339
340 RW_ASSERT(rw, !cpu_intr_p());
341 RW_ASSERT(rw, curthread != 0);
342 RW_ASSERT(rw, kpreempt_disabled());
343 RW_WANTLOCK(rw, op);
344
345 if (panicstr == NULL) {
346 KDASSERT(pserialize_not_in_read_section());
347 LOCKDEBUG_BARRIER(&kernel_lock, 1);
348 }
349
350 /*
351 * We play a slight trick here. If we're a reader, we want
352 * increment the read count. If we're a writer, we want to
353 * set the owner field and the WRITE_LOCKED bit.
354 *
355 * In the latter case, we expect those bits to be zero,
356 * therefore we can use an add operation to set them, which
357 * means an add operation for both cases.
358 */
359 if (__predict_true(op == RW_READER)) {
360 incr = RW_READ_INCR;
361 set_wait = RW_HAS_WAITERS;
362 need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
363 queue = TS_READER_Q;
364 } else {
365 RW_ASSERT(rw, op == RW_WRITER);
366 incr = curthread | RW_WRITE_LOCKED;
367 set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
368 need_wait = RW_WRITE_LOCKED | RW_THREAD;
369 queue = TS_WRITER_Q;
370 }
371
372 LOCKSTAT_ENTER(lsflag);
373
374 for (owner = rw->rw_owner;;) {
375 /*
376 * Read the lock owner field. If the need-to-wait
377 * indicator is clear, then try to acquire the lock.
378 */
379 if ((owner & need_wait) == 0) {
380 next = rw_cas(rw, owner, (owner + incr) & mask);
381 if (__predict_true(next == owner)) {
382 /* Got it! */
383 RW_MEMBAR_ENTER();
384 break;
385 }
386
387 /*
388 * Didn't get it -- spin around again (we'll
389 * probably sleep on the next iteration).
390 */
391 owner = next;
392 continue;
393 }
394 if (__predict_false(RW_OWNER(rw) == curthread)) {
395 rw_abort(__func__, __LINE__, rw,
396 "locking against myself");
397 }
398
399 /*
400 * If the lock owner is running on another CPU, and there
401 * are no existing waiters, then spin. Notes:
402 *
403 * 1) If an LWP on this CPU (possibly curlwp, or an LWP that
404 * curlwp has interupted) holds kernel_lock, we can't spin
405 * without a deadlock. The CPU that holds the rwlock may be
406 * blocked trying to acquire kernel_lock, or there may be an
407 * unseen chain of dependant locks. To defeat the potential
408 * deadlock, this LWP needs to sleep (and thereby directly
409 * drop the kernel_lock, or permit the interrupted LWP that
410 * holds kernel_lock to complete its work).
411 *
412 * 2) If trying to acquire a write lock, and the lock is
413 * currently read held, after a brief wait set the write
414 * wanted bit to block out new readers and try to avoid
415 * starvation. When the hold is acquired, we'll clear the
416 * WRITE_WANTED flag to give readers a chance again. With
417 * luck this should nudge things in the direction of
418 * interleaving readers and writers when there is high
419 * contention.
420 *
421 * 3) The spin wait can't be done in soft interrupt context,
422 * because a lock holder could be pinned down underneath the
423 * soft interrupt LWP (i.e. curlwp) on the same CPU. For
424 * the lock holder to make progress and release the lock,
425 * the soft interrupt needs to sleep.
426 */
427 if ((owner & RW_SPIN) != 0 && !cpu_softintr_p()) {
428 LOCKSTAT_START_TIMER(lsflag, spintime);
429 u_int count = SPINLOCK_BACKOFF_MIN;
430 do {
431 KPREEMPT_ENABLE(curlwp);
432 SPINLOCK_BACKOFF(count);
433 KPREEMPT_DISABLE(curlwp);
434 owner = rw->rw_owner;
435 if ((owner & need_wait) == 0)
436 break;
437 if (count != SPINLOCK_BACKOFF_MAX)
438 continue;
439 if (curcpu()->ci_biglock_count != 0)
440 break;
441 if (op == RW_WRITER &&
442 (owner & RW_WRITE_LOCKED) == 0 &&
443 (owner & RW_WRITE_WANTED) == 0) {
444 (void)rw_cas(rw, owner,
445 owner | RW_WRITE_WANTED);
446 }
447 } while ((owner & RW_SPIN) != 0);
448 LOCKSTAT_STOP_TIMER(lsflag, spintime);
449 LOCKSTAT_COUNT(spincnt, 1);
450 if ((owner & need_wait) == 0)
451 continue;
452 }
453
454 /*
455 * Grab the turnstile chain lock. Once we have that, we
456 * can adjust the waiter bits and sleep queue.
457 */
458 ts = turnstile_lookup(rw);
459
460 /*
461 * Mark the rwlock as having waiters, and disable spinning.
462 * If the set fails, then we may not need to sleep and
463 * should spin again. Reload rw_owner now that we own
464 * the turnstile chain lock.
465 */
466 owner = rw->rw_owner;
467 if ((owner & need_wait) == 0 ||
468 ((owner & RW_SPIN) != 0 && !cpu_softintr_p())) {
469 turnstile_exit(rw);
470 continue;
471 }
472 next = rw_cas(rw, owner, (owner | set_wait) & ~RW_SPIN);
473 if (__predict_false(next != owner)) {
474 turnstile_exit(rw);
475 owner = next;
476 continue;
477 }
478
479 LOCKSTAT_START_TIMER(lsflag, slptime);
480 turnstile_block(ts, queue, rw, &rw_syncobj);
481 LOCKSTAT_STOP_TIMER(lsflag, slptime);
482 LOCKSTAT_COUNT(slpcnt, 1);
483
484 /*
485 * No need for a memory barrier because of context switch.
486 * If not handed the lock, then spin again.
487 */
488 if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
489 break;
490
491 owner = rw->rw_owner;
492 }
493 KPREEMPT_ENABLE(curlwp);
494
495 LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
496 (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
497 (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
498 LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
499 (l->l_rwcallsite != 0 ? l->l_rwcallsite : ra));
500 LOCKSTAT_EXIT(lsflag);
501
502 RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
503 (op == RW_READER && RW_COUNT(rw) != 0));
504 RW_LOCKED(rw, op);
505 }
506
507 /*
508 * rw_enter:
509 *
510 * The fast path for acquiring a lock that considers only the
511 * uncontended case. Falls back to rw_vector_enter().
512 */
513 void
514 rw_enter(krwlock_t *rw, const krw_t op)
515 {
516 uintptr_t owner, incr, need_wait, curthread, next, mask;
517 lwp_t *l;
518
519 l = curlwp;
520 curthread = (uintptr_t)l;
521
522 RW_ASSERT(rw, !cpu_intr_p());
523 RW_ASSERT(rw, curthread != 0);
524 RW_WANTLOCK(rw, op);
525
526 KPREEMPT_DISABLE(l);
527 mask = rw_hold_remember(rw, l);
528
529 /*
530 * We play a slight trick here. If we're a reader, we want
531 * increment the read count. If we're a writer, we want to
532 * set the owner field and the WRITE_LOCKED bit.
533 *
534 * In the latter case, we expect those bits to be zero,
535 * therefore we can use an add operation to set them, which
536 * means an add operation for both cases.
537 */
538 if (__predict_true(op == RW_READER)) {
539 incr = RW_READ_INCR;
540 need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
541 } else {
542 RW_ASSERT(rw, op == RW_WRITER);
543 incr = curthread | RW_WRITE_LOCKED;
544 need_wait = RW_WRITE_LOCKED | RW_THREAD;
545 }
546
547 /*
548 * Read the lock owner field. If the need-to-wait
549 * indicator is clear, then try to acquire the lock.
550 */
551 owner = rw->rw_owner;
552 if ((owner & need_wait) == 0) {
553 next = rw_cas(rw, owner, (owner + incr) & mask);
554 if (__predict_true(next == owner)) {
555 /* Got it! */
556 KPREEMPT_ENABLE(l);
557 RW_MEMBAR_ENTER();
558 return;
559 }
560 }
561
562 rw_vector_enter(rw, op, mask, (uintptr_t)__builtin_return_address(0));
563 }
564
565 /*
566 * rw_vector_exit:
567 *
568 * The slow path for releasing a rwlock, that considers all conditions.
569 * Marked __noinline to prevent the compiler pulling it into rw_enter().
570 */
571 static void __noinline
572 rw_vector_exit(krwlock_t *rw)
573 {
574 uintptr_t curthread, owner, decr, newown, next;
575 turnstile_t *ts;
576 int rcnt, wcnt;
577 lwp_t *l;
578
579 l = curlwp;
580 curthread = (uintptr_t)l;
581 RW_ASSERT(rw, curthread != 0);
582 RW_ASSERT(rw, kpreempt_disabled());
583
584 /*
585 * Again, we use a trick. Since we used an add operation to
586 * set the required lock bits, we can use a subtract to clear
587 * them, which makes the read-release and write-release path
588 * the same.
589 */
590 owner = rw->rw_owner;
591 if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
592 RW_UNLOCKED(rw, RW_WRITER);
593 RW_ASSERT(rw, RW_OWNER(rw) == curthread);
594 decr = curthread | RW_WRITE_LOCKED;
595 } else {
596 RW_UNLOCKED(rw, RW_READER);
597 RW_ASSERT(rw, RW_COUNT(rw) != 0);
598 decr = RW_READ_INCR;
599 }
600
601 /*
602 * Compute what we expect the new value of the lock to be. Only
603 * proceed to do direct handoff if there are waiters, and if the
604 * lock would become unowned.
605 */
606 RW_MEMBAR_EXIT();
607 for (;;) {
608 newown = (owner - decr);
609 if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
610 break;
611 /* Want spinning enabled if lock is becoming free. */
612 if ((newown & RW_THREAD) == 0)
613 newown |= RW_SPIN;
614 next = rw_cas(rw, owner, newown);
615 if (__predict_true(next == owner)) {
616 rw_hold_forget(rw, l);
617 kpreempt_enable();
618 return;
619 }
620 owner = next;
621 }
622
623 /*
624 * Grab the turnstile chain lock. This gets the interlock
625 * on the sleep queue. Once we have that, we can adjust the
626 * waiter bits.
627 */
628 ts = turnstile_lookup(rw);
629 owner = rw->rw_owner;
630 RW_ASSERT(rw, ts != NULL);
631 RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
632
633 wcnt = TS_WAITERS(ts, TS_WRITER_Q);
634 rcnt = TS_WAITERS(ts, TS_READER_Q);
635
636 /*
637 * Give the lock away.
638 *
639 * If we are releasing a write lock, then prefer to wake all
640 * outstanding readers. Otherwise, wake one writer if there
641 * are outstanding readers, or all writers if there are no
642 * pending readers. If waking one specific writer, the writer
643 * is handed the lock here. If waking multiple writers, we
644 * set WRITE_WANTED to block out new readers, and let them
645 * do the work of acquiring the lock in rw_vector_enter().
646 */
647 if (rcnt == 0 || decr == RW_READ_INCR) {
648 RW_ASSERT(rw, wcnt != 0);
649 RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
650
651 if (rcnt != 0) {
652 /* Give the lock to the longest waiting writer. */
653 l = TS_FIRST(ts, TS_WRITER_Q);
654 newown = (uintptr_t)l | (owner & RW_NODEBUG);
655 newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
656 if (wcnt > 1)
657 newown |= RW_WRITE_WANTED;
658 rw_swap(rw, owner, newown);
659 rw_hold_forget(rw, l);
660 turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
661 } else {
662 /* Wake all writers and let them fight it out. */
663 newown = owner & RW_NODEBUG;
664 newown |= RW_WRITE_WANTED;
665 rw_swap(rw, owner, newown);
666 rw_hold_forget(rw, l);
667 turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
668 }
669 } else {
670 RW_ASSERT(rw, rcnt != 0);
671
672 /*
673 * Give the lock to all blocked readers. If there
674 * is a writer waiting, new readers that arrive
675 * after the release will be blocked out.
676 */
677 newown = owner & RW_NODEBUG;
678 newown += rcnt << RW_READ_COUNT_SHIFT;
679 if (wcnt != 0)
680 newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
681
682 /* Wake up all sleeping readers. */
683 rw_swap(rw, owner, newown);
684 rw_hold_forget(rw, l);
685 turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
686 }
687 kpreempt_enable();
688 }
689
690 /*
691 * rw_exit:
692 *
693 * The fast path for releasing a lock that considers only the
694 * uncontended case. Falls back to rw_vector_exit().
695 */
696 void
697 rw_exit(krwlock_t *rw)
698 {
699 uintptr_t curthread, owner, decr, newown, next;
700 lwp_t *l;
701
702 l = curlwp;
703 curthread = (uintptr_t)l;
704 RW_ASSERT(rw, curthread != 0);
705
706 /*
707 * Again, we use a trick. Since we used an add operation to
708 * set the required lock bits, we can use a subtract to clear
709 * them, which makes the read-release and write-release path
710 * the same.
711 */
712 owner = rw->rw_owner;
713 if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
714 RW_UNLOCKED(rw, RW_WRITER);
715 RW_ASSERT(rw, RW_OWNER(rw) == curthread);
716 decr = curthread | RW_WRITE_LOCKED;
717 } else {
718 RW_UNLOCKED(rw, RW_READER);
719 RW_ASSERT(rw, RW_COUNT(rw) != 0);
720 decr = RW_READ_INCR;
721 }
722
723 /* Now try to release it. */
724 RW_MEMBAR_EXIT();
725 KPREEMPT_DISABLE(l);
726 newown = (owner - decr);
727 if (__predict_true((newown & (RW_THREAD | RW_HAS_WAITERS)) !=
728 RW_HAS_WAITERS)) {
729 /* Want spinning (re-)enabled if lock is becoming free. */
730 if ((newown & RW_THREAD) == 0)
731 newown |= RW_SPIN;
732 next = rw_cas(rw, owner, newown);
733 if (__predict_true(next == owner)) {
734 rw_hold_forget(rw, l);
735 KPREEMPT_ENABLE(l);
736 return;
737 }
738 }
739 rw_vector_exit(rw);
740 }
741
742 /*
743 * rw_tryenter:
744 *
745 * Try to acquire a rwlock.
746 */
747 int
748 rw_tryenter(krwlock_t *rw, const krw_t op)
749 {
750 uintptr_t curthread, owner, incr, need_wait, next, mask;
751 lwp_t *l;
752
753 l = curlwp;
754 curthread = (uintptr_t)l;
755
756 RW_ASSERT(rw, curthread != 0);
757
758 KPREEMPT_DISABLE(l);
759 mask = rw_hold_remember(rw, l);
760
761 if (op == RW_READER) {
762 incr = RW_READ_INCR;
763 need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
764 } else {
765 RW_ASSERT(rw, op == RW_WRITER);
766 incr = curthread | RW_WRITE_LOCKED;
767 need_wait = RW_WRITE_LOCKED | RW_THREAD;
768 }
769
770 for (owner = rw->rw_owner;; owner = next) {
771 if (__predict_false((owner & need_wait) != 0)) {
772 rw_hold_forget(rw, l);
773 KPREEMPT_ENABLE(l);
774 return 0;
775 }
776 next = rw_cas(rw, owner, (owner + incr) & mask);
777 if (__predict_true(next == owner)) {
778 /* Got it! */
779 break;
780 }
781 }
782
783 RW_WANTLOCK(rw, op);
784 RW_LOCKED(rw, op);
785 RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
786 (op == RW_READER && RW_COUNT(rw) != 0));
787
788 KPREEMPT_ENABLE(l);
789 RW_MEMBAR_ENTER();
790 return 1;
791 }
792
793 /*
794 * rw_downgrade:
795 *
796 * Downgrade a write lock to a read lock.
797 */
798 void
799 rw_downgrade(krwlock_t *rw)
800 {
801 uintptr_t owner, curthread, newown, next;
802 turnstile_t *ts;
803 int rcnt, wcnt;
804 lwp_t *l;
805
806 l = curlwp;
807 curthread = (uintptr_t)l;
808 RW_ASSERT(rw, curthread != 0);
809 RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
810 RW_ASSERT(rw, RW_OWNER(rw) == curthread);
811 RW_UNLOCKED(rw, RW_WRITER);
812 #if !defined(DIAGNOSTIC)
813 __USE(curthread);
814 #endif
815
816 RW_MEMBAR_PRODUCER();
817
818 for (owner = rw->rw_owner;; owner = next) {
819 /*
820 * If there are no waiters we can do this the easy way. Try
821 * swapping us down to one read hold. If it fails, the lock
822 * condition has changed and we most likely now have
823 * waiters.
824 */
825 if ((owner & RW_HAS_WAITERS) == 0) {
826 newown = (owner & RW_NODEBUG) | RW_SPIN;
827 next = rw_cas(rw, owner, newown + RW_READ_INCR);
828 if (__predict_true(next == owner)) {
829 RW_LOCKED(rw, RW_READER);
830 RW_ASSERT(rw,
831 (rw->rw_owner & RW_WRITE_LOCKED) == 0);
832 RW_ASSERT(rw, RW_COUNT(rw) != 0);
833 return;
834 }
835 continue;
836 }
837
838 /*
839 * Grab the turnstile chain lock. This gets the interlock
840 * on the sleep queue. Once we have that, we can adjust the
841 * waiter bits.
842 */
843 ts = turnstile_lookup(rw);
844 RW_ASSERT(rw, ts != NULL);
845
846 rcnt = TS_WAITERS(ts, TS_READER_Q);
847 wcnt = TS_WAITERS(ts, TS_WRITER_Q);
848
849 if (rcnt == 0) {
850 /*
851 * If there are no readers, just preserve the
852 * waiters bits, swap us down to one read hold and
853 * return. Don't set the spin bit as nobody's
854 * running yet.
855 */
856 RW_ASSERT(rw, wcnt != 0);
857 RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
858 RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
859
860 newown = owner & RW_NODEBUG;
861 newown = RW_READ_INCR | RW_HAS_WAITERS |
862 RW_WRITE_WANTED;
863 next = rw_cas(rw, owner, newown);
864 turnstile_exit(rw);
865 if (__predict_true(next == owner))
866 break;
867 } else {
868 /*
869 * Give the lock to all blocked readers. We may
870 * retain one read hold if downgrading. If there is
871 * a writer waiting, new readers will be blocked
872 * out. Don't set the spin bit as nobody's running
873 * yet.
874 */
875 newown = owner & RW_NODEBUG;
876 newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
877 if (wcnt != 0)
878 newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
879
880 next = rw_cas(rw, owner, newown);
881 if (__predict_true(next == owner)) {
882 /* Wake up all sleeping readers. */
883 turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
884 break;
885 }
886 turnstile_exit(rw);
887 }
888 }
889
890 RW_WANTLOCK(rw, RW_READER);
891 RW_LOCKED(rw, RW_READER);
892 RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
893 RW_ASSERT(rw, RW_COUNT(rw) != 0);
894 }
895
896 /*
897 * rw_tryupgrade:
898 *
899 * Try to upgrade a read lock to a write lock. We must be the only
900 * reader.
901 */
902 int
903 rw_tryupgrade(krwlock_t *rw)
904 {
905 uintptr_t owner, curthread, newown, next;
906 struct lwp *l;
907
908 l = curlwp;
909 curthread = (uintptr_t)l;
910 RW_ASSERT(rw, curthread != 0);
911 RW_ASSERT(rw, rw_read_held(rw));
912
913 for (owner = RW_READ_INCR;; owner = next) {
914 newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
915 next = rw_cas(rw, owner, newown);
916 if (__predict_true(next == owner)) {
917 RW_MEMBAR_PRODUCER();
918 break;
919 }
920 RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0);
921 if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) {
922 RW_ASSERT(rw, (next & RW_THREAD) != 0);
923 return 0;
924 }
925 }
926
927 RW_UNLOCKED(rw, RW_READER);
928 RW_WANTLOCK(rw, RW_WRITER);
929 RW_LOCKED(rw, RW_WRITER);
930 RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
931 RW_ASSERT(rw, RW_OWNER(rw) == curthread);
932
933 return 1;
934 }
935
936 /*
937 * rw_read_held:
938 *
939 * Returns true if the rwlock is held for reading. Must only be
940 * used for diagnostic assertions, and never be used to make
941 * decisions about how to use a rwlock.
942 */
943 int
944 rw_read_held(krwlock_t *rw)
945 {
946 uintptr_t owner;
947
948 if (rw == NULL)
949 return 0;
950 owner = rw->rw_owner;
951 return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
952 }
953
954 /*
955 * rw_write_held:
956 *
957 * Returns true if the rwlock is held for writing. Must only be
958 * used for diagnostic assertions, and never be used to make
959 * decisions about how to use a rwlock.
960 */
961 int
962 rw_write_held(krwlock_t *rw)
963 {
964
965 if (rw == NULL)
966 return 0;
967 return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
968 (RW_WRITE_LOCKED | (uintptr_t)curlwp);
969 }
970
971 /*
972 * rw_lock_held:
973 *
974 * Returns true if the rwlock is held for reading or writing. Must
975 * only be used for diagnostic assertions, and never be used to make
976 * decisions about how to use a rwlock.
977 */
978 int
979 rw_lock_held(krwlock_t *rw)
980 {
981
982 if (rw == NULL)
983 return 0;
984 return (rw->rw_owner & RW_THREAD) != 0;
985 }
986
987 /*
988 * rw_owner:
989 *
990 * Return the current owner of an RW lock, but only if it is write
991 * held. Used for priority inheritance.
992 */
993 static lwp_t *
994 rw_owner(wchan_t obj)
995 {
996 krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
997 uintptr_t owner = rw->rw_owner;
998
999 if ((owner & RW_WRITE_LOCKED) == 0)
1000 return NULL;
1001
1002 return (void *)(owner & RW_THREAD);
1003 }
1004
1005 /*
1006 * rw_owner_running:
1007 *
1008 * Return true if a RW lock is unheld, or held and the owner is running
1009 * on a CPU. For the pagedaemon only - do not document or use in other
1010 * code.
1011 */
1012 bool
1013 rw_owner_running(const krwlock_t *rw)
1014 {
1015 uintptr_t owner = rw->rw_owner;
1016
1017 return (owner & RW_THREAD) == 0 || (owner & RW_SPIN) != 0;
1018 }
1019