kern_turnstile.c revision 1.37 1 /* $NetBSD: kern_turnstile.c,v 1.37 2020/03/26 19:46:42 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Jason R. Thorpe and Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Turnstiles are described in detail in:
35 *
36 * Solaris Internals: Core Kernel Architecture, Jim Mauro and
37 * Richard McDougall.
38 *
39 * Turnstiles are kept in a hash table. There are likely to be many more
40 * synchronisation objects than there are threads. Since a thread can block
41 * on only one lock at a time, we only need one turnstile per thread, and
42 * so they are allocated at thread creation time.
43 *
44 * When a thread decides it needs to block on a lock, it looks up the
45 * active turnstile for that lock. If no active turnstile exists, then
46 * the process lends its turnstile to the lock. If there is already an
47 * active turnstile for the lock, the thread places its turnstile on a
48 * list of free turnstiles, and references the active one instead.
49 *
50 * The act of looking up the turnstile acquires an interlock on the sleep
51 * queue. If a thread decides it doesn't need to block after all, then this
52 * interlock must be released by explicitly aborting the turnstile
53 * operation.
54 *
55 * When a thread is awakened, it needs to get its turnstile back. If there
56 * are still other threads waiting in the active turnstile, the thread
57 * grabs a free turnstile off the free list. Otherwise, it can take back
58 * the active turnstile from the lock (thus deactivating the turnstile).
59 *
60 * Turnstiles are where we do priority inheritence.
61 */
62
63 #include <sys/cdefs.h>
64 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.37 2020/03/26 19:46:42 ad Exp $");
65
66 #include <sys/param.h>
67 #include <sys/lockdebug.h>
68 #include <sys/pool.h>
69 #include <sys/proc.h>
70 #include <sys/sleepq.h>
71 #include <sys/systm.h>
72
73 /*
74 * Shift of 6 aligns to typical cache line size of 64 bytes; there's no
75 * point having two turnstile locks to back two lock objects that share one
76 * cache line.
77 */
78 #define TS_HASH_SIZE 128
79 #define TS_HASH_MASK (TS_HASH_SIZE - 1)
80 #define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK)
81
82 static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned;
83 pool_cache_t turnstile_cache __read_mostly;
84 extern turnstile_t turnstile0;
85
86 static union {
87 kmutex_t lock;
88 uint8_t pad[COHERENCY_UNIT];
89 } turnstile_locks[TS_HASH_SIZE] __cacheline_aligned;
90
91 static int turnstile_ctor(void *, void *, int);
92
93 /*
94 * turnstile_init:
95 *
96 * Initialize the turnstile mechanism.
97 */
98 void
99 turnstile_init(void)
100 {
101 int i;
102
103 for (i = 0; i < TS_HASH_SIZE; i++) {
104 LIST_INIT(&turnstile_chains[i]);
105 mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED);
106 }
107
108 turnstile_cache = pool_cache_init(sizeof(turnstile_t), coherency_unit,
109 0, 0, "tstile", NULL, IPL_NONE, turnstile_ctor, NULL, NULL);
110 KASSERT(turnstile_cache != NULL);
111
112 (void)turnstile_ctor(NULL, &turnstile0, 0);
113 }
114
115 /*
116 * turnstile_ctor:
117 *
118 * Constructor for turnstiles.
119 */
120 static int
121 turnstile_ctor(void *arg, void *obj, int flags)
122 {
123 turnstile_t *ts = obj;
124
125 memset(ts, 0, sizeof(*ts));
126 sleepq_init(&ts->ts_sleepq[TS_READER_Q]);
127 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]);
128 return (0);
129 }
130
131 /*
132 * turnstile_remove:
133 *
134 * Remove an LWP from a turnstile sleep queue and wake it.
135 */
136 static inline void
137 turnstile_remove(turnstile_t *ts, lwp_t *l, int q)
138 {
139 turnstile_t *nts;
140
141 KASSERT(l->l_ts == ts);
142
143 /*
144 * This process is no longer using the active turnstile.
145 * Find an inactive one on the free list to give to it.
146 */
147 if ((nts = ts->ts_free) != NULL) {
148 KASSERT(TS_ALL_WAITERS(ts) > 1);
149 l->l_ts = nts;
150 ts->ts_free = nts->ts_free;
151 nts->ts_free = NULL;
152 } else {
153 /*
154 * If the free list is empty, this is the last
155 * waiter.
156 */
157 KASSERT(TS_ALL_WAITERS(ts) == 1);
158 LIST_REMOVE(ts, ts_chain);
159 }
160
161 ts->ts_waiters[q]--;
162 sleepq_remove(&ts->ts_sleepq[q], l);
163 }
164
165 /*
166 * turnstile_lookup:
167 *
168 * Look up the turnstile for the specified lock. This acquires and
169 * holds the turnstile chain lock (sleep queue interlock).
170 */
171 turnstile_t *
172 turnstile_lookup(wchan_t obj)
173 {
174 turnstile_t *ts;
175 tschain_t *tc;
176 u_int hash;
177
178 hash = TS_HASH(obj);
179 tc = &turnstile_chains[hash];
180 mutex_spin_enter(&turnstile_locks[hash].lock);
181
182 LIST_FOREACH(ts, tc, ts_chain)
183 if (ts->ts_obj == obj)
184 return (ts);
185
186 /*
187 * No turnstile yet for this lock. No problem, turnstile_block()
188 * handles this by fetching the turnstile from the blocking thread.
189 */
190 return (NULL);
191 }
192
193 /*
194 * turnstile_exit:
195 *
196 * Abort a turnstile operation.
197 */
198 void
199 turnstile_exit(wchan_t obj)
200 {
201
202 mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock);
203 }
204
205 /*
206 * turnstile_lendpri:
207 *
208 * Lend our priority to lwps on the blocking chain.
209 *
210 * If the current owner of the lock (l->l_wchan, set by sleepq_enqueue)
211 * has a priority lower than ours (lwp_eprio(l)), lend our priority to
212 * him to avoid priority inversions.
213 */
214
215 static void
216 turnstile_lendpri(lwp_t *cur)
217 {
218 lwp_t * l = cur;
219 pri_t prio;
220
221 /*
222 * NOTE: if you get a panic in this code block, it is likely that
223 * a lock has been destroyed or corrupted while still in use. Try
224 * compiling a kernel with LOCKDEBUG to pinpoint the problem.
225 */
226
227 LOCKDEBUG_BARRIER(l->l_mutex, 1);
228 KASSERT(l == curlwp);
229 prio = lwp_eprio(l);
230 for (;;) {
231 lwp_t *owner;
232 turnstile_t *ts;
233 bool dolock;
234
235 if (l->l_wchan == NULL)
236 break;
237
238 /*
239 * Ask syncobj the owner of the lock.
240 */
241 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
242 if (owner == NULL)
243 break;
244
245 /*
246 * The owner may have changed as we have dropped the tc lock.
247 */
248 if (cur == owner) {
249 /*
250 * We own the lock: stop here, sleepq_block()
251 * should wake up immediatly.
252 */
253 break;
254 }
255 /*
256 * Acquire owner->l_mutex if we don't have it yet.
257 * Because we already have another LWP lock (l->l_mutex) held,
258 * we need to play a try lock dance to avoid deadlock.
259 */
260 dolock = l->l_mutex != owner->l_mutex;
261 if (l == owner || (dolock && !lwp_trylock(owner))) {
262 /*
263 * The owner was changed behind us or trylock failed.
264 * Restart from curlwp.
265 *
266 * Note that there may be a livelock here:
267 * the owner may try grabing cur's lock (which is the
268 * tc lock) while we're trying to grab the owner's lock.
269 */
270 lwp_unlock(l);
271 l = cur;
272 lwp_lock(l);
273 prio = lwp_eprio(l);
274 continue;
275 }
276 /*
277 * If the owner's priority is already higher than ours,
278 * there's nothing to do anymore.
279 */
280 if (prio <= lwp_eprio(owner)) {
281 if (dolock)
282 lwp_unlock(owner);
283 break;
284 }
285 /*
286 * Lend our priority to the 'owner' LWP.
287 *
288 * Update lenders info for turnstile_unlendpri.
289 */
290 ts = l->l_ts;
291 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
292 if (ts->ts_inheritor == NULL) {
293 ts->ts_inheritor = owner;
294 ts->ts_eprio = prio;
295 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
296 lwp_lendpri(owner, prio);
297 } else if (prio > ts->ts_eprio) {
298 ts->ts_eprio = prio;
299 lwp_lendpri(owner, prio);
300 }
301 if (dolock)
302 lwp_unlock(l);
303 LOCKDEBUG_BARRIER(owner->l_mutex, 1);
304 l = owner;
305 }
306 LOCKDEBUG_BARRIER(l->l_mutex, 1);
307 if (cur->l_mutex != l->l_mutex) {
308 lwp_unlock(l);
309 lwp_lock(cur);
310 }
311 LOCKDEBUG_BARRIER(cur->l_mutex, 1);
312 }
313
314 /*
315 * turnstile_unlendpri: undo turnstile_lendpri
316 */
317
318 static void
319 turnstile_unlendpri(turnstile_t *ts)
320 {
321 lwp_t * const l = curlwp;
322 turnstile_t *iter;
323 turnstile_t *next;
324 turnstile_t *prev = NULL;
325 pri_t prio;
326 bool dolock;
327
328 KASSERT(ts->ts_inheritor != NULL);
329 ts->ts_inheritor = NULL;
330 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock;
331 if (dolock) {
332 lwp_lock(l);
333 }
334
335 /*
336 * the following loop does two things.
337 *
338 * - remove ts from the list.
339 *
340 * - from the rest of the list, find the highest priority.
341 */
342
343 prio = -1;
344 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders));
345 for (iter = SLIST_FIRST(&l->l_pi_lenders);
346 iter != NULL; iter = next) {
347 KASSERT(lwp_eprio(l) >= ts->ts_eprio);
348 next = SLIST_NEXT(iter, ts_pichain);
349 if (iter == ts) {
350 if (prev == NULL) {
351 SLIST_REMOVE_HEAD(&l->l_pi_lenders,
352 ts_pichain);
353 } else {
354 SLIST_REMOVE_AFTER(prev, ts_pichain);
355 }
356 } else if (prio < iter->ts_eprio) {
357 prio = iter->ts_eprio;
358 }
359 prev = iter;
360 }
361
362 lwp_lendpri(l, prio);
363
364 if (dolock) {
365 lwp_unlock(l);
366 }
367 }
368
369 /*
370 * turnstile_block:
371 *
372 * Enter an object into the turnstile chain and prepare the current
373 * LWP for sleep.
374 */
375 void
376 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
377 {
378 lwp_t * const l = curlwp; /* cached curlwp */
379 turnstile_t *ots;
380 tschain_t *tc;
381 kmutex_t *lock;
382 sleepq_t *sq;
383 pri_t obase;
384 u_int hash;
385
386 hash = TS_HASH(obj);
387 tc = &turnstile_chains[hash];
388 lock = &turnstile_locks[hash].lock;
389
390 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
391 KASSERT(mutex_owned(lock));
392 KASSERT(l != NULL && l->l_ts != NULL);
393
394 if (ts == NULL) {
395 /*
396 * We are the first thread to wait for this object;
397 * lend our turnstile to it.
398 */
399 ts = l->l_ts;
400 KASSERT(TS_ALL_WAITERS(ts) == 0);
401 KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) &&
402 LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
403 ts->ts_obj = obj;
404 ts->ts_inheritor = NULL;
405 LIST_INSERT_HEAD(tc, ts, ts_chain);
406 } else {
407 /*
408 * Object already has a turnstile. Put our turnstile
409 * onto the free list, and reference the existing
410 * turnstile instead.
411 */
412 ots = l->l_ts;
413 KASSERT(ots->ts_free == NULL);
414 ots->ts_free = ts->ts_free;
415 ts->ts_free = ots;
416 l->l_ts = ts;
417
418 KASSERT(ts->ts_obj == obj);
419 KASSERT(TS_ALL_WAITERS(ts) != 0);
420 KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) ||
421 !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
422 }
423
424 sq = &ts->ts_sleepq[q];
425 ts->ts_waiters[q]++;
426 sleepq_enter(sq, l, lock);
427 LOCKDEBUG_BARRIER(lock, 1);
428 l->l_kpriority = true;
429 obase = l->l_kpribase;
430 if (obase < PRI_KTHREAD)
431 l->l_kpribase = PRI_KTHREAD;
432 sleepq_enqueue(sq, obj, "tstile", sobj);
433
434 /*
435 * Disable preemption across this entire block, as we may drop
436 * scheduler locks (allowing preemption), and would prefer not
437 * to be interrupted while in a state of flux.
438 */
439 KPREEMPT_DISABLE(l);
440 KASSERT(lock == l->l_mutex);
441 turnstile_lendpri(l);
442 sleepq_block(0, false);
443 l->l_kpribase = obase;
444 KPREEMPT_ENABLE(l);
445 }
446
447 /*
448 * turnstile_wakeup:
449 *
450 * Wake up the specified number of threads that are blocked
451 * in a turnstile.
452 */
453 void
454 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl)
455 {
456 sleepq_t *sq;
457 kmutex_t *lock;
458 u_int hash;
459 lwp_t *l;
460
461 hash = TS_HASH(ts->ts_obj);
462 lock = &turnstile_locks[hash].lock;
463 sq = &ts->ts_sleepq[q];
464
465 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
466 KASSERT(count > 0 && count <= TS_WAITERS(ts, q));
467 KASSERT(mutex_owned(lock));
468 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);
469
470 /*
471 * restore inherited priority if necessary.
472 */
473
474 if (ts->ts_inheritor != NULL) {
475 turnstile_unlendpri(ts);
476 }
477
478 if (nl != NULL) {
479 #if defined(DEBUG) || defined(LOCKDEBUG)
480 LIST_FOREACH(l, sq, l_sleepchain) {
481 if (l == nl)
482 break;
483 }
484 if (l == NULL)
485 panic("turnstile_wakeup: nl not on sleepq");
486 #endif
487 turnstile_remove(ts, nl, q);
488 } else {
489 while (count-- > 0) {
490 l = LIST_FIRST(sq);
491 KASSERT(l != NULL);
492 turnstile_remove(ts, l, q);
493 }
494 }
495 mutex_spin_exit(lock);
496 }
497
498 /*
499 * turnstile_unsleep:
500 *
501 * Remove an LWP from the turnstile. This is called when the LWP has
502 * not been awoken normally but instead interrupted: for example, if it
503 * has received a signal. It's not a valid action for turnstiles,
504 * since LWPs blocking on a turnstile are not interruptable.
505 */
506 void
507 turnstile_unsleep(lwp_t *l, bool cleanup)
508 {
509
510 lwp_unlock(l);
511 panic("turnstile_unsleep");
512 }
513
514 /*
515 * turnstile_changepri:
516 *
517 * Adjust the priority of an LWP residing on a turnstile.
518 */
519 void
520 turnstile_changepri(lwp_t *l, pri_t pri)
521 {
522
523 /* XXX priority inheritance */
524 sleepq_changepri(l, pri);
525 }
526
527 #if defined(LOCKDEBUG)
528 /*
529 * turnstile_print:
530 *
531 * Given the address of a lock object, print the contents of a
532 * turnstile.
533 */
534 void
535 turnstile_print(volatile void *obj, void (*pr)(const char *, ...))
536 {
537 turnstile_t *ts;
538 tschain_t *tc;
539 sleepq_t *rsq, *wsq;
540 u_int hash;
541 lwp_t *l;
542
543 hash = TS_HASH(obj);
544 tc = &turnstile_chains[hash];
545
546 LIST_FOREACH(ts, tc, ts_chain)
547 if (ts->ts_obj == obj)
548 break;
549
550 if (ts == NULL) {
551 (*pr)("Turnstile: no active turnstile for this lock.\n");
552 return;
553 }
554
555 rsq = &ts->ts_sleepq[TS_READER_Q];
556 wsq = &ts->ts_sleepq[TS_WRITER_Q];
557
558 (*pr)("Turnstile:\n");
559 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q));
560 TAILQ_FOREACH(l, rsq, l_sleepchain) {
561 (*pr)(" %p", l);
562 }
563 (*pr)("\n");
564
565 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q));
566 TAILQ_FOREACH(l, wsq, l_sleepchain) {
567 (*pr)(" %p", l);
568 }
569 (*pr)("\n");
570 }
571 #endif /* LOCKDEBUG */
572