kern_turnstile.c revision 1.30.2.1 1 /* $NetBSD: kern_turnstile.c,v 1.30.2.1 2012/04/17 00:08:27 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 2002, 2006, 2007, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe and Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Turnstiles are described in detail in:
34 *
35 * Solaris Internals: Core Kernel Architecture, Jim Mauro and
36 * Richard McDougall.
37 *
38 * Turnstiles are kept in a hash table. There are likely to be many more
39 * synchronisation objects than there are threads. Since a thread can block
40 * on only one lock at a time, we only need one turnstile per thread, and
41 * so they are allocated at thread creation time.
42 *
43 * When a thread decides it needs to block on a lock, it looks up the
44 * active turnstile for that lock. If no active turnstile exists, then
45 * the process lends its turnstile to the lock. If there is already an
46 * active turnstile for the lock, the thread places its turnstile on a
47 * list of free turnstiles, and references the active one instead.
48 *
49 * The act of looking up the turnstile acquires an interlock on the sleep
50 * queue. If a thread decides it doesn't need to block after all, then this
51 * interlock must be released by explicitly aborting the turnstile
52 * operation.
53 *
54 * When a thread is awakened, it needs to get its turnstile back. If there
55 * are still other threads waiting in the active turnstile, the thread
56 * grabs a free turnstile off the free list. Otherwise, it can take back
57 * the active turnstile from the lock (thus deactivating the turnstile).
58 *
59 * Turnstiles are the place to do priority inheritence.
60 */
61
62 #include <sys/cdefs.h>
63 __KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.30.2.1 2012/04/17 00:08:27 yamt Exp $");
64
65 #include <sys/param.h>
66 #include <sys/lockdebug.h>
67 #include <sys/pool.h>
68 #include <sys/proc.h>
69 #include <sys/sleepq.h>
70 #include <sys/systm.h>
71
72 #define TS_HASH_SIZE 64
73 #define TS_HASH_MASK (TS_HASH_SIZE - 1)
74 #define TS_HASH(obj) (((uintptr_t)(obj) >> 3) & TS_HASH_MASK)
75
76 static tschain_t turnstile_tab[TS_HASH_SIZE] __cacheline_aligned;
77 pool_cache_t turnstile_cache __read_mostly;
78
79 static int turnstile_ctor(void *, void *, int);
80
81 extern turnstile_t turnstile0;
82
83 /*
84 * turnstile_init:
85 *
86 * Initialize the turnstile mechanism.
87 */
88 void
89 turnstile_init(void)
90 {
91 tschain_t *tc;
92 int i;
93
94 for (i = 0; i < TS_HASH_SIZE; i++) {
95 tc = &turnstile_tab[i];
96 LIST_INIT(&tc->tc_chain);
97 tc->tc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
98 }
99
100 turnstile_cache = pool_cache_init(sizeof(turnstile_t), 0, 0, 0,
101 "tstilepl", NULL, IPL_NONE, turnstile_ctor, NULL, NULL);
102 KASSERT(turnstile_cache != NULL);
103
104 (void)turnstile_ctor(NULL, &turnstile0, 0);
105 }
106
107 /*
108 * turnstile_ctor:
109 *
110 * Constructor for turnstiles.
111 */
112 static int
113 turnstile_ctor(void *arg, void *obj, int flags)
114 {
115 turnstile_t *ts = obj;
116
117 memset(ts, 0, sizeof(*ts));
118 sleepq_init(&ts->ts_sleepq[TS_READER_Q]);
119 sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]);
120 return (0);
121 }
122
123 /*
124 * turnstile_remove:
125 *
126 * Remove an LWP from a turnstile sleep queue and wake it.
127 */
128 static inline void
129 turnstile_remove(turnstile_t *ts, lwp_t *l, int q)
130 {
131 turnstile_t *nts;
132
133 KASSERT(l->l_ts == ts);
134
135 /*
136 * This process is no longer using the active turnstile.
137 * Find an inactive one on the free list to give to it.
138 */
139 if ((nts = ts->ts_free) != NULL) {
140 KASSERT(TS_ALL_WAITERS(ts) > 1);
141 l->l_ts = nts;
142 ts->ts_free = nts->ts_free;
143 nts->ts_free = NULL;
144 } else {
145 /*
146 * If the free list is empty, this is the last
147 * waiter.
148 */
149 KASSERT(TS_ALL_WAITERS(ts) == 1);
150 LIST_REMOVE(ts, ts_chain);
151 }
152
153 ts->ts_waiters[q]--;
154 sleepq_remove(&ts->ts_sleepq[q], l);
155 }
156
157 /*
158 * turnstile_lookup:
159 *
160 * Look up the turnstile for the specified lock. This acquires and
161 * holds the turnstile chain lock (sleep queue interlock).
162 */
163 turnstile_t *
164 turnstile_lookup(wchan_t obj)
165 {
166 turnstile_t *ts;
167 tschain_t *tc;
168
169 tc = &turnstile_tab[TS_HASH(obj)];
170 mutex_spin_enter(tc->tc_mutex);
171
172 LIST_FOREACH(ts, &tc->tc_chain, ts_chain)
173 if (ts->ts_obj == obj)
174 return (ts);
175
176 /*
177 * No turnstile yet for this lock. No problem, turnstile_block()
178 * handles this by fetching the turnstile from the blocking thread.
179 */
180 return (NULL);
181 }
182
183 /*
184 * turnstile_exit:
185 *
186 * Abort a turnstile operation.
187 */
188 void
189 turnstile_exit(wchan_t obj)
190 {
191 tschain_t *tc;
192
193 tc = &turnstile_tab[TS_HASH(obj)];
194 mutex_spin_exit(tc->tc_mutex);
195 }
196
197 /*
198 * turnstile_lendpri:
199 *
200 * Lend our priority to lwps on the blocking chain.
201 *
202 *
203 */
204
205 static void
206 turnstile_lendpri(lwp_t *cur)
207 {
208 lwp_t * l = cur;
209 pri_t prio;
210
211 /*
212 * NOTE: if you get a panic in this code block, it is likely that
213 * a lock has been destroyed or corrupted while still in use. Try
214 * compiling a kernel with LOCKDEBUG to pinpoint the problem.
215 */
216
217 LOCKDEBUG_BARRIER(l->l_mutex, 1);
218 KASSERT(l == curlwp);
219 prio = lwp_eprio(l);
220 for (;;) {
221 lwp_t *owner;
222 turnstile_t *ts;
223 bool dolock;
224
225 if (l->l_wchan == NULL)
226 break;
227
228 owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
229 if (owner == NULL)
230 break;
231
232 /* The owner may have changed as we have dropped the tc lock */
233 if (cur == owner) {
234 /*
235 * we own the lock: stop here, sleepq_block()
236 * should wake up immediatly
237 */
238 break;
239 }
240 if (l->l_mutex != owner->l_mutex)
241 dolock = true;
242 else
243 dolock = false;
244 if (l == owner || (dolock && !lwp_trylock(owner))) {
245 /*
246 * restart from curlwp.
247 * Note that there may be a livelock here:
248 * the owner may try grabing cur's lock (which is
249 * the tc lock) while we're trying to grab
250 * the owner's lock.
251 */
252 lwp_unlock(l);
253 l = cur;
254 lwp_lock(l);
255 prio = lwp_eprio(l);
256 continue;
257 }
258 if (prio <= lwp_eprio(owner)) {
259 if (dolock)
260 lwp_unlock(owner);
261 break;
262 }
263 ts = l->l_ts;
264 KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
265 if (ts->ts_inheritor == NULL) {
266 ts->ts_inheritor = owner;
267 ts->ts_eprio = prio;
268 SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
269 lwp_lendpri(owner, prio);
270 } else if (prio > ts->ts_eprio) {
271 ts->ts_eprio = prio;
272 lwp_lendpri(owner, prio);
273 }
274 if (dolock)
275 lwp_unlock(l);
276 l = owner;
277 }
278 LOCKDEBUG_BARRIER(l->l_mutex, 1);
279 if (cur->l_mutex != l->l_mutex) {
280 lwp_unlock(l);
281 lwp_lock(cur);
282 }
283 LOCKDEBUG_BARRIER(cur->l_mutex, 1);
284 }
285
286 /*
287 * turnstile_unlendpri: undo turnstile_lendpri
288 */
289
290 static void
291 turnstile_unlendpri(turnstile_t *ts)
292 {
293 lwp_t * const l = curlwp;
294 turnstile_t *iter;
295 turnstile_t *next;
296 turnstile_t *prev = NULL;
297 pri_t prio;
298 bool dolock;
299
300 KASSERT(ts->ts_inheritor != NULL);
301 ts->ts_inheritor = NULL;
302 dolock = l->l_mutex == l->l_cpu->ci_schedstate.spc_lwplock;
303 if (dolock) {
304 lwp_lock(l);
305 }
306
307 /*
308 * the following loop does two things.
309 *
310 * - remove ts from the list.
311 *
312 * - from the rest of the list, find the highest priority.
313 */
314
315 prio = -1;
316 KASSERT(!SLIST_EMPTY(&l->l_pi_lenders));
317 for (iter = SLIST_FIRST(&l->l_pi_lenders);
318 iter != NULL; iter = next) {
319 KASSERT(lwp_eprio(l) >= ts->ts_eprio);
320 next = SLIST_NEXT(iter, ts_pichain);
321 if (iter == ts) {
322 if (prev == NULL) {
323 SLIST_REMOVE_HEAD(&l->l_pi_lenders,
324 ts_pichain);
325 } else {
326 SLIST_REMOVE_AFTER(prev, ts_pichain);
327 }
328 } else if (prio < iter->ts_eprio) {
329 prio = iter->ts_eprio;
330 }
331 prev = iter;
332 }
333
334 lwp_lendpri(l, prio);
335
336 if (dolock) {
337 lwp_unlock(l);
338 }
339 }
340
341 /*
342 * turnstile_block:
343 *
344 * Enter an object into the turnstile chain and prepare the current
345 * LWP for sleep.
346 */
347 void
348 turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
349 {
350 lwp_t * const l = curlwp; /* cached curlwp */
351 turnstile_t *ots;
352 tschain_t *tc;
353 sleepq_t *sq;
354 pri_t obase;
355
356 tc = &turnstile_tab[TS_HASH(obj)];
357
358 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
359 KASSERT(mutex_owned(tc->tc_mutex));
360 KASSERT(l != NULL && l->l_ts != NULL);
361
362 if (ts == NULL) {
363 /*
364 * We are the first thread to wait for this object;
365 * lend our turnstile to it.
366 */
367 ts = l->l_ts;
368 KASSERT(TS_ALL_WAITERS(ts) == 0);
369 KASSERT(TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) &&
370 TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
371 ts->ts_obj = obj;
372 ts->ts_inheritor = NULL;
373 LIST_INSERT_HEAD(&tc->tc_chain, ts, ts_chain);
374 } else {
375 /*
376 * Object already has a turnstile. Put our turnstile
377 * onto the free list, and reference the existing
378 * turnstile instead.
379 */
380 ots = l->l_ts;
381 KASSERT(ots->ts_free == NULL);
382 ots->ts_free = ts->ts_free;
383 ts->ts_free = ots;
384 l->l_ts = ts;
385
386 KASSERT(ts->ts_obj == obj);
387 KASSERT(TS_ALL_WAITERS(ts) != 0);
388 KASSERT(!TAILQ_EMPTY(&ts->ts_sleepq[TS_READER_Q]) ||
389 !TAILQ_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
390 }
391
392 sq = &ts->ts_sleepq[q];
393 ts->ts_waiters[q]++;
394 sleepq_enter(sq, l, tc->tc_mutex);
395 LOCKDEBUG_BARRIER(tc->tc_mutex, 1);
396 l->l_kpriority = true;
397 obase = l->l_kpribase;
398 if (obase < PRI_KTHREAD)
399 l->l_kpribase = PRI_KTHREAD;
400 sleepq_enqueue(sq, obj, "tstile", sobj);
401
402 /*
403 * Disable preemption across this entire block, as we may drop
404 * scheduler locks (allowing preemption), and would prefer not
405 * to be interrupted while in a state of flux.
406 */
407 KPREEMPT_DISABLE(l);
408 KASSERT(tc->tc_mutex == l->l_mutex);
409 turnstile_lendpri(l);
410 sleepq_block(0, false);
411 l->l_kpribase = obase;
412 KPREEMPT_ENABLE(l);
413 }
414
415 /*
416 * turnstile_wakeup:
417 *
418 * Wake up the specified number of threads that are blocked
419 * in a turnstile.
420 */
421 void
422 turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl)
423 {
424 sleepq_t *sq;
425 tschain_t *tc;
426 lwp_t *l;
427
428 tc = &turnstile_tab[TS_HASH(ts->ts_obj)];
429 sq = &ts->ts_sleepq[q];
430
431 KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
432 KASSERT(count > 0 && count <= TS_WAITERS(ts, q));
433 KASSERT(mutex_owned(tc->tc_mutex));
434 KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);
435
436 /*
437 * restore inherited priority if necessary.
438 */
439
440 if (ts->ts_inheritor != NULL) {
441 turnstile_unlendpri(ts);
442 }
443
444 if (nl != NULL) {
445 #if defined(DEBUG) || defined(LOCKDEBUG)
446 TAILQ_FOREACH(l, sq, l_sleepchain) {
447 if (l == nl)
448 break;
449 }
450 if (l == NULL)
451 panic("turnstile_wakeup: nl not on sleepq");
452 #endif
453 turnstile_remove(ts, nl, q);
454 } else {
455 while (count-- > 0) {
456 l = TAILQ_FIRST(sq);
457 KASSERT(l != NULL);
458 turnstile_remove(ts, l, q);
459 }
460 }
461 mutex_spin_exit(tc->tc_mutex);
462 }
463
464 /*
465 * turnstile_unsleep:
466 *
467 * Remove an LWP from the turnstile. This is called when the LWP has
468 * not been awoken normally but instead interrupted: for example, if it
469 * has received a signal. It's not a valid action for turnstiles,
470 * since LWPs blocking on a turnstile are not interruptable.
471 */
472 void
473 turnstile_unsleep(lwp_t *l, bool cleanup)
474 {
475
476 lwp_unlock(l);
477 panic("turnstile_unsleep");
478 }
479
480 /*
481 * turnstile_changepri:
482 *
483 * Adjust the priority of an LWP residing on a turnstile.
484 */
485 void
486 turnstile_changepri(lwp_t *l, pri_t pri)
487 {
488
489 /* XXX priority inheritance */
490 sleepq_changepri(l, pri);
491 }
492
493 #if defined(LOCKDEBUG)
494 /*
495 * turnstile_print:
496 *
497 * Given the address of a lock object, print the contents of a
498 * turnstile.
499 */
500 void
501 turnstile_print(volatile void *obj, void (*pr)(const char *, ...))
502 {
503 turnstile_t *ts;
504 tschain_t *tc;
505 sleepq_t *rsq, *wsq;
506 lwp_t *l;
507
508 tc = &turnstile_tab[TS_HASH(obj)];
509
510 LIST_FOREACH(ts, &tc->tc_chain, ts_chain)
511 if (ts->ts_obj == obj)
512 break;
513
514 (*pr)("Turnstile chain at %p.\n", tc);
515 if (ts == NULL) {
516 (*pr)("=> No active turnstile for this lock.\n");
517 return;
518 }
519
520 rsq = &ts->ts_sleepq[TS_READER_Q];
521 wsq = &ts->ts_sleepq[TS_WRITER_Q];
522
523 (*pr)("=> Turnstile at %p (wrq=%p, rdq=%p).\n", ts, rsq, wsq);
524
525 (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q));
526 TAILQ_FOREACH(l, rsq, l_sleepchain) {
527 (*pr)(" %p", l);
528 }
529 (*pr)("\n");
530
531 (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q));
532 TAILQ_FOREACH(l, wsq, l_sleepchain) {
533 (*pr)(" %p", l);
534 }
535 (*pr)("\n");
536 }
537 #endif /* LOCKDEBUG */
538