sys_select.c revision 1.3.2.2 1 /* $NetBSD: sys_select.c,v 1.3.2.2 2008/03/29 20:47:01 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1989, 1993
41 * The Regents of the University of California. All rights reserved.
42 * (c) UNIX System Laboratories, Inc.
43 * All or some portions of this file are derived from material licensed
44 * to the University of California by American Telephone and Telegraph
45 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
46 * the permission of UNIX System Laboratories, Inc.
47 *
48 * Redistribution and use in source and binary forms, with or without
49 * modification, are permitted provided that the following conditions
50 * are met:
51 * 1. Redistributions of source code must retain the above copyright
52 * notice, this list of conditions and the following disclaimer.
53 * 2. Redistributions in binary form must reproduce the above copyright
54 * notice, this list of conditions and the following disclaimer in the
55 * documentation and/or other materials provided with the distribution.
56 * 3. Neither the name of the University nor the names of its contributors
57 * may be used to endorse or promote products derived from this software
58 * without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
61 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
64 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
65 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
66 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
67 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
68 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
69 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
70 * SUCH DAMAGE.
71 *
72 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
73 */
74
75 /*
76 * System calls relating to files.
77 */
78
79 #include <sys/cdefs.h>
80 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.3.2.2 2008/03/29 20:47:01 christos Exp $");
81
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/filedesc.h>
85 #include <sys/ioctl.h>
86 #include <sys/file.h>
87 #include <sys/proc.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/uio.h>
91 #include <sys/kernel.h>
92 #include <sys/stat.h>
93 #include <sys/poll.h>
94 #include <sys/vnode.h>
95 #include <sys/mount.h>
96 #include <sys/syscallargs.h>
97 #include <sys/cpu.h>
98 #include <sys/atomic.h>
99 #include <sys/socketvar.h>
100 #include <sys/sleepq.h>
101
102 /* Flags for lwp::l_selflag. */
103 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
104 #define SEL_SCANNING 1 /* polling descriptors */
105 #define SEL_BLOCKING 2 /* about to block on select_cv */
106
107 /* Per-CPU state for select()/poll(). */
108 #if MAXCPUS > 32
109 #error adjust this code
110 #endif
111 typedef struct selcpu {
112 kmutex_t sc_lock;
113 sleepq_t sc_sleepq;
114 int sc_ncoll;
115 uint32_t sc_mask;
116 } selcpu_t;
117
118 static int selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
119 static int pollscan(lwp_t *, struct pollfd *, int, register_t *);
120 static void selclear(void);
121
122 static syncobj_t select_sobj = {
123 SOBJ_SLEEPQ_FIFO,
124 sleepq_unsleep,
125 sleepq_changepri,
126 sleepq_lendpri,
127 syncobj_noowner,
128 };
129
130 /*
131 * Select system call.
132 */
133 int
134 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
135 register_t *retval)
136 {
137 /* {
138 syscallarg(int) nd;
139 syscallarg(fd_set *) in;
140 syscallarg(fd_set *) ou;
141 syscallarg(fd_set *) ex;
142 syscallarg(const struct timespec *) ts;
143 syscallarg(sigset_t *) mask;
144 } */
145 struct timespec ats;
146 struct timeval atv, *tv = NULL;
147 sigset_t amask, *mask = NULL;
148 int error;
149
150 if (SCARG(uap, ts)) {
151 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
152 if (error)
153 return error;
154 atv.tv_sec = ats.tv_sec;
155 atv.tv_usec = ats.tv_nsec / 1000;
156 tv = &atv;
157 }
158 if (SCARG(uap, mask) != NULL) {
159 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
160 if (error)
161 return error;
162 mask = &amask;
163 }
164
165 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
166 SCARG(uap, ou), SCARG(uap, ex), tv, mask);
167 }
168
169 int
170 inittimeleft(struct timeval *tv, struct timeval *sleeptv)
171 {
172 if (itimerfix(tv))
173 return -1;
174 getmicrouptime(sleeptv);
175 return 0;
176 }
177
178 int
179 gettimeleft(struct timeval *tv, struct timeval *sleeptv)
180 {
181 /*
182 * We have to recalculate the timeout on every retry.
183 */
184 struct timeval slepttv;
185 /*
186 * reduce tv by elapsed time
187 * based on monotonic time scale
188 */
189 getmicrouptime(&slepttv);
190 timeradd(tv, sleeptv, tv);
191 timersub(tv, &slepttv, tv);
192 *sleeptv = slepttv;
193 return tvtohz(tv);
194 }
195
196 int
197 sys___select50(struct lwp *l, const struct sys___select50_args *uap,
198 register_t *retval)
199 {
200 /* {
201 syscallarg(int) nd;
202 syscallarg(fd_set *) in;
203 syscallarg(fd_set *) ou;
204 syscallarg(fd_set *) ex;
205 syscallarg(struct timeval *) tv;
206 } */
207 struct timeval atv, *tv = NULL;
208 int error;
209
210 if (SCARG(uap, tv)) {
211 error = copyin(SCARG(uap, tv), (void *)&atv,
212 sizeof(atv));
213 if (error)
214 return error;
215 tv = &atv;
216 }
217
218 return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
219 SCARG(uap, ou), SCARG(uap, ex), tv, NULL);
220 }
221
222 int
223 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
224 fd_set *u_ou, fd_set *u_ex, struct timeval *tv, sigset_t *mask)
225 {
226 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
227 sizeof(fd_mask) * 6];
228 proc_t * const p = l->l_proc;
229 char *bits;
230 int ncoll, error, timo;
231 size_t ni;
232 sigset_t oldmask;
233 struct timeval sleeptv;
234 selcpu_t *sc;
235
236 error = 0;
237 if (nd < 0)
238 return (EINVAL);
239 if (nd > p->p_fd->fd_nfiles) {
240 /* forgiving; slightly wrong */
241 nd = p->p_fd->fd_nfiles;
242 }
243 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
244 if (ni * 6 > sizeof(smallbits))
245 bits = kmem_alloc(ni * 6, KM_SLEEP);
246 else
247 bits = smallbits;
248
249 #define getbits(name, x) \
250 if (u_ ## name) { \
251 error = copyin(u_ ## name, bits + ni * x, ni); \
252 if (error) \
253 goto done; \
254 } else \
255 memset(bits + ni * x, 0, ni);
256 getbits(in, 0);
257 getbits(ou, 1);
258 getbits(ex, 2);
259 #undef getbits
260
261 timo = 0;
262 if (tv && inittimeleft(tv, &sleeptv) == -1) {
263 error = EINVAL;
264 goto done;
265 }
266
267 if (mask) {
268 sigminusset(&sigcantmask, mask);
269 mutex_enter(&p->p_smutex);
270 oldmask = l->l_sigmask;
271 l->l_sigmask = *mask;
272 mutex_exit(&p->p_smutex);
273 } else
274 oldmask = l->l_sigmask; /* XXXgcc */
275
276 sc = curcpu()->ci_data.cpu_selcpu;
277 l->l_selcpu = sc;
278 SLIST_INIT(&l->l_selwait);
279 for (;;) {
280 /*
281 * No need to lock. If this is overwritten by another
282 * value while scanning, we will retry below. We only
283 * need to see exact state from the descriptors that
284 * we are about to poll, and lock activity resulting
285 * from fo_poll is enough to provide an up to date value
286 * for new polling activity.
287 */
288 l->l_selflag = SEL_SCANNING;
289 ncoll = sc->sc_ncoll;
290
291 error = selscan(l, (fd_mask *)(bits + ni * 0),
292 (fd_mask *)(bits + ni * 3), nd, retval);
293
294 if (error || *retval)
295 break;
296 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
297 break;
298 mutex_spin_enter(&sc->sc_lock);
299 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
300 mutex_spin_exit(&sc->sc_lock);
301 continue;
302 }
303 l->l_selflag = SEL_BLOCKING;
304 lwp_lock(l);
305 lwp_unlock_to(l, &sc->sc_lock);
306 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
307 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks); /* XXX */
308 error = sleepq_block(timo, true);
309 if (error != 0)
310 break;
311 }
312 selclear();
313
314 if (mask) {
315 mutex_enter(&p->p_smutex);
316 l->l_sigmask = oldmask;
317 mutex_exit(&p->p_smutex);
318 }
319
320 done:
321 /* select is not restarted after signals... */
322 if (error == ERESTART)
323 error = EINTR;
324 if (error == EWOULDBLOCK)
325 error = 0;
326 if (error == 0 && u_in != NULL)
327 error = copyout(bits + ni * 3, u_in, ni);
328 if (error == 0 && u_ou != NULL)
329 error = copyout(bits + ni * 4, u_ou, ni);
330 if (error == 0 && u_ex != NULL)
331 error = copyout(bits + ni * 5, u_ex, ni);
332 if (bits != smallbits)
333 kmem_free(bits, ni * 6);
334 return (error);
335 }
336
337 int
338 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
339 register_t *retval)
340 {
341 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
342 POLLWRNORM | POLLHUP | POLLERR,
343 POLLRDBAND };
344 int msk, i, j, fd, n;
345 fd_mask ibits, obits;
346 file_t *fp;
347
348 n = 0;
349 for (msk = 0; msk < 3; msk++) {
350 for (i = 0; i < nfd; i += NFDBITS) {
351 ibits = *ibitp++;
352 obits = 0;
353 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
354 ibits &= ~(1 << j);
355 if ((fp = fd_getfile(fd)) == NULL)
356 return (EBADF);
357 if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
358 obits |= (1 << j);
359 n++;
360 }
361 fd_putfile(fd);
362 }
363 *obitp++ = obits;
364 }
365 }
366 *retval = n;
367 return (0);
368 }
369
370 /*
371 * Poll system call.
372 */
373 int
374 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
375 {
376 /* {
377 syscallarg(struct pollfd *) fds;
378 syscallarg(u_int) nfds;
379 syscallarg(int) timeout;
380 } */
381 struct timeval atv, *tv = NULL;
382
383 if (SCARG(uap, timeout) != INFTIM) {
384 atv.tv_sec = SCARG(uap, timeout) / 1000;
385 atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
386 tv = &atv;
387 }
388
389 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
390 tv, NULL);
391 }
392
393 /*
394 * Poll system call.
395 */
396 int
397 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
398 register_t *retval)
399 {
400 /* {
401 syscallarg(struct pollfd *) fds;
402 syscallarg(u_int) nfds;
403 syscallarg(const struct timespec *) ts;
404 syscallarg(const sigset_t *) mask;
405 } */
406 struct timespec ats;
407 struct timeval atv, *tv = NULL;
408 sigset_t amask, *mask = NULL;
409 int error;
410
411 if (SCARG(uap, ts)) {
412 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
413 if (error)
414 return error;
415 atv.tv_sec = ats.tv_sec;
416 atv.tv_usec = ats.tv_nsec / 1000;
417 tv = &atv;
418 }
419 if (SCARG(uap, mask)) {
420 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
421 if (error)
422 return error;
423 mask = &amask;
424 }
425
426 return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
427 tv, mask);
428 }
429
430 int
431 pollcommon(lwp_t *l, register_t *retval,
432 struct pollfd *u_fds, u_int nfds,
433 struct timeval *tv, sigset_t *mask)
434 {
435 char smallbits[32 * sizeof(struct pollfd)];
436 proc_t * const p = l->l_proc;
437 void * bits;
438 sigset_t oldmask;
439 int ncoll, error, timo;
440 size_t ni;
441 struct timeval sleeptv;
442 selcpu_t *sc;
443
444 if (nfds > p->p_fd->fd_nfiles) {
445 /* forgiving; slightly wrong */
446 nfds = p->p_fd->fd_nfiles;
447 }
448 ni = nfds * sizeof(struct pollfd);
449 if (ni > sizeof(smallbits))
450 bits = kmem_alloc(ni, KM_SLEEP);
451 else
452 bits = smallbits;
453
454 error = copyin(u_fds, bits, ni);
455 if (error)
456 goto done;
457
458 timo = 0;
459 if (tv && inittimeleft(tv, &sleeptv) == -1) {
460 error = EINVAL;
461 goto done;
462 }
463
464 if (mask) {
465 sigminusset(&sigcantmask, mask);
466 mutex_enter(&p->p_smutex);
467 oldmask = l->l_sigmask;
468 l->l_sigmask = *mask;
469 mutex_exit(&p->p_smutex);
470 } else
471 oldmask = l->l_sigmask; /* XXXgcc */
472
473 sc = curcpu()->ci_data.cpu_selcpu;
474 l->l_selcpu = sc;
475 SLIST_INIT(&l->l_selwait);
476 for (;;) {
477 /*
478 * No need to lock. If this is overwritten by another
479 * value while scanning, we will retry below. We only
480 * need to see exact state from the descriptors that
481 * we are about to poll, and lock activity resulting
482 * from fo_poll is enough to provide an up to date value
483 * for new polling activity.
484 */
485 ncoll = sc->sc_ncoll;
486 l->l_selflag = SEL_SCANNING;
487
488 error = pollscan(l, (struct pollfd *)bits, nfds, retval);
489
490 if (error || *retval)
491 break;
492 if (tv && (timo = gettimeleft(tv, &sleeptv)) <= 0)
493 break;
494 mutex_spin_enter(&sc->sc_lock);
495 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
496 mutex_spin_exit(&sc->sc_lock);
497 continue;
498 }
499 l->l_selflag = SEL_BLOCKING;
500 lwp_lock(l);
501 lwp_unlock_to(l, &sc->sc_lock);
502 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
503 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks); /* XXX */
504 error = sleepq_block(timo, true);
505 if (error != 0)
506 break;
507 }
508 selclear();
509
510 if (mask) {
511 mutex_enter(&p->p_smutex);
512 l->l_sigmask = oldmask;
513 mutex_exit(&p->p_smutex);
514 }
515 done:
516 /* poll is not restarted after signals... */
517 if (error == ERESTART)
518 error = EINTR;
519 if (error == EWOULDBLOCK)
520 error = 0;
521 if (error == 0)
522 error = copyout(bits, u_fds, ni);
523 if (bits != smallbits)
524 kmem_free(bits, ni);
525 return (error);
526 }
527
528 int
529 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
530 {
531 int i, n;
532 file_t *fp;
533
534 n = 0;
535 for (i = 0; i < nfd; i++, fds++) {
536 if (fds->fd < 0) {
537 fds->revents = 0;
538 } else if ((fp = fd_getfile(fds->fd)) == NULL) {
539 fds->revents = POLLNVAL;
540 n++;
541 } else {
542 fds->revents = (*fp->f_ops->fo_poll)(fp,
543 fds->events | POLLERR | POLLHUP);
544 if (fds->revents != 0)
545 n++;
546 fd_putfile(fds->fd);
547 }
548 }
549 *retval = n;
550 return (0);
551 }
552
553 /*ARGSUSED*/
554 int
555 seltrue(dev_t dev, int events, lwp_t *l)
556 {
557
558 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
559 }
560
561 /*
562 * Record a select request. Concurrency issues:
563 *
564 * The caller holds the same lock across calls to selrecord() and
565 * selwakeup(), so we don't need to consider a concurrent wakeup
566 * while in this routine.
567 *
568 * The only activity we need to guard against is selclear(), called by
569 * another thread that is exiting selcommon() or pollcommon().
570 * `sel_lwp' can only become non-NULL while the caller's lock is held,
571 * so it cannot become non-NULL due to a change made by another thread
572 * while we are in this routine. It can only become _NULL_ due to a
573 * call to selclear().
574 *
575 * If it is non-NULL and != selector there is the potential for
576 * selclear() to be called by another thread. If either of those
577 * conditions are true, we're not interested in touching the `named
578 * waiter' part of the selinfo record because we need to record a
579 * collision. Hence there is no need for additional locking in this
580 * routine.
581 */
582 void
583 selrecord(lwp_t *selector, struct selinfo *sip)
584 {
585 selcpu_t *sc;
586 lwp_t *other;
587
588 KASSERT(selector == curlwp);
589
590 sc = selector->l_selcpu;
591 other = sip->sel_lwp;
592
593 if (other == selector) {
594 /* `selector' has already claimed it. */
595 KASSERT(sip->sel_cpu = sc);
596 } else if (other == NULL) {
597 /*
598 * First named waiter, although there may be unnamed
599 * waiters (collisions). Issue a memory barrier to
600 * ensure that we access sel_lwp (above) before other
601 * fields - this guards against a call to selclear().
602 */
603 membar_enter();
604 sip->sel_lwp = selector;
605 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
606 /* Replace selinfo's lock with our chosen CPU's lock. */
607 sip->sel_cpu = sc;
608 } else {
609 /* Multiple waiters: record a collision. */
610 sip->sel_collision |= sc->sc_mask;
611 KASSERT(sip->sel_cpu != NULL);
612 }
613 }
614
615 /*
616 * Do a wakeup when a selectable event occurs. Concurrency issues:
617 *
618 * As per selrecord(), the caller's object lock is held. If there
619 * is a named waiter, we must acquire the associated selcpu's lock
620 * in order to synchronize with selclear() and pollers going to sleep
621 * in selcommon() and/or pollcommon().
622 *
623 * sip->sel_cpu cannot change at this point, as it is only changed
624 * in selrecord(), and concurrent calls to selrecord() are locked
625 * out by the caller.
626 */
627 void
628 selnotify(struct selinfo *sip, int events, long knhint)
629 {
630 selcpu_t *sc;
631 uint32_t mask;
632 int index, oflag, swapin;
633 lwp_t *l;
634
635 KNOTE(&sip->sel_klist, knhint);
636
637 if (sip->sel_lwp != NULL) {
638 /* One named LWP is waiting. */
639 swapin = 0;
640 sc = sip->sel_cpu;
641 mutex_spin_enter(&sc->sc_lock);
642 /* Still there? */
643 if (sip->sel_lwp != NULL) {
644 l = sip->sel_lwp;
645 /*
646 * If thread is sleeping, wake it up. If it's not
647 * yet asleep, it will notice the change in state
648 * and will re-poll the descriptors.
649 */
650 oflag = l->l_selflag;
651 l->l_selflag = SEL_RESET;
652 if (oflag == SEL_BLOCKING &&
653 l->l_mutex == &sc->sc_lock) {
654 KASSERT(l->l_wchan == sc);
655 swapin = sleepq_unsleep(l, false);
656 }
657 }
658 mutex_spin_exit(&sc->sc_lock);
659 if (swapin)
660 uvm_kick_scheduler();
661 }
662
663 if ((mask = sip->sel_collision) != 0) {
664 /*
665 * There was a collision (multiple waiters): we must
666 * inform all potentially interested waiters.
667 */
668 sip->sel_collision = 0;
669 do {
670 index = ffs(mask) - 1;
671 mask &= ~(1 << index);
672 sc = cpu_lookup_byindex(index)->ci_data.cpu_selcpu;
673 mutex_spin_enter(&sc->sc_lock);
674 sc->sc_ncoll++;
675 sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1);
676 } while (__predict_false(mask != 0));
677 }
678 }
679
680 /*
681 * Remove an LWP from all objects that it is waiting for. Concurrency
682 * issues:
683 *
684 * The object owner's (e.g. device driver) lock is not held here. Calls
685 * can be made to selrecord() and we do not synchronize against those
686 * directly using locks. However, we use `sel_lwp' to lock out changes.
687 * Before clearing it we must use memory barriers to ensure that we can
688 * safely traverse the list of selinfo records.
689 */
690 static void
691 selclear(void)
692 {
693 struct selinfo *sip, *next;
694 selcpu_t *sc;
695 lwp_t *l;
696
697 l = curlwp;
698 sc = l->l_selcpu;
699
700 mutex_spin_enter(&sc->sc_lock);
701 for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) {
702 KASSERT(sip->sel_lwp == l);
703 KASSERT(sip->sel_cpu == l->l_selcpu);
704 /*
705 * Read link to next selinfo record, if any.
706 * It's no longer safe to touch `sip' after clearing
707 * `sel_lwp', so ensure that the read of `sel_chain'
708 * completes before the clearing of sel_lwp becomes
709 * globally visible.
710 */
711 next = SLIST_NEXT(sip, sel_chain);
712 membar_exit();
713 /* Release the record for another named waiter to use. */
714 sip->sel_lwp = NULL;
715 }
716 mutex_spin_exit(&sc->sc_lock);
717 }
718
719 /*
720 * Initialize the select/poll system calls. Called once for each
721 * CPU in the system, as they are attached.
722 */
723 void
724 selsysinit(struct cpu_info *ci)
725 {
726 selcpu_t *sc;
727
728 sc = kmem_alloc(roundup2(sizeof(selcpu_t), coherency_unit) +
729 coherency_unit, KM_SLEEP);
730 sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
731 mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_SCHED);
732 sleepq_init(&sc->sc_sleepq, &sc->sc_lock);
733 sc->sc_ncoll = 0;
734 sc->sc_mask = (1 << cpu_index(ci));
735 ci->ci_data.cpu_selcpu = sc;
736 }
737
738 /*
739 * Initialize a selinfo record.
740 */
741 void
742 selinit(struct selinfo *sip)
743 {
744
745 memset(sip, 0, sizeof(*sip));
746 }
747
748 /*
749 * Destroy a selinfo record. The owning object must not gain new
750 * references while this is in progress: all activity on the record
751 * must be stopped.
752 *
753 * Concurrency issues: we only need guard against a call to selclear()
754 * by a thread exiting selcommon() and/or pollcommon(). The caller has
755 * prevented further references being made to the selinfo record via
756 * selrecord(), and it won't call selwakeup() again.
757 */
758 void
759 seldestroy(struct selinfo *sip)
760 {
761 selcpu_t *sc;
762 lwp_t *l;
763
764 if (sip->sel_lwp == NULL)
765 return;
766
767 /*
768 * Lock out selclear(). The selcpu pointer can't change while
769 * we are here since it is only ever changed in selrecord(),
770 * and that will not be entered again for this record because
771 * it is dying.
772 */
773 KASSERT(sip->sel_cpu != NULL);
774 sc = sip->sel_cpu;
775 mutex_spin_enter(&sc->sc_lock);
776 if ((l = sip->sel_lwp) != NULL) {
777 /*
778 * This should rarely happen, so although SLIST_REMOVE()
779 * is slow, using it here is not a problem.
780 */
781 KASSERT(l->l_selcpu == sc);
782 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
783 sip->sel_lwp = NULL;
784 }
785 mutex_spin_exit(&sc->sc_lock);
786 }
787
788 int
789 pollsock(struct socket *so, const struct timeval *tvp, int events)
790 {
791 int ncoll, error, timo;
792 struct timeval sleeptv, tv;
793 selcpu_t *sc;
794 lwp_t *l;
795
796 timo = 0;
797 if (tvp != NULL) {
798 tv = *tvp;
799 if (inittimeleft(&tv, &sleeptv) == -1)
800 return EINVAL;
801 }
802
803 l = curlwp;
804 sc = l->l_cpu->ci_data.cpu_selcpu;
805 l->l_selcpu = sc;
806 SLIST_INIT(&l->l_selwait);
807 error = 0;
808 for (;;) {
809 /*
810 * No need to lock. If this is overwritten by another
811 * value while scanning, we will retry below. We only
812 * need to see exact state from the descriptors that
813 * we are about to poll, and lock activity resulting
814 * from fo_poll is enough to provide an up to date value
815 * for new polling activity.
816 */
817 ncoll = sc->sc_ncoll;
818 l->l_selflag = SEL_SCANNING;
819 if (sopoll(so, events) != 0)
820 break;
821 if (tvp && (timo = gettimeleft(&tv, &sleeptv)) <= 0)
822 break;
823 mutex_spin_enter(&sc->sc_lock);
824 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
825 mutex_spin_exit(&sc->sc_lock);
826 continue;
827 }
828 l->l_selflag = SEL_BLOCKING;
829 lwp_lock(l);
830 lwp_unlock_to(l, &sc->sc_lock);
831 sleepq_enqueue(&sc->sc_sleepq, sc, "pollsock", &select_sobj);
832 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks); /* XXX */
833 error = sleepq_block(timo, true);
834 if (error != 0)
835 break;
836 }
837 selclear();
838 /* poll is not restarted after signals... */
839 if (error == ERESTART)
840 error = EINTR;
841 if (error == EWOULDBLOCK)
842 error = 0;
843 return (error);
844 }
845