sys_select.c revision 1.20 1 /* $NetBSD: sys_select.c,v 1.20 2009/12/12 17:47:05 dsl Exp $ */
2
3 /*-
4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
66 */
67
68 /*
69 * System calls relating to files.
70 */
71
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.20 2009/12/12 17:47:05 dsl Exp $");
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/ioctl.h>
79 #include <sys/file.h>
80 #include <sys/proc.h>
81 #include <sys/socketvar.h>
82 #include <sys/signalvar.h>
83 #include <sys/uio.h>
84 #include <sys/kernel.h>
85 #include <sys/stat.h>
86 #include <sys/poll.h>
87 #include <sys/vnode.h>
88 #include <sys/mount.h>
89 #include <sys/syscallargs.h>
90 #include <sys/cpu.h>
91 #include <sys/atomic.h>
92 #include <sys/socketvar.h>
93 #include <sys/sleepq.h>
94
95 /* Flags for lwp::l_selflag. */
96 #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
97 #define SEL_SCANNING 1 /* polling descriptors */
98 #define SEL_BLOCKING 2 /* about to block on select_cv */
99
100 /* Per-CPU state for select()/poll(). */
101 #if MAXCPUS > 32
102 #error adjust this code
103 #endif
104 typedef struct selcpu {
105 kmutex_t *sc_lock;
106 sleepq_t sc_sleepq;
107 int sc_ncoll;
108 uint32_t sc_mask;
109 } selcpu_t;
110
111 static inline int selscan(char *, u_int, register_t *);
112 static inline int pollscan(struct pollfd *, u_int, register_t *);
113 static void selclear(void);
114
115 static syncobj_t select_sobj = {
116 SOBJ_SLEEPQ_FIFO,
117 sleepq_unsleep,
118 sleepq_changepri,
119 sleepq_lendpri,
120 syncobj_noowner,
121 };
122
123 /*
124 * Select system call.
125 */
126 int
127 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
128 register_t *retval)
129 {
130 /* {
131 syscallarg(int) nd;
132 syscallarg(fd_set *) in;
133 syscallarg(fd_set *) ou;
134 syscallarg(fd_set *) ex;
135 syscallarg(const struct timespec *) ts;
136 syscallarg(sigset_t *) mask;
137 } */
138 struct timespec ats, *ts = NULL;
139 sigset_t amask, *mask = NULL;
140 int error;
141
142 if (SCARG(uap, ts)) {
143 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
144 if (error)
145 return error;
146 ts = &ats;
147 }
148 if (SCARG(uap, mask) != NULL) {
149 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
150 if (error)
151 return error;
152 mask = &amask;
153 }
154
155 return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
156 SCARG(uap, ou), SCARG(uap, ex), ts, mask);
157 }
158
159 int
160 sys___select50(struct lwp *l, const struct sys___select50_args *uap,
161 register_t *retval)
162 {
163 /* {
164 syscallarg(int) nd;
165 syscallarg(fd_set *) in;
166 syscallarg(fd_set *) ou;
167 syscallarg(fd_set *) ex;
168 syscallarg(struct timeval *) tv;
169 } */
170 struct timeval atv;
171 struct timespec ats, *ts = NULL;
172 int error;
173
174 if (SCARG(uap, tv)) {
175 error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
176 if (error)
177 return error;
178 TIMEVAL_TO_TIMESPEC(&atv, &ats);
179 ts = &ats;
180 }
181
182 return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
183 SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
184 }
185
186 /*
187 * sel_do_scan: common code to perform the scan on descriptors.
188 */
189 static int
190 sel_do_scan(void *fds, u_int nfds, struct timespec *ts, sigset_t *mask,
191 register_t *retval, int selpoll)
192 {
193 lwp_t * const l = curlwp;
194 proc_t * const p = l->l_proc;
195 selcpu_t *sc;
196 kmutex_t *lock;
197 sigset_t oldmask;
198 struct timespec sleepts;
199 int error, timo;
200
201 timo = 0;
202 if (ts && inittimeleft(ts, &sleepts) == -1) {
203 return EINVAL;
204 }
205
206 if (__predict_false(mask)) {
207 sigminusset(&sigcantmask, mask);
208 mutex_enter(p->p_lock);
209 oldmask = l->l_sigmask;
210 l->l_sigmask = *mask;
211 mutex_exit(p->p_lock);
212 } else {
213 /* XXXgcc */
214 oldmask = l->l_sigmask;
215 }
216
217 sc = curcpu()->ci_data.cpu_selcpu;
218 lock = sc->sc_lock;
219 l->l_selcpu = sc;
220 SLIST_INIT(&l->l_selwait);
221 for (;;) {
222 int ncoll;
223
224 /*
225 * No need to lock. If this is overwritten by another value
226 * while scanning, we will retry below. We only need to see
227 * exact state from the descriptors that we are about to poll,
228 * and lock activity resulting from fo_poll is enough to
229 * provide an up to date value for new polling activity.
230 */
231 l->l_selflag = SEL_SCANNING;
232 ncoll = sc->sc_ncoll;
233
234 if (selpoll) {
235 error = selscan((char *)fds, nfds, retval);
236 } else {
237 error = pollscan((struct pollfd *)fds, nfds, retval);
238 }
239
240 if (error || *retval)
241 break;
242 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
243 break;
244 mutex_spin_enter(lock);
245 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
246 mutex_spin_exit(lock);
247 continue;
248 }
249 l->l_selflag = SEL_BLOCKING;
250 l->l_kpriority = true;
251 sleepq_enter(&sc->sc_sleepq, l, lock);
252 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
253 error = sleepq_block(timo, true);
254 if (error != 0)
255 break;
256 }
257 selclear();
258
259 if (__predict_false(mask)) {
260 mutex_enter(p->p_lock);
261 l->l_sigmask = oldmask;
262 mutex_exit(p->p_lock);
263 }
264
265 /* select and poll are not restarted after signals... */
266 if (error == ERESTART)
267 return EINTR;
268 if (error == EWOULDBLOCK)
269 return 0;
270 return error;
271 }
272
273 int
274 selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
275 fd_set *u_ex, struct timespec *ts, sigset_t *mask)
276 {
277 char smallbits[howmany(FD_SETSIZE, NFDBITS) *
278 sizeof(fd_mask) * 6];
279 char *bits;
280 int error, nf;
281 size_t ni;
282
283 if (nd < 0)
284 return (EINVAL);
285 nf = curlwp->l_fd->fd_dt->dt_nfiles;
286 if (nd > nf) {
287 /* forgiving; slightly wrong */
288 nd = nf;
289 }
290 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
291 if (ni * 6 > sizeof(smallbits)) {
292 bits = kmem_alloc(ni * 6, KM_SLEEP);
293 if (bits == NULL)
294 return ENOMEM;
295 } else
296 bits = smallbits;
297
298 #define getbits(name, x) \
299 if (u_ ## name) { \
300 error = copyin(u_ ## name, bits + ni * x, ni); \
301 if (error) \
302 goto fail; \
303 } else \
304 memset(bits + ni * x, 0, ni);
305 getbits(in, 0);
306 getbits(ou, 1);
307 getbits(ex, 2);
308 #undef getbits
309
310 error = sel_do_scan(bits, nd, ts, mask, retval, 1);
311 if (error == 0 && u_in != NULL)
312 error = copyout(bits + ni * 3, u_in, ni);
313 if (error == 0 && u_ou != NULL)
314 error = copyout(bits + ni * 4, u_ou, ni);
315 if (error == 0 && u_ex != NULL)
316 error = copyout(bits + ni * 5, u_ex, ni);
317 fail:
318 if (bits != smallbits)
319 kmem_free(bits, ni * 6);
320 return (error);
321 }
322
323 static inline int
324 selscan(char *bits, u_int nfd, register_t *retval)
325 {
326 static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
327 POLLWRNORM | POLLHUP | POLLERR,
328 POLLRDBAND };
329 fd_mask *ibitp, *obitp;
330 int msk, i, j, fd, ni, n;
331 fd_mask ibits, obits;
332 file_t *fp;
333
334 ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
335 ibitp = (fd_mask *)(bits + ni * 0);
336 obitp = (fd_mask *)(bits + ni * 3);
337 n = 0;
338
339 for (msk = 0; msk < 3; msk++) {
340 for (i = 0; i < nfd; i += NFDBITS) {
341 ibits = *ibitp++;
342 obits = 0;
343 while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
344 ibits &= ~(1 << j);
345 if ((fp = fd_getfile(fd)) == NULL)
346 return (EBADF);
347 if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
348 obits |= (1 << j);
349 n++;
350 }
351 fd_putfile(fd);
352 }
353 *obitp++ = obits;
354 }
355 }
356 *retval = n;
357 return (0);
358 }
359
360 /*
361 * Poll system call.
362 */
363 int
364 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
365 {
366 /* {
367 syscallarg(struct pollfd *) fds;
368 syscallarg(u_int) nfds;
369 syscallarg(int) timeout;
370 } */
371 struct timespec ats, *ts = NULL;
372
373 if (SCARG(uap, timeout) != INFTIM) {
374 ats.tv_sec = SCARG(uap, timeout) / 1000;
375 ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
376 ts = &ats;
377 }
378
379 return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL);
380 }
381
382 /*
383 * Poll system call.
384 */
385 int
386 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
387 register_t *retval)
388 {
389 /* {
390 syscallarg(struct pollfd *) fds;
391 syscallarg(u_int) nfds;
392 syscallarg(const struct timespec *) ts;
393 syscallarg(const sigset_t *) mask;
394 } */
395 struct timespec ats, *ts = NULL;
396 sigset_t amask, *mask = NULL;
397 int error;
398
399 if (SCARG(uap, ts)) {
400 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
401 if (error)
402 return error;
403 ts = &ats;
404 }
405 if (SCARG(uap, mask)) {
406 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
407 if (error)
408 return error;
409 mask = &amask;
410 }
411
412 return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask);
413 }
414
415 int
416 pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
417 struct timespec *ts, sigset_t *mask)
418 {
419 struct pollfd smallfds[32];
420 struct pollfd *fds;
421 int error;
422 size_t ni;
423
424 if (nfds > 1000 + curlwp->l_fd->fd_dt->dt_nfiles) {
425 /*
426 * Either the user passed in a very sparse 'fds' or junk!
427 * The kmem_alloc() call below would be bad news.
428 * We could process the 'fds' array in chunks, but that
429 * is a lot of code that isn't normally useful.
430 * (Or just move the copyin/out into pollscan().)
431 * Historically the code silently truncated 'fds' to
432 * dt_nfiles entries - but that does cause issues.
433 */
434 return EINVAL;
435 }
436 ni = nfds * sizeof(struct pollfd);
437 if (ni > sizeof(smallfds)) {
438 fds = kmem_alloc(ni, KM_SLEEP);
439 if (fds == NULL)
440 return ENOMEM;
441 } else
442 fds = smallfds;
443
444 error = copyin(u_fds, fds, ni);
445 if (error)
446 goto fail;
447
448 error = sel_do_scan(fds, nfds, ts, mask, retval, 0);
449 if (error == 0)
450 error = copyout(fds, u_fds, ni);
451 fail:
452 if (fds != smallfds)
453 kmem_free(fds, ni);
454 return (error);
455 }
456
457 static inline int
458 pollscan(struct pollfd *fds, u_int nfd, register_t *retval)
459 {
460 int i, n;
461 file_t *fp;
462
463 n = 0;
464 for (i = 0; i < nfd; i++, fds++) {
465 if (fds->fd < 0) {
466 fds->revents = 0;
467 } else if ((fp = fd_getfile(fds->fd)) == NULL) {
468 fds->revents = POLLNVAL;
469 n++;
470 } else {
471 fds->revents = (*fp->f_ops->fo_poll)(fp,
472 fds->events | POLLERR | POLLHUP);
473 if (fds->revents != 0)
474 n++;
475 fd_putfile(fds->fd);
476 }
477 }
478 *retval = n;
479 return (0);
480 }
481
482 /*ARGSUSED*/
483 int
484 seltrue(dev_t dev, int events, lwp_t *l)
485 {
486
487 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
488 }
489
490 /*
491 * Record a select request. Concurrency issues:
492 *
493 * The caller holds the same lock across calls to selrecord() and
494 * selnotify(), so we don't need to consider a concurrent wakeup
495 * while in this routine.
496 *
497 * The only activity we need to guard against is selclear(), called by
498 * another thread that is exiting sel_do_scan().
499 * `sel_lwp' can only become non-NULL while the caller's lock is held,
500 * so it cannot become non-NULL due to a change made by another thread
501 * while we are in this routine. It can only become _NULL_ due to a
502 * call to selclear().
503 *
504 * If it is non-NULL and != selector there is the potential for
505 * selclear() to be called by another thread. If either of those
506 * conditions are true, we're not interested in touching the `named
507 * waiter' part of the selinfo record because we need to record a
508 * collision. Hence there is no need for additional locking in this
509 * routine.
510 */
511 void
512 selrecord(lwp_t *selector, struct selinfo *sip)
513 {
514 selcpu_t *sc;
515 lwp_t *other;
516
517 KASSERT(selector == curlwp);
518
519 sc = selector->l_selcpu;
520 other = sip->sel_lwp;
521
522 if (other == selector) {
523 /* `selector' has already claimed it. */
524 KASSERT(sip->sel_cpu = sc);
525 } else if (other == NULL) {
526 /*
527 * First named waiter, although there may be unnamed
528 * waiters (collisions). Issue a memory barrier to
529 * ensure that we access sel_lwp (above) before other
530 * fields - this guards against a call to selclear().
531 */
532 membar_enter();
533 sip->sel_lwp = selector;
534 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
535 /* Replace selinfo's lock with our chosen CPU's lock. */
536 sip->sel_cpu = sc;
537 } else {
538 /* Multiple waiters: record a collision. */
539 sip->sel_collision |= sc->sc_mask;
540 KASSERT(sip->sel_cpu != NULL);
541 }
542 }
543
544 /*
545 * Do a wakeup when a selectable event occurs. Concurrency issues:
546 *
547 * As per selrecord(), the caller's object lock is held. If there
548 * is a named waiter, we must acquire the associated selcpu's lock
549 * in order to synchronize with selclear() and pollers going to sleep
550 * in sel_do_scan().
551 *
552 * sip->sel_cpu cannot change at this point, as it is only changed
553 * in selrecord(), and concurrent calls to selrecord() are locked
554 * out by the caller.
555 */
556 void
557 selnotify(struct selinfo *sip, int events, long knhint)
558 {
559 selcpu_t *sc;
560 uint32_t mask;
561 int index, oflag;
562 lwp_t *l;
563 kmutex_t *lock;
564
565 KNOTE(&sip->sel_klist, knhint);
566
567 if (sip->sel_lwp != NULL) {
568 /* One named LWP is waiting. */
569 sc = sip->sel_cpu;
570 lock = sc->sc_lock;
571 mutex_spin_enter(lock);
572 /* Still there? */
573 if (sip->sel_lwp != NULL) {
574 l = sip->sel_lwp;
575 /*
576 * If thread is sleeping, wake it up. If it's not
577 * yet asleep, it will notice the change in state
578 * and will re-poll the descriptors.
579 */
580 oflag = l->l_selflag;
581 l->l_selflag = SEL_RESET;
582 if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
583 KASSERT(l->l_wchan == sc);
584 sleepq_unsleep(l, false);
585 }
586 }
587 mutex_spin_exit(lock);
588 }
589
590 if ((mask = sip->sel_collision) != 0) {
591 /*
592 * There was a collision (multiple waiters): we must
593 * inform all potentially interested waiters.
594 */
595 sip->sel_collision = 0;
596 do {
597 index = ffs(mask) - 1;
598 mask &= ~(1 << index);
599 sc = cpu_lookup(index)->ci_data.cpu_selcpu;
600 lock = sc->sc_lock;
601 mutex_spin_enter(lock);
602 sc->sc_ncoll++;
603 sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
604 } while (__predict_false(mask != 0));
605 }
606 }
607
608 /*
609 * Remove an LWP from all objects that it is waiting for. Concurrency
610 * issues:
611 *
612 * The object owner's (e.g. device driver) lock is not held here. Calls
613 * can be made to selrecord() and we do not synchronize against those
614 * directly using locks. However, we use `sel_lwp' to lock out changes.
615 * Before clearing it we must use memory barriers to ensure that we can
616 * safely traverse the list of selinfo records.
617 */
618 static void
619 selclear(void)
620 {
621 struct selinfo *sip, *next;
622 selcpu_t *sc;
623 lwp_t *l;
624 kmutex_t *lock;
625
626 l = curlwp;
627 sc = l->l_selcpu;
628 lock = sc->sc_lock;
629
630 mutex_spin_enter(lock);
631 for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) {
632 KASSERT(sip->sel_lwp == l);
633 KASSERT(sip->sel_cpu == l->l_selcpu);
634 /*
635 * Read link to next selinfo record, if any.
636 * It's no longer safe to touch `sip' after clearing
637 * `sel_lwp', so ensure that the read of `sel_chain'
638 * completes before the clearing of sel_lwp becomes
639 * globally visible.
640 */
641 next = SLIST_NEXT(sip, sel_chain);
642 membar_exit();
643 /* Release the record for another named waiter to use. */
644 sip->sel_lwp = NULL;
645 }
646 mutex_spin_exit(lock);
647 }
648
649 /*
650 * Initialize the select/poll system calls. Called once for each
651 * CPU in the system, as they are attached.
652 */
653 void
654 selsysinit(struct cpu_info *ci)
655 {
656 selcpu_t *sc;
657
658 sc = kmem_alloc(roundup2(sizeof(selcpu_t), coherency_unit) +
659 coherency_unit, KM_SLEEP);
660 sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
661 sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
662 sleepq_init(&sc->sc_sleepq);
663 sc->sc_ncoll = 0;
664 sc->sc_mask = (1 << cpu_index(ci));
665 ci->ci_data.cpu_selcpu = sc;
666 }
667
668 /*
669 * Initialize a selinfo record.
670 */
671 void
672 selinit(struct selinfo *sip)
673 {
674
675 memset(sip, 0, sizeof(*sip));
676 }
677
678 /*
679 * Destroy a selinfo record. The owning object must not gain new
680 * references while this is in progress: all activity on the record
681 * must be stopped.
682 *
683 * Concurrency issues: we only need guard against a call to selclear()
684 * by a thread exiting sel_do_scan(). The caller has prevented further
685 * references being made to the selinfo record via selrecord(), and it
686 * won't call selwakeup() again.
687 */
688 void
689 seldestroy(struct selinfo *sip)
690 {
691 selcpu_t *sc;
692 kmutex_t *lock;
693 lwp_t *l;
694
695 if (sip->sel_lwp == NULL)
696 return;
697
698 /*
699 * Lock out selclear(). The selcpu pointer can't change while
700 * we are here since it is only ever changed in selrecord(),
701 * and that will not be entered again for this record because
702 * it is dying.
703 */
704 KASSERT(sip->sel_cpu != NULL);
705 sc = sip->sel_cpu;
706 lock = sc->sc_lock;
707 mutex_spin_enter(lock);
708 if ((l = sip->sel_lwp) != NULL) {
709 /*
710 * This should rarely happen, so although SLIST_REMOVE()
711 * is slow, using it here is not a problem.
712 */
713 KASSERT(l->l_selcpu == sc);
714 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
715 sip->sel_lwp = NULL;
716 }
717 mutex_spin_exit(lock);
718 }
719
720 int
721 pollsock(struct socket *so, const struct timespec *tsp, int events)
722 {
723 int ncoll, error, timo;
724 struct timespec sleepts, ts;
725 selcpu_t *sc;
726 lwp_t *l;
727 kmutex_t *lock;
728
729 timo = 0;
730 if (tsp != NULL) {
731 ts = *tsp;
732 if (inittimeleft(&ts, &sleepts) == -1)
733 return EINVAL;
734 }
735
736 l = curlwp;
737 sc = l->l_cpu->ci_data.cpu_selcpu;
738 lock = sc->sc_lock;
739 l->l_selcpu = sc;
740 SLIST_INIT(&l->l_selwait);
741 error = 0;
742 for (;;) {
743 /*
744 * No need to lock. If this is overwritten by another
745 * value while scanning, we will retry below. We only
746 * need to see exact state from the descriptors that
747 * we are about to poll, and lock activity resulting
748 * from fo_poll is enough to provide an up to date value
749 * for new polling activity.
750 */
751 ncoll = sc->sc_ncoll;
752 l->l_selflag = SEL_SCANNING;
753 if (sopoll(so, events) != 0)
754 break;
755 if (tsp && (timo = gettimeleft(&ts, &sleepts)) <= 0)
756 break;
757 mutex_spin_enter(lock);
758 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
759 mutex_spin_exit(lock);
760 continue;
761 }
762 l->l_selflag = SEL_BLOCKING;
763 sleepq_enter(&sc->sc_sleepq, l, lock);
764 sleepq_enqueue(&sc->sc_sleepq, sc, "pollsock", &select_sobj);
765 error = sleepq_block(timo, true);
766 if (error != 0)
767 break;
768 }
769 selclear();
770 /* poll is not restarted after signals... */
771 if (error == ERESTART)
772 error = EINTR;
773 if (error == EWOULDBLOCK)
774 error = 0;
775 return (error);
776 }
777