1 1.68 khorben /* $NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $ */ 2 1.1 ad 3 1.1 ad /*- 4 1.62 ad * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023 5 1.62 ad * The NetBSD Foundation, Inc. 6 1.1 ad * All rights reserved. 7 1.1 ad * 8 1.1 ad * This code is derived from software contributed to The NetBSD Foundation 9 1.23 rmind * by Andrew Doran and Mindaugas Rasiukevicius. 10 1.1 ad * 11 1.1 ad * Redistribution and use in source and binary forms, with or without 12 1.1 ad * modification, are permitted provided that the following conditions 13 1.1 ad * are met: 14 1.1 ad * 1. Redistributions of source code must retain the above copyright 15 1.1 ad * notice, this list of conditions and the following disclaimer. 16 1.1 ad * 2. Redistributions in binary form must reproduce the above copyright 17 1.1 ad * notice, this list of conditions and the following disclaimer in the 18 1.1 ad * documentation and/or other materials provided with the distribution. 19 1.1 ad * 20 1.1 ad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 1.1 ad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 1.1 ad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 1.1 ad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 1.1 ad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 1.1 ad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 1.1 ad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 1.1 ad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 1.1 ad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 1.1 ad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 1.1 ad * POSSIBILITY OF SUCH DAMAGE. 31 1.1 ad */ 32 1.1 ad 33 1.1 ad /* 34 1.1 ad * Copyright (c) 1982, 1986, 1989, 1993 35 1.1 ad * The Regents of the University of California. All rights reserved. 36 1.1 ad * (c) UNIX System Laboratories, Inc. 37 1.1 ad * All or some portions of this file are derived from material licensed 38 1.1 ad * to the University of California by American Telephone and Telegraph 39 1.1 ad * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 1.1 ad * the permission of UNIX System Laboratories, Inc. 41 1.1 ad * 42 1.1 ad * Redistribution and use in source and binary forms, with or without 43 1.1 ad * modification, are permitted provided that the following conditions 44 1.1 ad * are met: 45 1.1 ad * 1. Redistributions of source code must retain the above copyright 46 1.1 ad * notice, this list of conditions and the following disclaimer. 47 1.1 ad * 2. Redistributions in binary form must reproduce the above copyright 48 1.1 ad * notice, this list of conditions and the following disclaimer in the 49 1.1 ad * documentation and/or other materials provided with the distribution. 50 1.1 ad * 3. Neither the name of the University nor the names of its contributors 51 1.1 ad * may be used to endorse or promote products derived from this software 52 1.1 ad * without specific prior written permission. 53 1.1 ad * 54 1.1 ad * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 1.1 ad * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 1.1 ad * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 1.1 ad * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 1.1 ad * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 1.1 ad * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 1.1 ad * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 1.1 ad * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 1.1 ad * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 1.1 ad * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 1.1 ad * SUCH DAMAGE. 65 1.1 ad * 66 1.1 ad * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 67 1.1 ad */ 68 1.1 ad 69 1.1 ad /* 70 1.21 rmind * System calls of synchronous I/O multiplexing subsystem. 71 1.21 rmind * 72 1.21 rmind * Locking 73 1.21 rmind * 74 1.22 ad * Two locks are used: <object-lock> and selcluster_t::sc_lock. 75 1.21 rmind * 76 1.21 rmind * The <object-lock> might be a device driver or another subsystem, e.g. 77 1.21 rmind * socket or pipe. This lock is not exported, and thus invisible to this 78 1.21 rmind * subsystem. Mainly, synchronisation between selrecord() and selnotify() 79 1.21 rmind * routines depends on this lock, as it will be described in the comments. 80 1.21 rmind * 81 1.21 rmind * Lock order 82 1.21 rmind * 83 1.21 rmind * <object-lock> -> 84 1.22 ad * selcluster_t::sc_lock 85 1.1 ad */ 86 1.1 ad 87 1.1 ad #include <sys/cdefs.h> 88 1.68 khorben __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.68 2024/11/26 23:10:15 khorben Exp $"); 89 1.1 ad 90 1.1 ad #include <sys/param.h> 91 1.66 riastrad 92 1.66 riastrad #include <sys/atomic.h> 93 1.66 riastrad #include <sys/bitops.h> 94 1.66 riastrad #include <sys/cpu.h> 95 1.66 riastrad #include <sys/file.h> 96 1.1 ad #include <sys/filedesc.h> 97 1.1 ad #include <sys/kernel.h> 98 1.29 rmind #include <sys/lwp.h> 99 1.66 riastrad #include <sys/mount.h> 100 1.1 ad #include <sys/poll.h> 101 1.66 riastrad #include <sys/proc.h> 102 1.66 riastrad #include <sys/signalvar.h> 103 1.66 riastrad #include <sys/sleepq.h> 104 1.66 riastrad #include <sys/socketvar.h> 105 1.66 riastrad #include <sys/socketvar.h> 106 1.66 riastrad #include <sys/syncobj.h> 107 1.1 ad #include <sys/syscallargs.h> 108 1.36 rmind #include <sys/sysctl.h> 109 1.66 riastrad #include <sys/systm.h> 110 1.66 riastrad #include <sys/uio.h> 111 1.1 ad 112 1.1 ad /* Flags for lwp::l_selflag. */ 113 1.1 ad #define SEL_RESET 0 /* awoken, interrupted, or not yet polling */ 114 1.1 ad #define SEL_SCANNING 1 /* polling descriptors */ 115 1.23 rmind #define SEL_BLOCKING 2 /* blocking and waiting for event */ 116 1.23 rmind #define SEL_EVENT 3 /* interrupted, events set directly */ 117 1.23 rmind 118 1.22 ad /* 119 1.22 ad * Per-cluster state for select()/poll(). For a system with fewer 120 1.50 ad * than 64 CPUs, this gives us per-CPU clusters. 121 1.22 ad */ 122 1.50 ad #define SELCLUSTERS 64 123 1.22 ad #define SELCLUSTERMASK (SELCLUSTERS - 1) 124 1.22 ad 125 1.22 ad typedef struct selcluster { 126 1.13 ad kmutex_t *sc_lock; 127 1.1 ad sleepq_t sc_sleepq; 128 1.49 ad uint64_t sc_mask; 129 1.1 ad int sc_ncoll; 130 1.22 ad } selcluster_t; 131 1.1 ad 132 1.23 rmind static inline int selscan(char *, const int, const size_t, register_t *); 133 1.23 rmind static inline int pollscan(struct pollfd *, const int, register_t *); 134 1.19 rmind static void selclear(void); 135 1.1 ad 136 1.23 rmind static const int sel_flag[] = { 137 1.23 rmind POLLRDNORM | POLLHUP | POLLERR, 138 1.23 rmind POLLWRNORM | POLLHUP | POLLERR, 139 1.23 rmind POLLRDBAND 140 1.23 rmind }; 141 1.23 rmind 142 1.53 ad /* 143 1.53 ad * LWPs are woken using the sleep queue only due to a collision, the case 144 1.53 ad * with the maximum Suck Factor. Save the cost of sorting for named waiters 145 1.53 ad * by inserting in LIFO order. In the future it would be preferable to not 146 1.53 ad * enqueue LWPs at all, unless subject to a collision. 147 1.53 ad */ 148 1.52 ad syncobj_t select_sobj = { 149 1.61 riastrad .sobj_name = "select", 150 1.53 ad .sobj_flag = SOBJ_SLEEPQ_LIFO, 151 1.62 ad .sobj_boostpri = PRI_KERNEL, 152 1.41 ozaki .sobj_unsleep = sleepq_unsleep, 153 1.41 ozaki .sobj_changepri = sleepq_changepri, 154 1.41 ozaki .sobj_lendpri = sleepq_lendpri, 155 1.41 ozaki .sobj_owner = syncobj_noowner, 156 1.1 ad }; 157 1.1 ad 158 1.23 rmind static selcluster_t *selcluster[SELCLUSTERS] __read_mostly; 159 1.36 rmind static int direct_select __read_mostly = 0; 160 1.22 ad 161 1.49 ad /* Operations: either select() or poll(). */ 162 1.49 ad const char selop_select[] = "select"; 163 1.49 ad const char selop_poll[] = "poll"; 164 1.49 ad 165 1.1 ad /* 166 1.1 ad * Select system call. 167 1.1 ad */ 168 1.1 ad int 169 1.12 christos sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap, 170 1.12 christos register_t *retval) 171 1.1 ad { 172 1.1 ad /* { 173 1.1 ad syscallarg(int) nd; 174 1.1 ad syscallarg(fd_set *) in; 175 1.1 ad syscallarg(fd_set *) ou; 176 1.1 ad syscallarg(fd_set *) ex; 177 1.1 ad syscallarg(const struct timespec *) ts; 178 1.1 ad syscallarg(sigset_t *) mask; 179 1.1 ad } */ 180 1.14 christos struct timespec ats, *ts = NULL; 181 1.1 ad sigset_t amask, *mask = NULL; 182 1.1 ad int error; 183 1.1 ad 184 1.1 ad if (SCARG(uap, ts)) { 185 1.1 ad error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 186 1.1 ad if (error) 187 1.1 ad return error; 188 1.14 christos ts = &ats; 189 1.1 ad } 190 1.1 ad if (SCARG(uap, mask) != NULL) { 191 1.1 ad error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 192 1.1 ad if (error) 193 1.1 ad return error; 194 1.1 ad mask = &amask; 195 1.1 ad } 196 1.1 ad 197 1.19 rmind return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 198 1.14 christos SCARG(uap, ou), SCARG(uap, ex), ts, mask); 199 1.1 ad } 200 1.1 ad 201 1.1 ad int 202 1.12 christos sys___select50(struct lwp *l, const struct sys___select50_args *uap, 203 1.12 christos register_t *retval) 204 1.1 ad { 205 1.1 ad /* { 206 1.1 ad syscallarg(int) nd; 207 1.1 ad syscallarg(fd_set *) in; 208 1.1 ad syscallarg(fd_set *) ou; 209 1.1 ad syscallarg(fd_set *) ex; 210 1.1 ad syscallarg(struct timeval *) tv; 211 1.1 ad } */ 212 1.14 christos struct timeval atv; 213 1.14 christos struct timespec ats, *ts = NULL; 214 1.1 ad int error; 215 1.1 ad 216 1.1 ad if (SCARG(uap, tv)) { 217 1.14 christos error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv)); 218 1.1 ad if (error) 219 1.1 ad return error; 220 1.48 kamil 221 1.48 kamil if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) 222 1.48 kamil return EINVAL; 223 1.48 kamil 224 1.14 christos TIMEVAL_TO_TIMESPEC(&atv, &ats); 225 1.14 christos ts = &ats; 226 1.1 ad } 227 1.1 ad 228 1.19 rmind return selcommon(retval, SCARG(uap, nd), SCARG(uap, in), 229 1.14 christos SCARG(uap, ou), SCARG(uap, ex), ts, NULL); 230 1.1 ad } 231 1.1 ad 232 1.17 rmind /* 233 1.17 rmind * sel_do_scan: common code to perform the scan on descriptors. 234 1.17 rmind */ 235 1.17 rmind static int 236 1.49 ad sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni, 237 1.23 rmind struct timespec *ts, sigset_t *mask, register_t *retval) 238 1.1 ad { 239 1.17 rmind lwp_t * const l = curlwp; 240 1.22 ad selcluster_t *sc; 241 1.13 ad kmutex_t *lock; 242 1.17 rmind struct timespec sleepts; 243 1.17 rmind int error, timo; 244 1.1 ad 245 1.1 ad timo = 0; 246 1.14 christos if (ts && inittimeleft(ts, &sleepts) == -1) { 247 1.17 rmind return EINVAL; 248 1.1 ad } 249 1.1 ad 250 1.32 christos if (__predict_false(mask)) 251 1.31 christos sigsuspendsetup(l, mask); 252 1.1 ad 253 1.49 ad /* 254 1.49 ad * We may context switch during or at any time after picking a CPU 255 1.49 ad * and cluster to associate with, but it doesn't matter. In the 256 1.49 ad * unlikely event we migrate elsewhere all we risk is a little lock 257 1.49 ad * contention; correctness is not sacrificed. 258 1.49 ad */ 259 1.22 ad sc = curcpu()->ci_data.cpu_selcluster; 260 1.13 ad lock = sc->sc_lock; 261 1.22 ad l->l_selcluster = sc; 262 1.49 ad 263 1.49 ad if (opname == selop_select) { 264 1.30 rmind l->l_selbits = fds; 265 1.23 rmind l->l_selni = ni; 266 1.23 rmind } else { 267 1.23 rmind l->l_selbits = NULL; 268 1.23 rmind } 269 1.34 hannken 270 1.1 ad for (;;) { 271 1.17 rmind int ncoll; 272 1.17 rmind 273 1.34 hannken SLIST_INIT(&l->l_selwait); 274 1.34 hannken l->l_selret = 0; 275 1.34 hannken 276 1.1 ad /* 277 1.17 rmind * No need to lock. If this is overwritten by another value 278 1.17 rmind * while scanning, we will retry below. We only need to see 279 1.17 rmind * exact state from the descriptors that we are about to poll, 280 1.17 rmind * and lock activity resulting from fo_poll is enough to 281 1.17 rmind * provide an up to date value for new polling activity. 282 1.1 ad */ 283 1.49 ad if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) { 284 1.49 ad /* Non-blocking: no need for selrecord()/selclear() */ 285 1.49 ad l->l_selflag = SEL_RESET; 286 1.49 ad } else { 287 1.49 ad l->l_selflag = SEL_SCANNING; 288 1.49 ad } 289 1.1 ad ncoll = sc->sc_ncoll; 290 1.59 riastrad membar_release(); 291 1.1 ad 292 1.49 ad if (opname == selop_select) { 293 1.23 rmind error = selscan((char *)fds, nf, ni, retval); 294 1.17 rmind } else { 295 1.23 rmind error = pollscan((struct pollfd *)fds, nf, retval); 296 1.17 rmind } 297 1.1 ad if (error || *retval) 298 1.1 ad break; 299 1.14 christos if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0) 300 1.1 ad break; 301 1.23 rmind /* 302 1.23 rmind * Acquire the lock and perform the (re)checks. Note, if 303 1.57 andvar * collision has occurred, then our state does not matter, 304 1.23 rmind * as we must perform re-scan. Therefore, check it first. 305 1.23 rmind */ 306 1.23 rmind state_check: 307 1.13 ad mutex_spin_enter(lock); 308 1.23 rmind if (__predict_false(sc->sc_ncoll != ncoll)) { 309 1.23 rmind /* Collision: perform re-scan. */ 310 1.23 rmind mutex_spin_exit(lock); 311 1.34 hannken selclear(); 312 1.23 rmind continue; 313 1.23 rmind } 314 1.23 rmind if (__predict_true(l->l_selflag == SEL_EVENT)) { 315 1.57 andvar /* Events occurred, they are set directly. */ 316 1.23 rmind mutex_spin_exit(lock); 317 1.23 rmind break; 318 1.23 rmind } 319 1.23 rmind if (__predict_true(l->l_selflag == SEL_RESET)) { 320 1.57 andvar /* Events occurred, but re-scan is requested. */ 321 1.13 ad mutex_spin_exit(lock); 322 1.34 hannken selclear(); 323 1.1 ad continue; 324 1.1 ad } 325 1.23 rmind /* Nothing happen, therefore - sleep. */ 326 1.1 ad l->l_selflag = SEL_BLOCKING; 327 1.63 ad KASSERT(l->l_blcnt == 0); 328 1.63 ad (void)sleepq_enter(&sc->sc_sleepq, l, lock); 329 1.54 ad sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true); 330 1.63 ad error = sleepq_block(timo, true, &select_sobj, 0); 331 1.23 rmind if (error != 0) { 332 1.1 ad break; 333 1.23 rmind } 334 1.23 rmind /* Awoken: need to check the state. */ 335 1.23 rmind goto state_check; 336 1.1 ad } 337 1.1 ad selclear(); 338 1.1 ad 339 1.34 hannken /* Add direct events if any. */ 340 1.34 hannken if (l->l_selflag == SEL_EVENT) { 341 1.34 hannken KASSERT(l->l_selret != 0); 342 1.34 hannken *retval += l->l_selret; 343 1.34 hannken } 344 1.34 hannken 345 1.33 christos if (__predict_false(mask)) 346 1.33 christos sigsuspendteardown(l); 347 1.33 christos 348 1.20 dsl /* select and poll are not restarted after signals... */ 349 1.20 dsl if (error == ERESTART) 350 1.20 dsl return EINTR; 351 1.20 dsl if (error == EWOULDBLOCK) 352 1.20 dsl return 0; 353 1.17 rmind return error; 354 1.17 rmind } 355 1.17 rmind 356 1.67 kre /* designed to be compatible with FD_SET() FD_ISSET() ... */ 357 1.67 kre static int 358 1.67 kre anyset(void *p, size_t nbits) 359 1.67 kre { 360 1.67 kre size_t nwords; 361 1.67 kre __fd_mask mask; 362 1.67 kre __fd_mask *f = (__fd_mask *)p; 363 1.67 kre 364 1.67 kre nwords = nbits / __NFDBITS; 365 1.67 kre 366 1.67 kre while (nwords-- > 0) 367 1.67 kre if (*f++ != 0) 368 1.67 kre return 1; 369 1.67 kre 370 1.67 kre nbits &= __NFDMASK; 371 1.67 kre if (nbits != 0) { 372 1.67 kre mask = (1U << nbits) - 1; 373 1.67 kre if ((*f & mask) != 0) 374 1.67 kre return 1; 375 1.67 kre } 376 1.67 kre return 0; 377 1.67 kre } 378 1.67 kre 379 1.17 rmind int 380 1.19 rmind selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou, 381 1.19 rmind fd_set *u_ex, struct timespec *ts, sigset_t *mask) 382 1.17 rmind { 383 1.17 rmind char smallbits[howmany(FD_SETSIZE, NFDBITS) * 384 1.17 rmind sizeof(fd_mask) * 6]; 385 1.17 rmind char *bits; 386 1.67 kre int error, nf, fb, db; 387 1.17 rmind size_t ni; 388 1.17 rmind 389 1.17 rmind if (nd < 0) 390 1.67 kre return EINVAL; 391 1.67 kre 392 1.51 riastrad nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles; 393 1.67 kre 394 1.67 kre /* 395 1.67 kre * Don't allow absurdly large numbers of fds to be selected. 396 1.67 kre * (used to silently truncate, naughty naughty, no more ...) 397 1.67 kre * 398 1.68 khorben * The additional FD_SETSIZE allows for cases where the limit 399 1.67 kre * is not a round binary number, but the fd_set wants to 400 1.67 kre * include all the possible fds, as fd_sets are always 401 1.67 kre * multiples of 32 bits (__NFDBITS extra would be enough). 402 1.67 kre * 403 1.67 kre * The first test handles the case where the res limit has been 404 1.67 kre * set lower after some fds were opened, we always allow selecting 405 1.67 kre * up to the highest currently open fd. 406 1.67 kre */ 407 1.67 kre if (nd > nf + FD_SETSIZE && 408 1.67 kre nd > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + FD_SETSIZE) 409 1.67 kre return EINVAL; 410 1.67 kre 411 1.67 kre fb = howmany(nf, __NFDBITS); /* how many fd_masks */ 412 1.67 kre db = howmany(nd, __NFDBITS); 413 1.67 kre 414 1.67 kre if (db > fb) { 415 1.67 kre size_t off; 416 1.67 kre 417 1.67 kre /* 418 1.67 kre * the application wants to supply more fd masks than can 419 1.67 kre * possibly represent valid file descriptors. 420 1.67 kre * 421 1.67 kre * Check the excess fd_masks, if any bits are set in them 422 1.67 kre * that must be an error (cannot represent valid fd). 423 1.67 kre * 424 1.67 kre * Supplying lots of extra cleared fd_masks is dumb, 425 1.67 kre * but harmless, so allow that. 426 1.67 kre */ 427 1.67 kre ni = (db - fb) * sizeof(fd_mask); /* excess bytes */ 428 1.67 kre bits = smallbits; 429 1.67 kre 430 1.67 kre /* skip over the valid fd_masks, those will be checked below */ 431 1.67 kre off = howmany(nf, __NFDBITS) * sizeof(__fd_mask); 432 1.67 kre 433 1.67 kre nd -= fb * NFDBITS; /* the number of excess fds */ 434 1.67 kre 435 1.67 kre #define checkbits(name, o, sz, fds) \ 436 1.67 kre do { \ 437 1.67 kre if (u_ ## name != NULL) { \ 438 1.67 kre error = copyin((char *)u_ ## name + o, \ 439 1.67 kre bits, sz); \ 440 1.67 kre if (error) \ 441 1.67 kre goto fail; \ 442 1.67 kre if (anyset(bits, (fds) ? \ 443 1.67 kre (size_t)(fds) : CHAR_BIT * (sz))) { \ 444 1.67 kre error = EBADF; \ 445 1.67 kre goto fail; \ 446 1.67 kre } \ 447 1.67 kre } \ 448 1.67 kre } while (0) 449 1.67 kre 450 1.67 kre while (ni > sizeof(smallbits)) { 451 1.67 kre checkbits(in, off, sizeof(smallbits), 0); 452 1.67 kre checkbits(ou, off, sizeof(smallbits), 0); 453 1.67 kre checkbits(ex, off, sizeof(smallbits), 0); 454 1.67 kre 455 1.67 kre off += sizeof(smallbits); 456 1.67 kre ni -= sizeof(smallbits); 457 1.67 kre nd -= sizeof(smallbits) * CHAR_BIT; 458 1.67 kre } 459 1.67 kre checkbits(in, off, ni, nd); 460 1.67 kre checkbits(ou, off, ni, nd); 461 1.67 kre checkbits(ex, off, ni, nd); 462 1.67 kre #undef checkbits 463 1.67 kre 464 1.67 kre db = fb; /* now just check the plausible fds */ 465 1.67 kre nd = db * __NFDBITS; 466 1.17 rmind } 467 1.67 kre 468 1.67 kre ni = db * sizeof(fd_mask); 469 1.40 chs if (ni * 6 > sizeof(smallbits)) 470 1.17 rmind bits = kmem_alloc(ni * 6, KM_SLEEP); 471 1.40 chs else 472 1.17 rmind bits = smallbits; 473 1.17 rmind 474 1.17 rmind #define getbits(name, x) \ 475 1.67 kre do { \ 476 1.67 kre if (u_ ## name) { \ 477 1.67 kre error = copyin(u_ ## name, bits + ni * x, ni); \ 478 1.67 kre if (error) \ 479 1.67 kre goto fail; \ 480 1.67 kre } else \ 481 1.67 kre memset(bits + ni * x, 0, ni); \ 482 1.67 kre } while (0) 483 1.67 kre 484 1.17 rmind getbits(in, 0); 485 1.17 rmind getbits(ou, 1); 486 1.17 rmind getbits(ex, 2); 487 1.17 rmind #undef getbits 488 1.1 ad 489 1.49 ad error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval); 490 1.67 kre 491 1.67 kre #define copyback(name, x) \ 492 1.67 kre do { \ 493 1.67 kre if (error == 0 && u_ ## name != NULL) \ 494 1.67 kre error = copyout(bits + ni * x, \ 495 1.67 kre u_ ## name, ni); \ 496 1.67 kre } while (0) 497 1.67 kre 498 1.67 kre copyback(in, 3); 499 1.67 kre copyback(ou, 4); 500 1.67 kre copyback(ex, 5); 501 1.67 kre #undef copyback 502 1.67 kre 503 1.20 dsl fail: 504 1.1 ad if (bits != smallbits) 505 1.1 ad kmem_free(bits, ni * 6); 506 1.1 ad return (error); 507 1.1 ad } 508 1.1 ad 509 1.19 rmind static inline int 510 1.23 rmind selscan(char *bits, const int nfd, const size_t ni, register_t *retval) 511 1.1 ad { 512 1.17 rmind fd_mask *ibitp, *obitp; 513 1.23 rmind int msk, i, j, fd, n; 514 1.1 ad file_t *fp; 515 1.49 ad lwp_t *l; 516 1.1 ad 517 1.17 rmind ibitp = (fd_mask *)(bits + ni * 0); 518 1.17 rmind obitp = (fd_mask *)(bits + ni * 3); 519 1.1 ad n = 0; 520 1.49 ad l = curlwp; 521 1.17 rmind 522 1.34 hannken memset(obitp, 0, ni * 3); 523 1.1 ad for (msk = 0; msk < 3; msk++) { 524 1.1 ad for (i = 0; i < nfd; i += NFDBITS) { 525 1.23 rmind fd_mask ibits, obits; 526 1.23 rmind 527 1.35 hannken ibits = *ibitp; 528 1.1 ad obits = 0; 529 1.1 ad while ((j = ffs(ibits)) && (fd = i + --j) < nfd) { 530 1.47 msaitoh ibits &= ~(1U << j); 531 1.1 ad if ((fp = fd_getfile(fd)) == NULL) 532 1.1 ad return (EBADF); 533 1.23 rmind /* 534 1.23 rmind * Setup an argument to selrecord(), which is 535 1.23 rmind * a file descriptor number. 536 1.23 rmind */ 537 1.49 ad l->l_selrec = fd; 538 1.23 rmind if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { 539 1.49 ad if (!direct_select) { 540 1.49 ad /* 541 1.49 ad * Have events: do nothing in 542 1.49 ad * selrecord(). 543 1.49 ad */ 544 1.49 ad l->l_selflag = SEL_RESET; 545 1.49 ad } 546 1.47 msaitoh obits |= (1U << j); 547 1.1 ad n++; 548 1.1 ad } 549 1.1 ad fd_putfile(fd); 550 1.1 ad } 551 1.34 hannken if (obits != 0) { 552 1.36 rmind if (direct_select) { 553 1.36 rmind kmutex_t *lock; 554 1.49 ad lock = l->l_selcluster->sc_lock; 555 1.35 hannken mutex_spin_enter(lock); 556 1.36 rmind *obitp |= obits; 557 1.35 hannken mutex_spin_exit(lock); 558 1.36 rmind } else { 559 1.36 rmind *obitp |= obits; 560 1.36 rmind } 561 1.34 hannken } 562 1.35 hannken ibitp++; 563 1.34 hannken obitp++; 564 1.1 ad } 565 1.1 ad } 566 1.1 ad *retval = n; 567 1.1 ad return (0); 568 1.1 ad } 569 1.1 ad 570 1.1 ad /* 571 1.1 ad * Poll system call. 572 1.1 ad */ 573 1.1 ad int 574 1.1 ad sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval) 575 1.1 ad { 576 1.1 ad /* { 577 1.1 ad syscallarg(struct pollfd *) fds; 578 1.1 ad syscallarg(u_int) nfds; 579 1.1 ad syscallarg(int) timeout; 580 1.1 ad } */ 581 1.14 christos struct timespec ats, *ts = NULL; 582 1.1 ad 583 1.1 ad if (SCARG(uap, timeout) != INFTIM) { 584 1.14 christos ats.tv_sec = SCARG(uap, timeout) / 1000; 585 1.14 christos ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000; 586 1.14 christos ts = &ats; 587 1.1 ad } 588 1.1 ad 589 1.19 rmind return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL); 590 1.1 ad } 591 1.1 ad 592 1.1 ad /* 593 1.1 ad * Poll system call. 594 1.1 ad */ 595 1.1 ad int 596 1.12 christos sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap, 597 1.12 christos register_t *retval) 598 1.1 ad { 599 1.1 ad /* { 600 1.1 ad syscallarg(struct pollfd *) fds; 601 1.1 ad syscallarg(u_int) nfds; 602 1.1 ad syscallarg(const struct timespec *) ts; 603 1.1 ad syscallarg(const sigset_t *) mask; 604 1.1 ad } */ 605 1.14 christos struct timespec ats, *ts = NULL; 606 1.1 ad sigset_t amask, *mask = NULL; 607 1.1 ad int error; 608 1.1 ad 609 1.1 ad if (SCARG(uap, ts)) { 610 1.1 ad error = copyin(SCARG(uap, ts), &ats, sizeof(ats)); 611 1.1 ad if (error) 612 1.1 ad return error; 613 1.14 christos ts = &ats; 614 1.1 ad } 615 1.1 ad if (SCARG(uap, mask)) { 616 1.1 ad error = copyin(SCARG(uap, mask), &amask, sizeof(amask)); 617 1.1 ad if (error) 618 1.1 ad return error; 619 1.1 ad mask = &amask; 620 1.1 ad } 621 1.1 ad 622 1.19 rmind return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask); 623 1.1 ad } 624 1.1 ad 625 1.1 ad int 626 1.19 rmind pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds, 627 1.14 christos struct timespec *ts, sigset_t *mask) 628 1.1 ad { 629 1.11 yamt struct pollfd smallfds[32]; 630 1.11 yamt struct pollfd *fds; 631 1.17 rmind int error; 632 1.20 dsl size_t ni; 633 1.1 ad 634 1.45 christos if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) { 635 1.20 dsl /* 636 1.43 christos * Prevent userland from causing over-allocation. 637 1.43 christos * Raising the default limit too high can still cause 638 1.43 christos * a lot of memory to be allocated, but this also means 639 1.43 christos * that the file descriptor array will also be large. 640 1.43 christos * 641 1.43 christos * To reduce the memory requirements here, we could 642 1.43 christos * process the 'fds' array in chunks, but that 643 1.20 dsl * is a lot of code that isn't normally useful. 644 1.20 dsl * (Or just move the copyin/out into pollscan().) 645 1.43 christos * 646 1.20 dsl * Historically the code silently truncated 'fds' to 647 1.20 dsl * dt_nfiles entries - but that does cause issues. 648 1.44 christos * 649 1.44 christos * Using the max limit equivalent to sysctl 650 1.44 christos * kern.maxfiles is the moral equivalent of OPEN_MAX 651 1.45 christos * as specified by POSIX. 652 1.45 christos * 653 1.45 christos * We add a slop of 1000 in case the resource limit was 654 1.45 christos * changed after opening descriptors or the same descriptor 655 1.45 christos * was specified more than once. 656 1.20 dsl */ 657 1.20 dsl return EINVAL; 658 1.1 ad } 659 1.1 ad ni = nfds * sizeof(struct pollfd); 660 1.40 chs if (ni > sizeof(smallfds)) 661 1.11 yamt fds = kmem_alloc(ni, KM_SLEEP); 662 1.40 chs else 663 1.11 yamt fds = smallfds; 664 1.1 ad 665 1.11 yamt error = copyin(u_fds, fds, ni); 666 1.1 ad if (error) 667 1.20 dsl goto fail; 668 1.1 ad 669 1.49 ad error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval); 670 1.1 ad if (error == 0) 671 1.11 yamt error = copyout(fds, u_fds, ni); 672 1.20 dsl fail: 673 1.11 yamt if (fds != smallfds) 674 1.11 yamt kmem_free(fds, ni); 675 1.1 ad return (error); 676 1.1 ad } 677 1.1 ad 678 1.19 rmind static inline int 679 1.23 rmind pollscan(struct pollfd *fds, const int nfd, register_t *retval) 680 1.1 ad { 681 1.1 ad file_t *fp; 682 1.34 hannken int i, n = 0, revents; 683 1.1 ad 684 1.1 ad for (i = 0; i < nfd; i++, fds++) { 685 1.34 hannken fds->revents = 0; 686 1.1 ad if (fds->fd < 0) { 687 1.34 hannken revents = 0; 688 1.1 ad } else if ((fp = fd_getfile(fds->fd)) == NULL) { 689 1.34 hannken revents = POLLNVAL; 690 1.1 ad } else { 691 1.23 rmind /* 692 1.23 rmind * Perform poll: registers select request or returns 693 1.23 rmind * the events which are set. Setup an argument for 694 1.23 rmind * selrecord(), which is a pointer to struct pollfd. 695 1.23 rmind */ 696 1.23 rmind curlwp->l_selrec = (uintptr_t)fds; 697 1.34 hannken revents = (*fp->f_ops->fo_poll)(fp, 698 1.1 ad fds->events | POLLERR | POLLHUP); 699 1.1 ad fd_putfile(fds->fd); 700 1.1 ad } 701 1.34 hannken if (revents) { 702 1.49 ad if (!direct_select) { 703 1.49 ad /* Have events: do nothing in selrecord(). */ 704 1.49 ad curlwp->l_selflag = SEL_RESET; 705 1.49 ad } 706 1.34 hannken fds->revents = revents; 707 1.34 hannken n++; 708 1.34 hannken } 709 1.1 ad } 710 1.1 ad *retval = n; 711 1.1 ad return (0); 712 1.1 ad } 713 1.1 ad 714 1.1 ad int 715 1.1 ad seltrue(dev_t dev, int events, lwp_t *l) 716 1.1 ad { 717 1.1 ad 718 1.1 ad return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 719 1.1 ad } 720 1.1 ad 721 1.1 ad /* 722 1.1 ad * Record a select request. Concurrency issues: 723 1.1 ad * 724 1.1 ad * The caller holds the same lock across calls to selrecord() and 725 1.4 yamt * selnotify(), so we don't need to consider a concurrent wakeup 726 1.1 ad * while in this routine. 727 1.1 ad * 728 1.1 ad * The only activity we need to guard against is selclear(), called by 729 1.17 rmind * another thread that is exiting sel_do_scan(). 730 1.1 ad * `sel_lwp' can only become non-NULL while the caller's lock is held, 731 1.1 ad * so it cannot become non-NULL due to a change made by another thread 732 1.1 ad * while we are in this routine. It can only become _NULL_ due to a 733 1.1 ad * call to selclear(). 734 1.1 ad * 735 1.1 ad * If it is non-NULL and != selector there is the potential for 736 1.1 ad * selclear() to be called by another thread. If either of those 737 1.1 ad * conditions are true, we're not interested in touching the `named 738 1.1 ad * waiter' part of the selinfo record because we need to record a 739 1.1 ad * collision. Hence there is no need for additional locking in this 740 1.1 ad * routine. 741 1.1 ad */ 742 1.1 ad void 743 1.1 ad selrecord(lwp_t *selector, struct selinfo *sip) 744 1.1 ad { 745 1.22 ad selcluster_t *sc; 746 1.1 ad lwp_t *other; 747 1.1 ad 748 1.1 ad KASSERT(selector == curlwp); 749 1.1 ad 750 1.22 ad sc = selector->l_selcluster; 751 1.1 ad other = sip->sel_lwp; 752 1.1 ad 753 1.49 ad if (selector->l_selflag == SEL_RESET) { 754 1.49 ad /* 0. We're not going to block - will poll again if needed. */ 755 1.49 ad } else if (other == selector) { 756 1.23 rmind /* 1. We (selector) already claimed to be the first LWP. */ 757 1.37 riastrad KASSERT(sip->sel_cluster == sc); 758 1.1 ad } else if (other == NULL) { 759 1.1 ad /* 760 1.23 rmind * 2. No first LWP, therefore we (selector) are the first. 761 1.23 rmind * 762 1.23 rmind * There may be unnamed waiters (collisions). Issue a memory 763 1.23 rmind * barrier to ensure that we access sel_lwp (above) before 764 1.23 rmind * other fields - this guards against a call to selclear(). 765 1.1 ad */ 766 1.59 riastrad membar_acquire(); 767 1.1 ad sip->sel_lwp = selector; 768 1.1 ad SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain); 769 1.23 rmind /* Copy the argument, which is for selnotify(). */ 770 1.23 rmind sip->sel_fdinfo = selector->l_selrec; 771 1.22 ad /* Replace selinfo's lock with the chosen cluster's lock. */ 772 1.22 ad sip->sel_cluster = sc; 773 1.1 ad } else { 774 1.23 rmind /* 3. Multiple waiters: record a collision. */ 775 1.1 ad sip->sel_collision |= sc->sc_mask; 776 1.22 ad KASSERT(sip->sel_cluster != NULL); 777 1.1 ad } 778 1.1 ad } 779 1.1 ad 780 1.1 ad /* 781 1.55 thorpej * Record a knote. 782 1.55 thorpej * 783 1.55 thorpej * The caller holds the same lock as for selrecord(). 784 1.55 thorpej */ 785 1.55 thorpej void 786 1.55 thorpej selrecord_knote(struct selinfo *sip, struct knote *kn) 787 1.55 thorpej { 788 1.58 thorpej klist_insert(&sip->sel_klist, kn); 789 1.55 thorpej } 790 1.55 thorpej 791 1.55 thorpej /* 792 1.55 thorpej * Remove a knote. 793 1.55 thorpej * 794 1.55 thorpej * The caller holds the same lock as for selrecord(). 795 1.56 thorpej * 796 1.56 thorpej * Returns true if the last knote was removed and the list 797 1.56 thorpej * is now empty. 798 1.55 thorpej */ 799 1.56 thorpej bool 800 1.55 thorpej selremove_knote(struct selinfo *sip, struct knote *kn) 801 1.55 thorpej { 802 1.58 thorpej return klist_remove(&sip->sel_klist, kn); 803 1.55 thorpej } 804 1.55 thorpej 805 1.55 thorpej /* 806 1.23 rmind * sel_setevents: a helper function for selnotify(), to set the events 807 1.23 rmind * for LWP sleeping in selcommon() or pollcommon(). 808 1.23 rmind */ 809 1.30 rmind static inline bool 810 1.23 rmind sel_setevents(lwp_t *l, struct selinfo *sip, const int events) 811 1.23 rmind { 812 1.23 rmind const int oflag = l->l_selflag; 813 1.30 rmind int ret = 0; 814 1.23 rmind 815 1.23 rmind /* 816 1.23 rmind * If we require re-scan or it was required by somebody else, 817 1.23 rmind * then just (re)set SEL_RESET and return. 818 1.23 rmind */ 819 1.23 rmind if (__predict_false(events == 0 || oflag == SEL_RESET)) { 820 1.23 rmind l->l_selflag = SEL_RESET; 821 1.30 rmind return true; 822 1.23 rmind } 823 1.23 rmind /* 824 1.23 rmind * Direct set. Note: select state of LWP is locked. First, 825 1.23 rmind * determine whether it is selcommon() or pollcommon(). 826 1.23 rmind */ 827 1.23 rmind if (l->l_selbits != NULL) { 828 1.30 rmind const size_t ni = l->l_selni; 829 1.23 rmind fd_mask *fds = (fd_mask *)l->l_selbits; 830 1.30 rmind fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3); 831 1.30 rmind const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK); 832 1.25 rmind const int idx = fd >> __NFDSHIFT; 833 1.23 rmind int n; 834 1.23 rmind 835 1.23 rmind for (n = 0; n < 3; n++) { 836 1.34 hannken if ((fds[idx] & fbit) != 0 && 837 1.34 hannken (ofds[idx] & fbit) == 0 && 838 1.34 hannken (sel_flag[n] & events)) { 839 1.30 rmind ofds[idx] |= fbit; 840 1.30 rmind ret++; 841 1.23 rmind } 842 1.23 rmind fds = (fd_mask *)((char *)fds + ni); 843 1.30 rmind ofds = (fd_mask *)((char *)ofds + ni); 844 1.23 rmind } 845 1.23 rmind } else { 846 1.23 rmind struct pollfd *pfd = (void *)sip->sel_fdinfo; 847 1.30 rmind int revents = events & (pfd->events | POLLERR | POLLHUP); 848 1.30 rmind 849 1.30 rmind if (revents) { 850 1.34 hannken if (pfd->revents == 0) 851 1.34 hannken ret = 1; 852 1.30 rmind pfd->revents |= revents; 853 1.30 rmind } 854 1.30 rmind } 855 1.30 rmind /* Check whether there are any events to return. */ 856 1.30 rmind if (!ret) { 857 1.30 rmind return false; 858 1.23 rmind } 859 1.23 rmind /* Indicate direct set and note the event (cluster lock is held). */ 860 1.23 rmind l->l_selflag = SEL_EVENT; 861 1.30 rmind l->l_selret += ret; 862 1.30 rmind return true; 863 1.23 rmind } 864 1.23 rmind 865 1.23 rmind /* 866 1.1 ad * Do a wakeup when a selectable event occurs. Concurrency issues: 867 1.1 ad * 868 1.1 ad * As per selrecord(), the caller's object lock is held. If there 869 1.22 ad * is a named waiter, we must acquire the associated selcluster's lock 870 1.1 ad * in order to synchronize with selclear() and pollers going to sleep 871 1.17 rmind * in sel_do_scan(). 872 1.1 ad * 873 1.22 ad * sip->sel_cluser cannot change at this point, as it is only changed 874 1.1 ad * in selrecord(), and concurrent calls to selrecord() are locked 875 1.1 ad * out by the caller. 876 1.1 ad */ 877 1.1 ad void 878 1.1 ad selnotify(struct selinfo *sip, int events, long knhint) 879 1.1 ad { 880 1.22 ad selcluster_t *sc; 881 1.49 ad uint64_t mask; 882 1.16 rmind int index, oflag; 883 1.1 ad lwp_t *l; 884 1.13 ad kmutex_t *lock; 885 1.1 ad 886 1.1 ad KNOTE(&sip->sel_klist, knhint); 887 1.1 ad 888 1.1 ad if (sip->sel_lwp != NULL) { 889 1.1 ad /* One named LWP is waiting. */ 890 1.22 ad sc = sip->sel_cluster; 891 1.13 ad lock = sc->sc_lock; 892 1.13 ad mutex_spin_enter(lock); 893 1.1 ad /* Still there? */ 894 1.1 ad if (sip->sel_lwp != NULL) { 895 1.23 rmind /* 896 1.23 rmind * Set the events for our LWP and indicate that. 897 1.23 rmind * Otherwise, request for a full re-scan. 898 1.23 rmind */ 899 1.1 ad l = sip->sel_lwp; 900 1.23 rmind oflag = l->l_selflag; 901 1.36 rmind 902 1.36 rmind if (!direct_select) { 903 1.36 rmind l->l_selflag = SEL_RESET; 904 1.36 rmind } else if (!sel_setevents(l, sip, events)) { 905 1.30 rmind /* No events to return. */ 906 1.30 rmind mutex_spin_exit(lock); 907 1.30 rmind return; 908 1.30 rmind } 909 1.36 rmind 910 1.1 ad /* 911 1.1 ad * If thread is sleeping, wake it up. If it's not 912 1.1 ad * yet asleep, it will notice the change in state 913 1.1 ad * and will re-poll the descriptors. 914 1.1 ad */ 915 1.13 ad if (oflag == SEL_BLOCKING && l->l_mutex == lock) { 916 1.1 ad KASSERT(l->l_wchan == sc); 917 1.64 ad sleepq_remove(l->l_sleepq, l, true); 918 1.1 ad } 919 1.1 ad } 920 1.13 ad mutex_spin_exit(lock); 921 1.1 ad } 922 1.1 ad 923 1.1 ad if ((mask = sip->sel_collision) != 0) { 924 1.1 ad /* 925 1.1 ad * There was a collision (multiple waiters): we must 926 1.1 ad * inform all potentially interested waiters. 927 1.1 ad */ 928 1.1 ad sip->sel_collision = 0; 929 1.3 ad do { 930 1.49 ad index = ffs64(mask) - 1; 931 1.49 ad mask ^= __BIT(index); 932 1.22 ad sc = selcluster[index]; 933 1.13 ad lock = sc->sc_lock; 934 1.13 ad mutex_spin_enter(lock); 935 1.1 ad sc->sc_ncoll++; 936 1.13 ad sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock); 937 1.3 ad } while (__predict_false(mask != 0)); 938 1.1 ad } 939 1.1 ad } 940 1.1 ad 941 1.1 ad /* 942 1.1 ad * Remove an LWP from all objects that it is waiting for. Concurrency 943 1.1 ad * issues: 944 1.1 ad * 945 1.1 ad * The object owner's (e.g. device driver) lock is not held here. Calls 946 1.1 ad * can be made to selrecord() and we do not synchronize against those 947 1.1 ad * directly using locks. However, we use `sel_lwp' to lock out changes. 948 1.1 ad * Before clearing it we must use memory barriers to ensure that we can 949 1.1 ad * safely traverse the list of selinfo records. 950 1.1 ad */ 951 1.1 ad static void 952 1.1 ad selclear(void) 953 1.1 ad { 954 1.1 ad struct selinfo *sip, *next; 955 1.22 ad selcluster_t *sc; 956 1.1 ad lwp_t *l; 957 1.13 ad kmutex_t *lock; 958 1.1 ad 959 1.1 ad l = curlwp; 960 1.22 ad sc = l->l_selcluster; 961 1.13 ad lock = sc->sc_lock; 962 1.1 ad 963 1.49 ad /* 964 1.49 ad * If the request was non-blocking, or we found events on the first 965 1.49 ad * descriptor, there will be no need to clear anything - avoid 966 1.49 ad * taking the lock. 967 1.49 ad */ 968 1.49 ad if (SLIST_EMPTY(&l->l_selwait)) { 969 1.49 ad return; 970 1.49 ad } 971 1.49 ad 972 1.13 ad mutex_spin_enter(lock); 973 1.1 ad for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { 974 1.1 ad KASSERT(sip->sel_lwp == l); 975 1.22 ad KASSERT(sip->sel_cluster == l->l_selcluster); 976 1.22 ad 977 1.1 ad /* 978 1.1 ad * Read link to next selinfo record, if any. 979 1.1 ad * It's no longer safe to touch `sip' after clearing 980 1.1 ad * `sel_lwp', so ensure that the read of `sel_chain' 981 1.1 ad * completes before the clearing of sel_lwp becomes 982 1.1 ad * globally visible. 983 1.1 ad */ 984 1.1 ad next = SLIST_NEXT(sip, sel_chain); 985 1.1 ad /* Release the record for another named waiter to use. */ 986 1.59 riastrad atomic_store_release(&sip->sel_lwp, NULL); 987 1.1 ad } 988 1.13 ad mutex_spin_exit(lock); 989 1.1 ad } 990 1.1 ad 991 1.1 ad /* 992 1.1 ad * Initialize the select/poll system calls. Called once for each 993 1.1 ad * CPU in the system, as they are attached. 994 1.1 ad */ 995 1.1 ad void 996 1.1 ad selsysinit(struct cpu_info *ci) 997 1.1 ad { 998 1.22 ad selcluster_t *sc; 999 1.22 ad u_int index; 1000 1.1 ad 1001 1.22 ad /* If already a cluster in place for this bit, re-use. */ 1002 1.22 ad index = cpu_index(ci) & SELCLUSTERMASK; 1003 1.22 ad sc = selcluster[index]; 1004 1.22 ad if (sc == NULL) { 1005 1.22 ad sc = kmem_alloc(roundup2(sizeof(selcluster_t), 1006 1.22 ad coherency_unit) + coherency_unit, KM_SLEEP); 1007 1.22 ad sc = (void *)roundup2((uintptr_t)sc, coherency_unit); 1008 1.22 ad sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED); 1009 1.22 ad sleepq_init(&sc->sc_sleepq); 1010 1.22 ad sc->sc_ncoll = 0; 1011 1.46 msaitoh sc->sc_mask = __BIT(index); 1012 1.22 ad selcluster[index] = sc; 1013 1.22 ad } 1014 1.22 ad ci->ci_data.cpu_selcluster = sc; 1015 1.1 ad } 1016 1.1 ad 1017 1.1 ad /* 1018 1.1 ad * Initialize a selinfo record. 1019 1.1 ad */ 1020 1.1 ad void 1021 1.1 ad selinit(struct selinfo *sip) 1022 1.1 ad { 1023 1.1 ad 1024 1.1 ad memset(sip, 0, sizeof(*sip)); 1025 1.58 thorpej klist_init(&sip->sel_klist); 1026 1.1 ad } 1027 1.1 ad 1028 1.1 ad /* 1029 1.1 ad * Destroy a selinfo record. The owning object must not gain new 1030 1.1 ad * references while this is in progress: all activity on the record 1031 1.1 ad * must be stopped. 1032 1.1 ad * 1033 1.1 ad * Concurrency issues: we only need guard against a call to selclear() 1034 1.17 rmind * by a thread exiting sel_do_scan(). The caller has prevented further 1035 1.17 rmind * references being made to the selinfo record via selrecord(), and it 1036 1.23 rmind * will not call selnotify() again. 1037 1.1 ad */ 1038 1.1 ad void 1039 1.1 ad seldestroy(struct selinfo *sip) 1040 1.1 ad { 1041 1.22 ad selcluster_t *sc; 1042 1.13 ad kmutex_t *lock; 1043 1.1 ad lwp_t *l; 1044 1.1 ad 1045 1.58 thorpej klist_fini(&sip->sel_klist); 1046 1.58 thorpej 1047 1.1 ad if (sip->sel_lwp == NULL) 1048 1.1 ad return; 1049 1.1 ad 1050 1.1 ad /* 1051 1.22 ad * Lock out selclear(). The selcluster pointer can't change while 1052 1.1 ad * we are here since it is only ever changed in selrecord(), 1053 1.1 ad * and that will not be entered again for this record because 1054 1.1 ad * it is dying. 1055 1.1 ad */ 1056 1.22 ad KASSERT(sip->sel_cluster != NULL); 1057 1.22 ad sc = sip->sel_cluster; 1058 1.13 ad lock = sc->sc_lock; 1059 1.13 ad mutex_spin_enter(lock); 1060 1.1 ad if ((l = sip->sel_lwp) != NULL) { 1061 1.1 ad /* 1062 1.1 ad * This should rarely happen, so although SLIST_REMOVE() 1063 1.1 ad * is slow, using it here is not a problem. 1064 1.1 ad */ 1065 1.22 ad KASSERT(l->l_selcluster == sc); 1066 1.1 ad SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain); 1067 1.1 ad sip->sel_lwp = NULL; 1068 1.1 ad } 1069 1.13 ad mutex_spin_exit(lock); 1070 1.1 ad } 1071 1.1 ad 1072 1.36 rmind /* 1073 1.36 rmind * System control nodes. 1074 1.36 rmind */ 1075 1.36 rmind SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup") 1076 1.36 rmind { 1077 1.36 rmind 1078 1.38 pooka sysctl_createv(clog, 0, NULL, NULL, 1079 1.36 rmind CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 1080 1.36 rmind CTLTYPE_INT, "direct_select", 1081 1.36 rmind SYSCTL_DESCR("Enable/disable direct select (for testing)"), 1082 1.36 rmind NULL, 0, &direct_select, 0, 1083 1.38 pooka CTL_KERN, CTL_CREATE, CTL_EOL); 1084 1.36 rmind } 1085