1 /* $NetBSD: kern_descrip.c,v 1.268 2026/01/04 01:32:23 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2008, 2009, 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1991, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 66 */ 67 68 /* 69 * File descriptor management. 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.268 2026/01/04 01:32:23 riastradh Exp $"); 74 75 #include <sys/param.h> 76 #include <sys/types.h> 77 78 #include <sys/atomic.h> 79 #include <sys/conf.h> 80 #include <sys/cpu.h> 81 #include <sys/event.h> 82 #include <sys/fcntl.h> 83 #include <sys/file.h> 84 #include <sys/filedesc.h> 85 #include <sys/ioctl.h> 86 #include <sys/kauth.h> 87 #include <sys/kernel.h> 88 #include <sys/kmem.h> 89 #include <sys/ktrace.h> 90 #include <sys/pool.h> 91 #include <sys/proc.h> 92 #include <sys/resourcevar.h> 93 #include <sys/sdt.h> 94 #include <sys/socket.h> 95 #include <sys/socketvar.h> 96 #include <sys/stat.h> 97 #include <sys/syscallargs.h> 98 #include <sys/sysctl.h> 99 #include <sys/systm.h> 100 #include <sys/unistd.h> 101 #include <sys/vnode.h> 102 103 /* 104 * A list (head) of open files, counter, and lock protecting them. 105 */ 106 struct filelist filehead __cacheline_aligned; 107 static u_int nfiles __cacheline_aligned; 108 kmutex_t filelist_lock __cacheline_aligned; 109 110 static pool_cache_t filedesc_cache __read_mostly; 111 static pool_cache_t file_cache __read_mostly; 112 113 static int file_ctor(void *, void *, int); 114 static void file_dtor(void *, void *); 115 static void fdfile_ctor(fdfile_t *); 116 static void fdfile_dtor(fdfile_t *); 117 static int filedesc_ctor(void *, void *, int); 118 static void filedesc_dtor(void *, void *); 119 static int filedescopen(dev_t, int, int, lwp_t *); 120 121 static int sysctl_kern_file(SYSCTLFN_PROTO); 122 static int sysctl_kern_file2(SYSCTLFN_PROTO); 123 static void fill_file(struct file *, const struct file *); 124 static void fill_file2(struct kinfo_file *, const file_t *, const fdfile_t *, 125 int, pid_t); 126 127 const struct cdevsw filedesc_cdevsw = { 128 .d_open = filedescopen, 129 .d_close = noclose, 130 .d_read = noread, 131 .d_write = nowrite, 132 .d_ioctl = noioctl, 133 .d_stop = nostop, 134 .d_tty = notty, 135 .d_poll = nopoll, 136 .d_mmap = nommap, 137 .d_kqfilter = nokqfilter, 138 .d_discard = nodiscard, 139 .d_flag = D_OTHER | D_MPSAFE 140 }; 141 142 /* For ease of reading. */ 143 __strong_alias(fd_putvnode,fd_putfile) 144 __strong_alias(fd_putsock,fd_putfile) 145 146 /* 147 * Initialize the descriptor system. 148 */ 149 void 150 fd_sys_init(void) 151 { 152 static struct sysctllog *clog; 153 154 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE); 155 156 LIST_INIT(&filehead); 157 158 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0, 159 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL); 160 KASSERT(file_cache != NULL); 161 162 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit, 163 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor, 164 NULL); 165 KASSERT(filedesc_cache != NULL); 166 167 sysctl_createv(&clog, 0, NULL, NULL, 168 CTLFLAG_PERMANENT, 169 CTLTYPE_STRUCT, "file", 170 SYSCTL_DESCR("System open file table"), 171 sysctl_kern_file, 0, NULL, 0, 172 CTL_KERN, KERN_FILE, CTL_EOL); 173 sysctl_createv(&clog, 0, NULL, NULL, 174 CTLFLAG_PERMANENT, 175 CTLTYPE_STRUCT, "file2", 176 SYSCTL_DESCR("System open file table"), 177 sysctl_kern_file2, 0, NULL, 0, 178 CTL_KERN, KERN_FILE2, CTL_EOL); 179 } 180 181 static bool 182 fd_isused(filedesc_t *fdp, unsigned fd) 183 { 184 u_int off = fd >> NDENTRYSHIFT; 185 186 KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles); 187 188 return (fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0; 189 } 190 191 /* 192 * Verify that the bitmaps match the descriptor table. 193 */ 194 static inline void 195 fd_checkmaps(filedesc_t *fdp) 196 { 197 #ifdef DEBUG 198 fdtab_t *dt; 199 u_int fd; 200 201 KASSERT(fdp->fd_refcnt <= 1 || mutex_owned(&fdp->fd_lock)); 202 203 dt = fdp->fd_dt; 204 if (fdp->fd_refcnt == -1) { 205 /* 206 * fd_free tears down the table without maintaining its bitmap. 207 */ 208 return; 209 } 210 for (fd = 0; fd < dt->dt_nfiles; fd++) { 211 if (fd < NDFDFILE) { 212 KASSERT(dt->dt_ff[fd] == 213 (fdfile_t *)fdp->fd_dfdfile[fd]); 214 } 215 if (dt->dt_ff[fd] == NULL) { 216 KASSERT(!fd_isused(fdp, fd)); 217 } else if (dt->dt_ff[fd]->ff_file != NULL) { 218 KASSERT(fd_isused(fdp, fd)); 219 } 220 } 221 #endif 222 } 223 224 static int 225 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits) 226 { 227 int i, off, maxoff; 228 uint32_t sub; 229 230 KASSERT(mutex_owned(&fdp->fd_lock)); 231 232 fd_checkmaps(fdp); 233 234 if (want > bits) 235 return -1; 236 237 off = want >> NDENTRYSHIFT; 238 i = want & NDENTRYMASK; 239 if (i) { 240 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i)); 241 if (sub != ~0) 242 goto found; 243 off++; 244 } 245 246 maxoff = NDLOSLOTS(bits); 247 while (off < maxoff) { 248 if ((sub = bitmap[off]) != ~0) 249 goto found; 250 off++; 251 } 252 253 return -1; 254 255 found: 256 return (off << NDENTRYSHIFT) + ffs(~sub) - 1; 257 } 258 259 static int 260 fd_last_set(filedesc_t *fd, int last) 261 { 262 int off, i; 263 fdfile_t **ff = fd->fd_dt->dt_ff; 264 uint32_t *bitmap = fd->fd_lomap; 265 266 KASSERT(mutex_owned(&fd->fd_lock)); 267 268 fd_checkmaps(fd); 269 270 off = (last - 1) >> NDENTRYSHIFT; 271 272 while (off >= 0 && !bitmap[off]) 273 off--; 274 275 if (off < 0) 276 return -1; 277 278 i = ((off + 1) << NDENTRYSHIFT) - 1; 279 if (i >= last) 280 i = last - 1; 281 282 /* XXX should use bitmap */ 283 while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated)) 284 i--; 285 286 return i; 287 } 288 289 static inline void 290 fd_used(filedesc_t *fdp, unsigned fd) 291 { 292 u_int off = fd >> NDENTRYSHIFT; 293 fdfile_t *ff; 294 295 ff = fdp->fd_dt->dt_ff[fd]; 296 297 KASSERT(mutex_owned(&fdp->fd_lock)); 298 KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) == 0); 299 KASSERT(ff != NULL); 300 KASSERT(ff->ff_file == NULL); 301 KASSERT(!ff->ff_allocated); 302 303 ff->ff_allocated = true; 304 fdp->fd_lomap[off] |= 1U << (fd & NDENTRYMASK); 305 if (__predict_false(fdp->fd_lomap[off] == ~0)) { 306 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 307 (1U << (off & NDENTRYMASK))) == 0); 308 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1U << (off & NDENTRYMASK); 309 } 310 311 if ((int)fd > fdp->fd_lastfile) { 312 fdp->fd_lastfile = fd; 313 } 314 315 fd_checkmaps(fdp); 316 } 317 318 static inline void 319 fd_unused(filedesc_t *fdp, unsigned fd) 320 { 321 u_int off = fd >> NDENTRYSHIFT; 322 fdfile_t *ff; 323 324 ff = fdp->fd_dt->dt_ff[fd]; 325 326 KASSERT(mutex_owned(&fdp->fd_lock)); 327 KASSERT(ff != NULL); 328 KASSERT(ff->ff_file == NULL); 329 KASSERT(ff->ff_allocated); 330 331 if (fd < fdp->fd_freefile) { 332 fdp->fd_freefile = fd; 333 } 334 335 if (fdp->fd_lomap[off] == ~0) { 336 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] & 337 (1U << (off & NDENTRYMASK))) != 0); 338 fdp->fd_himap[off >> NDENTRYSHIFT] &= 339 ~(1U << (off & NDENTRYMASK)); 340 } 341 KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0); 342 fdp->fd_lomap[off] &= ~(1U << (fd & NDENTRYMASK)); 343 ff->ff_allocated = false; 344 345 KASSERT(fd <= fdp->fd_lastfile); 346 if (fd == fdp->fd_lastfile) { 347 fdp->fd_lastfile = fd_last_set(fdp, fd); 348 } 349 fd_checkmaps(fdp); 350 } 351 352 /* 353 * Look up the file structure corresponding to a file descriptor 354 * and return the file, holding a reference on the descriptor. 355 */ 356 file_t * 357 fd_getfile(unsigned fd) 358 { 359 filedesc_t *fdp; 360 fdfile_t *ff; 361 file_t *fp; 362 fdtab_t *dt; 363 364 /* 365 * Look up the fdfile structure representing this descriptor. 366 * We are doing this unlocked. See fd_tryexpand(). 367 */ 368 fdp = curlwp->l_fd; 369 dt = atomic_load_consume(&fdp->fd_dt); 370 if (__predict_false(fd >= dt->dt_nfiles)) { 371 return NULL; 372 } 373 ff = dt->dt_ff[fd]; 374 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 375 if (__predict_false(ff == NULL)) { 376 return NULL; 377 } 378 379 /* Now get a reference to the descriptor. */ 380 if (fdp->fd_refcnt == 1) { 381 /* 382 * Single threaded: don't need to worry about concurrent 383 * access (other than earlier calls to kqueue, which may 384 * hold a reference to the descriptor). 385 */ 386 ff->ff_refcnt++; 387 } else { 388 /* 389 * Multi threaded: issue a memory barrier to ensure that we 390 * acquire the file pointer _after_ adding a reference. If 391 * no memory barrier, we could fetch a stale pointer. 392 * 393 * In particular, we must coordinate the following four 394 * memory operations: 395 * 396 * A. fd_close store ff->ff_file = NULL 397 * B. fd_close refcnt = atomic_dec_uint_nv(&ff->ff_refcnt) 398 * C. fd_getfile atomic_inc_uint(&ff->ff_refcnt) 399 * D. fd_getfile load fp = ff->ff_file 400 * 401 * If the order is D;A;B;C: 402 * 403 * 1. D: fp = ff->ff_file 404 * 2. A: ff->ff_file = NULL 405 * 3. B: refcnt = atomic_dec_uint_nv(&ff->ff_refcnt) 406 * 4. C: atomic_inc_uint(&ff->ff_refcnt) 407 * 408 * then fd_close determines that there are no more 409 * references and decides to free fp immediately, at 410 * the same that fd_getfile ends up with an fp that's 411 * about to be freed. *boom* 412 * 413 * By making B a release operation in fd_close, and by 414 * making C an acquire operation in fd_getfile, since 415 * they are atomic operations on the same object, which 416 * has a total modification order, we guarantee either: 417 * 418 * - B happens before C. Then since A is 419 * sequenced before B in fd_close, and C is 420 * sequenced before D in fd_getfile, we 421 * guarantee A happens before D, so fd_getfile 422 * reads a null fp and safely fails. 423 * 424 * - C happens before B. Then fd_getfile may read 425 * null or nonnull, but either way, fd_close 426 * will safely wait for references to drain. 427 */ 428 atomic_inc_uint(&ff->ff_refcnt); 429 membar_acquire(); 430 } 431 432 /* 433 * If the file is not open or is being closed then put the 434 * reference back. 435 */ 436 fp = atomic_load_consume(&ff->ff_file); 437 if (__predict_true(fp != NULL)) { 438 return fp; 439 } 440 fd_putfile(fd); 441 return NULL; 442 } 443 444 /* 445 * Release a reference to a file descriptor acquired with fd_getfile(). 446 */ 447 void 448 fd_putfile(unsigned fd) 449 { 450 filedesc_t *fdp; 451 fdfile_t *ff; 452 u_int u, v; 453 454 fdp = curlwp->l_fd; 455 KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles); 456 ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; 457 458 KASSERT(ff != NULL); 459 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 460 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 461 462 if (fdp->fd_refcnt == 1) { 463 /* 464 * Single threaded: don't need to worry about concurrent 465 * access (other than earlier calls to kqueue, which may 466 * hold a reference to the descriptor). 467 */ 468 if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) { 469 fd_close(fd); 470 return; 471 } 472 ff->ff_refcnt--; 473 return; 474 } 475 476 /* 477 * Ensure that any use of the file is complete and globally 478 * visible before dropping the final reference. If no membar, 479 * the current CPU could still access memory associated with 480 * the file after it has been freed or recycled by another 481 * CPU. 482 */ 483 membar_release(); 484 485 /* 486 * Be optimistic and start out with the assumption that no other 487 * threads are trying to close the descriptor. If the CAS fails, 488 * we lost a race and/or it's being closed. 489 */ 490 for (u = ff->ff_refcnt & FR_MASK;; u = v) { 491 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1); 492 if (__predict_true(u == v)) { 493 return; 494 } 495 if (__predict_false((v & FR_CLOSING) != 0)) { 496 break; 497 } 498 } 499 500 /* Another thread is waiting to close the file: join it. */ 501 (void)fd_close(fd); 502 } 503 504 /* 505 * Convenience wrapper around fd_getfile() that returns reference 506 * to a vnode. 507 */ 508 int 509 fd_getvnode(unsigned fd, file_t **fpp) 510 { 511 vnode_t *vp; 512 file_t *fp; 513 514 fp = fd_getfile(fd); 515 if (__predict_false(fp == NULL)) { 516 return SET_ERROR(EBADF); 517 } 518 if (__predict_false(fp->f_type != DTYPE_VNODE)) { 519 fd_putfile(fd); 520 return SET_ERROR(EINVAL); 521 } 522 vp = fp->f_vnode; 523 if (__predict_false(vp->v_type == VBAD)) { 524 /* XXX Is this case really necessary? */ 525 fd_putfile(fd); 526 return SET_ERROR(EBADF); 527 } 528 *fpp = fp; 529 return 0; 530 } 531 532 /* 533 * Convenience wrapper around fd_getfile() that returns reference 534 * to a socket. 535 */ 536 int 537 fd_getsock1(unsigned fd, struct socket **sop, file_t **fp) 538 { 539 *fp = fd_getfile(fd); 540 if (__predict_false(*fp == NULL)) { 541 return SET_ERROR(EBADF); 542 } 543 if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) { 544 fd_putfile(fd); 545 return SET_ERROR(ENOTSOCK); 546 } 547 *sop = (*fp)->f_socket; 548 return 0; 549 } 550 551 int 552 fd_getsock(unsigned fd, struct socket **sop) 553 { 554 file_t *fp; 555 return fd_getsock1(fd, sop, &fp); 556 } 557 558 /* 559 * Look up the file structure corresponding to a file descriptor 560 * and return it with a reference held on the file, not the 561 * descriptor. 562 * 563 * This is heavyweight and only used when accessing descriptors 564 * from a foreign process. The caller must ensure that `p' does 565 * not exit or fork across this call. 566 * 567 * To release the file (not descriptor) reference, use closef(). 568 */ 569 file_t * 570 fd_getfile2(proc_t *p, unsigned fd) 571 { 572 filedesc_t *fdp; 573 fdfile_t *ff; 574 file_t *fp; 575 fdtab_t *dt; 576 577 fdp = p->p_fd; 578 mutex_enter(&fdp->fd_lock); 579 dt = fdp->fd_dt; 580 if (fd >= dt->dt_nfiles) { 581 mutex_exit(&fdp->fd_lock); 582 return NULL; 583 } 584 if ((ff = dt->dt_ff[fd]) == NULL) { 585 mutex_exit(&fdp->fd_lock); 586 return NULL; 587 } 588 if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) { 589 mutex_exit(&fdp->fd_lock); 590 return NULL; 591 } 592 mutex_enter(&fp->f_lock); 593 fp->f_count++; 594 mutex_exit(&fp->f_lock); 595 mutex_exit(&fdp->fd_lock); 596 597 return fp; 598 } 599 600 /* 601 * Internal form of close. Must be called with a reference to the 602 * descriptor, and will drop the reference. When all descriptor 603 * references are dropped, releases the descriptor slot and a single 604 * reference to the file structure. 605 */ 606 int 607 fd_close(unsigned fd) 608 { 609 struct flock lf; 610 filedesc_t *fdp; 611 fdfile_t *ff; 612 file_t *fp; 613 proc_t *p; 614 lwp_t *l; 615 u_int refcnt; 616 617 l = curlwp; 618 p = l->l_proc; 619 fdp = l->l_fd; 620 ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; 621 622 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 623 624 mutex_enter(&fdp->fd_lock); 625 KASSERT((ff->ff_refcnt & FR_MASK) > 0); 626 fp = atomic_load_consume(&ff->ff_file); 627 if (__predict_false(fp == NULL)) { 628 /* 629 * Another user of the file is already closing, and is 630 * waiting for other users of the file to drain. Release 631 * our reference, and wake up the closer. 632 */ 633 membar_release(); 634 atomic_dec_uint(&ff->ff_refcnt); 635 cv_broadcast(&ff->ff_closing); 636 mutex_exit(&fdp->fd_lock); 637 638 /* 639 * An application error, so pretend that the descriptor 640 * was already closed. We can't safely wait for it to 641 * be closed without potentially deadlocking. 642 */ 643 return SET_ERROR(EBADF); 644 } 645 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 646 647 /* 648 * There may be multiple users of this file within the process. 649 * Notify existing and new users that the file is closing. This 650 * will prevent them from adding additional uses to this file 651 * while we are closing it. 652 */ 653 atomic_store_relaxed(&ff->ff_file, NULL); 654 ff->ff_exclose = false; 655 ff->ff_foclose = false; 656 657 /* 658 * We expect the caller to hold a descriptor reference - drop it. 659 * The reference count may increase beyond zero at this point due 660 * to an erroneous descriptor reference by an application, but 661 * fd_getfile() will notice that the file is being closed and drop 662 * the reference again. 663 */ 664 if (fdp->fd_refcnt == 1) { 665 /* Single threaded. */ 666 refcnt = --(ff->ff_refcnt); 667 } else { 668 /* Multi threaded. */ 669 membar_release(); 670 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt); 671 membar_acquire(); 672 } 673 if (__predict_false(refcnt != 0)) { 674 /* 675 * Wait for other references to drain. This is typically 676 * an application error - the descriptor is being closed 677 * while still in use. 678 * (Or just a threaded application trying to unblock its 679 * thread that sleeps in (say) accept()). 680 */ 681 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING); 682 683 /* 684 * Remove any knotes attached to the file. A knote 685 * attached to the descriptor can hold references on it. 686 */ 687 mutex_exit(&fdp->fd_lock); 688 if (!SLIST_EMPTY(&ff->ff_knlist)) { 689 knote_fdclose(fd); 690 } 691 692 /* 693 * Since the file system code doesn't know which fd 694 * each request came from (think dup()), we have to 695 * ask it to return ERESTART for any long-term blocks. 696 * The re-entry through read/write/etc will detect the 697 * closed fd and return EBAFD. 698 * Blocked partial writes may return a short length. 699 */ 700 (*fp->f_ops->fo_restart)(fp); 701 mutex_enter(&fdp->fd_lock); 702 703 /* 704 * We need to see the count drop to zero at least once, 705 * in order to ensure that all pre-existing references 706 * have been drained. New references past this point are 707 * of no interest. 708 * XXX (dsl) this may need to call fo_restart() after a 709 * timeout to guarantee that all the system calls exit. 710 */ 711 while ((ff->ff_refcnt & FR_MASK) != 0) { 712 cv_wait(&ff->ff_closing, &fdp->fd_lock); 713 } 714 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING); 715 } else { 716 /* If no references, there must be no knotes. */ 717 KASSERT(SLIST_EMPTY(&ff->ff_knlist)); 718 } 719 720 /* 721 * POSIX record locking dictates that any close releases ALL 722 * locks owned by this process. This is handled by setting 723 * a flag in the unlock to free ONLY locks obeying POSIX 724 * semantics, and not to free BSD-style file locks. 725 * If the descriptor was in a message, POSIX-style locks 726 * aren't passed with the descriptor. 727 */ 728 if (__predict_false((p->p_flag & PK_ADVLOCK) != 0) && 729 fp->f_ops->fo_advlock != NULL) { 730 lf.l_whence = SEEK_SET; 731 lf.l_start = 0; 732 lf.l_len = 0; 733 lf.l_type = F_UNLCK; 734 mutex_exit(&fdp->fd_lock); 735 (void)(*fp->f_ops->fo_advlock)(fp, p, F_UNLCK, &lf, F_POSIX); 736 mutex_enter(&fdp->fd_lock); 737 } 738 739 /* Free descriptor slot. */ 740 fd_unused(fdp, fd); 741 mutex_exit(&fdp->fd_lock); 742 743 /* Now drop reference to the file itself. */ 744 return closef(fp); 745 } 746 747 /* 748 * Duplicate a file descriptor. 749 */ 750 int 751 fd_dup(file_t *fp, int minfd, int *newp, bool exclose, bool foclose) 752 { 753 proc_t *p = curproc; 754 int error; 755 756 while ((error = fd_alloc(p, minfd, newp)) != 0) { 757 if (error != ENOSPC) { 758 return error; 759 } 760 fd_tryexpand(p); 761 } 762 763 fd_set_exclose(curlwp, *newp, exclose); 764 fd_set_foclose(curlwp, *newp, foclose); 765 fd_affix(p, fp, *newp); 766 return 0; 767 } 768 769 /* 770 * dup2 operation. 771 */ 772 int 773 fd_dup2(file_t *fp, unsigned newfd, int flags) 774 { 775 filedesc_t *fdp = curlwp->l_fd; 776 fdfile_t *ff; 777 fdtab_t *dt; 778 779 if (flags & ~(O_CLOEXEC|O_CLOFORK|O_NONBLOCK|O_NOSIGPIPE)) 780 return SET_ERROR(EINVAL); 781 /* 782 * Ensure there are enough slots in the descriptor table, 783 * and allocate an fdfile_t up front in case we need it. 784 */ 785 while (newfd >= atomic_load_consume(&fdp->fd_dt)->dt_nfiles) { 786 fd_tryexpand(curproc); 787 } 788 ff = kmem_alloc(sizeof(*ff), KM_SLEEP); 789 fdfile_ctor(ff); 790 791 /* 792 * If there is already a file open, close it. If the file is 793 * half open, wait for it to be constructed before closing it. 794 * XXX Potential for deadlock here? 795 */ 796 mutex_enter(&fdp->fd_lock); 797 while (fd_isused(fdp, newfd)) { 798 mutex_exit(&fdp->fd_lock); 799 if (fd_getfile(newfd) != NULL) { 800 (void)fd_close(newfd); 801 } else { 802 /* 803 * Crummy, but unlikely to happen. 804 * Can occur if we interrupt another 805 * thread while it is opening a file. 806 */ 807 kpause("dup2", false, 1, NULL); 808 } 809 mutex_enter(&fdp->fd_lock); 810 } 811 dt = fdp->fd_dt; 812 if (dt->dt_ff[newfd] == NULL) { 813 KASSERT(newfd >= NDFDFILE); 814 dt->dt_ff[newfd] = ff; 815 ff = NULL; 816 } 817 fd_used(fdp, newfd); 818 mutex_exit(&fdp->fd_lock); 819 820 fd_set_exclose(curlwp, newfd, (flags & O_CLOEXEC) != 0); 821 fd_set_foclose(curlwp, newfd, (flags & O_CLOFORK) != 0); 822 fp->f_flag |= flags & (FNONBLOCK|FNOSIGPIPE); 823 /* Slot is now allocated. Insert copy of the file. */ 824 fd_affix(curproc, fp, newfd); 825 if (ff != NULL) { 826 cv_destroy(&ff->ff_closing); 827 kmem_free(ff, sizeof(*ff)); 828 } 829 return 0; 830 } 831 832 /* 833 * Drop reference to a file structure. 834 */ 835 int 836 closef(file_t *fp) 837 { 838 struct flock lf; 839 int error; 840 841 /* 842 * Drop reference. If referenced elsewhere it's still open 843 * and we have nothing more to do. 844 */ 845 mutex_enter(&fp->f_lock); 846 KASSERT(fp->f_count > 0); 847 if (--fp->f_count > 0) { 848 mutex_exit(&fp->f_lock); 849 return 0; 850 } 851 KASSERT(fp->f_count == 0); 852 mutex_exit(&fp->f_lock); 853 854 /* We held the last reference - release locks, close and free. */ 855 if (fp->f_ops->fo_advlock == NULL) { 856 KASSERT((fp->f_flag & FHASLOCK) == 0); 857 } else if (fp->f_flag & FHASLOCK) { 858 lf.l_whence = SEEK_SET; 859 lf.l_start = 0; 860 lf.l_len = 0; 861 lf.l_type = F_UNLCK; 862 (void)(*fp->f_ops->fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK); 863 } 864 if (fp->f_ops != NULL) { 865 error = (*fp->f_ops->fo_close)(fp); 866 867 /* 868 * .fo_close is final, so real errors are frowned on 869 * (but allowed and passed on to close(2)), and 870 * ERESTART is absolutely forbidden because the file 871 * descriptor is gone and there is no chance to retry. 872 */ 873 KASSERTMSG(error != ERESTART, 874 "file %p f_ops %p fo_close %p returned ERESTART", 875 fp, fp->f_ops, fp->f_ops->fo_close); 876 } else { 877 error = 0; 878 } 879 KASSERT(fp->f_count == 0); 880 KASSERT(fp->f_cred != NULL); 881 pool_cache_put(file_cache, fp); 882 883 return error; 884 } 885 886 /* 887 * Allocate a file descriptor for the process. 888 * 889 * Future idea for experimentation: replace all of this with radixtree. 890 */ 891 int 892 fd_alloc(proc_t *p, int want, int *result) 893 { 894 filedesc_t *fdp = p->p_fd; 895 int i, lim, last, error, hi; 896 u_int off; 897 fdtab_t *dt; 898 899 KASSERT(p == curproc || p == &proc0); 900 901 /* 902 * Search for a free descriptor starting at the higher 903 * of want or fd_freefile. 904 */ 905 mutex_enter(&fdp->fd_lock); 906 fd_checkmaps(fdp); 907 dt = fdp->fd_dt; 908 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 909 lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); 910 last = uimin(dt->dt_nfiles, lim); 911 912 for (;;) { 913 if ((i = want) < fdp->fd_freefile) 914 i = fdp->fd_freefile; 915 off = i >> NDENTRYSHIFT; 916 hi = fd_next_zero(fdp, fdp->fd_himap, off, 917 (last + NDENTRIES - 1) >> NDENTRYSHIFT); 918 if (hi == -1) 919 break; 920 i = fd_next_zero(fdp, &fdp->fd_lomap[hi], 921 hi > off ? 0 : i & NDENTRYMASK, NDENTRIES); 922 if (i == -1) { 923 /* 924 * Free file descriptor in this block was 925 * below want, try again with higher want. 926 */ 927 want = (hi + 1) << NDENTRYSHIFT; 928 continue; 929 } 930 i += (hi << NDENTRYSHIFT); 931 if (i >= last) { 932 break; 933 } 934 if (dt->dt_ff[i] == NULL) { 935 KASSERT(i >= NDFDFILE); 936 dt->dt_ff[i] = kmem_alloc(sizeof(fdfile_t), KM_SLEEP); 937 fdfile_ctor(dt->dt_ff[i]); 938 } 939 KASSERT(dt->dt_ff[i]->ff_file == NULL); 940 fd_used(fdp, i); 941 if (want <= fdp->fd_freefile) { 942 fdp->fd_freefile = i; 943 } 944 *result = i; 945 KASSERT(i >= NDFDFILE || 946 dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]); 947 fd_checkmaps(fdp); 948 mutex_exit(&fdp->fd_lock); 949 return 0; 950 } 951 952 /* No space in current array. Let the caller expand and retry. */ 953 error = (dt->dt_nfiles >= lim) ? SET_ERROR(EMFILE) : SET_ERROR(ENOSPC); 954 mutex_exit(&fdp->fd_lock); 955 return error; 956 } 957 958 /* 959 * Allocate memory for a descriptor table. 960 */ 961 static fdtab_t * 962 fd_dtab_alloc(int n) 963 { 964 fdtab_t *dt; 965 size_t sz; 966 967 KASSERT(n > NDFILE); 968 969 sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]); 970 dt = kmem_alloc(sz, KM_SLEEP); 971 #ifdef DIAGNOSTIC 972 memset(dt, 0xff, sz); 973 #endif 974 dt->dt_nfiles = n; 975 dt->dt_link = NULL; 976 return dt; 977 } 978 979 /* 980 * Free a descriptor table, and all tables linked for deferred free. 981 */ 982 static void 983 fd_dtab_free(fdtab_t *dt) 984 { 985 fdtab_t *next; 986 size_t sz; 987 988 do { 989 next = dt->dt_link; 990 KASSERT(dt->dt_nfiles > NDFILE); 991 sz = sizeof(*dt) + 992 (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]); 993 #ifdef DIAGNOSTIC 994 memset(dt, 0xff, sz); 995 #endif 996 kmem_free(dt, sz); 997 dt = next; 998 } while (dt != NULL); 999 } 1000 1001 /* 1002 * Allocate descriptor bitmap. 1003 */ 1004 static void 1005 fd_map_alloc(int n, uint32_t **lo, uint32_t **hi) 1006 { 1007 uint8_t *ptr; 1008 size_t szlo, szhi; 1009 1010 KASSERT(n > NDENTRIES); 1011 1012 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 1013 szhi = NDHISLOTS(n) * sizeof(uint32_t); 1014 ptr = kmem_alloc(szlo + szhi, KM_SLEEP); 1015 *lo = (uint32_t *)ptr; 1016 *hi = (uint32_t *)(ptr + szlo); 1017 } 1018 1019 /* 1020 * Free descriptor bitmap. 1021 */ 1022 static void 1023 fd_map_free(int n, uint32_t *lo, uint32_t *hi) 1024 { 1025 size_t szlo, szhi; 1026 1027 KASSERT(n > NDENTRIES); 1028 1029 szlo = NDLOSLOTS(n) * sizeof(uint32_t); 1030 szhi = NDHISLOTS(n) * sizeof(uint32_t); 1031 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo)); 1032 kmem_free(lo, szlo + szhi); 1033 } 1034 1035 /* 1036 * Expand a process' descriptor table. 1037 */ 1038 void 1039 fd_tryexpand(proc_t *p) 1040 { 1041 filedesc_t *fdp; 1042 int i, numfiles, oldnfiles; 1043 fdtab_t *newdt, *dt; 1044 uint32_t *newhimap, *newlomap; 1045 1046 KASSERT(p == curproc || p == &proc0); 1047 1048 fdp = p->p_fd; 1049 newhimap = NULL; 1050 newlomap = NULL; 1051 oldnfiles = atomic_load_consume(&fdp->fd_dt)->dt_nfiles; 1052 1053 if (oldnfiles < NDEXTENT) 1054 numfiles = NDEXTENT; 1055 else 1056 numfiles = 2 * oldnfiles; 1057 1058 newdt = fd_dtab_alloc(numfiles); 1059 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 1060 fd_map_alloc(numfiles, &newlomap, &newhimap); 1061 } 1062 1063 mutex_enter(&fdp->fd_lock); 1064 dt = fdp->fd_dt; 1065 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1066 if (dt->dt_nfiles != oldnfiles) { 1067 /* fdp changed; caller must retry */ 1068 mutex_exit(&fdp->fd_lock); 1069 fd_dtab_free(newdt); 1070 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 1071 fd_map_free(numfiles, newlomap, newhimap); 1072 } 1073 return; 1074 } 1075 1076 /* Copy the existing descriptor table and zero the new portion. */ 1077 i = sizeof(fdfile_t *) * oldnfiles; 1078 memcpy(newdt->dt_ff, dt->dt_ff, i); 1079 memset((uint8_t *)newdt->dt_ff + i, 0, 1080 numfiles * sizeof(fdfile_t *) - i); 1081 1082 /* 1083 * Link old descriptor array into list to be discarded. We defer 1084 * freeing until the last reference to the descriptor table goes 1085 * away (usually process exit). This allows us to do lockless 1086 * lookups in fd_getfile(). 1087 */ 1088 if (oldnfiles > NDFILE) { 1089 if (fdp->fd_refcnt > 1) { 1090 newdt->dt_link = dt; 1091 } else { 1092 fd_dtab_free(dt); 1093 } 1094 } 1095 1096 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) { 1097 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t); 1098 memcpy(newhimap, fdp->fd_himap, i); 1099 memset((uint8_t *)newhimap + i, 0, 1100 NDHISLOTS(numfiles) * sizeof(uint32_t) - i); 1101 1102 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t); 1103 memcpy(newlomap, fdp->fd_lomap, i); 1104 memset((uint8_t *)newlomap + i, 0, 1105 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i); 1106 1107 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) { 1108 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap); 1109 } 1110 fdp->fd_himap = newhimap; 1111 fdp->fd_lomap = newlomap; 1112 } 1113 1114 /* 1115 * All other modifications must become globally visible before 1116 * the change to fd_dt. See fd_getfile(). 1117 */ 1118 atomic_store_release(&fdp->fd_dt, newdt); 1119 KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]); 1120 fd_checkmaps(fdp); 1121 mutex_exit(&fdp->fd_lock); 1122 } 1123 1124 /* 1125 * Create a new open file structure and allocate a file descriptor 1126 * for the current process. 1127 */ 1128 int 1129 fd_allocfile(file_t **resultfp, int *resultfd) 1130 { 1131 proc_t *p = curproc; 1132 kauth_cred_t cred; 1133 file_t *fp; 1134 int error; 1135 1136 while ((error = fd_alloc(p, 0, resultfd)) != 0) { 1137 if (error != ENOSPC) { 1138 return error; 1139 } 1140 fd_tryexpand(p); 1141 } 1142 1143 fp = pool_cache_get(file_cache, PR_WAITOK); 1144 if (fp == NULL) { 1145 fd_abort(p, NULL, *resultfd); 1146 return SET_ERROR(ENFILE); 1147 } 1148 KASSERT(fp->f_count == 0); 1149 KASSERT(fp->f_msgcount == 0); 1150 KASSERT(fp->f_unpcount == 0); 1151 1152 /* Replace cached credentials if not what we need. */ 1153 cred = curlwp->l_cred; 1154 if (__predict_false(cred != fp->f_cred)) { 1155 kauth_cred_free(fp->f_cred); 1156 fp->f_cred = kauth_cred_hold(cred); 1157 } 1158 1159 /* 1160 * Don't allow recycled files to be scanned. 1161 * See uipc_usrreq.c. 1162 */ 1163 if (__predict_false((fp->f_flag & FSCAN) != 0)) { 1164 mutex_enter(&fp->f_lock); 1165 atomic_and_uint(&fp->f_flag, ~FSCAN); 1166 mutex_exit(&fp->f_lock); 1167 } 1168 1169 fp->f_advice = 0; 1170 fp->f_offset = 0; 1171 *resultfp = fp; 1172 1173 return 0; 1174 } 1175 1176 /* 1177 * Successful creation of a new descriptor: make visible to the process. 1178 */ 1179 void 1180 fd_affix(proc_t *p, file_t *fp, unsigned fd) 1181 { 1182 fdfile_t *ff; 1183 filedesc_t *fdp; 1184 fdtab_t *dt; 1185 1186 KASSERT(p == curproc || p == &proc0); 1187 1188 /* Add a reference to the file structure. */ 1189 mutex_enter(&fp->f_lock); 1190 fp->f_count++; 1191 mutex_exit(&fp->f_lock); 1192 1193 /* 1194 * Insert the new file into the descriptor slot. 1195 */ 1196 fdp = p->p_fd; 1197 dt = atomic_load_consume(&fdp->fd_dt); 1198 ff = dt->dt_ff[fd]; 1199 1200 KASSERT(ff != NULL); 1201 KASSERT(ff->ff_file == NULL); 1202 KASSERT(ff->ff_allocated); 1203 KASSERT(fd_isused(fdp, fd)); 1204 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1205 1206 /* No need to lock in order to make file initially visible. */ 1207 atomic_store_release(&ff->ff_file, fp); 1208 } 1209 1210 /* 1211 * Abort creation of a new descriptor: free descriptor slot and file. 1212 */ 1213 void 1214 fd_abort(proc_t *p, file_t *fp, unsigned fd) 1215 { 1216 filedesc_t *fdp; 1217 fdfile_t *ff; 1218 1219 KASSERT(p == curproc || p == &proc0); 1220 1221 fdp = p->p_fd; 1222 ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; 1223 ff->ff_exclose = false; 1224 ff->ff_foclose = false; 1225 1226 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1227 1228 mutex_enter(&fdp->fd_lock); 1229 KASSERT(fd_isused(fdp, fd)); 1230 fd_unused(fdp, fd); 1231 mutex_exit(&fdp->fd_lock); 1232 1233 if (fp != NULL) { 1234 KASSERT(fp->f_count == 0); 1235 KASSERT(fp->f_cred != NULL); 1236 pool_cache_put(file_cache, fp); 1237 } 1238 } 1239 1240 static int 1241 file_ctor(void *arg, void *obj, int flags) 1242 { 1243 /* 1244 * It's easy to exhaust the open file limit on a system with many 1245 * CPUs due to caching. Allow a bit of leeway to reduce the element 1246 * of surprise. 1247 */ 1248 u_int slop = PCG_NOBJECTS_NORMAL * (ncpu - 1); 1249 file_t *fp = obj; 1250 1251 memset(fp, 0, sizeof(*fp)); 1252 1253 mutex_enter(&filelist_lock); 1254 if (__predict_false(nfiles >= slop + maxfiles)) { 1255 mutex_exit(&filelist_lock); 1256 tablefull("file", "increase kern.maxfiles or MAXFILES"); 1257 return SET_ERROR(ENFILE); 1258 } 1259 nfiles++; 1260 LIST_INSERT_HEAD(&filehead, fp, f_list); 1261 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1262 fp->f_cred = kauth_cred_hold(curlwp->l_cred); 1263 mutex_exit(&filelist_lock); 1264 1265 return 0; 1266 } 1267 1268 static void 1269 file_dtor(void *arg, void *obj) 1270 { 1271 file_t *fp = obj; 1272 1273 mutex_enter(&filelist_lock); 1274 nfiles--; 1275 LIST_REMOVE(fp, f_list); 1276 mutex_exit(&filelist_lock); 1277 1278 KASSERT(fp->f_count == 0); 1279 kauth_cred_free(fp->f_cred); 1280 mutex_destroy(&fp->f_lock); 1281 } 1282 1283 static void 1284 fdfile_ctor(fdfile_t *ff) 1285 { 1286 1287 memset(ff, 0, sizeof(*ff)); 1288 cv_init(&ff->ff_closing, "fdclose"); 1289 } 1290 1291 static void 1292 fdfile_dtor(fdfile_t *ff) 1293 { 1294 1295 cv_destroy(&ff->ff_closing); 1296 } 1297 1298 file_t * 1299 fgetdummy(void) 1300 { 1301 file_t *fp; 1302 1303 fp = kmem_zalloc(sizeof(*fp), KM_SLEEP); 1304 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE); 1305 return fp; 1306 } 1307 1308 void 1309 fputdummy(file_t *fp) 1310 { 1311 1312 mutex_destroy(&fp->f_lock); 1313 kmem_free(fp, sizeof(*fp)); 1314 } 1315 1316 /* 1317 * Create an initial filedesc structure. 1318 */ 1319 filedesc_t * 1320 fd_init(filedesc_t *fdp) 1321 { 1322 #ifdef DIAGNOSTIC 1323 unsigned fd; 1324 #endif 1325 1326 if (__predict_true(fdp == NULL)) { 1327 fdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1328 } else { 1329 KASSERT(fdp == &filedesc0); 1330 filedesc_ctor(NULL, fdp, PR_WAITOK); 1331 } 1332 1333 #ifdef DIAGNOSTIC 1334 KASSERT(fdp->fd_lastfile == -1); 1335 KASSERT(fdp->fd_lastkqfile == -1); 1336 KASSERT(fdp->fd_knhash == NULL); 1337 KASSERT(fdp->fd_freefile == 0); 1338 KASSERT(fdp->fd_exclose == false); 1339 KASSERT(fdp->fd_foclose == false); 1340 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); 1341 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1342 for (fd = 0; fd < NDFDFILE; fd++) { 1343 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == 1344 (fdfile_t *)fdp->fd_dfdfile[fd]); 1345 } 1346 for (fd = NDFDFILE; fd < NDFILE; fd++) { 1347 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL); 1348 } 1349 KASSERT(fdp->fd_himap == fdp->fd_dhimap); 1350 KASSERT(fdp->fd_lomap == fdp->fd_dlomap); 1351 #endif /* DIAGNOSTIC */ 1352 1353 fdp->fd_refcnt = 1; 1354 fd_checkmaps(fdp); 1355 1356 return fdp; 1357 } 1358 1359 /* 1360 * Initialize a file descriptor table. 1361 */ 1362 static int 1363 filedesc_ctor(void *arg, void *obj, int flag) 1364 { 1365 filedesc_t *fdp = obj; 1366 fdfile_t **ffp; 1367 int i; 1368 1369 memset(fdp, 0, sizeof(*fdp)); 1370 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE); 1371 fdp->fd_lastfile = -1; 1372 fdp->fd_lastkqfile = -1; 1373 fdp->fd_dt = &fdp->fd_dtbuiltin; 1374 fdp->fd_dtbuiltin.dt_nfiles = NDFILE; 1375 fdp->fd_himap = fdp->fd_dhimap; 1376 fdp->fd_lomap = fdp->fd_dlomap; 1377 1378 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t)); 1379 for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) { 1380 fdfile_ctor(*ffp = (fdfile_t *)fdp->fd_dfdfile[i]); 1381 } 1382 1383 return 0; 1384 } 1385 1386 static void 1387 filedesc_dtor(void *arg, void *obj) 1388 { 1389 filedesc_t *fdp = obj; 1390 int i; 1391 1392 for (i = 0; i < NDFDFILE; i++) { 1393 fdfile_dtor((fdfile_t *)fdp->fd_dfdfile[i]); 1394 } 1395 1396 mutex_destroy(&fdp->fd_lock); 1397 } 1398 1399 /* 1400 * Make p share curproc's filedesc structure. 1401 */ 1402 void 1403 fd_share(struct proc *p) 1404 { 1405 filedesc_t *fdp; 1406 1407 fdp = curlwp->l_fd; 1408 p->p_fd = fdp; 1409 atomic_inc_uint(&fdp->fd_refcnt); 1410 } 1411 1412 /* 1413 * Acquire a hold on a filedesc structure. 1414 */ 1415 void 1416 fd_hold(lwp_t *l) 1417 { 1418 filedesc_t *fdp = l->l_fd; 1419 1420 atomic_inc_uint(&fdp->fd_refcnt); 1421 } 1422 1423 /* 1424 * Copy a filedesc structure. 1425 */ 1426 filedesc_t * 1427 fd_copy(void) 1428 { 1429 filedesc_t *newfdp, *fdp; 1430 fdfile_t *ff, **ffp, **nffp, *ff2; 1431 int i, j, numfiles, lastfile, newlast; 1432 file_t *fp; 1433 fdtab_t *newdt; 1434 1435 fdp = curproc->p_fd; 1436 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK); 1437 newfdp->fd_refcnt = 1; 1438 1439 #ifdef DIAGNOSTIC 1440 KASSERT(newfdp->fd_lastfile == -1); 1441 KASSERT(newfdp->fd_lastkqfile == -1); 1442 KASSERT(newfdp->fd_knhash == NULL); 1443 KASSERT(newfdp->fd_freefile == 0); 1444 KASSERT(newfdp->fd_exclose == false); 1445 KASSERT(newfdp->fd_foclose == false); 1446 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); 1447 KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1448 for (i = 0; i < NDFDFILE; i++) { 1449 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == 1450 (fdfile_t *)&newfdp->fd_dfdfile[i]); 1451 } 1452 for (i = NDFDFILE; i < NDFILE; i++) { 1453 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL); 1454 } 1455 #endif /* DIAGNOSTIC */ 1456 1457 mutex_enter(&fdp->fd_lock); 1458 fd_checkmaps(fdp); 1459 numfiles = fdp->fd_dt->dt_nfiles; 1460 lastfile = fdp->fd_lastfile; 1461 1462 /* 1463 * If the number of open files fits in the internal arrays 1464 * of the open file structure, use them, otherwise allocate 1465 * additional memory for the number of descriptors currently 1466 * in use. 1467 */ 1468 if (lastfile < NDFILE) { 1469 i = NDFILE; 1470 newdt = newfdp->fd_dt; 1471 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); 1472 } else { 1473 /* 1474 * Compute the smallest multiple of NDEXTENT needed 1475 * for the file descriptors currently in use, 1476 * allowing the table to shrink. 1477 */ 1478 i = numfiles; 1479 while (i >= 2 * NDEXTENT && i > lastfile * 2) { 1480 i /= 2; 1481 } 1482 KASSERT(i > NDFILE); 1483 newdt = fd_dtab_alloc(i); 1484 newfdp->fd_dt = newdt; 1485 memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff, 1486 NDFDFILE * sizeof(fdfile_t **)); 1487 memset(newdt->dt_ff + NDFDFILE, 0, 1488 (i - NDFDFILE) * sizeof(fdfile_t **)); 1489 } 1490 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) { 1491 newfdp->fd_himap = newfdp->fd_dhimap; 1492 newfdp->fd_lomap = newfdp->fd_dlomap; 1493 } else { 1494 fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap); 1495 KASSERT(i >= NDENTRIES * NDENTRIES); 1496 memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t)); 1497 memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t)); 1498 } 1499 newfdp->fd_freefile = fdp->fd_freefile; 1500 newfdp->fd_exclose = fdp->fd_exclose; 1501 newfdp->fd_foclose = false; /* no close-on-fork will be copied */ 1502 1503 ffp = fdp->fd_dt->dt_ff; 1504 nffp = newdt->dt_ff; 1505 newlast = -1; 1506 for (i = 0; i <= lastfile; i++, ffp++, nffp++) { 1507 KASSERT(i >= NDFDFILE || 1508 *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]); 1509 ff = *ffp; 1510 if (ff == NULL || 1511 (fp = atomic_load_consume(&ff->ff_file)) == NULL) { 1512 /* Descriptor unused, or descriptor half open. */ 1513 KASSERT(!fd_isused(newfdp, i)); 1514 continue; 1515 } 1516 if (__predict_false(ff->ff_foclose || 1517 fp->f_type == DTYPE_KQUEUE)) { 1518 /* kqueue descriptors cannot be copied. */ 1519 /* close-on-fork descriptors aren't either */ 1520 if (i < newfdp->fd_freefile) { 1521 newfdp->fd_freefile = i; 1522 } 1523 continue; 1524 } 1525 /* It's active: add a reference to the file. */ 1526 mutex_enter(&fp->f_lock); 1527 fp->f_count++; 1528 mutex_exit(&fp->f_lock); 1529 1530 /* Allocate an fdfile_t to represent it. */ 1531 if (i >= NDFDFILE) { 1532 ff2 = kmem_alloc(sizeof(*ff2), KM_SLEEP); 1533 fdfile_ctor(ff2); 1534 *nffp = ff2; 1535 } else { 1536 ff2 = newdt->dt_ff[i]; 1537 } 1538 ff2->ff_file = fp; 1539 ff2->ff_exclose = ff->ff_exclose; 1540 ff2->ff_foclose = false; 1541 ff2->ff_allocated = true; 1542 1543 /* Fix up bitmaps. */ 1544 j = i >> NDENTRYSHIFT; 1545 KASSERT((newfdp->fd_lomap[j] & (1U << (i & NDENTRYMASK))) == 0); 1546 newfdp->fd_lomap[j] |= 1U << (i & NDENTRYMASK); 1547 if (__predict_false(newfdp->fd_lomap[j] == ~0)) { 1548 KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] & 1549 (1U << (j & NDENTRYMASK))) == 0); 1550 newfdp->fd_himap[j >> NDENTRYSHIFT] |= 1551 1U << (j & NDENTRYMASK); 1552 } 1553 newlast = i; 1554 } 1555 KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]); 1556 newfdp->fd_lastfile = newlast; 1557 fd_checkmaps(newfdp); 1558 mutex_exit(&fdp->fd_lock); 1559 1560 return newfdp; 1561 } 1562 1563 /* 1564 * Release a filedesc structure. 1565 */ 1566 void 1567 fd_free(void) 1568 { 1569 fdfile_t *ff; 1570 file_t *fp; 1571 int fd, nf; 1572 fdtab_t *dt; 1573 lwp_t * const l = curlwp; 1574 filedesc_t * const fdp = l->l_fd; 1575 const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0; 1576 1577 KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] == 1578 (fdfile_t *)fdp->fd_dfdfile[0]); 1579 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1580 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); 1581 1582 membar_release(); 1583 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0) 1584 return; 1585 membar_acquire(); 1586 1587 /* 1588 * Close any files that the process holds open. 1589 */ 1590 dt = fdp->fd_dt; 1591 fd_checkmaps(fdp); 1592 #ifdef DEBUG 1593 fdp->fd_refcnt = -1; /* see fd_checkmaps */ 1594 #endif 1595 for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) { 1596 ff = dt->dt_ff[fd]; 1597 KASSERT(fd >= NDFDFILE || 1598 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1599 if (ff == NULL) 1600 continue; 1601 if ((fp = atomic_load_consume(&ff->ff_file)) != NULL) { 1602 /* 1603 * Must use fd_close() here if there is 1604 * a reference from kqueue or we might have posix 1605 * advisory locks. 1606 */ 1607 if (__predict_true(ff->ff_refcnt == 0) && 1608 (noadvlock || fp->f_type != DTYPE_VNODE)) { 1609 ff->ff_file = NULL; 1610 ff->ff_exclose = false; 1611 ff->ff_foclose = false; 1612 ff->ff_allocated = false; 1613 closef(fp); 1614 } else { 1615 ff->ff_refcnt++; 1616 fd_close(fd); 1617 } 1618 } 1619 KASSERT(ff->ff_refcnt == 0); 1620 KASSERT(ff->ff_file == NULL); 1621 KASSERT(!ff->ff_exclose); 1622 KASSERT(!ff->ff_foclose); 1623 KASSERT(!ff->ff_allocated); 1624 if (fd >= NDFDFILE) { 1625 cv_destroy(&ff->ff_closing); 1626 kmem_free(ff, sizeof(*ff)); 1627 dt->dt_ff[fd] = NULL; 1628 } 1629 } 1630 1631 /* 1632 * Clean out the descriptor table for the next user and return 1633 * to the cache. 1634 */ 1635 if (__predict_false(dt != &fdp->fd_dtbuiltin)) { 1636 fd_dtab_free(fdp->fd_dt); 1637 /* Otherwise, done above. */ 1638 memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0, 1639 (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0])); 1640 fdp->fd_dt = &fdp->fd_dtbuiltin; 1641 } 1642 if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) { 1643 KASSERT(fdp->fd_himap != fdp->fd_dhimap); 1644 KASSERT(fdp->fd_lomap != fdp->fd_dlomap); 1645 fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap); 1646 } 1647 if (__predict_false(fdp->fd_knhash != NULL)) { 1648 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask); 1649 fdp->fd_knhash = NULL; 1650 fdp->fd_knhashmask = 0; 1651 } else { 1652 KASSERT(fdp->fd_knhashmask == 0); 1653 } 1654 fdp->fd_dt = &fdp->fd_dtbuiltin; 1655 fdp->fd_lastkqfile = -1; 1656 fdp->fd_lastfile = -1; 1657 fdp->fd_freefile = 0; 1658 fdp->fd_exclose = false; 1659 fdp->fd_foclose = false; 1660 memset(&fdp->fd_startzero, 0, sizeof(*fdp) - 1661 offsetof(filedesc_t, fd_startzero)); 1662 fdp->fd_himap = fdp->fd_dhimap; 1663 fdp->fd_lomap = fdp->fd_dlomap; 1664 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); 1665 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); 1666 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin); 1667 #ifdef DEBUG 1668 fdp->fd_refcnt = 0; /* see fd_checkmaps */ 1669 #endif 1670 fd_checkmaps(fdp); 1671 pool_cache_put(filedesc_cache, fdp); 1672 } 1673 1674 /* 1675 * File Descriptor pseudo-device driver (/dev/fd/). 1676 * 1677 * Opening minor device N dup()s the file (if any) connected to file 1678 * descriptor N belonging to the calling process. Note that this driver 1679 * consists of only the ``open()'' routine, because all subsequent 1680 * references to this file will be direct to the other driver. 1681 */ 1682 static int 1683 filedescopen(dev_t dev, int mode, int type, lwp_t *l) 1684 { 1685 1686 /* 1687 * XXX Kludge: set dupfd to contain the value of the 1688 * the file descriptor being sought for duplication. The error 1689 * return ensures that the vnode for this device will be released 1690 * by vn_open. Open will detect this special error and take the 1691 * actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN 1692 * will simply report the error. 1693 */ 1694 l->l_dupfd = minor(dev); /* XXX */ 1695 return SET_ERROR(EDUPFD); 1696 } 1697 1698 /* 1699 * Duplicate the specified descriptor to a free descriptor. 1700 * 1701 * old is the original fd. 1702 * moveit is true if we should move rather than duplicate. 1703 * flags are the open flags (converted from O_* to F*). 1704 * newp returns the new fd on success. 1705 * 1706 * These two cases are produced by the EDUPFD and EMOVEFD magic 1707 * errnos, but in the interest of removing that regrettable interface, 1708 * vn_open has been changed to intercept them. Now vn_open returns 1709 * either a vnode or a filehandle, and the filehandle is accompanied 1710 * by a boolean that says whether we should dup (moveit == false) or 1711 * move (moveit == true) the fd. 1712 * 1713 * The dup case is used by /dev/stderr, /proc/self/fd, and such. The 1714 * move case is used by cloner devices that allocate a fd of their 1715 * own (a layering violation that should go away eventually) that 1716 * then needs to be put in the place open() expects it. 1717 */ 1718 int 1719 fd_dupopen(int old, bool moveit, int flags, int *newp) 1720 { 1721 filedesc_t *fdp; 1722 fdfile_t *ff; 1723 file_t *fp; 1724 fdtab_t *dt; 1725 int error; 1726 1727 if ((fp = fd_getfile(old)) == NULL) { 1728 return SET_ERROR(EBADF); 1729 } 1730 fdp = curlwp->l_fd; 1731 dt = atomic_load_consume(&fdp->fd_dt); 1732 ff = dt->dt_ff[old]; 1733 1734 /* 1735 * There are two cases of interest here. 1736 * 1737 * 1. moveit == false (used to be the EDUPFD magic errno): 1738 * simply dup (old) to file descriptor (new) and return. 1739 * 1740 * 2. moveit == true (used to be the EMOVEFD magic errno): 1741 * steal away the file structure from (old) and store it in 1742 * (new). (old) is effectively closed by this operation. 1743 */ 1744 if (moveit == false) { 1745 /* 1746 * Check that the mode the file is being opened for is a 1747 * subset of the mode of the existing descriptor. 1748 */ 1749 if (((flags & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { 1750 error = SET_ERROR(EACCES); 1751 goto out; 1752 } 1753 1754 /* Copy it. */ 1755 error = fd_dup(fp, 0, newp, ff->ff_exclose, ff->ff_foclose); 1756 } else { 1757 /* Copy it. */ 1758 error = fd_dup(fp, 0, newp, ff->ff_exclose, ff->ff_foclose); 1759 if (error != 0) { 1760 goto out; 1761 } 1762 1763 /* Steal away the file pointer from 'old'. */ 1764 (void)fd_close(old); 1765 return 0; 1766 } 1767 1768 out: 1769 fd_putfile(old); 1770 return error; 1771 } 1772 1773 /* 1774 * Close open files on exec. 1775 */ 1776 void 1777 fd_closeexec(void) 1778 { 1779 proc_t *p; 1780 filedesc_t *fdp; 1781 fdfile_t *ff; 1782 lwp_t *l; 1783 fdtab_t *dt; 1784 int fd; 1785 1786 l = curlwp; 1787 p = l->l_proc; 1788 fdp = p->p_fd; 1789 1790 if (fdp->fd_refcnt > 1) { 1791 /* 1792 * Always unshare fd table on any exec 1793 */ 1794 fdp = fd_copy(); 1795 fd_free(); 1796 p->p_fd = fdp; 1797 l->l_fd = fdp; 1798 } 1799 1800 /* 1801 * If there are no "close-on" fd's nothing more to do 1802 */ 1803 if (!(fdp->fd_exclose || fdp->fd_foclose)) 1804 return; 1805 1806 fdp->fd_exclose = false; /* there will be none when done */ 1807 fdp->fd_foclose = false; 1808 1809 dt = atomic_load_consume(&fdp->fd_dt); 1810 1811 for (fd = 0; fd <= fdp->fd_lastfile; fd++) { 1812 if ((ff = dt->dt_ff[fd]) == NULL) { 1813 KASSERT(fd >= NDFDFILE); 1814 continue; 1815 } 1816 KASSERT(fd >= NDFDFILE || 1817 ff == (fdfile_t *)fdp->fd_dfdfile[fd]); 1818 if (ff->ff_file == NULL) 1819 continue; 1820 if (ff->ff_exclose) { 1821 /* 1822 * We need a reference to close the file. 1823 * No other threads can see the fdfile_t at 1824 * this point, so don't bother locking. 1825 */ 1826 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0); 1827 ff->ff_refcnt++; 1828 fd_close(fd); 1829 } else if (ff->ff_foclose) { 1830 /* 1831 * https://austingroupbugs.net/view.php?id=1851 1832 * (not yet approved, but probably will be: 202507) 1833 * FD_CLOFORK should not be preserved across exec 1834 */ 1835 ff->ff_foclose = false; 1836 } 1837 } 1838 } 1839 1840 1841 /* 1842 * Sets descriptor owner. If the owner is a process, 'pgid' 1843 * is set to positive value, process ID. If the owner is process group, 1844 * 'pgid' is set to -pg_id. 1845 */ 1846 int 1847 fsetown(pid_t *pgid, u_long cmd, const void *data) 1848 { 1849 pid_t id = *(const pid_t *)data; 1850 int error; 1851 1852 if (id <= INT_MIN) 1853 return SET_ERROR(EINVAL); 1854 1855 switch (cmd) { 1856 case TIOCSPGRP: 1857 if (id < 0) 1858 return SET_ERROR(EINVAL); 1859 id = -id; 1860 break; 1861 default: 1862 break; 1863 } 1864 if (id > 0) { 1865 mutex_enter(&proc_lock); 1866 error = proc_find(id) ? 0 : SET_ERROR(ESRCH); 1867 mutex_exit(&proc_lock); 1868 } else if (id < 0) { 1869 error = pgid_in_session(curproc, -id); 1870 } else { 1871 error = 0; 1872 } 1873 if (!error) { 1874 *pgid = id; 1875 } 1876 return error; 1877 } 1878 1879 void 1880 fd_set_exclose(struct lwp *l, int fd, bool exclose) 1881 { 1882 filedesc_t *fdp = l->l_fd; 1883 fdfile_t *ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; 1884 1885 ff->ff_exclose = exclose; 1886 if (exclose) 1887 fdp->fd_exclose = true; 1888 } 1889 1890 void 1891 fd_set_foclose(struct lwp *l, int fd, bool foclose) 1892 { 1893 filedesc_t *fdp = l->l_fd; 1894 fdfile_t *ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; 1895 1896 ff->ff_foclose = foclose; 1897 if (foclose) 1898 fdp->fd_foclose = true; 1899 } 1900 1901 /* 1902 * Return descriptor owner information. If the value is positive, 1903 * it's process ID. If it's negative, it's process group ID and 1904 * needs the sign removed before use. 1905 */ 1906 int 1907 fgetown(pid_t pgid, u_long cmd, void *data) 1908 { 1909 1910 switch (cmd) { 1911 case TIOCGPGRP: 1912 KASSERT(pgid > INT_MIN); 1913 *(int *)data = -pgid; 1914 break; 1915 default: 1916 *(int *)data = pgid; 1917 break; 1918 } 1919 return 0; 1920 } 1921 1922 /* 1923 * Send signal to descriptor owner, either process or process group. 1924 */ 1925 void 1926 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata) 1927 { 1928 ksiginfo_t ksi; 1929 1930 KASSERT(!cpu_intr_p()); 1931 1932 if (pgid == 0) { 1933 return; 1934 } 1935 1936 KSI_INIT(&ksi); 1937 ksi.ksi_signo = signo; 1938 ksi.ksi_code = code; 1939 ksi.ksi_band = band; 1940 1941 mutex_enter(&proc_lock); 1942 if (pgid > 0) { 1943 struct proc *p1; 1944 1945 p1 = proc_find(pgid); 1946 if (p1 != NULL) { 1947 kpsignal(p1, &ksi, fdescdata); 1948 } 1949 } else { 1950 struct pgrp *pgrp; 1951 1952 KASSERT(pgid < 0 && pgid > INT_MIN); 1953 pgrp = pgrp_find(-pgid); 1954 if (pgrp != NULL) { 1955 kpgsignal(pgrp, &ksi, fdescdata, 0); 1956 } 1957 } 1958 mutex_exit(&proc_lock); 1959 } 1960 1961 int 1962 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops, 1963 void *data) 1964 { 1965 1966 fp->f_flag = flag & FMASK; 1967 fd_set_exclose(curlwp, fd, (flag & O_CLOEXEC) != 0); 1968 fd_set_foclose(curlwp, fd, (flag & O_CLOFORK) != 0); 1969 fp->f_type = DTYPE_MISC; 1970 fp->f_ops = fops; 1971 fp->f_data = data; 1972 curlwp->l_dupfd = fd; 1973 fd_affix(curproc, fp, fd); 1974 1975 return SET_ERROR(EMOVEFD); 1976 } 1977 1978 int 1979 fnullop_fcntl(file_t *fp, u_int cmd, void *data) 1980 { 1981 1982 if (cmd == F_SETFL) 1983 return 0; 1984 1985 return SET_ERROR(EOPNOTSUPP); 1986 } 1987 1988 int 1989 fnullop_poll(file_t *fp, int which) 1990 { 1991 1992 return 0; 1993 } 1994 1995 int 1996 fnullop_kqfilter(file_t *fp, struct knote *kn) 1997 { 1998 1999 return SET_ERROR(EOPNOTSUPP); 2000 } 2001 2002 void 2003 fnullop_restart(file_t *fp) 2004 { 2005 2006 } 2007 2008 int 2009 fbadop_read(file_t *fp, off_t *offset, struct uio *uio, 2010 kauth_cred_t cred, int flags) 2011 { 2012 2013 return SET_ERROR(EOPNOTSUPP); 2014 } 2015 2016 int 2017 fbadop_write(file_t *fp, off_t *offset, struct uio *uio, 2018 kauth_cred_t cred, int flags) 2019 { 2020 2021 return SET_ERROR(EOPNOTSUPP); 2022 } 2023 2024 int 2025 fbadop_ioctl(file_t *fp, u_long com, void *data) 2026 { 2027 2028 return SET_ERROR(EOPNOTSUPP); 2029 } 2030 2031 int 2032 fbadop_stat(file_t *fp, struct stat *sb) 2033 { 2034 2035 return SET_ERROR(EOPNOTSUPP); 2036 } 2037 2038 int 2039 fbadop_close(file_t *fp) 2040 { 2041 2042 return SET_ERROR(EOPNOTSUPP); 2043 } 2044 2045 /* 2046 * sysctl routines pertaining to file descriptors 2047 */ 2048 2049 /* Initialized in sysctl_init() for now... */ 2050 extern kmutex_t sysctl_file_marker_lock; 2051 static u_int sysctl_file_marker = 1; 2052 2053 /* 2054 * Expects to be called with proc_lock and sysctl_file_marker_lock locked. 2055 */ 2056 static void 2057 sysctl_file_marker_reset(void) 2058 { 2059 struct proc *p; 2060 2061 PROCLIST_FOREACH(p, &allproc) { 2062 struct filedesc *fd = p->p_fd; 2063 fdtab_t *dt; 2064 u_int i; 2065 2066 mutex_enter(&fd->fd_lock); 2067 dt = fd->fd_dt; 2068 for (i = 0; i < dt->dt_nfiles; i++) { 2069 struct file *fp; 2070 fdfile_t *ff; 2071 2072 if ((ff = dt->dt_ff[i]) == NULL) { 2073 continue; 2074 } 2075 if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) { 2076 continue; 2077 } 2078 fp->f_marker = 0; 2079 } 2080 mutex_exit(&fd->fd_lock); 2081 } 2082 } 2083 2084 /* 2085 * sysctl helper routine for kern.file pseudo-subtree. 2086 */ 2087 static int 2088 sysctl_kern_file(SYSCTLFN_ARGS) 2089 { 2090 const bool allowaddr = get_expose_address(curproc); 2091 struct filelist flist; 2092 int error; 2093 size_t buflen; 2094 struct file *fp, fbuf; 2095 char *start, *where; 2096 struct proc *p; 2097 2098 start = where = oldp; 2099 buflen = *oldlenp; 2100 2101 if (where == NULL) { 2102 /* 2103 * overestimate by 10 files 2104 */ 2105 *oldlenp = sizeof(filehead) + (nfiles + 10) * 2106 sizeof(struct file); 2107 return 0; 2108 } 2109 2110 /* 2111 * first sysctl_copyout filehead 2112 */ 2113 if (buflen < sizeof(filehead)) { 2114 *oldlenp = 0; 2115 return 0; 2116 } 2117 sysctl_unlock(); 2118 if (allowaddr) { 2119 memcpy(&flist, &filehead, sizeof(flist)); 2120 } else { 2121 memset(&flist, 0, sizeof(flist)); 2122 } 2123 error = sysctl_copyout(l, &flist, where, sizeof(flist)); 2124 if (error) { 2125 sysctl_relock(); 2126 return error; 2127 } 2128 buflen -= sizeof(flist); 2129 where += sizeof(flist); 2130 2131 /* 2132 * followed by an array of file structures 2133 */ 2134 mutex_enter(&sysctl_file_marker_lock); 2135 mutex_enter(&proc_lock); 2136 PROCLIST_FOREACH(p, &allproc) { 2137 struct filedesc *fd; 2138 fdtab_t *dt; 2139 u_int i; 2140 2141 if (p->p_stat == SIDL) { 2142 /* skip embryonic processes */ 2143 continue; 2144 } 2145 mutex_enter(p->p_lock); 2146 error = kauth_authorize_process(l->l_cred, 2147 KAUTH_PROCESS_CANSEE, p, 2148 KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), 2149 NULL, NULL); 2150 mutex_exit(p->p_lock); 2151 if (error != 0) { 2152 /* 2153 * Don't leak kauth retval if we're silently 2154 * skipping this entry. 2155 */ 2156 error = 0; 2157 continue; 2158 } 2159 2160 /* 2161 * Grab a hold on the process. 2162 */ 2163 if (!rw_tryenter(&p->p_reflock, RW_READER)) { 2164 continue; 2165 } 2166 mutex_exit(&proc_lock); 2167 2168 fd = p->p_fd; 2169 mutex_enter(&fd->fd_lock); 2170 dt = fd->fd_dt; 2171 for (i = 0; i < dt->dt_nfiles; i++) { 2172 fdfile_t *ff; 2173 2174 if ((ff = dt->dt_ff[i]) == NULL) { 2175 continue; 2176 } 2177 if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) { 2178 continue; 2179 } 2180 2181 mutex_enter(&fp->f_lock); 2182 2183 if ((fp->f_count == 0) || 2184 (fp->f_marker == sysctl_file_marker)) { 2185 mutex_exit(&fp->f_lock); 2186 continue; 2187 } 2188 2189 /* Check that we have enough space. */ 2190 if (buflen < sizeof(struct file)) { 2191 *oldlenp = where - start; 2192 mutex_exit(&fp->f_lock); 2193 error = SET_ERROR(ENOMEM); 2194 break; 2195 } 2196 2197 fill_file(&fbuf, fp); 2198 mutex_exit(&fp->f_lock); 2199 error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf)); 2200 if (error) { 2201 break; 2202 } 2203 buflen -= sizeof(struct file); 2204 where += sizeof(struct file); 2205 2206 fp->f_marker = sysctl_file_marker; 2207 } 2208 mutex_exit(&fd->fd_lock); 2209 2210 /* 2211 * Release reference to process. 2212 */ 2213 mutex_enter(&proc_lock); 2214 rw_exit(&p->p_reflock); 2215 2216 if (error) 2217 break; 2218 } 2219 2220 sysctl_file_marker++; 2221 /* Reset all markers if wrapped. */ 2222 if (sysctl_file_marker == 0) { 2223 sysctl_file_marker_reset(); 2224 sysctl_file_marker++; 2225 } 2226 2227 mutex_exit(&proc_lock); 2228 mutex_exit(&sysctl_file_marker_lock); 2229 2230 *oldlenp = where - start; 2231 sysctl_relock(); 2232 return error; 2233 } 2234 2235 /* 2236 * sysctl helper function for kern.file2 2237 */ 2238 static int 2239 sysctl_kern_file2(SYSCTLFN_ARGS) 2240 { 2241 struct proc *p; 2242 struct file *fp; 2243 struct filedesc *fd; 2244 struct kinfo_file kf; 2245 char *dp; 2246 u_int i, op; 2247 size_t len, needed, elem_size, out_size; 2248 int error, arg, elem_count; 2249 fdfile_t *ff; 2250 fdtab_t *dt; 2251 2252 if (namelen == 1 && name[0] == CTL_QUERY) 2253 return sysctl_query(SYSCTLFN_CALL(rnode)); 2254 2255 if (namelen != 4) 2256 return SET_ERROR(EINVAL); 2257 2258 error = 0; 2259 dp = oldp; 2260 len = (oldp != NULL) ? *oldlenp : 0; 2261 op = name[0]; 2262 arg = name[1]; 2263 elem_size = name[2]; 2264 elem_count = name[3]; 2265 out_size = MIN(sizeof(kf), elem_size); 2266 needed = 0; 2267 2268 if (elem_size < 1 || elem_count < 0) 2269 return SET_ERROR(EINVAL); 2270 2271 switch (op) { 2272 case KERN_FILE_BYFILE: 2273 case KERN_FILE_BYPID: 2274 /* 2275 * We're traversing the process list in both cases; the BYFILE 2276 * case does additional work of keeping track of files already 2277 * looked at. 2278 */ 2279 2280 /* doesn't use arg so it must be zero */ 2281 if ((op == KERN_FILE_BYFILE) && (arg != 0)) 2282 return SET_ERROR(EINVAL); 2283 2284 if ((op == KERN_FILE_BYPID) && (arg < -1)) 2285 /* -1 means all processes */ 2286 return SET_ERROR(EINVAL); 2287 2288 sysctl_unlock(); 2289 if (op == KERN_FILE_BYFILE) 2290 mutex_enter(&sysctl_file_marker_lock); 2291 mutex_enter(&proc_lock); 2292 PROCLIST_FOREACH(p, &allproc) { 2293 if (p->p_stat == SIDL) { 2294 /* skip embryonic processes */ 2295 continue; 2296 } 2297 if (arg > 0 && p->p_pid != arg) { 2298 /* pick only the one we want */ 2299 /* XXX want 0 to mean "kernel files" */ 2300 continue; 2301 } 2302 mutex_enter(p->p_lock); 2303 error = kauth_authorize_process(l->l_cred, 2304 KAUTH_PROCESS_CANSEE, p, 2305 KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), 2306 NULL, NULL); 2307 mutex_exit(p->p_lock); 2308 if (error != 0) { 2309 /* 2310 * Don't leak kauth retval if we're silently 2311 * skipping this entry. 2312 */ 2313 error = 0; 2314 continue; 2315 } 2316 2317 /* 2318 * Grab a hold on the process. 2319 */ 2320 if (!rw_tryenter(&p->p_reflock, RW_READER)) { 2321 continue; 2322 } 2323 mutex_exit(&proc_lock); 2324 2325 fd = p->p_fd; 2326 mutex_enter(&fd->fd_lock); 2327 dt = fd->fd_dt; 2328 for (i = 0; i < dt->dt_nfiles; i++) { 2329 if ((ff = dt->dt_ff[i]) == NULL) { 2330 continue; 2331 } 2332 if ((fp = atomic_load_consume(&ff->ff_file)) == 2333 NULL) { 2334 continue; 2335 } 2336 2337 if ((op == KERN_FILE_BYFILE) && 2338 (fp->f_marker == sysctl_file_marker)) { 2339 continue; 2340 } 2341 if (len >= elem_size && elem_count > 0) { 2342 mutex_enter(&fp->f_lock); 2343 fill_file2(&kf, fp, ff, i, p->p_pid); 2344 mutex_exit(&fp->f_lock); 2345 mutex_exit(&fd->fd_lock); 2346 error = sysctl_copyout(l, 2347 &kf, dp, out_size); 2348 mutex_enter(&fd->fd_lock); 2349 if (error) 2350 break; 2351 dp += elem_size; 2352 len -= elem_size; 2353 } 2354 if (op == KERN_FILE_BYFILE) 2355 fp->f_marker = sysctl_file_marker; 2356 needed += elem_size; 2357 if (elem_count > 0 && elem_count != INT_MAX) 2358 elem_count--; 2359 } 2360 mutex_exit(&fd->fd_lock); 2361 2362 /* 2363 * Release reference to process. 2364 */ 2365 mutex_enter(&proc_lock); 2366 rw_exit(&p->p_reflock); 2367 } 2368 if (op == KERN_FILE_BYFILE) { 2369 sysctl_file_marker++; 2370 2371 /* Reset all markers if wrapped. */ 2372 if (sysctl_file_marker == 0) { 2373 sysctl_file_marker_reset(); 2374 sysctl_file_marker++; 2375 } 2376 } 2377 mutex_exit(&proc_lock); 2378 if (op == KERN_FILE_BYFILE) 2379 mutex_exit(&sysctl_file_marker_lock); 2380 sysctl_relock(); 2381 break; 2382 default: 2383 return SET_ERROR(EINVAL); 2384 } 2385 2386 if (oldp == NULL) 2387 needed += KERN_FILESLOP * elem_size; 2388 *oldlenp = needed; 2389 2390 return error; 2391 } 2392 2393 static void 2394 fill_file(struct file *fp, const struct file *fpsrc) 2395 { 2396 const bool allowaddr = get_expose_address(curproc); 2397 2398 memset(fp, 0, sizeof(*fp)); 2399 2400 fp->f_offset = fpsrc->f_offset; 2401 COND_SET_PTR(fp->f_cred, fpsrc->f_cred, allowaddr); 2402 COND_SET_CPTR(fp->f_ops, fpsrc->f_ops, allowaddr); 2403 COND_SET_STRUCT(fp->f_undata, fpsrc->f_undata, allowaddr); 2404 COND_SET_STRUCT(fp->f_list, fpsrc->f_list, allowaddr); 2405 fp->f_flag = fpsrc->f_flag; 2406 fp->f_marker = fpsrc->f_marker; 2407 fp->f_type = fpsrc->f_type; 2408 fp->f_advice = fpsrc->f_advice; 2409 fp->f_count = fpsrc->f_count; 2410 fp->f_msgcount = fpsrc->f_msgcount; 2411 fp->f_unpcount = fpsrc->f_unpcount; 2412 COND_SET_STRUCT(fp->f_unplist, fpsrc->f_unplist, allowaddr); 2413 } 2414 2415 static void 2416 fill_file2(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff, 2417 int i, pid_t pid) 2418 { 2419 const bool allowaddr = get_expose_address(curproc); 2420 2421 memset(kp, 0, sizeof(*kp)); 2422 2423 COND_SET_VALUE(kp->ki_fileaddr, PTRTOUINT64(fp), allowaddr); 2424 kp->ki_flag = fp->f_flag; 2425 kp->ki_iflags = 0; 2426 kp->ki_ftype = fp->f_type; 2427 kp->ki_count = fp->f_count; 2428 kp->ki_msgcount = fp->f_msgcount; 2429 COND_SET_VALUE(kp->ki_fucred, PTRTOUINT64(fp->f_cred), allowaddr); 2430 kp->ki_fuid = kauth_cred_geteuid(fp->f_cred); 2431 kp->ki_fgid = kauth_cred_getegid(fp->f_cred); 2432 COND_SET_VALUE(kp->ki_fops, PTRTOUINT64(fp->f_ops), allowaddr); 2433 kp->ki_foffset = fp->f_offset; 2434 COND_SET_VALUE(kp->ki_fdata, PTRTOUINT64(fp->f_data), allowaddr); 2435 2436 /* vnode information to glue this file to something */ 2437 if (fp->f_type == DTYPE_VNODE) { 2438 struct vnode *vp = fp->f_vnode; 2439 2440 COND_SET_VALUE(kp->ki_vun, PTRTOUINT64(vp->v_un.vu_socket), 2441 allowaddr); 2442 kp->ki_vsize = vp->v_size; 2443 kp->ki_vtype = vp->v_type; 2444 kp->ki_vtag = vp->v_tag; 2445 COND_SET_VALUE(kp->ki_vdata, PTRTOUINT64(vp->v_data), 2446 allowaddr); 2447 } 2448 2449 /* process information when retrieved via KERN_FILE_BYPID */ 2450 if (ff != NULL) { 2451 kp->ki_pid = pid; 2452 kp->ki_fd = i; 2453 kp->ki_ofileflags = (ff->ff_exclose ? FD_CLOEXEC : 0) | 2454 (ff->ff_foclose ? FD_CLOFORK : 0); 2455 kp->ki_usecount = ff->ff_refcnt; 2456 } 2457 } 2458