kern_descrip.c revision 1.223 1 /* $NetBSD: kern_descrip.c,v 1.223 2014/02/25 18:30:11 pooka Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1982, 1986, 1989, 1991, 1993
34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
66 */
67
68 /*
69 * File descriptor management.
70 */
71
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.223 2014/02/25 18:30:11 pooka Exp $");
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/proc.h>
80 #include <sys/file.h>
81 #include <sys/socket.h>
82 #include <sys/socketvar.h>
83 #include <sys/stat.h>
84 #include <sys/ioctl.h>
85 #include <sys/fcntl.h>
86 #include <sys/pool.h>
87 #include <sys/unistd.h>
88 #include <sys/resourcevar.h>
89 #include <sys/conf.h>
90 #include <sys/event.h>
91 #include <sys/kauth.h>
92 #include <sys/atomic.h>
93 #include <sys/syscallargs.h>
94 #include <sys/cpu.h>
95 #include <sys/kmem.h>
96 #include <sys/vnode.h>
97 #include <sys/sysctl.h>
98 #include <sys/ktrace.h>
99
100 /*
101 * A list (head) of open files, counter, and lock protecting them.
102 */
103 struct filelist filehead __cacheline_aligned;
104 static u_int nfiles __cacheline_aligned;
105 kmutex_t filelist_lock __cacheline_aligned;
106
107 static pool_cache_t filedesc_cache __read_mostly;
108 static pool_cache_t file_cache __read_mostly;
109 static pool_cache_t fdfile_cache __read_mostly;
110
111 static int file_ctor(void *, void *, int);
112 static void file_dtor(void *, void *);
113 static int fdfile_ctor(void *, void *, int);
114 static void fdfile_dtor(void *, void *);
115 static int filedesc_ctor(void *, void *, int);
116 static void filedesc_dtor(void *, void *);
117 static int filedescopen(dev_t, int, int, lwp_t *);
118
119 static int sysctl_kern_file(SYSCTLFN_PROTO);
120 static int sysctl_kern_file2(SYSCTLFN_PROTO);
121 static void fill_file(struct kinfo_file *, const file_t *, const fdfile_t *,
122 int, pid_t);
123
124 const struct cdevsw filedesc_cdevsw = {
125 filedescopen, noclose, noread, nowrite, noioctl,
126 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
127 };
128
129 /* For ease of reading. */
130 __strong_alias(fd_putvnode,fd_putfile)
131 __strong_alias(fd_putsock,fd_putfile)
132
133 /*
134 * Initialize the descriptor system.
135 */
136 void
137 fd_sys_init(void)
138 {
139 static struct sysctllog *clog;
140
141 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
142
143 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
144 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
145 KASSERT(file_cache != NULL);
146
147 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
148 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
149 NULL);
150 KASSERT(fdfile_cache != NULL);
151
152 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
153 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
154 NULL);
155 KASSERT(filedesc_cache != NULL);
156
157 sysctl_createv(&clog, 0, NULL, NULL,
158 CTLFLAG_PERMANENT,
159 CTLTYPE_STRUCT, "file",
160 SYSCTL_DESCR("System open file table"),
161 sysctl_kern_file, 0, NULL, 0,
162 CTL_KERN, KERN_FILE, CTL_EOL);
163 sysctl_createv(&clog, 0, NULL, NULL,
164 CTLFLAG_PERMANENT,
165 CTLTYPE_STRUCT, "file2",
166 SYSCTL_DESCR("System open file table"),
167 sysctl_kern_file2, 0, NULL, 0,
168 CTL_KERN, KERN_FILE2, CTL_EOL);
169 }
170
171 static bool
172 fd_isused(filedesc_t *fdp, unsigned fd)
173 {
174 u_int off = fd >> NDENTRYSHIFT;
175
176 KASSERT(fd < fdp->fd_dt->dt_nfiles);
177
178 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
179 }
180
181 /*
182 * Verify that the bitmaps match the descriptor table.
183 */
184 static inline void
185 fd_checkmaps(filedesc_t *fdp)
186 {
187 #ifdef DEBUG
188 fdtab_t *dt;
189 u_int fd;
190
191 dt = fdp->fd_dt;
192 if (fdp->fd_refcnt == -1) {
193 /*
194 * fd_free tears down the table without maintaining its bitmap.
195 */
196 return;
197 }
198 for (fd = 0; fd < dt->dt_nfiles; fd++) {
199 if (fd < NDFDFILE) {
200 KASSERT(dt->dt_ff[fd] ==
201 (fdfile_t *)fdp->fd_dfdfile[fd]);
202 }
203 if (dt->dt_ff[fd] == NULL) {
204 KASSERT(!fd_isused(fdp, fd));
205 } else if (dt->dt_ff[fd]->ff_file != NULL) {
206 KASSERT(fd_isused(fdp, fd));
207 }
208 }
209 #endif
210 }
211
212 static int
213 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
214 {
215 int i, off, maxoff;
216 uint32_t sub;
217
218 KASSERT(mutex_owned(&fdp->fd_lock));
219
220 fd_checkmaps(fdp);
221
222 if (want > bits)
223 return -1;
224
225 off = want >> NDENTRYSHIFT;
226 i = want & NDENTRYMASK;
227 if (i) {
228 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
229 if (sub != ~0)
230 goto found;
231 off++;
232 }
233
234 maxoff = NDLOSLOTS(bits);
235 while (off < maxoff) {
236 if ((sub = bitmap[off]) != ~0)
237 goto found;
238 off++;
239 }
240
241 return -1;
242
243 found:
244 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
245 }
246
247 static int
248 fd_last_set(filedesc_t *fd, int last)
249 {
250 int off, i;
251 fdfile_t **ff = fd->fd_dt->dt_ff;
252 uint32_t *bitmap = fd->fd_lomap;
253
254 KASSERT(mutex_owned(&fd->fd_lock));
255
256 fd_checkmaps(fd);
257
258 off = (last - 1) >> NDENTRYSHIFT;
259
260 while (off >= 0 && !bitmap[off])
261 off--;
262
263 if (off < 0)
264 return -1;
265
266 i = ((off + 1) << NDENTRYSHIFT) - 1;
267 if (i >= last)
268 i = last - 1;
269
270 /* XXX should use bitmap */
271 while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
272 i--;
273
274 return i;
275 }
276
277 static inline void
278 fd_used(filedesc_t *fdp, unsigned fd)
279 {
280 u_int off = fd >> NDENTRYSHIFT;
281 fdfile_t *ff;
282
283 ff = fdp->fd_dt->dt_ff[fd];
284
285 KASSERT(mutex_owned(&fdp->fd_lock));
286 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
287 KASSERT(ff != NULL);
288 KASSERT(ff->ff_file == NULL);
289 KASSERT(!ff->ff_allocated);
290
291 ff->ff_allocated = true;
292 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
293 if (__predict_false(fdp->fd_lomap[off] == ~0)) {
294 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
295 (1 << (off & NDENTRYMASK))) == 0);
296 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
297 }
298
299 if ((int)fd > fdp->fd_lastfile) {
300 fdp->fd_lastfile = fd;
301 }
302
303 fd_checkmaps(fdp);
304 }
305
306 static inline void
307 fd_unused(filedesc_t *fdp, unsigned fd)
308 {
309 u_int off = fd >> NDENTRYSHIFT;
310 fdfile_t *ff;
311
312 ff = fdp->fd_dt->dt_ff[fd];
313
314 /*
315 * Don't assert the lock is held here, as we may be copying
316 * the table during exec() and it is not needed there.
317 * procfs and sysctl are locked out by proc::p_reflock.
318 *
319 * KASSERT(mutex_owned(&fdp->fd_lock));
320 */
321 KASSERT(ff != NULL);
322 KASSERT(ff->ff_file == NULL);
323 KASSERT(ff->ff_allocated);
324
325 if (fd < fdp->fd_freefile) {
326 fdp->fd_freefile = fd;
327 }
328
329 if (fdp->fd_lomap[off] == ~0) {
330 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
331 (1 << (off & NDENTRYMASK))) != 0);
332 fdp->fd_himap[off >> NDENTRYSHIFT] &=
333 ~(1 << (off & NDENTRYMASK));
334 }
335 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
336 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
337 ff->ff_allocated = false;
338
339 KASSERT(fd <= fdp->fd_lastfile);
340 if (fd == fdp->fd_lastfile) {
341 fdp->fd_lastfile = fd_last_set(fdp, fd);
342 }
343 fd_checkmaps(fdp);
344 }
345
346 /*
347 * Look up the file structure corresponding to a file descriptor
348 * and return the file, holding a reference on the descriptor.
349 */
350 file_t *
351 fd_getfile(unsigned fd)
352 {
353 filedesc_t *fdp;
354 fdfile_t *ff;
355 file_t *fp;
356 fdtab_t *dt;
357
358 /*
359 * Look up the fdfile structure representing this descriptor.
360 * We are doing this unlocked. See fd_tryexpand().
361 */
362 fdp = curlwp->l_fd;
363 dt = fdp->fd_dt;
364 if (__predict_false(fd >= dt->dt_nfiles)) {
365 return NULL;
366 }
367 ff = dt->dt_ff[fd];
368 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
369 if (__predict_false(ff == NULL)) {
370 return NULL;
371 }
372
373 /* Now get a reference to the descriptor. */
374 if (fdp->fd_refcnt == 1) {
375 /*
376 * Single threaded: don't need to worry about concurrent
377 * access (other than earlier calls to kqueue, which may
378 * hold a reference to the descriptor).
379 */
380 ff->ff_refcnt++;
381 } else {
382 /*
383 * Multi threaded: issue a memory barrier to ensure that we
384 * acquire the file pointer _after_ adding a reference. If
385 * no memory barrier, we could fetch a stale pointer.
386 */
387 atomic_inc_uint(&ff->ff_refcnt);
388 #ifndef __HAVE_ATOMIC_AS_MEMBAR
389 membar_enter();
390 #endif
391 }
392
393 /*
394 * If the file is not open or is being closed then put the
395 * reference back.
396 */
397 fp = ff->ff_file;
398 if (__predict_true(fp != NULL)) {
399 return fp;
400 }
401 fd_putfile(fd);
402 return NULL;
403 }
404
405 /*
406 * Release a reference to a file descriptor acquired with fd_getfile().
407 */
408 void
409 fd_putfile(unsigned fd)
410 {
411 filedesc_t *fdp;
412 fdfile_t *ff;
413 u_int u, v;
414
415 fdp = curlwp->l_fd;
416 ff = fdp->fd_dt->dt_ff[fd];
417
418 KASSERT(fd < fdp->fd_dt->dt_nfiles);
419 KASSERT(ff != NULL);
420 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
421 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
422
423 if (fdp->fd_refcnt == 1) {
424 /*
425 * Single threaded: don't need to worry about concurrent
426 * access (other than earlier calls to kqueue, which may
427 * hold a reference to the descriptor).
428 */
429 if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
430 fd_close(fd);
431 return;
432 }
433 ff->ff_refcnt--;
434 return;
435 }
436
437 /*
438 * Ensure that any use of the file is complete and globally
439 * visible before dropping the final reference. If no membar,
440 * the current CPU could still access memory associated with
441 * the file after it has been freed or recycled by another
442 * CPU.
443 */
444 #ifndef __HAVE_ATOMIC_AS_MEMBAR
445 membar_exit();
446 #endif
447
448 /*
449 * Be optimistic and start out with the assumption that no other
450 * threads are trying to close the descriptor. If the CAS fails,
451 * we lost a race and/or it's being closed.
452 */
453 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
454 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
455 if (__predict_true(u == v)) {
456 return;
457 }
458 if (__predict_false((v & FR_CLOSING) != 0)) {
459 break;
460 }
461 }
462
463 /* Another thread is waiting to close the file: join it. */
464 (void)fd_close(fd);
465 }
466
467 /*
468 * Convenience wrapper around fd_getfile() that returns reference
469 * to a vnode.
470 */
471 int
472 fd_getvnode(unsigned fd, file_t **fpp)
473 {
474 vnode_t *vp;
475 file_t *fp;
476
477 fp = fd_getfile(fd);
478 if (__predict_false(fp == NULL)) {
479 return EBADF;
480 }
481 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
482 fd_putfile(fd);
483 return EINVAL;
484 }
485 vp = fp->f_data;
486 if (__predict_false(vp->v_type == VBAD)) {
487 /* XXX Is this case really necessary? */
488 fd_putfile(fd);
489 return EBADF;
490 }
491 *fpp = fp;
492 return 0;
493 }
494
495 /*
496 * Convenience wrapper around fd_getfile() that returns reference
497 * to a socket.
498 */
499 int
500 fd_getsock1(unsigned fd, struct socket **sop, file_t **fp)
501 {
502 *fp = fd_getfile(fd);
503 if (__predict_false(*fp == NULL)) {
504 return EBADF;
505 }
506 if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) {
507 fd_putfile(fd);
508 return ENOTSOCK;
509 }
510 *sop = (*fp)->f_data;
511 return 0;
512 }
513
514 int
515 fd_getsock(unsigned fd, struct socket **sop)
516 {
517 file_t *fp;
518 return fd_getsock1(fd, sop, &fp);
519 }
520
521 /*
522 * Look up the file structure corresponding to a file descriptor
523 * and return it with a reference held on the file, not the
524 * descriptor.
525 *
526 * This is heavyweight and only used when accessing descriptors
527 * from a foreign process. The caller must ensure that `p' does
528 * not exit or fork across this call.
529 *
530 * To release the file (not descriptor) reference, use closef().
531 */
532 file_t *
533 fd_getfile2(proc_t *p, unsigned fd)
534 {
535 filedesc_t *fdp;
536 fdfile_t *ff;
537 file_t *fp;
538 fdtab_t *dt;
539
540 fdp = p->p_fd;
541 mutex_enter(&fdp->fd_lock);
542 dt = fdp->fd_dt;
543 if (fd >= dt->dt_nfiles) {
544 mutex_exit(&fdp->fd_lock);
545 return NULL;
546 }
547 if ((ff = dt->dt_ff[fd]) == NULL) {
548 mutex_exit(&fdp->fd_lock);
549 return NULL;
550 }
551 if ((fp = ff->ff_file) == NULL) {
552 mutex_exit(&fdp->fd_lock);
553 return NULL;
554 }
555 mutex_enter(&fp->f_lock);
556 fp->f_count++;
557 mutex_exit(&fp->f_lock);
558 mutex_exit(&fdp->fd_lock);
559
560 return fp;
561 }
562
563 /*
564 * Internal form of close. Must be called with a reference to the
565 * descriptor, and will drop the reference. When all descriptor
566 * references are dropped, releases the descriptor slot and a single
567 * reference to the file structure.
568 */
569 int
570 fd_close(unsigned fd)
571 {
572 struct flock lf;
573 filedesc_t *fdp;
574 fdfile_t *ff;
575 file_t *fp;
576 proc_t *p;
577 lwp_t *l;
578 u_int refcnt;
579
580 l = curlwp;
581 p = l->l_proc;
582 fdp = l->l_fd;
583 ff = fdp->fd_dt->dt_ff[fd];
584
585 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
586
587 mutex_enter(&fdp->fd_lock);
588 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
589 if (__predict_false(ff->ff_file == NULL)) {
590 /*
591 * Another user of the file is already closing, and is
592 * waiting for other users of the file to drain. Release
593 * our reference, and wake up the closer.
594 */
595 atomic_dec_uint(&ff->ff_refcnt);
596 cv_broadcast(&ff->ff_closing);
597 mutex_exit(&fdp->fd_lock);
598
599 /*
600 * An application error, so pretend that the descriptor
601 * was already closed. We can't safely wait for it to
602 * be closed without potentially deadlocking.
603 */
604 return (EBADF);
605 }
606 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
607
608 /*
609 * There may be multiple users of this file within the process.
610 * Notify existing and new users that the file is closing. This
611 * will prevent them from adding additional uses to this file
612 * while we are closing it.
613 */
614 fp = ff->ff_file;
615 ff->ff_file = NULL;
616 ff->ff_exclose = false;
617
618 /*
619 * We expect the caller to hold a descriptor reference - drop it.
620 * The reference count may increase beyond zero at this point due
621 * to an erroneous descriptor reference by an application, but
622 * fd_getfile() will notice that the file is being closed and drop
623 * the reference again.
624 */
625 if (fdp->fd_refcnt == 1) {
626 /* Single threaded. */
627 refcnt = --(ff->ff_refcnt);
628 } else {
629 /* Multi threaded. */
630 #ifndef __HAVE_ATOMIC_AS_MEMBAR
631 membar_producer();
632 #endif
633 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
634 }
635 if (__predict_false(refcnt != 0)) {
636 /*
637 * Wait for other references to drain. This is typically
638 * an application error - the descriptor is being closed
639 * while still in use.
640 * (Or just a threaded application trying to unblock its
641 * thread that sleeps in (say) accept()).
642 */
643 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
644
645 /*
646 * Remove any knotes attached to the file. A knote
647 * attached to the descriptor can hold references on it.
648 */
649 mutex_exit(&fdp->fd_lock);
650 if (!SLIST_EMPTY(&ff->ff_knlist)) {
651 knote_fdclose(fd);
652 }
653
654 /*
655 * Since the file system code doesn't know which fd
656 * each request came from (think dup()), we have to
657 * ask it to return ERESTART for any long-term blocks.
658 * The re-entry through read/write/etc will detect the
659 * closed fd and return EBAFD.
660 * Blocked partial writes may return a short length.
661 */
662 (*fp->f_ops->fo_restart)(fp);
663 mutex_enter(&fdp->fd_lock);
664
665 /*
666 * We need to see the count drop to zero at least once,
667 * in order to ensure that all pre-existing references
668 * have been drained. New references past this point are
669 * of no interest.
670 * XXX (dsl) this may need to call fo_restart() after a
671 * timeout to guarantee that all the system calls exit.
672 */
673 while ((ff->ff_refcnt & FR_MASK) != 0) {
674 cv_wait(&ff->ff_closing, &fdp->fd_lock);
675 }
676 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
677 } else {
678 /* If no references, there must be no knotes. */
679 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
680 }
681
682 /*
683 * POSIX record locking dictates that any close releases ALL
684 * locks owned by this process. This is handled by setting
685 * a flag in the unlock to free ONLY locks obeying POSIX
686 * semantics, and not to free BSD-style file locks.
687 * If the descriptor was in a message, POSIX-style locks
688 * aren't passed with the descriptor.
689 */
690 if (__predict_false((p->p_flag & PK_ADVLOCK) != 0 &&
691 fp->f_type == DTYPE_VNODE)) {
692 lf.l_whence = SEEK_SET;
693 lf.l_start = 0;
694 lf.l_len = 0;
695 lf.l_type = F_UNLCK;
696 mutex_exit(&fdp->fd_lock);
697 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
698 mutex_enter(&fdp->fd_lock);
699 }
700
701 /* Free descriptor slot. */
702 fd_unused(fdp, fd);
703 mutex_exit(&fdp->fd_lock);
704
705 /* Now drop reference to the file itself. */
706 return closef(fp);
707 }
708
709 /*
710 * Duplicate a file descriptor.
711 */
712 int
713 fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
714 {
715 proc_t *p = curproc;
716 int error;
717
718 while ((error = fd_alloc(p, minfd, newp)) != 0) {
719 if (error != ENOSPC) {
720 return error;
721 }
722 fd_tryexpand(p);
723 }
724
725 curlwp->l_fd->fd_dt->dt_ff[*newp]->ff_exclose = exclose;
726 fd_affix(p, fp, *newp);
727 return 0;
728 }
729
730 /*
731 * dup2 operation.
732 */
733 int
734 fd_dup2(file_t *fp, unsigned new, int flags)
735 {
736 filedesc_t *fdp = curlwp->l_fd;
737 fdfile_t *ff;
738 fdtab_t *dt;
739
740 if (flags & ~(O_CLOEXEC|O_NONBLOCK))
741 return EINVAL;
742 /*
743 * Ensure there are enough slots in the descriptor table,
744 * and allocate an fdfile_t up front in case we need it.
745 */
746 while (new >= fdp->fd_dt->dt_nfiles) {
747 fd_tryexpand(curproc);
748 }
749 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
750
751 /*
752 * If there is already a file open, close it. If the file is
753 * half open, wait for it to be constructed before closing it.
754 * XXX Potential for deadlock here?
755 */
756 mutex_enter(&fdp->fd_lock);
757 while (fd_isused(fdp, new)) {
758 mutex_exit(&fdp->fd_lock);
759 if (fd_getfile(new) != NULL) {
760 (void)fd_close(new);
761 } else {
762 /*
763 * Crummy, but unlikely to happen.
764 * Can occur if we interrupt another
765 * thread while it is opening a file.
766 */
767 kpause("dup2", false, 1, NULL);
768 }
769 mutex_enter(&fdp->fd_lock);
770 }
771 dt = fdp->fd_dt;
772 if (dt->dt_ff[new] == NULL) {
773 KASSERT(new >= NDFDFILE);
774 dt->dt_ff[new] = ff;
775 ff = NULL;
776 }
777 fd_used(fdp, new);
778 mutex_exit(&fdp->fd_lock);
779
780 dt->dt_ff[new]->ff_exclose = (flags & O_CLOEXEC) != 0;
781 fp->f_flag |= flags & FNONBLOCK;
782 /* Slot is now allocated. Insert copy of the file. */
783 fd_affix(curproc, fp, new);
784 if (ff != NULL) {
785 pool_cache_put(fdfile_cache, ff);
786 }
787 return 0;
788 }
789
790 /*
791 * Drop reference to a file structure.
792 */
793 int
794 closef(file_t *fp)
795 {
796 struct flock lf;
797 int error;
798
799 /*
800 * Drop reference. If referenced elsewhere it's still open
801 * and we have nothing more to do.
802 */
803 mutex_enter(&fp->f_lock);
804 KASSERT(fp->f_count > 0);
805 if (--fp->f_count > 0) {
806 mutex_exit(&fp->f_lock);
807 return 0;
808 }
809 KASSERT(fp->f_count == 0);
810 mutex_exit(&fp->f_lock);
811
812 /* We held the last reference - release locks, close and free. */
813 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
814 lf.l_whence = SEEK_SET;
815 lf.l_start = 0;
816 lf.l_len = 0;
817 lf.l_type = F_UNLCK;
818 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
819 }
820 if (fp->f_ops != NULL) {
821 error = (*fp->f_ops->fo_close)(fp);
822 } else {
823 error = 0;
824 }
825 KASSERT(fp->f_count == 0);
826 KASSERT(fp->f_cred != NULL);
827 pool_cache_put(file_cache, fp);
828
829 return error;
830 }
831
832 /*
833 * Allocate a file descriptor for the process.
834 */
835 int
836 fd_alloc(proc_t *p, int want, int *result)
837 {
838 filedesc_t *fdp = p->p_fd;
839 int i, lim, last, error;
840 u_int off, new;
841 fdtab_t *dt;
842
843 KASSERT(p == curproc || p == &proc0);
844
845 /*
846 * Search for a free descriptor starting at the higher
847 * of want or fd_freefile.
848 */
849 mutex_enter(&fdp->fd_lock);
850 fd_checkmaps(fdp);
851 dt = fdp->fd_dt;
852 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
853 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
854 last = min(dt->dt_nfiles, lim);
855 for (;;) {
856 if ((i = want) < fdp->fd_freefile)
857 i = fdp->fd_freefile;
858 off = i >> NDENTRYSHIFT;
859 new = fd_next_zero(fdp, fdp->fd_himap, off,
860 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
861 if (new == -1)
862 break;
863 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
864 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
865 if (i == -1) {
866 /*
867 * Free file descriptor in this block was
868 * below want, try again with higher want.
869 */
870 want = (new + 1) << NDENTRYSHIFT;
871 continue;
872 }
873 i += (new << NDENTRYSHIFT);
874 if (i >= last) {
875 break;
876 }
877 if (dt->dt_ff[i] == NULL) {
878 KASSERT(i >= NDFDFILE);
879 dt->dt_ff[i] = pool_cache_get(fdfile_cache, PR_WAITOK);
880 }
881 KASSERT(dt->dt_ff[i]->ff_file == NULL);
882 fd_used(fdp, i);
883 if (want <= fdp->fd_freefile) {
884 fdp->fd_freefile = i;
885 }
886 *result = i;
887 KASSERT(i >= NDFDFILE ||
888 dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
889 fd_checkmaps(fdp);
890 mutex_exit(&fdp->fd_lock);
891 return 0;
892 }
893
894 /* No space in current array. Let the caller expand and retry. */
895 error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
896 mutex_exit(&fdp->fd_lock);
897 return error;
898 }
899
900 /*
901 * Allocate memory for a descriptor table.
902 */
903 static fdtab_t *
904 fd_dtab_alloc(int n)
905 {
906 fdtab_t *dt;
907 size_t sz;
908
909 KASSERT(n > NDFILE);
910
911 sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
912 dt = kmem_alloc(sz, KM_SLEEP);
913 #ifdef DIAGNOSTIC
914 memset(dt, 0xff, sz);
915 #endif
916 dt->dt_nfiles = n;
917 dt->dt_link = NULL;
918 return dt;
919 }
920
921 /*
922 * Free a descriptor table, and all tables linked for deferred free.
923 */
924 static void
925 fd_dtab_free(fdtab_t *dt)
926 {
927 fdtab_t *next;
928 size_t sz;
929
930 do {
931 next = dt->dt_link;
932 KASSERT(dt->dt_nfiles > NDFILE);
933 sz = sizeof(*dt) +
934 (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
935 #ifdef DIAGNOSTIC
936 memset(dt, 0xff, sz);
937 #endif
938 kmem_free(dt, sz);
939 dt = next;
940 } while (dt != NULL);
941 }
942
943 /*
944 * Allocate descriptor bitmap.
945 */
946 static void
947 fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
948 {
949 uint8_t *ptr;
950 size_t szlo, szhi;
951
952 KASSERT(n > NDENTRIES);
953
954 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
955 szhi = NDHISLOTS(n) * sizeof(uint32_t);
956 ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
957 *lo = (uint32_t *)ptr;
958 *hi = (uint32_t *)(ptr + szlo);
959 }
960
961 /*
962 * Free descriptor bitmap.
963 */
964 static void
965 fd_map_free(int n, uint32_t *lo, uint32_t *hi)
966 {
967 size_t szlo, szhi;
968
969 KASSERT(n > NDENTRIES);
970
971 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
972 szhi = NDHISLOTS(n) * sizeof(uint32_t);
973 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
974 kmem_free(lo, szlo + szhi);
975 }
976
977 /*
978 * Expand a process' descriptor table.
979 */
980 void
981 fd_tryexpand(proc_t *p)
982 {
983 filedesc_t *fdp;
984 int i, numfiles, oldnfiles;
985 fdtab_t *newdt, *dt;
986 uint32_t *newhimap, *newlomap;
987
988 KASSERT(p == curproc || p == &proc0);
989
990 fdp = p->p_fd;
991 newhimap = NULL;
992 newlomap = NULL;
993 oldnfiles = fdp->fd_dt->dt_nfiles;
994
995 if (oldnfiles < NDEXTENT)
996 numfiles = NDEXTENT;
997 else
998 numfiles = 2 * oldnfiles;
999
1000 newdt = fd_dtab_alloc(numfiles);
1001 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
1002 fd_map_alloc(numfiles, &newlomap, &newhimap);
1003 }
1004
1005 mutex_enter(&fdp->fd_lock);
1006 dt = fdp->fd_dt;
1007 KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1008 if (dt->dt_nfiles != oldnfiles) {
1009 /* fdp changed; caller must retry */
1010 mutex_exit(&fdp->fd_lock);
1011 fd_dtab_free(newdt);
1012 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
1013 fd_map_free(numfiles, newlomap, newhimap);
1014 }
1015 return;
1016 }
1017
1018 /* Copy the existing descriptor table and zero the new portion. */
1019 i = sizeof(fdfile_t *) * oldnfiles;
1020 memcpy(newdt->dt_ff, dt->dt_ff, i);
1021 memset((uint8_t *)newdt->dt_ff + i, 0,
1022 numfiles * sizeof(fdfile_t *) - i);
1023
1024 /*
1025 * Link old descriptor array into list to be discarded. We defer
1026 * freeing until the last reference to the descriptor table goes
1027 * away (usually process exit). This allows us to do lockless
1028 * lookups in fd_getfile().
1029 */
1030 if (oldnfiles > NDFILE) {
1031 if (fdp->fd_refcnt > 1) {
1032 newdt->dt_link = dt;
1033 } else {
1034 fd_dtab_free(dt);
1035 }
1036 }
1037
1038 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
1039 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
1040 memcpy(newhimap, fdp->fd_himap, i);
1041 memset((uint8_t *)newhimap + i, 0,
1042 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
1043
1044 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
1045 memcpy(newlomap, fdp->fd_lomap, i);
1046 memset((uint8_t *)newlomap + i, 0,
1047 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
1048
1049 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
1050 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
1051 }
1052 fdp->fd_himap = newhimap;
1053 fdp->fd_lomap = newlomap;
1054 }
1055
1056 /*
1057 * All other modifications must become globally visible before
1058 * the change to fd_dt. See fd_getfile().
1059 */
1060 membar_producer();
1061 fdp->fd_dt = newdt;
1062 KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1063 fd_checkmaps(fdp);
1064 mutex_exit(&fdp->fd_lock);
1065 }
1066
1067 /*
1068 * Create a new open file structure and allocate a file descriptor
1069 * for the current process.
1070 */
1071 int
1072 fd_allocfile(file_t **resultfp, int *resultfd)
1073 {
1074 proc_t *p = curproc;
1075 kauth_cred_t cred;
1076 file_t *fp;
1077 int error;
1078
1079 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
1080 if (error != ENOSPC) {
1081 return error;
1082 }
1083 fd_tryexpand(p);
1084 }
1085
1086 fp = pool_cache_get(file_cache, PR_WAITOK);
1087 if (fp == NULL) {
1088 fd_abort(p, NULL, *resultfd);
1089 return ENFILE;
1090 }
1091 KASSERT(fp->f_count == 0);
1092 KASSERT(fp->f_msgcount == 0);
1093 KASSERT(fp->f_unpcount == 0);
1094
1095 /* Replace cached credentials if not what we need. */
1096 cred = curlwp->l_cred;
1097 if (__predict_false(cred != fp->f_cred)) {
1098 kauth_cred_free(fp->f_cred);
1099 kauth_cred_hold(cred);
1100 fp->f_cred = cred;
1101 }
1102
1103 /*
1104 * Don't allow recycled files to be scanned.
1105 * See uipc_usrreq.c.
1106 */
1107 if (__predict_false((fp->f_flag & FSCAN) != 0)) {
1108 mutex_enter(&fp->f_lock);
1109 atomic_and_uint(&fp->f_flag, ~FSCAN);
1110 mutex_exit(&fp->f_lock);
1111 }
1112
1113 fp->f_advice = 0;
1114 fp->f_offset = 0;
1115 *resultfp = fp;
1116
1117 return 0;
1118 }
1119
1120 /*
1121 * Successful creation of a new descriptor: make visible to the process.
1122 */
1123 void
1124 fd_affix(proc_t *p, file_t *fp, unsigned fd)
1125 {
1126 fdfile_t *ff;
1127 filedesc_t *fdp;
1128
1129 KASSERT(p == curproc || p == &proc0);
1130
1131 /* Add a reference to the file structure. */
1132 mutex_enter(&fp->f_lock);
1133 fp->f_count++;
1134 mutex_exit(&fp->f_lock);
1135
1136 /*
1137 * Insert the new file into the descriptor slot.
1138 *
1139 * The memory barriers provided by lock activity in this routine
1140 * ensure that any updates to the file structure become globally
1141 * visible before the file becomes visible to other LWPs in the
1142 * current process.
1143 */
1144 fdp = p->p_fd;
1145 ff = fdp->fd_dt->dt_ff[fd];
1146
1147 KASSERT(ff != NULL);
1148 KASSERT(ff->ff_file == NULL);
1149 KASSERT(ff->ff_allocated);
1150 KASSERT(fd_isused(fdp, fd));
1151 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1152
1153 /* No need to lock in order to make file initially visible. */
1154 ff->ff_file = fp;
1155 }
1156
1157 /*
1158 * Abort creation of a new descriptor: free descriptor slot and file.
1159 */
1160 void
1161 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1162 {
1163 filedesc_t *fdp;
1164 fdfile_t *ff;
1165
1166 KASSERT(p == curproc || p == &proc0);
1167
1168 fdp = p->p_fd;
1169 ff = fdp->fd_dt->dt_ff[fd];
1170 ff->ff_exclose = false;
1171
1172 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1173
1174 mutex_enter(&fdp->fd_lock);
1175 KASSERT(fd_isused(fdp, fd));
1176 fd_unused(fdp, fd);
1177 mutex_exit(&fdp->fd_lock);
1178
1179 if (fp != NULL) {
1180 KASSERT(fp->f_count == 0);
1181 KASSERT(fp->f_cred != NULL);
1182 pool_cache_put(file_cache, fp);
1183 }
1184 }
1185
1186 static int
1187 file_ctor(void *arg, void *obj, int flags)
1188 {
1189 file_t *fp = obj;
1190
1191 memset(fp, 0, sizeof(*fp));
1192
1193 mutex_enter(&filelist_lock);
1194 if (__predict_false(nfiles >= maxfiles)) {
1195 mutex_exit(&filelist_lock);
1196 tablefull("file", "increase kern.maxfiles or MAXFILES");
1197 return ENFILE;
1198 }
1199 nfiles++;
1200 LIST_INSERT_HEAD(&filehead, fp, f_list);
1201 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1202 fp->f_cred = curlwp->l_cred;
1203 kauth_cred_hold(fp->f_cred);
1204 mutex_exit(&filelist_lock);
1205
1206 return 0;
1207 }
1208
1209 static void
1210 file_dtor(void *arg, void *obj)
1211 {
1212 file_t *fp = obj;
1213
1214 mutex_enter(&filelist_lock);
1215 nfiles--;
1216 LIST_REMOVE(fp, f_list);
1217 mutex_exit(&filelist_lock);
1218
1219 kauth_cred_free(fp->f_cred);
1220 mutex_destroy(&fp->f_lock);
1221 }
1222
1223 static int
1224 fdfile_ctor(void *arg, void *obj, int flags)
1225 {
1226 fdfile_t *ff = obj;
1227
1228 memset(ff, 0, sizeof(*ff));
1229 cv_init(&ff->ff_closing, "fdclose");
1230
1231 return 0;
1232 }
1233
1234 static void
1235 fdfile_dtor(void *arg, void *obj)
1236 {
1237 fdfile_t *ff = obj;
1238
1239 cv_destroy(&ff->ff_closing);
1240 }
1241
1242 file_t *
1243 fgetdummy(void)
1244 {
1245 file_t *fp;
1246
1247 fp = kmem_zalloc(sizeof(*fp), KM_SLEEP);
1248 if (fp != NULL) {
1249 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1250 }
1251 return fp;
1252 }
1253
1254 void
1255 fputdummy(file_t *fp)
1256 {
1257
1258 mutex_destroy(&fp->f_lock);
1259 kmem_free(fp, sizeof(*fp));
1260 }
1261
1262 /*
1263 * Create an initial filedesc structure.
1264 */
1265 filedesc_t *
1266 fd_init(filedesc_t *fdp)
1267 {
1268 #ifdef DIAGNOSTIC
1269 unsigned fd;
1270 #endif
1271
1272 if (__predict_true(fdp == NULL)) {
1273 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1274 } else {
1275 KASSERT(fdp == &filedesc0);
1276 filedesc_ctor(NULL, fdp, PR_WAITOK);
1277 }
1278
1279 #ifdef DIAGNOSTIC
1280 KASSERT(fdp->fd_lastfile == -1);
1281 KASSERT(fdp->fd_lastkqfile == -1);
1282 KASSERT(fdp->fd_knhash == NULL);
1283 KASSERT(fdp->fd_freefile == 0);
1284 KASSERT(fdp->fd_exclose == false);
1285 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
1286 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1287 for (fd = 0; fd < NDFDFILE; fd++) {
1288 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
1289 (fdfile_t *)fdp->fd_dfdfile[fd]);
1290 }
1291 for (fd = NDFDFILE; fd < NDFILE; fd++) {
1292 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
1293 }
1294 KASSERT(fdp->fd_himap == fdp->fd_dhimap);
1295 KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
1296 #endif /* DIAGNOSTIC */
1297
1298 fdp->fd_refcnt = 1;
1299 fd_checkmaps(fdp);
1300
1301 return fdp;
1302 }
1303
1304 /*
1305 * Initialize a file descriptor table.
1306 */
1307 static int
1308 filedesc_ctor(void *arg, void *obj, int flag)
1309 {
1310 filedesc_t *fdp = obj;
1311 fdfile_t **ffp;
1312 int i;
1313
1314 memset(fdp, 0, sizeof(*fdp));
1315 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1316 fdp->fd_lastfile = -1;
1317 fdp->fd_lastkqfile = -1;
1318 fdp->fd_dt = &fdp->fd_dtbuiltin;
1319 fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
1320 fdp->fd_himap = fdp->fd_dhimap;
1321 fdp->fd_lomap = fdp->fd_dlomap;
1322
1323 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1324 for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
1325 *ffp = (fdfile_t *)fdp->fd_dfdfile[i];
1326 (void)fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1327 }
1328
1329 return 0;
1330 }
1331
1332 static void
1333 filedesc_dtor(void *arg, void *obj)
1334 {
1335 filedesc_t *fdp = obj;
1336 int i;
1337
1338 for (i = 0; i < NDFDFILE; i++) {
1339 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1340 }
1341
1342 mutex_destroy(&fdp->fd_lock);
1343 }
1344
1345 /*
1346 * Make p share curproc's filedesc structure.
1347 */
1348 void
1349 fd_share(struct proc *p)
1350 {
1351 filedesc_t *fdp;
1352
1353 fdp = curlwp->l_fd;
1354 p->p_fd = fdp;
1355 atomic_inc_uint(&fdp->fd_refcnt);
1356 }
1357
1358 /*
1359 * Acquire a hold on a filedesc structure.
1360 */
1361 void
1362 fd_hold(lwp_t *l)
1363 {
1364 filedesc_t *fdp = l->l_fd;
1365
1366 atomic_inc_uint(&fdp->fd_refcnt);
1367 }
1368
1369 /*
1370 * Copy a filedesc structure.
1371 */
1372 filedesc_t *
1373 fd_copy(void)
1374 {
1375 filedesc_t *newfdp, *fdp;
1376 fdfile_t *ff, **ffp, **nffp, *ff2;
1377 int i, j, numfiles, lastfile, newlast;
1378 file_t *fp;
1379 fdtab_t *newdt;
1380
1381 fdp = curproc->p_fd;
1382 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1383 newfdp->fd_refcnt = 1;
1384
1385 #ifdef DIAGNOSTIC
1386 KASSERT(newfdp->fd_lastfile == -1);
1387 KASSERT(newfdp->fd_lastkqfile == -1);
1388 KASSERT(newfdp->fd_knhash == NULL);
1389 KASSERT(newfdp->fd_freefile == 0);
1390 KASSERT(newfdp->fd_exclose == false);
1391 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
1392 KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1393 for (i = 0; i < NDFDFILE; i++) {
1394 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
1395 (fdfile_t *)&newfdp->fd_dfdfile[i]);
1396 }
1397 for (i = NDFDFILE; i < NDFILE; i++) {
1398 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
1399 }
1400 #endif /* DIAGNOSTIC */
1401
1402 mutex_enter(&fdp->fd_lock);
1403 fd_checkmaps(fdp);
1404 numfiles = fdp->fd_dt->dt_nfiles;
1405 lastfile = fdp->fd_lastfile;
1406
1407 /*
1408 * If the number of open files fits in the internal arrays
1409 * of the open file structure, use them, otherwise allocate
1410 * additional memory for the number of descriptors currently
1411 * in use.
1412 */
1413 if (lastfile < NDFILE) {
1414 i = NDFILE;
1415 newdt = newfdp->fd_dt;
1416 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
1417 } else {
1418 /*
1419 * Compute the smallest multiple of NDEXTENT needed
1420 * for the file descriptors currently in use,
1421 * allowing the table to shrink.
1422 */
1423 i = numfiles;
1424 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1425 i /= 2;
1426 }
1427 KASSERT(i > NDFILE);
1428 newdt = fd_dtab_alloc(i);
1429 newfdp->fd_dt = newdt;
1430 memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
1431 NDFDFILE * sizeof(fdfile_t **));
1432 memset(newdt->dt_ff + NDFDFILE, 0,
1433 (i - NDFDFILE) * sizeof(fdfile_t **));
1434 }
1435 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1436 newfdp->fd_himap = newfdp->fd_dhimap;
1437 newfdp->fd_lomap = newfdp->fd_dlomap;
1438 } else {
1439 fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
1440 KASSERT(i >= NDENTRIES * NDENTRIES);
1441 memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
1442 memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
1443 }
1444 newfdp->fd_freefile = fdp->fd_freefile;
1445 newfdp->fd_exclose = fdp->fd_exclose;
1446
1447 ffp = fdp->fd_dt->dt_ff;
1448 nffp = newdt->dt_ff;
1449 newlast = -1;
1450 for (i = 0; i <= (int)lastfile; i++, ffp++, nffp++) {
1451 KASSERT(i >= NDFDFILE ||
1452 *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
1453 ff = *ffp;
1454 if (ff == NULL || (fp = ff->ff_file) == NULL) {
1455 /* Descriptor unused, or descriptor half open. */
1456 KASSERT(!fd_isused(newfdp, i));
1457 continue;
1458 }
1459 if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
1460 /* kqueue descriptors cannot be copied. */
1461 if (i < newfdp->fd_freefile) {
1462 newfdp->fd_freefile = i;
1463 }
1464 continue;
1465 }
1466 /* It's active: add a reference to the file. */
1467 mutex_enter(&fp->f_lock);
1468 fp->f_count++;
1469 mutex_exit(&fp->f_lock);
1470
1471 /* Allocate an fdfile_t to represent it. */
1472 if (i >= NDFDFILE) {
1473 ff2 = pool_cache_get(fdfile_cache, PR_WAITOK);
1474 *nffp = ff2;
1475 } else {
1476 ff2 = newdt->dt_ff[i];
1477 }
1478 ff2->ff_file = fp;
1479 ff2->ff_exclose = ff->ff_exclose;
1480 ff2->ff_allocated = true;
1481
1482 /* Fix up bitmaps. */
1483 j = i >> NDENTRYSHIFT;
1484 KASSERT((newfdp->fd_lomap[j] & (1 << (i & NDENTRYMASK))) == 0);
1485 newfdp->fd_lomap[j] |= 1 << (i & NDENTRYMASK);
1486 if (__predict_false(newfdp->fd_lomap[j] == ~0)) {
1487 KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
1488 (1 << (j & NDENTRYMASK))) == 0);
1489 newfdp->fd_himap[j >> NDENTRYSHIFT] |=
1490 1 << (j & NDENTRYMASK);
1491 }
1492 newlast = i;
1493 }
1494 KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1495 newfdp->fd_lastfile = newlast;
1496 fd_checkmaps(newfdp);
1497 mutex_exit(&fdp->fd_lock);
1498
1499 return newfdp;
1500 }
1501
1502 /*
1503 * Release a filedesc structure.
1504 */
1505 void
1506 fd_free(void)
1507 {
1508 fdfile_t *ff;
1509 file_t *fp;
1510 int fd, nf;
1511 fdtab_t *dt;
1512 lwp_t * const l = curlwp;
1513 filedesc_t * const fdp = l->l_fd;
1514 const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
1515
1516 KASSERT(fdp->fd_dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1517 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1518 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
1519
1520 #ifndef __HAVE_ATOMIC_AS_MEMBAR
1521 membar_exit();
1522 #endif
1523 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1524 return;
1525
1526 /*
1527 * Close any files that the process holds open.
1528 */
1529 dt = fdp->fd_dt;
1530 fd_checkmaps(fdp);
1531 #ifdef DEBUG
1532 fdp->fd_refcnt = -1; /* see fd_checkmaps */
1533 #endif
1534 for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
1535 ff = dt->dt_ff[fd];
1536 KASSERT(fd >= NDFDFILE ||
1537 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1538 if (ff == NULL)
1539 continue;
1540 if ((fp = ff->ff_file) != NULL) {
1541 /*
1542 * Must use fd_close() here if there is
1543 * a reference from kqueue or we might have posix
1544 * advisory locks.
1545 */
1546 if (__predict_true(ff->ff_refcnt == 0) &&
1547 (noadvlock || fp->f_type != DTYPE_VNODE)) {
1548 ff->ff_file = NULL;
1549 ff->ff_exclose = false;
1550 ff->ff_allocated = false;
1551 closef(fp);
1552 } else {
1553 ff->ff_refcnt++;
1554 fd_close(fd);
1555 }
1556 }
1557 KASSERT(ff->ff_refcnt == 0);
1558 KASSERT(ff->ff_file == NULL);
1559 KASSERT(!ff->ff_exclose);
1560 KASSERT(!ff->ff_allocated);
1561 if (fd >= NDFDFILE) {
1562 pool_cache_put(fdfile_cache, ff);
1563 dt->dt_ff[fd] = NULL;
1564 }
1565 }
1566
1567 /*
1568 * Clean out the descriptor table for the next user and return
1569 * to the cache.
1570 */
1571 if (__predict_false(dt != &fdp->fd_dtbuiltin)) {
1572 fd_dtab_free(fdp->fd_dt);
1573 /* Otherwise, done above. */
1574 memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
1575 (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
1576 fdp->fd_dt = &fdp->fd_dtbuiltin;
1577 }
1578 if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) {
1579 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1580 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1581 fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
1582 }
1583 if (__predict_false(fdp->fd_knhash != NULL)) {
1584 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1585 fdp->fd_knhash = NULL;
1586 fdp->fd_knhashmask = 0;
1587 } else {
1588 KASSERT(fdp->fd_knhashmask == 0);
1589 }
1590 fdp->fd_dt = &fdp->fd_dtbuiltin;
1591 fdp->fd_lastkqfile = -1;
1592 fdp->fd_lastfile = -1;
1593 fdp->fd_freefile = 0;
1594 fdp->fd_exclose = false;
1595 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1596 offsetof(filedesc_t, fd_startzero));
1597 fdp->fd_himap = fdp->fd_dhimap;
1598 fdp->fd_lomap = fdp->fd_dlomap;
1599 KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
1600 KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
1601 KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
1602 #ifdef DEBUG
1603 fdp->fd_refcnt = 0; /* see fd_checkmaps */
1604 #endif
1605 fd_checkmaps(fdp);
1606 pool_cache_put(filedesc_cache, fdp);
1607 }
1608
1609 /*
1610 * File Descriptor pseudo-device driver (/dev/fd/).
1611 *
1612 * Opening minor device N dup()s the file (if any) connected to file
1613 * descriptor N belonging to the calling process. Note that this driver
1614 * consists of only the ``open()'' routine, because all subsequent
1615 * references to this file will be direct to the other driver.
1616 */
1617 static int
1618 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1619 {
1620
1621 /*
1622 * XXX Kludge: set dupfd to contain the value of the
1623 * the file descriptor being sought for duplication. The error
1624 * return ensures that the vnode for this device will be released
1625 * by vn_open. Open will detect this special error and take the
1626 * actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN
1627 * will simply report the error.
1628 */
1629 l->l_dupfd = minor(dev); /* XXX */
1630 return EDUPFD;
1631 }
1632
1633 /*
1634 * Duplicate the specified descriptor to a free descriptor.
1635 */
1636 int
1637 fd_dupopen(int old, int *new, int mode, int error)
1638 {
1639 filedesc_t *fdp;
1640 fdfile_t *ff;
1641 file_t *fp;
1642 fdtab_t *dt;
1643
1644 if ((fp = fd_getfile(old)) == NULL) {
1645 return EBADF;
1646 }
1647 fdp = curlwp->l_fd;
1648 dt = fdp->fd_dt;
1649 ff = dt->dt_ff[old];
1650
1651 /*
1652 * There are two cases of interest here.
1653 *
1654 * For EDUPFD simply dup (old) to file descriptor
1655 * (new) and return.
1656 *
1657 * For EMOVEFD steal away the file structure from (old) and
1658 * store it in (new). (old) is effectively closed by
1659 * this operation.
1660 *
1661 * Any other error code is just returned.
1662 */
1663 switch (error) {
1664 case EDUPFD:
1665 /*
1666 * Check that the mode the file is being opened for is a
1667 * subset of the mode of the existing descriptor.
1668 */
1669 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1670 error = EACCES;
1671 break;
1672 }
1673
1674 /* Copy it. */
1675 error = fd_dup(fp, 0, new, ff->ff_exclose);
1676 break;
1677
1678 case EMOVEFD:
1679 /* Copy it. */
1680 error = fd_dup(fp, 0, new, ff->ff_exclose);
1681 if (error != 0) {
1682 break;
1683 }
1684
1685 /* Steal away the file pointer from 'old'. */
1686 (void)fd_close(old);
1687 return 0;
1688 }
1689
1690 fd_putfile(old);
1691 return error;
1692 }
1693
1694 /*
1695 * Close open files on exec.
1696 */
1697 void
1698 fd_closeexec(void)
1699 {
1700 proc_t *p;
1701 filedesc_t *fdp;
1702 fdfile_t *ff;
1703 lwp_t *l;
1704 fdtab_t *dt;
1705 int fd;
1706
1707 l = curlwp;
1708 p = l->l_proc;
1709 fdp = p->p_fd;
1710
1711 if (fdp->fd_refcnt > 1) {
1712 fdp = fd_copy();
1713 fd_free();
1714 p->p_fd = fdp;
1715 l->l_fd = fdp;
1716 }
1717 if (!fdp->fd_exclose) {
1718 return;
1719 }
1720 fdp->fd_exclose = false;
1721 dt = fdp->fd_dt;
1722
1723 for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1724 if ((ff = dt->dt_ff[fd]) == NULL) {
1725 KASSERT(fd >= NDFDFILE);
1726 continue;
1727 }
1728 KASSERT(fd >= NDFDFILE ||
1729 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1730 if (ff->ff_file == NULL)
1731 continue;
1732 if (ff->ff_exclose) {
1733 /*
1734 * We need a reference to close the file.
1735 * No other threads can see the fdfile_t at
1736 * this point, so don't bother locking.
1737 */
1738 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1739 ff->ff_refcnt++;
1740 fd_close(fd);
1741 }
1742 }
1743 }
1744
1745 /*
1746 * Sets descriptor owner. If the owner is a process, 'pgid'
1747 * is set to positive value, process ID. If the owner is process group,
1748 * 'pgid' is set to -pg_id.
1749 */
1750 int
1751 fsetown(pid_t *pgid, u_long cmd, const void *data)
1752 {
1753 pid_t id = *(const pid_t *)data;
1754 int error;
1755
1756 switch (cmd) {
1757 case TIOCSPGRP:
1758 if (id < 0)
1759 return EINVAL;
1760 id = -id;
1761 break;
1762 default:
1763 break;
1764 }
1765 if (id > 0) {
1766 mutex_enter(proc_lock);
1767 error = proc_find(id) ? 0 : ESRCH;
1768 mutex_exit(proc_lock);
1769 } else if (id < 0) {
1770 error = pgid_in_session(curproc, -id);
1771 } else {
1772 error = 0;
1773 }
1774 if (!error) {
1775 *pgid = id;
1776 }
1777 return error;
1778 }
1779
1780 void
1781 fd_set_exclose(struct lwp *l, int fd, bool exclose)
1782 {
1783 filedesc_t *fdp = l->l_fd;
1784 fdfile_t *ff = fdp->fd_dt->dt_ff[fd];
1785
1786 ff->ff_exclose = exclose;
1787 if (exclose)
1788 fdp->fd_exclose = true;
1789 }
1790
1791 /*
1792 * Return descriptor owner information. If the value is positive,
1793 * it's process ID. If it's negative, it's process group ID and
1794 * needs the sign removed before use.
1795 */
1796 int
1797 fgetown(pid_t pgid, u_long cmd, void *data)
1798 {
1799
1800 switch (cmd) {
1801 case TIOCGPGRP:
1802 *(int *)data = -pgid;
1803 break;
1804 default:
1805 *(int *)data = pgid;
1806 break;
1807 }
1808 return 0;
1809 }
1810
1811 /*
1812 * Send signal to descriptor owner, either process or process group.
1813 */
1814 void
1815 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1816 {
1817 ksiginfo_t ksi;
1818
1819 KASSERT(!cpu_intr_p());
1820
1821 if (pgid == 0) {
1822 return;
1823 }
1824
1825 KSI_INIT(&ksi);
1826 ksi.ksi_signo = signo;
1827 ksi.ksi_code = code;
1828 ksi.ksi_band = band;
1829
1830 mutex_enter(proc_lock);
1831 if (pgid > 0) {
1832 struct proc *p1;
1833
1834 p1 = proc_find(pgid);
1835 if (p1 != NULL) {
1836 kpsignal(p1, &ksi, fdescdata);
1837 }
1838 } else {
1839 struct pgrp *pgrp;
1840
1841 KASSERT(pgid < 0);
1842 pgrp = pgrp_find(-pgid);
1843 if (pgrp != NULL) {
1844 kpgsignal(pgrp, &ksi, fdescdata, 0);
1845 }
1846 }
1847 mutex_exit(proc_lock);
1848 }
1849
1850 int
1851 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1852 void *data)
1853 {
1854
1855 fp->f_flag = flag;
1856 fp->f_type = DTYPE_MISC;
1857 fp->f_ops = fops;
1858 fp->f_data = data;
1859 curlwp->l_dupfd = fd;
1860 fd_affix(curproc, fp, fd);
1861
1862 return EMOVEFD;
1863 }
1864
1865 int
1866 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1867 {
1868
1869 if (cmd == F_SETFL)
1870 return 0;
1871
1872 return EOPNOTSUPP;
1873 }
1874
1875 int
1876 fnullop_poll(file_t *fp, int which)
1877 {
1878
1879 return 0;
1880 }
1881
1882 int
1883 fnullop_kqfilter(file_t *fp, struct knote *kn)
1884 {
1885
1886 return EOPNOTSUPP;
1887 }
1888
1889 void
1890 fnullop_restart(file_t *fp)
1891 {
1892
1893 }
1894
1895 int
1896 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1897 kauth_cred_t cred, int flags)
1898 {
1899
1900 return EOPNOTSUPP;
1901 }
1902
1903 int
1904 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1905 kauth_cred_t cred, int flags)
1906 {
1907
1908 return EOPNOTSUPP;
1909 }
1910
1911 int
1912 fbadop_ioctl(file_t *fp, u_long com, void *data)
1913 {
1914
1915 return EOPNOTSUPP;
1916 }
1917
1918 int
1919 fbadop_stat(file_t *fp, struct stat *sb)
1920 {
1921
1922 return EOPNOTSUPP;
1923 }
1924
1925 int
1926 fbadop_close(file_t *fp)
1927 {
1928
1929 return EOPNOTSUPP;
1930 }
1931
1932 /*
1933 * sysctl routines pertaining to file descriptors
1934 */
1935
1936 /* Initialized in sysctl_init() for now... */
1937 extern kmutex_t sysctl_file_marker_lock;
1938 static u_int sysctl_file_marker = 1;
1939
1940 /*
1941 * Expects to be called with proc_lock and sysctl_file_marker_lock locked.
1942 */
1943 static void
1944 sysctl_file_marker_reset(void)
1945 {
1946 struct proc *p;
1947
1948 PROCLIST_FOREACH(p, &allproc) {
1949 struct filedesc *fd = p->p_fd;
1950 fdtab_t *dt;
1951 u_int i;
1952
1953 mutex_enter(&fd->fd_lock);
1954 dt = fd->fd_dt;
1955 for (i = 0; i < dt->dt_nfiles; i++) {
1956 struct file *fp;
1957 fdfile_t *ff;
1958
1959 if ((ff = dt->dt_ff[i]) == NULL) {
1960 continue;
1961 }
1962 if ((fp = ff->ff_file) == NULL) {
1963 continue;
1964 }
1965 fp->f_marker = 0;
1966 }
1967 mutex_exit(&fd->fd_lock);
1968 }
1969 }
1970
1971 /*
1972 * sysctl helper routine for kern.file pseudo-subtree.
1973 */
1974 static int
1975 sysctl_kern_file(SYSCTLFN_ARGS)
1976 {
1977 int error;
1978 size_t buflen;
1979 struct file *fp, fbuf;
1980 char *start, *where;
1981 struct proc *p;
1982
1983 start = where = oldp;
1984 buflen = *oldlenp;
1985
1986 if (where == NULL) {
1987 /*
1988 * overestimate by 10 files
1989 */
1990 *oldlenp = sizeof(filehead) + (nfiles + 10) *
1991 sizeof(struct file);
1992 return 0;
1993 }
1994
1995 /*
1996 * first sysctl_copyout filehead
1997 */
1998 if (buflen < sizeof(filehead)) {
1999 *oldlenp = 0;
2000 return 0;
2001 }
2002 sysctl_unlock();
2003 error = sysctl_copyout(l, &filehead, where, sizeof(filehead));
2004 if (error) {
2005 sysctl_relock();
2006 return error;
2007 }
2008 buflen -= sizeof(filehead);
2009 where += sizeof(filehead);
2010
2011 /*
2012 * followed by an array of file structures
2013 */
2014 mutex_enter(&sysctl_file_marker_lock);
2015 mutex_enter(proc_lock);
2016 PROCLIST_FOREACH(p, &allproc) {
2017 struct filedesc *fd;
2018 fdtab_t *dt;
2019 u_int i;
2020
2021 if (p->p_stat == SIDL) {
2022 /* skip embryonic processes */
2023 continue;
2024 }
2025 mutex_enter(p->p_lock);
2026 error = kauth_authorize_process(l->l_cred,
2027 KAUTH_PROCESS_CANSEE, p,
2028 KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
2029 NULL, NULL);
2030 mutex_exit(p->p_lock);
2031 if (error != 0) {
2032 /*
2033 * Don't leak kauth retval if we're silently
2034 * skipping this entry.
2035 */
2036 error = 0;
2037 continue;
2038 }
2039
2040 /*
2041 * Grab a hold on the process.
2042 */
2043 if (!rw_tryenter(&p->p_reflock, RW_READER)) {
2044 continue;
2045 }
2046 mutex_exit(proc_lock);
2047
2048 fd = p->p_fd;
2049 mutex_enter(&fd->fd_lock);
2050 dt = fd->fd_dt;
2051 for (i = 0; i < dt->dt_nfiles; i++) {
2052 fdfile_t *ff;
2053
2054 if ((ff = dt->dt_ff[i]) == NULL) {
2055 continue;
2056 }
2057 if ((fp = ff->ff_file) == NULL) {
2058 continue;
2059 }
2060
2061 mutex_enter(&fp->f_lock);
2062
2063 if ((fp->f_count == 0) ||
2064 (fp->f_marker == sysctl_file_marker)) {
2065 mutex_exit(&fp->f_lock);
2066 continue;
2067 }
2068
2069 /* Check that we have enough space. */
2070 if (buflen < sizeof(struct file)) {
2071 *oldlenp = where - start;
2072 mutex_exit(&fp->f_lock);
2073 error = ENOMEM;
2074 break;
2075 }
2076
2077 memcpy(&fbuf, fp, sizeof(fbuf));
2078 mutex_exit(&fp->f_lock);
2079 error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf));
2080 if (error) {
2081 break;
2082 }
2083 buflen -= sizeof(struct file);
2084 where += sizeof(struct file);
2085
2086 fp->f_marker = sysctl_file_marker;
2087 }
2088 mutex_exit(&fd->fd_lock);
2089
2090 /*
2091 * Release reference to process.
2092 */
2093 mutex_enter(proc_lock);
2094 rw_exit(&p->p_reflock);
2095
2096 if (error)
2097 break;
2098 }
2099
2100 sysctl_file_marker++;
2101 /* Reset all markers if wrapped. */
2102 if (sysctl_file_marker == 0) {
2103 sysctl_file_marker_reset();
2104 sysctl_file_marker++;
2105 }
2106
2107 mutex_exit(proc_lock);
2108 mutex_exit(&sysctl_file_marker_lock);
2109
2110 *oldlenp = where - start;
2111 sysctl_relock();
2112 return error;
2113 }
2114
2115 /*
2116 * sysctl helper function for kern.file2
2117 */
2118 static int
2119 sysctl_kern_file2(SYSCTLFN_ARGS)
2120 {
2121 struct proc *p;
2122 struct file *fp;
2123 struct filedesc *fd;
2124 struct kinfo_file kf;
2125 char *dp;
2126 u_int i, op;
2127 size_t len, needed, elem_size, out_size;
2128 int error, arg, elem_count;
2129 fdfile_t *ff;
2130 fdtab_t *dt;
2131
2132 if (namelen == 1 && name[0] == CTL_QUERY)
2133 return sysctl_query(SYSCTLFN_CALL(rnode));
2134
2135 if (namelen != 4)
2136 return EINVAL;
2137
2138 error = 0;
2139 dp = oldp;
2140 len = (oldp != NULL) ? *oldlenp : 0;
2141 op = name[0];
2142 arg = name[1];
2143 elem_size = name[2];
2144 elem_count = name[3];
2145 out_size = MIN(sizeof(kf), elem_size);
2146 needed = 0;
2147
2148 if (elem_size < 1 || elem_count < 0)
2149 return EINVAL;
2150
2151 switch (op) {
2152 case KERN_FILE_BYFILE:
2153 case KERN_FILE_BYPID:
2154 /*
2155 * We're traversing the process list in both cases; the BYFILE
2156 * case does additional work of keeping track of files already
2157 * looked at.
2158 */
2159
2160 /* doesn't use arg so it must be zero */
2161 if ((op == KERN_FILE_BYFILE) && (arg != 0))
2162 return EINVAL;
2163
2164 if ((op == KERN_FILE_BYPID) && (arg < -1))
2165 /* -1 means all processes */
2166 return EINVAL;
2167
2168 sysctl_unlock();
2169 if (op == KERN_FILE_BYFILE)
2170 mutex_enter(&sysctl_file_marker_lock);
2171 mutex_enter(proc_lock);
2172 PROCLIST_FOREACH(p, &allproc) {
2173 if (p->p_stat == SIDL) {
2174 /* skip embryonic processes */
2175 continue;
2176 }
2177 if (arg > 0 && p->p_pid != arg) {
2178 /* pick only the one we want */
2179 /* XXX want 0 to mean "kernel files" */
2180 continue;
2181 }
2182 mutex_enter(p->p_lock);
2183 error = kauth_authorize_process(l->l_cred,
2184 KAUTH_PROCESS_CANSEE, p,
2185 KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
2186 NULL, NULL);
2187 mutex_exit(p->p_lock);
2188 if (error != 0) {
2189 /*
2190 * Don't leak kauth retval if we're silently
2191 * skipping this entry.
2192 */
2193 error = 0;
2194 continue;
2195 }
2196
2197 /*
2198 * Grab a hold on the process.
2199 */
2200 if (!rw_tryenter(&p->p_reflock, RW_READER)) {
2201 continue;
2202 }
2203 mutex_exit(proc_lock);
2204
2205 fd = p->p_fd;
2206 mutex_enter(&fd->fd_lock);
2207 dt = fd->fd_dt;
2208 for (i = 0; i < dt->dt_nfiles; i++) {
2209 if ((ff = dt->dt_ff[i]) == NULL) {
2210 continue;
2211 }
2212 if ((fp = ff->ff_file) == NULL) {
2213 continue;
2214 }
2215
2216 if ((op == KERN_FILE_BYFILE) &&
2217 (fp->f_marker == sysctl_file_marker)) {
2218 continue;
2219 }
2220 if (len >= elem_size && elem_count > 0) {
2221 mutex_enter(&fp->f_lock);
2222 fill_file(&kf, fp, ff, i, p->p_pid);
2223 mutex_exit(&fp->f_lock);
2224 mutex_exit(&fd->fd_lock);
2225 error = sysctl_copyout(l,
2226 &kf, dp, out_size);
2227 mutex_enter(&fd->fd_lock);
2228 if (error)
2229 break;
2230 dp += elem_size;
2231 len -= elem_size;
2232 }
2233 if (op == KERN_FILE_BYFILE)
2234 fp->f_marker = sysctl_file_marker;
2235 needed += elem_size;
2236 if (elem_count > 0 && elem_count != INT_MAX)
2237 elem_count--;
2238 }
2239 mutex_exit(&fd->fd_lock);
2240
2241 /*
2242 * Release reference to process.
2243 */
2244 mutex_enter(proc_lock);
2245 rw_exit(&p->p_reflock);
2246 }
2247 if (op == KERN_FILE_BYFILE) {
2248 sysctl_file_marker++;
2249
2250 /* Reset all markers if wrapped. */
2251 if (sysctl_file_marker == 0) {
2252 sysctl_file_marker_reset();
2253 sysctl_file_marker++;
2254 }
2255 }
2256 mutex_exit(proc_lock);
2257 if (op == KERN_FILE_BYFILE)
2258 mutex_exit(&sysctl_file_marker_lock);
2259 sysctl_relock();
2260 break;
2261 default:
2262 return EINVAL;
2263 }
2264
2265 if (oldp == NULL)
2266 needed += KERN_FILESLOP * elem_size;
2267 *oldlenp = needed;
2268
2269 return error;
2270 }
2271
2272 static void
2273 fill_file(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff,
2274 int i, pid_t pid)
2275 {
2276
2277 memset(kp, 0, sizeof(*kp));
2278
2279 kp->ki_fileaddr = PTRTOUINT64(fp);
2280 kp->ki_flag = fp->f_flag;
2281 kp->ki_iflags = 0;
2282 kp->ki_ftype = fp->f_type;
2283 kp->ki_count = fp->f_count;
2284 kp->ki_msgcount = fp->f_msgcount;
2285 kp->ki_fucred = PTRTOUINT64(fp->f_cred);
2286 kp->ki_fuid = kauth_cred_geteuid(fp->f_cred);
2287 kp->ki_fgid = kauth_cred_getegid(fp->f_cred);
2288 kp->ki_fops = PTRTOUINT64(fp->f_ops);
2289 kp->ki_foffset = fp->f_offset;
2290 kp->ki_fdata = PTRTOUINT64(fp->f_data);
2291
2292 /* vnode information to glue this file to something */
2293 if (fp->f_type == DTYPE_VNODE) {
2294 struct vnode *vp = (struct vnode *)fp->f_data;
2295
2296 kp->ki_vun = PTRTOUINT64(vp->v_un.vu_socket);
2297 kp->ki_vsize = vp->v_size;
2298 kp->ki_vtype = vp->v_type;
2299 kp->ki_vtag = vp->v_tag;
2300 kp->ki_vdata = PTRTOUINT64(vp->v_data);
2301 }
2302
2303 /* process information when retrieved via KERN_FILE_BYPID */
2304 if (ff != NULL) {
2305 kp->ki_pid = pid;
2306 kp->ki_fd = i;
2307 kp->ki_ofileflags = ff->ff_exclose;
2308 kp->ki_usecount = ff->ff_refcnt;
2309 }
2310 }
2311