kern_descrip.c revision 1.183 1 /* $NetBSD: kern_descrip.c,v 1.183 2008/11/18 11:36:58 pooka Exp $ */
2
3 /*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Copyright (c) 1982, 1986, 1989, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 *
62 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
63 */
64
65 /*
66 * File descriptor management.
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.183 2008/11/18 11:36:58 pooka Exp $");
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/filedesc.h>
75 #include <sys/kernel.h>
76 #include <sys/vnode.h>
77 #include <sys/proc.h>
78 #include <sys/file.h>
79 #include <sys/namei.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/stat.h>
83 #include <sys/ioctl.h>
84 #include <sys/fcntl.h>
85 #include <sys/malloc.h>
86 #include <sys/pool.h>
87 #include <sys/syslog.h>
88 #include <sys/unistd.h>
89 #include <sys/resourcevar.h>
90 #include <sys/conf.h>
91 #include <sys/event.h>
92 #include <sys/kauth.h>
93 #include <sys/atomic.h>
94 #include <sys/mount.h>
95 #include <sys/syscallargs.h>
96 #include <sys/cpu.h>
97
98 static int file_ctor(void *, void *, int);
99 static void file_dtor(void *, void *);
100 static int fdfile_ctor(void *, void *, int);
101 static void fdfile_dtor(void *, void *);
102 static int filedesc_ctor(void *, void *, int);
103 static void filedesc_dtor(void *, void *);
104 static int filedescopen(dev_t, int, int, lwp_t *);
105
106 kmutex_t filelist_lock; /* lock on filehead */
107 struct filelist filehead; /* head of list of open files */
108 u_int nfiles; /* actual number of open files */
109
110 static pool_cache_t filedesc_cache;
111 static pool_cache_t file_cache;
112 static pool_cache_t fdfile_cache;
113
114 MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
115
116 const struct cdevsw filedesc_cdevsw = {
117 filedescopen, noclose, noread, nowrite, noioctl,
118 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
119 };
120
121 /* For ease of reading. */
122 __strong_alias(fd_putvnode,fd_putfile)
123 __strong_alias(fd_putsock,fd_putfile)
124
125 /*
126 * Initialize the descriptor system.
127 */
128 void
129 fd_sys_init(void)
130 {
131
132 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
133
134 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
135 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
136 KASSERT(file_cache != NULL);
137
138 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
139 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
140 NULL);
141 KASSERT(fdfile_cache != NULL);
142
143 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
144 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
145 NULL);
146 KASSERT(filedesc_cache != NULL);
147 }
148
149 static int
150 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
151 {
152 int i, off, maxoff;
153 uint32_t sub;
154
155 KASSERT(mutex_owned(&fdp->fd_lock));
156
157 if (want > bits)
158 return -1;
159
160 off = want >> NDENTRYSHIFT;
161 i = want & NDENTRYMASK;
162 if (i) {
163 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
164 if (sub != ~0)
165 goto found;
166 off++;
167 }
168
169 maxoff = NDLOSLOTS(bits);
170 while (off < maxoff) {
171 if ((sub = bitmap[off]) != ~0)
172 goto found;
173 off++;
174 }
175
176 return (-1);
177
178 found:
179 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
180 }
181
182 static int
183 fd_last_set(filedesc_t *fd, int last)
184 {
185 int off, i;
186 fdfile_t **ofiles = fd->fd_ofiles;
187 uint32_t *bitmap = fd->fd_lomap;
188
189 KASSERT(mutex_owned(&fd->fd_lock));
190
191 off = (last - 1) >> NDENTRYSHIFT;
192
193 while (off >= 0 && !bitmap[off])
194 off--;
195
196 if (off < 0)
197 return (-1);
198
199 i = ((off + 1) << NDENTRYSHIFT) - 1;
200 if (i >= last)
201 i = last - 1;
202
203 /* XXX should use bitmap */
204 /* XXXAD does not work for fd_copy() */
205 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
206 i--;
207
208 return (i);
209 }
210
211 void
212 fd_used(filedesc_t *fdp, unsigned fd)
213 {
214 u_int off = fd >> NDENTRYSHIFT;
215 fdfile_t *ff;
216
217 ff = fdp->fd_ofiles[fd];
218
219 KASSERT(mutex_owned(&fdp->fd_lock));
220 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
221 KASSERT(ff != NULL);
222 KASSERT(ff->ff_file == NULL);
223 KASSERT(!ff->ff_allocated);
224
225 ff->ff_allocated = 1;
226 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
227 if (fdp->fd_lomap[off] == ~0) {
228 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
229 (1 << (off & NDENTRYMASK))) == 0);
230 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
231 }
232
233 if ((int)fd > fdp->fd_lastfile) {
234 fdp->fd_lastfile = fd;
235 }
236
237 if (fd >= NDFDFILE) {
238 fdp->fd_nused++;
239 } else {
240 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
241 }
242 }
243
244 void
245 fd_unused(filedesc_t *fdp, unsigned fd)
246 {
247 u_int off = fd >> NDENTRYSHIFT;
248 fdfile_t *ff;
249
250 ff = fdp->fd_ofiles[fd];
251
252 /*
253 * Don't assert the lock is held here, as we may be copying
254 * the table during exec() and it is not needed there.
255 * procfs and sysctl are locked out by proc::p_reflock.
256 *
257 * KASSERT(mutex_owned(&fdp->fd_lock));
258 */
259 KASSERT(ff != NULL);
260 KASSERT(ff->ff_file == NULL);
261 KASSERT(ff->ff_allocated);
262
263 if (fd < fdp->fd_freefile) {
264 fdp->fd_freefile = fd;
265 }
266
267 if (fdp->fd_lomap[off] == ~0) {
268 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
269 (1 << (off & NDENTRYMASK))) != 0);
270 fdp->fd_himap[off >> NDENTRYSHIFT] &=
271 ~(1 << (off & NDENTRYMASK));
272 }
273 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
274 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
275 ff->ff_allocated = 0;
276
277 KASSERT(fd <= fdp->fd_lastfile);
278 if (fd == fdp->fd_lastfile) {
279 fdp->fd_lastfile = fd_last_set(fdp, fd);
280 }
281
282 if (fd >= NDFDFILE) {
283 KASSERT(fdp->fd_nused > 0);
284 fdp->fd_nused--;
285 } else {
286 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
287 }
288 }
289
290 /*
291 * Custom version of fd_unused() for fd_copy(), where the descriptor
292 * table is not yet fully initialized.
293 */
294 static inline void
295 fd_zap(filedesc_t *fdp, unsigned fd)
296 {
297 u_int off = fd >> NDENTRYSHIFT;
298
299 if (fd < fdp->fd_freefile) {
300 fdp->fd_freefile = fd;
301 }
302
303 if (fdp->fd_lomap[off] == ~0) {
304 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
305 (1 << (off & NDENTRYMASK))) != 0);
306 fdp->fd_himap[off >> NDENTRYSHIFT] &=
307 ~(1 << (off & NDENTRYMASK));
308 }
309 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
310 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
311 }
312
313 bool
314 fd_isused(filedesc_t *fdp, unsigned fd)
315 {
316 u_int off = fd >> NDENTRYSHIFT;
317
318 KASSERT(fd < fdp->fd_nfiles);
319
320 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
321 }
322
323 /*
324 * Look up the file structure corresponding to a file descriptor
325 * and return the file, holding a reference on the descriptor.
326 */
327 inline file_t *
328 fd_getfile(unsigned fd)
329 {
330 filedesc_t *fdp;
331 fdfile_t *ff;
332 file_t *fp;
333
334 fdp = curlwp->l_fd;
335
336 /*
337 * Look up the fdfile structure representing this descriptor.
338 * Ensure that we see fd_nfiles before fd_ofiles since we
339 * are doing this unlocked. See fd_tryexpand().
340 */
341 if (__predict_false(fd >= fdp->fd_nfiles)) {
342 return NULL;
343 }
344 membar_consumer();
345 ff = fdp->fd_ofiles[fd];
346 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
347 if (__predict_false(ff == NULL)) {
348 return NULL;
349 }
350
351 /*
352 * Now get a reference to the descriptor. Issue a memory
353 * barrier to ensure that we acquire the file pointer _after_
354 * adding a reference. If no memory barrier, we could fetch
355 * a stale pointer.
356 */
357 atomic_inc_uint(&ff->ff_refcnt);
358 #ifndef __HAVE_ATOMIC_AS_MEMBAR
359 membar_enter();
360 #endif
361
362 /*
363 * If the file is not open or is being closed then put the
364 * reference back.
365 */
366 fp = ff->ff_file;
367 if (__predict_true(fp != NULL)) {
368 return fp;
369 }
370 fd_putfile(fd);
371 return NULL;
372 }
373
374 /*
375 * Release a reference to a file descriptor acquired with fd_getfile().
376 */
377 void
378 fd_putfile(unsigned fd)
379 {
380 filedesc_t *fdp;
381 fdfile_t *ff;
382 u_int u, v;
383
384 fdp = curlwp->l_fd;
385 ff = fdp->fd_ofiles[fd];
386
387 KASSERT(fd < fdp->fd_nfiles);
388 KASSERT(ff != NULL);
389 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
390 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
391
392 /*
393 * Ensure that any use of the file is complete and globally
394 * visible before dropping the final reference. If no membar,
395 * the current CPU could still access memory associated with
396 * the file after it has been freed or recycled by another
397 * CPU.
398 */
399 #ifndef __HAVE_ATOMIC_AS_MEMBAR
400 membar_exit();
401 #endif
402
403 /*
404 * Be optimistic and start out with the assumption that no other
405 * threads are trying to close the descriptor. If the CAS fails,
406 * we lost a race and/or it's being closed.
407 */
408 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
409 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
410 if (__predict_true(u == v)) {
411 return;
412 }
413 if (__predict_false((v & FR_CLOSING) != 0)) {
414 break;
415 }
416 }
417
418 /* Another thread is waiting to close the file: join it. */
419 (void)fd_close(fd);
420 }
421
422 /*
423 * Convenience wrapper around fd_getfile() that returns reference
424 * to a vnode.
425 */
426 int
427 fd_getvnode(unsigned fd, file_t **fpp)
428 {
429 vnode_t *vp;
430 file_t *fp;
431
432 fp = fd_getfile(fd);
433 if (__predict_false(fp == NULL)) {
434 return EBADF;
435 }
436 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
437 fd_putfile(fd);
438 return EINVAL;
439 }
440 vp = fp->f_data;
441 if (__predict_false(vp->v_type == VBAD)) {
442 /* XXX Is this case really necessary? */
443 fd_putfile(fd);
444 return EBADF;
445 }
446 *fpp = fp;
447 return 0;
448 }
449
450 /*
451 * Convenience wrapper around fd_getfile() that returns reference
452 * to a socket.
453 */
454 int
455 fd_getsock(unsigned fd, struct socket **sop)
456 {
457 file_t *fp;
458
459 fp = fd_getfile(fd);
460 if (__predict_false(fp == NULL)) {
461 return EBADF;
462 }
463 if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
464 fd_putfile(fd);
465 return ENOTSOCK;
466 }
467 *sop = fp->f_data;
468 return 0;
469 }
470
471 /*
472 * Look up the file structure corresponding to a file descriptor
473 * and return it with a reference held on the file, not the
474 * descriptor.
475 *
476 * This is heavyweight and only used when accessing descriptors
477 * from a foreign process. The caller must ensure that `p' does
478 * not exit or fork across this call.
479 *
480 * To release the file (not descriptor) reference, use closef().
481 */
482 file_t *
483 fd_getfile2(proc_t *p, unsigned fd)
484 {
485 filedesc_t *fdp;
486 fdfile_t *ff;
487 file_t *fp;
488
489 fdp = p->p_fd;
490 mutex_enter(&fdp->fd_lock);
491 if (fd > fdp->fd_nfiles) {
492 mutex_exit(&fdp->fd_lock);
493 return NULL;
494 }
495 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
496 mutex_exit(&fdp->fd_lock);
497 return NULL;
498 }
499 mutex_enter(&ff->ff_lock);
500 if ((fp = ff->ff_file) == NULL) {
501 mutex_exit(&ff->ff_lock);
502 mutex_exit(&fdp->fd_lock);
503 return NULL;
504 }
505 mutex_enter(&fp->f_lock);
506 fp->f_count++;
507 mutex_exit(&fp->f_lock);
508 mutex_exit(&ff->ff_lock);
509 mutex_exit(&fdp->fd_lock);
510
511 return fp;
512 }
513
514 /*
515 * Internal form of close. Must be called with a reference to the
516 * descriptor, and will drop the reference. When all descriptor
517 * references are dropped, releases the descriptor slot and a single
518 * reference to the file structure.
519 */
520 int
521 fd_close(unsigned fd)
522 {
523 struct flock lf;
524 filedesc_t *fdp;
525 fdfile_t *ff;
526 file_t *fp;
527 proc_t *p;
528 lwp_t *l;
529
530 l = curlwp;
531 p = l->l_proc;
532 fdp = l->l_fd;
533 ff = fdp->fd_ofiles[fd];
534
535 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
536
537 mutex_enter(&ff->ff_lock);
538 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
539 if (ff->ff_file == NULL) {
540 /*
541 * Another user of the file is already closing, and is
542 * waiting for other users of the file to drain. Release
543 * our reference, and wake up the closer.
544 */
545 atomic_dec_uint(&ff->ff_refcnt);
546 cv_broadcast(&ff->ff_closing);
547 mutex_exit(&ff->ff_lock);
548
549 /*
550 * An application error, so pretend that the descriptor
551 * was already closed. We can't safely wait for it to
552 * be closed without potentially deadlocking.
553 */
554 return (EBADF);
555 }
556 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
557
558 /*
559 * There may be multiple users of this file within the process.
560 * Notify existing and new users that the file is closing. This
561 * will prevent them from adding additional uses to this file
562 * while we are closing it.
563 */
564 fp = ff->ff_file;
565 ff->ff_file = NULL;
566 ff->ff_exclose = false;
567
568 /*
569 * We expect the caller to hold a descriptor reference - drop it.
570 * The reference count may increase beyond zero at this point due
571 * to an erroneous descriptor reference by an application, but
572 * fd_getfile() will notice that the file is being closed and drop
573 * the reference again.
574 */
575 #ifndef __HAVE_ATOMIC_AS_MEMBAR
576 membar_producer();
577 #endif
578 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
579 /*
580 * Wait for other references to drain. This is typically
581 * an application error - the descriptor is being closed
582 * while still in use.
583 *
584 */
585 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
586 /*
587 * Remove any knotes attached to the file. A knote
588 * attached to the descriptor can hold references on it.
589 */
590 if (!SLIST_EMPTY(&ff->ff_knlist)) {
591 mutex_exit(&ff->ff_lock);
592 knote_fdclose(fd);
593 mutex_enter(&ff->ff_lock);
594 }
595 /*
596 * We need to see the count drop to zero at least once,
597 * in order to ensure that all pre-existing references
598 * have been drained. New references past this point are
599 * of no interest.
600 */
601 while ((ff->ff_refcnt & FR_MASK) != 0) {
602 cv_wait(&ff->ff_closing, &ff->ff_lock);
603 }
604 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
605 } else {
606 /* If no references, there must be no knotes. */
607 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
608 }
609 mutex_exit(&ff->ff_lock);
610
611 /*
612 * POSIX record locking dictates that any close releases ALL
613 * locks owned by this process. This is handled by setting
614 * a flag in the unlock to free ONLY locks obeying POSIX
615 * semantics, and not to free BSD-style file locks.
616 * If the descriptor was in a message, POSIX-style locks
617 * aren't passed with the descriptor.
618 */
619 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
620 lf.l_whence = SEEK_SET;
621 lf.l_start = 0;
622 lf.l_len = 0;
623 lf.l_type = F_UNLCK;
624 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
625 }
626
627
628 /* Free descriptor slot. */
629 mutex_enter(&fdp->fd_lock);
630 fd_unused(fdp, fd);
631 mutex_exit(&fdp->fd_lock);
632
633 /* Now drop reference to the file itself. */
634 return closef(fp);
635 }
636
637 /*
638 * Duplicate a file descriptor.
639 */
640 int
641 fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
642 {
643 proc_t *p;
644 int error;
645
646 p = curproc;
647
648 while ((error = fd_alloc(p, minfd, newp)) != 0) {
649 if (error != ENOSPC) {
650 return error;
651 }
652 fd_tryexpand(p);
653 }
654
655 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
656 fd_affix(p, fp, *newp);
657 return 0;
658 }
659
660 /*
661 * dup2 operation.
662 */
663 int
664 fd_dup2(file_t *fp, unsigned new)
665 {
666 filedesc_t *fdp;
667 fdfile_t *ff;
668
669 fdp = curlwp->l_fd;
670
671 /*
672 * Ensure there are enough slots in the descriptor table,
673 * and allocate an fdfile_t up front in case we need it.
674 */
675 while (new >= fdp->fd_nfiles) {
676 fd_tryexpand(curproc);
677 }
678 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
679
680 /*
681 * If there is already a file open, close it. If the file is
682 * half open, wait for it to be constructed before closing it.
683 * XXX Potential for deadlock here?
684 */
685 mutex_enter(&fdp->fd_lock);
686 while (fd_isused(fdp, new)) {
687 mutex_exit(&fdp->fd_lock);
688 if (fd_getfile(new) != NULL) {
689 (void)fd_close(new);
690 } else {
691 /* XXX Crummy, but unlikely to happen. */
692 kpause("dup2", false, 1, NULL);
693 }
694 mutex_enter(&fdp->fd_lock);
695 }
696 if (fdp->fd_ofiles[new] == NULL) {
697 KASSERT(new >= NDFDFILE);
698 fdp->fd_ofiles[new] = ff;
699 ff = NULL;
700 }
701 fd_used(fdp, new);
702 mutex_exit(&fdp->fd_lock);
703
704 /* Slot is now allocated. Insert copy of the file. */
705 fd_affix(curproc, fp, new);
706 if (ff != NULL) {
707 pool_cache_put(fdfile_cache, ff);
708 }
709 return 0;
710 }
711
712 /*
713 * Drop reference to a file structure.
714 */
715 int
716 closef(file_t *fp)
717 {
718 struct flock lf;
719 int error;
720
721 /*
722 * Drop reference. If referenced elsewhere it's still open
723 * and we have nothing more to do.
724 */
725 mutex_enter(&fp->f_lock);
726 KASSERT(fp->f_count > 0);
727 if (--fp->f_count > 0) {
728 mutex_exit(&fp->f_lock);
729 return 0;
730 }
731 KASSERT(fp->f_count == 0);
732 mutex_exit(&fp->f_lock);
733
734 /* We held the last reference - release locks, close and free. */
735 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
736 lf.l_whence = SEEK_SET;
737 lf.l_start = 0;
738 lf.l_len = 0;
739 lf.l_type = F_UNLCK;
740 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
741 }
742 if (fp->f_ops != NULL) {
743 error = (*fp->f_ops->fo_close)(fp);
744 } else {
745 error = 0;
746 }
747 ffree(fp);
748
749 return error;
750 }
751
752 /*
753 * Allocate a file descriptor for the process.
754 */
755 int
756 fd_alloc(proc_t *p, int want, int *result)
757 {
758 filedesc_t *fdp;
759 int i, lim, last, error;
760 u_int off, new;
761 fdfile_t *ff;
762
763 KASSERT(p == curproc || p == &proc0);
764
765 fdp = p->p_fd;
766 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
767 KASSERT(ff->ff_refcnt == 0);
768 KASSERT(ff->ff_file == NULL);
769
770 /*
771 * Search for a free descriptor starting at the higher
772 * of want or fd_freefile.
773 */
774 mutex_enter(&fdp->fd_lock);
775 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
776 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
777 last = min(fdp->fd_nfiles, lim);
778 for (;;) {
779 if ((i = want) < fdp->fd_freefile)
780 i = fdp->fd_freefile;
781 off = i >> NDENTRYSHIFT;
782 new = fd_next_zero(fdp, fdp->fd_himap, off,
783 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
784 if (new == -1)
785 break;
786 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
787 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
788 if (i == -1) {
789 /*
790 * Free file descriptor in this block was
791 * below want, try again with higher want.
792 */
793 want = (new + 1) << NDENTRYSHIFT;
794 continue;
795 }
796 i += (new << NDENTRYSHIFT);
797 if (i >= last) {
798 break;
799 }
800 if (fdp->fd_ofiles[i] == NULL) {
801 KASSERT(i >= NDFDFILE);
802 fdp->fd_ofiles[i] = ff;
803 } else {
804 pool_cache_put(fdfile_cache, ff);
805 }
806 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
807 fd_used(fdp, i);
808 if (want <= fdp->fd_freefile) {
809 fdp->fd_freefile = i;
810 }
811 *result = i;
812 mutex_exit(&fdp->fd_lock);
813 KASSERT(i >= NDFDFILE ||
814 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
815 return 0;
816 }
817
818 /* No space in current array. Let the caller expand and retry. */
819 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
820 mutex_exit(&fdp->fd_lock);
821 pool_cache_put(fdfile_cache, ff);
822 return error;
823 }
824
825 /*
826 * Expand a process' descriptor table.
827 */
828 void
829 fd_tryexpand(proc_t *p)
830 {
831 filedesc_t *fdp;
832 int i, numfiles, oldnfiles;
833 fdfile_t **newofile;
834 uint32_t *newhimap, *newlomap;
835
836 KASSERT(p == curproc || p == &proc0);
837
838 fdp = p->p_fd;
839 newhimap = NULL;
840 newlomap = NULL;
841 oldnfiles = fdp->fd_nfiles;
842
843 if (oldnfiles < NDEXTENT)
844 numfiles = NDEXTENT;
845 else
846 numfiles = 2 * oldnfiles;
847
848 newofile = malloc(numfiles * sizeof(fdfile_t *), M_FILEDESC, M_WAITOK);
849 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
850 newhimap = malloc(NDHISLOTS(numfiles) *
851 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
852 newlomap = malloc(NDLOSLOTS(numfiles) *
853 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
854 }
855
856 mutex_enter(&fdp->fd_lock);
857 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
858 if (fdp->fd_nfiles != oldnfiles) {
859 /* fdp changed; caller must retry */
860 mutex_exit(&fdp->fd_lock);
861 free(newofile, M_FILEDESC);
862 if (newhimap != NULL)
863 free(newhimap, M_FILEDESC);
864 if (newlomap != NULL)
865 free(newlomap, M_FILEDESC);
866 return;
867 }
868
869 /* Copy the existing ofile array and zero the new portion. */
870 i = sizeof(fdfile_t *) * fdp->fd_nfiles;
871 memcpy(newofile, fdp->fd_ofiles, i);
872 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
873
874 /*
875 * Link old ofiles array into list to be discarded. We defer
876 * freeing until process exit if the descriptor table is visble
877 * to other threads.
878 */
879 if (oldnfiles > NDFILE) {
880 if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
881 *(void **)fdp->fd_ofiles = fdp->fd_discard;
882 fdp->fd_discard = fdp->fd_ofiles;
883 } else {
884 free(fdp->fd_ofiles, M_FILEDESC);
885 }
886 }
887
888 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
889 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
890 memcpy(newhimap, fdp->fd_himap, i);
891 memset((uint8_t *)newhimap + i, 0,
892 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
893
894 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
895 memcpy(newlomap, fdp->fd_lomap, i);
896 memset((uint8_t *)newlomap + i, 0,
897 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
898
899 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
900 free(fdp->fd_himap, M_FILEDESC);
901 free(fdp->fd_lomap, M_FILEDESC);
902 }
903 fdp->fd_himap = newhimap;
904 fdp->fd_lomap = newlomap;
905 }
906
907 /*
908 * All other modifications must become globally visible before
909 * the change to fd_nfiles. See fd_getfile().
910 */
911 fdp->fd_ofiles = newofile;
912 membar_producer();
913 fdp->fd_nfiles = numfiles;
914 mutex_exit(&fdp->fd_lock);
915
916 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
917 }
918
919 /*
920 * Create a new open file structure and allocate a file descriptor
921 * for the current process.
922 */
923 int
924 fd_allocfile(file_t **resultfp, int *resultfd)
925 {
926 file_t *fp;
927 proc_t *p;
928 int error;
929
930 p = curproc;
931
932 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
933 if (error != ENOSPC) {
934 return error;
935 }
936 fd_tryexpand(p);
937 }
938
939 fp = pool_cache_get(file_cache, PR_WAITOK);
940 KASSERT(fp->f_count == 0);
941 fp->f_cred = kauth_cred_get();
942 kauth_cred_hold(fp->f_cred);
943
944 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
945 fd_abort(p, fp, *resultfd);
946 tablefull("file", "increase kern.maxfiles or MAXFILES");
947 return ENFILE;
948 }
949
950 fp->f_advice = 0;
951 fp->f_msgcount = 0;
952 fp->f_offset = 0;
953 fp->f_iflags = 0;
954 *resultfp = fp;
955
956 return 0;
957 }
958
959 /*
960 * Successful creation of a new descriptor: make visible to the process.
961 */
962 void
963 fd_affix(proc_t *p, file_t *fp, unsigned fd)
964 {
965 fdfile_t *ff;
966 filedesc_t *fdp;
967
968 KASSERT(p == curproc || p == &proc0);
969
970 /* Add a reference to the file structure. */
971 mutex_enter(&fp->f_lock);
972 fp->f_count++;
973 mutex_exit(&fp->f_lock);
974
975 /*
976 * Insert the new file into the descriptor slot.
977 *
978 * The memory barriers provided by lock activity in this routine
979 * ensure that any updates to the file structure become globally
980 * visible before the file becomes visible to other LWPs in the
981 * current process.
982 */
983 fdp = p->p_fd;
984 ff = fdp->fd_ofiles[fd];
985
986 KASSERT(ff != NULL);
987 KASSERT(ff->ff_file == NULL);
988 KASSERT(ff->ff_allocated);
989 KASSERT(fd_isused(fdp, fd));
990 KASSERT(fd >= NDFDFILE ||
991 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
992
993 /* No need to lock in order to make file initially visible. */
994 ff->ff_file = fp;
995 }
996
997 /*
998 * Abort creation of a new descriptor: free descriptor slot and file.
999 */
1000 void
1001 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1002 {
1003 filedesc_t *fdp;
1004 fdfile_t *ff;
1005
1006 KASSERT(p == curproc || p == &proc0);
1007
1008 fdp = p->p_fd;
1009 ff = fdp->fd_ofiles[fd];
1010
1011 KASSERT(fd >= NDFDFILE ||
1012 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1013
1014 mutex_enter(&fdp->fd_lock);
1015 KASSERT(fd_isused(fdp, fd));
1016 fd_unused(fdp, fd);
1017 mutex_exit(&fdp->fd_lock);
1018
1019 if (fp != NULL) {
1020 ffree(fp);
1021 }
1022 }
1023
1024 /*
1025 * Free a file descriptor.
1026 */
1027 void
1028 ffree(file_t *fp)
1029 {
1030
1031 KASSERT(fp->f_count == 0);
1032
1033 atomic_dec_uint(&nfiles);
1034 kauth_cred_free(fp->f_cred);
1035 pool_cache_put(file_cache, fp);
1036 }
1037
1038 static int
1039 file_ctor(void *arg, void *obj, int flags)
1040 {
1041 file_t *fp = obj;
1042
1043 memset(fp, 0, sizeof(*fp));
1044 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1045
1046 mutex_enter(&filelist_lock);
1047 LIST_INSERT_HEAD(&filehead, fp, f_list);
1048 mutex_exit(&filelist_lock);
1049
1050 return 0;
1051 }
1052
1053 static void
1054 file_dtor(void *arg, void *obj)
1055 {
1056 file_t *fp = obj;
1057
1058 mutex_enter(&filelist_lock);
1059 LIST_REMOVE(fp, f_list);
1060 mutex_exit(&filelist_lock);
1061
1062 mutex_destroy(&fp->f_lock);
1063 }
1064
1065 static int
1066 fdfile_ctor(void *arg, void *obj, int flags)
1067 {
1068 fdfile_t *ff = obj;
1069
1070 memset(ff, 0, sizeof(*ff));
1071 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1072 cv_init(&ff->ff_closing, "fdclose");
1073
1074 return 0;
1075 }
1076
1077 static void
1078 fdfile_dtor(void *arg, void *obj)
1079 {
1080 fdfile_t *ff = obj;
1081
1082 mutex_destroy(&ff->ff_lock);
1083 cv_destroy(&ff->ff_closing);
1084 }
1085
1086 file_t *
1087 fgetdummy(void)
1088 {
1089 file_t *fp;
1090
1091 fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1092 if (fp != NULL) {
1093 memset(fp, 0, sizeof(*fp));
1094 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1095 }
1096 return fp;
1097 }
1098
1099 void
1100 fputdummy(file_t *fp)
1101 {
1102
1103 mutex_destroy(&fp->f_lock);
1104 kmem_free(fp, sizeof(*fp));
1105 }
1106
1107 /*
1108 * Create an initial filedesc structure.
1109 */
1110 filedesc_t *
1111 fd_init(filedesc_t *fdp)
1112 {
1113 unsigned fd;
1114
1115 if (fdp == NULL) {
1116 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1117 } else {
1118 filedesc_ctor(NULL, fdp, PR_WAITOK);
1119 }
1120
1121 fdp->fd_refcnt = 1;
1122 fdp->fd_ofiles = fdp->fd_dfiles;
1123 fdp->fd_nfiles = NDFILE;
1124 fdp->fd_himap = fdp->fd_dhimap;
1125 fdp->fd_lomap = fdp->fd_dlomap;
1126 KASSERT(fdp->fd_lastfile == -1);
1127 KASSERT(fdp->fd_lastkqfile == -1);
1128 KASSERT(fdp->fd_knhash == NULL);
1129
1130 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1131 offsetof(filedesc_t, fd_startzero));
1132 for (fd = 0; fd < NDFDFILE; fd++) {
1133 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1134 }
1135
1136 return fdp;
1137 }
1138
1139 /*
1140 * Initialize a file descriptor table.
1141 */
1142 static int
1143 filedesc_ctor(void *arg, void *obj, int flag)
1144 {
1145 filedesc_t *fdp = obj;
1146 int i;
1147
1148 memset(fdp, 0, sizeof(*fdp));
1149 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1150 fdp->fd_lastfile = -1;
1151 fdp->fd_lastkqfile = -1;
1152
1153 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1154 for (i = 0; i < NDFDFILE; i++) {
1155 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1156 }
1157
1158 return 0;
1159 }
1160
1161 static void
1162 filedesc_dtor(void *arg, void *obj)
1163 {
1164 filedesc_t *fdp = obj;
1165 int i;
1166
1167 for (i = 0; i < NDFDFILE; i++) {
1168 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1169 }
1170
1171 mutex_destroy(&fdp->fd_lock);
1172 }
1173
1174 /*
1175 * Make p2 share p1's filedesc structure.
1176 */
1177 void
1178 fd_share(struct proc *p2)
1179 {
1180 filedesc_t *fdp;
1181
1182 fdp = curlwp->l_fd;
1183 p2->p_fd = fdp;
1184 atomic_inc_uint(&fdp->fd_refcnt);
1185 }
1186
1187 /*
1188 * Copy a filedesc structure.
1189 */
1190 filedesc_t *
1191 fd_copy(void)
1192 {
1193 filedesc_t *newfdp, *fdp;
1194 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1195 int i, nused, numfiles, lastfile, j, newlast;
1196 file_t *fp;
1197
1198 fdp = curproc->p_fd;
1199 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1200 newfdp->fd_refcnt = 1;
1201
1202 KASSERT(newfdp->fd_knhash == NULL);
1203 KASSERT(newfdp->fd_knhashmask == 0);
1204 KASSERT(newfdp->fd_discard == NULL);
1205
1206 for (;;) {
1207 numfiles = fdp->fd_nfiles;
1208 lastfile = fdp->fd_lastfile;
1209
1210 /*
1211 * If the number of open files fits in the internal arrays
1212 * of the open file structure, use them, otherwise allocate
1213 * additional memory for the number of descriptors currently
1214 * in use.
1215 */
1216 if (lastfile < NDFILE) {
1217 i = NDFILE;
1218 newfdp->fd_ofiles = newfdp->fd_dfiles;
1219 } else {
1220 /*
1221 * Compute the smallest multiple of NDEXTENT needed
1222 * for the file descriptors currently in use,
1223 * allowing the table to shrink.
1224 */
1225 i = numfiles;
1226 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1227 i /= 2;
1228 }
1229 newfdp->fd_ofiles = malloc(i * sizeof(fdfile_t *),
1230 M_FILEDESC, M_WAITOK);
1231 KASSERT(i >= NDFILE);
1232 }
1233 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1234 newfdp->fd_himap = newfdp->fd_dhimap;
1235 newfdp->fd_lomap = newfdp->fd_dlomap;
1236 } else {
1237 newfdp->fd_himap = malloc(NDHISLOTS(i) *
1238 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1239 newfdp->fd_lomap = malloc(NDLOSLOTS(i) *
1240 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1241 }
1242
1243 /*
1244 * Allocate and string together fdfile structures.
1245 * We abuse fdfile_t::ff_file here, but it will be
1246 * cleared before this routine returns.
1247 */
1248 nused = fdp->fd_nused;
1249 fflist = NULL;
1250 for (j = nused; j != 0; j--) {
1251 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1252 ff->ff_file = (void *)fflist;
1253 fflist = ff;
1254 }
1255
1256 mutex_enter(&fdp->fd_lock);
1257 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1258 lastfile == fdp->fd_lastfile) {
1259 break;
1260 }
1261 mutex_exit(&fdp->fd_lock);
1262 if (i >= NDFILE) {
1263 free(newfdp->fd_ofiles, M_FILEDESC);
1264 }
1265 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1266 free(newfdp->fd_himap, M_FILEDESC);
1267 free(newfdp->fd_lomap, M_FILEDESC);
1268 }
1269 while (fflist != NULL) {
1270 ff = fflist;
1271 fflist = (void *)ff->ff_file;
1272 ff->ff_file = NULL;
1273 pool_cache_put(fdfile_cache, ff);
1274 }
1275 }
1276
1277 newfdp->fd_nfiles = i;
1278 newfdp->fd_freefile = fdp->fd_freefile;
1279 newfdp->fd_exclose = fdp->fd_exclose;
1280
1281 /*
1282 * Clear the entries that will not be copied over.
1283 * Avoid calling memset with 0 size.
1284 */
1285 if (lastfile < (i-1)) {
1286 memset(newfdp->fd_ofiles + lastfile + 1, 0,
1287 (i - lastfile - 1) * sizeof(file_t **));
1288 }
1289 if (i < NDENTRIES * NDENTRIES) {
1290 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1291 }
1292 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1293 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1294
1295 ffp = fdp->fd_ofiles;
1296 nffp = newfdp->fd_ofiles;
1297 j = imax(lastfile, (NDFDFILE - 1));
1298 newlast = -1;
1299 KASSERT(j < fdp->fd_nfiles);
1300 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1301 ff = *ffp;
1302 /* Install built-in fdfiles even if unused here. */
1303 if (i < NDFDFILE) {
1304 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1305 } else {
1306 ff2 = NULL;
1307 }
1308 /* Determine if descriptor is active in parent. */
1309 if (ff == NULL || !fd_isused(fdp, i)) {
1310 KASSERT(ff != NULL || i >= NDFDFILE);
1311 continue;
1312 }
1313 mutex_enter(&ff->ff_lock);
1314 fp = ff->ff_file;
1315 if (fp == NULL) {
1316 /* Descriptor is half-open: free slot. */
1317 fd_zap(newfdp, i);
1318 mutex_exit(&ff->ff_lock);
1319 continue;
1320 }
1321 if (fp->f_type == DTYPE_KQUEUE) {
1322 /* kqueue descriptors cannot be copied. */
1323 fd_zap(newfdp, i);
1324 mutex_exit(&ff->ff_lock);
1325 continue;
1326 }
1327 /* It's active: add a reference to the file. */
1328 mutex_enter(&fp->f_lock);
1329 fp->f_count++;
1330 mutex_exit(&fp->f_lock);
1331 /* Consume one fdfile_t to represent it. */
1332 if (i >= NDFDFILE) {
1333 ff2 = fflist;
1334 fflist = (void *)ff2->ff_file;
1335 }
1336 ff2->ff_file = fp;
1337 ff2->ff_exclose = ff->ff_exclose;
1338 ff2->ff_allocated = true;
1339 mutex_exit(&ff->ff_lock);
1340 if (i > newlast) {
1341 newlast = i;
1342 }
1343 }
1344 mutex_exit(&fdp->fd_lock);
1345
1346 /* Discard unused fdfile_t structures. */
1347 while (__predict_false(fflist != NULL)) {
1348 ff = fflist;
1349 fflist = (void *)ff->ff_file;
1350 ff->ff_file = NULL;
1351 pool_cache_put(fdfile_cache, ff);
1352 nused--;
1353 }
1354 KASSERT(nused >= 0);
1355 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1356
1357 newfdp->fd_nused = nused;
1358 newfdp->fd_lastfile = newlast;
1359
1360 return (newfdp);
1361 }
1362
1363 /*
1364 * Release a filedesc structure.
1365 */
1366 void
1367 fd_free(void)
1368 {
1369 filedesc_t *fdp;
1370 fdfile_t *ff;
1371 file_t *fp;
1372 int fd, lastfd;
1373 void *discard;
1374
1375 fdp = curlwp->l_fd;
1376
1377 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1378
1379 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1380 return;
1381
1382 /*
1383 * Close any files that the process holds open.
1384 */
1385 for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1386 ff = fdp->fd_ofiles[fd];
1387 KASSERT(fd >= NDFDFILE ||
1388 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1389 if ((ff = fdp->fd_ofiles[fd]) == NULL)
1390 continue;
1391 if ((fp = ff->ff_file) != NULL) {
1392 /*
1393 * Must use fd_close() here as kqueue holds
1394 * long term references to descriptors.
1395 */
1396 ff->ff_refcnt++;
1397 fd_close(fd);
1398 }
1399 KASSERT(ff->ff_refcnt == 0);
1400 KASSERT(ff->ff_file == NULL);
1401 KASSERT(!ff->ff_exclose);
1402 KASSERT(!ff->ff_allocated);
1403 if (fd >= NDFDFILE) {
1404 pool_cache_put(fdfile_cache, ff);
1405 }
1406 }
1407
1408 /*
1409 * Clean out the descriptor table for the next user and return
1410 * to the cache.
1411 */
1412 while ((discard = fdp->fd_discard) != NULL) {
1413 KASSERT(discard != fdp->fd_ofiles);
1414 fdp->fd_discard = *(void **)discard;
1415 free(discard, M_FILEDESC);
1416 }
1417 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1418 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1419 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1420 free(fdp->fd_himap, M_FILEDESC);
1421 free(fdp->fd_lomap, M_FILEDESC);
1422 }
1423 if (fdp->fd_nfiles > NDFILE) {
1424 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1425 free(fdp->fd_ofiles, M_FILEDESC);
1426 }
1427 if (fdp->fd_knhash != NULL) {
1428 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1429 fdp->fd_knhash = NULL;
1430 fdp->fd_knhashmask = 0;
1431 } else {
1432 KASSERT(fdp->fd_knhashmask == 0);
1433 }
1434 fdp->fd_lastkqfile = -1;
1435 pool_cache_put(filedesc_cache, fdp);
1436 }
1437
1438 /*
1439 * File Descriptor pseudo-device driver (/dev/fd/).
1440 *
1441 * Opening minor device N dup()s the file (if any) connected to file
1442 * descriptor N belonging to the calling process. Note that this driver
1443 * consists of only the ``open()'' routine, because all subsequent
1444 * references to this file will be direct to the other driver.
1445 */
1446 static int
1447 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1448 {
1449
1450 /*
1451 * XXX Kludge: set dupfd to contain the value of the
1452 * the file descriptor being sought for duplication. The error
1453 * return ensures that the vnode for this device will be released
1454 * by vn_open. Open will detect this special error and take the
1455 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1456 * will simply report the error.
1457 */
1458 l->l_dupfd = minor(dev); /* XXX */
1459 return EDUPFD;
1460 }
1461
1462 /*
1463 * Duplicate the specified descriptor to a free descriptor.
1464 */
1465 int
1466 fd_dupopen(int old, int *new, int mode, int error)
1467 {
1468 filedesc_t *fdp;
1469 fdfile_t *ff;
1470 file_t *fp;
1471
1472 if ((fp = fd_getfile(old)) == NULL) {
1473 return EBADF;
1474 }
1475 fdp = curlwp->l_fd;
1476 ff = fdp->fd_ofiles[old];
1477
1478 /*
1479 * There are two cases of interest here.
1480 *
1481 * For EDUPFD simply dup (dfd) to file descriptor
1482 * (indx) and return.
1483 *
1484 * For EMOVEFD steal away the file structure from (dfd) and
1485 * store it in (indx). (dfd) is effectively closed by
1486 * this operation.
1487 *
1488 * Any other error code is just returned.
1489 */
1490 switch (error) {
1491 case EDUPFD:
1492 /*
1493 * Check that the mode the file is being opened for is a
1494 * subset of the mode of the existing descriptor.
1495 */
1496 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1497 error = EACCES;
1498 break;
1499 }
1500
1501 /* Copy it. */
1502 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1503 break;
1504
1505 case EMOVEFD:
1506 /* Copy it. */
1507 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1508 if (error != 0) {
1509 break;
1510 }
1511
1512 /* Steal away the file pointer from 'old'. */
1513 (void)fd_close(old);
1514 return 0;
1515 }
1516
1517 fd_putfile(old);
1518 return error;
1519 }
1520
1521 /*
1522 * Close open files on exec.
1523 */
1524 void
1525 fd_closeexec(void)
1526 {
1527 proc_t *p;
1528 filedesc_t *fdp;
1529 fdfile_t *ff;
1530 lwp_t *l;
1531 int fd;
1532
1533 l = curlwp;
1534 p = l->l_proc;
1535 fdp = p->p_fd;
1536
1537 cwdunshare(p);
1538
1539 if (p->p_cwdi->cwdi_edir) {
1540 vrele(p->p_cwdi->cwdi_edir);
1541 }
1542
1543 if (fdp->fd_refcnt > 1) {
1544 fdp = fd_copy();
1545 fd_free();
1546 p->p_fd = fdp;
1547 l->l_fd = fdp;
1548 }
1549 if (!fdp->fd_exclose) {
1550 return;
1551 }
1552 fdp->fd_exclose = false;
1553
1554 for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1555 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1556 KASSERT(fd >= NDFDFILE);
1557 continue;
1558 }
1559 KASSERT(fd >= NDFDFILE ||
1560 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1561 if (ff->ff_file == NULL)
1562 continue;
1563 if (ff->ff_exclose) {
1564 /*
1565 * We need a reference to close the file.
1566 * No other threads can see the fdfile_t at
1567 * this point, so don't bother locking.
1568 */
1569 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1570 ff->ff_refcnt++;
1571 fd_close(fd);
1572 }
1573 }
1574 }
1575
1576 /*
1577 * It is unsafe for set[ug]id processes to be started with file
1578 * descriptors 0..2 closed, as these descriptors are given implicit
1579 * significance in the Standard C library. fdcheckstd() will create a
1580 * descriptor referencing /dev/null for each of stdin, stdout, and
1581 * stderr that is not already open.
1582 */
1583 #define CHECK_UPTO 3
1584 int
1585 fd_checkstd(void)
1586 {
1587 struct proc *p;
1588 struct nameidata nd;
1589 filedesc_t *fdp;
1590 file_t *fp;
1591 struct proc *pp;
1592 int fd, i, error, flags = FREAD|FWRITE;
1593 char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1594
1595 p = curproc;
1596 closed[0] = '\0';
1597 if ((fdp = p->p_fd) == NULL)
1598 return (0);
1599 for (i = 0; i < CHECK_UPTO; i++) {
1600 KASSERT(i >= NDFDFILE ||
1601 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1602 if (fdp->fd_ofiles[i]->ff_file != NULL)
1603 continue;
1604 snprintf(which, sizeof(which), ",%d", i);
1605 strlcat(closed, which, sizeof(closed));
1606 if ((error = fd_allocfile(&fp, &fd)) != 0)
1607 return (error);
1608 KASSERT(fd < CHECK_UPTO);
1609 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1610 if ((error = vn_open(&nd, flags, 0)) != 0) {
1611 fd_abort(p, fp, fd);
1612 return (error);
1613 }
1614 fp->f_data = nd.ni_vp;
1615 fp->f_flag = flags;
1616 fp->f_ops = &vnops;
1617 fp->f_type = DTYPE_VNODE;
1618 VOP_UNLOCK(nd.ni_vp, 0);
1619 fd_affix(p, fp, fd);
1620 }
1621 if (closed[0] != '\0') {
1622 mutex_enter(proc_lock);
1623 pp = p->p_pptr;
1624 mutex_enter(pp->p_lock);
1625 log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1626 "was invoked by uid %d ppid %d (%s) "
1627 "with fd %s closed\n",
1628 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1629 pp->p_pid, pp->p_comm, &closed[1]);
1630 mutex_exit(pp->p_lock);
1631 mutex_exit(proc_lock);
1632 }
1633 return (0);
1634 }
1635 #undef CHECK_UPTO
1636
1637 /*
1638 * Sets descriptor owner. If the owner is a process, 'pgid'
1639 * is set to positive value, process ID. If the owner is process group,
1640 * 'pgid' is set to -pg_id.
1641 */
1642 int
1643 fsetown(pid_t *pgid, u_long cmd, const void *data)
1644 {
1645 int id = *(const int *)data;
1646 int error;
1647
1648 switch (cmd) {
1649 case TIOCSPGRP:
1650 if (id < 0)
1651 return (EINVAL);
1652 id = -id;
1653 break;
1654 default:
1655 break;
1656 }
1657
1658 if (id > 0 && !pfind(id))
1659 return (ESRCH);
1660 else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1661 return (error);
1662
1663 *pgid = id;
1664 return (0);
1665 }
1666
1667 /*
1668 * Return descriptor owner information. If the value is positive,
1669 * it's process ID. If it's negative, it's process group ID and
1670 * needs the sign removed before use.
1671 */
1672 int
1673 fgetown(pid_t pgid, u_long cmd, void *data)
1674 {
1675
1676 switch (cmd) {
1677 case TIOCGPGRP:
1678 *(int *)data = -pgid;
1679 break;
1680 default:
1681 *(int *)data = pgid;
1682 break;
1683 }
1684 return (0);
1685 }
1686
1687 /*
1688 * Send signal to descriptor owner, either process or process group.
1689 */
1690 void
1691 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1692 {
1693 struct proc *p1;
1694 struct pgrp *pgrp;
1695 ksiginfo_t ksi;
1696
1697 KASSERT(!cpu_intr_p());
1698
1699 KSI_INIT(&ksi);
1700 ksi.ksi_signo = signo;
1701 ksi.ksi_code = code;
1702 ksi.ksi_band = band;
1703
1704 mutex_enter(proc_lock);
1705 if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1706 kpsignal(p1, &ksi, fdescdata);
1707 else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1708 kpgsignal(pgrp, &ksi, fdescdata, 0);
1709 mutex_exit(proc_lock);
1710 }
1711
1712 int
1713 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1714 void *data)
1715 {
1716
1717 fp->f_flag = flag;
1718 fp->f_type = DTYPE_MISC;
1719 fp->f_ops = fops;
1720 fp->f_data = data;
1721 curlwp->l_dupfd = fd;
1722 fd_affix(curproc, fp, fd);
1723
1724 return EMOVEFD;
1725 }
1726
1727 int
1728 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1729 {
1730
1731 if (cmd == F_SETFL)
1732 return 0;
1733
1734 return EOPNOTSUPP;
1735 }
1736
1737 int
1738 fnullop_poll(file_t *fp, int which)
1739 {
1740
1741 return 0;
1742 }
1743
1744 int
1745 fnullop_kqfilter(file_t *fp, struct knote *kn)
1746 {
1747
1748 return 0;
1749 }
1750
1751 int
1752 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1753 kauth_cred_t cred, int flags)
1754 {
1755
1756 return EOPNOTSUPP;
1757 }
1758
1759 int
1760 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1761 kauth_cred_t cred, int flags)
1762 {
1763
1764 return EOPNOTSUPP;
1765 }
1766
1767 int
1768 fbadop_ioctl(file_t *fp, u_long com, void *data)
1769 {
1770
1771 return EOPNOTSUPP;
1772 }
1773
1774 int
1775 fbadop_stat(file_t *fp, struct stat *sb)
1776 {
1777
1778 return EOPNOTSUPP;
1779 }
1780
1781 int
1782 fbadop_close(file_t *fp)
1783 {
1784
1785 return EOPNOTSUPP;
1786 }
1787