kern_descrip.c revision 1.173 1 /* $NetBSD: kern_descrip.c,v 1.173 2008/03/21 21:53:35 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the NetBSD
18 * Foundation, Inc. and its contributors.
19 * 4. Neither the name of The NetBSD Foundation nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
25 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
27 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 /*
37 * Copyright (c) 1982, 1986, 1989, 1991, 1993
38 * The Regents of the University of California. All rights reserved.
39 * (c) UNIX System Laboratories, Inc.
40 * All or some portions of this file are derived from material licensed
41 * to the University of California by American Telephone and Telegraph
42 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
43 * the permission of UNIX System Laboratories, Inc.
44 *
45 * Redistribution and use in source and binary forms, with or without
46 * modification, are permitted provided that the following conditions
47 * are met:
48 * 1. Redistributions of source code must retain the above copyright
49 * notice, this list of conditions and the following disclaimer.
50 * 2. Redistributions in binary form must reproduce the above copyright
51 * notice, this list of conditions and the following disclaimer in the
52 * documentation and/or other materials provided with the distribution.
53 * 3. Neither the name of the University nor the names of its contributors
54 * may be used to endorse or promote products derived from this software
55 * without specific prior written permission.
56 *
57 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67 * SUCH DAMAGE.
68 *
69 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
70 */
71
72 /*
73 * File descriptor management.
74 */
75
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.173 2008/03/21 21:53:35 ad Exp $");
78
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/filedesc.h>
82 #include <sys/kernel.h>
83 #include <sys/vnode.h>
84 #include <sys/proc.h>
85 #include <sys/file.h>
86 #include <sys/namei.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/stat.h>
90 #include <sys/ioctl.h>
91 #include <sys/fcntl.h>
92 #include <sys/malloc.h>
93 #include <sys/pool.h>
94 #include <sys/syslog.h>
95 #include <sys/unistd.h>
96 #include <sys/resourcevar.h>
97 #include <sys/conf.h>
98 #include <sys/event.h>
99 #include <sys/kauth.h>
100 #include <sys/atomic.h>
101 #include <sys/mount.h>
102 #include <sys/syscallargs.h>
103
104 static int cwdi_ctor(void *, void *, int);
105 static void cwdi_dtor(void *, void *);
106 static int file_ctor(void *, void *, int);
107 static void file_dtor(void *, void *);
108 static int fdfile_ctor(void *, void *, int);
109 static void fdfile_dtor(void *, void *);
110 static int filedesc_ctor(void *, void *, int);
111 static void filedesc_dtor(void *, void *);
112 static int filedescopen(dev_t, int, int, lwp_t *);
113
114 kmutex_t filelist_lock; /* lock on filehead */
115 struct filelist filehead; /* head of list of open files */
116 u_int nfiles; /* actual number of open files */
117
118 static pool_cache_t cwdi_cache;
119 static pool_cache_t filedesc_cache;
120 static pool_cache_t file_cache;
121 static pool_cache_t fdfile_cache;
122
123 MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
124
125 const struct cdevsw filedesc_cdevsw = {
126 filedescopen, noclose, noread, nowrite, noioctl,
127 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
128 };
129
130 /* For ease of reading. */
131 __strong_alias(fd_putvnode,fd_putfile)
132 __strong_alias(fd_putsock,fd_putfile)
133
134 /*
135 * Initialize the descriptor system.
136 */
137 void
138 fd_sys_init(void)
139 {
140
141 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
142
143 file_cache = pool_cache_init(sizeof(file_t), CACHE_LINE_SIZE, 0,
144 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
145 KASSERT(file_cache != NULL);
146
147 fdfile_cache = pool_cache_init(sizeof(fdfile_t), CACHE_LINE_SIZE, 0,
148 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
149 NULL);
150 KASSERT(fdfile_cache != NULL);
151
152 cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), CACHE_LINE_SIZE,
153 0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL);
154 KASSERT(cwdi_cache != NULL);
155
156 filedesc_cache = pool_cache_init(sizeof(filedesc_t), CACHE_LINE_SIZE,
157 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
158 NULL);
159 KASSERT(filedesc_cache != NULL);
160 }
161
162 static int
163 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
164 {
165 int i, off, maxoff;
166 uint32_t sub;
167
168 KASSERT(mutex_owned(&fdp->fd_lock));
169
170 if (want > bits)
171 return -1;
172
173 off = want >> NDENTRYSHIFT;
174 i = want & NDENTRYMASK;
175 if (i) {
176 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
177 if (sub != ~0)
178 goto found;
179 off++;
180 }
181
182 maxoff = NDLOSLOTS(bits);
183 while (off < maxoff) {
184 if ((sub = bitmap[off]) != ~0)
185 goto found;
186 off++;
187 }
188
189 return (-1);
190
191 found:
192 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
193 }
194
195 static int
196 fd_last_set(filedesc_t *fd, int last)
197 {
198 int off, i;
199 fdfile_t **ofiles = fd->fd_ofiles;
200 uint32_t *bitmap = fd->fd_lomap;
201
202 KASSERT(mutex_owned(&fd->fd_lock));
203
204 off = (last - 1) >> NDENTRYSHIFT;
205
206 while (off >= 0 && !bitmap[off])
207 off--;
208
209 if (off < 0)
210 return (-1);
211
212 i = ((off + 1) << NDENTRYSHIFT) - 1;
213 if (i >= last)
214 i = last - 1;
215
216 /* XXX should use bitmap */
217 /* XXXAD does not work for fd_copy() */
218 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
219 i--;
220
221 return (i);
222 }
223
224 void
225 fd_used(filedesc_t *fdp, unsigned fd)
226 {
227 u_int off = fd >> NDENTRYSHIFT;
228 fdfile_t *ff;
229
230 ff = fdp->fd_ofiles[fd];
231
232 KASSERT(mutex_owned(&fdp->fd_lock));
233 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
234 KASSERT(ff != NULL);
235 KASSERT(ff->ff_file == NULL);
236 KASSERT(!ff->ff_allocated);
237
238 ff->ff_allocated = 1;
239 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
240 if (fdp->fd_lomap[off] == ~0) {
241 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
242 (1 << (off & NDENTRYMASK))) == 0);
243 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
244 }
245
246 if ((int)fd > fdp->fd_lastfile) {
247 fdp->fd_lastfile = fd;
248 }
249
250 if (fd >= NDFDFILE) {
251 fdp->fd_nused++;
252 } else {
253 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
254 }
255 }
256
257 void
258 fd_unused(filedesc_t *fdp, unsigned fd)
259 {
260 u_int off = fd >> NDENTRYSHIFT;
261 fdfile_t *ff;
262
263 ff = fdp->fd_ofiles[fd];
264
265 /*
266 * Don't assert the lock is held here, as we may be copying
267 * the table during exec() and it is not needed there.
268 * procfs and sysctl are locked out by proc::p_reflock.
269 *
270 * KASSERT(mutex_owned(&fdp->fd_lock));
271 */
272 KASSERT(ff != NULL);
273 KASSERT(ff->ff_file == NULL);
274 KASSERT(ff->ff_allocated);
275
276 if (fd < fdp->fd_freefile) {
277 fdp->fd_freefile = fd;
278 }
279
280 if (fdp->fd_lomap[off] == ~0) {
281 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
282 (1 << (off & NDENTRYMASK))) != 0);
283 fdp->fd_himap[off >> NDENTRYSHIFT] &=
284 ~(1 << (off & NDENTRYMASK));
285 }
286 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
287 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
288 ff->ff_allocated = 0;
289
290 KASSERT(fd <= fdp->fd_lastfile);
291 if (fd == fdp->fd_lastfile) {
292 fdp->fd_lastfile = fd_last_set(fdp, fd);
293 }
294
295 if (fd >= NDFDFILE) {
296 KASSERT(fdp->fd_nused > 0);
297 fdp->fd_nused--;
298 } else {
299 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
300 }
301 }
302
303 /*
304 * Custom version of fd_unused() for fd_copy(), where the descriptor
305 * table is not yet fully initialized.
306 */
307 static inline void
308 fd_zap(filedesc_t *fdp, unsigned fd)
309 {
310 u_int off = fd >> NDENTRYSHIFT;
311
312 if (fd < fdp->fd_freefile) {
313 fdp->fd_freefile = fd;
314 }
315
316 if (fdp->fd_lomap[off] == ~0) {
317 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
318 (1 << (off & NDENTRYMASK))) != 0);
319 fdp->fd_himap[off >> NDENTRYSHIFT] &=
320 ~(1 << (off & NDENTRYMASK));
321 }
322 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
323 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
324 }
325
326 bool
327 fd_isused(filedesc_t *fdp, unsigned fd)
328 {
329 u_int off = fd >> NDENTRYSHIFT;
330
331 KASSERT(fd < fdp->fd_nfiles);
332
333 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
334 }
335
336 /*
337 * Look up the file structure corresponding to a file descriptor
338 * and return the file, holding a reference on the descriptor.
339 */
340 inline file_t *
341 fd_getfile(unsigned fd)
342 {
343 filedesc_t *fdp;
344 fdfile_t *ff;
345 file_t *fp;
346
347 fdp = curlwp->l_fd;
348
349 /*
350 * Look up the fdfile structure representing this descriptor.
351 * Ensure that we see fd_nfiles before fd_ofiles since we
352 * are doing this unlocked. See fd_tryexpand().
353 */
354 if (__predict_false(fd >= fdp->fd_nfiles)) {
355 return NULL;
356 }
357 membar_consumer();
358 ff = fdp->fd_ofiles[fd];
359 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
360 if (__predict_false(ff == NULL)) {
361 return NULL;
362 }
363
364 /*
365 * Now get a reference to the descriptor. Issue a memory
366 * barrier to ensure that we acquire the file pointer _after_
367 * adding a reference. If no memory barrier, we could fetch
368 * a stale pointer.
369 */
370 atomic_inc_uint(&ff->ff_refcnt);
371 #ifndef __HAVE_ATOMIC_AS_MEMBAR
372 membar_enter();
373 #endif
374
375 /*
376 * If the file is not open or is being closed then put the
377 * reference back.
378 */
379 fp = ff->ff_file;
380 if (__predict_true(fp != NULL)) {
381 return fp;
382 }
383 fd_putfile(fd);
384 return NULL;
385 }
386
387 /*
388 * Release a reference to a file descriptor acquired with fd_getfile().
389 */
390 void
391 fd_putfile(unsigned fd)
392 {
393 filedesc_t *fdp;
394 fdfile_t *ff;
395 u_int u, v;
396
397 fdp = curlwp->l_fd;
398 ff = fdp->fd_ofiles[fd];
399
400 KASSERT(fd < fdp->fd_nfiles);
401 KASSERT(ff != NULL);
402 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
403 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
404
405 /*
406 * Ensure that any use of the file is complete and globally
407 * visible before dropping the final reference. If no membar,
408 * the current CPU could still access memory associated with
409 * the file after it has been freed or recycled by another
410 * CPU.
411 */
412 #ifndef __HAVE_ATOMIC_AS_MEMBAR
413 membar_exit();
414 #endif
415
416 /*
417 * Be optimistic and start out with the assumption that no other
418 * threads are trying to close the descriptor. If the CAS fails,
419 * we lost a race and/or it's being closed.
420 */
421 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
422 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
423 if (__predict_true(u == v)) {
424 return;
425 }
426 if (__predict_false((v & FR_CLOSING) != 0)) {
427 break;
428 }
429 }
430
431 /* Another thread is waiting to close the file: join it. */
432 (void)fd_close(fd);
433 }
434
435 /*
436 * Convenience wrapper around fd_getfile() that returns reference
437 * to a vnode.
438 */
439 int
440 fd_getvnode(unsigned fd, file_t **fpp)
441 {
442 vnode_t *vp;
443 file_t *fp;
444
445 fp = fd_getfile(fd);
446 if (__predict_false(fp == NULL)) {
447 return EBADF;
448 }
449 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
450 fd_putfile(fd);
451 return EINVAL;
452 }
453 vp = fp->f_data;
454 if (__predict_false(vp->v_type == VBAD)) {
455 /* XXX Is this case really necessary? */
456 fd_putfile(fd);
457 return EBADF;
458 }
459 *fpp = fp;
460 return 0;
461 }
462
463 /*
464 * Convenience wrapper around fd_getfile() that returns reference
465 * to a socket.
466 */
467 int
468 fd_getsock(unsigned fd, struct socket **sop)
469 {
470 file_t *fp;
471
472 fp = fd_getfile(fd);
473 if (__predict_false(fp == NULL)) {
474 return EBADF;
475 }
476 if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
477 fd_putfile(fd);
478 return ENOTSOCK;
479 }
480 *sop = fp->f_data;
481 return 0;
482 }
483
484 /*
485 * Look up the file structure corresponding to a file descriptor
486 * and return it with a reference held on the file, not the
487 * descriptor.
488 *
489 * This is heavyweight and only used when accessing descriptors
490 * from a foreign process. The caller must ensure that `p' does
491 * not exit or fork across this call.
492 *
493 * To release the file (not descriptor) reference, use closef().
494 */
495 file_t *
496 fd_getfile2(proc_t *p, unsigned fd)
497 {
498 filedesc_t *fdp;
499 fdfile_t *ff;
500 file_t *fp;
501
502 fdp = p->p_fd;
503 mutex_enter(&fdp->fd_lock);
504 if (fd > fdp->fd_nfiles) {
505 mutex_exit(&fdp->fd_lock);
506 return NULL;
507 }
508 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
509 mutex_exit(&fdp->fd_lock);
510 return NULL;
511 }
512 mutex_enter(&ff->ff_lock);
513 if ((fp = ff->ff_file) == NULL) {
514 mutex_exit(&ff->ff_lock);
515 mutex_exit(&fdp->fd_lock);
516 return NULL;
517 }
518 mutex_enter(&fp->f_lock);
519 fp->f_count++;
520 mutex_exit(&fp->f_lock);
521 mutex_exit(&ff->ff_lock);
522 mutex_exit(&fdp->fd_lock);
523
524 return fp;
525 }
526
527 /*
528 * Internal form of close. Must be called with a reference to the
529 * descriptor, and will drop the reference. When all descriptor
530 * references are dropped, releases the descriptor slot and a single
531 * reference to the file structure.
532 */
533 int
534 fd_close(unsigned fd)
535 {
536 struct flock lf;
537 filedesc_t *fdp;
538 fdfile_t *ff;
539 file_t *fp;
540 proc_t *p;
541 lwp_t *l;
542
543 l = curlwp;
544 p = l->l_proc;
545 fdp = l->l_fd;
546 ff = fdp->fd_ofiles[fd];
547
548 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
549
550 mutex_enter(&ff->ff_lock);
551 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
552 if (ff->ff_file == NULL) {
553 /*
554 * Another user of the file is already closing, and is
555 * waiting for other users of the file to drain. Release
556 * our reference, and wake up the closer.
557 */
558 atomic_dec_uint(&ff->ff_refcnt);
559 cv_broadcast(&ff->ff_closing);
560 mutex_exit(&ff->ff_lock);
561
562 /*
563 * An application error, so pretend that the descriptor
564 * was already closed. We can't safely wait for it to
565 * be closed without potentially deadlocking.
566 */
567 return (EBADF);
568 }
569 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
570
571 /*
572 * There may be multiple users of this file within the process.
573 * Notify existing and new users that the file is closing. This
574 * will prevent them from adding additional uses to this file
575 * while we are closing it.
576 */
577 fp = ff->ff_file;
578 ff->ff_file = NULL;
579 ff->ff_exclose = 0;
580
581 /*
582 * We expect the caller to hold a descriptor reference - drop it.
583 * The reference count may increase beyond zero at this point due
584 * to an erroneous descriptor reference by an application, but
585 * fd_getfile() will notice that the file is being closed and drop
586 * the reference again.
587 */
588 #ifndef __HAVE_ATOMIC_AS_MEMBAR
589 membar_producer();
590 #endif
591 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
592 /*
593 * Wait for other references to drain. This is typically
594 * an application error - the descriptor is being closed
595 * while still in use.
596 *
597 */
598 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
599 /*
600 * Remove any knotes attached to the file. A knote
601 * attached to the descriptor can hold references on it.
602 */
603 if (!SLIST_EMPTY(&ff->ff_knlist)) {
604 mutex_exit(&ff->ff_lock);
605 knote_fdclose(fd);
606 mutex_enter(&ff->ff_lock);
607 }
608 /*
609 * We need to see the count drop to zero at least once,
610 * in order to ensure that all pre-existing references
611 * have been drained. New references past this point are
612 * of no interest.
613 */
614 while ((ff->ff_refcnt & FR_MASK) != 0) {
615 cv_wait(&ff->ff_closing, &ff->ff_lock);
616 }
617 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
618 } else {
619 /* If no references, there must be no knotes. */
620 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
621 }
622 mutex_exit(&ff->ff_lock);
623
624 /*
625 * POSIX record locking dictates that any close releases ALL
626 * locks owned by this process. This is handled by setting
627 * a flag in the unlock to free ONLY locks obeying POSIX
628 * semantics, and not to free BSD-style file locks.
629 * If the descriptor was in a message, POSIX-style locks
630 * aren't passed with the descriptor.
631 */
632 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
633 lf.l_whence = SEEK_SET;
634 lf.l_start = 0;
635 lf.l_len = 0;
636 lf.l_type = F_UNLCK;
637 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
638 }
639
640
641 /* Free descriptor slot. */
642 mutex_enter(&fdp->fd_lock);
643 fd_unused(fdp, fd);
644 mutex_exit(&fdp->fd_lock);
645
646 /* Now drop reference to the file itself. */
647 return closef(fp);
648 }
649
650 /*
651 * Duplicate a file descriptor.
652 */
653 int
654 fd_dup(file_t *fp, int minfd, int *newp, int exclose)
655 {
656 proc_t *p;
657 int error;
658
659 p = curproc;
660
661 while ((error = fd_alloc(p, minfd, newp)) != 0) {
662 if (error != ENOSPC) {
663 return error;
664 }
665 fd_tryexpand(p);
666 }
667
668 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
669 fd_affix(p, fp, *newp);
670 return 0;
671 }
672
673 /*
674 * dup2 operation.
675 */
676 int
677 fd_dup2(file_t *fp, unsigned new)
678 {
679 filedesc_t *fdp;
680 fdfile_t *ff;
681
682 fdp = curlwp->l_fd;
683
684 /*
685 * Ensure there are enough slots in the descriptor table,
686 * and allocate an fdfile_t up front in case we need it.
687 */
688 while (new >= fdp->fd_nfiles) {
689 fd_tryexpand(curproc);
690 }
691 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
692
693 /*
694 * If there is already a file open, close it. If the file is
695 * half open, wait for it to be constructed before closing it.
696 * XXX Potential for deadlock here?
697 */
698 mutex_enter(&fdp->fd_lock);
699 while (fd_isused(fdp, new)) {
700 mutex_exit(&fdp->fd_lock);
701 if (fd_getfile(new) != NULL) {
702 (void)fd_close(new);
703 } else {
704 /* XXX Crummy, but unlikely to happen. */
705 kpause("dup2", false, 1, NULL);
706 }
707 mutex_enter(&fdp->fd_lock);
708 }
709 if (fdp->fd_ofiles[new] == NULL) {
710 KASSERT(new >= NDFDFILE);
711 fdp->fd_ofiles[new] = ff;
712 ff = NULL;
713 }
714 fd_used(fdp, new);
715 mutex_exit(&fdp->fd_lock);
716
717 /* Slot is now allocated. Insert copy of the file. */
718 fd_affix(curproc, fp, new);
719 if (ff != NULL) {
720 pool_cache_put(fdfile_cache, ff);
721 }
722 return 0;
723 }
724
725 /*
726 * Drop reference to a file structure.
727 */
728 int
729 closef(file_t *fp)
730 {
731 struct flock lf;
732 int error;
733
734 /*
735 * Drop reference. If referenced elsewhere it's still open
736 * and we have nothing more to do.
737 */
738 mutex_enter(&fp->f_lock);
739 KASSERT(fp->f_count > 0);
740 if (--fp->f_count > 0) {
741 mutex_exit(&fp->f_lock);
742 return 0;
743 }
744 KASSERT(fp->f_count == 0);
745 mutex_exit(&fp->f_lock);
746
747 /* We held the last reference - release locks, close and free. */
748 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
749 lf.l_whence = SEEK_SET;
750 lf.l_start = 0;
751 lf.l_len = 0;
752 lf.l_type = F_UNLCK;
753 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
754 }
755 if (fp->f_ops != NULL) {
756 error = (*fp->f_ops->fo_close)(fp);
757 } else {
758 error = 0;
759 }
760 ffree(fp);
761
762 return error;
763 }
764
765 /*
766 * Allocate a file descriptor for the process.
767 */
768 int
769 fd_alloc(proc_t *p, int want, int *result)
770 {
771 filedesc_t *fdp;
772 int i, lim, last, error;
773 u_int off, new;
774 fdfile_t *ff;
775
776 KASSERT(p == curproc || p == &proc0);
777
778 fdp = p->p_fd;
779 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
780 KASSERT(ff->ff_refcnt == 0);
781 KASSERT(ff->ff_file == NULL);
782
783 /*
784 * Search for a free descriptor starting at the higher
785 * of want or fd_freefile.
786 */
787 mutex_enter(&fdp->fd_lock);
788 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
789 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
790 last = min(fdp->fd_nfiles, lim);
791 for (;;) {
792 if ((i = want) < fdp->fd_freefile)
793 i = fdp->fd_freefile;
794 off = i >> NDENTRYSHIFT;
795 new = fd_next_zero(fdp, fdp->fd_himap, off,
796 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
797 if (new == -1)
798 break;
799 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
800 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
801 if (i == -1) {
802 /*
803 * Free file descriptor in this block was
804 * below want, try again with higher want.
805 */
806 want = (new + 1) << NDENTRYSHIFT;
807 continue;
808 }
809 i += (new << NDENTRYSHIFT);
810 if (i >= last) {
811 break;
812 }
813 if (fdp->fd_ofiles[i] == NULL) {
814 KASSERT(i >= NDFDFILE);
815 fdp->fd_ofiles[i] = ff;
816 } else {
817 pool_cache_put(fdfile_cache, ff);
818 }
819 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
820 fd_used(fdp, i);
821 if (want <= fdp->fd_freefile) {
822 fdp->fd_freefile = i;
823 }
824 *result = i;
825 mutex_exit(&fdp->fd_lock);
826 KASSERT(i >= NDFDFILE ||
827 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
828 return 0;
829 }
830
831 /* No space in current array. Let the caller expand and retry. */
832 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
833 mutex_exit(&fdp->fd_lock);
834 pool_cache_put(fdfile_cache, ff);
835 return error;
836 }
837
838 /*
839 * Expand a process' descriptor table.
840 */
841 void
842 fd_tryexpand(proc_t *p)
843 {
844 filedesc_t *fdp;
845 int i, numfiles, oldnfiles;
846 fdfile_t **newofile;
847 uint32_t *newhimap, *newlomap;
848
849 KASSERT(p == curproc || p == &proc0);
850
851 fdp = p->p_fd;
852 newhimap = NULL;
853 newlomap = NULL;
854 oldnfiles = fdp->fd_nfiles;
855
856 if (oldnfiles < NDEXTENT)
857 numfiles = NDEXTENT;
858 else
859 numfiles = 2 * oldnfiles;
860
861 newofile = malloc(numfiles * sizeof(fdfile_t *), M_FILEDESC, M_WAITOK);
862 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
863 newhimap = malloc(NDHISLOTS(numfiles) *
864 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
865 newlomap = malloc(NDLOSLOTS(numfiles) *
866 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
867 }
868
869 mutex_enter(&fdp->fd_lock);
870 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
871 if (fdp->fd_nfiles != oldnfiles) {
872 /* fdp changed; caller must retry */
873 mutex_exit(&fdp->fd_lock);
874 free(newofile, M_FILEDESC);
875 if (newhimap != NULL)
876 free(newhimap, M_FILEDESC);
877 if (newlomap != NULL)
878 free(newlomap, M_FILEDESC);
879 return;
880 }
881
882 /* Copy the existing ofile array and zero the new portion. */
883 i = sizeof(fdfile_t *) * fdp->fd_nfiles;
884 memcpy(newofile, fdp->fd_ofiles, i);
885 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
886
887 /*
888 * Link old ofiles array into list to be discarded. We defer
889 * freeing until process exit if the descriptor table is visble
890 * to other threads.
891 */
892 if (oldnfiles > NDFILE) {
893 if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
894 *(void **)fdp->fd_ofiles = fdp->fd_discard;
895 fdp->fd_discard = fdp->fd_ofiles;
896 } else {
897 free(fdp->fd_ofiles, M_FILEDESC);
898 }
899 }
900
901 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
902 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
903 memcpy(newhimap, fdp->fd_himap, i);
904 memset((uint8_t *)newhimap + i, 0,
905 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
906
907 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
908 memcpy(newlomap, fdp->fd_lomap, i);
909 memset((uint8_t *)newlomap + i, 0,
910 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
911
912 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
913 free(fdp->fd_himap, M_FILEDESC);
914 free(fdp->fd_lomap, M_FILEDESC);
915 }
916 fdp->fd_himap = newhimap;
917 fdp->fd_lomap = newlomap;
918 }
919
920 /*
921 * All other modifications must become globally visible before
922 * the change to fd_nfiles. See fd_getfile().
923 */
924 fdp->fd_ofiles = newofile;
925 membar_producer();
926 fdp->fd_nfiles = numfiles;
927 mutex_exit(&fdp->fd_lock);
928
929 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
930 }
931
932 /*
933 * Create a new open file structure and allocate a file descriptor
934 * for the current process.
935 */
936 int
937 fd_allocfile(file_t **resultfp, int *resultfd)
938 {
939 file_t *fp;
940 proc_t *p;
941 int error;
942
943 p = curproc;
944
945 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
946 if (error != ENOSPC) {
947 return error;
948 }
949 fd_tryexpand(p);
950 }
951
952 fp = pool_cache_get(file_cache, PR_WAITOK);
953 KASSERT(fp->f_count == 0);
954 fp->f_cred = kauth_cred_get();
955 kauth_cred_hold(fp->f_cred);
956
957 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
958 fd_abort(p, fp, *resultfd);
959 tablefull("file", "increase kern.maxfiles or MAXFILES");
960 return ENFILE;
961 }
962
963 fp->f_advice = 0;
964 fp->f_msgcount = 0;
965 fp->f_offset = 0;
966 fp->f_iflags = 0;
967 *resultfp = fp;
968
969 return 0;
970 }
971
972 /*
973 * Successful creation of a new descriptor: make visible to the process.
974 */
975 void
976 fd_affix(proc_t *p, file_t *fp, unsigned fd)
977 {
978 fdfile_t *ff;
979 filedesc_t *fdp;
980
981 KASSERT(p == curproc || p == &proc0);
982
983 /* Add a reference to the file structure. */
984 mutex_enter(&fp->f_lock);
985 fp->f_count++;
986 mutex_exit(&fp->f_lock);
987
988 /*
989 * Insert the new file into the descriptor slot.
990 *
991 * The memory barriers provided by lock activity in this routine
992 * ensure that any updates to the file structure become globally
993 * visible before the file becomes visible to other LWPs in the
994 * current process.
995 */
996 fdp = p->p_fd;
997 ff = fdp->fd_ofiles[fd];
998
999 KASSERT(ff != NULL);
1000 KASSERT(ff->ff_file == NULL);
1001 KASSERT(ff->ff_allocated);
1002 KASSERT(fd_isused(fdp, fd));
1003 KASSERT(fd >= NDFDFILE ||
1004 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1005
1006 /* No need to lock in order to make file initially visible. */
1007 ff->ff_file = fp;
1008 }
1009
1010 /*
1011 * Abort creation of a new descriptor: free descriptor slot and file.
1012 */
1013 void
1014 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1015 {
1016 filedesc_t *fdp;
1017 fdfile_t *ff;
1018
1019 KASSERT(p == curproc || p == &proc0);
1020
1021 fdp = p->p_fd;
1022 ff = fdp->fd_ofiles[fd];
1023
1024 KASSERT(fd >= NDFDFILE ||
1025 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1026
1027 mutex_enter(&fdp->fd_lock);
1028 KASSERT(fd_isused(fdp, fd));
1029 fd_unused(fdp, fd);
1030 mutex_exit(&fdp->fd_lock);
1031
1032 if (fp != NULL) {
1033 ffree(fp);
1034 }
1035 }
1036
1037 /*
1038 * Free a file descriptor.
1039 */
1040 void
1041 ffree(file_t *fp)
1042 {
1043
1044 KASSERT(fp->f_count == 0);
1045
1046 atomic_dec_uint(&nfiles);
1047 kauth_cred_free(fp->f_cred);
1048 pool_cache_put(file_cache, fp);
1049 }
1050
1051 /*
1052 * Create an initial cwdinfo structure, using the same current and root
1053 * directories as curproc.
1054 */
1055 struct cwdinfo *
1056 cwdinit(void)
1057 {
1058 struct cwdinfo *cwdi;
1059 struct cwdinfo *copy;
1060
1061 cwdi = pool_cache_get(cwdi_cache, PR_WAITOK);
1062 copy = curproc->p_cwdi;
1063
1064 rw_enter(©->cwdi_lock, RW_READER);
1065 cwdi->cwdi_cdir = copy->cwdi_cdir;
1066 if (cwdi->cwdi_cdir)
1067 VREF(cwdi->cwdi_cdir);
1068 cwdi->cwdi_rdir = copy->cwdi_rdir;
1069 if (cwdi->cwdi_rdir)
1070 VREF(cwdi->cwdi_rdir);
1071 cwdi->cwdi_edir = copy->cwdi_edir;
1072 if (cwdi->cwdi_edir)
1073 VREF(cwdi->cwdi_edir);
1074 cwdi->cwdi_cmask = copy->cwdi_cmask;
1075 cwdi->cwdi_refcnt = 1;
1076 rw_exit(©->cwdi_lock);
1077
1078 return (cwdi);
1079 }
1080
1081 static int
1082 cwdi_ctor(void *arg, void *obj, int flags)
1083 {
1084 struct cwdinfo *cwdi = obj;
1085
1086 rw_init(&cwdi->cwdi_lock);
1087
1088 return 0;
1089 }
1090
1091 static void
1092 cwdi_dtor(void *arg, void *obj)
1093 {
1094 struct cwdinfo *cwdi = obj;
1095
1096 rw_destroy(&cwdi->cwdi_lock);
1097 }
1098
1099 static int
1100 file_ctor(void *arg, void *obj, int flags)
1101 {
1102 file_t *fp = obj;
1103
1104 memset(fp, 0, sizeof(*fp));
1105 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1106
1107 mutex_enter(&filelist_lock);
1108 LIST_INSERT_HEAD(&filehead, fp, f_list);
1109 mutex_exit(&filelist_lock);
1110
1111 return 0;
1112 }
1113
1114 static void
1115 file_dtor(void *arg, void *obj)
1116 {
1117 file_t *fp = obj;
1118
1119 mutex_enter(&filelist_lock);
1120 LIST_REMOVE(fp, f_list);
1121 mutex_exit(&filelist_lock);
1122
1123 mutex_destroy(&fp->f_lock);
1124 }
1125
1126 static int
1127 fdfile_ctor(void *arg, void *obj, int flags)
1128 {
1129 fdfile_t *ff = obj;
1130
1131 memset(ff, 0, sizeof(*ff));
1132 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1133 cv_init(&ff->ff_closing, "fdclose");
1134
1135 return 0;
1136 }
1137
1138 static void
1139 fdfile_dtor(void *arg, void *obj)
1140 {
1141 fdfile_t *ff = obj;
1142
1143 mutex_destroy(&ff->ff_lock);
1144 cv_destroy(&ff->ff_closing);
1145 }
1146
1147 file_t *
1148 fgetdummy(void)
1149 {
1150 file_t *fp;
1151
1152 fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1153 if (fp != NULL) {
1154 memset(fp, 0, sizeof(*fp));
1155 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1156 }
1157 return fp;
1158 }
1159
1160 void
1161 fputdummy(file_t *fp)
1162 {
1163
1164 mutex_destroy(&fp->f_lock);
1165 kmem_free(fp, sizeof(*fp));
1166 }
1167
1168 /*
1169 * Make p2 share p1's cwdinfo.
1170 */
1171 void
1172 cwdshare(struct proc *p2)
1173 {
1174 struct cwdinfo *cwdi;
1175
1176 cwdi = curproc->p_cwdi;
1177
1178 atomic_inc_uint(&cwdi->cwdi_refcnt);
1179 p2->p_cwdi = cwdi;
1180 }
1181
1182 /*
1183 * Release a cwdinfo structure.
1184 */
1185 void
1186 cwdfree(struct cwdinfo *cwdi)
1187 {
1188
1189 if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
1190 return;
1191
1192 vrele(cwdi->cwdi_cdir);
1193 if (cwdi->cwdi_rdir)
1194 vrele(cwdi->cwdi_rdir);
1195 if (cwdi->cwdi_edir)
1196 vrele(cwdi->cwdi_edir);
1197 pool_cache_put(cwdi_cache, cwdi);
1198 }
1199
1200 /*
1201 * Create an initial filedesc structure.
1202 */
1203 filedesc_t *
1204 fd_init(filedesc_t *fdp)
1205 {
1206 unsigned fd;
1207
1208 if (fdp == NULL) {
1209 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1210 } else {
1211 filedesc_ctor(NULL, fdp, PR_WAITOK);
1212 }
1213
1214 fdp->fd_refcnt = 1;
1215 fdp->fd_ofiles = fdp->fd_dfiles;
1216 fdp->fd_nfiles = NDFILE;
1217 fdp->fd_himap = fdp->fd_dhimap;
1218 fdp->fd_lomap = fdp->fd_dlomap;
1219 KASSERT(fdp->fd_lastfile == -1);
1220 KASSERT(fdp->fd_lastkqfile == -1);
1221 KASSERT(fdp->fd_knhash == NULL);
1222
1223 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1224 offsetof(filedesc_t, fd_startzero));
1225 for (fd = 0; fd < NDFDFILE; fd++) {
1226 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1227 }
1228
1229 return fdp;
1230 }
1231
1232 /*
1233 * Initialize a file descriptor table.
1234 */
1235 static int
1236 filedesc_ctor(void *arg, void *obj, int flag)
1237 {
1238 filedesc_t *fdp = obj;
1239 int i;
1240
1241 memset(fdp, 0, sizeof(*fdp));
1242 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1243 fdp->fd_lastfile = -1;
1244 fdp->fd_lastkqfile = -1;
1245
1246 KASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1247 for (i = 0; i < NDFDFILE; i++) {
1248 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1249 }
1250
1251 return 0;
1252 }
1253
1254 static void
1255 filedesc_dtor(void *arg, void *obj)
1256 {
1257 filedesc_t *fdp = obj;
1258 int i;
1259
1260 for (i = 0; i < NDFDFILE; i++) {
1261 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1262 }
1263
1264 mutex_destroy(&fdp->fd_lock);
1265 }
1266
1267 /*
1268 * Make p2 share p1's filedesc structure.
1269 */
1270 void
1271 fd_share(struct proc *p2)
1272 {
1273 filedesc_t *fdp;
1274
1275 fdp = curlwp->l_fd;
1276 p2->p_fd = fdp;
1277 atomic_inc_uint(&fdp->fd_refcnt);
1278 }
1279
1280 /*
1281 * Copy a filedesc structure.
1282 */
1283 filedesc_t *
1284 fd_copy(void)
1285 {
1286 filedesc_t *newfdp, *fdp;
1287 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1288 int i, nused, numfiles, lastfile, j, newlast;
1289 file_t *fp;
1290
1291 fdp = curproc->p_fd;
1292 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1293 newfdp->fd_refcnt = 1;
1294
1295 KASSERT(newfdp->fd_knhash == NULL);
1296 KASSERT(newfdp->fd_knhashmask == 0);
1297 KASSERT(newfdp->fd_discard == NULL);
1298
1299 for (;;) {
1300 numfiles = fdp->fd_nfiles;
1301 lastfile = fdp->fd_lastfile;
1302
1303 /*
1304 * If the number of open files fits in the internal arrays
1305 * of the open file structure, use them, otherwise allocate
1306 * additional memory for the number of descriptors currently
1307 * in use.
1308 */
1309 if (lastfile < NDFILE) {
1310 i = NDFILE;
1311 newfdp->fd_ofiles = newfdp->fd_dfiles;
1312 } else {
1313 /*
1314 * Compute the smallest multiple of NDEXTENT needed
1315 * for the file descriptors currently in use,
1316 * allowing the table to shrink.
1317 */
1318 i = numfiles;
1319 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1320 i /= 2;
1321 }
1322 newfdp->fd_ofiles = malloc(i * sizeof(fdfile_t *),
1323 M_FILEDESC, M_WAITOK);
1324 KASSERT(i >= NDFILE);
1325 }
1326 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1327 newfdp->fd_himap = newfdp->fd_dhimap;
1328 newfdp->fd_lomap = newfdp->fd_dlomap;
1329 } else {
1330 newfdp->fd_himap = malloc(NDHISLOTS(i) *
1331 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1332 newfdp->fd_lomap = malloc(NDLOSLOTS(i) *
1333 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1334 }
1335
1336 /*
1337 * Allocate and string together fdfile structures.
1338 * We abuse fdfile_t::ff_file here, but it will be
1339 * cleared before this routine returns.
1340 */
1341 nused = fdp->fd_nused;
1342 fflist = NULL;
1343 for (j = nused; j != 0; j--) {
1344 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1345 ff->ff_file = (void *)fflist;
1346 fflist = ff;
1347 }
1348
1349 mutex_enter(&fdp->fd_lock);
1350 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1351 lastfile == fdp->fd_lastfile) {
1352 break;
1353 }
1354 mutex_exit(&fdp->fd_lock);
1355 if (i >= NDFILE) {
1356 free(newfdp->fd_ofiles, M_FILEDESC);
1357 }
1358 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1359 free(newfdp->fd_himap, M_FILEDESC);
1360 free(newfdp->fd_lomap, M_FILEDESC);
1361 }
1362 while (fflist != NULL) {
1363 ff = fflist;
1364 fflist = (void *)ff->ff_file;
1365 ff->ff_file = NULL;
1366 pool_cache_put(fdfile_cache, ff);
1367 }
1368 }
1369
1370 newfdp->fd_nfiles = i;
1371 newfdp->fd_freefile = fdp->fd_freefile;
1372 newfdp->fd_exclose = fdp->fd_exclose;
1373
1374 /*
1375 * Clear the entries that will not be copied over.
1376 * Avoid calling memset with 0 size.
1377 */
1378 if (lastfile < (i-1)) {
1379 memset(newfdp->fd_ofiles + lastfile + 1, 0,
1380 (i - lastfile - 1) * sizeof(file_t **));
1381 }
1382 if (i < NDENTRIES * NDENTRIES) {
1383 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1384 }
1385 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1386 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1387
1388 ffp = fdp->fd_ofiles;
1389 nffp = newfdp->fd_ofiles;
1390 j = imax(lastfile, (NDFDFILE - 1));
1391 newlast = -1;
1392 KASSERT(j < fdp->fd_nfiles);
1393 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1394 ff = *ffp;
1395 /* Install built-in fdfiles even if unused here. */
1396 if (i < NDFDFILE) {
1397 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1398 } else {
1399 ff2 = NULL;
1400 }
1401 /* Determine if descriptor is active in parent. */
1402 if (ff == NULL || !fd_isused(fdp, i)) {
1403 KASSERT(ff != NULL || i >= NDFDFILE);
1404 continue;
1405 }
1406 mutex_enter(&ff->ff_lock);
1407 fp = ff->ff_file;
1408 if (fp == NULL) {
1409 /* Descriptor is half-open: free slot. */
1410 fd_zap(newfdp, i);
1411 mutex_exit(&ff->ff_lock);
1412 continue;
1413 }
1414 if (fp->f_type == DTYPE_KQUEUE) {
1415 /* kqueue descriptors cannot be copied. */
1416 fd_zap(newfdp, i);
1417 mutex_exit(&ff->ff_lock);
1418 continue;
1419 }
1420 /* It's active: add a reference to the file. */
1421 mutex_enter(&fp->f_lock);
1422 fp->f_count++;
1423 mutex_exit(&fp->f_lock);
1424 /* Consume one fdfile_t to represent it. */
1425 if (i >= NDFDFILE) {
1426 ff2 = fflist;
1427 fflist = (void *)ff2->ff_file;
1428 }
1429 ff2->ff_file = fp;
1430 ff2->ff_exclose = ff->ff_exclose;
1431 ff2->ff_allocated = 1;
1432 mutex_exit(&ff->ff_lock);
1433 if (i > newlast) {
1434 newlast = i;
1435 }
1436 }
1437 mutex_exit(&fdp->fd_lock);
1438
1439 /* Discard unused fdfile_t structures. */
1440 while (__predict_false(fflist != NULL)) {
1441 ff = fflist;
1442 fflist = (void *)ff->ff_file;
1443 ff->ff_file = NULL;
1444 pool_cache_put(fdfile_cache, ff);
1445 nused--;
1446 }
1447 KASSERT(nused >= 0);
1448 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1449
1450 newfdp->fd_nused = nused;
1451 newfdp->fd_lastfile = newlast;
1452
1453 return (newfdp);
1454 }
1455
1456 /*
1457 * Release a filedesc structure.
1458 */
1459 void
1460 fd_free(void)
1461 {
1462 filedesc_t *fdp;
1463 fdfile_t *ff;
1464 file_t *fp;
1465 int fd, lastfd;
1466 void *discard;
1467
1468 fdp = curlwp->l_fd;
1469
1470 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1471
1472 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1473 return;
1474
1475 /*
1476 * Close any files that the process holds open.
1477 */
1478 for (fd = 0, lastfd = fdp->fd_lastfile; fd <= lastfd; fd++) {
1479 ff = fdp->fd_ofiles[fd];
1480 KASSERT(fd >= NDFDFILE ||
1481 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1482 if ((ff = fdp->fd_ofiles[fd]) == NULL)
1483 continue;
1484 if ((fp = ff->ff_file) != NULL) {
1485 /*
1486 * Must use fd_close() here as kqueue holds
1487 * long term references to descriptors.
1488 */
1489 ff->ff_refcnt++;
1490 fd_close(fd);
1491 }
1492 KASSERT(ff->ff_refcnt == 0);
1493 KASSERT(ff->ff_file == NULL);
1494 KASSERT(!ff->ff_exclose);
1495 KASSERT(!ff->ff_allocated);
1496 if (fd >= NDFDFILE) {
1497 pool_cache_put(fdfile_cache, ff);
1498 }
1499 }
1500
1501 /*
1502 * Clean out the descriptor table for the next user and return
1503 * to the cache.
1504 */
1505 while ((discard = fdp->fd_discard) != NULL) {
1506 KASSERT(discard != fdp->fd_ofiles);
1507 fdp->fd_discard = *(void **)discard;
1508 free(discard, M_FILEDESC);
1509 }
1510 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1511 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1512 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1513 free(fdp->fd_himap, M_FILEDESC);
1514 free(fdp->fd_lomap, M_FILEDESC);
1515 }
1516 if (fdp->fd_nfiles > NDFILE) {
1517 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1518 free(fdp->fd_ofiles, M_FILEDESC);
1519 }
1520 if (fdp->fd_knhash != NULL) {
1521 hashdone(fdp->fd_knhash, M_KEVENT);
1522 fdp->fd_knhash = NULL;
1523 fdp->fd_knhashmask = 0;
1524 } else {
1525 KASSERT(fdp->fd_knhashmask == 0);
1526 }
1527 fdp->fd_lastkqfile = -1;
1528 pool_cache_put(filedesc_cache, fdp);
1529 }
1530
1531 /*
1532 * File Descriptor pseudo-device driver (/dev/fd/).
1533 *
1534 * Opening minor device N dup()s the file (if any) connected to file
1535 * descriptor N belonging to the calling process. Note that this driver
1536 * consists of only the ``open()'' routine, because all subsequent
1537 * references to this file will be direct to the other driver.
1538 */
1539 static int
1540 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1541 {
1542
1543 /*
1544 * XXX Kludge: set dupfd to contain the value of the
1545 * the file descriptor being sought for duplication. The error
1546 * return ensures that the vnode for this device will be released
1547 * by vn_open. Open will detect this special error and take the
1548 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1549 * will simply report the error.
1550 */
1551 l->l_dupfd = minor(dev); /* XXX */
1552 return EDUPFD;
1553 }
1554
1555 /*
1556 * Duplicate the specified descriptor to a free descriptor.
1557 */
1558 int
1559 fd_dupopen(int old, int *new, int mode, int error)
1560 {
1561 filedesc_t *fdp;
1562 fdfile_t *ff;
1563 file_t *fp;
1564
1565 if ((fp = fd_getfile(old)) == NULL) {
1566 return EBADF;
1567 }
1568 fdp = curlwp->l_fd;
1569 ff = fdp->fd_ofiles[old];
1570
1571 /*
1572 * There are two cases of interest here.
1573 *
1574 * For EDUPFD simply dup (dfd) to file descriptor
1575 * (indx) and return.
1576 *
1577 * For EMOVEFD steal away the file structure from (dfd) and
1578 * store it in (indx). (dfd) is effectively closed by
1579 * this operation.
1580 *
1581 * Any other error code is just returned.
1582 */
1583 switch (error) {
1584 case EDUPFD:
1585 /*
1586 * Check that the mode the file is being opened for is a
1587 * subset of the mode of the existing descriptor.
1588 */
1589 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1590 error = EACCES;
1591 break;
1592 }
1593
1594 /* Copy it. */
1595 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1596 break;
1597
1598 case EMOVEFD:
1599 /* Copy it. */
1600 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1601 if (error != 0) {
1602 break;
1603 }
1604
1605 /* Steal away the file pointer from 'old'. */
1606 (void)fd_close(old);
1607 return 0;
1608 }
1609
1610 fd_putfile(old);
1611 return error;
1612 }
1613
1614 /*
1615 * Close open files on exec.
1616 */
1617 void
1618 fd_closeexec(void)
1619 {
1620 struct cwdinfo *cwdi;
1621 proc_t *p;
1622 filedesc_t *fdp;
1623 fdfile_t *ff;
1624 lwp_t *l;
1625 int fd;
1626
1627 l = curlwp;
1628 p = l->l_proc;
1629 fdp = p->p_fd;
1630 cwdi = p->p_cwdi;
1631
1632 if (cwdi->cwdi_refcnt > 1) {
1633 cwdi = cwdinit();
1634 cwdfree(p->p_cwdi);
1635 p->p_cwdi = cwdi;
1636 }
1637 if (p->p_cwdi->cwdi_edir) {
1638 vrele(p->p_cwdi->cwdi_edir);
1639 }
1640
1641 if (fdp->fd_refcnt > 1) {
1642 fdp = fd_copy();
1643 fd_free();
1644 p->p_fd = fdp;
1645 l->l_fd = fdp;
1646 }
1647 if (!fdp->fd_exclose) {
1648 return;
1649 }
1650 fdp->fd_exclose = 0;
1651
1652 for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1653 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1654 KASSERT(fd >= NDFDFILE);
1655 continue;
1656 }
1657 KASSERT(fd >= NDFDFILE ||
1658 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1659 if (ff->ff_file == NULL)
1660 continue;
1661 if (ff->ff_exclose) {
1662 /*
1663 * We need a reference to close the file.
1664 * No other threads can see the fdfile_t at
1665 * this point, so don't bother locking.
1666 */
1667 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1668 ff->ff_refcnt++;
1669 fd_close(fd);
1670 }
1671 }
1672 }
1673
1674 /*
1675 * It is unsafe for set[ug]id processes to be started with file
1676 * descriptors 0..2 closed, as these descriptors are given implicit
1677 * significance in the Standard C library. fdcheckstd() will create a
1678 * descriptor referencing /dev/null for each of stdin, stdout, and
1679 * stderr that is not already open.
1680 */
1681 #define CHECK_UPTO 3
1682 int
1683 fd_checkstd(void)
1684 {
1685 struct proc *p;
1686 struct nameidata nd;
1687 filedesc_t *fdp;
1688 file_t *fp;
1689 struct proc *pp;
1690 int fd, i, error, flags = FREAD|FWRITE;
1691 char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1692
1693 p = curproc;
1694 closed[0] = '\0';
1695 if ((fdp = p->p_fd) == NULL)
1696 return (0);
1697 for (i = 0; i < CHECK_UPTO; i++) {
1698 KASSERT(i >= NDFDFILE ||
1699 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1700 if (fdp->fd_ofiles[i]->ff_file != NULL)
1701 continue;
1702 snprintf(which, sizeof(which), ",%d", i);
1703 strlcat(closed, which, sizeof(closed));
1704 if ((error = fd_allocfile(&fp, &fd)) != 0)
1705 return (error);
1706 KASSERT(fd < CHECK_UPTO);
1707 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1708 if ((error = vn_open(&nd, flags, 0)) != 0) {
1709 fd_abort(p, fp, fd);
1710 return (error);
1711 }
1712 fp->f_data = nd.ni_vp;
1713 fp->f_flag = flags;
1714 fp->f_ops = &vnops;
1715 fp->f_type = DTYPE_VNODE;
1716 VOP_UNLOCK(nd.ni_vp, 0);
1717 fd_affix(p, fp, fd);
1718 }
1719 if (closed[0] != '\0') {
1720 mutex_enter(&proclist_lock);
1721 pp = p->p_pptr;
1722 mutex_enter(&pp->p_mutex);
1723 log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1724 "was invoked by uid %d ppid %d (%s) "
1725 "with fd %s closed\n",
1726 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1727 pp->p_pid, pp->p_comm, &closed[1]);
1728 mutex_exit(&pp->p_mutex);
1729 mutex_exit(&proclist_lock);
1730 }
1731 return (0);
1732 }
1733 #undef CHECK_UPTO
1734
1735 /*
1736 * Sets descriptor owner. If the owner is a process, 'pgid'
1737 * is set to positive value, process ID. If the owner is process group,
1738 * 'pgid' is set to -pg_id.
1739 */
1740 int
1741 fsetown(pid_t *pgid, int cmd, const void *data)
1742 {
1743 int id = *(const int *)data;
1744 int error;
1745
1746 switch (cmd) {
1747 case TIOCSPGRP:
1748 if (id < 0)
1749 return (EINVAL);
1750 id = -id;
1751 break;
1752 default:
1753 break;
1754 }
1755
1756 if (id > 0 && !pfind(id))
1757 return (ESRCH);
1758 else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1759 return (error);
1760
1761 *pgid = id;
1762 return (0);
1763 }
1764
1765 /*
1766 * Return descriptor owner information. If the value is positive,
1767 * it's process ID. If it's negative, it's process group ID and
1768 * needs the sign removed before use.
1769 */
1770 int
1771 fgetown(pid_t pgid, int cmd, void *data)
1772 {
1773
1774 switch (cmd) {
1775 case TIOCGPGRP:
1776 *(int *)data = -pgid;
1777 break;
1778 default:
1779 *(int *)data = pgid;
1780 break;
1781 }
1782 return (0);
1783 }
1784
1785 /*
1786 * Send signal to descriptor owner, either process or process group.
1787 */
1788 void
1789 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1790 {
1791 struct proc *p1;
1792 struct pgrp *pgrp;
1793 ksiginfo_t ksi;
1794
1795 KSI_INIT(&ksi);
1796 ksi.ksi_signo = signo;
1797 ksi.ksi_code = code;
1798 ksi.ksi_band = band;
1799
1800 /*
1801 * Since we may be called from an interrupt context, we must use
1802 * the proclist_mutex.
1803 */
1804 mutex_enter(&proclist_mutex);
1805 if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1806 kpsignal(p1, &ksi, fdescdata);
1807 else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1808 kpgsignal(pgrp, &ksi, fdescdata, 0);
1809 mutex_exit(&proclist_mutex);
1810 }
1811
1812 int
1813 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1814 void *data)
1815 {
1816
1817 fp->f_flag = flag;
1818 fp->f_type = DTYPE_MISC;
1819 fp->f_ops = fops;
1820 fp->f_data = data;
1821 curlwp->l_dupfd = fd;
1822 fd_affix(curproc, fp, fd);
1823
1824 return EMOVEFD;
1825 }
1826
1827 int
1828 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1829 {
1830
1831 if (cmd == F_SETFL)
1832 return 0;
1833
1834 return EOPNOTSUPP;
1835 }
1836
1837 int
1838 fnullop_poll(file_t *fp, int which)
1839 {
1840
1841 return 0;
1842 }
1843
1844 int
1845 fnullop_kqfilter(file_t *fp, struct knote *kn)
1846 {
1847
1848 return 0;
1849 }
1850
1851 int
1852 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1853 kauth_cred_t cred, int flags)
1854 {
1855
1856 return EOPNOTSUPP;
1857 }
1858
1859 int
1860 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1861 kauth_cred_t cred, int flags)
1862 {
1863
1864 return EOPNOTSUPP;
1865 }
1866
1867 int
1868 fbadop_ioctl(file_t *fp, u_long com, void *data)
1869 {
1870
1871 return EOPNOTSUPP;
1872 }
1873
1874 int
1875 fbadop_stat(file_t *fp, struct stat *sb)
1876 {
1877
1878 return EOPNOTSUPP;
1879 }
1880
1881 int
1882 fbadop_close(file_t *fp)
1883 {
1884
1885 return EOPNOTSUPP;
1886 }
1887