kern_descrip.c revision 1.177 1 /* $NetBSD: kern_descrip.c,v 1.177 2008/04/24 18:39:23 ad Exp $ */
2
3 /*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the NetBSD
18 * Foundation, Inc. and its contributors.
19 * 4. Neither the name of The NetBSD Foundation nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
25 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
27 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 /*
37 * Copyright (c) 1982, 1986, 1989, 1991, 1993
38 * The Regents of the University of California. All rights reserved.
39 * (c) UNIX System Laboratories, Inc.
40 * All or some portions of this file are derived from material licensed
41 * to the University of California by American Telephone and Telegraph
42 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
43 * the permission of UNIX System Laboratories, Inc.
44 *
45 * Redistribution and use in source and binary forms, with or without
46 * modification, are permitted provided that the following conditions
47 * are met:
48 * 1. Redistributions of source code must retain the above copyright
49 * notice, this list of conditions and the following disclaimer.
50 * 2. Redistributions in binary form must reproduce the above copyright
51 * notice, this list of conditions and the following disclaimer in the
52 * documentation and/or other materials provided with the distribution.
53 * 3. Neither the name of the University nor the names of its contributors
54 * may be used to endorse or promote products derived from this software
55 * without specific prior written permission.
56 *
57 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67 * SUCH DAMAGE.
68 *
69 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
70 */
71
72 /*
73 * File descriptor management.
74 */
75
76 #include <sys/cdefs.h>
77 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.177 2008/04/24 18:39:23 ad Exp $");
78
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/filedesc.h>
82 #include <sys/kernel.h>
83 #include <sys/vnode.h>
84 #include <sys/proc.h>
85 #include <sys/file.h>
86 #include <sys/namei.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/stat.h>
90 #include <sys/ioctl.h>
91 #include <sys/fcntl.h>
92 #include <sys/malloc.h>
93 #include <sys/pool.h>
94 #include <sys/syslog.h>
95 #include <sys/unistd.h>
96 #include <sys/resourcevar.h>
97 #include <sys/conf.h>
98 #include <sys/event.h>
99 #include <sys/kauth.h>
100 #include <sys/atomic.h>
101 #include <sys/mount.h>
102 #include <sys/syscallargs.h>
103 #include <sys/cpu.h>
104
105 static int cwdi_ctor(void *, void *, int);
106 static void cwdi_dtor(void *, void *);
107 static int file_ctor(void *, void *, int);
108 static void file_dtor(void *, void *);
109 static int fdfile_ctor(void *, void *, int);
110 static void fdfile_dtor(void *, void *);
111 static int filedesc_ctor(void *, void *, int);
112 static void filedesc_dtor(void *, void *);
113 static int filedescopen(dev_t, int, int, lwp_t *);
114
115 kmutex_t filelist_lock; /* lock on filehead */
116 struct filelist filehead; /* head of list of open files */
117 u_int nfiles; /* actual number of open files */
118
119 static pool_cache_t cwdi_cache;
120 static pool_cache_t filedesc_cache;
121 static pool_cache_t file_cache;
122 static pool_cache_t fdfile_cache;
123
124 MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
125
126 const struct cdevsw filedesc_cdevsw = {
127 filedescopen, noclose, noread, nowrite, noioctl,
128 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
129 };
130
131 /* For ease of reading. */
132 __strong_alias(fd_putvnode,fd_putfile)
133 __strong_alias(fd_putsock,fd_putfile)
134
135 /*
136 * Initialize the descriptor system.
137 */
138 void
139 fd_sys_init(void)
140 {
141
142 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
143
144 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
145 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
146 KASSERT(file_cache != NULL);
147
148 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
149 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
150 NULL);
151 KASSERT(fdfile_cache != NULL);
152
153 cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit,
154 0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL);
155 KASSERT(cwdi_cache != NULL);
156
157 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
158 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
159 NULL);
160 KASSERT(filedesc_cache != NULL);
161 }
162
163 static int
164 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
165 {
166 int i, off, maxoff;
167 uint32_t sub;
168
169 KASSERT(mutex_owned(&fdp->fd_lock));
170
171 if (want > bits)
172 return -1;
173
174 off = want >> NDENTRYSHIFT;
175 i = want & NDENTRYMASK;
176 if (i) {
177 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
178 if (sub != ~0)
179 goto found;
180 off++;
181 }
182
183 maxoff = NDLOSLOTS(bits);
184 while (off < maxoff) {
185 if ((sub = bitmap[off]) != ~0)
186 goto found;
187 off++;
188 }
189
190 return (-1);
191
192 found:
193 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
194 }
195
196 static int
197 fd_last_set(filedesc_t *fd, int last)
198 {
199 int off, i;
200 fdfile_t **ofiles = fd->fd_ofiles;
201 uint32_t *bitmap = fd->fd_lomap;
202
203 KASSERT(mutex_owned(&fd->fd_lock));
204
205 off = (last - 1) >> NDENTRYSHIFT;
206
207 while (off >= 0 && !bitmap[off])
208 off--;
209
210 if (off < 0)
211 return (-1);
212
213 i = ((off + 1) << NDENTRYSHIFT) - 1;
214 if (i >= last)
215 i = last - 1;
216
217 /* XXX should use bitmap */
218 /* XXXAD does not work for fd_copy() */
219 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
220 i--;
221
222 return (i);
223 }
224
225 void
226 fd_used(filedesc_t *fdp, unsigned fd)
227 {
228 u_int off = fd >> NDENTRYSHIFT;
229 fdfile_t *ff;
230
231 ff = fdp->fd_ofiles[fd];
232
233 KASSERT(mutex_owned(&fdp->fd_lock));
234 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
235 KASSERT(ff != NULL);
236 KASSERT(ff->ff_file == NULL);
237 KASSERT(!ff->ff_allocated);
238
239 ff->ff_allocated = 1;
240 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
241 if (fdp->fd_lomap[off] == ~0) {
242 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
243 (1 << (off & NDENTRYMASK))) == 0);
244 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
245 }
246
247 if ((int)fd > fdp->fd_lastfile) {
248 fdp->fd_lastfile = fd;
249 }
250
251 if (fd >= NDFDFILE) {
252 fdp->fd_nused++;
253 } else {
254 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
255 }
256 }
257
258 void
259 fd_unused(filedesc_t *fdp, unsigned fd)
260 {
261 u_int off = fd >> NDENTRYSHIFT;
262 fdfile_t *ff;
263
264 ff = fdp->fd_ofiles[fd];
265
266 /*
267 * Don't assert the lock is held here, as we may be copying
268 * the table during exec() and it is not needed there.
269 * procfs and sysctl are locked out by proc::p_reflock.
270 *
271 * KASSERT(mutex_owned(&fdp->fd_lock));
272 */
273 KASSERT(ff != NULL);
274 KASSERT(ff->ff_file == NULL);
275 KASSERT(ff->ff_allocated);
276
277 if (fd < fdp->fd_freefile) {
278 fdp->fd_freefile = fd;
279 }
280
281 if (fdp->fd_lomap[off] == ~0) {
282 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
283 (1 << (off & NDENTRYMASK))) != 0);
284 fdp->fd_himap[off >> NDENTRYSHIFT] &=
285 ~(1 << (off & NDENTRYMASK));
286 }
287 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
288 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
289 ff->ff_allocated = 0;
290
291 KASSERT(fd <= fdp->fd_lastfile);
292 if (fd == fdp->fd_lastfile) {
293 fdp->fd_lastfile = fd_last_set(fdp, fd);
294 }
295
296 if (fd >= NDFDFILE) {
297 KASSERT(fdp->fd_nused > 0);
298 fdp->fd_nused--;
299 } else {
300 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
301 }
302 }
303
304 /*
305 * Custom version of fd_unused() for fd_copy(), where the descriptor
306 * table is not yet fully initialized.
307 */
308 static inline void
309 fd_zap(filedesc_t *fdp, unsigned fd)
310 {
311 u_int off = fd >> NDENTRYSHIFT;
312
313 if (fd < fdp->fd_freefile) {
314 fdp->fd_freefile = fd;
315 }
316
317 if (fdp->fd_lomap[off] == ~0) {
318 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
319 (1 << (off & NDENTRYMASK))) != 0);
320 fdp->fd_himap[off >> NDENTRYSHIFT] &=
321 ~(1 << (off & NDENTRYMASK));
322 }
323 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
324 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
325 }
326
327 bool
328 fd_isused(filedesc_t *fdp, unsigned fd)
329 {
330 u_int off = fd >> NDENTRYSHIFT;
331
332 KASSERT(fd < fdp->fd_nfiles);
333
334 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
335 }
336
337 /*
338 * Look up the file structure corresponding to a file descriptor
339 * and return the file, holding a reference on the descriptor.
340 */
341 inline file_t *
342 fd_getfile(unsigned fd)
343 {
344 filedesc_t *fdp;
345 fdfile_t *ff;
346 file_t *fp;
347
348 fdp = curlwp->l_fd;
349
350 /*
351 * Look up the fdfile structure representing this descriptor.
352 * Ensure that we see fd_nfiles before fd_ofiles since we
353 * are doing this unlocked. See fd_tryexpand().
354 */
355 if (__predict_false(fd >= fdp->fd_nfiles)) {
356 return NULL;
357 }
358 membar_consumer();
359 ff = fdp->fd_ofiles[fd];
360 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
361 if (__predict_false(ff == NULL)) {
362 return NULL;
363 }
364
365 /*
366 * Now get a reference to the descriptor. Issue a memory
367 * barrier to ensure that we acquire the file pointer _after_
368 * adding a reference. If no memory barrier, we could fetch
369 * a stale pointer.
370 */
371 atomic_inc_uint(&ff->ff_refcnt);
372 #ifndef __HAVE_ATOMIC_AS_MEMBAR
373 membar_enter();
374 #endif
375
376 /*
377 * If the file is not open or is being closed then put the
378 * reference back.
379 */
380 fp = ff->ff_file;
381 if (__predict_true(fp != NULL)) {
382 return fp;
383 }
384 fd_putfile(fd);
385 return NULL;
386 }
387
388 /*
389 * Release a reference to a file descriptor acquired with fd_getfile().
390 */
391 void
392 fd_putfile(unsigned fd)
393 {
394 filedesc_t *fdp;
395 fdfile_t *ff;
396 u_int u, v;
397
398 fdp = curlwp->l_fd;
399 ff = fdp->fd_ofiles[fd];
400
401 KASSERT(fd < fdp->fd_nfiles);
402 KASSERT(ff != NULL);
403 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
404 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
405
406 /*
407 * Ensure that any use of the file is complete and globally
408 * visible before dropping the final reference. If no membar,
409 * the current CPU could still access memory associated with
410 * the file after it has been freed or recycled by another
411 * CPU.
412 */
413 #ifndef __HAVE_ATOMIC_AS_MEMBAR
414 membar_exit();
415 #endif
416
417 /*
418 * Be optimistic and start out with the assumption that no other
419 * threads are trying to close the descriptor. If the CAS fails,
420 * we lost a race and/or it's being closed.
421 */
422 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
423 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
424 if (__predict_true(u == v)) {
425 return;
426 }
427 if (__predict_false((v & FR_CLOSING) != 0)) {
428 break;
429 }
430 }
431
432 /* Another thread is waiting to close the file: join it. */
433 (void)fd_close(fd);
434 }
435
436 /*
437 * Convenience wrapper around fd_getfile() that returns reference
438 * to a vnode.
439 */
440 int
441 fd_getvnode(unsigned fd, file_t **fpp)
442 {
443 vnode_t *vp;
444 file_t *fp;
445
446 fp = fd_getfile(fd);
447 if (__predict_false(fp == NULL)) {
448 return EBADF;
449 }
450 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
451 fd_putfile(fd);
452 return EINVAL;
453 }
454 vp = fp->f_data;
455 if (__predict_false(vp->v_type == VBAD)) {
456 /* XXX Is this case really necessary? */
457 fd_putfile(fd);
458 return EBADF;
459 }
460 *fpp = fp;
461 return 0;
462 }
463
464 /*
465 * Convenience wrapper around fd_getfile() that returns reference
466 * to a socket.
467 */
468 int
469 fd_getsock(unsigned fd, struct socket **sop)
470 {
471 file_t *fp;
472
473 fp = fd_getfile(fd);
474 if (__predict_false(fp == NULL)) {
475 return EBADF;
476 }
477 if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
478 fd_putfile(fd);
479 return ENOTSOCK;
480 }
481 *sop = fp->f_data;
482 return 0;
483 }
484
485 /*
486 * Look up the file structure corresponding to a file descriptor
487 * and return it with a reference held on the file, not the
488 * descriptor.
489 *
490 * This is heavyweight and only used when accessing descriptors
491 * from a foreign process. The caller must ensure that `p' does
492 * not exit or fork across this call.
493 *
494 * To release the file (not descriptor) reference, use closef().
495 */
496 file_t *
497 fd_getfile2(proc_t *p, unsigned fd)
498 {
499 filedesc_t *fdp;
500 fdfile_t *ff;
501 file_t *fp;
502
503 fdp = p->p_fd;
504 mutex_enter(&fdp->fd_lock);
505 if (fd > fdp->fd_nfiles) {
506 mutex_exit(&fdp->fd_lock);
507 return NULL;
508 }
509 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
510 mutex_exit(&fdp->fd_lock);
511 return NULL;
512 }
513 mutex_enter(&ff->ff_lock);
514 if ((fp = ff->ff_file) == NULL) {
515 mutex_exit(&ff->ff_lock);
516 mutex_exit(&fdp->fd_lock);
517 return NULL;
518 }
519 mutex_enter(&fp->f_lock);
520 fp->f_count++;
521 mutex_exit(&fp->f_lock);
522 mutex_exit(&ff->ff_lock);
523 mutex_exit(&fdp->fd_lock);
524
525 return fp;
526 }
527
528 /*
529 * Internal form of close. Must be called with a reference to the
530 * descriptor, and will drop the reference. When all descriptor
531 * references are dropped, releases the descriptor slot and a single
532 * reference to the file structure.
533 */
534 int
535 fd_close(unsigned fd)
536 {
537 struct flock lf;
538 filedesc_t *fdp;
539 fdfile_t *ff;
540 file_t *fp;
541 proc_t *p;
542 lwp_t *l;
543
544 l = curlwp;
545 p = l->l_proc;
546 fdp = l->l_fd;
547 ff = fdp->fd_ofiles[fd];
548
549 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
550
551 mutex_enter(&ff->ff_lock);
552 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
553 if (ff->ff_file == NULL) {
554 /*
555 * Another user of the file is already closing, and is
556 * waiting for other users of the file to drain. Release
557 * our reference, and wake up the closer.
558 */
559 atomic_dec_uint(&ff->ff_refcnt);
560 cv_broadcast(&ff->ff_closing);
561 mutex_exit(&ff->ff_lock);
562
563 /*
564 * An application error, so pretend that the descriptor
565 * was already closed. We can't safely wait for it to
566 * be closed without potentially deadlocking.
567 */
568 return (EBADF);
569 }
570 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
571
572 /*
573 * There may be multiple users of this file within the process.
574 * Notify existing and new users that the file is closing. This
575 * will prevent them from adding additional uses to this file
576 * while we are closing it.
577 */
578 fp = ff->ff_file;
579 ff->ff_file = NULL;
580 ff->ff_exclose = 0;
581
582 /*
583 * We expect the caller to hold a descriptor reference - drop it.
584 * The reference count may increase beyond zero at this point due
585 * to an erroneous descriptor reference by an application, but
586 * fd_getfile() will notice that the file is being closed and drop
587 * the reference again.
588 */
589 #ifndef __HAVE_ATOMIC_AS_MEMBAR
590 membar_producer();
591 #endif
592 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
593 /*
594 * Wait for other references to drain. This is typically
595 * an application error - the descriptor is being closed
596 * while still in use.
597 *
598 */
599 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
600 /*
601 * Remove any knotes attached to the file. A knote
602 * attached to the descriptor can hold references on it.
603 */
604 if (!SLIST_EMPTY(&ff->ff_knlist)) {
605 mutex_exit(&ff->ff_lock);
606 knote_fdclose(fd);
607 mutex_enter(&ff->ff_lock);
608 }
609 /*
610 * We need to see the count drop to zero at least once,
611 * in order to ensure that all pre-existing references
612 * have been drained. New references past this point are
613 * of no interest.
614 */
615 while ((ff->ff_refcnt & FR_MASK) != 0) {
616 cv_wait(&ff->ff_closing, &ff->ff_lock);
617 }
618 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
619 } else {
620 /* If no references, there must be no knotes. */
621 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
622 }
623 mutex_exit(&ff->ff_lock);
624
625 /*
626 * POSIX record locking dictates that any close releases ALL
627 * locks owned by this process. This is handled by setting
628 * a flag in the unlock to free ONLY locks obeying POSIX
629 * semantics, and not to free BSD-style file locks.
630 * If the descriptor was in a message, POSIX-style locks
631 * aren't passed with the descriptor.
632 */
633 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
634 lf.l_whence = SEEK_SET;
635 lf.l_start = 0;
636 lf.l_len = 0;
637 lf.l_type = F_UNLCK;
638 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
639 }
640
641
642 /* Free descriptor slot. */
643 mutex_enter(&fdp->fd_lock);
644 fd_unused(fdp, fd);
645 mutex_exit(&fdp->fd_lock);
646
647 /* Now drop reference to the file itself. */
648 return closef(fp);
649 }
650
651 /*
652 * Duplicate a file descriptor.
653 */
654 int
655 fd_dup(file_t *fp, int minfd, int *newp, int exclose)
656 {
657 proc_t *p;
658 int error;
659
660 p = curproc;
661
662 while ((error = fd_alloc(p, minfd, newp)) != 0) {
663 if (error != ENOSPC) {
664 return error;
665 }
666 fd_tryexpand(p);
667 }
668
669 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
670 fd_affix(p, fp, *newp);
671 return 0;
672 }
673
674 /*
675 * dup2 operation.
676 */
677 int
678 fd_dup2(file_t *fp, unsigned new)
679 {
680 filedesc_t *fdp;
681 fdfile_t *ff;
682
683 fdp = curlwp->l_fd;
684
685 /*
686 * Ensure there are enough slots in the descriptor table,
687 * and allocate an fdfile_t up front in case we need it.
688 */
689 while (new >= fdp->fd_nfiles) {
690 fd_tryexpand(curproc);
691 }
692 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
693
694 /*
695 * If there is already a file open, close it. If the file is
696 * half open, wait for it to be constructed before closing it.
697 * XXX Potential for deadlock here?
698 */
699 mutex_enter(&fdp->fd_lock);
700 while (fd_isused(fdp, new)) {
701 mutex_exit(&fdp->fd_lock);
702 if (fd_getfile(new) != NULL) {
703 (void)fd_close(new);
704 } else {
705 /* XXX Crummy, but unlikely to happen. */
706 kpause("dup2", false, 1, NULL);
707 }
708 mutex_enter(&fdp->fd_lock);
709 }
710 if (fdp->fd_ofiles[new] == NULL) {
711 KASSERT(new >= NDFDFILE);
712 fdp->fd_ofiles[new] = ff;
713 ff = NULL;
714 }
715 fd_used(fdp, new);
716 mutex_exit(&fdp->fd_lock);
717
718 /* Slot is now allocated. Insert copy of the file. */
719 fd_affix(curproc, fp, new);
720 if (ff != NULL) {
721 pool_cache_put(fdfile_cache, ff);
722 }
723 return 0;
724 }
725
726 /*
727 * Drop reference to a file structure.
728 */
729 int
730 closef(file_t *fp)
731 {
732 struct flock lf;
733 int error;
734
735 /*
736 * Drop reference. If referenced elsewhere it's still open
737 * and we have nothing more to do.
738 */
739 mutex_enter(&fp->f_lock);
740 KASSERT(fp->f_count > 0);
741 if (--fp->f_count > 0) {
742 mutex_exit(&fp->f_lock);
743 return 0;
744 }
745 KASSERT(fp->f_count == 0);
746 mutex_exit(&fp->f_lock);
747
748 /* We held the last reference - release locks, close and free. */
749 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
750 lf.l_whence = SEEK_SET;
751 lf.l_start = 0;
752 lf.l_len = 0;
753 lf.l_type = F_UNLCK;
754 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
755 }
756 if (fp->f_ops != NULL) {
757 error = (*fp->f_ops->fo_close)(fp);
758 } else {
759 error = 0;
760 }
761 ffree(fp);
762
763 return error;
764 }
765
766 /*
767 * Allocate a file descriptor for the process.
768 */
769 int
770 fd_alloc(proc_t *p, int want, int *result)
771 {
772 filedesc_t *fdp;
773 int i, lim, last, error;
774 u_int off, new;
775 fdfile_t *ff;
776
777 KASSERT(p == curproc || p == &proc0);
778
779 fdp = p->p_fd;
780 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
781 KASSERT(ff->ff_refcnt == 0);
782 KASSERT(ff->ff_file == NULL);
783
784 /*
785 * Search for a free descriptor starting at the higher
786 * of want or fd_freefile.
787 */
788 mutex_enter(&fdp->fd_lock);
789 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
790 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
791 last = min(fdp->fd_nfiles, lim);
792 for (;;) {
793 if ((i = want) < fdp->fd_freefile)
794 i = fdp->fd_freefile;
795 off = i >> NDENTRYSHIFT;
796 new = fd_next_zero(fdp, fdp->fd_himap, off,
797 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
798 if (new == -1)
799 break;
800 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
801 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
802 if (i == -1) {
803 /*
804 * Free file descriptor in this block was
805 * below want, try again with higher want.
806 */
807 want = (new + 1) << NDENTRYSHIFT;
808 continue;
809 }
810 i += (new << NDENTRYSHIFT);
811 if (i >= last) {
812 break;
813 }
814 if (fdp->fd_ofiles[i] == NULL) {
815 KASSERT(i >= NDFDFILE);
816 fdp->fd_ofiles[i] = ff;
817 } else {
818 pool_cache_put(fdfile_cache, ff);
819 }
820 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
821 fd_used(fdp, i);
822 if (want <= fdp->fd_freefile) {
823 fdp->fd_freefile = i;
824 }
825 *result = i;
826 mutex_exit(&fdp->fd_lock);
827 KASSERT(i >= NDFDFILE ||
828 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
829 return 0;
830 }
831
832 /* No space in current array. Let the caller expand and retry. */
833 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
834 mutex_exit(&fdp->fd_lock);
835 pool_cache_put(fdfile_cache, ff);
836 return error;
837 }
838
839 /*
840 * Expand a process' descriptor table.
841 */
842 void
843 fd_tryexpand(proc_t *p)
844 {
845 filedesc_t *fdp;
846 int i, numfiles, oldnfiles;
847 fdfile_t **newofile;
848 uint32_t *newhimap, *newlomap;
849
850 KASSERT(p == curproc || p == &proc0);
851
852 fdp = p->p_fd;
853 newhimap = NULL;
854 newlomap = NULL;
855 oldnfiles = fdp->fd_nfiles;
856
857 if (oldnfiles < NDEXTENT)
858 numfiles = NDEXTENT;
859 else
860 numfiles = 2 * oldnfiles;
861
862 newofile = malloc(numfiles * sizeof(fdfile_t *), M_FILEDESC, M_WAITOK);
863 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
864 newhimap = malloc(NDHISLOTS(numfiles) *
865 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
866 newlomap = malloc(NDLOSLOTS(numfiles) *
867 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
868 }
869
870 mutex_enter(&fdp->fd_lock);
871 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
872 if (fdp->fd_nfiles != oldnfiles) {
873 /* fdp changed; caller must retry */
874 mutex_exit(&fdp->fd_lock);
875 free(newofile, M_FILEDESC);
876 if (newhimap != NULL)
877 free(newhimap, M_FILEDESC);
878 if (newlomap != NULL)
879 free(newlomap, M_FILEDESC);
880 return;
881 }
882
883 /* Copy the existing ofile array and zero the new portion. */
884 i = sizeof(fdfile_t *) * fdp->fd_nfiles;
885 memcpy(newofile, fdp->fd_ofiles, i);
886 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
887
888 /*
889 * Link old ofiles array into list to be discarded. We defer
890 * freeing until process exit if the descriptor table is visble
891 * to other threads.
892 */
893 if (oldnfiles > NDFILE) {
894 if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
895 *(void **)fdp->fd_ofiles = fdp->fd_discard;
896 fdp->fd_discard = fdp->fd_ofiles;
897 } else {
898 free(fdp->fd_ofiles, M_FILEDESC);
899 }
900 }
901
902 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
903 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
904 memcpy(newhimap, fdp->fd_himap, i);
905 memset((uint8_t *)newhimap + i, 0,
906 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
907
908 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
909 memcpy(newlomap, fdp->fd_lomap, i);
910 memset((uint8_t *)newlomap + i, 0,
911 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
912
913 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
914 free(fdp->fd_himap, M_FILEDESC);
915 free(fdp->fd_lomap, M_FILEDESC);
916 }
917 fdp->fd_himap = newhimap;
918 fdp->fd_lomap = newlomap;
919 }
920
921 /*
922 * All other modifications must become globally visible before
923 * the change to fd_nfiles. See fd_getfile().
924 */
925 fdp->fd_ofiles = newofile;
926 membar_producer();
927 fdp->fd_nfiles = numfiles;
928 mutex_exit(&fdp->fd_lock);
929
930 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
931 }
932
933 /*
934 * Create a new open file structure and allocate a file descriptor
935 * for the current process.
936 */
937 int
938 fd_allocfile(file_t **resultfp, int *resultfd)
939 {
940 file_t *fp;
941 proc_t *p;
942 int error;
943
944 p = curproc;
945
946 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
947 if (error != ENOSPC) {
948 return error;
949 }
950 fd_tryexpand(p);
951 }
952
953 fp = pool_cache_get(file_cache, PR_WAITOK);
954 KASSERT(fp->f_count == 0);
955 fp->f_cred = kauth_cred_get();
956 kauth_cred_hold(fp->f_cred);
957
958 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
959 fd_abort(p, fp, *resultfd);
960 tablefull("file", "increase kern.maxfiles or MAXFILES");
961 return ENFILE;
962 }
963
964 fp->f_advice = 0;
965 fp->f_msgcount = 0;
966 fp->f_offset = 0;
967 fp->f_iflags = 0;
968 *resultfp = fp;
969
970 return 0;
971 }
972
973 /*
974 * Successful creation of a new descriptor: make visible to the process.
975 */
976 void
977 fd_affix(proc_t *p, file_t *fp, unsigned fd)
978 {
979 fdfile_t *ff;
980 filedesc_t *fdp;
981
982 KASSERT(p == curproc || p == &proc0);
983
984 /* Add a reference to the file structure. */
985 mutex_enter(&fp->f_lock);
986 fp->f_count++;
987 mutex_exit(&fp->f_lock);
988
989 /*
990 * Insert the new file into the descriptor slot.
991 *
992 * The memory barriers provided by lock activity in this routine
993 * ensure that any updates to the file structure become globally
994 * visible before the file becomes visible to other LWPs in the
995 * current process.
996 */
997 fdp = p->p_fd;
998 ff = fdp->fd_ofiles[fd];
999
1000 KASSERT(ff != NULL);
1001 KASSERT(ff->ff_file == NULL);
1002 KASSERT(ff->ff_allocated);
1003 KASSERT(fd_isused(fdp, fd));
1004 KASSERT(fd >= NDFDFILE ||
1005 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1006
1007 /* No need to lock in order to make file initially visible. */
1008 ff->ff_file = fp;
1009 }
1010
1011 /*
1012 * Abort creation of a new descriptor: free descriptor slot and file.
1013 */
1014 void
1015 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1016 {
1017 filedesc_t *fdp;
1018 fdfile_t *ff;
1019
1020 KASSERT(p == curproc || p == &proc0);
1021
1022 fdp = p->p_fd;
1023 ff = fdp->fd_ofiles[fd];
1024
1025 KASSERT(fd >= NDFDFILE ||
1026 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1027
1028 mutex_enter(&fdp->fd_lock);
1029 KASSERT(fd_isused(fdp, fd));
1030 fd_unused(fdp, fd);
1031 mutex_exit(&fdp->fd_lock);
1032
1033 if (fp != NULL) {
1034 ffree(fp);
1035 }
1036 }
1037
1038 /*
1039 * Free a file descriptor.
1040 */
1041 void
1042 ffree(file_t *fp)
1043 {
1044
1045 KASSERT(fp->f_count == 0);
1046
1047 atomic_dec_uint(&nfiles);
1048 kauth_cred_free(fp->f_cred);
1049 pool_cache_put(file_cache, fp);
1050 }
1051
1052 /*
1053 * Create an initial cwdinfo structure, using the same current and root
1054 * directories as curproc.
1055 */
1056 struct cwdinfo *
1057 cwdinit(void)
1058 {
1059 struct cwdinfo *cwdi;
1060 struct cwdinfo *copy;
1061
1062 cwdi = pool_cache_get(cwdi_cache, PR_WAITOK);
1063 copy = curproc->p_cwdi;
1064
1065 rw_enter(©->cwdi_lock, RW_READER);
1066 cwdi->cwdi_cdir = copy->cwdi_cdir;
1067 if (cwdi->cwdi_cdir)
1068 VREF(cwdi->cwdi_cdir);
1069 cwdi->cwdi_rdir = copy->cwdi_rdir;
1070 if (cwdi->cwdi_rdir)
1071 VREF(cwdi->cwdi_rdir);
1072 cwdi->cwdi_edir = copy->cwdi_edir;
1073 if (cwdi->cwdi_edir)
1074 VREF(cwdi->cwdi_edir);
1075 cwdi->cwdi_cmask = copy->cwdi_cmask;
1076 cwdi->cwdi_refcnt = 1;
1077 rw_exit(©->cwdi_lock);
1078
1079 return (cwdi);
1080 }
1081
1082 static int
1083 cwdi_ctor(void *arg, void *obj, int flags)
1084 {
1085 struct cwdinfo *cwdi = obj;
1086
1087 rw_init(&cwdi->cwdi_lock);
1088
1089 return 0;
1090 }
1091
1092 static void
1093 cwdi_dtor(void *arg, void *obj)
1094 {
1095 struct cwdinfo *cwdi = obj;
1096
1097 rw_destroy(&cwdi->cwdi_lock);
1098 }
1099
1100 static int
1101 file_ctor(void *arg, void *obj, int flags)
1102 {
1103 file_t *fp = obj;
1104
1105 memset(fp, 0, sizeof(*fp));
1106 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1107
1108 mutex_enter(&filelist_lock);
1109 LIST_INSERT_HEAD(&filehead, fp, f_list);
1110 mutex_exit(&filelist_lock);
1111
1112 return 0;
1113 }
1114
1115 static void
1116 file_dtor(void *arg, void *obj)
1117 {
1118 file_t *fp = obj;
1119
1120 mutex_enter(&filelist_lock);
1121 LIST_REMOVE(fp, f_list);
1122 mutex_exit(&filelist_lock);
1123
1124 mutex_destroy(&fp->f_lock);
1125 }
1126
1127 static int
1128 fdfile_ctor(void *arg, void *obj, int flags)
1129 {
1130 fdfile_t *ff = obj;
1131
1132 memset(ff, 0, sizeof(*ff));
1133 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1134 cv_init(&ff->ff_closing, "fdclose");
1135
1136 return 0;
1137 }
1138
1139 static void
1140 fdfile_dtor(void *arg, void *obj)
1141 {
1142 fdfile_t *ff = obj;
1143
1144 mutex_destroy(&ff->ff_lock);
1145 cv_destroy(&ff->ff_closing);
1146 }
1147
1148 file_t *
1149 fgetdummy(void)
1150 {
1151 file_t *fp;
1152
1153 fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1154 if (fp != NULL) {
1155 memset(fp, 0, sizeof(*fp));
1156 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1157 }
1158 return fp;
1159 }
1160
1161 void
1162 fputdummy(file_t *fp)
1163 {
1164
1165 mutex_destroy(&fp->f_lock);
1166 kmem_free(fp, sizeof(*fp));
1167 }
1168
1169 /*
1170 * Make p2 share p1's cwdinfo.
1171 */
1172 void
1173 cwdshare(struct proc *p2)
1174 {
1175 struct cwdinfo *cwdi;
1176
1177 cwdi = curproc->p_cwdi;
1178
1179 atomic_inc_uint(&cwdi->cwdi_refcnt);
1180 p2->p_cwdi = cwdi;
1181 }
1182
1183 /*
1184 * Release a cwdinfo structure.
1185 */
1186 void
1187 cwdfree(struct cwdinfo *cwdi)
1188 {
1189
1190 if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
1191 return;
1192
1193 vrele(cwdi->cwdi_cdir);
1194 if (cwdi->cwdi_rdir)
1195 vrele(cwdi->cwdi_rdir);
1196 if (cwdi->cwdi_edir)
1197 vrele(cwdi->cwdi_edir);
1198 pool_cache_put(cwdi_cache, cwdi);
1199 }
1200
1201 /*
1202 * Create an initial filedesc structure.
1203 */
1204 filedesc_t *
1205 fd_init(filedesc_t *fdp)
1206 {
1207 unsigned fd;
1208
1209 if (fdp == NULL) {
1210 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1211 } else {
1212 filedesc_ctor(NULL, fdp, PR_WAITOK);
1213 }
1214
1215 fdp->fd_refcnt = 1;
1216 fdp->fd_ofiles = fdp->fd_dfiles;
1217 fdp->fd_nfiles = NDFILE;
1218 fdp->fd_himap = fdp->fd_dhimap;
1219 fdp->fd_lomap = fdp->fd_dlomap;
1220 KASSERT(fdp->fd_lastfile == -1);
1221 KASSERT(fdp->fd_lastkqfile == -1);
1222 KASSERT(fdp->fd_knhash == NULL);
1223
1224 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1225 offsetof(filedesc_t, fd_startzero));
1226 for (fd = 0; fd < NDFDFILE; fd++) {
1227 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1228 }
1229
1230 return fdp;
1231 }
1232
1233 /*
1234 * Initialize a file descriptor table.
1235 */
1236 static int
1237 filedesc_ctor(void *arg, void *obj, int flag)
1238 {
1239 filedesc_t *fdp = obj;
1240 int i;
1241
1242 memset(fdp, 0, sizeof(*fdp));
1243 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1244 fdp->fd_lastfile = -1;
1245 fdp->fd_lastkqfile = -1;
1246
1247 KASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1248 for (i = 0; i < NDFDFILE; i++) {
1249 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1250 }
1251
1252 return 0;
1253 }
1254
1255 static void
1256 filedesc_dtor(void *arg, void *obj)
1257 {
1258 filedesc_t *fdp = obj;
1259 int i;
1260
1261 for (i = 0; i < NDFDFILE; i++) {
1262 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1263 }
1264
1265 mutex_destroy(&fdp->fd_lock);
1266 }
1267
1268 /*
1269 * Make p2 share p1's filedesc structure.
1270 */
1271 void
1272 fd_share(struct proc *p2)
1273 {
1274 filedesc_t *fdp;
1275
1276 fdp = curlwp->l_fd;
1277 p2->p_fd = fdp;
1278 atomic_inc_uint(&fdp->fd_refcnt);
1279 }
1280
1281 /*
1282 * Copy a filedesc structure.
1283 */
1284 filedesc_t *
1285 fd_copy(void)
1286 {
1287 filedesc_t *newfdp, *fdp;
1288 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1289 int i, nused, numfiles, lastfile, j, newlast;
1290 file_t *fp;
1291
1292 fdp = curproc->p_fd;
1293 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1294 newfdp->fd_refcnt = 1;
1295
1296 KASSERT(newfdp->fd_knhash == NULL);
1297 KASSERT(newfdp->fd_knhashmask == 0);
1298 KASSERT(newfdp->fd_discard == NULL);
1299
1300 for (;;) {
1301 numfiles = fdp->fd_nfiles;
1302 lastfile = fdp->fd_lastfile;
1303
1304 /*
1305 * If the number of open files fits in the internal arrays
1306 * of the open file structure, use them, otherwise allocate
1307 * additional memory for the number of descriptors currently
1308 * in use.
1309 */
1310 if (lastfile < NDFILE) {
1311 i = NDFILE;
1312 newfdp->fd_ofiles = newfdp->fd_dfiles;
1313 } else {
1314 /*
1315 * Compute the smallest multiple of NDEXTENT needed
1316 * for the file descriptors currently in use,
1317 * allowing the table to shrink.
1318 */
1319 i = numfiles;
1320 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1321 i /= 2;
1322 }
1323 newfdp->fd_ofiles = malloc(i * sizeof(fdfile_t *),
1324 M_FILEDESC, M_WAITOK);
1325 KASSERT(i >= NDFILE);
1326 }
1327 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1328 newfdp->fd_himap = newfdp->fd_dhimap;
1329 newfdp->fd_lomap = newfdp->fd_dlomap;
1330 } else {
1331 newfdp->fd_himap = malloc(NDHISLOTS(i) *
1332 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1333 newfdp->fd_lomap = malloc(NDLOSLOTS(i) *
1334 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1335 }
1336
1337 /*
1338 * Allocate and string together fdfile structures.
1339 * We abuse fdfile_t::ff_file here, but it will be
1340 * cleared before this routine returns.
1341 */
1342 nused = fdp->fd_nused;
1343 fflist = NULL;
1344 for (j = nused; j != 0; j--) {
1345 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1346 ff->ff_file = (void *)fflist;
1347 fflist = ff;
1348 }
1349
1350 mutex_enter(&fdp->fd_lock);
1351 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1352 lastfile == fdp->fd_lastfile) {
1353 break;
1354 }
1355 mutex_exit(&fdp->fd_lock);
1356 if (i >= NDFILE) {
1357 free(newfdp->fd_ofiles, M_FILEDESC);
1358 }
1359 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1360 free(newfdp->fd_himap, M_FILEDESC);
1361 free(newfdp->fd_lomap, M_FILEDESC);
1362 }
1363 while (fflist != NULL) {
1364 ff = fflist;
1365 fflist = (void *)ff->ff_file;
1366 ff->ff_file = NULL;
1367 pool_cache_put(fdfile_cache, ff);
1368 }
1369 }
1370
1371 newfdp->fd_nfiles = i;
1372 newfdp->fd_freefile = fdp->fd_freefile;
1373 newfdp->fd_exclose = fdp->fd_exclose;
1374
1375 /*
1376 * Clear the entries that will not be copied over.
1377 * Avoid calling memset with 0 size.
1378 */
1379 if (lastfile < (i-1)) {
1380 memset(newfdp->fd_ofiles + lastfile + 1, 0,
1381 (i - lastfile - 1) * sizeof(file_t **));
1382 }
1383 if (i < NDENTRIES * NDENTRIES) {
1384 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1385 }
1386 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1387 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1388
1389 ffp = fdp->fd_ofiles;
1390 nffp = newfdp->fd_ofiles;
1391 j = imax(lastfile, (NDFDFILE - 1));
1392 newlast = -1;
1393 KASSERT(j < fdp->fd_nfiles);
1394 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1395 ff = *ffp;
1396 /* Install built-in fdfiles even if unused here. */
1397 if (i < NDFDFILE) {
1398 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1399 } else {
1400 ff2 = NULL;
1401 }
1402 /* Determine if descriptor is active in parent. */
1403 if (ff == NULL || !fd_isused(fdp, i)) {
1404 KASSERT(ff != NULL || i >= NDFDFILE);
1405 continue;
1406 }
1407 mutex_enter(&ff->ff_lock);
1408 fp = ff->ff_file;
1409 if (fp == NULL) {
1410 /* Descriptor is half-open: free slot. */
1411 fd_zap(newfdp, i);
1412 mutex_exit(&ff->ff_lock);
1413 continue;
1414 }
1415 if (fp->f_type == DTYPE_KQUEUE) {
1416 /* kqueue descriptors cannot be copied. */
1417 fd_zap(newfdp, i);
1418 mutex_exit(&ff->ff_lock);
1419 continue;
1420 }
1421 /* It's active: add a reference to the file. */
1422 mutex_enter(&fp->f_lock);
1423 fp->f_count++;
1424 mutex_exit(&fp->f_lock);
1425 /* Consume one fdfile_t to represent it. */
1426 if (i >= NDFDFILE) {
1427 ff2 = fflist;
1428 fflist = (void *)ff2->ff_file;
1429 }
1430 ff2->ff_file = fp;
1431 ff2->ff_exclose = ff->ff_exclose;
1432 ff2->ff_allocated = 1;
1433 mutex_exit(&ff->ff_lock);
1434 if (i > newlast) {
1435 newlast = i;
1436 }
1437 }
1438 mutex_exit(&fdp->fd_lock);
1439
1440 /* Discard unused fdfile_t structures. */
1441 while (__predict_false(fflist != NULL)) {
1442 ff = fflist;
1443 fflist = (void *)ff->ff_file;
1444 ff->ff_file = NULL;
1445 pool_cache_put(fdfile_cache, ff);
1446 nused--;
1447 }
1448 KASSERT(nused >= 0);
1449 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1450
1451 newfdp->fd_nused = nused;
1452 newfdp->fd_lastfile = newlast;
1453
1454 return (newfdp);
1455 }
1456
1457 /*
1458 * Release a filedesc structure.
1459 */
1460 void
1461 fd_free(void)
1462 {
1463 filedesc_t *fdp;
1464 fdfile_t *ff;
1465 file_t *fp;
1466 int fd, lastfd;
1467 void *discard;
1468
1469 fdp = curlwp->l_fd;
1470
1471 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1472
1473 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1474 return;
1475
1476 /*
1477 * Close any files that the process holds open.
1478 */
1479 for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1480 ff = fdp->fd_ofiles[fd];
1481 KASSERT(fd >= NDFDFILE ||
1482 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1483 if ((ff = fdp->fd_ofiles[fd]) == NULL)
1484 continue;
1485 if ((fp = ff->ff_file) != NULL) {
1486 /*
1487 * Must use fd_close() here as kqueue holds
1488 * long term references to descriptors.
1489 */
1490 ff->ff_refcnt++;
1491 fd_close(fd);
1492 }
1493 KASSERT(ff->ff_refcnt == 0);
1494 KASSERT(ff->ff_file == NULL);
1495 KASSERT(!ff->ff_exclose);
1496 KASSERT(!ff->ff_allocated);
1497 if (fd >= NDFDFILE) {
1498 pool_cache_put(fdfile_cache, ff);
1499 }
1500 }
1501
1502 /*
1503 * Clean out the descriptor table for the next user and return
1504 * to the cache.
1505 */
1506 while ((discard = fdp->fd_discard) != NULL) {
1507 KASSERT(discard != fdp->fd_ofiles);
1508 fdp->fd_discard = *(void **)discard;
1509 free(discard, M_FILEDESC);
1510 }
1511 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1512 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1513 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1514 free(fdp->fd_himap, M_FILEDESC);
1515 free(fdp->fd_lomap, M_FILEDESC);
1516 }
1517 if (fdp->fd_nfiles > NDFILE) {
1518 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1519 free(fdp->fd_ofiles, M_FILEDESC);
1520 }
1521 if (fdp->fd_knhash != NULL) {
1522 hashdone(fdp->fd_knhash, M_KEVENT);
1523 fdp->fd_knhash = NULL;
1524 fdp->fd_knhashmask = 0;
1525 } else {
1526 KASSERT(fdp->fd_knhashmask == 0);
1527 }
1528 fdp->fd_lastkqfile = -1;
1529 pool_cache_put(filedesc_cache, fdp);
1530 }
1531
1532 /*
1533 * File Descriptor pseudo-device driver (/dev/fd/).
1534 *
1535 * Opening minor device N dup()s the file (if any) connected to file
1536 * descriptor N belonging to the calling process. Note that this driver
1537 * consists of only the ``open()'' routine, because all subsequent
1538 * references to this file will be direct to the other driver.
1539 */
1540 static int
1541 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1542 {
1543
1544 /*
1545 * XXX Kludge: set dupfd to contain the value of the
1546 * the file descriptor being sought for duplication. The error
1547 * return ensures that the vnode for this device will be released
1548 * by vn_open. Open will detect this special error and take the
1549 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1550 * will simply report the error.
1551 */
1552 l->l_dupfd = minor(dev); /* XXX */
1553 return EDUPFD;
1554 }
1555
1556 /*
1557 * Duplicate the specified descriptor to a free descriptor.
1558 */
1559 int
1560 fd_dupopen(int old, int *new, int mode, int error)
1561 {
1562 filedesc_t *fdp;
1563 fdfile_t *ff;
1564 file_t *fp;
1565
1566 if ((fp = fd_getfile(old)) == NULL) {
1567 return EBADF;
1568 }
1569 fdp = curlwp->l_fd;
1570 ff = fdp->fd_ofiles[old];
1571
1572 /*
1573 * There are two cases of interest here.
1574 *
1575 * For EDUPFD simply dup (dfd) to file descriptor
1576 * (indx) and return.
1577 *
1578 * For EMOVEFD steal away the file structure from (dfd) and
1579 * store it in (indx). (dfd) is effectively closed by
1580 * this operation.
1581 *
1582 * Any other error code is just returned.
1583 */
1584 switch (error) {
1585 case EDUPFD:
1586 /*
1587 * Check that the mode the file is being opened for is a
1588 * subset of the mode of the existing descriptor.
1589 */
1590 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1591 error = EACCES;
1592 break;
1593 }
1594
1595 /* Copy it. */
1596 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1597 break;
1598
1599 case EMOVEFD:
1600 /* Copy it. */
1601 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1602 if (error != 0) {
1603 break;
1604 }
1605
1606 /* Steal away the file pointer from 'old'. */
1607 (void)fd_close(old);
1608 return 0;
1609 }
1610
1611 fd_putfile(old);
1612 return error;
1613 }
1614
1615 /*
1616 * Close open files on exec.
1617 */
1618 void
1619 fd_closeexec(void)
1620 {
1621 struct cwdinfo *cwdi;
1622 proc_t *p;
1623 filedesc_t *fdp;
1624 fdfile_t *ff;
1625 lwp_t *l;
1626 int fd;
1627
1628 l = curlwp;
1629 p = l->l_proc;
1630 fdp = p->p_fd;
1631 cwdi = p->p_cwdi;
1632
1633 if (cwdi->cwdi_refcnt > 1) {
1634 cwdi = cwdinit();
1635 cwdfree(p->p_cwdi);
1636 p->p_cwdi = cwdi;
1637 }
1638 if (p->p_cwdi->cwdi_edir) {
1639 vrele(p->p_cwdi->cwdi_edir);
1640 }
1641
1642 if (fdp->fd_refcnt > 1) {
1643 fdp = fd_copy();
1644 fd_free();
1645 p->p_fd = fdp;
1646 l->l_fd = fdp;
1647 }
1648 if (!fdp->fd_exclose) {
1649 return;
1650 }
1651 fdp->fd_exclose = 0;
1652
1653 for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1654 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1655 KASSERT(fd >= NDFDFILE);
1656 continue;
1657 }
1658 KASSERT(fd >= NDFDFILE ||
1659 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1660 if (ff->ff_file == NULL)
1661 continue;
1662 if (ff->ff_exclose) {
1663 /*
1664 * We need a reference to close the file.
1665 * No other threads can see the fdfile_t at
1666 * this point, so don't bother locking.
1667 */
1668 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1669 ff->ff_refcnt++;
1670 fd_close(fd);
1671 }
1672 }
1673 }
1674
1675 /*
1676 * It is unsafe for set[ug]id processes to be started with file
1677 * descriptors 0..2 closed, as these descriptors are given implicit
1678 * significance in the Standard C library. fdcheckstd() will create a
1679 * descriptor referencing /dev/null for each of stdin, stdout, and
1680 * stderr that is not already open.
1681 */
1682 #define CHECK_UPTO 3
1683 int
1684 fd_checkstd(void)
1685 {
1686 struct proc *p;
1687 struct nameidata nd;
1688 filedesc_t *fdp;
1689 file_t *fp;
1690 struct proc *pp;
1691 int fd, i, error, flags = FREAD|FWRITE;
1692 char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1693
1694 p = curproc;
1695 closed[0] = '\0';
1696 if ((fdp = p->p_fd) == NULL)
1697 return (0);
1698 for (i = 0; i < CHECK_UPTO; i++) {
1699 KASSERT(i >= NDFDFILE ||
1700 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1701 if (fdp->fd_ofiles[i]->ff_file != NULL)
1702 continue;
1703 snprintf(which, sizeof(which), ",%d", i);
1704 strlcat(closed, which, sizeof(closed));
1705 if ((error = fd_allocfile(&fp, &fd)) != 0)
1706 return (error);
1707 KASSERT(fd < CHECK_UPTO);
1708 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1709 if ((error = vn_open(&nd, flags, 0)) != 0) {
1710 fd_abort(p, fp, fd);
1711 return (error);
1712 }
1713 fp->f_data = nd.ni_vp;
1714 fp->f_flag = flags;
1715 fp->f_ops = &vnops;
1716 fp->f_type = DTYPE_VNODE;
1717 VOP_UNLOCK(nd.ni_vp, 0);
1718 fd_affix(p, fp, fd);
1719 }
1720 if (closed[0] != '\0') {
1721 mutex_enter(proc_lock);
1722 pp = p->p_pptr;
1723 mutex_enter(pp->p_lock);
1724 log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1725 "was invoked by uid %d ppid %d (%s) "
1726 "with fd %s closed\n",
1727 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1728 pp->p_pid, pp->p_comm, &closed[1]);
1729 mutex_exit(pp->p_lock);
1730 mutex_exit(proc_lock);
1731 }
1732 return (0);
1733 }
1734 #undef CHECK_UPTO
1735
1736 /*
1737 * Sets descriptor owner. If the owner is a process, 'pgid'
1738 * is set to positive value, process ID. If the owner is process group,
1739 * 'pgid' is set to -pg_id.
1740 */
1741 int
1742 fsetown(pid_t *pgid, int cmd, const void *data)
1743 {
1744 int id = *(const int *)data;
1745 int error;
1746
1747 switch (cmd) {
1748 case TIOCSPGRP:
1749 if (id < 0)
1750 return (EINVAL);
1751 id = -id;
1752 break;
1753 default:
1754 break;
1755 }
1756
1757 if (id > 0 && !pfind(id))
1758 return (ESRCH);
1759 else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1760 return (error);
1761
1762 *pgid = id;
1763 return (0);
1764 }
1765
1766 /*
1767 * Return descriptor owner information. If the value is positive,
1768 * it's process ID. If it's negative, it's process group ID and
1769 * needs the sign removed before use.
1770 */
1771 int
1772 fgetown(pid_t pgid, int cmd, void *data)
1773 {
1774
1775 switch (cmd) {
1776 case TIOCGPGRP:
1777 *(int *)data = -pgid;
1778 break;
1779 default:
1780 *(int *)data = pgid;
1781 break;
1782 }
1783 return (0);
1784 }
1785
1786 /*
1787 * Send signal to descriptor owner, either process or process group.
1788 */
1789 void
1790 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1791 {
1792 struct proc *p1;
1793 struct pgrp *pgrp;
1794 ksiginfo_t ksi;
1795
1796 KASSERT(!cpu_intr_p());
1797
1798 KSI_INIT(&ksi);
1799 ksi.ksi_signo = signo;
1800 ksi.ksi_code = code;
1801 ksi.ksi_band = band;
1802
1803 mutex_enter(proc_lock);
1804 if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1805 kpsignal(p1, &ksi, fdescdata);
1806 else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1807 kpgsignal(pgrp, &ksi, fdescdata, 0);
1808 mutex_exit(proc_lock);
1809 }
1810
1811 int
1812 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1813 void *data)
1814 {
1815
1816 fp->f_flag = flag;
1817 fp->f_type = DTYPE_MISC;
1818 fp->f_ops = fops;
1819 fp->f_data = data;
1820 curlwp->l_dupfd = fd;
1821 fd_affix(curproc, fp, fd);
1822
1823 return EMOVEFD;
1824 }
1825
1826 int
1827 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1828 {
1829
1830 if (cmd == F_SETFL)
1831 return 0;
1832
1833 return EOPNOTSUPP;
1834 }
1835
1836 int
1837 fnullop_poll(file_t *fp, int which)
1838 {
1839
1840 return 0;
1841 }
1842
1843 int
1844 fnullop_kqfilter(file_t *fp, struct knote *kn)
1845 {
1846
1847 return 0;
1848 }
1849
1850 int
1851 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1852 kauth_cred_t cred, int flags)
1853 {
1854
1855 return EOPNOTSUPP;
1856 }
1857
1858 int
1859 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1860 kauth_cred_t cred, int flags)
1861 {
1862
1863 return EOPNOTSUPP;
1864 }
1865
1866 int
1867 fbadop_ioctl(file_t *fp, u_long com, void *data)
1868 {
1869
1870 return EOPNOTSUPP;
1871 }
1872
1873 int
1874 fbadop_stat(file_t *fp, struct stat *sb)
1875 {
1876
1877 return EOPNOTSUPP;
1878 }
1879
1880 int
1881 fbadop_close(file_t *fp)
1882 {
1883
1884 return EOPNOTSUPP;
1885 }
1886