kern_descrip.c revision 1.179.2.1 1 /* $NetBSD: kern_descrip.c,v 1.179.2.1 2008/05/10 23:49:03 wrstuden Exp $ */
2
3 /*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Copyright (c) 1982, 1986, 1989, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * SUCH DAMAGE.
61 *
62 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
63 */
64
65 /*
66 * File descriptor management.
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.179.2.1 2008/05/10 23:49:03 wrstuden Exp $");
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/filedesc.h>
75 #include <sys/kernel.h>
76 #include <sys/vnode.h>
77 #include <sys/proc.h>
78 #include <sys/file.h>
79 #include <sys/namei.h>
80 #include <sys/socket.h>
81 #include <sys/socketvar.h>
82 #include <sys/stat.h>
83 #include <sys/ioctl.h>
84 #include <sys/fcntl.h>
85 #include <sys/malloc.h>
86 #include <sys/pool.h>
87 #include <sys/syslog.h>
88 #include <sys/unistd.h>
89 #include <sys/resourcevar.h>
90 #include <sys/conf.h>
91 #include <sys/event.h>
92 #include <sys/kauth.h>
93 #include <sys/atomic.h>
94 #include <sys/mount.h>
95 #include <sys/sa.h>
96 #include <sys/syscallargs.h>
97 #include <sys/cpu.h>
98
99 static int cwdi_ctor(void *, void *, int);
100 static void cwdi_dtor(void *, void *);
101 static int file_ctor(void *, void *, int);
102 static void file_dtor(void *, void *);
103 static int fdfile_ctor(void *, void *, int);
104 static void fdfile_dtor(void *, void *);
105 static int filedesc_ctor(void *, void *, int);
106 static void filedesc_dtor(void *, void *);
107 static int filedescopen(dev_t, int, int, lwp_t *);
108
109 kmutex_t filelist_lock; /* lock on filehead */
110 struct filelist filehead; /* head of list of open files */
111 u_int nfiles; /* actual number of open files */
112
113 static pool_cache_t cwdi_cache;
114 static pool_cache_t filedesc_cache;
115 static pool_cache_t file_cache;
116 static pool_cache_t fdfile_cache;
117
118 MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
119
120 const struct cdevsw filedesc_cdevsw = {
121 filedescopen, noclose, noread, nowrite, noioctl,
122 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
123 };
124
125 /* For ease of reading. */
126 __strong_alias(fd_putvnode,fd_putfile)
127 __strong_alias(fd_putsock,fd_putfile)
128
129 /*
130 * Initialize the descriptor system.
131 */
132 void
133 fd_sys_init(void)
134 {
135
136 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
137
138 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
139 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
140 KASSERT(file_cache != NULL);
141
142 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
143 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
144 NULL);
145 KASSERT(fdfile_cache != NULL);
146
147 cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit,
148 0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL);
149 KASSERT(cwdi_cache != NULL);
150
151 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
152 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
153 NULL);
154 KASSERT(filedesc_cache != NULL);
155 }
156
157 static int
158 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
159 {
160 int i, off, maxoff;
161 uint32_t sub;
162
163 KASSERT(mutex_owned(&fdp->fd_lock));
164
165 if (want > bits)
166 return -1;
167
168 off = want >> NDENTRYSHIFT;
169 i = want & NDENTRYMASK;
170 if (i) {
171 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
172 if (sub != ~0)
173 goto found;
174 off++;
175 }
176
177 maxoff = NDLOSLOTS(bits);
178 while (off < maxoff) {
179 if ((sub = bitmap[off]) != ~0)
180 goto found;
181 off++;
182 }
183
184 return (-1);
185
186 found:
187 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
188 }
189
190 static int
191 fd_last_set(filedesc_t *fd, int last)
192 {
193 int off, i;
194 fdfile_t **ofiles = fd->fd_ofiles;
195 uint32_t *bitmap = fd->fd_lomap;
196
197 KASSERT(mutex_owned(&fd->fd_lock));
198
199 off = (last - 1) >> NDENTRYSHIFT;
200
201 while (off >= 0 && !bitmap[off])
202 off--;
203
204 if (off < 0)
205 return (-1);
206
207 i = ((off + 1) << NDENTRYSHIFT) - 1;
208 if (i >= last)
209 i = last - 1;
210
211 /* XXX should use bitmap */
212 /* XXXAD does not work for fd_copy() */
213 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
214 i--;
215
216 return (i);
217 }
218
219 void
220 fd_used(filedesc_t *fdp, unsigned fd)
221 {
222 u_int off = fd >> NDENTRYSHIFT;
223 fdfile_t *ff;
224
225 ff = fdp->fd_ofiles[fd];
226
227 KASSERT(mutex_owned(&fdp->fd_lock));
228 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
229 KASSERT(ff != NULL);
230 KASSERT(ff->ff_file == NULL);
231 KASSERT(!ff->ff_allocated);
232
233 ff->ff_allocated = 1;
234 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
235 if (fdp->fd_lomap[off] == ~0) {
236 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
237 (1 << (off & NDENTRYMASK))) == 0);
238 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
239 }
240
241 if ((int)fd > fdp->fd_lastfile) {
242 fdp->fd_lastfile = fd;
243 }
244
245 if (fd >= NDFDFILE) {
246 fdp->fd_nused++;
247 } else {
248 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
249 }
250 }
251
252 void
253 fd_unused(filedesc_t *fdp, unsigned fd)
254 {
255 u_int off = fd >> NDENTRYSHIFT;
256 fdfile_t *ff;
257
258 ff = fdp->fd_ofiles[fd];
259
260 /*
261 * Don't assert the lock is held here, as we may be copying
262 * the table during exec() and it is not needed there.
263 * procfs and sysctl are locked out by proc::p_reflock.
264 *
265 * KASSERT(mutex_owned(&fdp->fd_lock));
266 */
267 KASSERT(ff != NULL);
268 KASSERT(ff->ff_file == NULL);
269 KASSERT(ff->ff_allocated);
270
271 if (fd < fdp->fd_freefile) {
272 fdp->fd_freefile = fd;
273 }
274
275 if (fdp->fd_lomap[off] == ~0) {
276 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
277 (1 << (off & NDENTRYMASK))) != 0);
278 fdp->fd_himap[off >> NDENTRYSHIFT] &=
279 ~(1 << (off & NDENTRYMASK));
280 }
281 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
282 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
283 ff->ff_allocated = 0;
284
285 KASSERT(fd <= fdp->fd_lastfile);
286 if (fd == fdp->fd_lastfile) {
287 fdp->fd_lastfile = fd_last_set(fdp, fd);
288 }
289
290 if (fd >= NDFDFILE) {
291 KASSERT(fdp->fd_nused > 0);
292 fdp->fd_nused--;
293 } else {
294 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
295 }
296 }
297
298 /*
299 * Custom version of fd_unused() for fd_copy(), where the descriptor
300 * table is not yet fully initialized.
301 */
302 static inline void
303 fd_zap(filedesc_t *fdp, unsigned fd)
304 {
305 u_int off = fd >> NDENTRYSHIFT;
306
307 if (fd < fdp->fd_freefile) {
308 fdp->fd_freefile = fd;
309 }
310
311 if (fdp->fd_lomap[off] == ~0) {
312 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
313 (1 << (off & NDENTRYMASK))) != 0);
314 fdp->fd_himap[off >> NDENTRYSHIFT] &=
315 ~(1 << (off & NDENTRYMASK));
316 }
317 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
318 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
319 }
320
321 bool
322 fd_isused(filedesc_t *fdp, unsigned fd)
323 {
324 u_int off = fd >> NDENTRYSHIFT;
325
326 KASSERT(fd < fdp->fd_nfiles);
327
328 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
329 }
330
331 /*
332 * Look up the file structure corresponding to a file descriptor
333 * and return the file, holding a reference on the descriptor.
334 */
335 inline file_t *
336 fd_getfile(unsigned fd)
337 {
338 filedesc_t *fdp;
339 fdfile_t *ff;
340 file_t *fp;
341
342 fdp = curlwp->l_fd;
343
344 /*
345 * Look up the fdfile structure representing this descriptor.
346 * Ensure that we see fd_nfiles before fd_ofiles since we
347 * are doing this unlocked. See fd_tryexpand().
348 */
349 if (__predict_false(fd >= fdp->fd_nfiles)) {
350 return NULL;
351 }
352 membar_consumer();
353 ff = fdp->fd_ofiles[fd];
354 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
355 if (__predict_false(ff == NULL)) {
356 return NULL;
357 }
358
359 /*
360 * Now get a reference to the descriptor. Issue a memory
361 * barrier to ensure that we acquire the file pointer _after_
362 * adding a reference. If no memory barrier, we could fetch
363 * a stale pointer.
364 */
365 atomic_inc_uint(&ff->ff_refcnt);
366 #ifndef __HAVE_ATOMIC_AS_MEMBAR
367 membar_enter();
368 #endif
369
370 /*
371 * If the file is not open or is being closed then put the
372 * reference back.
373 */
374 fp = ff->ff_file;
375 if (__predict_true(fp != NULL)) {
376 return fp;
377 }
378 fd_putfile(fd);
379 return NULL;
380 }
381
382 /*
383 * Release a reference to a file descriptor acquired with fd_getfile().
384 */
385 void
386 fd_putfile(unsigned fd)
387 {
388 filedesc_t *fdp;
389 fdfile_t *ff;
390 u_int u, v;
391
392 fdp = curlwp->l_fd;
393 ff = fdp->fd_ofiles[fd];
394
395 KASSERT(fd < fdp->fd_nfiles);
396 KASSERT(ff != NULL);
397 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
398 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
399
400 /*
401 * Ensure that any use of the file is complete and globally
402 * visible before dropping the final reference. If no membar,
403 * the current CPU could still access memory associated with
404 * the file after it has been freed or recycled by another
405 * CPU.
406 */
407 #ifndef __HAVE_ATOMIC_AS_MEMBAR
408 membar_exit();
409 #endif
410
411 /*
412 * Be optimistic and start out with the assumption that no other
413 * threads are trying to close the descriptor. If the CAS fails,
414 * we lost a race and/or it's being closed.
415 */
416 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
417 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
418 if (__predict_true(u == v)) {
419 return;
420 }
421 if (__predict_false((v & FR_CLOSING) != 0)) {
422 break;
423 }
424 }
425
426 /* Another thread is waiting to close the file: join it. */
427 (void)fd_close(fd);
428 }
429
430 /*
431 * Convenience wrapper around fd_getfile() that returns reference
432 * to a vnode.
433 */
434 int
435 fd_getvnode(unsigned fd, file_t **fpp)
436 {
437 vnode_t *vp;
438 file_t *fp;
439
440 fp = fd_getfile(fd);
441 if (__predict_false(fp == NULL)) {
442 return EBADF;
443 }
444 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
445 fd_putfile(fd);
446 return EINVAL;
447 }
448 vp = fp->f_data;
449 if (__predict_false(vp->v_type == VBAD)) {
450 /* XXX Is this case really necessary? */
451 fd_putfile(fd);
452 return EBADF;
453 }
454 *fpp = fp;
455 return 0;
456 }
457
458 /*
459 * Convenience wrapper around fd_getfile() that returns reference
460 * to a socket.
461 */
462 int
463 fd_getsock(unsigned fd, struct socket **sop)
464 {
465 file_t *fp;
466
467 fp = fd_getfile(fd);
468 if (__predict_false(fp == NULL)) {
469 return EBADF;
470 }
471 if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
472 fd_putfile(fd);
473 return ENOTSOCK;
474 }
475 *sop = fp->f_data;
476 return 0;
477 }
478
479 /*
480 * Look up the file structure corresponding to a file descriptor
481 * and return it with a reference held on the file, not the
482 * descriptor.
483 *
484 * This is heavyweight and only used when accessing descriptors
485 * from a foreign process. The caller must ensure that `p' does
486 * not exit or fork across this call.
487 *
488 * To release the file (not descriptor) reference, use closef().
489 */
490 file_t *
491 fd_getfile2(proc_t *p, unsigned fd)
492 {
493 filedesc_t *fdp;
494 fdfile_t *ff;
495 file_t *fp;
496
497 fdp = p->p_fd;
498 mutex_enter(&fdp->fd_lock);
499 if (fd > fdp->fd_nfiles) {
500 mutex_exit(&fdp->fd_lock);
501 return NULL;
502 }
503 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
504 mutex_exit(&fdp->fd_lock);
505 return NULL;
506 }
507 mutex_enter(&ff->ff_lock);
508 if ((fp = ff->ff_file) == NULL) {
509 mutex_exit(&ff->ff_lock);
510 mutex_exit(&fdp->fd_lock);
511 return NULL;
512 }
513 mutex_enter(&fp->f_lock);
514 fp->f_count++;
515 mutex_exit(&fp->f_lock);
516 mutex_exit(&ff->ff_lock);
517 mutex_exit(&fdp->fd_lock);
518
519 return fp;
520 }
521
522 /*
523 * Internal form of close. Must be called with a reference to the
524 * descriptor, and will drop the reference. When all descriptor
525 * references are dropped, releases the descriptor slot and a single
526 * reference to the file structure.
527 */
528 int
529 fd_close(unsigned fd)
530 {
531 struct flock lf;
532 filedesc_t *fdp;
533 fdfile_t *ff;
534 file_t *fp;
535 proc_t *p;
536 lwp_t *l;
537
538 l = curlwp;
539 p = l->l_proc;
540 fdp = l->l_fd;
541 ff = fdp->fd_ofiles[fd];
542
543 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
544
545 mutex_enter(&ff->ff_lock);
546 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
547 if (ff->ff_file == NULL) {
548 /*
549 * Another user of the file is already closing, and is
550 * waiting for other users of the file to drain. Release
551 * our reference, and wake up the closer.
552 */
553 atomic_dec_uint(&ff->ff_refcnt);
554 cv_broadcast(&ff->ff_closing);
555 mutex_exit(&ff->ff_lock);
556
557 /*
558 * An application error, so pretend that the descriptor
559 * was already closed. We can't safely wait for it to
560 * be closed without potentially deadlocking.
561 */
562 return (EBADF);
563 }
564 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
565
566 /*
567 * There may be multiple users of this file within the process.
568 * Notify existing and new users that the file is closing. This
569 * will prevent them from adding additional uses to this file
570 * while we are closing it.
571 */
572 fp = ff->ff_file;
573 ff->ff_file = NULL;
574 ff->ff_exclose = 0;
575
576 /*
577 * We expect the caller to hold a descriptor reference - drop it.
578 * The reference count may increase beyond zero at this point due
579 * to an erroneous descriptor reference by an application, but
580 * fd_getfile() will notice that the file is being closed and drop
581 * the reference again.
582 */
583 #ifndef __HAVE_ATOMIC_AS_MEMBAR
584 membar_producer();
585 #endif
586 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
587 /*
588 * Wait for other references to drain. This is typically
589 * an application error - the descriptor is being closed
590 * while still in use.
591 *
592 */
593 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
594 /*
595 * Remove any knotes attached to the file. A knote
596 * attached to the descriptor can hold references on it.
597 */
598 if (!SLIST_EMPTY(&ff->ff_knlist)) {
599 mutex_exit(&ff->ff_lock);
600 knote_fdclose(fd);
601 mutex_enter(&ff->ff_lock);
602 }
603 /*
604 * We need to see the count drop to zero at least once,
605 * in order to ensure that all pre-existing references
606 * have been drained. New references past this point are
607 * of no interest.
608 */
609 while ((ff->ff_refcnt & FR_MASK) != 0) {
610 cv_wait(&ff->ff_closing, &ff->ff_lock);
611 }
612 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
613 } else {
614 /* If no references, there must be no knotes. */
615 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
616 }
617 mutex_exit(&ff->ff_lock);
618
619 /*
620 * POSIX record locking dictates that any close releases ALL
621 * locks owned by this process. This is handled by setting
622 * a flag in the unlock to free ONLY locks obeying POSIX
623 * semantics, and not to free BSD-style file locks.
624 * If the descriptor was in a message, POSIX-style locks
625 * aren't passed with the descriptor.
626 */
627 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
628 lf.l_whence = SEEK_SET;
629 lf.l_start = 0;
630 lf.l_len = 0;
631 lf.l_type = F_UNLCK;
632 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
633 }
634
635
636 /* Free descriptor slot. */
637 mutex_enter(&fdp->fd_lock);
638 fd_unused(fdp, fd);
639 mutex_exit(&fdp->fd_lock);
640
641 /* Now drop reference to the file itself. */
642 return closef(fp);
643 }
644
645 /*
646 * Duplicate a file descriptor.
647 */
648 int
649 fd_dup(file_t *fp, int minfd, int *newp, int exclose)
650 {
651 proc_t *p;
652 int error;
653
654 p = curproc;
655
656 while ((error = fd_alloc(p, minfd, newp)) != 0) {
657 if (error != ENOSPC) {
658 return error;
659 }
660 fd_tryexpand(p);
661 }
662
663 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
664 fd_affix(p, fp, *newp);
665 return 0;
666 }
667
668 /*
669 * dup2 operation.
670 */
671 int
672 fd_dup2(file_t *fp, unsigned new)
673 {
674 filedesc_t *fdp;
675 fdfile_t *ff;
676
677 fdp = curlwp->l_fd;
678
679 /*
680 * Ensure there are enough slots in the descriptor table,
681 * and allocate an fdfile_t up front in case we need it.
682 */
683 while (new >= fdp->fd_nfiles) {
684 fd_tryexpand(curproc);
685 }
686 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
687
688 /*
689 * If there is already a file open, close it. If the file is
690 * half open, wait for it to be constructed before closing it.
691 * XXX Potential for deadlock here?
692 */
693 mutex_enter(&fdp->fd_lock);
694 while (fd_isused(fdp, new)) {
695 mutex_exit(&fdp->fd_lock);
696 if (fd_getfile(new) != NULL) {
697 (void)fd_close(new);
698 } else {
699 /* XXX Crummy, but unlikely to happen. */
700 kpause("dup2", false, 1, NULL);
701 }
702 mutex_enter(&fdp->fd_lock);
703 }
704 if (fdp->fd_ofiles[new] == NULL) {
705 KASSERT(new >= NDFDFILE);
706 fdp->fd_ofiles[new] = ff;
707 ff = NULL;
708 }
709 fd_used(fdp, new);
710 mutex_exit(&fdp->fd_lock);
711
712 /* Slot is now allocated. Insert copy of the file. */
713 fd_affix(curproc, fp, new);
714 if (ff != NULL) {
715 pool_cache_put(fdfile_cache, ff);
716 }
717 return 0;
718 }
719
720 /*
721 * Drop reference to a file structure.
722 */
723 int
724 closef(file_t *fp)
725 {
726 struct flock lf;
727 int error;
728
729 /*
730 * Drop reference. If referenced elsewhere it's still open
731 * and we have nothing more to do.
732 */
733 mutex_enter(&fp->f_lock);
734 KASSERT(fp->f_count > 0);
735 if (--fp->f_count > 0) {
736 mutex_exit(&fp->f_lock);
737 return 0;
738 }
739 KASSERT(fp->f_count == 0);
740 mutex_exit(&fp->f_lock);
741
742 /* We held the last reference - release locks, close and free. */
743 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
744 lf.l_whence = SEEK_SET;
745 lf.l_start = 0;
746 lf.l_len = 0;
747 lf.l_type = F_UNLCK;
748 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
749 }
750 if (fp->f_ops != NULL) {
751 error = (*fp->f_ops->fo_close)(fp);
752 } else {
753 error = 0;
754 }
755 ffree(fp);
756
757 return error;
758 }
759
760 /*
761 * Allocate a file descriptor for the process.
762 */
763 int
764 fd_alloc(proc_t *p, int want, int *result)
765 {
766 filedesc_t *fdp;
767 int i, lim, last, error;
768 u_int off, new;
769 fdfile_t *ff;
770
771 KASSERT(p == curproc || p == &proc0);
772
773 fdp = p->p_fd;
774 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
775 KASSERT(ff->ff_refcnt == 0);
776 KASSERT(ff->ff_file == NULL);
777
778 /*
779 * Search for a free descriptor starting at the higher
780 * of want or fd_freefile.
781 */
782 mutex_enter(&fdp->fd_lock);
783 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
784 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
785 last = min(fdp->fd_nfiles, lim);
786 for (;;) {
787 if ((i = want) < fdp->fd_freefile)
788 i = fdp->fd_freefile;
789 off = i >> NDENTRYSHIFT;
790 new = fd_next_zero(fdp, fdp->fd_himap, off,
791 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
792 if (new == -1)
793 break;
794 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
795 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
796 if (i == -1) {
797 /*
798 * Free file descriptor in this block was
799 * below want, try again with higher want.
800 */
801 want = (new + 1) << NDENTRYSHIFT;
802 continue;
803 }
804 i += (new << NDENTRYSHIFT);
805 if (i >= last) {
806 break;
807 }
808 if (fdp->fd_ofiles[i] == NULL) {
809 KASSERT(i >= NDFDFILE);
810 fdp->fd_ofiles[i] = ff;
811 } else {
812 pool_cache_put(fdfile_cache, ff);
813 }
814 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
815 fd_used(fdp, i);
816 if (want <= fdp->fd_freefile) {
817 fdp->fd_freefile = i;
818 }
819 *result = i;
820 mutex_exit(&fdp->fd_lock);
821 KASSERT(i >= NDFDFILE ||
822 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
823 return 0;
824 }
825
826 /* No space in current array. Let the caller expand and retry. */
827 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
828 mutex_exit(&fdp->fd_lock);
829 pool_cache_put(fdfile_cache, ff);
830 return error;
831 }
832
833 /*
834 * Expand a process' descriptor table.
835 */
836 void
837 fd_tryexpand(proc_t *p)
838 {
839 filedesc_t *fdp;
840 int i, numfiles, oldnfiles;
841 fdfile_t **newofile;
842 uint32_t *newhimap, *newlomap;
843
844 KASSERT(p == curproc || p == &proc0);
845
846 fdp = p->p_fd;
847 newhimap = NULL;
848 newlomap = NULL;
849 oldnfiles = fdp->fd_nfiles;
850
851 if (oldnfiles < NDEXTENT)
852 numfiles = NDEXTENT;
853 else
854 numfiles = 2 * oldnfiles;
855
856 newofile = malloc(numfiles * sizeof(fdfile_t *), M_FILEDESC, M_WAITOK);
857 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
858 newhimap = malloc(NDHISLOTS(numfiles) *
859 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
860 newlomap = malloc(NDLOSLOTS(numfiles) *
861 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
862 }
863
864 mutex_enter(&fdp->fd_lock);
865 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
866 if (fdp->fd_nfiles != oldnfiles) {
867 /* fdp changed; caller must retry */
868 mutex_exit(&fdp->fd_lock);
869 free(newofile, M_FILEDESC);
870 if (newhimap != NULL)
871 free(newhimap, M_FILEDESC);
872 if (newlomap != NULL)
873 free(newlomap, M_FILEDESC);
874 return;
875 }
876
877 /* Copy the existing ofile array and zero the new portion. */
878 i = sizeof(fdfile_t *) * fdp->fd_nfiles;
879 memcpy(newofile, fdp->fd_ofiles, i);
880 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
881
882 /*
883 * Link old ofiles array into list to be discarded. We defer
884 * freeing until process exit if the descriptor table is visble
885 * to other threads.
886 */
887 if (oldnfiles > NDFILE) {
888 if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
889 *(void **)fdp->fd_ofiles = fdp->fd_discard;
890 fdp->fd_discard = fdp->fd_ofiles;
891 } else {
892 free(fdp->fd_ofiles, M_FILEDESC);
893 }
894 }
895
896 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
897 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
898 memcpy(newhimap, fdp->fd_himap, i);
899 memset((uint8_t *)newhimap + i, 0,
900 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
901
902 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
903 memcpy(newlomap, fdp->fd_lomap, i);
904 memset((uint8_t *)newlomap + i, 0,
905 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
906
907 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
908 free(fdp->fd_himap, M_FILEDESC);
909 free(fdp->fd_lomap, M_FILEDESC);
910 }
911 fdp->fd_himap = newhimap;
912 fdp->fd_lomap = newlomap;
913 }
914
915 /*
916 * All other modifications must become globally visible before
917 * the change to fd_nfiles. See fd_getfile().
918 */
919 fdp->fd_ofiles = newofile;
920 membar_producer();
921 fdp->fd_nfiles = numfiles;
922 mutex_exit(&fdp->fd_lock);
923
924 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
925 }
926
927 /*
928 * Create a new open file structure and allocate a file descriptor
929 * for the current process.
930 */
931 int
932 fd_allocfile(file_t **resultfp, int *resultfd)
933 {
934 file_t *fp;
935 proc_t *p;
936 int error;
937
938 p = curproc;
939
940 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
941 if (error != ENOSPC) {
942 return error;
943 }
944 fd_tryexpand(p);
945 }
946
947 fp = pool_cache_get(file_cache, PR_WAITOK);
948 KASSERT(fp->f_count == 0);
949 fp->f_cred = kauth_cred_get();
950 kauth_cred_hold(fp->f_cred);
951
952 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
953 fd_abort(p, fp, *resultfd);
954 tablefull("file", "increase kern.maxfiles or MAXFILES");
955 return ENFILE;
956 }
957
958 fp->f_advice = 0;
959 fp->f_msgcount = 0;
960 fp->f_offset = 0;
961 fp->f_iflags = 0;
962 *resultfp = fp;
963
964 return 0;
965 }
966
967 /*
968 * Successful creation of a new descriptor: make visible to the process.
969 */
970 void
971 fd_affix(proc_t *p, file_t *fp, unsigned fd)
972 {
973 fdfile_t *ff;
974 filedesc_t *fdp;
975
976 KASSERT(p == curproc || p == &proc0);
977
978 /* Add a reference to the file structure. */
979 mutex_enter(&fp->f_lock);
980 fp->f_count++;
981 mutex_exit(&fp->f_lock);
982
983 /*
984 * Insert the new file into the descriptor slot.
985 *
986 * The memory barriers provided by lock activity in this routine
987 * ensure that any updates to the file structure become globally
988 * visible before the file becomes visible to other LWPs in the
989 * current process.
990 */
991 fdp = p->p_fd;
992 ff = fdp->fd_ofiles[fd];
993
994 KASSERT(ff != NULL);
995 KASSERT(ff->ff_file == NULL);
996 KASSERT(ff->ff_allocated);
997 KASSERT(fd_isused(fdp, fd));
998 KASSERT(fd >= NDFDFILE ||
999 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1000
1001 /* No need to lock in order to make file initially visible. */
1002 ff->ff_file = fp;
1003 }
1004
1005 /*
1006 * Abort creation of a new descriptor: free descriptor slot and file.
1007 */
1008 void
1009 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1010 {
1011 filedesc_t *fdp;
1012 fdfile_t *ff;
1013
1014 KASSERT(p == curproc || p == &proc0);
1015
1016 fdp = p->p_fd;
1017 ff = fdp->fd_ofiles[fd];
1018
1019 KASSERT(fd >= NDFDFILE ||
1020 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1021
1022 mutex_enter(&fdp->fd_lock);
1023 KASSERT(fd_isused(fdp, fd));
1024 fd_unused(fdp, fd);
1025 mutex_exit(&fdp->fd_lock);
1026
1027 if (fp != NULL) {
1028 ffree(fp);
1029 }
1030 }
1031
1032 /*
1033 * Free a file descriptor.
1034 */
1035 void
1036 ffree(file_t *fp)
1037 {
1038
1039 KASSERT(fp->f_count == 0);
1040
1041 atomic_dec_uint(&nfiles);
1042 kauth_cred_free(fp->f_cred);
1043 pool_cache_put(file_cache, fp);
1044 }
1045
1046 /*
1047 * Create an initial cwdinfo structure, using the same current and root
1048 * directories as curproc.
1049 */
1050 struct cwdinfo *
1051 cwdinit(void)
1052 {
1053 struct cwdinfo *cwdi;
1054 struct cwdinfo *copy;
1055
1056 cwdi = pool_cache_get(cwdi_cache, PR_WAITOK);
1057 copy = curproc->p_cwdi;
1058
1059 rw_enter(©->cwdi_lock, RW_READER);
1060 cwdi->cwdi_cdir = copy->cwdi_cdir;
1061 if (cwdi->cwdi_cdir)
1062 VREF(cwdi->cwdi_cdir);
1063 cwdi->cwdi_rdir = copy->cwdi_rdir;
1064 if (cwdi->cwdi_rdir)
1065 VREF(cwdi->cwdi_rdir);
1066 cwdi->cwdi_edir = copy->cwdi_edir;
1067 if (cwdi->cwdi_edir)
1068 VREF(cwdi->cwdi_edir);
1069 cwdi->cwdi_cmask = copy->cwdi_cmask;
1070 cwdi->cwdi_refcnt = 1;
1071 rw_exit(©->cwdi_lock);
1072
1073 return (cwdi);
1074 }
1075
1076 static int
1077 cwdi_ctor(void *arg, void *obj, int flags)
1078 {
1079 struct cwdinfo *cwdi = obj;
1080
1081 rw_init(&cwdi->cwdi_lock);
1082
1083 return 0;
1084 }
1085
1086 static void
1087 cwdi_dtor(void *arg, void *obj)
1088 {
1089 struct cwdinfo *cwdi = obj;
1090
1091 rw_destroy(&cwdi->cwdi_lock);
1092 }
1093
1094 static int
1095 file_ctor(void *arg, void *obj, int flags)
1096 {
1097 file_t *fp = obj;
1098
1099 memset(fp, 0, sizeof(*fp));
1100 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1101
1102 mutex_enter(&filelist_lock);
1103 LIST_INSERT_HEAD(&filehead, fp, f_list);
1104 mutex_exit(&filelist_lock);
1105
1106 return 0;
1107 }
1108
1109 static void
1110 file_dtor(void *arg, void *obj)
1111 {
1112 file_t *fp = obj;
1113
1114 mutex_enter(&filelist_lock);
1115 LIST_REMOVE(fp, f_list);
1116 mutex_exit(&filelist_lock);
1117
1118 mutex_destroy(&fp->f_lock);
1119 }
1120
1121 static int
1122 fdfile_ctor(void *arg, void *obj, int flags)
1123 {
1124 fdfile_t *ff = obj;
1125
1126 memset(ff, 0, sizeof(*ff));
1127 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1128 cv_init(&ff->ff_closing, "fdclose");
1129
1130 return 0;
1131 }
1132
1133 static void
1134 fdfile_dtor(void *arg, void *obj)
1135 {
1136 fdfile_t *ff = obj;
1137
1138 mutex_destroy(&ff->ff_lock);
1139 cv_destroy(&ff->ff_closing);
1140 }
1141
1142 file_t *
1143 fgetdummy(void)
1144 {
1145 file_t *fp;
1146
1147 fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1148 if (fp != NULL) {
1149 memset(fp, 0, sizeof(*fp));
1150 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1151 }
1152 return fp;
1153 }
1154
1155 void
1156 fputdummy(file_t *fp)
1157 {
1158
1159 mutex_destroy(&fp->f_lock);
1160 kmem_free(fp, sizeof(*fp));
1161 }
1162
1163 /*
1164 * Make p2 share p1's cwdinfo.
1165 */
1166 void
1167 cwdshare(struct proc *p2)
1168 {
1169 struct cwdinfo *cwdi;
1170
1171 cwdi = curproc->p_cwdi;
1172
1173 atomic_inc_uint(&cwdi->cwdi_refcnt);
1174 p2->p_cwdi = cwdi;
1175 }
1176
1177 /*
1178 * Release a cwdinfo structure.
1179 */
1180 void
1181 cwdfree(struct cwdinfo *cwdi)
1182 {
1183
1184 if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
1185 return;
1186
1187 vrele(cwdi->cwdi_cdir);
1188 if (cwdi->cwdi_rdir)
1189 vrele(cwdi->cwdi_rdir);
1190 if (cwdi->cwdi_edir)
1191 vrele(cwdi->cwdi_edir);
1192 pool_cache_put(cwdi_cache, cwdi);
1193 }
1194
1195 /*
1196 * Create an initial filedesc structure.
1197 */
1198 filedesc_t *
1199 fd_init(filedesc_t *fdp)
1200 {
1201 unsigned fd;
1202
1203 if (fdp == NULL) {
1204 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1205 } else {
1206 filedesc_ctor(NULL, fdp, PR_WAITOK);
1207 }
1208
1209 fdp->fd_refcnt = 1;
1210 fdp->fd_ofiles = fdp->fd_dfiles;
1211 fdp->fd_nfiles = NDFILE;
1212 fdp->fd_himap = fdp->fd_dhimap;
1213 fdp->fd_lomap = fdp->fd_dlomap;
1214 KASSERT(fdp->fd_lastfile == -1);
1215 KASSERT(fdp->fd_lastkqfile == -1);
1216 KASSERT(fdp->fd_knhash == NULL);
1217
1218 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1219 offsetof(filedesc_t, fd_startzero));
1220 for (fd = 0; fd < NDFDFILE; fd++) {
1221 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1222 }
1223
1224 return fdp;
1225 }
1226
1227 /*
1228 * Initialize a file descriptor table.
1229 */
1230 static int
1231 filedesc_ctor(void *arg, void *obj, int flag)
1232 {
1233 filedesc_t *fdp = obj;
1234 int i;
1235
1236 memset(fdp, 0, sizeof(*fdp));
1237 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1238 fdp->fd_lastfile = -1;
1239 fdp->fd_lastkqfile = -1;
1240
1241 KASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1242 for (i = 0; i < NDFDFILE; i++) {
1243 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1244 }
1245
1246 return 0;
1247 }
1248
1249 static void
1250 filedesc_dtor(void *arg, void *obj)
1251 {
1252 filedesc_t *fdp = obj;
1253 int i;
1254
1255 for (i = 0; i < NDFDFILE; i++) {
1256 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1257 }
1258
1259 mutex_destroy(&fdp->fd_lock);
1260 }
1261
1262 /*
1263 * Make p2 share p1's filedesc structure.
1264 */
1265 void
1266 fd_share(struct proc *p2)
1267 {
1268 filedesc_t *fdp;
1269
1270 fdp = curlwp->l_fd;
1271 p2->p_fd = fdp;
1272 atomic_inc_uint(&fdp->fd_refcnt);
1273 }
1274
1275 /*
1276 * Copy a filedesc structure.
1277 */
1278 filedesc_t *
1279 fd_copy(void)
1280 {
1281 filedesc_t *newfdp, *fdp;
1282 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1283 int i, nused, numfiles, lastfile, j, newlast;
1284 file_t *fp;
1285
1286 fdp = curproc->p_fd;
1287 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1288 newfdp->fd_refcnt = 1;
1289
1290 KASSERT(newfdp->fd_knhash == NULL);
1291 KASSERT(newfdp->fd_knhashmask == 0);
1292 KASSERT(newfdp->fd_discard == NULL);
1293
1294 for (;;) {
1295 numfiles = fdp->fd_nfiles;
1296 lastfile = fdp->fd_lastfile;
1297
1298 /*
1299 * If the number of open files fits in the internal arrays
1300 * of the open file structure, use them, otherwise allocate
1301 * additional memory for the number of descriptors currently
1302 * in use.
1303 */
1304 if (lastfile < NDFILE) {
1305 i = NDFILE;
1306 newfdp->fd_ofiles = newfdp->fd_dfiles;
1307 } else {
1308 /*
1309 * Compute the smallest multiple of NDEXTENT needed
1310 * for the file descriptors currently in use,
1311 * allowing the table to shrink.
1312 */
1313 i = numfiles;
1314 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1315 i /= 2;
1316 }
1317 newfdp->fd_ofiles = malloc(i * sizeof(fdfile_t *),
1318 M_FILEDESC, M_WAITOK);
1319 KASSERT(i >= NDFILE);
1320 }
1321 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1322 newfdp->fd_himap = newfdp->fd_dhimap;
1323 newfdp->fd_lomap = newfdp->fd_dlomap;
1324 } else {
1325 newfdp->fd_himap = malloc(NDHISLOTS(i) *
1326 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1327 newfdp->fd_lomap = malloc(NDLOSLOTS(i) *
1328 sizeof(uint32_t), M_FILEDESC, M_WAITOK);
1329 }
1330
1331 /*
1332 * Allocate and string together fdfile structures.
1333 * We abuse fdfile_t::ff_file here, but it will be
1334 * cleared before this routine returns.
1335 */
1336 nused = fdp->fd_nused;
1337 fflist = NULL;
1338 for (j = nused; j != 0; j--) {
1339 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1340 ff->ff_file = (void *)fflist;
1341 fflist = ff;
1342 }
1343
1344 mutex_enter(&fdp->fd_lock);
1345 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1346 lastfile == fdp->fd_lastfile) {
1347 break;
1348 }
1349 mutex_exit(&fdp->fd_lock);
1350 if (i >= NDFILE) {
1351 free(newfdp->fd_ofiles, M_FILEDESC);
1352 }
1353 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1354 free(newfdp->fd_himap, M_FILEDESC);
1355 free(newfdp->fd_lomap, M_FILEDESC);
1356 }
1357 while (fflist != NULL) {
1358 ff = fflist;
1359 fflist = (void *)ff->ff_file;
1360 ff->ff_file = NULL;
1361 pool_cache_put(fdfile_cache, ff);
1362 }
1363 }
1364
1365 newfdp->fd_nfiles = i;
1366 newfdp->fd_freefile = fdp->fd_freefile;
1367 newfdp->fd_exclose = fdp->fd_exclose;
1368
1369 /*
1370 * Clear the entries that will not be copied over.
1371 * Avoid calling memset with 0 size.
1372 */
1373 if (lastfile < (i-1)) {
1374 memset(newfdp->fd_ofiles + lastfile + 1, 0,
1375 (i - lastfile - 1) * sizeof(file_t **));
1376 }
1377 if (i < NDENTRIES * NDENTRIES) {
1378 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1379 }
1380 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1381 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1382
1383 ffp = fdp->fd_ofiles;
1384 nffp = newfdp->fd_ofiles;
1385 j = imax(lastfile, (NDFDFILE - 1));
1386 newlast = -1;
1387 KASSERT(j < fdp->fd_nfiles);
1388 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1389 ff = *ffp;
1390 /* Install built-in fdfiles even if unused here. */
1391 if (i < NDFDFILE) {
1392 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1393 } else {
1394 ff2 = NULL;
1395 }
1396 /* Determine if descriptor is active in parent. */
1397 if (ff == NULL || !fd_isused(fdp, i)) {
1398 KASSERT(ff != NULL || i >= NDFDFILE);
1399 continue;
1400 }
1401 mutex_enter(&ff->ff_lock);
1402 fp = ff->ff_file;
1403 if (fp == NULL) {
1404 /* Descriptor is half-open: free slot. */
1405 fd_zap(newfdp, i);
1406 mutex_exit(&ff->ff_lock);
1407 continue;
1408 }
1409 if (fp->f_type == DTYPE_KQUEUE) {
1410 /* kqueue descriptors cannot be copied. */
1411 fd_zap(newfdp, i);
1412 mutex_exit(&ff->ff_lock);
1413 continue;
1414 }
1415 /* It's active: add a reference to the file. */
1416 mutex_enter(&fp->f_lock);
1417 fp->f_count++;
1418 mutex_exit(&fp->f_lock);
1419 /* Consume one fdfile_t to represent it. */
1420 if (i >= NDFDFILE) {
1421 ff2 = fflist;
1422 fflist = (void *)ff2->ff_file;
1423 }
1424 ff2->ff_file = fp;
1425 ff2->ff_exclose = ff->ff_exclose;
1426 ff2->ff_allocated = 1;
1427 mutex_exit(&ff->ff_lock);
1428 if (i > newlast) {
1429 newlast = i;
1430 }
1431 }
1432 mutex_exit(&fdp->fd_lock);
1433
1434 /* Discard unused fdfile_t structures. */
1435 while (__predict_false(fflist != NULL)) {
1436 ff = fflist;
1437 fflist = (void *)ff->ff_file;
1438 ff->ff_file = NULL;
1439 pool_cache_put(fdfile_cache, ff);
1440 nused--;
1441 }
1442 KASSERT(nused >= 0);
1443 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1444
1445 newfdp->fd_nused = nused;
1446 newfdp->fd_lastfile = newlast;
1447
1448 return (newfdp);
1449 }
1450
1451 /*
1452 * Release a filedesc structure.
1453 */
1454 void
1455 fd_free(void)
1456 {
1457 filedesc_t *fdp;
1458 fdfile_t *ff;
1459 file_t *fp;
1460 int fd, lastfd;
1461 void *discard;
1462
1463 fdp = curlwp->l_fd;
1464
1465 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1466
1467 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1468 return;
1469
1470 /*
1471 * Close any files that the process holds open.
1472 */
1473 for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1474 ff = fdp->fd_ofiles[fd];
1475 KASSERT(fd >= NDFDFILE ||
1476 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1477 if ((ff = fdp->fd_ofiles[fd]) == NULL)
1478 continue;
1479 if ((fp = ff->ff_file) != NULL) {
1480 /*
1481 * Must use fd_close() here as kqueue holds
1482 * long term references to descriptors.
1483 */
1484 ff->ff_refcnt++;
1485 fd_close(fd);
1486 }
1487 KASSERT(ff->ff_refcnt == 0);
1488 KASSERT(ff->ff_file == NULL);
1489 KASSERT(!ff->ff_exclose);
1490 KASSERT(!ff->ff_allocated);
1491 if (fd >= NDFDFILE) {
1492 pool_cache_put(fdfile_cache, ff);
1493 }
1494 }
1495
1496 /*
1497 * Clean out the descriptor table for the next user and return
1498 * to the cache.
1499 */
1500 while ((discard = fdp->fd_discard) != NULL) {
1501 KASSERT(discard != fdp->fd_ofiles);
1502 fdp->fd_discard = *(void **)discard;
1503 free(discard, M_FILEDESC);
1504 }
1505 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1506 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1507 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1508 free(fdp->fd_himap, M_FILEDESC);
1509 free(fdp->fd_lomap, M_FILEDESC);
1510 }
1511 if (fdp->fd_nfiles > NDFILE) {
1512 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1513 free(fdp->fd_ofiles, M_FILEDESC);
1514 }
1515 if (fdp->fd_knhash != NULL) {
1516 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1517 fdp->fd_knhash = NULL;
1518 fdp->fd_knhashmask = 0;
1519 } else {
1520 KASSERT(fdp->fd_knhashmask == 0);
1521 }
1522 fdp->fd_lastkqfile = -1;
1523 pool_cache_put(filedesc_cache, fdp);
1524 }
1525
1526 /*
1527 * File Descriptor pseudo-device driver (/dev/fd/).
1528 *
1529 * Opening minor device N dup()s the file (if any) connected to file
1530 * descriptor N belonging to the calling process. Note that this driver
1531 * consists of only the ``open()'' routine, because all subsequent
1532 * references to this file will be direct to the other driver.
1533 */
1534 static int
1535 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1536 {
1537
1538 /*
1539 * XXX Kludge: set dupfd to contain the value of the
1540 * the file descriptor being sought for duplication. The error
1541 * return ensures that the vnode for this device will be released
1542 * by vn_open. Open will detect this special error and take the
1543 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1544 * will simply report the error.
1545 */
1546 l->l_dupfd = minor(dev); /* XXX */
1547 return EDUPFD;
1548 }
1549
1550 /*
1551 * Duplicate the specified descriptor to a free descriptor.
1552 */
1553 int
1554 fd_dupopen(int old, int *new, int mode, int error)
1555 {
1556 filedesc_t *fdp;
1557 fdfile_t *ff;
1558 file_t *fp;
1559
1560 if ((fp = fd_getfile(old)) == NULL) {
1561 return EBADF;
1562 }
1563 fdp = curlwp->l_fd;
1564 ff = fdp->fd_ofiles[old];
1565
1566 /*
1567 * There are two cases of interest here.
1568 *
1569 * For EDUPFD simply dup (dfd) to file descriptor
1570 * (indx) and return.
1571 *
1572 * For EMOVEFD steal away the file structure from (dfd) and
1573 * store it in (indx). (dfd) is effectively closed by
1574 * this operation.
1575 *
1576 * Any other error code is just returned.
1577 */
1578 switch (error) {
1579 case EDUPFD:
1580 /*
1581 * Check that the mode the file is being opened for is a
1582 * subset of the mode of the existing descriptor.
1583 */
1584 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1585 error = EACCES;
1586 break;
1587 }
1588
1589 /* Copy it. */
1590 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1591 break;
1592
1593 case EMOVEFD:
1594 /* Copy it. */
1595 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1596 if (error != 0) {
1597 break;
1598 }
1599
1600 /* Steal away the file pointer from 'old'. */
1601 (void)fd_close(old);
1602 return 0;
1603 }
1604
1605 fd_putfile(old);
1606 return error;
1607 }
1608
1609 /*
1610 * Close open files on exec.
1611 */
1612 void
1613 fd_closeexec(void)
1614 {
1615 struct cwdinfo *cwdi;
1616 proc_t *p;
1617 filedesc_t *fdp;
1618 fdfile_t *ff;
1619 lwp_t *l;
1620 int fd;
1621
1622 l = curlwp;
1623 p = l->l_proc;
1624 fdp = p->p_fd;
1625 cwdi = p->p_cwdi;
1626
1627 if (cwdi->cwdi_refcnt > 1) {
1628 cwdi = cwdinit();
1629 cwdfree(p->p_cwdi);
1630 p->p_cwdi = cwdi;
1631 }
1632 if (p->p_cwdi->cwdi_edir) {
1633 vrele(p->p_cwdi->cwdi_edir);
1634 }
1635
1636 if (fdp->fd_refcnt > 1) {
1637 fdp = fd_copy();
1638 fd_free();
1639 p->p_fd = fdp;
1640 l->l_fd = fdp;
1641 }
1642 if (!fdp->fd_exclose) {
1643 return;
1644 }
1645 fdp->fd_exclose = 0;
1646
1647 for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1648 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1649 KASSERT(fd >= NDFDFILE);
1650 continue;
1651 }
1652 KASSERT(fd >= NDFDFILE ||
1653 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1654 if (ff->ff_file == NULL)
1655 continue;
1656 if (ff->ff_exclose) {
1657 /*
1658 * We need a reference to close the file.
1659 * No other threads can see the fdfile_t at
1660 * this point, so don't bother locking.
1661 */
1662 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1663 ff->ff_refcnt++;
1664 fd_close(fd);
1665 }
1666 }
1667 }
1668
1669 /*
1670 * It is unsafe for set[ug]id processes to be started with file
1671 * descriptors 0..2 closed, as these descriptors are given implicit
1672 * significance in the Standard C library. fdcheckstd() will create a
1673 * descriptor referencing /dev/null for each of stdin, stdout, and
1674 * stderr that is not already open.
1675 */
1676 #define CHECK_UPTO 3
1677 int
1678 fd_checkstd(void)
1679 {
1680 struct proc *p;
1681 struct nameidata nd;
1682 filedesc_t *fdp;
1683 file_t *fp;
1684 struct proc *pp;
1685 int fd, i, error, flags = FREAD|FWRITE;
1686 char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1687
1688 p = curproc;
1689 closed[0] = '\0';
1690 if ((fdp = p->p_fd) == NULL)
1691 return (0);
1692 for (i = 0; i < CHECK_UPTO; i++) {
1693 KASSERT(i >= NDFDFILE ||
1694 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1695 if (fdp->fd_ofiles[i]->ff_file != NULL)
1696 continue;
1697 snprintf(which, sizeof(which), ",%d", i);
1698 strlcat(closed, which, sizeof(closed));
1699 if ((error = fd_allocfile(&fp, &fd)) != 0)
1700 return (error);
1701 KASSERT(fd < CHECK_UPTO);
1702 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1703 if ((error = vn_open(&nd, flags, 0)) != 0) {
1704 fd_abort(p, fp, fd);
1705 return (error);
1706 }
1707 fp->f_data = nd.ni_vp;
1708 fp->f_flag = flags;
1709 fp->f_ops = &vnops;
1710 fp->f_type = DTYPE_VNODE;
1711 VOP_UNLOCK(nd.ni_vp, 0);
1712 fd_affix(p, fp, fd);
1713 }
1714 if (closed[0] != '\0') {
1715 mutex_enter(proc_lock);
1716 pp = p->p_pptr;
1717 mutex_enter(pp->p_lock);
1718 log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1719 "was invoked by uid %d ppid %d (%s) "
1720 "with fd %s closed\n",
1721 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1722 pp->p_pid, pp->p_comm, &closed[1]);
1723 mutex_exit(pp->p_lock);
1724 mutex_exit(proc_lock);
1725 }
1726 return (0);
1727 }
1728 #undef CHECK_UPTO
1729
1730 /*
1731 * Sets descriptor owner. If the owner is a process, 'pgid'
1732 * is set to positive value, process ID. If the owner is process group,
1733 * 'pgid' is set to -pg_id.
1734 */
1735 int
1736 fsetown(pid_t *pgid, int cmd, const void *data)
1737 {
1738 int id = *(const int *)data;
1739 int error;
1740
1741 switch (cmd) {
1742 case TIOCSPGRP:
1743 if (id < 0)
1744 return (EINVAL);
1745 id = -id;
1746 break;
1747 default:
1748 break;
1749 }
1750
1751 if (id > 0 && !pfind(id))
1752 return (ESRCH);
1753 else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1754 return (error);
1755
1756 *pgid = id;
1757 return (0);
1758 }
1759
1760 /*
1761 * Return descriptor owner information. If the value is positive,
1762 * it's process ID. If it's negative, it's process group ID and
1763 * needs the sign removed before use.
1764 */
1765 int
1766 fgetown(pid_t pgid, int cmd, void *data)
1767 {
1768
1769 switch (cmd) {
1770 case TIOCGPGRP:
1771 *(int *)data = -pgid;
1772 break;
1773 default:
1774 *(int *)data = pgid;
1775 break;
1776 }
1777 return (0);
1778 }
1779
1780 /*
1781 * Send signal to descriptor owner, either process or process group.
1782 */
1783 void
1784 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1785 {
1786 struct proc *p1;
1787 struct pgrp *pgrp;
1788 ksiginfo_t ksi;
1789
1790 KASSERT(!cpu_intr_p());
1791
1792 KSI_INIT(&ksi);
1793 ksi.ksi_signo = signo;
1794 ksi.ksi_code = code;
1795 ksi.ksi_band = band;
1796
1797 mutex_enter(proc_lock);
1798 if (pgid > 0 && (p1 = p_find(pgid, PFIND_LOCKED)))
1799 kpsignal(p1, &ksi, fdescdata);
1800 else if (pgid < 0 && (pgrp = pg_find(-pgid, PFIND_LOCKED)))
1801 kpgsignal(pgrp, &ksi, fdescdata, 0);
1802 mutex_exit(proc_lock);
1803 }
1804
1805 int
1806 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1807 void *data)
1808 {
1809
1810 fp->f_flag = flag;
1811 fp->f_type = DTYPE_MISC;
1812 fp->f_ops = fops;
1813 fp->f_data = data;
1814 curlwp->l_dupfd = fd;
1815 fd_affix(curproc, fp, fd);
1816
1817 return EMOVEFD;
1818 }
1819
1820 int
1821 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1822 {
1823
1824 if (cmd == F_SETFL)
1825 return 0;
1826
1827 return EOPNOTSUPP;
1828 }
1829
1830 int
1831 fnullop_poll(file_t *fp, int which)
1832 {
1833
1834 return 0;
1835 }
1836
1837 int
1838 fnullop_kqfilter(file_t *fp, struct knote *kn)
1839 {
1840
1841 return 0;
1842 }
1843
1844 int
1845 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1846 kauth_cred_t cred, int flags)
1847 {
1848
1849 return EOPNOTSUPP;
1850 }
1851
1852 int
1853 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1854 kauth_cred_t cred, int flags)
1855 {
1856
1857 return EOPNOTSUPP;
1858 }
1859
1860 int
1861 fbadop_ioctl(file_t *fp, u_long com, void *data)
1862 {
1863
1864 return EOPNOTSUPP;
1865 }
1866
1867 int
1868 fbadop_stat(file_t *fp, struct stat *sb)
1869 {
1870
1871 return EOPNOTSUPP;
1872 }
1873
1874 int
1875 fbadop_close(file_t *fp)
1876 {
1877
1878 return EOPNOTSUPP;
1879 }
1880