vfs_vnops.c revision 1.239 1 /* $NetBSD: vfs_vnops.c,v 1.239 2023/04/22 13:52:46 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95
66 */
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.239 2023/04/22 13:52:46 riastradh Exp $");
70
71 #include "veriexec.h"
72
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/kernel.h>
76 #include <sys/file.h>
77 #include <sys/stat.h>
78 #include <sys/buf.h>
79 #include <sys/proc.h>
80 #include <sys/mount.h>
81 #include <sys/namei.h>
82 #include <sys/vnode_impl.h>
83 #include <sys/ioctl.h>
84 #include <sys/tty.h>
85 #include <sys/poll.h>
86 #include <sys/kauth.h>
87 #include <sys/syslog.h>
88 #include <sys/fstrans.h>
89 #include <sys/atomic.h>
90 #include <sys/filedesc.h>
91 #include <sys/wapbl.h>
92 #include <sys/mman.h>
93
94 #include <miscfs/specfs/specdev.h>
95 #include <miscfs/fifofs/fifo.h>
96
97 #include <uvm/uvm_extern.h>
98 #include <uvm/uvm_readahead.h>
99 #include <uvm/uvm_device.h>
100
101 #ifdef UNION
102 #include <fs/union/union.h>
103 #endif
104
105 #ifndef COMPAT_ZERODEV
106 #define COMPAT_ZERODEV(dev) (0)
107 #endif
108
109 int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *);
110
111 #include <sys/verified_exec.h>
112
113 static int vn_read(file_t *fp, off_t *offset, struct uio *uio,
114 kauth_cred_t cred, int flags);
115 static int vn_write(file_t *fp, off_t *offset, struct uio *uio,
116 kauth_cred_t cred, int flags);
117 static int vn_closefile(file_t *fp);
118 static int vn_poll(file_t *fp, int events);
119 static int vn_fcntl(file_t *fp, u_int com, void *data);
120 static int vn_statfile(file_t *fp, struct stat *sb);
121 static int vn_ioctl(file_t *fp, u_long com, void *data);
122 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *,
123 struct uvm_object **, int *);
124 static int vn_seek(struct file *, off_t, int, off_t *, int);
125 static int vn_advlock(struct file *, void *, int, struct flock *, int);
126
127 const struct fileops vnops = {
128 .fo_name = "vn",
129 .fo_read = vn_read,
130 .fo_write = vn_write,
131 .fo_ioctl = vn_ioctl,
132 .fo_fcntl = vn_fcntl,
133 .fo_poll = vn_poll,
134 .fo_stat = vn_statfile,
135 .fo_close = vn_closefile,
136 .fo_kqfilter = vn_kqfilter,
137 .fo_restart = fnullop_restart,
138 .fo_mmap = vn_mmap,
139 .fo_seek = vn_seek,
140 .fo_advlock = vn_advlock,
141 };
142
143 /*
144 * Common code for vnode open operations.
145 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
146 *
147 * at_dvp is the directory for openat(), if any.
148 * pb is the path.
149 * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT.
150 * fmode is the open flags, converted from O_* to F*
151 * cmode is the creation file permissions.
152 *
153 * XXX shouldn't cmode be mode_t?
154 *
155 * On success produces either a locked vnode in *ret_vp, or NULL in
156 * *ret_vp and a file descriptor number in *ret_fd.
157 *
158 * The caller may pass NULL for ret_fd (and ret_domove), in which case
159 * EOPNOTSUPP will be produced in the cases that would otherwise return
160 * a file descriptor.
161 *
162 * Note that callers that want no-follow behavior should pass
163 * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is
164 * honored.
165 */
166 int
167 vn_open(struct vnode *at_dvp, struct pathbuf *pb,
168 int nmode, int fmode, int cmode,
169 struct vnode **ret_vp, bool *ret_domove, int *ret_fd)
170 {
171 struct nameidata nd;
172 struct vnode *vp = NULL;
173 struct lwp *l = curlwp;
174 kauth_cred_t cred = l->l_cred;
175 struct vattr va;
176 int error;
177 const char *pathstring;
178
179 KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode);
180
181 KASSERT(ret_vp != NULL);
182 KASSERT((ret_domove == NULL) == (ret_fd == NULL));
183
184 if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY))
185 return EINVAL;
186
187 NDINIT(&nd, LOOKUP, nmode, pb);
188 if (at_dvp != NULL)
189 NDAT(&nd, at_dvp);
190
191 nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT;
192
193 if (fmode & O_CREAT) {
194 nd.ni_cnd.cn_nameiop = CREATE;
195 nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF;
196 if ((fmode & O_EXCL) == 0 &&
197 ((fmode & O_NOFOLLOW) == 0))
198 nd.ni_cnd.cn_flags |= FOLLOW;
199 if ((fmode & O_EXCL) == 0)
200 nd.ni_cnd.cn_flags |= NONEXCLHACK;
201 } else {
202 nd.ni_cnd.cn_nameiop = LOOKUP;
203 nd.ni_cnd.cn_flags |= LOCKLEAF;
204 if ((fmode & O_NOFOLLOW) == 0)
205 nd.ni_cnd.cn_flags |= FOLLOW;
206 }
207
208 pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf);
209 if (pathstring == NULL) {
210 return ENOMEM;
211 }
212
213 /*
214 * When this "interface" was exposed to do_open() it used
215 * to initialize l_dupfd to -newfd-1 (thus passing in the
216 * new file handle number to use)... but nothing in the
217 * kernel uses that value. So just send 0.
218 */
219 l->l_dupfd = 0;
220
221 error = namei(&nd);
222 if (error)
223 goto out;
224
225 vp = nd.ni_vp;
226
227 #if NVERIEXEC > 0
228 error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode);
229 if (error) {
230 /* We have to release the locks ourselves */
231 /*
232 * 20210604 dholland passing NONEXCLHACK means we can
233 * get ni_dvp == NULL back if ni_vp exists, and we should
234 * treat that like the non-O_CREAT case.
235 */
236 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
237 if (vp == NULL) {
238 vput(nd.ni_dvp);
239 } else {
240 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
241 if (nd.ni_dvp == nd.ni_vp)
242 vrele(nd.ni_dvp);
243 else
244 vput(nd.ni_dvp);
245 nd.ni_dvp = NULL;
246 vput(vp);
247 vp = NULL;
248 }
249 } else {
250 vput(vp);
251 vp = NULL;
252 }
253 goto out;
254 }
255 #endif /* NVERIEXEC > 0 */
256
257 /*
258 * 20210604 dholland ditto
259 */
260 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
261 if (nd.ni_vp == NULL) {
262 vattr_null(&va);
263 va.va_type = VREG;
264 va.va_mode = cmode;
265 if (fmode & O_EXCL)
266 va.va_vaflags |= VA_EXCLUSIVE;
267 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
268 &nd.ni_cnd, &va);
269 if (error) {
270 vput(nd.ni_dvp);
271 goto out;
272 }
273 fmode &= ~O_TRUNC;
274 vp = nd.ni_vp;
275 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
276 vput(nd.ni_dvp);
277 } else {
278 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
279 if (nd.ni_dvp == nd.ni_vp)
280 vrele(nd.ni_dvp);
281 else
282 vput(nd.ni_dvp);
283 nd.ni_dvp = NULL;
284 vp = nd.ni_vp;
285 if (fmode & O_EXCL) {
286 error = EEXIST;
287 goto bad;
288 }
289 fmode &= ~O_CREAT;
290 }
291 } else if ((fmode & O_CREAT) != 0) {
292 /*
293 * 20210606 dholland passing NONEXCLHACK means this
294 * case exists; it is the same as the following one
295 * but also needs to do things in the second (exists)
296 * half of the following block. (Besides handle
297 * ni_dvp, anyway.)
298 */
299 vp = nd.ni_vp;
300 KASSERT((fmode & O_EXCL) == 0);
301 fmode &= ~O_CREAT;
302 } else {
303 vp = nd.ni_vp;
304 }
305 if (vp->v_type == VSOCK) {
306 error = EOPNOTSUPP;
307 goto bad;
308 }
309 if (nd.ni_vp->v_type == VLNK) {
310 error = EFTYPE;
311 goto bad;
312 }
313
314 if ((fmode & O_CREAT) == 0) {
315 error = vn_openchk(vp, cred, fmode);
316 if (error != 0)
317 goto bad;
318 }
319
320 if (fmode & O_TRUNC) {
321 vattr_null(&va);
322 va.va_size = 0;
323 error = VOP_SETATTR(vp, &va, cred);
324 if (error != 0)
325 goto bad;
326 }
327 if ((error = VOP_OPEN(vp, fmode, cred)) != 0)
328 goto bad;
329 if (fmode & FWRITE) {
330 mutex_enter(vp->v_interlock);
331 vp->v_writecount++;
332 mutex_exit(vp->v_interlock);
333 }
334
335 bad:
336 if (error) {
337 vput(vp);
338 vp = NULL;
339 }
340 out:
341 pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring);
342
343 switch (error) {
344 case EDUPFD:
345 case EMOVEFD:
346 /* if the caller isn't prepared to handle fds, fail for them */
347 if (ret_fd == NULL) {
348 error = EOPNOTSUPP;
349 break;
350 }
351 *ret_vp = NULL;
352 *ret_domove = error == EMOVEFD;
353 *ret_fd = l->l_dupfd;
354 error = 0;
355 break;
356 case 0:
357 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
358 *ret_vp = vp;
359 break;
360 }
361 l->l_dupfd = 0;
362 return error;
363 }
364
365 /*
366 * Check for write permissions on the specified vnode.
367 * Prototype text segments cannot be written.
368 */
369 int
370 vn_writechk(struct vnode *vp)
371 {
372
373 /*
374 * If the vnode is in use as a process's text,
375 * we can't allow writing.
376 */
377 if (vp->v_iflag & VI_TEXT)
378 return ETXTBSY;
379 return 0;
380 }
381
382 int
383 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags)
384 {
385 int permbits = 0;
386 int error;
387
388 if (vp->v_type == VNON || vp->v_type == VBAD)
389 return ENXIO;
390
391 if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR)
392 return ENOTDIR;
393
394 if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG)
395 return EFTYPE;
396
397 if ((fflags & FREAD) != 0) {
398 permbits = VREAD;
399 }
400 if ((fflags & FEXEC) != 0) {
401 permbits |= VEXEC;
402 }
403 if ((fflags & (FWRITE | O_TRUNC)) != 0) {
404 permbits |= VWRITE;
405 if (vp->v_type == VDIR) {
406 error = EISDIR;
407 goto bad;
408 }
409 error = vn_writechk(vp);
410 if (error != 0)
411 goto bad;
412 }
413 error = VOP_ACCESS(vp, permbits, cred);
414 bad:
415 return error;
416 }
417
418 /*
419 * Mark a vnode as having executable mappings.
420 */
421 void
422 vn_markexec(struct vnode *vp)
423 {
424
425 if ((vp->v_iflag & VI_EXECMAP) != 0) {
426 /* Safe unlocked, as long as caller holds a reference. */
427 return;
428 }
429
430 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
431 mutex_enter(vp->v_interlock);
432 if ((vp->v_iflag & VI_EXECMAP) == 0) {
433 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
434 vp->v_iflag |= VI_EXECMAP;
435 }
436 mutex_exit(vp->v_interlock);
437 rw_exit(vp->v_uobj.vmobjlock);
438 }
439
440 /*
441 * Mark a vnode as being the text of a process.
442 * Fail if the vnode is currently writable.
443 */
444 int
445 vn_marktext(struct vnode *vp)
446 {
447
448 if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) {
449 /* Safe unlocked, as long as caller holds a reference. */
450 return 0;
451 }
452
453 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
454 mutex_enter(vp->v_interlock);
455 if (vp->v_writecount != 0) {
456 KASSERT((vp->v_iflag & VI_TEXT) == 0);
457 mutex_exit(vp->v_interlock);
458 rw_exit(vp->v_uobj.vmobjlock);
459 return ETXTBSY;
460 }
461 if ((vp->v_iflag & VI_EXECMAP) == 0) {
462 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
463 }
464 vp->v_iflag |= (VI_TEXT | VI_EXECMAP);
465 mutex_exit(vp->v_interlock);
466 rw_exit(vp->v_uobj.vmobjlock);
467 return 0;
468 }
469
470 /*
471 * Vnode close call
472 *
473 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node.
474 */
475 int
476 vn_close(struct vnode *vp, int flags, kauth_cred_t cred)
477 {
478 int error;
479
480 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
481 if (flags & FWRITE) {
482 mutex_enter(vp->v_interlock);
483 KASSERT(vp->v_writecount > 0);
484 vp->v_writecount--;
485 mutex_exit(vp->v_interlock);
486 }
487 error = VOP_CLOSE(vp, flags, cred);
488 vput(vp);
489 return error;
490 }
491
492 static int
493 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag)
494 {
495 struct lwp *l = curlwp;
496 off_t testoff;
497
498 if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG)
499 return 0;
500
501 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
502 if (ioflag & IO_APPEND)
503 testoff = vp->v_size;
504 else
505 testoff = uio->uio_offset;
506
507 if (testoff + uio->uio_resid >
508 l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
509 mutex_enter(&proc_lock);
510 psignal(l->l_proc, SIGXFSZ);
511 mutex_exit(&proc_lock);
512 return EFBIG;
513 }
514
515 return 0;
516 }
517
518 /*
519 * Package up an I/O request on a vnode into a uio and do it.
520 */
521 int
522 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
523 enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid,
524 struct lwp *l)
525 {
526 struct uio auio;
527 struct iovec aiov;
528 int error;
529
530 if ((ioflg & IO_NODELOCKED) == 0) {
531 if (rw == UIO_READ) {
532 vn_lock(vp, LK_SHARED | LK_RETRY);
533 } else /* UIO_WRITE */ {
534 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
535 }
536 }
537 auio.uio_iov = &aiov;
538 auio.uio_iovcnt = 1;
539 aiov.iov_base = base;
540 aiov.iov_len = len;
541 auio.uio_resid = len;
542 auio.uio_offset = offset;
543 auio.uio_rw = rw;
544 if (segflg == UIO_SYSSPACE) {
545 UIO_SETUP_SYSSPACE(&auio);
546 } else {
547 auio.uio_vmspace = l->l_proc->p_vmspace;
548 }
549
550 if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0)
551 goto out;
552
553 if (rw == UIO_READ) {
554 error = VOP_READ(vp, &auio, ioflg, cred);
555 } else {
556 error = VOP_WRITE(vp, &auio, ioflg, cred);
557 }
558
559 if (aresid)
560 *aresid = auio.uio_resid;
561 else
562 if (auio.uio_resid && error == 0)
563 error = EIO;
564
565 out:
566 if ((ioflg & IO_NODELOCKED) == 0) {
567 VOP_UNLOCK(vp);
568 }
569 return error;
570 }
571
572 int
573 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done,
574 struct lwp *l, off_t **cookies, int *ncookies)
575 {
576 struct vnode *vp = fp->f_vnode;
577 struct iovec aiov;
578 struct uio auio;
579 int error, eofflag;
580
581 /* Limit the size on any kernel buffers used by VOP_READDIR */
582 count = uimin(MAXBSIZE, count);
583
584 unionread:
585 if (vp->v_type != VDIR)
586 return EINVAL;
587 aiov.iov_base = bf;
588 aiov.iov_len = count;
589 auio.uio_iov = &aiov;
590 auio.uio_iovcnt = 1;
591 auio.uio_rw = UIO_READ;
592 if (segflg == UIO_SYSSPACE) {
593 UIO_SETUP_SYSSPACE(&auio);
594 } else {
595 KASSERT(l == curlwp);
596 auio.uio_vmspace = l->l_proc->p_vmspace;
597 }
598 auio.uio_resid = count;
599 vn_lock(vp, LK_SHARED | LK_RETRY);
600 mutex_enter(&fp->f_lock);
601 auio.uio_offset = fp->f_offset;
602 mutex_exit(&fp->f_lock);
603 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies,
604 ncookies);
605 mutex_enter(&fp->f_lock);
606 fp->f_offset = auio.uio_offset;
607 mutex_exit(&fp->f_lock);
608 VOP_UNLOCK(vp);
609 if (error)
610 return error;
611
612 if (count == auio.uio_resid && vn_union_readdir_hook) {
613 struct vnode *ovp = vp;
614
615 error = (*vn_union_readdir_hook)(&vp, fp, l);
616 if (error)
617 return error;
618 if (vp != ovp)
619 goto unionread;
620 }
621
622 if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) &&
623 (vp->v_mount->mnt_flag & MNT_UNION)) {
624 struct vnode *tvp = vp;
625 vp = vp->v_mount->mnt_vnodecovered;
626 vref(vp);
627 mutex_enter(&fp->f_lock);
628 fp->f_vnode = vp;
629 fp->f_offset = 0;
630 mutex_exit(&fp->f_lock);
631 vrele(tvp);
632 goto unionread;
633 }
634 *done = count - auio.uio_resid;
635 return error;
636 }
637
638 /*
639 * File table vnode read routine.
640 */
641 static int
642 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
643 int flags)
644 {
645 struct vnode *vp = fp->f_vnode;
646 int error, ioflag, fflag;
647 size_t count;
648
649 ioflag = IO_ADV_ENCODE(fp->f_advice);
650 fflag = fp->f_flag;
651 if (fflag & FNONBLOCK)
652 ioflag |= IO_NDELAY;
653 if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC))
654 ioflag |= IO_SYNC;
655 if (fflag & FALTIO)
656 ioflag |= IO_ALTSEMANTICS;
657 if (fflag & FDIRECT)
658 ioflag |= IO_DIRECT;
659 if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0)
660 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
661 else
662 vn_lock(vp, LK_SHARED | LK_RETRY);
663 if (__predict_false(vp->v_type == VDIR) &&
664 offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
665 mutex_enter(&fp->f_lock);
666 uio->uio_offset = *offset;
667 if (__predict_false(vp->v_type == VDIR) &&
668 offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
669 mutex_enter(&fp->f_lock);
670 count = uio->uio_resid;
671 error = VOP_READ(vp, uio, ioflag, cred);
672 if (flags & FOF_UPDATE_OFFSET)
673 *offset += count - uio->uio_resid;
674 VOP_UNLOCK(vp);
675 return error;
676 }
677
678 /*
679 * File table vnode write routine.
680 */
681 static int
682 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
683 int flags)
684 {
685 struct vnode *vp = fp->f_vnode;
686 int error, ioflag, fflag;
687 size_t count;
688
689 ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT;
690 fflag = fp->f_flag;
691 if (vp->v_type == VREG && (fflag & O_APPEND))
692 ioflag |= IO_APPEND;
693 if (fflag & FNONBLOCK)
694 ioflag |= IO_NDELAY;
695 if (fflag & FFSYNC ||
696 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
697 ioflag |= IO_SYNC;
698 else if (fflag & FDSYNC)
699 ioflag |= IO_DSYNC;
700 if (fflag & FALTIO)
701 ioflag |= IO_ALTSEMANTICS;
702 if (fflag & FDIRECT)
703 ioflag |= IO_DIRECT;
704 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
705 uio->uio_offset = *offset;
706 count = uio->uio_resid;
707
708 if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0)
709 goto out;
710
711 error = VOP_WRITE(vp, uio, ioflag, cred);
712
713 if (flags & FOF_UPDATE_OFFSET) {
714 if (ioflag & IO_APPEND) {
715 /*
716 * SUSv3 describes behaviour for count = 0 as following:
717 * "Before any action ... is taken, and if nbyte is zero
718 * and the file is a regular file, the write() function
719 * ... in the absence of errors ... shall return zero
720 * and have no other results."
721 */
722 if (count)
723 *offset = uio->uio_offset;
724 } else
725 *offset += count - uio->uio_resid;
726 }
727
728 out:
729 VOP_UNLOCK(vp);
730 return error;
731 }
732
733 /*
734 * File table vnode stat routine.
735 */
736 static int
737 vn_statfile(file_t *fp, struct stat *sb)
738 {
739 struct vnode *vp = fp->f_vnode;
740 int error;
741
742 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
743 error = vn_stat(vp, sb);
744 VOP_UNLOCK(vp);
745 return error;
746 }
747
748 int
749 vn_stat(struct vnode *vp, struct stat *sb)
750 {
751 struct vattr va;
752 int error;
753 mode_t mode;
754
755 memset(&va, 0, sizeof(va));
756 error = VOP_GETATTR(vp, &va, kauth_cred_get());
757 if (error)
758 return error;
759 /*
760 * Copy from vattr table
761 */
762 memset(sb, 0, sizeof(*sb));
763 sb->st_dev = va.va_fsid;
764 sb->st_ino = va.va_fileid;
765 mode = va.va_mode;
766 switch (vp->v_type) {
767 case VREG:
768 mode |= S_IFREG;
769 break;
770 case VDIR:
771 mode |= S_IFDIR;
772 break;
773 case VBLK:
774 mode |= S_IFBLK;
775 break;
776 case VCHR:
777 mode |= S_IFCHR;
778 break;
779 case VLNK:
780 mode |= S_IFLNK;
781 break;
782 case VSOCK:
783 mode |= S_IFSOCK;
784 break;
785 case VFIFO:
786 mode |= S_IFIFO;
787 break;
788 default:
789 return EBADF;
790 }
791 sb->st_mode = mode;
792 sb->st_nlink = va.va_nlink;
793 sb->st_uid = va.va_uid;
794 sb->st_gid = va.va_gid;
795 sb->st_rdev = va.va_rdev;
796 sb->st_size = va.va_size;
797 sb->st_atimespec = va.va_atime;
798 sb->st_mtimespec = va.va_mtime;
799 sb->st_ctimespec = va.va_ctime;
800 sb->st_birthtimespec = va.va_birthtime;
801 sb->st_blksize = va.va_blocksize;
802 sb->st_flags = va.va_flags;
803 sb->st_gen = 0;
804 sb->st_blocks = va.va_bytes / S_BLKSIZE;
805 return 0;
806 }
807
808 /*
809 * File table vnode fcntl routine.
810 */
811 static int
812 vn_fcntl(file_t *fp, u_int com, void *data)
813 {
814 struct vnode *vp = fp->f_vnode;
815 int error;
816
817 error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get());
818 return error;
819 }
820
821 /*
822 * File table vnode ioctl routine.
823 */
824 static int
825 vn_ioctl(file_t *fp, u_long com, void *data)
826 {
827 struct vnode *vp = fp->f_vnode, *ovp;
828 struct vattr vattr;
829 int error;
830
831 switch (vp->v_type) {
832
833 case VREG:
834 case VDIR:
835 if (com == FIONREAD) {
836 vn_lock(vp, LK_SHARED | LK_RETRY);
837 error = VOP_GETATTR(vp, &vattr, kauth_cred_get());
838 if (error == 0) {
839 if (vp->v_type == VDIR)
840 mutex_enter(&fp->f_lock);
841 *(int *)data = vattr.va_size - fp->f_offset;
842 if (vp->v_type == VDIR)
843 mutex_exit(&fp->f_lock);
844 }
845 VOP_UNLOCK(vp);
846 if (error)
847 return error;
848 return 0;
849 }
850 if ((com == FIONWRITE) || (com == FIONSPACE)) {
851 /*
852 * Files don't have send queues, so there never
853 * are any bytes in them, nor is there any
854 * open space in them.
855 */
856 *(int *)data = 0;
857 return 0;
858 }
859 if (com == FIOGETBMAP) {
860 daddr_t *block;
861
862 if (*(daddr_t *)data < 0)
863 return EINVAL;
864 block = (daddr_t *)data;
865 vn_lock(vp, LK_SHARED | LK_RETRY);
866 error = VOP_BMAP(vp, *block, NULL, block, NULL);
867 VOP_UNLOCK(vp);
868 return error;
869 }
870 if (com == OFIOGETBMAP) {
871 daddr_t ibn, obn;
872
873 if (*(int32_t *)data < 0)
874 return EINVAL;
875 ibn = (daddr_t)*(int32_t *)data;
876 vn_lock(vp, LK_SHARED | LK_RETRY);
877 error = VOP_BMAP(vp, ibn, NULL, &obn, NULL);
878 VOP_UNLOCK(vp);
879 *(int32_t *)data = (int32_t)obn;
880 return error;
881 }
882 if (com == FIONBIO || com == FIOASYNC) /* XXX */
883 return 0; /* XXX */
884 /* FALLTHROUGH */
885 case VFIFO:
886 case VCHR:
887 case VBLK:
888 error = VOP_IOCTL(vp, com, data, fp->f_flag,
889 kauth_cred_get());
890 if (error == 0 && com == TIOCSCTTY) {
891 vref(vp);
892 mutex_enter(&proc_lock);
893 ovp = curproc->p_session->s_ttyvp;
894 curproc->p_session->s_ttyvp = vp;
895 mutex_exit(&proc_lock);
896 if (ovp != NULL)
897 vrele(ovp);
898 }
899 return error;
900
901 default:
902 return EPASSTHROUGH;
903 }
904 }
905
906 /*
907 * File table vnode poll routine.
908 */
909 static int
910 vn_poll(file_t *fp, int events)
911 {
912
913 return VOP_POLL(fp->f_vnode, events);
914 }
915
916 /*
917 * File table vnode kqfilter routine.
918 */
919 int
920 vn_kqfilter(file_t *fp, struct knote *kn)
921 {
922
923 return VOP_KQFILTER(fp->f_vnode, kn);
924 }
925
926 static int
927 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp,
928 int *advicep, struct uvm_object **uobjp, int *maxprotp)
929 {
930 struct uvm_object *uobj;
931 struct vnode *vp;
932 struct vattr va;
933 struct lwp *l;
934 vm_prot_t maxprot;
935 off_t off;
936 int error, flags;
937 bool needwritemap;
938
939 l = curlwp;
940
941 off = *offp;
942 flags = *flagsp;
943 maxprot = VM_PROT_EXECUTE;
944
945 KASSERT(size > 0);
946
947 vp = fp->f_vnode;
948 if (vp->v_type != VREG && vp->v_type != VCHR &&
949 vp->v_type != VBLK) {
950 /* only REG/CHR/BLK support mmap */
951 return ENODEV;
952 }
953 if (vp->v_type != VCHR && off < 0) {
954 return EINVAL;
955 }
956 #if SIZE_MAX > UINT32_MAX /* XXX -Wtype-limits */
957 if (vp->v_type != VCHR && size > __type_max(off_t)) {
958 return EOVERFLOW;
959 }
960 #endif
961 if (vp->v_type != VCHR && off > __type_max(off_t) - size) {
962 /* no offset wrapping */
963 return EOVERFLOW;
964 }
965
966 /* special case: catch SunOS style /dev/zero */
967 if (vp->v_type == VCHR &&
968 (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
969 *uobjp = NULL;
970 *maxprotp = VM_PROT_ALL;
971 return 0;
972 }
973
974 /*
975 * Old programs may not select a specific sharing type, so
976 * default to an appropriate one.
977 *
978 * XXX: how does MAP_ANON fit in the picture?
979 */
980 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
981 #if defined(DEBUG)
982 struct proc *p = l->l_proc;
983 printf("WARNING: defaulted mmap() share type to "
984 "%s (pid %d command %s)\n", vp->v_type == VCHR ?
985 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
986 p->p_comm);
987 #endif
988 if (vp->v_type == VCHR)
989 flags |= MAP_SHARED; /* for a device */
990 else
991 flags |= MAP_PRIVATE; /* for a file */
992 }
993
994 /*
995 * MAP_PRIVATE device mappings don't make sense (and aren't
996 * supported anyway). However, some programs rely on this,
997 * so just change it to MAP_SHARED.
998 */
999 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
1000 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
1001 }
1002
1003 /*
1004 * now check protection
1005 */
1006
1007 /* check read access */
1008 if (fp->f_flag & FREAD)
1009 maxprot |= VM_PROT_READ;
1010 else if (prot & PROT_READ) {
1011 return EACCES;
1012 }
1013
1014 /* check write access, shared case first */
1015 if (flags & MAP_SHARED) {
1016 /*
1017 * if the file is writable, only add PROT_WRITE to
1018 * maxprot if the file is not immutable, append-only.
1019 * otherwise, if we have asked for PROT_WRITE, return
1020 * EPERM.
1021 */
1022 if (fp->f_flag & FWRITE) {
1023 vn_lock(vp, LK_SHARED | LK_RETRY);
1024 error = VOP_GETATTR(vp, &va, l->l_cred);
1025 VOP_UNLOCK(vp);
1026 if (error) {
1027 return error;
1028 }
1029 if ((va.va_flags &
1030 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
1031 maxprot |= VM_PROT_WRITE;
1032 else if (prot & PROT_WRITE) {
1033 return EPERM;
1034 }
1035 } else if (prot & PROT_WRITE) {
1036 return EACCES;
1037 }
1038 } else {
1039 /* MAP_PRIVATE mappings can always write to */
1040 maxprot |= VM_PROT_WRITE;
1041 }
1042
1043 /*
1044 * Don't allow mmap for EXEC if the file system
1045 * is mounted NOEXEC.
1046 */
1047 if ((prot & PROT_EXEC) != 0 &&
1048 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
1049 return EACCES;
1050 }
1051
1052 if (vp->v_type != VCHR) {
1053 error = VOP_MMAP(vp, prot, curlwp->l_cred);
1054 if (error) {
1055 return error;
1056 }
1057 vref(vp);
1058 uobj = &vp->v_uobj;
1059
1060 /*
1061 * If the vnode is being mapped with PROT_EXEC,
1062 * then mark it as text.
1063 */
1064 if (prot & PROT_EXEC) {
1065 vn_markexec(vp);
1066 }
1067 } else {
1068 int i = maxprot;
1069
1070 /*
1071 * XXX Some devices don't like to be mapped with
1072 * XXX PROT_EXEC or PROT_WRITE, but we don't really
1073 * XXX have a better way of handling this, right now
1074 */
1075 do {
1076 uobj = udv_attach(vp->v_rdev,
1077 (flags & MAP_SHARED) ? i :
1078 (i & ~VM_PROT_WRITE), off, size);
1079 i--;
1080 } while ((uobj == NULL) && (i > 0));
1081 if (uobj == NULL) {
1082 return EINVAL;
1083 }
1084 *advicep = UVM_ADV_RANDOM;
1085 }
1086
1087 /*
1088 * Set vnode flags to indicate the new kinds of mapping.
1089 * We take the vnode lock in exclusive mode here to serialize
1090 * with direct I/O.
1091 *
1092 * Safe to check for these flag values without a lock, as
1093 * long as a reference to the vnode is held.
1094 */
1095 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
1096 (flags & MAP_SHARED) != 0 &&
1097 (maxprot & VM_PROT_WRITE) != 0;
1098 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
1099 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1100 vp->v_vflag |= VV_MAPPED;
1101 if (needwritemap) {
1102 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
1103 mutex_enter(vp->v_interlock);
1104 vp->v_iflag |= VI_WRMAP;
1105 mutex_exit(vp->v_interlock);
1106 rw_exit(vp->v_uobj.vmobjlock);
1107 }
1108 VOP_UNLOCK(vp);
1109 }
1110
1111 #if NVERIEXEC > 0
1112
1113 /*
1114 * Check if the file can be executed indirectly.
1115 *
1116 * XXX: This gives false warnings about "Incorrect access type"
1117 * XXX: if the mapping is not executable. Harmless, but will be
1118 * XXX: fixed as part of other changes.
1119 */
1120 if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT,
1121 NULL)) {
1122
1123 /*
1124 * Don't allow executable mappings if we can't
1125 * indirectly execute the file.
1126 */
1127 if (prot & VM_PROT_EXECUTE) {
1128 return EPERM;
1129 }
1130
1131 /*
1132 * Strip the executable bit from 'maxprot' to make sure
1133 * it can't be made executable later.
1134 */
1135 maxprot &= ~VM_PROT_EXECUTE;
1136 }
1137 #endif /* NVERIEXEC > 0 */
1138
1139 *uobjp = uobj;
1140 *maxprotp = maxprot;
1141 *flagsp = flags;
1142
1143 return 0;
1144 }
1145
1146 static int
1147 vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp,
1148 int flags)
1149 {
1150 const off_t OFF_MIN = __type_min(off_t);
1151 const off_t OFF_MAX = __type_max(off_t);
1152 kauth_cred_t cred = fp->f_cred;
1153 off_t oldoff, newoff;
1154 struct vnode *vp = fp->f_vnode;
1155 struct vattr vattr;
1156 int error;
1157
1158 if (vp->v_type == VFIFO)
1159 return ESPIPE;
1160
1161 if (flags & FOF_UPDATE_OFFSET)
1162 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1163 else
1164 vn_lock(vp, LK_SHARED | LK_RETRY);
1165
1166 /* Compute the old and new offsets. */
1167 if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
1168 mutex_enter(&fp->f_lock);
1169 oldoff = fp->f_offset;
1170 if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
1171 mutex_exit(&fp->f_lock);
1172 switch (whence) {
1173 case SEEK_CUR:
1174 if (delta > 0) {
1175 if (oldoff > 0 && delta > OFF_MAX - oldoff) {
1176 newoff = OFF_MAX;
1177 break;
1178 }
1179 } else {
1180 if (oldoff < 0 && delta < OFF_MIN - oldoff) {
1181 newoff = OFF_MIN;
1182 break;
1183 }
1184 }
1185 newoff = oldoff + delta;
1186 break;
1187 case SEEK_END:
1188 error = VOP_GETATTR(vp, &vattr, cred);
1189 if (error)
1190 goto out;
1191 if (vattr.va_size > OFF_MAX ||
1192 delta > OFF_MAX - (off_t)vattr.va_size) {
1193 newoff = OFF_MAX;
1194 break;
1195 }
1196 newoff = delta + vattr.va_size;
1197 break;
1198 case SEEK_SET:
1199 newoff = delta;
1200 break;
1201 default:
1202 error = EINVAL;
1203 goto out;
1204 }
1205
1206 /* Pass the proposed change to the file system to audit. */
1207 error = VOP_SEEK(vp, oldoff, newoff, cred);
1208 if (error)
1209 goto out;
1210
1211 /* Success! */
1212 if (newoffp)
1213 *newoffp = newoff;
1214 if (flags & FOF_UPDATE_OFFSET)
1215 fp->f_offset = newoff;
1216 error = 0;
1217
1218 out: VOP_UNLOCK(vp);
1219 return error;
1220 }
1221
1222 static int
1223 vn_advlock(struct file *fp, void *id, int op, struct flock *fl,
1224 int flags)
1225 {
1226 struct vnode *const vp = fp->f_vnode;
1227
1228 if (fl->l_whence == SEEK_CUR) {
1229 vn_lock(vp, LK_SHARED | LK_RETRY);
1230 fl->l_start += fp->f_offset;
1231 VOP_UNLOCK(vp);
1232 }
1233
1234 return VOP_ADVLOCK(vp, id, op, fl, flags);
1235 }
1236
1237 /*
1238 * Check that the vnode is still valid, and if so
1239 * acquire requested lock.
1240 */
1241 int
1242 vn_lock(struct vnode *vp, int flags)
1243 {
1244 struct lwp *l;
1245 int error;
1246
1247 KASSERT(vrefcnt(vp) > 0);
1248 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY|
1249 LK_UPGRADE|LK_DOWNGRADE)) == 0);
1250 KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock));
1251
1252 #ifdef DIAGNOSTIC
1253 if (wapbl_vphaswapbl(vp))
1254 WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp));
1255 #endif
1256
1257 /* Get a more useful report for lockstat. */
1258 l = curlwp;
1259 KASSERT(l->l_rwcallsite == 0);
1260 l->l_rwcallsite = (uintptr_t)__builtin_return_address(0);
1261
1262 error = VOP_LOCK(vp, flags);
1263
1264 l->l_rwcallsite = 0;
1265
1266 switch (flags & (LK_RETRY | LK_NOWAIT)) {
1267 case 0:
1268 KASSERT(error == 0 || error == ENOENT);
1269 break;
1270 case LK_RETRY:
1271 KASSERT(error == 0);
1272 break;
1273 case LK_NOWAIT:
1274 KASSERT(error == 0 || error == EBUSY || error == ENOENT);
1275 break;
1276 case LK_RETRY | LK_NOWAIT:
1277 KASSERT(error == 0 || error == EBUSY);
1278 break;
1279 }
1280
1281 return error;
1282 }
1283
1284 /*
1285 * File table vnode close routine.
1286 */
1287 static int
1288 vn_closefile(file_t *fp)
1289 {
1290
1291 return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred);
1292 }
1293
1294 /*
1295 * Simplified in-kernel wrapper calls for extended attribute access.
1296 * Both calls pass in a NULL credential, authorizing a "kernel" access.
1297 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1298 */
1299 int
1300 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1301 const char *attrname, size_t *buflen, void *bf, struct lwp *l)
1302 {
1303 struct uio auio;
1304 struct iovec aiov;
1305 int error;
1306
1307 aiov.iov_len = *buflen;
1308 aiov.iov_base = bf;
1309
1310 auio.uio_iov = &aiov;
1311 auio.uio_iovcnt = 1;
1312 auio.uio_rw = UIO_READ;
1313 auio.uio_offset = 0;
1314 auio.uio_resid = *buflen;
1315 UIO_SETUP_SYSSPACE(&auio);
1316
1317 if ((ioflg & IO_NODELOCKED) == 0)
1318 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1319
1320 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL,
1321 NOCRED);
1322
1323 if ((ioflg & IO_NODELOCKED) == 0)
1324 VOP_UNLOCK(vp);
1325
1326 if (error == 0)
1327 *buflen = *buflen - auio.uio_resid;
1328
1329 return error;
1330 }
1331
1332 /*
1333 * XXX Failure mode if partially written?
1334 */
1335 int
1336 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1337 const char *attrname, size_t buflen, const void *bf, struct lwp *l)
1338 {
1339 struct uio auio;
1340 struct iovec aiov;
1341 int error;
1342
1343 aiov.iov_len = buflen;
1344 aiov.iov_base = __UNCONST(bf); /* XXXUNCONST kills const */
1345
1346 auio.uio_iov = &aiov;
1347 auio.uio_iovcnt = 1;
1348 auio.uio_rw = UIO_WRITE;
1349 auio.uio_offset = 0;
1350 auio.uio_resid = buflen;
1351 UIO_SETUP_SYSSPACE(&auio);
1352
1353 if ((ioflg & IO_NODELOCKED) == 0) {
1354 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1355 }
1356
1357 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED);
1358
1359 if ((ioflg & IO_NODELOCKED) == 0) {
1360 VOP_UNLOCK(vp);
1361 }
1362
1363 return error;
1364 }
1365
1366 int
1367 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1368 const char *attrname, struct lwp *l)
1369 {
1370 int error;
1371
1372 if ((ioflg & IO_NODELOCKED) == 0) {
1373 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1374 }
1375
1376 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED);
1377 if (error == EOPNOTSUPP)
1378 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1379 NOCRED);
1380
1381 if ((ioflg & IO_NODELOCKED) == 0) {
1382 VOP_UNLOCK(vp);
1383 }
1384
1385 return error;
1386 }
1387
1388 int
1389 vn_fifo_bypass(void *v)
1390 {
1391 struct vop_generic_args *ap = v;
1392
1393 return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v);
1394 }
1395
1396 /*
1397 * Open block device by device number
1398 */
1399 int
1400 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l)
1401 {
1402 int error;
1403
1404 if ((error = bdevvp(dev, vpp)) != 0)
1405 return error;
1406
1407 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1408 if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) {
1409 vput(*vpp);
1410 return error;
1411 }
1412 mutex_enter((*vpp)->v_interlock);
1413 (*vpp)->v_writecount++;
1414 mutex_exit((*vpp)->v_interlock);
1415 VOP_UNLOCK(*vpp);
1416
1417 return 0;
1418 }
1419
1420 /*
1421 * Lookup the provided name in the filesystem. If the file exists,
1422 * is a valid block device, and isn't being used by anyone else,
1423 * set *vpp to the file's vnode.
1424 */
1425 int
1426 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l)
1427 {
1428 struct vnode *vp;
1429 dev_t dev;
1430 enum vtype vt;
1431 int error;
1432
1433 error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL);
1434 if (error != 0)
1435 return error;
1436
1437 dev = vp->v_rdev;
1438 vt = vp->v_type;
1439
1440 VOP_UNLOCK(vp);
1441 (void) vn_close(vp, FREAD | FWRITE, l->l_cred);
1442
1443 if (vt != VBLK)
1444 return ENOTBLK;
1445
1446 return vn_bdev_open(dev, vpp, l);
1447 }
1448
1449 static long
1450 vn_knote_to_interest(const struct knote *kn)
1451 {
1452 switch (kn->kn_filter) {
1453 case EVFILT_READ:
1454 /*
1455 * Writing to the file or changing its attributes can
1456 * set the file size, which impacts the readability
1457 * filter.
1458 *
1459 * (No need to set NOTE_EXTEND here; it's only ever
1460 * send with other hints; see vnode_if.c.)
1461 */
1462 return NOTE_WRITE | NOTE_ATTRIB;
1463
1464 case EVFILT_VNODE:
1465 return kn->kn_sfflags;
1466
1467 case EVFILT_WRITE:
1468 default:
1469 return 0;
1470 }
1471 }
1472
1473 void
1474 vn_knote_attach(struct vnode *vp, struct knote *kn)
1475 {
1476 struct vnode_klist *vk = vp->v_klist;
1477 long interest = 0;
1478
1479 /*
1480 * In the case of layered / stacked file systems, knotes
1481 * should only ever be associated with the base vnode.
1482 */
1483 KASSERT(kn->kn_hook == vp);
1484 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
1485
1486 /*
1487 * We maintain a bitmask of the kevents that there is interest in,
1488 * to minimize the impact of having watchers. It's silly to have
1489 * to traverse vn_klist every time a read or write happens simply
1490 * because there is someone interested in knowing when the file
1491 * is deleted, for example.
1492 */
1493
1494 mutex_enter(vp->v_interlock);
1495 SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
1496 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
1497 interest |= vn_knote_to_interest(kn);
1498 }
1499 vk->vk_interest = interest;
1500 mutex_exit(vp->v_interlock);
1501 }
1502
1503 void
1504 vn_knote_detach(struct vnode *vp, struct knote *kn)
1505 {
1506 struct vnode_klist *vk = vp->v_klist;
1507 long interest = 0;
1508
1509 /* See above. */
1510 KASSERT(kn->kn_hook == vp);
1511 KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
1512
1513 /*
1514 * We special case removing the head of the list, because:
1515 *
1516 * 1. It's extremely likely that we're detaching the only
1517 * knote.
1518 *
1519 * 2. We're already traversing the whole list, so we don't
1520 * want to use the generic SLIST_REMOVE() which would
1521 * traverse it *again*.
1522 */
1523
1524 mutex_enter(vp->v_interlock);
1525 if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
1526 SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
1527 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
1528 interest |= vn_knote_to_interest(kn);
1529 }
1530 vk->vk_interest = interest;
1531 } else {
1532 struct knote *thiskn, *nextkn, *prevkn = NULL;
1533
1534 SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
1535 if (thiskn == kn) {
1536 KASSERT(kn != NULL);
1537 KASSERT(prevkn != NULL);
1538 SLIST_REMOVE_AFTER(prevkn, kn_selnext);
1539 kn = NULL;
1540 } else {
1541 interest |= vn_knote_to_interest(thiskn);
1542 prevkn = thiskn;
1543 }
1544 }
1545 vk->vk_interest = interest;
1546 }
1547 mutex_exit(vp->v_interlock);
1548 }
1549