ulfs_readwrite.c revision 1.4.2.4 1 /* $NetBSD: ulfs_readwrite.c,v 1.4.2.4 2017/12/03 11:39:22 jdolecek Exp $ */
2 /* from NetBSD: ufs_readwrite.c,v 1.120 2015/04/12 22:48:38 riastradh Exp */
3
4 /*-
5 * Copyright (c) 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
33 */
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(1, "$NetBSD: ulfs_readwrite.c,v 1.4.2.4 2017/12/03 11:39:22 jdolecek Exp $");
37
38 #ifdef LFS_READWRITE
39 #define FS struct lfs
40 #define I_FS i_lfs
41 #define READ lfs_read
42 #define READ_S "lfs_read"
43 #define WRITE lfs_write
44 #define WRITE_S "lfs_write"
45 #define BUFRD lfs_bufrd
46 #define BUFWR lfs_bufwr
47 #define fs_sb_getbsize(fs) lfs_sb_getbsize(fs)
48 #define fs_bmask lfs_bmask
49 #else
50 #define FS struct fs
51 #define I_FS i_fs
52 #define READ ffs_read
53 #define READ_S "ffs_read"
54 #define WRITE ffs_write
55 #define WRITE_S "ffs_write"
56 #define BUFRD ffs_bufrd
57 #define BUFWR ffs_bufwr
58 #define fs_sb_getbsize(fs) (fs)->fs_bsize
59 #endif
60
61 static int ulfs_post_read_update(struct vnode *, int, int);
62 static int ulfs_post_write_update(struct vnode *, struct uio *, int,
63 kauth_cred_t, off_t, int, int, int);
64
65 /*
66 * Vnode op for reading.
67 */
68 /* ARGSUSED */
69 int
70 READ(void *v)
71 {
72 struct vop_read_args /* {
73 struct vnode *a_vp;
74 struct uio *a_uio;
75 int a_ioflag;
76 kauth_cred_t a_cred;
77 } */ *ap = v;
78 struct vnode *vp;
79 struct inode *ip;
80 struct uio *uio;
81 FS *fs;
82 vsize_t bytelen;
83 int error, ioflag, advice;
84
85 vp = ap->a_vp;
86 ip = VTOI(vp);
87 fs = ip->I_FS;
88 uio = ap->a_uio;
89 ioflag = ap->a_ioflag;
90 error = 0;
91
92 KASSERT(uio->uio_rw == UIO_READ);
93 KASSERT(vp->v_type == VREG || vp->v_type == VDIR);
94
95 /* XXX Eliminate me by refusing directory reads from userland. */
96 if (vp->v_type == VDIR)
97 return BUFRD(vp, uio, ioflag, ap->a_cred);
98 #ifdef LFS_READWRITE
99 /* XXX Eliminate me by using ufs_bufio in lfs. */
100 if (vp->v_type == VREG && ip->i_number == LFS_IFILE_INUM)
101 return BUFRD(vp, uio, ioflag, ap->a_cred);
102 #endif
103 if ((u_int64_t)uio->uio_offset > fs->um_maxfilesize)
104 return (EFBIG);
105 if (uio->uio_resid == 0)
106 return (0);
107
108 #ifndef LFS_READWRITE
109 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
110 return ffs_snapshot_read(vp, uio, ioflag);
111 #endif /* !LFS_READWRITE */
112
113 if (uio->uio_offset >= ip->i_size)
114 goto out;
115
116 KASSERT(vp->v_type == VREG);
117 advice = IO_ADV_DECODE(ap->a_ioflag);
118 while (uio->uio_resid > 0) {
119 if (ioflag & IO_DIRECT) {
120 genfs_directio(vp, uio, ioflag);
121 }
122 bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid);
123 if (bytelen == 0)
124 break;
125 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
126 UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
127 if (error)
128 break;
129 }
130
131 out:
132 error = ulfs_post_read_update(vp, ap->a_ioflag, error);
133 return (error);
134 }
135
136 /*
137 * UFS op for reading via the buffer cache
138 */
139 int
140 BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
141 {
142 struct inode *ip;
143 FS *fs;
144 struct buf *bp;
145 daddr_t lbn, nextlbn;
146 off_t bytesinfile;
147 long size, xfersize, blkoffset;
148 int error;
149
150 KASSERT(VOP_ISLOCKED(vp));
151 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK ||
152 vp->v_type == VREG);
153 KASSERT(uio->uio_rw == UIO_READ);
154
155 ip = VTOI(vp);
156 fs = ip->I_FS;
157 error = 0;
158
159 KASSERT(vp->v_type != VLNK || ip->i_size >= fs->um_maxsymlinklen);
160 KASSERT(vp->v_type != VLNK || fs->um_maxsymlinklen != 0 ||
161 DIP(ip, blocks) == 0);
162 KASSERT(vp->v_type != VREG || vp == fs->lfs_ivnode);
163 KASSERT(vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM);
164
165 if (uio->uio_offset > fs->um_maxfilesize)
166 return EFBIG;
167 if (uio->uio_resid == 0)
168 return 0;
169
170 #ifndef LFS_READWRITE
171 KASSERT(!ISSET(ip->i_flags, (SF_SNAPSHOT | SF_SNAPINVAL)));
172 #endif
173
174 if (uio->uio_offset >= ip->i_size)
175 goto out;
176
177 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
178 bytesinfile = ip->i_size - uio->uio_offset;
179 if (bytesinfile <= 0)
180 break;
181 lbn = lfs_lblkno(fs, uio->uio_offset);
182 nextlbn = lbn + 1;
183 size = lfs_blksize(fs, ip, lbn);
184 blkoffset = lfs_blkoff(fs, uio->uio_offset);
185 xfersize = MIN(MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid),
186 bytesinfile);
187
188 if (lfs_lblktosize(fs, nextlbn) >= ip->i_size)
189 error = bread(vp, lbn, size, 0, &bp);
190 else {
191 int nextsize = lfs_blksize(fs, ip, nextlbn);
192 error = breadn(vp, lbn,
193 size, &nextlbn, &nextsize, 1, 0, &bp);
194 }
195 if (error)
196 break;
197
198 /*
199 * We should only get non-zero b_resid when an I/O error
200 * has occurred, which should cause us to break above.
201 * However, if the short read did not cause an error,
202 * then we want to ensure that we do not uiomove bad
203 * or uninitialized data.
204 */
205 size -= bp->b_resid;
206 if (size < xfersize) {
207 if (size == 0)
208 break;
209 xfersize = size;
210 }
211 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
212 if (error)
213 break;
214 brelse(bp, 0);
215 }
216 if (bp != NULL)
217 brelse(bp, 0);
218
219 out:
220 error = ulfs_post_read_update(vp, ioflag, error);
221 return (error);
222 }
223
224 static int
225 ulfs_post_read_update(struct vnode *vp, int ioflag, int oerror)
226 {
227 struct inode *ip = VTOI(vp);
228 int error = oerror;
229
230 if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
231 ip->i_state |= IN_ACCESS;
232 if ((ioflag & IO_SYNC) == IO_SYNC) {
233 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT);
234 }
235 }
236
237 /* Read error overrides any inode update error. */
238 if (oerror)
239 error = oerror;
240 return error;
241 }
242
243 /*
244 * Vnode op for writing.
245 */
246 int
247 WRITE(void *v)
248 {
249 struct vop_write_args /* {
250 struct vnode *a_vp;
251 struct uio *a_uio;
252 int a_ioflag;
253 kauth_cred_t a_cred;
254 } */ *ap = v;
255 struct vnode *vp;
256 struct uio *uio;
257 struct inode *ip;
258 FS *fs;
259 kauth_cred_t cred;
260 off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
261 int blkoffset, error, flags, ioflag, resid;
262 int aflag;
263 int extended=0;
264 vsize_t bytelen;
265 bool async;
266
267 cred = ap->a_cred;
268 ioflag = ap->a_ioflag;
269 uio = ap->a_uio;
270 vp = ap->a_vp;
271 ip = VTOI(vp);
272
273 KASSERT(vp->v_size == ip->i_size);
274 KASSERT(uio->uio_rw == UIO_WRITE);
275 KASSERT(vp->v_type == VREG);
276
277 if (ioflag & IO_APPEND)
278 uio->uio_offset = ip->i_size;
279 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
280 return (EPERM);
281
282 fs = ip->I_FS;
283 if (uio->uio_offset < 0 ||
284 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->um_maxfilesize)
285 return (EFBIG);
286 #ifdef LFS_READWRITE
287 /* Disallow writes to the Ifile, even if noschg flag is removed */
288 /* XXX can this go away when the Ifile is no longer in the namespace? */
289 if (vp == fs->lfs_ivnode)
290 return (EPERM);
291 #endif
292 if (uio->uio_resid == 0)
293 return (0);
294
295 flags = ioflag & IO_SYNC ? B_SYNC : 0;
296 async = vp->v_mount->mnt_flag & MNT_ASYNC;
297 origoff = uio->uio_offset;
298 resid = uio->uio_resid;
299 osize = ip->i_size;
300 error = 0;
301
302 KASSERT(vp->v_type == VREG);
303
304 #ifdef LFS_READWRITE
305 async = true;
306 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid));
307 lfs_check(vp, LFS_UNUSED_LBN, 0);
308 #endif /* !LFS_READWRITE */
309
310 preallocoff = round_page(lfs_blkroundup(fs, MAX(osize, uio->uio_offset)));
311 aflag = ioflag & IO_SYNC ? B_SYNC : 0;
312 nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
313 endallocoff = nsize - lfs_blkoff(fs, nsize);
314
315 /*
316 * if we're increasing the file size, deal with expanding
317 * the fragment if there is one.
318 */
319
320 if (nsize > osize && lfs_lblkno(fs, osize) < ULFS_NDADDR &&
321 lfs_lblkno(fs, osize) != lfs_lblkno(fs, nsize) &&
322 lfs_blkroundup(fs, osize) != osize) {
323 off_t eob;
324
325 eob = lfs_blkroundup(fs, osize);
326 uvm_vnp_setwritesize(vp, eob);
327 error = ulfs_balloc_range(vp, osize, eob - osize, cred, aflag);
328 if (error)
329 goto out;
330 if (flags & B_SYNC) {
331 mutex_enter(vp->v_interlock);
332 VOP_PUTPAGES(vp, trunc_page(osize & lfs_sb_getbmask(fs)),
333 round_page(eob),
334 PGO_CLEANIT | PGO_SYNCIO);
335 }
336 }
337
338 while (uio->uio_resid > 0) {
339 int ubc_flags = UBC_WRITE;
340 bool overwrite; /* if we're overwrite a whole block */
341 off_t newoff;
342
343 if (ioflag & IO_DIRECT) {
344 genfs_directio(vp, uio, ioflag);
345 }
346
347 oldoff = uio->uio_offset;
348 blkoffset = lfs_blkoff(fs, uio->uio_offset);
349 bytelen = MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid);
350 if (bytelen == 0) {
351 break;
352 }
353
354 /*
355 * if we're filling in a hole, allocate the blocks now and
356 * initialize the pages first. if we're extending the file,
357 * we can safely allocate blocks without initializing pages
358 * since the new blocks will be inaccessible until the write
359 * is complete.
360 */
361 overwrite = uio->uio_offset >= preallocoff &&
362 uio->uio_offset < endallocoff;
363 if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
364 lfs_blkoff(fs, uio->uio_offset) == 0 &&
365 (uio->uio_offset & PAGE_MASK) == 0) {
366 vsize_t len;
367
368 len = trunc_page(bytelen);
369 len -= lfs_blkoff(fs, len);
370 if (len > 0) {
371 overwrite = true;
372 bytelen = len;
373 }
374 }
375
376 newoff = oldoff + bytelen;
377 if (vp->v_size < newoff) {
378 uvm_vnp_setwritesize(vp, newoff);
379 }
380
381 if (!overwrite) {
382 error = ulfs_balloc_range(vp, uio->uio_offset, bytelen,
383 cred, aflag);
384 if (error)
385 break;
386 } else {
387 genfs_node_wrlock(vp);
388 error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
389 aflag, cred);
390 genfs_node_unlock(vp);
391 if (error)
392 break;
393 ubc_flags |= UBC_FAULTBUSY;
394 }
395
396 /*
397 * copy the data.
398 */
399
400 error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
401 IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp));
402
403 /*
404 * update UVM's notion of the size now that we've
405 * copied the data into the vnode's pages.
406 *
407 * we should update the size even when uiomove failed.
408 */
409
410 if (vp->v_size < newoff) {
411 uvm_vnp_setsize(vp, newoff);
412 extended = 1;
413 }
414
415 if (error)
416 break;
417
418 /*
419 * flush what we just wrote if necessary.
420 * XXXUBC simplistic async flushing.
421 */
422
423 #ifndef LFS_READWRITE
424 if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
425 mutex_enter(vp->v_interlock);
426 error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
427 (uio->uio_offset >> 16) << 16,
428 PGO_CLEANIT | PGO_LAZY);
429 if (error)
430 break;
431 }
432 #else
433 __USE(async);
434 #endif
435 }
436 if (error == 0 && ioflag & IO_SYNC) {
437 mutex_enter(vp->v_interlock);
438 error = VOP_PUTPAGES(vp, trunc_page(origoff & lfs_sb_getbmask(fs)),
439 round_page(lfs_blkroundup(fs, uio->uio_offset)),
440 PGO_CLEANIT | PGO_SYNCIO);
441 }
442
443 out:
444 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid,
445 extended, error);
446
447 return (error);
448 }
449
450 /*
451 * UFS op for writing via the buffer cache
452 */
453 int
454 BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
455 {
456 struct inode *ip;
457 FS *fs;
458 int flags;
459 struct buf *bp;
460 off_t osize;
461 int resid, xfersize, size, blkoffset;
462 daddr_t lbn;
463 int extended=0;
464 int error;
465 #ifdef LFS_READWRITE
466 bool need_unreserve = false;
467 #endif
468
469 KASSERT(ISSET(ioflag, IO_NODELOCKED));
470 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
471 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
472 KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC));
473 KASSERT(uio->uio_rw == UIO_WRITE);
474
475 ip = VTOI(vp);
476 fs = ip->I_FS;
477
478 KASSERT(vp->v_size == ip->i_size);
479
480 if (uio->uio_offset < 0 ||
481 uio->uio_resid > fs->um_maxfilesize ||
482 uio->uio_offset > (fs->um_maxfilesize - uio->uio_resid))
483 return EFBIG;
484 #ifdef LFS_READWRITE
485 KASSERT(vp != fs->lfs_ivnode);
486 #endif
487 if (uio->uio_resid == 0)
488 return 0;
489
490 flags = ioflag & IO_SYNC ? B_SYNC : 0;
491 resid = uio->uio_resid;
492 osize = ip->i_size;
493 error = 0;
494
495 KASSERT(vp->v_type != VREG);
496
497 #ifdef LFS_READWRITE
498 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid));
499 lfs_check(vp, LFS_UNUSED_LBN, 0);
500 #endif /* !LFS_READWRITE */
501
502 /* XXX Should never have pages cached here. */
503 KASSERT(vp->v_uobj.uo_npages == 0);
504 while (uio->uio_resid > 0) {
505 lbn = lfs_lblkno(fs, uio->uio_offset);
506 blkoffset = lfs_blkoff(fs, uio->uio_offset);
507 xfersize = MIN(fs_sb_getbsize(fs) - blkoffset, uio->uio_resid);
508 if (fs_sb_getbsize(fs) > xfersize)
509 flags |= B_CLRBUF;
510 else
511 flags &= ~B_CLRBUF;
512
513 #ifdef LFS_READWRITE
514 error = lfs_reserve(fs, vp, NULL,
515 lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs)));
516 if (error)
517 break;
518 need_unreserve = true;
519 #endif
520 error = lfs_balloc(vp, uio->uio_offset, xfersize, cred, flags,
521 &bp);
522
523 if (error)
524 break;
525 if (uio->uio_offset + xfersize > ip->i_size) {
526 ip->i_size = uio->uio_offset + xfersize;
527 DIP_ASSIGN(ip, size, ip->i_size);
528 uvm_vnp_setsize(vp, ip->i_size);
529 extended = 1;
530 }
531 size = lfs_blksize(fs, ip, lbn) - bp->b_resid;
532 if (xfersize > size)
533 xfersize = size;
534
535 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
536
537 /*
538 * if we didn't clear the block and the uiomove failed,
539 * the buf will now contain part of some other file,
540 * so we need to invalidate it.
541 */
542 if (error && (flags & B_CLRBUF) == 0) {
543 brelse(bp, BC_INVAL);
544 break;
545 }
546 #ifdef LFS_READWRITE
547 (void)VOP_BWRITE(bp->b_vp, bp);
548 lfs_reserve(fs, vp, NULL,
549 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs)));
550 need_unreserve = false;
551 #else
552 if (ioflag & IO_SYNC)
553 (void)bwrite(bp);
554 else if (xfersize + blkoffset == fs->fs_bsize)
555 bawrite(bp);
556 else
557 bdwrite(bp);
558 #endif
559 if (error || xfersize == 0)
560 break;
561 }
562 #ifdef LFS_READWRITE
563 if (need_unreserve) {
564 lfs_reserve(fs, vp, NULL,
565 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << lfs_sb_getbshift(fs)));
566 }
567 #endif
568
569 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid,
570 extended, error);
571
572 return (error);
573 }
574
575 static int
576 ulfs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag,
577 kauth_cred_t cred, off_t osize, int resid, int extended, int oerror)
578 {
579 struct inode *ip = VTOI(vp);
580 int error = oerror;
581
582 /* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */
583 ip->i_state |= IN_CHANGE | IN_UPDATE;
584 if (vp->v_mount->mnt_flag & MNT_RELATIME)
585 ip->i_state |= IN_ACCESS;
586
587 /*
588 * If we successfully wrote any data and we are not the superuser,
589 * we clear the setuid and setgid bits as a precaution against
590 * tampering.
591 */
592 if (resid > uio->uio_resid && cred) {
593 if (ip->i_mode & ISUID) {
594 if (kauth_authorize_vnode(cred,
595 KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) {
596 ip->i_mode &= ~ISUID;
597 DIP_ASSIGN(ip, mode, ip->i_mode);
598 }
599 }
600
601 if (ip->i_mode & ISGID) {
602 if (kauth_authorize_vnode(cred,
603 KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) {
604 ip->i_mode &= ~ISGID;
605 DIP_ASSIGN(ip, mode, ip->i_mode);
606 }
607 }
608 }
609
610 /* If we successfully wrote anything, notify kevent listeners. */
611 if (resid > uio->uio_resid)
612 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
613
614 /*
615 * Update the size on disk: truncate back to original size on
616 * error, or reflect the new size on success.
617 */
618 if (error) {
619 (void) lfs_truncate(vp, osize, ioflag & IO_SYNC, cred);
620 uio->uio_offset -= resid - uio->uio_resid;
621 uio->uio_resid = resid;
622 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) {
623 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT);
624 } else {
625 /* nothing */
626 }
627
628 /* Make sure the vnode uvm size matches the inode file size. */
629 KASSERT(vp->v_size == ip->i_size);
630
631 /* Write error overrides any inode update error. */
632 if (oerror)
633 error = oerror;
634 return error;
635 }
636