ulfs_readwrite.c revision 1.13 1 /* $NetBSD: ulfs_readwrite.c,v 1.13 2015/03/28 17:23:42 maxv Exp $ */
2 /* from NetBSD: ufs_readwrite.c,v 1.105 2013/01/22 09:39:18 dholland Exp */
3
4 /*-
5 * Copyright (c) 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
33 */
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(1, "$NetBSD: ulfs_readwrite.c,v 1.13 2015/03/28 17:23:42 maxv Exp $");
37
38 #ifdef LFS_READWRITE
39 #define FS struct lfs
40 #define I_FS i_lfs
41 #define READ lfs_read
42 #define READ_S "lfs_read"
43 #define WRITE lfs_write
44 #define WRITE_S "lfs_write"
45 #define BUFRD lfs_bufrd
46 #define BUFWR lfs_bufwr
47 #define fs_bsize lfs_bsize
48 #define fs_bmask lfs_bmask
49 #else
50 #define FS struct fs
51 #define I_FS i_fs
52 #define READ ffs_read
53 #define READ_S "ffs_read"
54 #define WRITE ffs_write
55 #define WRITE_S "ffs_write"
56 #define BUFRD ffs_bufrd
57 #define BUFWR ffs_bufwr
58 #endif
59
60 static int ulfs_post_read_update(struct vnode *, int, int);
61 static int ulfs_post_write_update(struct vnode *, struct uio *, int,
62 kauth_cred_t, off_t, int, int, int);
63
64 /*
65 * Vnode op for reading.
66 */
67 /* ARGSUSED */
68 int
69 READ(void *v)
70 {
71 struct vop_read_args /* {
72 struct vnode *a_vp;
73 struct uio *a_uio;
74 int a_ioflag;
75 kauth_cred_t a_cred;
76 } */ *ap = v;
77 struct vnode *vp;
78 struct inode *ip;
79 struct uio *uio;
80 FS *fs;
81 vsize_t bytelen;
82 int error, ioflag, advice;
83
84 vp = ap->a_vp;
85 ip = VTOI(vp);
86 fs = ip->I_FS;
87 uio = ap->a_uio;
88 ioflag = ap->a_ioflag;
89 error = 0;
90
91 KASSERT(uio->uio_rw == UIO_READ);
92 KASSERT(vp->v_type == VREG || vp->v_type == VDIR);
93
94 /* XXX Eliminate me by refusing directory reads from userland. */
95 if (vp->v_type == VDIR)
96 return BUFRD(vp, uio, ioflag, ap->a_cred);
97 #ifdef LFS_READWRITE
98 /* XXX Eliminate me by using ufs_bufio in lfs. */
99 if (vp->v_type == VREG && ip->i_number == LFS_IFILE_INUM)
100 return BUFRD(vp, uio, ioflag, ap->a_cred);
101 #endif
102 if ((u_int64_t)uio->uio_offset > fs->um_maxfilesize)
103 return (EFBIG);
104 if (uio->uio_resid == 0)
105 return (0);
106
107 #ifndef LFS_READWRITE
108 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
109 return ffs_snapshot_read(vp, uio, ioflag);
110 #endif /* !LFS_READWRITE */
111
112 fstrans_start(vp->v_mount, FSTRANS_SHARED);
113
114 if (uio->uio_offset >= ip->i_size)
115 goto out;
116
117 KASSERT(vp->v_type == VREG);
118 advice = IO_ADV_DECODE(ap->a_ioflag);
119 while (uio->uio_resid > 0) {
120 if (ioflag & IO_DIRECT) {
121 genfs_directio(vp, uio, ioflag);
122 }
123 bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid);
124 if (bytelen == 0)
125 break;
126 error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
127 UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
128 if (error)
129 break;
130 }
131
132 out:
133 error = ulfs_post_read_update(vp, ap->a_ioflag, error);
134 fstrans_done(vp->v_mount);
135 return (error);
136 }
137
138 /*
139 * UFS op for reading via the buffer cache
140 */
141 int
142 BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
143 {
144 struct inode *ip;
145 FS *fs;
146 struct buf *bp;
147 daddr_t lbn, nextlbn;
148 off_t bytesinfile;
149 long size, xfersize, blkoffset;
150 int error;
151
152 KASSERT(VOP_ISLOCKED(vp));
153 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK ||
154 vp->v_type == VREG);
155 KASSERT(uio->uio_rw == UIO_READ);
156
157 ip = VTOI(vp);
158 fs = ip->I_FS;
159 error = 0;
160
161 KASSERT(vp->v_type != VLNK || ip->i_size < fs->um_maxsymlinklen);
162 KASSERT(vp->v_type != VLNK || fs->um_maxsymlinklen != 0 ||
163 DIP(ip, blocks) == 0);
164 KASSERT(vp->v_type != VREG || vp == fs->lfs_ivnode);
165 KASSERT(vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM);
166
167 if (uio->uio_offset > fs->um_maxfilesize)
168 return EFBIG;
169 if (uio->uio_resid == 0)
170 return 0;
171
172 #ifndef LFS_READWRITE
173 KASSERT(!ISSET(ip->i_flags, (SF_SNAPSHOT | SF_SNAPINVAL)));
174 #endif
175
176 fstrans_start(vp->v_mount, FSTRANS_SHARED);
177
178 if (uio->uio_offset >= ip->i_size)
179 goto out;
180
181 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
182 bytesinfile = ip->i_size - uio->uio_offset;
183 if (bytesinfile <= 0)
184 break;
185 lbn = lfs_lblkno(fs, uio->uio_offset);
186 nextlbn = lbn + 1;
187 size = lfs_blksize(fs, ip, lbn);
188 blkoffset = lfs_blkoff(fs, uio->uio_offset);
189 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
190 bytesinfile);
191
192 if (lfs_lblktosize(fs, nextlbn) >= ip->i_size)
193 error = bread(vp, lbn, size, NOCRED, 0, &bp);
194 else {
195 int nextsize = lfs_blksize(fs, ip, nextlbn);
196 error = breadn(vp, lbn,
197 size, &nextlbn, &nextsize, 1, 0, &bp);
198 }
199 if (error)
200 break;
201
202 /*
203 * We should only get non-zero b_resid when an I/O error
204 * has occurred, which should cause us to break above.
205 * However, if the short read did not cause an error,
206 * then we want to ensure that we do not uiomove bad
207 * or uninitialized data.
208 */
209 size -= bp->b_resid;
210 if (size < xfersize) {
211 if (size == 0)
212 break;
213 xfersize = size;
214 }
215 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
216 if (error)
217 break;
218 brelse(bp, 0);
219 }
220 if (bp != NULL)
221 brelse(bp, 0);
222
223 out:
224 error = ulfs_post_read_update(vp, ioflag, error);
225 fstrans_done(vp->v_mount);
226 return (error);
227 }
228
229 static int
230 ulfs_post_read_update(struct vnode *vp, int ioflag, int error)
231 {
232 struct inode *ip = VTOI(vp);
233
234 if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
235 ip->i_flag |= IN_ACCESS;
236 if ((ioflag & IO_SYNC) == IO_SYNC) {
237 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT);
238 }
239 }
240
241 return error;
242 }
243
244 /*
245 * Vnode op for writing.
246 */
247 int
248 WRITE(void *v)
249 {
250 struct vop_write_args /* {
251 struct vnode *a_vp;
252 struct uio *a_uio;
253 int a_ioflag;
254 kauth_cred_t a_cred;
255 } */ *ap = v;
256 struct vnode *vp;
257 struct uio *uio;
258 struct inode *ip;
259 FS *fs;
260 kauth_cred_t cred;
261 off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
262 int blkoffset, error, flags, ioflag, resid;
263 int aflag;
264 int extended=0;
265 vsize_t bytelen;
266 bool async;
267
268 cred = ap->a_cred;
269 ioflag = ap->a_ioflag;
270 uio = ap->a_uio;
271 vp = ap->a_vp;
272 ip = VTOI(vp);
273
274 KASSERT(vp->v_size == ip->i_size);
275 KASSERT(uio->uio_rw == UIO_WRITE);
276 KASSERT(vp->v_type == VREG);
277
278 if (ioflag & IO_APPEND)
279 uio->uio_offset = ip->i_size;
280 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
281 return (EPERM);
282
283 fs = ip->I_FS;
284 if (uio->uio_offset < 0 ||
285 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->um_maxfilesize)
286 return (EFBIG);
287 #ifdef LFS_READWRITE
288 /* Disallow writes to the Ifile, even if noschg flag is removed */
289 /* XXX can this go away when the Ifile is no longer in the namespace? */
290 if (vp == fs->lfs_ivnode)
291 return (EPERM);
292 #endif
293 if (uio->uio_resid == 0)
294 return (0);
295
296 fstrans_start(vp->v_mount, FSTRANS_SHARED);
297
298 flags = ioflag & IO_SYNC ? B_SYNC : 0;
299 async = vp->v_mount->mnt_flag & MNT_ASYNC;
300 origoff = uio->uio_offset;
301 resid = uio->uio_resid;
302 osize = ip->i_size;
303 error = 0;
304
305 KASSERT(vp->v_type == VREG);
306
307 #ifdef LFS_READWRITE
308 async = true;
309 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid));
310 lfs_check(vp, LFS_UNUSED_LBN, 0);
311 #endif /* !LFS_READWRITE */
312
313 preallocoff = round_page(lfs_blkroundup(fs, MAX(osize, uio->uio_offset)));
314 aflag = ioflag & IO_SYNC ? B_SYNC : 0;
315 nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
316 endallocoff = nsize - lfs_blkoff(fs, nsize);
317
318 /*
319 * if we're increasing the file size, deal with expanding
320 * the fragment if there is one.
321 */
322
323 if (nsize > osize && lfs_lblkno(fs, osize) < ULFS_NDADDR &&
324 lfs_lblkno(fs, osize) != lfs_lblkno(fs, nsize) &&
325 lfs_blkroundup(fs, osize) != osize) {
326 off_t eob;
327
328 eob = lfs_blkroundup(fs, osize);
329 uvm_vnp_setwritesize(vp, eob);
330 error = ulfs_balloc_range(vp, osize, eob - osize, cred, aflag);
331 if (error)
332 goto out;
333 if (flags & B_SYNC) {
334 mutex_enter(vp->v_interlock);
335 VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
336 round_page(eob),
337 PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
338 }
339 }
340
341 while (uio->uio_resid > 0) {
342 int ubc_flags = UBC_WRITE;
343 bool overwrite; /* if we're overwrite a whole block */
344 off_t newoff;
345
346 if (ioflag & IO_DIRECT) {
347 genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
348 }
349
350 oldoff = uio->uio_offset;
351 blkoffset = lfs_blkoff(fs, uio->uio_offset);
352 bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
353 if (bytelen == 0) {
354 break;
355 }
356
357 /*
358 * if we're filling in a hole, allocate the blocks now and
359 * initialize the pages first. if we're extending the file,
360 * we can safely allocate blocks without initializing pages
361 * since the new blocks will be inaccessible until the write
362 * is complete.
363 */
364 overwrite = uio->uio_offset >= preallocoff &&
365 uio->uio_offset < endallocoff;
366 if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
367 lfs_blkoff(fs, uio->uio_offset) == 0 &&
368 (uio->uio_offset & PAGE_MASK) == 0) {
369 vsize_t len;
370
371 len = trunc_page(bytelen);
372 len -= lfs_blkoff(fs, len);
373 if (len > 0) {
374 overwrite = true;
375 bytelen = len;
376 }
377 }
378
379 newoff = oldoff + bytelen;
380 if (vp->v_size < newoff) {
381 uvm_vnp_setwritesize(vp, newoff);
382 }
383
384 if (!overwrite) {
385 error = ulfs_balloc_range(vp, uio->uio_offset, bytelen,
386 cred, aflag);
387 if (error)
388 break;
389 } else {
390 genfs_node_wrlock(vp);
391 error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
392 aflag, cred);
393 genfs_node_unlock(vp);
394 if (error)
395 break;
396 ubc_flags |= UBC_FAULTBUSY;
397 }
398
399 /*
400 * copy the data.
401 */
402
403 error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
404 IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp));
405
406 /*
407 * update UVM's notion of the size now that we've
408 * copied the data into the vnode's pages.
409 *
410 * we should update the size even when uiomove failed.
411 */
412
413 if (vp->v_size < newoff) {
414 uvm_vnp_setsize(vp, newoff);
415 extended = 1;
416 }
417
418 if (error)
419 break;
420
421 /*
422 * flush what we just wrote if necessary.
423 * XXXUBC simplistic async flushing.
424 */
425
426 #ifndef LFS_READWRITE
427 if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
428 mutex_enter(vp->v_interlock);
429 error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
430 (uio->uio_offset >> 16) << 16,
431 PGO_CLEANIT | PGO_JOURNALLOCKED | PGO_LAZY);
432 if (error)
433 break;
434 }
435 #else
436 __USE(async);
437 #endif
438 }
439 if (error == 0 && ioflag & IO_SYNC) {
440 mutex_enter(vp->v_interlock);
441 error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
442 round_page(lfs_blkroundup(fs, uio->uio_offset)),
443 PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
444 }
445
446 out:
447 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid,
448 extended, error);
449 fstrans_done(vp->v_mount);
450
451 return (error);
452 }
453
454 /*
455 * UFS op for writing via the buffer cache
456 */
457 int
458 BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
459 {
460 struct inode *ip;
461 FS *fs;
462 int flags;
463 struct buf *bp;
464 off_t osize, origoff;
465 int resid, xfersize, size, blkoffset;
466 daddr_t lbn;
467 int extended=0;
468 int error;
469 #ifdef LFS_READWRITE
470 bool need_unreserve = false;
471 #endif
472
473 KASSERT(ISSET(ioflag, IO_NODELOCKED));
474 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
475 KASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
476 KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC));
477 KASSERT(uio->uio_rw == UIO_WRITE);
478
479 ip = VTOI(vp);
480 fs = ip->I_FS;
481
482 KASSERT(vp->v_size == ip->i_size);
483
484 if (uio->uio_offset < 0 ||
485 uio->uio_resid > fs->um_maxfilesize ||
486 uio->uio_offset > (fs->um_maxfilesize - uio->uio_resid))
487 return EFBIG;
488 #ifdef LFS_READWRITE
489 KASSERT(vp != fs->lfs_ivnode);
490 #endif
491 if (uio->uio_resid == 0)
492 return 0;
493
494 fstrans_start(vp->v_mount, FSTRANS_SHARED);
495
496 flags = ioflag & IO_SYNC ? B_SYNC : 0;
497 origoff = uio->uio_offset;
498 resid = uio->uio_resid;
499 osize = ip->i_size;
500 error = 0;
501
502 KASSERT(vp->v_type != VREG);
503
504 #ifdef LFS_READWRITE
505 lfs_availwait(fs, lfs_btofsb(fs, uio->uio_resid));
506 lfs_check(vp, LFS_UNUSED_LBN, 0);
507 #endif /* !LFS_READWRITE */
508
509 /* XXX Should never have pages cached here. */
510 mutex_enter(vp->v_interlock);
511 VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid),
512 PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED);
513 while (uio->uio_resid > 0) {
514 lbn = lfs_lblkno(fs, uio->uio_offset);
515 blkoffset = lfs_blkoff(fs, uio->uio_offset);
516 xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
517 if (fs->fs_bsize > xfersize)
518 flags |= B_CLRBUF;
519 else
520 flags &= ~B_CLRBUF;
521
522 #ifdef LFS_READWRITE
523 error = lfs_reserve(fs, vp, NULL,
524 lfs_btofsb(fs, (ULFS_NIADDR + 1) << fs->lfs_bshift));
525 if (error)
526 break;
527 need_unreserve = true;
528 #endif
529 error = lfs_balloc(vp, uio->uio_offset, xfersize, cred, flags,
530 &bp);
531
532 if (error)
533 break;
534 if (uio->uio_offset + xfersize > ip->i_size) {
535 ip->i_size = uio->uio_offset + xfersize;
536 DIP_ASSIGN(ip, size, ip->i_size);
537 uvm_vnp_setsize(vp, ip->i_size);
538 extended = 1;
539 }
540 size = lfs_blksize(fs, ip, lbn) - bp->b_resid;
541 if (xfersize > size)
542 xfersize = size;
543
544 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
545
546 /*
547 * if we didn't clear the block and the uiomove failed,
548 * the buf will now contain part of some other file,
549 * so we need to invalidate it.
550 */
551 if (error && (flags & B_CLRBUF) == 0) {
552 brelse(bp, BC_INVAL);
553 break;
554 }
555 #ifdef LFS_READWRITE
556 (void)VOP_BWRITE(bp->b_vp, bp);
557 lfs_reserve(fs, vp, NULL,
558 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << fs->lfs_bshift));
559 need_unreserve = false;
560 #else
561 if (ioflag & IO_SYNC)
562 (void)bwrite(bp);
563 else if (xfersize + blkoffset == fs->fs_bsize)
564 bawrite(bp);
565 else
566 bdwrite(bp);
567 #endif
568 if (error || xfersize == 0)
569 break;
570 }
571 #ifdef LFS_READWRITE
572 if (need_unreserve) {
573 lfs_reserve(fs, vp, NULL,
574 -lfs_btofsb(fs, (ULFS_NIADDR + 1) << fs->lfs_bshift));
575 }
576 #endif
577
578 error = ulfs_post_write_update(vp, uio, ioflag, cred, osize, resid,
579 extended, error);
580 fstrans_done(vp->v_mount);
581
582 return (error);
583 }
584
585 static int
586 ulfs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag,
587 kauth_cred_t cred, off_t osize, int resid, int extended, int error)
588 {
589 struct inode *ip = VTOI(vp);
590
591 /* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */
592 ip->i_flag |= IN_CHANGE | IN_UPDATE;
593 if (vp->v_mount->mnt_flag & MNT_RELATIME)
594 ip->i_flag |= IN_ACCESS;
595
596 /*
597 * If we successfully wrote any data and we are not the superuser,
598 * we clear the setuid and setgid bits as a precaution against
599 * tampering.
600 */
601 if (resid > uio->uio_resid && cred) {
602 if (ip->i_mode & ISUID) {
603 if (kauth_authorize_vnode(cred,
604 KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) {
605 ip->i_mode &= ~ISUID;
606 DIP_ASSIGN(ip, mode, ip->i_mode);
607 }
608 }
609
610 if (ip->i_mode & ISGID) {
611 if (kauth_authorize_vnode(cred,
612 KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) {
613 ip->i_mode &= ~ISGID;
614 DIP_ASSIGN(ip, mode, ip->i_mode);
615 }
616 }
617 }
618
619 /* If we successfully wrote anything, notify kevent listeners. */
620 if (resid > uio->uio_resid)
621 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
622
623 /*
624 * Update the size on disk: truncate back to original size on
625 * error, or reflect the new size on success.
626 */
627 if (error) {
628 (void) lfs_truncate(vp, osize, ioflag & IO_SYNC, cred);
629 uio->uio_offset -= resid - uio->uio_resid;
630 uio->uio_resid = resid;
631 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) {
632 error = lfs_update(vp, NULL, NULL, UPDATE_WAIT);
633 } else {
634 /* nothing */
635 }
636
637 /* Make sure the vnode uvm size matches the inode file size. */
638 KASSERT(vp->v_size == ip->i_size);
639
640 return error;
641 }
642