lfs_balloc.c revision 1.57 1 /* $NetBSD: lfs_balloc.c,v 1.57 2005/11/02 12:39:14 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*
39 * Copyright (c) 1989, 1991, 1993
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_balloc.c 8.4 (Berkeley) 5/8/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.57 2005/11/02 12:39:14 yamt Exp $");
71
72 #if defined(_KERNEL_OPT)
73 #include "opt_quota.h"
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/buf.h>
79 #include <sys/proc.h>
80 #include <sys/vnode.h>
81 #include <sys/mount.h>
82 #include <sys/resourcevar.h>
83 #include <sys/tree.h>
84 #include <sys/trace.h>
85
86 #include <miscfs/specfs/specdev.h>
87
88 #include <ufs/ufs/quota.h>
89 #include <ufs/ufs/inode.h>
90 #include <ufs/ufs/ufsmount.h>
91 #include <ufs/ufs/ufs_extern.h>
92
93 #include <ufs/lfs/lfs.h>
94 #include <ufs/lfs/lfs_extern.h>
95
96 #include <uvm/uvm.h>
97
98 int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, struct ucred *);
99
100 u_int64_t locked_fakequeue_count;
101
102 /*
103 * Allocate a block, and to inode and filesystem block accounting for it
104 * and for any indirect blocks the may need to be created in order for
105 * this block to be created.
106 *
107 * Blocks which have never been accounted for (i.e., which "do not exist")
108 * have disk address 0, which is translated by ufs_bmap to the special value
109 * UNASSIGNED == -1, as in the historical UFS.
110 *
111 * Blocks which have been accounted for but which have not yet been written
112 * to disk are given the new special disk address UNWRITTEN == -2, so that
113 * they can be differentiated from completely new blocks.
114 */
115 /* VOP_BWRITE NIADDR+2 times */
116 int
117 lfs_balloc(struct vnode *vp, off_t startoffset, int iosize, struct ucred *cred,
118 int flags, struct buf **bpp)
119 {
120 int offset;
121 daddr_t daddr, idaddr;
122 struct buf *ibp, *bp;
123 struct inode *ip;
124 struct lfs *fs;
125 struct indir indirs[NIADDR+2], *idp;
126 daddr_t lbn, lastblock;
127 int bb, bcount;
128 int error, frags, i, nsize, osize, num;
129
130 ip = VTOI(vp);
131 fs = ip->i_lfs;
132 offset = blkoff(fs, startoffset);
133 KASSERT(iosize <= fs->lfs_bsize);
134 lbn = lblkno(fs, startoffset);
135 /* (void)lfs_check(vp, lbn, 0); */
136
137 ASSERT_MAYBE_SEGLOCK(fs);
138
139 /*
140 * Three cases: it's a block beyond the end of file, it's a block in
141 * the file that may or may not have been assigned a disk address or
142 * we're writing an entire block.
143 *
144 * Note, if the daddr is UNWRITTEN, the block already exists in
145 * the cache (it was read or written earlier). If so, make sure
146 * we don't count it as a new block or zero out its contents. If
147 * it did not, make sure we allocate any necessary indirect
148 * blocks.
149 *
150 * If we are writing a block beyond the end of the file, we need to
151 * check if the old last block was a fragment. If it was, we need
152 * to rewrite it.
153 */
154
155 if (bpp)
156 *bpp = NULL;
157
158 /* Check for block beyond end of file and fragment extension needed. */
159 lastblock = lblkno(fs, ip->i_size);
160 if (lastblock < NDADDR && lastblock < lbn) {
161 osize = blksize(fs, ip, lastblock);
162 if (osize < fs->lfs_bsize && osize > 0) {
163 if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
164 lastblock,
165 (bpp ? &bp : NULL), cred)))
166 return (error);
167 ip->i_ffs1_size = ip->i_size =
168 (lastblock + 1) * fs->lfs_bsize;
169 uvm_vnp_setsize(vp, ip->i_size);
170 ip->i_flag |= IN_CHANGE | IN_UPDATE;
171 if (bpp)
172 (void) VOP_BWRITE(bp);
173 }
174 }
175
176 /*
177 * If the block we are writing is a direct block, it's the last
178 * block in the file, and offset + iosize is less than a full
179 * block, we can write one or more fragments. There are two cases:
180 * the block is brand new and we should allocate it the correct
181 * size or it already exists and contains some fragments and
182 * may need to extend it.
183 */
184 if (lbn < NDADDR && lblkno(fs, ip->i_size) <= lbn) {
185 osize = blksize(fs, ip, lbn);
186 nsize = fragroundup(fs, offset + iosize);
187 if (lblktosize(fs, lbn) >= ip->i_size) {
188 /* Brand new block or fragment */
189 frags = numfrags(fs, nsize);
190 bb = fragstofsb(fs, frags);
191 if (!ISSPACE(fs, bb, cred))
192 return ENOSPC;
193 if (bpp) {
194 *bpp = bp = getblk(vp, lbn, nsize, 0, 0);
195 bp->b_blkno = UNWRITTEN;
196 if (flags & B_CLRBUF)
197 clrbuf(bp);
198 }
199 ip->i_lfs_effnblks += bb;
200 simple_lock(&fs->lfs_interlock);
201 fs->lfs_bfree -= bb;
202 simple_unlock(&fs->lfs_interlock);
203 ip->i_ffs1_db[lbn] = UNWRITTEN;
204 } else {
205 if (nsize <= osize) {
206 /* No need to extend */
207 if (bpp && (error = bread(vp, lbn, osize, NOCRED, &bp)))
208 return error;
209 } else {
210 /* Extend existing block */
211 if ((error =
212 lfs_fragextend(vp, osize, nsize, lbn,
213 (bpp ? &bp : NULL), cred)))
214 return error;
215 }
216 if (bpp)
217 *bpp = bp;
218 }
219 return 0;
220 }
221
222 error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL, NULL);
223 if (error)
224 return (error);
225
226 daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
227 KASSERT(daddr <= LFS_MAX_DADDR);
228
229 /*
230 * Do byte accounting all at once, so we can gracefully fail *before*
231 * we start assigning blocks.
232 */
233 bb = VFSTOUFS(vp->v_mount)->um_seqinc;
234 bcount = 0;
235 if (daddr == UNASSIGNED) {
236 bcount = bb;
237 }
238 for (i = 1; i < num; ++i) {
239 if (!indirs[i].in_exists) {
240 bcount += bb;
241 }
242 }
243 if (ISSPACE(fs, bcount, cred)) {
244 simple_lock(&fs->lfs_interlock);
245 fs->lfs_bfree -= bcount;
246 simple_unlock(&fs->lfs_interlock);
247 ip->i_lfs_effnblks += bcount;
248 } else {
249 return ENOSPC;
250 }
251
252 if (daddr == UNASSIGNED) {
253 if (num > 0 && ip->i_ffs1_ib[indirs[0].in_off] == 0) {
254 ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
255 }
256
257 /*
258 * Create new indirect blocks if necessary
259 */
260 if (num > 1) {
261 idaddr = ip->i_ffs1_ib[indirs[0].in_off];
262 for (i = 1; i < num; ++i) {
263 ibp = getblk(vp, indirs[i].in_lbn,
264 fs->lfs_bsize, 0,0);
265 if (!indirs[i].in_exists) {
266 clrbuf(ibp);
267 ibp->b_blkno = UNWRITTEN;
268 } else if (!(ibp->b_flags & (B_DELWRI | B_DONE))) {
269 ibp->b_blkno = fsbtodb(fs, idaddr);
270 ibp->b_flags |= B_READ;
271 VOP_STRATEGY(vp, ibp);
272 biowait(ibp);
273 }
274 /*
275 * This block exists, but the next one may not.
276 * If that is the case mark it UNWRITTEN to keep
277 * the accounting straight.
278 */
279 /* XXX ondisk32 */
280 if (((int32_t *)ibp->b_data)[indirs[i].in_off] == 0)
281 ((int32_t *)ibp->b_data)[indirs[i].in_off] =
282 UNWRITTEN;
283 /* XXX ondisk32 */
284 idaddr = ((int32_t *)ibp->b_data)[indirs[i].in_off];
285 #ifdef DEBUG
286 if (vp == fs->lfs_ivnode) {
287 LFS_ENTER_LOG("balloc", __FILE__,
288 __LINE__, indirs[i].in_lbn,
289 ibp->b_flags, curproc->p_pid);
290 }
291 #endif
292 if ((error = VOP_BWRITE(ibp)))
293 return error;
294 }
295 }
296 }
297
298
299 /*
300 * Get the existing block from the cache, if requested.
301 */
302 frags = fsbtofrags(fs, bb);
303 if (bpp)
304 *bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
305
306 /*
307 * Do accounting on blocks that represent pages.
308 */
309 if (!bpp)
310 lfs_register_block(vp, lbn);
311
312 /*
313 * The block we are writing may be a brand new block
314 * in which case we need to do accounting.
315 *
316 * We can tell a truly new block because ufs_bmaparray will say
317 * it is UNASSIGNED. Once we allocate it we will assign it the
318 * disk address UNWRITTEN.
319 */
320 if (daddr == UNASSIGNED) {
321 if (bpp) {
322 if (flags & B_CLRBUF)
323 clrbuf(bp);
324
325 /* Note the new address */
326 bp->b_blkno = UNWRITTEN;
327 }
328
329 switch (num) {
330 case 0:
331 ip->i_ffs1_db[lbn] = UNWRITTEN;
332 break;
333 case 1:
334 ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
335 break;
336 default:
337 idp = &indirs[num - 1];
338 if (bread(vp, idp->in_lbn, fs->lfs_bsize, NOCRED,
339 &ibp))
340 panic("lfs_balloc: bread bno %lld",
341 (long long)idp->in_lbn);
342 /* XXX ondisk32 */
343 ((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
344 #ifdef DEBUG
345 if (vp == fs->lfs_ivnode) {
346 LFS_ENTER_LOG("balloc", __FILE__,
347 __LINE__, idp->in_lbn,
348 ibp->b_flags, curproc->p_pid);
349 }
350 #endif
351 VOP_BWRITE(ibp);
352 }
353 } else if (bpp && !(bp->b_flags & (B_DONE|B_DELWRI))) {
354 /*
355 * Not a brand new block, also not in the cache;
356 * read it in from disk.
357 */
358 if (iosize == fs->lfs_bsize)
359 /* Optimization: I/O is unnecessary. */
360 bp->b_blkno = daddr;
361 else {
362 /*
363 * We need to read the block to preserve the
364 * existing bytes.
365 */
366 bp->b_blkno = daddr;
367 bp->b_flags |= B_READ;
368 VOP_STRATEGY(vp, bp);
369 return (biowait(bp));
370 }
371 }
372
373 return (0);
374 }
375
376 /* VOP_BWRITE 1 time */
377 int
378 lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf **bpp, struct ucred *cred)
379 {
380 struct inode *ip;
381 struct lfs *fs;
382 long bb;
383 int error;
384 extern long locked_queue_bytes;
385 size_t obufsize;
386
387 ip = VTOI(vp);
388 fs = ip->i_lfs;
389 bb = (long)fragstofsb(fs, numfrags(fs, nsize - osize));
390 error = 0;
391
392 ASSERT_DUNNO_SEGLOCK(fs);
393
394 /*
395 * Get the seglock so we don't enlarge blocks while a segment
396 * is being written. If we're called with bpp==NULL, though,
397 * we are only pretending to change a buffer, so we don't have to
398 * lock.
399 */
400 top:
401 if (bpp) {
402 lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
403 LFS_DEBUG_COUNTLOCKED("frag");
404 }
405
406 if (!ISSPACE(fs, bb, cred)) {
407 error = ENOSPC;
408 goto out;
409 }
410
411 /*
412 * If we are not asked to actually return the block, all we need
413 * to do is allocate space for it. UBC will handle dirtying the
414 * appropriate things and making sure it all goes to disk.
415 * Don't bother to read in that case.
416 */
417 if (bpp && (error = bread(vp, lbn, osize, NOCRED, bpp))) {
418 brelse(*bpp);
419 goto out;
420 }
421 #ifdef QUOTA
422 if ((error = chkdq(ip, bb, cred, 0))) {
423 if (bpp)
424 brelse(*bpp);
425 goto out;
426 }
427 #endif
428 /*
429 * Adjust accounting for lfs_avail. If there's not enough room,
430 * we will have to wait for the cleaner, which we can't do while
431 * holding a block busy or while holding the seglock. In that case,
432 * release both and start over after waiting.
433 */
434
435 if (bpp && ((*bpp)->b_flags & B_DELWRI)) {
436 if (!lfs_fits(fs, bb)) {
437 if (bpp)
438 brelse(*bpp);
439 #ifdef QUOTA
440 chkdq(ip, -bb, cred, 0);
441 #endif
442 lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
443 lfs_availwait(fs, bb);
444 goto top;
445 }
446 fs->lfs_avail -= bb;
447 }
448
449 simple_lock(&fs->lfs_interlock);
450 fs->lfs_bfree -= bb;
451 simple_unlock(&fs->lfs_interlock);
452 ip->i_lfs_effnblks += bb;
453 ip->i_flag |= IN_CHANGE | IN_UPDATE;
454
455 if (bpp) {
456 obufsize = (*bpp)->b_bufsize;
457 allocbuf(*bpp, nsize, 1);
458
459 /* Adjust locked-list accounting */
460 if (((*bpp)->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED) {
461 simple_lock(&lfs_subsys_lock);
462 locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
463 simple_unlock(&lfs_subsys_lock);
464 }
465
466 bzero((char *)((*bpp)->b_data) + osize, (u_int)(nsize - osize));
467 }
468
469 out:
470 if (bpp) {
471 lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
472 }
473 return (error);
474 }
475
476 static __inline int
477 lge(struct lbnentry *a, struct lbnentry *b)
478 {
479 return a->lbn - b->lbn;
480 }
481
482 SPLAY_PROTOTYPE(lfs_splay, lbnentry, entry, lge);
483
484 SPLAY_GENERATE(lfs_splay, lbnentry, entry, lge);
485
486 /*
487 * Record this lbn as being "write pending". We used to have this information
488 * on the buffer headers, but since pages don't have buffer headers we
489 * record it here instead.
490 */
491 void
492 lfs_register_block(struct vnode *vp, daddr_t lbn)
493 {
494 struct lfs *fs;
495 struct inode *ip;
496 struct lbnentry *lbp;
497
498 ip = VTOI(vp);
499
500 /* Don't count metadata */
501 if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
502 return;
503
504 fs = ip->i_lfs;
505
506 ASSERT_NO_SEGLOCK(fs);
507
508 /* If no space, wait for the cleaner */
509 lfs_availwait(fs, btofsb(fs, 1 << fs->lfs_bshift));
510
511 lbp = (struct lbnentry *)pool_get(&lfs_lbnentry_pool, PR_WAITOK);
512 lbp->lbn = lbn;
513 if (SPLAY_INSERT(lfs_splay, &ip->i_lfs_lbtree, lbp) != NULL) {
514 /* Already there */
515 pool_put(&lfs_lbnentry_pool, lbp);
516 return;
517 }
518
519 ++ip->i_lfs_nbtree;
520 simple_lock(&fs->lfs_interlock);
521 fs->lfs_favail += btofsb(fs, (1 << fs->lfs_bshift));
522 fs->lfs_pages += fs->lfs_bsize >> PAGE_SHIFT;
523 simple_lock(&lfs_subsys_lock);
524 ++locked_fakequeue_count;
525 lfs_subsys_pages += fs->lfs_bsize >> PAGE_SHIFT;
526 simple_unlock(&lfs_subsys_lock);
527 simple_unlock(&fs->lfs_interlock);
528 }
529
530 static void
531 lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp)
532 {
533 ASSERT_MAYBE_SEGLOCK(fs);
534
535 --ip->i_lfs_nbtree;
536 SPLAY_REMOVE(lfs_splay, &ip->i_lfs_lbtree, lbp);
537 pool_put(&lfs_lbnentry_pool, lbp);
538 simple_lock(&fs->lfs_interlock);
539 if (fs->lfs_favail > btofsb(fs, (1 << fs->lfs_bshift)))
540 fs->lfs_favail -= btofsb(fs, (1 << fs->lfs_bshift));
541 fs->lfs_pages -= fs->lfs_bsize >> PAGE_SHIFT;
542 simple_lock(&lfs_subsys_lock);
543 if (locked_fakequeue_count > 0)
544 --locked_fakequeue_count;
545 lfs_subsys_pages -= fs->lfs_bsize >> PAGE_SHIFT;
546 simple_unlock(&lfs_subsys_lock);
547 simple_unlock(&fs->lfs_interlock);
548 }
549
550 void
551 lfs_deregister_block(struct vnode *vp, daddr_t lbn)
552 {
553 struct lfs *fs;
554 struct inode *ip;
555 struct lbnentry *lbp;
556 struct lbnentry tmp;
557
558 ip = VTOI(vp);
559
560 /* Don't count metadata */
561 if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
562 return;
563
564 fs = ip->i_lfs;
565 tmp.lbn = lbn;
566 lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp);
567 if (lbp == NULL)
568 return;
569
570 lfs_do_deregister(fs, ip, lbp);
571 }
572
573 void
574 lfs_deregister_all(struct vnode *vp)
575 {
576 struct lbnentry *lbp, *nlbp;
577 struct lfs_splay *hd;
578 struct lfs *fs;
579 struct inode *ip;
580
581 ip = VTOI(vp);
582 fs = ip->i_lfs;
583 hd = &ip->i_lfs_lbtree;
584
585 for (lbp = SPLAY_MIN(lfs_splay, hd); lbp != NULL; lbp = nlbp) {
586 nlbp = SPLAY_NEXT(lfs_splay, hd, lbp);
587 lfs_do_deregister(fs, ip, lbp);
588 }
589 }
590