lfs_balloc.c revision 1.59.10.1 1 /* $NetBSD: lfs_balloc.c,v 1.59.10.1 2006/03/08 01:39:12 elad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*
39 * Copyright (c) 1989, 1991, 1993
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_balloc.c 8.4 (Berkeley) 5/8/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.59.10.1 2006/03/08 01:39:12 elad Exp $");
71
72 #if defined(_KERNEL_OPT)
73 #include "opt_quota.h"
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/buf.h>
79 #include <sys/proc.h>
80 #include <sys/vnode.h>
81 #include <sys/mount.h>
82 #include <sys/resourcevar.h>
83 #include <sys/tree.h>
84 #include <sys/trace.h>
85
86 #include <miscfs/specfs/specdev.h>
87
88 #include <ufs/ufs/quota.h>
89 #include <ufs/ufs/inode.h>
90 #include <ufs/ufs/ufsmount.h>
91 #include <ufs/ufs/ufs_extern.h>
92
93 #include <ufs/lfs/lfs.h>
94 #include <ufs/lfs/lfs_extern.h>
95
96 #include <uvm/uvm.h>
97
98 int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, kauth_cred_t);
99
100 u_int64_t locked_fakequeue_count;
101
102 /*
103 * Allocate a block, and to inode and filesystem block accounting for it
104 * and for any indirect blocks the may need to be created in order for
105 * this block to be created.
106 *
107 * Blocks which have never been accounted for (i.e., which "do not exist")
108 * have disk address 0, which is translated by ufs_bmap to the special value
109 * UNASSIGNED == -1, as in the historical UFS.
110 *
111 * Blocks which have been accounted for but which have not yet been written
112 * to disk are given the new special disk address UNWRITTEN == -2, so that
113 * they can be differentiated from completely new blocks.
114 */
115 /* VOP_BWRITE NIADDR+2 times */
116 int
117 lfs_balloc(struct vnode *vp, off_t startoffset, int iosize, kauth_cred_t cred,
118 int flags, struct buf **bpp)
119 {
120 int offset;
121 daddr_t daddr, idaddr;
122 struct buf *ibp, *bp;
123 struct inode *ip;
124 struct lfs *fs;
125 struct indir indirs[NIADDR+2], *idp;
126 daddr_t lbn, lastblock;
127 int bb, bcount;
128 int error, frags, i, nsize, osize, num;
129
130 ip = VTOI(vp);
131 fs = ip->i_lfs;
132 offset = blkoff(fs, startoffset);
133 KASSERT(iosize <= fs->lfs_bsize);
134 lbn = lblkno(fs, startoffset);
135 /* (void)lfs_check(vp, lbn, 0); */
136
137 ASSERT_MAYBE_SEGLOCK(fs);
138
139 /*
140 * Three cases: it's a block beyond the end of file, it's a block in
141 * the file that may or may not have been assigned a disk address or
142 * we're writing an entire block.
143 *
144 * Note, if the daddr is UNWRITTEN, the block already exists in
145 * the cache (it was read or written earlier). If so, make sure
146 * we don't count it as a new block or zero out its contents. If
147 * it did not, make sure we allocate any necessary indirect
148 * blocks.
149 *
150 * If we are writing a block beyond the end of the file, we need to
151 * check if the old last block was a fragment. If it was, we need
152 * to rewrite it.
153 */
154
155 if (bpp)
156 *bpp = NULL;
157
158 /* Check for block beyond end of file and fragment extension needed. */
159 lastblock = lblkno(fs, ip->i_size);
160 if (lastblock < NDADDR && lastblock < lbn) {
161 osize = blksize(fs, ip, lastblock);
162 if (osize < fs->lfs_bsize && osize > 0) {
163 if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
164 lastblock,
165 (bpp ? &bp : NULL), cred)))
166 return (error);
167 ip->i_ffs1_size = ip->i_size =
168 (lastblock + 1) * fs->lfs_bsize;
169 uvm_vnp_setsize(vp, ip->i_size);
170 ip->i_flag |= IN_CHANGE | IN_UPDATE;
171 if (bpp)
172 (void) VOP_BWRITE(bp);
173 }
174 }
175
176 /*
177 * If the block we are writing is a direct block, it's the last
178 * block in the file, and offset + iosize is less than a full
179 * block, we can write one or more fragments. There are two cases:
180 * the block is brand new and we should allocate it the correct
181 * size or it already exists and contains some fragments and
182 * may need to extend it.
183 */
184 if (lbn < NDADDR && lblkno(fs, ip->i_size) <= lbn) {
185 osize = blksize(fs, ip, lbn);
186 nsize = fragroundup(fs, offset + iosize);
187 if (lblktosize(fs, lbn) >= ip->i_size) {
188 /* Brand new block or fragment */
189 frags = numfrags(fs, nsize);
190 bb = fragstofsb(fs, frags);
191 if (!ISSPACE(fs, bb, cred))
192 return ENOSPC;
193 if (bpp) {
194 *bpp = bp = getblk(vp, lbn, nsize, 0, 0);
195 bp->b_blkno = UNWRITTEN;
196 if (flags & B_CLRBUF)
197 clrbuf(bp);
198 }
199 ip->i_lfs_effnblks += bb;
200 simple_lock(&fs->lfs_interlock);
201 fs->lfs_bfree -= bb;
202 simple_unlock(&fs->lfs_interlock);
203 ip->i_ffs1_db[lbn] = UNWRITTEN;
204 } else {
205 if (nsize <= osize) {
206 /* No need to extend */
207 if (bpp && (error = bread(vp, lbn, osize, NOCRED, &bp)))
208 return error;
209 } else {
210 /* Extend existing block */
211 if ((error =
212 lfs_fragextend(vp, osize, nsize, lbn,
213 (bpp ? &bp : NULL), cred)))
214 return error;
215 }
216 if (bpp)
217 *bpp = bp;
218 }
219 return 0;
220 }
221
222 error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL, NULL);
223 if (error)
224 return (error);
225
226 daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
227 KASSERT(daddr <= LFS_MAX_DADDR);
228
229 /*
230 * Do byte accounting all at once, so we can gracefully fail *before*
231 * we start assigning blocks.
232 */
233 bb = VFSTOUFS(vp->v_mount)->um_seqinc;
234 bcount = 0;
235 if (daddr == UNASSIGNED) {
236 bcount = bb;
237 }
238 for (i = 1; i < num; ++i) {
239 if (!indirs[i].in_exists) {
240 bcount += bb;
241 }
242 }
243 if (ISSPACE(fs, bcount, cred)) {
244 simple_lock(&fs->lfs_interlock);
245 fs->lfs_bfree -= bcount;
246 simple_unlock(&fs->lfs_interlock);
247 ip->i_lfs_effnblks += bcount;
248 } else {
249 return ENOSPC;
250 }
251
252 if (daddr == UNASSIGNED) {
253 if (num > 0 && ip->i_ffs1_ib[indirs[0].in_off] == 0) {
254 ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
255 }
256
257 /*
258 * Create new indirect blocks if necessary
259 */
260 if (num > 1) {
261 idaddr = ip->i_ffs1_ib[indirs[0].in_off];
262 for (i = 1; i < num; ++i) {
263 ibp = getblk(vp, indirs[i].in_lbn,
264 fs->lfs_bsize, 0,0);
265 if (!indirs[i].in_exists) {
266 clrbuf(ibp);
267 ibp->b_blkno = UNWRITTEN;
268 } else if (!(ibp->b_flags & (B_DELWRI | B_DONE))) {
269 ibp->b_blkno = fsbtodb(fs, idaddr);
270 ibp->b_flags |= B_READ;
271 VOP_STRATEGY(vp, ibp);
272 biowait(ibp);
273 }
274 /*
275 * This block exists, but the next one may not.
276 * If that is the case mark it UNWRITTEN to keep
277 * the accounting straight.
278 */
279 /* XXX ondisk32 */
280 if (((int32_t *)ibp->b_data)[indirs[i].in_off] == 0)
281 ((int32_t *)ibp->b_data)[indirs[i].in_off] =
282 UNWRITTEN;
283 /* XXX ondisk32 */
284 idaddr = ((int32_t *)ibp->b_data)[indirs[i].in_off];
285 #ifdef DEBUG
286 if (vp == fs->lfs_ivnode) {
287 LFS_ENTER_LOG("balloc", __FILE__,
288 __LINE__, indirs[i].in_lbn,
289 ibp->b_flags, curproc->p_pid);
290 }
291 #endif
292 if ((error = VOP_BWRITE(ibp)))
293 return error;
294 }
295 }
296 }
297
298
299 /*
300 * Get the existing block from the cache, if requested.
301 */
302 frags = fsbtofrags(fs, bb);
303 if (bpp)
304 *bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
305
306 /*
307 * Do accounting on blocks that represent pages.
308 */
309 if (!bpp)
310 lfs_register_block(vp, lbn);
311
312 /*
313 * The block we are writing may be a brand new block
314 * in which case we need to do accounting.
315 *
316 * We can tell a truly new block because ufs_bmaparray will say
317 * it is UNASSIGNED. Once we allocate it we will assign it the
318 * disk address UNWRITTEN.
319 */
320 if (daddr == UNASSIGNED) {
321 if (bpp) {
322 if (flags & B_CLRBUF)
323 clrbuf(bp);
324
325 /* Note the new address */
326 bp->b_blkno = UNWRITTEN;
327 }
328
329 switch (num) {
330 case 0:
331 ip->i_ffs1_db[lbn] = UNWRITTEN;
332 break;
333 case 1:
334 ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
335 break;
336 default:
337 idp = &indirs[num - 1];
338 if (bread(vp, idp->in_lbn, fs->lfs_bsize, NOCRED,
339 &ibp))
340 panic("lfs_balloc: bread bno %lld",
341 (long long)idp->in_lbn);
342 /* XXX ondisk32 */
343 ((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
344 #ifdef DEBUG
345 if (vp == fs->lfs_ivnode) {
346 LFS_ENTER_LOG("balloc", __FILE__,
347 __LINE__, idp->in_lbn,
348 ibp->b_flags, curproc->p_pid);
349 }
350 #endif
351 VOP_BWRITE(ibp);
352 }
353 } else if (bpp && !(bp->b_flags & (B_DONE|B_DELWRI))) {
354 /*
355 * Not a brand new block, also not in the cache;
356 * read it in from disk.
357 */
358 if (iosize == fs->lfs_bsize)
359 /* Optimization: I/O is unnecessary. */
360 bp->b_blkno = daddr;
361 else {
362 /*
363 * We need to read the block to preserve the
364 * existing bytes.
365 */
366 bp->b_blkno = daddr;
367 bp->b_flags |= B_READ;
368 VOP_STRATEGY(vp, bp);
369 return (biowait(bp));
370 }
371 }
372
373 return (0);
374 }
375
376 /* VOP_BWRITE 1 time */
377 int
378 lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf **bpp,
379 kauth_cred_t cred)
380 {
381 struct inode *ip;
382 struct lfs *fs;
383 long bb;
384 int error;
385 extern long locked_queue_bytes;
386 size_t obufsize;
387
388 ip = VTOI(vp);
389 fs = ip->i_lfs;
390 bb = (long)fragstofsb(fs, numfrags(fs, nsize - osize));
391 error = 0;
392
393 ASSERT_DUNNO_SEGLOCK(fs);
394
395 /*
396 * Get the seglock so we don't enlarge blocks while a segment
397 * is being written. If we're called with bpp==NULL, though,
398 * we are only pretending to change a buffer, so we don't have to
399 * lock.
400 */
401 top:
402 if (bpp) {
403 lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
404 LFS_DEBUG_COUNTLOCKED("frag");
405 }
406
407 if (!ISSPACE(fs, bb, cred)) {
408 error = ENOSPC;
409 goto out;
410 }
411
412 /*
413 * If we are not asked to actually return the block, all we need
414 * to do is allocate space for it. UBC will handle dirtying the
415 * appropriate things and making sure it all goes to disk.
416 * Don't bother to read in that case.
417 */
418 if (bpp && (error = bread(vp, lbn, osize, NOCRED, bpp))) {
419 brelse(*bpp);
420 goto out;
421 }
422 #ifdef QUOTA
423 if ((error = chkdq(ip, bb, cred, 0))) {
424 if (bpp)
425 brelse(*bpp);
426 goto out;
427 }
428 #endif
429 /*
430 * Adjust accounting for lfs_avail. If there's not enough room,
431 * we will have to wait for the cleaner, which we can't do while
432 * holding a block busy or while holding the seglock. In that case,
433 * release both and start over after waiting.
434 */
435
436 if (bpp && ((*bpp)->b_flags & B_DELWRI)) {
437 if (!lfs_fits(fs, bb)) {
438 if (bpp)
439 brelse(*bpp);
440 #ifdef QUOTA
441 chkdq(ip, -bb, cred, 0);
442 #endif
443 lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
444 lfs_availwait(fs, bb);
445 goto top;
446 }
447 fs->lfs_avail -= bb;
448 }
449
450 simple_lock(&fs->lfs_interlock);
451 fs->lfs_bfree -= bb;
452 simple_unlock(&fs->lfs_interlock);
453 ip->i_lfs_effnblks += bb;
454 ip->i_flag |= IN_CHANGE | IN_UPDATE;
455
456 if (bpp) {
457 obufsize = (*bpp)->b_bufsize;
458 allocbuf(*bpp, nsize, 1);
459
460 /* Adjust locked-list accounting */
461 if (((*bpp)->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED) {
462 simple_lock(&lfs_subsys_lock);
463 locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
464 simple_unlock(&lfs_subsys_lock);
465 }
466
467 bzero((char *)((*bpp)->b_data) + osize, (u_int)(nsize - osize));
468 }
469
470 out:
471 if (bpp) {
472 lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
473 }
474 return (error);
475 }
476
477 static inline int
478 lge(struct lbnentry *a, struct lbnentry *b)
479 {
480 return a->lbn - b->lbn;
481 }
482
483 SPLAY_PROTOTYPE(lfs_splay, lbnentry, entry, lge);
484
485 SPLAY_GENERATE(lfs_splay, lbnentry, entry, lge);
486
487 /*
488 * Record this lbn as being "write pending". We used to have this information
489 * on the buffer headers, but since pages don't have buffer headers we
490 * record it here instead.
491 */
492 void
493 lfs_register_block(struct vnode *vp, daddr_t lbn)
494 {
495 struct lfs *fs;
496 struct inode *ip;
497 struct lbnentry *lbp;
498
499 ip = VTOI(vp);
500
501 /* Don't count metadata */
502 if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
503 return;
504
505 fs = ip->i_lfs;
506
507 ASSERT_NO_SEGLOCK(fs);
508
509 /* If no space, wait for the cleaner */
510 lfs_availwait(fs, btofsb(fs, 1 << fs->lfs_bshift));
511
512 lbp = (struct lbnentry *)pool_get(&lfs_lbnentry_pool, PR_WAITOK);
513 lbp->lbn = lbn;
514 if (SPLAY_INSERT(lfs_splay, &ip->i_lfs_lbtree, lbp) != NULL) {
515 /* Already there */
516 pool_put(&lfs_lbnentry_pool, lbp);
517 return;
518 }
519
520 ++ip->i_lfs_nbtree;
521 simple_lock(&fs->lfs_interlock);
522 fs->lfs_favail += btofsb(fs, (1 << fs->lfs_bshift));
523 fs->lfs_pages += fs->lfs_bsize >> PAGE_SHIFT;
524 simple_lock(&lfs_subsys_lock);
525 ++locked_fakequeue_count;
526 lfs_subsys_pages += fs->lfs_bsize >> PAGE_SHIFT;
527 simple_unlock(&lfs_subsys_lock);
528 simple_unlock(&fs->lfs_interlock);
529 }
530
531 static void
532 lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp)
533 {
534 ASSERT_MAYBE_SEGLOCK(fs);
535
536 --ip->i_lfs_nbtree;
537 SPLAY_REMOVE(lfs_splay, &ip->i_lfs_lbtree, lbp);
538 pool_put(&lfs_lbnentry_pool, lbp);
539 simple_lock(&fs->lfs_interlock);
540 if (fs->lfs_favail > btofsb(fs, (1 << fs->lfs_bshift)))
541 fs->lfs_favail -= btofsb(fs, (1 << fs->lfs_bshift));
542 fs->lfs_pages -= fs->lfs_bsize >> PAGE_SHIFT;
543 simple_lock(&lfs_subsys_lock);
544 if (locked_fakequeue_count > 0)
545 --locked_fakequeue_count;
546 lfs_subsys_pages -= fs->lfs_bsize >> PAGE_SHIFT;
547 simple_unlock(&lfs_subsys_lock);
548 simple_unlock(&fs->lfs_interlock);
549 }
550
551 void
552 lfs_deregister_block(struct vnode *vp, daddr_t lbn)
553 {
554 struct lfs *fs;
555 struct inode *ip;
556 struct lbnentry *lbp;
557 struct lbnentry tmp;
558
559 ip = VTOI(vp);
560
561 /* Don't count metadata */
562 if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
563 return;
564
565 fs = ip->i_lfs;
566 tmp.lbn = lbn;
567 lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp);
568 if (lbp == NULL)
569 return;
570
571 lfs_do_deregister(fs, ip, lbp);
572 }
573
574 void
575 lfs_deregister_all(struct vnode *vp)
576 {
577 struct lbnentry *lbp, *nlbp;
578 struct lfs_splay *hd;
579 struct lfs *fs;
580 struct inode *ip;
581
582 ip = VTOI(vp);
583 fs = ip->i_lfs;
584 hd = &ip->i_lfs_lbtree;
585
586 for (lbp = SPLAY_MIN(lfs_splay, hd); lbp != NULL; lbp = nlbp) {
587 nlbp = SPLAY_NEXT(lfs_splay, hd, lbp);
588 lfs_do_deregister(fs, ip, lbp);
589 }
590 }
591