ffs_alloc.c revision 1.35.4.2 1 /* $NetBSD: ffs_alloc.c,v 1.35.4.2 2001/11/25 19:23:27 he Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
36 */
37
38 #if defined(_KERNEL) && !defined(_LKM)
39 #include "opt_ffs.h"
40 #include "opt_quota.h"
41 #endif
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/buf.h>
46 #include <sys/proc.h>
47 #include <sys/vnode.h>
48 #include <sys/mount.h>
49 #include <sys/kernel.h>
50 #include <sys/syslog.h>
51
52 #include <vm/vm.h>
53
54 #include <uvm/uvm_extern.h>
55
56 #include <ufs/ufs/quota.h>
57 #include <ufs/ufs/ufsmount.h>
58 #include <ufs/ufs/inode.h>
59 #include <ufs/ufs/ufs_extern.h>
60 #include <ufs/ufs/ufs_bswap.h>
61
62 #include <ufs/ffs/fs.h>
63 #include <ufs/ffs/ffs_extern.h>
64
65 static ufs_daddr_t ffs_alloccg __P((struct inode *, int, ufs_daddr_t, int));
66 static ufs_daddr_t ffs_alloccgblk __P((struct inode *, struct buf *,
67 ufs_daddr_t));
68 static ufs_daddr_t ffs_clusteralloc __P((struct inode *, int, ufs_daddr_t, int));
69 static ino_t ffs_dirpref __P((struct fs *, ino_t));
70 static ufs_daddr_t ffs_fragextend __P((struct inode *, int, long, int, int));
71 static void ffs_fserr __P((struct fs *, u_int, char *));
72 static u_long ffs_hashalloc
73 __P((struct inode *, int, long, int,
74 ufs_daddr_t (*)(struct inode *, int, ufs_daddr_t, int)));
75 static ufs_daddr_t ffs_nodealloccg __P((struct inode *, int, ufs_daddr_t, int));
76 static ufs_daddr_t ffs_mapsearch __P((struct fs *, struct cg *,
77 ufs_daddr_t, int));
78 #if defined(DIAGNOSTIC) || defined(DEBUG)
79 static int ffs_checkblk __P((struct inode *, ufs_daddr_t, long size));
80 #endif
81
82 /* if 1, changes in optimalization strategy are logged */
83 int ffs_log_changeopt = 0;
84
85 /* in ffs_tables.c */
86 extern int inside[], around[];
87 extern u_char *fragtbl[];
88
89 /*
90 * Allocate a block in the file system.
91 *
92 * The size of the requested block is given, which must be some
93 * multiple of fs_fsize and <= fs_bsize.
94 * A preference may be optionally specified. If a preference is given
95 * the following hierarchy is used to allocate a block:
96 * 1) allocate the requested block.
97 * 2) allocate a rotationally optimal block in the same cylinder.
98 * 3) allocate a block in the same cylinder group.
99 * 4) quadradically rehash into other cylinder groups, until an
100 * available block is located.
101 * If no block preference is given the following heirarchy is used
102 * to allocate a block:
103 * 1) allocate a block in the cylinder group that contains the
104 * inode for the file.
105 * 2) quadradically rehash into other cylinder groups, until an
106 * available block is located.
107 */
108 int
109 ffs_alloc(ip, lbn, bpref, size, cred, bnp)
110 struct inode *ip;
111 ufs_daddr_t lbn, bpref;
112 int size;
113 struct ucred *cred;
114 ufs_daddr_t *bnp;
115 {
116 struct fs *fs;
117 ufs_daddr_t bno;
118 int cg;
119 #ifdef QUOTA
120 int error;
121 #endif
122
123 *bnp = 0;
124 fs = ip->i_fs;
125 #ifdef DIAGNOSTIC
126 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
127 printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
128 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
129 panic("ffs_alloc: bad size");
130 }
131 if (cred == NOCRED)
132 panic("ffs_alloc: missing credential\n");
133 #endif /* DIAGNOSTIC */
134 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
135 goto nospace;
136 if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
137 goto nospace;
138 #ifdef QUOTA
139 if ((error = chkdq(ip, (long)btodb(size), cred, 0)) != 0)
140 return (error);
141 #endif
142 if (bpref >= fs->fs_size)
143 bpref = 0;
144 if (bpref == 0)
145 cg = ino_to_cg(fs, ip->i_number);
146 else
147 cg = dtog(fs, bpref);
148 bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size,
149 ffs_alloccg);
150 if (bno > 0) {
151 ip->i_ffs_blocks += btodb(size);
152 ip->i_flag |= IN_CHANGE | IN_UPDATE;
153 *bnp = bno;
154 return (0);
155 }
156 #ifdef QUOTA
157 /*
158 * Restore user's disk quota because allocation failed.
159 */
160 (void) chkdq(ip, (long)-btodb(size), cred, FORCE);
161 #endif
162 nospace:
163 ffs_fserr(fs, cred->cr_uid, "file system full");
164 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
165 return (ENOSPC);
166 }
167
168 /*
169 * Reallocate a fragment to a bigger size
170 *
171 * The number and size of the old block is given, and a preference
172 * and new size is also specified. The allocator attempts to extend
173 * the original block. Failing that, the regular block allocator is
174 * invoked to get an appropriate block.
175 */
176 int
177 ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
178 struct inode *ip;
179 ufs_daddr_t lbprev;
180 ufs_daddr_t bpref;
181 int osize, nsize;
182 struct ucred *cred;
183 struct buf **bpp;
184 {
185 struct fs *fs;
186 struct buf *bp;
187 int cg, request, error;
188 ufs_daddr_t bprev, bno;
189
190 *bpp = 0;
191 fs = ip->i_fs;
192 #ifdef DIAGNOSTIC
193 if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
194 (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
195 printf(
196 "dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
197 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
198 panic("ffs_realloccg: bad size");
199 }
200 if (cred == NOCRED)
201 panic("ffs_realloccg: missing credential\n");
202 #endif /* DIAGNOSTIC */
203 if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
204 goto nospace;
205 if ((bprev = ufs_rw32(ip->i_ffs_db[lbprev], UFS_FSNEEDSWAP(fs))) == 0) {
206 printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n",
207 ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt);
208 panic("ffs_realloccg: bad bprev");
209 }
210 /*
211 * Allocate the extra space in the buffer.
212 */
213 if ((error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) != 0) {
214 brelse(bp);
215 return (error);
216 }
217 #ifdef QUOTA
218 if ((error = chkdq(ip, (long)btodb(nsize - osize), cred, 0)) != 0) {
219 brelse(bp);
220 return (error);
221 }
222 #endif
223 /*
224 * Check for extension in the existing location.
225 */
226 cg = dtog(fs, bprev);
227 if ((bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) != 0) {
228 if (bp->b_blkno != fsbtodb(fs, bno))
229 panic("bad blockno");
230 ip->i_ffs_blocks += btodb(nsize - osize);
231 ip->i_flag |= IN_CHANGE | IN_UPDATE;
232 allocbuf(bp, nsize);
233 bp->b_flags |= B_DONE;
234 memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
235 *bpp = bp;
236 return (0);
237 }
238 /*
239 * Allocate a new disk location.
240 */
241 if (bpref >= fs->fs_size)
242 bpref = 0;
243 switch ((int)fs->fs_optim) {
244 case FS_OPTSPACE:
245 /*
246 * Allocate an exact sized fragment. Although this makes
247 * best use of space, we will waste time relocating it if
248 * the file continues to grow. If the fragmentation is
249 * less than half of the minimum free reserve, we choose
250 * to begin optimizing for time.
251 */
252 request = nsize;
253 if (fs->fs_minfree < 5 ||
254 fs->fs_cstotal.cs_nffree >
255 fs->fs_dsize * fs->fs_minfree / (2 * 100))
256 break;
257
258 if (ffs_log_changeopt) {
259 log(LOG_NOTICE,
260 "%s: optimization changed from SPACE to TIME\n",
261 fs->fs_fsmnt);
262 }
263
264 fs->fs_optim = FS_OPTTIME;
265 break;
266 case FS_OPTTIME:
267 /*
268 * At this point we have discovered a file that is trying to
269 * grow a small fragment to a larger fragment. To save time,
270 * we allocate a full sized block, then free the unused portion.
271 * If the file continues to grow, the `ffs_fragextend' call
272 * above will be able to grow it in place without further
273 * copying. If aberrant programs cause disk fragmentation to
274 * grow within 2% of the free reserve, we choose to begin
275 * optimizing for space.
276 */
277 request = fs->fs_bsize;
278 if (fs->fs_cstotal.cs_nffree <
279 fs->fs_dsize * (fs->fs_minfree - 2) / 100)
280 break;
281
282 if (ffs_log_changeopt) {
283 log(LOG_NOTICE,
284 "%s: optimization changed from TIME to SPACE\n",
285 fs->fs_fsmnt);
286 }
287
288 fs->fs_optim = FS_OPTSPACE;
289 break;
290 default:
291 printf("dev = 0x%x, optim = %d, fs = %s\n",
292 ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
293 panic("ffs_realloccg: bad optim");
294 /* NOTREACHED */
295 }
296 bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request,
297 ffs_alloccg);
298 if (bno > 0) {
299 bp->b_blkno = fsbtodb(fs, bno);
300 (void) uvm_vnp_uncache(ITOV(ip));
301 if (!DOINGSOFTDEP(ITOV(ip)))
302 ffs_blkfree(ip, bprev, (long)osize);
303 if (nsize < request)
304 ffs_blkfree(ip, bno + numfrags(fs, nsize),
305 (long)(request - nsize));
306 ip->i_ffs_blocks += btodb(nsize - osize);
307 ip->i_flag |= IN_CHANGE | IN_UPDATE;
308 allocbuf(bp, nsize);
309 bp->b_flags |= B_DONE;
310 memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
311 *bpp = bp;
312 return (0);
313 }
314 #ifdef QUOTA
315 /*
316 * Restore user's disk quota because allocation failed.
317 */
318 (void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE);
319 #endif
320 brelse(bp);
321 nospace:
322 /*
323 * no space available
324 */
325 ffs_fserr(fs, cred->cr_uid, "file system full");
326 uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
327 return (ENOSPC);
328 }
329
330 /*
331 * Reallocate a sequence of blocks into a contiguous sequence of blocks.
332 *
333 * The vnode and an array of buffer pointers for a range of sequential
334 * logical blocks to be made contiguous is given. The allocator attempts
335 * to find a range of sequential blocks starting as close as possible to
336 * an fs_rotdelay offset from the end of the allocation for the logical
337 * block immediately preceeding the current range. If successful, the
338 * physical block numbers in the buffer pointers and in the inode are
339 * changed to reflect the new allocation. If unsuccessful, the allocation
340 * is left unchanged. The success in doing the reallocation is returned.
341 * Note that the error return is not reflected back to the user. Rather
342 * the previous block allocation will be used.
343 */
344 #ifdef DEBUG
345 #include <sys/sysctl.h>
346 int prtrealloc = 0;
347 struct ctldebug debug15 = { "prtrealloc", &prtrealloc };
348 #endif
349
350 int doasyncfree = 1;
351 extern int doreallocblks;
352
353 int
354 ffs_reallocblks(v)
355 void *v;
356 {
357 struct vop_reallocblks_args /* {
358 struct vnode *a_vp;
359 struct cluster_save *a_buflist;
360 } */ *ap = v;
361 struct fs *fs;
362 struct inode *ip;
363 struct vnode *vp;
364 struct buf *sbp, *ebp;
365 ufs_daddr_t *bap, *sbap, *ebap = NULL;
366 struct cluster_save *buflist;
367 ufs_daddr_t start_lbn, end_lbn, soff, newblk, blkno;
368 struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
369 int i, len, start_lvl, end_lvl, pref, ssize;
370
371 vp = ap->a_vp;
372 ip = VTOI(vp);
373 fs = ip->i_fs;
374 if (fs->fs_contigsumsize <= 0)
375 return (ENOSPC);
376 buflist = ap->a_buflist;
377 len = buflist->bs_nchildren;
378 start_lbn = buflist->bs_children[0]->b_lblkno;
379 end_lbn = start_lbn + len - 1;
380 #ifdef DIAGNOSTIC
381 for (i = 0; i < len; i++)
382 if (!ffs_checkblk(ip,
383 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
384 panic("ffs_reallocblks: unallocated block 1");
385 for (i = 1; i < len; i++)
386 if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
387 panic("ffs_reallocblks: non-logical cluster");
388 blkno = buflist->bs_children[0]->b_blkno;
389 ssize = fsbtodb(fs, fs->fs_frag);
390 for (i = 1; i < len - 1; i++)
391 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
392 panic("ffs_reallocblks: non-physical cluster %d", i);
393 #endif
394 /*
395 * If the latest allocation is in a new cylinder group, assume that
396 * the filesystem has decided to move and do not force it back to
397 * the previous cylinder group.
398 */
399 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
400 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
401 return (ENOSPC);
402 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
403 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
404 return (ENOSPC);
405 /*
406 * Get the starting offset and block map for the first block.
407 */
408 if (start_lvl == 0) {
409 sbap = &ip->i_ffs_db[0];
410 soff = start_lbn;
411 } else {
412 idp = &start_ap[start_lvl - 1];
413 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
414 brelse(sbp);
415 return (ENOSPC);
416 }
417 sbap = (ufs_daddr_t *)sbp->b_data;
418 soff = idp->in_off;
419 }
420 /*
421 * Find the preferred location for the cluster.
422 */
423 pref = ffs_blkpref(ip, start_lbn, soff, sbap);
424 /*
425 * If the block range spans two block maps, get the second map.
426 */
427 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
428 ssize = len;
429 } else {
430 #ifdef DIAGNOSTIC
431 if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
432 panic("ffs_reallocblk: start == end");
433 #endif
434 ssize = len - (idp->in_off + 1);
435 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
436 goto fail;
437 ebap = (ufs_daddr_t *)ebp->b_data;
438 }
439 /*
440 * Search the block map looking for an allocation of the desired size.
441 */
442 if ((newblk = (ufs_daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref,
443 len, ffs_clusteralloc)) == 0)
444 goto fail;
445 /*
446 * We have found a new contiguous block.
447 *
448 * First we have to replace the old block pointers with the new
449 * block pointers in the inode and indirect blocks associated
450 * with the file.
451 */
452 #ifdef DEBUG
453 if (prtrealloc)
454 printf("realloc: ino %d, lbns %d-%d\n\told:", ip->i_number,
455 start_lbn, end_lbn);
456 #endif
457 blkno = newblk;
458 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
459 ufs_daddr_t ba;
460
461 if (i == ssize) {
462 bap = ebap;
463 soff = -i;
464 }
465 ba = ufs_rw32(*bap, UFS_FSNEEDSWAP(fs));
466 #ifdef DIAGNOSTIC
467 if (!ffs_checkblk(ip,
468 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
469 panic("ffs_reallocblks: unallocated block 2");
470 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != ba)
471 panic("ffs_reallocblks: alloc mismatch");
472 #endif
473 #ifdef DEBUG
474 if (prtrealloc)
475 printf(" %d,", ba);
476 #endif
477 if (DOINGSOFTDEP(vp)) {
478 if (sbap == &ip->i_ffs_db[0] && i < ssize)
479 softdep_setup_allocdirect(ip, start_lbn + i,
480 blkno, ba, fs->fs_bsize, fs->fs_bsize,
481 buflist->bs_children[i]);
482 else
483 softdep_setup_allocindir_page(ip, start_lbn + i,
484 i < ssize ? sbp : ebp, soff + i, blkno,
485 ba, buflist->bs_children[i]);
486 }
487 *bap++ = ufs_rw32(blkno, UFS_FSNEEDSWAP(fs));
488 }
489 /*
490 * Next we must write out the modified inode and indirect blocks.
491 * For strict correctness, the writes should be synchronous since
492 * the old block values may have been written to disk. In practise
493 * they are almost never written, but if we are concerned about
494 * strict correctness, the `doasyncfree' flag should be set to zero.
495 *
496 * The test on `doasyncfree' should be changed to test a flag
497 * that shows whether the associated buffers and inodes have
498 * been written. The flag should be set when the cluster is
499 * started and cleared whenever the buffer or inode is flushed.
500 * We can then check below to see if it is set, and do the
501 * synchronous write only when it has been cleared.
502 */
503 if (sbap != &ip->i_ffs_db[0]) {
504 if (doasyncfree)
505 bdwrite(sbp);
506 else
507 bwrite(sbp);
508 } else {
509 ip->i_flag |= IN_CHANGE | IN_UPDATE;
510 if (!doasyncfree)
511 VOP_UPDATE(vp, NULL, NULL, 1);
512 }
513 if (ssize < len) {
514 if (doasyncfree)
515 bdwrite(ebp);
516 else
517 bwrite(ebp);
518 }
519 /*
520 * Last, free the old blocks and assign the new blocks to the buffers.
521 */
522 #ifdef DEBUG
523 if (prtrealloc)
524 printf("\n\tnew:");
525 #endif
526 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
527 if (!DOINGSOFTDEP(vp))
528 ffs_blkfree(ip,
529 dbtofsb(fs, buflist->bs_children[i]->b_blkno),
530 fs->fs_bsize);
531 buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
532 #ifdef DEBUG
533 if (!ffs_checkblk(ip,
534 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
535 panic("ffs_reallocblks: unallocated block 3");
536 if (prtrealloc)
537 printf(" %d,", blkno);
538 #endif
539 }
540 #ifdef DEBUG
541 if (prtrealloc) {
542 prtrealloc--;
543 printf("\n");
544 }
545 #endif
546 return (0);
547
548 fail:
549 if (ssize < len)
550 brelse(ebp);
551 if (sbap != &ip->i_ffs_db[0])
552 brelse(sbp);
553 return (ENOSPC);
554 }
555
556 /*
557 * Allocate an inode in the file system.
558 *
559 * If allocating a directory, use ffs_dirpref to select the inode.
560 * If allocating in a directory, the following hierarchy is followed:
561 * 1) allocate the preferred inode.
562 * 2) allocate an inode in the same cylinder group.
563 * 3) quadradically rehash into other cylinder groups, until an
564 * available inode is located.
565 * If no inode preference is given the following heirarchy is used
566 * to allocate an inode:
567 * 1) allocate an inode in cylinder group 0.
568 * 2) quadradically rehash into other cylinder groups, until an
569 * available inode is located.
570 */
571 int
572 ffs_valloc(v)
573 void *v;
574 {
575 struct vop_valloc_args /* {
576 struct vnode *a_pvp;
577 int a_mode;
578 struct ucred *a_cred;
579 struct vnode **a_vpp;
580 } */ *ap = v;
581 struct vnode *pvp = ap->a_pvp;
582 struct inode *pip;
583 struct fs *fs;
584 struct inode *ip;
585 mode_t mode = ap->a_mode;
586 ino_t ino, ipref;
587 int cg, error;
588
589 *ap->a_vpp = NULL;
590 pip = VTOI(pvp);
591 fs = pip->i_fs;
592 if (fs->fs_cstotal.cs_nifree == 0)
593 goto noinodes;
594
595 ipref = pip->i_number;
596 if ((mode & IFMT) == IFDIR)
597 ipref = ffs_dirpref(fs, ipref);
598 if (ipref >= fs->fs_ncg * fs->fs_ipg)
599 ipref = 0;
600 cg = ino_to_cg(fs, ipref);
601 ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg);
602 if (ino == 0)
603 goto noinodes;
604 error = VFS_VGET(pvp->v_mount, ino, ap->a_vpp);
605 if (error) {
606 VOP_VFREE(pvp, ino, mode);
607 return (error);
608 }
609 ip = VTOI(*ap->a_vpp);
610 if (ip->i_ffs_mode) {
611 printf("mode = 0%o, inum = %d, fs = %s\n",
612 ip->i_ffs_mode, ip->i_number, fs->fs_fsmnt);
613 panic("ffs_valloc: dup alloc");
614 }
615 if (ip->i_ffs_blocks) { /* XXX */
616 printf("free inode %s/%d had %d blocks\n",
617 fs->fs_fsmnt, ino, ip->i_ffs_blocks);
618 ip->i_ffs_blocks = 0;
619 }
620 ip->i_ffs_flags = 0;
621 /*
622 * Set up a new generation number for this inode.
623 */
624 ip->i_ffs_gen++;
625 return (0);
626 noinodes:
627 ffs_fserr(fs, ap->a_cred->cr_uid, "out of inodes");
628 uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
629 return (ENOSPC);
630 }
631
632 /*
633 * Find a cylinder in which to place a directory.
634 *
635 * The policy implemented by this algorithm is to select from among
636 * those cylinder groups with above the average number of free inodes
637 * and a "reasonable" number of free blocks, the one with the smallest
638 * number of directories. If there are no cylinder groups with a
639 * reasonable number of free blocks, we select a CG with *any* free
640 * blocks or free frags.
641 *
642 * "Reasonable" here is arbitrarily defined as "at least 25% of the
643 * average amount of free space."
644 *
645 * This complex policy is intended to avoid pathological (linear
646 * search) allocation performance when a filesystem contains many
647 * small cylinder groups with few directory inodes and no free blocks;
648 * this was observed in practice with the old allocation policy (which
649 * ignored the distribution of free blocks). Under the old policy,
650 * when a new filesystem is populated with a number of files somewhat
651 * larger than the CG size, and then a second tree containing a large
652 * number of files and directories is created, mkdir() performance
653 * would degrade catastrophically, taking many seconds and involving
654 * thousands of disk reads to complete.
655 *
656 * XXX TODO: we currently ignore our "ipref" argument; we may want to
657 * add a heuristic to determine whether to place a directory in the
658 * same CG as its parent to reduce the amount of seeking required in
659 * the course of tree-walks.
660 */
661 static ino_t
662 ffs_dirpref(fs, ipref)
663 struct fs *fs;
664 ino_t ipref;
665 {
666 int cg, minndir, mincg, avgifree, bfreethresh;
667 int minndirf, mincgf;
668 struct csum *cs;
669
670 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
671 bfreethresh = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
672 bfreethresh >>= 2;
673 minndir = fs->fs_ipg;
674 minndirf = fs->fs_ipg;
675 mincg = 0;
676 mincgf = 0;
677 for (cg = 0; cg < fs->fs_ncg; cg++) {
678 cs = &fs->fs_cs(fs, cg);
679 if (cs->cs_nifree >= avgifree) {
680 if ((cs->cs_ndir < minndir) &&
681 (cs->cs_nbfree > bfreethresh)) {
682 mincg = cg;
683 minndir = cs->cs_ndir;
684 }
685 if ((cs->cs_ndir < minndirf) &&
686 ((cs->cs_nffree + cs->cs_nbfree) > 0)) {
687 mincgf = cg;
688 minndirf = cs->cs_ndir;
689 }
690 }
691 }
692 if (minndir == fs->fs_ipg)
693 mincg = mincgf;
694 return ((ino_t)(fs->fs_ipg * mincg));
695 }
696
697 /*
698 * Select the desired position for the next block in a file. The file is
699 * logically divided into sections. The first section is composed of the
700 * direct blocks. Each additional section contains fs_maxbpg blocks.
701 *
702 * If no blocks have been allocated in the first section, the policy is to
703 * request a block in the same cylinder group as the inode that describes
704 * the file. If no blocks have been allocated in any other section, the
705 * policy is to place the section in a cylinder group with a greater than
706 * average number of free blocks. An appropriate cylinder group is found
707 * by using a rotor that sweeps the cylinder groups. When a new group of
708 * blocks is needed, the sweep begins in the cylinder group following the
709 * cylinder group from which the previous allocation was made. The sweep
710 * continues until a cylinder group with greater than the average number
711 * of free blocks is found. If the allocation is for the first block in an
712 * indirect block, the information on the previous allocation is unavailable;
713 * here a best guess is made based upon the logical block number being
714 * allocated.
715 *
716 * If a section is already partially allocated, the policy is to
717 * contiguously allocate fs_maxcontig blocks. The end of one of these
718 * contiguous blocks and the beginning of the next is physically separated
719 * so that the disk head will be in transit between them for at least
720 * fs_rotdelay milliseconds. This is to allow time for the processor to
721 * schedule another I/O transfer.
722 */
723 ufs_daddr_t
724 ffs_blkpref(ip, lbn, indx, bap)
725 struct inode *ip;
726 ufs_daddr_t lbn;
727 int indx;
728 ufs_daddr_t *bap;
729 {
730 struct fs *fs;
731 int cg;
732 int avgbfree, startcg;
733 ufs_daddr_t nextblk;
734
735 fs = ip->i_fs;
736 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
737 if (lbn < NDADDR + NINDIR(fs)) {
738 cg = ino_to_cg(fs, ip->i_number);
739 return (fs->fs_fpg * cg + fs->fs_frag);
740 }
741 /*
742 * Find a cylinder with greater than average number of
743 * unused data blocks.
744 */
745 if (indx == 0 || bap[indx - 1] == 0)
746 startcg =
747 ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
748 else
749 startcg = dtog(fs,
750 ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
751 startcg %= fs->fs_ncg;
752 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
753 for (cg = startcg; cg < fs->fs_ncg; cg++)
754 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
755 fs->fs_cgrotor = cg;
756 return (fs->fs_fpg * cg + fs->fs_frag);
757 }
758 for (cg = 0; cg <= startcg; cg++)
759 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
760 fs->fs_cgrotor = cg;
761 return (fs->fs_fpg * cg + fs->fs_frag);
762 }
763 return (0);
764 }
765 /*
766 * One or more previous blocks have been laid out. If less
767 * than fs_maxcontig previous blocks are contiguous, the
768 * next block is requested contiguously, otherwise it is
769 * requested rotationally delayed by fs_rotdelay milliseconds.
770 */
771 nextblk = ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
772 if (indx < fs->fs_maxcontig ||
773 ufs_rw32(bap[indx - fs->fs_maxcontig], UFS_FSNEEDSWAP(fs)) +
774 blkstofrags(fs, fs->fs_maxcontig) != nextblk)
775 return (nextblk);
776 if (fs->fs_rotdelay != 0)
777 /*
778 * Here we convert ms of delay to frags as:
779 * (frags) = (ms) * (rev/sec) * (sect/rev) /
780 * ((sect/frag) * (ms/sec))
781 * then round up to the next block.
782 */
783 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
784 (NSPF(fs) * 1000), fs->fs_frag);
785 return (nextblk);
786 }
787
788 /*
789 * Implement the cylinder overflow algorithm.
790 *
791 * The policy implemented by this algorithm is:
792 * 1) allocate the block in its requested cylinder group.
793 * 2) quadradically rehash on the cylinder group number.
794 * 3) brute force search for a free block.
795 */
796 /*VARARGS5*/
797 static u_long
798 ffs_hashalloc(ip, cg, pref, size, allocator)
799 struct inode *ip;
800 int cg;
801 long pref;
802 int size; /* size for data blocks, mode for inodes */
803 ufs_daddr_t (*allocator) __P((struct inode *, int, ufs_daddr_t, int));
804 {
805 struct fs *fs;
806 long result;
807 int i, icg = cg;
808
809 fs = ip->i_fs;
810 /*
811 * 1: preferred cylinder group
812 */
813 result = (*allocator)(ip, cg, pref, size);
814 if (result)
815 return (result);
816 /*
817 * 2: quadratic rehash
818 */
819 for (i = 1; i < fs->fs_ncg; i *= 2) {
820 cg += i;
821 if (cg >= fs->fs_ncg)
822 cg -= fs->fs_ncg;
823 result = (*allocator)(ip, cg, 0, size);
824 if (result)
825 return (result);
826 }
827 /*
828 * 3: brute force search
829 * Note that we start at i == 2, since 0 was checked initially,
830 * and 1 is always checked in the quadratic rehash.
831 */
832 cg = (icg + 2) % fs->fs_ncg;
833 for (i = 2; i < fs->fs_ncg; i++) {
834 result = (*allocator)(ip, cg, 0, size);
835 if (result)
836 return (result);
837 cg++;
838 if (cg == fs->fs_ncg)
839 cg = 0;
840 }
841 return (0);
842 }
843
844 /*
845 * Determine whether a fragment can be extended.
846 *
847 * Check to see if the necessary fragments are available, and
848 * if they are, allocate them.
849 */
850 static ufs_daddr_t
851 ffs_fragextend(ip, cg, bprev, osize, nsize)
852 struct inode *ip;
853 int cg;
854 long bprev;
855 int osize, nsize;
856 {
857 struct fs *fs;
858 struct cg *cgp;
859 struct buf *bp;
860 long bno;
861 int frags, bbase;
862 int i, error;
863
864 fs = ip->i_fs;
865 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
866 return (0);
867 frags = numfrags(fs, nsize);
868 bbase = fragnum(fs, bprev);
869 if (bbase > fragnum(fs, (bprev + frags - 1))) {
870 /* cannot extend across a block boundary */
871 return (0);
872 }
873 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
874 (int)fs->fs_cgsize, NOCRED, &bp);
875 if (error) {
876 brelse(bp);
877 return (0);
878 }
879 cgp = (struct cg *)bp->b_data;
880 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
881 brelse(bp);
882 return (0);
883 }
884 cgp->cg_time = ufs_rw32(time.tv_sec, UFS_FSNEEDSWAP(fs));
885 bno = dtogd(fs, bprev);
886 for (i = numfrags(fs, osize); i < frags; i++)
887 if (isclr(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i)) {
888 brelse(bp);
889 return (0);
890 }
891 /*
892 * the current fragment can be extended
893 * deduct the count on fragment being extended into
894 * increase the count on the remaining fragment (if any)
895 * allocate the extended piece
896 */
897 for (i = frags; i < fs->fs_frag - bbase; i++)
898 if (isclr(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i))
899 break;
900 ufs_add32(cgp->cg_frsum[i - numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
901 if (i != frags)
902 ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
903 for (i = numfrags(fs, osize); i < frags; i++) {
904 clrbit(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i);
905 ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
906 fs->fs_cstotal.cs_nffree--;
907 fs->fs_cs(fs, cg).cs_nffree--;
908 }
909 fs->fs_fmod = 1;
910 if (DOINGSOFTDEP(ITOV(ip)))
911 softdep_setup_blkmapdep(bp, fs, bprev);
912 bdwrite(bp);
913 return (bprev);
914 }
915
916 /*
917 * Determine whether a block can be allocated.
918 *
919 * Check to see if a block of the appropriate size is available,
920 * and if it is, allocate it.
921 */
922 static ufs_daddr_t
923 ffs_alloccg(ip, cg, bpref, size)
924 struct inode *ip;
925 int cg;
926 ufs_daddr_t bpref;
927 int size;
928 {
929 struct cg *cgp;
930 struct buf *bp;
931 ufs_daddr_t bno, blkno;
932 int error, frags, allocsiz, i;
933 struct fs *fs = ip->i_fs;
934 #ifdef FFS_EI
935 const int needswap = UFS_FSNEEDSWAP(fs);
936 #endif
937
938 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
939 return (0);
940 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
941 (int)fs->fs_cgsize, NOCRED, &bp);
942 if (error) {
943 brelse(bp);
944 return (0);
945 }
946 cgp = (struct cg *)bp->b_data;
947 if (!cg_chkmagic(cgp, needswap) ||
948 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
949 brelse(bp);
950 return (0);
951 }
952 cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
953 if (size == fs->fs_bsize) {
954 bno = ffs_alloccgblk(ip, bp, bpref);
955 bdwrite(bp);
956 return (bno);
957 }
958 /*
959 * check to see if any fragments are already available
960 * allocsiz is the size which will be allocated, hacking
961 * it down to a smaller size if necessary
962 */
963 frags = numfrags(fs, size);
964 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
965 if (cgp->cg_frsum[allocsiz] != 0)
966 break;
967 if (allocsiz == fs->fs_frag) {
968 /*
969 * no fragments were available, so a block will be
970 * allocated, and hacked up
971 */
972 if (cgp->cg_cs.cs_nbfree == 0) {
973 brelse(bp);
974 return (0);
975 }
976 bno = ffs_alloccgblk(ip, bp, bpref);
977 bpref = dtogd(fs, bno);
978 for (i = frags; i < fs->fs_frag; i++)
979 setbit(cg_blksfree(cgp, needswap), bpref + i);
980 i = fs->fs_frag - frags;
981 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
982 fs->fs_cstotal.cs_nffree += i;
983 fs->fs_cs(fs, cg).cs_nffree += i;
984 fs->fs_fmod = 1;
985 ufs_add32(cgp->cg_frsum[i], 1, needswap);
986 bdwrite(bp);
987 return (bno);
988 }
989 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
990 #if 0
991 /*
992 * XXX fvdl mapsearch will panic, and never return -1
993 * also: returning NULL as ufs_daddr_t ?
994 */
995 if (bno < 0) {
996 brelse(bp);
997 return (0);
998 }
999 #endif
1000 for (i = 0; i < frags; i++)
1001 clrbit(cg_blksfree(cgp, needswap), bno + i);
1002 ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
1003 fs->fs_cstotal.cs_nffree -= frags;
1004 fs->fs_cs(fs, cg).cs_nffree -= frags;
1005 fs->fs_fmod = 1;
1006 ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
1007 if (frags != allocsiz)
1008 ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
1009 blkno = cg * fs->fs_fpg + bno;
1010 if (DOINGSOFTDEP(ITOV(ip)))
1011 softdep_setup_blkmapdep(bp, fs, blkno);
1012 bdwrite(bp);
1013 return blkno;
1014 }
1015
1016 /*
1017 * Allocate a block in a cylinder group.
1018 *
1019 * This algorithm implements the following policy:
1020 * 1) allocate the requested block.
1021 * 2) allocate a rotationally optimal block in the same cylinder.
1022 * 3) allocate the next available block on the block rotor for the
1023 * specified cylinder group.
1024 * Note that this routine only allocates fs_bsize blocks; these
1025 * blocks may be fragmented by the routine that allocates them.
1026 */
1027 static ufs_daddr_t
1028 ffs_alloccgblk(ip, bp, bpref)
1029 struct inode *ip;
1030 struct buf *bp;
1031 ufs_daddr_t bpref;
1032 {
1033 struct cg *cgp;
1034 ufs_daddr_t bno, blkno;
1035 int cylno, pos, delta;
1036 short *cylbp;
1037 int i;
1038 struct fs *fs = ip->i_fs;
1039 #ifdef FFS_EI
1040 const int needswap = UFS_FSNEEDSWAP(fs);
1041 #endif
1042
1043 cgp = (struct cg *)bp->b_data;
1044 if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
1045 bpref = ufs_rw32(cgp->cg_rotor, needswap);
1046 goto norot;
1047 }
1048 bpref = blknum(fs, bpref);
1049 bpref = dtogd(fs, bpref);
1050 /*
1051 * if the requested block is available, use it
1052 */
1053 if (ffs_isblock(fs, cg_blksfree(cgp, needswap),
1054 fragstoblks(fs, bpref))) {
1055 bno = bpref;
1056 goto gotit;
1057 }
1058 if (fs->fs_nrpos <= 1 || fs->fs_cpc == 0) {
1059 /*
1060 * Block layout information is not available.
1061 * Leaving bpref unchanged means we take the
1062 * next available free block following the one
1063 * we just allocated. Hopefully this will at
1064 * least hit a track cache on drives of unknown
1065 * geometry (e.g. SCSI).
1066 */
1067 goto norot;
1068 }
1069 /*
1070 * check for a block available on the same cylinder
1071 */
1072 cylno = cbtocylno(fs, bpref);
1073 if (cg_blktot(cgp, needswap)[cylno] == 0)
1074 goto norot;
1075 /*
1076 * check the summary information to see if a block is
1077 * available in the requested cylinder starting at the
1078 * requested rotational position and proceeding around.
1079 */
1080 cylbp = cg_blks(fs, cgp, cylno, needswap);
1081 pos = cbtorpos(fs, bpref);
1082 for (i = pos; i < fs->fs_nrpos; i++)
1083 if (ufs_rw16(cylbp[i], needswap) > 0)
1084 break;
1085 if (i == fs->fs_nrpos)
1086 for (i = 0; i < pos; i++)
1087 if (ufs_rw16(cylbp[i], needswap) > 0)
1088 break;
1089 if (ufs_rw16(cylbp[i], needswap) > 0) {
1090 /*
1091 * found a rotational position, now find the actual
1092 * block. A panic if none is actually there.
1093 */
1094 pos = cylno % fs->fs_cpc;
1095 bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
1096 if (fs_postbl(fs, pos)[i] == -1) {
1097 printf("pos = %d, i = %d, fs = %s\n",
1098 pos, i, fs->fs_fsmnt);
1099 panic("ffs_alloccgblk: cyl groups corrupted");
1100 }
1101 for (i = fs_postbl(fs, pos)[i];; ) {
1102 if (ffs_isblock(fs, cg_blksfree(cgp, needswap), bno + i)) {
1103 bno = blkstofrags(fs, (bno + i));
1104 goto gotit;
1105 }
1106 delta = fs_rotbl(fs)[i];
1107 if (delta <= 0 ||
1108 delta + i > fragstoblks(fs, fs->fs_fpg))
1109 break;
1110 i += delta;
1111 }
1112 printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
1113 panic("ffs_alloccgblk: can't find blk in cyl");
1114 }
1115 norot:
1116 /*
1117 * no blocks in the requested cylinder, so take next
1118 * available one in this cylinder group.
1119 */
1120 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1121 if (bno < 0)
1122 return (0);
1123 cgp->cg_rotor = ufs_rw32(bno, needswap);
1124 gotit:
1125 blkno = fragstoblks(fs, bno);
1126 ffs_clrblock(fs, cg_blksfree(cgp, needswap), (long)blkno);
1127 ffs_clusteracct(fs, cgp, blkno, -1);
1128 ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
1129 fs->fs_cstotal.cs_nbfree--;
1130 fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
1131 cylno = cbtocylno(fs, bno);
1132 ufs_add16(cg_blks(fs, cgp, cylno, needswap)[cbtorpos(fs, bno)], -1,
1133 needswap);
1134 ufs_add32(cg_blktot(cgp, needswap)[cylno], -1, needswap);
1135 fs->fs_fmod = 1;
1136 blkno = ufs_rw32(cgp->cg_cgx, needswap) * fs->fs_fpg + bno;
1137 if (DOINGSOFTDEP(ITOV(ip)))
1138 softdep_setup_blkmapdep(bp, fs, blkno);
1139 return (blkno);
1140 }
1141
1142 /*
1143 * Determine whether a cluster can be allocated.
1144 *
1145 * We do not currently check for optimal rotational layout if there
1146 * are multiple choices in the same cylinder group. Instead we just
1147 * take the first one that we find following bpref.
1148 */
1149 static ufs_daddr_t
1150 ffs_clusteralloc(ip, cg, bpref, len)
1151 struct inode *ip;
1152 int cg;
1153 ufs_daddr_t bpref;
1154 int len;
1155 {
1156 struct fs *fs;
1157 struct cg *cgp;
1158 struct buf *bp;
1159 int i, got, run, bno, bit, map;
1160 u_char *mapp;
1161 int32_t *lp;
1162
1163 fs = ip->i_fs;
1164 if (fs->fs_maxcluster[cg] < len)
1165 return (0);
1166 if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize,
1167 NOCRED, &bp))
1168 goto fail;
1169 cgp = (struct cg *)bp->b_data;
1170 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
1171 goto fail;
1172 /*
1173 * Check to see if a cluster of the needed size (or bigger) is
1174 * available in this cylinder group.
1175 */
1176 lp = &cg_clustersum(cgp, UFS_FSNEEDSWAP(fs))[len];
1177 for (i = len; i <= fs->fs_contigsumsize; i++)
1178 if (ufs_rw32(*lp++, UFS_FSNEEDSWAP(fs)) > 0)
1179 break;
1180 if (i > fs->fs_contigsumsize) {
1181 /*
1182 * This is the first time looking for a cluster in this
1183 * cylinder group. Update the cluster summary information
1184 * to reflect the true maximum sized cluster so that
1185 * future cluster allocation requests can avoid reading
1186 * the cylinder group map only to find no clusters.
1187 */
1188 lp = &cg_clustersum(cgp, UFS_FSNEEDSWAP(fs))[len - 1];
1189 for (i = len - 1; i > 0; i--)
1190 if (ufs_rw32(*lp--, UFS_FSNEEDSWAP(fs)) > 0)
1191 break;
1192 fs->fs_maxcluster[cg] = i;
1193 goto fail;
1194 }
1195 /*
1196 * Search the cluster map to find a big enough cluster.
1197 * We take the first one that we find, even if it is larger
1198 * than we need as we prefer to get one close to the previous
1199 * block allocation. We do not search before the current
1200 * preference point as we do not want to allocate a block
1201 * that is allocated before the previous one (as we will
1202 * then have to wait for another pass of the elevator
1203 * algorithm before it will be read). We prefer to fail and
1204 * be recalled to try an allocation in the next cylinder group.
1205 */
1206 if (dtog(fs, bpref) != cg)
1207 bpref = 0;
1208 else
1209 bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
1210 mapp = &cg_clustersfree(cgp, UFS_FSNEEDSWAP(fs))[bpref / NBBY];
1211 map = *mapp++;
1212 bit = 1 << (bpref % NBBY);
1213 for (run = 0, got = bpref;
1214 got < ufs_rw32(cgp->cg_nclusterblks, UFS_FSNEEDSWAP(fs)); got++) {
1215 if ((map & bit) == 0) {
1216 run = 0;
1217 } else {
1218 run++;
1219 if (run == len)
1220 break;
1221 }
1222 if ((got & (NBBY - 1)) != (NBBY - 1)) {
1223 bit <<= 1;
1224 } else {
1225 map = *mapp++;
1226 bit = 1;
1227 }
1228 }
1229 if (got == ufs_rw32(cgp->cg_nclusterblks, UFS_FSNEEDSWAP(fs)))
1230 goto fail;
1231 /*
1232 * Allocate the cluster that we have found.
1233 */
1234 #ifdef DIAGNOSTIC
1235 for (i = 1; i <= len; i++)
1236 if (!ffs_isblock(fs, cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)),
1237 got - run + i))
1238 panic("ffs_clusteralloc: map mismatch");
1239 #endif
1240 bno = cg * fs->fs_fpg + blkstofrags(fs, got - run + 1);
1241 if (dtog(fs, bno) != cg)
1242 panic("ffs_clusteralloc: allocated out of group");
1243 len = blkstofrags(fs, len);
1244 for (i = 0; i < len; i += fs->fs_frag)
1245 if ((got = ffs_alloccgblk(ip, bp, bno + i)) != bno + i)
1246 panic("ffs_clusteralloc: lost block");
1247 bdwrite(bp);
1248 return (bno);
1249
1250 fail:
1251 brelse(bp);
1252 return (0);
1253 }
1254
1255 /*
1256 * Determine whether an inode can be allocated.
1257 *
1258 * Check to see if an inode is available, and if it is,
1259 * allocate it using the following policy:
1260 * 1) allocate the requested inode.
1261 * 2) allocate the next available inode after the requested
1262 * inode in the specified cylinder group.
1263 */
1264 static ufs_daddr_t
1265 ffs_nodealloccg(ip, cg, ipref, mode)
1266 struct inode *ip;
1267 int cg;
1268 ufs_daddr_t ipref;
1269 int mode;
1270 {
1271 struct cg *cgp;
1272 struct buf *bp;
1273 int error, start, len, loc, map, i;
1274 struct fs *fs = ip->i_fs;
1275 #ifdef FFS_EI
1276 const int needswap = UFS_FSNEEDSWAP(fs);
1277 #endif
1278
1279 if (fs->fs_cs(fs, cg).cs_nifree == 0)
1280 return (0);
1281 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
1282 (int)fs->fs_cgsize, NOCRED, &bp);
1283 if (error) {
1284 brelse(bp);
1285 return (0);
1286 }
1287 cgp = (struct cg *)bp->b_data;
1288 if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0) {
1289 brelse(bp);
1290 return (0);
1291 }
1292 cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
1293 if (ipref) {
1294 ipref %= fs->fs_ipg;
1295 if (isclr(cg_inosused(cgp, needswap), ipref))
1296 goto gotit;
1297 }
1298 start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
1299 len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
1300 NBBY);
1301 loc = skpc(0xff, len, &cg_inosused(cgp, needswap)[start]);
1302 if (loc == 0) {
1303 len = start + 1;
1304 start = 0;
1305 loc = skpc(0xff, len, &cg_inosused(cgp, needswap)[0]);
1306 if (loc == 0) {
1307 printf("cg = %d, irotor = %d, fs = %s\n",
1308 cg, ufs_rw32(cgp->cg_irotor, needswap),
1309 fs->fs_fsmnt);
1310 panic("ffs_nodealloccg: map corrupted");
1311 /* NOTREACHED */
1312 }
1313 }
1314 i = start + len - loc;
1315 map = cg_inosused(cgp, needswap)[i];
1316 ipref = i * NBBY;
1317 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
1318 if ((map & i) == 0) {
1319 cgp->cg_irotor = ufs_rw32(ipref, needswap);
1320 goto gotit;
1321 }
1322 }
1323 printf("fs = %s\n", fs->fs_fsmnt);
1324 panic("ffs_nodealloccg: block not in map");
1325 /* NOTREACHED */
1326 gotit:
1327 if (DOINGSOFTDEP(ITOV(ip)))
1328 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
1329 setbit(cg_inosused(cgp, needswap), ipref);
1330 ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
1331 fs->fs_cstotal.cs_nifree--;
1332 fs->fs_cs(fs, cg).cs_nifree--;
1333 fs->fs_fmod = 1;
1334 if ((mode & IFMT) == IFDIR) {
1335 ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
1336 fs->fs_cstotal.cs_ndir++;
1337 fs->fs_cs(fs, cg).cs_ndir++;
1338 }
1339 bdwrite(bp);
1340 return (cg * fs->fs_ipg + ipref);
1341 }
1342
1343 /*
1344 * Free a block or fragment.
1345 *
1346 * The specified block or fragment is placed back in the
1347 * free map. If a fragment is deallocated, a possible
1348 * block reassembly is checked.
1349 */
1350 void
1351 ffs_blkfree(ip, bno, size)
1352 struct inode *ip;
1353 ufs_daddr_t bno;
1354 long size;
1355 {
1356 struct cg *cgp;
1357 struct buf *bp;
1358 ufs_daddr_t blkno;
1359 int i, error, cg, blk, frags, bbase;
1360 struct fs *fs = ip->i_fs;
1361 const int needswap = UFS_FSNEEDSWAP(fs);
1362
1363 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
1364 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
1365 printf("dev = 0x%x, bno = %u bsize = %d, size = %ld, fs = %s\n",
1366 ip->i_dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
1367 panic("blkfree: bad size");
1368 }
1369 cg = dtog(fs, bno);
1370 if ((u_int)bno >= fs->fs_size) {
1371 printf("bad block %d, ino %d\n", bno, ip->i_number);
1372 ffs_fserr(fs, ip->i_ffs_uid, "bad block");
1373 return;
1374 }
1375 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
1376 (int)fs->fs_cgsize, NOCRED, &bp);
1377 if (error) {
1378 brelse(bp);
1379 return;
1380 }
1381 cgp = (struct cg *)bp->b_data;
1382 if (!cg_chkmagic(cgp, needswap)) {
1383 brelse(bp);
1384 return;
1385 }
1386 cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
1387 bno = dtogd(fs, bno);
1388 if (size == fs->fs_bsize) {
1389 blkno = fragstoblks(fs, bno);
1390 if (!ffs_isfreeblock(fs, cg_blksfree(cgp, needswap), blkno)) {
1391 printf("dev = 0x%x, block = %d, fs = %s\n",
1392 ip->i_dev, bno, fs->fs_fsmnt);
1393 panic("blkfree: freeing free block");
1394 }
1395 ffs_setblock(fs, cg_blksfree(cgp, needswap), blkno);
1396 ffs_clusteracct(fs, cgp, blkno, 1);
1397 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1398 fs->fs_cstotal.cs_nbfree++;
1399 fs->fs_cs(fs, cg).cs_nbfree++;
1400 i = cbtocylno(fs, bno);
1401 ufs_add16(cg_blks(fs, cgp, i, needswap)[cbtorpos(fs, bno)], 1,
1402 needswap);
1403 ufs_add32(cg_blktot(cgp, needswap)[i], 1, needswap);
1404 } else {
1405 bbase = bno - fragnum(fs, bno);
1406 /*
1407 * decrement the counts associated with the old frags
1408 */
1409 blk = blkmap(fs, cg_blksfree(cgp, needswap), bbase);
1410 ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
1411 /*
1412 * deallocate the fragment
1413 */
1414 frags = numfrags(fs, size);
1415 for (i = 0; i < frags; i++) {
1416 if (isset(cg_blksfree(cgp, needswap), bno + i)) {
1417 printf("dev = 0x%x, block = %d, fs = %s\n",
1418 ip->i_dev, bno + i, fs->fs_fsmnt);
1419 panic("blkfree: freeing free frag");
1420 }
1421 setbit(cg_blksfree(cgp, needswap), bno + i);
1422 }
1423 ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1424 fs->fs_cstotal.cs_nffree += i;
1425 fs->fs_cs(fs, cg).cs_nffree += i;
1426 /*
1427 * add back in counts associated with the new frags
1428 */
1429 blk = blkmap(fs, cg_blksfree(cgp, needswap), bbase);
1430 ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
1431 /*
1432 * if a complete block has been reassembled, account for it
1433 */
1434 blkno = fragstoblks(fs, bbase);
1435 if (ffs_isblock(fs, cg_blksfree(cgp, needswap), blkno)) {
1436 ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
1437 fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1438 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1439 ffs_clusteracct(fs, cgp, blkno, 1);
1440 ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
1441 fs->fs_cstotal.cs_nbfree++;
1442 fs->fs_cs(fs, cg).cs_nbfree++;
1443 i = cbtocylno(fs, bbase);
1444 ufs_add16(cg_blks(fs, cgp, i, needswap)[cbtorpos(fs,
1445 bbase)], 1,
1446 needswap);
1447 ufs_add32(cg_blktot(cgp, needswap)[i], 1, needswap);
1448 }
1449 }
1450 fs->fs_fmod = 1;
1451 bdwrite(bp);
1452 }
1453
1454 #if defined(DIAGNOSTIC) || defined(DEBUG)
1455 /*
1456 * Verify allocation of a block or fragment. Returns true if block or
1457 * fragment is allocated, false if it is free.
1458 */
1459 static int
1460 ffs_checkblk(ip, bno, size)
1461 struct inode *ip;
1462 ufs_daddr_t bno;
1463 long size;
1464 {
1465 struct fs *fs;
1466 struct cg *cgp;
1467 struct buf *bp;
1468 int i, error, frags, free;
1469
1470 fs = ip->i_fs;
1471 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
1472 printf("bsize = %d, size = %ld, fs = %s\n",
1473 fs->fs_bsize, size, fs->fs_fsmnt);
1474 panic("checkblk: bad size");
1475 }
1476 if ((u_int)bno >= fs->fs_size)
1477 panic("checkblk: bad block %d", bno);
1478 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))),
1479 (int)fs->fs_cgsize, NOCRED, &bp);
1480 if (error) {
1481 brelse(bp);
1482 return 0;
1483 }
1484 cgp = (struct cg *)bp->b_data;
1485 if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
1486 brelse(bp);
1487 return 0;
1488 }
1489 bno = dtogd(fs, bno);
1490 if (size == fs->fs_bsize) {
1491 free = ffs_isblock(fs, cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)),
1492 fragstoblks(fs, bno));
1493 } else {
1494 frags = numfrags(fs, size);
1495 for (free = 0, i = 0; i < frags; i++)
1496 if (isset(cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)), bno + i))
1497 free++;
1498 if (free != 0 && free != frags)
1499 panic("checkblk: partially free fragment");
1500 }
1501 brelse(bp);
1502 return (!free);
1503 }
1504 #endif /* DIAGNOSTIC */
1505
1506 /*
1507 * Free an inode.
1508 */
1509 int
1510 ffs_vfree(v)
1511 void *v;
1512 {
1513 struct vop_vfree_args /* {
1514 struct vnode *a_pvp;
1515 ino_t a_ino;
1516 int a_mode;
1517 } */ *ap = v;
1518
1519 if (DOINGSOFTDEP(ap->a_pvp)) {
1520 softdep_freefile(ap);
1521 return (0);
1522 }
1523 return (ffs_freefile(ap));
1524 }
1525
1526 /*
1527 * Do the actual free operation.
1528 * The specified inode is placed back in the free map.
1529 */
1530 int
1531 ffs_freefile(v)
1532 void *v;
1533 {
1534 struct vop_vfree_args /* {
1535 struct vnode *a_pvp;
1536 ino_t a_ino;
1537 int a_mode;
1538 } */ *ap = v;
1539 struct cg *cgp;
1540 struct inode *pip = VTOI(ap->a_pvp);
1541 struct fs *fs = pip->i_fs;
1542 ino_t ino = ap->a_ino;
1543 struct buf *bp;
1544 int error, cg;
1545 #ifdef FFS_EI
1546 const int needswap = UFS_FSNEEDSWAP(fs);
1547 #endif
1548
1549 if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
1550 panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n",
1551 pip->i_dev, ino, fs->fs_fsmnt);
1552 cg = ino_to_cg(fs, ino);
1553 error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
1554 (int)fs->fs_cgsize, NOCRED, &bp);
1555 if (error) {
1556 brelse(bp);
1557 return (error);
1558 }
1559 cgp = (struct cg *)bp->b_data;
1560 if (!cg_chkmagic(cgp, needswap)) {
1561 brelse(bp);
1562 return (0);
1563 }
1564 cgp->cg_time = ufs_rw32(time.tv_sec, needswap);
1565 ino %= fs->fs_ipg;
1566 if (isclr(cg_inosused(cgp, needswap), ino)) {
1567 printf("dev = 0x%x, ino = %d, fs = %s\n",
1568 pip->i_dev, ino, fs->fs_fsmnt);
1569 if (fs->fs_ronly == 0)
1570 panic("ifree: freeing free inode");
1571 }
1572 clrbit(cg_inosused(cgp, needswap), ino);
1573 if (ino < ufs_rw32(cgp->cg_irotor, needswap))
1574 cgp->cg_irotor = ufs_rw32(ino, needswap);
1575 ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
1576 fs->fs_cstotal.cs_nifree++;
1577 fs->fs_cs(fs, cg).cs_nifree++;
1578 if ((ap->a_mode & IFMT) == IFDIR) {
1579 ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
1580 fs->fs_cstotal.cs_ndir--;
1581 fs->fs_cs(fs, cg).cs_ndir--;
1582 }
1583 fs->fs_fmod = 1;
1584 bdwrite(bp);
1585 return (0);
1586 }
1587
1588 /*
1589 * Find a block of the specified size in the specified cylinder group.
1590 *
1591 * It is a panic if a request is made to find a block if none are
1592 * available.
1593 */
1594 static ufs_daddr_t
1595 ffs_mapsearch(fs, cgp, bpref, allocsiz)
1596 struct fs *fs;
1597 struct cg *cgp;
1598 ufs_daddr_t bpref;
1599 int allocsiz;
1600 {
1601 ufs_daddr_t bno;
1602 int start, len, loc, i;
1603 int blk, field, subfield, pos;
1604 int ostart, olen;
1605 #ifdef FFS_EI
1606 const int needswap = UFS_FSNEEDSWAP(fs);
1607 #endif
1608
1609 /*
1610 * find the fragment by searching through the free block
1611 * map for an appropriate bit pattern
1612 */
1613 if (bpref)
1614 start = dtogd(fs, bpref) / NBBY;
1615 else
1616 start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
1617 len = howmany(fs->fs_fpg, NBBY) - start;
1618 ostart = start;
1619 olen = len;
1620 loc = scanc((u_int)len,
1621 (const u_char *)&cg_blksfree(cgp, needswap)[start],
1622 (const u_char *)fragtbl[fs->fs_frag],
1623 (1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1624 if (loc == 0) {
1625 len = start + 1;
1626 start = 0;
1627 loc = scanc((u_int)len,
1628 (const u_char *)&cg_blksfree(cgp, needswap)[0],
1629 (const u_char *)fragtbl[fs->fs_frag],
1630 (1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1631 if (loc == 0) {
1632 printf("start = %d, len = %d, fs = %s\n",
1633 ostart, olen, fs->fs_fsmnt);
1634 printf("offset=%d %ld\n",
1635 ufs_rw32(cgp->cg_freeoff, needswap),
1636 (long)cg_blksfree(cgp, needswap) - (long)cgp);
1637 panic("ffs_alloccg: map corrupted");
1638 /* NOTREACHED */
1639 }
1640 }
1641 bno = (start + len - loc) * NBBY;
1642 cgp->cg_frotor = ufs_rw32(bno, needswap);
1643 /*
1644 * found the byte in the map
1645 * sift through the bits to find the selected frag
1646 */
1647 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1648 blk = blkmap(fs, cg_blksfree(cgp, needswap), bno);
1649 blk <<= 1;
1650 field = around[allocsiz];
1651 subfield = inside[allocsiz];
1652 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
1653 if ((blk & field) == subfield)
1654 return (bno + pos);
1655 field <<= 1;
1656 subfield <<= 1;
1657 }
1658 }
1659 printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
1660 panic("ffs_alloccg: block not in map");
1661 return (-1);
1662 }
1663
1664 /*
1665 * Update the cluster map because of an allocation or free.
1666 *
1667 * Cnt == 1 means free; cnt == -1 means allocating.
1668 */
1669 void
1670 ffs_clusteracct(fs, cgp, blkno, cnt)
1671 struct fs *fs;
1672 struct cg *cgp;
1673 ufs_daddr_t blkno;
1674 int cnt;
1675 {
1676 int32_t *sump;
1677 int32_t *lp;
1678 u_char *freemapp, *mapp;
1679 int i, start, end, forw, back, map, bit;
1680 #ifdef FFS_EI
1681 const int needswap = UFS_FSNEEDSWAP(fs);
1682 #endif
1683
1684 if (fs->fs_contigsumsize <= 0)
1685 return;
1686 freemapp = cg_clustersfree(cgp, needswap);
1687 sump = cg_clustersum(cgp, needswap);
1688 /*
1689 * Allocate or clear the actual block.
1690 */
1691 if (cnt > 0)
1692 setbit(freemapp, blkno);
1693 else
1694 clrbit(freemapp, blkno);
1695 /*
1696 * Find the size of the cluster going forward.
1697 */
1698 start = blkno + 1;
1699 end = start + fs->fs_contigsumsize;
1700 if (end >= ufs_rw32(cgp->cg_nclusterblks, needswap))
1701 end = ufs_rw32(cgp->cg_nclusterblks, needswap);
1702 mapp = &freemapp[start / NBBY];
1703 map = *mapp++;
1704 bit = 1 << (start % NBBY);
1705 for (i = start; i < end; i++) {
1706 if ((map & bit) == 0)
1707 break;
1708 if ((i & (NBBY - 1)) != (NBBY - 1)) {
1709 bit <<= 1;
1710 } else {
1711 map = *mapp++;
1712 bit = 1;
1713 }
1714 }
1715 forw = i - start;
1716 /*
1717 * Find the size of the cluster going backward.
1718 */
1719 start = blkno - 1;
1720 end = start - fs->fs_contigsumsize;
1721 if (end < 0)
1722 end = -1;
1723 mapp = &freemapp[start / NBBY];
1724 map = *mapp--;
1725 bit = 1 << (start % NBBY);
1726 for (i = start; i > end; i--) {
1727 if ((map & bit) == 0)
1728 break;
1729 if ((i & (NBBY - 1)) != 0) {
1730 bit >>= 1;
1731 } else {
1732 map = *mapp--;
1733 bit = 1 << (NBBY - 1);
1734 }
1735 }
1736 back = start - i;
1737 /*
1738 * Account for old cluster and the possibly new forward and
1739 * back clusters.
1740 */
1741 i = back + forw + 1;
1742 if (i > fs->fs_contigsumsize)
1743 i = fs->fs_contigsumsize;
1744 ufs_add32(sump[i], cnt, needswap);
1745 if (back > 0)
1746 ufs_add32(sump[back], -cnt, needswap);
1747 if (forw > 0)
1748 ufs_add32(sump[forw], -cnt, needswap);
1749
1750 /*
1751 * Update cluster summary information.
1752 */
1753 lp = &sump[fs->fs_contigsumsize];
1754 for (i = fs->fs_contigsumsize; i > 0; i--)
1755 if (ufs_rw32(*lp--, needswap) > 0)
1756 break;
1757 fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
1758 }
1759
1760 /*
1761 * Fserr prints the name of a file system with an error diagnostic.
1762 *
1763 * The form of the error message is:
1764 * fs: error message
1765 */
1766 static void
1767 ffs_fserr(fs, uid, cp)
1768 struct fs *fs;
1769 u_int uid;
1770 char *cp;
1771 {
1772
1773 log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp);
1774 }
1775