ulfs_dirhash.c revision 1.8 1 /* $NetBSD: ulfs_dirhash.c,v 1.8 2014/02/25 18:30:13 pooka Exp $ */
2 /* from NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp */
3
4 /*
5 * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $
29 */
30
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: ulfs_dirhash.c,v 1.8 2014/02/25 18:30:13 pooka Exp $");
33
34 /*
35 * This implements a hash-based lookup scheme for ULFS directories.
36 */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/kmem.h>
42 #include <sys/types.h>
43 #include <sys/hash.h>
44 #include <sys/proc.h>
45 #include <sys/buf.h>
46 #include <sys/vnode.h>
47 #include <sys/mount.h>
48 #include <sys/pool.h>
49 #include <sys/sysctl.h>
50 #include <sys/atomic.h>
51
52 #include <ufs/lfs/ulfs_inode.h>
53 #include <ufs/lfs/ulfs_dirhash.h>
54 #include <ufs/lfs/ulfsmount.h>
55 #include <ufs/lfs/ulfs_bswap.h>
56 #include <ufs/lfs/ulfs_extern.h>
57
58 #define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1))
59 #define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1))
60 #define OFSFMT(ip) ((ip)->i_lfs->um_maxsymlinklen <= 0)
61 #define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n))
62
63 static u_int ulfs_dirhashminblks = 5;
64 static u_int ulfs_dirhashmaxmem = 2 * 1024 * 1024;
65 static u_int ulfs_dirhashmem;
66 static u_int ulfs_dirhashcheck = 0;
67
68 static int ulfsdirhash_hash(struct dirhash *dh, const char *name, int namelen);
69 static void ulfsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff,
70 int dirblksiz);
71 static void ulfsdirhash_delslot(struct dirhash *dh, int slot);
72 static int ulfsdirhash_findslot(struct dirhash *dh, const char *name,
73 int namelen, doff_t offset);
74 static doff_t ulfsdirhash_getprev(struct lfs_direct *dp, doff_t offset,
75 int dirblksiz);
76 static int ulfsdirhash_recycle(int wanted);
77
78 static pool_cache_t ulfsdirhashblk_cache;
79 static pool_cache_t ulfsdirhash_cache;
80
81 #define DIRHASHLIST_LOCK() mutex_enter(&ulfsdirhash_lock)
82 #define DIRHASHLIST_UNLOCK() mutex_exit(&ulfsdirhash_lock)
83 #define DIRHASH_LOCK(dh) mutex_enter(&(dh)->dh_lock)
84 #define DIRHASH_UNLOCK(dh) mutex_exit(&(dh)->dh_lock)
85 #define DIRHASH_BLKALLOC() \
86 pool_cache_get(ulfsdirhashblk_cache, PR_NOWAIT)
87 #define DIRHASH_BLKFREE(ptr) \
88 pool_cache_put(ulfsdirhashblk_cache, ptr)
89
90 /* Dirhash list; recently-used entries are near the tail. */
91 static TAILQ_HEAD(, dirhash) ulfsdirhash_list;
92
93 /* Protects: ulfsdirhash_list, `dh_list' field, ulfs_dirhashmem. */
94 static kmutex_t ulfsdirhash_lock;
95
96 static struct sysctllog *ulfsdirhash_sysctl_log;
97
98 /*
99 * Locking order:
100 * ulfsdirhash_lock
101 * dh_lock
102 *
103 * The dh_lock mutex should be acquired either via the inode lock, or via
104 * ulfsdirhash_lock. Only the owner of the inode may free the associated
105 * dirhash, but anything can steal its memory and set dh_hash to NULL.
106 */
107
108 /*
109 * Attempt to build up a hash table for the directory contents in
110 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
111 */
112 int
113 ulfsdirhash_build(struct inode *ip)
114 {
115 struct dirhash *dh;
116 struct buf *bp = NULL;
117 struct lfs_direct *ep;
118 struct vnode *vp;
119 doff_t bmask, pos;
120 int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
121 const int needswap = ULFS_IPNEEDSWAP(ip);
122 int dirblksiz = ip->i_lfs->um_dirblksiz;
123
124 /* Check if we can/should use dirhash. */
125 if (ip->i_dirhash == NULL) {
126 if (ip->i_size < (ulfs_dirhashminblks * dirblksiz) || OFSFMT(ip))
127 return (-1);
128 } else {
129 /* Hash exists, but sysctls could have changed. */
130 if (ip->i_size < (ulfs_dirhashminblks * dirblksiz) ||
131 ulfs_dirhashmem > ulfs_dirhashmaxmem) {
132 ulfsdirhash_free(ip);
133 return (-1);
134 }
135 /* Check if hash exists and is intact (note: unlocked read). */
136 if (ip->i_dirhash->dh_hash != NULL)
137 return (0);
138 /* Free the old, recycled hash and build a new one. */
139 ulfsdirhash_free(ip);
140 }
141
142 /* Don't hash removed directories. */
143 if (ip->i_nlink == 0)
144 return (-1);
145
146 vp = ip->i_vnode;
147 /* Allocate 50% more entries than this dir size could ever need. */
148 KASSERT(ip->i_size >= dirblksiz);
149 nslots = ip->i_size / LFS_DIRECTSIZ(1);
150 nslots = (nslots * 3 + 1) / 2;
151 narrays = howmany(nslots, DH_NBLKOFF);
152 nslots = narrays * DH_NBLKOFF;
153 dirblocks = howmany(ip->i_size, dirblksiz);
154 nblocks = (dirblocks * 3 + 1) / 2;
155
156 memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
157 narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
158 nblocks * sizeof(*dh->dh_blkfree);
159
160 while (atomic_add_int_nv(&ulfs_dirhashmem, memreqd) >
161 ulfs_dirhashmaxmem) {
162 atomic_add_int(&ulfs_dirhashmem, -memreqd);
163 if (memreqd > ulfs_dirhashmaxmem / 2)
164 return (-1);
165 /* Try to free some space. */
166 if (ulfsdirhash_recycle(memreqd) != 0)
167 return (-1);
168 else
169 DIRHASHLIST_UNLOCK();
170 }
171
172 /*
173 * Use non-blocking mallocs so that we will revert to a linear
174 * lookup on failure rather than potentially blocking forever.
175 */
176 dh = pool_cache_get(ulfsdirhash_cache, PR_NOWAIT);
177 if (dh == NULL) {
178 atomic_add_int(&ulfs_dirhashmem, -memreqd);
179 return (-1);
180 }
181 memset(dh, 0, sizeof(*dh));
182 mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE);
183 DIRHASH_LOCK(dh);
184 dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]);
185 dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP);
186 dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]);
187 dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP);
188 if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
189 goto fail;
190 for (i = 0; i < narrays; i++) {
191 if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL)
192 goto fail;
193 for (j = 0; j < DH_NBLKOFF; j++)
194 dh->dh_hash[i][j] = DIRHASH_EMPTY;
195 }
196
197 /* Initialise the hash table and block statistics. */
198 dh->dh_narrays = narrays;
199 dh->dh_hlen = nslots;
200 dh->dh_nblk = nblocks;
201 dh->dh_dirblks = dirblocks;
202 for (i = 0; i < dirblocks; i++)
203 dh->dh_blkfree[i] = dirblksiz / DIRALIGN;
204 for (i = 0; i < DH_NFSTATS; i++)
205 dh->dh_firstfree[i] = -1;
206 dh->dh_firstfree[DH_NFSTATS] = 0;
207 dh->dh_seqopt = 0;
208 dh->dh_seqoff = 0;
209 dh->dh_score = DH_SCOREINIT;
210 ip->i_dirhash = dh;
211
212 bmask = VFSTOULFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
213 pos = 0;
214 while (pos < ip->i_size) {
215 if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
216 != 0) {
217 preempt();
218 }
219 /* If necessary, get the next directory block. */
220 if ((pos & bmask) == 0) {
221 if (bp != NULL)
222 brelse(bp, 0);
223 if (ulfs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0)
224 goto fail;
225 }
226
227 /* Add this entry to the hash. */
228 ep = (struct lfs_direct *)((char *)bp->b_data + (pos & bmask));
229 if (ep->d_reclen == 0 || ep->d_reclen >
230 dirblksiz - (pos & (dirblksiz - 1))) {
231 /* Corrupted directory. */
232 brelse(bp, 0);
233 goto fail;
234 }
235 if (ep->d_ino != 0) {
236 /* Add the entry (simplified ulfsdirhash_add). */
237 slot = ulfsdirhash_hash(dh, ep->d_name, ep->d_namlen);
238 while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
239 slot = WRAPINCR(slot, dh->dh_hlen);
240 dh->dh_hused++;
241 DH_ENTRY(dh, slot) = pos;
242 ulfsdirhash_adjfree(dh, pos, -LFS_DIRSIZ(0, ep, needswap),
243 dirblksiz);
244 }
245 pos += ep->d_reclen;
246 }
247
248 if (bp != NULL)
249 brelse(bp, 0);
250 DIRHASHLIST_LOCK();
251 TAILQ_INSERT_TAIL(&ulfsdirhash_list, dh, dh_list);
252 dh->dh_onlist = 1;
253 DIRHASH_UNLOCK(dh);
254 DIRHASHLIST_UNLOCK();
255 return (0);
256
257 fail:
258 DIRHASH_UNLOCK(dh);
259 if (dh->dh_hash != NULL) {
260 for (i = 0; i < narrays; i++)
261 if (dh->dh_hash[i] != NULL)
262 DIRHASH_BLKFREE(dh->dh_hash[i]);
263 kmem_free(dh->dh_hash, dh->dh_hashsz);
264 }
265 if (dh->dh_blkfree != NULL)
266 kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
267 mutex_destroy(&dh->dh_lock);
268 pool_cache_put(ulfsdirhash_cache, dh);
269 ip->i_dirhash = NULL;
270 atomic_add_int(&ulfs_dirhashmem, -memreqd);
271 return (-1);
272 }
273
274 /*
275 * Free any hash table associated with inode 'ip'.
276 */
277 void
278 ulfsdirhash_free(struct inode *ip)
279 {
280 struct dirhash *dh;
281 int i, mem;
282
283 if ((dh = ip->i_dirhash) == NULL)
284 return;
285
286 if (dh->dh_onlist) {
287 DIRHASHLIST_LOCK();
288 if (dh->dh_onlist)
289 TAILQ_REMOVE(&ulfsdirhash_list, dh, dh_list);
290 DIRHASHLIST_UNLOCK();
291 }
292
293 /* The dirhash pointed to by 'dh' is exclusively ours now. */
294 mem = sizeof(*dh);
295 if (dh->dh_hash != NULL) {
296 for (i = 0; i < dh->dh_narrays; i++)
297 DIRHASH_BLKFREE(dh->dh_hash[i]);
298 kmem_free(dh->dh_hash, dh->dh_hashsz);
299 kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
300 mem += dh->dh_hashsz;
301 mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash);
302 mem += dh->dh_nblk * sizeof(*dh->dh_blkfree);
303 }
304 mutex_destroy(&dh->dh_lock);
305 pool_cache_put(ulfsdirhash_cache, dh);
306 ip->i_dirhash = NULL;
307
308 atomic_add_int(&ulfs_dirhashmem, -mem);
309 }
310
311 /*
312 * Find the offset of the specified name within the given inode.
313 * Returns 0 on success, ENOENT if the entry does not exist, or
314 * EJUSTRETURN if the caller should revert to a linear search.
315 *
316 * If successful, the directory offset is stored in *offp, and a
317 * pointer to a struct buf containing the entry is stored in *bpp. If
318 * prevoffp is non-NULL, the offset of the previous entry within
319 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
320 * is the first in a block, the start of the block is used).
321 */
322 int
323 ulfsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp,
324 struct buf **bpp, doff_t *prevoffp)
325 {
326 struct dirhash *dh, *dh_next;
327 struct lfs_direct *dp;
328 struct vnode *vp;
329 struct buf *bp;
330 doff_t blkoff, bmask, offset, prevoff;
331 int i, slot;
332 const int needswap = ULFS_IPNEEDSWAP(ip);
333 int dirblksiz = ip->i_lfs->um_dirblksiz;
334
335 if ((dh = ip->i_dirhash) == NULL)
336 return (EJUSTRETURN);
337
338 /*
339 * Move this dirhash towards the end of the list if it has a
340 * score higher than the next entry, and acquire the dh_lock.
341 * Optimise the case where it's already the last by performing
342 * an unlocked read of the TAILQ_NEXT pointer.
343 *
344 * In both cases, end up holding just dh_lock.
345 */
346 if (TAILQ_NEXT(dh, dh_list) != NULL) {
347 DIRHASHLIST_LOCK();
348 DIRHASH_LOCK(dh);
349 /*
350 * If the new score will be greater than that of the next
351 * entry, then move this entry past it. With both mutexes
352 * held, dh_next won't go away, but its dh_score could
353 * change; that's not important since it is just a hint.
354 */
355 if (dh->dh_hash != NULL &&
356 (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
357 dh->dh_score >= dh_next->dh_score) {
358 KASSERT(dh->dh_onlist);
359 TAILQ_REMOVE(&ulfsdirhash_list, dh, dh_list);
360 TAILQ_INSERT_AFTER(&ulfsdirhash_list, dh_next, dh,
361 dh_list);
362 }
363 DIRHASHLIST_UNLOCK();
364 } else {
365 /* Already the last, though that could change as we wait. */
366 DIRHASH_LOCK(dh);
367 }
368 if (dh->dh_hash == NULL) {
369 DIRHASH_UNLOCK(dh);
370 ulfsdirhash_free(ip);
371 return (EJUSTRETURN);
372 }
373
374 /* Update the score. */
375 if (dh->dh_score < DH_SCOREMAX)
376 dh->dh_score++;
377
378 vp = ip->i_vnode;
379 bmask = VFSTOULFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
380 blkoff = -1;
381 bp = NULL;
382 restart:
383 slot = ulfsdirhash_hash(dh, name, namelen);
384
385 if (dh->dh_seqopt) {
386 /*
387 * Sequential access optimisation. dh_seqoff contains the
388 * offset of the directory entry immediately following
389 * the last entry that was looked up. Check if this offset
390 * appears in the hash chain for the name we are looking for.
391 */
392 for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
393 i = WRAPINCR(i, dh->dh_hlen))
394 if (offset == dh->dh_seqoff)
395 break;
396 if (offset == dh->dh_seqoff) {
397 /*
398 * We found an entry with the expected offset. This
399 * is probably the entry we want, but if not, the
400 * code below will turn off seqoff and retry.
401 */
402 slot = i;
403 } else
404 dh->dh_seqopt = 0;
405 }
406
407 for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
408 slot = WRAPINCR(slot, dh->dh_hlen)) {
409 if (offset == DIRHASH_DEL)
410 continue;
411
412 if (offset < 0 || offset >= ip->i_size)
413 panic("ulfsdirhash_lookup: bad offset in hash array");
414 if ((offset & ~bmask) != blkoff) {
415 if (bp != NULL)
416 brelse(bp, 0);
417 blkoff = offset & ~bmask;
418 if (ulfs_blkatoff(vp, (off_t)blkoff,
419 NULL, &bp, false) != 0) {
420 DIRHASH_UNLOCK(dh);
421 return (EJUSTRETURN);
422 }
423 }
424 dp = (struct lfs_direct *)((char *)bp->b_data + (offset & bmask));
425 if (dp->d_reclen == 0 || dp->d_reclen >
426 dirblksiz - (offset & (dirblksiz - 1))) {
427 /* Corrupted directory. */
428 DIRHASH_UNLOCK(dh);
429 brelse(bp, 0);
430 return (EJUSTRETURN);
431 }
432 if (dp->d_namlen == namelen &&
433 memcmp(dp->d_name, name, namelen) == 0) {
434 /* Found. Get the prev offset if needed. */
435 if (prevoffp != NULL) {
436 if (offset & (dirblksiz - 1)) {
437 prevoff = ulfsdirhash_getprev(dp,
438 offset, dirblksiz);
439 if (prevoff == -1) {
440 brelse(bp, 0);
441 return (EJUSTRETURN);
442 }
443 } else
444 prevoff = offset;
445 *prevoffp = prevoff;
446 }
447
448 /* Check for sequential access, and update offset. */
449 if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
450 dh->dh_seqopt = 1;
451 dh->dh_seqoff = offset + LFS_DIRSIZ(0, dp, needswap);
452 DIRHASH_UNLOCK(dh);
453
454 *bpp = bp;
455 *offp = offset;
456 return (0);
457 }
458
459 if (dh->dh_hash == NULL) {
460 DIRHASH_UNLOCK(dh);
461 if (bp != NULL)
462 brelse(bp, 0);
463 ulfsdirhash_free(ip);
464 return (EJUSTRETURN);
465 }
466 /*
467 * When the name doesn't match in the seqopt case, go back
468 * and search normally.
469 */
470 if (dh->dh_seqopt) {
471 dh->dh_seqopt = 0;
472 goto restart;
473 }
474 }
475 DIRHASH_UNLOCK(dh);
476 if (bp != NULL)
477 brelse(bp, 0);
478 return (ENOENT);
479 }
480
481 /*
482 * Find a directory block with room for 'slotneeded' bytes. Returns
483 * the offset of the directory entry that begins the free space.
484 * This will either be the offset of an existing entry that has free
485 * space at the end, or the offset of an entry with d_ino == 0 at
486 * the start of a DIRBLKSIZ block.
487 *
488 * To use the space, the caller may need to compact existing entries in
489 * the directory. The total number of bytes in all of the entries involved
490 * in the compaction is stored in *slotsize. In other words, all of
491 * the entries that must be compacted are exactly contained in the
492 * region beginning at the returned offset and spanning *slotsize bytes.
493 *
494 * Returns -1 if no space was found, indicating that the directory
495 * must be extended.
496 */
497 doff_t
498 ulfsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
499 {
500 struct lfs_direct *dp;
501 struct dirhash *dh;
502 struct buf *bp;
503 doff_t pos, slotstart;
504 int dirblock, error, freebytes, i;
505 const int needswap = ULFS_IPNEEDSWAP(ip);
506 int dirblksiz = ip->i_lfs->um_dirblksiz;
507
508 if ((dh = ip->i_dirhash) == NULL)
509 return (-1);
510
511 DIRHASH_LOCK(dh);
512 if (dh->dh_hash == NULL) {
513 DIRHASH_UNLOCK(dh);
514 ulfsdirhash_free(ip);
515 return (-1);
516 }
517
518 /* Find a directory block with the desired free space. */
519 dirblock = -1;
520 for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
521 if ((dirblock = dh->dh_firstfree[i]) != -1)
522 break;
523 if (dirblock == -1) {
524 DIRHASH_UNLOCK(dh);
525 return (-1);
526 }
527
528 KASSERT(dirblock < dh->dh_nblk &&
529 dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN));
530 pos = dirblock * dirblksiz;
531 error = ulfs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false);
532 if (error) {
533 DIRHASH_UNLOCK(dh);
534 return (-1);
535 }
536 /* Find the first entry with free space. */
537 for (i = 0; i < dirblksiz; ) {
538 if (dp->d_reclen == 0) {
539 DIRHASH_UNLOCK(dh);
540 brelse(bp, 0);
541 return (-1);
542 }
543 if (dp->d_ino == 0 || dp->d_reclen > LFS_DIRSIZ(0, dp, needswap))
544 break;
545 i += dp->d_reclen;
546 dp = (struct lfs_direct *)((char *)dp + dp->d_reclen);
547 }
548 if (i > dirblksiz) {
549 DIRHASH_UNLOCK(dh);
550 brelse(bp, 0);
551 return (-1);
552 }
553 slotstart = pos + i;
554
555 /* Find the range of entries needed to get enough space */
556 freebytes = 0;
557 while (i < dirblksiz && freebytes < slotneeded) {
558 freebytes += dp->d_reclen;
559 if (dp->d_ino != 0)
560 freebytes -= LFS_DIRSIZ(0, dp, needswap);
561 if (dp->d_reclen == 0) {
562 DIRHASH_UNLOCK(dh);
563 brelse(bp, 0);
564 return (-1);
565 }
566 i += dp->d_reclen;
567 dp = (struct lfs_direct *)((char *)dp + dp->d_reclen);
568 }
569 if (i > dirblksiz) {
570 DIRHASH_UNLOCK(dh);
571 brelse(bp, 0);
572 return (-1);
573 }
574 if (freebytes < slotneeded)
575 panic("ulfsdirhash_findfree: free mismatch");
576 DIRHASH_UNLOCK(dh);
577 brelse(bp, 0);
578 *slotsize = pos + i - slotstart;
579 return (slotstart);
580 }
581
582 /*
583 * Return the start of the unused space at the end of a directory, or
584 * -1 if there are no trailing unused blocks.
585 */
586 doff_t
587 ulfsdirhash_enduseful(struct inode *ip)
588 {
589 struct dirhash *dh;
590 int i;
591 int dirblksiz = ip->i_lfs->um_dirblksiz;
592
593 if ((dh = ip->i_dirhash) == NULL)
594 return (-1);
595
596 DIRHASH_LOCK(dh);
597 if (dh->dh_hash == NULL) {
598 DIRHASH_UNLOCK(dh);
599 ulfsdirhash_free(ip);
600 return (-1);
601 }
602
603 if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) {
604 DIRHASH_UNLOCK(dh);
605 return (-1);
606 }
607
608 for (i = dh->dh_dirblks - 1; i >= 0; i--)
609 if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
610 break;
611 DIRHASH_UNLOCK(dh);
612 return ((doff_t)(i + 1) * dirblksiz);
613 }
614
615 /*
616 * Insert information into the hash about a new directory entry. dirp
617 * points to a struct lfs_direct containing the entry, and offset specifies
618 * the offset of this entry.
619 */
620 void
621 ulfsdirhash_add(struct inode *ip, struct lfs_direct *dirp, doff_t offset)
622 {
623 struct dirhash *dh;
624 int slot;
625 const int needswap = ULFS_IPNEEDSWAP(ip);
626 int dirblksiz = ip->i_lfs->um_dirblksiz;
627
628 if ((dh = ip->i_dirhash) == NULL)
629 return;
630
631 DIRHASH_LOCK(dh);
632 if (dh->dh_hash == NULL) {
633 DIRHASH_UNLOCK(dh);
634 ulfsdirhash_free(ip);
635 return;
636 }
637
638 KASSERT(offset < dh->dh_dirblks * dirblksiz);
639 /*
640 * Normal hash usage is < 66%. If the usage gets too high then
641 * remove the hash entirely and let it be rebuilt later.
642 */
643 if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
644 DIRHASH_UNLOCK(dh);
645 ulfsdirhash_free(ip);
646 return;
647 }
648
649 /* Find a free hash slot (empty or deleted), and add the entry. */
650 slot = ulfsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
651 while (DH_ENTRY(dh, slot) >= 0)
652 slot = WRAPINCR(slot, dh->dh_hlen);
653 if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
654 dh->dh_hused++;
655 DH_ENTRY(dh, slot) = offset;
656
657 /* Update the per-block summary info. */
658 ulfsdirhash_adjfree(dh, offset, -LFS_DIRSIZ(0, dirp, needswap), dirblksiz);
659 DIRHASH_UNLOCK(dh);
660 }
661
662 /*
663 * Remove the specified directory entry from the hash. The entry to remove
664 * is defined by the name in `dirp', which must exist at the specified
665 * `offset' within the directory.
666 */
667 void
668 ulfsdirhash_remove(struct inode *ip, struct lfs_direct *dirp, doff_t offset)
669 {
670 struct dirhash *dh;
671 int slot;
672 const int needswap = ULFS_IPNEEDSWAP(ip);
673 int dirblksiz = ip->i_lfs->um_dirblksiz;
674
675 if ((dh = ip->i_dirhash) == NULL)
676 return;
677
678 DIRHASH_LOCK(dh);
679 if (dh->dh_hash == NULL) {
680 DIRHASH_UNLOCK(dh);
681 ulfsdirhash_free(ip);
682 return;
683 }
684
685 KASSERT(offset < dh->dh_dirblks * dirblksiz);
686 /* Find the entry */
687 slot = ulfsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
688
689 /* Remove the hash entry. */
690 ulfsdirhash_delslot(dh, slot);
691
692 /* Update the per-block summary info. */
693 ulfsdirhash_adjfree(dh, offset, LFS_DIRSIZ(0, dirp, needswap), dirblksiz);
694 DIRHASH_UNLOCK(dh);
695 }
696
697 /*
698 * Change the offset associated with a directory entry in the hash. Used
699 * when compacting directory blocks.
700 */
701 void
702 ulfsdirhash_move(struct inode *ip, struct lfs_direct *dirp, doff_t oldoff,
703 doff_t newoff)
704 {
705 struct dirhash *dh;
706 int slot;
707
708 if ((dh = ip->i_dirhash) == NULL)
709 return;
710 DIRHASH_LOCK(dh);
711 if (dh->dh_hash == NULL) {
712 DIRHASH_UNLOCK(dh);
713 ulfsdirhash_free(ip);
714 return;
715 }
716
717 KASSERT(oldoff < dh->dh_dirblks * ip->i_lfs->um_dirblksiz &&
718 newoff < dh->dh_dirblks * ip->i_lfs->um_dirblksiz);
719 /* Find the entry, and update the offset. */
720 slot = ulfsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
721 DH_ENTRY(dh, slot) = newoff;
722 DIRHASH_UNLOCK(dh);
723 }
724
725 /*
726 * Inform dirhash that the directory has grown by one block that
727 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
728 */
729 void
730 ulfsdirhash_newblk(struct inode *ip, doff_t offset)
731 {
732 struct dirhash *dh;
733 int block;
734 int dirblksiz = ip->i_lfs->um_dirblksiz;
735
736 if ((dh = ip->i_dirhash) == NULL)
737 return;
738 DIRHASH_LOCK(dh);
739 if (dh->dh_hash == NULL) {
740 DIRHASH_UNLOCK(dh);
741 ulfsdirhash_free(ip);
742 return;
743 }
744
745 KASSERT(offset == dh->dh_dirblks * dirblksiz);
746 block = offset / dirblksiz;
747 if (block >= dh->dh_nblk) {
748 /* Out of space; must rebuild. */
749 DIRHASH_UNLOCK(dh);
750 ulfsdirhash_free(ip);
751 return;
752 }
753 dh->dh_dirblks = block + 1;
754
755 /* Account for the new free block. */
756 dh->dh_blkfree[block] = dirblksiz / DIRALIGN;
757 if (dh->dh_firstfree[DH_NFSTATS] == -1)
758 dh->dh_firstfree[DH_NFSTATS] = block;
759 DIRHASH_UNLOCK(dh);
760 }
761
762 /*
763 * Inform dirhash that the directory is being truncated.
764 */
765 void
766 ulfsdirhash_dirtrunc(struct inode *ip, doff_t offset)
767 {
768 struct dirhash *dh;
769 int block, i;
770 int dirblksiz = ip->i_lfs->um_dirblksiz;
771
772 if ((dh = ip->i_dirhash) == NULL)
773 return;
774
775 DIRHASH_LOCK(dh);
776 if (dh->dh_hash == NULL) {
777 DIRHASH_UNLOCK(dh);
778 ulfsdirhash_free(ip);
779 return;
780 }
781
782 KASSERT(offset <= dh->dh_dirblks * dirblksiz);
783 block = howmany(offset, dirblksiz);
784 /*
785 * If the directory shrinks to less than 1/8 of dh_nblk blocks
786 * (about 20% of its original size due to the 50% extra added in
787 * ulfsdirhash_build) then free it, and let the caller rebuild
788 * if necessary.
789 */
790 if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
791 DIRHASH_UNLOCK(dh);
792 ulfsdirhash_free(ip);
793 return;
794 }
795
796 /*
797 * Remove any `first free' information pertaining to the
798 * truncated blocks. All blocks we're removing should be
799 * completely unused.
800 */
801 if (dh->dh_firstfree[DH_NFSTATS] >= block)
802 dh->dh_firstfree[DH_NFSTATS] = -1;
803 for (i = block; i < dh->dh_dirblks; i++)
804 if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
805 panic("ulfsdirhash_dirtrunc: blocks in use");
806 for (i = 0; i < DH_NFSTATS; i++)
807 if (dh->dh_firstfree[i] >= block)
808 panic("ulfsdirhash_dirtrunc: first free corrupt");
809 dh->dh_dirblks = block;
810 DIRHASH_UNLOCK(dh);
811 }
812
813 /*
814 * Debugging function to check that the dirhash information about
815 * a directory block matches its actual contents. Panics if a mismatch
816 * is detected.
817 *
818 * On entry, `sbuf' should point to the start of an in-core
819 * DIRBLKSIZ-sized directory block, and `offset' should contain the
820 * offset from the start of the directory of that block.
821 */
822 void
823 ulfsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset)
824 {
825 struct dirhash *dh;
826 struct lfs_direct *dp;
827 int block, ffslot, i, nfree;
828 const int needswap = ULFS_IPNEEDSWAP(ip);
829 int dirblksiz = ip->i_lfs->um_dirblksiz;
830
831 if (!ulfs_dirhashcheck)
832 return;
833 if ((dh = ip->i_dirhash) == NULL)
834 return;
835
836 DIRHASH_LOCK(dh);
837 if (dh->dh_hash == NULL) {
838 DIRHASH_UNLOCK(dh);
839 ulfsdirhash_free(ip);
840 return;
841 }
842
843 block = offset / dirblksiz;
844 if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks)
845 panic("ulfsdirhash_checkblock: bad offset");
846
847 nfree = 0;
848 for (i = 0; i < dirblksiz; i += dp->d_reclen) {
849 dp = (struct lfs_direct *)(sbuf + i);
850 if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz)
851 panic("ulfsdirhash_checkblock: bad dir");
852
853 if (dp->d_ino == 0) {
854 #if 0
855 /*
856 * XXX entries with d_ino == 0 should only occur
857 * at the start of a DIRBLKSIZ block. However the
858 * ulfs code is tolerant of such entries at other
859 * offsets, and fsck does not fix them.
860 */
861 if (i != 0)
862 panic("ulfsdirhash_checkblock: bad dir inode");
863 #endif
864 nfree += dp->d_reclen;
865 continue;
866 }
867
868 /* Check that the entry exists (will panic if it doesn't). */
869 ulfsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
870
871 nfree += dp->d_reclen - LFS_DIRSIZ(0, dp, needswap);
872 }
873 if (i != dirblksiz)
874 panic("ulfsdirhash_checkblock: bad dir end");
875
876 if (dh->dh_blkfree[block] * DIRALIGN != nfree)
877 panic("ulfsdirhash_checkblock: bad free count");
878
879 ffslot = BLKFREE2IDX(nfree / DIRALIGN);
880 for (i = 0; i <= DH_NFSTATS; i++)
881 if (dh->dh_firstfree[i] == block && i != ffslot)
882 panic("ulfsdirhash_checkblock: bad first-free");
883 if (dh->dh_firstfree[ffslot] == -1)
884 panic("ulfsdirhash_checkblock: missing first-free entry");
885 DIRHASH_UNLOCK(dh);
886 }
887
888 /*
889 * Hash the specified filename into a dirhash slot.
890 */
891 static int
892 ulfsdirhash_hash(struct dirhash *dh, const char *name, int namelen)
893 {
894 u_int32_t hash;
895
896 /*
897 * We hash the name and then some other bit of data that is
898 * invariant over the dirhash's lifetime. Otherwise names
899 * differing only in the last byte are placed close to one
900 * another in the table, which is bad for linear probing.
901 */
902 hash = hash32_buf(name, namelen, HASH32_BUF_INIT);
903 hash = hash32_buf(&dh, sizeof(dh), hash);
904 return (hash % dh->dh_hlen);
905 }
906
907 /*
908 * Adjust the number of free bytes in the block containing `offset'
909 * by the value specified by `diff'.
910 *
911 * The caller must ensure we have exclusive access to `dh'; normally
912 * that means that dh_lock should be held, but this is also called
913 * from ulfsdirhash_build() where exclusive access can be assumed.
914 */
915 static void
916 ulfsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz)
917 {
918 int block, i, nfidx, ofidx;
919
920 KASSERT(mutex_owned(&dh->dh_lock));
921
922 /* Update the per-block summary info. */
923 block = offset / dirblksiz;
924 KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks);
925 ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
926 dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
927 nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
928
929 /* Update the `first free' list if necessary. */
930 if (ofidx != nfidx) {
931 /* If removing, scan forward for the next block. */
932 if (dh->dh_firstfree[ofidx] == block) {
933 for (i = block + 1; i < dh->dh_dirblks; i++)
934 if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
935 break;
936 dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
937 }
938
939 /* Make this the new `first free' if necessary */
940 if (dh->dh_firstfree[nfidx] > block ||
941 dh->dh_firstfree[nfidx] == -1)
942 dh->dh_firstfree[nfidx] = block;
943 }
944 }
945
946 /*
947 * Find the specified name which should have the specified offset.
948 * Returns a slot number, and panics on failure.
949 *
950 * `dh' must be locked on entry and remains so on return.
951 */
952 static int
953 ulfsdirhash_findslot(struct dirhash *dh, const char *name, int namelen,
954 doff_t offset)
955 {
956 int slot;
957
958 KASSERT(mutex_owned(&dh->dh_lock));
959
960 /* Find the entry. */
961 KASSERT(dh->dh_hused < dh->dh_hlen);
962 slot = ulfsdirhash_hash(dh, name, namelen);
963 while (DH_ENTRY(dh, slot) != offset &&
964 DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
965 slot = WRAPINCR(slot, dh->dh_hlen);
966 if (DH_ENTRY(dh, slot) != offset)
967 panic("ulfsdirhash_findslot: '%.*s' not found", namelen, name);
968
969 return (slot);
970 }
971
972 /*
973 * Remove the entry corresponding to the specified slot from the hash array.
974 *
975 * `dh' must be locked on entry and remains so on return.
976 */
977 static void
978 ulfsdirhash_delslot(struct dirhash *dh, int slot)
979 {
980 int i;
981
982 KASSERT(mutex_owned(&dh->dh_lock));
983
984 /* Mark the entry as deleted. */
985 DH_ENTRY(dh, slot) = DIRHASH_DEL;
986
987 /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
988 for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
989 i = WRAPINCR(i, dh->dh_hlen);
990 if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
991 i = WRAPDECR(i, dh->dh_hlen);
992 while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
993 DH_ENTRY(dh, i) = DIRHASH_EMPTY;
994 dh->dh_hused--;
995 i = WRAPDECR(i, dh->dh_hlen);
996 }
997 KASSERT(dh->dh_hused >= 0);
998 }
999 }
1000
1001 /*
1002 * Given a directory entry and its offset, find the offset of the
1003 * previous entry in the same DIRBLKSIZ-sized block. Returns an
1004 * offset, or -1 if there is no previous entry in the block or some
1005 * other problem occurred.
1006 */
1007 static doff_t
1008 ulfsdirhash_getprev(struct lfs_direct *dirp, doff_t offset, int dirblksiz)
1009 {
1010 struct lfs_direct *dp;
1011 char *blkbuf;
1012 doff_t blkoff, prevoff;
1013 int entrypos, i;
1014
1015 blkoff = offset & ~(dirblksiz - 1); /* offset of start of block */
1016 entrypos = offset & (dirblksiz - 1); /* entry relative to block */
1017 blkbuf = (char *)dirp - entrypos;
1018 prevoff = blkoff;
1019
1020 /* If `offset' is the start of a block, there is no previous entry. */
1021 if (entrypos == 0)
1022 return (-1);
1023
1024 /* Scan from the start of the block until we get to the entry. */
1025 for (i = 0; i < entrypos; i += dp->d_reclen) {
1026 dp = (struct lfs_direct *)(blkbuf + i);
1027 if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
1028 return (-1); /* Corrupted directory. */
1029 prevoff = blkoff + i;
1030 }
1031 return (prevoff);
1032 }
1033
1034 /*
1035 * Try to free up `wanted' bytes by stealing memory from existing
1036 * dirhashes. Returns zero with list locked if successful.
1037 */
1038 static int
1039 ulfsdirhash_recycle(int wanted)
1040 {
1041 struct dirhash *dh;
1042 doff_t **hash;
1043 u_int8_t *blkfree;
1044 int i, mem, narrays;
1045 size_t hashsz, blkfreesz;
1046
1047 DIRHASHLIST_LOCK();
1048 while (wanted + ulfs_dirhashmem > ulfs_dirhashmaxmem) {
1049 /* Find a dirhash, and lock it. */
1050 if ((dh = TAILQ_FIRST(&ulfsdirhash_list)) == NULL) {
1051 DIRHASHLIST_UNLOCK();
1052 return (-1);
1053 }
1054 DIRHASH_LOCK(dh);
1055 KASSERT(dh->dh_hash != NULL);
1056
1057 /* Decrement the score; only recycle if it becomes zero. */
1058 if (--dh->dh_score > 0) {
1059 DIRHASH_UNLOCK(dh);
1060 DIRHASHLIST_UNLOCK();
1061 return (-1);
1062 }
1063
1064 /* Remove it from the list and detach its memory. */
1065 TAILQ_REMOVE(&ulfsdirhash_list, dh, dh_list);
1066 dh->dh_onlist = 0;
1067 hash = dh->dh_hash;
1068 hashsz = dh->dh_hashsz;
1069 dh->dh_hash = NULL;
1070 blkfree = dh->dh_blkfree;
1071 blkfreesz = dh->dh_blkfreesz;
1072 dh->dh_blkfree = NULL;
1073 narrays = dh->dh_narrays;
1074 mem = narrays * sizeof(*dh->dh_hash) +
1075 narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
1076 dh->dh_nblk * sizeof(*dh->dh_blkfree);
1077
1078 /* Unlock everything, free the detached memory. */
1079 DIRHASH_UNLOCK(dh);
1080 DIRHASHLIST_UNLOCK();
1081
1082 for (i = 0; i < narrays; i++)
1083 DIRHASH_BLKFREE(hash[i]);
1084 kmem_free(hash, hashsz);
1085 kmem_free(blkfree, blkfreesz);
1086
1087 /* Account for the returned memory, and repeat if necessary. */
1088 DIRHASHLIST_LOCK();
1089 atomic_add_int(&ulfs_dirhashmem, -mem);
1090 }
1091 /* Success. */
1092 return (0);
1093 }
1094
1095 static void
1096 ulfsdirhash_sysctl_init(void)
1097 {
1098 const struct sysctlnode *rnode, *cnode;
1099
1100 sysctl_createv(&ulfsdirhash_sysctl_log, 0, NULL, &rnode,
1101 CTLFLAG_PERMANENT,
1102 CTLTYPE_NODE, "ulfs",
1103 SYSCTL_DESCR("ulfs"),
1104 NULL, 0, NULL, 0,
1105 CTL_VFS, CTL_CREATE, CTL_EOL);
1106
1107 sysctl_createv(&ulfsdirhash_sysctl_log, 0, &rnode, &rnode,
1108 CTLFLAG_PERMANENT,
1109 CTLTYPE_NODE, "dirhash",
1110 SYSCTL_DESCR("dirhash"),
1111 NULL, 0, NULL, 0,
1112 CTL_CREATE, CTL_EOL);
1113
1114 sysctl_createv(&ulfsdirhash_sysctl_log, 0, &rnode, &cnode,
1115 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1116 CTLTYPE_INT, "minblocks",
1117 SYSCTL_DESCR("minimum hashed directory size in blocks"),
1118 NULL, 0, &ulfs_dirhashminblks, 0,
1119 CTL_CREATE, CTL_EOL);
1120
1121 sysctl_createv(&ulfsdirhash_sysctl_log, 0, &rnode, &cnode,
1122 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1123 CTLTYPE_INT, "maxmem",
1124 SYSCTL_DESCR("maximum dirhash memory usage"),
1125 NULL, 0, &ulfs_dirhashmaxmem, 0,
1126 CTL_CREATE, CTL_EOL);
1127
1128 sysctl_createv(&ulfsdirhash_sysctl_log, 0, &rnode, &cnode,
1129 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
1130 CTLTYPE_INT, "memused",
1131 SYSCTL_DESCR("current dirhash memory usage"),
1132 NULL, 0, &ulfs_dirhashmem, 0,
1133 CTL_CREATE, CTL_EOL);
1134
1135 sysctl_createv(&ulfsdirhash_sysctl_log, 0, &rnode, &cnode,
1136 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1137 CTLTYPE_INT, "docheck",
1138 SYSCTL_DESCR("enable extra sanity checks"),
1139 NULL, 0, &ulfs_dirhashcheck, 0,
1140 CTL_CREATE, CTL_EOL);
1141 }
1142
1143 void
1144 ulfsdirhash_init(void)
1145 {
1146
1147 mutex_init(&ulfsdirhash_lock, MUTEX_DEFAULT, IPL_NONE);
1148 ulfsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0,
1149 0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL);
1150 ulfsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0,
1151 0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL);
1152 TAILQ_INIT(&ulfsdirhash_list);
1153 ulfsdirhash_sysctl_init();
1154 }
1155
1156 void
1157 ulfsdirhash_done(void)
1158 {
1159
1160 KASSERT(TAILQ_EMPTY(&ulfsdirhash_list));
1161 pool_cache_destroy(ulfsdirhashblk_cache);
1162 pool_cache_destroy(ulfsdirhash_cache);
1163 mutex_destroy(&ulfsdirhash_lock);
1164 sysctl_teardown(&ulfsdirhash_sysctl_log);
1165 }
1166