vfs_bio.c revision 1.98 1 /* $NetBSD: vfs_bio.c,v 1.98 2003/12/02 03:36:33 dbj Exp $ */
2
3 /*-
4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
37 */
38
39 /*-
40 * Copyright (c) 1994 Christopher G. Demetriou
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement:
52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
71 */
72
73 /*
74 * Some references:
75 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
76 * Leffler, et al.: The Design and Implementation of the 4.3BSD
77 * UNIX Operating System (Addison Welley, 1989)
78 */
79
80 #include "opt_softdep.h"
81
82 #include <sys/cdefs.h>
83 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.98 2003/12/02 03:36:33 dbj Exp $");
84
85 #include <sys/param.h>
86 #include <sys/systm.h>
87 #include <sys/proc.h>
88 #include <sys/buf.h>
89 #include <sys/vnode.h>
90 #include <sys/mount.h>
91 #include <sys/malloc.h>
92 #include <sys/resourcevar.h>
93 #include <sys/conf.h>
94
95 #include <uvm/uvm.h>
96
97 #include <miscfs/specfs/specdev.h>
98
99 /* Macros to clear/set/test flags. */
100 #define SET(t, f) (t) |= (f)
101 #define CLR(t, f) (t) &= ~(f)
102 #define ISSET(t, f) ((t) & (f))
103
104 /*
105 * Definitions for the buffer hash lists.
106 */
107 #define BUFHASH(dvp, lbn) \
108 (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
109 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
110 u_long bufhash;
111 #ifndef SOFTDEP
112 struct bio_ops bioops; /* I/O operation notification */
113 #endif
114
115 /*
116 * Insq/Remq for the buffer hash lists.
117 */
118 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
119 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
120
121 /*
122 * Definitions for the buffer free lists.
123 */
124 #define BQUEUES 4 /* number of free buffer queues */
125
126 #define BQ_LOCKED 0 /* super-blocks &c */
127 #define BQ_LRU 1 /* lru, useful buffers */
128 #define BQ_AGE 2 /* rubbish */
129 #define BQ_EMPTY 3 /* buffer headers with no memory */
130
131 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
132 int needbuffer;
133
134 /*
135 * Buffer queue lock.
136 * Take this lock first if also taking some buffer's b_interlock.
137 */
138 struct simplelock bqueue_slock = SIMPLELOCK_INITIALIZER;
139
140 /*
141 * Buffer pool for I/O buffers.
142 */
143 struct pool bufpool;
144
145 /*
146 * bread()/breadn() helper.
147 */
148 static __inline struct buf *bio_doread(struct vnode *, daddr_t, int,
149 struct ucred *, int);
150 int count_lock_queue(void);
151
152 /*
153 * Insq/Remq for the buffer free lists.
154 * Call with buffer queue locked.
155 */
156 #define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist)
157 #define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist)
158
159 void
160 bremfree(bp)
161 struct buf *bp;
162 {
163 struct bqueues *dp = NULL;
164
165 LOCK_ASSERT(simple_lock_held(&bqueue_slock));
166
167 /*
168 * We only calculate the head of the freelist when removing
169 * the last element of the list as that is the only time that
170 * it is needed (e.g. to reset the tail pointer).
171 *
172 * NB: This makes an assumption about how tailq's are implemented.
173 *
174 * We break the TAILQ abstraction in order to efficiently remove a
175 * buffer from its freelist without having to know exactly which
176 * freelist it is on.
177 */
178 if (TAILQ_NEXT(bp, b_freelist) == NULL) {
179 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
180 if (dp->tqh_last == &bp->b_freelist.tqe_next)
181 break;
182 if (dp == &bufqueues[BQUEUES])
183 panic("bremfree: lost tail");
184 }
185 TAILQ_REMOVE(dp, bp, b_freelist);
186 }
187
188 /*
189 * Initialize buffers and hash links for buffers.
190 */
191 void
192 bufinit()
193 {
194 struct buf *bp;
195 struct bqueues *dp;
196 u_int i, base, residual;
197
198 /*
199 * Initialize the buffer pool. This pool is used for buffers
200 * which are strictly I/O control blocks, not buffer cache
201 * buffers.
202 */
203 pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
204
205 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
206 TAILQ_INIT(dp);
207 bufhashtbl = hashinit(nbuf, HASH_LIST, M_CACHE, M_WAITOK, &bufhash);
208 base = bufpages / nbuf;
209 residual = bufpages % nbuf;
210 for (i = 0; i < nbuf; i++) {
211 bp = &buf[i];
212 memset((char *)bp, 0, sizeof(*bp));
213 BUF_INIT(bp);
214 bp->b_dev = NODEV;
215 bp->b_vnbufs.le_next = NOLIST;
216 bp->b_data = buffers + i * MAXBSIZE;
217 if (i < residual)
218 bp->b_bufsize = (base + 1) * PAGE_SIZE;
219 else
220 bp->b_bufsize = base * PAGE_SIZE;
221 bp->b_flags = B_INVAL;
222 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
223 binsheadfree(bp, dp);
224 binshash(bp, &invalhash);
225 }
226 }
227
228 static __inline struct buf *
229 bio_doread(vp, blkno, size, cred, async)
230 struct vnode *vp;
231 daddr_t blkno;
232 int size;
233 struct ucred *cred;
234 int async;
235 {
236 struct buf *bp;
237 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */
238 struct proc *p = l->l_proc;
239
240 bp = getblk(vp, blkno, size, 0, 0);
241
242 #ifdef DIAGNOSTIC
243 if (bp == NULL) {
244 panic("bio_doread: no such buf");
245 }
246 #endif
247
248 /*
249 * If buffer does not have data valid, start a read.
250 * Note that if buffer is B_INVAL, getblk() won't return it.
251 * Therefore, it's valid if its I/O has completed or been delayed.
252 */
253 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
254 /* Start I/O for the buffer. */
255 SET(bp->b_flags, B_READ | async);
256 VOP_STRATEGY(bp);
257
258 /* Pay for the read. */
259 p->p_stats->p_ru.ru_inblock++;
260 } else if (async) {
261 brelse(bp);
262 }
263
264 return (bp);
265 }
266
267 /*
268 * Read a disk block.
269 * This algorithm described in Bach (p.54).
270 */
271 int
272 bread(vp, blkno, size, cred, bpp)
273 struct vnode *vp;
274 daddr_t blkno;
275 int size;
276 struct ucred *cred;
277 struct buf **bpp;
278 {
279 struct buf *bp;
280
281 /* Get buffer for block. */
282 bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
283
284 /* Wait for the read to complete, and return result. */
285 return (biowait(bp));
286 }
287
288 /*
289 * Read-ahead multiple disk blocks. The first is sync, the rest async.
290 * Trivial modification to the breada algorithm presented in Bach (p.55).
291 */
292 int
293 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
294 struct vnode *vp;
295 daddr_t blkno; int size;
296 daddr_t rablks[]; int rasizes[];
297 int nrablks;
298 struct ucred *cred;
299 struct buf **bpp;
300 {
301 struct buf *bp;
302 int i;
303
304 bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
305
306 /*
307 * For each of the read-ahead blocks, start a read, if necessary.
308 */
309 for (i = 0; i < nrablks; i++) {
310 /* If it's in the cache, just go on to next one. */
311 if (incore(vp, rablks[i]))
312 continue;
313
314 /* Get a buffer for the read-ahead block */
315 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC);
316 }
317
318 /* Otherwise, we had to start a read for it; wait until it's valid. */
319 return (biowait(bp));
320 }
321
322 /*
323 * Read with single-block read-ahead. Defined in Bach (p.55), but
324 * implemented as a call to breadn().
325 * XXX for compatibility with old file systems.
326 */
327 int
328 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
329 struct vnode *vp;
330 daddr_t blkno; int size;
331 daddr_t rablkno; int rabsize;
332 struct ucred *cred;
333 struct buf **bpp;
334 {
335
336 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
337 }
338
339 /*
340 * Block write. Described in Bach (p.56)
341 */
342 int
343 bwrite(bp)
344 struct buf *bp;
345 {
346 int rv, sync, wasdelayed, s;
347 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */
348 struct proc *p = l->l_proc;
349 struct vnode *vp;
350 struct mount *mp;
351
352 KASSERT(ISSET(bp->b_flags, B_BUSY));
353
354 vp = bp->b_vp;
355 if (vp != NULL) {
356 if (vp->v_type == VBLK)
357 mp = vp->v_specmountpoint;
358 else
359 mp = vp->v_mount;
360 } else {
361 mp = NULL;
362 }
363
364 /*
365 * Remember buffer type, to switch on it later. If the write was
366 * synchronous, but the file system was mounted with MNT_ASYNC,
367 * convert it to a delayed write.
368 * XXX note that this relies on delayed tape writes being converted
369 * to async, not sync writes (which is safe, but ugly).
370 */
371 sync = !ISSET(bp->b_flags, B_ASYNC);
372 if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
373 bdwrite(bp);
374 return (0);
375 }
376
377 /*
378 * Collect statistics on synchronous and asynchronous writes.
379 * Writes to block devices are charged to their associated
380 * filesystem (if any).
381 */
382 if (mp != NULL) {
383 if (sync)
384 mp->mnt_stat.f_syncwrites++;
385 else
386 mp->mnt_stat.f_asyncwrites++;
387 }
388
389 s = splbio();
390 simple_lock(&bp->b_interlock);
391
392 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
393
394 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
395
396 /*
397 * Pay for the I/O operation and make sure the buf is on the correct
398 * vnode queue.
399 */
400 if (wasdelayed)
401 reassignbuf(bp, bp->b_vp);
402 else
403 p->p_stats->p_ru.ru_oublock++;
404
405 /* Initiate disk write. Make sure the appropriate party is charged. */
406 V_INCR_NUMOUTPUT(bp->b_vp);
407 simple_unlock(&bp->b_interlock);
408 splx(s);
409
410 VOP_STRATEGY(bp);
411
412 if (sync) {
413 /* If I/O was synchronous, wait for it to complete. */
414 rv = biowait(bp);
415
416 /* Release the buffer. */
417 brelse(bp);
418
419 return (rv);
420 } else {
421 return (0);
422 }
423 }
424
425 int
426 vn_bwrite(v)
427 void *v;
428 {
429 struct vop_bwrite_args *ap = v;
430
431 return (bwrite(ap->a_bp));
432 }
433
434 /*
435 * Delayed write.
436 *
437 * The buffer is marked dirty, but is not queued for I/O.
438 * This routine should be used when the buffer is expected
439 * to be modified again soon, typically a small write that
440 * partially fills a buffer.
441 *
442 * NB: magnetic tapes cannot be delayed; they must be
443 * written in the order that the writes are requested.
444 *
445 * Described in Leffler, et al. (pp. 208-213).
446 */
447 void
448 bdwrite(bp)
449 struct buf *bp;
450 {
451 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */
452 struct proc *p = l->l_proc;
453 const struct bdevsw *bdev;
454 int s;
455
456 /* If this is a tape block, write the block now. */
457 bdev = bdevsw_lookup(bp->b_dev);
458 if (bdev != NULL && bdev->d_type == D_TAPE) {
459 bawrite(bp);
460 return;
461 }
462
463 /*
464 * If the block hasn't been seen before:
465 * (1) Mark it as having been seen,
466 * (2) Charge for the write,
467 * (3) Make sure it's on its vnode's correct block list.
468 */
469 s = splbio();
470 simple_lock(&bp->b_interlock);
471
472 KASSERT(ISSET(bp->b_flags, B_BUSY));
473
474 if (!ISSET(bp->b_flags, B_DELWRI)) {
475 SET(bp->b_flags, B_DELWRI);
476 p->p_stats->p_ru.ru_oublock++;
477 reassignbuf(bp, bp->b_vp);
478 }
479
480 /* Otherwise, the "write" is done, so mark and release the buffer. */
481 CLR(bp->b_flags, B_DONE);
482 simple_unlock(&bp->b_interlock);
483 splx(s);
484
485 brelse(bp);
486 }
487
488 /*
489 * Asynchronous block write; just an asynchronous bwrite().
490 */
491 void
492 bawrite(bp)
493 struct buf *bp;
494 {
495 int s;
496
497 s = splbio();
498 simple_lock(&bp->b_interlock);
499
500 KASSERT(ISSET(bp->b_flags, B_BUSY));
501
502 SET(bp->b_flags, B_ASYNC);
503 simple_unlock(&bp->b_interlock);
504 splx(s);
505 VOP_BWRITE(bp);
506 }
507
508 /*
509 * Same as first half of bdwrite, mark buffer dirty, but do not release it.
510 * Call at splbio() and with the buffer interlock locked.
511 * Note: called only from biodone() through ffs softdep's bioops.io_complete()
512 */
513 void
514 bdirty(bp)
515 struct buf *bp;
516 {
517 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */
518 struct proc *p = l->l_proc;
519
520 LOCK_ASSERT(simple_lock_held(&bp->b_interlock));
521 KASSERT(ISSET(bp->b_flags, B_BUSY));
522
523 CLR(bp->b_flags, B_AGE);
524
525 if (!ISSET(bp->b_flags, B_DELWRI)) {
526 SET(bp->b_flags, B_DELWRI);
527 p->p_stats->p_ru.ru_oublock++;
528 reassignbuf(bp, bp->b_vp);
529 }
530 }
531
532 /*
533 * Release a buffer on to the free lists.
534 * Described in Bach (p. 46).
535 */
536 void
537 brelse(bp)
538 struct buf *bp;
539 {
540 struct bqueues *bufq;
541 int s;
542
543 /* Block disk interrupts. */
544 s = splbio();
545 simple_lock(&bqueue_slock);
546 simple_lock(&bp->b_interlock);
547
548 KASSERT(ISSET(bp->b_flags, B_BUSY));
549 KASSERT(!ISSET(bp->b_flags, B_CALL));
550
551 /* Wake up any processes waiting for any buffer to become free. */
552 if (needbuffer) {
553 needbuffer = 0;
554 wakeup(&needbuffer);
555 }
556
557 /* Wake up any proceeses waiting for _this_ buffer to become free. */
558 if (ISSET(bp->b_flags, B_WANTED)) {
559 CLR(bp->b_flags, B_WANTED|B_AGE);
560 wakeup(bp);
561 }
562
563 /*
564 * Determine which queue the buffer should be on, then put it there.
565 */
566
567 /* If it's locked, don't report an error; try again later. */
568 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
569 CLR(bp->b_flags, B_ERROR);
570
571 /* If it's not cacheable, or an error, mark it invalid. */
572 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
573 SET(bp->b_flags, B_INVAL);
574
575 if (ISSET(bp->b_flags, B_VFLUSH)) {
576 /*
577 * This is a delayed write buffer that was just flushed to
578 * disk. It is still on the LRU queue. If it's become
579 * invalid, then we need to move it to a different queue;
580 * otherwise leave it in its current position.
581 */
582 CLR(bp->b_flags, B_VFLUSH);
583 if (!ISSET(bp->b_flags, B_ERROR|B_INVAL|B_LOCKED|B_AGE))
584 goto already_queued;
585 else
586 bremfree(bp);
587 }
588
589 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
590 /*
591 * If it's invalid or empty, dissociate it from its vnode
592 * and put on the head of the appropriate queue.
593 */
594 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
595 (*bioops.io_deallocate)(bp);
596 CLR(bp->b_flags, B_DONE|B_DELWRI);
597 if (bp->b_vp) {
598 reassignbuf(bp, bp->b_vp);
599 brelvp(bp);
600 }
601 if (bp->b_bufsize <= 0)
602 /* no data */
603 bufq = &bufqueues[BQ_EMPTY];
604 else
605 /* invalid data */
606 bufq = &bufqueues[BQ_AGE];
607 binsheadfree(bp, bufq);
608 } else {
609 /*
610 * It has valid data. Put it on the end of the appropriate
611 * queue, so that it'll stick around for as long as possible.
612 * If buf is AGE, but has dependencies, must put it on last
613 * bufqueue to be scanned, ie LRU. This protects against the
614 * livelock where BQ_AGE only has buffers with dependencies,
615 * and we thus never get to the dependent buffers in BQ_LRU.
616 */
617 if (ISSET(bp->b_flags, B_LOCKED))
618 /* locked in core */
619 bufq = &bufqueues[BQ_LOCKED];
620 else if (!ISSET(bp->b_flags, B_AGE))
621 /* valid data */
622 bufq = &bufqueues[BQ_LRU];
623 else {
624 /* stale but valid data */
625 int has_deps;
626
627 if (LIST_FIRST(&bp->b_dep) != NULL &&
628 bioops.io_countdeps)
629 has_deps = (*bioops.io_countdeps)(bp, 0);
630 else
631 has_deps = 0;
632 bufq = has_deps ? &bufqueues[BQ_LRU] :
633 &bufqueues[BQ_AGE];
634 }
635 binstailfree(bp, bufq);
636 }
637
638 already_queued:
639 /* Unlock the buffer. */
640 CLR(bp->b_flags, B_AGE|B_ASYNC|B_BUSY|B_NOCACHE);
641 SET(bp->b_flags, B_CACHE);
642
643 /* Allow disk interrupts. */
644 simple_unlock(&bp->b_interlock);
645 simple_unlock(&bqueue_slock);
646 splx(s);
647 }
648
649 /*
650 * Determine if a block is in the cache.
651 * Just look on what would be its hash chain. If it's there, return
652 * a pointer to it, unless it's marked invalid. If it's marked invalid,
653 * we normally don't return the buffer, unless the caller explicitly
654 * wants us to.
655 */
656 struct buf *
657 incore(vp, blkno)
658 struct vnode *vp;
659 daddr_t blkno;
660 {
661 struct buf *bp;
662
663 /* Search hash chain */
664 LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
665 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
666 !ISSET(bp->b_flags, B_INVAL))
667 return (bp);
668 }
669
670 return (NULL);
671 }
672
673 /*
674 * Get a block of requested size that is associated with
675 * a given vnode and block offset. If it is found in the
676 * block cache, mark it as having been found, make it busy
677 * and return it. Otherwise, return an empty block of the
678 * correct size. It is up to the caller to insure that the
679 * cached blocks be of the correct size.
680 */
681 struct buf *
682 getblk(vp, blkno, size, slpflag, slptimeo)
683 struct vnode *vp;
684 daddr_t blkno;
685 int size, slpflag, slptimeo;
686 {
687 struct buf *bp;
688 int s, err;
689
690 start:
691 s = splbio();
692 simple_lock(&bqueue_slock);
693 bp = incore(vp, blkno);
694 if (bp != NULL) {
695 simple_lock(&bp->b_interlock);
696 if (ISSET(bp->b_flags, B_BUSY)) {
697 simple_unlock(&bqueue_slock);
698 if (curproc == uvm.pagedaemon_proc) {
699 simple_unlock(&bp->b_interlock);
700 splx(s);
701 return NULL;
702 }
703 SET(bp->b_flags, B_WANTED);
704 err = ltsleep(bp, slpflag | (PRIBIO + 1) | PNORELOCK,
705 "getblk", slptimeo, &bp->b_interlock);
706 splx(s);
707 if (err)
708 return (NULL);
709 goto start;
710 }
711 #ifdef DIAGNOSTIC
712 if (ISSET(bp->b_flags, B_DONE|B_DELWRI) &&
713 bp->b_bcount < size && vp->v_type != VBLK)
714 panic("getblk: block size invariant failed");
715 #endif
716 SET(bp->b_flags, B_BUSY);
717 bremfree(bp);
718 } else {
719 if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) {
720 simple_unlock(&bqueue_slock);
721 splx(s);
722 goto start;
723 }
724
725 binshash(bp, BUFHASH(vp, blkno));
726 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
727 bgetvp(vp, bp);
728 }
729 simple_unlock(&bp->b_interlock);
730 simple_unlock(&bqueue_slock);
731 splx(s);
732 /*
733 * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
734 * if we re-size buffers here.
735 */
736 if (ISSET(bp->b_flags, B_LOCKED)) {
737 KASSERT(bp->b_bufsize >= size);
738 } else {
739 allocbuf(bp, size);
740 }
741 return (bp);
742 }
743
744 /*
745 * Get an empty, disassociated buffer of given size.
746 */
747 struct buf *
748 geteblk(size)
749 int size;
750 {
751 struct buf *bp;
752 int s;
753
754 s = splbio();
755 simple_lock(&bqueue_slock);
756 while ((bp = getnewbuf(0, 0)) == 0)
757 ;
758
759 SET(bp->b_flags, B_INVAL);
760 binshash(bp, &invalhash);
761 simple_unlock(&bqueue_slock);
762 simple_unlock(&bp->b_interlock);
763 splx(s);
764 allocbuf(bp, size);
765 return (bp);
766 }
767
768 /*
769 * Expand or contract the actual memory allocated to a buffer.
770 *
771 * If the buffer shrinks, data is lost, so it's up to the
772 * caller to have written it out *first*; this routine will not
773 * start a write. If the buffer grows, it's the callers
774 * responsibility to fill out the buffer's additional contents.
775 */
776 void
777 allocbuf(bp, size)
778 struct buf *bp;
779 int size;
780 {
781 struct buf *nbp;
782 vsize_t desired_size;
783 int s;
784
785 desired_size = round_page((vsize_t)size);
786 if (desired_size > MAXBSIZE)
787 panic("allocbuf: buffer larger than MAXBSIZE requested");
788
789 if (bp->b_bufsize == desired_size)
790 goto out;
791
792 /*
793 * If the buffer is smaller than the desired size, we need to snarf
794 * it from other buffers. Get buffers (via getnewbuf()), and
795 * steal their pages.
796 */
797 while (bp->b_bufsize < desired_size) {
798 int amt;
799
800 /* find a buffer */
801 s = splbio();
802 simple_lock(&bqueue_slock);
803 while ((nbp = getnewbuf(0, 0)) == NULL)
804 ;
805
806 SET(nbp->b_flags, B_INVAL);
807 binshash(nbp, &invalhash);
808
809 simple_unlock(&nbp->b_interlock);
810 simple_unlock(&bqueue_slock);
811 splx(s);
812
813 /* and steal its pages, up to the amount we need */
814 amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize));
815 pagemove((nbp->b_data + nbp->b_bufsize - amt),
816 bp->b_data + bp->b_bufsize, amt);
817 bp->b_bufsize += amt;
818 nbp->b_bufsize -= amt;
819
820 /* reduce transfer count if we stole some data */
821 if (nbp->b_bcount > nbp->b_bufsize)
822 nbp->b_bcount = nbp->b_bufsize;
823
824 #ifdef DIAGNOSTIC
825 if (nbp->b_bufsize < 0)
826 panic("allocbuf: negative bufsize");
827 #endif
828 brelse(nbp);
829 }
830
831 /*
832 * If we want a buffer smaller than the current size,
833 * shrink this buffer. Grab a buf head from the EMPTY queue,
834 * move a page onto it, and put it on front of the AGE queue.
835 * If there are no free buffer headers, leave the buffer alone.
836 */
837 if (bp->b_bufsize > desired_size) {
838 s = splbio();
839 simple_lock(&bqueue_slock);
840 if ((nbp = TAILQ_FIRST(&bufqueues[BQ_EMPTY])) == NULL) {
841 /* No free buffer head */
842 simple_unlock(&bqueue_slock);
843 splx(s);
844 goto out;
845 }
846 /* No need to lock nbp since it came from the empty queue */
847 bremfree(nbp);
848 SET(nbp->b_flags, B_BUSY | B_INVAL);
849 simple_unlock(&bqueue_slock);
850 splx(s);
851
852 /* move the page to it and note this change */
853 pagemove(bp->b_data + desired_size,
854 nbp->b_data, bp->b_bufsize - desired_size);
855 nbp->b_bufsize = bp->b_bufsize - desired_size;
856 bp->b_bufsize = desired_size;
857 nbp->b_bcount = 0;
858
859 /* release the newly-filled buffer and leave */
860 brelse(nbp);
861 }
862
863 out:
864 bp->b_bcount = size;
865 }
866
867 /*
868 * Find a buffer which is available for use.
869 * Select something from a free list.
870 * Preference is to AGE list, then LRU list.
871 *
872 * Called with buffer queues locked.
873 * Return buffer locked.
874 */
875 struct buf *
876 getnewbuf(slpflag, slptimeo)
877 int slpflag, slptimeo;
878 {
879 struct buf *bp;
880
881 start:
882 LOCK_ASSERT(simple_lock_held(&bqueue_slock));
883
884 if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL ||
885 (bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) {
886 simple_lock(&bp->b_interlock);
887 bremfree(bp);
888 } else {
889 /* wait for a free buffer of any kind */
890 needbuffer = 1;
891 ltsleep(&needbuffer, slpflag|(PRIBIO+1),
892 "getnewbuf", slptimeo, &bqueue_slock);
893 return (NULL);
894 }
895
896 if (ISSET(bp->b_flags, B_VFLUSH)) {
897 /*
898 * This is a delayed write buffer being flushed to disk. Make
899 * sure it gets aged out of the queue when it's finished, and
900 * leave it off the LRU queue.
901 */
902 CLR(bp->b_flags, B_VFLUSH);
903 SET(bp->b_flags, B_AGE);
904 simple_unlock(&bp->b_interlock);
905 goto start;
906 }
907
908 /* Buffer is no longer on free lists. */
909 SET(bp->b_flags, B_BUSY);
910
911 /*
912 * If buffer was a delayed write, start it and return NULL
913 * (since we might sleep while starting the write).
914 */
915 if (ISSET(bp->b_flags, B_DELWRI)) {
916 /*
917 * This buffer has gone through the LRU, so make sure it gets
918 * reused ASAP.
919 */
920 SET(bp->b_flags, B_AGE);
921 simple_unlock(&bp->b_interlock);
922 simple_unlock(&bqueue_slock);
923 bawrite(bp);
924 simple_lock(&bqueue_slock);
925 return (NULL);
926 }
927
928 /* disassociate us from our vnode, if we had one... */
929 if (bp->b_vp)
930 brelvp(bp);
931
932 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
933 (*bioops.io_deallocate)(bp);
934
935 /* clear out various other fields */
936 bp->b_flags = B_BUSY;
937 bp->b_dev = NODEV;
938 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = 0;
939 bp->b_iodone = 0;
940 bp->b_error = 0;
941 bp->b_resid = 0;
942 bp->b_bcount = 0;
943
944 bremhash(bp);
945 return (bp);
946 }
947
948 /*
949 * Wait for operations on the buffer to complete.
950 * When they do, extract and return the I/O's error value.
951 */
952 int
953 biowait(bp)
954 struct buf *bp;
955 {
956 int s, error;
957
958 s = splbio();
959 simple_lock(&bp->b_interlock);
960 while (!ISSET(bp->b_flags, B_DONE | B_DELWRI))
961 ltsleep(bp, PRIBIO + 1, "biowait", 0, &bp->b_interlock);
962
963 /* check for interruption of I/O (e.g. via NFS), then errors. */
964 if (ISSET(bp->b_flags, B_EINTR)) {
965 CLR(bp->b_flags, B_EINTR);
966 error = EINTR;
967 } else if (ISSET(bp->b_flags, B_ERROR))
968 error = bp->b_error ? bp->b_error : EIO;
969 else
970 error = 0;
971
972 simple_unlock(&bp->b_interlock);
973 splx(s);
974 return (error);
975 }
976
977 /*
978 * Mark I/O complete on a buffer.
979 *
980 * If a callback has been requested, e.g. the pageout
981 * daemon, do so. Otherwise, awaken waiting processes.
982 *
983 * [ Leffler, et al., says on p.247:
984 * "This routine wakes up the blocked process, frees the buffer
985 * for an asynchronous write, or, for a request by the pagedaemon
986 * process, invokes a procedure specified in the buffer structure" ]
987 *
988 * In real life, the pagedaemon (or other system processes) wants
989 * to do async stuff to, and doesn't want the buffer brelse()'d.
990 * (for swap pager, that puts swap buffers on the free lists (!!!),
991 * for the vn device, that puts malloc'd buffers on the free lists!)
992 */
993 void
994 biodone(bp)
995 struct buf *bp;
996 {
997 int s = splbio();
998
999 simple_lock(&bp->b_interlock);
1000 if (ISSET(bp->b_flags, B_DONE))
1001 panic("biodone already");
1002 SET(bp->b_flags, B_DONE); /* note that it's done */
1003
1004 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1005 (*bioops.io_complete)(bp);
1006
1007 if (!ISSET(bp->b_flags, B_READ)) /* wake up reader */
1008 vwakeup(bp);
1009
1010 /*
1011 * If necessary, call out. Unlock the buffer before calling
1012 * iodone() as the buffer isn't valid any more when it return.
1013 */
1014 if (ISSET(bp->b_flags, B_CALL)) {
1015 CLR(bp->b_flags, B_CALL); /* but note callout done */
1016 simple_unlock(&bp->b_interlock);
1017 (*bp->b_iodone)(bp);
1018 } else {
1019 if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release */
1020 simple_unlock(&bp->b_interlock);
1021 brelse(bp);
1022 } else { /* or just wakeup the buffer */
1023 CLR(bp->b_flags, B_WANTED);
1024 wakeup(bp);
1025 simple_unlock(&bp->b_interlock);
1026 }
1027 }
1028
1029 splx(s);
1030 }
1031
1032 /*
1033 * Return a count of buffers on the "locked" queue.
1034 */
1035 int
1036 count_lock_queue()
1037 {
1038 struct buf *bp;
1039 int n = 0;
1040
1041 simple_lock(&bqueue_slock);
1042 TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED], b_freelist)
1043 n++;
1044 simple_unlock(&bqueue_slock);
1045 return (n);
1046 }
1047
1048 #ifdef DEBUG
1049 /*
1050 * Print out statistics on the current allocation of the buffer pool.
1051 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1052 * in vfs_syscalls.c using sysctl.
1053 */
1054 void
1055 vfs_bufstats()
1056 {
1057 int s, i, j, count;
1058 struct buf *bp;
1059 struct bqueues *dp;
1060 int counts[(MAXBSIZE / PAGE_SIZE) + 1];
1061 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
1062
1063 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1064 count = 0;
1065 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1066 counts[j] = 0;
1067 s = splbio();
1068 TAILQ_FOREACH(bp, dp, b_freelist) {
1069 counts[bp->b_bufsize/PAGE_SIZE]++;
1070 count++;
1071 }
1072 splx(s);
1073 printf("%s: total-%d", bname[i], count);
1074 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1075 if (counts[j] != 0)
1076 printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1077 printf("\n");
1078 }
1079 }
1080 #endif /* DEBUG */
1081