vfs_bio.c revision 1.33 1 /* $NetBSD: vfs_bio.c,v 1.33 1994/10/30 21:48:10 cgd Exp $ */
2
3 /*-
4 * Copyright (c) 1994 Christopher G. Demetriou
5 * Copyright (c) 1982, 1986, 1989, 1993
6 * The Regents of the University of California. All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed by the University of
24 * California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
42 */
43
44 /*
45 * Some references:
46 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
47 * Leffler, et al.: The Design and Implementation of the 4.3BSD
48 * UNIX Operating System (Addison Welley, 1989)
49 */
50
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/proc.h>
54 #include <sys/buf.h>
55 #include <sys/vnode.h>
56 #include <sys/mount.h>
57 #include <sys/trace.h>
58 #include <sys/malloc.h>
59 #include <sys/resourcevar.h>
60
61 /* Macros to clear/set/test flags. */
62 #define SET(t, f) (t) |= (f)
63 #define CLR(t, f) (t) &= ~(f)
64 #define ISSET(t, f) ((t) & (f))
65
66 /*
67 * Definitions for the buffer hash lists.
68 */
69 #define BUFHASH(dvp, lbn) \
70 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
71 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
72 u_long bufhash;
73
74 /*
75 * Insq/Remq for the buffer hash lists.
76 */
77 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
78 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
79
80 /*
81 * Definitions for the buffer free lists.
82 */
83 #define BQUEUES 4 /* number of free buffer queues */
84
85 #define BQ_LOCKED 0 /* super-blocks &c */
86 #define BQ_LRU 1 /* lru, useful buffers */
87 #define BQ_AGE 2 /* rubbish */
88 #define BQ_EMPTY 3 /* buffer headers with no memory */
89
90 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
91 int needbuffer;
92
93 /*
94 * Insq/Remq for the buffer free lists.
95 */
96 #define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist)
97 #define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist)
98
99 void
100 bremfree(bp)
101 struct buf *bp;
102 {
103 struct bqueues *dp = NULL;
104
105 /*
106 * We only calculate the head of the freelist when removing
107 * the last element of the list as that is the only time that
108 * it is needed (e.g. to reset the tail pointer).
109 *
110 * NB: This makes an assumption about how tailq's are implemented.
111 */
112 if (bp->b_freelist.tqe_next == NULL) {
113 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
114 if (dp->tqh_last == &bp->b_freelist.tqe_next)
115 break;
116 if (dp == &bufqueues[BQUEUES])
117 panic("bremfree: lost tail");
118 }
119 TAILQ_REMOVE(dp, bp, b_freelist);
120 }
121
122 /*
123 * Initialize buffers and hash links for buffers.
124 */
125 void
126 bufinit()
127 {
128 register struct buf *bp;
129 struct bqueues *dp;
130 register int i;
131 int base, residual;
132
133 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
134 TAILQ_INIT(dp);
135 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
136 base = bufpages / nbuf;
137 residual = bufpages % nbuf;
138 for (i = 0; i < nbuf; i++) {
139 bp = &buf[i];
140 bzero((char *)bp, sizeof *bp);
141 bp->b_dev = NODEV;
142 bp->b_rcred = NOCRED;
143 bp->b_wcred = NOCRED;
144 bp->b_vnbufs.le_next = NOLIST;
145 bp->b_data = buffers + i * MAXBSIZE;
146 if (i < residual)
147 bp->b_bufsize = (base + 1) * CLBYTES;
148 else
149 bp->b_bufsize = base * CLBYTES;
150 bp->b_flags = B_INVAL;
151 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
152 binsheadfree(bp, dp);
153 binshash(bp, &invalhash);
154 }
155 }
156
157 /*
158 * Read a disk block.
159 * This algorithm described in Bach (p.54).
160 */
161 bread(vp, blkno, size, cred, bpp)
162 struct vnode *vp;
163 daddr_t blkno;
164 int size;
165 struct ucred *cred;
166 struct buf **bpp;
167 {
168 register struct buf *bp;
169
170 /* Get buffer for block. */
171 bp = *bpp = getblk(vp, blkno, size, 0, 0);
172
173 /*
174 * If buffer data valid, return it.
175 * Note that if buffer is B_INVAL, getblk() won't return it.
176 * Therefore, it's valid if it's I/O has completed or been delayed.
177 */
178 if (ISSET(bp->b_flags, (B_DONE | B_DELWRI)))
179 return (0);
180
181 /* Start some I/O for the buffer (keeping credentials, if needed). */
182 SET(bp->b_flags, B_READ);
183 if (cred != NOCRED && bp->b_rcred == NOCRED) {
184 crhold(cred);
185 bp->b_rcred = cred;
186 }
187 VOP_STRATEGY(bp);
188
189 /* Pay for the read. */
190 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
191
192 /* Wait for the read to complete, and return result. */
193 return (biowait(bp));
194 }
195
196 /*
197 * Read-ahead multiple disk blocks. The first is sync, the rest async.
198 * Trivial modification to the breada algorithm presented in Bach (p.55).
199 */
200 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
201 struct vnode *vp;
202 daddr_t blkno; int size;
203 daddr_t rablks[]; int rasizes[];
204 int nrablks;
205 struct ucred *cred;
206 struct buf **bpp;
207 {
208 struct buf *bp, *rabp;
209 int i;
210
211 bp = NULL; /* We don't have a buffer yet. */
212
213 /* If first block not in cache, get buffer for it and read it in. */
214 if (!incore(vp, blkno)) {
215 bp = *bpp = getblk(vp, blkno, size, 0, 0);
216
217 /*
218 * If buffer data not valid, we have to read it in.
219 * If it is valid, just hold on to the buffer pointer.
220 */
221 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
222 /* Start I/O for the buffer (keeping credentials). */
223 SET(bp->b_flags, B_READ);
224 if (cred != NOCRED && bp->b_rcred == NOCRED) {
225 crhold(cred);
226 bp->b_rcred = cred;
227 }
228 VOP_STRATEGY(bp);
229
230 /* Pay for the read. */
231 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
232 }
233 }
234
235 /*
236 * For each of the read-ahead blocks, start a read, if necessary.
237 */
238 for (i = 0; i < nrablks; i++) {
239 /* If it's in the cache, just go on to next one. */
240 if (incore(vp, rablks[i]))
241 continue;
242
243 /* Get a buffer for the read-ahead block */
244 rabp = getblk(vp, rablks[i], rasizes[i], 0, 0);
245
246 /*
247 * If buffer data valid, just release the buffer back into
248 * the cache. If it's not valid, we have to read it in.
249 */
250 if (ISSET(rabp->b_flags, (B_DONE | B_DELWRI)))
251 brelse(rabp);
252 else {
253 /* Start I/O for the buffer (keeping credentials). */
254 SET(rabp->b_flags, (B_READ | B_ASYNC));
255 if (cred != NOCRED && rabp->b_rcred == NOCRED) {
256 crhold(cred);
257 rabp->b_rcred = cred;
258 }
259 VOP_STRATEGY(rabp);
260
261 /* Pay for the read. */
262 curproc->p_stats->p_ru.ru_inblock++; /* XXX */
263 }
264 }
265
266 /*
267 * If first block was originally in the cache (i.e. we *still* don't
268 * have buffer), use bread to get and return it.
269 */
270 if (bp == NULL)
271 return (bread(vp, blkno, size, cred, bpp));
272
273 /* Otherwise, we had to start a read for it; wait until it's valid. */
274 return (biowait(bp));
275 }
276
277 /*
278 * Read with single-block read-ahead. Defined in Bach (p.55), but
279 * implemented as a call to breadn().
280 * XXX for compatibility with old file systems.
281 */
282 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
283 struct vnode *vp;
284 daddr_t blkno; int size;
285 daddr_t rablkno; int rabsize;
286 struct ucred *cred;
287 struct buf **bpp;
288 {
289 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
290 }
291
292 /*
293 * Block write. Described in Bach (p.56)
294 */
295 bwrite(bp)
296 struct buf *bp;
297 {
298 int rv, s, sync, wasdelayed;
299
300 rv = 0;
301
302 /* Remember buffer type, to switch on it later. */
303 sync = !ISSET(bp->b_flags, B_ASYNC);
304 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
305 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
306
307 /*
308 * If not synchronous, pay for the I/O operation and make
309 * sure the buf is on the correct vnode queue. We have
310 * to do this now, because if we don't, the vnode may not
311 * be properly notified that it's i/o has completed.
312 */
313 if (!sync)
314 if (wasdelayed)
315 reassignbuf(bp, bp->b_vp);
316 else
317 curproc->p_stats->p_ru.ru_oublock++;
318
319 /* Initiate disk write. Make sure the appropriate party is charged. */
320 SET(bp->b_flags, B_WRITEINPROG);
321 bp->b_vp->v_numoutput++;
322 VOP_STRATEGY(bp);
323
324 /*
325 * If I/O was synchronous, wait for it to complete.
326 */
327 if (sync)
328 rv = biowait(bp);
329
330 /*
331 * Pay for the I/O operation, if it's not been paid for, and
332 * make sure it's on the correct vnode queue. (async operatings
333 * were payed for above.)
334 */
335 if (sync)
336 if (wasdelayed)
337 reassignbuf(bp, bp->b_vp);
338 else
339 curproc->p_stats->p_ru.ru_oublock++;
340
341 /* Release the buffer, or, if async, make sure it gets reused ASAP. */
342 if (sync)
343 brelse(bp);
344 else if (wasdelayed) {
345 s = splbio();
346 SET(bp->b_flags, B_AGE);
347 splx(s);
348 }
349 return (rv);
350 }
351
352 int
353 vn_bwrite(ap)
354 struct vop_bwrite_args *ap;
355 {
356 return (bwrite(ap->a_bp));
357 }
358
359 /*
360 * Delayed write.
361 *
362 * The buffer is marked dirty, but is not queued for I/O.
363 * This routine should be used when the buffer is expected
364 * to be modified again soon, typically a small write that
365 * partially fills a buffer.
366 *
367 * NB: magnetic tapes cannot be delayed; they must be
368 * written in the order that the writes are requested.
369 *
370 * Described in Leffler, et al. (pp. 208-213).
371 */
372 void
373 bdwrite(bp)
374 struct buf *bp;
375 {
376
377 /*
378 * If the block hasn't been seen before:
379 * (1) Mark it as having been seen,
380 * (2) Charge for the write.
381 * (3) Make sure it's on its vnode's correct block list,
382 */
383 if (!ISSET(bp->b_flags, B_DELWRI)) {
384 SET(bp->b_flags, B_DELWRI);
385 curproc->p_stats->p_ru.ru_oublock++; /* XXX */
386 reassignbuf(bp, bp->b_vp);
387 }
388
389 /* If this is a tape block, write it the block now. */
390 if (ISSET(bp->b_flags, B_TAPE)) {
391 bwrite(bp);
392 return;
393 }
394
395 /* Otherwise, the "write" is done, so mark and release the buffer. */
396 SET(bp->b_flags, B_DONE);
397 brelse(bp);
398 }
399
400 /*
401 * Asynchronous block write; just an asynchronous bwrite().
402 */
403 void
404 bawrite(bp)
405 struct buf *bp;
406 {
407
408 SET(bp->b_flags, B_ASYNC);
409 VOP_BWRITE(bp);
410 }
411
412 /*
413 * Release a buffer on to the free lists.
414 * Described in Bach (p. 46).
415 */
416 void
417 brelse(bp)
418 struct buf *bp;
419 {
420 struct bqueues *bufq;
421 int s;
422
423 /* Wake up any processes waiting for any buffer to become free. */
424 if (needbuffer) {
425 needbuffer = 0;
426 wakeup(&needbuffer);
427 }
428
429 /* Wake up any proceeses waiting for _this_ buffer to become free. */
430 if (ISSET(bp->b_flags, B_WANTED)) {
431 CLR(bp->b_flags, B_WANTED);
432 wakeup(bp);
433 }
434
435 /* Block disk interrupts. */
436 s = splbio();
437
438 /*
439 * Determine which queue the buffer should be on, then put it there.
440 */
441
442 /* If it's locked, don't report an error; try again later. */
443 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
444 CLR(bp->b_flags, B_ERROR);
445
446 /* If it's not cacheable, or an error, mark it invalid. */
447 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
448 SET(bp->b_flags, B_INVAL);
449
450 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
451 /*
452 * If it's invalid or empty, dissociate it from its vnode
453 * and put on the head of the appropriate queue.
454 */
455 if (bp->b_vp)
456 brelvp(bp);
457 CLR(bp->b_flags, B_DELWRI);
458 if (bp->b_bufsize <= 0)
459 /* no data */
460 bufq = &bufqueues[BQ_EMPTY];
461 else
462 /* invalid data */
463 bufq = &bufqueues[BQ_AGE];
464 binsheadfree(bp, bufq);
465 } else {
466 /*
467 * It has valid data. Put it on the end of the appropriate
468 * queue, so that it'll stick around for as long as possible.
469 */
470 if (ISSET(bp->b_flags, B_LOCKED))
471 /* locked in core */
472 bufq = &bufqueues[BQ_LOCKED];
473 else if (ISSET(bp->b_flags, B_AGE))
474 /* stale but valid data */
475 bufq = &bufqueues[BQ_AGE];
476 else
477 /* valid data */
478 bufq = &bufqueues[BQ_LRU];
479 binstailfree(bp, bufq);
480 }
481
482 /* Unlock the buffer. */
483 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
484
485 /* Allow disk interrupts. */
486 splx(s);
487 }
488
489 /*
490 * Determine if a block is in the cache.
491 * Just look on what would be its hash chain. If it's there, return
492 * a pointer to it, unless it's marked invalid. If it's marked invalid,
493 * we normally don't return the buffer, unless the caller explicitly
494 * wants us to.
495 */
496 struct buf *
497 incore(vp, blkno)
498 struct vnode *vp;
499 daddr_t blkno;
500 {
501 struct buf *bp;
502
503 bp = BUFHASH(vp, blkno)->lh_first;
504
505 /* Search hash chain */
506 for (; bp != NULL; bp = bp->b_hash.le_next) {
507 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
508 !ISSET(bp->b_flags, B_INVAL))
509 return (bp);
510 }
511
512 return (0);
513 }
514
515 /*
516 * Get a block of requested size that is associated with
517 * a given vnode and block offset. If it is found in the
518 * block cache, mark it as having been found, make it busy
519 * and return it. Otherwise, return an empty block of the
520 * correct size. It is up to the caller to insure that the
521 * cached blocks be of the correct size.
522 */
523 struct buf *
524 getblk(vp, blkno, size, slpflag, slptimeo)
525 register struct vnode *vp;
526 daddr_t blkno;
527 int size, slpflag, slptimeo;
528 {
529 struct buf *bp;
530 int s, err;
531
532 start:
533 s = splbio();
534 if (bp = incore(vp, blkno)) { /* XXX NFS VOP_BWRITE foolishness */
535 if (ISSET(bp->b_flags, B_BUSY)) {
536 SET(bp->b_flags, B_WANTED);
537 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
538 slptimeo);
539 splx(s);
540 if (err)
541 return (NULL);
542 goto start;
543 }
544 SET(bp->b_flags, (B_BUSY | B_CACHE));
545 bremfree(bp);
546 splx(s);
547 allocbuf(bp, size);
548 } else {
549 splx(s);
550 if ((bp = getnewbuf(slpflag, slptimeo)) == NULL)
551 goto start;
552 allocbuf(bp, size);
553 bp->b_blkno = bp->b_lblkno = blkno;
554 s = splbio();
555 bgetvp(vp, bp);
556 splx(s);
557 bremhash(bp);
558 binshash(bp, BUFHASH(vp, blkno));
559 }
560 return (bp);
561 }
562
563 /*
564 * Get an empty, disassociated buffer of given size.
565 */
566 struct buf *
567 geteblk(size)
568 int size;
569 {
570 struct buf *bp;
571
572 while ((bp = getnewbuf(0, 0)) == 0)
573 ;
574 SET(bp->b_flags, B_INVAL);
575 bremhash(bp);
576 binshash(bp, &invalhash);
577 allocbuf(bp, size);
578 bp->b_bcount = 0;
579 bp->b_error = 0;
580 bp->b_resid = 0;
581
582 return (bp);
583 }
584
585 /*
586 * Expand or contract the actual memory allocated to a buffer.
587 *
588 * If the buffer shrinks, data is lost, so it's up to the
589 * caller to have written it out *first*; this routine will not
590 * start a write. If the buffer grows, it's the callers
591 * responsibility to fill out the buffer's additional contents.
592 */
593 allocbuf(bp, size)
594 struct buf *bp;
595 int size;
596 {
597 struct buf *nbp;
598 vm_size_t desired_size;
599 int s;
600
601 desired_size = roundup(size, CLBYTES);
602 if (desired_size > MAXBSIZE)
603 panic("allocbuf: buffer larger than MAXBSIZE requested");
604
605 if (bp->b_bufsize == desired_size)
606 goto out;
607
608 /*
609 * If the buffer is smaller than the desired size, we need to snarf
610 * it from other buffers. Get buffers (via getnewbuf()), and
611 * steal their pages.
612 */
613 while (bp->b_bufsize < desired_size) {
614 int amt;
615
616 /* find a buffer */
617 while ((nbp = getnewbuf(0, 0)) == NULL)
618 ;
619
620 /* and steal its pages, up to the amount we need */
621 amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize));
622 pagemove((nbp->b_data + nbp->b_bufsize - amt),
623 bp->b_data + bp->b_bufsize, amt);
624 bp->b_bufsize += amt;
625 nbp->b_bufsize -= amt;
626
627 /* reduce transfer count if we stole some data */
628 if (nbp->b_bcount > nbp->b_bufsize)
629 nbp->b_bcount = nbp->b_bufsize;
630
631 #ifdef DIAGNOSTIC
632 if (nbp->b_bufsize < 0)
633 panic("allocbuf: negative bufsize");
634 #endif
635 if (nbp->b_bufsize == 0) {
636 bremhash(nbp);
637 binshash(nbp, &invalhash);
638 SET(nbp->b_flags, B_INVAL);
639 nbp->b_error = 0;
640 nbp->b_dev = NODEV;
641 }
642 brelse(nbp);
643 }
644
645 /*
646 * If we want a buffer smaller than the current size,
647 * shrink this buffer. Grab a buf head from the EMPTY queue,
648 * move a page onto it, and put it on front of the AGE queue.
649 * If there are no free buffer headers, leave the buffer alone.
650 */
651 if (bp->b_bufsize > desired_size) {
652 s = splbio();
653 if ((nbp = bufqueues[BQ_EMPTY].tqh_first) == NULL) {
654 /* No free buffer head */
655 splx(s);
656 goto out;
657 }
658 bremfree(nbp);
659 SET(nbp->b_flags, B_BUSY);
660 splx(s);
661
662 /* move the page to it and note this change */
663 pagemove(bp->b_data + desired_size,
664 nbp->b_data, bp->b_bufsize - desired_size);
665 nbp->b_bufsize = bp->b_bufsize - desired_size;
666 bp->b_bufsize = desired_size;
667 nbp->b_bcount = 0;
668 SET(nbp->b_flags, B_INVAL);
669
670 /* release the newly-filled buffer and leave */
671 brelse(nbp);
672 }
673
674 out:
675 bp->b_bcount = size;
676 }
677
678 /*
679 * Find a buffer which is available for use.
680 * Select something from a free list.
681 * Preference is to AGE list, then LRU list.
682 */
683 struct buf *
684 getnewbuf(slpflag, slptimeo)
685 int slpflag, slptimeo;
686 {
687 register struct buf *bp;
688 int s;
689
690 start:
691 s = splbio();
692 if ((bp = bufqueues[BQ_AGE].tqh_first) != NULL ||
693 (bp = bufqueues[BQ_LRU].tqh_first) != NULL) {
694 bremfree(bp);
695 } else {
696 /* wait for a free buffer of any kind */
697 needbuffer = 1;
698 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
699 splx(s);
700 return (0);
701 }
702
703 /* Buffer is no longer on free lists. */
704 SET(bp->b_flags, B_BUSY);
705 splx(s);
706
707 /* If buffer was a delayed write, start it, and go back to the top. */
708 if (ISSET(bp->b_flags, B_DELWRI)) {
709 bawrite (bp);
710 goto start;
711 }
712
713 /* disassociate us from our vnode, if we had one... */
714 s = splbio();
715 if (bp->b_vp)
716 brelvp(bp);
717 splx(s);
718
719 /* clear out various other fields */
720 bp->b_flags = B_BUSY;
721 bp->b_dev = NODEV;
722 bp->b_blkno = bp->b_lblkno = 0;
723 bp->b_iodone = 0;
724 bp->b_error = 0;
725 bp->b_resid = 0;
726 bp->b_bcount = 0;
727 bp->b_dirtyoff = bp->b_dirtyend = 0;
728 bp->b_validoff = bp->b_validend = 0;
729
730 /* nuke any credentials we were holding */
731 if (bp->b_rcred != NOCRED) {
732 crfree(bp->b_rcred);
733 bp->b_rcred = NOCRED;
734 }
735 if (bp->b_wcred != NOCRED) {
736 crfree(bp->b_wcred);
737 bp->b_wcred = NOCRED;
738 }
739
740 return (bp);
741 }
742
743 /*
744 * Wait for operations on the buffer to complete.
745 * When they do, extract and return the I/O's error value.
746 */
747 int
748 biowait(bp)
749 struct buf *bp;
750 {
751 int s;
752
753 s = splbio();
754 while (!ISSET(bp->b_flags, B_DONE))
755 tsleep(bp, PRIBIO + 1, "biowait", 0);
756 splx(s);
757
758 /* check for interruption of I/O (e.g. via NFS), then errors. */
759 if (ISSET(bp->b_flags, B_EINTR)) {
760 CLR(bp->b_flags, B_EINTR);
761 return (EINTR);
762 } else if (ISSET(bp->b_flags, B_ERROR))
763 return (bp->b_error ? bp->b_error : EIO);
764 else
765 return (0);
766 }
767
768 /*
769 * Mark I/O complete on a buffer.
770 *
771 * If a callback has been requested, e.g. the pageout
772 * daemon, do so. Otherwise, awaken waiting processes.
773 *
774 * [ Leffler, et al., says on p.247:
775 * "This routine wakes up the blocked process, frees the buffer
776 * for an asynchronous write, or, for a request by the pagedaemon
777 * process, invokes a procedure specified in the buffer structure" ]
778 *
779 * In real life, the pagedaemon (or other system processes) wants
780 * to do async stuff to, and doesn't want the buffer brelse()'d.
781 * (for swap pager, that puts swap buffers on the free lists (!!!),
782 * for the vn device, that puts malloc'd buffers on the free lists!)
783 */
784 void
785 biodone(bp)
786 struct buf *bp;
787 {
788 if (ISSET(bp->b_flags, B_DONE))
789 panic("biodone already");
790 SET(bp->b_flags, B_DONE); /* note that it's done */
791
792 if (!ISSET(bp->b_flags, B_READ)) /* wake up reader */
793 vwakeup(bp);
794
795 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
796 CLR(bp->b_flags, B_CALL); /* but note callout done */
797 (*bp->b_iodone)(bp);
798 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
799 brelse(bp);
800 else { /* or just wakeup the buffer */
801 CLR(bp->b_flags, B_WANTED);
802 wakeup(bp);
803 }
804 }
805
806 /*
807 * Return a count of buffers on the "locked" queue.
808 */
809 int
810 count_lock_queue()
811 {
812 register struct buf *bp;
813 register int n = 0;
814
815 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
816 bp = bp->b_freelist.tqe_next)
817 n++;
818 return (n);
819 }
820
821 #ifdef DIAGNOSTIC
822 /*
823 * Print out statistics on the current allocation of the buffer pool.
824 * Can be enabled to print out on every ``sync'' by setting "syncprt"
825 * in vfs_syscalls.c using sysctl.
826 */
827 void
828 vfs_bufstats()
829 {
830 int s, i, j, count;
831 register struct buf *bp;
832 register struct bqueues *dp;
833 int counts[MAXBSIZE/CLBYTES+1];
834 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
835
836 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
837 count = 0;
838 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
839 counts[j] = 0;
840 s = splbio();
841 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
842 counts[bp->b_bufsize/CLBYTES]++;
843 count++;
844 }
845 splx(s);
846 printf("%s: total-%d", bname[i], count);
847 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
848 if (counts[j] != 0)
849 printf(", %d-%d", j * CLBYTES, counts[j]);
850 printf("\n");
851 }
852 }
853 #endif /* DIAGNOSTIC */
854