vfs_bio.c revision 1.225 1 /* $NetBSD: vfs_bio.c,v 1.225 2010/12/12 10:30:09 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran, and by Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
66 */
67
68 /*-
69 * Copyright (c) 1994 Christopher G. Demetriou
70 *
71 * Redistribution and use in source and binary forms, with or without
72 * modification, are permitted provided that the following conditions
73 * are met:
74 * 1. Redistributions of source code must retain the above copyright
75 * notice, this list of conditions and the following disclaimer.
76 * 2. Redistributions in binary form must reproduce the above copyright
77 * notice, this list of conditions and the following disclaimer in the
78 * documentation and/or other materials provided with the distribution.
79 * 3. All advertising materials mentioning features or use of this software
80 * must display the following acknowledgement:
81 * This product includes software developed by the University of
82 * California, Berkeley and its contributors.
83 * 4. Neither the name of the University nor the names of its contributors
84 * may be used to endorse or promote products derived from this software
85 * without specific prior written permission.
86 *
87 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
88 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
89 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
90 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
91 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
92 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
93 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
94 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
95 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
96 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
97 * SUCH DAMAGE.
98 *
99 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
100 */
101
102 /*
103 * The buffer cache subsystem.
104 *
105 * Some references:
106 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
107 * Leffler, et al.: The Design and Implementation of the 4.3BSD
108 * UNIX Operating System (Addison Welley, 1989)
109 *
110 * Locking
111 *
112 * There are three locks:
113 * - bufcache_lock: protects global buffer cache state.
114 * - BC_BUSY: a long term per-buffer lock.
115 * - buf_t::b_objlock: lock on completion (biowait vs biodone).
116 *
117 * For buffers associated with vnodes (a most common case) b_objlock points
118 * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock.
119 *
120 * Lock order:
121 * bufcache_lock ->
122 * buf_t::b_objlock
123 */
124
125 #include <sys/cdefs.h>
126 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.225 2010/12/12 10:30:09 hannken Exp $");
127
128 #include "opt_bufcache.h"
129
130 #include <sys/param.h>
131 #include <sys/systm.h>
132 #include <sys/kernel.h>
133 #include <sys/proc.h>
134 #include <sys/buf.h>
135 #include <sys/vnode.h>
136 #include <sys/mount.h>
137 #include <sys/resourcevar.h>
138 #include <sys/sysctl.h>
139 #include <sys/conf.h>
140 #include <sys/kauth.h>
141 #include <sys/fstrans.h>
142 #include <sys/intr.h>
143 #include <sys/cpu.h>
144 #include <sys/wapbl.h>
145
146 #include <uvm/uvm.h>
147
148 #include <miscfs/specfs/specdev.h>
149
150 #ifndef BUFPAGES
151 # define BUFPAGES 0
152 #endif
153
154 #ifdef BUFCACHE
155 # if (BUFCACHE < 5) || (BUFCACHE > 95)
156 # error BUFCACHE is not between 5 and 95
157 # endif
158 #else
159 # define BUFCACHE 15
160 #endif
161
162 u_int nbuf; /* desired number of buffer headers */
163 u_int bufpages = BUFPAGES; /* optional hardwired count */
164 u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */
165
166 /* Function prototypes */
167 struct bqueue;
168
169 static void buf_setwm(void);
170 static int buf_trim(void);
171 static void *bufpool_page_alloc(struct pool *, int);
172 static void bufpool_page_free(struct pool *, void *);
173 static buf_t *bio_doread(struct vnode *, daddr_t, int,
174 kauth_cred_t, int);
175 static buf_t *getnewbuf(int, int, int);
176 static int buf_lotsfree(void);
177 static int buf_canrelease(void);
178 static u_long buf_mempoolidx(u_long);
179 static u_long buf_roundsize(u_long);
180 static void *buf_malloc(size_t);
181 static void buf_mrelease(void *, size_t);
182 static void binsheadfree(buf_t *, struct bqueue *);
183 static void binstailfree(buf_t *, struct bqueue *);
184 int count_lock_queue(void); /* XXX */
185 #ifdef DEBUG
186 static int checkfreelist(buf_t *, struct bqueue *, int);
187 #endif
188 static void biointr(void *);
189 static void biodone2(buf_t *);
190 static void bref(buf_t *);
191 static void brele(buf_t *);
192 static void sysctl_kern_buf_setup(void);
193 static void sysctl_vm_buf_setup(void);
194
195 /*
196 * Definitions for the buffer hash lists.
197 */
198 #define BUFHASH(dvp, lbn) \
199 (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
200 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
201 u_long bufhash;
202 struct bqueue bufqueues[BQUEUES];
203
204 static kcondvar_t needbuffer_cv;
205
206 /*
207 * Buffer queue lock.
208 */
209 kmutex_t bufcache_lock;
210 kmutex_t buffer_lock;
211
212 /* Software ISR for completed transfers. */
213 static void *biodone_sih;
214
215 /* Buffer pool for I/O buffers. */
216 static pool_cache_t buf_cache;
217 static pool_cache_t bufio_cache;
218
219 /* XXX - somewhat gross.. */
220 #if MAXBSIZE == 0x2000
221 #define NMEMPOOLS 5
222 #elif MAXBSIZE == 0x4000
223 #define NMEMPOOLS 6
224 #elif MAXBSIZE == 0x8000
225 #define NMEMPOOLS 7
226 #else
227 #define NMEMPOOLS 8
228 #endif
229
230 #define MEMPOOL_INDEX_OFFSET 9 /* smallest pool is 512 bytes */
231 #if (1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) != MAXBSIZE
232 #error update vfs_bio buffer memory parameters
233 #endif
234
235 /* Buffer memory pools */
236 static struct pool bmempools[NMEMPOOLS];
237
238 static struct vm_map *buf_map;
239
240 /*
241 * Buffer memory pool allocator.
242 */
243 static void *
244 bufpool_page_alloc(struct pool *pp, int flags)
245 {
246
247 return (void *)uvm_km_alloc(buf_map,
248 MAXBSIZE, MAXBSIZE,
249 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
250 | UVM_KMF_WIRED);
251 }
252
253 static void
254 bufpool_page_free(struct pool *pp, void *v)
255 {
256
257 uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
258 }
259
260 static struct pool_allocator bufmempool_allocator = {
261 .pa_alloc = bufpool_page_alloc,
262 .pa_free = bufpool_page_free,
263 .pa_pagesz = MAXBSIZE,
264 };
265
266 /* Buffer memory management variables */
267 u_long bufmem_valimit;
268 u_long bufmem_hiwater;
269 u_long bufmem_lowater;
270 u_long bufmem;
271
272 /*
273 * MD code can call this to set a hard limit on the amount
274 * of virtual memory used by the buffer cache.
275 */
276 int
277 buf_setvalimit(vsize_t sz)
278 {
279
280 /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
281 if (sz < NMEMPOOLS * MAXBSIZE)
282 return EINVAL;
283
284 bufmem_valimit = sz;
285 return 0;
286 }
287
288 static void
289 buf_setwm(void)
290 {
291
292 bufmem_hiwater = buf_memcalc();
293 /* lowater is approx. 2% of memory (with bufcache = 15) */
294 #define BUFMEM_WMSHIFT 3
295 #define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT)
296 if (bufmem_hiwater < BUFMEM_HIWMMIN)
297 /* Ensure a reasonable minimum value */
298 bufmem_hiwater = BUFMEM_HIWMMIN;
299 bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
300 }
301
302 #ifdef DEBUG
303 int debug_verify_freelist = 0;
304 static int
305 checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
306 {
307 buf_t *b;
308
309 if (!debug_verify_freelist)
310 return 1;
311
312 TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
313 if (b == bp)
314 return ison ? 1 : 0;
315 }
316
317 return ison ? 0 : 1;
318 }
319 #endif
320
321 /*
322 * Insq/Remq for the buffer hash lists.
323 * Call with buffer queue locked.
324 */
325 static void
326 binsheadfree(buf_t *bp, struct bqueue *dp)
327 {
328
329 KASSERT(mutex_owned(&bufcache_lock));
330 KASSERT(bp->b_freelistindex == -1);
331 TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
332 dp->bq_bytes += bp->b_bufsize;
333 bp->b_freelistindex = dp - bufqueues;
334 }
335
336 static void
337 binstailfree(buf_t *bp, struct bqueue *dp)
338 {
339
340 KASSERT(mutex_owned(&bufcache_lock));
341 KASSERT(bp->b_freelistindex == -1);
342 TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
343 dp->bq_bytes += bp->b_bufsize;
344 bp->b_freelistindex = dp - bufqueues;
345 }
346
347 void
348 bremfree(buf_t *bp)
349 {
350 struct bqueue *dp;
351 int bqidx = bp->b_freelistindex;
352
353 KASSERT(mutex_owned(&bufcache_lock));
354
355 KASSERT(bqidx != -1);
356 dp = &bufqueues[bqidx];
357 KDASSERT(checkfreelist(bp, dp, 1));
358 KASSERT(dp->bq_bytes >= bp->b_bufsize);
359 TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
360 dp->bq_bytes -= bp->b_bufsize;
361
362 /* For the sysctl helper. */
363 if (bp == dp->bq_marker)
364 dp->bq_marker = NULL;
365
366 #if defined(DIAGNOSTIC)
367 bp->b_freelistindex = -1;
368 #endif /* defined(DIAGNOSTIC) */
369 }
370
371 /*
372 * Add a reference to an buffer structure that came from buf_cache.
373 */
374 static inline void
375 bref(buf_t *bp)
376 {
377
378 KASSERT(mutex_owned(&bufcache_lock));
379 KASSERT(bp->b_refcnt > 0);
380
381 bp->b_refcnt++;
382 }
383
384 /*
385 * Free an unused buffer structure that came from buf_cache.
386 */
387 static inline void
388 brele(buf_t *bp)
389 {
390
391 KASSERT(mutex_owned(&bufcache_lock));
392 KASSERT(bp->b_refcnt > 0);
393
394 if (bp->b_refcnt-- == 1) {
395 buf_destroy(bp);
396 #ifdef DEBUG
397 memset((char *)bp, 0, sizeof(*bp));
398 #endif
399 pool_cache_put(buf_cache, bp);
400 }
401 }
402
403 /*
404 * note that for some ports this is used by pmap bootstrap code to
405 * determine kva size.
406 */
407 u_long
408 buf_memcalc(void)
409 {
410 u_long n;
411
412 /*
413 * Determine the upper bound of memory to use for buffers.
414 *
415 * - If bufpages is specified, use that as the number
416 * pages.
417 *
418 * - Otherwise, use bufcache as the percentage of
419 * physical memory.
420 */
421 if (bufpages != 0) {
422 n = bufpages;
423 } else {
424 if (bufcache < 5) {
425 printf("forcing bufcache %d -> 5", bufcache);
426 bufcache = 5;
427 }
428 if (bufcache > 95) {
429 printf("forcing bufcache %d -> 95", bufcache);
430 bufcache = 95;
431 }
432 n = calc_cache_size(buf_map, bufcache,
433 (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
434 / PAGE_SIZE;
435 }
436
437 n <<= PAGE_SHIFT;
438 if (bufmem_valimit != 0 && n > bufmem_valimit)
439 n = bufmem_valimit;
440
441 return (n);
442 }
443
444 /*
445 * Initialize buffers and hash links for buffers.
446 */
447 void
448 bufinit(void)
449 {
450 struct bqueue *dp;
451 int use_std;
452 u_int i;
453
454 mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
455 mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
456 cv_init(&needbuffer_cv, "needbuf");
457
458 if (bufmem_valimit != 0) {
459 vaddr_t minaddr = 0, maxaddr;
460 buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
461 bufmem_valimit, 0, false, 0);
462 if (buf_map == NULL)
463 panic("bufinit: cannot allocate submap");
464 } else
465 buf_map = kernel_map;
466
467 /*
468 * Initialize buffer cache memory parameters.
469 */
470 bufmem = 0;
471 buf_setwm();
472
473 /* On "small" machines use small pool page sizes where possible */
474 use_std = (physmem < atop(16*1024*1024));
475
476 /*
477 * Also use them on systems that can map the pool pages using
478 * a direct-mapped segment.
479 */
480 #ifdef PMAP_MAP_POOLPAGE
481 use_std = 1;
482 #endif
483
484 buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
485 "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
486 bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
487 "biopl", NULL, IPL_BIO, NULL, NULL, NULL);
488
489 bufmempool_allocator.pa_backingmap = buf_map;
490 for (i = 0; i < NMEMPOOLS; i++) {
491 struct pool_allocator *pa;
492 struct pool *pp = &bmempools[i];
493 u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
494 char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
495 if (__predict_true(size >= 1024))
496 (void)snprintf(name, 8, "buf%dk", size / 1024);
497 else
498 (void)snprintf(name, 8, "buf%db", size);
499 pa = (size <= PAGE_SIZE && use_std)
500 ? &pool_allocator_nointr
501 : &bufmempool_allocator;
502 pool_init(pp, size, 0, 0, 0, name, pa, IPL_NONE);
503 pool_setlowat(pp, 1);
504 pool_sethiwat(pp, 1);
505 }
506
507 /* Initialize the buffer queues */
508 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
509 TAILQ_INIT(&dp->bq_queue);
510 dp->bq_bytes = 0;
511 }
512
513 /*
514 * Estimate hash table size based on the amount of memory we
515 * intend to use for the buffer cache. The average buffer
516 * size is dependent on our clients (i.e. filesystems).
517 *
518 * For now, use an empirical 3K per buffer.
519 */
520 nbuf = (bufmem_hiwater / 1024) / 3;
521 bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
522
523 sysctl_kern_buf_setup();
524 sysctl_vm_buf_setup();
525 }
526
527 void
528 bufinit2(void)
529 {
530
531 biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
532 NULL);
533 if (biodone_sih == NULL)
534 panic("bufinit2: can't establish soft interrupt");
535 }
536
537 static int
538 buf_lotsfree(void)
539 {
540 int try, thresh;
541
542 /* Always allocate if less than the low water mark. */
543 if (bufmem < bufmem_lowater)
544 return 1;
545
546 /* Never allocate if greater than the high water mark. */
547 if (bufmem > bufmem_hiwater)
548 return 0;
549
550 /* If there's anything on the AGE list, it should be eaten. */
551 if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
552 return 0;
553
554 /*
555 * The probabily of getting a new allocation is inversely
556 * proportional to the current size of the cache, using
557 * a granularity of 16 steps.
558 */
559 try = random() & 0x0000000fL;
560
561 /* Don't use "16 * bufmem" here to avoid a 32-bit overflow. */
562 thresh = (bufmem - bufmem_lowater) /
563 ((bufmem_hiwater - bufmem_lowater) / 16);
564
565 if (try >= thresh)
566 return 1;
567
568 /* Otherwise don't allocate. */
569 return 0;
570 }
571
572 /*
573 * Return estimate of bytes we think need to be
574 * released to help resolve low memory conditions.
575 *
576 * => called with bufcache_lock held.
577 */
578 static int
579 buf_canrelease(void)
580 {
581 int pagedemand, ninvalid = 0;
582
583 KASSERT(mutex_owned(&bufcache_lock));
584
585 if (bufmem < bufmem_lowater)
586 return 0;
587
588 if (bufmem > bufmem_hiwater)
589 return bufmem - bufmem_hiwater;
590
591 ninvalid += bufqueues[BQ_AGE].bq_bytes;
592
593 pagedemand = uvmexp.freetarg - uvmexp.free;
594 if (pagedemand < 0)
595 return ninvalid;
596 return MAX(ninvalid, MIN(2 * MAXBSIZE,
597 MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
598 }
599
600 /*
601 * Buffer memory allocation helper functions
602 */
603 static u_long
604 buf_mempoolidx(u_long size)
605 {
606 u_int n = 0;
607
608 size -= 1;
609 size >>= MEMPOOL_INDEX_OFFSET;
610 while (size) {
611 size >>= 1;
612 n += 1;
613 }
614 if (n >= NMEMPOOLS)
615 panic("buf mem pool index %d", n);
616 return n;
617 }
618
619 static u_long
620 buf_roundsize(u_long size)
621 {
622 /* Round up to nearest power of 2 */
623 return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
624 }
625
626 static void *
627 buf_malloc(size_t size)
628 {
629 u_int n = buf_mempoolidx(size);
630 void *addr;
631
632 while (1) {
633 addr = pool_get(&bmempools[n], PR_NOWAIT);
634 if (addr != NULL)
635 break;
636
637 /* No memory, see if we can free some. If so, try again */
638 mutex_enter(&bufcache_lock);
639 if (buf_drain(1) > 0) {
640 mutex_exit(&bufcache_lock);
641 continue;
642 }
643
644 if (curlwp == uvm.pagedaemon_lwp) {
645 mutex_exit(&bufcache_lock);
646 return NULL;
647 }
648
649 /* Wait for buffers to arrive on the LRU queue */
650 cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
651 mutex_exit(&bufcache_lock);
652 }
653
654 return addr;
655 }
656
657 static void
658 buf_mrelease(void *addr, size_t size)
659 {
660
661 pool_put(&bmempools[buf_mempoolidx(size)], addr);
662 }
663
664 /*
665 * bread()/breadn() helper.
666 */
667 static buf_t *
668 bio_doread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
669 int async)
670 {
671 buf_t *bp;
672 struct mount *mp;
673
674 bp = getblk(vp, blkno, size, 0, 0);
675
676 #ifdef DIAGNOSTIC
677 if (bp == NULL) {
678 panic("bio_doread: no such buf");
679 }
680 #endif
681
682 /*
683 * If buffer does not have data valid, start a read.
684 * Note that if buffer is BC_INVAL, getblk() won't return it.
685 * Therefore, it's valid if its I/O has completed or been delayed.
686 */
687 if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
688 /* Start I/O for the buffer. */
689 SET(bp->b_flags, B_READ | async);
690 if (async)
691 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
692 else
693 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
694 VOP_STRATEGY(vp, bp);
695
696 /* Pay for the read. */
697 curlwp->l_ru.ru_inblock++;
698 } else if (async)
699 brelse(bp, 0);
700
701 if (vp->v_type == VBLK)
702 mp = vp->v_specmountpoint;
703 else
704 mp = vp->v_mount;
705
706 /*
707 * Collect statistics on synchronous and asynchronous reads.
708 * Reads from block devices are charged to their associated
709 * filesystem (if any).
710 */
711 if (mp != NULL) {
712 if (async == 0)
713 mp->mnt_stat.f_syncreads++;
714 else
715 mp->mnt_stat.f_asyncreads++;
716 }
717
718 return (bp);
719 }
720
721 /*
722 * Read a disk block.
723 * This algorithm described in Bach (p.54).
724 */
725 int
726 bread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
727 int flags, buf_t **bpp)
728 {
729 buf_t *bp;
730 int error;
731
732 /* Get buffer for block. */
733 bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
734
735 /* Wait for the read to complete, and return result. */
736 error = biowait(bp);
737 if (error == 0 && (flags & B_MODIFY) != 0)
738 error = fscow_run(bp, true);
739
740 return error;
741 }
742
743 /*
744 * Read-ahead multiple disk blocks. The first is sync, the rest async.
745 * Trivial modification to the breada algorithm presented in Bach (p.55).
746 */
747 int
748 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
749 int *rasizes, int nrablks, kauth_cred_t cred, int flags, buf_t **bpp)
750 {
751 buf_t *bp;
752 int error, i;
753
754 bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
755
756 /*
757 * For each of the read-ahead blocks, start a read, if necessary.
758 */
759 mutex_enter(&bufcache_lock);
760 for (i = 0; i < nrablks; i++) {
761 /* If it's in the cache, just go on to next one. */
762 if (incore(vp, rablks[i]))
763 continue;
764
765 /* Get a buffer for the read-ahead block */
766 mutex_exit(&bufcache_lock);
767 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC);
768 mutex_enter(&bufcache_lock);
769 }
770 mutex_exit(&bufcache_lock);
771
772 /* Otherwise, we had to start a read for it; wait until it's valid. */
773 error = biowait(bp);
774 if (error == 0 && (flags & B_MODIFY) != 0)
775 error = fscow_run(bp, true);
776 return error;
777 }
778
779 /*
780 * Block write. Described in Bach (p.56)
781 */
782 int
783 bwrite(buf_t *bp)
784 {
785 int rv, sync, wasdelayed;
786 struct vnode *vp;
787 struct mount *mp;
788
789 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
790 KASSERT(!cv_has_waiters(&bp->b_done));
791
792 vp = bp->b_vp;
793 if (vp != NULL) {
794 KASSERT(bp->b_objlock == &vp->v_interlock);
795 if (vp->v_type == VBLK)
796 mp = vp->v_specmountpoint;
797 else
798 mp = vp->v_mount;
799 } else {
800 mp = NULL;
801 }
802
803 if (mp && mp->mnt_wapbl) {
804 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
805 bdwrite(bp);
806 return 0;
807 }
808 }
809
810 /*
811 * Remember buffer type, to switch on it later. If the write was
812 * synchronous, but the file system was mounted with MNT_ASYNC,
813 * convert it to a delayed write.
814 * XXX note that this relies on delayed tape writes being converted
815 * to async, not sync writes (which is safe, but ugly).
816 */
817 sync = !ISSET(bp->b_flags, B_ASYNC);
818 if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
819 bdwrite(bp);
820 return (0);
821 }
822
823 /*
824 * Collect statistics on synchronous and asynchronous writes.
825 * Writes to block devices are charged to their associated
826 * filesystem (if any).
827 */
828 if (mp != NULL) {
829 if (sync)
830 mp->mnt_stat.f_syncwrites++;
831 else
832 mp->mnt_stat.f_asyncwrites++;
833 }
834
835 /*
836 * Pay for the I/O operation and make sure the buf is on the correct
837 * vnode queue.
838 */
839 bp->b_error = 0;
840 wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
841 CLR(bp->b_flags, B_READ);
842 if (wasdelayed) {
843 mutex_enter(&bufcache_lock);
844 mutex_enter(bp->b_objlock);
845 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
846 reassignbuf(bp, bp->b_vp);
847 mutex_exit(&bufcache_lock);
848 } else {
849 curlwp->l_ru.ru_oublock++;
850 mutex_enter(bp->b_objlock);
851 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
852 }
853 if (vp != NULL)
854 vp->v_numoutput++;
855 mutex_exit(bp->b_objlock);
856
857 /* Initiate disk write. */
858 if (sync)
859 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
860 else
861 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
862
863 VOP_STRATEGY(vp, bp);
864
865 if (sync) {
866 /* If I/O was synchronous, wait for it to complete. */
867 rv = biowait(bp);
868
869 /* Release the buffer. */
870 brelse(bp, 0);
871
872 return (rv);
873 } else {
874 return (0);
875 }
876 }
877
878 int
879 vn_bwrite(void *v)
880 {
881 struct vop_bwrite_args *ap = v;
882
883 return (bwrite(ap->a_bp));
884 }
885
886 /*
887 * Delayed write.
888 *
889 * The buffer is marked dirty, but is not queued for I/O.
890 * This routine should be used when the buffer is expected
891 * to be modified again soon, typically a small write that
892 * partially fills a buffer.
893 *
894 * NB: magnetic tapes cannot be delayed; they must be
895 * written in the order that the writes are requested.
896 *
897 * Described in Leffler, et al. (pp. 208-213).
898 */
899 void
900 bdwrite(buf_t *bp)
901 {
902
903 KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
904 bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
905 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
906 KASSERT(!cv_has_waiters(&bp->b_done));
907
908 /* If this is a tape block, write the block now. */
909 if (bdev_type(bp->b_dev) == D_TAPE) {
910 bawrite(bp);
911 return;
912 }
913
914 if (wapbl_vphaswapbl(bp->b_vp)) {
915 struct mount *mp = wapbl_vptomp(bp->b_vp);
916
917 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
918 WAPBL_ADD_BUF(mp, bp);
919 }
920 }
921
922 /*
923 * If the block hasn't been seen before:
924 * (1) Mark it as having been seen,
925 * (2) Charge for the write,
926 * (3) Make sure it's on its vnode's correct block list.
927 */
928 KASSERT(bp->b_vp == NULL || bp->b_objlock == &bp->b_vp->v_interlock);
929
930 if (!ISSET(bp->b_oflags, BO_DELWRI)) {
931 mutex_enter(&bufcache_lock);
932 mutex_enter(bp->b_objlock);
933 SET(bp->b_oflags, BO_DELWRI);
934 curlwp->l_ru.ru_oublock++;
935 reassignbuf(bp, bp->b_vp);
936 mutex_exit(&bufcache_lock);
937 } else {
938 mutex_enter(bp->b_objlock);
939 }
940 /* Otherwise, the "write" is done, so mark and release the buffer. */
941 CLR(bp->b_oflags, BO_DONE);
942 mutex_exit(bp->b_objlock);
943
944 brelse(bp, 0);
945 }
946
947 /*
948 * Asynchronous block write; just an asynchronous bwrite().
949 */
950 void
951 bawrite(buf_t *bp)
952 {
953
954 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
955
956 SET(bp->b_flags, B_ASYNC);
957 VOP_BWRITE(bp);
958 }
959
960 /*
961 * Release a buffer on to the free lists.
962 * Described in Bach (p. 46).
963 */
964 void
965 brelsel(buf_t *bp, int set)
966 {
967 struct bqueue *bufq;
968 struct vnode *vp;
969
970 KASSERT(mutex_owned(&bufcache_lock));
971 KASSERT(!cv_has_waiters(&bp->b_done));
972 KASSERT(bp->b_refcnt > 0);
973
974 SET(bp->b_cflags, set);
975
976 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
977 KASSERT(bp->b_iodone == NULL);
978
979 /* Wake up any processes waiting for any buffer to become free. */
980 cv_signal(&needbuffer_cv);
981
982 /* Wake up any proceeses waiting for _this_ buffer to become */
983 if (ISSET(bp->b_cflags, BC_WANTED))
984 CLR(bp->b_cflags, BC_WANTED|BC_AGE);
985
986 /* If it's clean clear the copy-on-write flag. */
987 if (ISSET(bp->b_flags, B_COWDONE)) {
988 mutex_enter(bp->b_objlock);
989 if (!ISSET(bp->b_oflags, BO_DELWRI))
990 CLR(bp->b_flags, B_COWDONE);
991 mutex_exit(bp->b_objlock);
992 }
993
994 /*
995 * Determine which queue the buffer should be on, then put it there.
996 */
997
998 /* If it's locked, don't report an error; try again later. */
999 if (ISSET(bp->b_flags, B_LOCKED))
1000 bp->b_error = 0;
1001
1002 /* If it's not cacheable, or an error, mark it invalid. */
1003 if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
1004 SET(bp->b_cflags, BC_INVAL);
1005
1006 if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1007 /*
1008 * This is a delayed write buffer that was just flushed to
1009 * disk. It is still on the LRU queue. If it's become
1010 * invalid, then we need to move it to a different queue;
1011 * otherwise leave it in its current position.
1012 */
1013 CLR(bp->b_cflags, BC_VFLUSH);
1014 if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
1015 !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
1016 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
1017 goto already_queued;
1018 } else {
1019 bremfree(bp);
1020 }
1021 }
1022
1023 KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0));
1024 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0));
1025 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0));
1026
1027 if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
1028 /*
1029 * If it's invalid or empty, dissociate it from its vnode
1030 * and put on the head of the appropriate queue.
1031 */
1032 if (ISSET(bp->b_flags, B_LOCKED)) {
1033 if (wapbl_vphaswapbl(vp = bp->b_vp)) {
1034 struct mount *mp = wapbl_vptomp(vp);
1035
1036 KASSERT(bp->b_iodone
1037 != mp->mnt_wapbl_op->wo_wapbl_biodone);
1038 WAPBL_REMOVE_BUF(mp, bp);
1039 }
1040 }
1041
1042 mutex_enter(bp->b_objlock);
1043 CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
1044 if ((vp = bp->b_vp) != NULL) {
1045 KASSERT(bp->b_objlock == &vp->v_interlock);
1046 reassignbuf(bp, bp->b_vp);
1047 brelvp(bp);
1048 mutex_exit(&vp->v_interlock);
1049 } else {
1050 KASSERT(bp->b_objlock == &buffer_lock);
1051 mutex_exit(bp->b_objlock);
1052 }
1053
1054 if (bp->b_bufsize <= 0)
1055 /* no data */
1056 goto already_queued;
1057 else
1058 /* invalid data */
1059 bufq = &bufqueues[BQ_AGE];
1060 binsheadfree(bp, bufq);
1061 } else {
1062 /*
1063 * It has valid data. Put it on the end of the appropriate
1064 * queue, so that it'll stick around for as long as possible.
1065 * If buf is AGE, but has dependencies, must put it on last
1066 * bufqueue to be scanned, ie LRU. This protects against the
1067 * livelock where BQ_AGE only has buffers with dependencies,
1068 * and we thus never get to the dependent buffers in BQ_LRU.
1069 */
1070 if (ISSET(bp->b_flags, B_LOCKED)) {
1071 /* locked in core */
1072 bufq = &bufqueues[BQ_LOCKED];
1073 } else if (!ISSET(bp->b_cflags, BC_AGE)) {
1074 /* valid data */
1075 bufq = &bufqueues[BQ_LRU];
1076 } else {
1077 /* stale but valid data */
1078 bufq = &bufqueues[BQ_AGE];
1079 }
1080 binstailfree(bp, bufq);
1081 }
1082 already_queued:
1083 /* Unlock the buffer. */
1084 CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
1085 CLR(bp->b_flags, B_ASYNC);
1086 cv_broadcast(&bp->b_busy);
1087
1088 if (bp->b_bufsize <= 0)
1089 brele(bp);
1090 }
1091
1092 void
1093 brelse(buf_t *bp, int set)
1094 {
1095
1096 mutex_enter(&bufcache_lock);
1097 brelsel(bp, set);
1098 mutex_exit(&bufcache_lock);
1099 }
1100
1101 /*
1102 * Determine if a block is in the cache.
1103 * Just look on what would be its hash chain. If it's there, return
1104 * a pointer to it, unless it's marked invalid. If it's marked invalid,
1105 * we normally don't return the buffer, unless the caller explicitly
1106 * wants us to.
1107 */
1108 buf_t *
1109 incore(struct vnode *vp, daddr_t blkno)
1110 {
1111 buf_t *bp;
1112
1113 KASSERT(mutex_owned(&bufcache_lock));
1114
1115 /* Search hash chain */
1116 LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
1117 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1118 !ISSET(bp->b_cflags, BC_INVAL)) {
1119 KASSERT(bp->b_objlock == &vp->v_interlock);
1120 return (bp);
1121 }
1122 }
1123
1124 return (NULL);
1125 }
1126
1127 /*
1128 * Get a block of requested size that is associated with
1129 * a given vnode and block offset. If it is found in the
1130 * block cache, mark it as having been found, make it busy
1131 * and return it. Otherwise, return an empty block of the
1132 * correct size. It is up to the caller to insure that the
1133 * cached blocks be of the correct size.
1134 */
1135 buf_t *
1136 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1137 {
1138 int err, preserve;
1139 buf_t *bp;
1140
1141 mutex_enter(&bufcache_lock);
1142 loop:
1143 bp = incore(vp, blkno);
1144 if (bp != NULL) {
1145 err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
1146 if (err != 0) {
1147 if (err == EPASSTHROUGH)
1148 goto loop;
1149 mutex_exit(&bufcache_lock);
1150 return (NULL);
1151 }
1152 KASSERT(!cv_has_waiters(&bp->b_done));
1153 #ifdef DIAGNOSTIC
1154 if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) &&
1155 bp->b_bcount < size && vp->v_type != VBLK)
1156 panic("getblk: block size invariant failed");
1157 #endif
1158 bremfree(bp);
1159 preserve = 1;
1160 } else {
1161 if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
1162 goto loop;
1163
1164 if (incore(vp, blkno) != NULL) {
1165 /* The block has come into memory in the meantime. */
1166 brelsel(bp, 0);
1167 goto loop;
1168 }
1169
1170 LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
1171 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
1172 mutex_enter(&vp->v_interlock);
1173 bgetvp(vp, bp);
1174 mutex_exit(&vp->v_interlock);
1175 preserve = 0;
1176 }
1177 mutex_exit(&bufcache_lock);
1178
1179 /*
1180 * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
1181 * if we re-size buffers here.
1182 */
1183 if (ISSET(bp->b_flags, B_LOCKED)) {
1184 KASSERT(bp->b_bufsize >= size);
1185 } else {
1186 if (allocbuf(bp, size, preserve)) {
1187 mutex_enter(&bufcache_lock);
1188 LIST_REMOVE(bp, b_hash);
1189 mutex_exit(&bufcache_lock);
1190 brelse(bp, BC_INVAL);
1191 return NULL;
1192 }
1193 }
1194 BIO_SETPRIO(bp, BPRIO_DEFAULT);
1195 return (bp);
1196 }
1197
1198 /*
1199 * Get an empty, disassociated buffer of given size.
1200 */
1201 buf_t *
1202 geteblk(int size)
1203 {
1204 buf_t *bp;
1205 int error;
1206
1207 mutex_enter(&bufcache_lock);
1208 while ((bp = getnewbuf(0, 0, 0)) == NULL)
1209 ;
1210
1211 SET(bp->b_cflags, BC_INVAL);
1212 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1213 mutex_exit(&bufcache_lock);
1214 BIO_SETPRIO(bp, BPRIO_DEFAULT);
1215 error = allocbuf(bp, size, 0);
1216 KASSERT(error == 0);
1217 return (bp);
1218 }
1219
1220 /*
1221 * Expand or contract the actual memory allocated to a buffer.
1222 *
1223 * If the buffer shrinks, data is lost, so it's up to the
1224 * caller to have written it out *first*; this routine will not
1225 * start a write. If the buffer grows, it's the callers
1226 * responsibility to fill out the buffer's additional contents.
1227 */
1228 int
1229 allocbuf(buf_t *bp, int size, int preserve)
1230 {
1231 void *addr;
1232 vsize_t oldsize, desired_size;
1233 int oldcount;
1234 int delta;
1235
1236 desired_size = buf_roundsize(size);
1237 if (desired_size > MAXBSIZE)
1238 printf("allocbuf: buffer larger than MAXBSIZE requested");
1239
1240 oldcount = bp->b_bcount;
1241
1242 bp->b_bcount = size;
1243
1244 oldsize = bp->b_bufsize;
1245 if (oldsize == desired_size) {
1246 /*
1247 * Do not short cut the WAPBL resize, as the buffer length
1248 * could still have changed and this would corrupt the
1249 * tracking of the transaction length.
1250 */
1251 goto out;
1252 }
1253
1254 /*
1255 * If we want a buffer of a different size, re-allocate the
1256 * buffer's memory; copy old content only if needed.
1257 */
1258 addr = buf_malloc(desired_size);
1259 if (addr == NULL)
1260 return ENOMEM;
1261 if (preserve)
1262 memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
1263 if (bp->b_data != NULL)
1264 buf_mrelease(bp->b_data, oldsize);
1265 bp->b_data = addr;
1266 bp->b_bufsize = desired_size;
1267
1268 /*
1269 * Update overall buffer memory counter (protected by bufcache_lock)
1270 */
1271 delta = (long)desired_size - (long)oldsize;
1272
1273 mutex_enter(&bufcache_lock);
1274 if ((bufmem += delta) > bufmem_hiwater) {
1275 /*
1276 * Need to trim overall memory usage.
1277 */
1278 while (buf_canrelease()) {
1279 if (curcpu()->ci_schedstate.spc_flags &
1280 SPCF_SHOULDYIELD) {
1281 mutex_exit(&bufcache_lock);
1282 preempt();
1283 mutex_enter(&bufcache_lock);
1284 }
1285 if (buf_trim() == 0)
1286 break;
1287 }
1288 }
1289 mutex_exit(&bufcache_lock);
1290
1291 out:
1292 if (wapbl_vphaswapbl(bp->b_vp))
1293 WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
1294
1295 return 0;
1296 }
1297
1298 /*
1299 * Find a buffer which is available for use.
1300 * Select something from a free list.
1301 * Preference is to AGE list, then LRU list.
1302 *
1303 * Called with the buffer queues locked.
1304 * Return buffer locked.
1305 */
1306 buf_t *
1307 getnewbuf(int slpflag, int slptimeo, int from_bufq)
1308 {
1309 buf_t *bp;
1310 struct vnode *vp;
1311
1312 start:
1313 KASSERT(mutex_owned(&bufcache_lock));
1314
1315 /*
1316 * Get a new buffer from the pool.
1317 */
1318 if (!from_bufq && buf_lotsfree()) {
1319 mutex_exit(&bufcache_lock);
1320 bp = pool_cache_get(buf_cache, PR_NOWAIT);
1321 if (bp != NULL) {
1322 memset((char *)bp, 0, sizeof(*bp));
1323 buf_init(bp);
1324 SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */
1325 mutex_enter(&bufcache_lock);
1326 #if defined(DIAGNOSTIC)
1327 bp->b_freelistindex = -1;
1328 #endif /* defined(DIAGNOSTIC) */
1329 return (bp);
1330 }
1331 mutex_enter(&bufcache_lock);
1332 }
1333
1334 KASSERT(mutex_owned(&bufcache_lock));
1335 if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL ||
1336 (bp = TAILQ_FIRST(&bufqueues[BQ_LRU].bq_queue)) != NULL) {
1337 KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
1338 bremfree(bp);
1339
1340 /* Buffer is no longer on free lists. */
1341 SET(bp->b_cflags, BC_BUSY);
1342 } else {
1343 /*
1344 * XXX: !from_bufq should be removed.
1345 */
1346 if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
1347 /* wait for a free buffer of any kind */
1348 if ((slpflag & PCATCH) != 0)
1349 (void)cv_timedwait_sig(&needbuffer_cv,
1350 &bufcache_lock, slptimeo);
1351 else
1352 (void)cv_timedwait(&needbuffer_cv,
1353 &bufcache_lock, slptimeo);
1354 }
1355 return (NULL);
1356 }
1357
1358 #ifdef DIAGNOSTIC
1359 if (bp->b_bufsize <= 0)
1360 panic("buffer %p: on queue but empty", bp);
1361 #endif
1362
1363 if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1364 /*
1365 * This is a delayed write buffer being flushed to disk. Make
1366 * sure it gets aged out of the queue when it's finished, and
1367 * leave it off the LRU queue.
1368 */
1369 CLR(bp->b_cflags, BC_VFLUSH);
1370 SET(bp->b_cflags, BC_AGE);
1371 goto start;
1372 }
1373
1374 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1375 KASSERT(bp->b_refcnt > 0);
1376 KASSERT(!cv_has_waiters(&bp->b_done));
1377
1378 /*
1379 * If buffer was a delayed write, start it and return NULL
1380 * (since we might sleep while starting the write).
1381 */
1382 if (ISSET(bp->b_oflags, BO_DELWRI)) {
1383 /*
1384 * This buffer has gone through the LRU, so make sure it gets
1385 * reused ASAP.
1386 */
1387 SET(bp->b_cflags, BC_AGE);
1388 mutex_exit(&bufcache_lock);
1389 bawrite(bp);
1390 mutex_enter(&bufcache_lock);
1391 return (NULL);
1392 }
1393
1394 vp = bp->b_vp;
1395
1396 /* clear out various other fields */
1397 bp->b_cflags = BC_BUSY;
1398 bp->b_oflags = 0;
1399 bp->b_flags = 0;
1400 bp->b_dev = NODEV;
1401 bp->b_blkno = 0;
1402 bp->b_lblkno = 0;
1403 bp->b_rawblkno = 0;
1404 bp->b_iodone = 0;
1405 bp->b_error = 0;
1406 bp->b_resid = 0;
1407 bp->b_bcount = 0;
1408
1409 LIST_REMOVE(bp, b_hash);
1410
1411 /* Disassociate us from our vnode, if we had one... */
1412 if (vp != NULL) {
1413 mutex_enter(&vp->v_interlock);
1414 brelvp(bp);
1415 mutex_exit(&vp->v_interlock);
1416 }
1417
1418 return (bp);
1419 }
1420
1421 /*
1422 * Attempt to free an aged buffer off the queues.
1423 * Called with queue lock held.
1424 * Returns the amount of buffer memory freed.
1425 */
1426 static int
1427 buf_trim(void)
1428 {
1429 buf_t *bp;
1430 long size = 0;
1431
1432 KASSERT(mutex_owned(&bufcache_lock));
1433
1434 /* Instruct getnewbuf() to get buffers off the queues */
1435 if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
1436 return 0;
1437
1438 KASSERT((bp->b_cflags & BC_WANTED) == 0);
1439 size = bp->b_bufsize;
1440 bufmem -= size;
1441 if (size > 0) {
1442 buf_mrelease(bp->b_data, size);
1443 bp->b_bcount = bp->b_bufsize = 0;
1444 }
1445 /* brelse() will return the buffer to the global buffer pool */
1446 brelsel(bp, 0);
1447 return size;
1448 }
1449
1450 int
1451 buf_drain(int n)
1452 {
1453 int size = 0, sz;
1454
1455 KASSERT(mutex_owned(&bufcache_lock));
1456
1457 while (size < n && bufmem > bufmem_lowater) {
1458 sz = buf_trim();
1459 if (sz <= 0)
1460 break;
1461 size += sz;
1462 }
1463
1464 return size;
1465 }
1466
1467 /*
1468 * Wait for operations on the buffer to complete.
1469 * When they do, extract and return the I/O's error value.
1470 */
1471 int
1472 biowait(buf_t *bp)
1473 {
1474
1475 KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1476 KASSERT(bp->b_refcnt > 0);
1477
1478 mutex_enter(bp->b_objlock);
1479 while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI))
1480 cv_wait(&bp->b_done, bp->b_objlock);
1481 mutex_exit(bp->b_objlock);
1482
1483 return bp->b_error;
1484 }
1485
1486 /*
1487 * Mark I/O complete on a buffer.
1488 *
1489 * If a callback has been requested, e.g. the pageout
1490 * daemon, do so. Otherwise, awaken waiting processes.
1491 *
1492 * [ Leffler, et al., says on p.247:
1493 * "This routine wakes up the blocked process, frees the buffer
1494 * for an asynchronous write, or, for a request by the pagedaemon
1495 * process, invokes a procedure specified in the buffer structure" ]
1496 *
1497 * In real life, the pagedaemon (or other system processes) wants
1498 * to do async stuff to, and doesn't want the buffer brelse()'d.
1499 * (for swap pager, that puts swap buffers on the free lists (!!!),
1500 * for the vn device, that puts malloc'd buffers on the free lists!)
1501 */
1502 void
1503 biodone(buf_t *bp)
1504 {
1505 int s;
1506
1507 KASSERT(!ISSET(bp->b_oflags, BO_DONE));
1508
1509 if (cpu_intr_p()) {
1510 /* From interrupt mode: defer to a soft interrupt. */
1511 s = splvm();
1512 TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
1513 softint_schedule(biodone_sih);
1514 splx(s);
1515 } else {
1516 /* Process now - the buffer may be freed soon. */
1517 biodone2(bp);
1518 }
1519 }
1520
1521 static void
1522 biodone2(buf_t *bp)
1523 {
1524 void (*callout)(buf_t *);
1525
1526 mutex_enter(bp->b_objlock);
1527 /* Note that the transfer is done. */
1528 if (ISSET(bp->b_oflags, BO_DONE))
1529 panic("biodone2 already");
1530 CLR(bp->b_flags, B_COWDONE);
1531 SET(bp->b_oflags, BO_DONE);
1532 BIO_SETPRIO(bp, BPRIO_DEFAULT);
1533
1534 /* Wake up waiting writers. */
1535 if (!ISSET(bp->b_flags, B_READ))
1536 vwakeup(bp);
1537
1538 if ((callout = bp->b_iodone) != NULL) {
1539 /* Note callout done, then call out. */
1540 KASSERT(!cv_has_waiters(&bp->b_done));
1541 KERNEL_LOCK(1, NULL); /* XXXSMP */
1542 bp->b_iodone = NULL;
1543 mutex_exit(bp->b_objlock);
1544 (*callout)(bp);
1545 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */
1546 } else if (ISSET(bp->b_flags, B_ASYNC)) {
1547 /* If async, release. */
1548 KASSERT(!cv_has_waiters(&bp->b_done));
1549 mutex_exit(bp->b_objlock);
1550 brelse(bp, 0);
1551 } else {
1552 /* Otherwise just wake up waiters in biowait(). */
1553 cv_broadcast(&bp->b_done);
1554 mutex_exit(bp->b_objlock);
1555 }
1556 }
1557
1558 static void
1559 biointr(void *cookie)
1560 {
1561 struct cpu_info *ci;
1562 buf_t *bp;
1563 int s;
1564
1565 ci = curcpu();
1566
1567 while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
1568 KASSERT(curcpu() == ci);
1569
1570 s = splvm();
1571 bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
1572 TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
1573 splx(s);
1574
1575 biodone2(bp);
1576 }
1577 }
1578
1579 /*
1580 * Return a count of buffers on the "locked" queue.
1581 */
1582 int
1583 count_lock_queue(void)
1584 {
1585 buf_t *bp;
1586 int n = 0;
1587
1588 mutex_enter(&bufcache_lock);
1589 TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist)
1590 n++;
1591 mutex_exit(&bufcache_lock);
1592 return (n);
1593 }
1594
1595 /*
1596 * Wait for all buffers to complete I/O
1597 * Return the number of "stuck" buffers.
1598 */
1599 int
1600 buf_syncwait(void)
1601 {
1602 buf_t *bp;
1603 int iter, nbusy, nbusy_prev = 0, dcount, ihash;
1604
1605 dcount = 10000;
1606 for (iter = 0; iter < 20;) {
1607 mutex_enter(&bufcache_lock);
1608 nbusy = 0;
1609 for (ihash = 0; ihash < bufhash+1; ihash++) {
1610 LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1611 if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY)
1612 nbusy += ((bp->b_flags & B_READ) == 0);
1613 }
1614 }
1615 mutex_exit(&bufcache_lock);
1616
1617 if (nbusy == 0)
1618 break;
1619 if (nbusy_prev == 0)
1620 nbusy_prev = nbusy;
1621 printf("%d ", nbusy);
1622 kpause("bflush", false, MAX(1, hz / 25 * iter), NULL);
1623 if (nbusy >= nbusy_prev) /* we didn't flush anything */
1624 iter++;
1625 else
1626 nbusy_prev = nbusy;
1627 }
1628
1629 if (nbusy) {
1630 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
1631 printf("giving up\nPrinting vnodes for busy buffers\n");
1632 for (ihash = 0; ihash < bufhash+1; ihash++) {
1633 LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1634 if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY &&
1635 (bp->b_flags & B_READ) == 0)
1636 vprint(NULL, bp->b_vp);
1637 }
1638 }
1639 #endif
1640 }
1641
1642 return nbusy;
1643 }
1644
1645 static void
1646 sysctl_fillbuf(buf_t *i, struct buf_sysctl *o)
1647 {
1648
1649 o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
1650 o->b_error = i->b_error;
1651 o->b_prio = i->b_prio;
1652 o->b_dev = i->b_dev;
1653 o->b_bufsize = i->b_bufsize;
1654 o->b_bcount = i->b_bcount;
1655 o->b_resid = i->b_resid;
1656 o->b_addr = PTRTOUINT64(i->b_data);
1657 o->b_blkno = i->b_blkno;
1658 o->b_rawblkno = i->b_rawblkno;
1659 o->b_iodone = PTRTOUINT64(i->b_iodone);
1660 o->b_proc = PTRTOUINT64(i->b_proc);
1661 o->b_vp = PTRTOUINT64(i->b_vp);
1662 o->b_saveaddr = PTRTOUINT64(i->b_saveaddr);
1663 o->b_lblkno = i->b_lblkno;
1664 }
1665
1666 #define KERN_BUFSLOP 20
1667 static int
1668 sysctl_dobuf(SYSCTLFN_ARGS)
1669 {
1670 buf_t *bp;
1671 struct buf_sysctl bs;
1672 struct bqueue *bq;
1673 char *dp;
1674 u_int i, op, arg;
1675 size_t len, needed, elem_size, out_size;
1676 int error, elem_count, retries;
1677
1678 if (namelen == 1 && name[0] == CTL_QUERY)
1679 return (sysctl_query(SYSCTLFN_CALL(rnode)));
1680
1681 if (namelen != 4)
1682 return (EINVAL);
1683
1684 retries = 100;
1685 retry:
1686 dp = oldp;
1687 len = (oldp != NULL) ? *oldlenp : 0;
1688 op = name[0];
1689 arg = name[1];
1690 elem_size = name[2];
1691 elem_count = name[3];
1692 out_size = MIN(sizeof(bs), elem_size);
1693
1694 /*
1695 * at the moment, these are just "placeholders" to make the
1696 * API for retrieving kern.buf data more extensible in the
1697 * future.
1698 *
1699 * XXX kern.buf currently has "netbsd32" issues. hopefully
1700 * these will be resolved at a later point.
1701 */
1702 if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
1703 elem_size < 1 || elem_count < 0)
1704 return (EINVAL);
1705
1706 error = 0;
1707 needed = 0;
1708 sysctl_unlock();
1709 mutex_enter(&bufcache_lock);
1710 for (i = 0; i < BQUEUES; i++) {
1711 bq = &bufqueues[i];
1712 TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
1713 bq->bq_marker = bp;
1714 if (len >= elem_size && elem_count > 0) {
1715 sysctl_fillbuf(bp, &bs);
1716 mutex_exit(&bufcache_lock);
1717 error = copyout(&bs, dp, out_size);
1718 mutex_enter(&bufcache_lock);
1719 if (error)
1720 break;
1721 if (bq->bq_marker != bp) {
1722 /*
1723 * This sysctl node is only for
1724 * statistics. Retry; if the
1725 * queue keeps changing, then
1726 * bail out.
1727 */
1728 if (retries-- == 0) {
1729 error = EAGAIN;
1730 break;
1731 }
1732 mutex_exit(&bufcache_lock);
1733 goto retry;
1734 }
1735 dp += elem_size;
1736 len -= elem_size;
1737 }
1738 needed += elem_size;
1739 if (elem_count > 0 && elem_count != INT_MAX)
1740 elem_count--;
1741 }
1742 if (error != 0)
1743 break;
1744 }
1745 mutex_exit(&bufcache_lock);
1746 sysctl_relock();
1747
1748 *oldlenp = needed;
1749 if (oldp == NULL)
1750 *oldlenp += KERN_BUFSLOP * sizeof(buf_t);
1751
1752 return (error);
1753 }
1754
1755 static int
1756 sysctl_bufvm_update(SYSCTLFN_ARGS)
1757 {
1758 int t, error, rv;
1759 struct sysctlnode node;
1760
1761 node = *rnode;
1762 node.sysctl_data = &t;
1763 t = *(int *)rnode->sysctl_data;
1764 error = sysctl_lookup(SYSCTLFN_CALL(&node));
1765 if (error || newp == NULL)
1766 return (error);
1767
1768 if (t < 0)
1769 return EINVAL;
1770 if (rnode->sysctl_data == &bufcache) {
1771 if (t > 100)
1772 return (EINVAL);
1773 bufcache = t;
1774 buf_setwm();
1775 } else if (rnode->sysctl_data == &bufmem_lowater) {
1776 if (bufmem_hiwater - t < 16)
1777 return (EINVAL);
1778 bufmem_lowater = t;
1779 } else if (rnode->sysctl_data == &bufmem_hiwater) {
1780 if (t - bufmem_lowater < 16)
1781 return (EINVAL);
1782 bufmem_hiwater = t;
1783 } else
1784 return (EINVAL);
1785
1786 /* Drain until below new high water mark */
1787 sysctl_unlock();
1788 mutex_enter(&bufcache_lock);
1789 while ((t = bufmem - bufmem_hiwater) >= 0) {
1790 rv = buf_drain(t / (2 * 1024));
1791 if (rv <= 0)
1792 break;
1793 }
1794 mutex_exit(&bufcache_lock);
1795 sysctl_relock();
1796
1797 return 0;
1798 }
1799
1800 static struct sysctllog *vfsbio_sysctllog;
1801
1802 static void
1803 sysctl_kern_buf_setup(void)
1804 {
1805
1806 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1807 CTLFLAG_PERMANENT,
1808 CTLTYPE_NODE, "kern", NULL,
1809 NULL, 0, NULL, 0,
1810 CTL_KERN, CTL_EOL);
1811 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1812 CTLFLAG_PERMANENT,
1813 CTLTYPE_NODE, "buf",
1814 SYSCTL_DESCR("Kernel buffer cache information"),
1815 sysctl_dobuf, 0, NULL, 0,
1816 CTL_KERN, KERN_BUF, CTL_EOL);
1817 }
1818
1819 static void
1820 sysctl_vm_buf_setup(void)
1821 {
1822
1823 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1824 CTLFLAG_PERMANENT,
1825 CTLTYPE_NODE, "vm", NULL,
1826 NULL, 0, NULL, 0,
1827 CTL_VM, CTL_EOL);
1828 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1829 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1830 CTLTYPE_INT, "bufcache",
1831 SYSCTL_DESCR("Percentage of physical memory to use for "
1832 "buffer cache"),
1833 sysctl_bufvm_update, 0, &bufcache, 0,
1834 CTL_VM, CTL_CREATE, CTL_EOL);
1835 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1836 CTLFLAG_PERMANENT|CTLFLAG_READONLY,
1837 CTLTYPE_INT, "bufmem",
1838 SYSCTL_DESCR("Amount of kernel memory used by buffer "
1839 "cache"),
1840 NULL, 0, &bufmem, 0,
1841 CTL_VM, CTL_CREATE, CTL_EOL);
1842 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1843 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1844 CTLTYPE_INT, "bufmem_lowater",
1845 SYSCTL_DESCR("Minimum amount of kernel memory to "
1846 "reserve for buffer cache"),
1847 sysctl_bufvm_update, 0, &bufmem_lowater, 0,
1848 CTL_VM, CTL_CREATE, CTL_EOL);
1849 sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1850 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1851 CTLTYPE_INT, "bufmem_hiwater",
1852 SYSCTL_DESCR("Maximum amount of kernel memory to use "
1853 "for buffer cache"),
1854 sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
1855 CTL_VM, CTL_CREATE, CTL_EOL);
1856 }
1857
1858 #ifdef DEBUG
1859 /*
1860 * Print out statistics on the current allocation of the buffer pool.
1861 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1862 * in vfs_syscalls.c using sysctl.
1863 */
1864 void
1865 vfs_bufstats(void)
1866 {
1867 int i, j, count;
1868 buf_t *bp;
1869 struct bqueue *dp;
1870 int counts[(MAXBSIZE / PAGE_SIZE) + 1];
1871 static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
1872
1873 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1874 count = 0;
1875 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1876 counts[j] = 0;
1877 TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
1878 counts[bp->b_bufsize/PAGE_SIZE]++;
1879 count++;
1880 }
1881 printf("%s: total-%d", bname[i], count);
1882 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1883 if (counts[j] != 0)
1884 printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1885 printf("\n");
1886 }
1887 }
1888 #endif /* DEBUG */
1889
1890 /* ------------------------------ */
1891
1892 buf_t *
1893 getiobuf(struct vnode *vp, bool waitok)
1894 {
1895 buf_t *bp;
1896
1897 bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
1898 if (bp == NULL)
1899 return bp;
1900
1901 buf_init(bp);
1902
1903 if ((bp->b_vp = vp) == NULL)
1904 bp->b_objlock = &buffer_lock;
1905 else
1906 bp->b_objlock = &vp->v_interlock;
1907
1908 return bp;
1909 }
1910
1911 void
1912 putiobuf(buf_t *bp)
1913 {
1914
1915 buf_destroy(bp);
1916 pool_cache_put(bufio_cache, bp);
1917 }
1918
1919 /*
1920 * nestiobuf_iodone: b_iodone callback for nested buffers.
1921 */
1922
1923 void
1924 nestiobuf_iodone(buf_t *bp)
1925 {
1926 buf_t *mbp = bp->b_private;
1927 int error;
1928 int donebytes;
1929
1930 KASSERT(bp->b_bcount <= bp->b_bufsize);
1931 KASSERT(mbp != bp);
1932
1933 error = bp->b_error;
1934 if (bp->b_error == 0 &&
1935 (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
1936 /*
1937 * Not all got transfered, raise an error. We have no way to
1938 * propagate these conditions to mbp.
1939 */
1940 error = EIO;
1941 }
1942
1943 donebytes = bp->b_bufsize;
1944
1945 putiobuf(bp);
1946 nestiobuf_done(mbp, donebytes, error);
1947 }
1948
1949 /*
1950 * nestiobuf_setup: setup a "nested" buffer.
1951 *
1952 * => 'mbp' is a "master" buffer which is being divided into sub pieces.
1953 * => 'bp' should be a buffer allocated by getiobuf.
1954 * => 'offset' is a byte offset in the master buffer.
1955 * => 'size' is a size in bytes of this nested buffer.
1956 */
1957
1958 void
1959 nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
1960 {
1961 const int b_read = mbp->b_flags & B_READ;
1962 struct vnode *vp = mbp->b_vp;
1963
1964 KASSERT(mbp->b_bcount >= offset + size);
1965 bp->b_vp = vp;
1966 bp->b_dev = mbp->b_dev;
1967 bp->b_objlock = mbp->b_objlock;
1968 bp->b_cflags = BC_BUSY;
1969 bp->b_flags = B_ASYNC | b_read;
1970 bp->b_iodone = nestiobuf_iodone;
1971 bp->b_data = (char *)mbp->b_data + offset;
1972 bp->b_resid = bp->b_bcount = size;
1973 bp->b_bufsize = bp->b_bcount;
1974 bp->b_private = mbp;
1975 BIO_COPYPRIO(bp, mbp);
1976 if (!b_read && vp != NULL) {
1977 mutex_enter(&vp->v_interlock);
1978 vp->v_numoutput++;
1979 mutex_exit(&vp->v_interlock);
1980 }
1981 }
1982
1983 /*
1984 * nestiobuf_done: propagate completion to the master buffer.
1985 *
1986 * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
1987 * => 'error' is an errno(2) that 'donebytes' has been completed with.
1988 */
1989
1990 void
1991 nestiobuf_done(buf_t *mbp, int donebytes, int error)
1992 {
1993
1994 if (donebytes == 0) {
1995 return;
1996 }
1997 mutex_enter(mbp->b_objlock);
1998 KASSERT(mbp->b_resid >= donebytes);
1999 mbp->b_resid -= donebytes;
2000 if (error)
2001 mbp->b_error = error;
2002 if (mbp->b_resid == 0) {
2003 mutex_exit(mbp->b_objlock);
2004 biodone(mbp);
2005 } else
2006 mutex_exit(mbp->b_objlock);
2007 }
2008
2009 void
2010 buf_init(buf_t *bp)
2011 {
2012
2013 cv_init(&bp->b_busy, "biolock");
2014 cv_init(&bp->b_done, "biowait");
2015 bp->b_dev = NODEV;
2016 bp->b_error = 0;
2017 bp->b_flags = 0;
2018 bp->b_cflags = 0;
2019 bp->b_oflags = 0;
2020 bp->b_objlock = &buffer_lock;
2021 bp->b_iodone = NULL;
2022 bp->b_refcnt = 1;
2023 bp->b_dev = NODEV;
2024 bp->b_vnbufs.le_next = NOLIST;
2025 BIO_SETPRIO(bp, BPRIO_DEFAULT);
2026 }
2027
2028 void
2029 buf_destroy(buf_t *bp)
2030 {
2031
2032 cv_destroy(&bp->b_done);
2033 cv_destroy(&bp->b_busy);
2034 }
2035
2036 int
2037 bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
2038 {
2039 int error;
2040
2041 KASSERT(mutex_owned(&bufcache_lock));
2042
2043 if ((bp->b_cflags & BC_BUSY) != 0) {
2044 if (curlwp == uvm.pagedaemon_lwp)
2045 return EDEADLK;
2046 bp->b_cflags |= BC_WANTED;
2047 bref(bp);
2048 if (interlock != NULL)
2049 mutex_exit(interlock);
2050 if (intr) {
2051 error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
2052 timo);
2053 } else {
2054 error = cv_timedwait(&bp->b_busy, &bufcache_lock,
2055 timo);
2056 }
2057 brele(bp);
2058 if (interlock != NULL)
2059 mutex_enter(interlock);
2060 if (error != 0)
2061 return error;
2062 return EPASSTHROUGH;
2063 }
2064 bp->b_cflags |= BC_BUSY;
2065
2066 return 0;
2067 }
2068