1 1.306 riastrad /* $NetBSD: vfs_bio.c,v 1.306 2024/12/07 02:27:38 riastradh Exp $ */ 2 1.183 ad 3 1.183 ad /*- 4 1.291 ad * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc. 5 1.183 ad * All rights reserved. 6 1.183 ad * 7 1.183 ad * This code is derived from software contributed to The NetBSD Foundation 8 1.217 ad * by Andrew Doran, and by Wasabi Systems, Inc. 9 1.183 ad * 10 1.183 ad * Redistribution and use in source and binary forms, with or without 11 1.183 ad * modification, are permitted provided that the following conditions 12 1.183 ad * are met: 13 1.183 ad * 1. Redistributions of source code must retain the above copyright 14 1.183 ad * notice, this list of conditions and the following disclaimer. 15 1.183 ad * 2. Redistributions in binary form must reproduce the above copyright 16 1.183 ad * notice, this list of conditions and the following disclaimer in the 17 1.183 ad * documentation and/or other materials provided with the distribution. 18 1.183 ad * 19 1.183 ad * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 1.183 ad * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 1.183 ad * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 1.183 ad * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 1.183 ad * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 1.183 ad * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 1.183 ad * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 1.183 ad * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 1.183 ad * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 1.183 ad * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 1.183 ad * POSSIBILITY OF SUCH DAMAGE. 30 1.183 ad */ 31 1.31 cgd 32 1.31 cgd /*- 33 1.31 cgd * Copyright (c) 1982, 1986, 1989, 1993 34 1.31 cgd * The Regents of the University of California. All rights reserved. 35 1.31 cgd * (c) UNIX System Laboratories, Inc. 36 1.31 cgd * All or some portions of this file are derived from material licensed 37 1.31 cgd * to the University of California by American Telephone and Telegraph 38 1.31 cgd * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 1.31 cgd * the permission of UNIX System Laboratories, Inc. 40 1.31 cgd * 41 1.31 cgd * Redistribution and use in source and binary forms, with or without 42 1.31 cgd * modification, are permitted provided that the following conditions 43 1.31 cgd * are met: 44 1.31 cgd * 1. Redistributions of source code must retain the above copyright 45 1.31 cgd * notice, this list of conditions and the following disclaimer. 46 1.31 cgd * 2. Redistributions in binary form must reproduce the above copyright 47 1.31 cgd * notice, this list of conditions and the following disclaimer in the 48 1.31 cgd * documentation and/or other materials provided with the distribution. 49 1.93 agc * 3. Neither the name of the University nor the names of its contributors 50 1.93 agc * may be used to endorse or promote products derived from this software 51 1.93 agc * without specific prior written permission. 52 1.93 agc * 53 1.93 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 1.93 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 1.93 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 1.93 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 1.93 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 1.93 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 1.93 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 1.93 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 1.93 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 1.93 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 1.93 agc * SUCH DAMAGE. 64 1.93 agc * 65 1.93 agc * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 66 1.93 agc */ 67 1.93 agc 68 1.93 agc /*- 69 1.93 agc * Copyright (c) 1994 Christopher G. Demetriou 70 1.93 agc * 71 1.93 agc * Redistribution and use in source and binary forms, with or without 72 1.93 agc * modification, are permitted provided that the following conditions 73 1.93 agc * are met: 74 1.93 agc * 1. Redistributions of source code must retain the above copyright 75 1.93 agc * notice, this list of conditions and the following disclaimer. 76 1.93 agc * 2. Redistributions in binary form must reproduce the above copyright 77 1.93 agc * notice, this list of conditions and the following disclaimer in the 78 1.93 agc * documentation and/or other materials provided with the distribution. 79 1.31 cgd * 3. All advertising materials mentioning features or use of this software 80 1.31 cgd * must display the following acknowledgement: 81 1.31 cgd * This product includes software developed by the University of 82 1.31 cgd * California, Berkeley and its contributors. 83 1.31 cgd * 4. Neither the name of the University nor the names of its contributors 84 1.31 cgd * may be used to endorse or promote products derived from this software 85 1.31 cgd * without specific prior written permission. 86 1.31 cgd * 87 1.31 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 88 1.31 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 89 1.31 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 90 1.31 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 91 1.31 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 92 1.31 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 93 1.31 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 94 1.31 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 95 1.31 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 96 1.31 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 97 1.31 cgd * SUCH DAMAGE. 98 1.31 cgd * 99 1.31 cgd * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 100 1.31 cgd */ 101 1.31 cgd 102 1.31 cgd /* 103 1.221 rmind * The buffer cache subsystem. 104 1.221 rmind * 105 1.31 cgd * Some references: 106 1.31 cgd * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 107 1.31 cgd * Leffler, et al.: The Design and Implementation of the 4.3BSD 108 1.31 cgd * UNIX Operating System (Addison Welley, 1989) 109 1.221 rmind * 110 1.221 rmind * Locking 111 1.221 rmind * 112 1.221 rmind * There are three locks: 113 1.221 rmind * - bufcache_lock: protects global buffer cache state. 114 1.221 rmind * - BC_BUSY: a long term per-buffer lock. 115 1.221 rmind * - buf_t::b_objlock: lock on completion (biowait vs biodone). 116 1.221 rmind * 117 1.221 rmind * For buffers associated with vnodes (a most common case) b_objlock points 118 1.221 rmind * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock. 119 1.221 rmind * 120 1.221 rmind * Lock order: 121 1.221 rmind * bufcache_lock -> 122 1.221 rmind * buf_t::b_objlock 123 1.31 cgd */ 124 1.77 lukem 125 1.178 dsl #include <sys/cdefs.h> 126 1.306 riastrad __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.306 2024/12/07 02:27:38 riastradh Exp $"); 127 1.178 dsl 128 1.256 pooka #ifdef _KERNEL_OPT 129 1.304 riastrad #include "opt_biohist.h" 130 1.100 pk #include "opt_bufcache.h" 131 1.259 riz #include "opt_dtrace.h" 132 1.256 pooka #endif 133 1.81 matt 134 1.31 cgd #include <sys/param.h> 135 1.304 riastrad #include <sys/types.h> 136 1.304 riastrad 137 1.304 riastrad #include <sys/bitops.h> 138 1.304 riastrad #include <sys/buf.h> 139 1.304 riastrad #include <sys/conf.h> 140 1.304 riastrad #include <sys/cprng.h> 141 1.304 riastrad #include <sys/cpu.h> 142 1.304 riastrad #include <sys/fstrans.h> 143 1.304 riastrad #include <sys/intr.h> 144 1.304 riastrad #include <sys/kauth.h> 145 1.100 pk #include <sys/kernel.h> 146 1.304 riastrad #include <sys/mount.h> 147 1.31 cgd #include <sys/proc.h> 148 1.31 cgd #include <sys/resourcevar.h> 149 1.304 riastrad #include <sys/sdt.h> 150 1.100 pk #include <sys/sysctl.h> 151 1.304 riastrad #include <sys/systm.h> 152 1.304 riastrad #include <sys/vnode.h> 153 1.208 simonb #include <sys/wapbl.h> 154 1.40 christos 155 1.227 uebayasi #include <uvm/uvm.h> /* extern struct uvm uvm */ 156 1.71 thorpej 157 1.59 fvdl #include <miscfs/specfs/specdev.h> 158 1.59 fvdl 159 1.288 riastrad SDT_PROVIDER_DEFINE(io); 160 1.288 riastrad 161 1.288 riastrad SDT_PROBE_DEFINE4(io, kernel, , bbusy__start, 162 1.288 riastrad "struct buf *"/*bp*/, 163 1.288 riastrad "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/); 164 1.288 riastrad SDT_PROBE_DEFINE5(io, kernel, , bbusy__done, 165 1.288 riastrad "struct buf *"/*bp*/, 166 1.288 riastrad "bool"/*intr*/, 167 1.288 riastrad "int"/*timo*/, 168 1.288 riastrad "kmutex_t *"/*interlock*/, 169 1.288 riastrad "int"/*error*/); 170 1.288 riastrad SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start); 171 1.288 riastrad SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/); 172 1.288 riastrad SDT_PROBE_DEFINE3(io, kernel, , getblk__start, 173 1.288 riastrad "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/); 174 1.288 riastrad SDT_PROBE_DEFINE4(io, kernel, , getblk__done, 175 1.288 riastrad "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/, 176 1.288 riastrad "struct buf *"/*bp*/); 177 1.288 riastrad SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/); 178 1.288 riastrad SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/); 179 1.288 riastrad SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/); 180 1.288 riastrad 181 1.100 pk #ifndef BUFPAGES 182 1.100 pk # define BUFPAGES 0 183 1.100 pk #endif 184 1.100 pk 185 1.100 pk #ifdef BUFCACHE 186 1.100 pk # if (BUFCACHE < 5) || (BUFCACHE > 95) 187 1.100 pk # error BUFCACHE is not between 5 and 95 188 1.100 pk # endif 189 1.100 pk #else 190 1.114 tls # define BUFCACHE 15 191 1.100 pk #endif 192 1.100 pk 193 1.217 ad u_int nbuf; /* desired number of buffer headers */ 194 1.100 pk u_int bufpages = BUFPAGES; /* optional hardwired count */ 195 1.100 pk u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */ 196 1.100 pk 197 1.274 chs /* 198 1.274 chs * Definitions for the buffer free lists. 199 1.274 chs */ 200 1.274 chs #define BQUEUES 3 /* number of free buffer queues */ 201 1.274 chs 202 1.274 chs #define BQ_LOCKED 0 /* super-blocks &c */ 203 1.274 chs #define BQ_LRU 1 /* lru, useful buffers */ 204 1.274 chs #define BQ_AGE 2 /* rubbish */ 205 1.274 chs 206 1.274 chs struct bqueue { 207 1.274 chs TAILQ_HEAD(, buf) bq_queue; 208 1.274 chs uint64_t bq_bytes; 209 1.274 chs buf_t *bq_marker; 210 1.274 chs }; 211 1.280 ad static struct bqueue bufqueues[BQUEUES] __cacheline_aligned; 212 1.274 chs 213 1.130 yamt /* Function prototypes */ 214 1.135 enami static void buf_setwm(void); 215 1.130 yamt static int buf_trim(void); 216 1.130 yamt static void *bufpool_page_alloc(struct pool *, int); 217 1.130 yamt static void bufpool_page_free(struct pool *, void *); 218 1.253 maxv static buf_t *bio_doread(struct vnode *, daddr_t, int, int); 219 1.183 ad static buf_t *getnewbuf(int, int, int); 220 1.130 yamt static int buf_lotsfree(void); 221 1.130 yamt static int buf_canrelease(void); 222 1.183 ad static u_long buf_mempoolidx(u_long); 223 1.183 ad static u_long buf_roundsize(u_long); 224 1.229 rmind static void *buf_alloc(size_t); 225 1.170 christos static void buf_mrelease(void *, size_t); 226 1.183 ad static void binsheadfree(buf_t *, struct bqueue *); 227 1.183 ad static void binstailfree(buf_t *, struct bqueue *); 228 1.130 yamt #ifdef DEBUG 229 1.206 bouyer static int checkfreelist(buf_t *, struct bqueue *, int); 230 1.130 yamt #endif 231 1.183 ad static void biointr(void *); 232 1.183 ad static void biodone2(buf_t *); 233 1.215 pooka static void sysctl_kern_buf_setup(void); 234 1.215 pooka static void sysctl_vm_buf_setup(void); 235 1.100 pk 236 1.264 pgoyette /* Initialization for biohist */ 237 1.264 pgoyette 238 1.264 pgoyette #include <sys/biohist.h> 239 1.264 pgoyette 240 1.266 pgoyette BIOHIST_DEFINE(biohist); 241 1.264 pgoyette 242 1.264 pgoyette void 243 1.264 pgoyette biohist_init(void) 244 1.264 pgoyette { 245 1.270 riastrad 246 1.266 pgoyette BIOHIST_INIT(biohist, BIOHIST_SIZE); 247 1.264 pgoyette } 248 1.264 pgoyette 249 1.31 cgd /* 250 1.31 cgd * Definitions for the buffer hash lists. 251 1.31 cgd */ 252 1.31 cgd #define BUFHASH(dvp, lbn) \ 253 1.73 chs (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) 254 1.31 cgd LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 255 1.31 cgd u_long bufhash; 256 1.183 ad 257 1.298 simonb static int bufhash_stats(struct hashstat_sysctl *, bool); 258 1.298 simonb 259 1.183 ad static kcondvar_t needbuffer_cv; 260 1.31 cgd 261 1.31 cgd /* 262 1.87 pk * Buffer queue lock. 263 1.87 pk */ 264 1.280 ad kmutex_t bufcache_lock __cacheline_aligned; 265 1.280 ad kmutex_t buffer_lock __cacheline_aligned; 266 1.87 pk 267 1.183 ad /* Software ISR for completed transfers. */ 268 1.183 ad static void *biodone_sih; 269 1.153 yamt 270 1.183 ad /* Buffer pool for I/O buffers. */ 271 1.183 ad static pool_cache_t buf_cache; 272 1.183 ad static pool_cache_t bufio_cache; 273 1.65 thorpej 274 1.232 jakllsch #define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */ 275 1.232 jakllsch #define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1) 276 1.232 jakllsch __CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE); 277 1.100 pk 278 1.100 pk /* Buffer memory pools */ 279 1.101 thorpej static struct pool bmempools[NMEMPOOLS]; 280 1.100 pk 281 1.191 yamt static struct vm_map *buf_map; 282 1.100 pk 283 1.100 pk /* 284 1.100 pk * Buffer memory pool allocator. 285 1.100 pk */ 286 1.101 thorpej static void * 287 1.166 yamt bufpool_page_alloc(struct pool *pp, int flags) 288 1.100 pk { 289 1.111 yamt 290 1.236 para return (void *)uvm_km_alloc(buf_map, 291 1.236 para MAXBSIZE, MAXBSIZE, 292 1.236 para ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK) 293 1.236 para | UVM_KMF_WIRED); 294 1.100 pk } 295 1.100 pk 296 1.101 thorpej static void 297 1.166 yamt bufpool_page_free(struct pool *pp, void *v) 298 1.100 pk { 299 1.144 yamt 300 1.236 para uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED); 301 1.100 pk } 302 1.100 pk 303 1.101 thorpej static struct pool_allocator bufmempool_allocator = { 304 1.162 christos .pa_alloc = bufpool_page_alloc, 305 1.162 christos .pa_free = bufpool_page_free, 306 1.162 christos .pa_pagesz = MAXBSIZE, 307 1.100 pk }; 308 1.100 pk 309 1.100 pk /* Buffer memory management variables */ 310 1.183 ad u_long bufmem_valimit; 311 1.183 ad u_long bufmem_hiwater; 312 1.183 ad u_long bufmem_lowater; 313 1.183 ad u_long bufmem; 314 1.100 pk 315 1.100 pk /* 316 1.100 pk * MD code can call this to set a hard limit on the amount 317 1.100 pk * of virtual memory used by the buffer cache. 318 1.100 pk */ 319 1.101 thorpej int 320 1.101 thorpej buf_setvalimit(vsize_t sz) 321 1.100 pk { 322 1.100 pk 323 1.100 pk /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */ 324 1.100 pk if (sz < NMEMPOOLS * MAXBSIZE) 325 1.306 riastrad return SET_ERROR(EINVAL); 326 1.100 pk 327 1.100 pk bufmem_valimit = sz; 328 1.100 pk return 0; 329 1.100 pk } 330 1.100 pk 331 1.135 enami static void 332 1.135 enami buf_setwm(void) 333 1.135 enami { 334 1.135 enami 335 1.135 enami bufmem_hiwater = buf_memcalc(); 336 1.135 enami /* lowater is approx. 2% of memory (with bufcache = 15) */ 337 1.135 enami #define BUFMEM_WMSHIFT 3 338 1.135 enami #define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT) 339 1.135 enami if (bufmem_hiwater < BUFMEM_HIWMMIN) 340 1.135 enami /* Ensure a reasonable minimum value */ 341 1.135 enami bufmem_hiwater = BUFMEM_HIWMMIN; 342 1.135 enami bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT; 343 1.135 enami } 344 1.135 enami 345 1.99 dbj #ifdef DEBUG 346 1.99 dbj int debug_verify_freelist = 0; 347 1.131 yamt static int 348 1.206 bouyer checkfreelist(buf_t *bp, struct bqueue *dp, int ison) 349 1.99 dbj { 350 1.183 ad buf_t *b; 351 1.183 ad 352 1.183 ad if (!debug_verify_freelist) 353 1.183 ad return 1; 354 1.131 yamt 355 1.131 yamt TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { 356 1.100 pk if (b == bp) 357 1.206 bouyer return ison ? 1 : 0; 358 1.100 pk } 359 1.183 ad 360 1.206 bouyer return ison ? 0 : 1; 361 1.99 dbj } 362 1.99 dbj #endif 363 1.99 dbj 364 1.131 yamt /* 365 1.131 yamt * Insq/Remq for the buffer hash lists. 366 1.131 yamt * Call with buffer queue locked. 367 1.131 yamt */ 368 1.183 ad static void 369 1.183 ad binsheadfree(buf_t *bp, struct bqueue *dp) 370 1.131 yamt { 371 1.131 yamt 372 1.206 bouyer KASSERT(mutex_owned(&bufcache_lock)); 373 1.131 yamt KASSERT(bp->b_freelistindex == -1); 374 1.131 yamt TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist); 375 1.131 yamt dp->bq_bytes += bp->b_bufsize; 376 1.131 yamt bp->b_freelistindex = dp - bufqueues; 377 1.131 yamt } 378 1.131 yamt 379 1.183 ad static void 380 1.183 ad binstailfree(buf_t *bp, struct bqueue *dp) 381 1.131 yamt { 382 1.131 yamt 383 1.206 bouyer KASSERT(mutex_owned(&bufcache_lock)); 384 1.257 martin KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? " 385 1.257 martin "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex); 386 1.131 yamt TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist); 387 1.131 yamt dp->bq_bytes += bp->b_bufsize; 388 1.131 yamt bp->b_freelistindex = dp - bufqueues; 389 1.131 yamt } 390 1.131 yamt 391 1.31 cgd void 392 1.183 ad bremfree(buf_t *bp) 393 1.31 cgd { 394 1.131 yamt struct bqueue *dp; 395 1.131 yamt int bqidx = bp->b_freelistindex; 396 1.94 yamt 397 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 398 1.31 cgd 399 1.131 yamt KASSERT(bqidx != -1); 400 1.131 yamt dp = &bufqueues[bqidx]; 401 1.206 bouyer KDASSERT(checkfreelist(bp, dp, 1)); 402 1.131 yamt KASSERT(dp->bq_bytes >= bp->b_bufsize); 403 1.131 yamt TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist); 404 1.131 yamt dp->bq_bytes -= bp->b_bufsize; 405 1.183 ad 406 1.183 ad /* For the sysctl helper. */ 407 1.183 ad if (bp == dp->bq_marker) 408 1.183 ad dp->bq_marker = NULL; 409 1.183 ad 410 1.131 yamt #if defined(DIAGNOSTIC) 411 1.131 yamt bp->b_freelistindex = -1; 412 1.131 yamt #endif /* defined(DIAGNOSTIC) */ 413 1.31 cgd } 414 1.31 cgd 415 1.183 ad /* 416 1.193 yamt * note that for some ports this is used by pmap bootstrap code to 417 1.193 yamt * determine kva size. 418 1.193 yamt */ 419 1.101 thorpej u_long 420 1.101 thorpej buf_memcalc(void) 421 1.100 pk { 422 1.100 pk u_long n; 423 1.244 njoly vsize_t mapsz = 0; 424 1.100 pk 425 1.100 pk /* 426 1.100 pk * Determine the upper bound of memory to use for buffers. 427 1.100 pk * 428 1.100 pk * - If bufpages is specified, use that as the number 429 1.100 pk * pages. 430 1.100 pk * 431 1.100 pk * - Otherwise, use bufcache as the percentage of 432 1.100 pk * physical memory. 433 1.100 pk */ 434 1.100 pk if (bufpages != 0) { 435 1.100 pk n = bufpages; 436 1.100 pk } else { 437 1.100 pk if (bufcache < 5) { 438 1.100 pk printf("forcing bufcache %d -> 5", bufcache); 439 1.100 pk bufcache = 5; 440 1.100 pk } 441 1.100 pk if (bufcache > 95) { 442 1.100 pk printf("forcing bufcache %d -> 95", bufcache); 443 1.100 pk bufcache = 95; 444 1.100 pk } 445 1.244 njoly if (buf_map != NULL) 446 1.244 njoly mapsz = vm_map_max(buf_map) - vm_map_min(buf_map); 447 1.243 para n = calc_cache_size(mapsz, bufcache, 448 1.193 yamt (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT) 449 1.193 yamt / PAGE_SIZE; 450 1.100 pk } 451 1.100 pk 452 1.100 pk n <<= PAGE_SHIFT; 453 1.100 pk if (bufmem_valimit != 0 && n > bufmem_valimit) 454 1.100 pk n = bufmem_valimit; 455 1.100 pk 456 1.306 riastrad return n; 457 1.100 pk } 458 1.100 pk 459 1.31 cgd /* 460 1.31 cgd * Initialize buffers and hash links for buffers. 461 1.31 cgd */ 462 1.31 cgd void 463 1.101 thorpej bufinit(void) 464 1.31 cgd { 465 1.131 yamt struct bqueue *dp; 466 1.127 thorpej int use_std; 467 1.100 pk u_int i; 468 1.250 pooka 469 1.250 pooka biodone_vfs = biodone; 470 1.100 pk 471 1.183 ad mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE); 472 1.183 ad mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE); 473 1.183 ad cv_init(&needbuffer_cv, "needbuf"); 474 1.183 ad 475 1.100 pk if (bufmem_valimit != 0) { 476 1.100 pk vaddr_t minaddr = 0, maxaddr; 477 1.100 pk buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 478 1.304 riastrad bufmem_valimit, 0, false, 0); 479 1.100 pk if (buf_map == NULL) 480 1.100 pk panic("bufinit: cannot allocate submap"); 481 1.100 pk } else 482 1.100 pk buf_map = kernel_map; 483 1.65 thorpej 484 1.192 yamt /* 485 1.192 yamt * Initialize buffer cache memory parameters. 486 1.192 yamt */ 487 1.192 yamt bufmem = 0; 488 1.192 yamt buf_setwm(); 489 1.192 yamt 490 1.100 pk /* On "small" machines use small pool page sizes where possible */ 491 1.127 thorpej use_std = (physmem < atop(16*1024*1024)); 492 1.127 thorpej 493 1.127 thorpej /* 494 1.127 thorpej * Also use them on systems that can map the pool pages using 495 1.127 thorpej * a direct-mapped segment. 496 1.127 thorpej */ 497 1.127 thorpej #ifdef PMAP_MAP_POOLPAGE 498 1.127 thorpej use_std = 1; 499 1.127 thorpej #endif 500 1.100 pk 501 1.183 ad buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, 502 1.183 ad "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL); 503 1.183 ad bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, 504 1.183 ad "biopl", NULL, IPL_BIO, NULL, NULL, NULL); 505 1.176 pooka 506 1.100 pk for (i = 0; i < NMEMPOOLS; i++) { 507 1.100 pk struct pool_allocator *pa; 508 1.100 pk struct pool *pp = &bmempools[i]; 509 1.100 pk u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET); 510 1.222 pooka char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */ 511 1.304 riastrad 512 1.232 jakllsch if (__predict_false(size >= 1048576)) 513 1.232 jakllsch (void)snprintf(name, 8, "buf%um", size / 1048576); 514 1.232 jakllsch else if (__predict_true(size >= 1024)) 515 1.232 jakllsch (void)snprintf(name, 8, "buf%uk", size / 1024); 516 1.165 christos else 517 1.232 jakllsch (void)snprintf(name, 8, "buf%ub", size); 518 1.127 thorpej pa = (size <= PAGE_SIZE && use_std) 519 1.304 riastrad ? &pool_allocator_nointr 520 1.304 riastrad : &bufmempool_allocator; 521 1.293 jdolecek pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE); 522 1.100 pk pool_setlowat(pp, 1); 523 1.126 thorpej pool_sethiwat(pp, 1); 524 1.100 pk } 525 1.100 pk 526 1.100 pk /* Initialize the buffer queues */ 527 1.131 yamt for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) { 528 1.131 yamt TAILQ_INIT(&dp->bq_queue); 529 1.131 yamt dp->bq_bytes = 0; 530 1.131 yamt } 531 1.100 pk 532 1.100 pk /* 533 1.100 pk * Estimate hash table size based on the amount of memory we 534 1.100 pk * intend to use for the buffer cache. The average buffer 535 1.100 pk * size is dependent on our clients (i.e. filesystems). 536 1.100 pk * 537 1.100 pk * For now, use an empirical 3K per buffer. 538 1.100 pk */ 539 1.100 pk nbuf = (bufmem_hiwater / 1024) / 3; 540 1.197 ad bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash); 541 1.215 pooka 542 1.215 pooka sysctl_kern_buf_setup(); 543 1.215 pooka sysctl_vm_buf_setup(); 544 1.298 simonb hashstat_register("bufhash", bufhash_stats); 545 1.100 pk } 546 1.100 pk 547 1.183 ad void 548 1.183 ad bufinit2(void) 549 1.183 ad { 550 1.183 ad 551 1.183 ad biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr, 552 1.183 ad NULL); 553 1.183 ad if (biodone_sih == NULL) 554 1.183 ad panic("bufinit2: can't establish soft interrupt"); 555 1.183 ad } 556 1.183 ad 557 1.100 pk static int 558 1.100 pk buf_lotsfree(void) 559 1.100 pk { 560 1.252 joerg u_long guess; 561 1.128 hannken 562 1.122 simonb /* Always allocate if less than the low water mark. */ 563 1.122 simonb if (bufmem < bufmem_lowater) 564 1.114 tls return 1; 565 1.142 perry 566 1.122 simonb /* Never allocate if greater than the high water mark. */ 567 1.122 simonb if (bufmem > bufmem_hiwater) 568 1.122 simonb return 0; 569 1.114 tls 570 1.115 tls /* If there's anything on the AGE list, it should be eaten. */ 571 1.131 yamt if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL) 572 1.115 tls return 0; 573 1.115 tls 574 1.122 simonb /* 575 1.122 simonb * The probabily of getting a new allocation is inversely 576 1.252 joerg * proportional to the current size of the cache above 577 1.252 joerg * the low water mark. Divide the total first to avoid overflows 578 1.252 joerg * in the product. 579 1.122 simonb */ 580 1.252 joerg guess = cprng_fast32() % 16; 581 1.114 tls 582 1.252 joerg if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >= 583 1.252 joerg (bufmem - bufmem_lowater)) 584 1.114 tls return 1; 585 1.114 tls 586 1.122 simonb /* Otherwise don't allocate. */ 587 1.114 tls return 0; 588 1.100 pk } 589 1.100 pk 590 1.100 pk /* 591 1.116 yamt * Return estimate of bytes we think need to be 592 1.100 pk * released to help resolve low memory conditions. 593 1.116 yamt * 594 1.183 ad * => called with bufcache_lock held. 595 1.100 pk */ 596 1.100 pk static int 597 1.100 pk buf_canrelease(void) 598 1.100 pk { 599 1.115 tls int pagedemand, ninvalid = 0; 600 1.115 tls 601 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 602 1.116 yamt 603 1.118 dan if (bufmem < bufmem_lowater) 604 1.118 dan return 0; 605 1.118 dan 606 1.141 tls if (bufmem > bufmem_hiwater) 607 1.141 tls return bufmem - bufmem_hiwater; 608 1.141 tls 609 1.131 yamt ninvalid += bufqueues[BQ_AGE].bq_bytes; 610 1.100 pk 611 1.296 ad pagedemand = uvmexp.freetarg - uvm_availmem(false); 612 1.115 tls if (pagedemand < 0) 613 1.115 tls return ninvalid; 614 1.115 tls return MAX(ninvalid, MIN(2 * MAXBSIZE, 615 1.115 tls MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE))); 616 1.100 pk } 617 1.100 pk 618 1.100 pk /* 619 1.100 pk * Buffer memory allocation helper functions 620 1.100 pk */ 621 1.183 ad static u_long 622 1.101 thorpej buf_mempoolidx(u_long size) 623 1.100 pk { 624 1.100 pk u_int n = 0; 625 1.100 pk 626 1.100 pk size -= 1; 627 1.100 pk size >>= MEMPOOL_INDEX_OFFSET; 628 1.100 pk while (size) { 629 1.100 pk size >>= 1; 630 1.100 pk n += 1; 631 1.100 pk } 632 1.100 pk if (n >= NMEMPOOLS) 633 1.100 pk panic("buf mem pool index %d", n); 634 1.100 pk return n; 635 1.100 pk } 636 1.100 pk 637 1.183 ad static u_long 638 1.101 thorpej buf_roundsize(u_long size) 639 1.100 pk { 640 1.304 riastrad 641 1.100 pk /* Round up to nearest power of 2 */ 642 1.100 pk return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET)); 643 1.100 pk } 644 1.100 pk 645 1.183 ad static void * 646 1.229 rmind buf_alloc(size_t size) 647 1.100 pk { 648 1.100 pk u_int n = buf_mempoolidx(size); 649 1.170 christos void *addr; 650 1.100 pk 651 1.100 pk while (1) { 652 1.100 pk addr = pool_get(&bmempools[n], PR_NOWAIT); 653 1.100 pk if (addr != NULL) 654 1.100 pk break; 655 1.100 pk 656 1.100 pk /* No memory, see if we can free some. If so, try again */ 657 1.183 ad mutex_enter(&bufcache_lock); 658 1.183 ad if (buf_drain(1) > 0) { 659 1.183 ad mutex_exit(&bufcache_lock); 660 1.100 pk continue; 661 1.183 ad } 662 1.183 ad 663 1.183 ad if (curlwp == uvm.pagedaemon_lwp) { 664 1.183 ad mutex_exit(&bufcache_lock); 665 1.183 ad return NULL; 666 1.183 ad } 667 1.100 pk 668 1.100 pk /* Wait for buffers to arrive on the LRU queue */ 669 1.183 ad cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4); 670 1.183 ad mutex_exit(&bufcache_lock); 671 1.31 cgd } 672 1.100 pk 673 1.100 pk return addr; 674 1.100 pk } 675 1.100 pk 676 1.101 thorpej static void 677 1.170 christos buf_mrelease(void *addr, size_t size) 678 1.100 pk { 679 1.100 pk 680 1.100 pk pool_put(&bmempools[buf_mempoolidx(size)], addr); 681 1.31 cgd } 682 1.31 cgd 683 1.130 yamt /* 684 1.130 yamt * bread()/breadn() helper. 685 1.130 yamt */ 686 1.183 ad static buf_t * 687 1.253 maxv bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) 688 1.31 cgd { 689 1.183 ad buf_t *bp; 690 1.123 christos struct mount *mp; 691 1.31 cgd 692 1.34 mycroft bp = getblk(vp, blkno, size, 0, 0); 693 1.31 cgd 694 1.240 hannken /* 695 1.240 hannken * getblk() may return NULL if we are the pagedaemon. 696 1.240 hannken */ 697 1.86 thorpej if (bp == NULL) { 698 1.240 hannken KASSERT(curlwp == uvm.pagedaemon_lwp); 699 1.240 hannken return NULL; 700 1.86 thorpej } 701 1.86 thorpej 702 1.31 cgd /* 703 1.34 mycroft * If buffer does not have data valid, start a read. 704 1.183 ad * Note that if buffer is BC_INVAL, getblk() won't return it. 705 1.87 pk * Therefore, it's valid if its I/O has completed or been delayed. 706 1.31 cgd */ 707 1.183 ad if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) { 708 1.73 chs /* Start I/O for the buffer. */ 709 1.34 mycroft SET(bp->b_flags, B_READ | async); 710 1.108 yamt if (async) 711 1.108 yamt BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 712 1.108 yamt else 713 1.108 yamt BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 714 1.112 hannken VOP_STRATEGY(vp, bp); 715 1.31 cgd 716 1.34 mycroft /* Pay for the read. */ 717 1.194 ad curlwp->l_ru.ru_inblock++; 718 1.183 ad } else if (async) 719 1.179 ad brelse(bp, 0); 720 1.31 cgd 721 1.124 yamt if (vp->v_type == VBLK) 722 1.247 hannken mp = spec_node_getmountedfs(vp); 723 1.124 yamt else 724 1.124 yamt mp = vp->v_mount; 725 1.123 christos 726 1.123 christos /* 727 1.123 christos * Collect statistics on synchronous and asynchronous reads. 728 1.123 christos * Reads from block devices are charged to their associated 729 1.123 christos * filesystem (if any). 730 1.123 christos */ 731 1.123 christos if (mp != NULL) { 732 1.123 christos if (async == 0) 733 1.123 christos mp->mnt_stat.f_syncreads++; 734 1.123 christos else 735 1.123 christos mp->mnt_stat.f_asyncreads++; 736 1.123 christos } 737 1.123 christos 738 1.306 riastrad return bp; 739 1.34 mycroft } 740 1.34 mycroft 741 1.34 mycroft /* 742 1.34 mycroft * Read a disk block. 743 1.34 mycroft * This algorithm described in Bach (p.54). 744 1.34 mycroft */ 745 1.40 christos int 746 1.255 maxv bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp) 747 1.34 mycroft { 748 1.183 ad buf_t *bp; 749 1.198 hannken int error; 750 1.34 mycroft 751 1.266 pgoyette BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); 752 1.264 pgoyette 753 1.34 mycroft /* Get buffer for block. */ 754 1.253 maxv bp = *bpp = bio_doread(vp, blkno, size, 0); 755 1.240 hannken if (bp == NULL) 756 1.306 riastrad return SET_ERROR(ENOMEM); 757 1.31 cgd 758 1.80 chs /* Wait for the read to complete, and return result. */ 759 1.198 hannken error = biowait(bp); 760 1.241 christos if (error == 0 && (flags & B_MODIFY) != 0) 761 1.198 hannken error = fscow_run(bp, true); 762 1.241 christos if (error) { 763 1.241 christos brelse(bp, 0); 764 1.241 christos *bpp = NULL; 765 1.240 hannken } 766 1.208 simonb 767 1.198 hannken return error; 768 1.31 cgd } 769 1.31 cgd 770 1.31 cgd /* 771 1.31 cgd * Read-ahead multiple disk blocks. The first is sync, the rest async. 772 1.31 cgd * Trivial modification to the breada algorithm presented in Bach (p.55). 773 1.31 cgd */ 774 1.40 christos int 775 1.101 thorpej breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, 776 1.254 maxv int *rasizes, int nrablks, int flags, buf_t **bpp) 777 1.31 cgd { 778 1.183 ad buf_t *bp; 779 1.198 hannken int error, i; 780 1.31 cgd 781 1.266 pgoyette BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); 782 1.264 pgoyette 783 1.253 maxv bp = *bpp = bio_doread(vp, blkno, size, 0); 784 1.240 hannken if (bp == NULL) 785 1.306 riastrad return SET_ERROR(ENOMEM); 786 1.31 cgd 787 1.31 cgd /* 788 1.31 cgd * For each of the read-ahead blocks, start a read, if necessary. 789 1.31 cgd */ 790 1.183 ad mutex_enter(&bufcache_lock); 791 1.31 cgd for (i = 0; i < nrablks; i++) { 792 1.31 cgd /* If it's in the cache, just go on to next one. */ 793 1.31 cgd if (incore(vp, rablks[i])) 794 1.31 cgd continue; 795 1.31 cgd 796 1.31 cgd /* Get a buffer for the read-ahead block */ 797 1.183 ad mutex_exit(&bufcache_lock); 798 1.253 maxv (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); 799 1.183 ad mutex_enter(&bufcache_lock); 800 1.31 cgd } 801 1.183 ad mutex_exit(&bufcache_lock); 802 1.31 cgd 803 1.80 chs /* Otherwise, we had to start a read for it; wait until it's valid. */ 804 1.198 hannken error = biowait(bp); 805 1.242 hannken if (error == 0 && (flags & B_MODIFY) != 0) 806 1.198 hannken error = fscow_run(bp, true); 807 1.242 hannken if (error) { 808 1.242 hannken brelse(bp, 0); 809 1.242 hannken *bpp = NULL; 810 1.240 hannken } 811 1.240 hannken 812 1.198 hannken return error; 813 1.31 cgd } 814 1.31 cgd 815 1.31 cgd /* 816 1.31 cgd * Block write. Described in Bach (p.56) 817 1.31 cgd */ 818 1.40 christos int 819 1.183 ad bwrite(buf_t *bp) 820 1.31 cgd { 821 1.183 ad int rv, sync, wasdelayed; 822 1.59 fvdl struct vnode *vp; 823 1.59 fvdl struct mount *mp; 824 1.31 cgd 825 1.276 pgoyette BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", 826 1.276 pgoyette (uintptr_t)bp, 0, 0, 0); 827 1.264 pgoyette 828 1.183 ad KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 829 1.203 ad KASSERT(!cv_has_waiters(&bp->b_done)); 830 1.87 pk 831 1.76 chs vp = bp->b_vp; 832 1.260 dholland 833 1.260 dholland /* 834 1.260 dholland * dholland 20160728 AFAICT vp==NULL must be impossible as it 835 1.260 dholland * will crash upon reaching VOP_STRATEGY below... see further 836 1.260 dholland * analysis on tech-kern. 837 1.260 dholland */ 838 1.260 dholland KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode"); 839 1.260 dholland 840 1.76 chs if (vp != NULL) { 841 1.230 rmind KASSERT(bp->b_objlock == vp->v_interlock); 842 1.76 chs if (vp->v_type == VBLK) 843 1.247 hannken mp = spec_node_getmountedfs(vp); 844 1.76 chs else 845 1.76 chs mp = vp->v_mount; 846 1.76 chs } else { 847 1.76 chs mp = NULL; 848 1.76 chs } 849 1.76 chs 850 1.208 simonb if (mp && mp->mnt_wapbl) { 851 1.208 simonb if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { 852 1.208 simonb bdwrite(bp); 853 1.208 simonb return 0; 854 1.208 simonb } 855 1.208 simonb } 856 1.208 simonb 857 1.38 cgd /* 858 1.38 cgd * Remember buffer type, to switch on it later. If the write was 859 1.38 cgd * synchronous, but the file system was mounted with MNT_ASYNC, 860 1.142 perry * convert it to a delayed write. 861 1.38 cgd * XXX note that this relies on delayed tape writes being converted 862 1.38 cgd * to async, not sync writes (which is safe, but ugly). 863 1.38 cgd */ 864 1.31 cgd sync = !ISSET(bp->b_flags, B_ASYNC); 865 1.76 chs if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) { 866 1.37 cgd bdwrite(bp); 867 1.306 riastrad return 0; 868 1.37 cgd } 869 1.46 mycroft 870 1.59 fvdl /* 871 1.59 fvdl * Collect statistics on synchronous and asynchronous writes. 872 1.59 fvdl * Writes to block devices are charged to their associated 873 1.59 fvdl * filesystem (if any). 874 1.59 fvdl */ 875 1.76 chs if (mp != NULL) { 876 1.76 chs if (sync) 877 1.76 chs mp->mnt_stat.f_syncwrites++; 878 1.59 fvdl else 879 1.76 chs mp->mnt_stat.f_asyncwrites++; 880 1.59 fvdl } 881 1.59 fvdl 882 1.46 mycroft /* 883 1.46 mycroft * Pay for the I/O operation and make sure the buf is on the correct 884 1.46 mycroft * vnode queue. 885 1.46 mycroft */ 886 1.184 ad bp->b_error = 0; 887 1.184 ad wasdelayed = ISSET(bp->b_oflags, BO_DELWRI); 888 1.183 ad CLR(bp->b_flags, B_READ); 889 1.184 ad if (wasdelayed) { 890 1.184 ad mutex_enter(&bufcache_lock); 891 1.184 ad mutex_enter(bp->b_objlock); 892 1.184 ad CLR(bp->b_oflags, BO_DONE | BO_DELWRI); 893 1.46 mycroft reassignbuf(bp, bp->b_vp); 894 1.282 ad /* Wake anyone trying to busy the buffer via vnode's lists. */ 895 1.282 ad cv_broadcast(&bp->b_busy); 896 1.184 ad mutex_exit(&bufcache_lock); 897 1.184 ad } else { 898 1.194 ad curlwp->l_ru.ru_oublock++; 899 1.184 ad mutex_enter(bp->b_objlock); 900 1.184 ad CLR(bp->b_oflags, BO_DONE | BO_DELWRI); 901 1.184 ad } 902 1.183 ad if (vp != NULL) 903 1.183 ad vp->v_numoutput++; 904 1.183 ad mutex_exit(bp->b_objlock); 905 1.32 mycroft 906 1.183 ad /* Initiate disk write. */ 907 1.108 yamt if (sync) 908 1.108 yamt BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 909 1.108 yamt else 910 1.108 yamt BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 911 1.108 yamt 912 1.112 hannken VOP_STRATEGY(vp, bp); 913 1.31 cgd 914 1.34 mycroft if (sync) { 915 1.46 mycroft /* If I/O was synchronous, wait for it to complete. */ 916 1.31 cgd rv = biowait(bp); 917 1.31 cgd 918 1.34 mycroft /* Release the buffer. */ 919 1.179 ad brelse(bp, 0); 920 1.34 mycroft 921 1.306 riastrad return rv; 922 1.34 mycroft } else { 923 1.306 riastrad return 0; 924 1.31 cgd } 925 1.31 cgd } 926 1.31 cgd 927 1.31 cgd int 928 1.101 thorpej vn_bwrite(void *v) 929 1.31 cgd { 930 1.40 christos struct vop_bwrite_args *ap = v; 931 1.34 mycroft 932 1.306 riastrad return bwrite(ap->a_bp); 933 1.31 cgd } 934 1.31 cgd 935 1.31 cgd /* 936 1.31 cgd * Delayed write. 937 1.31 cgd * 938 1.31 cgd * The buffer is marked dirty, but is not queued for I/O. 939 1.31 cgd * This routine should be used when the buffer is expected 940 1.31 cgd * to be modified again soon, typically a small write that 941 1.31 cgd * partially fills a buffer. 942 1.31 cgd * 943 1.31 cgd * NB: magnetic tapes cannot be delayed; they must be 944 1.31 cgd * written in the order that the writes are requested. 945 1.31 cgd * 946 1.31 cgd * Described in Leffler, et al. (pp. 208-213). 947 1.31 cgd */ 948 1.31 cgd void 949 1.183 ad bdwrite(buf_t *bp) 950 1.31 cgd { 951 1.183 ad 952 1.276 pgoyette BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx", 953 1.276 pgoyette (uintptr_t)bp, 0, 0, 0); 954 1.264 pgoyette 955 1.198 hannken KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS || 956 1.207 hannken bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE)); 957 1.183 ad KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 958 1.203 ad KASSERT(!cv_has_waiters(&bp->b_done)); 959 1.31 cgd 960 1.46 mycroft /* If this is a tape block, write the block now. */ 961 1.173 ad if (bdev_type(bp->b_dev) == D_TAPE) { 962 1.90 pk bawrite(bp); 963 1.90 pk return; 964 1.46 mycroft } 965 1.46 mycroft 966 1.208 simonb if (wapbl_vphaswapbl(bp->b_vp)) { 967 1.208 simonb struct mount *mp = wapbl_vptomp(bp->b_vp); 968 1.208 simonb 969 1.208 simonb if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { 970 1.208 simonb WAPBL_ADD_BUF(mp, bp); 971 1.208 simonb } 972 1.208 simonb } 973 1.208 simonb 974 1.31 cgd /* 975 1.31 cgd * If the block hasn't been seen before: 976 1.31 cgd * (1) Mark it as having been seen, 977 1.45 pk * (2) Charge for the write, 978 1.45 pk * (3) Make sure it's on its vnode's correct block list. 979 1.31 cgd */ 980 1.230 rmind KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock); 981 1.97 dbj 982 1.183 ad if (!ISSET(bp->b_oflags, BO_DELWRI)) { 983 1.184 ad mutex_enter(&bufcache_lock); 984 1.184 ad mutex_enter(bp->b_objlock); 985 1.183 ad SET(bp->b_oflags, BO_DELWRI); 986 1.194 ad curlwp->l_ru.ru_oublock++; 987 1.31 cgd reassignbuf(bp, bp->b_vp); 988 1.282 ad /* Wake anyone trying to busy the buffer via vnode's lists. */ 989 1.282 ad cv_broadcast(&bp->b_busy); 990 1.184 ad mutex_exit(&bufcache_lock); 991 1.184 ad } else { 992 1.184 ad mutex_enter(bp->b_objlock); 993 1.31 cgd } 994 1.31 cgd /* Otherwise, the "write" is done, so mark and release the buffer. */ 995 1.183 ad CLR(bp->b_oflags, BO_DONE); 996 1.183 ad mutex_exit(bp->b_objlock); 997 1.60 fvdl 998 1.179 ad brelse(bp, 0); 999 1.31 cgd } 1000 1.31 cgd 1001 1.31 cgd /* 1002 1.31 cgd * Asynchronous block write; just an asynchronous bwrite(). 1003 1.31 cgd */ 1004 1.31 cgd void 1005 1.183 ad bawrite(buf_t *bp) 1006 1.31 cgd { 1007 1.31 cgd 1008 1.183 ad KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1009 1.231 hannken KASSERT(bp->b_vp != NULL); 1010 1.87 pk 1011 1.31 cgd SET(bp->b_flags, B_ASYNC); 1012 1.231 hannken VOP_BWRITE(bp->b_vp, bp); 1013 1.31 cgd } 1014 1.31 cgd 1015 1.31 cgd /* 1016 1.31 cgd * Release a buffer on to the free lists. 1017 1.31 cgd * Described in Bach (p. 46). 1018 1.31 cgd */ 1019 1.31 cgd void 1020 1.183 ad brelsel(buf_t *bp, int set) 1021 1.31 cgd { 1022 1.131 yamt struct bqueue *bufq; 1023 1.183 ad struct vnode *vp; 1024 1.31 cgd 1025 1.288 riastrad SDT_PROBE2(io, kernel, , brelse, bp, set); 1026 1.288 riastrad 1027 1.240 hannken KASSERT(bp != NULL); 1028 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 1029 1.202 ad KASSERT(!cv_has_waiters(&bp->b_done)); 1030 1.270 riastrad 1031 1.183 ad SET(bp->b_cflags, set); 1032 1.179 ad 1033 1.183 ad KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1034 1.183 ad KASSERT(bp->b_iodone == NULL); 1035 1.87 pk 1036 1.31 cgd /* Wake up any processes waiting for any buffer to become free. */ 1037 1.183 ad cv_signal(&needbuffer_cv); 1038 1.31 cgd 1039 1.262 jdolecek /* Wake up any proceeses waiting for _this_ buffer to become free */ 1040 1.199 ad if (ISSET(bp->b_cflags, BC_WANTED)) 1041 1.183 ad CLR(bp->b_cflags, BC_WANTED|BC_AGE); 1042 1.31 cgd 1043 1.225 hannken /* If it's clean clear the copy-on-write flag. */ 1044 1.225 hannken if (ISSET(bp->b_flags, B_COWDONE)) { 1045 1.225 hannken mutex_enter(bp->b_objlock); 1046 1.225 hannken if (!ISSET(bp->b_oflags, BO_DELWRI)) 1047 1.225 hannken CLR(bp->b_flags, B_COWDONE); 1048 1.225 hannken mutex_exit(bp->b_objlock); 1049 1.225 hannken } 1050 1.225 hannken 1051 1.31 cgd /* 1052 1.31 cgd * Determine which queue the buffer should be on, then put it there. 1053 1.31 cgd */ 1054 1.31 cgd 1055 1.31 cgd /* If it's locked, don't report an error; try again later. */ 1056 1.187 ad if (ISSET(bp->b_flags, B_LOCKED)) 1057 1.174 ad bp->b_error = 0; 1058 1.31 cgd 1059 1.31 cgd /* If it's not cacheable, or an error, mark it invalid. */ 1060 1.183 ad if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0) 1061 1.183 ad SET(bp->b_cflags, BC_INVAL); 1062 1.31 cgd 1063 1.183 ad if (ISSET(bp->b_cflags, BC_VFLUSH)) { 1064 1.50 mycroft /* 1065 1.50 mycroft * This is a delayed write buffer that was just flushed to 1066 1.50 mycroft * disk. It is still on the LRU queue. If it's become 1067 1.50 mycroft * invalid, then we need to move it to a different queue; 1068 1.50 mycroft * otherwise leave it in its current position. 1069 1.50 mycroft */ 1070 1.183 ad CLR(bp->b_cflags, BC_VFLUSH); 1071 1.187 ad if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) && 1072 1.187 ad !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) { 1073 1.206 bouyer KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1)); 1074 1.50 mycroft goto already_queued; 1075 1.99 dbj } else { 1076 1.50 mycroft bremfree(bp); 1077 1.99 dbj } 1078 1.50 mycroft } 1079 1.99 dbj 1080 1.206 bouyer KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); 1081 1.206 bouyer KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); 1082 1.206 bouyer KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); 1083 1.50 mycroft 1084 1.183 ad if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) { 1085 1.31 cgd /* 1086 1.31 cgd * If it's invalid or empty, dissociate it from its vnode 1087 1.31 cgd * and put on the head of the appropriate queue. 1088 1.31 cgd */ 1089 1.208 simonb if (ISSET(bp->b_flags, B_LOCKED)) { 1090 1.208 simonb if (wapbl_vphaswapbl(vp = bp->b_vp)) { 1091 1.208 simonb struct mount *mp = wapbl_vptomp(vp); 1092 1.208 simonb 1093 1.304 riastrad KASSERT(bp->b_iodone != 1094 1.304 riastrad mp->mnt_wapbl_op->wo_wapbl_biodone); 1095 1.208 simonb WAPBL_REMOVE_BUF(mp, bp); 1096 1.208 simonb } 1097 1.208 simonb } 1098 1.208 simonb 1099 1.183 ad mutex_enter(bp->b_objlock); 1100 1.183 ad CLR(bp->b_oflags, BO_DONE|BO_DELWRI); 1101 1.183 ad if ((vp = bp->b_vp) != NULL) { 1102 1.230 rmind KASSERT(bp->b_objlock == vp->v_interlock); 1103 1.59 fvdl reassignbuf(bp, bp->b_vp); 1104 1.31 cgd brelvp(bp); 1105 1.230 rmind mutex_exit(vp->v_interlock); 1106 1.183 ad } else { 1107 1.183 ad KASSERT(bp->b_objlock == &buffer_lock); 1108 1.183 ad mutex_exit(bp->b_objlock); 1109 1.59 fvdl } 1110 1.281 ad /* We want to dispose of the buffer, so wake everybody. */ 1111 1.281 ad cv_broadcast(&bp->b_busy); 1112 1.31 cgd if (bp->b_bufsize <= 0) 1113 1.31 cgd /* no data */ 1114 1.100 pk goto already_queued; 1115 1.31 cgd else 1116 1.31 cgd /* invalid data */ 1117 1.31 cgd bufq = &bufqueues[BQ_AGE]; 1118 1.31 cgd binsheadfree(bp, bufq); 1119 1.183 ad } else { 1120 1.31 cgd /* 1121 1.31 cgd * It has valid data. Put it on the end of the appropriate 1122 1.31 cgd * queue, so that it'll stick around for as long as possible. 1123 1.67 fvdl * If buf is AGE, but has dependencies, must put it on last 1124 1.67 fvdl * bufqueue to be scanned, ie LRU. This protects against the 1125 1.67 fvdl * livelock where BQ_AGE only has buffers with dependencies, 1126 1.67 fvdl * and we thus never get to the dependent buffers in BQ_LRU. 1127 1.31 cgd */ 1128 1.187 ad if (ISSET(bp->b_flags, B_LOCKED)) { 1129 1.31 cgd /* locked in core */ 1130 1.31 cgd bufq = &bufqueues[BQ_LOCKED]; 1131 1.183 ad } else if (!ISSET(bp->b_cflags, BC_AGE)) { 1132 1.31 cgd /* valid data */ 1133 1.31 cgd bufq = &bufqueues[BQ_LRU]; 1134 1.183 ad } else { 1135 1.67 fvdl /* stale but valid data */ 1136 1.216 ad bufq = &bufqueues[BQ_AGE]; 1137 1.67 fvdl } 1138 1.31 cgd binstailfree(bp, bufq); 1139 1.31 cgd } 1140 1.50 mycroft already_queued: 1141 1.31 cgd /* Unlock the buffer. */ 1142 1.183 ad CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE); 1143 1.183 ad CLR(bp->b_flags, B_ASYNC); 1144 1.283 ad 1145 1.283 ad /* 1146 1.283 ad * Wake only the highest priority waiter on the lock, in order to 1147 1.283 ad * prevent a thundering herd: many LWPs simultaneously awakening and 1148 1.283 ad * competing for the buffer's lock. Testing in 2019 revealed this 1149 1.283 ad * to reduce contention on bufcache_lock tenfold during a kernel 1150 1.291 ad * compile. Here and elsewhere, when the buffer is changing 1151 1.291 ad * identity, being disposed of, or moving from one list to another, 1152 1.291 ad * we wake all lock requestors. 1153 1.283 ad */ 1154 1.291 ad if (bp->b_bufsize <= 0) { 1155 1.291 ad cv_broadcast(&bp->b_busy); 1156 1.291 ad buf_destroy(bp); 1157 1.291 ad #ifdef DEBUG 1158 1.291 ad memset((char *)bp, 0, sizeof(*bp)); 1159 1.291 ad #endif 1160 1.291 ad pool_cache_put(buf_cache, bp); 1161 1.291 ad } else 1162 1.291 ad cv_signal(&bp->b_busy); 1163 1.183 ad } 1164 1.183 ad 1165 1.183 ad void 1166 1.183 ad brelse(buf_t *bp, int set) 1167 1.183 ad { 1168 1.183 ad 1169 1.183 ad mutex_enter(&bufcache_lock); 1170 1.183 ad brelsel(bp, set); 1171 1.183 ad mutex_exit(&bufcache_lock); 1172 1.31 cgd } 1173 1.31 cgd 1174 1.31 cgd /* 1175 1.31 cgd * Determine if a block is in the cache. 1176 1.31 cgd * Just look on what would be its hash chain. If it's there, return 1177 1.31 cgd * a pointer to it, unless it's marked invalid. If it's marked invalid, 1178 1.31 cgd * we normally don't return the buffer, unless the caller explicitly 1179 1.31 cgd * wants us to. 1180 1.31 cgd */ 1181 1.183 ad buf_t * 1182 1.101 thorpej incore(struct vnode *vp, daddr_t blkno) 1183 1.31 cgd { 1184 1.183 ad buf_t *bp; 1185 1.183 ad 1186 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 1187 1.31 cgd 1188 1.31 cgd /* Search hash chain */ 1189 1.84 matt LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { 1190 1.31 cgd if (bp->b_lblkno == blkno && bp->b_vp == vp && 1191 1.183 ad !ISSET(bp->b_cflags, BC_INVAL)) { 1192 1.305 riastrad KASSERT(bp->b_objlock == vp->v_interlock); 1193 1.305 riastrad return (bp); 1194 1.183 ad } 1195 1.31 cgd } 1196 1.31 cgd 1197 1.306 riastrad return NULL; 1198 1.31 cgd } 1199 1.31 cgd 1200 1.31 cgd /* 1201 1.31 cgd * Get a block of requested size that is associated with 1202 1.31 cgd * a given vnode and block offset. If it is found in the 1203 1.31 cgd * block cache, mark it as having been found, make it busy 1204 1.31 cgd * and return it. Otherwise, return an empty block of the 1205 1.31 cgd * correct size. It is up to the caller to insure that the 1206 1.31 cgd * cached blocks be of the correct size. 1207 1.31 cgd */ 1208 1.183 ad buf_t * 1209 1.101 thorpej getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1210 1.31 cgd { 1211 1.183 ad int err, preserve; 1212 1.183 ad buf_t *bp; 1213 1.183 ad 1214 1.183 ad mutex_enter(&bufcache_lock); 1215 1.288 riastrad SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size); 1216 1.304 riastrad loop: 1217 1.73 chs bp = incore(vp, blkno); 1218 1.73 chs if (bp != NULL) { 1219 1.188 ad err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL); 1220 1.183 ad if (err != 0) { 1221 1.183 ad if (err == EPASSTHROUGH) 1222 1.183 ad goto loop; 1223 1.183 ad mutex_exit(&bufcache_lock); 1224 1.288 riastrad SDT_PROBE4(io, kernel, , getblk__done, 1225 1.288 riastrad vp, blkno, size, NULL); 1226 1.306 riastrad return NULL; 1227 1.31 cgd } 1228 1.203 ad KASSERT(!cv_has_waiters(&bp->b_done)); 1229 1.57 mycroft #ifdef DIAGNOSTIC 1230 1.183 ad if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && 1231 1.78 chs bp->b_bcount < size && vp->v_type != VBLK) 1232 1.73 chs panic("getblk: block size invariant failed"); 1233 1.57 mycroft #endif 1234 1.73 chs bremfree(bp); 1235 1.100 pk preserve = 1; 1236 1.73 chs } else { 1237 1.183 ad if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) 1238 1.183 ad goto loop; 1239 1.183 ad 1240 1.183 ad if (incore(vp, blkno) != NULL) { 1241 1.183 ad /* The block has come into memory in the meantime. */ 1242 1.183 ad brelsel(bp, 0); 1243 1.183 ad goto loop; 1244 1.87 pk } 1245 1.73 chs 1246 1.183 ad LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash); 1247 1.64 thorpej bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; 1248 1.230 rmind mutex_enter(vp->v_interlock); 1249 1.31 cgd bgetvp(vp, bp); 1250 1.230 rmind mutex_exit(vp->v_interlock); 1251 1.100 pk preserve = 0; 1252 1.31 cgd } 1253 1.183 ad mutex_exit(&bufcache_lock); 1254 1.183 ad 1255 1.96 yamt /* 1256 1.187 ad * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes) 1257 1.96 yamt * if we re-size buffers here. 1258 1.96 yamt */ 1259 1.187 ad if (ISSET(bp->b_flags, B_LOCKED)) { 1260 1.96 yamt KASSERT(bp->b_bufsize >= size); 1261 1.96 yamt } else { 1262 1.183 ad if (allocbuf(bp, size, preserve)) { 1263 1.183 ad mutex_enter(&bufcache_lock); 1264 1.183 ad LIST_REMOVE(bp, b_hash); 1265 1.271 skrll brelsel(bp, BC_INVAL); 1266 1.183 ad mutex_exit(&bufcache_lock); 1267 1.288 riastrad SDT_PROBE4(io, kernel, , getblk__done, 1268 1.288 riastrad vp, blkno, size, NULL); 1269 1.183 ad return NULL; 1270 1.183 ad } 1271 1.96 yamt } 1272 1.108 yamt BIO_SETPRIO(bp, BPRIO_DEFAULT); 1273 1.288 riastrad SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp); 1274 1.306 riastrad return bp; 1275 1.31 cgd } 1276 1.31 cgd 1277 1.31 cgd /* 1278 1.31 cgd * Get an empty, disassociated buffer of given size. 1279 1.31 cgd */ 1280 1.183 ad buf_t * 1281 1.101 thorpej geteblk(int size) 1282 1.31 cgd { 1283 1.183 ad buf_t *bp; 1284 1.248 martin int error __diagused; 1285 1.31 cgd 1286 1.183 ad mutex_enter(&bufcache_lock); 1287 1.183 ad while ((bp = getnewbuf(0, 0, 0)) == NULL) 1288 1.304 riastrad continue; 1289 1.87 pk 1290 1.183 ad SET(bp->b_cflags, BC_INVAL); 1291 1.183 ad LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1292 1.183 ad mutex_exit(&bufcache_lock); 1293 1.109 yamt BIO_SETPRIO(bp, BPRIO_DEFAULT); 1294 1.183 ad error = allocbuf(bp, size, 0); 1295 1.183 ad KASSERT(error == 0); 1296 1.306 riastrad return bp; 1297 1.31 cgd } 1298 1.31 cgd 1299 1.31 cgd /* 1300 1.31 cgd * Expand or contract the actual memory allocated to a buffer. 1301 1.31 cgd * 1302 1.31 cgd * If the buffer shrinks, data is lost, so it's up to the 1303 1.31 cgd * caller to have written it out *first*; this routine will not 1304 1.31 cgd * start a write. If the buffer grows, it's the callers 1305 1.31 cgd * responsibility to fill out the buffer's additional contents. 1306 1.31 cgd */ 1307 1.183 ad int 1308 1.183 ad allocbuf(buf_t *bp, int size, int preserve) 1309 1.31 cgd { 1310 1.208 simonb void *addr; 1311 1.100 pk vsize_t oldsize, desired_size; 1312 1.208 simonb int oldcount; 1313 1.183 ad int delta; 1314 1.31 cgd 1315 1.100 pk desired_size = buf_roundsize(size); 1316 1.31 cgd if (desired_size > MAXBSIZE) 1317 1.100 pk printf("allocbuf: buffer larger than MAXBSIZE requested"); 1318 1.31 cgd 1319 1.208 simonb oldcount = bp->b_bcount; 1320 1.208 simonb 1321 1.100 pk bp->b_bcount = size; 1322 1.100 pk 1323 1.100 pk oldsize = bp->b_bufsize; 1324 1.214 joerg if (oldsize == desired_size) { 1325 1.214 joerg /* 1326 1.214 joerg * Do not short cut the WAPBL resize, as the buffer length 1327 1.214 joerg * could still have changed and this would corrupt the 1328 1.214 joerg * tracking of the transaction length. 1329 1.214 joerg */ 1330 1.214 joerg goto out; 1331 1.214 joerg } 1332 1.31 cgd 1333 1.31 cgd /* 1334 1.100 pk * If we want a buffer of a different size, re-allocate the 1335 1.100 pk * buffer's memory; copy old content only if needed. 1336 1.31 cgd */ 1337 1.229 rmind addr = buf_alloc(desired_size); 1338 1.183 ad if (addr == NULL) 1339 1.306 riastrad return SET_ERROR(ENOMEM); 1340 1.100 pk if (preserve) 1341 1.100 pk memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); 1342 1.100 pk if (bp->b_data != NULL) 1343 1.100 pk buf_mrelease(bp->b_data, oldsize); 1344 1.100 pk bp->b_data = addr; 1345 1.100 pk bp->b_bufsize = desired_size; 1346 1.31 cgd 1347 1.31 cgd /* 1348 1.183 ad * Update overall buffer memory counter (protected by bufcache_lock) 1349 1.31 cgd */ 1350 1.100 pk delta = (long)desired_size - (long)oldsize; 1351 1.100 pk 1352 1.183 ad mutex_enter(&bufcache_lock); 1353 1.100 pk if ((bufmem += delta) > bufmem_hiwater) { 1354 1.100 pk /* 1355 1.100 pk * Need to trim overall memory usage. 1356 1.100 pk */ 1357 1.100 pk while (buf_canrelease()) { 1358 1.290 ad if (preempt_needed()) { 1359 1.183 ad mutex_exit(&bufcache_lock); 1360 1.168 ad preempt(); 1361 1.183 ad mutex_enter(&bufcache_lock); 1362 1.154 yamt } 1363 1.100 pk if (buf_trim() == 0) 1364 1.100 pk break; 1365 1.31 cgd } 1366 1.31 cgd } 1367 1.183 ad mutex_exit(&bufcache_lock); 1368 1.208 simonb 1369 1.304 riastrad out: 1370 1.304 riastrad if (wapbl_vphaswapbl(bp->b_vp)) { 1371 1.304 riastrad WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, 1372 1.304 riastrad oldsize, oldcount); 1373 1.304 riastrad } 1374 1.208 simonb 1375 1.183 ad return 0; 1376 1.31 cgd } 1377 1.31 cgd 1378 1.31 cgd /* 1379 1.31 cgd * Find a buffer which is available for use. 1380 1.31 cgd * Select something from a free list. 1381 1.142 perry * Preference is to AGE list, then LRU list. 1382 1.87 pk * 1383 1.183 ad * Called with the buffer queues locked. 1384 1.87 pk * Return buffer locked. 1385 1.31 cgd */ 1386 1.277 hannken static buf_t * 1387 1.101 thorpej getnewbuf(int slpflag, int slptimeo, int from_bufq) 1388 1.31 cgd { 1389 1.183 ad buf_t *bp; 1390 1.183 ad struct vnode *vp; 1391 1.277 hannken struct mount *transmp = NULL; 1392 1.31 cgd 1393 1.288 riastrad SDT_PROBE0(io, kernel, , getnewbuf__start); 1394 1.288 riastrad 1395 1.304 riastrad start: 1396 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 1397 1.87 pk 1398 1.100 pk /* 1399 1.183 ad * Get a new buffer from the pool. 1400 1.100 pk */ 1401 1.183 ad if (!from_bufq && buf_lotsfree()) { 1402 1.183 ad mutex_exit(&bufcache_lock); 1403 1.183 ad bp = pool_cache_get(buf_cache, PR_NOWAIT); 1404 1.183 ad if (bp != NULL) { 1405 1.183 ad memset((char *)bp, 0, sizeof(*bp)); 1406 1.183 ad buf_init(bp); 1407 1.204 reinoud SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */ 1408 1.183 ad mutex_enter(&bufcache_lock); 1409 1.131 yamt #if defined(DIAGNOSTIC) 1410 1.183 ad bp->b_freelistindex = -1; 1411 1.131 yamt #endif /* defined(DIAGNOSTIC) */ 1412 1.288 riastrad SDT_PROBE1(io, kernel, , getnewbuf__done, bp); 1413 1.306 riastrad return bp; 1414 1.183 ad } 1415 1.183 ad mutex_enter(&bufcache_lock); 1416 1.100 pk } 1417 1.100 pk 1418 1.209 reinoud KASSERT(mutex_owned(&bufcache_lock)); 1419 1.277 hannken if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) { 1420 1.277 hannken KASSERT(!ISSET(bp->b_oflags, BO_DELWRI)); 1421 1.277 hannken } else { 1422 1.277 hannken TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) { 1423 1.277 hannken if (ISSET(bp->b_cflags, BC_VFLUSH) || 1424 1.277 hannken !ISSET(bp->b_oflags, BO_DELWRI)) 1425 1.277 hannken break; 1426 1.277 hannken if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) { 1427 1.277 hannken KASSERT(transmp == NULL); 1428 1.277 hannken transmp = bp->b_vp->v_mount; 1429 1.277 hannken break; 1430 1.277 hannken } 1431 1.277 hannken } 1432 1.277 hannken } 1433 1.277 hannken if (bp != NULL) { 1434 1.305 riastrad KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || 1435 1.304 riastrad ISSET(bp->b_cflags, BC_VFLUSH)); 1436 1.31 cgd bremfree(bp); 1437 1.202 ad 1438 1.202 ad /* Buffer is no longer on free lists. */ 1439 1.202 ad SET(bp->b_cflags, BC_BUSY); 1440 1.282 ad 1441 1.282 ad /* Wake anyone trying to lock the old identity. */ 1442 1.282 ad cv_broadcast(&bp->b_busy); 1443 1.31 cgd } else { 1444 1.134 enami /* 1445 1.134 enami * XXX: !from_bufq should be removed. 1446 1.134 enami */ 1447 1.173 ad if (!from_bufq || curlwp != uvm.pagedaemon_lwp) { 1448 1.134 enami /* wait for a free buffer of any kind */ 1449 1.183 ad if ((slpflag & PCATCH) != 0) 1450 1.183 ad (void)cv_timedwait_sig(&needbuffer_cv, 1451 1.183 ad &bufcache_lock, slptimeo); 1452 1.183 ad else 1453 1.183 ad (void)cv_timedwait(&needbuffer_cv, 1454 1.183 ad &bufcache_lock, slptimeo); 1455 1.134 enami } 1456 1.288 riastrad SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); 1457 1.306 riastrad return NULL; 1458 1.31 cgd } 1459 1.31 cgd 1460 1.100 pk #ifdef DIAGNOSTIC 1461 1.100 pk if (bp->b_bufsize <= 0) 1462 1.100 pk panic("buffer %p: on queue but empty", bp); 1463 1.100 pk #endif 1464 1.100 pk 1465 1.183 ad if (ISSET(bp->b_cflags, BC_VFLUSH)) { 1466 1.50 mycroft /* 1467 1.50 mycroft * This is a delayed write buffer being flushed to disk. Make 1468 1.50 mycroft * sure it gets aged out of the queue when it's finished, and 1469 1.50 mycroft * leave it off the LRU queue. 1470 1.50 mycroft */ 1471 1.183 ad CLR(bp->b_cflags, BC_VFLUSH); 1472 1.183 ad SET(bp->b_cflags, BC_AGE); 1473 1.50 mycroft goto start; 1474 1.50 mycroft } 1475 1.50 mycroft 1476 1.202 ad KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1477 1.305 riastrad KASSERT(!cv_has_waiters(&bp->b_done)); 1478 1.31 cgd 1479 1.75 chs /* 1480 1.75 chs * If buffer was a delayed write, start it and return NULL 1481 1.75 chs * (since we might sleep while starting the write). 1482 1.75 chs */ 1483 1.183 ad if (ISSET(bp->b_oflags, BO_DELWRI)) { 1484 1.50 mycroft /* 1485 1.50 mycroft * This buffer has gone through the LRU, so make sure it gets 1486 1.50 mycroft * reused ASAP. 1487 1.50 mycroft */ 1488 1.183 ad SET(bp->b_cflags, BC_AGE); 1489 1.183 ad mutex_exit(&bufcache_lock); 1490 1.50 mycroft bawrite(bp); 1491 1.277 hannken KASSERT(transmp != NULL); 1492 1.277 hannken fstrans_done(transmp); 1493 1.183 ad mutex_enter(&bufcache_lock); 1494 1.288 riastrad SDT_PROBE1(io, kernel, , getnewbuf__done, NULL); 1495 1.306 riastrad return NULL; 1496 1.31 cgd } 1497 1.31 cgd 1498 1.277 hannken KASSERT(transmp == NULL); 1499 1.277 hannken 1500 1.183 ad vp = bp->b_vp; 1501 1.59 fvdl 1502 1.31 cgd /* clear out various other fields */ 1503 1.183 ad bp->b_cflags = BC_BUSY; 1504 1.183 ad bp->b_oflags = 0; 1505 1.183 ad bp->b_flags = 0; 1506 1.31 cgd bp->b_dev = NODEV; 1507 1.183 ad bp->b_blkno = 0; 1508 1.183 ad bp->b_lblkno = 0; 1509 1.183 ad bp->b_rawblkno = 0; 1510 1.31 cgd bp->b_iodone = 0; 1511 1.31 cgd bp->b_error = 0; 1512 1.31 cgd bp->b_resid = 0; 1513 1.31 cgd bp->b_bcount = 0; 1514 1.142 perry 1515 1.183 ad LIST_REMOVE(bp, b_hash); 1516 1.183 ad 1517 1.183 ad /* Disassociate us from our vnode, if we had one... */ 1518 1.183 ad if (vp != NULL) { 1519 1.230 rmind mutex_enter(vp->v_interlock); 1520 1.183 ad brelvp(bp); 1521 1.230 rmind mutex_exit(vp->v_interlock); 1522 1.183 ad } 1523 1.183 ad 1524 1.288 riastrad SDT_PROBE1(io, kernel, , getnewbuf__done, bp); 1525 1.306 riastrad return bp; 1526 1.31 cgd } 1527 1.31 cgd 1528 1.31 cgd /* 1529 1.297 chs * Invalidate the specified buffer if it exists. 1530 1.297 chs */ 1531 1.297 chs void 1532 1.297 chs binvalbuf(struct vnode *vp, daddr_t blkno) 1533 1.297 chs { 1534 1.297 chs buf_t *bp; 1535 1.297 chs int err; 1536 1.297 chs 1537 1.297 chs mutex_enter(&bufcache_lock); 1538 1.297 chs 1539 1.304 riastrad loop: 1540 1.297 chs bp = incore(vp, blkno); 1541 1.297 chs if (bp != NULL) { 1542 1.297 chs err = bbusy(bp, 0, 0, NULL); 1543 1.297 chs if (err == EPASSTHROUGH) 1544 1.297 chs goto loop; 1545 1.297 chs bremfree(bp); 1546 1.297 chs if (ISSET(bp->b_oflags, BO_DELWRI)) { 1547 1.297 chs SET(bp->b_cflags, BC_NOCACHE); 1548 1.297 chs mutex_exit(&bufcache_lock); 1549 1.297 chs bwrite(bp); 1550 1.297 chs } else { 1551 1.297 chs brelsel(bp, BC_INVAL); 1552 1.297 chs mutex_exit(&bufcache_lock); 1553 1.297 chs } 1554 1.297 chs } else 1555 1.297 chs mutex_exit(&bufcache_lock); 1556 1.297 chs } 1557 1.297 chs 1558 1.297 chs /* 1559 1.100 pk * Attempt to free an aged buffer off the queues. 1560 1.183 ad * Called with queue lock held. 1561 1.100 pk * Returns the amount of buffer memory freed. 1562 1.100 pk */ 1563 1.130 yamt static int 1564 1.101 thorpej buf_trim(void) 1565 1.100 pk { 1566 1.183 ad buf_t *bp; 1567 1.245 christos long size; 1568 1.100 pk 1569 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 1570 1.183 ad 1571 1.100 pk /* Instruct getnewbuf() to get buffers off the queues */ 1572 1.101 thorpej if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL) 1573 1.100 pk return 0; 1574 1.100 pk 1575 1.183 ad KASSERT((bp->b_cflags & BC_WANTED) == 0); 1576 1.100 pk size = bp->b_bufsize; 1577 1.100 pk bufmem -= size; 1578 1.100 pk if (size > 0) { 1579 1.100 pk buf_mrelease(bp->b_data, size); 1580 1.100 pk bp->b_bcount = bp->b_bufsize = 0; 1581 1.100 pk } 1582 1.100 pk /* brelse() will return the buffer to the global buffer pool */ 1583 1.183 ad brelsel(bp, 0); 1584 1.100 pk return size; 1585 1.100 pk } 1586 1.100 pk 1587 1.101 thorpej int 1588 1.101 thorpej buf_drain(int n) 1589 1.100 pk { 1590 1.183 ad int size = 0, sz; 1591 1.100 pk 1592 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 1593 1.116 yamt 1594 1.134 enami while (size < n && bufmem > bufmem_lowater) { 1595 1.134 enami sz = buf_trim(); 1596 1.134 enami if (sz <= 0) 1597 1.134 enami break; 1598 1.134 enami size += sz; 1599 1.134 enami } 1600 1.114 tls 1601 1.100 pk return size; 1602 1.100 pk } 1603 1.100 pk 1604 1.100 pk /* 1605 1.31 cgd * Wait for operations on the buffer to complete. 1606 1.31 cgd * When they do, extract and return the I/O's error value. 1607 1.31 cgd */ 1608 1.31 cgd int 1609 1.183 ad biowait(buf_t *bp) 1610 1.31 cgd { 1611 1.142 perry 1612 1.266 pgoyette BIOHIST_FUNC(__func__); 1613 1.264 pgoyette 1614 1.202 ad KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1615 1.202 ad 1616 1.259 riz SDT_PROBE1(io, kernel, , wait__start, bp); 1617 1.259 riz 1618 1.183 ad mutex_enter(bp->b_objlock); 1619 1.264 pgoyette 1620 1.276 pgoyette BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx", 1621 1.304 riastrad (uintptr_t)bp, bp->b_oflags, 1622 1.276 pgoyette (uintptr_t)__builtin_return_address(0), 0); 1623 1.264 pgoyette 1624 1.264 pgoyette while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) { 1625 1.304 riastrad BIOHIST_LOG(biohist, "waiting bp=%#jx", 1626 1.304 riastrad (uintptr_t)bp, 0, 0, 0); 1627 1.183 ad cv_wait(&bp->b_done, bp->b_objlock); 1628 1.264 pgoyette } 1629 1.183 ad mutex_exit(bp->b_objlock); 1630 1.183 ad 1631 1.259 riz SDT_PROBE1(io, kernel, , wait__done, bp); 1632 1.259 riz 1633 1.276 pgoyette BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0); 1634 1.264 pgoyette 1635 1.183 ad return bp->b_error; 1636 1.31 cgd } 1637 1.31 cgd 1638 1.31 cgd /* 1639 1.31 cgd * Mark I/O complete on a buffer. 1640 1.31 cgd * 1641 1.31 cgd * If a callback has been requested, e.g. the pageout 1642 1.31 cgd * daemon, do so. Otherwise, awaken waiting processes. 1643 1.31 cgd * 1644 1.31 cgd * [ Leffler, et al., says on p.247: 1645 1.31 cgd * "This routine wakes up the blocked process, frees the buffer 1646 1.31 cgd * for an asynchronous write, or, for a request by the pagedaemon 1647 1.31 cgd * process, invokes a procedure specified in the buffer structure" ] 1648 1.31 cgd * 1649 1.31 cgd * In real life, the pagedaemon (or other system processes) wants 1650 1.263 dholland * to do async stuff too, and doesn't want the buffer brelse()'d. 1651 1.31 cgd * (for swap pager, that puts swap buffers on the free lists (!!!), 1652 1.229 rmind * for the vn device, that puts allocated buffers on the free lists!) 1653 1.31 cgd */ 1654 1.31 cgd void 1655 1.183 ad biodone(buf_t *bp) 1656 1.183 ad { 1657 1.183 ad int s; 1658 1.183 ad 1659 1.266 pgoyette BIOHIST_FUNC(__func__); 1660 1.264 pgoyette 1661 1.183 ad KASSERT(!ISSET(bp->b_oflags, BO_DONE)); 1662 1.183 ad 1663 1.183 ad if (cpu_intr_p()) { 1664 1.183 ad /* From interrupt mode: defer to a soft interrupt. */ 1665 1.183 ad s = splvm(); 1666 1.183 ad TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq); 1667 1.264 pgoyette 1668 1.276 pgoyette BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled", 1669 1.276 pgoyette (uintptr_t)bp, 0, 0, 0); 1670 1.183 ad softint_schedule(biodone_sih); 1671 1.183 ad splx(s); 1672 1.183 ad } else { 1673 1.183 ad /* Process now - the buffer may be freed soon. */ 1674 1.183 ad biodone2(bp); 1675 1.183 ad } 1676 1.183 ad } 1677 1.183 ad 1678 1.259 riz SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/); 1679 1.259 riz 1680 1.183 ad static void 1681 1.183 ad biodone2(buf_t *bp) 1682 1.31 cgd { 1683 1.183 ad void (*callout)(buf_t *); 1684 1.183 ad 1685 1.259 riz SDT_PROBE1(io, kernel, ,done, bp); 1686 1.259 riz 1687 1.266 pgoyette BIOHIST_FUNC(__func__); 1688 1.276 pgoyette BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); 1689 1.264 pgoyette 1690 1.183 ad mutex_enter(bp->b_objlock); 1691 1.183 ad /* Note that the transfer is done. */ 1692 1.183 ad if (ISSET(bp->b_oflags, BO_DONE)) 1693 1.183 ad panic("biodone2 already"); 1694 1.186 hannken CLR(bp->b_flags, B_COWDONE); 1695 1.183 ad SET(bp->b_oflags, BO_DONE); 1696 1.108 yamt BIO_SETPRIO(bp, BPRIO_DEFAULT); 1697 1.31 cgd 1698 1.183 ad /* Wake up waiting writers. */ 1699 1.183 ad if (!ISSET(bp->b_flags, B_READ)) 1700 1.31 cgd vwakeup(bp); 1701 1.31 cgd 1702 1.183 ad if ((callout = bp->b_iodone) != NULL) { 1703 1.276 pgoyette BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout, 1704 1.276 pgoyette 0, 0, 0); 1705 1.264 pgoyette 1706 1.183 ad /* Note callout done, then call out. */ 1707 1.201 ad KASSERT(!cv_has_waiters(&bp->b_done)); 1708 1.183 ad bp->b_iodone = NULL; 1709 1.183 ad mutex_exit(bp->b_objlock); 1710 1.183 ad (*callout)(bp); 1711 1.183 ad } else if (ISSET(bp->b_flags, B_ASYNC)) { 1712 1.183 ad /* If async, release. */ 1713 1.266 pgoyette BIOHIST_LOG(biohist, "async", 0, 0, 0, 0); 1714 1.201 ad KASSERT(!cv_has_waiters(&bp->b_done)); 1715 1.183 ad mutex_exit(bp->b_objlock); 1716 1.183 ad brelse(bp, 0); 1717 1.59 fvdl } else { 1718 1.183 ad /* Otherwise just wake up waiters in biowait(). */ 1719 1.266 pgoyette BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0); 1720 1.183 ad cv_broadcast(&bp->b_done); 1721 1.183 ad mutex_exit(bp->b_objlock); 1722 1.31 cgd } 1723 1.183 ad } 1724 1.183 ad 1725 1.183 ad static void 1726 1.183 ad biointr(void *cookie) 1727 1.183 ad { 1728 1.183 ad struct cpu_info *ci; 1729 1.183 ad buf_t *bp; 1730 1.183 ad int s; 1731 1.183 ad 1732 1.266 pgoyette BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist); 1733 1.264 pgoyette 1734 1.183 ad ci = curcpu(); 1735 1.60 fvdl 1736 1.265 pgoyette s = splvm(); 1737 1.183 ad while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) { 1738 1.183 ad KASSERT(curcpu() == ci); 1739 1.183 ad 1740 1.183 ad bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone); 1741 1.183 ad TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq); 1742 1.183 ad splx(s); 1743 1.183 ad 1744 1.276 pgoyette BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0); 1745 1.183 ad biodone2(bp); 1746 1.265 pgoyette 1747 1.265 pgoyette s = splvm(); 1748 1.183 ad } 1749 1.265 pgoyette splx(s); 1750 1.31 cgd } 1751 1.31 cgd 1752 1.117 atatat static void 1753 1.278 maxv sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o) 1754 1.117 atatat { 1755 1.278 maxv const bool allowaddr = get_expose_address(curproc); 1756 1.278 maxv 1757 1.278 maxv memset(o, 0, sizeof(*o)); 1758 1.117 atatat 1759 1.183 ad o->b_flags = i->b_flags | i->b_cflags | i->b_oflags; 1760 1.117 atatat o->b_error = i->b_error; 1761 1.117 atatat o->b_prio = i->b_prio; 1762 1.117 atatat o->b_dev = i->b_dev; 1763 1.117 atatat o->b_bufsize = i->b_bufsize; 1764 1.117 atatat o->b_bcount = i->b_bcount; 1765 1.117 atatat o->b_resid = i->b_resid; 1766 1.278 maxv COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr); 1767 1.117 atatat o->b_blkno = i->b_blkno; 1768 1.117 atatat o->b_rawblkno = i->b_rawblkno; 1769 1.278 maxv COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr); 1770 1.278 maxv COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr); 1771 1.278 maxv COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr); 1772 1.278 maxv COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr); 1773 1.117 atatat o->b_lblkno = i->b_lblkno; 1774 1.117 atatat } 1775 1.117 atatat 1776 1.100 pk static int 1777 1.100 pk sysctl_dobuf(SYSCTLFN_ARGS) 1778 1.100 pk { 1779 1.183 ad buf_t *bp; 1780 1.117 atatat struct buf_sysctl bs; 1781 1.183 ad struct bqueue *bq; 1782 1.100 pk char *dp; 1783 1.117 atatat u_int i, op, arg; 1784 1.117 atatat size_t len, needed, elem_size, out_size; 1785 1.183 ad int error, elem_count, retries; 1786 1.117 atatat 1787 1.117 atatat if (namelen == 1 && name[0] == CTL_QUERY) 1788 1.306 riastrad return sysctl_query(SYSCTLFN_CALL(rnode)); 1789 1.117 atatat 1790 1.117 atatat if (namelen != 4) 1791 1.306 riastrad return SET_ERROR(EINVAL); 1792 1.100 pk 1793 1.183 ad retries = 100; 1794 1.304 riastrad retry: 1795 1.100 pk dp = oldp; 1796 1.117 atatat len = (oldp != NULL) ? *oldlenp : 0; 1797 1.117 atatat op = name[0]; 1798 1.117 atatat arg = name[1]; 1799 1.117 atatat elem_size = name[2]; 1800 1.117 atatat elem_count = name[3]; 1801 1.117 atatat out_size = MIN(sizeof(bs), elem_size); 1802 1.117 atatat 1803 1.117 atatat /* 1804 1.117 atatat * at the moment, these are just "placeholders" to make the 1805 1.117 atatat * API for retrieving kern.buf data more extensible in the 1806 1.117 atatat * future. 1807 1.117 atatat * 1808 1.117 atatat * XXX kern.buf currently has "netbsd32" issues. hopefully 1809 1.117 atatat * these will be resolved at a later point. 1810 1.117 atatat */ 1811 1.117 atatat if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL || 1812 1.117 atatat elem_size < 1 || elem_count < 0) 1813 1.306 riastrad return SET_ERROR(EINVAL); 1814 1.117 atatat 1815 1.301 simonb if (oldp == NULL) { 1816 1.301 simonb /* count only, don't run through the buffer queues */ 1817 1.304 riastrad needed = pool_cache_nget(buf_cache) - 1818 1.304 riastrad pool_cache_nput(buf_cache); 1819 1.301 simonb *oldlenp = (needed + KERN_BUFSLOP) * elem_size; 1820 1.301 simonb 1821 1.301 simonb return 0; 1822 1.301 simonb } 1823 1.301 simonb 1824 1.100 pk error = 0; 1825 1.100 pk needed = 0; 1826 1.185 ad sysctl_unlock(); 1827 1.183 ad mutex_enter(&bufcache_lock); 1828 1.100 pk for (i = 0; i < BQUEUES; i++) { 1829 1.183 ad bq = &bufqueues[i]; 1830 1.183 ad TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) { 1831 1.183 ad bq->bq_marker = bp; 1832 1.117 atatat if (len >= elem_size && elem_count > 0) { 1833 1.117 atatat sysctl_fillbuf(bp, &bs); 1834 1.183 ad mutex_exit(&bufcache_lock); 1835 1.117 atatat error = copyout(&bs, dp, out_size); 1836 1.183 ad mutex_enter(&bufcache_lock); 1837 1.100 pk if (error) 1838 1.183 ad break; 1839 1.183 ad if (bq->bq_marker != bp) { 1840 1.183 ad /* 1841 1.183 ad * This sysctl node is only for 1842 1.183 ad * statistics. Retry; if the 1843 1.183 ad * queue keeps changing, then 1844 1.183 ad * bail out. 1845 1.183 ad */ 1846 1.183 ad if (retries-- == 0) { 1847 1.306 riastrad error = SET_ERROR(EAGAIN); 1848 1.183 ad break; 1849 1.183 ad } 1850 1.183 ad mutex_exit(&bufcache_lock); 1851 1.233 rmind sysctl_relock(); 1852 1.183 ad goto retry; 1853 1.183 ad } 1854 1.100 pk dp += elem_size; 1855 1.100 pk len -= elem_size; 1856 1.100 pk } 1857 1.218 mrg needed += elem_size; 1858 1.218 mrg if (elem_count > 0 && elem_count != INT_MAX) 1859 1.218 mrg elem_count--; 1860 1.100 pk } 1861 1.183 ad if (error != 0) 1862 1.183 ad break; 1863 1.100 pk } 1864 1.183 ad mutex_exit(&bufcache_lock); 1865 1.185 ad sysctl_relock(); 1866 1.100 pk 1867 1.117 atatat *oldlenp = needed; 1868 1.100 pk 1869 1.306 riastrad return error; 1870 1.100 pk } 1871 1.100 pk 1872 1.100 pk static int 1873 1.183 ad sysctl_bufvm_update(SYSCTLFN_ARGS) 1874 1.100 pk { 1875 1.238 dsl int error, rv; 1876 1.100 pk struct sysctlnode node; 1877 1.239 dsl unsigned int temp_bufcache; 1878 1.239 dsl unsigned long temp_water; 1879 1.100 pk 1880 1.238 dsl /* Take a copy of the supplied node and its data */ 1881 1.100 pk node = *rnode; 1882 1.239 dsl if (node.sysctl_data == &bufcache) { 1883 1.304 riastrad node.sysctl_data = &temp_bufcache; 1884 1.304 riastrad temp_bufcache = *(unsigned int *)rnode->sysctl_data; 1885 1.239 dsl } else { 1886 1.304 riastrad node.sysctl_data = &temp_water; 1887 1.304 riastrad temp_water = *(unsigned long *)rnode->sysctl_data; 1888 1.239 dsl } 1889 1.238 dsl 1890 1.238 dsl /* Update the copy */ 1891 1.100 pk error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1892 1.100 pk if (error || newp == NULL) 1893 1.306 riastrad return error; 1894 1.100 pk 1895 1.183 ad if (rnode->sysctl_data == &bufcache) { 1896 1.239 dsl if (temp_bufcache > 100) 1897 1.306 riastrad return SET_ERROR(EINVAL); 1898 1.239 dsl bufcache = temp_bufcache; 1899 1.183 ad buf_setwm(); 1900 1.183 ad } else if (rnode->sysctl_data == &bufmem_lowater) { 1901 1.239 dsl if (bufmem_hiwater - temp_water < 16) 1902 1.306 riastrad return SET_ERROR(EINVAL); 1903 1.239 dsl bufmem_lowater = temp_water; 1904 1.117 atatat } else if (rnode->sysctl_data == &bufmem_hiwater) { 1905 1.239 dsl if (temp_water - bufmem_lowater < 16) 1906 1.306 riastrad return SET_ERROR(EINVAL); 1907 1.239 dsl bufmem_hiwater = temp_water; 1908 1.100 pk } else 1909 1.306 riastrad return SET_ERROR(EINVAL); 1910 1.100 pk 1911 1.183 ad /* Drain until below new high water mark */ 1912 1.185 ad sysctl_unlock(); 1913 1.183 ad mutex_enter(&bufcache_lock); 1914 1.238 dsl while (bufmem > bufmem_hiwater) { 1915 1.238 dsl rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024)); 1916 1.183 ad if (rv <= 0) 1917 1.183 ad break; 1918 1.183 ad } 1919 1.183 ad mutex_exit(&bufcache_lock); 1920 1.185 ad sysctl_relock(); 1921 1.100 pk 1922 1.100 pk return 0; 1923 1.100 pk } 1924 1.100 pk 1925 1.215 pooka static struct sysctllog *vfsbio_sysctllog; 1926 1.215 pooka 1927 1.215 pooka static void 1928 1.215 pooka sysctl_kern_buf_setup(void) 1929 1.100 pk { 1930 1.100 pk 1931 1.215 pooka sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1932 1.304 riastrad CTLFLAG_PERMANENT, 1933 1.304 riastrad CTLTYPE_NODE, "buf", 1934 1.304 riastrad SYSCTL_DESCR("Kernel buffer cache information"), 1935 1.304 riastrad sysctl_dobuf, 0, NULL, 0, 1936 1.304 riastrad CTL_KERN, KERN_BUF, CTL_EOL); 1937 1.104 atatat } 1938 1.104 atatat 1939 1.215 pooka static void 1940 1.215 pooka sysctl_vm_buf_setup(void) 1941 1.104 atatat { 1942 1.104 atatat 1943 1.215 pooka sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1944 1.304 riastrad CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1945 1.304 riastrad CTLTYPE_INT, "bufcache", 1946 1.304 riastrad SYSCTL_DESCR("Percentage of physical memory to use for " 1947 1.304 riastrad "buffer cache"), 1948 1.304 riastrad sysctl_bufvm_update, 0, &bufcache, 0, 1949 1.304 riastrad CTL_VM, CTL_CREATE, CTL_EOL); 1950 1.215 pooka sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1951 1.304 riastrad CTLFLAG_PERMANENT|CTLFLAG_READONLY, 1952 1.304 riastrad CTLTYPE_LONG, "bufmem", 1953 1.304 riastrad SYSCTL_DESCR("Amount of kernel memory used by buffer cache"), 1954 1.304 riastrad NULL, 0, &bufmem, 0, 1955 1.304 riastrad CTL_VM, CTL_CREATE, CTL_EOL); 1956 1.215 pooka sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1957 1.304 riastrad CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1958 1.304 riastrad CTLTYPE_LONG, "bufmem_lowater", 1959 1.304 riastrad SYSCTL_DESCR("Minimum amount of kernel memory to reserve for " 1960 1.304 riastrad "buffer cache"), 1961 1.304 riastrad sysctl_bufvm_update, 0, &bufmem_lowater, 0, 1962 1.304 riastrad CTL_VM, CTL_CREATE, CTL_EOL); 1963 1.215 pooka sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL, 1964 1.304 riastrad CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1965 1.304 riastrad CTLTYPE_LONG, "bufmem_hiwater", 1966 1.304 riastrad SYSCTL_DESCR("Maximum amount of kernel memory to use for " 1967 1.304 riastrad "buffer cache"), 1968 1.304 riastrad sysctl_bufvm_update, 0, &bufmem_hiwater, 0, 1969 1.304 riastrad CTL_VM, CTL_CREATE, CTL_EOL); 1970 1.100 pk } 1971 1.100 pk 1972 1.298 simonb static int 1973 1.298 simonb bufhash_stats(struct hashstat_sysctl *hs, bool fill) 1974 1.298 simonb { 1975 1.298 simonb buf_t *bp; 1976 1.298 simonb uint64_t chain; 1977 1.298 simonb 1978 1.298 simonb strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name)); 1979 1.298 simonb strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc)); 1980 1.298 simonb if (!fill) 1981 1.298 simonb return 0; 1982 1.298 simonb 1983 1.298 simonb hs->hash_size = bufhash + 1; 1984 1.298 simonb 1985 1.298 simonb for (size_t i = 0; i < hs->hash_size; i++) { 1986 1.298 simonb chain = 0; 1987 1.298 simonb 1988 1.298 simonb mutex_enter(&bufcache_lock); 1989 1.298 simonb LIST_FOREACH(bp, &bufhashtbl[i], b_hash) { 1990 1.298 simonb chain++; 1991 1.298 simonb } 1992 1.298 simonb mutex_exit(&bufcache_lock); 1993 1.298 simonb 1994 1.298 simonb if (chain > 0) { 1995 1.298 simonb hs->hash_used++; 1996 1.298 simonb hs->hash_items += chain; 1997 1.298 simonb if (chain > hs->hash_maxchain) 1998 1.298 simonb hs->hash_maxchain = chain; 1999 1.298 simonb } 2000 1.298 simonb preempt_point(); 2001 1.298 simonb } 2002 1.298 simonb 2003 1.298 simonb return 0; 2004 1.298 simonb } 2005 1.298 simonb 2006 1.36 cgd #ifdef DEBUG 2007 1.31 cgd /* 2008 1.31 cgd * Print out statistics on the current allocation of the buffer pool. 2009 1.31 cgd * Can be enabled to print out on every ``sync'' by setting "syncprt" 2010 1.31 cgd * in vfs_syscalls.c using sysctl. 2011 1.31 cgd */ 2012 1.31 cgd void 2013 1.101 thorpej vfs_bufstats(void) 2014 1.31 cgd { 2015 1.183 ad int i, j, count; 2016 1.183 ad buf_t *bp; 2017 1.131 yamt struct bqueue *dp; 2018 1.261 christos int counts[MAXBSIZE / MIN_PAGE_SIZE + 1]; 2019 1.145 christos static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" }; 2020 1.71 thorpej 2021 1.31 cgd for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 2022 1.31 cgd count = 0; 2023 1.261 christos memset(counts, 0, sizeof(counts)); 2024 1.131 yamt TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) { 2025 1.261 christos counts[bp->b_bufsize / PAGE_SIZE]++; 2026 1.31 cgd count++; 2027 1.31 cgd } 2028 1.48 christos printf("%s: total-%d", bname[i], count); 2029 1.261 christos for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++) 2030 1.31 cgd if (counts[j] != 0) 2031 1.71 thorpej printf(", %d-%d", j * PAGE_SIZE, counts[j]); 2032 1.48 christos printf("\n"); 2033 1.31 cgd } 2034 1.31 cgd } 2035 1.36 cgd #endif /* DEBUG */ 2036 1.149 yamt 2037 1.150 yamt /* ------------------------------ */ 2038 1.150 yamt 2039 1.183 ad buf_t * 2040 1.183 ad getiobuf(struct vnode *vp, bool waitok) 2041 1.149 yamt { 2042 1.183 ad buf_t *bp; 2043 1.149 yamt 2044 1.183 ad bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); 2045 1.183 ad if (bp == NULL) 2046 1.183 ad return bp; 2047 1.149 yamt 2048 1.183 ad buf_init(bp); 2049 1.149 yamt 2050 1.268 skrll if ((bp->b_vp = vp) != NULL) { 2051 1.268 skrll bp->b_objlock = vp->v_interlock; 2052 1.269 skrll } else { 2053 1.268 skrll KASSERT(bp->b_objlock == &buffer_lock); 2054 1.269 skrll } 2055 1.270 riastrad 2056 1.183 ad return bp; 2057 1.149 yamt } 2058 1.149 yamt 2059 1.149 yamt void 2060 1.183 ad putiobuf(buf_t *bp) 2061 1.149 yamt { 2062 1.149 yamt 2063 1.183 ad buf_destroy(bp); 2064 1.183 ad pool_cache_put(bufio_cache, bp); 2065 1.149 yamt } 2066 1.152 yamt 2067 1.152 yamt /* 2068 1.152 yamt * nestiobuf_iodone: b_iodone callback for nested buffers. 2069 1.152 yamt */ 2070 1.152 yamt 2071 1.167 reinoud void 2072 1.183 ad nestiobuf_iodone(buf_t *bp) 2073 1.152 yamt { 2074 1.183 ad buf_t *mbp = bp->b_private; 2075 1.152 yamt int error; 2076 1.155 reinoud int donebytes; 2077 1.152 yamt 2078 1.155 reinoud KASSERT(bp->b_bcount <= bp->b_bufsize); 2079 1.152 yamt KASSERT(mbp != bp); 2080 1.155 reinoud 2081 1.195 reinoud error = bp->b_error; 2082 1.183 ad if (bp->b_error == 0 && 2083 1.183 ad (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { 2084 1.155 reinoud /* 2085 1.285 msaitoh * Not all got transferred, raise an error. We have no way to 2086 1.155 reinoud * propagate these conditions to mbp. 2087 1.155 reinoud */ 2088 1.306 riastrad error = SET_ERROR(EIO); 2089 1.152 yamt } 2090 1.155 reinoud 2091 1.156 yamt donebytes = bp->b_bufsize; 2092 1.155 reinoud 2093 1.152 yamt putiobuf(bp); 2094 1.152 yamt nestiobuf_done(mbp, donebytes, error); 2095 1.152 yamt } 2096 1.152 yamt 2097 1.152 yamt /* 2098 1.152 yamt * nestiobuf_setup: setup a "nested" buffer. 2099 1.152 yamt * 2100 1.152 yamt * => 'mbp' is a "master" buffer which is being divided into sub pieces. 2101 1.190 yamt * => 'bp' should be a buffer allocated by getiobuf. 2102 1.152 yamt * => 'offset' is a byte offset in the master buffer. 2103 1.152 yamt * => 'size' is a size in bytes of this nested buffer. 2104 1.152 yamt */ 2105 1.152 yamt 2106 1.152 yamt void 2107 1.183 ad nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size) 2108 1.152 yamt { 2109 1.295 jdolecek const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS); 2110 1.152 yamt struct vnode *vp = mbp->b_vp; 2111 1.152 yamt 2112 1.152 yamt KASSERT(mbp->b_bcount >= offset + size); 2113 1.152 yamt bp->b_vp = vp; 2114 1.210 hannken bp->b_dev = mbp->b_dev; 2115 1.183 ad bp->b_objlock = mbp->b_objlock; 2116 1.183 ad bp->b_cflags = BC_BUSY; 2117 1.272 jdolecek bp->b_flags = B_ASYNC | b_pass; 2118 1.152 yamt bp->b_iodone = nestiobuf_iodone; 2119 1.170 christos bp->b_data = (char *)mbp->b_data + offset; 2120 1.152 yamt bp->b_resid = bp->b_bcount = size; 2121 1.152 yamt bp->b_bufsize = bp->b_bcount; 2122 1.152 yamt bp->b_private = mbp; 2123 1.152 yamt BIO_COPYPRIO(bp, mbp); 2124 1.272 jdolecek if (BUF_ISWRITE(bp) && vp != NULL) { 2125 1.230 rmind mutex_enter(vp->v_interlock); 2126 1.183 ad vp->v_numoutput++; 2127 1.230 rmind mutex_exit(vp->v_interlock); 2128 1.152 yamt } 2129 1.152 yamt } 2130 1.152 yamt 2131 1.152 yamt /* 2132 1.152 yamt * nestiobuf_done: propagate completion to the master buffer. 2133 1.152 yamt * 2134 1.152 yamt * => 'donebytes' specifies how many bytes in the 'mbp' is completed. 2135 1.152 yamt * => 'error' is an errno(2) that 'donebytes' has been completed with. 2136 1.152 yamt */ 2137 1.152 yamt 2138 1.152 yamt void 2139 1.183 ad nestiobuf_done(buf_t *mbp, int donebytes, int error) 2140 1.152 yamt { 2141 1.152 yamt 2142 1.152 yamt if (donebytes == 0) { 2143 1.152 yamt return; 2144 1.152 yamt } 2145 1.183 ad mutex_enter(mbp->b_objlock); 2146 1.152 yamt KASSERT(mbp->b_resid >= donebytes); 2147 1.152 yamt mbp->b_resid -= donebytes; 2148 1.195 reinoud if (error) 2149 1.195 reinoud mbp->b_error = error; 2150 1.152 yamt if (mbp->b_resid == 0) { 2151 1.226 reinoud if (mbp->b_error) 2152 1.226 reinoud mbp->b_resid = mbp->b_bcount; 2153 1.183 ad mutex_exit(mbp->b_objlock); 2154 1.183 ad biodone(mbp); 2155 1.183 ad } else 2156 1.183 ad mutex_exit(mbp->b_objlock); 2157 1.183 ad } 2158 1.183 ad 2159 1.183 ad void 2160 1.183 ad buf_init(buf_t *bp) 2161 1.183 ad { 2162 1.183 ad 2163 1.183 ad cv_init(&bp->b_busy, "biolock"); 2164 1.183 ad cv_init(&bp->b_done, "biowait"); 2165 1.183 ad bp->b_dev = NODEV; 2166 1.183 ad bp->b_error = 0; 2167 1.183 ad bp->b_flags = 0; 2168 1.204 reinoud bp->b_cflags = 0; 2169 1.183 ad bp->b_oflags = 0; 2170 1.183 ad bp->b_objlock = &buffer_lock; 2171 1.183 ad bp->b_iodone = NULL; 2172 1.202 ad bp->b_dev = NODEV; 2173 1.202 ad bp->b_vnbufs.le_next = NOLIST; 2174 1.183 ad BIO_SETPRIO(bp, BPRIO_DEFAULT); 2175 1.183 ad } 2176 1.183 ad 2177 1.183 ad void 2178 1.183 ad buf_destroy(buf_t *bp) 2179 1.183 ad { 2180 1.183 ad 2181 1.183 ad cv_destroy(&bp->b_done); 2182 1.183 ad cv_destroy(&bp->b_busy); 2183 1.183 ad } 2184 1.183 ad 2185 1.183 ad int 2186 1.188 ad bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock) 2187 1.183 ad { 2188 1.183 ad int error; 2189 1.183 ad 2190 1.183 ad KASSERT(mutex_owned(&bufcache_lock)); 2191 1.183 ad 2192 1.288 riastrad SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock); 2193 1.288 riastrad 2194 1.183 ad if ((bp->b_cflags & BC_BUSY) != 0) { 2195 1.288 riastrad if (curlwp == uvm.pagedaemon_lwp) { 2196 1.306 riastrad error = SET_ERROR(EDEADLK); 2197 1.288 riastrad goto out; 2198 1.288 riastrad } 2199 1.183 ad bp->b_cflags |= BC_WANTED; 2200 1.188 ad if (interlock != NULL) 2201 1.188 ad mutex_exit(interlock); 2202 1.183 ad if (intr) { 2203 1.183 ad error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock, 2204 1.183 ad timo); 2205 1.183 ad } else { 2206 1.183 ad error = cv_timedwait(&bp->b_busy, &bufcache_lock, 2207 1.183 ad timo); 2208 1.152 yamt } 2209 1.291 ad /* 2210 1.291 ad * At this point the buffer may be gone: don't touch it 2211 1.291 ad * again. The caller needs to find it again and retry. 2212 1.291 ad */ 2213 1.188 ad if (interlock != NULL) 2214 1.188 ad mutex_enter(interlock); 2215 1.291 ad if (error == 0) 2216 1.306 riastrad error = SET_ERROR(EPASSTHROUGH); 2217 1.291 ad } else { 2218 1.291 ad bp->b_cflags |= BC_BUSY; 2219 1.291 ad error = 0; 2220 1.152 yamt } 2221 1.183 ad 2222 1.288 riastrad out: SDT_PROBE5(io, kernel, , bbusy__done, 2223 1.288 riastrad bp, intr, timo, interlock, error); 2224 1.289 riastrad return error; 2225 1.152 yamt } 2226 1.274 chs 2227 1.274 chs /* 2228 1.274 chs * Nothing outside this file should really need to know about nbuf, 2229 1.274 chs * but a few things still want to read it, so give them a way to do that. 2230 1.274 chs */ 2231 1.279 msaitoh u_int 2232 1.274 chs buf_nbuf(void) 2233 1.274 chs { 2234 1.274 chs 2235 1.274 chs return nbuf; 2236 1.274 chs } 2237