Home | History | Annotate | Line # | Download | only in kern
kern_physio.c revision 1.70
      1 /*	$NetBSD: kern_physio.c,v 1.70 2005/12/17 05:26:41 yamt Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1982, 1986, 1990, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  * (c) UNIX System Laboratories, Inc.
      7  * All or some portions of this file are derived from material licensed
      8  * to the University of California by American Telephone and Telegraph
      9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
     10  * the permission of UNIX System Laboratories, Inc.
     11  *
     12  * Redistribution and use in source and binary forms, with or without
     13  * modification, are permitted provided that the following conditions
     14  * are met:
     15  * 1. Redistributions of source code must retain the above copyright
     16  *    notice, this list of conditions and the following disclaimer.
     17  * 2. Redistributions in binary form must reproduce the above copyright
     18  *    notice, this list of conditions and the following disclaimer in the
     19  *    documentation and/or other materials provided with the distribution.
     20  * 3. Neither the name of the University nor the names of its contributors
     21  *    may be used to endorse or promote products derived from this software
     22  *    without specific prior written permission.
     23  *
     24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     34  * SUCH DAMAGE.
     35  *
     36  *	@(#)kern_physio.c	8.1 (Berkeley) 6/10/93
     37  */
     38 
     39 /*-
     40  * Copyright (c) 1994 Christopher G. Demetriou
     41  *
     42  * Redistribution and use in source and binary forms, with or without
     43  * modification, are permitted provided that the following conditions
     44  * are met:
     45  * 1. Redistributions of source code must retain the above copyright
     46  *    notice, this list of conditions and the following disclaimer.
     47  * 2. Redistributions in binary form must reproduce the above copyright
     48  *    notice, this list of conditions and the following disclaimer in the
     49  *    documentation and/or other materials provided with the distribution.
     50  * 3. All advertising materials mentioning features or use of this software
     51  *    must display the following acknowledgement:
     52  *	This product includes software developed by the University of
     53  *	California, Berkeley and its contributors.
     54  * 4. Neither the name of the University nor the names of its contributors
     55  *    may be used to endorse or promote products derived from this software
     56  *    without specific prior written permission.
     57  *
     58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     68  * SUCH DAMAGE.
     69  *
     70  *	@(#)kern_physio.c	8.1 (Berkeley) 6/10/93
     71  */
     72 
     73 #include <sys/cdefs.h>
     74 __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.70 2005/12/17 05:26:41 yamt Exp $");
     75 
     76 #include <sys/param.h>
     77 #include <sys/systm.h>
     78 #include <sys/buf.h>
     79 #include <sys/proc.h>
     80 #include <sys/once.h>
     81 #include <sys/workqueue.h>
     82 
     83 #include <uvm/uvm_extern.h>
     84 
     85 ONCE_DECL(physio_initialized);
     86 struct workqueue *physio_workqueue;
     87 
     88 /*
     89  * The routines implemented in this file are described in:
     90  *	Leffler, et al.: The Design and Implementation of the 4.3BSD
     91  *	    UNIX Operating System (Addison Welley, 1989)
     92  * on pages 231-233.
     93  *
     94  * The routines "getphysbuf" and "putphysbuf" steal and return a swap
     95  * buffer.  Leffler, et al., says that swap buffers are used to do the
     96  * I/O, so raw I/O requests don't have to be single-threaded.  Of course,
     97  * NetBSD doesn't use "swap buffers" -- we have our own memory pool for
     98  * buffer descriptors.
     99  */
    100 
    101 /* #define	PHYSIO_DEBUG */
    102 #if defined(PHYSIO_DEBUG)
    103 #define	DPRINTF(a)	printf a
    104 #else /* defined(PHYSIO_DEBUG) */
    105 #define	DPRINTF(a)	/* nothing */
    106 #endif /* defined(PHYSIO_DEBUG) */
    107 
    108 /* abuse these members/flags of struct buf */
    109 #define	b_running	b_freelistindex
    110 #define	b_endoffset	b_lblkno
    111 #define	B_DONTFREE	B_AGE
    112 
    113 /*
    114  * allocate a buffer structure for use in physical I/O.
    115  */
    116 static struct buf *
    117 getphysbuf(void)
    118 {
    119 	struct buf *bp;
    120 	int s;
    121 
    122 	s = splbio();
    123 	bp = pool_get(&bufpool, PR_WAITOK);
    124 	splx(s);
    125 	BUF_INIT(bp);
    126 	bp->b_error = 0;
    127 	bp->b_flags = B_BUSY;
    128 	return(bp);
    129 }
    130 
    131 /*
    132  * get rid of a swap buffer structure which has been used in physical I/O.
    133  */
    134 static void
    135 putphysbuf(struct buf *bp)
    136 {
    137 	int s;
    138 
    139 	if ((bp->b_flags & B_DONTFREE) != 0) {
    140 		return;
    141 	}
    142 
    143 	if (__predict_false(bp->b_flags & B_WANTED))
    144 		panic("putphysbuf: private buf B_WANTED");
    145 	s = splbio();
    146 	pool_put(&bufpool, bp);
    147 	splx(s);
    148 }
    149 
    150 static void
    151 physio_done(struct work *wk, void *dummy)
    152 {
    153 	struct buf *bp = (void *)wk;
    154 	size_t todo = bp->b_bufsize;
    155 	size_t done = bp->b_bcount - bp->b_resid;
    156 	struct buf *mbp = bp->b_private;
    157 
    158 	KASSERT(&bp->b_work == wk);
    159 	KASSERT(bp->b_bcount <= todo);
    160 	KASSERT(bp->b_resid <= bp->b_bcount);
    161 	KASSERT((bp->b_flags & B_PHYS) != 0);
    162 	KASSERT(dummy == NULL);
    163 
    164 	vunmapbuf(bp, todo);
    165 	uvm_vsunlock(bp->b_proc, bp->b_data, todo);
    166 
    167 	simple_lock(&mbp->b_interlock);
    168 	if (__predict_false(done != todo)) {
    169 		off_t endoffset = dbtob(bp->b_blkno) + done;
    170 
    171 		/*
    172 		 * we got an error or hit EOM.
    173 		 *
    174 		 * we only care about the first one.
    175 		 * ie. the one at the lowest offset.
    176 		 */
    177 
    178 		KASSERT(mbp->b_endoffset != endoffset);
    179 		DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
    180 		    ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
    181 		    __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
    182 		    bp->b_blkno, bp->b_bcount, bp->b_flags));
    183 
    184 		if (mbp->b_endoffset == -1 || endoffset < mbp->b_endoffset) {
    185 			int error;
    186 
    187 			if ((bp->b_flags & B_ERROR) != 0) {
    188 				if (bp->b_error == 0) {
    189 					error = EIO; /* XXX */
    190 				} else {
    191 					error = bp->b_error;
    192 				}
    193 			} else {
    194 				error = 0; /* EOM */
    195 			}
    196 
    197 			DPRINTF(("%s: mbp=%p, error %d -> %d, endoff %" PRIu64
    198 			    " -> %" PRIu64 "\n",
    199 			    __func__, mbp,
    200 			    mbp->b_error, error,
    201 			    mbp->b_endoffset, endoffset));
    202 
    203 			mbp->b_endoffset = endoffset;
    204 			mbp->b_error = error;
    205 		}
    206 		mbp->b_flags |= B_ERROR;
    207 	} else {
    208 		KASSERT((bp->b_flags & B_ERROR) == 0);
    209 	}
    210 
    211 	mbp->b_running--;
    212 	if ((mbp->b_flags & B_WANTED) != 0) {
    213 		mbp->b_flags &= ~B_WANTED;
    214 		wakeup(mbp);
    215 	}
    216 	simple_unlock(&mbp->b_interlock);
    217 
    218 	putphysbuf(bp);
    219 }
    220 
    221 static void
    222 physio_biodone(struct buf *bp)
    223 {
    224 #if defined(DIAGNOSTIC)
    225 	struct buf *mbp = bp->b_private;
    226 	size_t todo = bp->b_bufsize;
    227 
    228 	KASSERT(mbp->b_running > 0);
    229 	KASSERT(bp->b_bcount <= todo);
    230 	KASSERT(bp->b_resid <= bp->b_bcount);
    231 #endif /* defined(DIAGNOSTIC) */
    232 
    233 	workqueue_enqueue(physio_workqueue, &bp->b_work);
    234 }
    235 
    236 static int
    237 physio_wait(struct buf *bp, int n, const char *wchan)
    238 {
    239 	int error = 0;
    240 
    241 	LOCK_ASSERT(simple_lock_held(&bp->b_interlock));
    242 
    243 	while (bp->b_running > n) {
    244 		bp->b_flags |= B_WANTED;
    245 		error = ltsleep(bp, PRIBIO + 1, wchan, 0, &bp->b_interlock);
    246 		if (error) {
    247 			break;
    248 		}
    249 	}
    250 
    251 	return error;
    252 }
    253 
    254 static void
    255 physio_init(void)
    256 {
    257 
    258 	KASSERT(physio_workqueue == NULL);
    259 
    260 	if (workqueue_create(&physio_workqueue, "physiod",
    261 	    physio_done, NULL, PRIBIO, 0/* IPL_BIO notyet */, 0)) {
    262 		panic("physiod create");
    263 	}
    264 }
    265 
    266 #define	PHYSIO_CONCURRENCY	16	/* XXX tune */
    267 
    268 /*
    269  * Do "physical I/O" on behalf of a user.  "Physical I/O" is I/O directly
    270  * from the raw device to user buffers, and bypasses the buffer cache.
    271  *
    272  * Comments in brackets are from Leffler, et al.'s pseudo-code implementation.
    273  */
    274 int
    275 physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
    276     void (*min_phys)(struct buf *), struct uio *uio)
    277 {
    278 	struct iovec *iovp;
    279 	struct lwp *l = curlwp;
    280 	struct proc *p = l->l_proc;
    281 	int i, s;
    282 	int error = 0;
    283 	int error2;
    284 	struct buf *bp = NULL;
    285 	struct buf *mbp;
    286 	int concurrency = PHYSIO_CONCURRENCY - 1;
    287 
    288 	RUN_ONCE(&physio_initialized, physio_init);
    289 
    290 	DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
    291 	    __func__, uio->uio_offset, uio->uio_resid));
    292 
    293 	flags &= B_READ | B_WRITE;
    294 
    295 	/* Make sure we have a buffer, creating one if necessary. */
    296 	if (obp != NULL) {
    297 		/* [raise the processor priority level to splbio;] */
    298 		s = splbio();
    299 		simple_lock(&obp->b_interlock);
    300 
    301 		/* [while the buffer is marked busy] */
    302 		while (obp->b_flags & B_BUSY) {
    303 			/* [mark the buffer wanted] */
    304 			obp->b_flags |= B_WANTED;
    305 			/* [wait until the buffer is available] */
    306 			ltsleep(obp, PRIBIO+1, "physbuf", 0, &bp->b_interlock);
    307 		}
    308 
    309 		/* Mark it busy, so nobody else will use it. */
    310 		obp->b_flags = B_BUSY | B_DONTFREE;
    311 
    312 		/* [lower the priority level] */
    313 		simple_unlock(&obp->b_interlock);
    314 		splx(s);
    315 
    316 		concurrency = 0; /* see "XXXkludge" comment below */
    317 	}
    318 
    319 	mbp = getphysbuf();
    320 	mbp->b_running = 0;
    321 	mbp->b_endoffset = -1;
    322 
    323 	PHOLD(l);
    324 
    325 	for (i = 0; i < uio->uio_iovcnt; i++) {
    326 		boolean_t sync = TRUE;
    327 
    328 		iovp = &uio->uio_iov[i];
    329 		while (iovp->iov_len > 0) {
    330 			size_t todo;
    331 			vaddr_t endp;
    332 
    333 			simple_lock(&mbp->b_interlock);
    334 			if ((mbp->b_flags & B_ERROR) != 0) {
    335 				goto done_locked;
    336 			}
    337 			error = physio_wait(mbp, sync ? 0 : concurrency,
    338 			    "physio1");
    339 			if (error) {
    340 				goto done_locked;
    341 			}
    342 			simple_unlock(&mbp->b_interlock);
    343 			if (obp != NULL) {
    344 				/*
    345 				 * XXXkludge
    346 				 * some drivers use "obp" as an identifier.
    347 				 */
    348 				bp = obp;
    349 			} else {
    350 				bp = getphysbuf();
    351 			}
    352 			bp->b_dev = dev;
    353 			bp->b_proc = p;
    354 			bp->b_private = mbp;
    355 			bp->b_vp = NULL;
    356 
    357 			/*
    358 			 * [mark the buffer busy for physical I/O]
    359 			 * (i.e. set B_PHYS (because it's an I/O to user
    360 			 * memory, and B_RAW, because B_RAW is to be
    361 			 * "Set by physio for raw transfers.", in addition
    362 			 * to the "busy" and read/write flag.)
    363 			 */
    364 			bp->b_flags = (bp->b_flags & B_DONTFREE) |
    365 			    B_BUSY | B_PHYS | B_RAW | B_CALL | flags;
    366 			bp->b_iodone = physio_biodone;
    367 
    368 			/* [set up the buffer for a maximum-sized transfer] */
    369 			bp->b_blkno = btodb(uio->uio_offset);
    370 			if (dbtob(bp->b_blkno) != uio->uio_offset) {
    371 				error = EINVAL;
    372 				goto done;
    373 			}
    374 			bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
    375 			bp->b_data = iovp->iov_base;
    376 
    377 			/*
    378 			 * [call minphys to bound the transfer size]
    379 			 * and remember the amount of data to transfer,
    380 			 * for later comparison.
    381 			 */
    382 			(*min_phys)(bp);
    383 			todo = bp->b_bufsize = bp->b_bcount;
    384 #if defined(DIAGNOSTIC)
    385 			if (todo > MAXPHYS)
    386 				panic("todo(%zu) > MAXPHYS; minphys broken",
    387 				    todo);
    388 #endif /* defined(DIAGNOSTIC) */
    389 
    390 			sync = FALSE;
    391 			endp = (vaddr_t)bp->b_data + todo;
    392 			if (trunc_page(endp) != endp) {
    393 				/*
    394 				 * following requests can overlap.
    395 				 * note that uvm_vslock does round_page.
    396 				 */
    397 				sync = TRUE;
    398 			}
    399 
    400 			/*
    401 			 * [lock the part of the user address space involved
    402 			 *    in the transfer]
    403 			 * Beware vmapbuf(); it clobbers b_data and
    404 			 * saves it in b_saveaddr.  However, vunmapbuf()
    405 			 * restores it.
    406 			 */
    407 			error = uvm_vslock(p, bp->b_data, todo,
    408 			    (flags & B_READ) ?  VM_PROT_WRITE : VM_PROT_READ);
    409 			if (error) {
    410 				goto done;
    411 			}
    412 			vmapbuf(bp, todo);
    413 
    414 			BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
    415 
    416 			simple_lock(&mbp->b_interlock);
    417 			mbp->b_running++;
    418 			simple_unlock(&mbp->b_interlock);
    419 
    420 			/* [call strategy to start the transfer] */
    421 			(*strategy)(bp);
    422 			bp = NULL;
    423 
    424 			iovp->iov_len -= todo;
    425 			iovp->iov_base = (caddr_t)iovp->iov_base + todo;
    426 			uio->uio_offset += todo;
    427 			uio->uio_resid -= todo;
    428 		}
    429 	}
    430 
    431 done:
    432 	simple_lock(&mbp->b_interlock);
    433 done_locked:
    434 	error2 = physio_wait(mbp, 0, "physio2");
    435 	if (error == 0) {
    436 		error = error2;
    437 	}
    438 	simple_unlock(&mbp->b_interlock);
    439 
    440 	if ((mbp->b_flags & B_ERROR) != 0) {
    441 		off_t delta;
    442 
    443 		delta = uio->uio_offset - mbp->b_endoffset;
    444 		KASSERT(delta > 0);
    445 		uio->uio_resid += delta;
    446 		/* uio->uio_offset = mbp->b_endoffset; */
    447 	} else {
    448 		KASSERT(mbp->b_endoffset == -1);
    449 	}
    450 	if (bp != NULL) {
    451 		putphysbuf(bp);
    452 	}
    453 	if (error == 0) {
    454 		error = mbp->b_error;
    455 	}
    456 	putphysbuf(mbp);
    457 
    458 	/*
    459 	 * [clean up the state of the buffer]
    460 	 * Remember if somebody wants it, so we can wake them up below.
    461 	 * Also, if we had to steal it, give it back.
    462 	 */
    463 	if (obp != NULL) {
    464 		KASSERT((obp->b_flags & B_BUSY) != 0);
    465 		KASSERT((obp->b_flags & B_DONTFREE) != 0);
    466 
    467 		/*
    468 		 * [if another process is waiting for the raw I/O buffer,
    469 		 *    wake up processes waiting to do physical I/O;
    470 		 */
    471 		s = splbio();
    472 		simple_lock(&obp->b_interlock);
    473 		obp->b_flags &=
    474 		    ~(B_BUSY | B_PHYS | B_RAW | B_CALL | B_DONTFREE);
    475 		if ((obp->b_flags & B_WANTED) != 0) {
    476 			obp->b_flags &= ~B_WANTED;
    477 			wakeup(obp);
    478 		}
    479 		simple_unlock(&obp->b_interlock);
    480 		splx(s);
    481 	}
    482 	PRELE(l);
    483 
    484 	DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
    485 	    __func__, uio->uio_offset, uio->uio_resid));
    486 
    487 	return error;
    488 }
    489 
    490 /*
    491  * Leffler, et al., says on p. 231:
    492  * "The minphys() routine is called by physio() to adjust the
    493  * size of each I/O transfer before the latter is passed to
    494  * the strategy routine..."
    495  *
    496  * so, just adjust the buffer's count accounting to MAXPHYS here,
    497  * and return the new count;
    498  */
    499 void
    500 minphys(struct buf *bp)
    501 {
    502 
    503 	if (bp->b_bcount > MAXPHYS)
    504 		bp->b_bcount = MAXPHYS;
    505 }
    506