kern_physio.c revision 1.62 1 1.62 yamt /* $NetBSD: kern_physio.c,v 1.62 2005/10/29 11:23:19 yamt Exp $ */
2 1.20 cgd
3 1.20 cgd /*-
4 1.20 cgd * Copyright (c) 1982, 1986, 1990, 1993
5 1.20 cgd * The Regents of the University of California. All rights reserved.
6 1.20 cgd * (c) UNIX System Laboratories, Inc.
7 1.20 cgd * All or some portions of this file are derived from material licensed
8 1.20 cgd * to the University of California by American Telephone and Telegraph
9 1.20 cgd * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 1.20 cgd * the permission of UNIX System Laboratories, Inc.
11 1.20 cgd *
12 1.20 cgd * Redistribution and use in source and binary forms, with or without
13 1.20 cgd * modification, are permitted provided that the following conditions
14 1.20 cgd * are met:
15 1.20 cgd * 1. Redistributions of source code must retain the above copyright
16 1.20 cgd * notice, this list of conditions and the following disclaimer.
17 1.20 cgd * 2. Redistributions in binary form must reproduce the above copyright
18 1.20 cgd * notice, this list of conditions and the following disclaimer in the
19 1.20 cgd * documentation and/or other materials provided with the distribution.
20 1.57 agc * 3. Neither the name of the University nor the names of its contributors
21 1.57 agc * may be used to endorse or promote products derived from this software
22 1.57 agc * without specific prior written permission.
23 1.57 agc *
24 1.57 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 1.57 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 1.57 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 1.57 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 1.57 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 1.57 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 1.57 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 1.57 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 1.57 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 1.57 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 1.57 agc * SUCH DAMAGE.
35 1.57 agc *
36 1.57 agc * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
37 1.57 agc */
38 1.57 agc
39 1.57 agc /*-
40 1.57 agc * Copyright (c) 1994 Christopher G. Demetriou
41 1.57 agc *
42 1.57 agc * Redistribution and use in source and binary forms, with or without
43 1.57 agc * modification, are permitted provided that the following conditions
44 1.57 agc * are met:
45 1.57 agc * 1. Redistributions of source code must retain the above copyright
46 1.57 agc * notice, this list of conditions and the following disclaimer.
47 1.57 agc * 2. Redistributions in binary form must reproduce the above copyright
48 1.57 agc * notice, this list of conditions and the following disclaimer in the
49 1.57 agc * documentation and/or other materials provided with the distribution.
50 1.20 cgd * 3. All advertising materials mentioning features or use of this software
51 1.20 cgd * must display the following acknowledgement:
52 1.20 cgd * This product includes software developed by the University of
53 1.20 cgd * California, Berkeley and its contributors.
54 1.20 cgd * 4. Neither the name of the University nor the names of its contributors
55 1.20 cgd * may be used to endorse or promote products derived from this software
56 1.20 cgd * without specific prior written permission.
57 1.20 cgd *
58 1.20 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 1.20 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 1.20 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 1.20 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 1.20 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 1.20 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 1.20 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 1.20 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 1.20 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 1.20 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 1.20 cgd * SUCH DAMAGE.
69 1.20 cgd *
70 1.20 cgd * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
71 1.20 cgd */
72 1.51 lukem
73 1.51 lukem #include <sys/cdefs.h>
74 1.62 yamt __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.62 2005/10/29 11:23:19 yamt Exp $");
75 1.30 mrg
76 1.20 cgd #include <sys/param.h>
77 1.20 cgd #include <sys/systm.h>
78 1.20 cgd #include <sys/buf.h>
79 1.20 cgd #include <sys/proc.h>
80 1.62 yamt #include <sys/once.h>
81 1.62 yamt #include <sys/workqueue.h>
82 1.26 christos
83 1.29 mrg #include <uvm/uvm_extern.h>
84 1.29 mrg
85 1.62 yamt ONCE_DECL(physio_initialized);
86 1.62 yamt struct workqueue *physio_workqueue;
87 1.62 yamt
88 1.20 cgd /*
89 1.20 cgd * The routines implemented in this file are described in:
90 1.20 cgd * Leffler, et al.: The Design and Implementation of the 4.3BSD
91 1.20 cgd * UNIX Operating System (Addison Welley, 1989)
92 1.20 cgd * on pages 231-233.
93 1.20 cgd *
94 1.20 cgd * The routines "getphysbuf" and "putphysbuf" steal and return a swap
95 1.20 cgd * buffer. Leffler, et al., says that swap buffers are used to do the
96 1.61 thorpej * I/O, so raw I/O requests don't have to be single-threaded. Of course,
97 1.61 thorpej * NetBSD doesn't use "swap buffers" -- we have our own memory pool for
98 1.61 thorpej * buffer descriptors.
99 1.20 cgd */
100 1.20 cgd
101 1.61 thorpej /*
102 1.61 thorpej * allocate a buffer structure for use in physical I/O.
103 1.61 thorpej */
104 1.61 thorpej static struct buf *
105 1.61 thorpej getphysbuf(void)
106 1.61 thorpej {
107 1.61 thorpej struct buf *bp;
108 1.61 thorpej int s;
109 1.61 thorpej
110 1.61 thorpej s = splbio();
111 1.61 thorpej bp = pool_get(&bufpool, PR_WAITOK);
112 1.61 thorpej splx(s);
113 1.61 thorpej BUF_INIT(bp);
114 1.62 yamt bp->b_error = 0;
115 1.61 thorpej return(bp);
116 1.61 thorpej }
117 1.61 thorpej
118 1.61 thorpej /*
119 1.61 thorpej * get rid of a swap buffer structure which has been used in physical I/O.
120 1.61 thorpej */
121 1.61 thorpej static void
122 1.61 thorpej putphysbuf(struct buf *bp)
123 1.61 thorpej {
124 1.61 thorpej int s;
125 1.61 thorpej
126 1.61 thorpej if (__predict_false(bp->b_flags & B_WANTED))
127 1.61 thorpej panic("putphysbuf: private buf B_WANTED");
128 1.61 thorpej s = splbio();
129 1.61 thorpej pool_put(&bufpool, bp);
130 1.61 thorpej splx(s);
131 1.61 thorpej }
132 1.20 cgd
133 1.62 yamt /* abuse these members of struct buf */
134 1.62 yamt #define b_running b_freelistindex
135 1.62 yamt #define b_eomoffset b_lblkno
136 1.62 yamt
137 1.62 yamt static void
138 1.62 yamt physio_done(struct work *wk, void *dummy)
139 1.62 yamt {
140 1.62 yamt struct buf *bp = (void *)wk;
141 1.62 yamt size_t todo = bp->b_bufsize;
142 1.62 yamt struct buf *mbp = bp->b_private;
143 1.62 yamt
144 1.62 yamt KASSERT(&bp->b_work == wk);
145 1.62 yamt KASSERT(bp->b_bcount <= todo);
146 1.62 yamt KASSERT(bp->b_resid <= bp->b_bcount);
147 1.62 yamt KASSERT((bp->b_flags & B_PHYS) != 0);
148 1.62 yamt KASSERT(dummy == NULL);
149 1.62 yamt
150 1.62 yamt vunmapbuf(bp, todo);
151 1.62 yamt uvm_vsunlock(bp->b_proc, bp->b_data, todo);
152 1.62 yamt
153 1.62 yamt simple_lock(&mbp->b_interlock);
154 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0) {
155 1.62 yamt goto done;
156 1.62 yamt }
157 1.62 yamt if ((bp->b_flags & B_ERROR) != 0) {
158 1.62 yamt if (bp->b_error == 0) {
159 1.62 yamt mbp->b_error = EIO; /* XXX */
160 1.62 yamt } else {
161 1.62 yamt mbp->b_error = bp->b_error;
162 1.62 yamt }
163 1.62 yamt mbp->b_flags |= B_ERROR;
164 1.62 yamt goto done;
165 1.62 yamt }
166 1.62 yamt KASSERT(bp->b_resid == 0); /* XXX */
167 1.62 yamt if (bp->b_bcount != todo) {
168 1.62 yamt #if defined(DIAGNOSTIC)
169 1.62 yamt off_t eomoffset = dbtob(bp->b_blkno);
170 1.62 yamt
171 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0 &&
172 1.62 yamt mbp->b_eomoffset != eomoffset) {
173 1.62 yamt panic("%s: eom mismatch", __func__);
174 1.62 yamt }
175 1.62 yamt mbp->b_eomoffset = eomoffset;
176 1.62 yamt #endif /* defined(DIAGNOSTIC) */
177 1.62 yamt mbp->b_flags |= B_ERROR;
178 1.62 yamt mbp->b_error = 0;
179 1.62 yamt goto done;
180 1.62 yamt }
181 1.62 yamt done:
182 1.62 yamt mbp->b_resid -= bp->b_bcount - bp->b_resid;
183 1.62 yamt mbp->b_running--;
184 1.62 yamt if ((mbp->b_flags & B_WANTED) != 0) {
185 1.62 yamt mbp->b_flags &= ~B_WANTED;
186 1.62 yamt wakeup(mbp);
187 1.62 yamt }
188 1.62 yamt simple_unlock(&mbp->b_interlock);
189 1.62 yamt
190 1.62 yamt putphysbuf(bp);
191 1.62 yamt }
192 1.62 yamt
193 1.62 yamt static void
194 1.62 yamt physio_biodone(struct buf *bp)
195 1.62 yamt {
196 1.62 yamt struct buf *mbp = bp->b_private;
197 1.62 yamt size_t todo = bp->b_bufsize;
198 1.62 yamt
199 1.62 yamt KASSERT(mbp->b_running > 0);
200 1.62 yamt KASSERT(todo <= mbp->b_resid);
201 1.62 yamt KASSERT(bp->b_bcount <= todo);
202 1.62 yamt KASSERT(bp->b_resid <= bp->b_bcount);
203 1.62 yamt
204 1.62 yamt workqueue_enqueue(physio_workqueue, &bp->b_work);
205 1.62 yamt }
206 1.62 yamt
207 1.62 yamt static int
208 1.62 yamt physio_wait(struct buf *bp, int n, const char *wchan)
209 1.62 yamt {
210 1.62 yamt int error = 0;
211 1.62 yamt
212 1.62 yamt LOCK_ASSERT(simple_lock_held(&bp->b_interlock));
213 1.62 yamt
214 1.62 yamt while (bp->b_running > n) {
215 1.62 yamt bp->b_flags |= B_WANTED;
216 1.62 yamt error = ltsleep(bp, PRIBIO + 1, wchan, 0, &bp->b_interlock);
217 1.62 yamt if (error) {
218 1.62 yamt break;
219 1.62 yamt }
220 1.62 yamt }
221 1.62 yamt
222 1.62 yamt return error;
223 1.62 yamt }
224 1.62 yamt
225 1.62 yamt static void
226 1.62 yamt physio_init(void)
227 1.62 yamt {
228 1.62 yamt
229 1.62 yamt KASSERT(physio_workqueue == NULL);
230 1.62 yamt
231 1.62 yamt if (workqueue_create(&physio_workqueue, "physiod",
232 1.62 yamt physio_done, NULL, PRIBIO, IPL_BIO, 0)) {
233 1.62 yamt panic("physiod create");
234 1.62 yamt }
235 1.62 yamt }
236 1.62 yamt
237 1.62 yamt #define PHYSIO_CONCURRENCY 16 /* XXX tune */
238 1.62 yamt
239 1.20 cgd /*
240 1.20 cgd * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly
241 1.20 cgd * from the raw device to user buffers, and bypasses the buffer cache.
242 1.20 cgd *
243 1.20 cgd * Comments in brackets are from Leffler, et al.'s pseudo-code implementation.
244 1.20 cgd */
245 1.20 cgd int
246 1.62 yamt physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
247 1.61 thorpej void (*min_phys)(struct buf *), struct uio *uio)
248 1.20 cgd {
249 1.20 cgd struct iovec *iovp;
250 1.54 thorpej struct lwp *l = curlwp;
251 1.54 thorpej struct proc *p = l->l_proc;
252 1.62 yamt int i, s;
253 1.62 yamt int error = 0;
254 1.62 yamt int error2;
255 1.62 yamt size_t todo;
256 1.62 yamt struct buf *bp = NULL;
257 1.62 yamt struct buf *mbp;
258 1.62 yamt
259 1.62 yamt RUN_ONCE(&physio_initialized, physio_init);
260 1.20 cgd
261 1.53 hannken flags &= B_READ | B_WRITE;
262 1.20 cgd
263 1.20 cgd /* Make sure we have a buffer, creating one if necessary. */
264 1.62 yamt if (obp != NULL) {
265 1.29 mrg /* [raise the processor priority level to splbio;] */
266 1.29 mrg s = splbio();
267 1.20 cgd
268 1.29 mrg /* [while the buffer is marked busy] */
269 1.62 yamt while (obp->b_flags & B_BUSY) {
270 1.29 mrg /* [mark the buffer wanted] */
271 1.62 yamt obp->b_flags |= B_WANTED;
272 1.29 mrg /* [wait until the buffer is available] */
273 1.62 yamt tsleep(obp, PRIBIO+1, "physbuf", 0);
274 1.29 mrg }
275 1.20 cgd
276 1.29 mrg /* Mark it busy, so nobody else will use it. */
277 1.62 yamt obp->b_flags |= B_BUSY;
278 1.20 cgd
279 1.29 mrg /* [lower the priority level] */
280 1.29 mrg splx(s);
281 1.29 mrg }
282 1.20 cgd
283 1.62 yamt mbp = getphysbuf();
284 1.62 yamt mbp->b_resid = uio->uio_resid;
285 1.62 yamt mbp->b_running = 0;
286 1.62 yamt mbp->b_flags = 0;
287 1.62 yamt
288 1.62 yamt PHOLD(l);
289 1.20 cgd
290 1.20 cgd for (i = 0; i < uio->uio_iovcnt; i++) {
291 1.20 cgd iovp = &uio->uio_iov[i];
292 1.20 cgd while (iovp->iov_len > 0) {
293 1.62 yamt simple_lock(&mbp->b_interlock);
294 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0) {
295 1.62 yamt error = mbp->b_error;
296 1.62 yamt goto done_locked;
297 1.62 yamt }
298 1.62 yamt error = physio_wait(mbp, PHYSIO_CONCURRENCY - 1,
299 1.62 yamt "physio1");
300 1.62 yamt if (error) {
301 1.62 yamt goto done_locked;
302 1.62 yamt }
303 1.62 yamt simple_unlock(&mbp->b_interlock);
304 1.62 yamt bp = getphysbuf();
305 1.62 yamt bp->b_dev = dev;
306 1.62 yamt bp->b_proc = p;
307 1.62 yamt bp->b_private = mbp;
308 1.62 yamt bp->b_vp = NULL;
309 1.47 chs
310 1.20 cgd /*
311 1.20 cgd * [mark the buffer busy for physical I/O]
312 1.20 cgd * (i.e. set B_PHYS (because it's an I/O to user
313 1.20 cgd * memory, and B_RAW, because B_RAW is to be
314 1.20 cgd * "Set by physio for raw transfers.", in addition
315 1.20 cgd * to the "busy" and read/write flag.)
316 1.20 cgd */
317 1.62 yamt bp->b_flags = B_BUSY | B_PHYS | B_RAW | B_CALL | flags;
318 1.62 yamt bp->b_iodone = physio_biodone;
319 1.20 cgd
320 1.20 cgd /* [set up the buffer for a maximum-sized transfer] */
321 1.20 cgd bp->b_blkno = btodb(uio->uio_offset);
322 1.62 yamt if (dbtob(bp->b_blkno) != uio->uio_offset) {
323 1.62 yamt error = EINVAL;
324 1.62 yamt goto done;
325 1.62 yamt }
326 1.62 yamt bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
327 1.20 cgd bp->b_data = iovp->iov_base;
328 1.24 mycroft
329 1.20 cgd /*
330 1.48 wiz * [call minphys to bound the transfer size]
331 1.20 cgd * and remember the amount of data to transfer,
332 1.20 cgd * for later comparison.
333 1.20 cgd */
334 1.59 simonb (*min_phys)(bp);
335 1.62 yamt todo = bp->b_bufsize = bp->b_bcount;
336 1.62 yamt #if defined(DIAGNOSTIC)
337 1.22 cgd if (todo > MAXPHYS)
338 1.62 yamt panic("todo(%zu) > MAXPHYS; minphys broken",
339 1.62 yamt todo);
340 1.62 yamt #endif /* defined(DIAGNOSTIC) */
341 1.20 cgd
342 1.20 cgd /*
343 1.20 cgd * [lock the part of the user address space involved
344 1.20 cgd * in the transfer]
345 1.20 cgd * Beware vmapbuf(); it clobbers b_data and
346 1.20 cgd * saves it in b_saveaddr. However, vunmapbuf()
347 1.20 cgd * restores it.
348 1.20 cgd */
349 1.47 chs error = uvm_vslock(p, bp->b_data, todo,
350 1.62 yamt (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ);
351 1.47 chs if (error) {
352 1.62 yamt goto done;
353 1.37 thorpej }
354 1.20 cgd vmapbuf(bp, todo);
355 1.20 cgd
356 1.58 yamt BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
357 1.58 yamt
358 1.62 yamt simple_lock(&mbp->b_interlock);
359 1.62 yamt mbp->b_running++;
360 1.62 yamt simple_unlock(&mbp->b_interlock);
361 1.62 yamt
362 1.20 cgd /* [call strategy to start the transfer] */
363 1.20 cgd (*strategy)(bp);
364 1.62 yamt bp = NULL;
365 1.20 cgd
366 1.62 yamt iovp->iov_len -= todo;
367 1.62 yamt iovp->iov_base = (caddr_t)iovp->iov_base + todo;
368 1.62 yamt uio->uio_offset += todo;
369 1.62 yamt uio->uio_resid -= todo;
370 1.20 cgd }
371 1.20 cgd }
372 1.20 cgd
373 1.20 cgd done:
374 1.62 yamt simple_lock(&mbp->b_interlock);
375 1.62 yamt done_locked:
376 1.62 yamt error2 = physio_wait(mbp, 0, "physio2");
377 1.62 yamt if (error == 0) {
378 1.62 yamt error = error2;
379 1.62 yamt }
380 1.62 yamt simple_unlock(&mbp->b_interlock);
381 1.62 yamt KASSERT((mbp->b_flags & B_ERROR) != 0 ||
382 1.62 yamt mbp->b_resid == uio->uio_resid);
383 1.62 yamt #if defined(DIAGNOSTIC)
384 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0 && mbp->b_error == 0 &&
385 1.62 yamt uio->uio_offset - mbp->b_resid != mbp->b_eomoffset) {
386 1.62 yamt panic("%s: eom", __func__);
387 1.62 yamt }
388 1.62 yamt #endif /* defined(DIAGNOSTIC) */
389 1.62 yamt uio->uio_resid = mbp->b_resid;
390 1.62 yamt if (bp != NULL) {
391 1.62 yamt putphysbuf(bp);
392 1.62 yamt }
393 1.62 yamt if (error == 0) {
394 1.62 yamt error = mbp->b_error;
395 1.62 yamt }
396 1.62 yamt putphysbuf(mbp);
397 1.62 yamt
398 1.20 cgd /*
399 1.20 cgd * [clean up the state of the buffer]
400 1.20 cgd * Remember if somebody wants it, so we can wake them up below.
401 1.20 cgd * Also, if we had to steal it, give it back.
402 1.20 cgd */
403 1.62 yamt if (obp != NULL) {
404 1.62 yamt s = splbio();
405 1.20 cgd /*
406 1.20 cgd * [if another process is waiting for the raw I/O buffer,
407 1.20 cgd * wake up processes waiting to do physical I/O;
408 1.20 cgd */
409 1.62 yamt if (obp->b_flags & B_WANTED) {
410 1.62 yamt obp->b_flags &= ~B_WANTED;
411 1.62 yamt wakeup(obp);
412 1.20 cgd }
413 1.62 yamt splx(s);
414 1.20 cgd }
415 1.62 yamt PRELE(l);
416 1.20 cgd
417 1.62 yamt return error;
418 1.20 cgd }
419 1.20 cgd
420 1.20 cgd /*
421 1.20 cgd * Leffler, et al., says on p. 231:
422 1.20 cgd * "The minphys() routine is called by physio() to adjust the
423 1.20 cgd * size of each I/O transfer before the latter is passed to
424 1.24 mycroft * the strategy routine..."
425 1.20 cgd *
426 1.20 cgd * so, just adjust the buffer's count accounting to MAXPHYS here,
427 1.20 cgd * and return the new count;
428 1.20 cgd */
429 1.24 mycroft void
430 1.61 thorpej minphys(struct buf *bp)
431 1.20 cgd {
432 1.20 cgd
433 1.24 mycroft if (bp->b_bcount > MAXPHYS)
434 1.24 mycroft bp->b_bcount = MAXPHYS;
435 1.20 cgd }
436