kern_physio.c revision 1.64 1 1.64 yamt /* $NetBSD: kern_physio.c,v 1.64 2005/10/30 09:17:02 yamt Exp $ */
2 1.20 cgd
3 1.20 cgd /*-
4 1.20 cgd * Copyright (c) 1982, 1986, 1990, 1993
5 1.20 cgd * The Regents of the University of California. All rights reserved.
6 1.20 cgd * (c) UNIX System Laboratories, Inc.
7 1.20 cgd * All or some portions of this file are derived from material licensed
8 1.20 cgd * to the University of California by American Telephone and Telegraph
9 1.20 cgd * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 1.20 cgd * the permission of UNIX System Laboratories, Inc.
11 1.20 cgd *
12 1.20 cgd * Redistribution and use in source and binary forms, with or without
13 1.20 cgd * modification, are permitted provided that the following conditions
14 1.20 cgd * are met:
15 1.20 cgd * 1. Redistributions of source code must retain the above copyright
16 1.20 cgd * notice, this list of conditions and the following disclaimer.
17 1.20 cgd * 2. Redistributions in binary form must reproduce the above copyright
18 1.20 cgd * notice, this list of conditions and the following disclaimer in the
19 1.20 cgd * documentation and/or other materials provided with the distribution.
20 1.57 agc * 3. Neither the name of the University nor the names of its contributors
21 1.57 agc * may be used to endorse or promote products derived from this software
22 1.57 agc * without specific prior written permission.
23 1.57 agc *
24 1.57 agc * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 1.57 agc * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 1.57 agc * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 1.57 agc * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 1.57 agc * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 1.57 agc * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 1.57 agc * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 1.57 agc * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 1.57 agc * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 1.57 agc * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 1.57 agc * SUCH DAMAGE.
35 1.57 agc *
36 1.57 agc * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
37 1.57 agc */
38 1.57 agc
39 1.57 agc /*-
40 1.57 agc * Copyright (c) 1994 Christopher G. Demetriou
41 1.57 agc *
42 1.57 agc * Redistribution and use in source and binary forms, with or without
43 1.57 agc * modification, are permitted provided that the following conditions
44 1.57 agc * are met:
45 1.57 agc * 1. Redistributions of source code must retain the above copyright
46 1.57 agc * notice, this list of conditions and the following disclaimer.
47 1.57 agc * 2. Redistributions in binary form must reproduce the above copyright
48 1.57 agc * notice, this list of conditions and the following disclaimer in the
49 1.57 agc * documentation and/or other materials provided with the distribution.
50 1.20 cgd * 3. All advertising materials mentioning features or use of this software
51 1.20 cgd * must display the following acknowledgement:
52 1.20 cgd * This product includes software developed by the University of
53 1.20 cgd * California, Berkeley and its contributors.
54 1.20 cgd * 4. Neither the name of the University nor the names of its contributors
55 1.20 cgd * may be used to endorse or promote products derived from this software
56 1.20 cgd * without specific prior written permission.
57 1.20 cgd *
58 1.20 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 1.20 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 1.20 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 1.20 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 1.20 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 1.20 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 1.20 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 1.20 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 1.20 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 1.20 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 1.20 cgd * SUCH DAMAGE.
69 1.20 cgd *
70 1.20 cgd * @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
71 1.20 cgd */
72 1.51 lukem
73 1.51 lukem #include <sys/cdefs.h>
74 1.64 yamt __KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.64 2005/10/30 09:17:02 yamt Exp $");
75 1.30 mrg
76 1.20 cgd #include <sys/param.h>
77 1.20 cgd #include <sys/systm.h>
78 1.20 cgd #include <sys/buf.h>
79 1.20 cgd #include <sys/proc.h>
80 1.62 yamt #include <sys/once.h>
81 1.62 yamt #include <sys/workqueue.h>
82 1.26 christos
83 1.29 mrg #include <uvm/uvm_extern.h>
84 1.29 mrg
85 1.62 yamt ONCE_DECL(physio_initialized);
86 1.62 yamt struct workqueue *physio_workqueue;
87 1.62 yamt
88 1.20 cgd /*
89 1.20 cgd * The routines implemented in this file are described in:
90 1.20 cgd * Leffler, et al.: The Design and Implementation of the 4.3BSD
91 1.20 cgd * UNIX Operating System (Addison Welley, 1989)
92 1.20 cgd * on pages 231-233.
93 1.20 cgd *
94 1.20 cgd * The routines "getphysbuf" and "putphysbuf" steal and return a swap
95 1.20 cgd * buffer. Leffler, et al., says that swap buffers are used to do the
96 1.61 thorpej * I/O, so raw I/O requests don't have to be single-threaded. Of course,
97 1.61 thorpej * NetBSD doesn't use "swap buffers" -- we have our own memory pool for
98 1.61 thorpej * buffer descriptors.
99 1.20 cgd */
100 1.20 cgd
101 1.61 thorpej /*
102 1.61 thorpej * allocate a buffer structure for use in physical I/O.
103 1.61 thorpej */
104 1.61 thorpej static struct buf *
105 1.61 thorpej getphysbuf(void)
106 1.61 thorpej {
107 1.61 thorpej struct buf *bp;
108 1.61 thorpej int s;
109 1.61 thorpej
110 1.61 thorpej s = splbio();
111 1.61 thorpej bp = pool_get(&bufpool, PR_WAITOK);
112 1.61 thorpej splx(s);
113 1.61 thorpej BUF_INIT(bp);
114 1.62 yamt bp->b_error = 0;
115 1.61 thorpej return(bp);
116 1.61 thorpej }
117 1.61 thorpej
118 1.61 thorpej /*
119 1.61 thorpej * get rid of a swap buffer structure which has been used in physical I/O.
120 1.61 thorpej */
121 1.61 thorpej static void
122 1.61 thorpej putphysbuf(struct buf *bp)
123 1.61 thorpej {
124 1.61 thorpej int s;
125 1.61 thorpej
126 1.61 thorpej if (__predict_false(bp->b_flags & B_WANTED))
127 1.61 thorpej panic("putphysbuf: private buf B_WANTED");
128 1.61 thorpej s = splbio();
129 1.61 thorpej pool_put(&bufpool, bp);
130 1.61 thorpej splx(s);
131 1.61 thorpej }
132 1.20 cgd
133 1.62 yamt /* abuse these members of struct buf */
134 1.62 yamt #define b_running b_freelistindex
135 1.62 yamt #define b_eomoffset b_lblkno
136 1.62 yamt
137 1.62 yamt static void
138 1.62 yamt physio_done(struct work *wk, void *dummy)
139 1.62 yamt {
140 1.62 yamt struct buf *bp = (void *)wk;
141 1.62 yamt size_t todo = bp->b_bufsize;
142 1.62 yamt struct buf *mbp = bp->b_private;
143 1.62 yamt
144 1.62 yamt KASSERT(&bp->b_work == wk);
145 1.62 yamt KASSERT(bp->b_bcount <= todo);
146 1.62 yamt KASSERT(bp->b_resid <= bp->b_bcount);
147 1.62 yamt KASSERT((bp->b_flags & B_PHYS) != 0);
148 1.62 yamt KASSERT(dummy == NULL);
149 1.62 yamt
150 1.62 yamt vunmapbuf(bp, todo);
151 1.62 yamt uvm_vsunlock(bp->b_proc, bp->b_data, todo);
152 1.62 yamt
153 1.62 yamt simple_lock(&mbp->b_interlock);
154 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0) {
155 1.62 yamt goto done;
156 1.62 yamt }
157 1.62 yamt if ((bp->b_flags & B_ERROR) != 0) {
158 1.62 yamt if (bp->b_error == 0) {
159 1.62 yamt mbp->b_error = EIO; /* XXX */
160 1.62 yamt } else {
161 1.62 yamt mbp->b_error = bp->b_error;
162 1.62 yamt }
163 1.62 yamt mbp->b_flags |= B_ERROR;
164 1.62 yamt goto done;
165 1.62 yamt }
166 1.62 yamt KASSERT(bp->b_resid == 0); /* XXX */
167 1.62 yamt if (bp->b_bcount != todo) {
168 1.62 yamt #if defined(DIAGNOSTIC)
169 1.62 yamt off_t eomoffset = dbtob(bp->b_blkno);
170 1.62 yamt
171 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0 &&
172 1.62 yamt mbp->b_eomoffset != eomoffset) {
173 1.62 yamt panic("%s: eom mismatch", __func__);
174 1.62 yamt }
175 1.62 yamt mbp->b_eomoffset = eomoffset;
176 1.62 yamt #endif /* defined(DIAGNOSTIC) */
177 1.62 yamt mbp->b_flags |= B_ERROR;
178 1.62 yamt mbp->b_error = 0;
179 1.62 yamt goto done;
180 1.62 yamt }
181 1.62 yamt done:
182 1.62 yamt mbp->b_resid -= bp->b_bcount - bp->b_resid;
183 1.62 yamt mbp->b_running--;
184 1.62 yamt if ((mbp->b_flags & B_WANTED) != 0) {
185 1.62 yamt mbp->b_flags &= ~B_WANTED;
186 1.62 yamt wakeup(mbp);
187 1.62 yamt }
188 1.62 yamt simple_unlock(&mbp->b_interlock);
189 1.62 yamt
190 1.62 yamt putphysbuf(bp);
191 1.62 yamt }
192 1.62 yamt
193 1.62 yamt static void
194 1.62 yamt physio_biodone(struct buf *bp)
195 1.62 yamt {
196 1.63 yamt #if defined(DIAGNOSTIC)
197 1.62 yamt struct buf *mbp = bp->b_private;
198 1.62 yamt size_t todo = bp->b_bufsize;
199 1.62 yamt
200 1.62 yamt KASSERT(mbp->b_running > 0);
201 1.62 yamt KASSERT(todo <= mbp->b_resid);
202 1.62 yamt KASSERT(bp->b_bcount <= todo);
203 1.62 yamt KASSERT(bp->b_resid <= bp->b_bcount);
204 1.63 yamt #endif /* defined(DIAGNOSTIC) */
205 1.62 yamt
206 1.62 yamt workqueue_enqueue(physio_workqueue, &bp->b_work);
207 1.62 yamt }
208 1.62 yamt
209 1.62 yamt static int
210 1.62 yamt physio_wait(struct buf *bp, int n, const char *wchan)
211 1.62 yamt {
212 1.62 yamt int error = 0;
213 1.62 yamt
214 1.62 yamt LOCK_ASSERT(simple_lock_held(&bp->b_interlock));
215 1.62 yamt
216 1.62 yamt while (bp->b_running > n) {
217 1.62 yamt bp->b_flags |= B_WANTED;
218 1.62 yamt error = ltsleep(bp, PRIBIO + 1, wchan, 0, &bp->b_interlock);
219 1.62 yamt if (error) {
220 1.62 yamt break;
221 1.62 yamt }
222 1.62 yamt }
223 1.62 yamt
224 1.62 yamt return error;
225 1.62 yamt }
226 1.62 yamt
227 1.62 yamt static void
228 1.62 yamt physio_init(void)
229 1.62 yamt {
230 1.62 yamt
231 1.62 yamt KASSERT(physio_workqueue == NULL);
232 1.62 yamt
233 1.62 yamt if (workqueue_create(&physio_workqueue, "physiod",
234 1.64 yamt physio_done, NULL, PRIBIO, 0/* IPL_BIO notyet */, 0)) {
235 1.62 yamt panic("physiod create");
236 1.62 yamt }
237 1.62 yamt }
238 1.62 yamt
239 1.62 yamt #define PHYSIO_CONCURRENCY 16 /* XXX tune */
240 1.62 yamt
241 1.20 cgd /*
242 1.20 cgd * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly
243 1.20 cgd * from the raw device to user buffers, and bypasses the buffer cache.
244 1.20 cgd *
245 1.20 cgd * Comments in brackets are from Leffler, et al.'s pseudo-code implementation.
246 1.20 cgd */
247 1.20 cgd int
248 1.62 yamt physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
249 1.61 thorpej void (*min_phys)(struct buf *), struct uio *uio)
250 1.20 cgd {
251 1.20 cgd struct iovec *iovp;
252 1.54 thorpej struct lwp *l = curlwp;
253 1.54 thorpej struct proc *p = l->l_proc;
254 1.62 yamt int i, s;
255 1.62 yamt int error = 0;
256 1.62 yamt int error2;
257 1.62 yamt size_t todo;
258 1.62 yamt struct buf *bp = NULL;
259 1.62 yamt struct buf *mbp;
260 1.62 yamt
261 1.62 yamt RUN_ONCE(&physio_initialized, physio_init);
262 1.20 cgd
263 1.53 hannken flags &= B_READ | B_WRITE;
264 1.20 cgd
265 1.20 cgd /* Make sure we have a buffer, creating one if necessary. */
266 1.62 yamt if (obp != NULL) {
267 1.29 mrg /* [raise the processor priority level to splbio;] */
268 1.29 mrg s = splbio();
269 1.20 cgd
270 1.29 mrg /* [while the buffer is marked busy] */
271 1.62 yamt while (obp->b_flags & B_BUSY) {
272 1.29 mrg /* [mark the buffer wanted] */
273 1.62 yamt obp->b_flags |= B_WANTED;
274 1.29 mrg /* [wait until the buffer is available] */
275 1.62 yamt tsleep(obp, PRIBIO+1, "physbuf", 0);
276 1.29 mrg }
277 1.20 cgd
278 1.29 mrg /* Mark it busy, so nobody else will use it. */
279 1.62 yamt obp->b_flags |= B_BUSY;
280 1.20 cgd
281 1.29 mrg /* [lower the priority level] */
282 1.29 mrg splx(s);
283 1.29 mrg }
284 1.20 cgd
285 1.62 yamt mbp = getphysbuf();
286 1.62 yamt mbp->b_resid = uio->uio_resid;
287 1.62 yamt mbp->b_running = 0;
288 1.62 yamt mbp->b_flags = 0;
289 1.62 yamt
290 1.62 yamt PHOLD(l);
291 1.20 cgd
292 1.20 cgd for (i = 0; i < uio->uio_iovcnt; i++) {
293 1.20 cgd iovp = &uio->uio_iov[i];
294 1.20 cgd while (iovp->iov_len > 0) {
295 1.62 yamt simple_lock(&mbp->b_interlock);
296 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0) {
297 1.62 yamt error = mbp->b_error;
298 1.62 yamt goto done_locked;
299 1.62 yamt }
300 1.62 yamt error = physio_wait(mbp, PHYSIO_CONCURRENCY - 1,
301 1.62 yamt "physio1");
302 1.62 yamt if (error) {
303 1.62 yamt goto done_locked;
304 1.62 yamt }
305 1.62 yamt simple_unlock(&mbp->b_interlock);
306 1.62 yamt bp = getphysbuf();
307 1.62 yamt bp->b_dev = dev;
308 1.62 yamt bp->b_proc = p;
309 1.62 yamt bp->b_private = mbp;
310 1.62 yamt bp->b_vp = NULL;
311 1.47 chs
312 1.20 cgd /*
313 1.20 cgd * [mark the buffer busy for physical I/O]
314 1.20 cgd * (i.e. set B_PHYS (because it's an I/O to user
315 1.20 cgd * memory, and B_RAW, because B_RAW is to be
316 1.20 cgd * "Set by physio for raw transfers.", in addition
317 1.20 cgd * to the "busy" and read/write flag.)
318 1.20 cgd */
319 1.62 yamt bp->b_flags = B_BUSY | B_PHYS | B_RAW | B_CALL | flags;
320 1.62 yamt bp->b_iodone = physio_biodone;
321 1.20 cgd
322 1.20 cgd /* [set up the buffer for a maximum-sized transfer] */
323 1.20 cgd bp->b_blkno = btodb(uio->uio_offset);
324 1.62 yamt if (dbtob(bp->b_blkno) != uio->uio_offset) {
325 1.62 yamt error = EINVAL;
326 1.62 yamt goto done;
327 1.62 yamt }
328 1.62 yamt bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
329 1.20 cgd bp->b_data = iovp->iov_base;
330 1.24 mycroft
331 1.20 cgd /*
332 1.48 wiz * [call minphys to bound the transfer size]
333 1.20 cgd * and remember the amount of data to transfer,
334 1.20 cgd * for later comparison.
335 1.20 cgd */
336 1.59 simonb (*min_phys)(bp);
337 1.62 yamt todo = bp->b_bufsize = bp->b_bcount;
338 1.62 yamt #if defined(DIAGNOSTIC)
339 1.22 cgd if (todo > MAXPHYS)
340 1.62 yamt panic("todo(%zu) > MAXPHYS; minphys broken",
341 1.62 yamt todo);
342 1.62 yamt #endif /* defined(DIAGNOSTIC) */
343 1.20 cgd
344 1.20 cgd /*
345 1.20 cgd * [lock the part of the user address space involved
346 1.20 cgd * in the transfer]
347 1.20 cgd * Beware vmapbuf(); it clobbers b_data and
348 1.20 cgd * saves it in b_saveaddr. However, vunmapbuf()
349 1.20 cgd * restores it.
350 1.20 cgd */
351 1.47 chs error = uvm_vslock(p, bp->b_data, todo,
352 1.62 yamt (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ);
353 1.47 chs if (error) {
354 1.62 yamt goto done;
355 1.37 thorpej }
356 1.20 cgd vmapbuf(bp, todo);
357 1.20 cgd
358 1.58 yamt BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
359 1.58 yamt
360 1.62 yamt simple_lock(&mbp->b_interlock);
361 1.62 yamt mbp->b_running++;
362 1.62 yamt simple_unlock(&mbp->b_interlock);
363 1.62 yamt
364 1.20 cgd /* [call strategy to start the transfer] */
365 1.20 cgd (*strategy)(bp);
366 1.62 yamt bp = NULL;
367 1.20 cgd
368 1.62 yamt iovp->iov_len -= todo;
369 1.62 yamt iovp->iov_base = (caddr_t)iovp->iov_base + todo;
370 1.62 yamt uio->uio_offset += todo;
371 1.62 yamt uio->uio_resid -= todo;
372 1.20 cgd }
373 1.20 cgd }
374 1.20 cgd
375 1.20 cgd done:
376 1.62 yamt simple_lock(&mbp->b_interlock);
377 1.62 yamt done_locked:
378 1.62 yamt error2 = physio_wait(mbp, 0, "physio2");
379 1.62 yamt if (error == 0) {
380 1.62 yamt error = error2;
381 1.62 yamt }
382 1.62 yamt simple_unlock(&mbp->b_interlock);
383 1.62 yamt KASSERT((mbp->b_flags & B_ERROR) != 0 ||
384 1.62 yamt mbp->b_resid == uio->uio_resid);
385 1.62 yamt #if defined(DIAGNOSTIC)
386 1.62 yamt if ((mbp->b_flags & B_ERROR) != 0 && mbp->b_error == 0 &&
387 1.62 yamt uio->uio_offset - mbp->b_resid != mbp->b_eomoffset) {
388 1.62 yamt panic("%s: eom", __func__);
389 1.62 yamt }
390 1.62 yamt #endif /* defined(DIAGNOSTIC) */
391 1.62 yamt uio->uio_resid = mbp->b_resid;
392 1.62 yamt if (bp != NULL) {
393 1.62 yamt putphysbuf(bp);
394 1.62 yamt }
395 1.62 yamt if (error == 0) {
396 1.62 yamt error = mbp->b_error;
397 1.62 yamt }
398 1.62 yamt putphysbuf(mbp);
399 1.62 yamt
400 1.20 cgd /*
401 1.20 cgd * [clean up the state of the buffer]
402 1.20 cgd * Remember if somebody wants it, so we can wake them up below.
403 1.20 cgd * Also, if we had to steal it, give it back.
404 1.20 cgd */
405 1.62 yamt if (obp != NULL) {
406 1.62 yamt s = splbio();
407 1.20 cgd /*
408 1.20 cgd * [if another process is waiting for the raw I/O buffer,
409 1.20 cgd * wake up processes waiting to do physical I/O;
410 1.20 cgd */
411 1.62 yamt if (obp->b_flags & B_WANTED) {
412 1.62 yamt obp->b_flags &= ~B_WANTED;
413 1.62 yamt wakeup(obp);
414 1.20 cgd }
415 1.62 yamt splx(s);
416 1.20 cgd }
417 1.62 yamt PRELE(l);
418 1.20 cgd
419 1.62 yamt return error;
420 1.20 cgd }
421 1.20 cgd
422 1.20 cgd /*
423 1.20 cgd * Leffler, et al., says on p. 231:
424 1.20 cgd * "The minphys() routine is called by physio() to adjust the
425 1.20 cgd * size of each I/O transfer before the latter is passed to
426 1.24 mycroft * the strategy routine..."
427 1.20 cgd *
428 1.20 cgd * so, just adjust the buffer's count accounting to MAXPHYS here,
429 1.20 cgd * and return the new count;
430 1.20 cgd */
431 1.24 mycroft void
432 1.61 thorpej minphys(struct buf *bp)
433 1.20 cgd {
434 1.20 cgd
435 1.24 mycroft if (bp->b_bcount > MAXPHYS)
436 1.24 mycroft bp->b_bcount = MAXPHYS;
437 1.20 cgd }
438