rumpblk.c revision 1.10 1 /* $NetBSD: rumpblk.c,v 1.10 2009/03/19 03:05:14 uebayasi Exp $ */
2
3 /*
4 * Copyright (c) 2009 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by the
7 * Finnish Cultural Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 /*
32 * Block device emulation. Presents a block device interface and
33 * uses rumpuser system calls to satisfy I/O requests.
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.10 2009/03/19 03:05:14 uebayasi Exp $");
38
39 #include <sys/param.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/disklabel.h>
43 #include <sys/fcntl.h>
44 #include <sys/kmem.h>
45 #include <sys/malloc.h>
46 #include <sys/stat.h>
47
48 #include <rump/rumpuser.h>
49
50 #include "rump_private.h"
51 #include "rump_vfs_private.h"
52
53 #define RUMPBLK_SIZE 16
54 static struct rblkdev {
55 char *rblk_path;
56 int rblk_fd;
57 uint8_t *rblk_mem;
58 size_t rblk_size;
59
60 struct partition *rblk_curpi;
61 struct partition rblk_pi;
62 struct disklabel rblk_dl;
63 } minors[RUMPBLK_SIZE];
64
65 dev_type_open(rumpblk_open);
66 dev_type_close(rumpblk_close);
67 dev_type_read(rumpblk_read);
68 dev_type_write(rumpblk_write);
69 dev_type_ioctl(rumpblk_ioctl);
70 dev_type_strategy(rumpblk_strategy);
71 dev_type_strategy(rumpblk_strategy_fail);
72 dev_type_dump(rumpblk_dump);
73 dev_type_size(rumpblk_size);
74
75 static const struct bdevsw rumpblk_bdevsw = {
76 rumpblk_open, rumpblk_close, rumpblk_strategy, rumpblk_ioctl,
77 nodump, nosize, D_DISK
78 };
79
80 static const struct bdevsw rumpblk_bdevsw_fail = {
81 rumpblk_open, rumpblk_close, rumpblk_strategy_fail, rumpblk_ioctl,
82 nodump, nosize, D_DISK
83 };
84
85 static const struct cdevsw rumpblk_cdevsw = {
86 rumpblk_open, rumpblk_close, rumpblk_read, rumpblk_write,
87 rumpblk_ioctl, nostop, notty, nopoll, nommap, nokqfilter, D_DISK
88 };
89
90 /* fail every n out of BLKFAIL_MAX */
91 #define BLKFAIL_MAX 10000
92 static int blkfail;
93 static unsigned randstate;
94
95 int
96 rumpblk_init(void)
97 {
98 char buf[64];
99 int rumpblk = RUMPBLK;
100 int error;
101
102 if (rumpuser_getenv("RUMP_BLKFAIL", buf, sizeof(buf), &error) == 0) {
103 blkfail = strtoul(buf, NULL, 10);
104 /* fail everything */
105 if (blkfail > BLKFAIL_MAX)
106 blkfail = BLKFAIL_MAX;
107 if (rumpuser_getenv("RUMP_BLKFAIL_SEED", buf, sizeof(buf),
108 &error) == 0) {
109 randstate = strtoul(buf, NULL, 10);
110 } else {
111 randstate = arc4random(); /* XXX: not enough entropy */
112 }
113 printf("rumpblk: FAULT INJECTION ACTIVE! every %d out of"
114 " %d I/O will fail. key %u\n", blkfail, BLKFAIL_MAX,
115 randstate);
116 } else {
117 blkfail = 0;
118 }
119
120 if (blkfail) {
121 return devsw_attach("rumpblk", &rumpblk_bdevsw_fail, &rumpblk,
122 &rumpblk_cdevsw, &rumpblk);
123 } else {
124 return devsw_attach("rumpblk", &rumpblk_bdevsw, &rumpblk,
125 &rumpblk_cdevsw, &rumpblk);
126 }
127 }
128
129 int
130 rumpblk_register(const char *path)
131 {
132 size_t len;
133 int i;
134
135 for (i = 0; i < RUMPBLK_SIZE; i++)
136 if (minors[i].rblk_path && strcmp(minors[i].rblk_path, path) == 0)
137 return i;
138
139 for (i = 0; i < RUMPBLK_SIZE; i++)
140 if (minors[i].rblk_path == NULL)
141 break;
142 if (i == RUMPBLK_SIZE)
143 return -1;
144
145 len = strlen(path);
146 minors[i].rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
147 strcpy(minors[i].rblk_path, path);
148 minors[i].rblk_fd = -1;
149 return i;
150 }
151
152 int
153 rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
154 {
155 struct rblkdev *rblk = &minors[minor(dev)];
156 uint8_t *mem = NULL;
157 uint64_t fsize;
158 int ft, dummy;
159 int error, fd;
160
161 KASSERT(rblk->rblk_fd == -1);
162 fd = rumpuser_open(rblk->rblk_path, OFLAGS(flag), &error);
163 if (error)
164 return error;
165
166 if (rumpuser_getfileinfo(rblk->rblk_path, &fsize, &ft, &error) == -1) {
167 rumpuser_close(fd, &dummy);
168 return error;
169 }
170
171 if (ft == RUMPUSER_FT_REG) {
172 /*
173 * Try to mmap the file if it's size is max. half of
174 * the address space. If mmap fails due to e.g. limits,
175 * we fall back to the read/write path. This test is only
176 * to prevent size_t vs. off_t wraparounds.
177 */
178 if (fsize < UINT64_C(1) << (sizeof(void *) * 8 - 1)) {
179 int mmflags;
180
181 mmflags = 0;
182 if (flag & FREAD)
183 mmflags |= RUMPUSER_FILEMMAP_READ;
184 if (flag & FWRITE)
185 mmflags |= RUMPUSER_FILEMMAP_WRITE;
186 mem = rumpuser_filemmap(fd, 0, fsize, mmflags, &error);
187 }
188
189 memset(&rblk->rblk_dl, 0, sizeof(rblk->rblk_dl));
190
191 rblk->rblk_size = fsize;
192 rblk->rblk_pi.p_size = fsize >> DEV_BSHIFT;
193 rblk->rblk_dl.d_secsize = DEV_BSIZE;
194 rblk->rblk_curpi = &rblk->rblk_pi;
195 } else {
196 if (rumpuser_ioctl(fd,DIOCGDINFO, &rblk->rblk_dl, &error) != -1) {
197 rumpuser_close(fd, &dummy);
198 return error;
199 }
200
201 rblk->rblk_curpi = &rblk->rblk_dl.d_partitions[0];
202 }
203 rblk->rblk_fd = fd;
204 rblk->rblk_mem = mem;
205
206 return 0;
207 }
208
209 int
210 rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
211 {
212 struct rblkdev *rblk = &minors[minor(dev)];
213 int dummy;
214
215 if (rblk->rblk_mem) {
216 KASSERT(rblk->rblk_size);
217 rumpuser_memsync(rblk->rblk_mem, rblk->rblk_size, &dummy);
218 rumpuser_unmap(rblk->rblk_mem, rblk->rblk_size);
219 rblk->rblk_mem = NULL;
220 }
221 rumpuser_close(rblk->rblk_fd, &dummy);
222 rblk->rblk_fd = -1;
223
224 return 0;
225 }
226
227 int
228 rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
229 {
230 struct rblkdev *rblk = &minors[minor(dev)];
231 int rv, error;
232
233 if (xfer == DIOCGPART) {
234 struct partinfo *pi = (struct partinfo *)addr;
235
236 pi->part = rblk->rblk_curpi;
237 pi->disklab = &rblk->rblk_dl;
238
239 return 0;
240 }
241
242 rv = rumpuser_ioctl(rblk->rblk_fd, xfer, addr, &error);
243 if (rv == -1)
244 return error;
245
246 return 0;
247 }
248
249 int
250 rumpblk_read(dev_t dev, struct uio *uio, int flags)
251 {
252
253 panic("%s: unimplemented", __func__);
254 }
255
256 int
257 rumpblk_write(dev_t dev, struct uio *uio, int flags)
258 {
259
260 panic("%s: unimplemented", __func__);
261 }
262
263 static void
264 dostrategy(struct buf *bp)
265 {
266 struct rblkdev *rblk = &minors[minor(bp->b_dev)];
267 off_t off;
268 int async, error;
269
270 off = bp->b_blkno << DEV_BSHIFT;
271 async = bp->b_flags & B_ASYNC;
272 DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
273 " (0x%" PRIx64 " - 0x%" PRIx64")\n",
274 bp->b_bcount, BUF_ISREAD(bp) "READ" : "WRITE",
275 off, off, (off + bp->b_bcount)));
276
277 /* mem optimization? handle here and return */
278 if (rblk->rblk_mem) {
279 uint8_t *ioaddr = rblk->rblk_mem + off;
280 if (BUF_ISREAD(bp)) {
281 memcpy(bp->b_data, ioaddr, bp->b_bcount);
282 } else {
283 memcpy(ioaddr, bp->b_data, bp->b_bcount);
284 }
285
286 /* synchronous write, sync necessary bits back to disk */
287 if (BUF_ISWRITE(bp) && !async) {
288 rumpuser_memsync(ioaddr, bp->b_bcount, &error);
289 }
290 rump_biodone(bp, bp->b_bcount, 0);
291
292 return;
293 }
294
295 /*
296 * Do I/O. We have different paths for async and sync I/O.
297 * Async I/O is done by passing a request to rumpuser where
298 * it is executed. The rumpuser routine then calls
299 * biodone() to signal any waiters in the kernel. I/O's are
300 * executed in series. Technically executing them in parallel
301 * would produce better results, but then we'd need either
302 * more threads or posix aio. Maybe worth investigating
303 * this later.
304 *
305 * Synchronous I/O is done directly in the context mainly to
306 * avoid unnecessary scheduling with the I/O thread.
307 */
308 if (async && rump_threads) {
309 struct rumpuser_aio *rua;
310
311 rumpuser_mutex_enter(&rumpuser_aio_mtx);
312 /*
313 * Check if our buffer is full. Doing it this way
314 * throttles the I/O a bit if we have a massive
315 * async I/O burst.
316 */
317 if ((rumpuser_aio_head+1) % N_AIOS == rumpuser_aio_tail) {
318 rumpuser_mutex_exit(&rumpuser_aio_mtx);
319 goto syncfallback;
320 }
321
322 rua = &rumpuser_aios[rumpuser_aio_head];
323 KASSERT(rua->rua_bp == NULL);
324 rua->rua_fd = rblk->rblk_fd;
325 rua->rua_data = bp->b_data;
326 rua->rua_dlen = bp->b_bcount;
327 rua->rua_off = off;
328 rua->rua_bp = bp;
329 rua->rua_op = BUF_ISREAD(bp);
330
331 /* insert into queue & signal */
332 rumpuser_aio_head = (rumpuser_aio_head+1) % N_AIOS;
333 rumpuser_cv_signal(&rumpuser_aio_cv);
334 rumpuser_mutex_exit(&rumpuser_aio_mtx);
335 } else {
336 syncfallback:
337 if (BUF_ISREAD(bp)) {
338 rumpuser_read_bio(rblk->rblk_fd, bp->b_data,
339 bp->b_bcount, off, rump_biodone, bp);
340 } else {
341 rumpuser_write_bio(rblk->rblk_fd, bp->b_data,
342 bp->b_bcount, off, rump_biodone, bp);
343 }
344 if (!async) {
345 if (BUF_ISWRITE(bp))
346 rumpuser_fsync(rblk->rblk_fd, &error);
347 }
348 }
349 }
350
351 void
352 rumpblk_strategy(struct buf *bp)
353 {
354
355 dostrategy(bp);
356 }
357
358 /*
359 * Simple random number generator. This is private so that we can
360 * very repeatedly control which blocks will fail.
361 *
362 * <mlelstv> pooka, rand()
363 * <mlelstv> [paste]
364 */
365 static unsigned
366 gimmerand(void)
367 {
368
369 return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
370 }
371
372 /*
373 * Block device with very simple fault injection. Fails every
374 * n out of BLKFAIL_MAX I/O with EIO. n is determined by the env
375 * variable RUMP_BLKFAIL.
376 */
377 void
378 rumpblk_strategy_fail(struct buf *bp)
379 {
380
381 if (gimmerand() % BLKFAIL_MAX >= blkfail) {
382 dostrategy(bp);
383 } else {
384 printf("block fault injection: failing I/O on block %lld\n",
385 (long long)bp->b_blkno);
386 bp->b_error = EIO;
387 biodone(bp);
388 }
389 }
390