ld_ataraid.c revision 1.12 1 /* $NetBSD: ld_ataraid.c,v 1.12 2004/09/17 23:21:53 enami Exp $ */
2
3 /*
4 * Copyright (c) 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Jason R. Thorpe for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project by
20 * Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 * or promote products derived from this software without specific prior
23 * written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Support for ATA RAID logical disks.
40 *
41 * Note that all the RAID happens in software here; the ATA RAID
42 * controllers we're dealing with (Promise, etc.) only support
43 * configuration data on the component disks, with the BIOS supporting
44 * booting from the RAID volumes.
45 */
46
47 #include <sys/cdefs.h>
48 __KERNEL_RCSID(0, "$NetBSD: ld_ataraid.c,v 1.12 2004/09/17 23:21:53 enami Exp $");
49
50 #include "rnd.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/conf.h>
55 #include <sys/kernel.h>
56 #include <sys/device.h>
57 #include <sys/buf.h>
58 #include <sys/dkio.h>
59 #include <sys/disk.h>
60 #include <sys/disklabel.h>
61 #include <sys/fcntl.h>
62 #include <sys/malloc.h>
63 #include <sys/vnode.h>
64 #if NRND > 0
65 #include <sys/rnd.h>
66 #endif
67
68 #include <miscfs/specfs/specdev.h>
69
70 #include <dev/ldvar.h>
71
72 #include <dev/ata/ata_raidvar.h>
73
74 struct ld_ataraid_softc {
75 struct ld_softc sc_ld;
76
77 struct ataraid_array_info *sc_aai;
78 struct vnode *sc_vnodes[ATA_RAID_MAX_DISKS];
79
80 void (*sc_iodone)(struct buf *);
81 };
82
83 static int ld_ataraid_match(struct device *, struct cfdata *, void *);
84 static void ld_ataraid_attach(struct device *, struct device *, void *);
85
86 static int ld_ataraid_dump(struct ld_softc *, void *, int, int);
87
88 static int ld_ataraid_start_span(struct ld_softc *, struct buf *);
89
90 static int ld_ataraid_start_raid0(struct ld_softc *, struct buf *);
91 static void ld_ataraid_iodone_raid0(struct buf *);
92
93 CFATTACH_DECL(ld_ataraid, sizeof(struct ld_ataraid_softc),
94 ld_ataraid_match, ld_ataraid_attach, NULL, NULL);
95
96 static int ld_ataraid_initialized;
97 static struct pool ld_ataraid_cbufpl;
98
99 struct cbuf {
100 struct buf cb_buf; /* new I/O buf */
101 struct buf *cb_obp; /* ptr. to original I/O buf */
102 struct ld_ataraid_softc *cb_sc; /* pointer to ld softc */
103 u_int cb_comp; /* target component */
104 SIMPLEQ_ENTRY(cbuf) cb_q; /* fifo of component buffers */
105 struct cbuf *cb_other; /* other cbuf in case of mirror */
106 int cb_flags;
107 #define CBUF_IODONE 0x00000001 /* I/O is already successfully done */
108 };
109
110 #define CBUF_GET() pool_get(&ld_ataraid_cbufpl, PR_NOWAIT);
111 #define CBUF_PUT(cbp) pool_put(&ld_ataraid_cbufpl, (cbp))
112
113 static int
114 ld_ataraid_match(struct device *parent, struct cfdata *match, void *aux)
115 {
116
117 return (1);
118 }
119
120 static void
121 ld_ataraid_attach(struct device *parent, struct device *self, void *aux)
122 {
123 struct ld_ataraid_softc *sc = (void *) self;
124 struct ld_softc *ld = &sc->sc_ld;
125 struct ataraid_array_info *aai = aux;
126 const char *level;
127 struct vnode *vp;
128 char unklev[32];
129 u_int i;
130
131 if (ld_ataraid_initialized == 0) {
132 ld_ataraid_initialized = 1;
133 pool_init(&ld_ataraid_cbufpl, sizeof(struct cbuf), 0,
134 0, 0, "ldcbuf", NULL);
135 }
136
137 sc->sc_aai = aai; /* this data persists */
138
139 ld->sc_maxxfer = MAXPHYS * aai->aai_width; /* XXX */
140 ld->sc_secperunit = aai->aai_capacity;
141 ld->sc_secsize = 512; /* XXX */
142 ld->sc_maxqueuecnt = 128; /* XXX */
143 ld->sc_dump = ld_ataraid_dump;
144
145 switch (aai->aai_level) {
146 case AAI_L_SPAN:
147 level = "SPAN";
148 ld->sc_start = ld_ataraid_start_span;
149 sc->sc_iodone = ld_ataraid_iodone_raid0;
150 break;
151
152 case AAI_L_RAID0:
153 level = "RAID-0";
154 ld->sc_start = ld_ataraid_start_raid0;
155 sc->sc_iodone = ld_ataraid_iodone_raid0;
156 break;
157
158 case AAI_L_RAID1:
159 level = "RAID-1";
160 ld->sc_start = ld_ataraid_start_raid0;
161 sc->sc_iodone = ld_ataraid_iodone_raid0;
162 break;
163
164 case AAI_L_RAID0 | AAI_L_RAID1:
165 level = "RAID-10";
166 ld->sc_start = ld_ataraid_start_raid0;
167 sc->sc_iodone = ld_ataraid_iodone_raid0;
168 break;
169
170 default:
171 snprintf(unklev, sizeof(unklev), "<unknown level 0x%x>",
172 aai->aai_level);
173 level = unklev;
174 }
175
176 aprint_naive(": ATA %s array\n", level);
177 aprint_normal(": %s ATA %s array\n",
178 ata_raid_type_name(aai->aai_type), level);
179
180 if (ld->sc_start == NULL) {
181 aprint_error("%s: unsupported array type\n",
182 ld->sc_dv.dv_xname);
183 return;
184 }
185
186 /*
187 * We get a geometry from the device; use it.
188 */
189 ld->sc_nheads = aai->aai_heads;
190 ld->sc_nsectors = aai->aai_sectors;
191 ld->sc_ncylinders = aai->aai_cylinders;
192
193 /*
194 * Configure all the component disks.
195 */
196 for (i = 0; i < aai->aai_ndisks; i++) {
197 struct ataraid_disk_info *adi = &aai->aai_disks[i];
198 int bmajor, error;
199 dev_t dev;
200
201 bmajor = devsw_name2blk(adi->adi_dev->dv_xname, NULL, 0);
202 dev = MAKEDISKDEV(bmajor, adi->adi_dev->dv_unit, RAW_PART);
203 error = bdevvp(dev, &vp);
204 if (error)
205 break;
206 error = VOP_OPEN(vp, FREAD|FWRITE, NOCRED, 0);
207 if (error) {
208 vput(vp);
209 /*
210 * XXX This is bogus. We should just mark the
211 * XXX component as FAILED, and write-back new
212 * XXX config blocks.
213 */
214 break;
215 }
216
217 VOP_UNLOCK(vp, 0);
218 sc->sc_vnodes[i] = vp;
219 }
220 if (i == aai->aai_ndisks) {
221 ld->sc_flags = LDF_ENABLED;
222 goto finish;
223 }
224
225 for (i = 0; i < aai->aai_ndisks; i++) {
226 vp = sc->sc_vnodes[i];
227 sc->sc_vnodes[i] = NULL;
228 if (vp != NULL)
229 (void) vn_close(vp, FREAD|FWRITE, NOCRED, curproc);
230 }
231
232 finish:
233 ldattach(ld);
234 }
235
236 static struct cbuf *
237 ld_ataraid_make_cbuf(struct ld_ataraid_softc *sc, struct buf *bp,
238 u_int comp, daddr_t bn, caddr_t addr, long bcount)
239 {
240 struct cbuf *cbp;
241
242 cbp = CBUF_GET();
243 if (cbp == NULL)
244 return (NULL);
245 BUF_INIT(&cbp->cb_buf);
246 cbp->cb_buf.b_flags = bp->b_flags | B_CALL;
247 cbp->cb_buf.b_iodone = sc->sc_iodone;
248 cbp->cb_buf.b_proc = bp->b_proc;
249 cbp->cb_buf.b_vp = sc->sc_vnodes[comp];
250 cbp->cb_buf.b_blkno = bn + sc->sc_aai->aai_offset;
251 cbp->cb_buf.b_data = addr;
252 cbp->cb_buf.b_bcount = bcount;
253
254 /* Context for iodone */
255 cbp->cb_obp = bp;
256 cbp->cb_sc = sc;
257 cbp->cb_comp = comp;
258 cbp->cb_other = NULL;
259 cbp->cb_flags = 0;
260
261 return (cbp);
262 }
263
264 static int
265 ld_ataraid_start_span(struct ld_softc *ld, struct buf *bp)
266 {
267 struct ld_ataraid_softc *sc = (void *) ld;
268 struct ataraid_array_info *aai = sc->sc_aai;
269 struct ataraid_disk_info *adi;
270 SIMPLEQ_HEAD(, cbuf) cbufq;
271 struct cbuf *cbp;
272 caddr_t addr;
273 daddr_t bn;
274 long bcount, rcount;
275 u_int comp;
276
277 /* Allocate component buffers. */
278 SIMPLEQ_INIT(&cbufq);
279 addr = bp->b_data;
280
281 /* Find the first component. */
282 comp = 0;
283 adi = &aai->aai_disks[comp];
284 bn = bp->b_rawblkno;
285 while (bn >= adi->adi_compsize) {
286 bn -= adi->adi_compsize;
287 adi = &aai->aai_disks[++comp];
288 }
289
290 bp->b_resid = bp->b_bcount;
291
292 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
293 rcount = bp->b_bcount;
294 if ((adi->adi_compsize - bn) < btodb(rcount))
295 rcount = dbtob(adi->adi_compsize - bn);
296
297 cbp = ld_ataraid_make_cbuf(sc, bp, comp, bn, addr, rcount);
298 if (cbp == NULL) {
299 /* Free the already allocated component buffers. */
300 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
301 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
302 CBUF_PUT(cbp);
303 }
304 return (EAGAIN);
305 }
306
307 /*
308 * For a span, we always know we advance to the next disk,
309 * and always start at offset 0 on that disk.
310 */
311 adi = &aai->aai_disks[++comp];
312 bn = 0;
313
314 SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
315 addr += rcount;
316 }
317
318 /* Now fire off the requests. */
319 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
320 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
321 if ((cbp->cb_buf.b_flags & B_READ) == 0)
322 cbp->cb_buf.b_vp->v_numoutput++;
323 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
324 }
325
326 return (0);
327 }
328
329 static int
330 ld_ataraid_start_raid0(struct ld_softc *ld, struct buf *bp)
331 {
332 struct ld_ataraid_softc *sc = (void *) ld;
333 struct ataraid_array_info *aai = sc->sc_aai;
334 struct ataraid_disk_info *adi;
335 SIMPLEQ_HEAD(, cbuf) cbufq;
336 struct cbuf *cbp, *other_cbp;
337 caddr_t addr;
338 daddr_t bn, cbn, tbn, off;
339 long bcount, rcount;
340 u_int comp;
341 const int read = bp->b_flags & B_READ;
342 const int mirror = aai->aai_level & AAI_L_RAID1;
343 int error;
344
345 /* Allocate component buffers. */
346 SIMPLEQ_INIT(&cbufq);
347 addr = bp->b_data;
348 bn = bp->b_rawblkno;
349
350 bp->b_resid = bp->b_bcount;
351
352 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
353 tbn = bn / aai->aai_interleave;
354 off = bn % aai->aai_interleave;
355
356 if (__predict_false(tbn == aai->aai_capacity /
357 aai->aai_interleave)) {
358 /* Last stripe. */
359 daddr_t sz = (aai->aai_capacity -
360 (tbn * aai->aai_interleave)) /
361 aai->aai_width;
362 comp = off / sz;
363 cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
364 (off % sz);
365 rcount = min(bcount, dbtob(sz));
366 } else {
367 comp = tbn % aai->aai_width;
368 cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
369 off;
370 rcount = min(bcount, dbtob(aai->aai_interleave - off));
371 }
372
373 /*
374 * See if a component is valid.
375 */
376 try_mirror:
377 adi = &aai->aai_disks[comp];
378 if ((adi->adi_status & ADI_S_ONLINE) == 0) {
379 if (mirror && comp < aai->aai_width) {
380 comp += aai->aai_width;
381 goto try_mirror;
382 }
383
384 /*
385 * No component available.
386 */
387 error = EIO;
388 goto free_and_exit;
389 }
390
391 cbp = ld_ataraid_make_cbuf(sc, bp, comp, cbn, addr, rcount);
392 if (cbp == NULL) {
393 resource_shortage:
394 error = EAGAIN;
395 free_and_exit:
396 /* Free the already allocated component buffers. */
397 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
398 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
399 CBUF_PUT(cbp);
400 }
401 return (error);
402 }
403 SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
404 if (mirror && !read && comp < aai->aai_width) {
405 comp += aai->aai_width;
406 adi = &aai->aai_disks[comp];
407 if (adi->adi_status & ADI_S_ONLINE) {
408 other_cbp = ld_ataraid_make_cbuf(sc, bp,
409 comp, cbn, addr, rcount);
410 if (other_cbp == NULL)
411 goto resource_shortage;
412 SIMPLEQ_INSERT_TAIL(&cbufq, other_cbp, cb_q);
413 other_cbp->cb_other = cbp;
414 cbp->cb_other = other_cbp;
415 }
416 }
417 bn += btodb(rcount);
418 addr += rcount;
419 }
420
421 /* Now fire off the requests. */
422 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
423 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
424 if ((cbp->cb_buf.b_flags & B_READ) == 0)
425 cbp->cb_buf.b_vp->v_numoutput++;
426 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
427 }
428
429 return (0);
430 }
431
432 /*
433 * Called at interrupt time. Mark the component as done and if all
434 * components are done, take an "interrupt".
435 */
436 static void
437 ld_ataraid_iodone_raid0(struct buf *vbp)
438 {
439 struct cbuf *cbp = (struct cbuf *) vbp, *other_cbp;
440 struct buf *bp = cbp->cb_obp;
441 struct ld_ataraid_softc *sc = cbp->cb_sc;
442 struct ataraid_array_info *aai = sc->sc_aai;
443 struct ataraid_disk_info *adi;
444 long count;
445 int s, iodone;
446
447 s = splbio();
448
449 iodone = cbp->cb_flags & CBUF_IODONE;
450 other_cbp = cbp->cb_other;
451 if (other_cbp != NULL)
452 /* You are alone */
453 other_cbp->cb_other = NULL;
454
455 if (cbp->cb_buf.b_flags & B_ERROR) {
456 /*
457 * Mark this component broken.
458 */
459 adi = &aai->aai_disks[cbp->cb_comp];
460 adi->adi_status &= ~ADI_S_ONLINE;
461
462 printf("%s: error %d on component %d (%s)\n",
463 sc->sc_ld.sc_dv.dv_xname, bp->b_error, cbp->cb_comp,
464 adi->adi_dev->dv_xname);
465
466 /*
467 * If we didn't see an error yet and we are reading
468 * RAID1 disk, try another component.
469 */
470 if ((bp->b_flags & B_ERROR) == 0 &&
471 (cbp->cb_buf.b_flags & B_READ) != 0 &&
472 (aai->aai_level & AAI_L_RAID1) != 0 &&
473 cbp->cb_comp < aai->aai_width) {
474 cbp->cb_comp += aai->aai_width;
475 adi = &aai->aai_disks[cbp->cb_comp];
476 if (adi->adi_status & ADI_S_ONLINE) {
477 cbp->cb_buf.b_flags &= ~B_ERROR;
478 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
479 goto out;
480 }
481 }
482
483 if (iodone || other_cbp != NULL)
484 /*
485 * If I/O on other component successfully done
486 * or the I/O is still in progress, no need
487 * to tell an error to upper layer.
488 */
489 ;
490 else {
491 bp->b_flags |= B_ERROR;
492 bp->b_error = cbp->cb_buf.b_error ?
493 cbp->cb_buf.b_error : EIO;
494 }
495
496 /* XXX Update component config blocks. */
497
498 } else {
499 /*
500 * If other I/O is still in progress, tell it that
501 * our I/O is successfully done.
502 */
503 if (other_cbp != NULL)
504 other_cbp->cb_flags |= CBUF_IODONE;
505 }
506 count = cbp->cb_buf.b_bcount;
507 CBUF_PUT(cbp);
508
509 if (other_cbp != NULL)
510 goto out;
511
512 /* If all done, "interrupt". */
513 bp->b_resid -= count;
514 if (bp->b_resid < 0)
515 panic("ld_ataraid_iodone_raid0: count");
516 if (bp->b_resid == 0)
517 lddone(&sc->sc_ld, bp);
518
519 out:
520 splx(s);
521 }
522
523 static int
524 ld_ataraid_dump(struct ld_softc *sc, void *data, int blkno, int blkcnt)
525 {
526
527 return (EIO);
528 }
529