ld_ataraid.c revision 1.25 1 /* $NetBSD: ld_ataraid.c,v 1.25 2008/03/21 21:54:59 ad Exp $ */
2
3 /*
4 * Copyright (c) 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Jason R. Thorpe for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project by
20 * Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 * or promote products derived from this software without specific prior
23 * written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Support for ATA RAID logical disks.
40 *
41 * Note that all the RAID happens in software here; the ATA RAID
42 * controllers we're dealing with (Promise, etc.) only support
43 * configuration data on the component disks, with the BIOS supporting
44 * booting from the RAID volumes.
45 */
46
47 #include <sys/cdefs.h>
48 __KERNEL_RCSID(0, "$NetBSD: ld_ataraid.c,v 1.25 2008/03/21 21:54:59 ad Exp $");
49
50 #include "rnd.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/conf.h>
55 #include <sys/kernel.h>
56 #include <sys/device.h>
57 #include <sys/buf.h>
58 #include <sys/bufq.h>
59 #include <sys/dkio.h>
60 #include <sys/disk.h>
61 #include <sys/disklabel.h>
62 #include <sys/fcntl.h>
63 #include <sys/malloc.h>
64 #include <sys/vnode.h>
65 #include <sys/kauth.h>
66 #if NRND > 0
67 #include <sys/rnd.h>
68 #endif
69
70 #include <miscfs/specfs/specdev.h>
71
72 #include <dev/ldvar.h>
73
74 #include <dev/ata/ata_raidvar.h>
75
76 struct ld_ataraid_softc {
77 struct ld_softc sc_ld;
78
79 struct ataraid_array_info *sc_aai;
80 struct vnode *sc_vnodes[ATA_RAID_MAX_DISKS];
81
82 void (*sc_iodone)(struct buf *);
83 };
84
85 static int ld_ataraid_match(struct device *, struct cfdata *, void *);
86 static void ld_ataraid_attach(struct device *, struct device *, void *);
87
88 static int ld_ataraid_dump(struct ld_softc *, void *, int, int);
89
90 static int ld_ataraid_start_span(struct ld_softc *, struct buf *);
91
92 static int ld_ataraid_start_raid0(struct ld_softc *, struct buf *);
93 static void ld_ataraid_iodone_raid0(struct buf *);
94
95 CFATTACH_DECL(ld_ataraid, sizeof(struct ld_ataraid_softc),
96 ld_ataraid_match, ld_ataraid_attach, NULL, NULL);
97
98 static int ld_ataraid_initialized;
99 static struct pool ld_ataraid_cbufpl;
100
101 struct cbuf {
102 struct buf cb_buf; /* new I/O buf */
103 struct buf *cb_obp; /* ptr. to original I/O buf */
104 struct ld_ataraid_softc *cb_sc; /* pointer to ld softc */
105 u_int cb_comp; /* target component */
106 SIMPLEQ_ENTRY(cbuf) cb_q; /* fifo of component buffers */
107 struct cbuf *cb_other; /* other cbuf in case of mirror */
108 int cb_flags;
109 #define CBUF_IODONE 0x00000001 /* I/O is already successfully done */
110 };
111
112 #define CBUF_GET() pool_get(&ld_ataraid_cbufpl, PR_NOWAIT);
113 #define CBUF_PUT(cbp) pool_put(&ld_ataraid_cbufpl, (cbp))
114
115 static int
116 ld_ataraid_match(struct device *parent,
117 struct cfdata *match, void *aux)
118 {
119
120 return (1);
121 }
122
123 static void
124 ld_ataraid_attach(struct device *parent, struct device *self,
125 void *aux)
126 {
127 struct ld_ataraid_softc *sc = (void *) self;
128 struct ld_softc *ld = &sc->sc_ld;
129 struct ataraid_array_info *aai = aux;
130 const char *level;
131 struct vnode *vp;
132 char unklev[32];
133 u_int i;
134
135 if (ld_ataraid_initialized == 0) {
136 ld_ataraid_initialized = 1;
137 pool_init(&ld_ataraid_cbufpl, sizeof(struct cbuf), 0,
138 0, 0, "ldcbuf", NULL, IPL_BIO);
139 }
140
141 sc->sc_aai = aai; /* this data persists */
142
143 ld->sc_maxxfer = MAXPHYS * aai->aai_width; /* XXX */
144 ld->sc_secperunit = aai->aai_capacity;
145 ld->sc_secsize = 512; /* XXX */
146 ld->sc_maxqueuecnt = 128; /* XXX */
147 ld->sc_dump = ld_ataraid_dump;
148
149 switch (aai->aai_level) {
150 case AAI_L_SPAN:
151 level = "SPAN";
152 ld->sc_start = ld_ataraid_start_span;
153 sc->sc_iodone = ld_ataraid_iodone_raid0;
154 break;
155
156 case AAI_L_RAID0:
157 level = "RAID-0";
158 ld->sc_start = ld_ataraid_start_raid0;
159 sc->sc_iodone = ld_ataraid_iodone_raid0;
160 break;
161
162 case AAI_L_RAID1:
163 level = "RAID-1";
164 ld->sc_start = ld_ataraid_start_raid0;
165 sc->sc_iodone = ld_ataraid_iodone_raid0;
166 break;
167
168 case AAI_L_RAID0 | AAI_L_RAID1:
169 level = "RAID-10";
170 ld->sc_start = ld_ataraid_start_raid0;
171 sc->sc_iodone = ld_ataraid_iodone_raid0;
172 break;
173
174 default:
175 snprintf(unklev, sizeof(unklev), "<unknown level 0x%x>",
176 aai->aai_level);
177 level = unklev;
178 }
179
180 aprint_naive(": ATA %s array\n", level);
181 aprint_normal(": %s ATA %s array\n",
182 ata_raid_type_name(aai->aai_type), level);
183
184 if (ld->sc_start == NULL) {
185 aprint_error("%s: unsupported array type\n",
186 ld->sc_dv.dv_xname);
187 return;
188 }
189
190 /*
191 * We get a geometry from the device; use it.
192 */
193 ld->sc_nheads = aai->aai_heads;
194 ld->sc_nsectors = aai->aai_sectors;
195 ld->sc_ncylinders = aai->aai_cylinders;
196
197 /*
198 * Configure all the component disks.
199 */
200 for (i = 0; i < aai->aai_ndisks; i++) {
201 struct ataraid_disk_info *adi = &aai->aai_disks[i];
202 int bmajor, error;
203 dev_t dev;
204
205 bmajor = devsw_name2blk(adi->adi_dev->dv_xname, NULL, 0);
206 dev = MAKEDISKDEV(bmajor, device_unit(adi->adi_dev), RAW_PART);
207 error = bdevvp(dev, &vp);
208 if (error)
209 break;
210 error = VOP_OPEN(vp, FREAD|FWRITE, NOCRED);
211 if (error) {
212 vput(vp);
213 /*
214 * XXX This is bogus. We should just mark the
215 * XXX component as FAILED, and write-back new
216 * XXX config blocks.
217 */
218 break;
219 }
220
221 VOP_UNLOCK(vp, 0);
222 sc->sc_vnodes[i] = vp;
223 }
224 if (i == aai->aai_ndisks) {
225 ld->sc_flags = LDF_ENABLED;
226 goto finish;
227 }
228
229 for (i = 0; i < aai->aai_ndisks; i++) {
230 vp = sc->sc_vnodes[i];
231 sc->sc_vnodes[i] = NULL;
232 if (vp != NULL)
233 (void) vn_close(vp, FREAD|FWRITE, NOCRED);
234 }
235
236 finish:
237 ldattach(ld);
238 }
239
240 static struct cbuf *
241 ld_ataraid_make_cbuf(struct ld_ataraid_softc *sc, struct buf *bp,
242 u_int comp, daddr_t bn, void *addr, long bcount)
243 {
244 struct cbuf *cbp;
245
246 cbp = CBUF_GET();
247 if (cbp == NULL)
248 return (NULL);
249 buf_init(&cbp->cb_buf);
250 cbp->cb_buf.b_flags = bp->b_flags;
251 cbp->cb_buf.b_oflags = bp->b_oflags;
252 cbp->cb_buf.b_cflags = bp->b_cflags;
253 cbp->cb_buf.b_iodone = sc->sc_iodone;
254 cbp->cb_buf.b_proc = bp->b_proc;
255 cbp->cb_buf.b_vp = sc->sc_vnodes[comp];
256 cbp->cb_buf.b_objlock = &sc->sc_vnodes[comp]->v_interlock;
257 cbp->cb_buf.b_blkno = bn + sc->sc_aai->aai_offset;
258 cbp->cb_buf.b_data = addr;
259 cbp->cb_buf.b_bcount = bcount;
260
261 /* Context for iodone */
262 cbp->cb_obp = bp;
263 cbp->cb_sc = sc;
264 cbp->cb_comp = comp;
265 cbp->cb_other = NULL;
266 cbp->cb_flags = 0;
267
268 return (cbp);
269 }
270
271 static int
272 ld_ataraid_start_span(struct ld_softc *ld, struct buf *bp)
273 {
274 struct ld_ataraid_softc *sc = (void *) ld;
275 struct ataraid_array_info *aai = sc->sc_aai;
276 struct ataraid_disk_info *adi;
277 SIMPLEQ_HEAD(, cbuf) cbufq;
278 struct cbuf *cbp;
279 char *addr;
280 daddr_t bn;
281 long bcount, rcount;
282 u_int comp;
283
284 /* Allocate component buffers. */
285 SIMPLEQ_INIT(&cbufq);
286 addr = bp->b_data;
287
288 /* Find the first component. */
289 comp = 0;
290 adi = &aai->aai_disks[comp];
291 bn = bp->b_rawblkno;
292 while (bn >= adi->adi_compsize) {
293 bn -= adi->adi_compsize;
294 adi = &aai->aai_disks[++comp];
295 }
296
297 bp->b_resid = bp->b_bcount;
298
299 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
300 rcount = bp->b_bcount;
301 if ((adi->adi_compsize - bn) < btodb(rcount))
302 rcount = dbtob(adi->adi_compsize - bn);
303
304 cbp = ld_ataraid_make_cbuf(sc, bp, comp, bn, addr, rcount);
305 if (cbp == NULL) {
306 /* Free the already allocated component buffers. */
307 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
308 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
309 buf_destroy(&cbp->cb_buf);
310 CBUF_PUT(cbp);
311 }
312 return (EAGAIN);
313 }
314
315 /*
316 * For a span, we always know we advance to the next disk,
317 * and always start at offset 0 on that disk.
318 */
319 adi = &aai->aai_disks[++comp];
320 bn = 0;
321
322 SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
323 addr += rcount;
324 }
325
326 /* Now fire off the requests. */
327 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
328 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
329 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
330 mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
331 cbp->cb_buf.b_vp->v_numoutput++;
332 mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
333 }
334 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
335 }
336
337 return (0);
338 }
339
340 static int
341 ld_ataraid_start_raid0(struct ld_softc *ld, struct buf *bp)
342 {
343 struct ld_ataraid_softc *sc = (void *) ld;
344 struct ataraid_array_info *aai = sc->sc_aai;
345 struct ataraid_disk_info *adi;
346 SIMPLEQ_HEAD(, cbuf) cbufq;
347 struct cbuf *cbp, *other_cbp;
348 char *addr;
349 daddr_t bn, cbn, tbn, off;
350 long bcount, rcount;
351 u_int comp;
352 const int read = bp->b_flags & B_READ;
353 const int mirror = aai->aai_level & AAI_L_RAID1;
354 int error;
355
356 /* Allocate component buffers. */
357 SIMPLEQ_INIT(&cbufq);
358 addr = bp->b_data;
359 bn = bp->b_rawblkno;
360
361 bp->b_resid = bp->b_bcount;
362
363 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
364 tbn = bn / aai->aai_interleave;
365 off = bn % aai->aai_interleave;
366
367 if (__predict_false(tbn == aai->aai_capacity /
368 aai->aai_interleave)) {
369 /* Last stripe. */
370 daddr_t sz = (aai->aai_capacity -
371 (tbn * aai->aai_interleave)) /
372 aai->aai_width;
373 comp = off / sz;
374 cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
375 (off % sz);
376 rcount = min(bcount, dbtob(sz));
377 } else {
378 comp = tbn % aai->aai_width;
379 cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
380 off;
381 rcount = min(bcount, dbtob(aai->aai_interleave - off));
382 }
383
384 /*
385 * See if a component is valid.
386 */
387 try_mirror:
388 adi = &aai->aai_disks[comp];
389 if ((adi->adi_status & ADI_S_ONLINE) == 0) {
390 if (mirror && comp < aai->aai_width) {
391 comp += aai->aai_width;
392 goto try_mirror;
393 }
394
395 /*
396 * No component available.
397 */
398 error = EIO;
399 goto free_and_exit;
400 }
401
402 cbp = ld_ataraid_make_cbuf(sc, bp, comp, cbn, addr, rcount);
403 if (cbp == NULL) {
404 resource_shortage:
405 error = EAGAIN;
406 free_and_exit:
407 /* Free the already allocated component buffers. */
408 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
409 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
410 buf_destroy(&cbp->cb_buf);
411 CBUF_PUT(cbp);
412 }
413 return (error);
414 }
415 SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
416 if (mirror && !read && comp < aai->aai_width) {
417 comp += aai->aai_width;
418 adi = &aai->aai_disks[comp];
419 if (adi->adi_status & ADI_S_ONLINE) {
420 other_cbp = ld_ataraid_make_cbuf(sc, bp,
421 comp, cbn, addr, rcount);
422 if (other_cbp == NULL)
423 goto resource_shortage;
424 SIMPLEQ_INSERT_TAIL(&cbufq, other_cbp, cb_q);
425 other_cbp->cb_other = cbp;
426 cbp->cb_other = other_cbp;
427 }
428 }
429 bn += btodb(rcount);
430 addr += rcount;
431 }
432
433 /* Now fire off the requests. */
434 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
435 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
436 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
437 mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
438 cbp->cb_buf.b_vp->v_numoutput++;
439 mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
440 }
441 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
442 }
443
444 return (0);
445 }
446
447 /*
448 * Called at interrupt time. Mark the component as done and if all
449 * components are done, take an "interrupt".
450 */
451 static void
452 ld_ataraid_iodone_raid0(struct buf *vbp)
453 {
454 struct cbuf *cbp = (struct cbuf *) vbp, *other_cbp;
455 struct buf *bp = cbp->cb_obp;
456 struct ld_ataraid_softc *sc = cbp->cb_sc;
457 struct ataraid_array_info *aai = sc->sc_aai;
458 struct ataraid_disk_info *adi;
459 long count;
460 int s, iodone;
461
462 s = splbio();
463
464 iodone = cbp->cb_flags & CBUF_IODONE;
465 other_cbp = cbp->cb_other;
466 if (other_cbp != NULL)
467 /* You are alone */
468 other_cbp->cb_other = NULL;
469
470 if (cbp->cb_buf.b_error != 0) {
471 /*
472 * Mark this component broken.
473 */
474 adi = &aai->aai_disks[cbp->cb_comp];
475 adi->adi_status &= ~ADI_S_ONLINE;
476
477 printf("%s: error %d on component %d (%s)\n",
478 sc->sc_ld.sc_dv.dv_xname, bp->b_error, cbp->cb_comp,
479 adi->adi_dev->dv_xname);
480
481 /*
482 * If we didn't see an error yet and we are reading
483 * RAID1 disk, try another component.
484 */
485 if (bp->b_error == 0 &&
486 (cbp->cb_buf.b_flags & B_READ) != 0 &&
487 (aai->aai_level & AAI_L_RAID1) != 0 &&
488 cbp->cb_comp < aai->aai_width) {
489 cbp->cb_comp += aai->aai_width;
490 adi = &aai->aai_disks[cbp->cb_comp];
491 if (adi->adi_status & ADI_S_ONLINE) {
492 cbp->cb_buf.b_error = 0;
493 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
494 goto out;
495 }
496 }
497
498 if (iodone || other_cbp != NULL)
499 /*
500 * If I/O on other component successfully done
501 * or the I/O is still in progress, no need
502 * to tell an error to upper layer.
503 */
504 ;
505 else {
506 bp->b_error = cbp->cb_buf.b_error ?
507 cbp->cb_buf.b_error : EIO;
508 }
509
510 /* XXX Update component config blocks. */
511
512 } else {
513 /*
514 * If other I/O is still in progress, tell it that
515 * our I/O is successfully done.
516 */
517 if (other_cbp != NULL)
518 other_cbp->cb_flags |= CBUF_IODONE;
519 }
520 count = cbp->cb_buf.b_bcount;
521 CBUF_PUT(cbp);
522
523 if (other_cbp != NULL)
524 goto out;
525
526 /* If all done, "interrupt". */
527 bp->b_resid -= count;
528 if (bp->b_resid < 0)
529 panic("ld_ataraid_iodone_raid0: count");
530 if (bp->b_resid == 0)
531 lddone(&sc->sc_ld, bp);
532
533 out:
534 splx(s);
535 }
536
537 static int
538 ld_ataraid_dump(struct ld_softc *sc, void *data,
539 int blkno, int blkcnt)
540 {
541
542 return (EIO);
543 }
544