ld_ataraid.c revision 1.28 1 /* $NetBSD: ld_ataraid.c,v 1.28 2008/08/24 09:19:03 hannken Exp $ */
2
3 /*
4 * Copyright (c) 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Jason R. Thorpe for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project by
20 * Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 * or promote products derived from this software without specific prior
23 * written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38 /*
39 * Support for ATA RAID logical disks.
40 *
41 * Note that all the RAID happens in software here; the ATA RAID
42 * controllers we're dealing with (Promise, etc.) only support
43 * configuration data on the component disks, with the BIOS supporting
44 * booting from the RAID volumes.
45 */
46
47 #include <sys/cdefs.h>
48 __KERNEL_RCSID(0, "$NetBSD: ld_ataraid.c,v 1.28 2008/08/24 09:19:03 hannken Exp $");
49
50 #include "rnd.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/conf.h>
55 #include <sys/kernel.h>
56 #include <sys/device.h>
57 #include <sys/buf.h>
58 #include <sys/bufq.h>
59 #include <sys/dkio.h>
60 #include <sys/disk.h>
61 #include <sys/disklabel.h>
62 #include <sys/fcntl.h>
63 #include <sys/malloc.h>
64 #include <sys/vnode.h>
65 #include <sys/kauth.h>
66 #if NRND > 0
67 #include <sys/rnd.h>
68 #endif
69
70 #include <miscfs/specfs/specdev.h>
71
72 #include <dev/ldvar.h>
73
74 #include <dev/ata/ata_raidvar.h>
75
76 struct ld_ataraid_softc {
77 struct ld_softc sc_ld;
78
79 struct ataraid_array_info *sc_aai;
80 struct vnode *sc_vnodes[ATA_RAID_MAX_DISKS];
81
82 void (*sc_iodone)(struct buf *);
83 };
84
85 static int ld_ataraid_match(struct device *, struct cfdata *, void *);
86 static void ld_ataraid_attach(struct device *, struct device *, void *);
87
88 static int ld_ataraid_dump(struct ld_softc *, void *, int, int);
89
90 static int ld_ataraid_start_span(struct ld_softc *, struct buf *);
91
92 static int ld_ataraid_start_raid0(struct ld_softc *, struct buf *);
93 static void ld_ataraid_iodone_raid0(struct buf *);
94
95 CFATTACH_DECL_NEW(ld_ataraid, sizeof(struct ld_ataraid_softc),
96 ld_ataraid_match, ld_ataraid_attach, NULL, NULL);
97
98 static int ld_ataraid_initialized;
99 static struct pool ld_ataraid_cbufpl;
100
101 struct cbuf {
102 struct buf cb_buf; /* new I/O buf */
103 struct buf *cb_obp; /* ptr. to original I/O buf */
104 struct ld_ataraid_softc *cb_sc; /* pointer to ld softc */
105 u_int cb_comp; /* target component */
106 SIMPLEQ_ENTRY(cbuf) cb_q; /* fifo of component buffers */
107 struct cbuf *cb_other; /* other cbuf in case of mirror */
108 int cb_flags;
109 #define CBUF_IODONE 0x00000001 /* I/O is already successfully done */
110 };
111
112 #define CBUF_GET() pool_get(&ld_ataraid_cbufpl, PR_NOWAIT);
113 #define CBUF_PUT(cbp) pool_put(&ld_ataraid_cbufpl, (cbp))
114
115 static int
116 ld_ataraid_match(device_t parent, cfdata_t match, void *aux)
117 {
118
119 return (1);
120 }
121
122 static void
123 ld_ataraid_attach(device_t parent, device_t self, void *aux)
124 {
125 struct ld_ataraid_softc *sc = device_private(self);
126 struct ld_softc *ld = &sc->sc_ld;
127 struct ataraid_array_info *aai = aux;
128 const char *level;
129 struct vnode *vp;
130 char unklev[32];
131 u_int i;
132
133 if (ld_ataraid_initialized == 0) {
134 ld_ataraid_initialized = 1;
135 pool_init(&ld_ataraid_cbufpl, sizeof(struct cbuf), 0,
136 0, 0, "ldcbuf", NULL, IPL_BIO);
137 }
138
139 sc->sc_aai = aai; /* this data persists */
140
141 ld->sc_maxxfer = MAXPHYS * aai->aai_width; /* XXX */
142 ld->sc_secperunit = aai->aai_capacity;
143 ld->sc_secsize = 512; /* XXX */
144 ld->sc_maxqueuecnt = 128; /* XXX */
145 ld->sc_dump = ld_ataraid_dump;
146
147 switch (aai->aai_level) {
148 case AAI_L_SPAN:
149 level = "SPAN";
150 ld->sc_start = ld_ataraid_start_span;
151 sc->sc_iodone = ld_ataraid_iodone_raid0;
152 break;
153
154 case AAI_L_RAID0:
155 level = "RAID-0";
156 ld->sc_start = ld_ataraid_start_raid0;
157 sc->sc_iodone = ld_ataraid_iodone_raid0;
158 break;
159
160 case AAI_L_RAID1:
161 level = "RAID-1";
162 ld->sc_start = ld_ataraid_start_raid0;
163 sc->sc_iodone = ld_ataraid_iodone_raid0;
164 break;
165
166 case AAI_L_RAID0 | AAI_L_RAID1:
167 level = "RAID-10";
168 ld->sc_start = ld_ataraid_start_raid0;
169 sc->sc_iodone = ld_ataraid_iodone_raid0;
170 break;
171
172 default:
173 snprintf(unklev, sizeof(unklev), "<unknown level 0x%x>",
174 aai->aai_level);
175 level = unklev;
176 }
177
178 aprint_naive(": ATA %s array\n", level);
179 aprint_normal(": %s ATA %s array\n",
180 ata_raid_type_name(aai->aai_type), level);
181
182 if (ld->sc_start == NULL) {
183 aprint_error_dev(&ld->sc_dv, "unsupported array type\n");
184 return;
185 }
186
187 /*
188 * We get a geometry from the device; use it.
189 */
190 ld->sc_nheads = aai->aai_heads;
191 ld->sc_nsectors = aai->aai_sectors;
192 ld->sc_ncylinders = aai->aai_cylinders;
193
194 /*
195 * Configure all the component disks.
196 */
197 for (i = 0; i < aai->aai_ndisks; i++) {
198 struct ataraid_disk_info *adi = &aai->aai_disks[i];
199 int bmajor, error;
200 dev_t dev;
201
202 bmajor = devsw_name2blk(device_xname(adi->adi_dev), NULL, 0);
203 dev = MAKEDISKDEV(bmajor, device_unit(adi->adi_dev), RAW_PART);
204 error = bdevvp(dev, &vp);
205 if (error)
206 break;
207 error = VOP_OPEN(vp, FREAD|FWRITE, NOCRED);
208 if (error) {
209 vput(vp);
210 /*
211 * XXX This is bogus. We should just mark the
212 * XXX component as FAILED, and write-back new
213 * XXX config blocks.
214 */
215 break;
216 }
217
218 VOP_UNLOCK(vp, 0);
219 sc->sc_vnodes[i] = vp;
220 }
221 if (i == aai->aai_ndisks) {
222 ld->sc_flags = LDF_ENABLED;
223 goto finish;
224 }
225
226 for (i = 0; i < aai->aai_ndisks; i++) {
227 vp = sc->sc_vnodes[i];
228 sc->sc_vnodes[i] = NULL;
229 if (vp != NULL)
230 (void) vn_close(vp, FREAD|FWRITE, NOCRED);
231 }
232
233 finish:
234 ldattach(ld);
235 }
236
237 static struct cbuf *
238 ld_ataraid_make_cbuf(struct ld_ataraid_softc *sc, struct buf *bp,
239 u_int comp, daddr_t bn, void *addr, long bcount)
240 {
241 struct cbuf *cbp;
242
243 cbp = CBUF_GET();
244 if (cbp == NULL)
245 return (NULL);
246 buf_init(&cbp->cb_buf);
247 cbp->cb_buf.b_flags = bp->b_flags;
248 cbp->cb_buf.b_oflags = bp->b_oflags;
249 cbp->cb_buf.b_cflags = bp->b_cflags;
250 cbp->cb_buf.b_iodone = sc->sc_iodone;
251 cbp->cb_buf.b_proc = bp->b_proc;
252 cbp->cb_buf.b_vp = sc->sc_vnodes[comp];
253 cbp->cb_buf.b_objlock = &sc->sc_vnodes[comp]->v_interlock;
254 cbp->cb_buf.b_blkno = bn + sc->sc_aai->aai_offset;
255 cbp->cb_buf.b_data = addr;
256 cbp->cb_buf.b_bcount = bcount;
257
258 /* Context for iodone */
259 cbp->cb_obp = bp;
260 cbp->cb_sc = sc;
261 cbp->cb_comp = comp;
262 cbp->cb_other = NULL;
263 cbp->cb_flags = 0;
264
265 return (cbp);
266 }
267
268 static int
269 ld_ataraid_start_span(struct ld_softc *ld, struct buf *bp)
270 {
271 struct ld_ataraid_softc *sc = (void *) ld;
272 struct ataraid_array_info *aai = sc->sc_aai;
273 struct ataraid_disk_info *adi;
274 SIMPLEQ_HEAD(, cbuf) cbufq;
275 struct cbuf *cbp;
276 char *addr;
277 daddr_t bn;
278 long bcount, rcount;
279 u_int comp;
280
281 /* Allocate component buffers. */
282 SIMPLEQ_INIT(&cbufq);
283 addr = bp->b_data;
284
285 /* Find the first component. */
286 comp = 0;
287 adi = &aai->aai_disks[comp];
288 bn = bp->b_rawblkno;
289 while (bn >= adi->adi_compsize) {
290 bn -= adi->adi_compsize;
291 adi = &aai->aai_disks[++comp];
292 }
293
294 bp->b_resid = bp->b_bcount;
295
296 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
297 rcount = bp->b_bcount;
298 if ((adi->adi_compsize - bn) < btodb(rcount))
299 rcount = dbtob(adi->adi_compsize - bn);
300
301 cbp = ld_ataraid_make_cbuf(sc, bp, comp, bn, addr, rcount);
302 if (cbp == NULL) {
303 /* Free the already allocated component buffers. */
304 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
305 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
306 buf_destroy(&cbp->cb_buf);
307 CBUF_PUT(cbp);
308 }
309 return (EAGAIN);
310 }
311
312 /*
313 * For a span, we always know we advance to the next disk,
314 * and always start at offset 0 on that disk.
315 */
316 adi = &aai->aai_disks[++comp];
317 bn = 0;
318
319 SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
320 addr += rcount;
321 }
322
323 /* Now fire off the requests. */
324 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
325 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
326 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
327 mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
328 cbp->cb_buf.b_vp->v_numoutput++;
329 mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
330 }
331 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
332 }
333
334 return (0);
335 }
336
337 static int
338 ld_ataraid_start_raid0(struct ld_softc *ld, struct buf *bp)
339 {
340 struct ld_ataraid_softc *sc = (void *) ld;
341 struct ataraid_array_info *aai = sc->sc_aai;
342 struct ataraid_disk_info *adi;
343 SIMPLEQ_HEAD(, cbuf) cbufq;
344 struct cbuf *cbp, *other_cbp;
345 char *addr;
346 daddr_t bn, cbn, tbn, off;
347 long bcount, rcount;
348 u_int comp;
349 const int read = bp->b_flags & B_READ;
350 const int mirror = aai->aai_level & AAI_L_RAID1;
351 int error;
352
353 /* Allocate component buffers. */
354 SIMPLEQ_INIT(&cbufq);
355 addr = bp->b_data;
356 bn = bp->b_rawblkno;
357
358 bp->b_resid = bp->b_bcount;
359
360 for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
361 tbn = bn / aai->aai_interleave;
362 off = bn % aai->aai_interleave;
363
364 if (__predict_false(tbn == aai->aai_capacity /
365 aai->aai_interleave)) {
366 /* Last stripe. */
367 daddr_t sz = (aai->aai_capacity -
368 (tbn * aai->aai_interleave)) /
369 aai->aai_width;
370 comp = off / sz;
371 cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
372 (off % sz);
373 rcount = min(bcount, dbtob(sz));
374 } else {
375 comp = tbn % aai->aai_width;
376 cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
377 off;
378 rcount = min(bcount, dbtob(aai->aai_interleave - off));
379 }
380
381 /*
382 * See if a component is valid.
383 */
384 try_mirror:
385 adi = &aai->aai_disks[comp];
386 if ((adi->adi_status & ADI_S_ONLINE) == 0) {
387 if (mirror && comp < aai->aai_width) {
388 comp += aai->aai_width;
389 goto try_mirror;
390 }
391
392 /*
393 * No component available.
394 */
395 error = EIO;
396 goto free_and_exit;
397 }
398
399 cbp = ld_ataraid_make_cbuf(sc, bp, comp, cbn, addr, rcount);
400 if (cbp == NULL) {
401 resource_shortage:
402 error = EAGAIN;
403 free_and_exit:
404 /* Free the already allocated component buffers. */
405 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
406 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
407 buf_destroy(&cbp->cb_buf);
408 CBUF_PUT(cbp);
409 }
410 return (error);
411 }
412 SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
413 if (mirror && !read && comp < aai->aai_width) {
414 comp += aai->aai_width;
415 adi = &aai->aai_disks[comp];
416 if (adi->adi_status & ADI_S_ONLINE) {
417 other_cbp = ld_ataraid_make_cbuf(sc, bp,
418 comp, cbn, addr, rcount);
419 if (other_cbp == NULL)
420 goto resource_shortage;
421 SIMPLEQ_INSERT_TAIL(&cbufq, other_cbp, cb_q);
422 other_cbp->cb_other = cbp;
423 cbp->cb_other = other_cbp;
424 }
425 }
426 bn += btodb(rcount);
427 addr += rcount;
428 }
429
430 /* Now fire off the requests. */
431 while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
432 SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
433 if ((cbp->cb_buf.b_flags & B_READ) == 0) {
434 mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
435 cbp->cb_buf.b_vp->v_numoutput++;
436 mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
437 }
438 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
439 }
440
441 return (0);
442 }
443
444 /*
445 * Called at interrupt time. Mark the component as done and if all
446 * components are done, take an "interrupt".
447 */
448 static void
449 ld_ataraid_iodone_raid0(struct buf *vbp)
450 {
451 struct cbuf *cbp = (struct cbuf *) vbp, *other_cbp;
452 struct buf *bp = cbp->cb_obp;
453 struct ld_ataraid_softc *sc = cbp->cb_sc;
454 struct ataraid_array_info *aai = sc->sc_aai;
455 struct ataraid_disk_info *adi;
456 long count;
457 int s, iodone;
458
459 s = splbio();
460
461 iodone = cbp->cb_flags & CBUF_IODONE;
462 other_cbp = cbp->cb_other;
463 if (other_cbp != NULL)
464 /* You are alone */
465 other_cbp->cb_other = NULL;
466
467 if (cbp->cb_buf.b_error != 0) {
468 /*
469 * Mark this component broken.
470 */
471 adi = &aai->aai_disks[cbp->cb_comp];
472 adi->adi_status &= ~ADI_S_ONLINE;
473
474 printf("%s: error %d on component %d (%s)\n",
475 device_xname(&sc->sc_ld.sc_dv), bp->b_error, cbp->cb_comp,
476 device_xname(adi->adi_dev));
477
478 /*
479 * If we didn't see an error yet and we are reading
480 * RAID1 disk, try another component.
481 */
482 if (bp->b_error == 0 &&
483 (cbp->cb_buf.b_flags & B_READ) != 0 &&
484 (aai->aai_level & AAI_L_RAID1) != 0 &&
485 cbp->cb_comp < aai->aai_width) {
486 cbp->cb_comp += aai->aai_width;
487 adi = &aai->aai_disks[cbp->cb_comp];
488 if (adi->adi_status & ADI_S_ONLINE) {
489 cbp->cb_buf.b_error = 0;
490 VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
491 goto out;
492 }
493 }
494
495 if (iodone || other_cbp != NULL)
496 /*
497 * If I/O on other component successfully done
498 * or the I/O is still in progress, no need
499 * to tell an error to upper layer.
500 */
501 ;
502 else {
503 bp->b_error = cbp->cb_buf.b_error ?
504 cbp->cb_buf.b_error : EIO;
505 }
506
507 /* XXX Update component config blocks. */
508
509 } else {
510 /*
511 * If other I/O is still in progress, tell it that
512 * our I/O is successfully done.
513 */
514 if (other_cbp != NULL)
515 other_cbp->cb_flags |= CBUF_IODONE;
516 }
517 count = cbp->cb_buf.b_bcount;
518 buf_destroy(&cbp->cb_buf);
519 CBUF_PUT(cbp);
520
521 if (other_cbp != NULL)
522 goto out;
523
524 /* If all done, "interrupt". */
525 bp->b_resid -= count;
526 if (bp->b_resid < 0)
527 panic("ld_ataraid_iodone_raid0: count");
528 if (bp->b_resid == 0)
529 lddone(&sc->sc_ld, bp);
530
531 out:
532 splx(s);
533 }
534
535 static int
536 ld_ataraid_dump(struct ld_softc *sc, void *data,
537 int blkno, int blkcnt)
538 {
539
540 return (EIO);
541 }
542