Home | History | Annotate | Line # | Download | only in ata
ld_ataraid.c revision 1.25
      1 /*	$NetBSD: ld_ataraid.c,v 1.25 2008/03/21 21:54:59 ad Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 2003 Wasabi Systems, Inc.
      5  * All rights reserved.
      6  *
      7  * Written by Jason R. Thorpe for Wasabi Systems, Inc.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 3. All advertising materials mentioning features or use of this software
     18  *    must display the following acknowledgement:
     19  *	This product includes software developed for the NetBSD Project by
     20  *	Wasabi Systems, Inc.
     21  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
     22  *    or promote products derived from this software without specific prior
     23  *    written permission.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
     26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
     29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     35  * POSSIBILITY OF SUCH DAMAGE.
     36  */
     37 
     38 /*
     39  * Support for ATA RAID logical disks.
     40  *
     41  * Note that all the RAID happens in software here; the ATA RAID
     42  * controllers we're dealing with (Promise, etc.) only support
     43  * configuration data on the component disks, with the BIOS supporting
     44  * booting from the RAID volumes.
     45  */
     46 
     47 #include <sys/cdefs.h>
     48 __KERNEL_RCSID(0, "$NetBSD: ld_ataraid.c,v 1.25 2008/03/21 21:54:59 ad Exp $");
     49 
     50 #include "rnd.h"
     51 
     52 #include <sys/param.h>
     53 #include <sys/systm.h>
     54 #include <sys/conf.h>
     55 #include <sys/kernel.h>
     56 #include <sys/device.h>
     57 #include <sys/buf.h>
     58 #include <sys/bufq.h>
     59 #include <sys/dkio.h>
     60 #include <sys/disk.h>
     61 #include <sys/disklabel.h>
     62 #include <sys/fcntl.h>
     63 #include <sys/malloc.h>
     64 #include <sys/vnode.h>
     65 #include <sys/kauth.h>
     66 #if NRND > 0
     67 #include <sys/rnd.h>
     68 #endif
     69 
     70 #include <miscfs/specfs/specdev.h>
     71 
     72 #include <dev/ldvar.h>
     73 
     74 #include <dev/ata/ata_raidvar.h>
     75 
     76 struct ld_ataraid_softc {
     77 	struct ld_softc sc_ld;
     78 
     79 	struct ataraid_array_info *sc_aai;
     80 	struct vnode *sc_vnodes[ATA_RAID_MAX_DISKS];
     81 
     82 	void	(*sc_iodone)(struct buf *);
     83 };
     84 
     85 static int	ld_ataraid_match(struct device *, struct cfdata *, void *);
     86 static void	ld_ataraid_attach(struct device *, struct device *, void *);
     87 
     88 static int	ld_ataraid_dump(struct ld_softc *, void *, int, int);
     89 
     90 static int	ld_ataraid_start_span(struct ld_softc *, struct buf *);
     91 
     92 static int	ld_ataraid_start_raid0(struct ld_softc *, struct buf *);
     93 static void	ld_ataraid_iodone_raid0(struct buf *);
     94 
     95 CFATTACH_DECL(ld_ataraid, sizeof(struct ld_ataraid_softc),
     96     ld_ataraid_match, ld_ataraid_attach, NULL, NULL);
     97 
     98 static int ld_ataraid_initialized;
     99 static struct pool ld_ataraid_cbufpl;
    100 
    101 struct cbuf {
    102 	struct buf	cb_buf;		/* new I/O buf */
    103 	struct buf	*cb_obp;	/* ptr. to original I/O buf */
    104 	struct ld_ataraid_softc *cb_sc;	/* pointer to ld softc */
    105 	u_int		cb_comp;	/* target component */
    106 	SIMPLEQ_ENTRY(cbuf) cb_q;	/* fifo of component buffers */
    107 	struct cbuf	*cb_other;	/* other cbuf in case of mirror */
    108 	int		cb_flags;
    109 #define	CBUF_IODONE	0x00000001	/* I/O is already successfully done */
    110 };
    111 
    112 #define	CBUF_GET()	pool_get(&ld_ataraid_cbufpl, PR_NOWAIT);
    113 #define	CBUF_PUT(cbp)	pool_put(&ld_ataraid_cbufpl, (cbp))
    114 
    115 static int
    116 ld_ataraid_match(struct device *parent,
    117     struct cfdata *match, void *aux)
    118 {
    119 
    120 	return (1);
    121 }
    122 
    123 static void
    124 ld_ataraid_attach(struct device *parent, struct device *self,
    125     void *aux)
    126 {
    127 	struct ld_ataraid_softc *sc = (void *) self;
    128 	struct ld_softc *ld = &sc->sc_ld;
    129 	struct ataraid_array_info *aai = aux;
    130 	const char *level;
    131 	struct vnode *vp;
    132 	char unklev[32];
    133 	u_int i;
    134 
    135 	if (ld_ataraid_initialized == 0) {
    136 		ld_ataraid_initialized = 1;
    137 		pool_init(&ld_ataraid_cbufpl, sizeof(struct cbuf), 0,
    138 		    0, 0, "ldcbuf", NULL, IPL_BIO);
    139 	}
    140 
    141 	sc->sc_aai = aai;	/* this data persists */
    142 
    143 	ld->sc_maxxfer = MAXPHYS * aai->aai_width;	/* XXX */
    144 	ld->sc_secperunit = aai->aai_capacity;
    145 	ld->sc_secsize = 512;				/* XXX */
    146 	ld->sc_maxqueuecnt = 128;			/* XXX */
    147 	ld->sc_dump = ld_ataraid_dump;
    148 
    149 	switch (aai->aai_level) {
    150 	case AAI_L_SPAN:
    151 		level = "SPAN";
    152 		ld->sc_start = ld_ataraid_start_span;
    153 		sc->sc_iodone = ld_ataraid_iodone_raid0;
    154 		break;
    155 
    156 	case AAI_L_RAID0:
    157 		level = "RAID-0";
    158 		ld->sc_start = ld_ataraid_start_raid0;
    159 		sc->sc_iodone = ld_ataraid_iodone_raid0;
    160 		break;
    161 
    162 	case AAI_L_RAID1:
    163 		level = "RAID-1";
    164 		ld->sc_start = ld_ataraid_start_raid0;
    165 		sc->sc_iodone = ld_ataraid_iodone_raid0;
    166 		break;
    167 
    168 	case AAI_L_RAID0 | AAI_L_RAID1:
    169 		level = "RAID-10";
    170 		ld->sc_start = ld_ataraid_start_raid0;
    171 		sc->sc_iodone = ld_ataraid_iodone_raid0;
    172 		break;
    173 
    174 	default:
    175 		snprintf(unklev, sizeof(unklev), "<unknown level 0x%x>",
    176 		    aai->aai_level);
    177 		level = unklev;
    178 	}
    179 
    180 	aprint_naive(": ATA %s array\n", level);
    181 	aprint_normal(": %s ATA %s array\n",
    182 	    ata_raid_type_name(aai->aai_type), level);
    183 
    184 	if (ld->sc_start == NULL) {
    185 		aprint_error("%s: unsupported array type\n",
    186 		    ld->sc_dv.dv_xname);
    187 		return;
    188 	}
    189 
    190 	/*
    191 	 * We get a geometry from the device; use it.
    192 	 */
    193 	ld->sc_nheads = aai->aai_heads;
    194 	ld->sc_nsectors = aai->aai_sectors;
    195 	ld->sc_ncylinders = aai->aai_cylinders;
    196 
    197 	/*
    198 	 * Configure all the component disks.
    199 	 */
    200 	for (i = 0; i < aai->aai_ndisks; i++) {
    201 		struct ataraid_disk_info *adi = &aai->aai_disks[i];
    202 		int bmajor, error;
    203 		dev_t dev;
    204 
    205 		bmajor = devsw_name2blk(adi->adi_dev->dv_xname, NULL, 0);
    206 		dev = MAKEDISKDEV(bmajor, device_unit(adi->adi_dev), RAW_PART);
    207 		error = bdevvp(dev, &vp);
    208 		if (error)
    209 			break;
    210 		error = VOP_OPEN(vp, FREAD|FWRITE, NOCRED);
    211 		if (error) {
    212 			vput(vp);
    213 			/*
    214 			 * XXX This is bogus.  We should just mark the
    215 			 * XXX component as FAILED, and write-back new
    216 			 * XXX config blocks.
    217 			 */
    218 			break;
    219 		}
    220 
    221 		VOP_UNLOCK(vp, 0);
    222 		sc->sc_vnodes[i] = vp;
    223 	}
    224 	if (i == aai->aai_ndisks) {
    225 		ld->sc_flags = LDF_ENABLED;
    226 		goto finish;
    227 	}
    228 
    229 	for (i = 0; i < aai->aai_ndisks; i++) {
    230 		vp = sc->sc_vnodes[i];
    231 		sc->sc_vnodes[i] = NULL;
    232 		if (vp != NULL)
    233 			(void) vn_close(vp, FREAD|FWRITE, NOCRED);
    234 	}
    235 
    236  finish:
    237 	ldattach(ld);
    238 }
    239 
    240 static struct cbuf *
    241 ld_ataraid_make_cbuf(struct ld_ataraid_softc *sc, struct buf *bp,
    242     u_int comp, daddr_t bn, void *addr, long bcount)
    243 {
    244 	struct cbuf *cbp;
    245 
    246 	cbp = CBUF_GET();
    247 	if (cbp == NULL)
    248 		return (NULL);
    249 	buf_init(&cbp->cb_buf);
    250 	cbp->cb_buf.b_flags = bp->b_flags;
    251 	cbp->cb_buf.b_oflags = bp->b_oflags;
    252 	cbp->cb_buf.b_cflags = bp->b_cflags;
    253 	cbp->cb_buf.b_iodone = sc->sc_iodone;
    254 	cbp->cb_buf.b_proc = bp->b_proc;
    255 	cbp->cb_buf.b_vp = sc->sc_vnodes[comp];
    256 	cbp->cb_buf.b_objlock = &sc->sc_vnodes[comp]->v_interlock;
    257 	cbp->cb_buf.b_blkno = bn + sc->sc_aai->aai_offset;
    258 	cbp->cb_buf.b_data = addr;
    259 	cbp->cb_buf.b_bcount = bcount;
    260 
    261 	/* Context for iodone */
    262 	cbp->cb_obp = bp;
    263 	cbp->cb_sc = sc;
    264 	cbp->cb_comp = comp;
    265 	cbp->cb_other = NULL;
    266 	cbp->cb_flags = 0;
    267 
    268 	return (cbp);
    269 }
    270 
    271 static int
    272 ld_ataraid_start_span(struct ld_softc *ld, struct buf *bp)
    273 {
    274 	struct ld_ataraid_softc *sc = (void *) ld;
    275 	struct ataraid_array_info *aai = sc->sc_aai;
    276 	struct ataraid_disk_info *adi;
    277 	SIMPLEQ_HEAD(, cbuf) cbufq;
    278 	struct cbuf *cbp;
    279 	char *addr;
    280 	daddr_t bn;
    281 	long bcount, rcount;
    282 	u_int comp;
    283 
    284 	/* Allocate component buffers. */
    285 	SIMPLEQ_INIT(&cbufq);
    286 	addr = bp->b_data;
    287 
    288 	/* Find the first component. */
    289 	comp = 0;
    290 	adi = &aai->aai_disks[comp];
    291 	bn = bp->b_rawblkno;
    292 	while (bn >= adi->adi_compsize) {
    293 		bn -= adi->adi_compsize;
    294 		adi = &aai->aai_disks[++comp];
    295 	}
    296 
    297 	bp->b_resid = bp->b_bcount;
    298 
    299 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
    300 		rcount = bp->b_bcount;
    301 		if ((adi->adi_compsize - bn) < btodb(rcount))
    302 			rcount = dbtob(adi->adi_compsize - bn);
    303 
    304 		cbp = ld_ataraid_make_cbuf(sc, bp, comp, bn, addr, rcount);
    305 		if (cbp == NULL) {
    306 			/* Free the already allocated component buffers. */
    307 			while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
    308 				SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
    309 				buf_destroy(&cbp->cb_buf);
    310 				CBUF_PUT(cbp);
    311 			}
    312 			return (EAGAIN);
    313 		}
    314 
    315 		/*
    316 		 * For a span, we always know we advance to the next disk,
    317 		 * and always start at offset 0 on that disk.
    318 		 */
    319 		adi = &aai->aai_disks[++comp];
    320 		bn = 0;
    321 
    322 		SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
    323 		addr += rcount;
    324 	}
    325 
    326 	/* Now fire off the requests. */
    327 	while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
    328 		SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
    329 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
    330 			mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
    331 			cbp->cb_buf.b_vp->v_numoutput++;
    332 			mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
    333 		}
    334 		VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
    335 	}
    336 
    337 	return (0);
    338 }
    339 
    340 static int
    341 ld_ataraid_start_raid0(struct ld_softc *ld, struct buf *bp)
    342 {
    343 	struct ld_ataraid_softc *sc = (void *) ld;
    344 	struct ataraid_array_info *aai = sc->sc_aai;
    345 	struct ataraid_disk_info *adi;
    346 	SIMPLEQ_HEAD(, cbuf) cbufq;
    347 	struct cbuf *cbp, *other_cbp;
    348 	char *addr;
    349 	daddr_t bn, cbn, tbn, off;
    350 	long bcount, rcount;
    351 	u_int comp;
    352 	const int read = bp->b_flags & B_READ;
    353 	const int mirror = aai->aai_level & AAI_L_RAID1;
    354 	int error;
    355 
    356 	/* Allocate component buffers. */
    357 	SIMPLEQ_INIT(&cbufq);
    358 	addr = bp->b_data;
    359 	bn = bp->b_rawblkno;
    360 
    361 	bp->b_resid = bp->b_bcount;
    362 
    363 	for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
    364 		tbn = bn / aai->aai_interleave;
    365 		off = bn % aai->aai_interleave;
    366 
    367 		if (__predict_false(tbn == aai->aai_capacity /
    368 					   aai->aai_interleave)) {
    369 			/* Last stripe. */
    370 			daddr_t sz = (aai->aai_capacity -
    371 				      (tbn * aai->aai_interleave)) /
    372 				     aai->aai_width;
    373 			comp = off / sz;
    374 			cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
    375 			    (off % sz);
    376 			rcount = min(bcount, dbtob(sz));
    377 		} else {
    378 			comp = tbn % aai->aai_width;
    379 			cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
    380 			    off;
    381 			rcount = min(bcount, dbtob(aai->aai_interleave - off));
    382 		}
    383 
    384 		/*
    385 		 * See if a component is valid.
    386 		 */
    387 try_mirror:
    388 		adi = &aai->aai_disks[comp];
    389 		if ((adi->adi_status & ADI_S_ONLINE) == 0) {
    390 			if (mirror && comp < aai->aai_width) {
    391 				comp += aai->aai_width;
    392 				goto try_mirror;
    393 			}
    394 
    395 			/*
    396 			 * No component available.
    397 			 */
    398 			error = EIO;
    399 			goto free_and_exit;
    400 		}
    401 
    402 		cbp = ld_ataraid_make_cbuf(sc, bp, comp, cbn, addr, rcount);
    403 		if (cbp == NULL) {
    404 resource_shortage:
    405 			error = EAGAIN;
    406 free_and_exit:
    407 			/* Free the already allocated component buffers. */
    408 			while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
    409 				SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
    410 				buf_destroy(&cbp->cb_buf);
    411 				CBUF_PUT(cbp);
    412 			}
    413 			return (error);
    414 		}
    415 		SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
    416 		if (mirror && !read && comp < aai->aai_width) {
    417 			comp += aai->aai_width;
    418 			adi = &aai->aai_disks[comp];
    419 			if (adi->adi_status & ADI_S_ONLINE) {
    420 				other_cbp = ld_ataraid_make_cbuf(sc, bp,
    421 				    comp, cbn, addr, rcount);
    422 				if (other_cbp == NULL)
    423 					goto resource_shortage;
    424 				SIMPLEQ_INSERT_TAIL(&cbufq, other_cbp, cb_q);
    425 				other_cbp->cb_other = cbp;
    426 				cbp->cb_other = other_cbp;
    427 			}
    428 		}
    429 		bn += btodb(rcount);
    430 		addr += rcount;
    431 	}
    432 
    433 	/* Now fire off the requests. */
    434 	while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
    435 		SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
    436 		if ((cbp->cb_buf.b_flags & B_READ) == 0) {
    437 			mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
    438 			cbp->cb_buf.b_vp->v_numoutput++;
    439 			mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
    440 		}
    441 		VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
    442 	}
    443 
    444 	return (0);
    445 }
    446 
    447 /*
    448  * Called at interrupt time.  Mark the component as done and if all
    449  * components are done, take an "interrupt".
    450  */
    451 static void
    452 ld_ataraid_iodone_raid0(struct buf *vbp)
    453 {
    454 	struct cbuf *cbp = (struct cbuf *) vbp, *other_cbp;
    455 	struct buf *bp = cbp->cb_obp;
    456 	struct ld_ataraid_softc *sc = cbp->cb_sc;
    457 	struct ataraid_array_info *aai = sc->sc_aai;
    458 	struct ataraid_disk_info *adi;
    459 	long count;
    460 	int s, iodone;
    461 
    462 	s = splbio();
    463 
    464 	iodone = cbp->cb_flags & CBUF_IODONE;
    465 	other_cbp = cbp->cb_other;
    466 	if (other_cbp != NULL)
    467 		/* You are alone */
    468 		other_cbp->cb_other = NULL;
    469 
    470 	if (cbp->cb_buf.b_error != 0) {
    471 		/*
    472 		 * Mark this component broken.
    473 		 */
    474 		adi = &aai->aai_disks[cbp->cb_comp];
    475 		adi->adi_status &= ~ADI_S_ONLINE;
    476 
    477 		printf("%s: error %d on component %d (%s)\n",
    478 		    sc->sc_ld.sc_dv.dv_xname, bp->b_error, cbp->cb_comp,
    479 		    adi->adi_dev->dv_xname);
    480 
    481 		/*
    482 		 * If we didn't see an error yet and we are reading
    483 		 * RAID1 disk, try another component.
    484 		 */
    485 		if (bp->b_error == 0 &&
    486 		    (cbp->cb_buf.b_flags & B_READ) != 0 &&
    487 		    (aai->aai_level & AAI_L_RAID1) != 0 &&
    488 		    cbp->cb_comp < aai->aai_width) {
    489 			cbp->cb_comp += aai->aai_width;
    490 			adi = &aai->aai_disks[cbp->cb_comp];
    491 			if (adi->adi_status & ADI_S_ONLINE) {
    492 				cbp->cb_buf.b_error = 0;
    493 				VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
    494 				goto out;
    495 			}
    496 		}
    497 
    498 		if (iodone || other_cbp != NULL)
    499 			/*
    500 			 * If I/O on other component successfully done
    501 			 * or the I/O is still in progress, no need
    502 			 * to tell an error to upper layer.
    503 			 */
    504 			;
    505 		else {
    506 			bp->b_error = cbp->cb_buf.b_error ?
    507 			    cbp->cb_buf.b_error : EIO;
    508 		}
    509 
    510 		/* XXX Update component config blocks. */
    511 
    512 	} else {
    513 		/*
    514 		 * If other I/O is still in progress, tell it that
    515 		 * our I/O is successfully done.
    516 		 */
    517 		if (other_cbp != NULL)
    518 			other_cbp->cb_flags |= CBUF_IODONE;
    519 	}
    520 	count = cbp->cb_buf.b_bcount;
    521 	CBUF_PUT(cbp);
    522 
    523 	if (other_cbp != NULL)
    524 		goto out;
    525 
    526 	/* If all done, "interrupt". */
    527 	bp->b_resid -= count;
    528 	if (bp->b_resid < 0)
    529 		panic("ld_ataraid_iodone_raid0: count");
    530 	if (bp->b_resid == 0)
    531 		lddone(&sc->sc_ld, bp);
    532 
    533 out:
    534 	splx(s);
    535 }
    536 
    537 static int
    538 ld_ataraid_dump(struct ld_softc *sc, void *data,
    539     int blkno, int blkcnt)
    540 {
    541 
    542 	return (EIO);
    543 }
    544