Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.225
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.225 2007/03/04 06:02:38 christos Exp $	*/
      2 /*-
      3  * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This code is derived from software contributed to The NetBSD Foundation
      7  * by Greg Oster; Jason R. Thorpe.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 3. All advertising materials mentioning features or use of this software
     18  *    must display the following acknowledgement:
     19  *        This product includes software developed by the NetBSD
     20  *        Foundation, Inc. and its contributors.
     21  * 4. Neither the name of The NetBSD Foundation nor the names of its
     22  *    contributors may be used to endorse or promote products derived
     23  *    from this software without specific prior written permission.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     35  * POSSIBILITY OF SUCH DAMAGE.
     36  */
     37 
     38 /*
     39  * Copyright (c) 1990, 1993
     40  *      The Regents of the University of California.  All rights reserved.
     41  *
     42  * This code is derived from software contributed to Berkeley by
     43  * the Systems Programming Group of the University of Utah Computer
     44  * Science Department.
     45  *
     46  * Redistribution and use in source and binary forms, with or without
     47  * modification, are permitted provided that the following conditions
     48  * are met:
     49  * 1. Redistributions of source code must retain the above copyright
     50  *    notice, this list of conditions and the following disclaimer.
     51  * 2. Redistributions in binary form must reproduce the above copyright
     52  *    notice, this list of conditions and the following disclaimer in the
     53  *    documentation and/or other materials provided with the distribution.
     54  * 3. Neither the name of the University nor the names of its contributors
     55  *    may be used to endorse or promote products derived from this software
     56  *    without specific prior written permission.
     57  *
     58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     68  * SUCH DAMAGE.
     69  *
     70  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     71  *
     72  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     73  */
     74 
     75 /*
     76  * Copyright (c) 1988 University of Utah.
     77  *
     78  * This code is derived from software contributed to Berkeley by
     79  * the Systems Programming Group of the University of Utah Computer
     80  * Science Department.
     81  *
     82  * Redistribution and use in source and binary forms, with or without
     83  * modification, are permitted provided that the following conditions
     84  * are met:
     85  * 1. Redistributions of source code must retain the above copyright
     86  *    notice, this list of conditions and the following disclaimer.
     87  * 2. Redistributions in binary form must reproduce the above copyright
     88  *    notice, this list of conditions and the following disclaimer in the
     89  *    documentation and/or other materials provided with the distribution.
     90  * 3. All advertising materials mentioning features or use of this software
     91  *    must display the following acknowledgement:
     92  *      This product includes software developed by the University of
     93  *      California, Berkeley and its contributors.
     94  * 4. Neither the name of the University nor the names of its contributors
     95  *    may be used to endorse or promote products derived from this software
     96  *    without specific prior written permission.
     97  *
     98  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     99  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    100  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    101  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
    102  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    103  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    104  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    105  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    106  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    107  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    108  * SUCH DAMAGE.
    109  *
    110  * from: Utah $Hdr: cd.c 1.6 90/11/28$
    111  *
    112  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
    113  */
    114 
    115 /*
    116  * Copyright (c) 1995 Carnegie-Mellon University.
    117  * All rights reserved.
    118  *
    119  * Authors: Mark Holland, Jim Zelenka
    120  *
    121  * Permission to use, copy, modify and distribute this software and
    122  * its documentation is hereby granted, provided that both the copyright
    123  * notice and this permission notice appear in all copies of the
    124  * software, derivative works or modified versions, and any portions
    125  * thereof, and that both notices appear in supporting documentation.
    126  *
    127  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
    128  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
    129  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
    130  *
    131  * Carnegie Mellon requests users of this software to return to
    132  *
    133  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
    134  *  School of Computer Science
    135  *  Carnegie Mellon University
    136  *  Pittsburgh PA 15213-3890
    137  *
    138  * any improvements or extensions that they make and grant Carnegie the
    139  * rights to redistribute these changes.
    140  */
    141 
    142 /***********************************************************
    143  *
    144  * rf_kintf.c -- the kernel interface routines for RAIDframe
    145  *
    146  ***********************************************************/
    147 
    148 #include <sys/cdefs.h>
    149 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.225 2007/03/04 06:02:38 christos Exp $");
    150 
    151 #include <sys/param.h>
    152 #include <sys/errno.h>
    153 #include <sys/pool.h>
    154 #include <sys/proc.h>
    155 #include <sys/queue.h>
    156 #include <sys/disk.h>
    157 #include <sys/device.h>
    158 #include <sys/stat.h>
    159 #include <sys/ioctl.h>
    160 #include <sys/fcntl.h>
    161 #include <sys/systm.h>
    162 #include <sys/namei.h>
    163 #include <sys/vnode.h>
    164 #include <sys/disklabel.h>
    165 #include <sys/conf.h>
    166 #include <sys/lock.h>
    167 #include <sys/buf.h>
    168 #include <sys/bufq.h>
    169 #include <sys/user.h>
    170 #include <sys/reboot.h>
    171 #include <sys/kauth.h>
    172 
    173 #include <dev/raidframe/raidframevar.h>
    174 #include <dev/raidframe/raidframeio.h>
    175 #include "raid.h"
    176 #include "opt_raid_autoconfig.h"
    177 #include "rf_raid.h"
    178 #include "rf_copyback.h"
    179 #include "rf_dag.h"
    180 #include "rf_dagflags.h"
    181 #include "rf_desc.h"
    182 #include "rf_diskqueue.h"
    183 #include "rf_etimer.h"
    184 #include "rf_general.h"
    185 #include "rf_kintf.h"
    186 #include "rf_options.h"
    187 #include "rf_driver.h"
    188 #include "rf_parityscan.h"
    189 #include "rf_threadstuff.h"
    190 
    191 #ifdef DEBUG
    192 int     rf_kdebug_level = 0;
    193 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    194 #else				/* DEBUG */
    195 #define db1_printf(a) { }
    196 #endif				/* DEBUG */
    197 
    198 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    199 
    200 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
    201 
    202 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    203 						 * spare table */
    204 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    205 						 * installation process */
    206 
    207 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    208 
    209 /* prototypes */
    210 static void KernelWakeupFunc(struct buf *);
    211 static void InitBP(struct buf *, struct vnode *, unsigned,
    212     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    213     void *, int, struct proc *);
    214 static void raidinit(RF_Raid_t *);
    215 
    216 void raidattach(int);
    217 static int raid_match(struct device *, struct cfdata *, void *);
    218 static void raid_attach(struct device *, struct device *, void *);
    219 static int raid_detach(struct device *, int);
    220 
    221 dev_type_open(raidopen);
    222 dev_type_close(raidclose);
    223 dev_type_read(raidread);
    224 dev_type_write(raidwrite);
    225 dev_type_ioctl(raidioctl);
    226 dev_type_strategy(raidstrategy);
    227 dev_type_dump(raiddump);
    228 dev_type_size(raidsize);
    229 
    230 const struct bdevsw raid_bdevsw = {
    231 	raidopen, raidclose, raidstrategy, raidioctl,
    232 	raiddump, raidsize, D_DISK
    233 };
    234 
    235 const struct cdevsw raid_cdevsw = {
    236 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    237 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    238 };
    239 
    240 /* XXX Not sure if the following should be replacing the raidPtrs above,
    241    or if it should be used in conjunction with that...
    242 */
    243 
    244 struct raid_softc {
    245 	struct device *sc_dev;
    246 	int     sc_flags;	/* flags */
    247 	int     sc_cflags;	/* configuration flags */
    248 	uint64_t sc_size;	/* size of the raid device */
    249 	char    sc_xname[20];	/* XXX external name */
    250 	struct disk sc_dkdev;	/* generic disk device info */
    251 	struct bufq_state *buf_queue;	/* used for the device queue */
    252 };
    253 /* sc_flags */
    254 #define RAIDF_INITED	0x01	/* unit has been initialized */
    255 #define RAIDF_WLABEL	0x02	/* label area is writable */
    256 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    257 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    258 #define RAIDF_LOCKED	0x80	/* unit is locked */
    259 
    260 #define	raidunit(x)	DISKUNIT(x)
    261 int numraid = 0;
    262 
    263 extern struct cfdriver raid_cd;
    264 CFATTACH_DECL(raid, sizeof(struct raid_softc),
    265     raid_match, raid_attach, raid_detach, NULL);
    266 
    267 /*
    268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    269  * Be aware that large numbers can allow the driver to consume a lot of
    270  * kernel memory, especially on writes, and in degraded mode reads.
    271  *
    272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    273  * a single 64K write will typically require 64K for the old data,
    274  * 64K for the old parity, and 64K for the new parity, for a total
    275  * of 192K (if the parity buffer is not re-used immediately).
    276  * Even it if is used immediately, that's still 128K, which when multiplied
    277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    278  *
    279  * Now in degraded mode, for example, a 64K read on the above setup may
    280  * require data reconstruction, which will require *all* of the 4 remaining
    281  * disks to participate -- 4 * 32K/disk == 128K again.
    282  */
    283 
    284 #ifndef RAIDOUTSTANDING
    285 #define RAIDOUTSTANDING   6
    286 #endif
    287 
    288 #define RAIDLABELDEV(dev)	\
    289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    290 
    291 /* declared here, and made public, for the benefit of KVM stuff.. */
    292 struct raid_softc *raid_softc;
    293 
    294 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    295 				     struct disklabel *);
    296 static void raidgetdisklabel(dev_t);
    297 static void raidmakedisklabel(struct raid_softc *);
    298 
    299 static int raidlock(struct raid_softc *);
    300 static void raidunlock(struct raid_softc *);
    301 
    302 static void rf_markalldirty(RF_Raid_t *);
    303 
    304 void rf_ReconThread(struct rf_recon_req *);
    305 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    306 void rf_CopybackThread(RF_Raid_t *raidPtr);
    307 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    308 int rf_autoconfig(struct device *self);
    309 void rf_buildroothack(RF_ConfigSet_t *);
    310 
    311 RF_AutoConfig_t *rf_find_raid_components(void);
    312 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    313 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    314 static int rf_reasonable_label(RF_ComponentLabel_t *);
    315 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    316 int rf_set_autoconfig(RF_Raid_t *, int);
    317 int rf_set_rootpartition(RF_Raid_t *, int);
    318 void rf_release_all_vps(RF_ConfigSet_t *);
    319 void rf_cleanup_config_set(RF_ConfigSet_t *);
    320 int rf_have_enough_components(RF_ConfigSet_t *);
    321 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    322 
    323 static int raidautoconfig = 0; /* Debugging, mostly.  Set to 0 to not
    324 				  allow autoconfig to take place.
    325 				  Note that this is overridden by having
    326 				  RAID_AUTOCONFIG as an option in the
    327 				  kernel config file.  */
    328 
    329 struct RF_Pools_s rf_pools;
    330 
    331 void
    332 raidattach(int num)
    333 {
    334 	int raidID;
    335 	int i, rc;
    336 
    337 #ifdef DEBUG
    338 	printf("raidattach: Asked for %d units\n", num);
    339 #endif
    340 
    341 	if (num <= 0) {
    342 #ifdef DIAGNOSTIC
    343 		panic("raidattach: count <= 0");
    344 #endif
    345 		return;
    346 	}
    347 	/* This is where all the initialization stuff gets done. */
    348 
    349 	numraid = num;
    350 
    351 	/* Make some space for requested number of units... */
    352 
    353 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    354 	if (raidPtrs == NULL) {
    355 		panic("raidPtrs is NULL!!");
    356 	}
    357 
    358 	rf_mutex_init(&rf_sparet_wait_mutex);
    359 
    360 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    361 
    362 	for (i = 0; i < num; i++)
    363 		raidPtrs[i] = NULL;
    364 	rc = rf_BootRaidframe();
    365 	if (rc == 0)
    366 		printf("Kernelized RAIDframe activated\n");
    367 	else
    368 		panic("Serious error booting RAID!!");
    369 
    370 	/* put together some datastructures like the CCD device does.. This
    371 	 * lets us lock the device and what-not when it gets opened. */
    372 
    373 	raid_softc = (struct raid_softc *)
    374 		malloc(num * sizeof(struct raid_softc),
    375 		       M_RAIDFRAME, M_NOWAIT);
    376 	if (raid_softc == NULL) {
    377 		printf("WARNING: no memory for RAIDframe driver\n");
    378 		return;
    379 	}
    380 
    381 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    382 
    383 	for (raidID = 0; raidID < num; raidID++) {
    384 		bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
    385 
    386 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    387 			  (RF_Raid_t *));
    388 		if (raidPtrs[raidID] == NULL) {
    389 			printf("WARNING: raidPtrs[%d] is NULL\n", raidID);
    390 			numraid = raidID;
    391 			return;
    392 		}
    393 	}
    394 
    395 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    396 		printf("config_cfattach_attach failed?\n");
    397 	}
    398 
    399 #ifdef RAID_AUTOCONFIG
    400 	raidautoconfig = 1;
    401 #endif
    402 
    403 	/*
    404 	 * Register a finalizer which will be used to auto-config RAID
    405 	 * sets once all real hardware devices have been found.
    406 	 */
    407 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    408 		printf("WARNING: unable to register RAIDframe finalizer\n");
    409 }
    410 
    411 int
    412 rf_autoconfig(struct device *self)
    413 {
    414 	RF_AutoConfig_t *ac_list;
    415 	RF_ConfigSet_t *config_sets;
    416 	int i;
    417 
    418 	if (raidautoconfig == 0)
    419 		return (0);
    420 
    421 	/* XXX This code can only be run once. */
    422 	raidautoconfig = 0;
    423 
    424 	/* 1. locate all RAID components on the system */
    425 #ifdef DEBUG
    426 	printf("Searching for RAID components...\n");
    427 #endif
    428 	ac_list = rf_find_raid_components();
    429 
    430 	/* 2. Sort them into their respective sets. */
    431 	config_sets = rf_create_auto_sets(ac_list);
    432 
    433 	/*
    434 	 * 3. Evaluate each set andconfigure the valid ones.
    435 	 * This gets done in rf_buildroothack().
    436 	 */
    437 	rf_buildroothack(config_sets);
    438 
    439 	for (i = 0; i < numraid; i++)
    440 		if (raidPtrs[i] != NULL && raidPtrs[i]->valid)
    441 			dkwedge_discover(&raid_softc[i].sc_dkdev);
    442 
    443 	return 1;
    444 }
    445 
    446 void
    447 rf_buildroothack(RF_ConfigSet_t *config_sets)
    448 {
    449 	RF_ConfigSet_t *cset;
    450 	RF_ConfigSet_t *next_cset;
    451 	int retcode;
    452 	int raidID;
    453 	int rootID;
    454 	int num_root;
    455 
    456 	rootID = 0;
    457 	num_root = 0;
    458 	cset = config_sets;
    459 	while(cset != NULL ) {
    460 		next_cset = cset->next;
    461 		if (rf_have_enough_components(cset) &&
    462 		    cset->ac->clabel->autoconfigure==1) {
    463 			retcode = rf_auto_config_set(cset,&raidID);
    464 			if (!retcode) {
    465 #ifdef DEBUG
    466 				printf("raid%d: configured ok\n", raidID);
    467 #endif
    468 				if (cset->rootable) {
    469 					rootID = raidID;
    470 					num_root++;
    471 				}
    472 			} else {
    473 				/* The autoconfig didn't work :( */
    474 #ifdef DEBUG
    475 				printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    476 #endif
    477 				rf_release_all_vps(cset);
    478 			}
    479 		} else {
    480 #ifdef DEBUG
    481 			printf("raid%d: not enough components\n", raidID);
    482 #endif
    483 			/* we're not autoconfiguring this set...
    484 			   release the associated resources */
    485 			rf_release_all_vps(cset);
    486 		}
    487 		/* cleanup */
    488 		rf_cleanup_config_set(cset);
    489 		cset = next_cset;
    490 	}
    491 
    492 	/* if the user has specified what the root device should be
    493 	   then we don't touch booted_device or boothowto... */
    494 
    495 	if (rootspec != NULL)
    496 		return;
    497 
    498 	/* we found something bootable... */
    499 
    500 	if (num_root == 1) {
    501 		booted_device = raid_softc[rootID].sc_dev;
    502 	} else if (num_root > 1) {
    503 		/* we can't guess.. require the user to answer... */
    504 		boothowto |= RB_ASKNAME;
    505 	}
    506 }
    507 
    508 
    509 int
    510 raidsize(dev_t dev)
    511 {
    512 	struct raid_softc *rs;
    513 	struct disklabel *lp;
    514 	int     part, unit, omask, size;
    515 
    516 	unit = raidunit(dev);
    517 	if (unit >= numraid)
    518 		return (-1);
    519 	rs = &raid_softc[unit];
    520 
    521 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    522 		return (-1);
    523 
    524 	part = DISKPART(dev);
    525 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    526 	lp = rs->sc_dkdev.dk_label;
    527 
    528 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    529 		return (-1);
    530 
    531 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    532 		size = -1;
    533 	else
    534 		size = lp->d_partitions[part].p_size *
    535 		    (lp->d_secsize / DEV_BSIZE);
    536 
    537 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    538 		return (-1);
    539 
    540 	return (size);
    541 
    542 }
    543 
    544 int
    545 raiddump(dev_t dev, daddr_t blkno, void *va,
    546     size_t  size)
    547 {
    548 	/* Not implemented. */
    549 	return ENXIO;
    550 }
    551 /* ARGSUSED */
    552 int
    553 raidopen(dev_t dev, int flags, int fmt,
    554     struct lwp *l)
    555 {
    556 	int     unit = raidunit(dev);
    557 	struct raid_softc *rs;
    558 	struct disklabel *lp;
    559 	int     part, pmask;
    560 	int     error = 0;
    561 
    562 	if (unit >= numraid)
    563 		return (ENXIO);
    564 	rs = &raid_softc[unit];
    565 
    566 	if ((error = raidlock(rs)) != 0)
    567 		return (error);
    568 	lp = rs->sc_dkdev.dk_label;
    569 
    570 	part = DISKPART(dev);
    571 
    572 	/*
    573 	 * If there are wedges, and this is not RAW_PART, then we
    574 	 * need to fail.
    575 	 */
    576 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    577 		error = EBUSY;
    578 		goto bad;
    579 	}
    580 	pmask = (1 << part);
    581 
    582 	if ((rs->sc_flags & RAIDF_INITED) &&
    583 	    (rs->sc_dkdev.dk_openmask == 0))
    584 		raidgetdisklabel(dev);
    585 
    586 	/* make sure that this partition exists */
    587 
    588 	if (part != RAW_PART) {
    589 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    590 		    ((part >= lp->d_npartitions) ||
    591 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    592 			error = ENXIO;
    593 			goto bad;
    594 		}
    595 	}
    596 	/* Prevent this unit from being unconfigured while open. */
    597 	switch (fmt) {
    598 	case S_IFCHR:
    599 		rs->sc_dkdev.dk_copenmask |= pmask;
    600 		break;
    601 
    602 	case S_IFBLK:
    603 		rs->sc_dkdev.dk_bopenmask |= pmask;
    604 		break;
    605 	}
    606 
    607 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    608 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    609 		/* First one... mark things as dirty... Note that we *MUST*
    610 		 have done a configure before this.  I DO NOT WANT TO BE
    611 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    612 		 THAT THEY BELONG TOGETHER!!!!! */
    613 		/* XXX should check to see if we're only open for reading
    614 		   here... If so, we needn't do this, but then need some
    615 		   other way of keeping track of what's happened.. */
    616 
    617 		rf_markalldirty( raidPtrs[unit] );
    618 	}
    619 
    620 
    621 	rs->sc_dkdev.dk_openmask =
    622 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    623 
    624 bad:
    625 	raidunlock(rs);
    626 
    627 	return (error);
    628 
    629 
    630 }
    631 /* ARGSUSED */
    632 int
    633 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    634 {
    635 	int     unit = raidunit(dev);
    636 	struct cfdata *cf;
    637 	struct raid_softc *rs;
    638 	int     error = 0;
    639 	int     part;
    640 
    641 	if (unit >= numraid)
    642 		return (ENXIO);
    643 	rs = &raid_softc[unit];
    644 
    645 	if ((error = raidlock(rs)) != 0)
    646 		return (error);
    647 
    648 	part = DISKPART(dev);
    649 
    650 	/* ...that much closer to allowing unconfiguration... */
    651 	switch (fmt) {
    652 	case S_IFCHR:
    653 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    654 		break;
    655 
    656 	case S_IFBLK:
    657 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    658 		break;
    659 	}
    660 	rs->sc_dkdev.dk_openmask =
    661 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    662 
    663 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    664 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    665 		/* Last one... device is not unconfigured yet.
    666 		   Device shutdown has taken care of setting the
    667 		   clean bits if RAIDF_INITED is not set
    668 		   mark things as clean... */
    669 
    670 		rf_update_component_labels(raidPtrs[unit],
    671 						 RF_FINAL_COMPONENT_UPDATE);
    672 		if (doing_shutdown) {
    673 			/* last one, and we're going down, so
    674 			   lights out for this RAID set too. */
    675 			error = rf_Shutdown(raidPtrs[unit]);
    676 
    677 			/* It's no longer initialized... */
    678 			rs->sc_flags &= ~RAIDF_INITED;
    679 
    680 			/* detach the device */
    681 
    682 			cf = device_cfdata(rs->sc_dev);
    683 			error = config_detach(rs->sc_dev, DETACH_QUIET);
    684 			free(cf, M_RAIDFRAME);
    685 
    686 			/* Detach the disk. */
    687 			pseudo_disk_detach(&rs->sc_dkdev);
    688 		}
    689 	}
    690 
    691 	raidunlock(rs);
    692 	return (0);
    693 
    694 }
    695 
    696 void
    697 raidstrategy(struct buf *bp)
    698 {
    699 	int s;
    700 
    701 	unsigned int raidID = raidunit(bp->b_dev);
    702 	RF_Raid_t *raidPtr;
    703 	struct raid_softc *rs = &raid_softc[raidID];
    704 	int     wlabel;
    705 
    706 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    707 		bp->b_error = ENXIO;
    708 		bp->b_flags |= B_ERROR;
    709 		goto done;
    710 	}
    711 	if (raidID >= numraid || !raidPtrs[raidID]) {
    712 		bp->b_error = ENODEV;
    713 		bp->b_flags |= B_ERROR;
    714 		goto done;
    715 	}
    716 	raidPtr = raidPtrs[raidID];
    717 	if (!raidPtr->valid) {
    718 		bp->b_error = ENODEV;
    719 		bp->b_flags |= B_ERROR;
    720 		goto done;
    721 	}
    722 	if (bp->b_bcount == 0) {
    723 		db1_printf(("b_bcount is zero..\n"));
    724 		goto done;
    725 	}
    726 
    727 	/*
    728 	 * Do bounds checking and adjust transfer.  If there's an
    729 	 * error, the bounds check will flag that for us.
    730 	 */
    731 
    732 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    733 	if (DISKPART(bp->b_dev) == RAW_PART) {
    734 		uint64_t size; /* device size in DEV_BSIZE unit */
    735 
    736 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    737 			size = raidPtr->totalSectors <<
    738 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    739 		} else {
    740 			size = raidPtr->totalSectors >>
    741 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    742 		}
    743 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    744 			goto done;
    745 		}
    746 	} else {
    747 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    748 			db1_printf(("Bounds check failed!!:%d %d\n",
    749 				(int) bp->b_blkno, (int) wlabel));
    750 			goto done;
    751 		}
    752 	}
    753 	s = splbio();
    754 
    755 	bp->b_resid = 0;
    756 
    757 	/* stuff it onto our queue */
    758 	BUFQ_PUT(rs->buf_queue, bp);
    759 
    760 	/* scheduled the IO to happen at the next convenient time */
    761 	wakeup(&(raidPtrs[raidID]->iodone));
    762 
    763 	splx(s);
    764 	return;
    765 
    766 done:
    767 	bp->b_resid = bp->b_bcount;
    768 	biodone(bp);
    769 }
    770 /* ARGSUSED */
    771 int
    772 raidread(dev_t dev, struct uio *uio, int flags)
    773 {
    774 	int     unit = raidunit(dev);
    775 	struct raid_softc *rs;
    776 
    777 	if (unit >= numraid)
    778 		return (ENXIO);
    779 	rs = &raid_softc[unit];
    780 
    781 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    782 		return (ENXIO);
    783 
    784 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    785 
    786 }
    787 /* ARGSUSED */
    788 int
    789 raidwrite(dev_t dev, struct uio *uio, int flags)
    790 {
    791 	int     unit = raidunit(dev);
    792 	struct raid_softc *rs;
    793 
    794 	if (unit >= numraid)
    795 		return (ENXIO);
    796 	rs = &raid_softc[unit];
    797 
    798 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    799 		return (ENXIO);
    800 
    801 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    802 
    803 }
    804 
    805 int
    806 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    807 {
    808 	int     unit = raidunit(dev);
    809 	int     error = 0;
    810 	int     part, pmask;
    811 	struct cfdata *cf;
    812 	struct raid_softc *rs;
    813 	RF_Config_t *k_cfg, *u_cfg;
    814 	RF_Raid_t *raidPtr;
    815 	RF_RaidDisk_t *diskPtr;
    816 	RF_AccTotals_t *totals;
    817 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    818 	u_char *specific_buf;
    819 	int retcode = 0;
    820 	int column;
    821 	int raidid;
    822 	struct rf_recon_req *rrcopy, *rr;
    823 	RF_ComponentLabel_t *clabel;
    824 	RF_ComponentLabel_t *ci_label;
    825 	RF_ComponentLabel_t **clabel_ptr;
    826 	RF_SingleComponent_t *sparePtr,*componentPtr;
    827 	RF_SingleComponent_t component;
    828 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
    829 	int i, j, d;
    830 #ifdef __HAVE_OLD_DISKLABEL
    831 	struct disklabel newlabel;
    832 #endif
    833 	struct dkwedge_info *dkw;
    834 
    835 	if (unit >= numraid)
    836 		return (ENXIO);
    837 	rs = &raid_softc[unit];
    838 	raidPtr = raidPtrs[unit];
    839 
    840 	db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
    841 		(int) DISKPART(dev), (int) unit, (int) cmd));
    842 
    843 	/* Must be open for writes for these commands... */
    844 	switch (cmd) {
    845 #ifdef DIOCGSECTORSIZE
    846 	case DIOCGSECTORSIZE:
    847 		*(u_int *)data = raidPtr->bytesPerSector;
    848 		return 0;
    849 	case DIOCGMEDIASIZE:
    850 		*(off_t *)data =
    851 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
    852 		return 0;
    853 #endif
    854 	case DIOCSDINFO:
    855 	case DIOCWDINFO:
    856 #ifdef __HAVE_OLD_DISKLABEL
    857 	case ODIOCWDINFO:
    858 	case ODIOCSDINFO:
    859 #endif
    860 	case DIOCWLABEL:
    861 	case DIOCAWEDGE:
    862 	case DIOCDWEDGE:
    863 		if ((flag & FWRITE) == 0)
    864 			return (EBADF);
    865 	}
    866 
    867 	/* Must be initialized for these... */
    868 	switch (cmd) {
    869 	case DIOCGDINFO:
    870 	case DIOCSDINFO:
    871 	case DIOCWDINFO:
    872 #ifdef __HAVE_OLD_DISKLABEL
    873 	case ODIOCGDINFO:
    874 	case ODIOCWDINFO:
    875 	case ODIOCSDINFO:
    876 	case ODIOCGDEFLABEL:
    877 #endif
    878 	case DIOCGPART:
    879 	case DIOCWLABEL:
    880 	case DIOCGDEFLABEL:
    881 	case DIOCAWEDGE:
    882 	case DIOCDWEDGE:
    883 	case DIOCLWEDGES:
    884 	case RAIDFRAME_SHUTDOWN:
    885 	case RAIDFRAME_REWRITEPARITY:
    886 	case RAIDFRAME_GET_INFO:
    887 	case RAIDFRAME_RESET_ACCTOTALS:
    888 	case RAIDFRAME_GET_ACCTOTALS:
    889 	case RAIDFRAME_KEEP_ACCTOTALS:
    890 	case RAIDFRAME_GET_SIZE:
    891 	case RAIDFRAME_FAIL_DISK:
    892 	case RAIDFRAME_COPYBACK:
    893 	case RAIDFRAME_CHECK_RECON_STATUS:
    894 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
    895 	case RAIDFRAME_GET_COMPONENT_LABEL:
    896 	case RAIDFRAME_SET_COMPONENT_LABEL:
    897 	case RAIDFRAME_ADD_HOT_SPARE:
    898 	case RAIDFRAME_REMOVE_HOT_SPARE:
    899 	case RAIDFRAME_INIT_LABELS:
    900 	case RAIDFRAME_REBUILD_IN_PLACE:
    901 	case RAIDFRAME_CHECK_PARITY:
    902 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
    903 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
    904 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
    905 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
    906 	case RAIDFRAME_SET_AUTOCONFIG:
    907 	case RAIDFRAME_SET_ROOT:
    908 	case RAIDFRAME_DELETE_COMPONENT:
    909 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
    910 		if ((rs->sc_flags & RAIDF_INITED) == 0)
    911 			return (ENXIO);
    912 	}
    913 
    914 	switch (cmd) {
    915 
    916 		/* configure the system */
    917 	case RAIDFRAME_CONFIGURE:
    918 
    919 		if (raidPtr->valid) {
    920 			/* There is a valid RAID set running on this unit! */
    921 			printf("raid%d: Device already configured!\n",unit);
    922 			return(EINVAL);
    923 		}
    924 
    925 		/* copy-in the configuration information */
    926 		/* data points to a pointer to the configuration structure */
    927 
    928 		u_cfg = *((RF_Config_t **) data);
    929 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
    930 		if (k_cfg == NULL) {
    931 			return (ENOMEM);
    932 		}
    933 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
    934 		if (retcode) {
    935 			RF_Free(k_cfg, sizeof(RF_Config_t));
    936 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
    937 				retcode));
    938 			return (retcode);
    939 		}
    940 		/* allocate a buffer for the layout-specific data, and copy it
    941 		 * in */
    942 		if (k_cfg->layoutSpecificSize) {
    943 			if (k_cfg->layoutSpecificSize > 10000) {
    944 				/* sanity check */
    945 				RF_Free(k_cfg, sizeof(RF_Config_t));
    946 				return (EINVAL);
    947 			}
    948 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
    949 			    (u_char *));
    950 			if (specific_buf == NULL) {
    951 				RF_Free(k_cfg, sizeof(RF_Config_t));
    952 				return (ENOMEM);
    953 			}
    954 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
    955 			    k_cfg->layoutSpecificSize);
    956 			if (retcode) {
    957 				RF_Free(k_cfg, sizeof(RF_Config_t));
    958 				RF_Free(specific_buf,
    959 					k_cfg->layoutSpecificSize);
    960 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
    961 					retcode));
    962 				return (retcode);
    963 			}
    964 		} else
    965 			specific_buf = NULL;
    966 		k_cfg->layoutSpecific = specific_buf;
    967 
    968 		/* should do some kind of sanity check on the configuration.
    969 		 * Store the sum of all the bytes in the last byte? */
    970 
    971 		/* configure the system */
    972 
    973 		/*
    974 		 * Clear the entire RAID descriptor, just to make sure
    975 		 *  there is no stale data left in the case of a
    976 		 *  reconfiguration
    977 		 */
    978 		memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
    979 		raidPtr->raidid = unit;
    980 
    981 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
    982 
    983 		if (retcode == 0) {
    984 
    985 			/* allow this many simultaneous IO's to
    986 			   this RAID device */
    987 			raidPtr->openings = RAIDOUTSTANDING;
    988 
    989 			raidinit(raidPtr);
    990 			rf_markalldirty(raidPtr);
    991 		}
    992 		/* free the buffers.  No return code here. */
    993 		if (k_cfg->layoutSpecificSize) {
    994 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
    995 		}
    996 		RF_Free(k_cfg, sizeof(RF_Config_t));
    997 
    998 		return (retcode);
    999 
   1000 		/* shutdown the system */
   1001 	case RAIDFRAME_SHUTDOWN:
   1002 
   1003 		if ((error = raidlock(rs)) != 0)
   1004 			return (error);
   1005 
   1006 		/*
   1007 		 * If somebody has a partition mounted, we shouldn't
   1008 		 * shutdown.
   1009 		 */
   1010 
   1011 		part = DISKPART(dev);
   1012 		pmask = (1 << part);
   1013 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1014 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1015 			(rs->sc_dkdev.dk_copenmask & pmask))) {
   1016 			raidunlock(rs);
   1017 			return (EBUSY);
   1018 		}
   1019 
   1020 		retcode = rf_Shutdown(raidPtr);
   1021 
   1022 		/* It's no longer initialized... */
   1023 		rs->sc_flags &= ~RAIDF_INITED;
   1024 
   1025 		/* free the pseudo device attach bits */
   1026 
   1027 		cf = device_cfdata(rs->sc_dev);
   1028 		/* XXX this causes us to not return any errors
   1029 		   from the above call to rf_Shutdown() */
   1030 		retcode = config_detach(rs->sc_dev, DETACH_QUIET);
   1031 		free(cf, M_RAIDFRAME);
   1032 
   1033 		/* Detach the disk. */
   1034 		pseudo_disk_detach(&rs->sc_dkdev);
   1035 
   1036 		raidunlock(rs);
   1037 
   1038 		return (retcode);
   1039 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1040 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1041 		/* need to read the component label for the disk indicated
   1042 		   by row,column in clabel */
   1043 
   1044 		/* For practice, let's get it directly fromdisk, rather
   1045 		   than from the in-core copy */
   1046 		RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
   1047 			   (RF_ComponentLabel_t *));
   1048 		if (clabel == NULL)
   1049 			return (ENOMEM);
   1050 
   1051 		retcode = copyin( *clabel_ptr, clabel,
   1052 				  sizeof(RF_ComponentLabel_t));
   1053 
   1054 		if (retcode) {
   1055 			RF_Free( clabel, sizeof(RF_ComponentLabel_t));
   1056 			return(retcode);
   1057 		}
   1058 
   1059 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1060 
   1061 		column = clabel->column;
   1062 
   1063 		if ((column < 0) || (column >= raidPtr->numCol +
   1064 				     raidPtr->numSpare)) {
   1065 			RF_Free( clabel, sizeof(RF_ComponentLabel_t));
   1066 			return(EINVAL);
   1067 		}
   1068 
   1069 		retcode = raidread_component_label(raidPtr->Disks[column].dev,
   1070 				raidPtr->raid_cinfo[column].ci_vp,
   1071 				clabel );
   1072 
   1073 		if (retcode == 0) {
   1074 			retcode = copyout(clabel, *clabel_ptr,
   1075 					  sizeof(RF_ComponentLabel_t));
   1076 		}
   1077 		RF_Free(clabel, sizeof(RF_ComponentLabel_t));
   1078 		return (retcode);
   1079 
   1080 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1081 		clabel = (RF_ComponentLabel_t *) data;
   1082 
   1083 		/* XXX check the label for valid stuff... */
   1084 		/* Note that some things *should not* get modified --
   1085 		   the user should be re-initing the labels instead of
   1086 		   trying to patch things.
   1087 		   */
   1088 
   1089 		raidid = raidPtr->raidid;
   1090 #ifdef DEBUG
   1091 		printf("raid%d: Got component label:\n", raidid);
   1092 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1093 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1094 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1095 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1096 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1097 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1098 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1099 #endif
   1100 		clabel->row = 0;
   1101 		column = clabel->column;
   1102 
   1103 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1104 			return(EINVAL);
   1105 		}
   1106 
   1107 		/* XXX this isn't allowed to do anything for now :-) */
   1108 
   1109 		/* XXX and before it is, we need to fill in the rest
   1110 		   of the fields!?!?!?! */
   1111 #if 0
   1112 		raidwrite_component_label(
   1113 		     raidPtr->Disks[column].dev,
   1114 			    raidPtr->raid_cinfo[column].ci_vp,
   1115 			    clabel );
   1116 #endif
   1117 		return (0);
   1118 
   1119 	case RAIDFRAME_INIT_LABELS:
   1120 		clabel = (RF_ComponentLabel_t *) data;
   1121 		/*
   1122 		   we only want the serial number from
   1123 		   the above.  We get all the rest of the information
   1124 		   from the config that was used to create this RAID
   1125 		   set.
   1126 		   */
   1127 
   1128 		raidPtr->serial_number = clabel->serial_number;
   1129 
   1130 		RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
   1131 			  (RF_ComponentLabel_t *));
   1132 		if (ci_label == NULL)
   1133 			return (ENOMEM);
   1134 
   1135 		raid_init_component_label(raidPtr, ci_label);
   1136 		ci_label->serial_number = clabel->serial_number;
   1137 		ci_label->row = 0; /* we dont' pretend to support more */
   1138 
   1139 		for(column=0;column<raidPtr->numCol;column++) {
   1140 			diskPtr = &raidPtr->Disks[column];
   1141 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1142 				ci_label->partitionSize = diskPtr->partitionSize;
   1143 				ci_label->column = column;
   1144 				raidwrite_component_label(
   1145 							  raidPtr->Disks[column].dev,
   1146 							  raidPtr->raid_cinfo[column].ci_vp,
   1147 							  ci_label );
   1148 			}
   1149 		}
   1150 		RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
   1151 
   1152 		return (retcode);
   1153 	case RAIDFRAME_SET_AUTOCONFIG:
   1154 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1155 		printf("raid%d: New autoconfig value is: %d\n",
   1156 		       raidPtr->raidid, d);
   1157 		*(int *) data = d;
   1158 		return (retcode);
   1159 
   1160 	case RAIDFRAME_SET_ROOT:
   1161 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1162 		printf("raid%d: New rootpartition value is: %d\n",
   1163 		       raidPtr->raidid, d);
   1164 		*(int *) data = d;
   1165 		return (retcode);
   1166 
   1167 		/* initialize all parity */
   1168 	case RAIDFRAME_REWRITEPARITY:
   1169 
   1170 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1171 			/* Parity for RAID 0 is trivially correct */
   1172 			raidPtr->parity_good = RF_RAID_CLEAN;
   1173 			return(0);
   1174 		}
   1175 
   1176 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1177 			/* Re-write is already in progress! */
   1178 			return(EINVAL);
   1179 		}
   1180 
   1181 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1182 					   rf_RewriteParityThread,
   1183 					   raidPtr,"raid_parity");
   1184 		return (retcode);
   1185 
   1186 
   1187 	case RAIDFRAME_ADD_HOT_SPARE:
   1188 		sparePtr = (RF_SingleComponent_t *) data;
   1189 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1190 		retcode = rf_add_hot_spare(raidPtr, &component);
   1191 		return(retcode);
   1192 
   1193 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1194 		return(retcode);
   1195 
   1196 	case RAIDFRAME_DELETE_COMPONENT:
   1197 		componentPtr = (RF_SingleComponent_t *)data;
   1198 		memcpy( &component, componentPtr,
   1199 			sizeof(RF_SingleComponent_t));
   1200 		retcode = rf_delete_component(raidPtr, &component);
   1201 		return(retcode);
   1202 
   1203 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1204 		componentPtr = (RF_SingleComponent_t *)data;
   1205 		memcpy( &component, componentPtr,
   1206 			sizeof(RF_SingleComponent_t));
   1207 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1208 		return(retcode);
   1209 
   1210 	case RAIDFRAME_REBUILD_IN_PLACE:
   1211 
   1212 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1213 			/* Can't do this on a RAID 0!! */
   1214 			return(EINVAL);
   1215 		}
   1216 
   1217 		if (raidPtr->recon_in_progress == 1) {
   1218 			/* a reconstruct is already in progress! */
   1219 			return(EINVAL);
   1220 		}
   1221 
   1222 		componentPtr = (RF_SingleComponent_t *) data;
   1223 		memcpy( &component, componentPtr,
   1224 			sizeof(RF_SingleComponent_t));
   1225 		component.row = 0; /* we don't support any more */
   1226 		column = component.column;
   1227 
   1228 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1229 			return(EINVAL);
   1230 		}
   1231 
   1232 		RF_LOCK_MUTEX(raidPtr->mutex);
   1233 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1234 		    (raidPtr->numFailures > 0)) {
   1235 			/* XXX 0 above shouldn't be constant!!! */
   1236 			/* some component other than this has failed.
   1237 			   Let's not make things worse than they already
   1238 			   are... */
   1239 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1240 			       raidPtr->raidid);
   1241 			printf("raid%d:     Col: %d   Too many failures.\n",
   1242 			       raidPtr->raidid, column);
   1243 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1244 			return (EINVAL);
   1245 		}
   1246 		if (raidPtr->Disks[column].status ==
   1247 		    rf_ds_reconstructing) {
   1248 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1249 			       raidPtr->raidid);
   1250 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1251 
   1252 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1253 			return (EINVAL);
   1254 		}
   1255 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1256 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1257 			return (EINVAL);
   1258 		}
   1259 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1260 
   1261 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1262 		if (rrcopy == NULL)
   1263 			return(ENOMEM);
   1264 
   1265 		rrcopy->raidPtr = (void *) raidPtr;
   1266 		rrcopy->col = column;
   1267 
   1268 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1269 					   rf_ReconstructInPlaceThread,
   1270 					   rrcopy,"raid_reconip");
   1271 		return(retcode);
   1272 
   1273 	case RAIDFRAME_GET_INFO:
   1274 		if (!raidPtr->valid)
   1275 			return (ENODEV);
   1276 		ucfgp = (RF_DeviceConfig_t **) data;
   1277 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1278 			  (RF_DeviceConfig_t *));
   1279 		if (d_cfg == NULL)
   1280 			return (ENOMEM);
   1281 		d_cfg->rows = 1; /* there is only 1 row now */
   1282 		d_cfg->cols = raidPtr->numCol;
   1283 		d_cfg->ndevs = raidPtr->numCol;
   1284 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1285 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1286 			return (ENOMEM);
   1287 		}
   1288 		d_cfg->nspares = raidPtr->numSpare;
   1289 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1290 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1291 			return (ENOMEM);
   1292 		}
   1293 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1294 		d = 0;
   1295 		for (j = 0; j < d_cfg->cols; j++) {
   1296 			d_cfg->devs[d] = raidPtr->Disks[j];
   1297 			d++;
   1298 		}
   1299 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1300 			d_cfg->spares[i] = raidPtr->Disks[j];
   1301 		}
   1302 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1303 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1304 
   1305 		return (retcode);
   1306 
   1307 	case RAIDFRAME_CHECK_PARITY:
   1308 		*(int *) data = raidPtr->parity_good;
   1309 		return (0);
   1310 
   1311 	case RAIDFRAME_RESET_ACCTOTALS:
   1312 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1313 		return (0);
   1314 
   1315 	case RAIDFRAME_GET_ACCTOTALS:
   1316 		totals = (RF_AccTotals_t *) data;
   1317 		*totals = raidPtr->acc_totals;
   1318 		return (0);
   1319 
   1320 	case RAIDFRAME_KEEP_ACCTOTALS:
   1321 		raidPtr->keep_acc_totals = *(int *)data;
   1322 		return (0);
   1323 
   1324 	case RAIDFRAME_GET_SIZE:
   1325 		*(int *) data = raidPtr->totalSectors;
   1326 		return (0);
   1327 
   1328 		/* fail a disk & optionally start reconstruction */
   1329 	case RAIDFRAME_FAIL_DISK:
   1330 
   1331 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1332 			/* Can't do this on a RAID 0!! */
   1333 			return(EINVAL);
   1334 		}
   1335 
   1336 		rr = (struct rf_recon_req *) data;
   1337 		rr->row = 0;
   1338 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1339 			return (EINVAL);
   1340 
   1341 
   1342 		RF_LOCK_MUTEX(raidPtr->mutex);
   1343 		if (raidPtr->status == rf_rs_reconstructing) {
   1344 			/* you can't fail a disk while we're reconstructing! */
   1345 			/* XXX wrong for RAID6 */
   1346 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1347 			return (EINVAL);
   1348 		}
   1349 		if ((raidPtr->Disks[rr->col].status ==
   1350 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1351 			/* some other component has failed.  Let's not make
   1352 			   things worse. XXX wrong for RAID6 */
   1353 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1354 			return (EINVAL);
   1355 		}
   1356 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1357 			/* Can't fail a spared disk! */
   1358 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1359 			return (EINVAL);
   1360 		}
   1361 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1362 
   1363 		/* make a copy of the recon request so that we don't rely on
   1364 		 * the user's buffer */
   1365 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1366 		if (rrcopy == NULL)
   1367 			return(ENOMEM);
   1368 		memcpy(rrcopy, rr, sizeof(*rr));
   1369 		rrcopy->raidPtr = (void *) raidPtr;
   1370 
   1371 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1372 					   rf_ReconThread,
   1373 					   rrcopy,"raid_recon");
   1374 		return (0);
   1375 
   1376 		/* invoke a copyback operation after recon on whatever disk
   1377 		 * needs it, if any */
   1378 	case RAIDFRAME_COPYBACK:
   1379 
   1380 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1381 			/* This makes no sense on a RAID 0!! */
   1382 			return(EINVAL);
   1383 		}
   1384 
   1385 		if (raidPtr->copyback_in_progress == 1) {
   1386 			/* Copyback is already in progress! */
   1387 			return(EINVAL);
   1388 		}
   1389 
   1390 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1391 					   rf_CopybackThread,
   1392 					   raidPtr,"raid_copyback");
   1393 		return (retcode);
   1394 
   1395 		/* return the percentage completion of reconstruction */
   1396 	case RAIDFRAME_CHECK_RECON_STATUS:
   1397 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1398 			/* This makes no sense on a RAID 0, so tell the
   1399 			   user it's done. */
   1400 			*(int *) data = 100;
   1401 			return(0);
   1402 		}
   1403 		if (raidPtr->status != rf_rs_reconstructing)
   1404 			*(int *) data = 100;
   1405 		else {
   1406 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1407 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1408 			} else {
   1409 				*(int *) data = 0;
   1410 			}
   1411 		}
   1412 		return (0);
   1413 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1414 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1415 		if (raidPtr->status != rf_rs_reconstructing) {
   1416 			progressInfo.remaining = 0;
   1417 			progressInfo.completed = 100;
   1418 			progressInfo.total = 100;
   1419 		} else {
   1420 			progressInfo.total =
   1421 				raidPtr->reconControl->numRUsTotal;
   1422 			progressInfo.completed =
   1423 				raidPtr->reconControl->numRUsComplete;
   1424 			progressInfo.remaining = progressInfo.total -
   1425 				progressInfo.completed;
   1426 		}
   1427 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1428 				  sizeof(RF_ProgressInfo_t));
   1429 		return (retcode);
   1430 
   1431 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1432 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1433 			/* This makes no sense on a RAID 0, so tell the
   1434 			   user it's done. */
   1435 			*(int *) data = 100;
   1436 			return(0);
   1437 		}
   1438 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1439 			*(int *) data = 100 *
   1440 				raidPtr->parity_rewrite_stripes_done /
   1441 				raidPtr->Layout.numStripe;
   1442 		} else {
   1443 			*(int *) data = 100;
   1444 		}
   1445 		return (0);
   1446 
   1447 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1448 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1449 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1450 			progressInfo.total = raidPtr->Layout.numStripe;
   1451 			progressInfo.completed =
   1452 				raidPtr->parity_rewrite_stripes_done;
   1453 			progressInfo.remaining = progressInfo.total -
   1454 				progressInfo.completed;
   1455 		} else {
   1456 			progressInfo.remaining = 0;
   1457 			progressInfo.completed = 100;
   1458 			progressInfo.total = 100;
   1459 		}
   1460 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1461 				  sizeof(RF_ProgressInfo_t));
   1462 		return (retcode);
   1463 
   1464 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1465 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1466 			/* This makes no sense on a RAID 0 */
   1467 			*(int *) data = 100;
   1468 			return(0);
   1469 		}
   1470 		if (raidPtr->copyback_in_progress == 1) {
   1471 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1472 				raidPtr->Layout.numStripe;
   1473 		} else {
   1474 			*(int *) data = 100;
   1475 		}
   1476 		return (0);
   1477 
   1478 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1479 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1480 		if (raidPtr->copyback_in_progress == 1) {
   1481 			progressInfo.total = raidPtr->Layout.numStripe;
   1482 			progressInfo.completed =
   1483 				raidPtr->copyback_stripes_done;
   1484 			progressInfo.remaining = progressInfo.total -
   1485 				progressInfo.completed;
   1486 		} else {
   1487 			progressInfo.remaining = 0;
   1488 			progressInfo.completed = 100;
   1489 			progressInfo.total = 100;
   1490 		}
   1491 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1492 				  sizeof(RF_ProgressInfo_t));
   1493 		return (retcode);
   1494 
   1495 		/* the sparetable daemon calls this to wait for the kernel to
   1496 		 * need a spare table. this ioctl does not return until a
   1497 		 * spare table is needed. XXX -- calling mpsleep here in the
   1498 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1499 		 * -- I should either compute the spare table in the kernel,
   1500 		 * or have a different -- XXX XXX -- interface (a different
   1501 		 * character device) for delivering the table     -- XXX */
   1502 #if 0
   1503 	case RAIDFRAME_SPARET_WAIT:
   1504 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1505 		while (!rf_sparet_wait_queue)
   1506 			mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
   1507 		waitreq = rf_sparet_wait_queue;
   1508 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1509 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1510 
   1511 		/* structure assignment */
   1512 		*((RF_SparetWait_t *) data) = *waitreq;
   1513 
   1514 		RF_Free(waitreq, sizeof(*waitreq));
   1515 		return (0);
   1516 
   1517 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1518 		 * code in it that will cause the dameon to exit */
   1519 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1520 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1521 		waitreq->fcol = -1;
   1522 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1523 		waitreq->next = rf_sparet_wait_queue;
   1524 		rf_sparet_wait_queue = waitreq;
   1525 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1526 		wakeup(&rf_sparet_wait_queue);
   1527 		return (0);
   1528 
   1529 		/* used by the spare table daemon to deliver a spare table
   1530 		 * into the kernel */
   1531 	case RAIDFRAME_SEND_SPARET:
   1532 
   1533 		/* install the spare table */
   1534 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1535 
   1536 		/* respond to the requestor.  the return status of the spare
   1537 		 * table installation is passed in the "fcol" field */
   1538 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1539 		waitreq->fcol = retcode;
   1540 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1541 		waitreq->next = rf_sparet_resp_queue;
   1542 		rf_sparet_resp_queue = waitreq;
   1543 		wakeup(&rf_sparet_resp_queue);
   1544 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1545 
   1546 		return (retcode);
   1547 #endif
   1548 
   1549 	default:
   1550 		break; /* fall through to the os-specific code below */
   1551 
   1552 	}
   1553 
   1554 	if (!raidPtr->valid)
   1555 		return (EINVAL);
   1556 
   1557 	/*
   1558 	 * Add support for "regular" device ioctls here.
   1559 	 */
   1560 
   1561 	switch (cmd) {
   1562 	case DIOCGDINFO:
   1563 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1564 		break;
   1565 #ifdef __HAVE_OLD_DISKLABEL
   1566 	case ODIOCGDINFO:
   1567 		newlabel = *(rs->sc_dkdev.dk_label);
   1568 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1569 			return ENOTTY;
   1570 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1571 		break;
   1572 #endif
   1573 
   1574 	case DIOCGPART:
   1575 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1576 		((struct partinfo *) data)->part =
   1577 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1578 		break;
   1579 
   1580 	case DIOCWDINFO:
   1581 	case DIOCSDINFO:
   1582 #ifdef __HAVE_OLD_DISKLABEL
   1583 	case ODIOCWDINFO:
   1584 	case ODIOCSDINFO:
   1585 #endif
   1586 	{
   1587 		struct disklabel *lp;
   1588 #ifdef __HAVE_OLD_DISKLABEL
   1589 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1590 			memset(&newlabel, 0, sizeof newlabel);
   1591 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1592 			lp = &newlabel;
   1593 		} else
   1594 #endif
   1595 		lp = (struct disklabel *)data;
   1596 
   1597 		if ((error = raidlock(rs)) != 0)
   1598 			return (error);
   1599 
   1600 		rs->sc_flags |= RAIDF_LABELLING;
   1601 
   1602 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1603 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1604 		if (error == 0) {
   1605 			if (cmd == DIOCWDINFO
   1606 #ifdef __HAVE_OLD_DISKLABEL
   1607 			    || cmd == ODIOCWDINFO
   1608 #endif
   1609 			   )
   1610 				error = writedisklabel(RAIDLABELDEV(dev),
   1611 				    raidstrategy, rs->sc_dkdev.dk_label,
   1612 				    rs->sc_dkdev.dk_cpulabel);
   1613 		}
   1614 		rs->sc_flags &= ~RAIDF_LABELLING;
   1615 
   1616 		raidunlock(rs);
   1617 
   1618 		if (error)
   1619 			return (error);
   1620 		break;
   1621 	}
   1622 
   1623 	case DIOCWLABEL:
   1624 		if (*(int *) data != 0)
   1625 			rs->sc_flags |= RAIDF_WLABEL;
   1626 		else
   1627 			rs->sc_flags &= ~RAIDF_WLABEL;
   1628 		break;
   1629 
   1630 	case DIOCGDEFLABEL:
   1631 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1632 		break;
   1633 
   1634 #ifdef __HAVE_OLD_DISKLABEL
   1635 	case ODIOCGDEFLABEL:
   1636 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1637 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1638 			return ENOTTY;
   1639 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1640 		break;
   1641 #endif
   1642 
   1643 	case DIOCAWEDGE:
   1644 	case DIOCDWEDGE:
   1645 	    	dkw = (void *)data;
   1646 
   1647 		/* If the ioctl happens here, the parent is us. */
   1648 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1649 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1650 
   1651 	case DIOCLWEDGES:
   1652 		return dkwedge_list(&rs->sc_dkdev,
   1653 		    (struct dkwedge_list *)data, l);
   1654 
   1655 	default:
   1656 		retcode = ENOTTY;
   1657 	}
   1658 	return (retcode);
   1659 
   1660 }
   1661 
   1662 
   1663 /* raidinit -- complete the rest of the initialization for the
   1664    RAIDframe device.  */
   1665 
   1666 
   1667 static void
   1668 raidinit(RF_Raid_t *raidPtr)
   1669 {
   1670 	struct cfdata *cf;
   1671 	struct raid_softc *rs;
   1672 	int     unit;
   1673 
   1674 	unit = raidPtr->raidid;
   1675 
   1676 	rs = &raid_softc[unit];
   1677 
   1678 	/* XXX should check return code first... */
   1679 	rs->sc_flags |= RAIDF_INITED;
   1680 
   1681 	/* XXX doesn't check bounds. */
   1682 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1683 
   1684 	rs->sc_dkdev.dk_name = rs->sc_xname;
   1685 
   1686 	/* attach the pseudo device */
   1687 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1688 	cf->cf_name = raid_cd.cd_name;
   1689 	cf->cf_atname = raid_cd.cd_name;
   1690 	cf->cf_unit = unit;
   1691 	cf->cf_fstate = FSTATE_STAR;
   1692 
   1693 	rs->sc_dev = config_attach_pseudo(cf);
   1694 
   1695 	if (rs->sc_dev==NULL) {
   1696 		printf("raid%d: config_attach_pseudo failed\n",
   1697 		       raidPtr->raidid);
   1698 	}
   1699 
   1700 	/* disk_attach actually creates space for the CPU disklabel, among
   1701 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1702 	 * with disklabels. */
   1703 
   1704 	disk_attach(&rs->sc_dkdev);
   1705 
   1706 	/* XXX There may be a weird interaction here between this, and
   1707 	 * protectedSectors, as used in RAIDframe.  */
   1708 
   1709 	rs->sc_size = raidPtr->totalSectors;
   1710 }
   1711 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1712 /* wake up the daemon & tell it to get us a spare table
   1713  * XXX
   1714  * the entries in the queues should be tagged with the raidPtr
   1715  * so that in the extremely rare case that two recons happen at once,
   1716  * we know for which device were requesting a spare table
   1717  * XXX
   1718  *
   1719  * XXX This code is not currently used. GO
   1720  */
   1721 int
   1722 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1723 {
   1724 	int     retcode;
   1725 
   1726 	RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1727 	req->next = rf_sparet_wait_queue;
   1728 	rf_sparet_wait_queue = req;
   1729 	wakeup(&rf_sparet_wait_queue);
   1730 
   1731 	/* mpsleep unlocks the mutex */
   1732 	while (!rf_sparet_resp_queue) {
   1733 		tsleep(&rf_sparet_resp_queue, PRIBIO,
   1734 		    "raidframe getsparetable", 0);
   1735 	}
   1736 	req = rf_sparet_resp_queue;
   1737 	rf_sparet_resp_queue = req->next;
   1738 	RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1739 
   1740 	retcode = req->fcol;
   1741 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1742 					 * alloc'd */
   1743 	return (retcode);
   1744 }
   1745 #endif
   1746 
   1747 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1748  * bp & passes it down.
   1749  * any calls originating in the kernel must use non-blocking I/O
   1750  * do some extra sanity checking to return "appropriate" error values for
   1751  * certain conditions (to make some standard utilities work)
   1752  *
   1753  * Formerly known as: rf_DoAccessKernel
   1754  */
   1755 void
   1756 raidstart(RF_Raid_t *raidPtr)
   1757 {
   1758 	RF_SectorCount_t num_blocks, pb, sum;
   1759 	RF_RaidAddr_t raid_addr;
   1760 	struct partition *pp;
   1761 	daddr_t blocknum;
   1762 	int     unit;
   1763 	struct raid_softc *rs;
   1764 	int     do_async;
   1765 	struct buf *bp;
   1766 	int rc;
   1767 
   1768 	unit = raidPtr->raidid;
   1769 	rs = &raid_softc[unit];
   1770 
   1771 	/* quick check to see if anything has died recently */
   1772 	RF_LOCK_MUTEX(raidPtr->mutex);
   1773 	if (raidPtr->numNewFailures > 0) {
   1774 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1775 		rf_update_component_labels(raidPtr,
   1776 					   RF_NORMAL_COMPONENT_UPDATE);
   1777 		RF_LOCK_MUTEX(raidPtr->mutex);
   1778 		raidPtr->numNewFailures--;
   1779 	}
   1780 
   1781 	/* Check to see if we're at the limit... */
   1782 	while (raidPtr->openings > 0) {
   1783 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1784 
   1785 		/* get the next item, if any, from the queue */
   1786 		if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
   1787 			/* nothing more to do */
   1788 			return;
   1789 		}
   1790 
   1791 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   1792 		 * partition.. Need to make it absolute to the underlying
   1793 		 * device.. */
   1794 
   1795 		blocknum = bp->b_blkno;
   1796 		if (DISKPART(bp->b_dev) != RAW_PART) {
   1797 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   1798 			blocknum += pp->p_offset;
   1799 		}
   1800 
   1801 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1802 			    (int) blocknum));
   1803 
   1804 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1805 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1806 
   1807 		/* *THIS* is where we adjust what block we're going to...
   1808 		 * but DO NOT TOUCH bp->b_blkno!!! */
   1809 		raid_addr = blocknum;
   1810 
   1811 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1812 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1813 		sum = raid_addr + num_blocks + pb;
   1814 		if (1 || rf_debugKernelAccess) {
   1815 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1816 				    (int) raid_addr, (int) sum, (int) num_blocks,
   1817 				    (int) pb, (int) bp->b_resid));
   1818 		}
   1819 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1820 		    || (sum < num_blocks) || (sum < pb)) {
   1821 			bp->b_error = ENOSPC;
   1822 			bp->b_flags |= B_ERROR;
   1823 			bp->b_resid = bp->b_bcount;
   1824 			biodone(bp);
   1825 			RF_LOCK_MUTEX(raidPtr->mutex);
   1826 			continue;
   1827 		}
   1828 		/*
   1829 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1830 		 */
   1831 
   1832 		if (bp->b_bcount & raidPtr->sectorMask) {
   1833 			bp->b_error = EINVAL;
   1834 			bp->b_flags |= B_ERROR;
   1835 			bp->b_resid = bp->b_bcount;
   1836 			biodone(bp);
   1837 			RF_LOCK_MUTEX(raidPtr->mutex);
   1838 			continue;
   1839 
   1840 		}
   1841 		db1_printf(("Calling DoAccess..\n"));
   1842 
   1843 
   1844 		RF_LOCK_MUTEX(raidPtr->mutex);
   1845 		raidPtr->openings--;
   1846 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1847 
   1848 		/*
   1849 		 * Everything is async.
   1850 		 */
   1851 		do_async = 1;
   1852 
   1853 		disk_busy(&rs->sc_dkdev);
   1854 
   1855 		/* XXX we're still at splbio() here... do we *really*
   1856 		   need to be? */
   1857 
   1858 		/* don't ever condition on bp->b_flags & B_WRITE.
   1859 		 * always condition on B_READ instead */
   1860 
   1861 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1862 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1863 				 do_async, raid_addr, num_blocks,
   1864 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1865 
   1866 		if (rc) {
   1867 			bp->b_error = rc;
   1868 			bp->b_flags |= B_ERROR;
   1869 			bp->b_resid = bp->b_bcount;
   1870 			biodone(bp);
   1871 			/* continue loop */
   1872 		}
   1873 
   1874 		RF_LOCK_MUTEX(raidPtr->mutex);
   1875 	}
   1876 	RF_UNLOCK_MUTEX(raidPtr->mutex);
   1877 }
   1878 
   1879 
   1880 
   1881 
   1882 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1883 
   1884 int
   1885 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1886 {
   1887 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1888 	struct buf *bp;
   1889 
   1890 	req->queue = queue;
   1891 
   1892 #if DIAGNOSTIC
   1893 	if (queue->raidPtr->raidid >= numraid) {
   1894 		printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
   1895 		    numraid);
   1896 		panic("Invalid Unit number in rf_DispatchKernelIO");
   1897 	}
   1898 #endif
   1899 
   1900 	bp = req->bp;
   1901 
   1902 	switch (req->type) {
   1903 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1904 		/* XXX need to do something extra here.. */
   1905 		/* I'm leaving this in, as I've never actually seen it used,
   1906 		 * and I'd like folks to report it... GO */
   1907 		printf(("WAKEUP CALLED\n"));
   1908 		queue->numOutstanding++;
   1909 
   1910 		bp->b_flags = 0;
   1911 		bp->b_private = req;
   1912 
   1913 		KernelWakeupFunc(bp);
   1914 		break;
   1915 
   1916 	case RF_IO_TYPE_READ:
   1917 	case RF_IO_TYPE_WRITE:
   1918 #if RF_ACC_TRACE > 0
   1919 		if (req->tracerec) {
   1920 			RF_ETIMER_START(req->tracerec->timer);
   1921 		}
   1922 #endif
   1923 		InitBP(bp, queue->rf_cinfo->ci_vp,
   1924 		    op, queue->rf_cinfo->ci_dev,
   1925 		    req->sectorOffset, req->numSector,
   1926 		    req->buf, KernelWakeupFunc, (void *) req,
   1927 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   1928 
   1929 		if (rf_debugKernelAccess) {
   1930 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   1931 				(long) bp->b_blkno));
   1932 		}
   1933 		queue->numOutstanding++;
   1934 		queue->last_deq_sector = req->sectorOffset;
   1935 		/* acc wouldn't have been let in if there were any pending
   1936 		 * reqs at any other priority */
   1937 		queue->curPriority = req->priority;
   1938 
   1939 		db1_printf(("Going for %c to unit %d col %d\n",
   1940 			    req->type, queue->raidPtr->raidid,
   1941 			    queue->col));
   1942 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   1943 			(int) req->sectorOffset, (int) req->numSector,
   1944 			(int) (req->numSector <<
   1945 			    queue->raidPtr->logBytesPerSector),
   1946 			(int) queue->raidPtr->logBytesPerSector));
   1947 		VOP_STRATEGY(bp->b_vp, bp);
   1948 
   1949 		break;
   1950 
   1951 	default:
   1952 		panic("bad req->type in rf_DispatchKernelIO");
   1953 	}
   1954 	db1_printf(("Exiting from DispatchKernelIO\n"));
   1955 
   1956 	return (0);
   1957 }
   1958 /* this is the callback function associated with a I/O invoked from
   1959    kernel code.
   1960  */
   1961 static void
   1962 KernelWakeupFunc(struct buf *bp)
   1963 {
   1964 	RF_DiskQueueData_t *req = NULL;
   1965 	RF_DiskQueue_t *queue;
   1966 	int s;
   1967 
   1968 	s = splbio();
   1969 	db1_printf(("recovering the request queue:\n"));
   1970 	req = bp->b_private;
   1971 
   1972 	queue = (RF_DiskQueue_t *) req->queue;
   1973 
   1974 #if RF_ACC_TRACE > 0
   1975 	if (req->tracerec) {
   1976 		RF_ETIMER_STOP(req->tracerec->timer);
   1977 		RF_ETIMER_EVAL(req->tracerec->timer);
   1978 		RF_LOCK_MUTEX(rf_tracing_mutex);
   1979 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   1980 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   1981 		req->tracerec->num_phys_ios++;
   1982 		RF_UNLOCK_MUTEX(rf_tracing_mutex);
   1983 	}
   1984 #endif
   1985 
   1986 	/* XXX Ok, let's get aggressive... If B_ERROR is set, let's go
   1987 	 * ballistic, and mark the component as hosed... */
   1988 
   1989 	if (bp->b_flags & B_ERROR) {
   1990 		/* Mark the disk as dead */
   1991 		/* but only mark it once... */
   1992 		/* and only if it wouldn't leave this RAID set
   1993 		   completely broken */
   1994 		if (((queue->raidPtr->Disks[queue->col].status ==
   1995 		      rf_ds_optimal) ||
   1996 		     (queue->raidPtr->Disks[queue->col].status ==
   1997 		      rf_ds_used_spare)) &&
   1998 		     (queue->raidPtr->numFailures <
   1999 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2000 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2001 			       queue->raidPtr->raidid,
   2002 			       queue->raidPtr->Disks[queue->col].devname);
   2003 			queue->raidPtr->Disks[queue->col].status =
   2004 			    rf_ds_failed;
   2005 			queue->raidPtr->status = rf_rs_degraded;
   2006 			queue->raidPtr->numFailures++;
   2007 			queue->raidPtr->numNewFailures++;
   2008 		} else {	/* Disk is already dead... */
   2009 			/* printf("Disk already marked as dead!\n"); */
   2010 		}
   2011 
   2012 	}
   2013 
   2014 	/* Fill in the error value */
   2015 
   2016 	req->error = (bp->b_flags & B_ERROR) ? bp->b_error : 0;
   2017 
   2018 	simple_lock(&queue->raidPtr->iodone_lock);
   2019 
   2020 	/* Drop this one on the "finished" queue... */
   2021 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2022 
   2023 	/* Let the raidio thread know there is work to be done. */
   2024 	wakeup(&(queue->raidPtr->iodone));
   2025 
   2026 	simple_unlock(&queue->raidPtr->iodone_lock);
   2027 
   2028 	splx(s);
   2029 }
   2030 
   2031 
   2032 
   2033 /*
   2034  * initialize a buf structure for doing an I/O in the kernel.
   2035  */
   2036 static void
   2037 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2038        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2039        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2040        struct proc *b_proc)
   2041 {
   2042 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2043 	bp->b_flags = B_CALL | rw_flag;	/* XXX need B_PHYS here too??? */
   2044 	bp->b_bcount = numSect << logBytesPerSector;
   2045 	bp->b_bufsize = bp->b_bcount;
   2046 	bp->b_error = 0;
   2047 	bp->b_dev = dev;
   2048 	bp->b_data = bf;
   2049 	bp->b_blkno = startSect;
   2050 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2051 	if (bp->b_bcount == 0) {
   2052 		panic("bp->b_bcount is zero in InitBP!!");
   2053 	}
   2054 	bp->b_proc = b_proc;
   2055 	bp->b_iodone = cbFunc;
   2056 	bp->b_private = cbArg;
   2057 	bp->b_vp = b_vp;
   2058 	if ((bp->b_flags & B_READ) == 0) {
   2059 		bp->b_vp->v_numoutput++;
   2060 	}
   2061 
   2062 }
   2063 
   2064 static void
   2065 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2066 		    struct disklabel *lp)
   2067 {
   2068 	memset(lp, 0, sizeof(*lp));
   2069 
   2070 	/* fabricate a label... */
   2071 	lp->d_secperunit = raidPtr->totalSectors;
   2072 	lp->d_secsize = raidPtr->bytesPerSector;
   2073 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2074 	lp->d_ntracks = 4 * raidPtr->numCol;
   2075 	lp->d_ncylinders = raidPtr->totalSectors /
   2076 		(lp->d_nsectors * lp->d_ntracks);
   2077 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2078 
   2079 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2080 	lp->d_type = DTYPE_RAID;
   2081 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2082 	lp->d_rpm = 3600;
   2083 	lp->d_interleave = 1;
   2084 	lp->d_flags = 0;
   2085 
   2086 	lp->d_partitions[RAW_PART].p_offset = 0;
   2087 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2088 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2089 	lp->d_npartitions = RAW_PART + 1;
   2090 
   2091 	lp->d_magic = DISKMAGIC;
   2092 	lp->d_magic2 = DISKMAGIC;
   2093 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2094 
   2095 }
   2096 /*
   2097  * Read the disklabel from the raid device.  If one is not present, fake one
   2098  * up.
   2099  */
   2100 static void
   2101 raidgetdisklabel(dev_t dev)
   2102 {
   2103 	int     unit = raidunit(dev);
   2104 	struct raid_softc *rs = &raid_softc[unit];
   2105 	const char   *errstring;
   2106 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2107 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2108 	RF_Raid_t *raidPtr;
   2109 
   2110 	db1_printf(("Getting the disklabel...\n"));
   2111 
   2112 	memset(clp, 0, sizeof(*clp));
   2113 
   2114 	raidPtr = raidPtrs[unit];
   2115 
   2116 	raidgetdefaultlabel(raidPtr, rs, lp);
   2117 
   2118 	/*
   2119 	 * Call the generic disklabel extraction routine.
   2120 	 */
   2121 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2122 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2123 	if (errstring)
   2124 		raidmakedisklabel(rs);
   2125 	else {
   2126 		int     i;
   2127 		struct partition *pp;
   2128 
   2129 		/*
   2130 		 * Sanity check whether the found disklabel is valid.
   2131 		 *
   2132 		 * This is necessary since total size of the raid device
   2133 		 * may vary when an interleave is changed even though exactly
   2134 		 * same components are used, and old disklabel may used
   2135 		 * if that is found.
   2136 		 */
   2137 		if (lp->d_secperunit != rs->sc_size)
   2138 			printf("raid%d: WARNING: %s: "
   2139 			    "total sector size in disklabel (%d) != "
   2140 			    "the size of raid (%ld)\n", unit, rs->sc_xname,
   2141 			    lp->d_secperunit, (long) rs->sc_size);
   2142 		for (i = 0; i < lp->d_npartitions; i++) {
   2143 			pp = &lp->d_partitions[i];
   2144 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2145 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2146 				       "exceeds the size of raid (%ld)\n",
   2147 				       unit, rs->sc_xname, 'a' + i, (long) rs->sc_size);
   2148 		}
   2149 	}
   2150 
   2151 }
   2152 /*
   2153  * Take care of things one might want to take care of in the event
   2154  * that a disklabel isn't present.
   2155  */
   2156 static void
   2157 raidmakedisklabel(struct raid_softc *rs)
   2158 {
   2159 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2160 	db1_printf(("Making a label..\n"));
   2161 
   2162 	/*
   2163 	 * For historical reasons, if there's no disklabel present
   2164 	 * the raw partition must be marked FS_BSDFFS.
   2165 	 */
   2166 
   2167 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2168 
   2169 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2170 
   2171 	lp->d_checksum = dkcksum(lp);
   2172 }
   2173 /*
   2174  * Wait interruptibly for an exclusive lock.
   2175  *
   2176  * XXX
   2177  * Several drivers do this; it should be abstracted and made MP-safe.
   2178  * (Hmm... where have we seen this warning before :->  GO )
   2179  */
   2180 static int
   2181 raidlock(struct raid_softc *rs)
   2182 {
   2183 	int     error;
   2184 
   2185 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2186 		rs->sc_flags |= RAIDF_WANTED;
   2187 		if ((error =
   2188 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2189 			return (error);
   2190 	}
   2191 	rs->sc_flags |= RAIDF_LOCKED;
   2192 	return (0);
   2193 }
   2194 /*
   2195  * Unlock and wake up any waiters.
   2196  */
   2197 static void
   2198 raidunlock(struct raid_softc *rs)
   2199 {
   2200 
   2201 	rs->sc_flags &= ~RAIDF_LOCKED;
   2202 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2203 		rs->sc_flags &= ~RAIDF_WANTED;
   2204 		wakeup(rs);
   2205 	}
   2206 }
   2207 
   2208 
   2209 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2210 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2211 
   2212 int
   2213 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
   2214 {
   2215 	RF_ComponentLabel_t clabel;
   2216 	raidread_component_label(dev, b_vp, &clabel);
   2217 	clabel.mod_counter = mod_counter;
   2218 	clabel.clean = RF_RAID_CLEAN;
   2219 	raidwrite_component_label(dev, b_vp, &clabel);
   2220 	return(0);
   2221 }
   2222 
   2223 
   2224 int
   2225 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
   2226 {
   2227 	RF_ComponentLabel_t clabel;
   2228 	raidread_component_label(dev, b_vp, &clabel);
   2229 	clabel.mod_counter = mod_counter;
   2230 	clabel.clean = RF_RAID_DIRTY;
   2231 	raidwrite_component_label(dev, b_vp, &clabel);
   2232 	return(0);
   2233 }
   2234 
   2235 /* ARGSUSED */
   2236 int
   2237 raidread_component_label(dev_t dev, struct vnode *b_vp,
   2238 			 RF_ComponentLabel_t *clabel)
   2239 {
   2240 	struct buf *bp;
   2241 	const struct bdevsw *bdev;
   2242 	int error;
   2243 
   2244 	/* XXX should probably ensure that we don't try to do this if
   2245 	   someone has changed rf_protected_sectors. */
   2246 
   2247 	if (b_vp == NULL) {
   2248 		/* For whatever reason, this component is not valid.
   2249 		   Don't try to read a component label from it. */
   2250 		return(EINVAL);
   2251 	}
   2252 
   2253 	/* get a block of the appropriate size... */
   2254 	bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
   2255 	bp->b_dev = dev;
   2256 
   2257 	/* get our ducks in a row for the read */
   2258 	bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
   2259 	bp->b_bcount = RF_COMPONENT_INFO_SIZE;
   2260 	bp->b_flags |= B_READ;
   2261  	bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
   2262 
   2263 	bdev = bdevsw_lookup(bp->b_dev);
   2264 	if (bdev == NULL)
   2265 		return (ENXIO);
   2266 	(*bdev->d_strategy)(bp);
   2267 
   2268 	error = biowait(bp);
   2269 
   2270 	if (!error) {
   2271 		memcpy(clabel, bp->b_data,
   2272 		       sizeof(RF_ComponentLabel_t));
   2273 	}
   2274 
   2275 	brelse(bp);
   2276 	return(error);
   2277 }
   2278 /* ARGSUSED */
   2279 int
   2280 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
   2281 			  RF_ComponentLabel_t *clabel)
   2282 {
   2283 	struct buf *bp;
   2284 	const struct bdevsw *bdev;
   2285 	int error;
   2286 
   2287 	/* get a block of the appropriate size... */
   2288 	bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
   2289 	bp->b_dev = dev;
   2290 
   2291 	/* get our ducks in a row for the write */
   2292 	bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
   2293 	bp->b_bcount = RF_COMPONENT_INFO_SIZE;
   2294 	bp->b_flags |= B_WRITE;
   2295  	bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
   2296 
   2297 	memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
   2298 
   2299 	memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
   2300 
   2301 	bdev = bdevsw_lookup(bp->b_dev);
   2302 	if (bdev == NULL)
   2303 		return (ENXIO);
   2304 	(*bdev->d_strategy)(bp);
   2305 	error = biowait(bp);
   2306 	brelse(bp);
   2307 	if (error) {
   2308 #if 1
   2309 		printf("Failed to write RAID component info!\n");
   2310 #endif
   2311 	}
   2312 
   2313 	return(error);
   2314 }
   2315 
   2316 void
   2317 rf_markalldirty(RF_Raid_t *raidPtr)
   2318 {
   2319 	RF_ComponentLabel_t clabel;
   2320 	int sparecol;
   2321 	int c;
   2322 	int j;
   2323 	int scol = -1;
   2324 
   2325 	raidPtr->mod_counter++;
   2326 	for (c = 0; c < raidPtr->numCol; c++) {
   2327 		/* we don't want to touch (at all) a disk that has
   2328 		   failed */
   2329 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2330 			raidread_component_label(
   2331 						 raidPtr->Disks[c].dev,
   2332 						 raidPtr->raid_cinfo[c].ci_vp,
   2333 						 &clabel);
   2334 			if (clabel.status == rf_ds_spared) {
   2335 				/* XXX do something special...
   2336 				   but whatever you do, don't
   2337 				   try to access it!! */
   2338 			} else {
   2339 				raidmarkdirty(
   2340 					      raidPtr->Disks[c].dev,
   2341 					      raidPtr->raid_cinfo[c].ci_vp,
   2342 					      raidPtr->mod_counter);
   2343 			}
   2344 		}
   2345 	}
   2346 
   2347 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2348 		sparecol = raidPtr->numCol + c;
   2349 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2350 			/*
   2351 
   2352 			   we claim this disk is "optimal" if it's
   2353 			   rf_ds_used_spare, as that means it should be
   2354 			   directly substitutable for the disk it replaced.
   2355 			   We note that too...
   2356 
   2357 			 */
   2358 
   2359 			for(j=0;j<raidPtr->numCol;j++) {
   2360 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2361 					scol = j;
   2362 					break;
   2363 				}
   2364 			}
   2365 
   2366 			raidread_component_label(
   2367 				 raidPtr->Disks[sparecol].dev,
   2368 				 raidPtr->raid_cinfo[sparecol].ci_vp,
   2369 				 &clabel);
   2370 			/* make sure status is noted */
   2371 
   2372 			raid_init_component_label(raidPtr, &clabel);
   2373 
   2374 			clabel.row = 0;
   2375 			clabel.column = scol;
   2376 			/* Note: we *don't* change status from rf_ds_used_spare
   2377 			   to rf_ds_optimal */
   2378 			/* clabel.status = rf_ds_optimal; */
   2379 
   2380 			raidmarkdirty(raidPtr->Disks[sparecol].dev,
   2381 				      raidPtr->raid_cinfo[sparecol].ci_vp,
   2382 				      raidPtr->mod_counter);
   2383 		}
   2384 	}
   2385 }
   2386 
   2387 
   2388 void
   2389 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2390 {
   2391 	RF_ComponentLabel_t clabel;
   2392 	int sparecol;
   2393 	int c;
   2394 	int j;
   2395 	int scol;
   2396 
   2397 	scol = -1;
   2398 
   2399 	/* XXX should do extra checks to make sure things really are clean,
   2400 	   rather than blindly setting the clean bit... */
   2401 
   2402 	raidPtr->mod_counter++;
   2403 
   2404 	for (c = 0; c < raidPtr->numCol; c++) {
   2405 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2406 			raidread_component_label(
   2407 						 raidPtr->Disks[c].dev,
   2408 						 raidPtr->raid_cinfo[c].ci_vp,
   2409 						 &clabel);
   2410 			/* make sure status is noted */
   2411 			clabel.status = rf_ds_optimal;
   2412 
   2413 			/* bump the counter */
   2414 			clabel.mod_counter = raidPtr->mod_counter;
   2415 
   2416 			/* note what unit we are configured as */
   2417 			clabel.last_unit = raidPtr->raidid;
   2418 
   2419 			raidwrite_component_label(
   2420 						  raidPtr->Disks[c].dev,
   2421 						  raidPtr->raid_cinfo[c].ci_vp,
   2422 						  &clabel);
   2423 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2424 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2425 					raidmarkclean(
   2426 						      raidPtr->Disks[c].dev,
   2427 						      raidPtr->raid_cinfo[c].ci_vp,
   2428 						      raidPtr->mod_counter);
   2429 				}
   2430 			}
   2431 		}
   2432 		/* else we don't touch it.. */
   2433 	}
   2434 
   2435 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2436 		sparecol = raidPtr->numCol + c;
   2437 		/* Need to ensure that the reconstruct actually completed! */
   2438 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2439 			/*
   2440 
   2441 			   we claim this disk is "optimal" if it's
   2442 			   rf_ds_used_spare, as that means it should be
   2443 			   directly substitutable for the disk it replaced.
   2444 			   We note that too...
   2445 
   2446 			 */
   2447 
   2448 			for(j=0;j<raidPtr->numCol;j++) {
   2449 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2450 					scol = j;
   2451 					break;
   2452 				}
   2453 			}
   2454 
   2455 			/* XXX shouldn't *really* need this... */
   2456 			raidread_component_label(
   2457 				      raidPtr->Disks[sparecol].dev,
   2458 				      raidPtr->raid_cinfo[sparecol].ci_vp,
   2459 				      &clabel);
   2460 			/* make sure status is noted */
   2461 
   2462 			raid_init_component_label(raidPtr, &clabel);
   2463 
   2464 			clabel.mod_counter = raidPtr->mod_counter;
   2465 			clabel.column = scol;
   2466 			clabel.status = rf_ds_optimal;
   2467 			clabel.last_unit = raidPtr->raidid;
   2468 
   2469 			raidwrite_component_label(
   2470 				      raidPtr->Disks[sparecol].dev,
   2471 				      raidPtr->raid_cinfo[sparecol].ci_vp,
   2472 				      &clabel);
   2473 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2474 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2475 					raidmarkclean( raidPtr->Disks[sparecol].dev,
   2476 						       raidPtr->raid_cinfo[sparecol].ci_vp,
   2477 						       raidPtr->mod_counter);
   2478 				}
   2479 			}
   2480 		}
   2481 	}
   2482 }
   2483 
   2484 void
   2485 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2486 {
   2487 	struct proc *p;
   2488 	struct lwp *l;
   2489 
   2490 	p = raidPtr->engine_thread;
   2491 	l = LIST_FIRST(&p->p_lwps);
   2492 
   2493 	if (vp != NULL) {
   2494 		if (auto_configured == 1) {
   2495 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2496 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
   2497 			vput(vp);
   2498 
   2499 		} else {
   2500 			(void) vn_close(vp, FREAD | FWRITE, p->p_cred, l);
   2501 		}
   2502 	}
   2503 }
   2504 
   2505 
   2506 void
   2507 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2508 {
   2509 	int r,c;
   2510 	struct vnode *vp;
   2511 	int acd;
   2512 
   2513 
   2514 	/* We take this opportunity to close the vnodes like we should.. */
   2515 
   2516 	for (c = 0; c < raidPtr->numCol; c++) {
   2517 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2518 		acd = raidPtr->Disks[c].auto_configured;
   2519 		rf_close_component(raidPtr, vp, acd);
   2520 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2521 		raidPtr->Disks[c].auto_configured = 0;
   2522 	}
   2523 
   2524 	for (r = 0; r < raidPtr->numSpare; r++) {
   2525 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2526 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2527 		rf_close_component(raidPtr, vp, acd);
   2528 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2529 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2530 	}
   2531 }
   2532 
   2533 
   2534 void
   2535 rf_ReconThread(struct rf_recon_req *req)
   2536 {
   2537 	int     s;
   2538 	RF_Raid_t *raidPtr;
   2539 
   2540 	s = splbio();
   2541 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2542 	raidPtr->recon_in_progress = 1;
   2543 
   2544 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2545 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2546 
   2547 	RF_Free(req, sizeof(*req));
   2548 
   2549 	raidPtr->recon_in_progress = 0;
   2550 	splx(s);
   2551 
   2552 	/* That's all... */
   2553 	kthread_exit(0);	/* does not return */
   2554 }
   2555 
   2556 void
   2557 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2558 {
   2559 	int retcode;
   2560 	int s;
   2561 
   2562 	raidPtr->parity_rewrite_stripes_done = 0;
   2563 	raidPtr->parity_rewrite_in_progress = 1;
   2564 	s = splbio();
   2565 	retcode = rf_RewriteParity(raidPtr);
   2566 	splx(s);
   2567 	if (retcode) {
   2568 		printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
   2569 	} else {
   2570 		/* set the clean bit!  If we shutdown correctly,
   2571 		   the clean bit on each component label will get
   2572 		   set */
   2573 		raidPtr->parity_good = RF_RAID_CLEAN;
   2574 	}
   2575 	raidPtr->parity_rewrite_in_progress = 0;
   2576 
   2577 	/* Anyone waiting for us to stop?  If so, inform them... */
   2578 	if (raidPtr->waitShutdown) {
   2579 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2580 	}
   2581 
   2582 	/* That's all... */
   2583 	kthread_exit(0);	/* does not return */
   2584 }
   2585 
   2586 
   2587 void
   2588 rf_CopybackThread(RF_Raid_t *raidPtr)
   2589 {
   2590 	int s;
   2591 
   2592 	raidPtr->copyback_in_progress = 1;
   2593 	s = splbio();
   2594 	rf_CopybackReconstructedData(raidPtr);
   2595 	splx(s);
   2596 	raidPtr->copyback_in_progress = 0;
   2597 
   2598 	/* That's all... */
   2599 	kthread_exit(0);	/* does not return */
   2600 }
   2601 
   2602 
   2603 void
   2604 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2605 {
   2606 	int s;
   2607 	RF_Raid_t *raidPtr;
   2608 
   2609 	s = splbio();
   2610 	raidPtr = req->raidPtr;
   2611 	raidPtr->recon_in_progress = 1;
   2612 	rf_ReconstructInPlace(raidPtr, req->col);
   2613 	RF_Free(req, sizeof(*req));
   2614 	raidPtr->recon_in_progress = 0;
   2615 	splx(s);
   2616 
   2617 	/* That's all... */
   2618 	kthread_exit(0);	/* does not return */
   2619 }
   2620 
   2621 static RF_AutoConfig_t *
   2622 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2623     const char *cname, RF_SectorCount_t size)
   2624 {
   2625 	int good_one = 0;
   2626 	RF_ComponentLabel_t *clabel;
   2627 	RF_AutoConfig_t *ac;
   2628 
   2629 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2630 	if (clabel == NULL) {
   2631 oomem:
   2632 		    while(ac_list) {
   2633 			    ac = ac_list;
   2634 			    if (ac->clabel)
   2635 				    free(ac->clabel, M_RAIDFRAME);
   2636 			    ac_list = ac_list->next;
   2637 			    free(ac, M_RAIDFRAME);
   2638 		    }
   2639 		    printf("RAID auto config: out of memory!\n");
   2640 		    return NULL; /* XXX probably should panic? */
   2641 	}
   2642 
   2643 	if (!raidread_component_label(dev, vp, clabel)) {
   2644 		    /* Got the label.  Does it look reasonable? */
   2645 		    if (rf_reasonable_label(clabel) &&
   2646 			(clabel->partitionSize <= size)) {
   2647 #ifdef DEBUG
   2648 			    printf("Component on: %s: %llu\n",
   2649 				cname, (unsigned long long)size);
   2650 			    rf_print_component_label(clabel);
   2651 #endif
   2652 			    /* if it's reasonable, add it, else ignore it. */
   2653 			    ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2654 				M_NOWAIT);
   2655 			    if (ac == NULL) {
   2656 				    free(clabel, M_RAIDFRAME);
   2657 				    goto oomem;
   2658 			    }
   2659 			    strlcpy(ac->devname, cname, sizeof(ac->devname));
   2660 			    ac->dev = dev;
   2661 			    ac->vp = vp;
   2662 			    ac->clabel = clabel;
   2663 			    ac->next = ac_list;
   2664 			    ac_list = ac;
   2665 			    good_one = 1;
   2666 		    }
   2667 	}
   2668 	if (!good_one) {
   2669 		/* cleanup */
   2670 		free(clabel, M_RAIDFRAME);
   2671 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2672 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
   2673 		vput(vp);
   2674 	}
   2675 	return ac_list;
   2676 }
   2677 
   2678 RF_AutoConfig_t *
   2679 rf_find_raid_components()
   2680 {
   2681 	struct vnode *vp;
   2682 	struct disklabel label;
   2683 	struct device *dv;
   2684 	dev_t dev;
   2685 	int bmajor, bminor, wedge;
   2686 	int error;
   2687 	int i;
   2688 	RF_AutoConfig_t *ac_list;
   2689 
   2690 
   2691 	/* initialize the AutoConfig list */
   2692 	ac_list = NULL;
   2693 
   2694 	/* we begin by trolling through *all* the devices on the system */
   2695 
   2696 	for (dv = alldevs.tqh_first; dv != NULL;
   2697 	     dv = dv->dv_list.tqe_next) {
   2698 
   2699 		/* we are only interested in disks... */
   2700 		if (device_class(dv) != DV_DISK)
   2701 			continue;
   2702 
   2703 		/* we don't care about floppies... */
   2704 		if (device_is_a(dv, "fd")) {
   2705 			continue;
   2706 		}
   2707 
   2708 		/* we don't care about CD's... */
   2709 		if (device_is_a(dv, "cd")) {
   2710 			continue;
   2711 		}
   2712 
   2713 		/* hdfd is the Atari/Hades floppy driver */
   2714 		if (device_is_a(dv, "hdfd")) {
   2715 			continue;
   2716 		}
   2717 
   2718 		/* fdisa is the Atari/Milan floppy driver */
   2719 		if (device_is_a(dv, "fdisa")) {
   2720 			continue;
   2721 		}
   2722 
   2723 		/* need to find the device_name_to_block_device_major stuff */
   2724 		bmajor = devsw_name2blk(dv->dv_xname, NULL, 0);
   2725 
   2726 		/* get a vnode for the raw partition of this disk */
   2727 
   2728 		wedge = device_is_a(dv, "dk");
   2729 		bminor = minor(device_unit(dv));
   2730 		dev = wedge ? makedev(bmajor, bminor) :
   2731 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2732 		if (bdevvp(dev, &vp))
   2733 			panic("RAID can't alloc vnode");
   2734 
   2735 		error = VOP_OPEN(vp, FREAD, NOCRED, 0);
   2736 
   2737 		if (error) {
   2738 			/* "Who cares."  Continue looking
   2739 			   for something that exists*/
   2740 			vput(vp);
   2741 			continue;
   2742 		}
   2743 
   2744 		if (wedge) {
   2745 			struct dkwedge_info dkw;
   2746 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2747 			    NOCRED, 0);
   2748 			if (error) {
   2749 				printf("RAIDframe: can't get wedge info for "
   2750 				    "dev %s (%d)\n", dv->dv_xname, error);
   2751 out:
   2752 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2753 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
   2754 				vput(vp);
   2755 				continue;
   2756 			}
   2757 
   2758 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0)
   2759 				goto out;
   2760 
   2761 			ac_list = rf_get_component(ac_list, dev, vp,
   2762 			    dv->dv_xname, dkw.dkw_size);
   2763 			continue;
   2764 		}
   2765 
   2766 		/* Ok, the disk exists.  Go get the disklabel. */
   2767 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED, 0);
   2768 		if (error) {
   2769 			/*
   2770 			 * XXX can't happen - open() would
   2771 			 * have errored out (or faked up one)
   2772 			 */
   2773 			if (error != ENOTTY)
   2774 				printf("RAIDframe: can't get label for dev "
   2775 				    "%s (%d)\n", dv->dv_xname, error);
   2776 		}
   2777 
   2778 		/* don't need this any more.  We'll allocate it again
   2779 		   a little later if we really do... */
   2780 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2781 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED, 0);
   2782 		vput(vp);
   2783 
   2784 		if (error)
   2785 			continue;
   2786 
   2787 		for (i = 0; i < label.d_npartitions; i++) {
   2788 			char cname[sizeof(ac_list->devname)];
   2789 
   2790 			/* We only support partitions marked as RAID */
   2791 			if (label.d_partitions[i].p_fstype != FS_RAID)
   2792 				continue;
   2793 
   2794 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2795 			if (bdevvp(dev, &vp))
   2796 				panic("RAID can't alloc vnode");
   2797 
   2798 			error = VOP_OPEN(vp, FREAD, NOCRED, 0);
   2799 			if (error) {
   2800 				/* Whatever... */
   2801 				vput(vp);
   2802 				continue;
   2803 			}
   2804 			snprintf(cname, sizeof(cname), "%s%c",
   2805 			    dv->dv_xname, 'a' + i);
   2806 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   2807 				label.d_partitions[i].p_size);
   2808 		}
   2809 	}
   2810 	return ac_list;
   2811 }
   2812 
   2813 
   2814 static int
   2815 rf_reasonable_label(RF_ComponentLabel_t *clabel)
   2816 {
   2817 
   2818 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2819 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2820 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2821 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2822 	    clabel->row >=0 &&
   2823 	    clabel->column >= 0 &&
   2824 	    clabel->num_rows > 0 &&
   2825 	    clabel->num_columns > 0 &&
   2826 	    clabel->row < clabel->num_rows &&
   2827 	    clabel->column < clabel->num_columns &&
   2828 	    clabel->blockSize > 0 &&
   2829 	    clabel->numBlocks > 0) {
   2830 		/* label looks reasonable enough... */
   2831 		return(1);
   2832 	}
   2833 	return(0);
   2834 }
   2835 
   2836 
   2837 #ifdef DEBUG
   2838 void
   2839 rf_print_component_label(RF_ComponentLabel_t *clabel)
   2840 {
   2841 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   2842 	       clabel->row, clabel->column,
   2843 	       clabel->num_rows, clabel->num_columns);
   2844 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   2845 	       clabel->version, clabel->serial_number,
   2846 	       clabel->mod_counter);
   2847 	printf("   Clean: %s Status: %d\n",
   2848 	       clabel->clean ? "Yes" : "No", clabel->status );
   2849 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   2850 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   2851 	printf("   RAID Level: %c  blocksize: %d numBlocks: %d\n",
   2852 	       (char) clabel->parityConfig, clabel->blockSize,
   2853 	       clabel->numBlocks);
   2854 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
   2855 	printf("   Contains root partition: %s\n",
   2856 	       clabel->root_partition ? "Yes" : "No" );
   2857 	printf("   Last configured as: raid%d\n", clabel->last_unit );
   2858 #if 0
   2859 	   printf("   Config order: %d\n", clabel->config_order);
   2860 #endif
   2861 
   2862 }
   2863 #endif
   2864 
   2865 RF_ConfigSet_t *
   2866 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   2867 {
   2868 	RF_AutoConfig_t *ac;
   2869 	RF_ConfigSet_t *config_sets;
   2870 	RF_ConfigSet_t *cset;
   2871 	RF_AutoConfig_t *ac_next;
   2872 
   2873 
   2874 	config_sets = NULL;
   2875 
   2876 	/* Go through the AutoConfig list, and figure out which components
   2877 	   belong to what sets.  */
   2878 	ac = ac_list;
   2879 	while(ac!=NULL) {
   2880 		/* we're going to putz with ac->next, so save it here
   2881 		   for use at the end of the loop */
   2882 		ac_next = ac->next;
   2883 
   2884 		if (config_sets == NULL) {
   2885 			/* will need at least this one... */
   2886 			config_sets = (RF_ConfigSet_t *)
   2887 				malloc(sizeof(RF_ConfigSet_t),
   2888 				       M_RAIDFRAME, M_NOWAIT);
   2889 			if (config_sets == NULL) {
   2890 				panic("rf_create_auto_sets: No memory!");
   2891 			}
   2892 			/* this one is easy :) */
   2893 			config_sets->ac = ac;
   2894 			config_sets->next = NULL;
   2895 			config_sets->rootable = 0;
   2896 			ac->next = NULL;
   2897 		} else {
   2898 			/* which set does this component fit into? */
   2899 			cset = config_sets;
   2900 			while(cset!=NULL) {
   2901 				if (rf_does_it_fit(cset, ac)) {
   2902 					/* looks like it matches... */
   2903 					ac->next = cset->ac;
   2904 					cset->ac = ac;
   2905 					break;
   2906 				}
   2907 				cset = cset->next;
   2908 			}
   2909 			if (cset==NULL) {
   2910 				/* didn't find a match above... new set..*/
   2911 				cset = (RF_ConfigSet_t *)
   2912 					malloc(sizeof(RF_ConfigSet_t),
   2913 					       M_RAIDFRAME, M_NOWAIT);
   2914 				if (cset == NULL) {
   2915 					panic("rf_create_auto_sets: No memory!");
   2916 				}
   2917 				cset->ac = ac;
   2918 				ac->next = NULL;
   2919 				cset->next = config_sets;
   2920 				cset->rootable = 0;
   2921 				config_sets = cset;
   2922 			}
   2923 		}
   2924 		ac = ac_next;
   2925 	}
   2926 
   2927 
   2928 	return(config_sets);
   2929 }
   2930 
   2931 static int
   2932 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   2933 {
   2934 	RF_ComponentLabel_t *clabel1, *clabel2;
   2935 
   2936 	/* If this one matches the *first* one in the set, that's good
   2937 	   enough, since the other members of the set would have been
   2938 	   through here too... */
   2939 	/* note that we are not checking partitionSize here..
   2940 
   2941 	   Note that we are also not checking the mod_counters here.
   2942 	   If everything else matches execpt the mod_counter, that's
   2943 	   good enough for this test.  We will deal with the mod_counters
   2944 	   a little later in the autoconfiguration process.
   2945 
   2946 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   2947 
   2948 	   The reason we don't check for this is that failed disks
   2949 	   will have lower modification counts.  If those disks are
   2950 	   not added to the set they used to belong to, then they will
   2951 	   form their own set, which may result in 2 different sets,
   2952 	   for example, competing to be configured at raid0, and
   2953 	   perhaps competing to be the root filesystem set.  If the
   2954 	   wrong ones get configured, or both attempt to become /,
   2955 	   weird behaviour and or serious lossage will occur.  Thus we
   2956 	   need to bring them into the fold here, and kick them out at
   2957 	   a later point.
   2958 
   2959 	*/
   2960 
   2961 	clabel1 = cset->ac->clabel;
   2962 	clabel2 = ac->clabel;
   2963 	if ((clabel1->version == clabel2->version) &&
   2964 	    (clabel1->serial_number == clabel2->serial_number) &&
   2965 	    (clabel1->num_rows == clabel2->num_rows) &&
   2966 	    (clabel1->num_columns == clabel2->num_columns) &&
   2967 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   2968 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   2969 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   2970 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   2971 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   2972 	    (clabel1->blockSize == clabel2->blockSize) &&
   2973 	    (clabel1->numBlocks == clabel2->numBlocks) &&
   2974 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   2975 	    (clabel1->root_partition == clabel2->root_partition) &&
   2976 	    (clabel1->last_unit == clabel2->last_unit) &&
   2977 	    (clabel1->config_order == clabel2->config_order)) {
   2978 		/* if it get's here, it almost *has* to be a match */
   2979 	} else {
   2980 		/* it's not consistent with somebody in the set..
   2981 		   punt */
   2982 		return(0);
   2983 	}
   2984 	/* all was fine.. it must fit... */
   2985 	return(1);
   2986 }
   2987 
   2988 int
   2989 rf_have_enough_components(RF_ConfigSet_t *cset)
   2990 {
   2991 	RF_AutoConfig_t *ac;
   2992 	RF_AutoConfig_t *auto_config;
   2993 	RF_ComponentLabel_t *clabel;
   2994 	int c;
   2995 	int num_cols;
   2996 	int num_missing;
   2997 	int mod_counter;
   2998 	int mod_counter_found;
   2999 	int even_pair_failed;
   3000 	char parity_type;
   3001 
   3002 
   3003 	/* check to see that we have enough 'live' components
   3004 	   of this set.  If so, we can configure it if necessary */
   3005 
   3006 	num_cols = cset->ac->clabel->num_columns;
   3007 	parity_type = cset->ac->clabel->parityConfig;
   3008 
   3009 	/* XXX Check for duplicate components!?!?!? */
   3010 
   3011 	/* Determine what the mod_counter is supposed to be for this set. */
   3012 
   3013 	mod_counter_found = 0;
   3014 	mod_counter = 0;
   3015 	ac = cset->ac;
   3016 	while(ac!=NULL) {
   3017 		if (mod_counter_found==0) {
   3018 			mod_counter = ac->clabel->mod_counter;
   3019 			mod_counter_found = 1;
   3020 		} else {
   3021 			if (ac->clabel->mod_counter > mod_counter) {
   3022 				mod_counter = ac->clabel->mod_counter;
   3023 			}
   3024 		}
   3025 		ac = ac->next;
   3026 	}
   3027 
   3028 	num_missing = 0;
   3029 	auto_config = cset->ac;
   3030 
   3031 	even_pair_failed = 0;
   3032 	for(c=0; c<num_cols; c++) {
   3033 		ac = auto_config;
   3034 		while(ac!=NULL) {
   3035 			if ((ac->clabel->column == c) &&
   3036 			    (ac->clabel->mod_counter == mod_counter)) {
   3037 				/* it's this one... */
   3038 #ifdef DEBUG
   3039 				printf("Found: %s at %d\n",
   3040 				       ac->devname,c);
   3041 #endif
   3042 				break;
   3043 			}
   3044 			ac=ac->next;
   3045 		}
   3046 		if (ac==NULL) {
   3047 				/* Didn't find one here! */
   3048 				/* special case for RAID 1, especially
   3049 				   where there are more than 2
   3050 				   components (where RAIDframe treats
   3051 				   things a little differently :( ) */
   3052 			if (parity_type == '1') {
   3053 				if (c%2 == 0) { /* even component */
   3054 					even_pair_failed = 1;
   3055 				} else { /* odd component.  If
   3056 					    we're failed, and
   3057 					    so is the even
   3058 					    component, it's
   3059 					    "Good Night, Charlie" */
   3060 					if (even_pair_failed == 1) {
   3061 						return(0);
   3062 					}
   3063 				}
   3064 			} else {
   3065 				/* normal accounting */
   3066 				num_missing++;
   3067 			}
   3068 		}
   3069 		if ((parity_type == '1') && (c%2 == 1)) {
   3070 				/* Just did an even component, and we didn't
   3071 				   bail.. reset the even_pair_failed flag,
   3072 				   and go on to the next component.... */
   3073 			even_pair_failed = 0;
   3074 		}
   3075 	}
   3076 
   3077 	clabel = cset->ac->clabel;
   3078 
   3079 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3080 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3081 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3082 		/* XXX this needs to be made *much* more general */
   3083 		/* Too many failures */
   3084 		return(0);
   3085 	}
   3086 	/* otherwise, all is well, and we've got enough to take a kick
   3087 	   at autoconfiguring this set */
   3088 	return(1);
   3089 }
   3090 
   3091 void
   3092 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3093 			RF_Raid_t *raidPtr)
   3094 {
   3095 	RF_ComponentLabel_t *clabel;
   3096 	int i;
   3097 
   3098 	clabel = ac->clabel;
   3099 
   3100 	/* 1. Fill in the common stuff */
   3101 	config->numRow = clabel->num_rows = 1;
   3102 	config->numCol = clabel->num_columns;
   3103 	config->numSpare = 0; /* XXX should this be set here? */
   3104 	config->sectPerSU = clabel->sectPerSU;
   3105 	config->SUsPerPU = clabel->SUsPerPU;
   3106 	config->SUsPerRU = clabel->SUsPerRU;
   3107 	config->parityConfig = clabel->parityConfig;
   3108 	/* XXX... */
   3109 	strcpy(config->diskQueueType,"fifo");
   3110 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3111 	config->layoutSpecificSize = 0; /* XXX ?? */
   3112 
   3113 	while(ac!=NULL) {
   3114 		/* row/col values will be in range due to the checks
   3115 		   in reasonable_label() */
   3116 		strcpy(config->devnames[0][ac->clabel->column],
   3117 		       ac->devname);
   3118 		ac = ac->next;
   3119 	}
   3120 
   3121 	for(i=0;i<RF_MAXDBGV;i++) {
   3122 		config->debugVars[i][0] = 0;
   3123 	}
   3124 }
   3125 
   3126 int
   3127 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3128 {
   3129 	RF_ComponentLabel_t clabel;
   3130 	struct vnode *vp;
   3131 	dev_t dev;
   3132 	int column;
   3133 	int sparecol;
   3134 
   3135 	raidPtr->autoconfigure = new_value;
   3136 
   3137 	for(column=0; column<raidPtr->numCol; column++) {
   3138 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3139 			dev = raidPtr->Disks[column].dev;
   3140 			vp = raidPtr->raid_cinfo[column].ci_vp;
   3141 			raidread_component_label(dev, vp, &clabel);
   3142 			clabel.autoconfigure = new_value;
   3143 			raidwrite_component_label(dev, vp, &clabel);
   3144 		}
   3145 	}
   3146 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3147 		sparecol = raidPtr->numCol + column;
   3148 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3149 			dev = raidPtr->Disks[sparecol].dev;
   3150 			vp = raidPtr->raid_cinfo[sparecol].ci_vp;
   3151 			raidread_component_label(dev, vp, &clabel);
   3152 			clabel.autoconfigure = new_value;
   3153 			raidwrite_component_label(dev, vp, &clabel);
   3154 		}
   3155 	}
   3156 	return(new_value);
   3157 }
   3158 
   3159 int
   3160 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3161 {
   3162 	RF_ComponentLabel_t clabel;
   3163 	struct vnode *vp;
   3164 	dev_t dev;
   3165 	int column;
   3166 	int sparecol;
   3167 
   3168 	raidPtr->root_partition = new_value;
   3169 	for(column=0; column<raidPtr->numCol; column++) {
   3170 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3171 			dev = raidPtr->Disks[column].dev;
   3172 			vp = raidPtr->raid_cinfo[column].ci_vp;
   3173 			raidread_component_label(dev, vp, &clabel);
   3174 			clabel.root_partition = new_value;
   3175 			raidwrite_component_label(dev, vp, &clabel);
   3176 		}
   3177 	}
   3178 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3179 		sparecol = raidPtr->numCol + column;
   3180 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3181 			dev = raidPtr->Disks[sparecol].dev;
   3182 			vp = raidPtr->raid_cinfo[sparecol].ci_vp;
   3183 			raidread_component_label(dev, vp, &clabel);
   3184 			clabel.root_partition = new_value;
   3185 			raidwrite_component_label(dev, vp, &clabel);
   3186 		}
   3187 	}
   3188 	return(new_value);
   3189 }
   3190 
   3191 void
   3192 rf_release_all_vps(RF_ConfigSet_t *cset)
   3193 {
   3194 	RF_AutoConfig_t *ac;
   3195 
   3196 	ac = cset->ac;
   3197 	while(ac!=NULL) {
   3198 		/* Close the vp, and give it back */
   3199 		if (ac->vp) {
   3200 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3201 			VOP_CLOSE(ac->vp, FREAD, NOCRED, 0);
   3202 			vput(ac->vp);
   3203 			ac->vp = NULL;
   3204 		}
   3205 		ac = ac->next;
   3206 	}
   3207 }
   3208 
   3209 
   3210 void
   3211 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3212 {
   3213 	RF_AutoConfig_t *ac;
   3214 	RF_AutoConfig_t *next_ac;
   3215 
   3216 	ac = cset->ac;
   3217 	while(ac!=NULL) {
   3218 		next_ac = ac->next;
   3219 		/* nuke the label */
   3220 		free(ac->clabel, M_RAIDFRAME);
   3221 		/* cleanup the config structure */
   3222 		free(ac, M_RAIDFRAME);
   3223 		/* "next.." */
   3224 		ac = next_ac;
   3225 	}
   3226 	/* and, finally, nuke the config set */
   3227 	free(cset, M_RAIDFRAME);
   3228 }
   3229 
   3230 
   3231 void
   3232 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3233 {
   3234 	/* current version number */
   3235 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3236 	clabel->serial_number = raidPtr->serial_number;
   3237 	clabel->mod_counter = raidPtr->mod_counter;
   3238 	clabel->num_rows = 1;
   3239 	clabel->num_columns = raidPtr->numCol;
   3240 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3241 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3242 
   3243 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3244 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3245 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3246 
   3247 	clabel->blockSize = raidPtr->bytesPerSector;
   3248 	clabel->numBlocks = raidPtr->sectorsPerDisk;
   3249 
   3250 	/* XXX not portable */
   3251 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3252 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3253 	clabel->autoconfigure = raidPtr->autoconfigure;
   3254 	clabel->root_partition = raidPtr->root_partition;
   3255 	clabel->last_unit = raidPtr->raidid;
   3256 	clabel->config_order = raidPtr->config_order;
   3257 }
   3258 
   3259 int
   3260 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3261 {
   3262 	RF_Raid_t *raidPtr;
   3263 	RF_Config_t *config;
   3264 	int raidID;
   3265 	int retcode;
   3266 
   3267 #ifdef DEBUG
   3268 	printf("RAID autoconfigure\n");
   3269 #endif
   3270 
   3271 	retcode = 0;
   3272 	*unit = -1;
   3273 
   3274 	/* 1. Create a config structure */
   3275 
   3276 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3277 				       M_RAIDFRAME,
   3278 				       M_NOWAIT);
   3279 	if (config==NULL) {
   3280 		printf("Out of mem!?!?\n");
   3281 				/* XXX do something more intelligent here. */
   3282 		return(1);
   3283 	}
   3284 
   3285 	memset(config, 0, sizeof(RF_Config_t));
   3286 
   3287 	/*
   3288 	   2. Figure out what RAID ID this one is supposed to live at
   3289 	   See if we can get the same RAID dev that it was configured
   3290 	   on last time..
   3291 	*/
   3292 
   3293 	raidID = cset->ac->clabel->last_unit;
   3294 	if ((raidID < 0) || (raidID >= numraid)) {
   3295 		/* let's not wander off into lala land. */
   3296 		raidID = numraid - 1;
   3297 	}
   3298 	if (raidPtrs[raidID]->valid != 0) {
   3299 
   3300 		/*
   3301 		   Nope... Go looking for an alternative...
   3302 		   Start high so we don't immediately use raid0 if that's
   3303 		   not taken.
   3304 		*/
   3305 
   3306 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3307 			if (raidPtrs[raidID]->valid == 0) {
   3308 				/* can use this one! */
   3309 				break;
   3310 			}
   3311 		}
   3312 	}
   3313 
   3314 	if (raidID < 0) {
   3315 		/* punt... */
   3316 		printf("Unable to auto configure this set!\n");
   3317 		printf("(Out of RAID devs!)\n");
   3318 		free(config, M_RAIDFRAME);
   3319 		return(1);
   3320 	}
   3321 
   3322 #ifdef DEBUG
   3323 	printf("Configuring raid%d:\n",raidID);
   3324 #endif
   3325 
   3326 	raidPtr = raidPtrs[raidID];
   3327 
   3328 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3329 	raidPtr->raidid = raidID;
   3330 	raidPtr->openings = RAIDOUTSTANDING;
   3331 
   3332 	/* 3. Build the configuration structure */
   3333 	rf_create_configuration(cset->ac, config, raidPtr);
   3334 
   3335 	/* 4. Do the configuration */
   3336 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3337 
   3338 	if (retcode == 0) {
   3339 
   3340 		raidinit(raidPtrs[raidID]);
   3341 
   3342 		rf_markalldirty(raidPtrs[raidID]);
   3343 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3344 		if (cset->ac->clabel->root_partition==1) {
   3345 			/* everything configured just fine.  Make a note
   3346 			   that this set is eligible to be root. */
   3347 			cset->rootable = 1;
   3348 			/* XXX do this here? */
   3349 			raidPtrs[raidID]->root_partition = 1;
   3350 		}
   3351 	}
   3352 
   3353 	/* 5. Cleanup */
   3354 	free(config, M_RAIDFRAME);
   3355 
   3356 	*unit = raidID;
   3357 	return(retcode);
   3358 }
   3359 
   3360 void
   3361 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3362 {
   3363 	struct buf *bp;
   3364 
   3365 	bp = (struct buf *)desc->bp;
   3366 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3367 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3368 }
   3369 
   3370 void
   3371 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3372 	     size_t xmin, size_t xmax)
   3373 {
   3374 	pool_init(p, size, 0, 0, 0, w_chan, NULL);
   3375 	pool_sethiwat(p, xmax);
   3376 	pool_prime(p, xmin);
   3377 	pool_setlowat(p, xmin);
   3378 }
   3379 
   3380 /*
   3381  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3382  * if there is IO pending and if that IO could possibly be done for a
   3383  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3384  * otherwise.
   3385  *
   3386  */
   3387 
   3388 int
   3389 rf_buf_queue_check(int raidid)
   3390 {
   3391 	if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
   3392 	    raidPtrs[raidid]->openings > 0) {
   3393 		/* there is work to do */
   3394 		return 0;
   3395 	}
   3396 	/* default is nothing to do */
   3397 	return 1;
   3398 }
   3399 
   3400 int
   3401 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
   3402 {
   3403 	struct partinfo dpart;
   3404 	struct dkwedge_info dkw;
   3405 	int error;
   3406 
   3407 	error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred, l);
   3408 	if (error == 0) {
   3409 		diskPtr->blockSize = dpart.disklab->d_secsize;
   3410 		diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
   3411 		diskPtr->partitionSize = dpart.part->p_size;
   3412 		return 0;
   3413 	}
   3414 
   3415 	error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred, l);
   3416 	if (error == 0) {
   3417 		diskPtr->blockSize = 512;	/* XXX */
   3418 		diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
   3419 		diskPtr->partitionSize = dkw.dkw_size;
   3420 		return 0;
   3421 	}
   3422 	return error;
   3423 }
   3424 
   3425 static int
   3426 raid_match(struct device *self, struct cfdata *cfdata,
   3427     void *aux)
   3428 {
   3429 	return 1;
   3430 }
   3431 
   3432 static void
   3433 raid_attach(struct device *parent, struct device *self,
   3434     void *aux)
   3435 {
   3436 
   3437 }
   3438 
   3439 
   3440 static int
   3441 raid_detach(struct device *self, int flags)
   3442 {
   3443 	struct raid_softc *rs = (struct raid_softc *)self;
   3444 
   3445 	if (rs->sc_flags & RAIDF_INITED)
   3446 		return EBUSY;
   3447 
   3448 	return 0;
   3449 }
   3450 
   3451 
   3452