Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.333
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.333 2016/01/02 16:10:06 mlelstv Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.333 2016/01/02 16:10:06 mlelstv Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 struct raid_softc;
    183 static void raidinit(struct raid_softc *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 
    200 static dev_type_open(raidopen);
    201 static dev_type_close(raidclose);
    202 static dev_type_read(raidread);
    203 static dev_type_write(raidwrite);
    204 static dev_type_ioctl(raidioctl);
    205 static dev_type_strategy(raidstrategy);
    206 static dev_type_dump(raiddump);
    207 static dev_type_size(raidsize);
    208 
    209 const struct bdevsw raid_bdevsw = {
    210 	.d_open = raidopen,
    211 	.d_close = raidclose,
    212 	.d_strategy = raidstrategy,
    213 	.d_ioctl = raidioctl,
    214 	.d_dump = raiddump,
    215 	.d_psize = raidsize,
    216 	.d_discard = nodiscard,
    217 	.d_flag = D_DISK
    218 };
    219 
    220 const struct cdevsw raid_cdevsw = {
    221 	.d_open = raidopen,
    222 	.d_close = raidclose,
    223 	.d_read = raidread,
    224 	.d_write = raidwrite,
    225 	.d_ioctl = raidioctl,
    226 	.d_stop = nostop,
    227 	.d_tty = notty,
    228 	.d_poll = nopoll,
    229 	.d_mmap = nommap,
    230 	.d_kqfilter = nokqfilter,
    231 	.d_discard = nodiscard,
    232 	.d_flag = D_DISK
    233 };
    234 
    235 static struct dkdriver rf_dkdriver = {
    236 	.d_strategy = raidstrategy,
    237 	.d_minphys = minphys
    238 };
    239 
    240 struct raid_softc {
    241 	device_t sc_dev;
    242 	int	sc_unit;
    243 	int     sc_flags;	/* flags */
    244 	int     sc_cflags;	/* configuration flags */
    245 	kmutex_t sc_mutex;	/* interlock mutex */
    246 	kcondvar_t sc_cv;	/* and the condvar */
    247 	uint64_t sc_size;	/* size of the raid device */
    248 	char    sc_xname[20];	/* XXX external name */
    249 	struct disk sc_dkdev;	/* generic disk device info */
    250 	struct bufq_state *buf_queue;	/* used for the device queue */
    251 	RF_Raid_t sc_r;
    252 	LIST_ENTRY(raid_softc) sc_link;
    253 };
    254 /* sc_flags */
    255 #define RAIDF_INITED	0x01	/* unit has been initialized */
    256 #define RAIDF_WLABEL	0x02	/* label area is writable */
    257 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    258 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    259 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    260 #define RAIDF_LOCKED	0x80	/* unit is locked */
    261 
    262 #define	raidunit(x)	DISKUNIT(x)
    263 
    264 extern struct cfdriver raid_cd;
    265 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    266     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    267     DVF_DETACH_SHUTDOWN);
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    296 				     struct disklabel *);
    297 static void raidgetdisklabel(dev_t);
    298 static void raidmakedisklabel(struct raid_softc *);
    299 
    300 static int raidlock(struct raid_softc *);
    301 static void raidunlock(struct raid_softc *);
    302 
    303 static int raid_detach_unlocked(struct raid_softc *);
    304 
    305 static void rf_markalldirty(RF_Raid_t *);
    306 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    307 
    308 void rf_ReconThread(struct rf_recon_req *);
    309 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    310 void rf_CopybackThread(RF_Raid_t *raidPtr);
    311 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    312 int rf_autoconfig(device_t);
    313 void rf_buildroothack(RF_ConfigSet_t *);
    314 
    315 RF_AutoConfig_t *rf_find_raid_components(void);
    316 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    317 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    318 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    319 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    320 int rf_set_autoconfig(RF_Raid_t *, int);
    321 int rf_set_rootpartition(RF_Raid_t *, int);
    322 void rf_release_all_vps(RF_ConfigSet_t *);
    323 void rf_cleanup_config_set(RF_ConfigSet_t *);
    324 int rf_have_enough_components(RF_ConfigSet_t *);
    325 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    326 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    327 
    328 /*
    329  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    330  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    331  * in the kernel config file.
    332  */
    333 #ifdef RAID_AUTOCONFIG
    334 int raidautoconfig = 1;
    335 #else
    336 int raidautoconfig = 0;
    337 #endif
    338 static bool raidautoconfigdone = false;
    339 
    340 struct RF_Pools_s rf_pools;
    341 
    342 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    343 static kmutex_t raid_lock;
    344 
    345 static struct raid_softc *
    346 raidcreate(int unit) {
    347 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    348 	if (sc == NULL) {
    349 #ifdef DIAGNOSTIC
    350 		printf("%s: out of memory\n", __func__);
    351 #endif
    352 		return NULL;
    353 	}
    354 	sc->sc_unit = unit;
    355 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    356 	cv_init(&sc->sc_cv, "raidunit");
    357 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    358 	return sc;
    359 }
    360 
    361 static void
    362 raiddestroy(struct raid_softc *sc) {
    363 	cv_destroy(&sc->sc_cv);
    364 	mutex_destroy(&sc->sc_mutex);
    365 	bufq_free(sc->buf_queue);
    366 	kmem_free(sc, sizeof(*sc));
    367 }
    368 
    369 static struct raid_softc *
    370 raidget(int unit, bool create) {
    371 	struct raid_softc *sc;
    372 	if (unit < 0) {
    373 #ifdef DIAGNOSTIC
    374 		panic("%s: unit %d!", __func__, unit);
    375 #endif
    376 		return NULL;
    377 	}
    378 	mutex_enter(&raid_lock);
    379 	LIST_FOREACH(sc, &raids, sc_link) {
    380 		if (sc->sc_unit == unit) {
    381 			mutex_exit(&raid_lock);
    382 			return sc;
    383 		}
    384 	}
    385 	mutex_exit(&raid_lock);
    386 	if (!create)
    387 		return NULL;
    388 	if ((sc = raidcreate(unit)) == NULL)
    389 		return NULL;
    390 	mutex_enter(&raid_lock);
    391 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    392 	mutex_exit(&raid_lock);
    393 	return sc;
    394 }
    395 
    396 static void
    397 raidput(struct raid_softc *sc) {
    398 	mutex_enter(&raid_lock);
    399 	LIST_REMOVE(sc, sc_link);
    400 	mutex_exit(&raid_lock);
    401 	raiddestroy(sc);
    402 }
    403 
    404 void
    405 raidattach(int num)
    406 {
    407 
    408 	/*
    409 	 * Device attachment and associated initialization now occurs
    410 	 * as part of the module initialization.
    411 	 */
    412 }
    413 
    414 int
    415 rf_autoconfig(device_t self)
    416 {
    417 	RF_AutoConfig_t *ac_list;
    418 	RF_ConfigSet_t *config_sets;
    419 
    420 	if (!raidautoconfig || raidautoconfigdone == true)
    421 		return (0);
    422 
    423 	/* XXX This code can only be run once. */
    424 	raidautoconfigdone = true;
    425 
    426 #ifdef __HAVE_CPU_BOOTCONF
    427 	/*
    428 	 * 0. find the boot device if needed first so we can use it later
    429 	 * this needs to be done before we autoconfigure any raid sets,
    430 	 * because if we use wedges we are not going to be able to open
    431 	 * the boot device later
    432 	 */
    433 	if (booted_device == NULL)
    434 		cpu_bootconf();
    435 #endif
    436 	/* 1. locate all RAID components on the system */
    437 	aprint_debug("Searching for RAID components...\n");
    438 	ac_list = rf_find_raid_components();
    439 
    440 	/* 2. Sort them into their respective sets. */
    441 	config_sets = rf_create_auto_sets(ac_list);
    442 
    443 	/*
    444 	 * 3. Evaluate each set and configure the valid ones.
    445 	 * This gets done in rf_buildroothack().
    446 	 */
    447 	rf_buildroothack(config_sets);
    448 
    449 	return 1;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname = device_xname(bdv);
    455 	size_t len = strlen(bootname);
    456 
    457 	for (int col = 0; col < r->numCol; col++) {
    458 		const char *devname = r->Disks[col].devname;
    459 		devname += sizeof("/dev/") - 1;
    460 		if (strncmp(devname, "dk", 2) == 0) {
    461 			const char *parent =
    462 			    dkwedge_get_parent_name(r->Disks[col].dev);
    463 			if (parent != NULL)
    464 				devname = parent;
    465 		}
    466 		if (strncmp(devname, bootname, len) == 0) {
    467 			struct raid_softc *sc = r->softc;
    468 			aprint_debug("raid%d includes boot device %s\n",
    469 			    sc->sc_unit, devname);
    470 			return 1;
    471 		}
    472 	}
    473 	return 0;
    474 }
    475 
    476 void
    477 rf_buildroothack(RF_ConfigSet_t *config_sets)
    478 {
    479 	RF_ConfigSet_t *cset;
    480 	RF_ConfigSet_t *next_cset;
    481 	int num_root;
    482 	struct raid_softc *sc, *rsc;
    483 
    484 	sc = rsc = NULL;
    485 	num_root = 0;
    486 	cset = config_sets;
    487 	while (cset != NULL) {
    488 		next_cset = cset->next;
    489 		if (rf_have_enough_components(cset) &&
    490 		    cset->ac->clabel->autoconfigure == 1) {
    491 			sc = rf_auto_config_set(cset);
    492 			if (sc != NULL) {
    493 				aprint_debug("raid%d: configured ok\n",
    494 				    sc->sc_unit);
    495 				if (cset->rootable) {
    496 					rsc = sc;
    497 					num_root++;
    498 				}
    499 			} else {
    500 				/* The autoconfig didn't work :( */
    501 				aprint_debug("Autoconfig failed\n");
    502 				rf_release_all_vps(cset);
    503 			}
    504 		} else {
    505 			/* we're not autoconfiguring this set...
    506 			   release the associated resources */
    507 			rf_release_all_vps(cset);
    508 		}
    509 		/* cleanup */
    510 		rf_cleanup_config_set(cset);
    511 		cset = next_cset;
    512 	}
    513 
    514 	/* if the user has specified what the root device should be
    515 	   then we don't touch booted_device or boothowto... */
    516 
    517 	if (rootspec != NULL)
    518 		return;
    519 
    520 	/* we found something bootable... */
    521 
    522 	/*
    523 	 * XXX: The following code assumes that the root raid
    524 	 * is the first ('a') partition. This is about the best
    525 	 * we can do with a BSD disklabel, but we might be able
    526 	 * to do better with a GPT label, by setting a specified
    527 	 * attribute to indicate the root partition. We can then
    528 	 * stash the partition number in the r->root_partition
    529 	 * high bits (the bottom 2 bits are already used). For
    530 	 * now we just set booted_partition to 0 when we override
    531 	 * root.
    532 	 */
    533 	if (num_root == 1) {
    534 		device_t candidate_root;
    535 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    536 			char cname[sizeof(cset->ac->devname)];
    537 			/* XXX: assume 'a' */
    538 			snprintf(cname, sizeof(cname), "%s%c",
    539 			    device_xname(rsc->sc_dev), 'a');
    540 			candidate_root = dkwedge_find_by_wname(cname);
    541 		} else
    542 			candidate_root = rsc->sc_dev;
    543 		if (booted_device == NULL ||
    544 		    rsc->sc_r.root_partition == 1 ||
    545 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    546 			booted_device = candidate_root;
    547 			booted_partition = 0;	/* XXX assume 'a' */
    548 		}
    549 	} else if (num_root > 1) {
    550 
    551 		/*
    552 		 * Maybe the MD code can help. If it cannot, then
    553 		 * setroot() will discover that we have no
    554 		 * booted_device and will ask the user if nothing was
    555 		 * hardwired in the kernel config file
    556 		 */
    557 		if (booted_device == NULL)
    558 			return;
    559 
    560 		num_root = 0;
    561 		mutex_enter(&raid_lock);
    562 		LIST_FOREACH(sc, &raids, sc_link) {
    563 			RF_Raid_t *r = &sc->sc_r;
    564 			if (r->valid == 0)
    565 				continue;
    566 
    567 			if (r->root_partition == 0)
    568 				continue;
    569 
    570 			if (rf_containsboot(r, booted_device)) {
    571 				num_root++;
    572 				rsc = sc;
    573 			}
    574 		}
    575 		mutex_exit(&raid_lock);
    576 
    577 		if (num_root == 1) {
    578 			booted_device = rsc->sc_dev;
    579 			booted_partition = 0;	/* XXX assume 'a' */
    580 		} else {
    581 			/* we can't guess.. require the user to answer... */
    582 			boothowto |= RB_ASKNAME;
    583 		}
    584 	}
    585 }
    586 
    587 static int
    588 raidsize(dev_t dev)
    589 {
    590 	struct raid_softc *rs;
    591 	struct disklabel *lp;
    592 	int     part, unit, omask, size;
    593 
    594 	unit = raidunit(dev);
    595 	if ((rs = raidget(unit, false)) == NULL)
    596 		return -1;
    597 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    598 		return (-1);
    599 
    600 	part = DISKPART(dev);
    601 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    602 	lp = rs->sc_dkdev.dk_label;
    603 
    604 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    605 		return (-1);
    606 
    607 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    608 		size = -1;
    609 	else
    610 		size = lp->d_partitions[part].p_size *
    611 		    (lp->d_secsize / DEV_BSIZE);
    612 
    613 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    614 		return (-1);
    615 
    616 	return (size);
    617 
    618 }
    619 
    620 static int
    621 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    622 {
    623 	int     unit = raidunit(dev);
    624 	struct raid_softc *rs;
    625 	const struct bdevsw *bdev;
    626 	struct disklabel *lp;
    627 	RF_Raid_t *raidPtr;
    628 	daddr_t offset;
    629 	int     part, c, sparecol, j, scol, dumpto;
    630 	int     error = 0;
    631 
    632 	if ((rs = raidget(unit, false)) == NULL)
    633 		return ENXIO;
    634 
    635 	raidPtr = &rs->sc_r;
    636 
    637 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    638 		return ENXIO;
    639 
    640 	/* we only support dumping to RAID 1 sets */
    641 	if (raidPtr->Layout.numDataCol != 1 ||
    642 	    raidPtr->Layout.numParityCol != 1)
    643 		return EINVAL;
    644 
    645 	if ((error = raidlock(rs)) != 0)
    646 		return error;
    647 
    648 	if (size % DEV_BSIZE != 0) {
    649 		error = EINVAL;
    650 		goto out;
    651 	}
    652 
    653 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    654 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    655 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    656 		    size / DEV_BSIZE, rs->sc_size);
    657 		error = EINVAL;
    658 		goto out;
    659 	}
    660 
    661 	part = DISKPART(dev);
    662 	lp = rs->sc_dkdev.dk_label;
    663 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    664 
    665 	/* figure out what device is alive.. */
    666 
    667 	/*
    668 	   Look for a component to dump to.  The preference for the
    669 	   component to dump to is as follows:
    670 	   1) the master
    671 	   2) a used_spare of the master
    672 	   3) the slave
    673 	   4) a used_spare of the slave
    674 	*/
    675 
    676 	dumpto = -1;
    677 	for (c = 0; c < raidPtr->numCol; c++) {
    678 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    679 			/* this might be the one */
    680 			dumpto = c;
    681 			break;
    682 		}
    683 	}
    684 
    685 	/*
    686 	   At this point we have possibly selected a live master or a
    687 	   live slave.  We now check to see if there is a spared
    688 	   master (or a spared slave), if we didn't find a live master
    689 	   or a live slave.
    690 	*/
    691 
    692 	for (c = 0; c < raidPtr->numSpare; c++) {
    693 		sparecol = raidPtr->numCol + c;
    694 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    695 			/* How about this one? */
    696 			scol = -1;
    697 			for(j=0;j<raidPtr->numCol;j++) {
    698 				if (raidPtr->Disks[j].spareCol == sparecol) {
    699 					scol = j;
    700 					break;
    701 				}
    702 			}
    703 			if (scol == 0) {
    704 				/*
    705 				   We must have found a spared master!
    706 				   We'll take that over anything else
    707 				   found so far.  (We couldn't have
    708 				   found a real master before, since
    709 				   this is a used spare, and it's
    710 				   saying that it's replacing the
    711 				   master.)  On reboot (with
    712 				   autoconfiguration turned on)
    713 				   sparecol will become the 1st
    714 				   component (component0) of this set.
    715 				*/
    716 				dumpto = sparecol;
    717 				break;
    718 			} else if (scol != -1) {
    719 				/*
    720 				   Must be a spared slave.  We'll dump
    721 				   to that if we havn't found anything
    722 				   else so far.
    723 				*/
    724 				if (dumpto == -1)
    725 					dumpto = sparecol;
    726 			}
    727 		}
    728 	}
    729 
    730 	if (dumpto == -1) {
    731 		/* we couldn't find any live components to dump to!?!?
    732 		 */
    733 		error = EINVAL;
    734 		goto out;
    735 	}
    736 
    737 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    738 
    739 	/*
    740 	   Note that blkno is relative to this particular partition.
    741 	   By adding the offset of this partition in the RAID
    742 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    743 	   value that is relative to the partition used for the
    744 	   underlying component.
    745 	*/
    746 
    747 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    748 				blkno + offset, va, size);
    749 
    750 out:
    751 	raidunlock(rs);
    752 
    753 	return error;
    754 }
    755 
    756 /* ARGSUSED */
    757 static int
    758 raidopen(dev_t dev, int flags, int fmt,
    759     struct lwp *l)
    760 {
    761 	int     unit = raidunit(dev);
    762 	struct raid_softc *rs;
    763 	struct disklabel *lp;
    764 	int     part, pmask;
    765 	int     error = 0;
    766 
    767 	if ((rs = raidget(unit, true)) == NULL)
    768 		return ENXIO;
    769 	if ((error = raidlock(rs)) != 0)
    770 		return (error);
    771 
    772 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    773 		error = EBUSY;
    774 		goto bad;
    775 	}
    776 
    777 	lp = rs->sc_dkdev.dk_label;
    778 
    779 	part = DISKPART(dev);
    780 
    781 	/*
    782 	 * If there are wedges, and this is not RAW_PART, then we
    783 	 * need to fail.
    784 	 */
    785 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    786 		error = EBUSY;
    787 		goto bad;
    788 	}
    789 	pmask = (1 << part);
    790 
    791 	if ((rs->sc_flags & RAIDF_INITED) &&
    792 	    (rs->sc_dkdev.dk_nwedges == 0) &&
    793 	    (rs->sc_dkdev.dk_openmask == 0))
    794 		raidgetdisklabel(dev);
    795 
    796 	/* make sure that this partition exists */
    797 
    798 	if (part != RAW_PART) {
    799 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    800 		    ((part >= lp->d_npartitions) ||
    801 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    802 			error = ENXIO;
    803 			goto bad;
    804 		}
    805 	}
    806 	/* Prevent this unit from being unconfigured while open. */
    807 	switch (fmt) {
    808 	case S_IFCHR:
    809 		rs->sc_dkdev.dk_copenmask |= pmask;
    810 		break;
    811 
    812 	case S_IFBLK:
    813 		rs->sc_dkdev.dk_bopenmask |= pmask;
    814 		break;
    815 	}
    816 
    817 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    818 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    819 		/* First one... mark things as dirty... Note that we *MUST*
    820 		 have done a configure before this.  I DO NOT WANT TO BE
    821 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    822 		 THAT THEY BELONG TOGETHER!!!!! */
    823 		/* XXX should check to see if we're only open for reading
    824 		   here... If so, we needn't do this, but then need some
    825 		   other way of keeping track of what's happened.. */
    826 
    827 		rf_markalldirty(&rs->sc_r);
    828 	}
    829 
    830 
    831 	rs->sc_dkdev.dk_openmask =
    832 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    833 
    834 bad:
    835 	raidunlock(rs);
    836 
    837 	return (error);
    838 
    839 
    840 }
    841 
    842 /* ARGSUSED */
    843 static int
    844 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    845 {
    846 	int     unit = raidunit(dev);
    847 	struct raid_softc *rs;
    848 	int     error = 0;
    849 	int     part;
    850 
    851 	if ((rs = raidget(unit, false)) == NULL)
    852 		return ENXIO;
    853 
    854 	if ((error = raidlock(rs)) != 0)
    855 		return (error);
    856 
    857 	part = DISKPART(dev);
    858 
    859 	/* ...that much closer to allowing unconfiguration... */
    860 	switch (fmt) {
    861 	case S_IFCHR:
    862 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    863 		break;
    864 
    865 	case S_IFBLK:
    866 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    867 		break;
    868 	}
    869 	rs->sc_dkdev.dk_openmask =
    870 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    871 
    872 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    873 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    874 		/* Last one... device is not unconfigured yet.
    875 		   Device shutdown has taken care of setting the
    876 		   clean bits if RAIDF_INITED is not set
    877 		   mark things as clean... */
    878 
    879 		rf_update_component_labels(&rs->sc_r,
    880 						 RF_FINAL_COMPONENT_UPDATE);
    881 	}
    882 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    883 	    ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)) {
    884 		/*
    885 		 * Detach this raid unit
    886 		 */
    887 		cfdata_t cf = NULL;
    888 		int retcode = 0;
    889 
    890 		if (rs->sc_dev != NULL) {
    891 			cf = device_cfdata(rs->sc_dev);
    892 
    893 			raidunlock(rs);
    894 			retcode = config_detach(rs->sc_dev, DETACH_QUIET);
    895 			if (retcode == 0)
    896 				/* free the pseudo device attach bits */
    897 				free(cf, M_RAIDFRAME);
    898 		} else {
    899 			raidput(rs);
    900 		}
    901 		return retcode;
    902 	}
    903 
    904 	raidunlock(rs);
    905 	return (0);
    906 }
    907 
    908 static void
    909 raidstrategy(struct buf *bp)
    910 {
    911 	unsigned int unit = raidunit(bp->b_dev);
    912 	RF_Raid_t *raidPtr;
    913 	int     wlabel;
    914 	struct raid_softc *rs;
    915 
    916 	if ((rs = raidget(unit, false)) == NULL) {
    917 		bp->b_error = ENXIO;
    918 		goto done;
    919 	}
    920 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    921 		bp->b_error = ENXIO;
    922 		goto done;
    923 	}
    924 	raidPtr = &rs->sc_r;
    925 	if (!raidPtr->valid) {
    926 		bp->b_error = ENODEV;
    927 		goto done;
    928 	}
    929 	if (bp->b_bcount == 0) {
    930 		db1_printf(("b_bcount is zero..\n"));
    931 		goto done;
    932 	}
    933 
    934 	/*
    935 	 * Do bounds checking and adjust transfer.  If there's an
    936 	 * error, the bounds check will flag that for us.
    937 	 */
    938 
    939 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    940 	if (DISKPART(bp->b_dev) == RAW_PART) {
    941 		uint64_t size; /* device size in DEV_BSIZE unit */
    942 
    943 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    944 			size = raidPtr->totalSectors <<
    945 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    946 		} else {
    947 			size = raidPtr->totalSectors >>
    948 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    949 		}
    950 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    951 			goto done;
    952 		}
    953 	} else {
    954 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    955 			db1_printf(("Bounds check failed!!:%d %d\n",
    956 				(int) bp->b_blkno, (int) wlabel));
    957 			goto done;
    958 		}
    959 	}
    960 
    961 	rf_lock_mutex2(raidPtr->iodone_lock);
    962 
    963 	bp->b_resid = 0;
    964 
    965 	/* stuff it onto our queue */
    966 	bufq_put(rs->buf_queue, bp);
    967 
    968 	/* scheduled the IO to happen at the next convenient time */
    969 	rf_signal_cond2(raidPtr->iodone_cv);
    970 	rf_unlock_mutex2(raidPtr->iodone_lock);
    971 
    972 	return;
    973 
    974 done:
    975 	bp->b_resid = bp->b_bcount;
    976 	biodone(bp);
    977 }
    978 
    979 /* ARGSUSED */
    980 static int
    981 raidread(dev_t dev, struct uio *uio, int flags)
    982 {
    983 	int     unit = raidunit(dev);
    984 	struct raid_softc *rs;
    985 
    986 	if ((rs = raidget(unit, false)) == NULL)
    987 		return ENXIO;
    988 
    989 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    990 		return (ENXIO);
    991 
    992 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    993 
    994 }
    995 
    996 /* ARGSUSED */
    997 static int
    998 raidwrite(dev_t dev, struct uio *uio, int flags)
    999 {
   1000 	int     unit = raidunit(dev);
   1001 	struct raid_softc *rs;
   1002 
   1003 	if ((rs = raidget(unit, false)) == NULL)
   1004 		return ENXIO;
   1005 
   1006 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1007 		return (ENXIO);
   1008 
   1009 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1010 
   1011 }
   1012 
   1013 static int
   1014 raid_detach_unlocked(struct raid_softc *rs)
   1015 {
   1016 	int error;
   1017 	RF_Raid_t *raidPtr;
   1018 
   1019 	raidPtr = &rs->sc_r;
   1020 
   1021 	/*
   1022 	 * If somebody has a partition mounted, we shouldn't
   1023 	 * shutdown.
   1024 	 */
   1025 	if (rs->sc_dkdev.dk_openmask != 0)
   1026 		return EBUSY;
   1027 
   1028 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1029 		return 0;
   1030 
   1031 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1032 
   1033 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1034 		return error;
   1035 
   1036 	/* Detach the disk. */
   1037 	dkwedge_delall(&rs->sc_dkdev);
   1038 	disk_detach(&rs->sc_dkdev);
   1039 	disk_destroy(&rs->sc_dkdev);
   1040 
   1041 	rs->sc_flags &= ~RAIDF_INITED;
   1042 
   1043 	/* Free the softc */
   1044 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1045 
   1046 	return 0;
   1047 }
   1048 
   1049 static int
   1050 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1051 {
   1052 	int     unit = raidunit(dev);
   1053 	int     error = 0;
   1054 	int     part, pmask, s;
   1055 	cfdata_t cf;
   1056 	struct raid_softc *rs;
   1057 	RF_Config_t *k_cfg, *u_cfg;
   1058 	RF_Raid_t *raidPtr;
   1059 	RF_RaidDisk_t *diskPtr;
   1060 	RF_AccTotals_t *totals;
   1061 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1062 	u_char *specific_buf;
   1063 	int retcode = 0;
   1064 	int column;
   1065 /*	int raidid; */
   1066 	struct rf_recon_req *rrcopy, *rr;
   1067 	RF_ComponentLabel_t *clabel;
   1068 	RF_ComponentLabel_t *ci_label;
   1069 	RF_ComponentLabel_t **clabel_ptr;
   1070 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1071 	RF_SingleComponent_t component;
   1072 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1073 	int i, j, d;
   1074 #ifdef __HAVE_OLD_DISKLABEL
   1075 	struct disklabel newlabel;
   1076 #endif
   1077 
   1078 	if ((rs = raidget(unit, false)) == NULL)
   1079 		return ENXIO;
   1080 	raidPtr = &rs->sc_r;
   1081 
   1082 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1083 		(int) DISKPART(dev), (int) unit, cmd));
   1084 
   1085 	/* Must be open for writes for these commands... */
   1086 	switch (cmd) {
   1087 #ifdef DIOCGSECTORSIZE
   1088 	case DIOCGSECTORSIZE:
   1089 		*(u_int *)data = raidPtr->bytesPerSector;
   1090 		return 0;
   1091 	case DIOCGMEDIASIZE:
   1092 		*(off_t *)data =
   1093 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1094 		return 0;
   1095 #endif
   1096 	case DIOCSDINFO:
   1097 	case DIOCWDINFO:
   1098 #ifdef __HAVE_OLD_DISKLABEL
   1099 	case ODIOCWDINFO:
   1100 	case ODIOCSDINFO:
   1101 #endif
   1102 	case DIOCWLABEL:
   1103 	case DIOCAWEDGE:
   1104 	case DIOCDWEDGE:
   1105 	case DIOCMWEDGES:
   1106 	case DIOCSSTRATEGY:
   1107 		if ((flag & FWRITE) == 0)
   1108 			return (EBADF);
   1109 	}
   1110 
   1111 	/* Must be initialized for these... */
   1112 	switch (cmd) {
   1113 	case DIOCGDINFO:
   1114 	case DIOCSDINFO:
   1115 	case DIOCWDINFO:
   1116 #ifdef __HAVE_OLD_DISKLABEL
   1117 	case ODIOCGDINFO:
   1118 	case ODIOCWDINFO:
   1119 	case ODIOCSDINFO:
   1120 	case ODIOCGDEFLABEL:
   1121 #endif
   1122 	case DIOCGPARTINFO:
   1123 	case DIOCWLABEL:
   1124 	case DIOCGDEFLABEL:
   1125 	case DIOCAWEDGE:
   1126 	case DIOCDWEDGE:
   1127 	case DIOCLWEDGES:
   1128 	case DIOCMWEDGES:
   1129 	case DIOCCACHESYNC:
   1130 	case RAIDFRAME_SHUTDOWN:
   1131 	case RAIDFRAME_REWRITEPARITY:
   1132 	case RAIDFRAME_GET_INFO:
   1133 	case RAIDFRAME_RESET_ACCTOTALS:
   1134 	case RAIDFRAME_GET_ACCTOTALS:
   1135 	case RAIDFRAME_KEEP_ACCTOTALS:
   1136 	case RAIDFRAME_GET_SIZE:
   1137 	case RAIDFRAME_FAIL_DISK:
   1138 	case RAIDFRAME_COPYBACK:
   1139 	case RAIDFRAME_CHECK_RECON_STATUS:
   1140 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1141 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1142 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1143 	case RAIDFRAME_ADD_HOT_SPARE:
   1144 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1145 	case RAIDFRAME_INIT_LABELS:
   1146 	case RAIDFRAME_REBUILD_IN_PLACE:
   1147 	case RAIDFRAME_CHECK_PARITY:
   1148 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1149 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1150 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1151 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1152 	case RAIDFRAME_SET_AUTOCONFIG:
   1153 	case RAIDFRAME_SET_ROOT:
   1154 	case RAIDFRAME_DELETE_COMPONENT:
   1155 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1156 	case RAIDFRAME_PARITYMAP_STATUS:
   1157 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1158 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1159 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1160 	case DIOCGSTRATEGY:
   1161 	case DIOCSSTRATEGY:
   1162 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1163 			return (ENXIO);
   1164 	}
   1165 
   1166 	switch (cmd) {
   1167 #ifdef COMPAT_50
   1168 	case RAIDFRAME_GET_INFO50:
   1169 		return rf_get_info50(raidPtr, data);
   1170 
   1171 	case RAIDFRAME_CONFIGURE50:
   1172 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1173 			return retcode;
   1174 		goto config;
   1175 #endif
   1176 		/* configure the system */
   1177 	case RAIDFRAME_CONFIGURE:
   1178 
   1179 		if (raidPtr->valid) {
   1180 			/* There is a valid RAID set running on this unit! */
   1181 			printf("raid%d: Device already configured!\n",unit);
   1182 			return(EINVAL);
   1183 		}
   1184 
   1185 		/* copy-in the configuration information */
   1186 		/* data points to a pointer to the configuration structure */
   1187 
   1188 		u_cfg = *((RF_Config_t **) data);
   1189 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1190 		if (k_cfg == NULL) {
   1191 			return (ENOMEM);
   1192 		}
   1193 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1194 		if (retcode) {
   1195 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1196 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1197 				retcode));
   1198 			goto no_config;
   1199 		}
   1200 		goto config;
   1201 	config:
   1202 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1203 
   1204 		/* allocate a buffer for the layout-specific data, and copy it
   1205 		 * in */
   1206 		if (k_cfg->layoutSpecificSize) {
   1207 			if (k_cfg->layoutSpecificSize > 10000) {
   1208 				/* sanity check */
   1209 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1210 				retcode = EINVAL;
   1211 				goto no_config;
   1212 			}
   1213 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1214 			    (u_char *));
   1215 			if (specific_buf == NULL) {
   1216 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1217 				retcode = ENOMEM;
   1218 				goto no_config;
   1219 			}
   1220 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1221 			    k_cfg->layoutSpecificSize);
   1222 			if (retcode) {
   1223 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1224 				RF_Free(specific_buf,
   1225 					k_cfg->layoutSpecificSize);
   1226 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1227 					retcode));
   1228 				goto no_config;
   1229 			}
   1230 		} else
   1231 			specific_buf = NULL;
   1232 		k_cfg->layoutSpecific = specific_buf;
   1233 
   1234 		/* should do some kind of sanity check on the configuration.
   1235 		 * Store the sum of all the bytes in the last byte? */
   1236 
   1237 		/* configure the system */
   1238 
   1239 		/*
   1240 		 * Clear the entire RAID descriptor, just to make sure
   1241 		 *  there is no stale data left in the case of a
   1242 		 *  reconfiguration
   1243 		 */
   1244 		memset(raidPtr, 0, sizeof(*raidPtr));
   1245 		raidPtr->softc = rs;
   1246 		raidPtr->raidid = unit;
   1247 
   1248 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1249 
   1250 		if (retcode == 0) {
   1251 
   1252 			/* allow this many simultaneous IO's to
   1253 			   this RAID device */
   1254 			raidPtr->openings = RAIDOUTSTANDING;
   1255 
   1256 			raidinit(rs);
   1257 			rf_markalldirty(raidPtr);
   1258 		}
   1259 		/* free the buffers.  No return code here. */
   1260 		if (k_cfg->layoutSpecificSize) {
   1261 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1262 		}
   1263 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1264 
   1265 	no_config:
   1266 		/*
   1267 		 * If configuration failed, set sc_flags so that we
   1268 		 * will detach the device when we close it.
   1269 		 */
   1270 		if (retcode != 0)
   1271 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1272 		return (retcode);
   1273 
   1274 		/* shutdown the system */
   1275 	case RAIDFRAME_SHUTDOWN:
   1276 
   1277 		part = DISKPART(dev);
   1278 		pmask = (1 << part);
   1279 
   1280 		if ((error = raidlock(rs)) != 0)
   1281 			return (error);
   1282 
   1283 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1284 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1285 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1286 			retcode = EBUSY;
   1287 		else {
   1288 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1289 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1290 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1291 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1292 			retcode = 0;
   1293 		}
   1294 
   1295 		raidunlock(rs);
   1296 
   1297 		if (retcode != 0)
   1298 			return retcode;
   1299 
   1300 		/* free the pseudo device attach bits */
   1301 
   1302 		cf = device_cfdata(rs->sc_dev);
   1303 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1304 			free(cf, M_RAIDFRAME);
   1305 
   1306 		return (retcode);
   1307 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1308 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1309 		/* need to read the component label for the disk indicated
   1310 		   by row,column in clabel */
   1311 
   1312 		/*
   1313 		 * Perhaps there should be an option to skip the in-core
   1314 		 * copy and hit the disk, as with disklabel(8).
   1315 		 */
   1316 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1317 
   1318 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1319 
   1320 		if (retcode) {
   1321 			RF_Free(clabel, sizeof(*clabel));
   1322 			return retcode;
   1323 		}
   1324 
   1325 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1326 
   1327 		column = clabel->column;
   1328 
   1329 		if ((column < 0) || (column >= raidPtr->numCol +
   1330 		    raidPtr->numSpare)) {
   1331 			RF_Free(clabel, sizeof(*clabel));
   1332 			return EINVAL;
   1333 		}
   1334 
   1335 		RF_Free(clabel, sizeof(*clabel));
   1336 
   1337 		clabel = raidget_component_label(raidPtr, column);
   1338 
   1339 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1340 
   1341 #if 0
   1342 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1343 		clabel = (RF_ComponentLabel_t *) data;
   1344 
   1345 		/* XXX check the label for valid stuff... */
   1346 		/* Note that some things *should not* get modified --
   1347 		   the user should be re-initing the labels instead of
   1348 		   trying to patch things.
   1349 		   */
   1350 
   1351 		raidid = raidPtr->raidid;
   1352 #ifdef DEBUG
   1353 		printf("raid%d: Got component label:\n", raidid);
   1354 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1355 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1356 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1357 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1358 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1359 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1360 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1361 #endif
   1362 		clabel->row = 0;
   1363 		column = clabel->column;
   1364 
   1365 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1366 			return(EINVAL);
   1367 		}
   1368 
   1369 		/* XXX this isn't allowed to do anything for now :-) */
   1370 
   1371 		/* XXX and before it is, we need to fill in the rest
   1372 		   of the fields!?!?!?! */
   1373 		memcpy(raidget_component_label(raidPtr, column),
   1374 		    clabel, sizeof(*clabel));
   1375 		raidflush_component_label(raidPtr, column);
   1376 		return (0);
   1377 #endif
   1378 
   1379 	case RAIDFRAME_INIT_LABELS:
   1380 		clabel = (RF_ComponentLabel_t *) data;
   1381 		/*
   1382 		   we only want the serial number from
   1383 		   the above.  We get all the rest of the information
   1384 		   from the config that was used to create this RAID
   1385 		   set.
   1386 		   */
   1387 
   1388 		raidPtr->serial_number = clabel->serial_number;
   1389 
   1390 		for(column=0;column<raidPtr->numCol;column++) {
   1391 			diskPtr = &raidPtr->Disks[column];
   1392 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1393 				ci_label = raidget_component_label(raidPtr,
   1394 				    column);
   1395 				/* Zeroing this is important. */
   1396 				memset(ci_label, 0, sizeof(*ci_label));
   1397 				raid_init_component_label(raidPtr, ci_label);
   1398 				ci_label->serial_number =
   1399 				    raidPtr->serial_number;
   1400 				ci_label->row = 0; /* we dont' pretend to support more */
   1401 				rf_component_label_set_partitionsize(ci_label,
   1402 				    diskPtr->partitionSize);
   1403 				ci_label->column = column;
   1404 				raidflush_component_label(raidPtr, column);
   1405 			}
   1406 			/* XXXjld what about the spares? */
   1407 		}
   1408 
   1409 		return (retcode);
   1410 	case RAIDFRAME_SET_AUTOCONFIG:
   1411 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1412 		printf("raid%d: New autoconfig value is: %d\n",
   1413 		       raidPtr->raidid, d);
   1414 		*(int *) data = d;
   1415 		return (retcode);
   1416 
   1417 	case RAIDFRAME_SET_ROOT:
   1418 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1419 		printf("raid%d: New rootpartition value is: %d\n",
   1420 		       raidPtr->raidid, d);
   1421 		*(int *) data = d;
   1422 		return (retcode);
   1423 
   1424 		/* initialize all parity */
   1425 	case RAIDFRAME_REWRITEPARITY:
   1426 
   1427 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1428 			/* Parity for RAID 0 is trivially correct */
   1429 			raidPtr->parity_good = RF_RAID_CLEAN;
   1430 			return(0);
   1431 		}
   1432 
   1433 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1434 			/* Re-write is already in progress! */
   1435 			return(EINVAL);
   1436 		}
   1437 
   1438 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1439 					   rf_RewriteParityThread,
   1440 					   raidPtr,"raid_parity");
   1441 		return (retcode);
   1442 
   1443 
   1444 	case RAIDFRAME_ADD_HOT_SPARE:
   1445 		sparePtr = (RF_SingleComponent_t *) data;
   1446 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1447 		retcode = rf_add_hot_spare(raidPtr, &component);
   1448 		return(retcode);
   1449 
   1450 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1451 		return(retcode);
   1452 
   1453 	case RAIDFRAME_DELETE_COMPONENT:
   1454 		componentPtr = (RF_SingleComponent_t *)data;
   1455 		memcpy( &component, componentPtr,
   1456 			sizeof(RF_SingleComponent_t));
   1457 		retcode = rf_delete_component(raidPtr, &component);
   1458 		return(retcode);
   1459 
   1460 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1461 		componentPtr = (RF_SingleComponent_t *)data;
   1462 		memcpy( &component, componentPtr,
   1463 			sizeof(RF_SingleComponent_t));
   1464 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1465 		return(retcode);
   1466 
   1467 	case RAIDFRAME_REBUILD_IN_PLACE:
   1468 
   1469 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1470 			/* Can't do this on a RAID 0!! */
   1471 			return(EINVAL);
   1472 		}
   1473 
   1474 		if (raidPtr->recon_in_progress == 1) {
   1475 			/* a reconstruct is already in progress! */
   1476 			return(EINVAL);
   1477 		}
   1478 
   1479 		componentPtr = (RF_SingleComponent_t *) data;
   1480 		memcpy( &component, componentPtr,
   1481 			sizeof(RF_SingleComponent_t));
   1482 		component.row = 0; /* we don't support any more */
   1483 		column = component.column;
   1484 
   1485 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1486 			return(EINVAL);
   1487 		}
   1488 
   1489 		rf_lock_mutex2(raidPtr->mutex);
   1490 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1491 		    (raidPtr->numFailures > 0)) {
   1492 			/* XXX 0 above shouldn't be constant!!! */
   1493 			/* some component other than this has failed.
   1494 			   Let's not make things worse than they already
   1495 			   are... */
   1496 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1497 			       raidPtr->raidid);
   1498 			printf("raid%d:     Col: %d   Too many failures.\n",
   1499 			       raidPtr->raidid, column);
   1500 			rf_unlock_mutex2(raidPtr->mutex);
   1501 			return (EINVAL);
   1502 		}
   1503 		if (raidPtr->Disks[column].status ==
   1504 		    rf_ds_reconstructing) {
   1505 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1506 			       raidPtr->raidid);
   1507 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1508 
   1509 			rf_unlock_mutex2(raidPtr->mutex);
   1510 			return (EINVAL);
   1511 		}
   1512 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1513 			rf_unlock_mutex2(raidPtr->mutex);
   1514 			return (EINVAL);
   1515 		}
   1516 		rf_unlock_mutex2(raidPtr->mutex);
   1517 
   1518 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1519 		if (rrcopy == NULL)
   1520 			return(ENOMEM);
   1521 
   1522 		rrcopy->raidPtr = (void *) raidPtr;
   1523 		rrcopy->col = column;
   1524 
   1525 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1526 					   rf_ReconstructInPlaceThread,
   1527 					   rrcopy,"raid_reconip");
   1528 		return(retcode);
   1529 
   1530 	case RAIDFRAME_GET_INFO:
   1531 		if (!raidPtr->valid)
   1532 			return (ENODEV);
   1533 		ucfgp = (RF_DeviceConfig_t **) data;
   1534 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1535 			  (RF_DeviceConfig_t *));
   1536 		if (d_cfg == NULL)
   1537 			return (ENOMEM);
   1538 		d_cfg->rows = 1; /* there is only 1 row now */
   1539 		d_cfg->cols = raidPtr->numCol;
   1540 		d_cfg->ndevs = raidPtr->numCol;
   1541 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1542 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1543 			return (ENOMEM);
   1544 		}
   1545 		d_cfg->nspares = raidPtr->numSpare;
   1546 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1547 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1548 			return (ENOMEM);
   1549 		}
   1550 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1551 		d = 0;
   1552 		for (j = 0; j < d_cfg->cols; j++) {
   1553 			d_cfg->devs[d] = raidPtr->Disks[j];
   1554 			d++;
   1555 		}
   1556 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1557 			d_cfg->spares[i] = raidPtr->Disks[j];
   1558 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1559 				/* XXX: raidctl(8) expects to see this as a used spare */
   1560 				d_cfg->spares[i].status = rf_ds_used_spare;
   1561 			}
   1562 		}
   1563 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1564 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1565 
   1566 		return (retcode);
   1567 
   1568 	case RAIDFRAME_CHECK_PARITY:
   1569 		*(int *) data = raidPtr->parity_good;
   1570 		return (0);
   1571 
   1572 	case RAIDFRAME_PARITYMAP_STATUS:
   1573 		if (rf_paritymap_ineligible(raidPtr))
   1574 			return EINVAL;
   1575 		rf_paritymap_status(raidPtr->parity_map,
   1576 		    (struct rf_pmstat *)data);
   1577 		return 0;
   1578 
   1579 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1580 		if (rf_paritymap_ineligible(raidPtr))
   1581 			return EINVAL;
   1582 		if (raidPtr->parity_map == NULL)
   1583 			return ENOENT; /* ??? */
   1584 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1585 			(struct rf_pmparams *)data, 1))
   1586 			return EINVAL;
   1587 		return 0;
   1588 
   1589 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1590 		if (rf_paritymap_ineligible(raidPtr))
   1591 			return EINVAL;
   1592 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1593 		return 0;
   1594 
   1595 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1596 		if (rf_paritymap_ineligible(raidPtr))
   1597 			return EINVAL;
   1598 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1599 		/* XXX should errors be passed up? */
   1600 		return 0;
   1601 
   1602 	case RAIDFRAME_RESET_ACCTOTALS:
   1603 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1604 		return (0);
   1605 
   1606 	case RAIDFRAME_GET_ACCTOTALS:
   1607 		totals = (RF_AccTotals_t *) data;
   1608 		*totals = raidPtr->acc_totals;
   1609 		return (0);
   1610 
   1611 	case RAIDFRAME_KEEP_ACCTOTALS:
   1612 		raidPtr->keep_acc_totals = *(int *)data;
   1613 		return (0);
   1614 
   1615 	case RAIDFRAME_GET_SIZE:
   1616 		*(int *) data = raidPtr->totalSectors;
   1617 		return (0);
   1618 
   1619 		/* fail a disk & optionally start reconstruction */
   1620 	case RAIDFRAME_FAIL_DISK:
   1621 
   1622 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1623 			/* Can't do this on a RAID 0!! */
   1624 			return(EINVAL);
   1625 		}
   1626 
   1627 		rr = (struct rf_recon_req *) data;
   1628 		rr->row = 0;
   1629 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1630 			return (EINVAL);
   1631 
   1632 
   1633 		rf_lock_mutex2(raidPtr->mutex);
   1634 		if (raidPtr->status == rf_rs_reconstructing) {
   1635 			/* you can't fail a disk while we're reconstructing! */
   1636 			/* XXX wrong for RAID6 */
   1637 			rf_unlock_mutex2(raidPtr->mutex);
   1638 			return (EINVAL);
   1639 		}
   1640 		if ((raidPtr->Disks[rr->col].status ==
   1641 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1642 			/* some other component has failed.  Let's not make
   1643 			   things worse. XXX wrong for RAID6 */
   1644 			rf_unlock_mutex2(raidPtr->mutex);
   1645 			return (EINVAL);
   1646 		}
   1647 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1648 			/* Can't fail a spared disk! */
   1649 			rf_unlock_mutex2(raidPtr->mutex);
   1650 			return (EINVAL);
   1651 		}
   1652 		rf_unlock_mutex2(raidPtr->mutex);
   1653 
   1654 		/* make a copy of the recon request so that we don't rely on
   1655 		 * the user's buffer */
   1656 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1657 		if (rrcopy == NULL)
   1658 			return(ENOMEM);
   1659 		memcpy(rrcopy, rr, sizeof(*rr));
   1660 		rrcopy->raidPtr = (void *) raidPtr;
   1661 
   1662 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1663 					   rf_ReconThread,
   1664 					   rrcopy,"raid_recon");
   1665 		return (0);
   1666 
   1667 		/* invoke a copyback operation after recon on whatever disk
   1668 		 * needs it, if any */
   1669 	case RAIDFRAME_COPYBACK:
   1670 
   1671 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1672 			/* This makes no sense on a RAID 0!! */
   1673 			return(EINVAL);
   1674 		}
   1675 
   1676 		if (raidPtr->copyback_in_progress == 1) {
   1677 			/* Copyback is already in progress! */
   1678 			return(EINVAL);
   1679 		}
   1680 
   1681 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1682 					   rf_CopybackThread,
   1683 					   raidPtr,"raid_copyback");
   1684 		return (retcode);
   1685 
   1686 		/* return the percentage completion of reconstruction */
   1687 	case RAIDFRAME_CHECK_RECON_STATUS:
   1688 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1689 			/* This makes no sense on a RAID 0, so tell the
   1690 			   user it's done. */
   1691 			*(int *) data = 100;
   1692 			return(0);
   1693 		}
   1694 		if (raidPtr->status != rf_rs_reconstructing)
   1695 			*(int *) data = 100;
   1696 		else {
   1697 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1698 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1699 			} else {
   1700 				*(int *) data = 0;
   1701 			}
   1702 		}
   1703 		return (0);
   1704 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1705 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1706 		if (raidPtr->status != rf_rs_reconstructing) {
   1707 			progressInfo.remaining = 0;
   1708 			progressInfo.completed = 100;
   1709 			progressInfo.total = 100;
   1710 		} else {
   1711 			progressInfo.total =
   1712 				raidPtr->reconControl->numRUsTotal;
   1713 			progressInfo.completed =
   1714 				raidPtr->reconControl->numRUsComplete;
   1715 			progressInfo.remaining = progressInfo.total -
   1716 				progressInfo.completed;
   1717 		}
   1718 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1719 				  sizeof(RF_ProgressInfo_t));
   1720 		return (retcode);
   1721 
   1722 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1723 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1724 			/* This makes no sense on a RAID 0, so tell the
   1725 			   user it's done. */
   1726 			*(int *) data = 100;
   1727 			return(0);
   1728 		}
   1729 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1730 			*(int *) data = 100 *
   1731 				raidPtr->parity_rewrite_stripes_done /
   1732 				raidPtr->Layout.numStripe;
   1733 		} else {
   1734 			*(int *) data = 100;
   1735 		}
   1736 		return (0);
   1737 
   1738 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1739 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1740 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1741 			progressInfo.total = raidPtr->Layout.numStripe;
   1742 			progressInfo.completed =
   1743 				raidPtr->parity_rewrite_stripes_done;
   1744 			progressInfo.remaining = progressInfo.total -
   1745 				progressInfo.completed;
   1746 		} else {
   1747 			progressInfo.remaining = 0;
   1748 			progressInfo.completed = 100;
   1749 			progressInfo.total = 100;
   1750 		}
   1751 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1752 				  sizeof(RF_ProgressInfo_t));
   1753 		return (retcode);
   1754 
   1755 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1756 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1757 			/* This makes no sense on a RAID 0 */
   1758 			*(int *) data = 100;
   1759 			return(0);
   1760 		}
   1761 		if (raidPtr->copyback_in_progress == 1) {
   1762 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1763 				raidPtr->Layout.numStripe;
   1764 		} else {
   1765 			*(int *) data = 100;
   1766 		}
   1767 		return (0);
   1768 
   1769 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1770 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1771 		if (raidPtr->copyback_in_progress == 1) {
   1772 			progressInfo.total = raidPtr->Layout.numStripe;
   1773 			progressInfo.completed =
   1774 				raidPtr->copyback_stripes_done;
   1775 			progressInfo.remaining = progressInfo.total -
   1776 				progressInfo.completed;
   1777 		} else {
   1778 			progressInfo.remaining = 0;
   1779 			progressInfo.completed = 100;
   1780 			progressInfo.total = 100;
   1781 		}
   1782 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1783 				  sizeof(RF_ProgressInfo_t));
   1784 		return (retcode);
   1785 
   1786 		/* the sparetable daemon calls this to wait for the kernel to
   1787 		 * need a spare table. this ioctl does not return until a
   1788 		 * spare table is needed. XXX -- calling mpsleep here in the
   1789 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1790 		 * -- I should either compute the spare table in the kernel,
   1791 		 * or have a different -- XXX XXX -- interface (a different
   1792 		 * character device) for delivering the table     -- XXX */
   1793 #if 0
   1794 	case RAIDFRAME_SPARET_WAIT:
   1795 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1796 		while (!rf_sparet_wait_queue)
   1797 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1798 		waitreq = rf_sparet_wait_queue;
   1799 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1800 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1801 
   1802 		/* structure assignment */
   1803 		*((RF_SparetWait_t *) data) = *waitreq;
   1804 
   1805 		RF_Free(waitreq, sizeof(*waitreq));
   1806 		return (0);
   1807 
   1808 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1809 		 * code in it that will cause the dameon to exit */
   1810 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1811 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1812 		waitreq->fcol = -1;
   1813 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1814 		waitreq->next = rf_sparet_wait_queue;
   1815 		rf_sparet_wait_queue = waitreq;
   1816 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1817 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1818 		return (0);
   1819 
   1820 		/* used by the spare table daemon to deliver a spare table
   1821 		 * into the kernel */
   1822 	case RAIDFRAME_SEND_SPARET:
   1823 
   1824 		/* install the spare table */
   1825 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1826 
   1827 		/* respond to the requestor.  the return status of the spare
   1828 		 * table installation is passed in the "fcol" field */
   1829 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1830 		waitreq->fcol = retcode;
   1831 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1832 		waitreq->next = rf_sparet_resp_queue;
   1833 		rf_sparet_resp_queue = waitreq;
   1834 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1835 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1836 
   1837 		return (retcode);
   1838 #endif
   1839 
   1840 	default:
   1841 		break; /* fall through to the os-specific code below */
   1842 
   1843 	}
   1844 
   1845 	if (!raidPtr->valid)
   1846 		return (EINVAL);
   1847 
   1848 	/*
   1849 	 * Add support for "regular" device ioctls here.
   1850 	 */
   1851 
   1852 	error = disk_ioctl(&rs->sc_dkdev, dev, cmd, data, flag, l);
   1853 	if (error != EPASSTHROUGH)
   1854 		return (error);
   1855 
   1856 	switch (cmd) {
   1857 	case DIOCWDINFO:
   1858 	case DIOCSDINFO:
   1859 #ifdef __HAVE_OLD_DISKLABEL
   1860 	case ODIOCWDINFO:
   1861 	case ODIOCSDINFO:
   1862 #endif
   1863 	{
   1864 		struct disklabel *lp;
   1865 #ifdef __HAVE_OLD_DISKLABEL
   1866 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1867 			memset(&newlabel, 0, sizeof newlabel);
   1868 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1869 			lp = &newlabel;
   1870 		} else
   1871 #endif
   1872 		lp = (struct disklabel *)data;
   1873 
   1874 		if ((error = raidlock(rs)) != 0)
   1875 			return (error);
   1876 
   1877 		rs->sc_flags |= RAIDF_LABELLING;
   1878 
   1879 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1880 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1881 		if (error == 0) {
   1882 			if (cmd == DIOCWDINFO
   1883 #ifdef __HAVE_OLD_DISKLABEL
   1884 			    || cmd == ODIOCWDINFO
   1885 #endif
   1886 			   )
   1887 				error = writedisklabel(RAIDLABELDEV(dev),
   1888 				    raidstrategy, rs->sc_dkdev.dk_label,
   1889 				    rs->sc_dkdev.dk_cpulabel);
   1890 		}
   1891 		rs->sc_flags &= ~RAIDF_LABELLING;
   1892 
   1893 		raidunlock(rs);
   1894 
   1895 		if (error)
   1896 			return (error);
   1897 		break;
   1898 	}
   1899 
   1900 	case DIOCWLABEL:
   1901 		if (*(int *) data != 0)
   1902 			rs->sc_flags |= RAIDF_WLABEL;
   1903 		else
   1904 			rs->sc_flags &= ~RAIDF_WLABEL;
   1905 		break;
   1906 
   1907 	case DIOCGDEFLABEL:
   1908 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1909 		break;
   1910 
   1911 #ifdef __HAVE_OLD_DISKLABEL
   1912 	case ODIOCGDEFLABEL:
   1913 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1914 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1915 			return ENOTTY;
   1916 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1917 		break;
   1918 #endif
   1919 
   1920 	case DIOCCACHESYNC:
   1921 		return rf_sync_component_caches(raidPtr);
   1922 
   1923 	case DIOCGSTRATEGY:
   1924 	    {
   1925 		struct disk_strategy *dks = (void *)data;
   1926 
   1927 		s = splbio();
   1928 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1929 		    sizeof(dks->dks_name));
   1930 		splx(s);
   1931 		dks->dks_paramlen = 0;
   1932 
   1933 		return 0;
   1934 	    }
   1935 
   1936 	case DIOCSSTRATEGY:
   1937 	    {
   1938 		struct disk_strategy *dks = (void *)data;
   1939 		struct bufq_state *new;
   1940 		struct bufq_state *old;
   1941 
   1942 		if (dks->dks_param != NULL) {
   1943 			return EINVAL;
   1944 		}
   1945 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1946 		error = bufq_alloc(&new, dks->dks_name,
   1947 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1948 		if (error) {
   1949 			return error;
   1950 		}
   1951 		s = splbio();
   1952 		old = rs->buf_queue;
   1953 		bufq_move(new, old);
   1954 		rs->buf_queue = new;
   1955 		splx(s);
   1956 		bufq_free(old);
   1957 
   1958 		return 0;
   1959 	    }
   1960 
   1961 	default:
   1962 		retcode = ENOTTY;
   1963 	}
   1964 	return (retcode);
   1965 
   1966 }
   1967 
   1968 
   1969 /* raidinit -- complete the rest of the initialization for the
   1970    RAIDframe device.  */
   1971 
   1972 
   1973 static void
   1974 raidinit(struct raid_softc *rs)
   1975 {
   1976 	cfdata_t cf;
   1977 	int     unit;
   1978 	RF_Raid_t *raidPtr = &rs->sc_r;
   1979 
   1980 	unit = raidPtr->raidid;
   1981 
   1982 
   1983 	/* XXX should check return code first... */
   1984 	rs->sc_flags |= RAIDF_INITED;
   1985 
   1986 	/* XXX doesn't check bounds. */
   1987 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1988 
   1989 	/* attach the pseudo device */
   1990 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1991 	cf->cf_name = raid_cd.cd_name;
   1992 	cf->cf_atname = raid_cd.cd_name;
   1993 	cf->cf_unit = unit;
   1994 	cf->cf_fstate = FSTATE_STAR;
   1995 
   1996 	rs->sc_dev = config_attach_pseudo(cf);
   1997 
   1998 	if (rs->sc_dev == NULL) {
   1999 		printf("raid%d: config_attach_pseudo failed\n",
   2000 		    raidPtr->raidid);
   2001 		rs->sc_flags &= ~RAIDF_INITED;
   2002 		free(cf, M_RAIDFRAME);
   2003 		return;
   2004 	}
   2005 
   2006 	/* disk_attach actually creates space for the CPU disklabel, among
   2007 	 * other things, so it's critical to call this *BEFORE* we try putzing
   2008 	 * with disklabels. */
   2009 
   2010 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   2011 	disk_attach(&rs->sc_dkdev);
   2012 
   2013 	/* XXX There may be a weird interaction here between this, and
   2014 	 * protectedSectors, as used in RAIDframe.  */
   2015 
   2016 	rs->sc_size = raidPtr->totalSectors;
   2017 
   2018 	rf_set_geometry(rs, raidPtr);
   2019 
   2020 	dkwedge_discover(&rs->sc_dkdev);
   2021 
   2022 }
   2023 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2024 /* wake up the daemon & tell it to get us a spare table
   2025  * XXX
   2026  * the entries in the queues should be tagged with the raidPtr
   2027  * so that in the extremely rare case that two recons happen at once,
   2028  * we know for which device were requesting a spare table
   2029  * XXX
   2030  *
   2031  * XXX This code is not currently used. GO
   2032  */
   2033 int
   2034 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2035 {
   2036 	int     retcode;
   2037 
   2038 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2039 	req->next = rf_sparet_wait_queue;
   2040 	rf_sparet_wait_queue = req;
   2041 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2042 
   2043 	/* mpsleep unlocks the mutex */
   2044 	while (!rf_sparet_resp_queue) {
   2045 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2046 	}
   2047 	req = rf_sparet_resp_queue;
   2048 	rf_sparet_resp_queue = req->next;
   2049 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2050 
   2051 	retcode = req->fcol;
   2052 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2053 					 * alloc'd */
   2054 	return (retcode);
   2055 }
   2056 #endif
   2057 
   2058 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2059  * bp & passes it down.
   2060  * any calls originating in the kernel must use non-blocking I/O
   2061  * do some extra sanity checking to return "appropriate" error values for
   2062  * certain conditions (to make some standard utilities work)
   2063  *
   2064  * Formerly known as: rf_DoAccessKernel
   2065  */
   2066 void
   2067 raidstart(RF_Raid_t *raidPtr)
   2068 {
   2069 	RF_SectorCount_t num_blocks, pb, sum;
   2070 	RF_RaidAddr_t raid_addr;
   2071 	struct partition *pp;
   2072 	daddr_t blocknum;
   2073 	struct raid_softc *rs;
   2074 	int     do_async;
   2075 	struct buf *bp;
   2076 	int rc;
   2077 
   2078 	rs = raidPtr->softc;
   2079 	/* quick check to see if anything has died recently */
   2080 	rf_lock_mutex2(raidPtr->mutex);
   2081 	if (raidPtr->numNewFailures > 0) {
   2082 		rf_unlock_mutex2(raidPtr->mutex);
   2083 		rf_update_component_labels(raidPtr,
   2084 					   RF_NORMAL_COMPONENT_UPDATE);
   2085 		rf_lock_mutex2(raidPtr->mutex);
   2086 		raidPtr->numNewFailures--;
   2087 	}
   2088 
   2089 	/* Check to see if we're at the limit... */
   2090 	while (raidPtr->openings > 0) {
   2091 		rf_unlock_mutex2(raidPtr->mutex);
   2092 
   2093 		/* get the next item, if any, from the queue */
   2094 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2095 			/* nothing more to do */
   2096 			return;
   2097 		}
   2098 
   2099 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2100 		 * partition.. Need to make it absolute to the underlying
   2101 		 * device.. */
   2102 
   2103 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2104 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2105 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2106 			blocknum += pp->p_offset;
   2107 		}
   2108 
   2109 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2110 			    (int) blocknum));
   2111 
   2112 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2113 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2114 
   2115 		/* *THIS* is where we adjust what block we're going to...
   2116 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2117 		raid_addr = blocknum;
   2118 
   2119 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2120 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2121 		sum = raid_addr + num_blocks + pb;
   2122 		if (1 || rf_debugKernelAccess) {
   2123 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2124 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2125 				    (int) pb, (int) bp->b_resid));
   2126 		}
   2127 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2128 		    || (sum < num_blocks) || (sum < pb)) {
   2129 			bp->b_error = ENOSPC;
   2130 			bp->b_resid = bp->b_bcount;
   2131 			biodone(bp);
   2132 			rf_lock_mutex2(raidPtr->mutex);
   2133 			continue;
   2134 		}
   2135 		/*
   2136 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2137 		 */
   2138 
   2139 		if (bp->b_bcount & raidPtr->sectorMask) {
   2140 			bp->b_error = EINVAL;
   2141 			bp->b_resid = bp->b_bcount;
   2142 			biodone(bp);
   2143 			rf_lock_mutex2(raidPtr->mutex);
   2144 			continue;
   2145 
   2146 		}
   2147 		db1_printf(("Calling DoAccess..\n"));
   2148 
   2149 
   2150 		rf_lock_mutex2(raidPtr->mutex);
   2151 		raidPtr->openings--;
   2152 		rf_unlock_mutex2(raidPtr->mutex);
   2153 
   2154 		/*
   2155 		 * Everything is async.
   2156 		 */
   2157 		do_async = 1;
   2158 
   2159 		disk_busy(&rs->sc_dkdev);
   2160 
   2161 		/* XXX we're still at splbio() here... do we *really*
   2162 		   need to be? */
   2163 
   2164 		/* don't ever condition on bp->b_flags & B_WRITE.
   2165 		 * always condition on B_READ instead */
   2166 
   2167 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2168 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2169 				 do_async, raid_addr, num_blocks,
   2170 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2171 
   2172 		if (rc) {
   2173 			bp->b_error = rc;
   2174 			bp->b_resid = bp->b_bcount;
   2175 			biodone(bp);
   2176 			/* continue loop */
   2177 		}
   2178 
   2179 		rf_lock_mutex2(raidPtr->mutex);
   2180 	}
   2181 	rf_unlock_mutex2(raidPtr->mutex);
   2182 }
   2183 
   2184 
   2185 
   2186 
   2187 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2188 
   2189 int
   2190 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2191 {
   2192 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2193 	struct buf *bp;
   2194 
   2195 	req->queue = queue;
   2196 	bp = req->bp;
   2197 
   2198 	switch (req->type) {
   2199 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2200 		/* XXX need to do something extra here.. */
   2201 		/* I'm leaving this in, as I've never actually seen it used,
   2202 		 * and I'd like folks to report it... GO */
   2203 		printf(("WAKEUP CALLED\n"));
   2204 		queue->numOutstanding++;
   2205 
   2206 		bp->b_flags = 0;
   2207 		bp->b_private = req;
   2208 
   2209 		KernelWakeupFunc(bp);
   2210 		break;
   2211 
   2212 	case RF_IO_TYPE_READ:
   2213 	case RF_IO_TYPE_WRITE:
   2214 #if RF_ACC_TRACE > 0
   2215 		if (req->tracerec) {
   2216 			RF_ETIMER_START(req->tracerec->timer);
   2217 		}
   2218 #endif
   2219 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2220 		    op, queue->rf_cinfo->ci_dev,
   2221 		    req->sectorOffset, req->numSector,
   2222 		    req->buf, KernelWakeupFunc, (void *) req,
   2223 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2224 
   2225 		if (rf_debugKernelAccess) {
   2226 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2227 				(long) bp->b_blkno));
   2228 		}
   2229 		queue->numOutstanding++;
   2230 		queue->last_deq_sector = req->sectorOffset;
   2231 		/* acc wouldn't have been let in if there were any pending
   2232 		 * reqs at any other priority */
   2233 		queue->curPriority = req->priority;
   2234 
   2235 		db1_printf(("Going for %c to unit %d col %d\n",
   2236 			    req->type, queue->raidPtr->raidid,
   2237 			    queue->col));
   2238 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2239 			(int) req->sectorOffset, (int) req->numSector,
   2240 			(int) (req->numSector <<
   2241 			    queue->raidPtr->logBytesPerSector),
   2242 			(int) queue->raidPtr->logBytesPerSector));
   2243 
   2244 		/*
   2245 		 * XXX: drop lock here since this can block at
   2246 		 * least with backing SCSI devices.  Retake it
   2247 		 * to minimize fuss with calling interfaces.
   2248 		 */
   2249 
   2250 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2251 		bdev_strategy(bp);
   2252 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2253 		break;
   2254 
   2255 	default:
   2256 		panic("bad req->type in rf_DispatchKernelIO");
   2257 	}
   2258 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2259 
   2260 	return (0);
   2261 }
   2262 /* this is the callback function associated with a I/O invoked from
   2263    kernel code.
   2264  */
   2265 static void
   2266 KernelWakeupFunc(struct buf *bp)
   2267 {
   2268 	RF_DiskQueueData_t *req = NULL;
   2269 	RF_DiskQueue_t *queue;
   2270 
   2271 	db1_printf(("recovering the request queue:\n"));
   2272 
   2273 	req = bp->b_private;
   2274 
   2275 	queue = (RF_DiskQueue_t *) req->queue;
   2276 
   2277 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2278 
   2279 #if RF_ACC_TRACE > 0
   2280 	if (req->tracerec) {
   2281 		RF_ETIMER_STOP(req->tracerec->timer);
   2282 		RF_ETIMER_EVAL(req->tracerec->timer);
   2283 		rf_lock_mutex2(rf_tracing_mutex);
   2284 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2285 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2286 		req->tracerec->num_phys_ios++;
   2287 		rf_unlock_mutex2(rf_tracing_mutex);
   2288 	}
   2289 #endif
   2290 
   2291 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2292 	 * ballistic, and mark the component as hosed... */
   2293 
   2294 	if (bp->b_error != 0) {
   2295 		/* Mark the disk as dead */
   2296 		/* but only mark it once... */
   2297 		/* and only if it wouldn't leave this RAID set
   2298 		   completely broken */
   2299 		if (((queue->raidPtr->Disks[queue->col].status ==
   2300 		      rf_ds_optimal) ||
   2301 		     (queue->raidPtr->Disks[queue->col].status ==
   2302 		      rf_ds_used_spare)) &&
   2303 		     (queue->raidPtr->numFailures <
   2304 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2305 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2306 			       queue->raidPtr->raidid,
   2307 			       bp->b_error,
   2308 			       queue->raidPtr->Disks[queue->col].devname);
   2309 			queue->raidPtr->Disks[queue->col].status =
   2310 			    rf_ds_failed;
   2311 			queue->raidPtr->status = rf_rs_degraded;
   2312 			queue->raidPtr->numFailures++;
   2313 			queue->raidPtr->numNewFailures++;
   2314 		} else {	/* Disk is already dead... */
   2315 			/* printf("Disk already marked as dead!\n"); */
   2316 		}
   2317 
   2318 	}
   2319 
   2320 	/* Fill in the error value */
   2321 	req->error = bp->b_error;
   2322 
   2323 	/* Drop this one on the "finished" queue... */
   2324 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2325 
   2326 	/* Let the raidio thread know there is work to be done. */
   2327 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2328 
   2329 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2330 }
   2331 
   2332 
   2333 /*
   2334  * initialize a buf structure for doing an I/O in the kernel.
   2335  */
   2336 static void
   2337 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2338        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2339        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2340        struct proc *b_proc)
   2341 {
   2342 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2343 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2344 	bp->b_oflags = 0;
   2345 	bp->b_cflags = 0;
   2346 	bp->b_bcount = numSect << logBytesPerSector;
   2347 	bp->b_bufsize = bp->b_bcount;
   2348 	bp->b_error = 0;
   2349 	bp->b_dev = dev;
   2350 	bp->b_data = bf;
   2351 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2352 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2353 	if (bp->b_bcount == 0) {
   2354 		panic("bp->b_bcount is zero in InitBP!!");
   2355 	}
   2356 	bp->b_proc = b_proc;
   2357 	bp->b_iodone = cbFunc;
   2358 	bp->b_private = cbArg;
   2359 }
   2360 
   2361 static void
   2362 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2363 		    struct disklabel *lp)
   2364 {
   2365 	memset(lp, 0, sizeof(*lp));
   2366 
   2367 	/* fabricate a label... */
   2368 	if (raidPtr->totalSectors > UINT32_MAX)
   2369 		lp->d_secperunit = UINT32_MAX;
   2370 	else
   2371 		lp->d_secperunit = raidPtr->totalSectors;
   2372 	lp->d_secsize = raidPtr->bytesPerSector;
   2373 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2374 	lp->d_ntracks = 4 * raidPtr->numCol;
   2375 	lp->d_ncylinders = raidPtr->totalSectors /
   2376 		(lp->d_nsectors * lp->d_ntracks);
   2377 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2378 
   2379 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2380 	lp->d_type = DKTYPE_RAID;
   2381 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2382 	lp->d_rpm = 3600;
   2383 	lp->d_interleave = 1;
   2384 	lp->d_flags = 0;
   2385 
   2386 	lp->d_partitions[RAW_PART].p_offset = 0;
   2387 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2388 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2389 	lp->d_npartitions = RAW_PART + 1;
   2390 
   2391 	lp->d_magic = DISKMAGIC;
   2392 	lp->d_magic2 = DISKMAGIC;
   2393 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2394 
   2395 }
   2396 /*
   2397  * Read the disklabel from the raid device.  If one is not present, fake one
   2398  * up.
   2399  */
   2400 static void
   2401 raidgetdisklabel(dev_t dev)
   2402 {
   2403 	int     unit = raidunit(dev);
   2404 	struct raid_softc *rs;
   2405 	const char   *errstring;
   2406 	struct disklabel *lp;
   2407 	struct cpu_disklabel *clp;
   2408 	RF_Raid_t *raidPtr;
   2409 
   2410 	if ((rs = raidget(unit, false)) == NULL)
   2411 		return;
   2412 
   2413 	lp = rs->sc_dkdev.dk_label;
   2414 	clp = rs->sc_dkdev.dk_cpulabel;
   2415 
   2416 	db1_printf(("Getting the disklabel...\n"));
   2417 
   2418 	memset(clp, 0, sizeof(*clp));
   2419 
   2420 	raidPtr = &rs->sc_r;
   2421 
   2422 	raidgetdefaultlabel(raidPtr, rs, lp);
   2423 
   2424 	/*
   2425 	 * Call the generic disklabel extraction routine.
   2426 	 */
   2427 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2428 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2429 	if (errstring)
   2430 		raidmakedisklabel(rs);
   2431 	else {
   2432 		int     i;
   2433 		struct partition *pp;
   2434 
   2435 		/*
   2436 		 * Sanity check whether the found disklabel is valid.
   2437 		 *
   2438 		 * This is necessary since total size of the raid device
   2439 		 * may vary when an interleave is changed even though exactly
   2440 		 * same components are used, and old disklabel may used
   2441 		 * if that is found.
   2442 		 */
   2443 		if (lp->d_secperunit < UINT32_MAX ?
   2444 		    lp->d_secperunit != rs->sc_size :
   2445 		    lp->d_secperunit > rs->sc_size)
   2446 			printf("raid%d: WARNING: %s: "
   2447 			    "total sector size in disklabel (%ju) != "
   2448 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
   2449 			    (uintmax_t)lp->d_secperunit,
   2450 			    (uintmax_t)rs->sc_size);
   2451 		for (i = 0; i < lp->d_npartitions; i++) {
   2452 			pp = &lp->d_partitions[i];
   2453 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2454 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2455 				       "exceeds the size of raid (%ju)\n",
   2456 				       unit, rs->sc_xname, 'a' + i,
   2457 				       (uintmax_t)rs->sc_size);
   2458 		}
   2459 	}
   2460 
   2461 }
   2462 /*
   2463  * Take care of things one might want to take care of in the event
   2464  * that a disklabel isn't present.
   2465  */
   2466 static void
   2467 raidmakedisklabel(struct raid_softc *rs)
   2468 {
   2469 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2470 	db1_printf(("Making a label..\n"));
   2471 
   2472 	/*
   2473 	 * For historical reasons, if there's no disklabel present
   2474 	 * the raw partition must be marked FS_BSDFFS.
   2475 	 */
   2476 
   2477 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2478 
   2479 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2480 
   2481 	lp->d_checksum = dkcksum(lp);
   2482 }
   2483 /*
   2484  * Wait interruptibly for an exclusive lock.
   2485  *
   2486  * XXX
   2487  * Several drivers do this; it should be abstracted and made MP-safe.
   2488  * (Hmm... where have we seen this warning before :->  GO )
   2489  */
   2490 static int
   2491 raidlock(struct raid_softc *rs)
   2492 {
   2493 	int     error;
   2494 
   2495 	mutex_enter(&rs->sc_mutex);
   2496 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2497 		rs->sc_flags |= RAIDF_WANTED;
   2498 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2499 		if (error != 0)
   2500 			return (error);
   2501 	}
   2502 	rs->sc_flags |= RAIDF_LOCKED;
   2503 	mutex_exit(&rs->sc_mutex);
   2504 	return (0);
   2505 }
   2506 /*
   2507  * Unlock and wake up any waiters.
   2508  */
   2509 static void
   2510 raidunlock(struct raid_softc *rs)
   2511 {
   2512 
   2513 	mutex_enter(&rs->sc_mutex);
   2514 	rs->sc_flags &= ~RAIDF_LOCKED;
   2515 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2516 		rs->sc_flags &= ~RAIDF_WANTED;
   2517 		cv_broadcast(&rs->sc_cv);
   2518 	}
   2519 	mutex_exit(&rs->sc_mutex);
   2520 }
   2521 
   2522 
   2523 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2524 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2525 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2526 
   2527 static daddr_t
   2528 rf_component_info_offset(void)
   2529 {
   2530 
   2531 	return RF_COMPONENT_INFO_OFFSET;
   2532 }
   2533 
   2534 static daddr_t
   2535 rf_component_info_size(unsigned secsize)
   2536 {
   2537 	daddr_t info_size;
   2538 
   2539 	KASSERT(secsize);
   2540 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2541 		info_size = secsize;
   2542 	else
   2543 		info_size = RF_COMPONENT_INFO_SIZE;
   2544 
   2545 	return info_size;
   2546 }
   2547 
   2548 static daddr_t
   2549 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2550 {
   2551 	daddr_t map_offset;
   2552 
   2553 	KASSERT(raidPtr->bytesPerSector);
   2554 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2555 		map_offset = raidPtr->bytesPerSector;
   2556 	else
   2557 		map_offset = RF_COMPONENT_INFO_SIZE;
   2558 	map_offset += rf_component_info_offset();
   2559 
   2560 	return map_offset;
   2561 }
   2562 
   2563 static daddr_t
   2564 rf_parity_map_size(RF_Raid_t *raidPtr)
   2565 {
   2566 	daddr_t map_size;
   2567 
   2568 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2569 		map_size = raidPtr->bytesPerSector;
   2570 	else
   2571 		map_size = RF_PARITY_MAP_SIZE;
   2572 
   2573 	return map_size;
   2574 }
   2575 
   2576 int
   2577 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2578 {
   2579 	RF_ComponentLabel_t *clabel;
   2580 
   2581 	clabel = raidget_component_label(raidPtr, col);
   2582 	clabel->clean = RF_RAID_CLEAN;
   2583 	raidflush_component_label(raidPtr, col);
   2584 	return(0);
   2585 }
   2586 
   2587 
   2588 int
   2589 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2590 {
   2591 	RF_ComponentLabel_t *clabel;
   2592 
   2593 	clabel = raidget_component_label(raidPtr, col);
   2594 	clabel->clean = RF_RAID_DIRTY;
   2595 	raidflush_component_label(raidPtr, col);
   2596 	return(0);
   2597 }
   2598 
   2599 int
   2600 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2601 {
   2602 	KASSERT(raidPtr->bytesPerSector);
   2603 	return raidread_component_label(raidPtr->bytesPerSector,
   2604 	    raidPtr->Disks[col].dev,
   2605 	    raidPtr->raid_cinfo[col].ci_vp,
   2606 	    &raidPtr->raid_cinfo[col].ci_label);
   2607 }
   2608 
   2609 RF_ComponentLabel_t *
   2610 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2611 {
   2612 	return &raidPtr->raid_cinfo[col].ci_label;
   2613 }
   2614 
   2615 int
   2616 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2617 {
   2618 	RF_ComponentLabel_t *label;
   2619 
   2620 	label = &raidPtr->raid_cinfo[col].ci_label;
   2621 	label->mod_counter = raidPtr->mod_counter;
   2622 #ifndef RF_NO_PARITY_MAP
   2623 	label->parity_map_modcount = label->mod_counter;
   2624 #endif
   2625 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2626 	    raidPtr->Disks[col].dev,
   2627 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2628 }
   2629 
   2630 
   2631 static int
   2632 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2633     RF_ComponentLabel_t *clabel)
   2634 {
   2635 	return raidread_component_area(dev, b_vp, clabel,
   2636 	    sizeof(RF_ComponentLabel_t),
   2637 	    rf_component_info_offset(),
   2638 	    rf_component_info_size(secsize));
   2639 }
   2640 
   2641 /* ARGSUSED */
   2642 static int
   2643 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2644     size_t msize, daddr_t offset, daddr_t dsize)
   2645 {
   2646 	struct buf *bp;
   2647 	int error;
   2648 
   2649 	/* XXX should probably ensure that we don't try to do this if
   2650 	   someone has changed rf_protected_sectors. */
   2651 
   2652 	if (b_vp == NULL) {
   2653 		/* For whatever reason, this component is not valid.
   2654 		   Don't try to read a component label from it. */
   2655 		return(EINVAL);
   2656 	}
   2657 
   2658 	/* get a block of the appropriate size... */
   2659 	bp = geteblk((int)dsize);
   2660 	bp->b_dev = dev;
   2661 
   2662 	/* get our ducks in a row for the read */
   2663 	bp->b_blkno = offset / DEV_BSIZE;
   2664 	bp->b_bcount = dsize;
   2665 	bp->b_flags |= B_READ;
   2666  	bp->b_resid = dsize;
   2667 
   2668 	bdev_strategy(bp);
   2669 	error = biowait(bp);
   2670 
   2671 	if (!error) {
   2672 		memcpy(data, bp->b_data, msize);
   2673 	}
   2674 
   2675 	brelse(bp, 0);
   2676 	return(error);
   2677 }
   2678 
   2679 
   2680 static int
   2681 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2682     RF_ComponentLabel_t *clabel)
   2683 {
   2684 	return raidwrite_component_area(dev, b_vp, clabel,
   2685 	    sizeof(RF_ComponentLabel_t),
   2686 	    rf_component_info_offset(),
   2687 	    rf_component_info_size(secsize), 0);
   2688 }
   2689 
   2690 /* ARGSUSED */
   2691 static int
   2692 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2693     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2694 {
   2695 	struct buf *bp;
   2696 	int error;
   2697 
   2698 	/* get a block of the appropriate size... */
   2699 	bp = geteblk((int)dsize);
   2700 	bp->b_dev = dev;
   2701 
   2702 	/* get our ducks in a row for the write */
   2703 	bp->b_blkno = offset / DEV_BSIZE;
   2704 	bp->b_bcount = dsize;
   2705 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2706  	bp->b_resid = dsize;
   2707 
   2708 	memset(bp->b_data, 0, dsize);
   2709 	memcpy(bp->b_data, data, msize);
   2710 
   2711 	bdev_strategy(bp);
   2712 	if (asyncp)
   2713 		return 0;
   2714 	error = biowait(bp);
   2715 	brelse(bp, 0);
   2716 	if (error) {
   2717 #if 1
   2718 		printf("Failed to write RAID component info!\n");
   2719 #endif
   2720 	}
   2721 
   2722 	return(error);
   2723 }
   2724 
   2725 void
   2726 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2727 {
   2728 	int c;
   2729 
   2730 	for (c = 0; c < raidPtr->numCol; c++) {
   2731 		/* Skip dead disks. */
   2732 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2733 			continue;
   2734 		/* XXXjld: what if an error occurs here? */
   2735 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2736 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2737 		    RF_PARITYMAP_NBYTE,
   2738 		    rf_parity_map_offset(raidPtr),
   2739 		    rf_parity_map_size(raidPtr), 0);
   2740 	}
   2741 }
   2742 
   2743 void
   2744 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2745 {
   2746 	struct rf_paritymap_ondisk tmp;
   2747 	int c,first;
   2748 
   2749 	first=1;
   2750 	for (c = 0; c < raidPtr->numCol; c++) {
   2751 		/* Skip dead disks. */
   2752 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2753 			continue;
   2754 		raidread_component_area(raidPtr->Disks[c].dev,
   2755 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2756 		    RF_PARITYMAP_NBYTE,
   2757 		    rf_parity_map_offset(raidPtr),
   2758 		    rf_parity_map_size(raidPtr));
   2759 		if (first) {
   2760 			memcpy(map, &tmp, sizeof(*map));
   2761 			first = 0;
   2762 		} else {
   2763 			rf_paritymap_merge(map, &tmp);
   2764 		}
   2765 	}
   2766 }
   2767 
   2768 void
   2769 rf_markalldirty(RF_Raid_t *raidPtr)
   2770 {
   2771 	RF_ComponentLabel_t *clabel;
   2772 	int sparecol;
   2773 	int c;
   2774 	int j;
   2775 	int scol = -1;
   2776 
   2777 	raidPtr->mod_counter++;
   2778 	for (c = 0; c < raidPtr->numCol; c++) {
   2779 		/* we don't want to touch (at all) a disk that has
   2780 		   failed */
   2781 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2782 			clabel = raidget_component_label(raidPtr, c);
   2783 			if (clabel->status == rf_ds_spared) {
   2784 				/* XXX do something special...
   2785 				   but whatever you do, don't
   2786 				   try to access it!! */
   2787 			} else {
   2788 				raidmarkdirty(raidPtr, c);
   2789 			}
   2790 		}
   2791 	}
   2792 
   2793 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2794 		sparecol = raidPtr->numCol + c;
   2795 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2796 			/*
   2797 
   2798 			   we claim this disk is "optimal" if it's
   2799 			   rf_ds_used_spare, as that means it should be
   2800 			   directly substitutable for the disk it replaced.
   2801 			   We note that too...
   2802 
   2803 			 */
   2804 
   2805 			for(j=0;j<raidPtr->numCol;j++) {
   2806 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2807 					scol = j;
   2808 					break;
   2809 				}
   2810 			}
   2811 
   2812 			clabel = raidget_component_label(raidPtr, sparecol);
   2813 			/* make sure status is noted */
   2814 
   2815 			raid_init_component_label(raidPtr, clabel);
   2816 
   2817 			clabel->row = 0;
   2818 			clabel->column = scol;
   2819 			/* Note: we *don't* change status from rf_ds_used_spare
   2820 			   to rf_ds_optimal */
   2821 			/* clabel.status = rf_ds_optimal; */
   2822 
   2823 			raidmarkdirty(raidPtr, sparecol);
   2824 		}
   2825 	}
   2826 }
   2827 
   2828 
   2829 void
   2830 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2831 {
   2832 	RF_ComponentLabel_t *clabel;
   2833 	int sparecol;
   2834 	int c;
   2835 	int j;
   2836 	int scol;
   2837 
   2838 	scol = -1;
   2839 
   2840 	/* XXX should do extra checks to make sure things really are clean,
   2841 	   rather than blindly setting the clean bit... */
   2842 
   2843 	raidPtr->mod_counter++;
   2844 
   2845 	for (c = 0; c < raidPtr->numCol; c++) {
   2846 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2847 			clabel = raidget_component_label(raidPtr, c);
   2848 			/* make sure status is noted */
   2849 			clabel->status = rf_ds_optimal;
   2850 
   2851 			/* note what unit we are configured as */
   2852 			clabel->last_unit = raidPtr->raidid;
   2853 
   2854 			raidflush_component_label(raidPtr, c);
   2855 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2856 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2857 					raidmarkclean(raidPtr, c);
   2858 				}
   2859 			}
   2860 		}
   2861 		/* else we don't touch it.. */
   2862 	}
   2863 
   2864 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2865 		sparecol = raidPtr->numCol + c;
   2866 		/* Need to ensure that the reconstruct actually completed! */
   2867 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2868 			/*
   2869 
   2870 			   we claim this disk is "optimal" if it's
   2871 			   rf_ds_used_spare, as that means it should be
   2872 			   directly substitutable for the disk it replaced.
   2873 			   We note that too...
   2874 
   2875 			 */
   2876 
   2877 			for(j=0;j<raidPtr->numCol;j++) {
   2878 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2879 					scol = j;
   2880 					break;
   2881 				}
   2882 			}
   2883 
   2884 			/* XXX shouldn't *really* need this... */
   2885 			clabel = raidget_component_label(raidPtr, sparecol);
   2886 			/* make sure status is noted */
   2887 
   2888 			raid_init_component_label(raidPtr, clabel);
   2889 
   2890 			clabel->column = scol;
   2891 			clabel->status = rf_ds_optimal;
   2892 			clabel->last_unit = raidPtr->raidid;
   2893 
   2894 			raidflush_component_label(raidPtr, sparecol);
   2895 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2896 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2897 					raidmarkclean(raidPtr, sparecol);
   2898 				}
   2899 			}
   2900 		}
   2901 	}
   2902 }
   2903 
   2904 void
   2905 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2906 {
   2907 
   2908 	if (vp != NULL) {
   2909 		if (auto_configured == 1) {
   2910 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2911 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2912 			vput(vp);
   2913 
   2914 		} else {
   2915 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2916 		}
   2917 	}
   2918 }
   2919 
   2920 
   2921 void
   2922 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2923 {
   2924 	int r,c;
   2925 	struct vnode *vp;
   2926 	int acd;
   2927 
   2928 
   2929 	/* We take this opportunity to close the vnodes like we should.. */
   2930 
   2931 	for (c = 0; c < raidPtr->numCol; c++) {
   2932 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2933 		acd = raidPtr->Disks[c].auto_configured;
   2934 		rf_close_component(raidPtr, vp, acd);
   2935 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2936 		raidPtr->Disks[c].auto_configured = 0;
   2937 	}
   2938 
   2939 	for (r = 0; r < raidPtr->numSpare; r++) {
   2940 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2941 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2942 		rf_close_component(raidPtr, vp, acd);
   2943 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2944 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2945 	}
   2946 }
   2947 
   2948 
   2949 void
   2950 rf_ReconThread(struct rf_recon_req *req)
   2951 {
   2952 	int     s;
   2953 	RF_Raid_t *raidPtr;
   2954 
   2955 	s = splbio();
   2956 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2957 	raidPtr->recon_in_progress = 1;
   2958 
   2959 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2960 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2961 
   2962 	RF_Free(req, sizeof(*req));
   2963 
   2964 	raidPtr->recon_in_progress = 0;
   2965 	splx(s);
   2966 
   2967 	/* That's all... */
   2968 	kthread_exit(0);	/* does not return */
   2969 }
   2970 
   2971 void
   2972 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2973 {
   2974 	int retcode;
   2975 	int s;
   2976 
   2977 	raidPtr->parity_rewrite_stripes_done = 0;
   2978 	raidPtr->parity_rewrite_in_progress = 1;
   2979 	s = splbio();
   2980 	retcode = rf_RewriteParity(raidPtr);
   2981 	splx(s);
   2982 	if (retcode) {
   2983 		printf("raid%d: Error re-writing parity (%d)!\n",
   2984 		    raidPtr->raidid, retcode);
   2985 	} else {
   2986 		/* set the clean bit!  If we shutdown correctly,
   2987 		   the clean bit on each component label will get
   2988 		   set */
   2989 		raidPtr->parity_good = RF_RAID_CLEAN;
   2990 	}
   2991 	raidPtr->parity_rewrite_in_progress = 0;
   2992 
   2993 	/* Anyone waiting for us to stop?  If so, inform them... */
   2994 	if (raidPtr->waitShutdown) {
   2995 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2996 	}
   2997 
   2998 	/* That's all... */
   2999 	kthread_exit(0);	/* does not return */
   3000 }
   3001 
   3002 
   3003 void
   3004 rf_CopybackThread(RF_Raid_t *raidPtr)
   3005 {
   3006 	int s;
   3007 
   3008 	raidPtr->copyback_in_progress = 1;
   3009 	s = splbio();
   3010 	rf_CopybackReconstructedData(raidPtr);
   3011 	splx(s);
   3012 	raidPtr->copyback_in_progress = 0;
   3013 
   3014 	/* That's all... */
   3015 	kthread_exit(0);	/* does not return */
   3016 }
   3017 
   3018 
   3019 void
   3020 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3021 {
   3022 	int s;
   3023 	RF_Raid_t *raidPtr;
   3024 
   3025 	s = splbio();
   3026 	raidPtr = req->raidPtr;
   3027 	raidPtr->recon_in_progress = 1;
   3028 	rf_ReconstructInPlace(raidPtr, req->col);
   3029 	RF_Free(req, sizeof(*req));
   3030 	raidPtr->recon_in_progress = 0;
   3031 	splx(s);
   3032 
   3033 	/* That's all... */
   3034 	kthread_exit(0);	/* does not return */
   3035 }
   3036 
   3037 static RF_AutoConfig_t *
   3038 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3039     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3040     unsigned secsize)
   3041 {
   3042 	int good_one = 0;
   3043 	RF_ComponentLabel_t *clabel;
   3044 	RF_AutoConfig_t *ac;
   3045 
   3046 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3047 	if (clabel == NULL) {
   3048 oomem:
   3049 		    while(ac_list) {
   3050 			    ac = ac_list;
   3051 			    if (ac->clabel)
   3052 				    free(ac->clabel, M_RAIDFRAME);
   3053 			    ac_list = ac_list->next;
   3054 			    free(ac, M_RAIDFRAME);
   3055 		    }
   3056 		    printf("RAID auto config: out of memory!\n");
   3057 		    return NULL; /* XXX probably should panic? */
   3058 	}
   3059 
   3060 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3061 		/* Got the label.  Does it look reasonable? */
   3062 		if (rf_reasonable_label(clabel, numsecs) &&
   3063 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3064 #ifdef DEBUG
   3065 			printf("Component on: %s: %llu\n",
   3066 				cname, (unsigned long long)size);
   3067 			rf_print_component_label(clabel);
   3068 #endif
   3069 			/* if it's reasonable, add it, else ignore it. */
   3070 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3071 				M_NOWAIT);
   3072 			if (ac == NULL) {
   3073 				free(clabel, M_RAIDFRAME);
   3074 				goto oomem;
   3075 			}
   3076 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3077 			ac->dev = dev;
   3078 			ac->vp = vp;
   3079 			ac->clabel = clabel;
   3080 			ac->next = ac_list;
   3081 			ac_list = ac;
   3082 			good_one = 1;
   3083 		}
   3084 	}
   3085 	if (!good_one) {
   3086 		/* cleanup */
   3087 		free(clabel, M_RAIDFRAME);
   3088 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3089 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3090 		vput(vp);
   3091 	}
   3092 	return ac_list;
   3093 }
   3094 
   3095 RF_AutoConfig_t *
   3096 rf_find_raid_components(void)
   3097 {
   3098 	struct vnode *vp;
   3099 	struct disklabel label;
   3100 	device_t dv;
   3101 	deviter_t di;
   3102 	dev_t dev;
   3103 	int bmajor, bminor, wedge, rf_part_found;
   3104 	int error;
   3105 	int i;
   3106 	RF_AutoConfig_t *ac_list;
   3107 	uint64_t numsecs;
   3108 	unsigned secsize;
   3109 
   3110 	/* initialize the AutoConfig list */
   3111 	ac_list = NULL;
   3112 
   3113 	/* we begin by trolling through *all* the devices on the system */
   3114 
   3115 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3116 	     dv = deviter_next(&di)) {
   3117 
   3118 		/* we are only interested in disks... */
   3119 		if (device_class(dv) != DV_DISK)
   3120 			continue;
   3121 
   3122 		/* we don't care about floppies... */
   3123 		if (device_is_a(dv, "fd")) {
   3124 			continue;
   3125 		}
   3126 
   3127 		/* we don't care about CD's... */
   3128 		if (device_is_a(dv, "cd")) {
   3129 			continue;
   3130 		}
   3131 
   3132 		/* we don't care about md's... */
   3133 		if (device_is_a(dv, "md")) {
   3134 			continue;
   3135 		}
   3136 
   3137 		/* hdfd is the Atari/Hades floppy driver */
   3138 		if (device_is_a(dv, "hdfd")) {
   3139 			continue;
   3140 		}
   3141 
   3142 		/* fdisa is the Atari/Milan floppy driver */
   3143 		if (device_is_a(dv, "fdisa")) {
   3144 			continue;
   3145 		}
   3146 
   3147 		/* need to find the device_name_to_block_device_major stuff */
   3148 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3149 
   3150 		rf_part_found = 0; /*No raid partition as yet*/
   3151 
   3152 		/* get a vnode for the raw partition of this disk */
   3153 
   3154 		wedge = device_is_a(dv, "dk");
   3155 		bminor = minor(device_unit(dv));
   3156 		dev = wedge ? makedev(bmajor, bminor) :
   3157 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3158 		if (bdevvp(dev, &vp))
   3159 			panic("RAID can't alloc vnode");
   3160 
   3161 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3162 
   3163 		if (error) {
   3164 			/* "Who cares."  Continue looking
   3165 			   for something that exists*/
   3166 			vput(vp);
   3167 			continue;
   3168 		}
   3169 
   3170 		error = getdisksize(vp, &numsecs, &secsize);
   3171 		if (error) {
   3172 			vput(vp);
   3173 			continue;
   3174 		}
   3175 		if (wedge) {
   3176 			struct dkwedge_info dkw;
   3177 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3178 			    NOCRED);
   3179 			if (error) {
   3180 				printf("RAIDframe: can't get wedge info for "
   3181 				    "dev %s (%d)\n", device_xname(dv), error);
   3182 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3183 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3184 				vput(vp);
   3185 				continue;
   3186 			}
   3187 
   3188 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3189 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3190 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3191 				vput(vp);
   3192 				continue;
   3193 			}
   3194 
   3195 			ac_list = rf_get_component(ac_list, dev, vp,
   3196 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3197 			rf_part_found = 1; /*There is a raid component on this disk*/
   3198 			continue;
   3199 		}
   3200 
   3201 		/* Ok, the disk exists.  Go get the disklabel. */
   3202 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3203 		if (error) {
   3204 			/*
   3205 			 * XXX can't happen - open() would
   3206 			 * have errored out (or faked up one)
   3207 			 */
   3208 			if (error != ENOTTY)
   3209 				printf("RAIDframe: can't get label for dev "
   3210 				    "%s (%d)\n", device_xname(dv), error);
   3211 		}
   3212 
   3213 		/* don't need this any more.  We'll allocate it again
   3214 		   a little later if we really do... */
   3215 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3216 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3217 		vput(vp);
   3218 
   3219 		if (error)
   3220 			continue;
   3221 
   3222 		rf_part_found = 0; /*No raid partitions yet*/
   3223 		for (i = 0; i < label.d_npartitions; i++) {
   3224 			char cname[sizeof(ac_list->devname)];
   3225 
   3226 			/* We only support partitions marked as RAID */
   3227 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3228 				continue;
   3229 
   3230 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3231 			if (bdevvp(dev, &vp))
   3232 				panic("RAID can't alloc vnode");
   3233 
   3234 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3235 			if (error) {
   3236 				/* Whatever... */
   3237 				vput(vp);
   3238 				continue;
   3239 			}
   3240 			snprintf(cname, sizeof(cname), "%s%c",
   3241 			    device_xname(dv), 'a' + i);
   3242 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3243 				label.d_partitions[i].p_size, numsecs, secsize);
   3244 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3245 		}
   3246 
   3247 		/*
   3248 		 *If there is no raid component on this disk, either in a
   3249 		 *disklabel or inside a wedge, check the raw partition as well,
   3250 		 *as it is possible to configure raid components on raw disk
   3251 		 *devices.
   3252 		 */
   3253 
   3254 		if (!rf_part_found) {
   3255 			char cname[sizeof(ac_list->devname)];
   3256 
   3257 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3258 			if (bdevvp(dev, &vp))
   3259 				panic("RAID can't alloc vnode");
   3260 
   3261 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3262 			if (error) {
   3263 				/* Whatever... */
   3264 				vput(vp);
   3265 				continue;
   3266 			}
   3267 			snprintf(cname, sizeof(cname), "%s%c",
   3268 			    device_xname(dv), 'a' + RAW_PART);
   3269 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3270 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3271 		}
   3272 	}
   3273 	deviter_release(&di);
   3274 	return ac_list;
   3275 }
   3276 
   3277 
   3278 int
   3279 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3280 {
   3281 
   3282 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3283 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3284 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3285 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3286 	    clabel->row >=0 &&
   3287 	    clabel->column >= 0 &&
   3288 	    clabel->num_rows > 0 &&
   3289 	    clabel->num_columns > 0 &&
   3290 	    clabel->row < clabel->num_rows &&
   3291 	    clabel->column < clabel->num_columns &&
   3292 	    clabel->blockSize > 0 &&
   3293 	    /*
   3294 	     * numBlocksHi may contain garbage, but it is ok since
   3295 	     * the type is unsigned.  If it is really garbage,
   3296 	     * rf_fix_old_label_size() will fix it.
   3297 	     */
   3298 	    rf_component_label_numblocks(clabel) > 0) {
   3299 		/*
   3300 		 * label looks reasonable enough...
   3301 		 * let's make sure it has no old garbage.
   3302 		 */
   3303 		if (numsecs)
   3304 			rf_fix_old_label_size(clabel, numsecs);
   3305 		return(1);
   3306 	}
   3307 	return(0);
   3308 }
   3309 
   3310 
   3311 /*
   3312  * For reasons yet unknown, some old component labels have garbage in
   3313  * the newer numBlocksHi region, and this causes lossage.  Since those
   3314  * disks will also have numsecs set to less than 32 bits of sectors,
   3315  * we can determine when this corruption has occurred, and fix it.
   3316  *
   3317  * The exact same problem, with the same unknown reason, happens to
   3318  * the partitionSizeHi member as well.
   3319  */
   3320 static void
   3321 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3322 {
   3323 
   3324 	if (numsecs < ((uint64_t)1 << 32)) {
   3325 		if (clabel->numBlocksHi) {
   3326 			printf("WARNING: total sectors < 32 bits, yet "
   3327 			       "numBlocksHi set\n"
   3328 			       "WARNING: resetting numBlocksHi to zero.\n");
   3329 			clabel->numBlocksHi = 0;
   3330 		}
   3331 
   3332 		if (clabel->partitionSizeHi) {
   3333 			printf("WARNING: total sectors < 32 bits, yet "
   3334 			       "partitionSizeHi set\n"
   3335 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3336 			clabel->partitionSizeHi = 0;
   3337 		}
   3338 	}
   3339 }
   3340 
   3341 
   3342 #ifdef DEBUG
   3343 void
   3344 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3345 {
   3346 	uint64_t numBlocks;
   3347 	static const char *rp[] = {
   3348 	    "No", "Force", "Soft", "*invalid*"
   3349 	};
   3350 
   3351 
   3352 	numBlocks = rf_component_label_numblocks(clabel);
   3353 
   3354 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3355 	       clabel->row, clabel->column,
   3356 	       clabel->num_rows, clabel->num_columns);
   3357 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3358 	       clabel->version, clabel->serial_number,
   3359 	       clabel->mod_counter);
   3360 	printf("   Clean: %s Status: %d\n",
   3361 	       clabel->clean ? "Yes" : "No", clabel->status);
   3362 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3363 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3364 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3365 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3366 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3367 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3368 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3369 #if 0
   3370 	   printf("   Config order: %d\n", clabel->config_order);
   3371 #endif
   3372 
   3373 }
   3374 #endif
   3375 
   3376 RF_ConfigSet_t *
   3377 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3378 {
   3379 	RF_AutoConfig_t *ac;
   3380 	RF_ConfigSet_t *config_sets;
   3381 	RF_ConfigSet_t *cset;
   3382 	RF_AutoConfig_t *ac_next;
   3383 
   3384 
   3385 	config_sets = NULL;
   3386 
   3387 	/* Go through the AutoConfig list, and figure out which components
   3388 	   belong to what sets.  */
   3389 	ac = ac_list;
   3390 	while(ac!=NULL) {
   3391 		/* we're going to putz with ac->next, so save it here
   3392 		   for use at the end of the loop */
   3393 		ac_next = ac->next;
   3394 
   3395 		if (config_sets == NULL) {
   3396 			/* will need at least this one... */
   3397 			config_sets = (RF_ConfigSet_t *)
   3398 				malloc(sizeof(RF_ConfigSet_t),
   3399 				       M_RAIDFRAME, M_NOWAIT);
   3400 			if (config_sets == NULL) {
   3401 				panic("rf_create_auto_sets: No memory!");
   3402 			}
   3403 			/* this one is easy :) */
   3404 			config_sets->ac = ac;
   3405 			config_sets->next = NULL;
   3406 			config_sets->rootable = 0;
   3407 			ac->next = NULL;
   3408 		} else {
   3409 			/* which set does this component fit into? */
   3410 			cset = config_sets;
   3411 			while(cset!=NULL) {
   3412 				if (rf_does_it_fit(cset, ac)) {
   3413 					/* looks like it matches... */
   3414 					ac->next = cset->ac;
   3415 					cset->ac = ac;
   3416 					break;
   3417 				}
   3418 				cset = cset->next;
   3419 			}
   3420 			if (cset==NULL) {
   3421 				/* didn't find a match above... new set..*/
   3422 				cset = (RF_ConfigSet_t *)
   3423 					malloc(sizeof(RF_ConfigSet_t),
   3424 					       M_RAIDFRAME, M_NOWAIT);
   3425 				if (cset == NULL) {
   3426 					panic("rf_create_auto_sets: No memory!");
   3427 				}
   3428 				cset->ac = ac;
   3429 				ac->next = NULL;
   3430 				cset->next = config_sets;
   3431 				cset->rootable = 0;
   3432 				config_sets = cset;
   3433 			}
   3434 		}
   3435 		ac = ac_next;
   3436 	}
   3437 
   3438 
   3439 	return(config_sets);
   3440 }
   3441 
   3442 static int
   3443 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3444 {
   3445 	RF_ComponentLabel_t *clabel1, *clabel2;
   3446 
   3447 	/* If this one matches the *first* one in the set, that's good
   3448 	   enough, since the other members of the set would have been
   3449 	   through here too... */
   3450 	/* note that we are not checking partitionSize here..
   3451 
   3452 	   Note that we are also not checking the mod_counters here.
   3453 	   If everything else matches except the mod_counter, that's
   3454 	   good enough for this test.  We will deal with the mod_counters
   3455 	   a little later in the autoconfiguration process.
   3456 
   3457 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3458 
   3459 	   The reason we don't check for this is that failed disks
   3460 	   will have lower modification counts.  If those disks are
   3461 	   not added to the set they used to belong to, then they will
   3462 	   form their own set, which may result in 2 different sets,
   3463 	   for example, competing to be configured at raid0, and
   3464 	   perhaps competing to be the root filesystem set.  If the
   3465 	   wrong ones get configured, or both attempt to become /,
   3466 	   weird behaviour and or serious lossage will occur.  Thus we
   3467 	   need to bring them into the fold here, and kick them out at
   3468 	   a later point.
   3469 
   3470 	*/
   3471 
   3472 	clabel1 = cset->ac->clabel;
   3473 	clabel2 = ac->clabel;
   3474 	if ((clabel1->version == clabel2->version) &&
   3475 	    (clabel1->serial_number == clabel2->serial_number) &&
   3476 	    (clabel1->num_rows == clabel2->num_rows) &&
   3477 	    (clabel1->num_columns == clabel2->num_columns) &&
   3478 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3479 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3480 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3481 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3482 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3483 	    (clabel1->blockSize == clabel2->blockSize) &&
   3484 	    rf_component_label_numblocks(clabel1) ==
   3485 	    rf_component_label_numblocks(clabel2) &&
   3486 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3487 	    (clabel1->root_partition == clabel2->root_partition) &&
   3488 	    (clabel1->last_unit == clabel2->last_unit) &&
   3489 	    (clabel1->config_order == clabel2->config_order)) {
   3490 		/* if it get's here, it almost *has* to be a match */
   3491 	} else {
   3492 		/* it's not consistent with somebody in the set..
   3493 		   punt */
   3494 		return(0);
   3495 	}
   3496 	/* all was fine.. it must fit... */
   3497 	return(1);
   3498 }
   3499 
   3500 int
   3501 rf_have_enough_components(RF_ConfigSet_t *cset)
   3502 {
   3503 	RF_AutoConfig_t *ac;
   3504 	RF_AutoConfig_t *auto_config;
   3505 	RF_ComponentLabel_t *clabel;
   3506 	int c;
   3507 	int num_cols;
   3508 	int num_missing;
   3509 	int mod_counter;
   3510 	int mod_counter_found;
   3511 	int even_pair_failed;
   3512 	char parity_type;
   3513 
   3514 
   3515 	/* check to see that we have enough 'live' components
   3516 	   of this set.  If so, we can configure it if necessary */
   3517 
   3518 	num_cols = cset->ac->clabel->num_columns;
   3519 	parity_type = cset->ac->clabel->parityConfig;
   3520 
   3521 	/* XXX Check for duplicate components!?!?!? */
   3522 
   3523 	/* Determine what the mod_counter is supposed to be for this set. */
   3524 
   3525 	mod_counter_found = 0;
   3526 	mod_counter = 0;
   3527 	ac = cset->ac;
   3528 	while(ac!=NULL) {
   3529 		if (mod_counter_found==0) {
   3530 			mod_counter = ac->clabel->mod_counter;
   3531 			mod_counter_found = 1;
   3532 		} else {
   3533 			if (ac->clabel->mod_counter > mod_counter) {
   3534 				mod_counter = ac->clabel->mod_counter;
   3535 			}
   3536 		}
   3537 		ac = ac->next;
   3538 	}
   3539 
   3540 	num_missing = 0;
   3541 	auto_config = cset->ac;
   3542 
   3543 	even_pair_failed = 0;
   3544 	for(c=0; c<num_cols; c++) {
   3545 		ac = auto_config;
   3546 		while(ac!=NULL) {
   3547 			if ((ac->clabel->column == c) &&
   3548 			    (ac->clabel->mod_counter == mod_counter)) {
   3549 				/* it's this one... */
   3550 #ifdef DEBUG
   3551 				printf("Found: %s at %d\n",
   3552 				       ac->devname,c);
   3553 #endif
   3554 				break;
   3555 			}
   3556 			ac=ac->next;
   3557 		}
   3558 		if (ac==NULL) {
   3559 				/* Didn't find one here! */
   3560 				/* special case for RAID 1, especially
   3561 				   where there are more than 2
   3562 				   components (where RAIDframe treats
   3563 				   things a little differently :( ) */
   3564 			if (parity_type == '1') {
   3565 				if (c%2 == 0) { /* even component */
   3566 					even_pair_failed = 1;
   3567 				} else { /* odd component.  If
   3568 					    we're failed, and
   3569 					    so is the even
   3570 					    component, it's
   3571 					    "Good Night, Charlie" */
   3572 					if (even_pair_failed == 1) {
   3573 						return(0);
   3574 					}
   3575 				}
   3576 			} else {
   3577 				/* normal accounting */
   3578 				num_missing++;
   3579 			}
   3580 		}
   3581 		if ((parity_type == '1') && (c%2 == 1)) {
   3582 				/* Just did an even component, and we didn't
   3583 				   bail.. reset the even_pair_failed flag,
   3584 				   and go on to the next component.... */
   3585 			even_pair_failed = 0;
   3586 		}
   3587 	}
   3588 
   3589 	clabel = cset->ac->clabel;
   3590 
   3591 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3592 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3593 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3594 		/* XXX this needs to be made *much* more general */
   3595 		/* Too many failures */
   3596 		return(0);
   3597 	}
   3598 	/* otherwise, all is well, and we've got enough to take a kick
   3599 	   at autoconfiguring this set */
   3600 	return(1);
   3601 }
   3602 
   3603 void
   3604 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3605 			RF_Raid_t *raidPtr)
   3606 {
   3607 	RF_ComponentLabel_t *clabel;
   3608 	int i;
   3609 
   3610 	clabel = ac->clabel;
   3611 
   3612 	/* 1. Fill in the common stuff */
   3613 	config->numRow = clabel->num_rows = 1;
   3614 	config->numCol = clabel->num_columns;
   3615 	config->numSpare = 0; /* XXX should this be set here? */
   3616 	config->sectPerSU = clabel->sectPerSU;
   3617 	config->SUsPerPU = clabel->SUsPerPU;
   3618 	config->SUsPerRU = clabel->SUsPerRU;
   3619 	config->parityConfig = clabel->parityConfig;
   3620 	/* XXX... */
   3621 	strcpy(config->diskQueueType,"fifo");
   3622 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3623 	config->layoutSpecificSize = 0; /* XXX ?? */
   3624 
   3625 	while(ac!=NULL) {
   3626 		/* row/col values will be in range due to the checks
   3627 		   in reasonable_label() */
   3628 		strcpy(config->devnames[0][ac->clabel->column],
   3629 		       ac->devname);
   3630 		ac = ac->next;
   3631 	}
   3632 
   3633 	for(i=0;i<RF_MAXDBGV;i++) {
   3634 		config->debugVars[i][0] = 0;
   3635 	}
   3636 }
   3637 
   3638 int
   3639 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3640 {
   3641 	RF_ComponentLabel_t *clabel;
   3642 	int column;
   3643 	int sparecol;
   3644 
   3645 	raidPtr->autoconfigure = new_value;
   3646 
   3647 	for(column=0; column<raidPtr->numCol; column++) {
   3648 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3649 			clabel = raidget_component_label(raidPtr, column);
   3650 			clabel->autoconfigure = new_value;
   3651 			raidflush_component_label(raidPtr, column);
   3652 		}
   3653 	}
   3654 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3655 		sparecol = raidPtr->numCol + column;
   3656 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3657 			clabel = raidget_component_label(raidPtr, sparecol);
   3658 			clabel->autoconfigure = new_value;
   3659 			raidflush_component_label(raidPtr, sparecol);
   3660 		}
   3661 	}
   3662 	return(new_value);
   3663 }
   3664 
   3665 int
   3666 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3667 {
   3668 	RF_ComponentLabel_t *clabel;
   3669 	int column;
   3670 	int sparecol;
   3671 
   3672 	raidPtr->root_partition = new_value;
   3673 	for(column=0; column<raidPtr->numCol; column++) {
   3674 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3675 			clabel = raidget_component_label(raidPtr, column);
   3676 			clabel->root_partition = new_value;
   3677 			raidflush_component_label(raidPtr, column);
   3678 		}
   3679 	}
   3680 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3681 		sparecol = raidPtr->numCol + column;
   3682 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3683 			clabel = raidget_component_label(raidPtr, sparecol);
   3684 			clabel->root_partition = new_value;
   3685 			raidflush_component_label(raidPtr, sparecol);
   3686 		}
   3687 	}
   3688 	return(new_value);
   3689 }
   3690 
   3691 void
   3692 rf_release_all_vps(RF_ConfigSet_t *cset)
   3693 {
   3694 	RF_AutoConfig_t *ac;
   3695 
   3696 	ac = cset->ac;
   3697 	while(ac!=NULL) {
   3698 		/* Close the vp, and give it back */
   3699 		if (ac->vp) {
   3700 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3701 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3702 			vput(ac->vp);
   3703 			ac->vp = NULL;
   3704 		}
   3705 		ac = ac->next;
   3706 	}
   3707 }
   3708 
   3709 
   3710 void
   3711 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3712 {
   3713 	RF_AutoConfig_t *ac;
   3714 	RF_AutoConfig_t *next_ac;
   3715 
   3716 	ac = cset->ac;
   3717 	while(ac!=NULL) {
   3718 		next_ac = ac->next;
   3719 		/* nuke the label */
   3720 		free(ac->clabel, M_RAIDFRAME);
   3721 		/* cleanup the config structure */
   3722 		free(ac, M_RAIDFRAME);
   3723 		/* "next.." */
   3724 		ac = next_ac;
   3725 	}
   3726 	/* and, finally, nuke the config set */
   3727 	free(cset, M_RAIDFRAME);
   3728 }
   3729 
   3730 
   3731 void
   3732 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3733 {
   3734 	/* current version number */
   3735 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3736 	clabel->serial_number = raidPtr->serial_number;
   3737 	clabel->mod_counter = raidPtr->mod_counter;
   3738 
   3739 	clabel->num_rows = 1;
   3740 	clabel->num_columns = raidPtr->numCol;
   3741 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3742 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3743 
   3744 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3745 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3746 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3747 
   3748 	clabel->blockSize = raidPtr->bytesPerSector;
   3749 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3750 
   3751 	/* XXX not portable */
   3752 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3753 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3754 	clabel->autoconfigure = raidPtr->autoconfigure;
   3755 	clabel->root_partition = raidPtr->root_partition;
   3756 	clabel->last_unit = raidPtr->raidid;
   3757 	clabel->config_order = raidPtr->config_order;
   3758 
   3759 #ifndef RF_NO_PARITY_MAP
   3760 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3761 #endif
   3762 }
   3763 
   3764 struct raid_softc *
   3765 rf_auto_config_set(RF_ConfigSet_t *cset)
   3766 {
   3767 	RF_Raid_t *raidPtr;
   3768 	RF_Config_t *config;
   3769 	int raidID;
   3770 	struct raid_softc *sc;
   3771 
   3772 #ifdef DEBUG
   3773 	printf("RAID autoconfigure\n");
   3774 #endif
   3775 
   3776 	/* 1. Create a config structure */
   3777 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3778 	if (config == NULL) {
   3779 		printf("%s: Out of mem - config!?!?\n", __func__);
   3780 				/* XXX do something more intelligent here. */
   3781 		return NULL;
   3782 	}
   3783 
   3784 	/*
   3785 	   2. Figure out what RAID ID this one is supposed to live at
   3786 	   See if we can get the same RAID dev that it was configured
   3787 	   on last time..
   3788 	*/
   3789 
   3790 	raidID = cset->ac->clabel->last_unit;
   3791 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3792 	     sc = raidget(++raidID, false))
   3793 		continue;
   3794 #ifdef DEBUG
   3795 	printf("Configuring raid%d:\n",raidID);
   3796 #endif
   3797 
   3798 	if (sc == NULL)
   3799 		sc = raidget(raidID, true);
   3800 	if (sc == NULL) {
   3801 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3802 				/* XXX do something more intelligent here. */
   3803 		free(config, M_RAIDFRAME);
   3804 		return NULL;
   3805 	}
   3806 
   3807 	raidPtr = &sc->sc_r;
   3808 
   3809 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3810 	raidPtr->softc = sc;
   3811 	raidPtr->raidid = raidID;
   3812 	raidPtr->openings = RAIDOUTSTANDING;
   3813 
   3814 	/* 3. Build the configuration structure */
   3815 	rf_create_configuration(cset->ac, config, raidPtr);
   3816 
   3817 	/* 4. Do the configuration */
   3818 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3819 		raidinit(sc);
   3820 
   3821 		rf_markalldirty(raidPtr);
   3822 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3823 		switch (cset->ac->clabel->root_partition) {
   3824 		case 1:	/* Force Root */
   3825 		case 2:	/* Soft Root: root when boot partition part of raid */
   3826 			/*
   3827 			 * everything configured just fine.  Make a note
   3828 			 * that this set is eligible to be root,
   3829 			 * or forced to be root
   3830 			 */
   3831 			cset->rootable = cset->ac->clabel->root_partition;
   3832 			/* XXX do this here? */
   3833 			raidPtr->root_partition = cset->rootable;
   3834 			break;
   3835 		default:
   3836 			break;
   3837 		}
   3838 	} else {
   3839 		raidput(sc);
   3840 		sc = NULL;
   3841 	}
   3842 
   3843 	/* 5. Cleanup */
   3844 	free(config, M_RAIDFRAME);
   3845 	return sc;
   3846 }
   3847 
   3848 void
   3849 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3850 {
   3851 	struct buf *bp;
   3852 	struct raid_softc *rs;
   3853 
   3854 	bp = (struct buf *)desc->bp;
   3855 	rs = desc->raidPtr->softc;
   3856 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3857 	    (bp->b_flags & B_READ));
   3858 }
   3859 
   3860 void
   3861 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3862 	     size_t xmin, size_t xmax)
   3863 {
   3864 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3865 	pool_sethiwat(p, xmax);
   3866 	pool_prime(p, xmin);
   3867 	pool_setlowat(p, xmin);
   3868 }
   3869 
   3870 /*
   3871  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3872  * if there is IO pending and if that IO could possibly be done for a
   3873  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3874  * otherwise.
   3875  *
   3876  */
   3877 
   3878 int
   3879 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3880 {
   3881 	struct raid_softc *rs = raidPtr->softc;
   3882 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3883 		/* there is work to do */
   3884 		return 0;
   3885 	}
   3886 	/* default is nothing to do */
   3887 	return 1;
   3888 }
   3889 
   3890 int
   3891 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3892 {
   3893 	uint64_t numsecs;
   3894 	unsigned secsize;
   3895 	int error;
   3896 
   3897 	error = getdisksize(vp, &numsecs, &secsize);
   3898 	if (error == 0) {
   3899 		diskPtr->blockSize = secsize;
   3900 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3901 		diskPtr->partitionSize = numsecs;
   3902 		return 0;
   3903 	}
   3904 	return error;
   3905 }
   3906 
   3907 static int
   3908 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3909 {
   3910 	return 1;
   3911 }
   3912 
   3913 static void
   3914 raid_attach(device_t parent, device_t self, void *aux)
   3915 {
   3916 
   3917 }
   3918 
   3919 
   3920 static int
   3921 raid_detach(device_t self, int flags)
   3922 {
   3923 	int error;
   3924 	struct raid_softc *rs = raidget(device_unit(self), false);
   3925 
   3926 	if (rs == NULL)
   3927 		return ENXIO;
   3928 
   3929 	if ((error = raidlock(rs)) != 0)
   3930 		return (error);
   3931 
   3932 	error = raid_detach_unlocked(rs);
   3933 
   3934 	raidunlock(rs);
   3935 
   3936 	/* XXX raid can be referenced here */
   3937 
   3938 	if (error)
   3939 		return error;
   3940 
   3941 	/* Free the softc */
   3942 	raidput(rs);
   3943 
   3944 	return 0;
   3945 }
   3946 
   3947 static void
   3948 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3949 {
   3950 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3951 
   3952 	memset(dg, 0, sizeof(*dg));
   3953 
   3954 	dg->dg_secperunit = raidPtr->totalSectors;
   3955 	dg->dg_secsize = raidPtr->bytesPerSector;
   3956 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3957 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3958 
   3959 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3960 }
   3961 
   3962 /*
   3963  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3964  * We end up returning whatever error was returned by the first cache flush
   3965  * that fails.
   3966  */
   3967 
   3968 int
   3969 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3970 {
   3971 	int c, sparecol;
   3972 	int e,error;
   3973 	int force = 1;
   3974 
   3975 	error = 0;
   3976 	for (c = 0; c < raidPtr->numCol; c++) {
   3977 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3978 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3979 					  &force, FWRITE, NOCRED);
   3980 			if (e) {
   3981 				if (e != ENODEV)
   3982 					printf("raid%d: cache flush to component %s failed.\n",
   3983 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3984 				if (error == 0) {
   3985 					error = e;
   3986 				}
   3987 			}
   3988 		}
   3989 	}
   3990 
   3991 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3992 		sparecol = raidPtr->numCol + c;
   3993 		/* Need to ensure that the reconstruct actually completed! */
   3994 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3995 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3996 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3997 			if (e) {
   3998 				if (e != ENODEV)
   3999 					printf("raid%d: cache flush to component %s failed.\n",
   4000 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   4001 				if (error == 0) {
   4002 					error = e;
   4003 				}
   4004 			}
   4005 		}
   4006 	}
   4007 	return error;
   4008 }
   4009 
   4010 /*
   4011  * Module interface
   4012  */
   4013 
   4014 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   4015 
   4016 #ifdef _MODULE
   4017 CFDRIVER_DECL(raid, DV_DISK, NULL);
   4018 #endif
   4019 
   4020 static int raid_modcmd(modcmd_t, void *);
   4021 static int raid_modcmd_init(void);
   4022 static int raid_modcmd_fini(void);
   4023 
   4024 static int
   4025 raid_modcmd(modcmd_t cmd, void *data)
   4026 {
   4027 	int error;
   4028 
   4029 	error = 0;
   4030 	switch (cmd) {
   4031 	case MODULE_CMD_INIT:
   4032 		error = raid_modcmd_init();
   4033 		break;
   4034 	case MODULE_CMD_FINI:
   4035 		error = raid_modcmd_fini();
   4036 		break;
   4037 	default:
   4038 		error = ENOTTY;
   4039 		break;
   4040 	}
   4041 	return error;
   4042 }
   4043 
   4044 static int
   4045 raid_modcmd_init(void)
   4046 {
   4047 	int error;
   4048 	int bmajor, cmajor;
   4049 
   4050 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   4051 	mutex_enter(&raid_lock);
   4052 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4053 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   4054 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   4055 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   4056 
   4057 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   4058 #endif
   4059 
   4060 	bmajor = cmajor = -1;
   4061 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4062 	    &raid_cdevsw, &cmajor);
   4063 	if (error != 0 && error != EEXIST) {
   4064 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4065 		mutex_exit(&raid_lock);
   4066 		return error;
   4067 	}
   4068 #ifdef _MODULE
   4069 	error = config_cfdriver_attach(&raid_cd);
   4070 	if (error != 0) {
   4071 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4072 		    __func__, error);
   4073 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4074 		mutex_exit(&raid_lock);
   4075 		return error;
   4076 	}
   4077 #endif
   4078 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4079 	if (error != 0) {
   4080 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4081 		    __func__, error);
   4082 #ifdef _MODULE
   4083 		config_cfdriver_detach(&raid_cd);
   4084 #endif
   4085 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4086 		mutex_exit(&raid_lock);
   4087 		return error;
   4088 	}
   4089 
   4090 	raidautoconfigdone = false;
   4091 
   4092 	mutex_exit(&raid_lock);
   4093 
   4094 	if (error == 0) {
   4095 		if (rf_BootRaidframe(true) == 0)
   4096 			aprint_verbose("Kernelized RAIDframe activated\n");
   4097 		else
   4098 			panic("Serious error activating RAID!!");
   4099 	}
   4100 
   4101 	/*
   4102 	 * Register a finalizer which will be used to auto-config RAID
   4103 	 * sets once all real hardware devices have been found.
   4104 	 */
   4105 	error = config_finalize_register(NULL, rf_autoconfig);
   4106 	if (error != 0) {
   4107 		aprint_error("WARNING: unable to register RAIDframe "
   4108 		    "finalizer\n");
   4109 		error = 0;
   4110 	}
   4111 
   4112 	return error;
   4113 }
   4114 
   4115 static int
   4116 raid_modcmd_fini(void)
   4117 {
   4118 	int error;
   4119 
   4120 	mutex_enter(&raid_lock);
   4121 
   4122 	/* Don't allow unload if raid device(s) exist.  */
   4123 	if (!LIST_EMPTY(&raids)) {
   4124 		mutex_exit(&raid_lock);
   4125 		return EBUSY;
   4126 	}
   4127 
   4128 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4129 	if (error != 0) {
   4130 		mutex_exit(&raid_lock);
   4131 		return error;
   4132 	}
   4133 #ifdef _MODULE
   4134 	error = config_cfdriver_detach(&raid_cd);
   4135 	if (error != 0) {
   4136 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4137 		mutex_exit(&raid_lock);
   4138 		return error;
   4139 	}
   4140 #endif
   4141 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4142 	if (error != 0) {
   4143 #ifdef _MODULE
   4144 		config_cfdriver_attach(&raid_cd);
   4145 #endif
   4146 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4147 		mutex_exit(&raid_lock);
   4148 		return error;
   4149 	}
   4150 	rf_BootRaidframe(false);
   4151 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4152 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4153 	rf_destroy_cond2(rf_sparet_wait_cv);
   4154 	rf_destroy_cond2(rf_sparet_resp_cv);
   4155 #endif
   4156 	mutex_exit(&raid_lock);
   4157 	mutex_destroy(&raid_lock);
   4158 
   4159 	return error;
   4160 }
   4161