Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.316.2.8
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.316.2.8 2017/08/28 17:52:26 skrll Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.316.2.8 2017/08/28 17:52:26 skrll Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #ifdef DEBUG_ROOT
    165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    166 #else
    167 #define DPRINTF(a, ...)
    168 #endif
    169 
    170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    171 static rf_declare_mutex2(rf_sparet_wait_mutex);
    172 static rf_declare_cond2(rf_sparet_wait_cv);
    173 static rf_declare_cond2(rf_sparet_resp_cv);
    174 
    175 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    176 						 * spare table */
    177 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    178 						 * installation process */
    179 #endif
    180 
    181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    182 
    183 /* prototypes */
    184 static void KernelWakeupFunc(struct buf *);
    185 static void InitBP(struct buf *, struct vnode *, unsigned,
    186     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    187     void *, int, struct proc *);
    188 struct raid_softc;
    189 static void raidinit(struct raid_softc *);
    190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 const struct bdevsw raid_bdevsw = {
    221 	.d_open = raidopen,
    222 	.d_close = raidclose,
    223 	.d_strategy = raidstrategy,
    224 	.d_ioctl = raidioctl,
    225 	.d_dump = raiddump,
    226 	.d_psize = raidsize,
    227 	.d_discard = nodiscard,
    228 	.d_flag = D_DISK
    229 };
    230 
    231 const struct cdevsw raid_cdevsw = {
    232 	.d_open = raidopen,
    233 	.d_close = raidclose,
    234 	.d_read = raidread,
    235 	.d_write = raidwrite,
    236 	.d_ioctl = raidioctl,
    237 	.d_stop = nostop,
    238 	.d_tty = notty,
    239 	.d_poll = nopoll,
    240 	.d_mmap = nommap,
    241 	.d_kqfilter = nokqfilter,
    242 	.d_discard = nodiscard,
    243 	.d_flag = D_DISK
    244 };
    245 
    246 static struct dkdriver rf_dkdriver = {
    247 	.d_open = raidopen,
    248 	.d_close = raidclose,
    249 	.d_strategy = raidstrategy,
    250 	.d_diskstart = raid_diskstart,
    251 	.d_dumpblocks = raid_dumpblocks,
    252 	.d_lastclose = raid_lastclose,
    253 	.d_minphys = minphys
    254 };
    255 
    256 struct raid_softc {
    257 	struct dk_softc sc_dksc;
    258 	int	sc_unit;
    259 	int     sc_flags;	/* flags */
    260 	int     sc_cflags;	/* configuration flags */
    261 	kmutex_t sc_mutex;	/* interlock mutex */
    262 	kcondvar_t sc_cv;	/* and the condvar */
    263 	uint64_t sc_size;	/* size of the raid device */
    264 	char    sc_xname[20];	/* XXX external name */
    265 	RF_Raid_t sc_r;
    266 	LIST_ENTRY(raid_softc) sc_link;
    267 };
    268 /* sc_flags */
    269 #define RAIDF_INITED		0x01	/* unit has been initialized */
    270 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    271 #define RAIDF_DETACH  		0x04	/* detach after final close */
    272 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    273 #define RAIDF_LOCKED		0x10	/* unit is locked */
    274 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    275 
    276 #define	raidunit(x)	DISKUNIT(x)
    277 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    278 
    279 extern struct cfdriver raid_cd;
    280 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    281     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    282     DVF_DETACH_SHUTDOWN);
    283 
    284 /*
    285  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    286  * Be aware that large numbers can allow the driver to consume a lot of
    287  * kernel memory, especially on writes, and in degraded mode reads.
    288  *
    289  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    290  * a single 64K write will typically require 64K for the old data,
    291  * 64K for the old parity, and 64K for the new parity, for a total
    292  * of 192K (if the parity buffer is not re-used immediately).
    293  * Even it if is used immediately, that's still 128K, which when multiplied
    294  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    295  *
    296  * Now in degraded mode, for example, a 64K read on the above setup may
    297  * require data reconstruction, which will require *all* of the 4 remaining
    298  * disks to participate -- 4 * 32K/disk == 128K again.
    299  */
    300 
    301 #ifndef RAIDOUTSTANDING
    302 #define RAIDOUTSTANDING   6
    303 #endif
    304 
    305 #define RAIDLABELDEV(dev)	\
    306 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    307 
    308 /* declared here, and made public, for the benefit of KVM stuff.. */
    309 
    310 static int raidlock(struct raid_softc *);
    311 static void raidunlock(struct raid_softc *);
    312 
    313 static int raid_detach_unlocked(struct raid_softc *);
    314 
    315 static void rf_markalldirty(RF_Raid_t *);
    316 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    317 
    318 void rf_ReconThread(struct rf_recon_req *);
    319 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    320 void rf_CopybackThread(RF_Raid_t *raidPtr);
    321 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    322 int rf_autoconfig(device_t);
    323 void rf_buildroothack(RF_ConfigSet_t *);
    324 
    325 RF_AutoConfig_t *rf_find_raid_components(void);
    326 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    327 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    328 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    329 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    330 int rf_set_autoconfig(RF_Raid_t *, int);
    331 int rf_set_rootpartition(RF_Raid_t *, int);
    332 void rf_release_all_vps(RF_ConfigSet_t *);
    333 void rf_cleanup_config_set(RF_ConfigSet_t *);
    334 int rf_have_enough_components(RF_ConfigSet_t *);
    335 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    336 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    337 
    338 /*
    339  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    340  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    341  * in the kernel config file.
    342  */
    343 #ifdef RAID_AUTOCONFIG
    344 int raidautoconfig = 1;
    345 #else
    346 int raidautoconfig = 0;
    347 #endif
    348 static bool raidautoconfigdone = false;
    349 
    350 struct RF_Pools_s rf_pools;
    351 
    352 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    353 static kmutex_t raid_lock;
    354 
    355 static struct raid_softc *
    356 raidcreate(int unit) {
    357 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    358 	sc->sc_unit = unit;
    359 	cv_init(&sc->sc_cv, "raidunit");
    360 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    361 	return sc;
    362 }
    363 
    364 static void
    365 raiddestroy(struct raid_softc *sc) {
    366 	cv_destroy(&sc->sc_cv);
    367 	mutex_destroy(&sc->sc_mutex);
    368 	kmem_free(sc, sizeof(*sc));
    369 }
    370 
    371 static struct raid_softc *
    372 raidget(int unit, bool create) {
    373 	struct raid_softc *sc;
    374 	if (unit < 0) {
    375 #ifdef DIAGNOSTIC
    376 		panic("%s: unit %d!", __func__, unit);
    377 #endif
    378 		return NULL;
    379 	}
    380 	mutex_enter(&raid_lock);
    381 	LIST_FOREACH(sc, &raids, sc_link) {
    382 		if (sc->sc_unit == unit) {
    383 			mutex_exit(&raid_lock);
    384 			return sc;
    385 		}
    386 	}
    387 	mutex_exit(&raid_lock);
    388 	if (!create)
    389 		return NULL;
    390 	if ((sc = raidcreate(unit)) == NULL)
    391 		return NULL;
    392 	mutex_enter(&raid_lock);
    393 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    394 	mutex_exit(&raid_lock);
    395 	return sc;
    396 }
    397 
    398 static void
    399 raidput(struct raid_softc *sc) {
    400 	mutex_enter(&raid_lock);
    401 	LIST_REMOVE(sc, sc_link);
    402 	mutex_exit(&raid_lock);
    403 	raiddestroy(sc);
    404 }
    405 
    406 void
    407 raidattach(int num)
    408 {
    409 
    410 	/*
    411 	 * Device attachment and associated initialization now occurs
    412 	 * as part of the module initialization.
    413 	 */
    414 }
    415 
    416 int
    417 rf_autoconfig(device_t self)
    418 {
    419 	RF_AutoConfig_t *ac_list;
    420 	RF_ConfigSet_t *config_sets;
    421 
    422 	if (!raidautoconfig || raidautoconfigdone == true)
    423 		return (0);
    424 
    425 	/* XXX This code can only be run once. */
    426 	raidautoconfigdone = true;
    427 
    428 #ifdef __HAVE_CPU_BOOTCONF
    429 	/*
    430 	 * 0. find the boot device if needed first so we can use it later
    431 	 * this needs to be done before we autoconfigure any raid sets,
    432 	 * because if we use wedges we are not going to be able to open
    433 	 * the boot device later
    434 	 */
    435 	if (booted_device == NULL)
    436 		cpu_bootconf();
    437 #endif
    438 	/* 1. locate all RAID components on the system */
    439 	aprint_debug("Searching for RAID components...\n");
    440 	ac_list = rf_find_raid_components();
    441 
    442 	/* 2. Sort them into their respective sets. */
    443 	config_sets = rf_create_auto_sets(ac_list);
    444 
    445 	/*
    446 	 * 3. Evaluate each set and configure the valid ones.
    447 	 * This gets done in rf_buildroothack().
    448 	 */
    449 	rf_buildroothack(config_sets);
    450 
    451 	return 1;
    452 }
    453 
    454 static int
    455 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    456 	const char *bootname = device_xname(bdv);
    457 	size_t len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 void
    479 rf_buildroothack(RF_ConfigSet_t *config_sets)
    480 {
    481 	RF_ConfigSet_t *cset;
    482 	RF_ConfigSet_t *next_cset;
    483 	int num_root;
    484 	struct raid_softc *sc, *rsc;
    485 	struct dk_softc *dksc;
    486 
    487 	sc = rsc = NULL;
    488 	num_root = 0;
    489 	cset = config_sets;
    490 	while (cset != NULL) {
    491 		next_cset = cset->next;
    492 		if (rf_have_enough_components(cset) &&
    493 		    cset->ac->clabel->autoconfigure == 1) {
    494 			sc = rf_auto_config_set(cset);
    495 			if (sc != NULL) {
    496 				aprint_debug("raid%d: configured ok\n",
    497 				    sc->sc_unit);
    498 				if (cset->rootable) {
    499 					rsc = sc;
    500 					num_root++;
    501 				}
    502 			} else {
    503 				/* The autoconfig didn't work :( */
    504 				aprint_debug("Autoconfig failed\n");
    505 				rf_release_all_vps(cset);
    506 			}
    507 		} else {
    508 			/* we're not autoconfiguring this set...
    509 			   release the associated resources */
    510 			rf_release_all_vps(cset);
    511 		}
    512 		/* cleanup */
    513 		rf_cleanup_config_set(cset);
    514 		cset = next_cset;
    515 	}
    516 	dksc = &rsc->sc_dksc;
    517 
    518 	/* if the user has specified what the root device should be
    519 	   then we don't touch booted_device or boothowto... */
    520 
    521 	if (rootspec != NULL)
    522 		return;
    523 
    524 	/* we found something bootable... */
    525 
    526 	/*
    527 	 * XXX: The following code assumes that the root raid
    528 	 * is the first ('a') partition. This is about the best
    529 	 * we can do with a BSD disklabel, but we might be able
    530 	 * to do better with a GPT label, by setting a specified
    531 	 * attribute to indicate the root partition. We can then
    532 	 * stash the partition number in the r->root_partition
    533 	 * high bits (the bottom 2 bits are already used). For
    534 	 * now we just set booted_partition to 0 when we override
    535 	 * root.
    536 	 */
    537 	if (num_root == 1) {
    538 		device_t candidate_root;
    539 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    540 			char cname[sizeof(cset->ac->devname)];
    541 			/* XXX: assume partition 'a' first */
    542 			snprintf(cname, sizeof(cname), "%s%c",
    543 			    device_xname(dksc->sc_dev), 'a');
    544 			candidate_root = dkwedge_find_by_wname(cname);
    545 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    546 			    cname);
    547 			if (candidate_root == NULL) {
    548 				/*
    549 				 * If that is not found, because we don't use
    550 				 * disklabel, return the first dk child
    551 				 * XXX: we can skip the 'a' check above
    552 				 * and always do this...
    553 				 */
    554 				size_t i = 0;
    555 				candidate_root = dkwedge_find_by_parent(
    556 				    device_xname(dksc->sc_dev), &i);
    557 			}
    558 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    559 			    candidate_root);
    560 		} else
    561 			candidate_root = dksc->sc_dev;
    562 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    563 		DPRINTF("%s: booted_device=%p root_partition=%d "
    564 		   "contains_boot=%d\n", __func__, booted_device,
    565 		   rsc->sc_r.root_partition,
    566 		   rf_containsboot(&rsc->sc_r, booted_device));
    567 		if (booted_device == NULL ||
    568 		    rsc->sc_r.root_partition == 1 ||
    569 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    570 			booted_device = candidate_root;
    571 			booted_partition = 0;	/* XXX assume 'a' */
    572 		}
    573 	} else if (num_root > 1) {
    574 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    575 		    booted_device);
    576 
    577 		/*
    578 		 * Maybe the MD code can help. If it cannot, then
    579 		 * setroot() will discover that we have no
    580 		 * booted_device and will ask the user if nothing was
    581 		 * hardwired in the kernel config file
    582 		 */
    583 		if (booted_device == NULL)
    584 			return;
    585 
    586 		num_root = 0;
    587 		mutex_enter(&raid_lock);
    588 		LIST_FOREACH(sc, &raids, sc_link) {
    589 			RF_Raid_t *r = &sc->sc_r;
    590 			if (r->valid == 0)
    591 				continue;
    592 
    593 			if (r->root_partition == 0)
    594 				continue;
    595 
    596 			if (rf_containsboot(r, booted_device)) {
    597 				num_root++;
    598 				rsc = sc;
    599 				dksc = &rsc->sc_dksc;
    600 			}
    601 		}
    602 		mutex_exit(&raid_lock);
    603 
    604 		if (num_root == 1) {
    605 			booted_device = dksc->sc_dev;
    606 			booted_partition = 0;	/* XXX assume 'a' */
    607 		} else {
    608 			/* we can't guess.. require the user to answer... */
    609 			boothowto |= RB_ASKNAME;
    610 		}
    611 	}
    612 }
    613 
    614 static int
    615 raidsize(dev_t dev)
    616 {
    617 	struct raid_softc *rs;
    618 	struct dk_softc *dksc;
    619 	unsigned int unit;
    620 
    621 	unit = raidunit(dev);
    622 	if ((rs = raidget(unit, false)) == NULL)
    623 		return -1;
    624 	dksc = &rs->sc_dksc;
    625 
    626 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    627 		return -1;
    628 
    629 	return dk_size(dksc, dev);
    630 }
    631 
    632 static int
    633 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    634 {
    635 	unsigned int unit;
    636 	struct raid_softc *rs;
    637 	struct dk_softc *dksc;
    638 
    639 	unit = raidunit(dev);
    640 	if ((rs = raidget(unit, false)) == NULL)
    641 		return ENXIO;
    642 	dksc = &rs->sc_dksc;
    643 
    644 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    645 		return ENODEV;
    646 
    647         /*
    648            Note that blkno is relative to this particular partition.
    649            By adding adding RF_PROTECTED_SECTORS, we get a value that
    650 	   is relative to the partition used for the underlying component.
    651         */
    652 	blkno += RF_PROTECTED_SECTORS;
    653 
    654 	return dk_dump(dksc, dev, blkno, va, size);
    655 }
    656 
    657 static int
    658 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    659 {
    660 	struct raid_softc *rs = raidsoftc(dev);
    661 	const struct bdevsw *bdev;
    662 	RF_Raid_t *raidPtr;
    663 	int     c, sparecol, j, scol, dumpto;
    664 	int     error = 0;
    665 
    666 	raidPtr = &rs->sc_r;
    667 
    668 	/* we only support dumping to RAID 1 sets */
    669 	if (raidPtr->Layout.numDataCol != 1 ||
    670 	    raidPtr->Layout.numParityCol != 1)
    671 		return EINVAL;
    672 
    673 	if ((error = raidlock(rs)) != 0)
    674 		return error;
    675 
    676 	/* figure out what device is alive.. */
    677 
    678 	/*
    679 	   Look for a component to dump to.  The preference for the
    680 	   component to dump to is as follows:
    681 	   1) the master
    682 	   2) a used_spare of the master
    683 	   3) the slave
    684 	   4) a used_spare of the slave
    685 	*/
    686 
    687 	dumpto = -1;
    688 	for (c = 0; c < raidPtr->numCol; c++) {
    689 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    690 			/* this might be the one */
    691 			dumpto = c;
    692 			break;
    693 		}
    694 	}
    695 
    696 	/*
    697 	   At this point we have possibly selected a live master or a
    698 	   live slave.  We now check to see if there is a spared
    699 	   master (or a spared slave), if we didn't find a live master
    700 	   or a live slave.
    701 	*/
    702 
    703 	for (c = 0; c < raidPtr->numSpare; c++) {
    704 		sparecol = raidPtr->numCol + c;
    705 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    706 			/* How about this one? */
    707 			scol = -1;
    708 			for(j=0;j<raidPtr->numCol;j++) {
    709 				if (raidPtr->Disks[j].spareCol == sparecol) {
    710 					scol = j;
    711 					break;
    712 				}
    713 			}
    714 			if (scol == 0) {
    715 				/*
    716 				   We must have found a spared master!
    717 				   We'll take that over anything else
    718 				   found so far.  (We couldn't have
    719 				   found a real master before, since
    720 				   this is a used spare, and it's
    721 				   saying that it's replacing the
    722 				   master.)  On reboot (with
    723 				   autoconfiguration turned on)
    724 				   sparecol will become the 1st
    725 				   component (component0) of this set.
    726 				*/
    727 				dumpto = sparecol;
    728 				break;
    729 			} else if (scol != -1) {
    730 				/*
    731 				   Must be a spared slave.  We'll dump
    732 				   to that if we havn't found anything
    733 				   else so far.
    734 				*/
    735 				if (dumpto == -1)
    736 					dumpto = sparecol;
    737 			}
    738 		}
    739 	}
    740 
    741 	if (dumpto == -1) {
    742 		/* we couldn't find any live components to dump to!?!?
    743 		 */
    744 		error = EINVAL;
    745 		goto out;
    746 	}
    747 
    748 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    749 	if (bdev == NULL) {
    750 		error = ENXIO;
    751 		goto out;
    752 	}
    753 
    754 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    755 				blkno, va, nblk * raidPtr->bytesPerSector);
    756 
    757 out:
    758 	raidunlock(rs);
    759 
    760 	return error;
    761 }
    762 
    763 /* ARGSUSED */
    764 static int
    765 raidopen(dev_t dev, int flags, int fmt,
    766     struct lwp *l)
    767 {
    768 	int     unit = raidunit(dev);
    769 	struct raid_softc *rs;
    770 	struct dk_softc *dksc;
    771 	int     error = 0;
    772 	int     part, pmask;
    773 
    774 	if ((rs = raidget(unit, true)) == NULL)
    775 		return ENXIO;
    776 	if ((error = raidlock(rs)) != 0)
    777 		return (error);
    778 
    779 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    780 		error = EBUSY;
    781 		goto bad;
    782 	}
    783 
    784 	dksc = &rs->sc_dksc;
    785 
    786 	part = DISKPART(dev);
    787 	pmask = (1 << part);
    788 
    789 	if (!DK_BUSY(dksc, pmask) &&
    790 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    791 		/* First one... mark things as dirty... Note that we *MUST*
    792 		 have done a configure before this.  I DO NOT WANT TO BE
    793 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    794 		 THAT THEY BELONG TOGETHER!!!!! */
    795 		/* XXX should check to see if we're only open for reading
    796 		   here... If so, we needn't do this, but then need some
    797 		   other way of keeping track of what's happened.. */
    798 
    799 		rf_markalldirty(&rs->sc_r);
    800 	}
    801 
    802 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    803 		error = dk_open(dksc, dev, flags, fmt, l);
    804 
    805 bad:
    806 	raidunlock(rs);
    807 
    808 	return (error);
    809 
    810 
    811 }
    812 
    813 static int
    814 raid_lastclose(device_t self)
    815 {
    816 	struct raid_softc *rs = raidsoftc(self);
    817 
    818 	/* Last one... device is not unconfigured yet.
    819 	   Device shutdown has taken care of setting the
    820 	   clean bits if RAIDF_INITED is not set
    821 	   mark things as clean... */
    822 
    823 	rf_update_component_labels(&rs->sc_r,
    824 	    RF_FINAL_COMPONENT_UPDATE);
    825 
    826 	/* pass to unlocked code */
    827 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    828 		rs->sc_flags |= RAIDF_DETACH;
    829 
    830 	return 0;
    831 }
    832 
    833 /* ARGSUSED */
    834 static int
    835 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    836 {
    837 	int     unit = raidunit(dev);
    838 	struct raid_softc *rs;
    839 	struct dk_softc *dksc;
    840 	cfdata_t cf;
    841 	int     error = 0, do_detach = 0, do_put = 0;
    842 
    843 	if ((rs = raidget(unit, false)) == NULL)
    844 		return ENXIO;
    845 	dksc = &rs->sc_dksc;
    846 
    847 	if ((error = raidlock(rs)) != 0)
    848 		return (error);
    849 
    850 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    851 		error = dk_close(dksc, dev, flags, fmt, l);
    852 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    853 			do_detach = 1;
    854 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    855 		do_put = 1;
    856 
    857 	raidunlock(rs);
    858 
    859 	if (do_detach) {
    860 		/* free the pseudo device attach bits */
    861 		cf = device_cfdata(dksc->sc_dev);
    862 		error = config_detach(dksc->sc_dev, 0);
    863 		if (error == 0)
    864 			free(cf, M_RAIDFRAME);
    865 	} else if (do_put) {
    866 		raidput(rs);
    867 	}
    868 
    869 	return (error);
    870 
    871 }
    872 
    873 static void
    874 raid_wakeup(RF_Raid_t *raidPtr)
    875 {
    876 	rf_lock_mutex2(raidPtr->iodone_lock);
    877 	rf_signal_cond2(raidPtr->iodone_cv);
    878 	rf_unlock_mutex2(raidPtr->iodone_lock);
    879 }
    880 
    881 static void
    882 raidstrategy(struct buf *bp)
    883 {
    884 	unsigned int unit;
    885 	struct raid_softc *rs;
    886 	struct dk_softc *dksc;
    887 	RF_Raid_t *raidPtr;
    888 
    889 	unit = raidunit(bp->b_dev);
    890 	if ((rs = raidget(unit, false)) == NULL) {
    891 		bp->b_error = ENXIO;
    892 		goto fail;
    893 	}
    894 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    895 		bp->b_error = ENXIO;
    896 		goto fail;
    897 	}
    898 	dksc = &rs->sc_dksc;
    899 	raidPtr = &rs->sc_r;
    900 
    901 	/* Queue IO only */
    902 	if (dk_strategy_defer(dksc, bp))
    903 		goto done;
    904 
    905 	/* schedule the IO to happen at the next convenient time */
    906 	raid_wakeup(raidPtr);
    907 
    908 done:
    909 	return;
    910 
    911 fail:
    912 	bp->b_resid = bp->b_bcount;
    913 	biodone(bp);
    914 }
    915 
    916 static int
    917 raid_diskstart(device_t dev, struct buf *bp)
    918 {
    919 	struct raid_softc *rs = raidsoftc(dev);
    920 	RF_Raid_t *raidPtr;
    921 
    922 	raidPtr = &rs->sc_r;
    923 	if (!raidPtr->valid) {
    924 		db1_printf(("raid is not valid..\n"));
    925 		return ENODEV;
    926 	}
    927 
    928 	/* XXX */
    929 	bp->b_resid = 0;
    930 
    931 	return raiddoaccess(raidPtr, bp);
    932 }
    933 
    934 void
    935 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    936 {
    937 	struct raid_softc *rs;
    938 	struct dk_softc *dksc;
    939 
    940 	rs = raidPtr->softc;
    941 	dksc = &rs->sc_dksc;
    942 
    943 	dk_done(dksc, bp);
    944 
    945 	rf_lock_mutex2(raidPtr->mutex);
    946 	raidPtr->openings++;
    947 	rf_unlock_mutex2(raidPtr->mutex);
    948 
    949 	/* schedule more IO */
    950 	raid_wakeup(raidPtr);
    951 }
    952 
    953 /* ARGSUSED */
    954 static int
    955 raidread(dev_t dev, struct uio *uio, int flags)
    956 {
    957 	int     unit = raidunit(dev);
    958 	struct raid_softc *rs;
    959 
    960 	if ((rs = raidget(unit, false)) == NULL)
    961 		return ENXIO;
    962 
    963 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    964 		return (ENXIO);
    965 
    966 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    967 
    968 }
    969 
    970 /* ARGSUSED */
    971 static int
    972 raidwrite(dev_t dev, struct uio *uio, int flags)
    973 {
    974 	int     unit = raidunit(dev);
    975 	struct raid_softc *rs;
    976 
    977 	if ((rs = raidget(unit, false)) == NULL)
    978 		return ENXIO;
    979 
    980 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    981 		return (ENXIO);
    982 
    983 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    984 
    985 }
    986 
    987 static int
    988 raid_detach_unlocked(struct raid_softc *rs)
    989 {
    990 	struct dk_softc *dksc = &rs->sc_dksc;
    991 	RF_Raid_t *raidPtr;
    992 	int error;
    993 
    994 	raidPtr = &rs->sc_r;
    995 
    996 	if (DK_BUSY(dksc, 0) ||
    997 	    raidPtr->recon_in_progress != 0 ||
    998 	    raidPtr->parity_rewrite_in_progress != 0 ||
    999 	    raidPtr->copyback_in_progress != 0)
   1000 		return EBUSY;
   1001 
   1002 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1003 		return 0;
   1004 
   1005 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1006 
   1007 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1008 		return error;
   1009 
   1010 	rs->sc_flags &= ~RAIDF_INITED;
   1011 
   1012 	/* Kill off any queued buffers */
   1013 	dk_drain(dksc);
   1014 	bufq_free(dksc->sc_bufq);
   1015 
   1016 	/* Detach the disk. */
   1017 	dkwedge_delall(&dksc->sc_dkdev);
   1018 	disk_detach(&dksc->sc_dkdev);
   1019 	disk_destroy(&dksc->sc_dkdev);
   1020 	dk_detach(dksc);
   1021 
   1022 	return 0;
   1023 }
   1024 
   1025 static int
   1026 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1027 {
   1028 	int     unit = raidunit(dev);
   1029 	int     error = 0;
   1030 	int     part, pmask;
   1031 	struct raid_softc *rs;
   1032 	struct dk_softc *dksc;
   1033 	RF_Config_t *k_cfg, *u_cfg;
   1034 	RF_Raid_t *raidPtr;
   1035 	RF_RaidDisk_t *diskPtr;
   1036 	RF_AccTotals_t *totals;
   1037 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1038 	u_char *specific_buf;
   1039 	int retcode = 0;
   1040 	int column;
   1041 /*	int raidid; */
   1042 	struct rf_recon_req *rrcopy, *rr;
   1043 	RF_ComponentLabel_t *clabel;
   1044 	RF_ComponentLabel_t *ci_label;
   1045 	RF_ComponentLabel_t **clabel_ptr;
   1046 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1047 	RF_SingleComponent_t component;
   1048 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1049 	int i, j, d;
   1050 
   1051 	if ((rs = raidget(unit, false)) == NULL)
   1052 		return ENXIO;
   1053 	dksc = &rs->sc_dksc;
   1054 	raidPtr = &rs->sc_r;
   1055 
   1056 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1057 		(int) DISKPART(dev), (int) unit, cmd));
   1058 
   1059 	/* Must be initialized for these... */
   1060 	switch (cmd) {
   1061 	case RAIDFRAME_REWRITEPARITY:
   1062 	case RAIDFRAME_GET_INFO:
   1063 	case RAIDFRAME_RESET_ACCTOTALS:
   1064 	case RAIDFRAME_GET_ACCTOTALS:
   1065 	case RAIDFRAME_KEEP_ACCTOTALS:
   1066 	case RAIDFRAME_GET_SIZE:
   1067 	case RAIDFRAME_FAIL_DISK:
   1068 	case RAIDFRAME_COPYBACK:
   1069 	case RAIDFRAME_CHECK_RECON_STATUS:
   1070 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1071 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1072 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1073 	case RAIDFRAME_ADD_HOT_SPARE:
   1074 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1075 	case RAIDFRAME_INIT_LABELS:
   1076 	case RAIDFRAME_REBUILD_IN_PLACE:
   1077 	case RAIDFRAME_CHECK_PARITY:
   1078 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1079 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1080 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1081 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1082 	case RAIDFRAME_SET_AUTOCONFIG:
   1083 	case RAIDFRAME_SET_ROOT:
   1084 	case RAIDFRAME_DELETE_COMPONENT:
   1085 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1086 	case RAIDFRAME_PARITYMAP_STATUS:
   1087 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1088 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1089 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1090 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1091 			return (ENXIO);
   1092 	}
   1093 
   1094 	switch (cmd) {
   1095 #ifdef COMPAT_50
   1096 	case RAIDFRAME_GET_INFO50:
   1097 		return rf_get_info50(raidPtr, data);
   1098 
   1099 	case RAIDFRAME_CONFIGURE50:
   1100 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1101 			return retcode;
   1102 		goto config;
   1103 #endif
   1104 		/* configure the system */
   1105 	case RAIDFRAME_CONFIGURE:
   1106 
   1107 		if (raidPtr->valid) {
   1108 			/* There is a valid RAID set running on this unit! */
   1109 			printf("raid%d: Device already configured!\n",unit);
   1110 			return(EINVAL);
   1111 		}
   1112 
   1113 		/* copy-in the configuration information */
   1114 		/* data points to a pointer to the configuration structure */
   1115 
   1116 		u_cfg = *((RF_Config_t **) data);
   1117 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1118 		if (k_cfg == NULL) {
   1119 			return (ENOMEM);
   1120 		}
   1121 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1122 		if (retcode) {
   1123 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1124 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1125 				retcode));
   1126 			goto no_config;
   1127 		}
   1128 		goto config;
   1129 	config:
   1130 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1131 
   1132 		/* allocate a buffer for the layout-specific data, and copy it
   1133 		 * in */
   1134 		if (k_cfg->layoutSpecificSize) {
   1135 			if (k_cfg->layoutSpecificSize > 10000) {
   1136 				/* sanity check */
   1137 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1138 				retcode = EINVAL;
   1139 				goto no_config;
   1140 			}
   1141 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1142 			    (u_char *));
   1143 			if (specific_buf == NULL) {
   1144 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1145 				retcode = ENOMEM;
   1146 				goto no_config;
   1147 			}
   1148 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1149 			    k_cfg->layoutSpecificSize);
   1150 			if (retcode) {
   1151 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1152 				RF_Free(specific_buf,
   1153 					k_cfg->layoutSpecificSize);
   1154 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1155 					retcode));
   1156 				goto no_config;
   1157 			}
   1158 		} else
   1159 			specific_buf = NULL;
   1160 		k_cfg->layoutSpecific = specific_buf;
   1161 
   1162 		/* should do some kind of sanity check on the configuration.
   1163 		 * Store the sum of all the bytes in the last byte? */
   1164 
   1165 		/* configure the system */
   1166 
   1167 		/*
   1168 		 * Clear the entire RAID descriptor, just to make sure
   1169 		 *  there is no stale data left in the case of a
   1170 		 *  reconfiguration
   1171 		 */
   1172 		memset(raidPtr, 0, sizeof(*raidPtr));
   1173 		raidPtr->softc = rs;
   1174 		raidPtr->raidid = unit;
   1175 
   1176 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1177 
   1178 		if (retcode == 0) {
   1179 
   1180 			/* allow this many simultaneous IO's to
   1181 			   this RAID device */
   1182 			raidPtr->openings = RAIDOUTSTANDING;
   1183 
   1184 			raidinit(rs);
   1185 			raid_wakeup(raidPtr);
   1186 			rf_markalldirty(raidPtr);
   1187 		}
   1188 		/* free the buffers.  No return code here. */
   1189 		if (k_cfg->layoutSpecificSize) {
   1190 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1191 		}
   1192 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1193 
   1194 	no_config:
   1195 		/*
   1196 		 * If configuration failed, set sc_flags so that we
   1197 		 * will detach the device when we close it.
   1198 		 */
   1199 		if (retcode != 0)
   1200 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1201 		return (retcode);
   1202 
   1203 		/* shutdown the system */
   1204 	case RAIDFRAME_SHUTDOWN:
   1205 
   1206 		part = DISKPART(dev);
   1207 		pmask = (1 << part);
   1208 
   1209 		if ((error = raidlock(rs)) != 0)
   1210 			return (error);
   1211 
   1212 		if (DK_BUSY(dksc, pmask) ||
   1213 		    raidPtr->recon_in_progress != 0 ||
   1214 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1215 		    raidPtr->copyback_in_progress != 0)
   1216 			retcode = EBUSY;
   1217 		else {
   1218 			/* detach and free on close */
   1219 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1220 			retcode = 0;
   1221 		}
   1222 
   1223 		raidunlock(rs);
   1224 
   1225 		return (retcode);
   1226 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1227 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1228 		/* need to read the component label for the disk indicated
   1229 		   by row,column in clabel */
   1230 
   1231 		/*
   1232 		 * Perhaps there should be an option to skip the in-core
   1233 		 * copy and hit the disk, as with disklabel(8).
   1234 		 */
   1235 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1236 
   1237 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1238 
   1239 		if (retcode) {
   1240 			RF_Free(clabel, sizeof(*clabel));
   1241 			return retcode;
   1242 		}
   1243 
   1244 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1245 
   1246 		column = clabel->column;
   1247 
   1248 		if ((column < 0) || (column >= raidPtr->numCol +
   1249 		    raidPtr->numSpare)) {
   1250 			RF_Free(clabel, sizeof(*clabel));
   1251 			return EINVAL;
   1252 		}
   1253 
   1254 		RF_Free(clabel, sizeof(*clabel));
   1255 
   1256 		clabel = raidget_component_label(raidPtr, column);
   1257 
   1258 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1259 
   1260 #if 0
   1261 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1262 		clabel = (RF_ComponentLabel_t *) data;
   1263 
   1264 		/* XXX check the label for valid stuff... */
   1265 		/* Note that some things *should not* get modified --
   1266 		   the user should be re-initing the labels instead of
   1267 		   trying to patch things.
   1268 		   */
   1269 
   1270 		raidid = raidPtr->raidid;
   1271 #ifdef DEBUG
   1272 		printf("raid%d: Got component label:\n", raidid);
   1273 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1274 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1275 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1276 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1277 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1278 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1279 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1280 #endif
   1281 		clabel->row = 0;
   1282 		column = clabel->column;
   1283 
   1284 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1285 			return(EINVAL);
   1286 		}
   1287 
   1288 		/* XXX this isn't allowed to do anything for now :-) */
   1289 
   1290 		/* XXX and before it is, we need to fill in the rest
   1291 		   of the fields!?!?!?! */
   1292 		memcpy(raidget_component_label(raidPtr, column),
   1293 		    clabel, sizeof(*clabel));
   1294 		raidflush_component_label(raidPtr, column);
   1295 		return (0);
   1296 #endif
   1297 
   1298 	case RAIDFRAME_INIT_LABELS:
   1299 		clabel = (RF_ComponentLabel_t *) data;
   1300 		/*
   1301 		   we only want the serial number from
   1302 		   the above.  We get all the rest of the information
   1303 		   from the config that was used to create this RAID
   1304 		   set.
   1305 		   */
   1306 
   1307 		raidPtr->serial_number = clabel->serial_number;
   1308 
   1309 		for(column=0;column<raidPtr->numCol;column++) {
   1310 			diskPtr = &raidPtr->Disks[column];
   1311 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1312 				ci_label = raidget_component_label(raidPtr,
   1313 				    column);
   1314 				/* Zeroing this is important. */
   1315 				memset(ci_label, 0, sizeof(*ci_label));
   1316 				raid_init_component_label(raidPtr, ci_label);
   1317 				ci_label->serial_number =
   1318 				    raidPtr->serial_number;
   1319 				ci_label->row = 0; /* we dont' pretend to support more */
   1320 				rf_component_label_set_partitionsize(ci_label,
   1321 				    diskPtr->partitionSize);
   1322 				ci_label->column = column;
   1323 				raidflush_component_label(raidPtr, column);
   1324 			}
   1325 			/* XXXjld what about the spares? */
   1326 		}
   1327 
   1328 		return (retcode);
   1329 	case RAIDFRAME_SET_AUTOCONFIG:
   1330 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1331 		printf("raid%d: New autoconfig value is: %d\n",
   1332 		       raidPtr->raidid, d);
   1333 		*(int *) data = d;
   1334 		return (retcode);
   1335 
   1336 	case RAIDFRAME_SET_ROOT:
   1337 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1338 		printf("raid%d: New rootpartition value is: %d\n",
   1339 		       raidPtr->raidid, d);
   1340 		*(int *) data = d;
   1341 		return (retcode);
   1342 
   1343 		/* initialize all parity */
   1344 	case RAIDFRAME_REWRITEPARITY:
   1345 
   1346 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1347 			/* Parity for RAID 0 is trivially correct */
   1348 			raidPtr->parity_good = RF_RAID_CLEAN;
   1349 			return(0);
   1350 		}
   1351 
   1352 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1353 			/* Re-write is already in progress! */
   1354 			return(EINVAL);
   1355 		}
   1356 
   1357 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1358 					   rf_RewriteParityThread,
   1359 					   raidPtr,"raid_parity");
   1360 		return (retcode);
   1361 
   1362 
   1363 	case RAIDFRAME_ADD_HOT_SPARE:
   1364 		sparePtr = (RF_SingleComponent_t *) data;
   1365 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1366 		retcode = rf_add_hot_spare(raidPtr, &component);
   1367 		return(retcode);
   1368 
   1369 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1370 		return(retcode);
   1371 
   1372 	case RAIDFRAME_DELETE_COMPONENT:
   1373 		componentPtr = (RF_SingleComponent_t *)data;
   1374 		memcpy( &component, componentPtr,
   1375 			sizeof(RF_SingleComponent_t));
   1376 		retcode = rf_delete_component(raidPtr, &component);
   1377 		return(retcode);
   1378 
   1379 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1380 		componentPtr = (RF_SingleComponent_t *)data;
   1381 		memcpy( &component, componentPtr,
   1382 			sizeof(RF_SingleComponent_t));
   1383 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1384 		return(retcode);
   1385 
   1386 	case RAIDFRAME_REBUILD_IN_PLACE:
   1387 
   1388 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1389 			/* Can't do this on a RAID 0!! */
   1390 			return(EINVAL);
   1391 		}
   1392 
   1393 		if (raidPtr->recon_in_progress == 1) {
   1394 			/* a reconstruct is already in progress! */
   1395 			return(EINVAL);
   1396 		}
   1397 
   1398 		componentPtr = (RF_SingleComponent_t *) data;
   1399 		memcpy( &component, componentPtr,
   1400 			sizeof(RF_SingleComponent_t));
   1401 		component.row = 0; /* we don't support any more */
   1402 		column = component.column;
   1403 
   1404 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1405 			return(EINVAL);
   1406 		}
   1407 
   1408 		rf_lock_mutex2(raidPtr->mutex);
   1409 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1410 		    (raidPtr->numFailures > 0)) {
   1411 			/* XXX 0 above shouldn't be constant!!! */
   1412 			/* some component other than this has failed.
   1413 			   Let's not make things worse than they already
   1414 			   are... */
   1415 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1416 			       raidPtr->raidid);
   1417 			printf("raid%d:     Col: %d   Too many failures.\n",
   1418 			       raidPtr->raidid, column);
   1419 			rf_unlock_mutex2(raidPtr->mutex);
   1420 			return (EINVAL);
   1421 		}
   1422 		if (raidPtr->Disks[column].status ==
   1423 		    rf_ds_reconstructing) {
   1424 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1425 			       raidPtr->raidid);
   1426 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1427 
   1428 			rf_unlock_mutex2(raidPtr->mutex);
   1429 			return (EINVAL);
   1430 		}
   1431 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1432 			rf_unlock_mutex2(raidPtr->mutex);
   1433 			return (EINVAL);
   1434 		}
   1435 		rf_unlock_mutex2(raidPtr->mutex);
   1436 
   1437 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1438 		if (rrcopy == NULL)
   1439 			return(ENOMEM);
   1440 
   1441 		rrcopy->raidPtr = (void *) raidPtr;
   1442 		rrcopy->col = column;
   1443 
   1444 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1445 					   rf_ReconstructInPlaceThread,
   1446 					   rrcopy,"raid_reconip");
   1447 		return(retcode);
   1448 
   1449 	case RAIDFRAME_GET_INFO:
   1450 		if (!raidPtr->valid)
   1451 			return (ENODEV);
   1452 		ucfgp = (RF_DeviceConfig_t **) data;
   1453 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1454 			  (RF_DeviceConfig_t *));
   1455 		if (d_cfg == NULL)
   1456 			return (ENOMEM);
   1457 		d_cfg->rows = 1; /* there is only 1 row now */
   1458 		d_cfg->cols = raidPtr->numCol;
   1459 		d_cfg->ndevs = raidPtr->numCol;
   1460 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1461 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1462 			return (ENOMEM);
   1463 		}
   1464 		d_cfg->nspares = raidPtr->numSpare;
   1465 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1466 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1467 			return (ENOMEM);
   1468 		}
   1469 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1470 		d = 0;
   1471 		for (j = 0; j < d_cfg->cols; j++) {
   1472 			d_cfg->devs[d] = raidPtr->Disks[j];
   1473 			d++;
   1474 		}
   1475 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1476 			d_cfg->spares[i] = raidPtr->Disks[j];
   1477 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1478 				/* XXX: raidctl(8) expects to see this as a used spare */
   1479 				d_cfg->spares[i].status = rf_ds_used_spare;
   1480 			}
   1481 		}
   1482 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1483 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1484 
   1485 		return (retcode);
   1486 
   1487 	case RAIDFRAME_CHECK_PARITY:
   1488 		*(int *) data = raidPtr->parity_good;
   1489 		return (0);
   1490 
   1491 	case RAIDFRAME_PARITYMAP_STATUS:
   1492 		if (rf_paritymap_ineligible(raidPtr))
   1493 			return EINVAL;
   1494 		rf_paritymap_status(raidPtr->parity_map,
   1495 		    (struct rf_pmstat *)data);
   1496 		return 0;
   1497 
   1498 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1499 		if (rf_paritymap_ineligible(raidPtr))
   1500 			return EINVAL;
   1501 		if (raidPtr->parity_map == NULL)
   1502 			return ENOENT; /* ??? */
   1503 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1504 			(struct rf_pmparams *)data, 1))
   1505 			return EINVAL;
   1506 		return 0;
   1507 
   1508 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1509 		if (rf_paritymap_ineligible(raidPtr))
   1510 			return EINVAL;
   1511 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1512 		return 0;
   1513 
   1514 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1515 		if (rf_paritymap_ineligible(raidPtr))
   1516 			return EINVAL;
   1517 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1518 		/* XXX should errors be passed up? */
   1519 		return 0;
   1520 
   1521 	case RAIDFRAME_RESET_ACCTOTALS:
   1522 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1523 		return (0);
   1524 
   1525 	case RAIDFRAME_GET_ACCTOTALS:
   1526 		totals = (RF_AccTotals_t *) data;
   1527 		*totals = raidPtr->acc_totals;
   1528 		return (0);
   1529 
   1530 	case RAIDFRAME_KEEP_ACCTOTALS:
   1531 		raidPtr->keep_acc_totals = *(int *)data;
   1532 		return (0);
   1533 
   1534 	case RAIDFRAME_GET_SIZE:
   1535 		*(int *) data = raidPtr->totalSectors;
   1536 		return (0);
   1537 
   1538 		/* fail a disk & optionally start reconstruction */
   1539 	case RAIDFRAME_FAIL_DISK:
   1540 
   1541 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1542 			/* Can't do this on a RAID 0!! */
   1543 			return(EINVAL);
   1544 		}
   1545 
   1546 		rr = (struct rf_recon_req *) data;
   1547 		rr->row = 0;
   1548 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1549 			return (EINVAL);
   1550 
   1551 
   1552 		rf_lock_mutex2(raidPtr->mutex);
   1553 		if (raidPtr->status == rf_rs_reconstructing) {
   1554 			/* you can't fail a disk while we're reconstructing! */
   1555 			/* XXX wrong for RAID6 */
   1556 			rf_unlock_mutex2(raidPtr->mutex);
   1557 			return (EINVAL);
   1558 		}
   1559 		if ((raidPtr->Disks[rr->col].status ==
   1560 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1561 			/* some other component has failed.  Let's not make
   1562 			   things worse. XXX wrong for RAID6 */
   1563 			rf_unlock_mutex2(raidPtr->mutex);
   1564 			return (EINVAL);
   1565 		}
   1566 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1567 			/* Can't fail a spared disk! */
   1568 			rf_unlock_mutex2(raidPtr->mutex);
   1569 			return (EINVAL);
   1570 		}
   1571 		rf_unlock_mutex2(raidPtr->mutex);
   1572 
   1573 		/* make a copy of the recon request so that we don't rely on
   1574 		 * the user's buffer */
   1575 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1576 		if (rrcopy == NULL)
   1577 			return(ENOMEM);
   1578 		memcpy(rrcopy, rr, sizeof(*rr));
   1579 		rrcopy->raidPtr = (void *) raidPtr;
   1580 
   1581 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1582 					   rf_ReconThread,
   1583 					   rrcopy,"raid_recon");
   1584 		return (0);
   1585 
   1586 		/* invoke a copyback operation after recon on whatever disk
   1587 		 * needs it, if any */
   1588 	case RAIDFRAME_COPYBACK:
   1589 
   1590 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1591 			/* This makes no sense on a RAID 0!! */
   1592 			return(EINVAL);
   1593 		}
   1594 
   1595 		if (raidPtr->copyback_in_progress == 1) {
   1596 			/* Copyback is already in progress! */
   1597 			return(EINVAL);
   1598 		}
   1599 
   1600 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1601 					   rf_CopybackThread,
   1602 					   raidPtr,"raid_copyback");
   1603 		return (retcode);
   1604 
   1605 		/* return the percentage completion of reconstruction */
   1606 	case RAIDFRAME_CHECK_RECON_STATUS:
   1607 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1608 			/* This makes no sense on a RAID 0, so tell the
   1609 			   user it's done. */
   1610 			*(int *) data = 100;
   1611 			return(0);
   1612 		}
   1613 		if (raidPtr->status != rf_rs_reconstructing)
   1614 			*(int *) data = 100;
   1615 		else {
   1616 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1617 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1618 			} else {
   1619 				*(int *) data = 0;
   1620 			}
   1621 		}
   1622 		return (0);
   1623 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1624 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1625 		if (raidPtr->status != rf_rs_reconstructing) {
   1626 			progressInfo.remaining = 0;
   1627 			progressInfo.completed = 100;
   1628 			progressInfo.total = 100;
   1629 		} else {
   1630 			progressInfo.total =
   1631 				raidPtr->reconControl->numRUsTotal;
   1632 			progressInfo.completed =
   1633 				raidPtr->reconControl->numRUsComplete;
   1634 			progressInfo.remaining = progressInfo.total -
   1635 				progressInfo.completed;
   1636 		}
   1637 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1638 				  sizeof(RF_ProgressInfo_t));
   1639 		return (retcode);
   1640 
   1641 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1642 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1643 			/* This makes no sense on a RAID 0, so tell the
   1644 			   user it's done. */
   1645 			*(int *) data = 100;
   1646 			return(0);
   1647 		}
   1648 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1649 			*(int *) data = 100 *
   1650 				raidPtr->parity_rewrite_stripes_done /
   1651 				raidPtr->Layout.numStripe;
   1652 		} else {
   1653 			*(int *) data = 100;
   1654 		}
   1655 		return (0);
   1656 
   1657 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1658 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1659 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1660 			progressInfo.total = raidPtr->Layout.numStripe;
   1661 			progressInfo.completed =
   1662 				raidPtr->parity_rewrite_stripes_done;
   1663 			progressInfo.remaining = progressInfo.total -
   1664 				progressInfo.completed;
   1665 		} else {
   1666 			progressInfo.remaining = 0;
   1667 			progressInfo.completed = 100;
   1668 			progressInfo.total = 100;
   1669 		}
   1670 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1671 				  sizeof(RF_ProgressInfo_t));
   1672 		return (retcode);
   1673 
   1674 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1675 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1676 			/* This makes no sense on a RAID 0 */
   1677 			*(int *) data = 100;
   1678 			return(0);
   1679 		}
   1680 		if (raidPtr->copyback_in_progress == 1) {
   1681 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1682 				raidPtr->Layout.numStripe;
   1683 		} else {
   1684 			*(int *) data = 100;
   1685 		}
   1686 		return (0);
   1687 
   1688 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1689 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1690 		if (raidPtr->copyback_in_progress == 1) {
   1691 			progressInfo.total = raidPtr->Layout.numStripe;
   1692 			progressInfo.completed =
   1693 				raidPtr->copyback_stripes_done;
   1694 			progressInfo.remaining = progressInfo.total -
   1695 				progressInfo.completed;
   1696 		} else {
   1697 			progressInfo.remaining = 0;
   1698 			progressInfo.completed = 100;
   1699 			progressInfo.total = 100;
   1700 		}
   1701 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1702 				  sizeof(RF_ProgressInfo_t));
   1703 		return (retcode);
   1704 
   1705 	case RAIDFRAME_SET_LAST_UNIT:
   1706 		for (column = 0; column < raidPtr->numCol; column++)
   1707 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1708 				return EBUSY;
   1709 
   1710 		for (column = 0; column < raidPtr->numCol; column++) {
   1711 			clabel = raidget_component_label(raidPtr, column);
   1712 			clabel->last_unit = *(int *)data;
   1713 			raidflush_component_label(raidPtr, column);
   1714 		}
   1715 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1716 		return 0;
   1717 
   1718 		/* the sparetable daemon calls this to wait for the kernel to
   1719 		 * need a spare table. this ioctl does not return until a
   1720 		 * spare table is needed. XXX -- calling mpsleep here in the
   1721 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1722 		 * -- I should either compute the spare table in the kernel,
   1723 		 * or have a different -- XXX XXX -- interface (a different
   1724 		 * character device) for delivering the table     -- XXX */
   1725 #if 0
   1726 	case RAIDFRAME_SPARET_WAIT:
   1727 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1728 		while (!rf_sparet_wait_queue)
   1729 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1730 		waitreq = rf_sparet_wait_queue;
   1731 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1732 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1733 
   1734 		/* structure assignment */
   1735 		*((RF_SparetWait_t *) data) = *waitreq;
   1736 
   1737 		RF_Free(waitreq, sizeof(*waitreq));
   1738 		return (0);
   1739 
   1740 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1741 		 * code in it that will cause the dameon to exit */
   1742 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1743 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1744 		waitreq->fcol = -1;
   1745 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1746 		waitreq->next = rf_sparet_wait_queue;
   1747 		rf_sparet_wait_queue = waitreq;
   1748 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1749 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1750 		return (0);
   1751 
   1752 		/* used by the spare table daemon to deliver a spare table
   1753 		 * into the kernel */
   1754 	case RAIDFRAME_SEND_SPARET:
   1755 
   1756 		/* install the spare table */
   1757 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1758 
   1759 		/* respond to the requestor.  the return status of the spare
   1760 		 * table installation is passed in the "fcol" field */
   1761 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1762 		waitreq->fcol = retcode;
   1763 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1764 		waitreq->next = rf_sparet_resp_queue;
   1765 		rf_sparet_resp_queue = waitreq;
   1766 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1767 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1768 
   1769 		return (retcode);
   1770 #endif
   1771 
   1772 	default:
   1773 		break; /* fall through to the os-specific code below */
   1774 
   1775 	}
   1776 
   1777 	if (!raidPtr->valid)
   1778 		return (EINVAL);
   1779 
   1780 	/*
   1781 	 * Add support for "regular" device ioctls here.
   1782 	 */
   1783 
   1784 	switch (cmd) {
   1785 	case DIOCGCACHE:
   1786 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1787 		break;
   1788 
   1789 	case DIOCCACHESYNC:
   1790 		retcode = rf_sync_component_caches(raidPtr);
   1791 		break;
   1792 
   1793 	default:
   1794 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1795 		break;
   1796 	}
   1797 
   1798 	return (retcode);
   1799 
   1800 }
   1801 
   1802 
   1803 /* raidinit -- complete the rest of the initialization for the
   1804    RAIDframe device.  */
   1805 
   1806 
   1807 static void
   1808 raidinit(struct raid_softc *rs)
   1809 {
   1810 	cfdata_t cf;
   1811 	unsigned int unit;
   1812 	struct dk_softc *dksc = &rs->sc_dksc;
   1813 	RF_Raid_t *raidPtr = &rs->sc_r;
   1814 	device_t dev;
   1815 
   1816 	unit = raidPtr->raidid;
   1817 
   1818 	/* XXX doesn't check bounds. */
   1819 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1820 
   1821 	/* attach the pseudo device */
   1822 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1823 	cf->cf_name = raid_cd.cd_name;
   1824 	cf->cf_atname = raid_cd.cd_name;
   1825 	cf->cf_unit = unit;
   1826 	cf->cf_fstate = FSTATE_STAR;
   1827 
   1828 	dev = config_attach_pseudo(cf);
   1829 	if (dev == NULL) {
   1830 		printf("raid%d: config_attach_pseudo failed\n",
   1831 		    raidPtr->raidid);
   1832 		free(cf, M_RAIDFRAME);
   1833 		return;
   1834 	}
   1835 
   1836 	/* provide a backpointer to the real softc */
   1837 	raidsoftc(dev) = rs;
   1838 
   1839 	/* disk_attach actually creates space for the CPU disklabel, among
   1840 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1841 	 * with disklabels. */
   1842 	dk_init(dksc, dev, DKTYPE_RAID);
   1843 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1844 
   1845 	/* XXX There may be a weird interaction here between this, and
   1846 	 * protectedSectors, as used in RAIDframe.  */
   1847 
   1848 	rs->sc_size = raidPtr->totalSectors;
   1849 
   1850 	/* Attach dk and disk subsystems */
   1851 	dk_attach(dksc);
   1852 	disk_attach(&dksc->sc_dkdev);
   1853 	rf_set_geometry(rs, raidPtr);
   1854 
   1855 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1856 
   1857 	/* mark unit as usuable */
   1858 	rs->sc_flags |= RAIDF_INITED;
   1859 
   1860 	dkwedge_discover(&dksc->sc_dkdev);
   1861 }
   1862 
   1863 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1864 /* wake up the daemon & tell it to get us a spare table
   1865  * XXX
   1866  * the entries in the queues should be tagged with the raidPtr
   1867  * so that in the extremely rare case that two recons happen at once,
   1868  * we know for which device were requesting a spare table
   1869  * XXX
   1870  *
   1871  * XXX This code is not currently used. GO
   1872  */
   1873 int
   1874 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1875 {
   1876 	int     retcode;
   1877 
   1878 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1879 	req->next = rf_sparet_wait_queue;
   1880 	rf_sparet_wait_queue = req;
   1881 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1882 
   1883 	/* mpsleep unlocks the mutex */
   1884 	while (!rf_sparet_resp_queue) {
   1885 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1886 	}
   1887 	req = rf_sparet_resp_queue;
   1888 	rf_sparet_resp_queue = req->next;
   1889 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1890 
   1891 	retcode = req->fcol;
   1892 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1893 					 * alloc'd */
   1894 	return (retcode);
   1895 }
   1896 #endif
   1897 
   1898 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1899  * bp & passes it down.
   1900  * any calls originating in the kernel must use non-blocking I/O
   1901  * do some extra sanity checking to return "appropriate" error values for
   1902  * certain conditions (to make some standard utilities work)
   1903  *
   1904  * Formerly known as: rf_DoAccessKernel
   1905  */
   1906 void
   1907 raidstart(RF_Raid_t *raidPtr)
   1908 {
   1909 	struct raid_softc *rs;
   1910 	struct dk_softc *dksc;
   1911 
   1912 	rs = raidPtr->softc;
   1913 	dksc = &rs->sc_dksc;
   1914 	/* quick check to see if anything has died recently */
   1915 	rf_lock_mutex2(raidPtr->mutex);
   1916 	if (raidPtr->numNewFailures > 0) {
   1917 		rf_unlock_mutex2(raidPtr->mutex);
   1918 		rf_update_component_labels(raidPtr,
   1919 					   RF_NORMAL_COMPONENT_UPDATE);
   1920 		rf_lock_mutex2(raidPtr->mutex);
   1921 		raidPtr->numNewFailures--;
   1922 	}
   1923 	rf_unlock_mutex2(raidPtr->mutex);
   1924 
   1925 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1926 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1927 		return;
   1928 	}
   1929 
   1930 	dk_start(dksc, NULL);
   1931 }
   1932 
   1933 static int
   1934 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1935 {
   1936 	RF_SectorCount_t num_blocks, pb, sum;
   1937 	RF_RaidAddr_t raid_addr;
   1938 	daddr_t blocknum;
   1939 	int     do_async;
   1940 	int rc;
   1941 
   1942 	rf_lock_mutex2(raidPtr->mutex);
   1943 	if (raidPtr->openings == 0) {
   1944 		rf_unlock_mutex2(raidPtr->mutex);
   1945 		return EAGAIN;
   1946 	}
   1947 	rf_unlock_mutex2(raidPtr->mutex);
   1948 
   1949 	blocknum = bp->b_rawblkno;
   1950 
   1951 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1952 		    (int) blocknum));
   1953 
   1954 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1955 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1956 
   1957 	/* *THIS* is where we adjust what block we're going to...
   1958 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1959 	raid_addr = blocknum;
   1960 
   1961 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1962 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1963 	sum = raid_addr + num_blocks + pb;
   1964 	if (1 || rf_debugKernelAccess) {
   1965 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1966 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1967 			    (int) pb, (int) bp->b_resid));
   1968 	}
   1969 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1970 	    || (sum < num_blocks) || (sum < pb)) {
   1971 		rc = ENOSPC;
   1972 		goto done;
   1973 	}
   1974 	/*
   1975 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1976 	 */
   1977 
   1978 	if (bp->b_bcount & raidPtr->sectorMask) {
   1979 		rc = ENOSPC;
   1980 		goto done;
   1981 	}
   1982 	db1_printf(("Calling DoAccess..\n"));
   1983 
   1984 
   1985 	rf_lock_mutex2(raidPtr->mutex);
   1986 	raidPtr->openings--;
   1987 	rf_unlock_mutex2(raidPtr->mutex);
   1988 
   1989 	/*
   1990 	 * Everything is async.
   1991 	 */
   1992 	do_async = 1;
   1993 
   1994 	/* don't ever condition on bp->b_flags & B_WRITE.
   1995 	 * always condition on B_READ instead */
   1996 
   1997 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1998 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1999 			 do_async, raid_addr, num_blocks,
   2000 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2001 
   2002 done:
   2003 	return rc;
   2004 }
   2005 
   2006 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2007 
   2008 int
   2009 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2010 {
   2011 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2012 	struct buf *bp;
   2013 
   2014 	req->queue = queue;
   2015 	bp = req->bp;
   2016 
   2017 	switch (req->type) {
   2018 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2019 		/* XXX need to do something extra here.. */
   2020 		/* I'm leaving this in, as I've never actually seen it used,
   2021 		 * and I'd like folks to report it... GO */
   2022 		printf(("WAKEUP CALLED\n"));
   2023 		queue->numOutstanding++;
   2024 
   2025 		bp->b_flags = 0;
   2026 		bp->b_private = req;
   2027 
   2028 		KernelWakeupFunc(bp);
   2029 		break;
   2030 
   2031 	case RF_IO_TYPE_READ:
   2032 	case RF_IO_TYPE_WRITE:
   2033 #if RF_ACC_TRACE > 0
   2034 		if (req->tracerec) {
   2035 			RF_ETIMER_START(req->tracerec->timer);
   2036 		}
   2037 #endif
   2038 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2039 		    op, queue->rf_cinfo->ci_dev,
   2040 		    req->sectorOffset, req->numSector,
   2041 		    req->buf, KernelWakeupFunc, (void *) req,
   2042 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2043 
   2044 		if (rf_debugKernelAccess) {
   2045 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2046 				(long) bp->b_blkno));
   2047 		}
   2048 		queue->numOutstanding++;
   2049 		queue->last_deq_sector = req->sectorOffset;
   2050 		/* acc wouldn't have been let in if there were any pending
   2051 		 * reqs at any other priority */
   2052 		queue->curPriority = req->priority;
   2053 
   2054 		db1_printf(("Going for %c to unit %d col %d\n",
   2055 			    req->type, queue->raidPtr->raidid,
   2056 			    queue->col));
   2057 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2058 			(int) req->sectorOffset, (int) req->numSector,
   2059 			(int) (req->numSector <<
   2060 			    queue->raidPtr->logBytesPerSector),
   2061 			(int) queue->raidPtr->logBytesPerSector));
   2062 
   2063 		/*
   2064 		 * XXX: drop lock here since this can block at
   2065 		 * least with backing SCSI devices.  Retake it
   2066 		 * to minimize fuss with calling interfaces.
   2067 		 */
   2068 
   2069 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2070 		bdev_strategy(bp);
   2071 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2072 		break;
   2073 
   2074 	default:
   2075 		panic("bad req->type in rf_DispatchKernelIO");
   2076 	}
   2077 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2078 
   2079 	return (0);
   2080 }
   2081 /* this is the callback function associated with a I/O invoked from
   2082    kernel code.
   2083  */
   2084 static void
   2085 KernelWakeupFunc(struct buf *bp)
   2086 {
   2087 	RF_DiskQueueData_t *req = NULL;
   2088 	RF_DiskQueue_t *queue;
   2089 
   2090 	db1_printf(("recovering the request queue:\n"));
   2091 
   2092 	req = bp->b_private;
   2093 
   2094 	queue = (RF_DiskQueue_t *) req->queue;
   2095 
   2096 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2097 
   2098 #if RF_ACC_TRACE > 0
   2099 	if (req->tracerec) {
   2100 		RF_ETIMER_STOP(req->tracerec->timer);
   2101 		RF_ETIMER_EVAL(req->tracerec->timer);
   2102 		rf_lock_mutex2(rf_tracing_mutex);
   2103 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2104 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2105 		req->tracerec->num_phys_ios++;
   2106 		rf_unlock_mutex2(rf_tracing_mutex);
   2107 	}
   2108 #endif
   2109 
   2110 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2111 	 * ballistic, and mark the component as hosed... */
   2112 
   2113 	if (bp->b_error != 0) {
   2114 		/* Mark the disk as dead */
   2115 		/* but only mark it once... */
   2116 		/* and only if it wouldn't leave this RAID set
   2117 		   completely broken */
   2118 		if (((queue->raidPtr->Disks[queue->col].status ==
   2119 		      rf_ds_optimal) ||
   2120 		     (queue->raidPtr->Disks[queue->col].status ==
   2121 		      rf_ds_used_spare)) &&
   2122 		     (queue->raidPtr->numFailures <
   2123 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2124 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2125 			       queue->raidPtr->raidid,
   2126 			       bp->b_error,
   2127 			       queue->raidPtr->Disks[queue->col].devname);
   2128 			queue->raidPtr->Disks[queue->col].status =
   2129 			    rf_ds_failed;
   2130 			queue->raidPtr->status = rf_rs_degraded;
   2131 			queue->raidPtr->numFailures++;
   2132 			queue->raidPtr->numNewFailures++;
   2133 		} else {	/* Disk is already dead... */
   2134 			/* printf("Disk already marked as dead!\n"); */
   2135 		}
   2136 
   2137 	}
   2138 
   2139 	/* Fill in the error value */
   2140 	req->error = bp->b_error;
   2141 
   2142 	/* Drop this one on the "finished" queue... */
   2143 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2144 
   2145 	/* Let the raidio thread know there is work to be done. */
   2146 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2147 
   2148 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2149 }
   2150 
   2151 
   2152 /*
   2153  * initialize a buf structure for doing an I/O in the kernel.
   2154  */
   2155 static void
   2156 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2157        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2158        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2159        struct proc *b_proc)
   2160 {
   2161 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2162 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2163 	bp->b_oflags = 0;
   2164 	bp->b_cflags = 0;
   2165 	bp->b_bcount = numSect << logBytesPerSector;
   2166 	bp->b_bufsize = bp->b_bcount;
   2167 	bp->b_error = 0;
   2168 	bp->b_dev = dev;
   2169 	bp->b_data = bf;
   2170 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2171 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2172 	if (bp->b_bcount == 0) {
   2173 		panic("bp->b_bcount is zero in InitBP!!");
   2174 	}
   2175 	bp->b_proc = b_proc;
   2176 	bp->b_iodone = cbFunc;
   2177 	bp->b_private = cbArg;
   2178 }
   2179 
   2180 /*
   2181  * Wait interruptibly for an exclusive lock.
   2182  *
   2183  * XXX
   2184  * Several drivers do this; it should be abstracted and made MP-safe.
   2185  * (Hmm... where have we seen this warning before :->  GO )
   2186  */
   2187 static int
   2188 raidlock(struct raid_softc *rs)
   2189 {
   2190 	int     error;
   2191 
   2192 	error = 0;
   2193 	mutex_enter(&rs->sc_mutex);
   2194 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2195 		rs->sc_flags |= RAIDF_WANTED;
   2196 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2197 		if (error != 0)
   2198 			goto done;
   2199 	}
   2200 	rs->sc_flags |= RAIDF_LOCKED;
   2201 done:
   2202 	mutex_exit(&rs->sc_mutex);
   2203 	return (error);
   2204 }
   2205 /*
   2206  * Unlock and wake up any waiters.
   2207  */
   2208 static void
   2209 raidunlock(struct raid_softc *rs)
   2210 {
   2211 
   2212 	mutex_enter(&rs->sc_mutex);
   2213 	rs->sc_flags &= ~RAIDF_LOCKED;
   2214 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2215 		rs->sc_flags &= ~RAIDF_WANTED;
   2216 		cv_broadcast(&rs->sc_cv);
   2217 	}
   2218 	mutex_exit(&rs->sc_mutex);
   2219 }
   2220 
   2221 
   2222 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2223 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2224 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2225 
   2226 static daddr_t
   2227 rf_component_info_offset(void)
   2228 {
   2229 
   2230 	return RF_COMPONENT_INFO_OFFSET;
   2231 }
   2232 
   2233 static daddr_t
   2234 rf_component_info_size(unsigned secsize)
   2235 {
   2236 	daddr_t info_size;
   2237 
   2238 	KASSERT(secsize);
   2239 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2240 		info_size = secsize;
   2241 	else
   2242 		info_size = RF_COMPONENT_INFO_SIZE;
   2243 
   2244 	return info_size;
   2245 }
   2246 
   2247 static daddr_t
   2248 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2249 {
   2250 	daddr_t map_offset;
   2251 
   2252 	KASSERT(raidPtr->bytesPerSector);
   2253 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2254 		map_offset = raidPtr->bytesPerSector;
   2255 	else
   2256 		map_offset = RF_COMPONENT_INFO_SIZE;
   2257 	map_offset += rf_component_info_offset();
   2258 
   2259 	return map_offset;
   2260 }
   2261 
   2262 static daddr_t
   2263 rf_parity_map_size(RF_Raid_t *raidPtr)
   2264 {
   2265 	daddr_t map_size;
   2266 
   2267 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2268 		map_size = raidPtr->bytesPerSector;
   2269 	else
   2270 		map_size = RF_PARITY_MAP_SIZE;
   2271 
   2272 	return map_size;
   2273 }
   2274 
   2275 int
   2276 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2277 {
   2278 	RF_ComponentLabel_t *clabel;
   2279 
   2280 	clabel = raidget_component_label(raidPtr, col);
   2281 	clabel->clean = RF_RAID_CLEAN;
   2282 	raidflush_component_label(raidPtr, col);
   2283 	return(0);
   2284 }
   2285 
   2286 
   2287 int
   2288 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2289 {
   2290 	RF_ComponentLabel_t *clabel;
   2291 
   2292 	clabel = raidget_component_label(raidPtr, col);
   2293 	clabel->clean = RF_RAID_DIRTY;
   2294 	raidflush_component_label(raidPtr, col);
   2295 	return(0);
   2296 }
   2297 
   2298 int
   2299 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2300 {
   2301 	KASSERT(raidPtr->bytesPerSector);
   2302 	return raidread_component_label(raidPtr->bytesPerSector,
   2303 	    raidPtr->Disks[col].dev,
   2304 	    raidPtr->raid_cinfo[col].ci_vp,
   2305 	    &raidPtr->raid_cinfo[col].ci_label);
   2306 }
   2307 
   2308 RF_ComponentLabel_t *
   2309 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2310 {
   2311 	return &raidPtr->raid_cinfo[col].ci_label;
   2312 }
   2313 
   2314 int
   2315 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2316 {
   2317 	RF_ComponentLabel_t *label;
   2318 
   2319 	label = &raidPtr->raid_cinfo[col].ci_label;
   2320 	label->mod_counter = raidPtr->mod_counter;
   2321 #ifndef RF_NO_PARITY_MAP
   2322 	label->parity_map_modcount = label->mod_counter;
   2323 #endif
   2324 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2325 	    raidPtr->Disks[col].dev,
   2326 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2327 }
   2328 
   2329 
   2330 static int
   2331 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2332     RF_ComponentLabel_t *clabel)
   2333 {
   2334 	return raidread_component_area(dev, b_vp, clabel,
   2335 	    sizeof(RF_ComponentLabel_t),
   2336 	    rf_component_info_offset(),
   2337 	    rf_component_info_size(secsize));
   2338 }
   2339 
   2340 /* ARGSUSED */
   2341 static int
   2342 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2343     size_t msize, daddr_t offset, daddr_t dsize)
   2344 {
   2345 	struct buf *bp;
   2346 	int error;
   2347 
   2348 	/* XXX should probably ensure that we don't try to do this if
   2349 	   someone has changed rf_protected_sectors. */
   2350 
   2351 	if (b_vp == NULL) {
   2352 		/* For whatever reason, this component is not valid.
   2353 		   Don't try to read a component label from it. */
   2354 		return(EINVAL);
   2355 	}
   2356 
   2357 	/* get a block of the appropriate size... */
   2358 	bp = geteblk((int)dsize);
   2359 	bp->b_dev = dev;
   2360 
   2361 	/* get our ducks in a row for the read */
   2362 	bp->b_blkno = offset / DEV_BSIZE;
   2363 	bp->b_bcount = dsize;
   2364 	bp->b_flags |= B_READ;
   2365  	bp->b_resid = dsize;
   2366 
   2367 	bdev_strategy(bp);
   2368 	error = biowait(bp);
   2369 
   2370 	if (!error) {
   2371 		memcpy(data, bp->b_data, msize);
   2372 	}
   2373 
   2374 	brelse(bp, 0);
   2375 	return(error);
   2376 }
   2377 
   2378 
   2379 static int
   2380 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2381     RF_ComponentLabel_t *clabel)
   2382 {
   2383 	return raidwrite_component_area(dev, b_vp, clabel,
   2384 	    sizeof(RF_ComponentLabel_t),
   2385 	    rf_component_info_offset(),
   2386 	    rf_component_info_size(secsize), 0);
   2387 }
   2388 
   2389 /* ARGSUSED */
   2390 static int
   2391 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2392     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2393 {
   2394 	struct buf *bp;
   2395 	int error;
   2396 
   2397 	/* get a block of the appropriate size... */
   2398 	bp = geteblk((int)dsize);
   2399 	bp->b_dev = dev;
   2400 
   2401 	/* get our ducks in a row for the write */
   2402 	bp->b_blkno = offset / DEV_BSIZE;
   2403 	bp->b_bcount = dsize;
   2404 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2405  	bp->b_resid = dsize;
   2406 
   2407 	memset(bp->b_data, 0, dsize);
   2408 	memcpy(bp->b_data, data, msize);
   2409 
   2410 	bdev_strategy(bp);
   2411 	if (asyncp)
   2412 		return 0;
   2413 	error = biowait(bp);
   2414 	brelse(bp, 0);
   2415 	if (error) {
   2416 #if 1
   2417 		printf("Failed to write RAID component info!\n");
   2418 #endif
   2419 	}
   2420 
   2421 	return(error);
   2422 }
   2423 
   2424 void
   2425 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2426 {
   2427 	int c;
   2428 
   2429 	for (c = 0; c < raidPtr->numCol; c++) {
   2430 		/* Skip dead disks. */
   2431 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2432 			continue;
   2433 		/* XXXjld: what if an error occurs here? */
   2434 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2435 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2436 		    RF_PARITYMAP_NBYTE,
   2437 		    rf_parity_map_offset(raidPtr),
   2438 		    rf_parity_map_size(raidPtr), 0);
   2439 	}
   2440 }
   2441 
   2442 void
   2443 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2444 {
   2445 	struct rf_paritymap_ondisk tmp;
   2446 	int c,first;
   2447 
   2448 	first=1;
   2449 	for (c = 0; c < raidPtr->numCol; c++) {
   2450 		/* Skip dead disks. */
   2451 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2452 			continue;
   2453 		raidread_component_area(raidPtr->Disks[c].dev,
   2454 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2455 		    RF_PARITYMAP_NBYTE,
   2456 		    rf_parity_map_offset(raidPtr),
   2457 		    rf_parity_map_size(raidPtr));
   2458 		if (first) {
   2459 			memcpy(map, &tmp, sizeof(*map));
   2460 			first = 0;
   2461 		} else {
   2462 			rf_paritymap_merge(map, &tmp);
   2463 		}
   2464 	}
   2465 }
   2466 
   2467 void
   2468 rf_markalldirty(RF_Raid_t *raidPtr)
   2469 {
   2470 	RF_ComponentLabel_t *clabel;
   2471 	int sparecol;
   2472 	int c;
   2473 	int j;
   2474 	int scol = -1;
   2475 
   2476 	raidPtr->mod_counter++;
   2477 	for (c = 0; c < raidPtr->numCol; c++) {
   2478 		/* we don't want to touch (at all) a disk that has
   2479 		   failed */
   2480 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2481 			clabel = raidget_component_label(raidPtr, c);
   2482 			if (clabel->status == rf_ds_spared) {
   2483 				/* XXX do something special...
   2484 				   but whatever you do, don't
   2485 				   try to access it!! */
   2486 			} else {
   2487 				raidmarkdirty(raidPtr, c);
   2488 			}
   2489 		}
   2490 	}
   2491 
   2492 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2493 		sparecol = raidPtr->numCol + c;
   2494 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2495 			/*
   2496 
   2497 			   we claim this disk is "optimal" if it's
   2498 			   rf_ds_used_spare, as that means it should be
   2499 			   directly substitutable for the disk it replaced.
   2500 			   We note that too...
   2501 
   2502 			 */
   2503 
   2504 			for(j=0;j<raidPtr->numCol;j++) {
   2505 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2506 					scol = j;
   2507 					break;
   2508 				}
   2509 			}
   2510 
   2511 			clabel = raidget_component_label(raidPtr, sparecol);
   2512 			/* make sure status is noted */
   2513 
   2514 			raid_init_component_label(raidPtr, clabel);
   2515 
   2516 			clabel->row = 0;
   2517 			clabel->column = scol;
   2518 			/* Note: we *don't* change status from rf_ds_used_spare
   2519 			   to rf_ds_optimal */
   2520 			/* clabel.status = rf_ds_optimal; */
   2521 
   2522 			raidmarkdirty(raidPtr, sparecol);
   2523 		}
   2524 	}
   2525 }
   2526 
   2527 
   2528 void
   2529 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2530 {
   2531 	RF_ComponentLabel_t *clabel;
   2532 	int sparecol;
   2533 	int c;
   2534 	int j;
   2535 	int scol;
   2536 	struct raid_softc *rs = raidPtr->softc;
   2537 
   2538 	scol = -1;
   2539 
   2540 	/* XXX should do extra checks to make sure things really are clean,
   2541 	   rather than blindly setting the clean bit... */
   2542 
   2543 	raidPtr->mod_counter++;
   2544 
   2545 	for (c = 0; c < raidPtr->numCol; c++) {
   2546 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2547 			clabel = raidget_component_label(raidPtr, c);
   2548 			/* make sure status is noted */
   2549 			clabel->status = rf_ds_optimal;
   2550 
   2551 			/* note what unit we are configured as */
   2552 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2553 				clabel->last_unit = raidPtr->raidid;
   2554 
   2555 			raidflush_component_label(raidPtr, c);
   2556 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2557 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2558 					raidmarkclean(raidPtr, c);
   2559 				}
   2560 			}
   2561 		}
   2562 		/* else we don't touch it.. */
   2563 	}
   2564 
   2565 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2566 		sparecol = raidPtr->numCol + c;
   2567 		/* Need to ensure that the reconstruct actually completed! */
   2568 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2569 			/*
   2570 
   2571 			   we claim this disk is "optimal" if it's
   2572 			   rf_ds_used_spare, as that means it should be
   2573 			   directly substitutable for the disk it replaced.
   2574 			   We note that too...
   2575 
   2576 			 */
   2577 
   2578 			for(j=0;j<raidPtr->numCol;j++) {
   2579 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2580 					scol = j;
   2581 					break;
   2582 				}
   2583 			}
   2584 
   2585 			/* XXX shouldn't *really* need this... */
   2586 			clabel = raidget_component_label(raidPtr, sparecol);
   2587 			/* make sure status is noted */
   2588 
   2589 			raid_init_component_label(raidPtr, clabel);
   2590 
   2591 			clabel->column = scol;
   2592 			clabel->status = rf_ds_optimal;
   2593 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2594 				clabel->last_unit = raidPtr->raidid;
   2595 
   2596 			raidflush_component_label(raidPtr, sparecol);
   2597 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2598 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2599 					raidmarkclean(raidPtr, sparecol);
   2600 				}
   2601 			}
   2602 		}
   2603 	}
   2604 }
   2605 
   2606 void
   2607 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2608 {
   2609 
   2610 	if (vp != NULL) {
   2611 		if (auto_configured == 1) {
   2612 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2613 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2614 			vput(vp);
   2615 
   2616 		} else {
   2617 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2618 		}
   2619 	}
   2620 }
   2621 
   2622 
   2623 void
   2624 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2625 {
   2626 	int r,c;
   2627 	struct vnode *vp;
   2628 	int acd;
   2629 
   2630 
   2631 	/* We take this opportunity to close the vnodes like we should.. */
   2632 
   2633 	for (c = 0; c < raidPtr->numCol; c++) {
   2634 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2635 		acd = raidPtr->Disks[c].auto_configured;
   2636 		rf_close_component(raidPtr, vp, acd);
   2637 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2638 		raidPtr->Disks[c].auto_configured = 0;
   2639 	}
   2640 
   2641 	for (r = 0; r < raidPtr->numSpare; r++) {
   2642 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2643 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2644 		rf_close_component(raidPtr, vp, acd);
   2645 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2646 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2647 	}
   2648 }
   2649 
   2650 
   2651 void
   2652 rf_ReconThread(struct rf_recon_req *req)
   2653 {
   2654 	int     s;
   2655 	RF_Raid_t *raidPtr;
   2656 
   2657 	s = splbio();
   2658 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2659 	raidPtr->recon_in_progress = 1;
   2660 
   2661 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2662 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2663 
   2664 	RF_Free(req, sizeof(*req));
   2665 
   2666 	raidPtr->recon_in_progress = 0;
   2667 	splx(s);
   2668 
   2669 	/* That's all... */
   2670 	kthread_exit(0);	/* does not return */
   2671 }
   2672 
   2673 void
   2674 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2675 {
   2676 	int retcode;
   2677 	int s;
   2678 
   2679 	raidPtr->parity_rewrite_stripes_done = 0;
   2680 	raidPtr->parity_rewrite_in_progress = 1;
   2681 	s = splbio();
   2682 	retcode = rf_RewriteParity(raidPtr);
   2683 	splx(s);
   2684 	if (retcode) {
   2685 		printf("raid%d: Error re-writing parity (%d)!\n",
   2686 		    raidPtr->raidid, retcode);
   2687 	} else {
   2688 		/* set the clean bit!  If we shutdown correctly,
   2689 		   the clean bit on each component label will get
   2690 		   set */
   2691 		raidPtr->parity_good = RF_RAID_CLEAN;
   2692 	}
   2693 	raidPtr->parity_rewrite_in_progress = 0;
   2694 
   2695 	/* Anyone waiting for us to stop?  If so, inform them... */
   2696 	if (raidPtr->waitShutdown) {
   2697 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2698 	}
   2699 
   2700 	/* That's all... */
   2701 	kthread_exit(0);	/* does not return */
   2702 }
   2703 
   2704 
   2705 void
   2706 rf_CopybackThread(RF_Raid_t *raidPtr)
   2707 {
   2708 	int s;
   2709 
   2710 	raidPtr->copyback_in_progress = 1;
   2711 	s = splbio();
   2712 	rf_CopybackReconstructedData(raidPtr);
   2713 	splx(s);
   2714 	raidPtr->copyback_in_progress = 0;
   2715 
   2716 	/* That's all... */
   2717 	kthread_exit(0);	/* does not return */
   2718 }
   2719 
   2720 
   2721 void
   2722 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2723 {
   2724 	int s;
   2725 	RF_Raid_t *raidPtr;
   2726 
   2727 	s = splbio();
   2728 	raidPtr = req->raidPtr;
   2729 	raidPtr->recon_in_progress = 1;
   2730 	rf_ReconstructInPlace(raidPtr, req->col);
   2731 	RF_Free(req, sizeof(*req));
   2732 	raidPtr->recon_in_progress = 0;
   2733 	splx(s);
   2734 
   2735 	/* That's all... */
   2736 	kthread_exit(0);	/* does not return */
   2737 }
   2738 
   2739 static RF_AutoConfig_t *
   2740 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2741     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2742     unsigned secsize)
   2743 {
   2744 	int good_one = 0;
   2745 	RF_ComponentLabel_t *clabel;
   2746 	RF_AutoConfig_t *ac;
   2747 
   2748 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2749 	if (clabel == NULL) {
   2750 oomem:
   2751 		    while(ac_list) {
   2752 			    ac = ac_list;
   2753 			    if (ac->clabel)
   2754 				    free(ac->clabel, M_RAIDFRAME);
   2755 			    ac_list = ac_list->next;
   2756 			    free(ac, M_RAIDFRAME);
   2757 		    }
   2758 		    printf("RAID auto config: out of memory!\n");
   2759 		    return NULL; /* XXX probably should panic? */
   2760 	}
   2761 
   2762 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2763 		/* Got the label.  Does it look reasonable? */
   2764 		if (rf_reasonable_label(clabel, numsecs) &&
   2765 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2766 #ifdef DEBUG
   2767 			printf("Component on: %s: %llu\n",
   2768 				cname, (unsigned long long)size);
   2769 			rf_print_component_label(clabel);
   2770 #endif
   2771 			/* if it's reasonable, add it, else ignore it. */
   2772 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2773 				M_NOWAIT);
   2774 			if (ac == NULL) {
   2775 				free(clabel, M_RAIDFRAME);
   2776 				goto oomem;
   2777 			}
   2778 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2779 			ac->dev = dev;
   2780 			ac->vp = vp;
   2781 			ac->clabel = clabel;
   2782 			ac->next = ac_list;
   2783 			ac_list = ac;
   2784 			good_one = 1;
   2785 		}
   2786 	}
   2787 	if (!good_one) {
   2788 		/* cleanup */
   2789 		free(clabel, M_RAIDFRAME);
   2790 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2791 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2792 		vput(vp);
   2793 	}
   2794 	return ac_list;
   2795 }
   2796 
   2797 RF_AutoConfig_t *
   2798 rf_find_raid_components(void)
   2799 {
   2800 	struct vnode *vp;
   2801 	struct disklabel label;
   2802 	device_t dv;
   2803 	deviter_t di;
   2804 	dev_t dev;
   2805 	int bmajor, bminor, wedge, rf_part_found;
   2806 	int error;
   2807 	int i;
   2808 	RF_AutoConfig_t *ac_list;
   2809 	uint64_t numsecs;
   2810 	unsigned secsize;
   2811 	int dowedges;
   2812 
   2813 	/* initialize the AutoConfig list */
   2814 	ac_list = NULL;
   2815 
   2816 	/*
   2817 	 * we begin by trolling through *all* the devices on the system *twice*
   2818 	 * first we scan for wedges, second for other devices. This avoids
   2819 	 * using a raw partition instead of a wedge that covers the whole disk
   2820 	 */
   2821 
   2822 	for (dowedges=1; dowedges>=0; --dowedges) {
   2823 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2824 		     dv = deviter_next(&di)) {
   2825 
   2826 			/* we are only interested in disks... */
   2827 			if (device_class(dv) != DV_DISK)
   2828 				continue;
   2829 
   2830 			/* we don't care about floppies... */
   2831 			if (device_is_a(dv, "fd")) {
   2832 				continue;
   2833 			}
   2834 
   2835 			/* we don't care about CD's... */
   2836 			if (device_is_a(dv, "cd")) {
   2837 				continue;
   2838 			}
   2839 
   2840 			/* we don't care about md's... */
   2841 			if (device_is_a(dv, "md")) {
   2842 				continue;
   2843 			}
   2844 
   2845 			/* hdfd is the Atari/Hades floppy driver */
   2846 			if (device_is_a(dv, "hdfd")) {
   2847 				continue;
   2848 			}
   2849 
   2850 			/* fdisa is the Atari/Milan floppy driver */
   2851 			if (device_is_a(dv, "fdisa")) {
   2852 				continue;
   2853 			}
   2854 
   2855 			/* are we in the wedges pass ? */
   2856 			wedge = device_is_a(dv, "dk");
   2857 			if (wedge != dowedges) {
   2858 				continue;
   2859 			}
   2860 
   2861 			/* need to find the device_name_to_block_device_major stuff */
   2862 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2863 
   2864 			rf_part_found = 0; /*No raid partition as yet*/
   2865 
   2866 			/* get a vnode for the raw partition of this disk */
   2867 			bminor = minor(device_unit(dv));
   2868 			dev = wedge ? makedev(bmajor, bminor) :
   2869 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2870 			if (bdevvp(dev, &vp))
   2871 				panic("RAID can't alloc vnode");
   2872 
   2873 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2874 
   2875 			if (error) {
   2876 				/* "Who cares."  Continue looking
   2877 				   for something that exists*/
   2878 				vput(vp);
   2879 				continue;
   2880 			}
   2881 
   2882 			error = getdisksize(vp, &numsecs, &secsize);
   2883 			if (error) {
   2884 				/*
   2885 				 * Pseudo devices like vnd and cgd can be
   2886 				 * opened but may still need some configuration.
   2887 				 * Ignore these quietly.
   2888 				 */
   2889 				if (error != ENXIO)
   2890 					printf("RAIDframe: can't get disk size"
   2891 					    " for dev %s (%d)\n",
   2892 					    device_xname(dv), error);
   2893 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2894 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2895 				vput(vp);
   2896 				continue;
   2897 			}
   2898 			if (wedge) {
   2899 				struct dkwedge_info dkw;
   2900 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2901 				    NOCRED);
   2902 				if (error) {
   2903 					printf("RAIDframe: can't get wedge info for "
   2904 					    "dev %s (%d)\n", device_xname(dv), error);
   2905 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2906 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2907 					vput(vp);
   2908 					continue;
   2909 				}
   2910 
   2911 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2912 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2913 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2914 					vput(vp);
   2915 					continue;
   2916 				}
   2917 
   2918 				ac_list = rf_get_component(ac_list, dev, vp,
   2919 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2920 				rf_part_found = 1; /*There is a raid component on this disk*/
   2921 				continue;
   2922 			}
   2923 
   2924 			/* Ok, the disk exists.  Go get the disklabel. */
   2925 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2926 			if (error) {
   2927 				/*
   2928 				 * XXX can't happen - open() would
   2929 				 * have errored out (or faked up one)
   2930 				 */
   2931 				if (error != ENOTTY)
   2932 					printf("RAIDframe: can't get label for dev "
   2933 					    "%s (%d)\n", device_xname(dv), error);
   2934 			}
   2935 
   2936 			/* don't need this any more.  We'll allocate it again
   2937 			   a little later if we really do... */
   2938 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2939 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2940 			vput(vp);
   2941 
   2942 			if (error)
   2943 				continue;
   2944 
   2945 			rf_part_found = 0; /*No raid partitions yet*/
   2946 			for (i = 0; i < label.d_npartitions; i++) {
   2947 				char cname[sizeof(ac_list->devname)];
   2948 
   2949 				/* We only support partitions marked as RAID */
   2950 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2951 					continue;
   2952 
   2953 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2954 				if (bdevvp(dev, &vp))
   2955 					panic("RAID can't alloc vnode");
   2956 
   2957 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2958 				if (error) {
   2959 					/* Whatever... */
   2960 					vput(vp);
   2961 					continue;
   2962 				}
   2963 				snprintf(cname, sizeof(cname), "%s%c",
   2964 				    device_xname(dv), 'a' + i);
   2965 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2966 					label.d_partitions[i].p_size, numsecs, secsize);
   2967 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2968 			}
   2969 
   2970 			/*
   2971 			 *If there is no raid component on this disk, either in a
   2972 			 *disklabel or inside a wedge, check the raw partition as well,
   2973 			 *as it is possible to configure raid components on raw disk
   2974 			 *devices.
   2975 			 */
   2976 
   2977 			if (!rf_part_found) {
   2978 				char cname[sizeof(ac_list->devname)];
   2979 
   2980 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2981 				if (bdevvp(dev, &vp))
   2982 					panic("RAID can't alloc vnode");
   2983 
   2984 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2985 				if (error) {
   2986 					/* Whatever... */
   2987 					vput(vp);
   2988 					continue;
   2989 				}
   2990 				snprintf(cname, sizeof(cname), "%s%c",
   2991 				    device_xname(dv), 'a' + RAW_PART);
   2992 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2993 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2994 			}
   2995 		}
   2996 		deviter_release(&di);
   2997 	}
   2998 	return ac_list;
   2999 }
   3000 
   3001 
   3002 int
   3003 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3004 {
   3005 
   3006 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3007 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3008 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3009 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3010 	    clabel->row >=0 &&
   3011 	    clabel->column >= 0 &&
   3012 	    clabel->num_rows > 0 &&
   3013 	    clabel->num_columns > 0 &&
   3014 	    clabel->row < clabel->num_rows &&
   3015 	    clabel->column < clabel->num_columns &&
   3016 	    clabel->blockSize > 0 &&
   3017 	    /*
   3018 	     * numBlocksHi may contain garbage, but it is ok since
   3019 	     * the type is unsigned.  If it is really garbage,
   3020 	     * rf_fix_old_label_size() will fix it.
   3021 	     */
   3022 	    rf_component_label_numblocks(clabel) > 0) {
   3023 		/*
   3024 		 * label looks reasonable enough...
   3025 		 * let's make sure it has no old garbage.
   3026 		 */
   3027 		if (numsecs)
   3028 			rf_fix_old_label_size(clabel, numsecs);
   3029 		return(1);
   3030 	}
   3031 	return(0);
   3032 }
   3033 
   3034 
   3035 /*
   3036  * For reasons yet unknown, some old component labels have garbage in
   3037  * the newer numBlocksHi region, and this causes lossage.  Since those
   3038  * disks will also have numsecs set to less than 32 bits of sectors,
   3039  * we can determine when this corruption has occurred, and fix it.
   3040  *
   3041  * The exact same problem, with the same unknown reason, happens to
   3042  * the partitionSizeHi member as well.
   3043  */
   3044 static void
   3045 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3046 {
   3047 
   3048 	if (numsecs < ((uint64_t)1 << 32)) {
   3049 		if (clabel->numBlocksHi) {
   3050 			printf("WARNING: total sectors < 32 bits, yet "
   3051 			       "numBlocksHi set\n"
   3052 			       "WARNING: resetting numBlocksHi to zero.\n");
   3053 			clabel->numBlocksHi = 0;
   3054 		}
   3055 
   3056 		if (clabel->partitionSizeHi) {
   3057 			printf("WARNING: total sectors < 32 bits, yet "
   3058 			       "partitionSizeHi set\n"
   3059 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3060 			clabel->partitionSizeHi = 0;
   3061 		}
   3062 	}
   3063 }
   3064 
   3065 
   3066 #ifdef DEBUG
   3067 void
   3068 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3069 {
   3070 	uint64_t numBlocks;
   3071 	static const char *rp[] = {
   3072 	    "No", "Force", "Soft", "*invalid*"
   3073 	};
   3074 
   3075 
   3076 	numBlocks = rf_component_label_numblocks(clabel);
   3077 
   3078 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3079 	       clabel->row, clabel->column,
   3080 	       clabel->num_rows, clabel->num_columns);
   3081 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3082 	       clabel->version, clabel->serial_number,
   3083 	       clabel->mod_counter);
   3084 	printf("   Clean: %s Status: %d\n",
   3085 	       clabel->clean ? "Yes" : "No", clabel->status);
   3086 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3087 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3088 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3089 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3090 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3091 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3092 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3093 #if 0
   3094 	   printf("   Config order: %d\n", clabel->config_order);
   3095 #endif
   3096 
   3097 }
   3098 #endif
   3099 
   3100 RF_ConfigSet_t *
   3101 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3102 {
   3103 	RF_AutoConfig_t *ac;
   3104 	RF_ConfigSet_t *config_sets;
   3105 	RF_ConfigSet_t *cset;
   3106 	RF_AutoConfig_t *ac_next;
   3107 
   3108 
   3109 	config_sets = NULL;
   3110 
   3111 	/* Go through the AutoConfig list, and figure out which components
   3112 	   belong to what sets.  */
   3113 	ac = ac_list;
   3114 	while(ac!=NULL) {
   3115 		/* we're going to putz with ac->next, so save it here
   3116 		   for use at the end of the loop */
   3117 		ac_next = ac->next;
   3118 
   3119 		if (config_sets == NULL) {
   3120 			/* will need at least this one... */
   3121 			config_sets = (RF_ConfigSet_t *)
   3122 				malloc(sizeof(RF_ConfigSet_t),
   3123 				       M_RAIDFRAME, M_NOWAIT);
   3124 			if (config_sets == NULL) {
   3125 				panic("rf_create_auto_sets: No memory!");
   3126 			}
   3127 			/* this one is easy :) */
   3128 			config_sets->ac = ac;
   3129 			config_sets->next = NULL;
   3130 			config_sets->rootable = 0;
   3131 			ac->next = NULL;
   3132 		} else {
   3133 			/* which set does this component fit into? */
   3134 			cset = config_sets;
   3135 			while(cset!=NULL) {
   3136 				if (rf_does_it_fit(cset, ac)) {
   3137 					/* looks like it matches... */
   3138 					ac->next = cset->ac;
   3139 					cset->ac = ac;
   3140 					break;
   3141 				}
   3142 				cset = cset->next;
   3143 			}
   3144 			if (cset==NULL) {
   3145 				/* didn't find a match above... new set..*/
   3146 				cset = (RF_ConfigSet_t *)
   3147 					malloc(sizeof(RF_ConfigSet_t),
   3148 					       M_RAIDFRAME, M_NOWAIT);
   3149 				if (cset == NULL) {
   3150 					panic("rf_create_auto_sets: No memory!");
   3151 				}
   3152 				cset->ac = ac;
   3153 				ac->next = NULL;
   3154 				cset->next = config_sets;
   3155 				cset->rootable = 0;
   3156 				config_sets = cset;
   3157 			}
   3158 		}
   3159 		ac = ac_next;
   3160 	}
   3161 
   3162 
   3163 	return(config_sets);
   3164 }
   3165 
   3166 static int
   3167 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3168 {
   3169 	RF_ComponentLabel_t *clabel1, *clabel2;
   3170 
   3171 	/* If this one matches the *first* one in the set, that's good
   3172 	   enough, since the other members of the set would have been
   3173 	   through here too... */
   3174 	/* note that we are not checking partitionSize here..
   3175 
   3176 	   Note that we are also not checking the mod_counters here.
   3177 	   If everything else matches except the mod_counter, that's
   3178 	   good enough for this test.  We will deal with the mod_counters
   3179 	   a little later in the autoconfiguration process.
   3180 
   3181 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3182 
   3183 	   The reason we don't check for this is that failed disks
   3184 	   will have lower modification counts.  If those disks are
   3185 	   not added to the set they used to belong to, then they will
   3186 	   form their own set, which may result in 2 different sets,
   3187 	   for example, competing to be configured at raid0, and
   3188 	   perhaps competing to be the root filesystem set.  If the
   3189 	   wrong ones get configured, or both attempt to become /,
   3190 	   weird behaviour and or serious lossage will occur.  Thus we
   3191 	   need to bring them into the fold here, and kick them out at
   3192 	   a later point.
   3193 
   3194 	*/
   3195 
   3196 	clabel1 = cset->ac->clabel;
   3197 	clabel2 = ac->clabel;
   3198 	if ((clabel1->version == clabel2->version) &&
   3199 	    (clabel1->serial_number == clabel2->serial_number) &&
   3200 	    (clabel1->num_rows == clabel2->num_rows) &&
   3201 	    (clabel1->num_columns == clabel2->num_columns) &&
   3202 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3203 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3204 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3205 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3206 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3207 	    (clabel1->blockSize == clabel2->blockSize) &&
   3208 	    rf_component_label_numblocks(clabel1) ==
   3209 	    rf_component_label_numblocks(clabel2) &&
   3210 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3211 	    (clabel1->root_partition == clabel2->root_partition) &&
   3212 	    (clabel1->last_unit == clabel2->last_unit) &&
   3213 	    (clabel1->config_order == clabel2->config_order)) {
   3214 		/* if it get's here, it almost *has* to be a match */
   3215 	} else {
   3216 		/* it's not consistent with somebody in the set..
   3217 		   punt */
   3218 		return(0);
   3219 	}
   3220 	/* all was fine.. it must fit... */
   3221 	return(1);
   3222 }
   3223 
   3224 int
   3225 rf_have_enough_components(RF_ConfigSet_t *cset)
   3226 {
   3227 	RF_AutoConfig_t *ac;
   3228 	RF_AutoConfig_t *auto_config;
   3229 	RF_ComponentLabel_t *clabel;
   3230 	int c;
   3231 	int num_cols;
   3232 	int num_missing;
   3233 	int mod_counter;
   3234 	int mod_counter_found;
   3235 	int even_pair_failed;
   3236 	char parity_type;
   3237 
   3238 
   3239 	/* check to see that we have enough 'live' components
   3240 	   of this set.  If so, we can configure it if necessary */
   3241 
   3242 	num_cols = cset->ac->clabel->num_columns;
   3243 	parity_type = cset->ac->clabel->parityConfig;
   3244 
   3245 	/* XXX Check for duplicate components!?!?!? */
   3246 
   3247 	/* Determine what the mod_counter is supposed to be for this set. */
   3248 
   3249 	mod_counter_found = 0;
   3250 	mod_counter = 0;
   3251 	ac = cset->ac;
   3252 	while(ac!=NULL) {
   3253 		if (mod_counter_found==0) {
   3254 			mod_counter = ac->clabel->mod_counter;
   3255 			mod_counter_found = 1;
   3256 		} else {
   3257 			if (ac->clabel->mod_counter > mod_counter) {
   3258 				mod_counter = ac->clabel->mod_counter;
   3259 			}
   3260 		}
   3261 		ac = ac->next;
   3262 	}
   3263 
   3264 	num_missing = 0;
   3265 	auto_config = cset->ac;
   3266 
   3267 	even_pair_failed = 0;
   3268 	for(c=0; c<num_cols; c++) {
   3269 		ac = auto_config;
   3270 		while(ac!=NULL) {
   3271 			if ((ac->clabel->column == c) &&
   3272 			    (ac->clabel->mod_counter == mod_counter)) {
   3273 				/* it's this one... */
   3274 #ifdef DEBUG
   3275 				printf("Found: %s at %d\n",
   3276 				       ac->devname,c);
   3277 #endif
   3278 				break;
   3279 			}
   3280 			ac=ac->next;
   3281 		}
   3282 		if (ac==NULL) {
   3283 				/* Didn't find one here! */
   3284 				/* special case for RAID 1, especially
   3285 				   where there are more than 2
   3286 				   components (where RAIDframe treats
   3287 				   things a little differently :( ) */
   3288 			if (parity_type == '1') {
   3289 				if (c%2 == 0) { /* even component */
   3290 					even_pair_failed = 1;
   3291 				} else { /* odd component.  If
   3292 					    we're failed, and
   3293 					    so is the even
   3294 					    component, it's
   3295 					    "Good Night, Charlie" */
   3296 					if (even_pair_failed == 1) {
   3297 						return(0);
   3298 					}
   3299 				}
   3300 			} else {
   3301 				/* normal accounting */
   3302 				num_missing++;
   3303 			}
   3304 		}
   3305 		if ((parity_type == '1') && (c%2 == 1)) {
   3306 				/* Just did an even component, and we didn't
   3307 				   bail.. reset the even_pair_failed flag,
   3308 				   and go on to the next component.... */
   3309 			even_pair_failed = 0;
   3310 		}
   3311 	}
   3312 
   3313 	clabel = cset->ac->clabel;
   3314 
   3315 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3316 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3317 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3318 		/* XXX this needs to be made *much* more general */
   3319 		/* Too many failures */
   3320 		return(0);
   3321 	}
   3322 	/* otherwise, all is well, and we've got enough to take a kick
   3323 	   at autoconfiguring this set */
   3324 	return(1);
   3325 }
   3326 
   3327 void
   3328 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3329 			RF_Raid_t *raidPtr)
   3330 {
   3331 	RF_ComponentLabel_t *clabel;
   3332 	int i;
   3333 
   3334 	clabel = ac->clabel;
   3335 
   3336 	/* 1. Fill in the common stuff */
   3337 	config->numRow = clabel->num_rows = 1;
   3338 	config->numCol = clabel->num_columns;
   3339 	config->numSpare = 0; /* XXX should this be set here? */
   3340 	config->sectPerSU = clabel->sectPerSU;
   3341 	config->SUsPerPU = clabel->SUsPerPU;
   3342 	config->SUsPerRU = clabel->SUsPerRU;
   3343 	config->parityConfig = clabel->parityConfig;
   3344 	/* XXX... */
   3345 	strcpy(config->diskQueueType,"fifo");
   3346 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3347 	config->layoutSpecificSize = 0; /* XXX ?? */
   3348 
   3349 	while(ac!=NULL) {
   3350 		/* row/col values will be in range due to the checks
   3351 		   in reasonable_label() */
   3352 		strcpy(config->devnames[0][ac->clabel->column],
   3353 		       ac->devname);
   3354 		ac = ac->next;
   3355 	}
   3356 
   3357 	for(i=0;i<RF_MAXDBGV;i++) {
   3358 		config->debugVars[i][0] = 0;
   3359 	}
   3360 }
   3361 
   3362 int
   3363 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3364 {
   3365 	RF_ComponentLabel_t *clabel;
   3366 	int column;
   3367 	int sparecol;
   3368 
   3369 	raidPtr->autoconfigure = new_value;
   3370 
   3371 	for(column=0; column<raidPtr->numCol; column++) {
   3372 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3373 			clabel = raidget_component_label(raidPtr, column);
   3374 			clabel->autoconfigure = new_value;
   3375 			raidflush_component_label(raidPtr, column);
   3376 		}
   3377 	}
   3378 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3379 		sparecol = raidPtr->numCol + column;
   3380 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3381 			clabel = raidget_component_label(raidPtr, sparecol);
   3382 			clabel->autoconfigure = new_value;
   3383 			raidflush_component_label(raidPtr, sparecol);
   3384 		}
   3385 	}
   3386 	return(new_value);
   3387 }
   3388 
   3389 int
   3390 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3391 {
   3392 	RF_ComponentLabel_t *clabel;
   3393 	int column;
   3394 	int sparecol;
   3395 
   3396 	raidPtr->root_partition = new_value;
   3397 	for(column=0; column<raidPtr->numCol; column++) {
   3398 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3399 			clabel = raidget_component_label(raidPtr, column);
   3400 			clabel->root_partition = new_value;
   3401 			raidflush_component_label(raidPtr, column);
   3402 		}
   3403 	}
   3404 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3405 		sparecol = raidPtr->numCol + column;
   3406 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3407 			clabel = raidget_component_label(raidPtr, sparecol);
   3408 			clabel->root_partition = new_value;
   3409 			raidflush_component_label(raidPtr, sparecol);
   3410 		}
   3411 	}
   3412 	return(new_value);
   3413 }
   3414 
   3415 void
   3416 rf_release_all_vps(RF_ConfigSet_t *cset)
   3417 {
   3418 	RF_AutoConfig_t *ac;
   3419 
   3420 	ac = cset->ac;
   3421 	while(ac!=NULL) {
   3422 		/* Close the vp, and give it back */
   3423 		if (ac->vp) {
   3424 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3425 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3426 			vput(ac->vp);
   3427 			ac->vp = NULL;
   3428 		}
   3429 		ac = ac->next;
   3430 	}
   3431 }
   3432 
   3433 
   3434 void
   3435 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3436 {
   3437 	RF_AutoConfig_t *ac;
   3438 	RF_AutoConfig_t *next_ac;
   3439 
   3440 	ac = cset->ac;
   3441 	while(ac!=NULL) {
   3442 		next_ac = ac->next;
   3443 		/* nuke the label */
   3444 		free(ac->clabel, M_RAIDFRAME);
   3445 		/* cleanup the config structure */
   3446 		free(ac, M_RAIDFRAME);
   3447 		/* "next.." */
   3448 		ac = next_ac;
   3449 	}
   3450 	/* and, finally, nuke the config set */
   3451 	free(cset, M_RAIDFRAME);
   3452 }
   3453 
   3454 
   3455 void
   3456 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3457 {
   3458 	/* current version number */
   3459 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3460 	clabel->serial_number = raidPtr->serial_number;
   3461 	clabel->mod_counter = raidPtr->mod_counter;
   3462 
   3463 	clabel->num_rows = 1;
   3464 	clabel->num_columns = raidPtr->numCol;
   3465 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3466 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3467 
   3468 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3469 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3470 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3471 
   3472 	clabel->blockSize = raidPtr->bytesPerSector;
   3473 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3474 
   3475 	/* XXX not portable */
   3476 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3477 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3478 	clabel->autoconfigure = raidPtr->autoconfigure;
   3479 	clabel->root_partition = raidPtr->root_partition;
   3480 	clabel->last_unit = raidPtr->raidid;
   3481 	clabel->config_order = raidPtr->config_order;
   3482 
   3483 #ifndef RF_NO_PARITY_MAP
   3484 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3485 #endif
   3486 }
   3487 
   3488 struct raid_softc *
   3489 rf_auto_config_set(RF_ConfigSet_t *cset)
   3490 {
   3491 	RF_Raid_t *raidPtr;
   3492 	RF_Config_t *config;
   3493 	int raidID;
   3494 	struct raid_softc *sc;
   3495 
   3496 #ifdef DEBUG
   3497 	printf("RAID autoconfigure\n");
   3498 #endif
   3499 
   3500 	/* 1. Create a config structure */
   3501 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3502 	if (config == NULL) {
   3503 		printf("%s: Out of mem - config!?!?\n", __func__);
   3504 				/* XXX do something more intelligent here. */
   3505 		return NULL;
   3506 	}
   3507 
   3508 	/*
   3509 	   2. Figure out what RAID ID this one is supposed to live at
   3510 	   See if we can get the same RAID dev that it was configured
   3511 	   on last time..
   3512 	*/
   3513 
   3514 	raidID = cset->ac->clabel->last_unit;
   3515 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3516 	     sc = raidget(++raidID, false))
   3517 		continue;
   3518 #ifdef DEBUG
   3519 	printf("Configuring raid%d:\n",raidID);
   3520 #endif
   3521 
   3522 	if (sc == NULL)
   3523 		sc = raidget(raidID, true);
   3524 	if (sc == NULL) {
   3525 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3526 				/* XXX do something more intelligent here. */
   3527 		free(config, M_RAIDFRAME);
   3528 		return NULL;
   3529 	}
   3530 
   3531 	raidPtr = &sc->sc_r;
   3532 
   3533 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3534 	raidPtr->softc = sc;
   3535 	raidPtr->raidid = raidID;
   3536 	raidPtr->openings = RAIDOUTSTANDING;
   3537 
   3538 	/* 3. Build the configuration structure */
   3539 	rf_create_configuration(cset->ac, config, raidPtr);
   3540 
   3541 	/* 4. Do the configuration */
   3542 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3543 		raidinit(sc);
   3544 
   3545 		rf_markalldirty(raidPtr);
   3546 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3547 		switch (cset->ac->clabel->root_partition) {
   3548 		case 1:	/* Force Root */
   3549 		case 2:	/* Soft Root: root when boot partition part of raid */
   3550 			/*
   3551 			 * everything configured just fine.  Make a note
   3552 			 * that this set is eligible to be root,
   3553 			 * or forced to be root
   3554 			 */
   3555 			cset->rootable = cset->ac->clabel->root_partition;
   3556 			/* XXX do this here? */
   3557 			raidPtr->root_partition = cset->rootable;
   3558 			break;
   3559 		default:
   3560 			break;
   3561 		}
   3562 	} else {
   3563 		raidput(sc);
   3564 		sc = NULL;
   3565 	}
   3566 
   3567 	/* 5. Cleanup */
   3568 	free(config, M_RAIDFRAME);
   3569 	return sc;
   3570 }
   3571 
   3572 void
   3573 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3574 	     size_t xmin, size_t xmax)
   3575 {
   3576 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3577 	pool_sethiwat(p, xmax);
   3578 	pool_prime(p, xmin);
   3579 	pool_setlowat(p, xmin);
   3580 }
   3581 
   3582 /*
   3583  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3584  * to see if there is IO pending and if that IO could possibly be done
   3585  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3586  * otherwise.
   3587  *
   3588  */
   3589 int
   3590 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3591 {
   3592 	struct raid_softc *rs;
   3593 	struct dk_softc *dksc;
   3594 
   3595 	rs = raidPtr->softc;
   3596 	dksc = &rs->sc_dksc;
   3597 
   3598 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3599 		return 1;
   3600 
   3601 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3602 		/* there is work to do */
   3603 		return 0;
   3604 	}
   3605 	/* default is nothing to do */
   3606 	return 1;
   3607 }
   3608 
   3609 int
   3610 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3611 {
   3612 	uint64_t numsecs;
   3613 	unsigned secsize;
   3614 	int error;
   3615 
   3616 	error = getdisksize(vp, &numsecs, &secsize);
   3617 	if (error == 0) {
   3618 		diskPtr->blockSize = secsize;
   3619 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3620 		diskPtr->partitionSize = numsecs;
   3621 		return 0;
   3622 	}
   3623 	return error;
   3624 }
   3625 
   3626 static int
   3627 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3628 {
   3629 	return 1;
   3630 }
   3631 
   3632 static void
   3633 raid_attach(device_t parent, device_t self, void *aux)
   3634 {
   3635 }
   3636 
   3637 
   3638 static int
   3639 raid_detach(device_t self, int flags)
   3640 {
   3641 	int error;
   3642 	struct raid_softc *rs = raidsoftc(self);
   3643 
   3644 	if (rs == NULL)
   3645 		return ENXIO;
   3646 
   3647 	if ((error = raidlock(rs)) != 0)
   3648 		return (error);
   3649 
   3650 	error = raid_detach_unlocked(rs);
   3651 
   3652 	raidunlock(rs);
   3653 
   3654 	/* XXX raid can be referenced here */
   3655 
   3656 	if (error)
   3657 		return error;
   3658 
   3659 	/* Free the softc */
   3660 	raidput(rs);
   3661 
   3662 	return 0;
   3663 }
   3664 
   3665 static void
   3666 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3667 {
   3668 	struct dk_softc *dksc = &rs->sc_dksc;
   3669 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3670 
   3671 	memset(dg, 0, sizeof(*dg));
   3672 
   3673 	dg->dg_secperunit = raidPtr->totalSectors;
   3674 	dg->dg_secsize = raidPtr->bytesPerSector;
   3675 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3676 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3677 
   3678 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3679 }
   3680 
   3681 /*
   3682  * Get cache info for all the components (including spares).
   3683  * Returns intersection of all the cache flags of all disks, or first
   3684  * error if any encountered.
   3685  * XXXfua feature flags can change as spares are added - lock down somehow
   3686  */
   3687 static int
   3688 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3689 {
   3690 	int c;
   3691 	int error;
   3692 	int dkwhole = 0, dkpart;
   3693 
   3694 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3695 		/*
   3696 		 * Check any non-dead disk, even when currently being
   3697 		 * reconstructed.
   3698 		 */
   3699 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3700 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3701 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3702 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3703 			if (error) {
   3704 				if (error != ENODEV) {
   3705 					printf("raid%d: get cache for component %s failed\n",
   3706 					    raidPtr->raidid,
   3707 					    raidPtr->Disks[c].devname);
   3708 				}
   3709 
   3710 				return error;
   3711 			}
   3712 
   3713 			if (c == 0)
   3714 				dkwhole = dkpart;
   3715 			else
   3716 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3717 		}
   3718 	}
   3719 
   3720 	*data = dkwhole;
   3721 
   3722 	return 0;
   3723 }
   3724 
   3725 /*
   3726  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3727  * We end up returning whatever error was returned by the first cache flush
   3728  * that fails.
   3729  */
   3730 
   3731 int
   3732 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3733 {
   3734 	int c, sparecol;
   3735 	int e,error;
   3736 	int force = 1;
   3737 
   3738 	error = 0;
   3739 	for (c = 0; c < raidPtr->numCol; c++) {
   3740 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3741 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3742 					  &force, FWRITE, NOCRED);
   3743 			if (e) {
   3744 				if (e != ENODEV)
   3745 					printf("raid%d: cache flush to component %s failed.\n",
   3746 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3747 				if (error == 0) {
   3748 					error = e;
   3749 				}
   3750 			}
   3751 		}
   3752 	}
   3753 
   3754 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3755 		sparecol = raidPtr->numCol + c;
   3756 		/* Need to ensure that the reconstruct actually completed! */
   3757 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3758 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3759 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3760 			if (e) {
   3761 				if (e != ENODEV)
   3762 					printf("raid%d: cache flush to component %s failed.\n",
   3763 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3764 				if (error == 0) {
   3765 					error = e;
   3766 				}
   3767 			}
   3768 		}
   3769 	}
   3770 	return error;
   3771 }
   3772 
   3773 /*
   3774  * Module interface
   3775  */
   3776 
   3777 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3778 
   3779 #ifdef _MODULE
   3780 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3781 #endif
   3782 
   3783 static int raid_modcmd(modcmd_t, void *);
   3784 static int raid_modcmd_init(void);
   3785 static int raid_modcmd_fini(void);
   3786 
   3787 static int
   3788 raid_modcmd(modcmd_t cmd, void *data)
   3789 {
   3790 	int error;
   3791 
   3792 	error = 0;
   3793 	switch (cmd) {
   3794 	case MODULE_CMD_INIT:
   3795 		error = raid_modcmd_init();
   3796 		break;
   3797 	case MODULE_CMD_FINI:
   3798 		error = raid_modcmd_fini();
   3799 		break;
   3800 	default:
   3801 		error = ENOTTY;
   3802 		break;
   3803 	}
   3804 	return error;
   3805 }
   3806 
   3807 static int
   3808 raid_modcmd_init(void)
   3809 {
   3810 	int error;
   3811 	int bmajor, cmajor;
   3812 
   3813 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3814 	mutex_enter(&raid_lock);
   3815 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3816 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3817 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3818 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3819 
   3820 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3821 #endif
   3822 
   3823 	bmajor = cmajor = -1;
   3824 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3825 	    &raid_cdevsw, &cmajor);
   3826 	if (error != 0 && error != EEXIST) {
   3827 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3828 		mutex_exit(&raid_lock);
   3829 		return error;
   3830 	}
   3831 #ifdef _MODULE
   3832 	error = config_cfdriver_attach(&raid_cd);
   3833 	if (error != 0) {
   3834 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3835 		    __func__, error);
   3836 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3837 		mutex_exit(&raid_lock);
   3838 		return error;
   3839 	}
   3840 #endif
   3841 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3842 	if (error != 0) {
   3843 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3844 		    __func__, error);
   3845 #ifdef _MODULE
   3846 		config_cfdriver_detach(&raid_cd);
   3847 #endif
   3848 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3849 		mutex_exit(&raid_lock);
   3850 		return error;
   3851 	}
   3852 
   3853 	raidautoconfigdone = false;
   3854 
   3855 	mutex_exit(&raid_lock);
   3856 
   3857 	if (error == 0) {
   3858 		if (rf_BootRaidframe(true) == 0)
   3859 			aprint_verbose("Kernelized RAIDframe activated\n");
   3860 		else
   3861 			panic("Serious error activating RAID!!");
   3862 	}
   3863 
   3864 	/*
   3865 	 * Register a finalizer which will be used to auto-config RAID
   3866 	 * sets once all real hardware devices have been found.
   3867 	 */
   3868 	error = config_finalize_register(NULL, rf_autoconfig);
   3869 	if (error != 0) {
   3870 		aprint_error("WARNING: unable to register RAIDframe "
   3871 		    "finalizer\n");
   3872 		error = 0;
   3873 	}
   3874 
   3875 	return error;
   3876 }
   3877 
   3878 static int
   3879 raid_modcmd_fini(void)
   3880 {
   3881 	int error;
   3882 
   3883 	mutex_enter(&raid_lock);
   3884 
   3885 	/* Don't allow unload if raid device(s) exist.  */
   3886 	if (!LIST_EMPTY(&raids)) {
   3887 		mutex_exit(&raid_lock);
   3888 		return EBUSY;
   3889 	}
   3890 
   3891 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3892 	if (error != 0) {
   3893 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3894 		mutex_exit(&raid_lock);
   3895 		return error;
   3896 	}
   3897 #ifdef _MODULE
   3898 	error = config_cfdriver_detach(&raid_cd);
   3899 	if (error != 0) {
   3900 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3901 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3902 		mutex_exit(&raid_lock);
   3903 		return error;
   3904 	}
   3905 #endif
   3906 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3907 	if (error != 0) {
   3908 		aprint_error("%s: cannot detach devsw\n",__func__);
   3909 #ifdef _MODULE
   3910 		config_cfdriver_attach(&raid_cd);
   3911 #endif
   3912 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3913 		mutex_exit(&raid_lock);
   3914 		return error;
   3915 	}
   3916 	rf_BootRaidframe(false);
   3917 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3918 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3919 	rf_destroy_cond2(rf_sparet_wait_cv);
   3920 	rf_destroy_cond2(rf_sparet_resp_cv);
   3921 #endif
   3922 	mutex_exit(&raid_lock);
   3923 	mutex_destroy(&raid_lock);
   3924 
   3925 	return error;
   3926 }
   3927