Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.339
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.339 2016/01/05 17:06:34 mlelstv Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.339 2016/01/05 17:06:34 mlelstv Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 struct raid_softc;
    183 static void raidinit(struct raid_softc *);
    184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    185 
    186 static int raid_match(device_t, cfdata_t, void *);
    187 static void raid_attach(device_t, device_t, void *);
    188 static int raid_detach(device_t, int);
    189 
    190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    191     daddr_t, daddr_t);
    192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    193     daddr_t, daddr_t, int);
    194 
    195 static int raidwrite_component_label(unsigned,
    196     dev_t, struct vnode *, RF_ComponentLabel_t *);
    197 static int raidread_component_label(unsigned,
    198     dev_t, struct vnode *, RF_ComponentLabel_t *);
    199 
    200 static int raid_diskstart(device_t, struct buf *bp);
    201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    202 static int raid_lastclose(device_t);
    203 
    204 static dev_type_open(raidopen);
    205 static dev_type_close(raidclose);
    206 static dev_type_read(raidread);
    207 static dev_type_write(raidwrite);
    208 static dev_type_ioctl(raidioctl);
    209 static dev_type_strategy(raidstrategy);
    210 static dev_type_dump(raiddump);
    211 static dev_type_size(raidsize);
    212 
    213 const struct bdevsw raid_bdevsw = {
    214 	.d_open = raidopen,
    215 	.d_close = raidclose,
    216 	.d_strategy = raidstrategy,
    217 	.d_ioctl = raidioctl,
    218 	.d_dump = raiddump,
    219 	.d_psize = raidsize,
    220 	.d_discard = nodiscard,
    221 	.d_flag = D_DISK
    222 };
    223 
    224 const struct cdevsw raid_cdevsw = {
    225 	.d_open = raidopen,
    226 	.d_close = raidclose,
    227 	.d_read = raidread,
    228 	.d_write = raidwrite,
    229 	.d_ioctl = raidioctl,
    230 	.d_stop = nostop,
    231 	.d_tty = notty,
    232 	.d_poll = nopoll,
    233 	.d_mmap = nommap,
    234 	.d_kqfilter = nokqfilter,
    235 	.d_discard = nodiscard,
    236 	.d_flag = D_DISK
    237 };
    238 
    239 static struct dkdriver rf_dkdriver = {
    240 	.d_open = raidopen,
    241 	.d_close = raidclose,
    242 	.d_strategy = raidstrategy,
    243 	.d_diskstart = raid_diskstart,
    244 	.d_dumpblocks = raid_dumpblocks,
    245 	.d_lastclose = raid_lastclose,
    246 	.d_minphys = minphys
    247 };
    248 
    249 struct raid_softc {
    250 	struct dk_softc sc_dksc;
    251 	int	sc_unit;
    252 	int     sc_flags;	/* flags */
    253 	int     sc_cflags;	/* configuration flags */
    254 	kmutex_t sc_mutex;	/* interlock mutex */
    255 	kcondvar_t sc_cv;	/* and the condvar */
    256 	uint64_t sc_size;	/* size of the raid device */
    257 	char    sc_xname[20];	/* XXX external name */
    258 	RF_Raid_t sc_r;
    259 	LIST_ENTRY(raid_softc) sc_link;
    260 };
    261 /* sc_flags */
    262 #define RAIDF_INITED	0x01	/* unit has been initialized */
    263 #define RAIDF_WLABEL	0x02	/* label area is writable */
    264 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    265 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    266 #define RAIDF_DETACH  	0x10	/* detach after final close */
    267 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    268 #define RAIDF_LOCKED	0x80	/* unit is locked */
    269 
    270 #define	raidunit(x)	DISKUNIT(x)
    271 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    272 
    273 extern struct cfdriver raid_cd;
    274 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    275     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    276     DVF_DETACH_SHUTDOWN);
    277 
    278 /*
    279  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    280  * Be aware that large numbers can allow the driver to consume a lot of
    281  * kernel memory, especially on writes, and in degraded mode reads.
    282  *
    283  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    284  * a single 64K write will typically require 64K for the old data,
    285  * 64K for the old parity, and 64K for the new parity, for a total
    286  * of 192K (if the parity buffer is not re-used immediately).
    287  * Even it if is used immediately, that's still 128K, which when multiplied
    288  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    289  *
    290  * Now in degraded mode, for example, a 64K read on the above setup may
    291  * require data reconstruction, which will require *all* of the 4 remaining
    292  * disks to participate -- 4 * 32K/disk == 128K again.
    293  */
    294 
    295 #ifndef RAIDOUTSTANDING
    296 #define RAIDOUTSTANDING   6
    297 #endif
    298 
    299 #define RAIDLABELDEV(dev)	\
    300 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    301 
    302 /* declared here, and made public, for the benefit of KVM stuff.. */
    303 
    304 static int raidlock(struct raid_softc *);
    305 static void raidunlock(struct raid_softc *);
    306 
    307 static int raid_detach_unlocked(struct raid_softc *);
    308 
    309 static void rf_markalldirty(RF_Raid_t *);
    310 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    311 
    312 void rf_ReconThread(struct rf_recon_req *);
    313 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    314 void rf_CopybackThread(RF_Raid_t *raidPtr);
    315 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    316 int rf_autoconfig(device_t);
    317 void rf_buildroothack(RF_ConfigSet_t *);
    318 
    319 RF_AutoConfig_t *rf_find_raid_components(void);
    320 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    321 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    322 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    323 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    324 int rf_set_autoconfig(RF_Raid_t *, int);
    325 int rf_set_rootpartition(RF_Raid_t *, int);
    326 void rf_release_all_vps(RF_ConfigSet_t *);
    327 void rf_cleanup_config_set(RF_ConfigSet_t *);
    328 int rf_have_enough_components(RF_ConfigSet_t *);
    329 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    330 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    331 
    332 /*
    333  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    334  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    335  * in the kernel config file.
    336  */
    337 #ifdef RAID_AUTOCONFIG
    338 int raidautoconfig = 1;
    339 #else
    340 int raidautoconfig = 0;
    341 #endif
    342 static bool raidautoconfigdone = false;
    343 
    344 struct RF_Pools_s rf_pools;
    345 
    346 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    347 static kmutex_t raid_lock;
    348 
    349 static struct raid_softc *
    350 raidcreate(int unit) {
    351 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    352 	if (sc == NULL) {
    353 #ifdef DIAGNOSTIC
    354 		printf("%s: out of memory\n", __func__);
    355 #endif
    356 		return NULL;
    357 	}
    358 	sc->sc_unit = unit;
    359 	cv_init(&sc->sc_cv, "raidunit");
    360 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    361 	return sc;
    362 }
    363 
    364 static void
    365 raiddestroy(struct raid_softc *sc) {
    366 	cv_destroy(&sc->sc_cv);
    367 	mutex_destroy(&sc->sc_mutex);
    368 	kmem_free(sc, sizeof(*sc));
    369 }
    370 
    371 static struct raid_softc *
    372 raidget(int unit, bool create) {
    373 	struct raid_softc *sc;
    374 	if (unit < 0) {
    375 #ifdef DIAGNOSTIC
    376 		panic("%s: unit %d!", __func__, unit);
    377 #endif
    378 		return NULL;
    379 	}
    380 	mutex_enter(&raid_lock);
    381 	LIST_FOREACH(sc, &raids, sc_link) {
    382 		if (sc->sc_unit == unit) {
    383 			mutex_exit(&raid_lock);
    384 			return sc;
    385 		}
    386 	}
    387 	mutex_exit(&raid_lock);
    388 	if (!create)
    389 		return NULL;
    390 	if ((sc = raidcreate(unit)) == NULL)
    391 		return NULL;
    392 	mutex_enter(&raid_lock);
    393 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    394 	mutex_exit(&raid_lock);
    395 	return sc;
    396 }
    397 
    398 static void
    399 raidput(struct raid_softc *sc) {
    400 	mutex_enter(&raid_lock);
    401 	LIST_REMOVE(sc, sc_link);
    402 	mutex_exit(&raid_lock);
    403 	raiddestroy(sc);
    404 }
    405 
    406 void
    407 raidattach(int num)
    408 {
    409 
    410 	/*
    411 	 * Device attachment and associated initialization now occurs
    412 	 * as part of the module initialization.
    413 	 */
    414 }
    415 
    416 int
    417 rf_autoconfig(device_t self)
    418 {
    419 	RF_AutoConfig_t *ac_list;
    420 	RF_ConfigSet_t *config_sets;
    421 
    422 	if (!raidautoconfig || raidautoconfigdone == true)
    423 		return (0);
    424 
    425 	/* XXX This code can only be run once. */
    426 	raidautoconfigdone = true;
    427 
    428 #ifdef __HAVE_CPU_BOOTCONF
    429 	/*
    430 	 * 0. find the boot device if needed first so we can use it later
    431 	 * this needs to be done before we autoconfigure any raid sets,
    432 	 * because if we use wedges we are not going to be able to open
    433 	 * the boot device later
    434 	 */
    435 	if (booted_device == NULL)
    436 		cpu_bootconf();
    437 #endif
    438 	/* 1. locate all RAID components on the system */
    439 	aprint_debug("Searching for RAID components...\n");
    440 	ac_list = rf_find_raid_components();
    441 
    442 	/* 2. Sort them into their respective sets. */
    443 	config_sets = rf_create_auto_sets(ac_list);
    444 
    445 	/*
    446 	 * 3. Evaluate each set and configure the valid ones.
    447 	 * This gets done in rf_buildroothack().
    448 	 */
    449 	rf_buildroothack(config_sets);
    450 
    451 	return 1;
    452 }
    453 
    454 static int
    455 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    456 	const char *bootname = device_xname(bdv);
    457 	size_t len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 void
    479 rf_buildroothack(RF_ConfigSet_t *config_sets)
    480 {
    481 	RF_ConfigSet_t *cset;
    482 	RF_ConfigSet_t *next_cset;
    483 	int num_root;
    484 	struct raid_softc *sc, *rsc;
    485 	struct dk_softc *dksc;
    486 
    487 	sc = rsc = NULL;
    488 	num_root = 0;
    489 	cset = config_sets;
    490 	while (cset != NULL) {
    491 		next_cset = cset->next;
    492 		if (rf_have_enough_components(cset) &&
    493 		    cset->ac->clabel->autoconfigure == 1) {
    494 			sc = rf_auto_config_set(cset);
    495 			if (sc != NULL) {
    496 				aprint_debug("raid%d: configured ok\n",
    497 				    sc->sc_unit);
    498 				if (cset->rootable) {
    499 					rsc = sc;
    500 					num_root++;
    501 				}
    502 			} else {
    503 				/* The autoconfig didn't work :( */
    504 				aprint_debug("Autoconfig failed\n");
    505 				rf_release_all_vps(cset);
    506 			}
    507 		} else {
    508 			/* we're not autoconfiguring this set...
    509 			   release the associated resources */
    510 			rf_release_all_vps(cset);
    511 		}
    512 		/* cleanup */
    513 		rf_cleanup_config_set(cset);
    514 		cset = next_cset;
    515 	}
    516 	dksc = &rsc->sc_dksc;
    517 
    518 	/* if the user has specified what the root device should be
    519 	   then we don't touch booted_device or boothowto... */
    520 
    521 	if (rootspec != NULL)
    522 		return;
    523 
    524 	/* we found something bootable... */
    525 
    526 	/*
    527 	 * XXX: The following code assumes that the root raid
    528 	 * is the first ('a') partition. This is about the best
    529 	 * we can do with a BSD disklabel, but we might be able
    530 	 * to do better with a GPT label, by setting a specified
    531 	 * attribute to indicate the root partition. We can then
    532 	 * stash the partition number in the r->root_partition
    533 	 * high bits (the bottom 2 bits are already used). For
    534 	 * now we just set booted_partition to 0 when we override
    535 	 * root.
    536 	 */
    537 	if (num_root == 1) {
    538 		device_t candidate_root;
    539 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    540 			char cname[sizeof(cset->ac->devname)];
    541 			/* XXX: assume 'a' */
    542 			snprintf(cname, sizeof(cname), "%s%c",
    543 			    device_xname(dksc->sc_dev), 'a');
    544 			candidate_root = dkwedge_find_by_wname(cname);
    545 		} else
    546 			candidate_root = dksc->sc_dev;
    547 		if (booted_device == NULL ||
    548 		    rsc->sc_r.root_partition == 1 ||
    549 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    550 			booted_device = candidate_root;
    551 			booted_partition = 0;	/* XXX assume 'a' */
    552 		}
    553 	} else if (num_root > 1) {
    554 
    555 		/*
    556 		 * Maybe the MD code can help. If it cannot, then
    557 		 * setroot() will discover that we have no
    558 		 * booted_device and will ask the user if nothing was
    559 		 * hardwired in the kernel config file
    560 		 */
    561 		if (booted_device == NULL)
    562 			return;
    563 
    564 		num_root = 0;
    565 		mutex_enter(&raid_lock);
    566 		LIST_FOREACH(sc, &raids, sc_link) {
    567 			RF_Raid_t *r = &sc->sc_r;
    568 			if (r->valid == 0)
    569 				continue;
    570 
    571 			if (r->root_partition == 0)
    572 				continue;
    573 
    574 			if (rf_containsboot(r, booted_device)) {
    575 				num_root++;
    576 				rsc = sc;
    577 				dksc = &rsc->sc_dksc;
    578 			}
    579 		}
    580 		mutex_exit(&raid_lock);
    581 
    582 		if (num_root == 1) {
    583 			booted_device = dksc->sc_dev;
    584 			booted_partition = 0;	/* XXX assume 'a' */
    585 		} else {
    586 			/* we can't guess.. require the user to answer... */
    587 			boothowto |= RB_ASKNAME;
    588 		}
    589 	}
    590 }
    591 
    592 static int
    593 raidsize(dev_t dev)
    594 {
    595 	struct raid_softc *rs;
    596 	struct dk_softc *dksc;
    597 	unsigned int unit;
    598 
    599 	unit = raidunit(dev);
    600 	if ((rs = raidget(unit, false)) == NULL)
    601 		return -1;
    602 	dksc = &rs->sc_dksc;
    603 
    604 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    605 		return -1;
    606 
    607 	return dk_size(dksc, dev);
    608 }
    609 
    610 static int
    611 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    612 {
    613 	unsigned int unit;
    614 	struct raid_softc *rs;
    615 	struct dk_softc *dksc;
    616 
    617 	unit = raidunit(dev);
    618 	if ((rs = raidget(unit, false)) == NULL)
    619 		return ENXIO;
    620 	dksc = &rs->sc_dksc;
    621 
    622 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    623 		return ENODEV;
    624 
    625         /*
    626            Note that blkno is relative to this particular partition.
    627            By adding adding RF_PROTECTED_SECTORS, we get a value that
    628 	   is relative to the partition used for the underlying component.
    629         */
    630 	blkno += RF_PROTECTED_SECTORS;
    631 
    632 	return dk_dump(dksc, dev, blkno, va, size);
    633 }
    634 
    635 static int
    636 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    637 {
    638 	struct raid_softc *rs = raidsoftc(dev);
    639 	const struct bdevsw *bdev;
    640 	RF_Raid_t *raidPtr;
    641 	int     c, sparecol, j, scol, dumpto;
    642 	int     error = 0;
    643 
    644 	raidPtr = &rs->sc_r;
    645 
    646 	/* we only support dumping to RAID 1 sets */
    647 	if (raidPtr->Layout.numDataCol != 1 ||
    648 	    raidPtr->Layout.numParityCol != 1)
    649 		return EINVAL;
    650 
    651 	if ((error = raidlock(rs)) != 0)
    652 		return error;
    653 
    654 	/* figure out what device is alive.. */
    655 
    656 	/*
    657 	   Look for a component to dump to.  The preference for the
    658 	   component to dump to is as follows:
    659 	   1) the master
    660 	   2) a used_spare of the master
    661 	   3) the slave
    662 	   4) a used_spare of the slave
    663 	*/
    664 
    665 	dumpto = -1;
    666 	for (c = 0; c < raidPtr->numCol; c++) {
    667 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    668 			/* this might be the one */
    669 			dumpto = c;
    670 			break;
    671 		}
    672 	}
    673 
    674 	/*
    675 	   At this point we have possibly selected a live master or a
    676 	   live slave.  We now check to see if there is a spared
    677 	   master (or a spared slave), if we didn't find a live master
    678 	   or a live slave.
    679 	*/
    680 
    681 	for (c = 0; c < raidPtr->numSpare; c++) {
    682 		sparecol = raidPtr->numCol + c;
    683 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    684 			/* How about this one? */
    685 			scol = -1;
    686 			for(j=0;j<raidPtr->numCol;j++) {
    687 				if (raidPtr->Disks[j].spareCol == sparecol) {
    688 					scol = j;
    689 					break;
    690 				}
    691 			}
    692 			if (scol == 0) {
    693 				/*
    694 				   We must have found a spared master!
    695 				   We'll take that over anything else
    696 				   found so far.  (We couldn't have
    697 				   found a real master before, since
    698 				   this is a used spare, and it's
    699 				   saying that it's replacing the
    700 				   master.)  On reboot (with
    701 				   autoconfiguration turned on)
    702 				   sparecol will become the 1st
    703 				   component (component0) of this set.
    704 				*/
    705 				dumpto = sparecol;
    706 				break;
    707 			} else if (scol != -1) {
    708 				/*
    709 				   Must be a spared slave.  We'll dump
    710 				   to that if we havn't found anything
    711 				   else so far.
    712 				*/
    713 				if (dumpto == -1)
    714 					dumpto = sparecol;
    715 			}
    716 		}
    717 	}
    718 
    719 	if (dumpto == -1) {
    720 		/* we couldn't find any live components to dump to!?!?
    721 		 */
    722 		error = EINVAL;
    723 		goto out;
    724 	}
    725 
    726 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    727 
    728 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    729 				blkno, va, nblk * raidPtr->bytesPerSector);
    730 
    731 out:
    732 	raidunlock(rs);
    733 
    734 	return error;
    735 }
    736 
    737 /* ARGSUSED */
    738 static int
    739 raidopen(dev_t dev, int flags, int fmt,
    740     struct lwp *l)
    741 {
    742 	int     unit = raidunit(dev);
    743 	struct raid_softc *rs;
    744 	struct dk_softc *dksc;
    745 	int     error = 0;
    746 	int     part, pmask;
    747 
    748 	if ((rs = raidget(unit, true)) == NULL)
    749 		return ENXIO;
    750 	if ((error = raidlock(rs)) != 0)
    751 		return (error);
    752 
    753 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    754 		error = EBUSY;
    755 		goto bad;
    756 	}
    757 
    758 	dksc = &rs->sc_dksc;
    759 
    760 	part = DISKPART(dev);
    761 	pmask = (1 << part);
    762 
    763 	if (!DK_BUSY(dksc, pmask) &&
    764 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    765 		/* First one... mark things as dirty... Note that we *MUST*
    766 		 have done a configure before this.  I DO NOT WANT TO BE
    767 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    768 		 THAT THEY BELONG TOGETHER!!!!! */
    769 		/* XXX should check to see if we're only open for reading
    770 		   here... If so, we needn't do this, but then need some
    771 		   other way of keeping track of what's happened.. */
    772 
    773 		rf_markalldirty(&rs->sc_r);
    774 	}
    775 
    776 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    777 		error = dk_open(dksc, dev, flags, fmt, l);
    778 
    779 bad:
    780 	raidunlock(rs);
    781 
    782 	return (error);
    783 
    784 
    785 }
    786 
    787 static int
    788 raid_lastclose(device_t self)
    789 {
    790 	struct raid_softc *rs = raidsoftc(self);
    791 
    792 	/* Last one... device is not unconfigured yet.
    793 	   Device shutdown has taken care of setting the
    794 	   clean bits if RAIDF_INITED is not set
    795 	   mark things as clean... */
    796 
    797 	rf_update_component_labels(&rs->sc_r,
    798 	    RF_FINAL_COMPONENT_UPDATE);
    799 
    800 	/* pass to unlocked code */
    801 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    802 		rs->sc_flags |= RAIDF_DETACH;
    803 
    804 	return 0;
    805 }
    806 
    807 /* ARGSUSED */
    808 static int
    809 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    810 {
    811 	int     unit = raidunit(dev);
    812 	struct raid_softc *rs;
    813 	struct dk_softc *dksc;
    814 	cfdata_t cf;
    815 	int     error = 0, do_detach = 0, do_put = 0;
    816 
    817 	if ((rs = raidget(unit, false)) == NULL)
    818 		return ENXIO;
    819 	dksc = &rs->sc_dksc;
    820 
    821 	if ((error = raidlock(rs)) != 0)
    822 		return (error);
    823 
    824 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    825 		error = dk_close(dksc, dev, flags, fmt, l);
    826 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    827 			do_detach = 1;
    828 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    829 		do_put = 1;
    830 
    831 	raidunlock(rs);
    832 
    833 	if (do_detach) {
    834 		/* free the pseudo device attach bits */
    835 		cf = device_cfdata(dksc->sc_dev);
    836 		error = config_detach(dksc->sc_dev, 0);
    837 		if (error == 0)
    838 			free(cf, M_RAIDFRAME);
    839 	} else if (do_put) {
    840 		raidput(rs);
    841 	}
    842 
    843 	return (error);
    844 
    845 }
    846 
    847 static void
    848 raid_wakeup(RF_Raid_t *raidPtr)
    849 {
    850 	rf_lock_mutex2(raidPtr->iodone_lock);
    851 	rf_signal_cond2(raidPtr->iodone_cv);
    852 	rf_unlock_mutex2(raidPtr->iodone_lock);
    853 }
    854 
    855 static void
    856 raidstrategy(struct buf *bp)
    857 {
    858 	unsigned int unit;
    859 	struct raid_softc *rs;
    860 	struct dk_softc *dksc;
    861 	RF_Raid_t *raidPtr;
    862 
    863 	unit = raidunit(bp->b_dev);
    864 	if ((rs = raidget(unit, false)) == NULL) {
    865 		bp->b_error = ENXIO;
    866 		goto fail;
    867 	}
    868 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    869 		bp->b_error = ENXIO;
    870 		goto fail;
    871 	}
    872 	dksc = &rs->sc_dksc;
    873 	raidPtr = &rs->sc_r;
    874 
    875 	/* Queue IO only */
    876 	if (dk_strategy_defer(dksc, bp))
    877 		goto done;
    878 
    879 	/* schedule the IO to happen at the next convenient time */
    880 	raid_wakeup(raidPtr);
    881 
    882 done:
    883 	return;
    884 
    885 fail:
    886 	bp->b_resid = bp->b_bcount;
    887 	biodone(bp);
    888 }
    889 
    890 static int
    891 raid_diskstart(device_t dev, struct buf *bp)
    892 {
    893 	struct raid_softc *rs = raidsoftc(dev);
    894 	RF_Raid_t *raidPtr;
    895 
    896 	raidPtr = &rs->sc_r;
    897 	if (!raidPtr->valid) {
    898 		db1_printf(("raid is not valid..\n"));
    899 		return ENODEV;
    900 	}
    901 
    902 	/* XXX */
    903 	bp->b_resid = 0;
    904 
    905 	return raiddoaccess(raidPtr, bp);
    906 }
    907 
    908 void
    909 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    910 {
    911 	struct raid_softc *rs;
    912 	struct dk_softc *dksc;
    913 
    914 	rs = raidPtr->softc;
    915 	dksc = &rs->sc_dksc;
    916 
    917 	dk_done(dksc, bp);
    918 
    919 	rf_lock_mutex2(raidPtr->mutex);
    920 	raidPtr->openings++;
    921 	rf_unlock_mutex2(raidPtr->mutex);
    922 
    923 	/* schedule more IO */
    924 	raid_wakeup(raidPtr);
    925 }
    926 
    927 /* ARGSUSED */
    928 static int
    929 raidread(dev_t dev, struct uio *uio, int flags)
    930 {
    931 	int     unit = raidunit(dev);
    932 	struct raid_softc *rs;
    933 
    934 	if ((rs = raidget(unit, false)) == NULL)
    935 		return ENXIO;
    936 
    937 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    938 		return (ENXIO);
    939 
    940 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    941 
    942 }
    943 
    944 /* ARGSUSED */
    945 static int
    946 raidwrite(dev_t dev, struct uio *uio, int flags)
    947 {
    948 	int     unit = raidunit(dev);
    949 	struct raid_softc *rs;
    950 
    951 	if ((rs = raidget(unit, false)) == NULL)
    952 		return ENXIO;
    953 
    954 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    955 		return (ENXIO);
    956 
    957 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    958 
    959 }
    960 
    961 static int
    962 raid_detach_unlocked(struct raid_softc *rs)
    963 {
    964 	struct dk_softc *dksc = &rs->sc_dksc;
    965 	RF_Raid_t *raidPtr;
    966 	int error;
    967 
    968 	raidPtr = &rs->sc_r;
    969 
    970 	if (DK_BUSY(dksc, 0) ||
    971 	    raidPtr->recon_in_progress != 0 ||
    972 	    raidPtr->parity_rewrite_in_progress != 0 ||
    973 	    raidPtr->copyback_in_progress != 0)
    974 		return EBUSY;
    975 
    976 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    977 		return 0;
    978 
    979 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
    980 
    981 	if ((error = rf_Shutdown(raidPtr)) != 0)
    982 		return error;
    983 
    984 	rs->sc_flags &= ~RAIDF_INITED;
    985 
    986 	/* Kill off any queued buffers */
    987 	dk_drain(dksc);
    988 	bufq_free(dksc->sc_bufq);
    989 
    990 	/* Detach the disk. */
    991 	dkwedge_delall(&dksc->sc_dkdev);
    992 	disk_detach(&dksc->sc_dkdev);
    993 	disk_destroy(&dksc->sc_dkdev);
    994 	dk_detach(dksc);
    995 
    996 	return 0;
    997 }
    998 
    999 static int
   1000 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1001 {
   1002 	int     unit = raidunit(dev);
   1003 	int     error = 0;
   1004 	int     part, pmask;
   1005 	struct raid_softc *rs;
   1006 	struct dk_softc *dksc;
   1007 	RF_Config_t *k_cfg, *u_cfg;
   1008 	RF_Raid_t *raidPtr;
   1009 	RF_RaidDisk_t *diskPtr;
   1010 	RF_AccTotals_t *totals;
   1011 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1012 	u_char *specific_buf;
   1013 	int retcode = 0;
   1014 	int column;
   1015 /*	int raidid; */
   1016 	struct rf_recon_req *rrcopy, *rr;
   1017 	RF_ComponentLabel_t *clabel;
   1018 	RF_ComponentLabel_t *ci_label;
   1019 	RF_ComponentLabel_t **clabel_ptr;
   1020 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1021 	RF_SingleComponent_t component;
   1022 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1023 	int i, j, d;
   1024 
   1025 	if ((rs = raidget(unit, false)) == NULL)
   1026 		return ENXIO;
   1027 	dksc = &rs->sc_dksc;
   1028 	raidPtr = &rs->sc_r;
   1029 
   1030 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1031 		(int) DISKPART(dev), (int) unit, cmd));
   1032 
   1033 	/* Must be initialized for these... */
   1034 	switch (cmd) {
   1035 	case RAIDFRAME_REWRITEPARITY:
   1036 	case RAIDFRAME_GET_INFO:
   1037 	case RAIDFRAME_RESET_ACCTOTALS:
   1038 	case RAIDFRAME_GET_ACCTOTALS:
   1039 	case RAIDFRAME_KEEP_ACCTOTALS:
   1040 	case RAIDFRAME_GET_SIZE:
   1041 	case RAIDFRAME_FAIL_DISK:
   1042 	case RAIDFRAME_COPYBACK:
   1043 	case RAIDFRAME_CHECK_RECON_STATUS:
   1044 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1045 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1046 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1047 	case RAIDFRAME_ADD_HOT_SPARE:
   1048 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1049 	case RAIDFRAME_INIT_LABELS:
   1050 	case RAIDFRAME_REBUILD_IN_PLACE:
   1051 	case RAIDFRAME_CHECK_PARITY:
   1052 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1053 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1054 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1055 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1056 	case RAIDFRAME_SET_AUTOCONFIG:
   1057 	case RAIDFRAME_SET_ROOT:
   1058 	case RAIDFRAME_DELETE_COMPONENT:
   1059 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1060 	case RAIDFRAME_PARITYMAP_STATUS:
   1061 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1062 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1063 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1064 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1065 			return (ENXIO);
   1066 	}
   1067 
   1068 	switch (cmd) {
   1069 #ifdef COMPAT_50
   1070 	case RAIDFRAME_GET_INFO50:
   1071 		return rf_get_info50(raidPtr, data);
   1072 
   1073 	case RAIDFRAME_CONFIGURE50:
   1074 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1075 			return retcode;
   1076 		goto config;
   1077 #endif
   1078 		/* configure the system */
   1079 	case RAIDFRAME_CONFIGURE:
   1080 
   1081 		if (raidPtr->valid) {
   1082 			/* There is a valid RAID set running on this unit! */
   1083 			printf("raid%d: Device already configured!\n",unit);
   1084 			return(EINVAL);
   1085 		}
   1086 
   1087 		/* copy-in the configuration information */
   1088 		/* data points to a pointer to the configuration structure */
   1089 
   1090 		u_cfg = *((RF_Config_t **) data);
   1091 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1092 		if (k_cfg == NULL) {
   1093 			return (ENOMEM);
   1094 		}
   1095 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1096 		if (retcode) {
   1097 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1098 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1099 				retcode));
   1100 			goto no_config;
   1101 		}
   1102 		goto config;
   1103 	config:
   1104 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1105 
   1106 		/* allocate a buffer for the layout-specific data, and copy it
   1107 		 * in */
   1108 		if (k_cfg->layoutSpecificSize) {
   1109 			if (k_cfg->layoutSpecificSize > 10000) {
   1110 				/* sanity check */
   1111 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1112 				retcode = EINVAL;
   1113 				goto no_config;
   1114 			}
   1115 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1116 			    (u_char *));
   1117 			if (specific_buf == NULL) {
   1118 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1119 				retcode = ENOMEM;
   1120 				goto no_config;
   1121 			}
   1122 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1123 			    k_cfg->layoutSpecificSize);
   1124 			if (retcode) {
   1125 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1126 				RF_Free(specific_buf,
   1127 					k_cfg->layoutSpecificSize);
   1128 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1129 					retcode));
   1130 				goto no_config;
   1131 			}
   1132 		} else
   1133 			specific_buf = NULL;
   1134 		k_cfg->layoutSpecific = specific_buf;
   1135 
   1136 		/* should do some kind of sanity check on the configuration.
   1137 		 * Store the sum of all the bytes in the last byte? */
   1138 
   1139 		/* configure the system */
   1140 
   1141 		/*
   1142 		 * Clear the entire RAID descriptor, just to make sure
   1143 		 *  there is no stale data left in the case of a
   1144 		 *  reconfiguration
   1145 		 */
   1146 		memset(raidPtr, 0, sizeof(*raidPtr));
   1147 		raidPtr->softc = rs;
   1148 		raidPtr->raidid = unit;
   1149 
   1150 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1151 
   1152 		if (retcode == 0) {
   1153 
   1154 			/* allow this many simultaneous IO's to
   1155 			   this RAID device */
   1156 			raidPtr->openings = RAIDOUTSTANDING;
   1157 
   1158 			raidinit(rs);
   1159 			raid_wakeup(raidPtr);
   1160 			rf_markalldirty(raidPtr);
   1161 		}
   1162 		/* free the buffers.  No return code here. */
   1163 		if (k_cfg->layoutSpecificSize) {
   1164 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1165 		}
   1166 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1167 
   1168 	no_config:
   1169 		/*
   1170 		 * If configuration failed, set sc_flags so that we
   1171 		 * will detach the device when we close it.
   1172 		 */
   1173 		if (retcode != 0)
   1174 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1175 		return (retcode);
   1176 
   1177 		/* shutdown the system */
   1178 	case RAIDFRAME_SHUTDOWN:
   1179 
   1180 		part = DISKPART(dev);
   1181 		pmask = (1 << part);
   1182 
   1183 		if ((error = raidlock(rs)) != 0)
   1184 			return (error);
   1185 
   1186 		if (DK_BUSY(dksc, pmask) ||
   1187 		    raidPtr->recon_in_progress != 0 ||
   1188 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1189 		    raidPtr->copyback_in_progress != 0)
   1190 			retcode = EBUSY;
   1191 		else {
   1192 			/* detach and free on close */
   1193 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1194 			retcode = 0;
   1195 		}
   1196 
   1197 		raidunlock(rs);
   1198 
   1199 		return (retcode);
   1200 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1201 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1202 		/* need to read the component label for the disk indicated
   1203 		   by row,column in clabel */
   1204 
   1205 		/*
   1206 		 * Perhaps there should be an option to skip the in-core
   1207 		 * copy and hit the disk, as with disklabel(8).
   1208 		 */
   1209 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1210 
   1211 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1212 
   1213 		if (retcode) {
   1214 			RF_Free(clabel, sizeof(*clabel));
   1215 			return retcode;
   1216 		}
   1217 
   1218 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1219 
   1220 		column = clabel->column;
   1221 
   1222 		if ((column < 0) || (column >= raidPtr->numCol +
   1223 		    raidPtr->numSpare)) {
   1224 			RF_Free(clabel, sizeof(*clabel));
   1225 			return EINVAL;
   1226 		}
   1227 
   1228 		RF_Free(clabel, sizeof(*clabel));
   1229 
   1230 		clabel = raidget_component_label(raidPtr, column);
   1231 
   1232 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1233 
   1234 #if 0
   1235 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1236 		clabel = (RF_ComponentLabel_t *) data;
   1237 
   1238 		/* XXX check the label for valid stuff... */
   1239 		/* Note that some things *should not* get modified --
   1240 		   the user should be re-initing the labels instead of
   1241 		   trying to patch things.
   1242 		   */
   1243 
   1244 		raidid = raidPtr->raidid;
   1245 #ifdef DEBUG
   1246 		printf("raid%d: Got component label:\n", raidid);
   1247 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1248 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1249 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1250 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1251 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1252 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1253 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1254 #endif
   1255 		clabel->row = 0;
   1256 		column = clabel->column;
   1257 
   1258 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1259 			return(EINVAL);
   1260 		}
   1261 
   1262 		/* XXX this isn't allowed to do anything for now :-) */
   1263 
   1264 		/* XXX and before it is, we need to fill in the rest
   1265 		   of the fields!?!?!?! */
   1266 		memcpy(raidget_component_label(raidPtr, column),
   1267 		    clabel, sizeof(*clabel));
   1268 		raidflush_component_label(raidPtr, column);
   1269 		return (0);
   1270 #endif
   1271 
   1272 	case RAIDFRAME_INIT_LABELS:
   1273 		clabel = (RF_ComponentLabel_t *) data;
   1274 		/*
   1275 		   we only want the serial number from
   1276 		   the above.  We get all the rest of the information
   1277 		   from the config that was used to create this RAID
   1278 		   set.
   1279 		   */
   1280 
   1281 		raidPtr->serial_number = clabel->serial_number;
   1282 
   1283 		for(column=0;column<raidPtr->numCol;column++) {
   1284 			diskPtr = &raidPtr->Disks[column];
   1285 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1286 				ci_label = raidget_component_label(raidPtr,
   1287 				    column);
   1288 				/* Zeroing this is important. */
   1289 				memset(ci_label, 0, sizeof(*ci_label));
   1290 				raid_init_component_label(raidPtr, ci_label);
   1291 				ci_label->serial_number =
   1292 				    raidPtr->serial_number;
   1293 				ci_label->row = 0; /* we dont' pretend to support more */
   1294 				rf_component_label_set_partitionsize(ci_label,
   1295 				    diskPtr->partitionSize);
   1296 				ci_label->column = column;
   1297 				raidflush_component_label(raidPtr, column);
   1298 			}
   1299 			/* XXXjld what about the spares? */
   1300 		}
   1301 
   1302 		return (retcode);
   1303 	case RAIDFRAME_SET_AUTOCONFIG:
   1304 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1305 		printf("raid%d: New autoconfig value is: %d\n",
   1306 		       raidPtr->raidid, d);
   1307 		*(int *) data = d;
   1308 		return (retcode);
   1309 
   1310 	case RAIDFRAME_SET_ROOT:
   1311 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1312 		printf("raid%d: New rootpartition value is: %d\n",
   1313 		       raidPtr->raidid, d);
   1314 		*(int *) data = d;
   1315 		return (retcode);
   1316 
   1317 		/* initialize all parity */
   1318 	case RAIDFRAME_REWRITEPARITY:
   1319 
   1320 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1321 			/* Parity for RAID 0 is trivially correct */
   1322 			raidPtr->parity_good = RF_RAID_CLEAN;
   1323 			return(0);
   1324 		}
   1325 
   1326 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1327 			/* Re-write is already in progress! */
   1328 			return(EINVAL);
   1329 		}
   1330 
   1331 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1332 					   rf_RewriteParityThread,
   1333 					   raidPtr,"raid_parity");
   1334 		return (retcode);
   1335 
   1336 
   1337 	case RAIDFRAME_ADD_HOT_SPARE:
   1338 		sparePtr = (RF_SingleComponent_t *) data;
   1339 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1340 		retcode = rf_add_hot_spare(raidPtr, &component);
   1341 		return(retcode);
   1342 
   1343 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1344 		return(retcode);
   1345 
   1346 	case RAIDFRAME_DELETE_COMPONENT:
   1347 		componentPtr = (RF_SingleComponent_t *)data;
   1348 		memcpy( &component, componentPtr,
   1349 			sizeof(RF_SingleComponent_t));
   1350 		retcode = rf_delete_component(raidPtr, &component);
   1351 		return(retcode);
   1352 
   1353 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1354 		componentPtr = (RF_SingleComponent_t *)data;
   1355 		memcpy( &component, componentPtr,
   1356 			sizeof(RF_SingleComponent_t));
   1357 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1358 		return(retcode);
   1359 
   1360 	case RAIDFRAME_REBUILD_IN_PLACE:
   1361 
   1362 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1363 			/* Can't do this on a RAID 0!! */
   1364 			return(EINVAL);
   1365 		}
   1366 
   1367 		if (raidPtr->recon_in_progress == 1) {
   1368 			/* a reconstruct is already in progress! */
   1369 			return(EINVAL);
   1370 		}
   1371 
   1372 		componentPtr = (RF_SingleComponent_t *) data;
   1373 		memcpy( &component, componentPtr,
   1374 			sizeof(RF_SingleComponent_t));
   1375 		component.row = 0; /* we don't support any more */
   1376 		column = component.column;
   1377 
   1378 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1379 			return(EINVAL);
   1380 		}
   1381 
   1382 		rf_lock_mutex2(raidPtr->mutex);
   1383 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1384 		    (raidPtr->numFailures > 0)) {
   1385 			/* XXX 0 above shouldn't be constant!!! */
   1386 			/* some component other than this has failed.
   1387 			   Let's not make things worse than they already
   1388 			   are... */
   1389 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1390 			       raidPtr->raidid);
   1391 			printf("raid%d:     Col: %d   Too many failures.\n",
   1392 			       raidPtr->raidid, column);
   1393 			rf_unlock_mutex2(raidPtr->mutex);
   1394 			return (EINVAL);
   1395 		}
   1396 		if (raidPtr->Disks[column].status ==
   1397 		    rf_ds_reconstructing) {
   1398 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1399 			       raidPtr->raidid);
   1400 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1401 
   1402 			rf_unlock_mutex2(raidPtr->mutex);
   1403 			return (EINVAL);
   1404 		}
   1405 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1406 			rf_unlock_mutex2(raidPtr->mutex);
   1407 			return (EINVAL);
   1408 		}
   1409 		rf_unlock_mutex2(raidPtr->mutex);
   1410 
   1411 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1412 		if (rrcopy == NULL)
   1413 			return(ENOMEM);
   1414 
   1415 		rrcopy->raidPtr = (void *) raidPtr;
   1416 		rrcopy->col = column;
   1417 
   1418 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1419 					   rf_ReconstructInPlaceThread,
   1420 					   rrcopy,"raid_reconip");
   1421 		return(retcode);
   1422 
   1423 	case RAIDFRAME_GET_INFO:
   1424 		if (!raidPtr->valid)
   1425 			return (ENODEV);
   1426 		ucfgp = (RF_DeviceConfig_t **) data;
   1427 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1428 			  (RF_DeviceConfig_t *));
   1429 		if (d_cfg == NULL)
   1430 			return (ENOMEM);
   1431 		d_cfg->rows = 1; /* there is only 1 row now */
   1432 		d_cfg->cols = raidPtr->numCol;
   1433 		d_cfg->ndevs = raidPtr->numCol;
   1434 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1435 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1436 			return (ENOMEM);
   1437 		}
   1438 		d_cfg->nspares = raidPtr->numSpare;
   1439 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1440 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1441 			return (ENOMEM);
   1442 		}
   1443 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1444 		d = 0;
   1445 		for (j = 0; j < d_cfg->cols; j++) {
   1446 			d_cfg->devs[d] = raidPtr->Disks[j];
   1447 			d++;
   1448 		}
   1449 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1450 			d_cfg->spares[i] = raidPtr->Disks[j];
   1451 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1452 				/* XXX: raidctl(8) expects to see this as a used spare */
   1453 				d_cfg->spares[i].status = rf_ds_used_spare;
   1454 			}
   1455 		}
   1456 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1457 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1458 
   1459 		return (retcode);
   1460 
   1461 	case RAIDFRAME_CHECK_PARITY:
   1462 		*(int *) data = raidPtr->parity_good;
   1463 		return (0);
   1464 
   1465 	case RAIDFRAME_PARITYMAP_STATUS:
   1466 		if (rf_paritymap_ineligible(raidPtr))
   1467 			return EINVAL;
   1468 		rf_paritymap_status(raidPtr->parity_map,
   1469 		    (struct rf_pmstat *)data);
   1470 		return 0;
   1471 
   1472 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1473 		if (rf_paritymap_ineligible(raidPtr))
   1474 			return EINVAL;
   1475 		if (raidPtr->parity_map == NULL)
   1476 			return ENOENT; /* ??? */
   1477 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1478 			(struct rf_pmparams *)data, 1))
   1479 			return EINVAL;
   1480 		return 0;
   1481 
   1482 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1483 		if (rf_paritymap_ineligible(raidPtr))
   1484 			return EINVAL;
   1485 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1486 		return 0;
   1487 
   1488 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1489 		if (rf_paritymap_ineligible(raidPtr))
   1490 			return EINVAL;
   1491 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1492 		/* XXX should errors be passed up? */
   1493 		return 0;
   1494 
   1495 	case RAIDFRAME_RESET_ACCTOTALS:
   1496 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1497 		return (0);
   1498 
   1499 	case RAIDFRAME_GET_ACCTOTALS:
   1500 		totals = (RF_AccTotals_t *) data;
   1501 		*totals = raidPtr->acc_totals;
   1502 		return (0);
   1503 
   1504 	case RAIDFRAME_KEEP_ACCTOTALS:
   1505 		raidPtr->keep_acc_totals = *(int *)data;
   1506 		return (0);
   1507 
   1508 	case RAIDFRAME_GET_SIZE:
   1509 		*(int *) data = raidPtr->totalSectors;
   1510 		return (0);
   1511 
   1512 		/* fail a disk & optionally start reconstruction */
   1513 	case RAIDFRAME_FAIL_DISK:
   1514 
   1515 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1516 			/* Can't do this on a RAID 0!! */
   1517 			return(EINVAL);
   1518 		}
   1519 
   1520 		rr = (struct rf_recon_req *) data;
   1521 		rr->row = 0;
   1522 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1523 			return (EINVAL);
   1524 
   1525 
   1526 		rf_lock_mutex2(raidPtr->mutex);
   1527 		if (raidPtr->status == rf_rs_reconstructing) {
   1528 			/* you can't fail a disk while we're reconstructing! */
   1529 			/* XXX wrong for RAID6 */
   1530 			rf_unlock_mutex2(raidPtr->mutex);
   1531 			return (EINVAL);
   1532 		}
   1533 		if ((raidPtr->Disks[rr->col].status ==
   1534 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1535 			/* some other component has failed.  Let's not make
   1536 			   things worse. XXX wrong for RAID6 */
   1537 			rf_unlock_mutex2(raidPtr->mutex);
   1538 			return (EINVAL);
   1539 		}
   1540 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1541 			/* Can't fail a spared disk! */
   1542 			rf_unlock_mutex2(raidPtr->mutex);
   1543 			return (EINVAL);
   1544 		}
   1545 		rf_unlock_mutex2(raidPtr->mutex);
   1546 
   1547 		/* make a copy of the recon request so that we don't rely on
   1548 		 * the user's buffer */
   1549 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1550 		if (rrcopy == NULL)
   1551 			return(ENOMEM);
   1552 		memcpy(rrcopy, rr, sizeof(*rr));
   1553 		rrcopy->raidPtr = (void *) raidPtr;
   1554 
   1555 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1556 					   rf_ReconThread,
   1557 					   rrcopy,"raid_recon");
   1558 		return (0);
   1559 
   1560 		/* invoke a copyback operation after recon on whatever disk
   1561 		 * needs it, if any */
   1562 	case RAIDFRAME_COPYBACK:
   1563 
   1564 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1565 			/* This makes no sense on a RAID 0!! */
   1566 			return(EINVAL);
   1567 		}
   1568 
   1569 		if (raidPtr->copyback_in_progress == 1) {
   1570 			/* Copyback is already in progress! */
   1571 			return(EINVAL);
   1572 		}
   1573 
   1574 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1575 					   rf_CopybackThread,
   1576 					   raidPtr,"raid_copyback");
   1577 		return (retcode);
   1578 
   1579 		/* return the percentage completion of reconstruction */
   1580 	case RAIDFRAME_CHECK_RECON_STATUS:
   1581 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1582 			/* This makes no sense on a RAID 0, so tell the
   1583 			   user it's done. */
   1584 			*(int *) data = 100;
   1585 			return(0);
   1586 		}
   1587 		if (raidPtr->status != rf_rs_reconstructing)
   1588 			*(int *) data = 100;
   1589 		else {
   1590 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1591 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1592 			} else {
   1593 				*(int *) data = 0;
   1594 			}
   1595 		}
   1596 		return (0);
   1597 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1598 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1599 		if (raidPtr->status != rf_rs_reconstructing) {
   1600 			progressInfo.remaining = 0;
   1601 			progressInfo.completed = 100;
   1602 			progressInfo.total = 100;
   1603 		} else {
   1604 			progressInfo.total =
   1605 				raidPtr->reconControl->numRUsTotal;
   1606 			progressInfo.completed =
   1607 				raidPtr->reconControl->numRUsComplete;
   1608 			progressInfo.remaining = progressInfo.total -
   1609 				progressInfo.completed;
   1610 		}
   1611 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1612 				  sizeof(RF_ProgressInfo_t));
   1613 		return (retcode);
   1614 
   1615 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1616 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1617 			/* This makes no sense on a RAID 0, so tell the
   1618 			   user it's done. */
   1619 			*(int *) data = 100;
   1620 			return(0);
   1621 		}
   1622 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1623 			*(int *) data = 100 *
   1624 				raidPtr->parity_rewrite_stripes_done /
   1625 				raidPtr->Layout.numStripe;
   1626 		} else {
   1627 			*(int *) data = 100;
   1628 		}
   1629 		return (0);
   1630 
   1631 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1632 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1633 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1634 			progressInfo.total = raidPtr->Layout.numStripe;
   1635 			progressInfo.completed =
   1636 				raidPtr->parity_rewrite_stripes_done;
   1637 			progressInfo.remaining = progressInfo.total -
   1638 				progressInfo.completed;
   1639 		} else {
   1640 			progressInfo.remaining = 0;
   1641 			progressInfo.completed = 100;
   1642 			progressInfo.total = 100;
   1643 		}
   1644 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1645 				  sizeof(RF_ProgressInfo_t));
   1646 		return (retcode);
   1647 
   1648 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1649 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1650 			/* This makes no sense on a RAID 0 */
   1651 			*(int *) data = 100;
   1652 			return(0);
   1653 		}
   1654 		if (raidPtr->copyback_in_progress == 1) {
   1655 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1656 				raidPtr->Layout.numStripe;
   1657 		} else {
   1658 			*(int *) data = 100;
   1659 		}
   1660 		return (0);
   1661 
   1662 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1663 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1664 		if (raidPtr->copyback_in_progress == 1) {
   1665 			progressInfo.total = raidPtr->Layout.numStripe;
   1666 			progressInfo.completed =
   1667 				raidPtr->copyback_stripes_done;
   1668 			progressInfo.remaining = progressInfo.total -
   1669 				progressInfo.completed;
   1670 		} else {
   1671 			progressInfo.remaining = 0;
   1672 			progressInfo.completed = 100;
   1673 			progressInfo.total = 100;
   1674 		}
   1675 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1676 				  sizeof(RF_ProgressInfo_t));
   1677 		return (retcode);
   1678 
   1679 		/* the sparetable daemon calls this to wait for the kernel to
   1680 		 * need a spare table. this ioctl does not return until a
   1681 		 * spare table is needed. XXX -- calling mpsleep here in the
   1682 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1683 		 * -- I should either compute the spare table in the kernel,
   1684 		 * or have a different -- XXX XXX -- interface (a different
   1685 		 * character device) for delivering the table     -- XXX */
   1686 #if 0
   1687 	case RAIDFRAME_SPARET_WAIT:
   1688 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1689 		while (!rf_sparet_wait_queue)
   1690 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1691 		waitreq = rf_sparet_wait_queue;
   1692 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1693 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1694 
   1695 		/* structure assignment */
   1696 		*((RF_SparetWait_t *) data) = *waitreq;
   1697 
   1698 		RF_Free(waitreq, sizeof(*waitreq));
   1699 		return (0);
   1700 
   1701 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1702 		 * code in it that will cause the dameon to exit */
   1703 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1704 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1705 		waitreq->fcol = -1;
   1706 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1707 		waitreq->next = rf_sparet_wait_queue;
   1708 		rf_sparet_wait_queue = waitreq;
   1709 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1710 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1711 		return (0);
   1712 
   1713 		/* used by the spare table daemon to deliver a spare table
   1714 		 * into the kernel */
   1715 	case RAIDFRAME_SEND_SPARET:
   1716 
   1717 		/* install the spare table */
   1718 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1719 
   1720 		/* respond to the requestor.  the return status of the spare
   1721 		 * table installation is passed in the "fcol" field */
   1722 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1723 		waitreq->fcol = retcode;
   1724 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1725 		waitreq->next = rf_sparet_resp_queue;
   1726 		rf_sparet_resp_queue = waitreq;
   1727 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1728 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1729 
   1730 		return (retcode);
   1731 #endif
   1732 
   1733 	default:
   1734 		break; /* fall through to the os-specific code below */
   1735 
   1736 	}
   1737 
   1738 	if (!raidPtr->valid)
   1739 		return (EINVAL);
   1740 
   1741 	/*
   1742 	 * Add support for "regular" device ioctls here.
   1743 	 */
   1744 
   1745 	error = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1746 	if (error != EPASSTHROUGH)
   1747 		return (error);
   1748 
   1749 	switch (cmd) {
   1750 	case DIOCCACHESYNC:
   1751 		return rf_sync_component_caches(raidPtr);
   1752 
   1753 	default:
   1754 		retcode = ENOTTY;
   1755 	}
   1756 	return (retcode);
   1757 
   1758 }
   1759 
   1760 
   1761 /* raidinit -- complete the rest of the initialization for the
   1762    RAIDframe device.  */
   1763 
   1764 
   1765 static void
   1766 raidinit(struct raid_softc *rs)
   1767 {
   1768 	cfdata_t cf;
   1769 	unsigned int unit;
   1770 	struct dk_softc *dksc = &rs->sc_dksc;
   1771 	RF_Raid_t *raidPtr = &rs->sc_r;
   1772 	device_t dev;
   1773 
   1774 	unit = raidPtr->raidid;
   1775 
   1776 	/* XXX doesn't check bounds. */
   1777 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1778 
   1779 	/* attach the pseudo device */
   1780 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1781 	cf->cf_name = raid_cd.cd_name;
   1782 	cf->cf_atname = raid_cd.cd_name;
   1783 	cf->cf_unit = unit;
   1784 	cf->cf_fstate = FSTATE_STAR;
   1785 
   1786 	dev = config_attach_pseudo(cf);
   1787 	if (dev == NULL) {
   1788 		printf("raid%d: config_attach_pseudo failed\n",
   1789 		    raidPtr->raidid);
   1790 		free(cf, M_RAIDFRAME);
   1791 		return;
   1792 	}
   1793 
   1794 	/* provide a backpointer to the real softc */
   1795 	raidsoftc(dev) = rs;
   1796 
   1797 	/* disk_attach actually creates space for the CPU disklabel, among
   1798 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1799 	 * with disklabels. */
   1800 	dk_init(dksc, dev, DKTYPE_RAID);
   1801 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1802 
   1803 	/* XXX There may be a weird interaction here between this, and
   1804 	 * protectedSectors, as used in RAIDframe.  */
   1805 
   1806 	rs->sc_size = raidPtr->totalSectors;
   1807 
   1808 	/* Attach dk and disk subsystems */
   1809 	dk_attach(dksc);
   1810 	disk_attach(&dksc->sc_dkdev);
   1811 	rf_set_geometry(rs, raidPtr);
   1812 
   1813 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1814 
   1815 	/* mark unit as usuable */
   1816 	rs->sc_flags |= RAIDF_INITED;
   1817 
   1818 	dkwedge_discover(&dksc->sc_dkdev);
   1819 }
   1820 
   1821 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1822 /* wake up the daemon & tell it to get us a spare table
   1823  * XXX
   1824  * the entries in the queues should be tagged with the raidPtr
   1825  * so that in the extremely rare case that two recons happen at once,
   1826  * we know for which device were requesting a spare table
   1827  * XXX
   1828  *
   1829  * XXX This code is not currently used. GO
   1830  */
   1831 int
   1832 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1833 {
   1834 	int     retcode;
   1835 
   1836 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1837 	req->next = rf_sparet_wait_queue;
   1838 	rf_sparet_wait_queue = req;
   1839 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1840 
   1841 	/* mpsleep unlocks the mutex */
   1842 	while (!rf_sparet_resp_queue) {
   1843 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1844 	}
   1845 	req = rf_sparet_resp_queue;
   1846 	rf_sparet_resp_queue = req->next;
   1847 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1848 
   1849 	retcode = req->fcol;
   1850 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1851 					 * alloc'd */
   1852 	return (retcode);
   1853 }
   1854 #endif
   1855 
   1856 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1857  * bp & passes it down.
   1858  * any calls originating in the kernel must use non-blocking I/O
   1859  * do some extra sanity checking to return "appropriate" error values for
   1860  * certain conditions (to make some standard utilities work)
   1861  *
   1862  * Formerly known as: rf_DoAccessKernel
   1863  */
   1864 void
   1865 raidstart(RF_Raid_t *raidPtr)
   1866 {
   1867 	struct raid_softc *rs;
   1868 	struct dk_softc *dksc;
   1869 
   1870 	rs = raidPtr->softc;
   1871 	dksc = &rs->sc_dksc;
   1872 	/* quick check to see if anything has died recently */
   1873 	rf_lock_mutex2(raidPtr->mutex);
   1874 	if (raidPtr->numNewFailures > 0) {
   1875 		rf_unlock_mutex2(raidPtr->mutex);
   1876 		rf_update_component_labels(raidPtr,
   1877 					   RF_NORMAL_COMPONENT_UPDATE);
   1878 		rf_lock_mutex2(raidPtr->mutex);
   1879 		raidPtr->numNewFailures--;
   1880 	}
   1881 	rf_unlock_mutex2(raidPtr->mutex);
   1882 
   1883 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1884 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1885 		return;
   1886 	}
   1887 
   1888 	dk_start(dksc, NULL);
   1889 }
   1890 
   1891 static int
   1892 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1893 {
   1894 	RF_SectorCount_t num_blocks, pb, sum;
   1895 	RF_RaidAddr_t raid_addr;
   1896 	daddr_t blocknum;
   1897 	int     do_async;
   1898 	int rc;
   1899 
   1900 	rf_lock_mutex2(raidPtr->mutex);
   1901 	if (raidPtr->openings == 0) {
   1902 		rf_unlock_mutex2(raidPtr->mutex);
   1903 		return EAGAIN;
   1904 	}
   1905 	rf_unlock_mutex2(raidPtr->mutex);
   1906 
   1907 	blocknum = bp->b_rawblkno;
   1908 
   1909 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1910 		    (int) blocknum));
   1911 
   1912 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1913 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1914 
   1915 	/* *THIS* is where we adjust what block we're going to...
   1916 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1917 	raid_addr = blocknum;
   1918 
   1919 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1920 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1921 	sum = raid_addr + num_blocks + pb;
   1922 	if (1 || rf_debugKernelAccess) {
   1923 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1924 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1925 			    (int) pb, (int) bp->b_resid));
   1926 	}
   1927 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1928 	    || (sum < num_blocks) || (sum < pb)) {
   1929 		rc = ENOSPC;
   1930 		goto done;
   1931 	}
   1932 	/*
   1933 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1934 	 */
   1935 
   1936 	if (bp->b_bcount & raidPtr->sectorMask) {
   1937 		rc = ENOSPC;
   1938 		goto done;
   1939 	}
   1940 	db1_printf(("Calling DoAccess..\n"));
   1941 
   1942 
   1943 	rf_lock_mutex2(raidPtr->mutex);
   1944 	raidPtr->openings--;
   1945 	rf_unlock_mutex2(raidPtr->mutex);
   1946 
   1947 	/*
   1948 	 * Everything is async.
   1949 	 */
   1950 	do_async = 1;
   1951 
   1952 	/* don't ever condition on bp->b_flags & B_WRITE.
   1953 	 * always condition on B_READ instead */
   1954 
   1955 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1956 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1957 			 do_async, raid_addr, num_blocks,
   1958 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1959 
   1960 done:
   1961 	return rc;
   1962 }
   1963 
   1964 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1965 
   1966 int
   1967 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1968 {
   1969 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1970 	struct buf *bp;
   1971 
   1972 	req->queue = queue;
   1973 	bp = req->bp;
   1974 
   1975 	switch (req->type) {
   1976 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1977 		/* XXX need to do something extra here.. */
   1978 		/* I'm leaving this in, as I've never actually seen it used,
   1979 		 * and I'd like folks to report it... GO */
   1980 		printf(("WAKEUP CALLED\n"));
   1981 		queue->numOutstanding++;
   1982 
   1983 		bp->b_flags = 0;
   1984 		bp->b_private = req;
   1985 
   1986 		KernelWakeupFunc(bp);
   1987 		break;
   1988 
   1989 	case RF_IO_TYPE_READ:
   1990 	case RF_IO_TYPE_WRITE:
   1991 #if RF_ACC_TRACE > 0
   1992 		if (req->tracerec) {
   1993 			RF_ETIMER_START(req->tracerec->timer);
   1994 		}
   1995 #endif
   1996 		InitBP(bp, queue->rf_cinfo->ci_vp,
   1997 		    op, queue->rf_cinfo->ci_dev,
   1998 		    req->sectorOffset, req->numSector,
   1999 		    req->buf, KernelWakeupFunc, (void *) req,
   2000 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2001 
   2002 		if (rf_debugKernelAccess) {
   2003 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2004 				(long) bp->b_blkno));
   2005 		}
   2006 		queue->numOutstanding++;
   2007 		queue->last_deq_sector = req->sectorOffset;
   2008 		/* acc wouldn't have been let in if there were any pending
   2009 		 * reqs at any other priority */
   2010 		queue->curPriority = req->priority;
   2011 
   2012 		db1_printf(("Going for %c to unit %d col %d\n",
   2013 			    req->type, queue->raidPtr->raidid,
   2014 			    queue->col));
   2015 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2016 			(int) req->sectorOffset, (int) req->numSector,
   2017 			(int) (req->numSector <<
   2018 			    queue->raidPtr->logBytesPerSector),
   2019 			(int) queue->raidPtr->logBytesPerSector));
   2020 
   2021 		/*
   2022 		 * XXX: drop lock here since this can block at
   2023 		 * least with backing SCSI devices.  Retake it
   2024 		 * to minimize fuss with calling interfaces.
   2025 		 */
   2026 
   2027 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2028 		bdev_strategy(bp);
   2029 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2030 		break;
   2031 
   2032 	default:
   2033 		panic("bad req->type in rf_DispatchKernelIO");
   2034 	}
   2035 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2036 
   2037 	return (0);
   2038 }
   2039 /* this is the callback function associated with a I/O invoked from
   2040    kernel code.
   2041  */
   2042 static void
   2043 KernelWakeupFunc(struct buf *bp)
   2044 {
   2045 	RF_DiskQueueData_t *req = NULL;
   2046 	RF_DiskQueue_t *queue;
   2047 
   2048 	db1_printf(("recovering the request queue:\n"));
   2049 
   2050 	req = bp->b_private;
   2051 
   2052 	queue = (RF_DiskQueue_t *) req->queue;
   2053 
   2054 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2055 
   2056 #if RF_ACC_TRACE > 0
   2057 	if (req->tracerec) {
   2058 		RF_ETIMER_STOP(req->tracerec->timer);
   2059 		RF_ETIMER_EVAL(req->tracerec->timer);
   2060 		rf_lock_mutex2(rf_tracing_mutex);
   2061 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2062 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2063 		req->tracerec->num_phys_ios++;
   2064 		rf_unlock_mutex2(rf_tracing_mutex);
   2065 	}
   2066 #endif
   2067 
   2068 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2069 	 * ballistic, and mark the component as hosed... */
   2070 
   2071 	if (bp->b_error != 0) {
   2072 		/* Mark the disk as dead */
   2073 		/* but only mark it once... */
   2074 		/* and only if it wouldn't leave this RAID set
   2075 		   completely broken */
   2076 		if (((queue->raidPtr->Disks[queue->col].status ==
   2077 		      rf_ds_optimal) ||
   2078 		     (queue->raidPtr->Disks[queue->col].status ==
   2079 		      rf_ds_used_spare)) &&
   2080 		     (queue->raidPtr->numFailures <
   2081 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2082 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2083 			       queue->raidPtr->raidid,
   2084 			       bp->b_error,
   2085 			       queue->raidPtr->Disks[queue->col].devname);
   2086 			queue->raidPtr->Disks[queue->col].status =
   2087 			    rf_ds_failed;
   2088 			queue->raidPtr->status = rf_rs_degraded;
   2089 			queue->raidPtr->numFailures++;
   2090 			queue->raidPtr->numNewFailures++;
   2091 		} else {	/* Disk is already dead... */
   2092 			/* printf("Disk already marked as dead!\n"); */
   2093 		}
   2094 
   2095 	}
   2096 
   2097 	/* Fill in the error value */
   2098 	req->error = bp->b_error;
   2099 
   2100 	/* Drop this one on the "finished" queue... */
   2101 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2102 
   2103 	/* Let the raidio thread know there is work to be done. */
   2104 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2105 
   2106 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2107 }
   2108 
   2109 
   2110 /*
   2111  * initialize a buf structure for doing an I/O in the kernel.
   2112  */
   2113 static void
   2114 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2115        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2116        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2117        struct proc *b_proc)
   2118 {
   2119 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2120 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2121 	bp->b_oflags = 0;
   2122 	bp->b_cflags = 0;
   2123 	bp->b_bcount = numSect << logBytesPerSector;
   2124 	bp->b_bufsize = bp->b_bcount;
   2125 	bp->b_error = 0;
   2126 	bp->b_dev = dev;
   2127 	bp->b_data = bf;
   2128 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2129 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2130 	if (bp->b_bcount == 0) {
   2131 		panic("bp->b_bcount is zero in InitBP!!");
   2132 	}
   2133 	bp->b_proc = b_proc;
   2134 	bp->b_iodone = cbFunc;
   2135 	bp->b_private = cbArg;
   2136 }
   2137 
   2138 /*
   2139  * Wait interruptibly for an exclusive lock.
   2140  *
   2141  * XXX
   2142  * Several drivers do this; it should be abstracted and made MP-safe.
   2143  * (Hmm... where have we seen this warning before :->  GO )
   2144  */
   2145 static int
   2146 raidlock(struct raid_softc *rs)
   2147 {
   2148 	int     error;
   2149 
   2150 	error = 0;
   2151 	mutex_enter(&rs->sc_mutex);
   2152 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2153 		rs->sc_flags |= RAIDF_WANTED;
   2154 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2155 		if (error != 0)
   2156 			goto done;
   2157 	}
   2158 	rs->sc_flags |= RAIDF_LOCKED;
   2159 done:
   2160 	mutex_exit(&rs->sc_mutex);
   2161 	return (error);
   2162 }
   2163 /*
   2164  * Unlock and wake up any waiters.
   2165  */
   2166 static void
   2167 raidunlock(struct raid_softc *rs)
   2168 {
   2169 
   2170 	mutex_enter(&rs->sc_mutex);
   2171 	rs->sc_flags &= ~RAIDF_LOCKED;
   2172 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2173 		rs->sc_flags &= ~RAIDF_WANTED;
   2174 		cv_broadcast(&rs->sc_cv);
   2175 	}
   2176 	mutex_exit(&rs->sc_mutex);
   2177 }
   2178 
   2179 
   2180 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2181 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2182 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2183 
   2184 static daddr_t
   2185 rf_component_info_offset(void)
   2186 {
   2187 
   2188 	return RF_COMPONENT_INFO_OFFSET;
   2189 }
   2190 
   2191 static daddr_t
   2192 rf_component_info_size(unsigned secsize)
   2193 {
   2194 	daddr_t info_size;
   2195 
   2196 	KASSERT(secsize);
   2197 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2198 		info_size = secsize;
   2199 	else
   2200 		info_size = RF_COMPONENT_INFO_SIZE;
   2201 
   2202 	return info_size;
   2203 }
   2204 
   2205 static daddr_t
   2206 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2207 {
   2208 	daddr_t map_offset;
   2209 
   2210 	KASSERT(raidPtr->bytesPerSector);
   2211 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2212 		map_offset = raidPtr->bytesPerSector;
   2213 	else
   2214 		map_offset = RF_COMPONENT_INFO_SIZE;
   2215 	map_offset += rf_component_info_offset();
   2216 
   2217 	return map_offset;
   2218 }
   2219 
   2220 static daddr_t
   2221 rf_parity_map_size(RF_Raid_t *raidPtr)
   2222 {
   2223 	daddr_t map_size;
   2224 
   2225 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2226 		map_size = raidPtr->bytesPerSector;
   2227 	else
   2228 		map_size = RF_PARITY_MAP_SIZE;
   2229 
   2230 	return map_size;
   2231 }
   2232 
   2233 int
   2234 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2235 {
   2236 	RF_ComponentLabel_t *clabel;
   2237 
   2238 	clabel = raidget_component_label(raidPtr, col);
   2239 	clabel->clean = RF_RAID_CLEAN;
   2240 	raidflush_component_label(raidPtr, col);
   2241 	return(0);
   2242 }
   2243 
   2244 
   2245 int
   2246 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2247 {
   2248 	RF_ComponentLabel_t *clabel;
   2249 
   2250 	clabel = raidget_component_label(raidPtr, col);
   2251 	clabel->clean = RF_RAID_DIRTY;
   2252 	raidflush_component_label(raidPtr, col);
   2253 	return(0);
   2254 }
   2255 
   2256 int
   2257 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2258 {
   2259 	KASSERT(raidPtr->bytesPerSector);
   2260 	return raidread_component_label(raidPtr->bytesPerSector,
   2261 	    raidPtr->Disks[col].dev,
   2262 	    raidPtr->raid_cinfo[col].ci_vp,
   2263 	    &raidPtr->raid_cinfo[col].ci_label);
   2264 }
   2265 
   2266 RF_ComponentLabel_t *
   2267 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2268 {
   2269 	return &raidPtr->raid_cinfo[col].ci_label;
   2270 }
   2271 
   2272 int
   2273 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2274 {
   2275 	RF_ComponentLabel_t *label;
   2276 
   2277 	label = &raidPtr->raid_cinfo[col].ci_label;
   2278 	label->mod_counter = raidPtr->mod_counter;
   2279 #ifndef RF_NO_PARITY_MAP
   2280 	label->parity_map_modcount = label->mod_counter;
   2281 #endif
   2282 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2283 	    raidPtr->Disks[col].dev,
   2284 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2285 }
   2286 
   2287 
   2288 static int
   2289 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2290     RF_ComponentLabel_t *clabel)
   2291 {
   2292 	return raidread_component_area(dev, b_vp, clabel,
   2293 	    sizeof(RF_ComponentLabel_t),
   2294 	    rf_component_info_offset(),
   2295 	    rf_component_info_size(secsize));
   2296 }
   2297 
   2298 /* ARGSUSED */
   2299 static int
   2300 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2301     size_t msize, daddr_t offset, daddr_t dsize)
   2302 {
   2303 	struct buf *bp;
   2304 	int error;
   2305 
   2306 	/* XXX should probably ensure that we don't try to do this if
   2307 	   someone has changed rf_protected_sectors. */
   2308 
   2309 	if (b_vp == NULL) {
   2310 		/* For whatever reason, this component is not valid.
   2311 		   Don't try to read a component label from it. */
   2312 		return(EINVAL);
   2313 	}
   2314 
   2315 	/* get a block of the appropriate size... */
   2316 	bp = geteblk((int)dsize);
   2317 	bp->b_dev = dev;
   2318 
   2319 	/* get our ducks in a row for the read */
   2320 	bp->b_blkno = offset / DEV_BSIZE;
   2321 	bp->b_bcount = dsize;
   2322 	bp->b_flags |= B_READ;
   2323  	bp->b_resid = dsize;
   2324 
   2325 	bdev_strategy(bp);
   2326 	error = bp->b_error;
   2327 	if (!error)
   2328 		error = biowait(bp);
   2329 
   2330 	if (!error) {
   2331 		memcpy(data, bp->b_data, msize);
   2332 	}
   2333 
   2334 	brelse(bp, 0);
   2335 	return(error);
   2336 }
   2337 
   2338 
   2339 static int
   2340 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2341     RF_ComponentLabel_t *clabel)
   2342 {
   2343 	return raidwrite_component_area(dev, b_vp, clabel,
   2344 	    sizeof(RF_ComponentLabel_t),
   2345 	    rf_component_info_offset(),
   2346 	    rf_component_info_size(secsize), 0);
   2347 }
   2348 
   2349 /* ARGSUSED */
   2350 static int
   2351 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2352     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2353 {
   2354 	struct buf *bp;
   2355 	int error;
   2356 
   2357 	/* get a block of the appropriate size... */
   2358 	bp = geteblk((int)dsize);
   2359 	bp->b_dev = dev;
   2360 
   2361 	/* get our ducks in a row for the write */
   2362 	bp->b_blkno = offset / DEV_BSIZE;
   2363 	bp->b_bcount = dsize;
   2364 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2365  	bp->b_resid = dsize;
   2366 
   2367 	memset(bp->b_data, 0, dsize);
   2368 	memcpy(bp->b_data, data, msize);
   2369 
   2370 	bdev_strategy(bp);
   2371 	if (asyncp)
   2372 		return 0;
   2373 	error = bp->b_error;
   2374 	if (!error)
   2375 		error = biowait(bp);
   2376 	brelse(bp, 0);
   2377 	if (error) {
   2378 #if 1
   2379 		printf("Failed to write RAID component info!\n");
   2380 #endif
   2381 	}
   2382 
   2383 	return(error);
   2384 }
   2385 
   2386 void
   2387 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2388 {
   2389 	int c;
   2390 
   2391 	for (c = 0; c < raidPtr->numCol; c++) {
   2392 		/* Skip dead disks. */
   2393 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2394 			continue;
   2395 		/* XXXjld: what if an error occurs here? */
   2396 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2397 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2398 		    RF_PARITYMAP_NBYTE,
   2399 		    rf_parity_map_offset(raidPtr),
   2400 		    rf_parity_map_size(raidPtr), 0);
   2401 	}
   2402 }
   2403 
   2404 void
   2405 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2406 {
   2407 	struct rf_paritymap_ondisk tmp;
   2408 	int c,first;
   2409 
   2410 	first=1;
   2411 	for (c = 0; c < raidPtr->numCol; c++) {
   2412 		/* Skip dead disks. */
   2413 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2414 			continue;
   2415 		raidread_component_area(raidPtr->Disks[c].dev,
   2416 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2417 		    RF_PARITYMAP_NBYTE,
   2418 		    rf_parity_map_offset(raidPtr),
   2419 		    rf_parity_map_size(raidPtr));
   2420 		if (first) {
   2421 			memcpy(map, &tmp, sizeof(*map));
   2422 			first = 0;
   2423 		} else {
   2424 			rf_paritymap_merge(map, &tmp);
   2425 		}
   2426 	}
   2427 }
   2428 
   2429 void
   2430 rf_markalldirty(RF_Raid_t *raidPtr)
   2431 {
   2432 	RF_ComponentLabel_t *clabel;
   2433 	int sparecol;
   2434 	int c;
   2435 	int j;
   2436 	int scol = -1;
   2437 
   2438 	raidPtr->mod_counter++;
   2439 	for (c = 0; c < raidPtr->numCol; c++) {
   2440 		/* we don't want to touch (at all) a disk that has
   2441 		   failed */
   2442 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2443 			clabel = raidget_component_label(raidPtr, c);
   2444 			if (clabel->status == rf_ds_spared) {
   2445 				/* XXX do something special...
   2446 				   but whatever you do, don't
   2447 				   try to access it!! */
   2448 			} else {
   2449 				raidmarkdirty(raidPtr, c);
   2450 			}
   2451 		}
   2452 	}
   2453 
   2454 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2455 		sparecol = raidPtr->numCol + c;
   2456 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2457 			/*
   2458 
   2459 			   we claim this disk is "optimal" if it's
   2460 			   rf_ds_used_spare, as that means it should be
   2461 			   directly substitutable for the disk it replaced.
   2462 			   We note that too...
   2463 
   2464 			 */
   2465 
   2466 			for(j=0;j<raidPtr->numCol;j++) {
   2467 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2468 					scol = j;
   2469 					break;
   2470 				}
   2471 			}
   2472 
   2473 			clabel = raidget_component_label(raidPtr, sparecol);
   2474 			/* make sure status is noted */
   2475 
   2476 			raid_init_component_label(raidPtr, clabel);
   2477 
   2478 			clabel->row = 0;
   2479 			clabel->column = scol;
   2480 			/* Note: we *don't* change status from rf_ds_used_spare
   2481 			   to rf_ds_optimal */
   2482 			/* clabel.status = rf_ds_optimal; */
   2483 
   2484 			raidmarkdirty(raidPtr, sparecol);
   2485 		}
   2486 	}
   2487 }
   2488 
   2489 
   2490 void
   2491 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2492 {
   2493 	RF_ComponentLabel_t *clabel;
   2494 	int sparecol;
   2495 	int c;
   2496 	int j;
   2497 	int scol;
   2498 
   2499 	scol = -1;
   2500 
   2501 	/* XXX should do extra checks to make sure things really are clean,
   2502 	   rather than blindly setting the clean bit... */
   2503 
   2504 	raidPtr->mod_counter++;
   2505 
   2506 	for (c = 0; c < raidPtr->numCol; c++) {
   2507 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2508 			clabel = raidget_component_label(raidPtr, c);
   2509 			/* make sure status is noted */
   2510 			clabel->status = rf_ds_optimal;
   2511 
   2512 			/* note what unit we are configured as */
   2513 			clabel->last_unit = raidPtr->raidid;
   2514 
   2515 			raidflush_component_label(raidPtr, c);
   2516 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2517 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2518 					raidmarkclean(raidPtr, c);
   2519 				}
   2520 			}
   2521 		}
   2522 		/* else we don't touch it.. */
   2523 	}
   2524 
   2525 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2526 		sparecol = raidPtr->numCol + c;
   2527 		/* Need to ensure that the reconstruct actually completed! */
   2528 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2529 			/*
   2530 
   2531 			   we claim this disk is "optimal" if it's
   2532 			   rf_ds_used_spare, as that means it should be
   2533 			   directly substitutable for the disk it replaced.
   2534 			   We note that too...
   2535 
   2536 			 */
   2537 
   2538 			for(j=0;j<raidPtr->numCol;j++) {
   2539 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2540 					scol = j;
   2541 					break;
   2542 				}
   2543 			}
   2544 
   2545 			/* XXX shouldn't *really* need this... */
   2546 			clabel = raidget_component_label(raidPtr, sparecol);
   2547 			/* make sure status is noted */
   2548 
   2549 			raid_init_component_label(raidPtr, clabel);
   2550 
   2551 			clabel->column = scol;
   2552 			clabel->status = rf_ds_optimal;
   2553 			clabel->last_unit = raidPtr->raidid;
   2554 
   2555 			raidflush_component_label(raidPtr, sparecol);
   2556 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2557 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2558 					raidmarkclean(raidPtr, sparecol);
   2559 				}
   2560 			}
   2561 		}
   2562 	}
   2563 }
   2564 
   2565 void
   2566 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2567 {
   2568 
   2569 	if (vp != NULL) {
   2570 		if (auto_configured == 1) {
   2571 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2572 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2573 			vput(vp);
   2574 
   2575 		} else {
   2576 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2577 		}
   2578 	}
   2579 }
   2580 
   2581 
   2582 void
   2583 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2584 {
   2585 	int r,c;
   2586 	struct vnode *vp;
   2587 	int acd;
   2588 
   2589 
   2590 	/* We take this opportunity to close the vnodes like we should.. */
   2591 
   2592 	for (c = 0; c < raidPtr->numCol; c++) {
   2593 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2594 		acd = raidPtr->Disks[c].auto_configured;
   2595 		rf_close_component(raidPtr, vp, acd);
   2596 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2597 		raidPtr->Disks[c].auto_configured = 0;
   2598 	}
   2599 
   2600 	for (r = 0; r < raidPtr->numSpare; r++) {
   2601 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2602 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2603 		rf_close_component(raidPtr, vp, acd);
   2604 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2605 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2606 	}
   2607 }
   2608 
   2609 
   2610 void
   2611 rf_ReconThread(struct rf_recon_req *req)
   2612 {
   2613 	int     s;
   2614 	RF_Raid_t *raidPtr;
   2615 
   2616 	s = splbio();
   2617 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2618 	raidPtr->recon_in_progress = 1;
   2619 
   2620 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2621 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2622 
   2623 	RF_Free(req, sizeof(*req));
   2624 
   2625 	raidPtr->recon_in_progress = 0;
   2626 	splx(s);
   2627 
   2628 	/* That's all... */
   2629 	kthread_exit(0);	/* does not return */
   2630 }
   2631 
   2632 void
   2633 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2634 {
   2635 	int retcode;
   2636 	int s;
   2637 
   2638 	raidPtr->parity_rewrite_stripes_done = 0;
   2639 	raidPtr->parity_rewrite_in_progress = 1;
   2640 	s = splbio();
   2641 	retcode = rf_RewriteParity(raidPtr);
   2642 	splx(s);
   2643 	if (retcode) {
   2644 		printf("raid%d: Error re-writing parity (%d)!\n",
   2645 		    raidPtr->raidid, retcode);
   2646 	} else {
   2647 		/* set the clean bit!  If we shutdown correctly,
   2648 		   the clean bit on each component label will get
   2649 		   set */
   2650 		raidPtr->parity_good = RF_RAID_CLEAN;
   2651 	}
   2652 	raidPtr->parity_rewrite_in_progress = 0;
   2653 
   2654 	/* Anyone waiting for us to stop?  If so, inform them... */
   2655 	if (raidPtr->waitShutdown) {
   2656 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2657 	}
   2658 
   2659 	/* That's all... */
   2660 	kthread_exit(0);	/* does not return */
   2661 }
   2662 
   2663 
   2664 void
   2665 rf_CopybackThread(RF_Raid_t *raidPtr)
   2666 {
   2667 	int s;
   2668 
   2669 	raidPtr->copyback_in_progress = 1;
   2670 	s = splbio();
   2671 	rf_CopybackReconstructedData(raidPtr);
   2672 	splx(s);
   2673 	raidPtr->copyback_in_progress = 0;
   2674 
   2675 	/* That's all... */
   2676 	kthread_exit(0);	/* does not return */
   2677 }
   2678 
   2679 
   2680 void
   2681 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2682 {
   2683 	int s;
   2684 	RF_Raid_t *raidPtr;
   2685 
   2686 	s = splbio();
   2687 	raidPtr = req->raidPtr;
   2688 	raidPtr->recon_in_progress = 1;
   2689 	rf_ReconstructInPlace(raidPtr, req->col);
   2690 	RF_Free(req, sizeof(*req));
   2691 	raidPtr->recon_in_progress = 0;
   2692 	splx(s);
   2693 
   2694 	/* That's all... */
   2695 	kthread_exit(0);	/* does not return */
   2696 }
   2697 
   2698 static RF_AutoConfig_t *
   2699 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2700     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2701     unsigned secsize)
   2702 {
   2703 	int good_one = 0;
   2704 	RF_ComponentLabel_t *clabel;
   2705 	RF_AutoConfig_t *ac;
   2706 
   2707 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2708 	if (clabel == NULL) {
   2709 oomem:
   2710 		    while(ac_list) {
   2711 			    ac = ac_list;
   2712 			    if (ac->clabel)
   2713 				    free(ac->clabel, M_RAIDFRAME);
   2714 			    ac_list = ac_list->next;
   2715 			    free(ac, M_RAIDFRAME);
   2716 		    }
   2717 		    printf("RAID auto config: out of memory!\n");
   2718 		    return NULL; /* XXX probably should panic? */
   2719 	}
   2720 
   2721 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2722 		/* Got the label.  Does it look reasonable? */
   2723 		if (rf_reasonable_label(clabel, numsecs) &&
   2724 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2725 #ifdef DEBUG
   2726 			printf("Component on: %s: %llu\n",
   2727 				cname, (unsigned long long)size);
   2728 			rf_print_component_label(clabel);
   2729 #endif
   2730 			/* if it's reasonable, add it, else ignore it. */
   2731 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2732 				M_NOWAIT);
   2733 			if (ac == NULL) {
   2734 				free(clabel, M_RAIDFRAME);
   2735 				goto oomem;
   2736 			}
   2737 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2738 			ac->dev = dev;
   2739 			ac->vp = vp;
   2740 			ac->clabel = clabel;
   2741 			ac->next = ac_list;
   2742 			ac_list = ac;
   2743 			good_one = 1;
   2744 		}
   2745 	}
   2746 	if (!good_one) {
   2747 		/* cleanup */
   2748 		free(clabel, M_RAIDFRAME);
   2749 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2750 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2751 		vput(vp);
   2752 	}
   2753 	return ac_list;
   2754 }
   2755 
   2756 RF_AutoConfig_t *
   2757 rf_find_raid_components(void)
   2758 {
   2759 	struct vnode *vp;
   2760 	struct disklabel label;
   2761 	device_t dv;
   2762 	deviter_t di;
   2763 	dev_t dev;
   2764 	int bmajor, bminor, wedge, rf_part_found;
   2765 	int error;
   2766 	int i;
   2767 	RF_AutoConfig_t *ac_list;
   2768 	uint64_t numsecs;
   2769 	unsigned secsize;
   2770 	int dowedges;
   2771 
   2772 	/* initialize the AutoConfig list */
   2773 	ac_list = NULL;
   2774 
   2775 	/*
   2776 	 * we begin by trolling through *all* the devices on the system *twice*
   2777 	 * first we scan for wedges, second for other devices. This avoids
   2778 	 * using a raw partition instead of a wedge that covers the whole disk
   2779 	 */
   2780 
   2781 	for (dowedges=1; dowedges>=0; --dowedges) {
   2782 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2783 		     dv = deviter_next(&di)) {
   2784 
   2785 			/* we are only interested in disks... */
   2786 			if (device_class(dv) != DV_DISK)
   2787 				continue;
   2788 
   2789 			/* we don't care about floppies... */
   2790 			if (device_is_a(dv, "fd")) {
   2791 				continue;
   2792 			}
   2793 
   2794 			/* we don't care about CD's... */
   2795 			if (device_is_a(dv, "cd")) {
   2796 				continue;
   2797 			}
   2798 
   2799 			/* we don't care about md's... */
   2800 			if (device_is_a(dv, "md")) {
   2801 				continue;
   2802 			}
   2803 
   2804 			/* hdfd is the Atari/Hades floppy driver */
   2805 			if (device_is_a(dv, "hdfd")) {
   2806 				continue;
   2807 			}
   2808 
   2809 			/* fdisa is the Atari/Milan floppy driver */
   2810 			if (device_is_a(dv, "fdisa")) {
   2811 				continue;
   2812 			}
   2813 
   2814 			/* are we in the wedges pass ? */
   2815 			wedge = device_is_a(dv, "dk");
   2816 			if (wedge != dowedges) {
   2817 				continue;
   2818 			}
   2819 
   2820 			/* need to find the device_name_to_block_device_major stuff */
   2821 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2822 
   2823 			rf_part_found = 0; /*No raid partition as yet*/
   2824 
   2825 			/* get a vnode for the raw partition of this disk */
   2826 			bminor = minor(device_unit(dv));
   2827 			dev = wedge ? makedev(bmajor, bminor) :
   2828 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2829 			if (bdevvp(dev, &vp))
   2830 				panic("RAID can't alloc vnode");
   2831 
   2832 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2833 
   2834 			if (error) {
   2835 				/* "Who cares."  Continue looking
   2836 				   for something that exists*/
   2837 				vput(vp);
   2838 				continue;
   2839 			}
   2840 
   2841 			error = getdisksize(vp, &numsecs, &secsize);
   2842 			if (error) {
   2843 				/*
   2844 				 * Pseudo devices like vnd and cgd can be
   2845 				 * opened but may still need some configuration.
   2846 				 * Ignore these quietly.
   2847 				 */
   2848 				if (error != ENXIO)
   2849 					printf("RAIDframe: can't get disk size"
   2850 					    " for dev %s (%d)\n",
   2851 					    device_xname(dv), error);
   2852 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2853 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2854 				vput(vp);
   2855 				continue;
   2856 			}
   2857 			if (wedge) {
   2858 				struct dkwedge_info dkw;
   2859 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2860 				    NOCRED);
   2861 				if (error) {
   2862 					printf("RAIDframe: can't get wedge info for "
   2863 					    "dev %s (%d)\n", device_xname(dv), error);
   2864 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2865 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2866 					vput(vp);
   2867 					continue;
   2868 				}
   2869 
   2870 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2871 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2872 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2873 					vput(vp);
   2874 					continue;
   2875 				}
   2876 
   2877 				ac_list = rf_get_component(ac_list, dev, vp,
   2878 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2879 				rf_part_found = 1; /*There is a raid component on this disk*/
   2880 				continue;
   2881 			}
   2882 
   2883 			/* Ok, the disk exists.  Go get the disklabel. */
   2884 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2885 			if (error) {
   2886 				/*
   2887 				 * XXX can't happen - open() would
   2888 				 * have errored out (or faked up one)
   2889 				 */
   2890 				if (error != ENOTTY)
   2891 					printf("RAIDframe: can't get label for dev "
   2892 					    "%s (%d)\n", device_xname(dv), error);
   2893 			}
   2894 
   2895 			/* don't need this any more.  We'll allocate it again
   2896 			   a little later if we really do... */
   2897 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2898 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2899 			vput(vp);
   2900 
   2901 			if (error)
   2902 				continue;
   2903 
   2904 			rf_part_found = 0; /*No raid partitions yet*/
   2905 			for (i = 0; i < label.d_npartitions; i++) {
   2906 				char cname[sizeof(ac_list->devname)];
   2907 
   2908 				/* We only support partitions marked as RAID */
   2909 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2910 					continue;
   2911 
   2912 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2913 				if (bdevvp(dev, &vp))
   2914 					panic("RAID can't alloc vnode");
   2915 
   2916 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2917 				if (error) {
   2918 					/* Whatever... */
   2919 					vput(vp);
   2920 					continue;
   2921 				}
   2922 				snprintf(cname, sizeof(cname), "%s%c",
   2923 				    device_xname(dv), 'a' + i);
   2924 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2925 					label.d_partitions[i].p_size, numsecs, secsize);
   2926 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2927 			}
   2928 
   2929 			/*
   2930 			 *If there is no raid component on this disk, either in a
   2931 			 *disklabel or inside a wedge, check the raw partition as well,
   2932 			 *as it is possible to configure raid components on raw disk
   2933 			 *devices.
   2934 			 */
   2935 
   2936 			if (!rf_part_found) {
   2937 				char cname[sizeof(ac_list->devname)];
   2938 
   2939 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2940 				if (bdevvp(dev, &vp))
   2941 					panic("RAID can't alloc vnode");
   2942 
   2943 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2944 				if (error) {
   2945 					/* Whatever... */
   2946 					vput(vp);
   2947 					continue;
   2948 				}
   2949 				snprintf(cname, sizeof(cname), "%s%c",
   2950 				    device_xname(dv), 'a' + RAW_PART);
   2951 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2952 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2953 			}
   2954 		}
   2955 		deviter_release(&di);
   2956 	}
   2957 	return ac_list;
   2958 }
   2959 
   2960 
   2961 int
   2962 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2963 {
   2964 
   2965 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2966 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2967 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2968 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2969 	    clabel->row >=0 &&
   2970 	    clabel->column >= 0 &&
   2971 	    clabel->num_rows > 0 &&
   2972 	    clabel->num_columns > 0 &&
   2973 	    clabel->row < clabel->num_rows &&
   2974 	    clabel->column < clabel->num_columns &&
   2975 	    clabel->blockSize > 0 &&
   2976 	    /*
   2977 	     * numBlocksHi may contain garbage, but it is ok since
   2978 	     * the type is unsigned.  If it is really garbage,
   2979 	     * rf_fix_old_label_size() will fix it.
   2980 	     */
   2981 	    rf_component_label_numblocks(clabel) > 0) {
   2982 		/*
   2983 		 * label looks reasonable enough...
   2984 		 * let's make sure it has no old garbage.
   2985 		 */
   2986 		if (numsecs)
   2987 			rf_fix_old_label_size(clabel, numsecs);
   2988 		return(1);
   2989 	}
   2990 	return(0);
   2991 }
   2992 
   2993 
   2994 /*
   2995  * For reasons yet unknown, some old component labels have garbage in
   2996  * the newer numBlocksHi region, and this causes lossage.  Since those
   2997  * disks will also have numsecs set to less than 32 bits of sectors,
   2998  * we can determine when this corruption has occurred, and fix it.
   2999  *
   3000  * The exact same problem, with the same unknown reason, happens to
   3001  * the partitionSizeHi member as well.
   3002  */
   3003 static void
   3004 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3005 {
   3006 
   3007 	if (numsecs < ((uint64_t)1 << 32)) {
   3008 		if (clabel->numBlocksHi) {
   3009 			printf("WARNING: total sectors < 32 bits, yet "
   3010 			       "numBlocksHi set\n"
   3011 			       "WARNING: resetting numBlocksHi to zero.\n");
   3012 			clabel->numBlocksHi = 0;
   3013 		}
   3014 
   3015 		if (clabel->partitionSizeHi) {
   3016 			printf("WARNING: total sectors < 32 bits, yet "
   3017 			       "partitionSizeHi set\n"
   3018 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3019 			clabel->partitionSizeHi = 0;
   3020 		}
   3021 	}
   3022 }
   3023 
   3024 
   3025 #ifdef DEBUG
   3026 void
   3027 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3028 {
   3029 	uint64_t numBlocks;
   3030 	static const char *rp[] = {
   3031 	    "No", "Force", "Soft", "*invalid*"
   3032 	};
   3033 
   3034 
   3035 	numBlocks = rf_component_label_numblocks(clabel);
   3036 
   3037 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3038 	       clabel->row, clabel->column,
   3039 	       clabel->num_rows, clabel->num_columns);
   3040 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3041 	       clabel->version, clabel->serial_number,
   3042 	       clabel->mod_counter);
   3043 	printf("   Clean: %s Status: %d\n",
   3044 	       clabel->clean ? "Yes" : "No", clabel->status);
   3045 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3046 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3047 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3048 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3049 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3050 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3051 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3052 #if 0
   3053 	   printf("   Config order: %d\n", clabel->config_order);
   3054 #endif
   3055 
   3056 }
   3057 #endif
   3058 
   3059 RF_ConfigSet_t *
   3060 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3061 {
   3062 	RF_AutoConfig_t *ac;
   3063 	RF_ConfigSet_t *config_sets;
   3064 	RF_ConfigSet_t *cset;
   3065 	RF_AutoConfig_t *ac_next;
   3066 
   3067 
   3068 	config_sets = NULL;
   3069 
   3070 	/* Go through the AutoConfig list, and figure out which components
   3071 	   belong to what sets.  */
   3072 	ac = ac_list;
   3073 	while(ac!=NULL) {
   3074 		/* we're going to putz with ac->next, so save it here
   3075 		   for use at the end of the loop */
   3076 		ac_next = ac->next;
   3077 
   3078 		if (config_sets == NULL) {
   3079 			/* will need at least this one... */
   3080 			config_sets = (RF_ConfigSet_t *)
   3081 				malloc(sizeof(RF_ConfigSet_t),
   3082 				       M_RAIDFRAME, M_NOWAIT);
   3083 			if (config_sets == NULL) {
   3084 				panic("rf_create_auto_sets: No memory!");
   3085 			}
   3086 			/* this one is easy :) */
   3087 			config_sets->ac = ac;
   3088 			config_sets->next = NULL;
   3089 			config_sets->rootable = 0;
   3090 			ac->next = NULL;
   3091 		} else {
   3092 			/* which set does this component fit into? */
   3093 			cset = config_sets;
   3094 			while(cset!=NULL) {
   3095 				if (rf_does_it_fit(cset, ac)) {
   3096 					/* looks like it matches... */
   3097 					ac->next = cset->ac;
   3098 					cset->ac = ac;
   3099 					break;
   3100 				}
   3101 				cset = cset->next;
   3102 			}
   3103 			if (cset==NULL) {
   3104 				/* didn't find a match above... new set..*/
   3105 				cset = (RF_ConfigSet_t *)
   3106 					malloc(sizeof(RF_ConfigSet_t),
   3107 					       M_RAIDFRAME, M_NOWAIT);
   3108 				if (cset == NULL) {
   3109 					panic("rf_create_auto_sets: No memory!");
   3110 				}
   3111 				cset->ac = ac;
   3112 				ac->next = NULL;
   3113 				cset->next = config_sets;
   3114 				cset->rootable = 0;
   3115 				config_sets = cset;
   3116 			}
   3117 		}
   3118 		ac = ac_next;
   3119 	}
   3120 
   3121 
   3122 	return(config_sets);
   3123 }
   3124 
   3125 static int
   3126 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3127 {
   3128 	RF_ComponentLabel_t *clabel1, *clabel2;
   3129 
   3130 	/* If this one matches the *first* one in the set, that's good
   3131 	   enough, since the other members of the set would have been
   3132 	   through here too... */
   3133 	/* note that we are not checking partitionSize here..
   3134 
   3135 	   Note that we are also not checking the mod_counters here.
   3136 	   If everything else matches except the mod_counter, that's
   3137 	   good enough for this test.  We will deal with the mod_counters
   3138 	   a little later in the autoconfiguration process.
   3139 
   3140 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3141 
   3142 	   The reason we don't check for this is that failed disks
   3143 	   will have lower modification counts.  If those disks are
   3144 	   not added to the set they used to belong to, then they will
   3145 	   form their own set, which may result in 2 different sets,
   3146 	   for example, competing to be configured at raid0, and
   3147 	   perhaps competing to be the root filesystem set.  If the
   3148 	   wrong ones get configured, or both attempt to become /,
   3149 	   weird behaviour and or serious lossage will occur.  Thus we
   3150 	   need to bring them into the fold here, and kick them out at
   3151 	   a later point.
   3152 
   3153 	*/
   3154 
   3155 	clabel1 = cset->ac->clabel;
   3156 	clabel2 = ac->clabel;
   3157 	if ((clabel1->version == clabel2->version) &&
   3158 	    (clabel1->serial_number == clabel2->serial_number) &&
   3159 	    (clabel1->num_rows == clabel2->num_rows) &&
   3160 	    (clabel1->num_columns == clabel2->num_columns) &&
   3161 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3162 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3163 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3164 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3165 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3166 	    (clabel1->blockSize == clabel2->blockSize) &&
   3167 	    rf_component_label_numblocks(clabel1) ==
   3168 	    rf_component_label_numblocks(clabel2) &&
   3169 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3170 	    (clabel1->root_partition == clabel2->root_partition) &&
   3171 	    (clabel1->last_unit == clabel2->last_unit) &&
   3172 	    (clabel1->config_order == clabel2->config_order)) {
   3173 		/* if it get's here, it almost *has* to be a match */
   3174 	} else {
   3175 		/* it's not consistent with somebody in the set..
   3176 		   punt */
   3177 		return(0);
   3178 	}
   3179 	/* all was fine.. it must fit... */
   3180 	return(1);
   3181 }
   3182 
   3183 int
   3184 rf_have_enough_components(RF_ConfigSet_t *cset)
   3185 {
   3186 	RF_AutoConfig_t *ac;
   3187 	RF_AutoConfig_t *auto_config;
   3188 	RF_ComponentLabel_t *clabel;
   3189 	int c;
   3190 	int num_cols;
   3191 	int num_missing;
   3192 	int mod_counter;
   3193 	int mod_counter_found;
   3194 	int even_pair_failed;
   3195 	char parity_type;
   3196 
   3197 
   3198 	/* check to see that we have enough 'live' components
   3199 	   of this set.  If so, we can configure it if necessary */
   3200 
   3201 	num_cols = cset->ac->clabel->num_columns;
   3202 	parity_type = cset->ac->clabel->parityConfig;
   3203 
   3204 	/* XXX Check for duplicate components!?!?!? */
   3205 
   3206 	/* Determine what the mod_counter is supposed to be for this set. */
   3207 
   3208 	mod_counter_found = 0;
   3209 	mod_counter = 0;
   3210 	ac = cset->ac;
   3211 	while(ac!=NULL) {
   3212 		if (mod_counter_found==0) {
   3213 			mod_counter = ac->clabel->mod_counter;
   3214 			mod_counter_found = 1;
   3215 		} else {
   3216 			if (ac->clabel->mod_counter > mod_counter) {
   3217 				mod_counter = ac->clabel->mod_counter;
   3218 			}
   3219 		}
   3220 		ac = ac->next;
   3221 	}
   3222 
   3223 	num_missing = 0;
   3224 	auto_config = cset->ac;
   3225 
   3226 	even_pair_failed = 0;
   3227 	for(c=0; c<num_cols; c++) {
   3228 		ac = auto_config;
   3229 		while(ac!=NULL) {
   3230 			if ((ac->clabel->column == c) &&
   3231 			    (ac->clabel->mod_counter == mod_counter)) {
   3232 				/* it's this one... */
   3233 #ifdef DEBUG
   3234 				printf("Found: %s at %d\n",
   3235 				       ac->devname,c);
   3236 #endif
   3237 				break;
   3238 			}
   3239 			ac=ac->next;
   3240 		}
   3241 		if (ac==NULL) {
   3242 				/* Didn't find one here! */
   3243 				/* special case for RAID 1, especially
   3244 				   where there are more than 2
   3245 				   components (where RAIDframe treats
   3246 				   things a little differently :( ) */
   3247 			if (parity_type == '1') {
   3248 				if (c%2 == 0) { /* even component */
   3249 					even_pair_failed = 1;
   3250 				} else { /* odd component.  If
   3251 					    we're failed, and
   3252 					    so is the even
   3253 					    component, it's
   3254 					    "Good Night, Charlie" */
   3255 					if (even_pair_failed == 1) {
   3256 						return(0);
   3257 					}
   3258 				}
   3259 			} else {
   3260 				/* normal accounting */
   3261 				num_missing++;
   3262 			}
   3263 		}
   3264 		if ((parity_type == '1') && (c%2 == 1)) {
   3265 				/* Just did an even component, and we didn't
   3266 				   bail.. reset the even_pair_failed flag,
   3267 				   and go on to the next component.... */
   3268 			even_pair_failed = 0;
   3269 		}
   3270 	}
   3271 
   3272 	clabel = cset->ac->clabel;
   3273 
   3274 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3275 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3276 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3277 		/* XXX this needs to be made *much* more general */
   3278 		/* Too many failures */
   3279 		return(0);
   3280 	}
   3281 	/* otherwise, all is well, and we've got enough to take a kick
   3282 	   at autoconfiguring this set */
   3283 	return(1);
   3284 }
   3285 
   3286 void
   3287 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3288 			RF_Raid_t *raidPtr)
   3289 {
   3290 	RF_ComponentLabel_t *clabel;
   3291 	int i;
   3292 
   3293 	clabel = ac->clabel;
   3294 
   3295 	/* 1. Fill in the common stuff */
   3296 	config->numRow = clabel->num_rows = 1;
   3297 	config->numCol = clabel->num_columns;
   3298 	config->numSpare = 0; /* XXX should this be set here? */
   3299 	config->sectPerSU = clabel->sectPerSU;
   3300 	config->SUsPerPU = clabel->SUsPerPU;
   3301 	config->SUsPerRU = clabel->SUsPerRU;
   3302 	config->parityConfig = clabel->parityConfig;
   3303 	/* XXX... */
   3304 	strcpy(config->diskQueueType,"fifo");
   3305 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3306 	config->layoutSpecificSize = 0; /* XXX ?? */
   3307 
   3308 	while(ac!=NULL) {
   3309 		/* row/col values will be in range due to the checks
   3310 		   in reasonable_label() */
   3311 		strcpy(config->devnames[0][ac->clabel->column],
   3312 		       ac->devname);
   3313 		ac = ac->next;
   3314 	}
   3315 
   3316 	for(i=0;i<RF_MAXDBGV;i++) {
   3317 		config->debugVars[i][0] = 0;
   3318 	}
   3319 }
   3320 
   3321 int
   3322 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3323 {
   3324 	RF_ComponentLabel_t *clabel;
   3325 	int column;
   3326 	int sparecol;
   3327 
   3328 	raidPtr->autoconfigure = new_value;
   3329 
   3330 	for(column=0; column<raidPtr->numCol; column++) {
   3331 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3332 			clabel = raidget_component_label(raidPtr, column);
   3333 			clabel->autoconfigure = new_value;
   3334 			raidflush_component_label(raidPtr, column);
   3335 		}
   3336 	}
   3337 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3338 		sparecol = raidPtr->numCol + column;
   3339 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3340 			clabel = raidget_component_label(raidPtr, sparecol);
   3341 			clabel->autoconfigure = new_value;
   3342 			raidflush_component_label(raidPtr, sparecol);
   3343 		}
   3344 	}
   3345 	return(new_value);
   3346 }
   3347 
   3348 int
   3349 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3350 {
   3351 	RF_ComponentLabel_t *clabel;
   3352 	int column;
   3353 	int sparecol;
   3354 
   3355 	raidPtr->root_partition = new_value;
   3356 	for(column=0; column<raidPtr->numCol; column++) {
   3357 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3358 			clabel = raidget_component_label(raidPtr, column);
   3359 			clabel->root_partition = new_value;
   3360 			raidflush_component_label(raidPtr, column);
   3361 		}
   3362 	}
   3363 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3364 		sparecol = raidPtr->numCol + column;
   3365 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3366 			clabel = raidget_component_label(raidPtr, sparecol);
   3367 			clabel->root_partition = new_value;
   3368 			raidflush_component_label(raidPtr, sparecol);
   3369 		}
   3370 	}
   3371 	return(new_value);
   3372 }
   3373 
   3374 void
   3375 rf_release_all_vps(RF_ConfigSet_t *cset)
   3376 {
   3377 	RF_AutoConfig_t *ac;
   3378 
   3379 	ac = cset->ac;
   3380 	while(ac!=NULL) {
   3381 		/* Close the vp, and give it back */
   3382 		if (ac->vp) {
   3383 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3384 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3385 			vput(ac->vp);
   3386 			ac->vp = NULL;
   3387 		}
   3388 		ac = ac->next;
   3389 	}
   3390 }
   3391 
   3392 
   3393 void
   3394 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3395 {
   3396 	RF_AutoConfig_t *ac;
   3397 	RF_AutoConfig_t *next_ac;
   3398 
   3399 	ac = cset->ac;
   3400 	while(ac!=NULL) {
   3401 		next_ac = ac->next;
   3402 		/* nuke the label */
   3403 		free(ac->clabel, M_RAIDFRAME);
   3404 		/* cleanup the config structure */
   3405 		free(ac, M_RAIDFRAME);
   3406 		/* "next.." */
   3407 		ac = next_ac;
   3408 	}
   3409 	/* and, finally, nuke the config set */
   3410 	free(cset, M_RAIDFRAME);
   3411 }
   3412 
   3413 
   3414 void
   3415 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3416 {
   3417 	/* current version number */
   3418 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3419 	clabel->serial_number = raidPtr->serial_number;
   3420 	clabel->mod_counter = raidPtr->mod_counter;
   3421 
   3422 	clabel->num_rows = 1;
   3423 	clabel->num_columns = raidPtr->numCol;
   3424 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3425 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3426 
   3427 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3428 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3429 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3430 
   3431 	clabel->blockSize = raidPtr->bytesPerSector;
   3432 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3433 
   3434 	/* XXX not portable */
   3435 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3436 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3437 	clabel->autoconfigure = raidPtr->autoconfigure;
   3438 	clabel->root_partition = raidPtr->root_partition;
   3439 	clabel->last_unit = raidPtr->raidid;
   3440 	clabel->config_order = raidPtr->config_order;
   3441 
   3442 #ifndef RF_NO_PARITY_MAP
   3443 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3444 #endif
   3445 }
   3446 
   3447 struct raid_softc *
   3448 rf_auto_config_set(RF_ConfigSet_t *cset)
   3449 {
   3450 	RF_Raid_t *raidPtr;
   3451 	RF_Config_t *config;
   3452 	int raidID;
   3453 	struct raid_softc *sc;
   3454 
   3455 #ifdef DEBUG
   3456 	printf("RAID autoconfigure\n");
   3457 #endif
   3458 
   3459 	/* 1. Create a config structure */
   3460 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3461 	if (config == NULL) {
   3462 		printf("%s: Out of mem - config!?!?\n", __func__);
   3463 				/* XXX do something more intelligent here. */
   3464 		return NULL;
   3465 	}
   3466 
   3467 	/*
   3468 	   2. Figure out what RAID ID this one is supposed to live at
   3469 	   See if we can get the same RAID dev that it was configured
   3470 	   on last time..
   3471 	*/
   3472 
   3473 	raidID = cset->ac->clabel->last_unit;
   3474 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3475 	     sc = raidget(++raidID, false))
   3476 		continue;
   3477 #ifdef DEBUG
   3478 	printf("Configuring raid%d:\n",raidID);
   3479 #endif
   3480 
   3481 	if (sc == NULL)
   3482 		sc = raidget(raidID, true);
   3483 	if (sc == NULL) {
   3484 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3485 				/* XXX do something more intelligent here. */
   3486 		free(config, M_RAIDFRAME);
   3487 		return NULL;
   3488 	}
   3489 
   3490 	raidPtr = &sc->sc_r;
   3491 
   3492 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3493 	raidPtr->softc = sc;
   3494 	raidPtr->raidid = raidID;
   3495 	raidPtr->openings = RAIDOUTSTANDING;
   3496 
   3497 	/* 3. Build the configuration structure */
   3498 	rf_create_configuration(cset->ac, config, raidPtr);
   3499 
   3500 	/* 4. Do the configuration */
   3501 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3502 		raidinit(sc);
   3503 
   3504 		rf_markalldirty(raidPtr);
   3505 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3506 		switch (cset->ac->clabel->root_partition) {
   3507 		case 1:	/* Force Root */
   3508 		case 2:	/* Soft Root: root when boot partition part of raid */
   3509 			/*
   3510 			 * everything configured just fine.  Make a note
   3511 			 * that this set is eligible to be root,
   3512 			 * or forced to be root
   3513 			 */
   3514 			cset->rootable = cset->ac->clabel->root_partition;
   3515 			/* XXX do this here? */
   3516 			raidPtr->root_partition = cset->rootable;
   3517 			break;
   3518 		default:
   3519 			break;
   3520 		}
   3521 	} else {
   3522 		raidput(sc);
   3523 		sc = NULL;
   3524 	}
   3525 
   3526 	/* 5. Cleanup */
   3527 	free(config, M_RAIDFRAME);
   3528 	return sc;
   3529 }
   3530 
   3531 void
   3532 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3533 	     size_t xmin, size_t xmax)
   3534 {
   3535 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3536 	pool_sethiwat(p, xmax);
   3537 	pool_prime(p, xmin);
   3538 	pool_setlowat(p, xmin);
   3539 }
   3540 
   3541 /*
   3542  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3543  * to see if there is IO pending and if that IO could possibly be done
   3544  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3545  * otherwise.
   3546  *
   3547  */
   3548 int
   3549 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3550 {
   3551 	struct raid_softc *rs;
   3552 	struct dk_softc *dksc;
   3553 
   3554 	rs = raidPtr->softc;
   3555 	dksc = &rs->sc_dksc;
   3556 
   3557 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3558 		return 1;
   3559 
   3560 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3561 		/* there is work to do */
   3562 		return 0;
   3563 	}
   3564 	/* default is nothing to do */
   3565 	return 1;
   3566 }
   3567 
   3568 int
   3569 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3570 {
   3571 	uint64_t numsecs;
   3572 	unsigned secsize;
   3573 	int error;
   3574 
   3575 	error = getdisksize(vp, &numsecs, &secsize);
   3576 	if (error == 0) {
   3577 		diskPtr->blockSize = secsize;
   3578 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3579 		diskPtr->partitionSize = numsecs;
   3580 		return 0;
   3581 	}
   3582 	return error;
   3583 }
   3584 
   3585 static int
   3586 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3587 {
   3588 	return 1;
   3589 }
   3590 
   3591 static void
   3592 raid_attach(device_t parent, device_t self, void *aux)
   3593 {
   3594 }
   3595 
   3596 
   3597 static int
   3598 raid_detach(device_t self, int flags)
   3599 {
   3600 	int error;
   3601 	struct raid_softc *rs = raidsoftc(self);
   3602 
   3603 	if (rs == NULL)
   3604 		return ENXIO;
   3605 
   3606 	if ((error = raidlock(rs)) != 0)
   3607 		return (error);
   3608 
   3609 	error = raid_detach_unlocked(rs);
   3610 
   3611 	raidunlock(rs);
   3612 
   3613 	/* XXX raid can be referenced here */
   3614 
   3615 	if (error)
   3616 		return error;
   3617 
   3618 	/* Free the softc */
   3619 	raidput(rs);
   3620 
   3621 	return 0;
   3622 }
   3623 
   3624 static void
   3625 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3626 {
   3627 	struct dk_softc *dksc = &rs->sc_dksc;
   3628 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3629 
   3630 	memset(dg, 0, sizeof(*dg));
   3631 
   3632 	dg->dg_secperunit = raidPtr->totalSectors;
   3633 	dg->dg_secsize = raidPtr->bytesPerSector;
   3634 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3635 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3636 
   3637 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3638 }
   3639 
   3640 /*
   3641  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3642  * We end up returning whatever error was returned by the first cache flush
   3643  * that fails.
   3644  */
   3645 
   3646 int
   3647 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3648 {
   3649 	int c, sparecol;
   3650 	int e,error;
   3651 	int force = 1;
   3652 
   3653 	error = 0;
   3654 	for (c = 0; c < raidPtr->numCol; c++) {
   3655 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3656 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3657 					  &force, FWRITE, NOCRED);
   3658 			if (e) {
   3659 				if (e != ENODEV)
   3660 					printf("raid%d: cache flush to component %s failed.\n",
   3661 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3662 				if (error == 0) {
   3663 					error = e;
   3664 				}
   3665 			}
   3666 		}
   3667 	}
   3668 
   3669 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3670 		sparecol = raidPtr->numCol + c;
   3671 		/* Need to ensure that the reconstruct actually completed! */
   3672 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3673 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3674 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3675 			if (e) {
   3676 				if (e != ENODEV)
   3677 					printf("raid%d: cache flush to component %s failed.\n",
   3678 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3679 				if (error == 0) {
   3680 					error = e;
   3681 				}
   3682 			}
   3683 		}
   3684 	}
   3685 	return error;
   3686 }
   3687 
   3688 /*
   3689  * Module interface
   3690  */
   3691 
   3692 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3693 
   3694 #ifdef _MODULE
   3695 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3696 #endif
   3697 
   3698 static int raid_modcmd(modcmd_t, void *);
   3699 static int raid_modcmd_init(void);
   3700 static int raid_modcmd_fini(void);
   3701 
   3702 static int
   3703 raid_modcmd(modcmd_t cmd, void *data)
   3704 {
   3705 	int error;
   3706 
   3707 	error = 0;
   3708 	switch (cmd) {
   3709 	case MODULE_CMD_INIT:
   3710 		error = raid_modcmd_init();
   3711 		break;
   3712 	case MODULE_CMD_FINI:
   3713 		error = raid_modcmd_fini();
   3714 		break;
   3715 	default:
   3716 		error = ENOTTY;
   3717 		break;
   3718 	}
   3719 	return error;
   3720 }
   3721 
   3722 static int
   3723 raid_modcmd_init(void)
   3724 {
   3725 	int error;
   3726 	int bmajor, cmajor;
   3727 
   3728 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3729 	mutex_enter(&raid_lock);
   3730 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3731 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3732 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3733 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3734 
   3735 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3736 #endif
   3737 
   3738 	bmajor = cmajor = -1;
   3739 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3740 	    &raid_cdevsw, &cmajor);
   3741 	if (error != 0 && error != EEXIST) {
   3742 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3743 		mutex_exit(&raid_lock);
   3744 		return error;
   3745 	}
   3746 #ifdef _MODULE
   3747 	error = config_cfdriver_attach(&raid_cd);
   3748 	if (error != 0) {
   3749 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3750 		    __func__, error);
   3751 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3752 		mutex_exit(&raid_lock);
   3753 		return error;
   3754 	}
   3755 #endif
   3756 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3757 	if (error != 0) {
   3758 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3759 		    __func__, error);
   3760 #ifdef _MODULE
   3761 		config_cfdriver_detach(&raid_cd);
   3762 #endif
   3763 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3764 		mutex_exit(&raid_lock);
   3765 		return error;
   3766 	}
   3767 
   3768 	raidautoconfigdone = false;
   3769 
   3770 	mutex_exit(&raid_lock);
   3771 
   3772 	if (error == 0) {
   3773 		if (rf_BootRaidframe(true) == 0)
   3774 			aprint_verbose("Kernelized RAIDframe activated\n");
   3775 		else
   3776 			panic("Serious error activating RAID!!");
   3777 	}
   3778 
   3779 	/*
   3780 	 * Register a finalizer which will be used to auto-config RAID
   3781 	 * sets once all real hardware devices have been found.
   3782 	 */
   3783 	error = config_finalize_register(NULL, rf_autoconfig);
   3784 	if (error != 0) {
   3785 		aprint_error("WARNING: unable to register RAIDframe "
   3786 		    "finalizer\n");
   3787 		error = 0;
   3788 	}
   3789 
   3790 	return error;
   3791 }
   3792 
   3793 static int
   3794 raid_modcmd_fini(void)
   3795 {
   3796 	int error;
   3797 
   3798 	mutex_enter(&raid_lock);
   3799 
   3800 	/* Don't allow unload if raid device(s) exist.  */
   3801 	if (!LIST_EMPTY(&raids)) {
   3802 		mutex_exit(&raid_lock);
   3803 		return EBUSY;
   3804 	}
   3805 
   3806 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3807 	if (error != 0) {
   3808 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3809 		mutex_exit(&raid_lock);
   3810 		return error;
   3811 	}
   3812 #ifdef _MODULE
   3813 	error = config_cfdriver_detach(&raid_cd);
   3814 	if (error != 0) {
   3815 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3816 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3817 		mutex_exit(&raid_lock);
   3818 		return error;
   3819 	}
   3820 #endif
   3821 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3822 	if (error != 0) {
   3823 		aprint_error("%s: cannot detach devsw\n",__func__);
   3824 #ifdef _MODULE
   3825 		config_cfdriver_attach(&raid_cd);
   3826 #endif
   3827 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3828 		mutex_exit(&raid_lock);
   3829 		return error;
   3830 	}
   3831 	rf_BootRaidframe(false);
   3832 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3833 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3834 	rf_destroy_cond2(rf_sparet_wait_cv);
   3835 	rf_destroy_cond2(rf_sparet_resp_cv);
   3836 #endif
   3837 	mutex_exit(&raid_lock);
   3838 	mutex_destroy(&raid_lock);
   3839 
   3840 	return error;
   3841 }
   3842