Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.341
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.341 2016/01/06 17:40:50 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.341 2016/01/06 17:40:50 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 struct raid_softc;
    183 static void raidinit(struct raid_softc *);
    184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    185 
    186 static int raid_match(device_t, cfdata_t, void *);
    187 static void raid_attach(device_t, device_t, void *);
    188 static int raid_detach(device_t, int);
    189 
    190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    191     daddr_t, daddr_t);
    192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    193     daddr_t, daddr_t, int);
    194 
    195 static int raidwrite_component_label(unsigned,
    196     dev_t, struct vnode *, RF_ComponentLabel_t *);
    197 static int raidread_component_label(unsigned,
    198     dev_t, struct vnode *, RF_ComponentLabel_t *);
    199 
    200 static int raid_diskstart(device_t, struct buf *bp);
    201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    202 static int raid_lastclose(device_t);
    203 
    204 static dev_type_open(raidopen);
    205 static dev_type_close(raidclose);
    206 static dev_type_read(raidread);
    207 static dev_type_write(raidwrite);
    208 static dev_type_ioctl(raidioctl);
    209 static dev_type_strategy(raidstrategy);
    210 static dev_type_dump(raiddump);
    211 static dev_type_size(raidsize);
    212 
    213 const struct bdevsw raid_bdevsw = {
    214 	.d_open = raidopen,
    215 	.d_close = raidclose,
    216 	.d_strategy = raidstrategy,
    217 	.d_ioctl = raidioctl,
    218 	.d_dump = raiddump,
    219 	.d_psize = raidsize,
    220 	.d_discard = nodiscard,
    221 	.d_flag = D_DISK
    222 };
    223 
    224 const struct cdevsw raid_cdevsw = {
    225 	.d_open = raidopen,
    226 	.d_close = raidclose,
    227 	.d_read = raidread,
    228 	.d_write = raidwrite,
    229 	.d_ioctl = raidioctl,
    230 	.d_stop = nostop,
    231 	.d_tty = notty,
    232 	.d_poll = nopoll,
    233 	.d_mmap = nommap,
    234 	.d_kqfilter = nokqfilter,
    235 	.d_discard = nodiscard,
    236 	.d_flag = D_DISK
    237 };
    238 
    239 static struct dkdriver rf_dkdriver = {
    240 	.d_open = raidopen,
    241 	.d_close = raidclose,
    242 	.d_strategy = raidstrategy,
    243 	.d_diskstart = raid_diskstart,
    244 	.d_dumpblocks = raid_dumpblocks,
    245 	.d_lastclose = raid_lastclose,
    246 	.d_minphys = minphys
    247 };
    248 
    249 struct raid_softc {
    250 	struct dk_softc sc_dksc;
    251 	int	sc_unit;
    252 	int     sc_flags;	/* flags */
    253 	int     sc_cflags;	/* configuration flags */
    254 	kmutex_t sc_mutex;	/* interlock mutex */
    255 	kcondvar_t sc_cv;	/* and the condvar */
    256 	uint64_t sc_size;	/* size of the raid device */
    257 	char    sc_xname[20];	/* XXX external name */
    258 	RF_Raid_t sc_r;
    259 	LIST_ENTRY(raid_softc) sc_link;
    260 };
    261 /* sc_flags */
    262 #define RAIDF_INITED		0x001	/* unit has been initialized */
    263 #define RAIDF_WLABEL		0x002	/* label area is writable */
    264 #define RAIDF_LABELLING		0x004	/* unit is currently being labelled */
    265 #define RAIDF_SHUTDOWN		0x008	/* unit is being shutdown */
    266 #define RAIDF_DETACH  		0x010	/* detach after final close */
    267 #define RAIDF_WANTED		0x040	/* someone is waiting to obtain a lock */
    268 #define RAIDF_LOCKED		0x080	/* unit is locked */
    269 #define RAIDF_UNIT_CHANGED	0x100	/* unit is being changed */
    270 
    271 #define	raidunit(x)	DISKUNIT(x)
    272 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    273 
    274 extern struct cfdriver raid_cd;
    275 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    276     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    277     DVF_DETACH_SHUTDOWN);
    278 
    279 /*
    280  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    281  * Be aware that large numbers can allow the driver to consume a lot of
    282  * kernel memory, especially on writes, and in degraded mode reads.
    283  *
    284  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    285  * a single 64K write will typically require 64K for the old data,
    286  * 64K for the old parity, and 64K for the new parity, for a total
    287  * of 192K (if the parity buffer is not re-used immediately).
    288  * Even it if is used immediately, that's still 128K, which when multiplied
    289  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    290  *
    291  * Now in degraded mode, for example, a 64K read on the above setup may
    292  * require data reconstruction, which will require *all* of the 4 remaining
    293  * disks to participate -- 4 * 32K/disk == 128K again.
    294  */
    295 
    296 #ifndef RAIDOUTSTANDING
    297 #define RAIDOUTSTANDING   6
    298 #endif
    299 
    300 #define RAIDLABELDEV(dev)	\
    301 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    302 
    303 /* declared here, and made public, for the benefit of KVM stuff.. */
    304 
    305 static int raidlock(struct raid_softc *);
    306 static void raidunlock(struct raid_softc *);
    307 
    308 static int raid_detach_unlocked(struct raid_softc *);
    309 
    310 static void rf_markalldirty(RF_Raid_t *);
    311 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    312 
    313 void rf_ReconThread(struct rf_recon_req *);
    314 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    315 void rf_CopybackThread(RF_Raid_t *raidPtr);
    316 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    317 int rf_autoconfig(device_t);
    318 void rf_buildroothack(RF_ConfigSet_t *);
    319 
    320 RF_AutoConfig_t *rf_find_raid_components(void);
    321 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    322 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    323 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    324 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    325 int rf_set_autoconfig(RF_Raid_t *, int);
    326 int rf_set_rootpartition(RF_Raid_t *, int);
    327 void rf_release_all_vps(RF_ConfigSet_t *);
    328 void rf_cleanup_config_set(RF_ConfigSet_t *);
    329 int rf_have_enough_components(RF_ConfigSet_t *);
    330 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    331 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    332 
    333 /*
    334  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    335  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    336  * in the kernel config file.
    337  */
    338 #ifdef RAID_AUTOCONFIG
    339 int raidautoconfig = 1;
    340 #else
    341 int raidautoconfig = 0;
    342 #endif
    343 static bool raidautoconfigdone = false;
    344 
    345 struct RF_Pools_s rf_pools;
    346 
    347 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    348 static kmutex_t raid_lock;
    349 
    350 static struct raid_softc *
    351 raidcreate(int unit) {
    352 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    353 	if (sc == NULL) {
    354 #ifdef DIAGNOSTIC
    355 		printf("%s: out of memory\n", __func__);
    356 #endif
    357 		return NULL;
    358 	}
    359 	sc->sc_unit = unit;
    360 	cv_init(&sc->sc_cv, "raidunit");
    361 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    362 	return sc;
    363 }
    364 
    365 static void
    366 raiddestroy(struct raid_softc *sc) {
    367 	cv_destroy(&sc->sc_cv);
    368 	mutex_destroy(&sc->sc_mutex);
    369 	kmem_free(sc, sizeof(*sc));
    370 }
    371 
    372 static struct raid_softc *
    373 raidget(int unit, bool create) {
    374 	struct raid_softc *sc;
    375 	if (unit < 0) {
    376 #ifdef DIAGNOSTIC
    377 		panic("%s: unit %d!", __func__, unit);
    378 #endif
    379 		return NULL;
    380 	}
    381 	mutex_enter(&raid_lock);
    382 	LIST_FOREACH(sc, &raids, sc_link) {
    383 		if (sc->sc_unit == unit) {
    384 			mutex_exit(&raid_lock);
    385 			return sc;
    386 		}
    387 	}
    388 	mutex_exit(&raid_lock);
    389 	if (!create)
    390 		return NULL;
    391 	if ((sc = raidcreate(unit)) == NULL)
    392 		return NULL;
    393 	mutex_enter(&raid_lock);
    394 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    395 	mutex_exit(&raid_lock);
    396 	return sc;
    397 }
    398 
    399 static void
    400 raidput(struct raid_softc *sc) {
    401 	mutex_enter(&raid_lock);
    402 	LIST_REMOVE(sc, sc_link);
    403 	mutex_exit(&raid_lock);
    404 	raiddestroy(sc);
    405 }
    406 
    407 void
    408 raidattach(int num)
    409 {
    410 
    411 	/*
    412 	 * Device attachment and associated initialization now occurs
    413 	 * as part of the module initialization.
    414 	 */
    415 }
    416 
    417 int
    418 rf_autoconfig(device_t self)
    419 {
    420 	RF_AutoConfig_t *ac_list;
    421 	RF_ConfigSet_t *config_sets;
    422 
    423 	if (!raidautoconfig || raidautoconfigdone == true)
    424 		return (0);
    425 
    426 	/* XXX This code can only be run once. */
    427 	raidautoconfigdone = true;
    428 
    429 #ifdef __HAVE_CPU_BOOTCONF
    430 	/*
    431 	 * 0. find the boot device if needed first so we can use it later
    432 	 * this needs to be done before we autoconfigure any raid sets,
    433 	 * because if we use wedges we are not going to be able to open
    434 	 * the boot device later
    435 	 */
    436 	if (booted_device == NULL)
    437 		cpu_bootconf();
    438 #endif
    439 	/* 1. locate all RAID components on the system */
    440 	aprint_debug("Searching for RAID components...\n");
    441 	ac_list = rf_find_raid_components();
    442 
    443 	/* 2. Sort them into their respective sets. */
    444 	config_sets = rf_create_auto_sets(ac_list);
    445 
    446 	/*
    447 	 * 3. Evaluate each set and configure the valid ones.
    448 	 * This gets done in rf_buildroothack().
    449 	 */
    450 	rf_buildroothack(config_sets);
    451 
    452 	return 1;
    453 }
    454 
    455 static int
    456 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    457 	const char *bootname = device_xname(bdv);
    458 	size_t len = strlen(bootname);
    459 
    460 	for (int col = 0; col < r->numCol; col++) {
    461 		const char *devname = r->Disks[col].devname;
    462 		devname += sizeof("/dev/") - 1;
    463 		if (strncmp(devname, "dk", 2) == 0) {
    464 			const char *parent =
    465 			    dkwedge_get_parent_name(r->Disks[col].dev);
    466 			if (parent != NULL)
    467 				devname = parent;
    468 		}
    469 		if (strncmp(devname, bootname, len) == 0) {
    470 			struct raid_softc *sc = r->softc;
    471 			aprint_debug("raid%d includes boot device %s\n",
    472 			    sc->sc_unit, devname);
    473 			return 1;
    474 		}
    475 	}
    476 	return 0;
    477 }
    478 
    479 void
    480 rf_buildroothack(RF_ConfigSet_t *config_sets)
    481 {
    482 	RF_ConfigSet_t *cset;
    483 	RF_ConfigSet_t *next_cset;
    484 	int num_root;
    485 	struct raid_softc *sc, *rsc;
    486 	struct dk_softc *dksc;
    487 
    488 	sc = rsc = NULL;
    489 	num_root = 0;
    490 	cset = config_sets;
    491 	while (cset != NULL) {
    492 		next_cset = cset->next;
    493 		if (rf_have_enough_components(cset) &&
    494 		    cset->ac->clabel->autoconfigure == 1) {
    495 			sc = rf_auto_config_set(cset);
    496 			if (sc != NULL) {
    497 				aprint_debug("raid%d: configured ok\n",
    498 				    sc->sc_unit);
    499 				if (cset->rootable) {
    500 					rsc = sc;
    501 					num_root++;
    502 				}
    503 			} else {
    504 				/* The autoconfig didn't work :( */
    505 				aprint_debug("Autoconfig failed\n");
    506 				rf_release_all_vps(cset);
    507 			}
    508 		} else {
    509 			/* we're not autoconfiguring this set...
    510 			   release the associated resources */
    511 			rf_release_all_vps(cset);
    512 		}
    513 		/* cleanup */
    514 		rf_cleanup_config_set(cset);
    515 		cset = next_cset;
    516 	}
    517 	dksc = &rsc->sc_dksc;
    518 
    519 	/* if the user has specified what the root device should be
    520 	   then we don't touch booted_device or boothowto... */
    521 
    522 	if (rootspec != NULL)
    523 		return;
    524 
    525 	/* we found something bootable... */
    526 
    527 	/*
    528 	 * XXX: The following code assumes that the root raid
    529 	 * is the first ('a') partition. This is about the best
    530 	 * we can do with a BSD disklabel, but we might be able
    531 	 * to do better with a GPT label, by setting a specified
    532 	 * attribute to indicate the root partition. We can then
    533 	 * stash the partition number in the r->root_partition
    534 	 * high bits (the bottom 2 bits are already used). For
    535 	 * now we just set booted_partition to 0 when we override
    536 	 * root.
    537 	 */
    538 	if (num_root == 1) {
    539 		device_t candidate_root;
    540 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    541 			char cname[sizeof(cset->ac->devname)];
    542 			/* XXX: assume 'a' */
    543 			snprintf(cname, sizeof(cname), "%s%c",
    544 			    device_xname(dksc->sc_dev), 'a');
    545 			candidate_root = dkwedge_find_by_wname(cname);
    546 		} else
    547 			candidate_root = dksc->sc_dev;
    548 		if (booted_device == NULL ||
    549 		    rsc->sc_r.root_partition == 1 ||
    550 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    551 			booted_device = candidate_root;
    552 			booted_partition = 0;	/* XXX assume 'a' */
    553 		}
    554 	} else if (num_root > 1) {
    555 
    556 		/*
    557 		 * Maybe the MD code can help. If it cannot, then
    558 		 * setroot() will discover that we have no
    559 		 * booted_device and will ask the user if nothing was
    560 		 * hardwired in the kernel config file
    561 		 */
    562 		if (booted_device == NULL)
    563 			return;
    564 
    565 		num_root = 0;
    566 		mutex_enter(&raid_lock);
    567 		LIST_FOREACH(sc, &raids, sc_link) {
    568 			RF_Raid_t *r = &sc->sc_r;
    569 			if (r->valid == 0)
    570 				continue;
    571 
    572 			if (r->root_partition == 0)
    573 				continue;
    574 
    575 			if (rf_containsboot(r, booted_device)) {
    576 				num_root++;
    577 				rsc = sc;
    578 				dksc = &rsc->sc_dksc;
    579 			}
    580 		}
    581 		mutex_exit(&raid_lock);
    582 
    583 		if (num_root == 1) {
    584 			booted_device = dksc->sc_dev;
    585 			booted_partition = 0;	/* XXX assume 'a' */
    586 		} else {
    587 			/* we can't guess.. require the user to answer... */
    588 			boothowto |= RB_ASKNAME;
    589 		}
    590 	}
    591 }
    592 
    593 static int
    594 raidsize(dev_t dev)
    595 {
    596 	struct raid_softc *rs;
    597 	struct dk_softc *dksc;
    598 	unsigned int unit;
    599 
    600 	unit = raidunit(dev);
    601 	if ((rs = raidget(unit, false)) == NULL)
    602 		return -1;
    603 	dksc = &rs->sc_dksc;
    604 
    605 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    606 		return -1;
    607 
    608 	return dk_size(dksc, dev);
    609 }
    610 
    611 static int
    612 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    613 {
    614 	unsigned int unit;
    615 	struct raid_softc *rs;
    616 	struct dk_softc *dksc;
    617 
    618 	unit = raidunit(dev);
    619 	if ((rs = raidget(unit, false)) == NULL)
    620 		return ENXIO;
    621 	dksc = &rs->sc_dksc;
    622 
    623 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    624 		return ENODEV;
    625 
    626         /*
    627            Note that blkno is relative to this particular partition.
    628            By adding adding RF_PROTECTED_SECTORS, we get a value that
    629 	   is relative to the partition used for the underlying component.
    630         */
    631 	blkno += RF_PROTECTED_SECTORS;
    632 
    633 	return dk_dump(dksc, dev, blkno, va, size);
    634 }
    635 
    636 static int
    637 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    638 {
    639 	struct raid_softc *rs = raidsoftc(dev);
    640 	const struct bdevsw *bdev;
    641 	RF_Raid_t *raidPtr;
    642 	int     c, sparecol, j, scol, dumpto;
    643 	int     error = 0;
    644 
    645 	raidPtr = &rs->sc_r;
    646 
    647 	/* we only support dumping to RAID 1 sets */
    648 	if (raidPtr->Layout.numDataCol != 1 ||
    649 	    raidPtr->Layout.numParityCol != 1)
    650 		return EINVAL;
    651 
    652 	if ((error = raidlock(rs)) != 0)
    653 		return error;
    654 
    655 	/* figure out what device is alive.. */
    656 
    657 	/*
    658 	   Look for a component to dump to.  The preference for the
    659 	   component to dump to is as follows:
    660 	   1) the master
    661 	   2) a used_spare of the master
    662 	   3) the slave
    663 	   4) a used_spare of the slave
    664 	*/
    665 
    666 	dumpto = -1;
    667 	for (c = 0; c < raidPtr->numCol; c++) {
    668 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    669 			/* this might be the one */
    670 			dumpto = c;
    671 			break;
    672 		}
    673 	}
    674 
    675 	/*
    676 	   At this point we have possibly selected a live master or a
    677 	   live slave.  We now check to see if there is a spared
    678 	   master (or a spared slave), if we didn't find a live master
    679 	   or a live slave.
    680 	*/
    681 
    682 	for (c = 0; c < raidPtr->numSpare; c++) {
    683 		sparecol = raidPtr->numCol + c;
    684 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    685 			/* How about this one? */
    686 			scol = -1;
    687 			for(j=0;j<raidPtr->numCol;j++) {
    688 				if (raidPtr->Disks[j].spareCol == sparecol) {
    689 					scol = j;
    690 					break;
    691 				}
    692 			}
    693 			if (scol == 0) {
    694 				/*
    695 				   We must have found a spared master!
    696 				   We'll take that over anything else
    697 				   found so far.  (We couldn't have
    698 				   found a real master before, since
    699 				   this is a used spare, and it's
    700 				   saying that it's replacing the
    701 				   master.)  On reboot (with
    702 				   autoconfiguration turned on)
    703 				   sparecol will become the 1st
    704 				   component (component0) of this set.
    705 				*/
    706 				dumpto = sparecol;
    707 				break;
    708 			} else if (scol != -1) {
    709 				/*
    710 				   Must be a spared slave.  We'll dump
    711 				   to that if we havn't found anything
    712 				   else so far.
    713 				*/
    714 				if (dumpto == -1)
    715 					dumpto = sparecol;
    716 			}
    717 		}
    718 	}
    719 
    720 	if (dumpto == -1) {
    721 		/* we couldn't find any live components to dump to!?!?
    722 		 */
    723 		error = EINVAL;
    724 		goto out;
    725 	}
    726 
    727 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    728 
    729 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    730 				blkno, va, nblk * raidPtr->bytesPerSector);
    731 
    732 out:
    733 	raidunlock(rs);
    734 
    735 	return error;
    736 }
    737 
    738 /* ARGSUSED */
    739 static int
    740 raidopen(dev_t dev, int flags, int fmt,
    741     struct lwp *l)
    742 {
    743 	int     unit = raidunit(dev);
    744 	struct raid_softc *rs;
    745 	struct dk_softc *dksc;
    746 	int     error = 0;
    747 	int     part, pmask;
    748 
    749 	if ((rs = raidget(unit, true)) == NULL)
    750 		return ENXIO;
    751 	if ((error = raidlock(rs)) != 0)
    752 		return (error);
    753 
    754 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    755 		error = EBUSY;
    756 		goto bad;
    757 	}
    758 
    759 	dksc = &rs->sc_dksc;
    760 
    761 	part = DISKPART(dev);
    762 	pmask = (1 << part);
    763 
    764 	if (!DK_BUSY(dksc, pmask) &&
    765 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    766 		/* First one... mark things as dirty... Note that we *MUST*
    767 		 have done a configure before this.  I DO NOT WANT TO BE
    768 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    769 		 THAT THEY BELONG TOGETHER!!!!! */
    770 		/* XXX should check to see if we're only open for reading
    771 		   here... If so, we needn't do this, but then need some
    772 		   other way of keeping track of what's happened.. */
    773 
    774 		rf_markalldirty(&rs->sc_r);
    775 	}
    776 
    777 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    778 		error = dk_open(dksc, dev, flags, fmt, l);
    779 
    780 bad:
    781 	raidunlock(rs);
    782 
    783 	return (error);
    784 
    785 
    786 }
    787 
    788 static int
    789 raid_lastclose(device_t self)
    790 {
    791 	struct raid_softc *rs = raidsoftc(self);
    792 
    793 	/* Last one... device is not unconfigured yet.
    794 	   Device shutdown has taken care of setting the
    795 	   clean bits if RAIDF_INITED is not set
    796 	   mark things as clean... */
    797 
    798 	rf_update_component_labels(&rs->sc_r,
    799 	    RF_FINAL_COMPONENT_UPDATE);
    800 
    801 	/* pass to unlocked code */
    802 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    803 		rs->sc_flags |= RAIDF_DETACH;
    804 
    805 	return 0;
    806 }
    807 
    808 /* ARGSUSED */
    809 static int
    810 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    811 {
    812 	int     unit = raidunit(dev);
    813 	struct raid_softc *rs;
    814 	struct dk_softc *dksc;
    815 	cfdata_t cf;
    816 	int     error = 0, do_detach = 0, do_put = 0;
    817 
    818 	if ((rs = raidget(unit, false)) == NULL)
    819 		return ENXIO;
    820 	dksc = &rs->sc_dksc;
    821 
    822 	if ((error = raidlock(rs)) != 0)
    823 		return (error);
    824 
    825 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    826 		error = dk_close(dksc, dev, flags, fmt, l);
    827 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    828 			do_detach = 1;
    829 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    830 		do_put = 1;
    831 
    832 	raidunlock(rs);
    833 
    834 	if (do_detach) {
    835 		/* free the pseudo device attach bits */
    836 		cf = device_cfdata(dksc->sc_dev);
    837 		error = config_detach(dksc->sc_dev, 0);
    838 		if (error == 0)
    839 			free(cf, M_RAIDFRAME);
    840 	} else if (do_put) {
    841 		raidput(rs);
    842 	}
    843 
    844 	return (error);
    845 
    846 }
    847 
    848 static void
    849 raid_wakeup(RF_Raid_t *raidPtr)
    850 {
    851 	rf_lock_mutex2(raidPtr->iodone_lock);
    852 	rf_signal_cond2(raidPtr->iodone_cv);
    853 	rf_unlock_mutex2(raidPtr->iodone_lock);
    854 }
    855 
    856 static void
    857 raidstrategy(struct buf *bp)
    858 {
    859 	unsigned int unit;
    860 	struct raid_softc *rs;
    861 	struct dk_softc *dksc;
    862 	RF_Raid_t *raidPtr;
    863 
    864 	unit = raidunit(bp->b_dev);
    865 	if ((rs = raidget(unit, false)) == NULL) {
    866 		bp->b_error = ENXIO;
    867 		goto fail;
    868 	}
    869 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    870 		bp->b_error = ENXIO;
    871 		goto fail;
    872 	}
    873 	dksc = &rs->sc_dksc;
    874 	raidPtr = &rs->sc_r;
    875 
    876 	/* Queue IO only */
    877 	if (dk_strategy_defer(dksc, bp))
    878 		goto done;
    879 
    880 	/* schedule the IO to happen at the next convenient time */
    881 	raid_wakeup(raidPtr);
    882 
    883 done:
    884 	return;
    885 
    886 fail:
    887 	bp->b_resid = bp->b_bcount;
    888 	biodone(bp);
    889 }
    890 
    891 static int
    892 raid_diskstart(device_t dev, struct buf *bp)
    893 {
    894 	struct raid_softc *rs = raidsoftc(dev);
    895 	RF_Raid_t *raidPtr;
    896 
    897 	raidPtr = &rs->sc_r;
    898 	if (!raidPtr->valid) {
    899 		db1_printf(("raid is not valid..\n"));
    900 		return ENODEV;
    901 	}
    902 
    903 	/* XXX */
    904 	bp->b_resid = 0;
    905 
    906 	return raiddoaccess(raidPtr, bp);
    907 }
    908 
    909 void
    910 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    911 {
    912 	struct raid_softc *rs;
    913 	struct dk_softc *dksc;
    914 
    915 	rs = raidPtr->softc;
    916 	dksc = &rs->sc_dksc;
    917 
    918 	dk_done(dksc, bp);
    919 
    920 	rf_lock_mutex2(raidPtr->mutex);
    921 	raidPtr->openings++;
    922 	rf_unlock_mutex2(raidPtr->mutex);
    923 
    924 	/* schedule more IO */
    925 	raid_wakeup(raidPtr);
    926 }
    927 
    928 /* ARGSUSED */
    929 static int
    930 raidread(dev_t dev, struct uio *uio, int flags)
    931 {
    932 	int     unit = raidunit(dev);
    933 	struct raid_softc *rs;
    934 
    935 	if ((rs = raidget(unit, false)) == NULL)
    936 		return ENXIO;
    937 
    938 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    939 		return (ENXIO);
    940 
    941 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    942 
    943 }
    944 
    945 /* ARGSUSED */
    946 static int
    947 raidwrite(dev_t dev, struct uio *uio, int flags)
    948 {
    949 	int     unit = raidunit(dev);
    950 	struct raid_softc *rs;
    951 
    952 	if ((rs = raidget(unit, false)) == NULL)
    953 		return ENXIO;
    954 
    955 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    956 		return (ENXIO);
    957 
    958 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    959 
    960 }
    961 
    962 static int
    963 raid_detach_unlocked(struct raid_softc *rs)
    964 {
    965 	struct dk_softc *dksc = &rs->sc_dksc;
    966 	RF_Raid_t *raidPtr;
    967 	int error;
    968 
    969 	raidPtr = &rs->sc_r;
    970 
    971 	if (DK_BUSY(dksc, 0) ||
    972 	    raidPtr->recon_in_progress != 0 ||
    973 	    raidPtr->parity_rewrite_in_progress != 0 ||
    974 	    raidPtr->copyback_in_progress != 0)
    975 		return EBUSY;
    976 
    977 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    978 		return 0;
    979 
    980 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
    981 
    982 	if ((error = rf_Shutdown(raidPtr)) != 0)
    983 		return error;
    984 
    985 	rs->sc_flags &= ~RAIDF_INITED;
    986 
    987 	/* Kill off any queued buffers */
    988 	dk_drain(dksc);
    989 	bufq_free(dksc->sc_bufq);
    990 
    991 	/* Detach the disk. */
    992 	dkwedge_delall(&dksc->sc_dkdev);
    993 	disk_detach(&dksc->sc_dkdev);
    994 	disk_destroy(&dksc->sc_dkdev);
    995 	dk_detach(dksc);
    996 
    997 	return 0;
    998 }
    999 
   1000 static int
   1001 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1002 {
   1003 	int     unit = raidunit(dev);
   1004 	int     error = 0;
   1005 	int     part, pmask;
   1006 	struct raid_softc *rs;
   1007 	struct dk_softc *dksc;
   1008 	RF_Config_t *k_cfg, *u_cfg;
   1009 	RF_Raid_t *raidPtr;
   1010 	RF_RaidDisk_t *diskPtr;
   1011 	RF_AccTotals_t *totals;
   1012 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1013 	u_char *specific_buf;
   1014 	int retcode = 0;
   1015 	int column;
   1016 /*	int raidid; */
   1017 	struct rf_recon_req *rrcopy, *rr;
   1018 	RF_ComponentLabel_t *clabel;
   1019 	RF_ComponentLabel_t *ci_label;
   1020 	RF_ComponentLabel_t **clabel_ptr;
   1021 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1022 	RF_SingleComponent_t component;
   1023 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1024 	int i, j, d;
   1025 
   1026 	if ((rs = raidget(unit, false)) == NULL)
   1027 		return ENXIO;
   1028 	dksc = &rs->sc_dksc;
   1029 	raidPtr = &rs->sc_r;
   1030 
   1031 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1032 		(int) DISKPART(dev), (int) unit, cmd));
   1033 
   1034 	/* Must be initialized for these... */
   1035 	switch (cmd) {
   1036 	case RAIDFRAME_REWRITEPARITY:
   1037 	case RAIDFRAME_GET_INFO:
   1038 	case RAIDFRAME_RESET_ACCTOTALS:
   1039 	case RAIDFRAME_GET_ACCTOTALS:
   1040 	case RAIDFRAME_KEEP_ACCTOTALS:
   1041 	case RAIDFRAME_GET_SIZE:
   1042 	case RAIDFRAME_FAIL_DISK:
   1043 	case RAIDFRAME_COPYBACK:
   1044 	case RAIDFRAME_CHECK_RECON_STATUS:
   1045 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1046 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1047 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1048 	case RAIDFRAME_ADD_HOT_SPARE:
   1049 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1050 	case RAIDFRAME_INIT_LABELS:
   1051 	case RAIDFRAME_REBUILD_IN_PLACE:
   1052 	case RAIDFRAME_CHECK_PARITY:
   1053 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1054 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1055 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1056 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1057 	case RAIDFRAME_SET_AUTOCONFIG:
   1058 	case RAIDFRAME_SET_ROOT:
   1059 	case RAIDFRAME_DELETE_COMPONENT:
   1060 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1061 	case RAIDFRAME_PARITYMAP_STATUS:
   1062 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1063 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1064 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1065 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1066 			return (ENXIO);
   1067 	}
   1068 
   1069 	switch (cmd) {
   1070 #ifdef COMPAT_50
   1071 	case RAIDFRAME_GET_INFO50:
   1072 		return rf_get_info50(raidPtr, data);
   1073 
   1074 	case RAIDFRAME_CONFIGURE50:
   1075 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1076 			return retcode;
   1077 		goto config;
   1078 #endif
   1079 		/* configure the system */
   1080 	case RAIDFRAME_CONFIGURE:
   1081 
   1082 		if (raidPtr->valid) {
   1083 			/* There is a valid RAID set running on this unit! */
   1084 			printf("raid%d: Device already configured!\n",unit);
   1085 			return(EINVAL);
   1086 		}
   1087 
   1088 		/* copy-in the configuration information */
   1089 		/* data points to a pointer to the configuration structure */
   1090 
   1091 		u_cfg = *((RF_Config_t **) data);
   1092 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1093 		if (k_cfg == NULL) {
   1094 			return (ENOMEM);
   1095 		}
   1096 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1097 		if (retcode) {
   1098 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1099 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1100 				retcode));
   1101 			goto no_config;
   1102 		}
   1103 		goto config;
   1104 	config:
   1105 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1106 
   1107 		/* allocate a buffer for the layout-specific data, and copy it
   1108 		 * in */
   1109 		if (k_cfg->layoutSpecificSize) {
   1110 			if (k_cfg->layoutSpecificSize > 10000) {
   1111 				/* sanity check */
   1112 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1113 				retcode = EINVAL;
   1114 				goto no_config;
   1115 			}
   1116 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1117 			    (u_char *));
   1118 			if (specific_buf == NULL) {
   1119 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1120 				retcode = ENOMEM;
   1121 				goto no_config;
   1122 			}
   1123 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1124 			    k_cfg->layoutSpecificSize);
   1125 			if (retcode) {
   1126 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1127 				RF_Free(specific_buf,
   1128 					k_cfg->layoutSpecificSize);
   1129 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1130 					retcode));
   1131 				goto no_config;
   1132 			}
   1133 		} else
   1134 			specific_buf = NULL;
   1135 		k_cfg->layoutSpecific = specific_buf;
   1136 
   1137 		/* should do some kind of sanity check on the configuration.
   1138 		 * Store the sum of all the bytes in the last byte? */
   1139 
   1140 		/* configure the system */
   1141 
   1142 		/*
   1143 		 * Clear the entire RAID descriptor, just to make sure
   1144 		 *  there is no stale data left in the case of a
   1145 		 *  reconfiguration
   1146 		 */
   1147 		memset(raidPtr, 0, sizeof(*raidPtr));
   1148 		raidPtr->softc = rs;
   1149 		raidPtr->raidid = unit;
   1150 
   1151 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1152 
   1153 		if (retcode == 0) {
   1154 
   1155 			/* allow this many simultaneous IO's to
   1156 			   this RAID device */
   1157 			raidPtr->openings = RAIDOUTSTANDING;
   1158 
   1159 			raidinit(rs);
   1160 			raid_wakeup(raidPtr);
   1161 			rf_markalldirty(raidPtr);
   1162 		}
   1163 		/* free the buffers.  No return code here. */
   1164 		if (k_cfg->layoutSpecificSize) {
   1165 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1166 		}
   1167 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1168 
   1169 	no_config:
   1170 		/*
   1171 		 * If configuration failed, set sc_flags so that we
   1172 		 * will detach the device when we close it.
   1173 		 */
   1174 		if (retcode != 0)
   1175 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1176 		return (retcode);
   1177 
   1178 		/* shutdown the system */
   1179 	case RAIDFRAME_SHUTDOWN:
   1180 
   1181 		part = DISKPART(dev);
   1182 		pmask = (1 << part);
   1183 
   1184 		if ((error = raidlock(rs)) != 0)
   1185 			return (error);
   1186 
   1187 		if (DK_BUSY(dksc, pmask) ||
   1188 		    raidPtr->recon_in_progress != 0 ||
   1189 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1190 		    raidPtr->copyback_in_progress != 0)
   1191 			retcode = EBUSY;
   1192 		else {
   1193 			/* detach and free on close */
   1194 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1195 			retcode = 0;
   1196 		}
   1197 
   1198 		raidunlock(rs);
   1199 
   1200 		return (retcode);
   1201 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1202 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1203 		/* need to read the component label for the disk indicated
   1204 		   by row,column in clabel */
   1205 
   1206 		/*
   1207 		 * Perhaps there should be an option to skip the in-core
   1208 		 * copy and hit the disk, as with disklabel(8).
   1209 		 */
   1210 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1211 
   1212 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1213 
   1214 		if (retcode) {
   1215 			RF_Free(clabel, sizeof(*clabel));
   1216 			return retcode;
   1217 		}
   1218 
   1219 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1220 
   1221 		column = clabel->column;
   1222 
   1223 		if ((column < 0) || (column >= raidPtr->numCol +
   1224 		    raidPtr->numSpare)) {
   1225 			RF_Free(clabel, sizeof(*clabel));
   1226 			return EINVAL;
   1227 		}
   1228 
   1229 		RF_Free(clabel, sizeof(*clabel));
   1230 
   1231 		clabel = raidget_component_label(raidPtr, column);
   1232 
   1233 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1234 
   1235 #if 0
   1236 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1237 		clabel = (RF_ComponentLabel_t *) data;
   1238 
   1239 		/* XXX check the label for valid stuff... */
   1240 		/* Note that some things *should not* get modified --
   1241 		   the user should be re-initing the labels instead of
   1242 		   trying to patch things.
   1243 		   */
   1244 
   1245 		raidid = raidPtr->raidid;
   1246 #ifdef DEBUG
   1247 		printf("raid%d: Got component label:\n", raidid);
   1248 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1249 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1250 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1251 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1252 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1253 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1254 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1255 #endif
   1256 		clabel->row = 0;
   1257 		column = clabel->column;
   1258 
   1259 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1260 			return(EINVAL);
   1261 		}
   1262 
   1263 		/* XXX this isn't allowed to do anything for now :-) */
   1264 
   1265 		/* XXX and before it is, we need to fill in the rest
   1266 		   of the fields!?!?!?! */
   1267 		memcpy(raidget_component_label(raidPtr, column),
   1268 		    clabel, sizeof(*clabel));
   1269 		raidflush_component_label(raidPtr, column);
   1270 		return (0);
   1271 #endif
   1272 
   1273 	case RAIDFRAME_INIT_LABELS:
   1274 		clabel = (RF_ComponentLabel_t *) data;
   1275 		/*
   1276 		   we only want the serial number from
   1277 		   the above.  We get all the rest of the information
   1278 		   from the config that was used to create this RAID
   1279 		   set.
   1280 		   */
   1281 
   1282 		raidPtr->serial_number = clabel->serial_number;
   1283 
   1284 		for(column=0;column<raidPtr->numCol;column++) {
   1285 			diskPtr = &raidPtr->Disks[column];
   1286 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1287 				ci_label = raidget_component_label(raidPtr,
   1288 				    column);
   1289 				/* Zeroing this is important. */
   1290 				memset(ci_label, 0, sizeof(*ci_label));
   1291 				raid_init_component_label(raidPtr, ci_label);
   1292 				ci_label->serial_number =
   1293 				    raidPtr->serial_number;
   1294 				ci_label->row = 0; /* we dont' pretend to support more */
   1295 				rf_component_label_set_partitionsize(ci_label,
   1296 				    diskPtr->partitionSize);
   1297 				ci_label->column = column;
   1298 				raidflush_component_label(raidPtr, column);
   1299 			}
   1300 			/* XXXjld what about the spares? */
   1301 		}
   1302 
   1303 		return (retcode);
   1304 	case RAIDFRAME_SET_AUTOCONFIG:
   1305 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1306 		printf("raid%d: New autoconfig value is: %d\n",
   1307 		       raidPtr->raidid, d);
   1308 		*(int *) data = d;
   1309 		return (retcode);
   1310 
   1311 	case RAIDFRAME_SET_ROOT:
   1312 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1313 		printf("raid%d: New rootpartition value is: %d\n",
   1314 		       raidPtr->raidid, d);
   1315 		*(int *) data = d;
   1316 		return (retcode);
   1317 
   1318 		/* initialize all parity */
   1319 	case RAIDFRAME_REWRITEPARITY:
   1320 
   1321 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1322 			/* Parity for RAID 0 is trivially correct */
   1323 			raidPtr->parity_good = RF_RAID_CLEAN;
   1324 			return(0);
   1325 		}
   1326 
   1327 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1328 			/* Re-write is already in progress! */
   1329 			return(EINVAL);
   1330 		}
   1331 
   1332 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1333 					   rf_RewriteParityThread,
   1334 					   raidPtr,"raid_parity");
   1335 		return (retcode);
   1336 
   1337 
   1338 	case RAIDFRAME_ADD_HOT_SPARE:
   1339 		sparePtr = (RF_SingleComponent_t *) data;
   1340 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1341 		retcode = rf_add_hot_spare(raidPtr, &component);
   1342 		return(retcode);
   1343 
   1344 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1345 		return(retcode);
   1346 
   1347 	case RAIDFRAME_DELETE_COMPONENT:
   1348 		componentPtr = (RF_SingleComponent_t *)data;
   1349 		memcpy( &component, componentPtr,
   1350 			sizeof(RF_SingleComponent_t));
   1351 		retcode = rf_delete_component(raidPtr, &component);
   1352 		return(retcode);
   1353 
   1354 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1355 		componentPtr = (RF_SingleComponent_t *)data;
   1356 		memcpy( &component, componentPtr,
   1357 			sizeof(RF_SingleComponent_t));
   1358 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1359 		return(retcode);
   1360 
   1361 	case RAIDFRAME_REBUILD_IN_PLACE:
   1362 
   1363 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1364 			/* Can't do this on a RAID 0!! */
   1365 			return(EINVAL);
   1366 		}
   1367 
   1368 		if (raidPtr->recon_in_progress == 1) {
   1369 			/* a reconstruct is already in progress! */
   1370 			return(EINVAL);
   1371 		}
   1372 
   1373 		componentPtr = (RF_SingleComponent_t *) data;
   1374 		memcpy( &component, componentPtr,
   1375 			sizeof(RF_SingleComponent_t));
   1376 		component.row = 0; /* we don't support any more */
   1377 		column = component.column;
   1378 
   1379 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1380 			return(EINVAL);
   1381 		}
   1382 
   1383 		rf_lock_mutex2(raidPtr->mutex);
   1384 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1385 		    (raidPtr->numFailures > 0)) {
   1386 			/* XXX 0 above shouldn't be constant!!! */
   1387 			/* some component other than this has failed.
   1388 			   Let's not make things worse than they already
   1389 			   are... */
   1390 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1391 			       raidPtr->raidid);
   1392 			printf("raid%d:     Col: %d   Too many failures.\n",
   1393 			       raidPtr->raidid, column);
   1394 			rf_unlock_mutex2(raidPtr->mutex);
   1395 			return (EINVAL);
   1396 		}
   1397 		if (raidPtr->Disks[column].status ==
   1398 		    rf_ds_reconstructing) {
   1399 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1400 			       raidPtr->raidid);
   1401 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1402 
   1403 			rf_unlock_mutex2(raidPtr->mutex);
   1404 			return (EINVAL);
   1405 		}
   1406 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1407 			rf_unlock_mutex2(raidPtr->mutex);
   1408 			return (EINVAL);
   1409 		}
   1410 		rf_unlock_mutex2(raidPtr->mutex);
   1411 
   1412 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1413 		if (rrcopy == NULL)
   1414 			return(ENOMEM);
   1415 
   1416 		rrcopy->raidPtr = (void *) raidPtr;
   1417 		rrcopy->col = column;
   1418 
   1419 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1420 					   rf_ReconstructInPlaceThread,
   1421 					   rrcopy,"raid_reconip");
   1422 		return(retcode);
   1423 
   1424 	case RAIDFRAME_GET_INFO:
   1425 		if (!raidPtr->valid)
   1426 			return (ENODEV);
   1427 		ucfgp = (RF_DeviceConfig_t **) data;
   1428 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1429 			  (RF_DeviceConfig_t *));
   1430 		if (d_cfg == NULL)
   1431 			return (ENOMEM);
   1432 		d_cfg->rows = 1; /* there is only 1 row now */
   1433 		d_cfg->cols = raidPtr->numCol;
   1434 		d_cfg->ndevs = raidPtr->numCol;
   1435 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1436 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1437 			return (ENOMEM);
   1438 		}
   1439 		d_cfg->nspares = raidPtr->numSpare;
   1440 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1441 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1442 			return (ENOMEM);
   1443 		}
   1444 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1445 		d = 0;
   1446 		for (j = 0; j < d_cfg->cols; j++) {
   1447 			d_cfg->devs[d] = raidPtr->Disks[j];
   1448 			d++;
   1449 		}
   1450 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1451 			d_cfg->spares[i] = raidPtr->Disks[j];
   1452 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1453 				/* XXX: raidctl(8) expects to see this as a used spare */
   1454 				d_cfg->spares[i].status = rf_ds_used_spare;
   1455 			}
   1456 		}
   1457 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1458 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1459 
   1460 		return (retcode);
   1461 
   1462 	case RAIDFRAME_CHECK_PARITY:
   1463 		*(int *) data = raidPtr->parity_good;
   1464 		return (0);
   1465 
   1466 	case RAIDFRAME_PARITYMAP_STATUS:
   1467 		if (rf_paritymap_ineligible(raidPtr))
   1468 			return EINVAL;
   1469 		rf_paritymap_status(raidPtr->parity_map,
   1470 		    (struct rf_pmstat *)data);
   1471 		return 0;
   1472 
   1473 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1474 		if (rf_paritymap_ineligible(raidPtr))
   1475 			return EINVAL;
   1476 		if (raidPtr->parity_map == NULL)
   1477 			return ENOENT; /* ??? */
   1478 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1479 			(struct rf_pmparams *)data, 1))
   1480 			return EINVAL;
   1481 		return 0;
   1482 
   1483 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1484 		if (rf_paritymap_ineligible(raidPtr))
   1485 			return EINVAL;
   1486 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1487 		return 0;
   1488 
   1489 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1490 		if (rf_paritymap_ineligible(raidPtr))
   1491 			return EINVAL;
   1492 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1493 		/* XXX should errors be passed up? */
   1494 		return 0;
   1495 
   1496 	case RAIDFRAME_RESET_ACCTOTALS:
   1497 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1498 		return (0);
   1499 
   1500 	case RAIDFRAME_GET_ACCTOTALS:
   1501 		totals = (RF_AccTotals_t *) data;
   1502 		*totals = raidPtr->acc_totals;
   1503 		return (0);
   1504 
   1505 	case RAIDFRAME_KEEP_ACCTOTALS:
   1506 		raidPtr->keep_acc_totals = *(int *)data;
   1507 		return (0);
   1508 
   1509 	case RAIDFRAME_GET_SIZE:
   1510 		*(int *) data = raidPtr->totalSectors;
   1511 		return (0);
   1512 
   1513 		/* fail a disk & optionally start reconstruction */
   1514 	case RAIDFRAME_FAIL_DISK:
   1515 
   1516 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1517 			/* Can't do this on a RAID 0!! */
   1518 			return(EINVAL);
   1519 		}
   1520 
   1521 		rr = (struct rf_recon_req *) data;
   1522 		rr->row = 0;
   1523 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1524 			return (EINVAL);
   1525 
   1526 
   1527 		rf_lock_mutex2(raidPtr->mutex);
   1528 		if (raidPtr->status == rf_rs_reconstructing) {
   1529 			/* you can't fail a disk while we're reconstructing! */
   1530 			/* XXX wrong for RAID6 */
   1531 			rf_unlock_mutex2(raidPtr->mutex);
   1532 			return (EINVAL);
   1533 		}
   1534 		if ((raidPtr->Disks[rr->col].status ==
   1535 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1536 			/* some other component has failed.  Let's not make
   1537 			   things worse. XXX wrong for RAID6 */
   1538 			rf_unlock_mutex2(raidPtr->mutex);
   1539 			return (EINVAL);
   1540 		}
   1541 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1542 			/* Can't fail a spared disk! */
   1543 			rf_unlock_mutex2(raidPtr->mutex);
   1544 			return (EINVAL);
   1545 		}
   1546 		rf_unlock_mutex2(raidPtr->mutex);
   1547 
   1548 		/* make a copy of the recon request so that we don't rely on
   1549 		 * the user's buffer */
   1550 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1551 		if (rrcopy == NULL)
   1552 			return(ENOMEM);
   1553 		memcpy(rrcopy, rr, sizeof(*rr));
   1554 		rrcopy->raidPtr = (void *) raidPtr;
   1555 
   1556 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1557 					   rf_ReconThread,
   1558 					   rrcopy,"raid_recon");
   1559 		return (0);
   1560 
   1561 		/* invoke a copyback operation after recon on whatever disk
   1562 		 * needs it, if any */
   1563 	case RAIDFRAME_COPYBACK:
   1564 
   1565 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1566 			/* This makes no sense on a RAID 0!! */
   1567 			return(EINVAL);
   1568 		}
   1569 
   1570 		if (raidPtr->copyback_in_progress == 1) {
   1571 			/* Copyback is already in progress! */
   1572 			return(EINVAL);
   1573 		}
   1574 
   1575 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1576 					   rf_CopybackThread,
   1577 					   raidPtr,"raid_copyback");
   1578 		return (retcode);
   1579 
   1580 		/* return the percentage completion of reconstruction */
   1581 	case RAIDFRAME_CHECK_RECON_STATUS:
   1582 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1583 			/* This makes no sense on a RAID 0, so tell the
   1584 			   user it's done. */
   1585 			*(int *) data = 100;
   1586 			return(0);
   1587 		}
   1588 		if (raidPtr->status != rf_rs_reconstructing)
   1589 			*(int *) data = 100;
   1590 		else {
   1591 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1592 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1593 			} else {
   1594 				*(int *) data = 0;
   1595 			}
   1596 		}
   1597 		return (0);
   1598 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1599 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1600 		if (raidPtr->status != rf_rs_reconstructing) {
   1601 			progressInfo.remaining = 0;
   1602 			progressInfo.completed = 100;
   1603 			progressInfo.total = 100;
   1604 		} else {
   1605 			progressInfo.total =
   1606 				raidPtr->reconControl->numRUsTotal;
   1607 			progressInfo.completed =
   1608 				raidPtr->reconControl->numRUsComplete;
   1609 			progressInfo.remaining = progressInfo.total -
   1610 				progressInfo.completed;
   1611 		}
   1612 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1613 				  sizeof(RF_ProgressInfo_t));
   1614 		return (retcode);
   1615 
   1616 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1617 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1618 			/* This makes no sense on a RAID 0, so tell the
   1619 			   user it's done. */
   1620 			*(int *) data = 100;
   1621 			return(0);
   1622 		}
   1623 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1624 			*(int *) data = 100 *
   1625 				raidPtr->parity_rewrite_stripes_done /
   1626 				raidPtr->Layout.numStripe;
   1627 		} else {
   1628 			*(int *) data = 100;
   1629 		}
   1630 		return (0);
   1631 
   1632 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1633 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1634 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1635 			progressInfo.total = raidPtr->Layout.numStripe;
   1636 			progressInfo.completed =
   1637 				raidPtr->parity_rewrite_stripes_done;
   1638 			progressInfo.remaining = progressInfo.total -
   1639 				progressInfo.completed;
   1640 		} else {
   1641 			progressInfo.remaining = 0;
   1642 			progressInfo.completed = 100;
   1643 			progressInfo.total = 100;
   1644 		}
   1645 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1646 				  sizeof(RF_ProgressInfo_t));
   1647 		return (retcode);
   1648 
   1649 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1650 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1651 			/* This makes no sense on a RAID 0 */
   1652 			*(int *) data = 100;
   1653 			return(0);
   1654 		}
   1655 		if (raidPtr->copyback_in_progress == 1) {
   1656 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1657 				raidPtr->Layout.numStripe;
   1658 		} else {
   1659 			*(int *) data = 100;
   1660 		}
   1661 		return (0);
   1662 
   1663 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1664 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1665 		if (raidPtr->copyback_in_progress == 1) {
   1666 			progressInfo.total = raidPtr->Layout.numStripe;
   1667 			progressInfo.completed =
   1668 				raidPtr->copyback_stripes_done;
   1669 			progressInfo.remaining = progressInfo.total -
   1670 				progressInfo.completed;
   1671 		} else {
   1672 			progressInfo.remaining = 0;
   1673 			progressInfo.completed = 100;
   1674 			progressInfo.total = 100;
   1675 		}
   1676 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1677 				  sizeof(RF_ProgressInfo_t));
   1678 		return (retcode);
   1679 
   1680 	case RAIDFRAME_SET_LAST_UNIT:
   1681 		for (column = 0; column < raidPtr->numCol; column++)
   1682 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1683 				return EBUSY;
   1684 
   1685 		for (column = 0; column < raidPtr->numCol; column++) {
   1686 			clabel = raidget_component_label(raidPtr, column);
   1687 			clabel->last_unit = *(int *)data;
   1688 			raidflush_component_label(raidPtr, column);
   1689 		}
   1690 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1691 		return 0;
   1692 
   1693 		/* the sparetable daemon calls this to wait for the kernel to
   1694 		 * need a spare table. this ioctl does not return until a
   1695 		 * spare table is needed. XXX -- calling mpsleep here in the
   1696 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1697 		 * -- I should either compute the spare table in the kernel,
   1698 		 * or have a different -- XXX XXX -- interface (a different
   1699 		 * character device) for delivering the table     -- XXX */
   1700 #if 0
   1701 	case RAIDFRAME_SPARET_WAIT:
   1702 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1703 		while (!rf_sparet_wait_queue)
   1704 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1705 		waitreq = rf_sparet_wait_queue;
   1706 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1707 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1708 
   1709 		/* structure assignment */
   1710 		*((RF_SparetWait_t *) data) = *waitreq;
   1711 
   1712 		RF_Free(waitreq, sizeof(*waitreq));
   1713 		return (0);
   1714 
   1715 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1716 		 * code in it that will cause the dameon to exit */
   1717 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1718 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1719 		waitreq->fcol = -1;
   1720 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1721 		waitreq->next = rf_sparet_wait_queue;
   1722 		rf_sparet_wait_queue = waitreq;
   1723 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1724 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1725 		return (0);
   1726 
   1727 		/* used by the spare table daemon to deliver a spare table
   1728 		 * into the kernel */
   1729 	case RAIDFRAME_SEND_SPARET:
   1730 
   1731 		/* install the spare table */
   1732 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1733 
   1734 		/* respond to the requestor.  the return status of the spare
   1735 		 * table installation is passed in the "fcol" field */
   1736 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1737 		waitreq->fcol = retcode;
   1738 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1739 		waitreq->next = rf_sparet_resp_queue;
   1740 		rf_sparet_resp_queue = waitreq;
   1741 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1742 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1743 
   1744 		return (retcode);
   1745 #endif
   1746 
   1747 	default:
   1748 		break; /* fall through to the os-specific code below */
   1749 
   1750 	}
   1751 
   1752 	if (!raidPtr->valid)
   1753 		return (EINVAL);
   1754 
   1755 	/*
   1756 	 * Add support for "regular" device ioctls here.
   1757 	 */
   1758 
   1759 	error = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1760 	if (error != EPASSTHROUGH)
   1761 		return (error);
   1762 
   1763 	switch (cmd) {
   1764 	case DIOCCACHESYNC:
   1765 		return rf_sync_component_caches(raidPtr);
   1766 
   1767 	default:
   1768 		retcode = ENOTTY;
   1769 	}
   1770 	return (retcode);
   1771 
   1772 }
   1773 
   1774 
   1775 /* raidinit -- complete the rest of the initialization for the
   1776    RAIDframe device.  */
   1777 
   1778 
   1779 static void
   1780 raidinit(struct raid_softc *rs)
   1781 {
   1782 	cfdata_t cf;
   1783 	unsigned int unit;
   1784 	struct dk_softc *dksc = &rs->sc_dksc;
   1785 	RF_Raid_t *raidPtr = &rs->sc_r;
   1786 	device_t dev;
   1787 
   1788 	unit = raidPtr->raidid;
   1789 
   1790 	/* XXX doesn't check bounds. */
   1791 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1792 
   1793 	/* attach the pseudo device */
   1794 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1795 	cf->cf_name = raid_cd.cd_name;
   1796 	cf->cf_atname = raid_cd.cd_name;
   1797 	cf->cf_unit = unit;
   1798 	cf->cf_fstate = FSTATE_STAR;
   1799 
   1800 	dev = config_attach_pseudo(cf);
   1801 	if (dev == NULL) {
   1802 		printf("raid%d: config_attach_pseudo failed\n",
   1803 		    raidPtr->raidid);
   1804 		free(cf, M_RAIDFRAME);
   1805 		return;
   1806 	}
   1807 
   1808 	/* provide a backpointer to the real softc */
   1809 	raidsoftc(dev) = rs;
   1810 
   1811 	/* disk_attach actually creates space for the CPU disklabel, among
   1812 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1813 	 * with disklabels. */
   1814 	dk_init(dksc, dev, DKTYPE_RAID);
   1815 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1816 
   1817 	/* XXX There may be a weird interaction here between this, and
   1818 	 * protectedSectors, as used in RAIDframe.  */
   1819 
   1820 	rs->sc_size = raidPtr->totalSectors;
   1821 
   1822 	/* Attach dk and disk subsystems */
   1823 	dk_attach(dksc);
   1824 	disk_attach(&dksc->sc_dkdev);
   1825 	rf_set_geometry(rs, raidPtr);
   1826 
   1827 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1828 
   1829 	/* mark unit as usuable */
   1830 	rs->sc_flags |= RAIDF_INITED;
   1831 
   1832 	dkwedge_discover(&dksc->sc_dkdev);
   1833 }
   1834 
   1835 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1836 /* wake up the daemon & tell it to get us a spare table
   1837  * XXX
   1838  * the entries in the queues should be tagged with the raidPtr
   1839  * so that in the extremely rare case that two recons happen at once,
   1840  * we know for which device were requesting a spare table
   1841  * XXX
   1842  *
   1843  * XXX This code is not currently used. GO
   1844  */
   1845 int
   1846 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1847 {
   1848 	int     retcode;
   1849 
   1850 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1851 	req->next = rf_sparet_wait_queue;
   1852 	rf_sparet_wait_queue = req;
   1853 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1854 
   1855 	/* mpsleep unlocks the mutex */
   1856 	while (!rf_sparet_resp_queue) {
   1857 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1858 	}
   1859 	req = rf_sparet_resp_queue;
   1860 	rf_sparet_resp_queue = req->next;
   1861 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1862 
   1863 	retcode = req->fcol;
   1864 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1865 					 * alloc'd */
   1866 	return (retcode);
   1867 }
   1868 #endif
   1869 
   1870 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1871  * bp & passes it down.
   1872  * any calls originating in the kernel must use non-blocking I/O
   1873  * do some extra sanity checking to return "appropriate" error values for
   1874  * certain conditions (to make some standard utilities work)
   1875  *
   1876  * Formerly known as: rf_DoAccessKernel
   1877  */
   1878 void
   1879 raidstart(RF_Raid_t *raidPtr)
   1880 {
   1881 	struct raid_softc *rs;
   1882 	struct dk_softc *dksc;
   1883 
   1884 	rs = raidPtr->softc;
   1885 	dksc = &rs->sc_dksc;
   1886 	/* quick check to see if anything has died recently */
   1887 	rf_lock_mutex2(raidPtr->mutex);
   1888 	if (raidPtr->numNewFailures > 0) {
   1889 		rf_unlock_mutex2(raidPtr->mutex);
   1890 		rf_update_component_labels(raidPtr,
   1891 					   RF_NORMAL_COMPONENT_UPDATE);
   1892 		rf_lock_mutex2(raidPtr->mutex);
   1893 		raidPtr->numNewFailures--;
   1894 	}
   1895 	rf_unlock_mutex2(raidPtr->mutex);
   1896 
   1897 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1898 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1899 		return;
   1900 	}
   1901 
   1902 	dk_start(dksc, NULL);
   1903 }
   1904 
   1905 static int
   1906 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1907 {
   1908 	RF_SectorCount_t num_blocks, pb, sum;
   1909 	RF_RaidAddr_t raid_addr;
   1910 	daddr_t blocknum;
   1911 	int     do_async;
   1912 	int rc;
   1913 
   1914 	rf_lock_mutex2(raidPtr->mutex);
   1915 	if (raidPtr->openings == 0) {
   1916 		rf_unlock_mutex2(raidPtr->mutex);
   1917 		return EAGAIN;
   1918 	}
   1919 	rf_unlock_mutex2(raidPtr->mutex);
   1920 
   1921 	blocknum = bp->b_rawblkno;
   1922 
   1923 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1924 		    (int) blocknum));
   1925 
   1926 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1927 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1928 
   1929 	/* *THIS* is where we adjust what block we're going to...
   1930 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1931 	raid_addr = blocknum;
   1932 
   1933 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1934 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1935 	sum = raid_addr + num_blocks + pb;
   1936 	if (1 || rf_debugKernelAccess) {
   1937 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1938 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1939 			    (int) pb, (int) bp->b_resid));
   1940 	}
   1941 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1942 	    || (sum < num_blocks) || (sum < pb)) {
   1943 		rc = ENOSPC;
   1944 		goto done;
   1945 	}
   1946 	/*
   1947 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1948 	 */
   1949 
   1950 	if (bp->b_bcount & raidPtr->sectorMask) {
   1951 		rc = ENOSPC;
   1952 		goto done;
   1953 	}
   1954 	db1_printf(("Calling DoAccess..\n"));
   1955 
   1956 
   1957 	rf_lock_mutex2(raidPtr->mutex);
   1958 	raidPtr->openings--;
   1959 	rf_unlock_mutex2(raidPtr->mutex);
   1960 
   1961 	/*
   1962 	 * Everything is async.
   1963 	 */
   1964 	do_async = 1;
   1965 
   1966 	/* don't ever condition on bp->b_flags & B_WRITE.
   1967 	 * always condition on B_READ instead */
   1968 
   1969 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1970 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1971 			 do_async, raid_addr, num_blocks,
   1972 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1973 
   1974 done:
   1975 	return rc;
   1976 }
   1977 
   1978 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1979 
   1980 int
   1981 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1982 {
   1983 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1984 	struct buf *bp;
   1985 
   1986 	req->queue = queue;
   1987 	bp = req->bp;
   1988 
   1989 	switch (req->type) {
   1990 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1991 		/* XXX need to do something extra here.. */
   1992 		/* I'm leaving this in, as I've never actually seen it used,
   1993 		 * and I'd like folks to report it... GO */
   1994 		printf(("WAKEUP CALLED\n"));
   1995 		queue->numOutstanding++;
   1996 
   1997 		bp->b_flags = 0;
   1998 		bp->b_private = req;
   1999 
   2000 		KernelWakeupFunc(bp);
   2001 		break;
   2002 
   2003 	case RF_IO_TYPE_READ:
   2004 	case RF_IO_TYPE_WRITE:
   2005 #if RF_ACC_TRACE > 0
   2006 		if (req->tracerec) {
   2007 			RF_ETIMER_START(req->tracerec->timer);
   2008 		}
   2009 #endif
   2010 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2011 		    op, queue->rf_cinfo->ci_dev,
   2012 		    req->sectorOffset, req->numSector,
   2013 		    req->buf, KernelWakeupFunc, (void *) req,
   2014 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2015 
   2016 		if (rf_debugKernelAccess) {
   2017 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2018 				(long) bp->b_blkno));
   2019 		}
   2020 		queue->numOutstanding++;
   2021 		queue->last_deq_sector = req->sectorOffset;
   2022 		/* acc wouldn't have been let in if there were any pending
   2023 		 * reqs at any other priority */
   2024 		queue->curPriority = req->priority;
   2025 
   2026 		db1_printf(("Going for %c to unit %d col %d\n",
   2027 			    req->type, queue->raidPtr->raidid,
   2028 			    queue->col));
   2029 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2030 			(int) req->sectorOffset, (int) req->numSector,
   2031 			(int) (req->numSector <<
   2032 			    queue->raidPtr->logBytesPerSector),
   2033 			(int) queue->raidPtr->logBytesPerSector));
   2034 
   2035 		/*
   2036 		 * XXX: drop lock here since this can block at
   2037 		 * least with backing SCSI devices.  Retake it
   2038 		 * to minimize fuss with calling interfaces.
   2039 		 */
   2040 
   2041 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2042 		bdev_strategy(bp);
   2043 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2044 		break;
   2045 
   2046 	default:
   2047 		panic("bad req->type in rf_DispatchKernelIO");
   2048 	}
   2049 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2050 
   2051 	return (0);
   2052 }
   2053 /* this is the callback function associated with a I/O invoked from
   2054    kernel code.
   2055  */
   2056 static void
   2057 KernelWakeupFunc(struct buf *bp)
   2058 {
   2059 	RF_DiskQueueData_t *req = NULL;
   2060 	RF_DiskQueue_t *queue;
   2061 
   2062 	db1_printf(("recovering the request queue:\n"));
   2063 
   2064 	req = bp->b_private;
   2065 
   2066 	queue = (RF_DiskQueue_t *) req->queue;
   2067 
   2068 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2069 
   2070 #if RF_ACC_TRACE > 0
   2071 	if (req->tracerec) {
   2072 		RF_ETIMER_STOP(req->tracerec->timer);
   2073 		RF_ETIMER_EVAL(req->tracerec->timer);
   2074 		rf_lock_mutex2(rf_tracing_mutex);
   2075 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2076 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2077 		req->tracerec->num_phys_ios++;
   2078 		rf_unlock_mutex2(rf_tracing_mutex);
   2079 	}
   2080 #endif
   2081 
   2082 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2083 	 * ballistic, and mark the component as hosed... */
   2084 
   2085 	if (bp->b_error != 0) {
   2086 		/* Mark the disk as dead */
   2087 		/* but only mark it once... */
   2088 		/* and only if it wouldn't leave this RAID set
   2089 		   completely broken */
   2090 		if (((queue->raidPtr->Disks[queue->col].status ==
   2091 		      rf_ds_optimal) ||
   2092 		     (queue->raidPtr->Disks[queue->col].status ==
   2093 		      rf_ds_used_spare)) &&
   2094 		     (queue->raidPtr->numFailures <
   2095 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2096 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2097 			       queue->raidPtr->raidid,
   2098 			       bp->b_error,
   2099 			       queue->raidPtr->Disks[queue->col].devname);
   2100 			queue->raidPtr->Disks[queue->col].status =
   2101 			    rf_ds_failed;
   2102 			queue->raidPtr->status = rf_rs_degraded;
   2103 			queue->raidPtr->numFailures++;
   2104 			queue->raidPtr->numNewFailures++;
   2105 		} else {	/* Disk is already dead... */
   2106 			/* printf("Disk already marked as dead!\n"); */
   2107 		}
   2108 
   2109 	}
   2110 
   2111 	/* Fill in the error value */
   2112 	req->error = bp->b_error;
   2113 
   2114 	/* Drop this one on the "finished" queue... */
   2115 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2116 
   2117 	/* Let the raidio thread know there is work to be done. */
   2118 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2119 
   2120 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2121 }
   2122 
   2123 
   2124 /*
   2125  * initialize a buf structure for doing an I/O in the kernel.
   2126  */
   2127 static void
   2128 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2129        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2130        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2131        struct proc *b_proc)
   2132 {
   2133 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2134 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2135 	bp->b_oflags = 0;
   2136 	bp->b_cflags = 0;
   2137 	bp->b_bcount = numSect << logBytesPerSector;
   2138 	bp->b_bufsize = bp->b_bcount;
   2139 	bp->b_error = 0;
   2140 	bp->b_dev = dev;
   2141 	bp->b_data = bf;
   2142 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2143 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2144 	if (bp->b_bcount == 0) {
   2145 		panic("bp->b_bcount is zero in InitBP!!");
   2146 	}
   2147 	bp->b_proc = b_proc;
   2148 	bp->b_iodone = cbFunc;
   2149 	bp->b_private = cbArg;
   2150 }
   2151 
   2152 /*
   2153  * Wait interruptibly for an exclusive lock.
   2154  *
   2155  * XXX
   2156  * Several drivers do this; it should be abstracted and made MP-safe.
   2157  * (Hmm... where have we seen this warning before :->  GO )
   2158  */
   2159 static int
   2160 raidlock(struct raid_softc *rs)
   2161 {
   2162 	int     error;
   2163 
   2164 	error = 0;
   2165 	mutex_enter(&rs->sc_mutex);
   2166 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2167 		rs->sc_flags |= RAIDF_WANTED;
   2168 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2169 		if (error != 0)
   2170 			goto done;
   2171 	}
   2172 	rs->sc_flags |= RAIDF_LOCKED;
   2173 done:
   2174 	mutex_exit(&rs->sc_mutex);
   2175 	return (error);
   2176 }
   2177 /*
   2178  * Unlock and wake up any waiters.
   2179  */
   2180 static void
   2181 raidunlock(struct raid_softc *rs)
   2182 {
   2183 
   2184 	mutex_enter(&rs->sc_mutex);
   2185 	rs->sc_flags &= ~RAIDF_LOCKED;
   2186 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2187 		rs->sc_flags &= ~RAIDF_WANTED;
   2188 		cv_broadcast(&rs->sc_cv);
   2189 	}
   2190 	mutex_exit(&rs->sc_mutex);
   2191 }
   2192 
   2193 
   2194 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2195 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2196 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2197 
   2198 static daddr_t
   2199 rf_component_info_offset(void)
   2200 {
   2201 
   2202 	return RF_COMPONENT_INFO_OFFSET;
   2203 }
   2204 
   2205 static daddr_t
   2206 rf_component_info_size(unsigned secsize)
   2207 {
   2208 	daddr_t info_size;
   2209 
   2210 	KASSERT(secsize);
   2211 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2212 		info_size = secsize;
   2213 	else
   2214 		info_size = RF_COMPONENT_INFO_SIZE;
   2215 
   2216 	return info_size;
   2217 }
   2218 
   2219 static daddr_t
   2220 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2221 {
   2222 	daddr_t map_offset;
   2223 
   2224 	KASSERT(raidPtr->bytesPerSector);
   2225 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2226 		map_offset = raidPtr->bytesPerSector;
   2227 	else
   2228 		map_offset = RF_COMPONENT_INFO_SIZE;
   2229 	map_offset += rf_component_info_offset();
   2230 
   2231 	return map_offset;
   2232 }
   2233 
   2234 static daddr_t
   2235 rf_parity_map_size(RF_Raid_t *raidPtr)
   2236 {
   2237 	daddr_t map_size;
   2238 
   2239 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2240 		map_size = raidPtr->bytesPerSector;
   2241 	else
   2242 		map_size = RF_PARITY_MAP_SIZE;
   2243 
   2244 	return map_size;
   2245 }
   2246 
   2247 int
   2248 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2249 {
   2250 	RF_ComponentLabel_t *clabel;
   2251 
   2252 	clabel = raidget_component_label(raidPtr, col);
   2253 	clabel->clean = RF_RAID_CLEAN;
   2254 	raidflush_component_label(raidPtr, col);
   2255 	return(0);
   2256 }
   2257 
   2258 
   2259 int
   2260 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2261 {
   2262 	RF_ComponentLabel_t *clabel;
   2263 
   2264 	clabel = raidget_component_label(raidPtr, col);
   2265 	clabel->clean = RF_RAID_DIRTY;
   2266 	raidflush_component_label(raidPtr, col);
   2267 	return(0);
   2268 }
   2269 
   2270 int
   2271 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2272 {
   2273 	KASSERT(raidPtr->bytesPerSector);
   2274 	return raidread_component_label(raidPtr->bytesPerSector,
   2275 	    raidPtr->Disks[col].dev,
   2276 	    raidPtr->raid_cinfo[col].ci_vp,
   2277 	    &raidPtr->raid_cinfo[col].ci_label);
   2278 }
   2279 
   2280 RF_ComponentLabel_t *
   2281 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2282 {
   2283 	return &raidPtr->raid_cinfo[col].ci_label;
   2284 }
   2285 
   2286 int
   2287 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2288 {
   2289 	RF_ComponentLabel_t *label;
   2290 
   2291 	label = &raidPtr->raid_cinfo[col].ci_label;
   2292 	label->mod_counter = raidPtr->mod_counter;
   2293 #ifndef RF_NO_PARITY_MAP
   2294 	label->parity_map_modcount = label->mod_counter;
   2295 #endif
   2296 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2297 	    raidPtr->Disks[col].dev,
   2298 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2299 }
   2300 
   2301 
   2302 static int
   2303 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2304     RF_ComponentLabel_t *clabel)
   2305 {
   2306 	return raidread_component_area(dev, b_vp, clabel,
   2307 	    sizeof(RF_ComponentLabel_t),
   2308 	    rf_component_info_offset(),
   2309 	    rf_component_info_size(secsize));
   2310 }
   2311 
   2312 /* ARGSUSED */
   2313 static int
   2314 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2315     size_t msize, daddr_t offset, daddr_t dsize)
   2316 {
   2317 	struct buf *bp;
   2318 	int error;
   2319 
   2320 	/* XXX should probably ensure that we don't try to do this if
   2321 	   someone has changed rf_protected_sectors. */
   2322 
   2323 	if (b_vp == NULL) {
   2324 		/* For whatever reason, this component is not valid.
   2325 		   Don't try to read a component label from it. */
   2326 		return(EINVAL);
   2327 	}
   2328 
   2329 	/* get a block of the appropriate size... */
   2330 	bp = geteblk((int)dsize);
   2331 	bp->b_dev = dev;
   2332 
   2333 	/* get our ducks in a row for the read */
   2334 	bp->b_blkno = offset / DEV_BSIZE;
   2335 	bp->b_bcount = dsize;
   2336 	bp->b_flags |= B_READ;
   2337  	bp->b_resid = dsize;
   2338 
   2339 	bdev_strategy(bp);
   2340 	error = biowait(bp);
   2341 
   2342 	if (!error) {
   2343 		memcpy(data, bp->b_data, msize);
   2344 	}
   2345 
   2346 	brelse(bp, 0);
   2347 	return(error);
   2348 }
   2349 
   2350 
   2351 static int
   2352 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2353     RF_ComponentLabel_t *clabel)
   2354 {
   2355 	return raidwrite_component_area(dev, b_vp, clabel,
   2356 	    sizeof(RF_ComponentLabel_t),
   2357 	    rf_component_info_offset(),
   2358 	    rf_component_info_size(secsize), 0);
   2359 }
   2360 
   2361 /* ARGSUSED */
   2362 static int
   2363 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2364     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2365 {
   2366 	struct buf *bp;
   2367 	int error;
   2368 
   2369 	/* get a block of the appropriate size... */
   2370 	bp = geteblk((int)dsize);
   2371 	bp->b_dev = dev;
   2372 
   2373 	/* get our ducks in a row for the write */
   2374 	bp->b_blkno = offset / DEV_BSIZE;
   2375 	bp->b_bcount = dsize;
   2376 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2377  	bp->b_resid = dsize;
   2378 
   2379 	memset(bp->b_data, 0, dsize);
   2380 	memcpy(bp->b_data, data, msize);
   2381 
   2382 	bdev_strategy(bp);
   2383 	if (asyncp)
   2384 		return 0;
   2385 	error = biowait(bp);
   2386 	brelse(bp, 0);
   2387 	if (error) {
   2388 #if 1
   2389 		printf("Failed to write RAID component info!\n");
   2390 #endif
   2391 	}
   2392 
   2393 	return(error);
   2394 }
   2395 
   2396 void
   2397 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2398 {
   2399 	int c;
   2400 
   2401 	for (c = 0; c < raidPtr->numCol; c++) {
   2402 		/* Skip dead disks. */
   2403 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2404 			continue;
   2405 		/* XXXjld: what if an error occurs here? */
   2406 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2407 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2408 		    RF_PARITYMAP_NBYTE,
   2409 		    rf_parity_map_offset(raidPtr),
   2410 		    rf_parity_map_size(raidPtr), 0);
   2411 	}
   2412 }
   2413 
   2414 void
   2415 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2416 {
   2417 	struct rf_paritymap_ondisk tmp;
   2418 	int c,first;
   2419 
   2420 	first=1;
   2421 	for (c = 0; c < raidPtr->numCol; c++) {
   2422 		/* Skip dead disks. */
   2423 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2424 			continue;
   2425 		raidread_component_area(raidPtr->Disks[c].dev,
   2426 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2427 		    RF_PARITYMAP_NBYTE,
   2428 		    rf_parity_map_offset(raidPtr),
   2429 		    rf_parity_map_size(raidPtr));
   2430 		if (first) {
   2431 			memcpy(map, &tmp, sizeof(*map));
   2432 			first = 0;
   2433 		} else {
   2434 			rf_paritymap_merge(map, &tmp);
   2435 		}
   2436 	}
   2437 }
   2438 
   2439 void
   2440 rf_markalldirty(RF_Raid_t *raidPtr)
   2441 {
   2442 	RF_ComponentLabel_t *clabel;
   2443 	int sparecol;
   2444 	int c;
   2445 	int j;
   2446 	int scol = -1;
   2447 
   2448 	raidPtr->mod_counter++;
   2449 	for (c = 0; c < raidPtr->numCol; c++) {
   2450 		/* we don't want to touch (at all) a disk that has
   2451 		   failed */
   2452 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2453 			clabel = raidget_component_label(raidPtr, c);
   2454 			if (clabel->status == rf_ds_spared) {
   2455 				/* XXX do something special...
   2456 				   but whatever you do, don't
   2457 				   try to access it!! */
   2458 			} else {
   2459 				raidmarkdirty(raidPtr, c);
   2460 			}
   2461 		}
   2462 	}
   2463 
   2464 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2465 		sparecol = raidPtr->numCol + c;
   2466 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2467 			/*
   2468 
   2469 			   we claim this disk is "optimal" if it's
   2470 			   rf_ds_used_spare, as that means it should be
   2471 			   directly substitutable for the disk it replaced.
   2472 			   We note that too...
   2473 
   2474 			 */
   2475 
   2476 			for(j=0;j<raidPtr->numCol;j++) {
   2477 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2478 					scol = j;
   2479 					break;
   2480 				}
   2481 			}
   2482 
   2483 			clabel = raidget_component_label(raidPtr, sparecol);
   2484 			/* make sure status is noted */
   2485 
   2486 			raid_init_component_label(raidPtr, clabel);
   2487 
   2488 			clabel->row = 0;
   2489 			clabel->column = scol;
   2490 			/* Note: we *don't* change status from rf_ds_used_spare
   2491 			   to rf_ds_optimal */
   2492 			/* clabel.status = rf_ds_optimal; */
   2493 
   2494 			raidmarkdirty(raidPtr, sparecol);
   2495 		}
   2496 	}
   2497 }
   2498 
   2499 
   2500 void
   2501 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2502 {
   2503 	RF_ComponentLabel_t *clabel;
   2504 	int sparecol;
   2505 	int c;
   2506 	int j;
   2507 	int scol;
   2508 	struct raid_softc *rs = raidPtr->softc;
   2509 
   2510 	scol = -1;
   2511 
   2512 	/* XXX should do extra checks to make sure things really are clean,
   2513 	   rather than blindly setting the clean bit... */
   2514 
   2515 	raidPtr->mod_counter++;
   2516 
   2517 	for (c = 0; c < raidPtr->numCol; c++) {
   2518 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2519 			clabel = raidget_component_label(raidPtr, c);
   2520 			/* make sure status is noted */
   2521 			clabel->status = rf_ds_optimal;
   2522 
   2523 			/* note what unit we are configured as */
   2524 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2525 				clabel->last_unit = raidPtr->raidid;
   2526 
   2527 			raidflush_component_label(raidPtr, c);
   2528 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2529 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2530 					raidmarkclean(raidPtr, c);
   2531 				}
   2532 			}
   2533 		}
   2534 		/* else we don't touch it.. */
   2535 	}
   2536 
   2537 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2538 		sparecol = raidPtr->numCol + c;
   2539 		/* Need to ensure that the reconstruct actually completed! */
   2540 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2541 			/*
   2542 
   2543 			   we claim this disk is "optimal" if it's
   2544 			   rf_ds_used_spare, as that means it should be
   2545 			   directly substitutable for the disk it replaced.
   2546 			   We note that too...
   2547 
   2548 			 */
   2549 
   2550 			for(j=0;j<raidPtr->numCol;j++) {
   2551 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2552 					scol = j;
   2553 					break;
   2554 				}
   2555 			}
   2556 
   2557 			/* XXX shouldn't *really* need this... */
   2558 			clabel = raidget_component_label(raidPtr, sparecol);
   2559 			/* make sure status is noted */
   2560 
   2561 			raid_init_component_label(raidPtr, clabel);
   2562 
   2563 			clabel->column = scol;
   2564 			clabel->status = rf_ds_optimal;
   2565 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2566 				clabel->last_unit = raidPtr->raidid;
   2567 
   2568 			raidflush_component_label(raidPtr, sparecol);
   2569 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2570 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2571 					raidmarkclean(raidPtr, sparecol);
   2572 				}
   2573 			}
   2574 		}
   2575 	}
   2576 }
   2577 
   2578 void
   2579 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2580 {
   2581 
   2582 	if (vp != NULL) {
   2583 		if (auto_configured == 1) {
   2584 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2585 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2586 			vput(vp);
   2587 
   2588 		} else {
   2589 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2590 		}
   2591 	}
   2592 }
   2593 
   2594 
   2595 void
   2596 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2597 {
   2598 	int r,c;
   2599 	struct vnode *vp;
   2600 	int acd;
   2601 
   2602 
   2603 	/* We take this opportunity to close the vnodes like we should.. */
   2604 
   2605 	for (c = 0; c < raidPtr->numCol; c++) {
   2606 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2607 		acd = raidPtr->Disks[c].auto_configured;
   2608 		rf_close_component(raidPtr, vp, acd);
   2609 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2610 		raidPtr->Disks[c].auto_configured = 0;
   2611 	}
   2612 
   2613 	for (r = 0; r < raidPtr->numSpare; r++) {
   2614 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2615 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2616 		rf_close_component(raidPtr, vp, acd);
   2617 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2618 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2619 	}
   2620 }
   2621 
   2622 
   2623 void
   2624 rf_ReconThread(struct rf_recon_req *req)
   2625 {
   2626 	int     s;
   2627 	RF_Raid_t *raidPtr;
   2628 
   2629 	s = splbio();
   2630 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2631 	raidPtr->recon_in_progress = 1;
   2632 
   2633 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2634 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2635 
   2636 	RF_Free(req, sizeof(*req));
   2637 
   2638 	raidPtr->recon_in_progress = 0;
   2639 	splx(s);
   2640 
   2641 	/* That's all... */
   2642 	kthread_exit(0);	/* does not return */
   2643 }
   2644 
   2645 void
   2646 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2647 {
   2648 	int retcode;
   2649 	int s;
   2650 
   2651 	raidPtr->parity_rewrite_stripes_done = 0;
   2652 	raidPtr->parity_rewrite_in_progress = 1;
   2653 	s = splbio();
   2654 	retcode = rf_RewriteParity(raidPtr);
   2655 	splx(s);
   2656 	if (retcode) {
   2657 		printf("raid%d: Error re-writing parity (%d)!\n",
   2658 		    raidPtr->raidid, retcode);
   2659 	} else {
   2660 		/* set the clean bit!  If we shutdown correctly,
   2661 		   the clean bit on each component label will get
   2662 		   set */
   2663 		raidPtr->parity_good = RF_RAID_CLEAN;
   2664 	}
   2665 	raidPtr->parity_rewrite_in_progress = 0;
   2666 
   2667 	/* Anyone waiting for us to stop?  If so, inform them... */
   2668 	if (raidPtr->waitShutdown) {
   2669 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2670 	}
   2671 
   2672 	/* That's all... */
   2673 	kthread_exit(0);	/* does not return */
   2674 }
   2675 
   2676 
   2677 void
   2678 rf_CopybackThread(RF_Raid_t *raidPtr)
   2679 {
   2680 	int s;
   2681 
   2682 	raidPtr->copyback_in_progress = 1;
   2683 	s = splbio();
   2684 	rf_CopybackReconstructedData(raidPtr);
   2685 	splx(s);
   2686 	raidPtr->copyback_in_progress = 0;
   2687 
   2688 	/* That's all... */
   2689 	kthread_exit(0);	/* does not return */
   2690 }
   2691 
   2692 
   2693 void
   2694 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2695 {
   2696 	int s;
   2697 	RF_Raid_t *raidPtr;
   2698 
   2699 	s = splbio();
   2700 	raidPtr = req->raidPtr;
   2701 	raidPtr->recon_in_progress = 1;
   2702 	rf_ReconstructInPlace(raidPtr, req->col);
   2703 	RF_Free(req, sizeof(*req));
   2704 	raidPtr->recon_in_progress = 0;
   2705 	splx(s);
   2706 
   2707 	/* That's all... */
   2708 	kthread_exit(0);	/* does not return */
   2709 }
   2710 
   2711 static RF_AutoConfig_t *
   2712 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2713     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2714     unsigned secsize)
   2715 {
   2716 	int good_one = 0;
   2717 	RF_ComponentLabel_t *clabel;
   2718 	RF_AutoConfig_t *ac;
   2719 
   2720 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2721 	if (clabel == NULL) {
   2722 oomem:
   2723 		    while(ac_list) {
   2724 			    ac = ac_list;
   2725 			    if (ac->clabel)
   2726 				    free(ac->clabel, M_RAIDFRAME);
   2727 			    ac_list = ac_list->next;
   2728 			    free(ac, M_RAIDFRAME);
   2729 		    }
   2730 		    printf("RAID auto config: out of memory!\n");
   2731 		    return NULL; /* XXX probably should panic? */
   2732 	}
   2733 
   2734 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2735 		/* Got the label.  Does it look reasonable? */
   2736 		if (rf_reasonable_label(clabel, numsecs) &&
   2737 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2738 #ifdef DEBUG
   2739 			printf("Component on: %s: %llu\n",
   2740 				cname, (unsigned long long)size);
   2741 			rf_print_component_label(clabel);
   2742 #endif
   2743 			/* if it's reasonable, add it, else ignore it. */
   2744 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2745 				M_NOWAIT);
   2746 			if (ac == NULL) {
   2747 				free(clabel, M_RAIDFRAME);
   2748 				goto oomem;
   2749 			}
   2750 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2751 			ac->dev = dev;
   2752 			ac->vp = vp;
   2753 			ac->clabel = clabel;
   2754 			ac->next = ac_list;
   2755 			ac_list = ac;
   2756 			good_one = 1;
   2757 		}
   2758 	}
   2759 	if (!good_one) {
   2760 		/* cleanup */
   2761 		free(clabel, M_RAIDFRAME);
   2762 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2763 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2764 		vput(vp);
   2765 	}
   2766 	return ac_list;
   2767 }
   2768 
   2769 RF_AutoConfig_t *
   2770 rf_find_raid_components(void)
   2771 {
   2772 	struct vnode *vp;
   2773 	struct disklabel label;
   2774 	device_t dv;
   2775 	deviter_t di;
   2776 	dev_t dev;
   2777 	int bmajor, bminor, wedge, rf_part_found;
   2778 	int error;
   2779 	int i;
   2780 	RF_AutoConfig_t *ac_list;
   2781 	uint64_t numsecs;
   2782 	unsigned secsize;
   2783 	int dowedges;
   2784 
   2785 	/* initialize the AutoConfig list */
   2786 	ac_list = NULL;
   2787 
   2788 	/*
   2789 	 * we begin by trolling through *all* the devices on the system *twice*
   2790 	 * first we scan for wedges, second for other devices. This avoids
   2791 	 * using a raw partition instead of a wedge that covers the whole disk
   2792 	 */
   2793 
   2794 	for (dowedges=1; dowedges>=0; --dowedges) {
   2795 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2796 		     dv = deviter_next(&di)) {
   2797 
   2798 			/* we are only interested in disks... */
   2799 			if (device_class(dv) != DV_DISK)
   2800 				continue;
   2801 
   2802 			/* we don't care about floppies... */
   2803 			if (device_is_a(dv, "fd")) {
   2804 				continue;
   2805 			}
   2806 
   2807 			/* we don't care about CD's... */
   2808 			if (device_is_a(dv, "cd")) {
   2809 				continue;
   2810 			}
   2811 
   2812 			/* we don't care about md's... */
   2813 			if (device_is_a(dv, "md")) {
   2814 				continue;
   2815 			}
   2816 
   2817 			/* hdfd is the Atari/Hades floppy driver */
   2818 			if (device_is_a(dv, "hdfd")) {
   2819 				continue;
   2820 			}
   2821 
   2822 			/* fdisa is the Atari/Milan floppy driver */
   2823 			if (device_is_a(dv, "fdisa")) {
   2824 				continue;
   2825 			}
   2826 
   2827 			/* are we in the wedges pass ? */
   2828 			wedge = device_is_a(dv, "dk");
   2829 			if (wedge != dowedges) {
   2830 				continue;
   2831 			}
   2832 
   2833 			/* need to find the device_name_to_block_device_major stuff */
   2834 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2835 
   2836 			rf_part_found = 0; /*No raid partition as yet*/
   2837 
   2838 			/* get a vnode for the raw partition of this disk */
   2839 			bminor = minor(device_unit(dv));
   2840 			dev = wedge ? makedev(bmajor, bminor) :
   2841 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2842 			if (bdevvp(dev, &vp))
   2843 				panic("RAID can't alloc vnode");
   2844 
   2845 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2846 
   2847 			if (error) {
   2848 				/* "Who cares."  Continue looking
   2849 				   for something that exists*/
   2850 				vput(vp);
   2851 				continue;
   2852 			}
   2853 
   2854 			error = getdisksize(vp, &numsecs, &secsize);
   2855 			if (error) {
   2856 				/*
   2857 				 * Pseudo devices like vnd and cgd can be
   2858 				 * opened but may still need some configuration.
   2859 				 * Ignore these quietly.
   2860 				 */
   2861 				if (error != ENXIO)
   2862 					printf("RAIDframe: can't get disk size"
   2863 					    " for dev %s (%d)\n",
   2864 					    device_xname(dv), error);
   2865 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2866 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2867 				vput(vp);
   2868 				continue;
   2869 			}
   2870 			if (wedge) {
   2871 				struct dkwedge_info dkw;
   2872 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2873 				    NOCRED);
   2874 				if (error) {
   2875 					printf("RAIDframe: can't get wedge info for "
   2876 					    "dev %s (%d)\n", device_xname(dv), error);
   2877 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2878 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2879 					vput(vp);
   2880 					continue;
   2881 				}
   2882 
   2883 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2884 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2885 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2886 					vput(vp);
   2887 					continue;
   2888 				}
   2889 
   2890 				ac_list = rf_get_component(ac_list, dev, vp,
   2891 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2892 				rf_part_found = 1; /*There is a raid component on this disk*/
   2893 				continue;
   2894 			}
   2895 
   2896 			/* Ok, the disk exists.  Go get the disklabel. */
   2897 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2898 			if (error) {
   2899 				/*
   2900 				 * XXX can't happen - open() would
   2901 				 * have errored out (or faked up one)
   2902 				 */
   2903 				if (error != ENOTTY)
   2904 					printf("RAIDframe: can't get label for dev "
   2905 					    "%s (%d)\n", device_xname(dv), error);
   2906 			}
   2907 
   2908 			/* don't need this any more.  We'll allocate it again
   2909 			   a little later if we really do... */
   2910 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2911 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2912 			vput(vp);
   2913 
   2914 			if (error)
   2915 				continue;
   2916 
   2917 			rf_part_found = 0; /*No raid partitions yet*/
   2918 			for (i = 0; i < label.d_npartitions; i++) {
   2919 				char cname[sizeof(ac_list->devname)];
   2920 
   2921 				/* We only support partitions marked as RAID */
   2922 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2923 					continue;
   2924 
   2925 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2926 				if (bdevvp(dev, &vp))
   2927 					panic("RAID can't alloc vnode");
   2928 
   2929 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2930 				if (error) {
   2931 					/* Whatever... */
   2932 					vput(vp);
   2933 					continue;
   2934 				}
   2935 				snprintf(cname, sizeof(cname), "%s%c",
   2936 				    device_xname(dv), 'a' + i);
   2937 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2938 					label.d_partitions[i].p_size, numsecs, secsize);
   2939 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2940 			}
   2941 
   2942 			/*
   2943 			 *If there is no raid component on this disk, either in a
   2944 			 *disklabel or inside a wedge, check the raw partition as well,
   2945 			 *as it is possible to configure raid components on raw disk
   2946 			 *devices.
   2947 			 */
   2948 
   2949 			if (!rf_part_found) {
   2950 				char cname[sizeof(ac_list->devname)];
   2951 
   2952 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2953 				if (bdevvp(dev, &vp))
   2954 					panic("RAID can't alloc vnode");
   2955 
   2956 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2957 				if (error) {
   2958 					/* Whatever... */
   2959 					vput(vp);
   2960 					continue;
   2961 				}
   2962 				snprintf(cname, sizeof(cname), "%s%c",
   2963 				    device_xname(dv), 'a' + RAW_PART);
   2964 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2965 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2966 			}
   2967 		}
   2968 		deviter_release(&di);
   2969 	}
   2970 	return ac_list;
   2971 }
   2972 
   2973 
   2974 int
   2975 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2976 {
   2977 
   2978 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2979 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2980 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2981 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2982 	    clabel->row >=0 &&
   2983 	    clabel->column >= 0 &&
   2984 	    clabel->num_rows > 0 &&
   2985 	    clabel->num_columns > 0 &&
   2986 	    clabel->row < clabel->num_rows &&
   2987 	    clabel->column < clabel->num_columns &&
   2988 	    clabel->blockSize > 0 &&
   2989 	    /*
   2990 	     * numBlocksHi may contain garbage, but it is ok since
   2991 	     * the type is unsigned.  If it is really garbage,
   2992 	     * rf_fix_old_label_size() will fix it.
   2993 	     */
   2994 	    rf_component_label_numblocks(clabel) > 0) {
   2995 		/*
   2996 		 * label looks reasonable enough...
   2997 		 * let's make sure it has no old garbage.
   2998 		 */
   2999 		if (numsecs)
   3000 			rf_fix_old_label_size(clabel, numsecs);
   3001 		return(1);
   3002 	}
   3003 	return(0);
   3004 }
   3005 
   3006 
   3007 /*
   3008  * For reasons yet unknown, some old component labels have garbage in
   3009  * the newer numBlocksHi region, and this causes lossage.  Since those
   3010  * disks will also have numsecs set to less than 32 bits of sectors,
   3011  * we can determine when this corruption has occurred, and fix it.
   3012  *
   3013  * The exact same problem, with the same unknown reason, happens to
   3014  * the partitionSizeHi member as well.
   3015  */
   3016 static void
   3017 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3018 {
   3019 
   3020 	if (numsecs < ((uint64_t)1 << 32)) {
   3021 		if (clabel->numBlocksHi) {
   3022 			printf("WARNING: total sectors < 32 bits, yet "
   3023 			       "numBlocksHi set\n"
   3024 			       "WARNING: resetting numBlocksHi to zero.\n");
   3025 			clabel->numBlocksHi = 0;
   3026 		}
   3027 
   3028 		if (clabel->partitionSizeHi) {
   3029 			printf("WARNING: total sectors < 32 bits, yet "
   3030 			       "partitionSizeHi set\n"
   3031 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3032 			clabel->partitionSizeHi = 0;
   3033 		}
   3034 	}
   3035 }
   3036 
   3037 
   3038 #ifdef DEBUG
   3039 void
   3040 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3041 {
   3042 	uint64_t numBlocks;
   3043 	static const char *rp[] = {
   3044 	    "No", "Force", "Soft", "*invalid*"
   3045 	};
   3046 
   3047 
   3048 	numBlocks = rf_component_label_numblocks(clabel);
   3049 
   3050 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3051 	       clabel->row, clabel->column,
   3052 	       clabel->num_rows, clabel->num_columns);
   3053 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3054 	       clabel->version, clabel->serial_number,
   3055 	       clabel->mod_counter);
   3056 	printf("   Clean: %s Status: %d\n",
   3057 	       clabel->clean ? "Yes" : "No", clabel->status);
   3058 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3059 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3060 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3061 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3062 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3063 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3064 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3065 #if 0
   3066 	   printf("   Config order: %d\n", clabel->config_order);
   3067 #endif
   3068 
   3069 }
   3070 #endif
   3071 
   3072 RF_ConfigSet_t *
   3073 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3074 {
   3075 	RF_AutoConfig_t *ac;
   3076 	RF_ConfigSet_t *config_sets;
   3077 	RF_ConfigSet_t *cset;
   3078 	RF_AutoConfig_t *ac_next;
   3079 
   3080 
   3081 	config_sets = NULL;
   3082 
   3083 	/* Go through the AutoConfig list, and figure out which components
   3084 	   belong to what sets.  */
   3085 	ac = ac_list;
   3086 	while(ac!=NULL) {
   3087 		/* we're going to putz with ac->next, so save it here
   3088 		   for use at the end of the loop */
   3089 		ac_next = ac->next;
   3090 
   3091 		if (config_sets == NULL) {
   3092 			/* will need at least this one... */
   3093 			config_sets = (RF_ConfigSet_t *)
   3094 				malloc(sizeof(RF_ConfigSet_t),
   3095 				       M_RAIDFRAME, M_NOWAIT);
   3096 			if (config_sets == NULL) {
   3097 				panic("rf_create_auto_sets: No memory!");
   3098 			}
   3099 			/* this one is easy :) */
   3100 			config_sets->ac = ac;
   3101 			config_sets->next = NULL;
   3102 			config_sets->rootable = 0;
   3103 			ac->next = NULL;
   3104 		} else {
   3105 			/* which set does this component fit into? */
   3106 			cset = config_sets;
   3107 			while(cset!=NULL) {
   3108 				if (rf_does_it_fit(cset, ac)) {
   3109 					/* looks like it matches... */
   3110 					ac->next = cset->ac;
   3111 					cset->ac = ac;
   3112 					break;
   3113 				}
   3114 				cset = cset->next;
   3115 			}
   3116 			if (cset==NULL) {
   3117 				/* didn't find a match above... new set..*/
   3118 				cset = (RF_ConfigSet_t *)
   3119 					malloc(sizeof(RF_ConfigSet_t),
   3120 					       M_RAIDFRAME, M_NOWAIT);
   3121 				if (cset == NULL) {
   3122 					panic("rf_create_auto_sets: No memory!");
   3123 				}
   3124 				cset->ac = ac;
   3125 				ac->next = NULL;
   3126 				cset->next = config_sets;
   3127 				cset->rootable = 0;
   3128 				config_sets = cset;
   3129 			}
   3130 		}
   3131 		ac = ac_next;
   3132 	}
   3133 
   3134 
   3135 	return(config_sets);
   3136 }
   3137 
   3138 static int
   3139 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3140 {
   3141 	RF_ComponentLabel_t *clabel1, *clabel2;
   3142 
   3143 	/* If this one matches the *first* one in the set, that's good
   3144 	   enough, since the other members of the set would have been
   3145 	   through here too... */
   3146 	/* note that we are not checking partitionSize here..
   3147 
   3148 	   Note that we are also not checking the mod_counters here.
   3149 	   If everything else matches except the mod_counter, that's
   3150 	   good enough for this test.  We will deal with the mod_counters
   3151 	   a little later in the autoconfiguration process.
   3152 
   3153 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3154 
   3155 	   The reason we don't check for this is that failed disks
   3156 	   will have lower modification counts.  If those disks are
   3157 	   not added to the set they used to belong to, then they will
   3158 	   form their own set, which may result in 2 different sets,
   3159 	   for example, competing to be configured at raid0, and
   3160 	   perhaps competing to be the root filesystem set.  If the
   3161 	   wrong ones get configured, or both attempt to become /,
   3162 	   weird behaviour and or serious lossage will occur.  Thus we
   3163 	   need to bring them into the fold here, and kick them out at
   3164 	   a later point.
   3165 
   3166 	*/
   3167 
   3168 	clabel1 = cset->ac->clabel;
   3169 	clabel2 = ac->clabel;
   3170 	if ((clabel1->version == clabel2->version) &&
   3171 	    (clabel1->serial_number == clabel2->serial_number) &&
   3172 	    (clabel1->num_rows == clabel2->num_rows) &&
   3173 	    (clabel1->num_columns == clabel2->num_columns) &&
   3174 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3175 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3176 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3177 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3178 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3179 	    (clabel1->blockSize == clabel2->blockSize) &&
   3180 	    rf_component_label_numblocks(clabel1) ==
   3181 	    rf_component_label_numblocks(clabel2) &&
   3182 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3183 	    (clabel1->root_partition == clabel2->root_partition) &&
   3184 	    (clabel1->last_unit == clabel2->last_unit) &&
   3185 	    (clabel1->config_order == clabel2->config_order)) {
   3186 		/* if it get's here, it almost *has* to be a match */
   3187 	} else {
   3188 		/* it's not consistent with somebody in the set..
   3189 		   punt */
   3190 		return(0);
   3191 	}
   3192 	/* all was fine.. it must fit... */
   3193 	return(1);
   3194 }
   3195 
   3196 int
   3197 rf_have_enough_components(RF_ConfigSet_t *cset)
   3198 {
   3199 	RF_AutoConfig_t *ac;
   3200 	RF_AutoConfig_t *auto_config;
   3201 	RF_ComponentLabel_t *clabel;
   3202 	int c;
   3203 	int num_cols;
   3204 	int num_missing;
   3205 	int mod_counter;
   3206 	int mod_counter_found;
   3207 	int even_pair_failed;
   3208 	char parity_type;
   3209 
   3210 
   3211 	/* check to see that we have enough 'live' components
   3212 	   of this set.  If so, we can configure it if necessary */
   3213 
   3214 	num_cols = cset->ac->clabel->num_columns;
   3215 	parity_type = cset->ac->clabel->parityConfig;
   3216 
   3217 	/* XXX Check for duplicate components!?!?!? */
   3218 
   3219 	/* Determine what the mod_counter is supposed to be for this set. */
   3220 
   3221 	mod_counter_found = 0;
   3222 	mod_counter = 0;
   3223 	ac = cset->ac;
   3224 	while(ac!=NULL) {
   3225 		if (mod_counter_found==0) {
   3226 			mod_counter = ac->clabel->mod_counter;
   3227 			mod_counter_found = 1;
   3228 		} else {
   3229 			if (ac->clabel->mod_counter > mod_counter) {
   3230 				mod_counter = ac->clabel->mod_counter;
   3231 			}
   3232 		}
   3233 		ac = ac->next;
   3234 	}
   3235 
   3236 	num_missing = 0;
   3237 	auto_config = cset->ac;
   3238 
   3239 	even_pair_failed = 0;
   3240 	for(c=0; c<num_cols; c++) {
   3241 		ac = auto_config;
   3242 		while(ac!=NULL) {
   3243 			if ((ac->clabel->column == c) &&
   3244 			    (ac->clabel->mod_counter == mod_counter)) {
   3245 				/* it's this one... */
   3246 #ifdef DEBUG
   3247 				printf("Found: %s at %d\n",
   3248 				       ac->devname,c);
   3249 #endif
   3250 				break;
   3251 			}
   3252 			ac=ac->next;
   3253 		}
   3254 		if (ac==NULL) {
   3255 				/* Didn't find one here! */
   3256 				/* special case for RAID 1, especially
   3257 				   where there are more than 2
   3258 				   components (where RAIDframe treats
   3259 				   things a little differently :( ) */
   3260 			if (parity_type == '1') {
   3261 				if (c%2 == 0) { /* even component */
   3262 					even_pair_failed = 1;
   3263 				} else { /* odd component.  If
   3264 					    we're failed, and
   3265 					    so is the even
   3266 					    component, it's
   3267 					    "Good Night, Charlie" */
   3268 					if (even_pair_failed == 1) {
   3269 						return(0);
   3270 					}
   3271 				}
   3272 			} else {
   3273 				/* normal accounting */
   3274 				num_missing++;
   3275 			}
   3276 		}
   3277 		if ((parity_type == '1') && (c%2 == 1)) {
   3278 				/* Just did an even component, and we didn't
   3279 				   bail.. reset the even_pair_failed flag,
   3280 				   and go on to the next component.... */
   3281 			even_pair_failed = 0;
   3282 		}
   3283 	}
   3284 
   3285 	clabel = cset->ac->clabel;
   3286 
   3287 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3288 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3289 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3290 		/* XXX this needs to be made *much* more general */
   3291 		/* Too many failures */
   3292 		return(0);
   3293 	}
   3294 	/* otherwise, all is well, and we've got enough to take a kick
   3295 	   at autoconfiguring this set */
   3296 	return(1);
   3297 }
   3298 
   3299 void
   3300 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3301 			RF_Raid_t *raidPtr)
   3302 {
   3303 	RF_ComponentLabel_t *clabel;
   3304 	int i;
   3305 
   3306 	clabel = ac->clabel;
   3307 
   3308 	/* 1. Fill in the common stuff */
   3309 	config->numRow = clabel->num_rows = 1;
   3310 	config->numCol = clabel->num_columns;
   3311 	config->numSpare = 0; /* XXX should this be set here? */
   3312 	config->sectPerSU = clabel->sectPerSU;
   3313 	config->SUsPerPU = clabel->SUsPerPU;
   3314 	config->SUsPerRU = clabel->SUsPerRU;
   3315 	config->parityConfig = clabel->parityConfig;
   3316 	/* XXX... */
   3317 	strcpy(config->diskQueueType,"fifo");
   3318 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3319 	config->layoutSpecificSize = 0; /* XXX ?? */
   3320 
   3321 	while(ac!=NULL) {
   3322 		/* row/col values will be in range due to the checks
   3323 		   in reasonable_label() */
   3324 		strcpy(config->devnames[0][ac->clabel->column],
   3325 		       ac->devname);
   3326 		ac = ac->next;
   3327 	}
   3328 
   3329 	for(i=0;i<RF_MAXDBGV;i++) {
   3330 		config->debugVars[i][0] = 0;
   3331 	}
   3332 }
   3333 
   3334 int
   3335 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3336 {
   3337 	RF_ComponentLabel_t *clabel;
   3338 	int column;
   3339 	int sparecol;
   3340 
   3341 	raidPtr->autoconfigure = new_value;
   3342 
   3343 	for(column=0; column<raidPtr->numCol; column++) {
   3344 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3345 			clabel = raidget_component_label(raidPtr, column);
   3346 			clabel->autoconfigure = new_value;
   3347 			raidflush_component_label(raidPtr, column);
   3348 		}
   3349 	}
   3350 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3351 		sparecol = raidPtr->numCol + column;
   3352 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3353 			clabel = raidget_component_label(raidPtr, sparecol);
   3354 			clabel->autoconfigure = new_value;
   3355 			raidflush_component_label(raidPtr, sparecol);
   3356 		}
   3357 	}
   3358 	return(new_value);
   3359 }
   3360 
   3361 int
   3362 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3363 {
   3364 	RF_ComponentLabel_t *clabel;
   3365 	int column;
   3366 	int sparecol;
   3367 
   3368 	raidPtr->root_partition = new_value;
   3369 	for(column=0; column<raidPtr->numCol; column++) {
   3370 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3371 			clabel = raidget_component_label(raidPtr, column);
   3372 			clabel->root_partition = new_value;
   3373 			raidflush_component_label(raidPtr, column);
   3374 		}
   3375 	}
   3376 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3377 		sparecol = raidPtr->numCol + column;
   3378 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3379 			clabel = raidget_component_label(raidPtr, sparecol);
   3380 			clabel->root_partition = new_value;
   3381 			raidflush_component_label(raidPtr, sparecol);
   3382 		}
   3383 	}
   3384 	return(new_value);
   3385 }
   3386 
   3387 void
   3388 rf_release_all_vps(RF_ConfigSet_t *cset)
   3389 {
   3390 	RF_AutoConfig_t *ac;
   3391 
   3392 	ac = cset->ac;
   3393 	while(ac!=NULL) {
   3394 		/* Close the vp, and give it back */
   3395 		if (ac->vp) {
   3396 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3397 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3398 			vput(ac->vp);
   3399 			ac->vp = NULL;
   3400 		}
   3401 		ac = ac->next;
   3402 	}
   3403 }
   3404 
   3405 
   3406 void
   3407 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3408 {
   3409 	RF_AutoConfig_t *ac;
   3410 	RF_AutoConfig_t *next_ac;
   3411 
   3412 	ac = cset->ac;
   3413 	while(ac!=NULL) {
   3414 		next_ac = ac->next;
   3415 		/* nuke the label */
   3416 		free(ac->clabel, M_RAIDFRAME);
   3417 		/* cleanup the config structure */
   3418 		free(ac, M_RAIDFRAME);
   3419 		/* "next.." */
   3420 		ac = next_ac;
   3421 	}
   3422 	/* and, finally, nuke the config set */
   3423 	free(cset, M_RAIDFRAME);
   3424 }
   3425 
   3426 
   3427 void
   3428 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3429 {
   3430 	/* current version number */
   3431 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3432 	clabel->serial_number = raidPtr->serial_number;
   3433 	clabel->mod_counter = raidPtr->mod_counter;
   3434 
   3435 	clabel->num_rows = 1;
   3436 	clabel->num_columns = raidPtr->numCol;
   3437 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3438 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3439 
   3440 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3441 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3442 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3443 
   3444 	clabel->blockSize = raidPtr->bytesPerSector;
   3445 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3446 
   3447 	/* XXX not portable */
   3448 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3449 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3450 	clabel->autoconfigure = raidPtr->autoconfigure;
   3451 	clabel->root_partition = raidPtr->root_partition;
   3452 	clabel->last_unit = raidPtr->raidid;
   3453 	clabel->config_order = raidPtr->config_order;
   3454 
   3455 #ifndef RF_NO_PARITY_MAP
   3456 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3457 #endif
   3458 }
   3459 
   3460 struct raid_softc *
   3461 rf_auto_config_set(RF_ConfigSet_t *cset)
   3462 {
   3463 	RF_Raid_t *raidPtr;
   3464 	RF_Config_t *config;
   3465 	int raidID;
   3466 	struct raid_softc *sc;
   3467 
   3468 #ifdef DEBUG
   3469 	printf("RAID autoconfigure\n");
   3470 #endif
   3471 
   3472 	/* 1. Create a config structure */
   3473 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3474 	if (config == NULL) {
   3475 		printf("%s: Out of mem - config!?!?\n", __func__);
   3476 				/* XXX do something more intelligent here. */
   3477 		return NULL;
   3478 	}
   3479 
   3480 	/*
   3481 	   2. Figure out what RAID ID this one is supposed to live at
   3482 	   See if we can get the same RAID dev that it was configured
   3483 	   on last time..
   3484 	*/
   3485 
   3486 	raidID = cset->ac->clabel->last_unit;
   3487 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3488 	     sc = raidget(++raidID, false))
   3489 		continue;
   3490 #ifdef DEBUG
   3491 	printf("Configuring raid%d:\n",raidID);
   3492 #endif
   3493 
   3494 	if (sc == NULL)
   3495 		sc = raidget(raidID, true);
   3496 	if (sc == NULL) {
   3497 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3498 				/* XXX do something more intelligent here. */
   3499 		free(config, M_RAIDFRAME);
   3500 		return NULL;
   3501 	}
   3502 
   3503 	raidPtr = &sc->sc_r;
   3504 
   3505 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3506 	raidPtr->softc = sc;
   3507 	raidPtr->raidid = raidID;
   3508 	raidPtr->openings = RAIDOUTSTANDING;
   3509 
   3510 	/* 3. Build the configuration structure */
   3511 	rf_create_configuration(cset->ac, config, raidPtr);
   3512 
   3513 	/* 4. Do the configuration */
   3514 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3515 		raidinit(sc);
   3516 
   3517 		rf_markalldirty(raidPtr);
   3518 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3519 		switch (cset->ac->clabel->root_partition) {
   3520 		case 1:	/* Force Root */
   3521 		case 2:	/* Soft Root: root when boot partition part of raid */
   3522 			/*
   3523 			 * everything configured just fine.  Make a note
   3524 			 * that this set is eligible to be root,
   3525 			 * or forced to be root
   3526 			 */
   3527 			cset->rootable = cset->ac->clabel->root_partition;
   3528 			/* XXX do this here? */
   3529 			raidPtr->root_partition = cset->rootable;
   3530 			break;
   3531 		default:
   3532 			break;
   3533 		}
   3534 	} else {
   3535 		raidput(sc);
   3536 		sc = NULL;
   3537 	}
   3538 
   3539 	/* 5. Cleanup */
   3540 	free(config, M_RAIDFRAME);
   3541 	return sc;
   3542 }
   3543 
   3544 void
   3545 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3546 	     size_t xmin, size_t xmax)
   3547 {
   3548 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3549 	pool_sethiwat(p, xmax);
   3550 	pool_prime(p, xmin);
   3551 	pool_setlowat(p, xmin);
   3552 }
   3553 
   3554 /*
   3555  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3556  * to see if there is IO pending and if that IO could possibly be done
   3557  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3558  * otherwise.
   3559  *
   3560  */
   3561 int
   3562 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3563 {
   3564 	struct raid_softc *rs;
   3565 	struct dk_softc *dksc;
   3566 
   3567 	rs = raidPtr->softc;
   3568 	dksc = &rs->sc_dksc;
   3569 
   3570 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3571 		return 1;
   3572 
   3573 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3574 		/* there is work to do */
   3575 		return 0;
   3576 	}
   3577 	/* default is nothing to do */
   3578 	return 1;
   3579 }
   3580 
   3581 int
   3582 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3583 {
   3584 	uint64_t numsecs;
   3585 	unsigned secsize;
   3586 	int error;
   3587 
   3588 	error = getdisksize(vp, &numsecs, &secsize);
   3589 	if (error == 0) {
   3590 		diskPtr->blockSize = secsize;
   3591 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3592 		diskPtr->partitionSize = numsecs;
   3593 		return 0;
   3594 	}
   3595 	return error;
   3596 }
   3597 
   3598 static int
   3599 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3600 {
   3601 	return 1;
   3602 }
   3603 
   3604 static void
   3605 raid_attach(device_t parent, device_t self, void *aux)
   3606 {
   3607 }
   3608 
   3609 
   3610 static int
   3611 raid_detach(device_t self, int flags)
   3612 {
   3613 	int error;
   3614 	struct raid_softc *rs = raidsoftc(self);
   3615 
   3616 	if (rs == NULL)
   3617 		return ENXIO;
   3618 
   3619 	if ((error = raidlock(rs)) != 0)
   3620 		return (error);
   3621 
   3622 	error = raid_detach_unlocked(rs);
   3623 
   3624 	raidunlock(rs);
   3625 
   3626 	/* XXX raid can be referenced here */
   3627 
   3628 	if (error)
   3629 		return error;
   3630 
   3631 	/* Free the softc */
   3632 	raidput(rs);
   3633 
   3634 	return 0;
   3635 }
   3636 
   3637 static void
   3638 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3639 {
   3640 	struct dk_softc *dksc = &rs->sc_dksc;
   3641 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3642 
   3643 	memset(dg, 0, sizeof(*dg));
   3644 
   3645 	dg->dg_secperunit = raidPtr->totalSectors;
   3646 	dg->dg_secsize = raidPtr->bytesPerSector;
   3647 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3648 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3649 
   3650 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3651 }
   3652 
   3653 /*
   3654  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3655  * We end up returning whatever error was returned by the first cache flush
   3656  * that fails.
   3657  */
   3658 
   3659 int
   3660 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3661 {
   3662 	int c, sparecol;
   3663 	int e,error;
   3664 	int force = 1;
   3665 
   3666 	error = 0;
   3667 	for (c = 0; c < raidPtr->numCol; c++) {
   3668 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3669 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3670 					  &force, FWRITE, NOCRED);
   3671 			if (e) {
   3672 				if (e != ENODEV)
   3673 					printf("raid%d: cache flush to component %s failed.\n",
   3674 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3675 				if (error == 0) {
   3676 					error = e;
   3677 				}
   3678 			}
   3679 		}
   3680 	}
   3681 
   3682 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3683 		sparecol = raidPtr->numCol + c;
   3684 		/* Need to ensure that the reconstruct actually completed! */
   3685 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3686 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3687 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3688 			if (e) {
   3689 				if (e != ENODEV)
   3690 					printf("raid%d: cache flush to component %s failed.\n",
   3691 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3692 				if (error == 0) {
   3693 					error = e;
   3694 				}
   3695 			}
   3696 		}
   3697 	}
   3698 	return error;
   3699 }
   3700 
   3701 /*
   3702  * Module interface
   3703  */
   3704 
   3705 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3706 
   3707 #ifdef _MODULE
   3708 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3709 #endif
   3710 
   3711 static int raid_modcmd(modcmd_t, void *);
   3712 static int raid_modcmd_init(void);
   3713 static int raid_modcmd_fini(void);
   3714 
   3715 static int
   3716 raid_modcmd(modcmd_t cmd, void *data)
   3717 {
   3718 	int error;
   3719 
   3720 	error = 0;
   3721 	switch (cmd) {
   3722 	case MODULE_CMD_INIT:
   3723 		error = raid_modcmd_init();
   3724 		break;
   3725 	case MODULE_CMD_FINI:
   3726 		error = raid_modcmd_fini();
   3727 		break;
   3728 	default:
   3729 		error = ENOTTY;
   3730 		break;
   3731 	}
   3732 	return error;
   3733 }
   3734 
   3735 static int
   3736 raid_modcmd_init(void)
   3737 {
   3738 	int error;
   3739 	int bmajor, cmajor;
   3740 
   3741 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3742 	mutex_enter(&raid_lock);
   3743 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3744 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3745 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3746 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3747 
   3748 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3749 #endif
   3750 
   3751 	bmajor = cmajor = -1;
   3752 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3753 	    &raid_cdevsw, &cmajor);
   3754 	if (error != 0 && error != EEXIST) {
   3755 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3756 		mutex_exit(&raid_lock);
   3757 		return error;
   3758 	}
   3759 #ifdef _MODULE
   3760 	error = config_cfdriver_attach(&raid_cd);
   3761 	if (error != 0) {
   3762 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3763 		    __func__, error);
   3764 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3765 		mutex_exit(&raid_lock);
   3766 		return error;
   3767 	}
   3768 #endif
   3769 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3770 	if (error != 0) {
   3771 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3772 		    __func__, error);
   3773 #ifdef _MODULE
   3774 		config_cfdriver_detach(&raid_cd);
   3775 #endif
   3776 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3777 		mutex_exit(&raid_lock);
   3778 		return error;
   3779 	}
   3780 
   3781 	raidautoconfigdone = false;
   3782 
   3783 	mutex_exit(&raid_lock);
   3784 
   3785 	if (error == 0) {
   3786 		if (rf_BootRaidframe(true) == 0)
   3787 			aprint_verbose("Kernelized RAIDframe activated\n");
   3788 		else
   3789 			panic("Serious error activating RAID!!");
   3790 	}
   3791 
   3792 	/*
   3793 	 * Register a finalizer which will be used to auto-config RAID
   3794 	 * sets once all real hardware devices have been found.
   3795 	 */
   3796 	error = config_finalize_register(NULL, rf_autoconfig);
   3797 	if (error != 0) {
   3798 		aprint_error("WARNING: unable to register RAIDframe "
   3799 		    "finalizer\n");
   3800 		error = 0;
   3801 	}
   3802 
   3803 	return error;
   3804 }
   3805 
   3806 static int
   3807 raid_modcmd_fini(void)
   3808 {
   3809 	int error;
   3810 
   3811 	mutex_enter(&raid_lock);
   3812 
   3813 	/* Don't allow unload if raid device(s) exist.  */
   3814 	if (!LIST_EMPTY(&raids)) {
   3815 		mutex_exit(&raid_lock);
   3816 		return EBUSY;
   3817 	}
   3818 
   3819 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3820 	if (error != 0) {
   3821 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3822 		mutex_exit(&raid_lock);
   3823 		return error;
   3824 	}
   3825 #ifdef _MODULE
   3826 	error = config_cfdriver_detach(&raid_cd);
   3827 	if (error != 0) {
   3828 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3829 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3830 		mutex_exit(&raid_lock);
   3831 		return error;
   3832 	}
   3833 #endif
   3834 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3835 	if (error != 0) {
   3836 		aprint_error("%s: cannot detach devsw\n",__func__);
   3837 #ifdef _MODULE
   3838 		config_cfdriver_attach(&raid_cd);
   3839 #endif
   3840 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3841 		mutex_exit(&raid_lock);
   3842 		return error;
   3843 	}
   3844 	rf_BootRaidframe(false);
   3845 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3846 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3847 	rf_destroy_cond2(rf_sparet_wait_cv);
   3848 	rf_destroy_cond2(rf_sparet_resp_cv);
   3849 #endif
   3850 	mutex_exit(&raid_lock);
   3851 	mutex_destroy(&raid_lock);
   3852 
   3853 	return error;
   3854 }
   3855