Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.343
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.343 2016/01/07 14:15:26 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.343 2016/01/07 14:15:26 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 struct raid_softc;
    183 static void raidinit(struct raid_softc *);
    184 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    185 
    186 static int raid_match(device_t, cfdata_t, void *);
    187 static void raid_attach(device_t, device_t, void *);
    188 static int raid_detach(device_t, int);
    189 
    190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    191     daddr_t, daddr_t);
    192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    193     daddr_t, daddr_t, int);
    194 
    195 static int raidwrite_component_label(unsigned,
    196     dev_t, struct vnode *, RF_ComponentLabel_t *);
    197 static int raidread_component_label(unsigned,
    198     dev_t, struct vnode *, RF_ComponentLabel_t *);
    199 
    200 static int raid_diskstart(device_t, struct buf *bp);
    201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    202 static int raid_lastclose(device_t);
    203 
    204 static dev_type_open(raidopen);
    205 static dev_type_close(raidclose);
    206 static dev_type_read(raidread);
    207 static dev_type_write(raidwrite);
    208 static dev_type_ioctl(raidioctl);
    209 static dev_type_strategy(raidstrategy);
    210 static dev_type_dump(raiddump);
    211 static dev_type_size(raidsize);
    212 
    213 const struct bdevsw raid_bdevsw = {
    214 	.d_open = raidopen,
    215 	.d_close = raidclose,
    216 	.d_strategy = raidstrategy,
    217 	.d_ioctl = raidioctl,
    218 	.d_dump = raiddump,
    219 	.d_psize = raidsize,
    220 	.d_discard = nodiscard,
    221 	.d_flag = D_DISK
    222 };
    223 
    224 const struct cdevsw raid_cdevsw = {
    225 	.d_open = raidopen,
    226 	.d_close = raidclose,
    227 	.d_read = raidread,
    228 	.d_write = raidwrite,
    229 	.d_ioctl = raidioctl,
    230 	.d_stop = nostop,
    231 	.d_tty = notty,
    232 	.d_poll = nopoll,
    233 	.d_mmap = nommap,
    234 	.d_kqfilter = nokqfilter,
    235 	.d_discard = nodiscard,
    236 	.d_flag = D_DISK
    237 };
    238 
    239 static struct dkdriver rf_dkdriver = {
    240 	.d_open = raidopen,
    241 	.d_close = raidclose,
    242 	.d_strategy = raidstrategy,
    243 	.d_diskstart = raid_diskstart,
    244 	.d_dumpblocks = raid_dumpblocks,
    245 	.d_lastclose = raid_lastclose,
    246 	.d_minphys = minphys
    247 };
    248 
    249 struct raid_softc {
    250 	struct dk_softc sc_dksc;
    251 	int	sc_unit;
    252 	int     sc_flags;	/* flags */
    253 	int     sc_cflags;	/* configuration flags */
    254 	kmutex_t sc_mutex;	/* interlock mutex */
    255 	kcondvar_t sc_cv;	/* and the condvar */
    256 	uint64_t sc_size;	/* size of the raid device */
    257 	char    sc_xname[20];	/* XXX external name */
    258 	RF_Raid_t sc_r;
    259 	LIST_ENTRY(raid_softc) sc_link;
    260 };
    261 /* sc_flags */
    262 #define RAIDF_INITED		0x01	/* unit has been initialized */
    263 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    264 #define RAIDF_DETACH  		0x04	/* detach after final close */
    265 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    266 #define RAIDF_LOCKED		0x10	/* unit is locked */
    267 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    268 
    269 #define	raidunit(x)	DISKUNIT(x)
    270 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    271 
    272 extern struct cfdriver raid_cd;
    273 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    274     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    275     DVF_DETACH_SHUTDOWN);
    276 
    277 /*
    278  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    279  * Be aware that large numbers can allow the driver to consume a lot of
    280  * kernel memory, especially on writes, and in degraded mode reads.
    281  *
    282  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    283  * a single 64K write will typically require 64K for the old data,
    284  * 64K for the old parity, and 64K for the new parity, for a total
    285  * of 192K (if the parity buffer is not re-used immediately).
    286  * Even it if is used immediately, that's still 128K, which when multiplied
    287  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    288  *
    289  * Now in degraded mode, for example, a 64K read on the above setup may
    290  * require data reconstruction, which will require *all* of the 4 remaining
    291  * disks to participate -- 4 * 32K/disk == 128K again.
    292  */
    293 
    294 #ifndef RAIDOUTSTANDING
    295 #define RAIDOUTSTANDING   6
    296 #endif
    297 
    298 #define RAIDLABELDEV(dev)	\
    299 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    300 
    301 /* declared here, and made public, for the benefit of KVM stuff.. */
    302 
    303 static int raidlock(struct raid_softc *);
    304 static void raidunlock(struct raid_softc *);
    305 
    306 static int raid_detach_unlocked(struct raid_softc *);
    307 
    308 static void rf_markalldirty(RF_Raid_t *);
    309 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    310 
    311 void rf_ReconThread(struct rf_recon_req *);
    312 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    313 void rf_CopybackThread(RF_Raid_t *raidPtr);
    314 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    315 int rf_autoconfig(device_t);
    316 void rf_buildroothack(RF_ConfigSet_t *);
    317 
    318 RF_AutoConfig_t *rf_find_raid_components(void);
    319 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    320 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    321 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    322 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    323 int rf_set_autoconfig(RF_Raid_t *, int);
    324 int rf_set_rootpartition(RF_Raid_t *, int);
    325 void rf_release_all_vps(RF_ConfigSet_t *);
    326 void rf_cleanup_config_set(RF_ConfigSet_t *);
    327 int rf_have_enough_components(RF_ConfigSet_t *);
    328 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    329 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    330 
    331 /*
    332  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    333  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    334  * in the kernel config file.
    335  */
    336 #ifdef RAID_AUTOCONFIG
    337 int raidautoconfig = 1;
    338 #else
    339 int raidautoconfig = 0;
    340 #endif
    341 static bool raidautoconfigdone = false;
    342 
    343 struct RF_Pools_s rf_pools;
    344 
    345 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    346 static kmutex_t raid_lock;
    347 
    348 static struct raid_softc *
    349 raidcreate(int unit) {
    350 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    351 	if (sc == NULL) {
    352 #ifdef DIAGNOSTIC
    353 		printf("%s: out of memory\n", __func__);
    354 #endif
    355 		return NULL;
    356 	}
    357 	sc->sc_unit = unit;
    358 	cv_init(&sc->sc_cv, "raidunit");
    359 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    360 	return sc;
    361 }
    362 
    363 static void
    364 raiddestroy(struct raid_softc *sc) {
    365 	cv_destroy(&sc->sc_cv);
    366 	mutex_destroy(&sc->sc_mutex);
    367 	kmem_free(sc, sizeof(*sc));
    368 }
    369 
    370 static struct raid_softc *
    371 raidget(int unit, bool create) {
    372 	struct raid_softc *sc;
    373 	if (unit < 0) {
    374 #ifdef DIAGNOSTIC
    375 		panic("%s: unit %d!", __func__, unit);
    376 #endif
    377 		return NULL;
    378 	}
    379 	mutex_enter(&raid_lock);
    380 	LIST_FOREACH(sc, &raids, sc_link) {
    381 		if (sc->sc_unit == unit) {
    382 			mutex_exit(&raid_lock);
    383 			return sc;
    384 		}
    385 	}
    386 	mutex_exit(&raid_lock);
    387 	if (!create)
    388 		return NULL;
    389 	if ((sc = raidcreate(unit)) == NULL)
    390 		return NULL;
    391 	mutex_enter(&raid_lock);
    392 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    393 	mutex_exit(&raid_lock);
    394 	return sc;
    395 }
    396 
    397 static void
    398 raidput(struct raid_softc *sc) {
    399 	mutex_enter(&raid_lock);
    400 	LIST_REMOVE(sc, sc_link);
    401 	mutex_exit(&raid_lock);
    402 	raiddestroy(sc);
    403 }
    404 
    405 void
    406 raidattach(int num)
    407 {
    408 
    409 	/*
    410 	 * Device attachment and associated initialization now occurs
    411 	 * as part of the module initialization.
    412 	 */
    413 }
    414 
    415 int
    416 rf_autoconfig(device_t self)
    417 {
    418 	RF_AutoConfig_t *ac_list;
    419 	RF_ConfigSet_t *config_sets;
    420 
    421 	if (!raidautoconfig || raidautoconfigdone == true)
    422 		return (0);
    423 
    424 	/* XXX This code can only be run once. */
    425 	raidautoconfigdone = true;
    426 
    427 #ifdef __HAVE_CPU_BOOTCONF
    428 	/*
    429 	 * 0. find the boot device if needed first so we can use it later
    430 	 * this needs to be done before we autoconfigure any raid sets,
    431 	 * because if we use wedges we are not going to be able to open
    432 	 * the boot device later
    433 	 */
    434 	if (booted_device == NULL)
    435 		cpu_bootconf();
    436 #endif
    437 	/* 1. locate all RAID components on the system */
    438 	aprint_debug("Searching for RAID components...\n");
    439 	ac_list = rf_find_raid_components();
    440 
    441 	/* 2. Sort them into their respective sets. */
    442 	config_sets = rf_create_auto_sets(ac_list);
    443 
    444 	/*
    445 	 * 3. Evaluate each set and configure the valid ones.
    446 	 * This gets done in rf_buildroothack().
    447 	 */
    448 	rf_buildroothack(config_sets);
    449 
    450 	return 1;
    451 }
    452 
    453 static int
    454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    455 	const char *bootname = device_xname(bdv);
    456 	size_t len = strlen(bootname);
    457 
    458 	for (int col = 0; col < r->numCol; col++) {
    459 		const char *devname = r->Disks[col].devname;
    460 		devname += sizeof("/dev/") - 1;
    461 		if (strncmp(devname, "dk", 2) == 0) {
    462 			const char *parent =
    463 			    dkwedge_get_parent_name(r->Disks[col].dev);
    464 			if (parent != NULL)
    465 				devname = parent;
    466 		}
    467 		if (strncmp(devname, bootname, len) == 0) {
    468 			struct raid_softc *sc = r->softc;
    469 			aprint_debug("raid%d includes boot device %s\n",
    470 			    sc->sc_unit, devname);
    471 			return 1;
    472 		}
    473 	}
    474 	return 0;
    475 }
    476 
    477 void
    478 rf_buildroothack(RF_ConfigSet_t *config_sets)
    479 {
    480 	RF_ConfigSet_t *cset;
    481 	RF_ConfigSet_t *next_cset;
    482 	int num_root;
    483 	struct raid_softc *sc, *rsc;
    484 	struct dk_softc *dksc;
    485 
    486 	sc = rsc = NULL;
    487 	num_root = 0;
    488 	cset = config_sets;
    489 	while (cset != NULL) {
    490 		next_cset = cset->next;
    491 		if (rf_have_enough_components(cset) &&
    492 		    cset->ac->clabel->autoconfigure == 1) {
    493 			sc = rf_auto_config_set(cset);
    494 			if (sc != NULL) {
    495 				aprint_debug("raid%d: configured ok\n",
    496 				    sc->sc_unit);
    497 				if (cset->rootable) {
    498 					rsc = sc;
    499 					num_root++;
    500 				}
    501 			} else {
    502 				/* The autoconfig didn't work :( */
    503 				aprint_debug("Autoconfig failed\n");
    504 				rf_release_all_vps(cset);
    505 			}
    506 		} else {
    507 			/* we're not autoconfiguring this set...
    508 			   release the associated resources */
    509 			rf_release_all_vps(cset);
    510 		}
    511 		/* cleanup */
    512 		rf_cleanup_config_set(cset);
    513 		cset = next_cset;
    514 	}
    515 	dksc = &rsc->sc_dksc;
    516 
    517 	/* if the user has specified what the root device should be
    518 	   then we don't touch booted_device or boothowto... */
    519 
    520 	if (rootspec != NULL)
    521 		return;
    522 
    523 	/* we found something bootable... */
    524 
    525 	/*
    526 	 * XXX: The following code assumes that the root raid
    527 	 * is the first ('a') partition. This is about the best
    528 	 * we can do with a BSD disklabel, but we might be able
    529 	 * to do better with a GPT label, by setting a specified
    530 	 * attribute to indicate the root partition. We can then
    531 	 * stash the partition number in the r->root_partition
    532 	 * high bits (the bottom 2 bits are already used). For
    533 	 * now we just set booted_partition to 0 when we override
    534 	 * root.
    535 	 */
    536 	if (num_root == 1) {
    537 		device_t candidate_root;
    538 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    539 			char cname[sizeof(cset->ac->devname)];
    540 			/* XXX: assume 'a' */
    541 			snprintf(cname, sizeof(cname), "%s%c",
    542 			    device_xname(dksc->sc_dev), 'a');
    543 			candidate_root = dkwedge_find_by_wname(cname);
    544 		} else
    545 			candidate_root = dksc->sc_dev;
    546 		if (booted_device == NULL ||
    547 		    rsc->sc_r.root_partition == 1 ||
    548 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    549 			booted_device = candidate_root;
    550 			booted_partition = 0;	/* XXX assume 'a' */
    551 		}
    552 	} else if (num_root > 1) {
    553 
    554 		/*
    555 		 * Maybe the MD code can help. If it cannot, then
    556 		 * setroot() will discover that we have no
    557 		 * booted_device and will ask the user if nothing was
    558 		 * hardwired in the kernel config file
    559 		 */
    560 		if (booted_device == NULL)
    561 			return;
    562 
    563 		num_root = 0;
    564 		mutex_enter(&raid_lock);
    565 		LIST_FOREACH(sc, &raids, sc_link) {
    566 			RF_Raid_t *r = &sc->sc_r;
    567 			if (r->valid == 0)
    568 				continue;
    569 
    570 			if (r->root_partition == 0)
    571 				continue;
    572 
    573 			if (rf_containsboot(r, booted_device)) {
    574 				num_root++;
    575 				rsc = sc;
    576 				dksc = &rsc->sc_dksc;
    577 			}
    578 		}
    579 		mutex_exit(&raid_lock);
    580 
    581 		if (num_root == 1) {
    582 			booted_device = dksc->sc_dev;
    583 			booted_partition = 0;	/* XXX assume 'a' */
    584 		} else {
    585 			/* we can't guess.. require the user to answer... */
    586 			boothowto |= RB_ASKNAME;
    587 		}
    588 	}
    589 }
    590 
    591 static int
    592 raidsize(dev_t dev)
    593 {
    594 	struct raid_softc *rs;
    595 	struct dk_softc *dksc;
    596 	unsigned int unit;
    597 
    598 	unit = raidunit(dev);
    599 	if ((rs = raidget(unit, false)) == NULL)
    600 		return -1;
    601 	dksc = &rs->sc_dksc;
    602 
    603 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    604 		return -1;
    605 
    606 	return dk_size(dksc, dev);
    607 }
    608 
    609 static int
    610 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    611 {
    612 	unsigned int unit;
    613 	struct raid_softc *rs;
    614 	struct dk_softc *dksc;
    615 
    616 	unit = raidunit(dev);
    617 	if ((rs = raidget(unit, false)) == NULL)
    618 		return ENXIO;
    619 	dksc = &rs->sc_dksc;
    620 
    621 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    622 		return ENODEV;
    623 
    624         /*
    625            Note that blkno is relative to this particular partition.
    626            By adding adding RF_PROTECTED_SECTORS, we get a value that
    627 	   is relative to the partition used for the underlying component.
    628         */
    629 	blkno += RF_PROTECTED_SECTORS;
    630 
    631 	return dk_dump(dksc, dev, blkno, va, size);
    632 }
    633 
    634 static int
    635 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    636 {
    637 	struct raid_softc *rs = raidsoftc(dev);
    638 	const struct bdevsw *bdev;
    639 	RF_Raid_t *raidPtr;
    640 	int     c, sparecol, j, scol, dumpto;
    641 	int     error = 0;
    642 
    643 	raidPtr = &rs->sc_r;
    644 
    645 	/* we only support dumping to RAID 1 sets */
    646 	if (raidPtr->Layout.numDataCol != 1 ||
    647 	    raidPtr->Layout.numParityCol != 1)
    648 		return EINVAL;
    649 
    650 	if ((error = raidlock(rs)) != 0)
    651 		return error;
    652 
    653 	/* figure out what device is alive.. */
    654 
    655 	/*
    656 	   Look for a component to dump to.  The preference for the
    657 	   component to dump to is as follows:
    658 	   1) the master
    659 	   2) a used_spare of the master
    660 	   3) the slave
    661 	   4) a used_spare of the slave
    662 	*/
    663 
    664 	dumpto = -1;
    665 	for (c = 0; c < raidPtr->numCol; c++) {
    666 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    667 			/* this might be the one */
    668 			dumpto = c;
    669 			break;
    670 		}
    671 	}
    672 
    673 	/*
    674 	   At this point we have possibly selected a live master or a
    675 	   live slave.  We now check to see if there is a spared
    676 	   master (or a spared slave), if we didn't find a live master
    677 	   or a live slave.
    678 	*/
    679 
    680 	for (c = 0; c < raidPtr->numSpare; c++) {
    681 		sparecol = raidPtr->numCol + c;
    682 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    683 			/* How about this one? */
    684 			scol = -1;
    685 			for(j=0;j<raidPtr->numCol;j++) {
    686 				if (raidPtr->Disks[j].spareCol == sparecol) {
    687 					scol = j;
    688 					break;
    689 				}
    690 			}
    691 			if (scol == 0) {
    692 				/*
    693 				   We must have found a spared master!
    694 				   We'll take that over anything else
    695 				   found so far.  (We couldn't have
    696 				   found a real master before, since
    697 				   this is a used spare, and it's
    698 				   saying that it's replacing the
    699 				   master.)  On reboot (with
    700 				   autoconfiguration turned on)
    701 				   sparecol will become the 1st
    702 				   component (component0) of this set.
    703 				*/
    704 				dumpto = sparecol;
    705 				break;
    706 			} else if (scol != -1) {
    707 				/*
    708 				   Must be a spared slave.  We'll dump
    709 				   to that if we havn't found anything
    710 				   else so far.
    711 				*/
    712 				if (dumpto == -1)
    713 					dumpto = sparecol;
    714 			}
    715 		}
    716 	}
    717 
    718 	if (dumpto == -1) {
    719 		/* we couldn't find any live components to dump to!?!?
    720 		 */
    721 		error = EINVAL;
    722 		goto out;
    723 	}
    724 
    725 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    726 	if (bdev == NULL) {
    727 		error = ENXIO;
    728 		goto out;
    729 	}
    730 
    731 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    732 				blkno, va, nblk * raidPtr->bytesPerSector);
    733 
    734 out:
    735 	raidunlock(rs);
    736 
    737 	return error;
    738 }
    739 
    740 /* ARGSUSED */
    741 static int
    742 raidopen(dev_t dev, int flags, int fmt,
    743     struct lwp *l)
    744 {
    745 	int     unit = raidunit(dev);
    746 	struct raid_softc *rs;
    747 	struct dk_softc *dksc;
    748 	int     error = 0;
    749 	int     part, pmask;
    750 
    751 	if ((rs = raidget(unit, true)) == NULL)
    752 		return ENXIO;
    753 	if ((error = raidlock(rs)) != 0)
    754 		return (error);
    755 
    756 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    757 		error = EBUSY;
    758 		goto bad;
    759 	}
    760 
    761 	dksc = &rs->sc_dksc;
    762 
    763 	part = DISKPART(dev);
    764 	pmask = (1 << part);
    765 
    766 	if (!DK_BUSY(dksc, pmask) &&
    767 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    768 		/* First one... mark things as dirty... Note that we *MUST*
    769 		 have done a configure before this.  I DO NOT WANT TO BE
    770 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    771 		 THAT THEY BELONG TOGETHER!!!!! */
    772 		/* XXX should check to see if we're only open for reading
    773 		   here... If so, we needn't do this, but then need some
    774 		   other way of keeping track of what's happened.. */
    775 
    776 		rf_markalldirty(&rs->sc_r);
    777 	}
    778 
    779 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    780 		error = dk_open(dksc, dev, flags, fmt, l);
    781 
    782 bad:
    783 	raidunlock(rs);
    784 
    785 	return (error);
    786 
    787 
    788 }
    789 
    790 static int
    791 raid_lastclose(device_t self)
    792 {
    793 	struct raid_softc *rs = raidsoftc(self);
    794 
    795 	/* Last one... device is not unconfigured yet.
    796 	   Device shutdown has taken care of setting the
    797 	   clean bits if RAIDF_INITED is not set
    798 	   mark things as clean... */
    799 
    800 	rf_update_component_labels(&rs->sc_r,
    801 	    RF_FINAL_COMPONENT_UPDATE);
    802 
    803 	/* pass to unlocked code */
    804 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    805 		rs->sc_flags |= RAIDF_DETACH;
    806 
    807 	return 0;
    808 }
    809 
    810 /* ARGSUSED */
    811 static int
    812 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    813 {
    814 	int     unit = raidunit(dev);
    815 	struct raid_softc *rs;
    816 	struct dk_softc *dksc;
    817 	cfdata_t cf;
    818 	int     error = 0, do_detach = 0, do_put = 0;
    819 
    820 	if ((rs = raidget(unit, false)) == NULL)
    821 		return ENXIO;
    822 	dksc = &rs->sc_dksc;
    823 
    824 	if ((error = raidlock(rs)) != 0)
    825 		return (error);
    826 
    827 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    828 		error = dk_close(dksc, dev, flags, fmt, l);
    829 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    830 			do_detach = 1;
    831 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    832 		do_put = 1;
    833 
    834 	raidunlock(rs);
    835 
    836 	if (do_detach) {
    837 		/* free the pseudo device attach bits */
    838 		cf = device_cfdata(dksc->sc_dev);
    839 		error = config_detach(dksc->sc_dev, 0);
    840 		if (error == 0)
    841 			free(cf, M_RAIDFRAME);
    842 	} else if (do_put) {
    843 		raidput(rs);
    844 	}
    845 
    846 	return (error);
    847 
    848 }
    849 
    850 static void
    851 raid_wakeup(RF_Raid_t *raidPtr)
    852 {
    853 	rf_lock_mutex2(raidPtr->iodone_lock);
    854 	rf_signal_cond2(raidPtr->iodone_cv);
    855 	rf_unlock_mutex2(raidPtr->iodone_lock);
    856 }
    857 
    858 static void
    859 raidstrategy(struct buf *bp)
    860 {
    861 	unsigned int unit;
    862 	struct raid_softc *rs;
    863 	struct dk_softc *dksc;
    864 	RF_Raid_t *raidPtr;
    865 
    866 	unit = raidunit(bp->b_dev);
    867 	if ((rs = raidget(unit, false)) == NULL) {
    868 		bp->b_error = ENXIO;
    869 		goto fail;
    870 	}
    871 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    872 		bp->b_error = ENXIO;
    873 		goto fail;
    874 	}
    875 	dksc = &rs->sc_dksc;
    876 	raidPtr = &rs->sc_r;
    877 
    878 	/* Queue IO only */
    879 	if (dk_strategy_defer(dksc, bp))
    880 		goto done;
    881 
    882 	/* schedule the IO to happen at the next convenient time */
    883 	raid_wakeup(raidPtr);
    884 
    885 done:
    886 	return;
    887 
    888 fail:
    889 	bp->b_resid = bp->b_bcount;
    890 	biodone(bp);
    891 }
    892 
    893 static int
    894 raid_diskstart(device_t dev, struct buf *bp)
    895 {
    896 	struct raid_softc *rs = raidsoftc(dev);
    897 	RF_Raid_t *raidPtr;
    898 
    899 	raidPtr = &rs->sc_r;
    900 	if (!raidPtr->valid) {
    901 		db1_printf(("raid is not valid..\n"));
    902 		return ENODEV;
    903 	}
    904 
    905 	/* XXX */
    906 	bp->b_resid = 0;
    907 
    908 	return raiddoaccess(raidPtr, bp);
    909 }
    910 
    911 void
    912 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    913 {
    914 	struct raid_softc *rs;
    915 	struct dk_softc *dksc;
    916 
    917 	rs = raidPtr->softc;
    918 	dksc = &rs->sc_dksc;
    919 
    920 	dk_done(dksc, bp);
    921 
    922 	rf_lock_mutex2(raidPtr->mutex);
    923 	raidPtr->openings++;
    924 	rf_unlock_mutex2(raidPtr->mutex);
    925 
    926 	/* schedule more IO */
    927 	raid_wakeup(raidPtr);
    928 }
    929 
    930 /* ARGSUSED */
    931 static int
    932 raidread(dev_t dev, struct uio *uio, int flags)
    933 {
    934 	int     unit = raidunit(dev);
    935 	struct raid_softc *rs;
    936 
    937 	if ((rs = raidget(unit, false)) == NULL)
    938 		return ENXIO;
    939 
    940 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    941 		return (ENXIO);
    942 
    943 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    944 
    945 }
    946 
    947 /* ARGSUSED */
    948 static int
    949 raidwrite(dev_t dev, struct uio *uio, int flags)
    950 {
    951 	int     unit = raidunit(dev);
    952 	struct raid_softc *rs;
    953 
    954 	if ((rs = raidget(unit, false)) == NULL)
    955 		return ENXIO;
    956 
    957 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    958 		return (ENXIO);
    959 
    960 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    961 
    962 }
    963 
    964 static int
    965 raid_detach_unlocked(struct raid_softc *rs)
    966 {
    967 	struct dk_softc *dksc = &rs->sc_dksc;
    968 	RF_Raid_t *raidPtr;
    969 	int error;
    970 
    971 	raidPtr = &rs->sc_r;
    972 
    973 	if (DK_BUSY(dksc, 0) ||
    974 	    raidPtr->recon_in_progress != 0 ||
    975 	    raidPtr->parity_rewrite_in_progress != 0 ||
    976 	    raidPtr->copyback_in_progress != 0)
    977 		return EBUSY;
    978 
    979 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    980 		return 0;
    981 
    982 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
    983 
    984 	if ((error = rf_Shutdown(raidPtr)) != 0)
    985 		return error;
    986 
    987 	rs->sc_flags &= ~RAIDF_INITED;
    988 
    989 	/* Kill off any queued buffers */
    990 	dk_drain(dksc);
    991 	bufq_free(dksc->sc_bufq);
    992 
    993 	/* Detach the disk. */
    994 	dkwedge_delall(&dksc->sc_dkdev);
    995 	disk_detach(&dksc->sc_dkdev);
    996 	disk_destroy(&dksc->sc_dkdev);
    997 	dk_detach(dksc);
    998 
    999 	return 0;
   1000 }
   1001 
   1002 static int
   1003 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1004 {
   1005 	int     unit = raidunit(dev);
   1006 	int     error = 0;
   1007 	int     part, pmask;
   1008 	struct raid_softc *rs;
   1009 	struct dk_softc *dksc;
   1010 	RF_Config_t *k_cfg, *u_cfg;
   1011 	RF_Raid_t *raidPtr;
   1012 	RF_RaidDisk_t *diskPtr;
   1013 	RF_AccTotals_t *totals;
   1014 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1015 	u_char *specific_buf;
   1016 	int retcode = 0;
   1017 	int column;
   1018 /*	int raidid; */
   1019 	struct rf_recon_req *rrcopy, *rr;
   1020 	RF_ComponentLabel_t *clabel;
   1021 	RF_ComponentLabel_t *ci_label;
   1022 	RF_ComponentLabel_t **clabel_ptr;
   1023 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1024 	RF_SingleComponent_t component;
   1025 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1026 	int i, j, d;
   1027 
   1028 	if ((rs = raidget(unit, false)) == NULL)
   1029 		return ENXIO;
   1030 	dksc = &rs->sc_dksc;
   1031 	raidPtr = &rs->sc_r;
   1032 
   1033 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1034 		(int) DISKPART(dev), (int) unit, cmd));
   1035 
   1036 	/* Must be initialized for these... */
   1037 	switch (cmd) {
   1038 	case RAIDFRAME_REWRITEPARITY:
   1039 	case RAIDFRAME_GET_INFO:
   1040 	case RAIDFRAME_RESET_ACCTOTALS:
   1041 	case RAIDFRAME_GET_ACCTOTALS:
   1042 	case RAIDFRAME_KEEP_ACCTOTALS:
   1043 	case RAIDFRAME_GET_SIZE:
   1044 	case RAIDFRAME_FAIL_DISK:
   1045 	case RAIDFRAME_COPYBACK:
   1046 	case RAIDFRAME_CHECK_RECON_STATUS:
   1047 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1048 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1049 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1050 	case RAIDFRAME_ADD_HOT_SPARE:
   1051 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1052 	case RAIDFRAME_INIT_LABELS:
   1053 	case RAIDFRAME_REBUILD_IN_PLACE:
   1054 	case RAIDFRAME_CHECK_PARITY:
   1055 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1056 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1057 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1058 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1059 	case RAIDFRAME_SET_AUTOCONFIG:
   1060 	case RAIDFRAME_SET_ROOT:
   1061 	case RAIDFRAME_DELETE_COMPONENT:
   1062 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1063 	case RAIDFRAME_PARITYMAP_STATUS:
   1064 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1065 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1066 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1067 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1068 			return (ENXIO);
   1069 	}
   1070 
   1071 	switch (cmd) {
   1072 #ifdef COMPAT_50
   1073 	case RAIDFRAME_GET_INFO50:
   1074 		return rf_get_info50(raidPtr, data);
   1075 
   1076 	case RAIDFRAME_CONFIGURE50:
   1077 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1078 			return retcode;
   1079 		goto config;
   1080 #endif
   1081 		/* configure the system */
   1082 	case RAIDFRAME_CONFIGURE:
   1083 
   1084 		if (raidPtr->valid) {
   1085 			/* There is a valid RAID set running on this unit! */
   1086 			printf("raid%d: Device already configured!\n",unit);
   1087 			return(EINVAL);
   1088 		}
   1089 
   1090 		/* copy-in the configuration information */
   1091 		/* data points to a pointer to the configuration structure */
   1092 
   1093 		u_cfg = *((RF_Config_t **) data);
   1094 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1095 		if (k_cfg == NULL) {
   1096 			return (ENOMEM);
   1097 		}
   1098 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1099 		if (retcode) {
   1100 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1101 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1102 				retcode));
   1103 			goto no_config;
   1104 		}
   1105 		goto config;
   1106 	config:
   1107 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1108 
   1109 		/* allocate a buffer for the layout-specific data, and copy it
   1110 		 * in */
   1111 		if (k_cfg->layoutSpecificSize) {
   1112 			if (k_cfg->layoutSpecificSize > 10000) {
   1113 				/* sanity check */
   1114 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1115 				retcode = EINVAL;
   1116 				goto no_config;
   1117 			}
   1118 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1119 			    (u_char *));
   1120 			if (specific_buf == NULL) {
   1121 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1122 				retcode = ENOMEM;
   1123 				goto no_config;
   1124 			}
   1125 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1126 			    k_cfg->layoutSpecificSize);
   1127 			if (retcode) {
   1128 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1129 				RF_Free(specific_buf,
   1130 					k_cfg->layoutSpecificSize);
   1131 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1132 					retcode));
   1133 				goto no_config;
   1134 			}
   1135 		} else
   1136 			specific_buf = NULL;
   1137 		k_cfg->layoutSpecific = specific_buf;
   1138 
   1139 		/* should do some kind of sanity check on the configuration.
   1140 		 * Store the sum of all the bytes in the last byte? */
   1141 
   1142 		/* configure the system */
   1143 
   1144 		/*
   1145 		 * Clear the entire RAID descriptor, just to make sure
   1146 		 *  there is no stale data left in the case of a
   1147 		 *  reconfiguration
   1148 		 */
   1149 		memset(raidPtr, 0, sizeof(*raidPtr));
   1150 		raidPtr->softc = rs;
   1151 		raidPtr->raidid = unit;
   1152 
   1153 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1154 
   1155 		if (retcode == 0) {
   1156 
   1157 			/* allow this many simultaneous IO's to
   1158 			   this RAID device */
   1159 			raidPtr->openings = RAIDOUTSTANDING;
   1160 
   1161 			raidinit(rs);
   1162 			raid_wakeup(raidPtr);
   1163 			rf_markalldirty(raidPtr);
   1164 		}
   1165 		/* free the buffers.  No return code here. */
   1166 		if (k_cfg->layoutSpecificSize) {
   1167 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1168 		}
   1169 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1170 
   1171 	no_config:
   1172 		/*
   1173 		 * If configuration failed, set sc_flags so that we
   1174 		 * will detach the device when we close it.
   1175 		 */
   1176 		if (retcode != 0)
   1177 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1178 		return (retcode);
   1179 
   1180 		/* shutdown the system */
   1181 	case RAIDFRAME_SHUTDOWN:
   1182 
   1183 		part = DISKPART(dev);
   1184 		pmask = (1 << part);
   1185 
   1186 		if ((error = raidlock(rs)) != 0)
   1187 			return (error);
   1188 
   1189 		if (DK_BUSY(dksc, pmask) ||
   1190 		    raidPtr->recon_in_progress != 0 ||
   1191 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1192 		    raidPtr->copyback_in_progress != 0)
   1193 			retcode = EBUSY;
   1194 		else {
   1195 			/* detach and free on close */
   1196 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1197 			retcode = 0;
   1198 		}
   1199 
   1200 		raidunlock(rs);
   1201 
   1202 		return (retcode);
   1203 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1204 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1205 		/* need to read the component label for the disk indicated
   1206 		   by row,column in clabel */
   1207 
   1208 		/*
   1209 		 * Perhaps there should be an option to skip the in-core
   1210 		 * copy and hit the disk, as with disklabel(8).
   1211 		 */
   1212 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1213 
   1214 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1215 
   1216 		if (retcode) {
   1217 			RF_Free(clabel, sizeof(*clabel));
   1218 			return retcode;
   1219 		}
   1220 
   1221 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1222 
   1223 		column = clabel->column;
   1224 
   1225 		if ((column < 0) || (column >= raidPtr->numCol +
   1226 		    raidPtr->numSpare)) {
   1227 			RF_Free(clabel, sizeof(*clabel));
   1228 			return EINVAL;
   1229 		}
   1230 
   1231 		RF_Free(clabel, sizeof(*clabel));
   1232 
   1233 		clabel = raidget_component_label(raidPtr, column);
   1234 
   1235 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1236 
   1237 #if 0
   1238 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1239 		clabel = (RF_ComponentLabel_t *) data;
   1240 
   1241 		/* XXX check the label for valid stuff... */
   1242 		/* Note that some things *should not* get modified --
   1243 		   the user should be re-initing the labels instead of
   1244 		   trying to patch things.
   1245 		   */
   1246 
   1247 		raidid = raidPtr->raidid;
   1248 #ifdef DEBUG
   1249 		printf("raid%d: Got component label:\n", raidid);
   1250 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1251 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1252 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1253 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1254 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1255 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1256 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1257 #endif
   1258 		clabel->row = 0;
   1259 		column = clabel->column;
   1260 
   1261 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1262 			return(EINVAL);
   1263 		}
   1264 
   1265 		/* XXX this isn't allowed to do anything for now :-) */
   1266 
   1267 		/* XXX and before it is, we need to fill in the rest
   1268 		   of the fields!?!?!?! */
   1269 		memcpy(raidget_component_label(raidPtr, column),
   1270 		    clabel, sizeof(*clabel));
   1271 		raidflush_component_label(raidPtr, column);
   1272 		return (0);
   1273 #endif
   1274 
   1275 	case RAIDFRAME_INIT_LABELS:
   1276 		clabel = (RF_ComponentLabel_t *) data;
   1277 		/*
   1278 		   we only want the serial number from
   1279 		   the above.  We get all the rest of the information
   1280 		   from the config that was used to create this RAID
   1281 		   set.
   1282 		   */
   1283 
   1284 		raidPtr->serial_number = clabel->serial_number;
   1285 
   1286 		for(column=0;column<raidPtr->numCol;column++) {
   1287 			diskPtr = &raidPtr->Disks[column];
   1288 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1289 				ci_label = raidget_component_label(raidPtr,
   1290 				    column);
   1291 				/* Zeroing this is important. */
   1292 				memset(ci_label, 0, sizeof(*ci_label));
   1293 				raid_init_component_label(raidPtr, ci_label);
   1294 				ci_label->serial_number =
   1295 				    raidPtr->serial_number;
   1296 				ci_label->row = 0; /* we dont' pretend to support more */
   1297 				rf_component_label_set_partitionsize(ci_label,
   1298 				    diskPtr->partitionSize);
   1299 				ci_label->column = column;
   1300 				raidflush_component_label(raidPtr, column);
   1301 			}
   1302 			/* XXXjld what about the spares? */
   1303 		}
   1304 
   1305 		return (retcode);
   1306 	case RAIDFRAME_SET_AUTOCONFIG:
   1307 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1308 		printf("raid%d: New autoconfig value is: %d\n",
   1309 		       raidPtr->raidid, d);
   1310 		*(int *) data = d;
   1311 		return (retcode);
   1312 
   1313 	case RAIDFRAME_SET_ROOT:
   1314 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1315 		printf("raid%d: New rootpartition value is: %d\n",
   1316 		       raidPtr->raidid, d);
   1317 		*(int *) data = d;
   1318 		return (retcode);
   1319 
   1320 		/* initialize all parity */
   1321 	case RAIDFRAME_REWRITEPARITY:
   1322 
   1323 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1324 			/* Parity for RAID 0 is trivially correct */
   1325 			raidPtr->parity_good = RF_RAID_CLEAN;
   1326 			return(0);
   1327 		}
   1328 
   1329 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1330 			/* Re-write is already in progress! */
   1331 			return(EINVAL);
   1332 		}
   1333 
   1334 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1335 					   rf_RewriteParityThread,
   1336 					   raidPtr,"raid_parity");
   1337 		return (retcode);
   1338 
   1339 
   1340 	case RAIDFRAME_ADD_HOT_SPARE:
   1341 		sparePtr = (RF_SingleComponent_t *) data;
   1342 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1343 		retcode = rf_add_hot_spare(raidPtr, &component);
   1344 		return(retcode);
   1345 
   1346 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1347 		return(retcode);
   1348 
   1349 	case RAIDFRAME_DELETE_COMPONENT:
   1350 		componentPtr = (RF_SingleComponent_t *)data;
   1351 		memcpy( &component, componentPtr,
   1352 			sizeof(RF_SingleComponent_t));
   1353 		retcode = rf_delete_component(raidPtr, &component);
   1354 		return(retcode);
   1355 
   1356 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1357 		componentPtr = (RF_SingleComponent_t *)data;
   1358 		memcpy( &component, componentPtr,
   1359 			sizeof(RF_SingleComponent_t));
   1360 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1361 		return(retcode);
   1362 
   1363 	case RAIDFRAME_REBUILD_IN_PLACE:
   1364 
   1365 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1366 			/* Can't do this on a RAID 0!! */
   1367 			return(EINVAL);
   1368 		}
   1369 
   1370 		if (raidPtr->recon_in_progress == 1) {
   1371 			/* a reconstruct is already in progress! */
   1372 			return(EINVAL);
   1373 		}
   1374 
   1375 		componentPtr = (RF_SingleComponent_t *) data;
   1376 		memcpy( &component, componentPtr,
   1377 			sizeof(RF_SingleComponent_t));
   1378 		component.row = 0; /* we don't support any more */
   1379 		column = component.column;
   1380 
   1381 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1382 			return(EINVAL);
   1383 		}
   1384 
   1385 		rf_lock_mutex2(raidPtr->mutex);
   1386 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1387 		    (raidPtr->numFailures > 0)) {
   1388 			/* XXX 0 above shouldn't be constant!!! */
   1389 			/* some component other than this has failed.
   1390 			   Let's not make things worse than they already
   1391 			   are... */
   1392 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1393 			       raidPtr->raidid);
   1394 			printf("raid%d:     Col: %d   Too many failures.\n",
   1395 			       raidPtr->raidid, column);
   1396 			rf_unlock_mutex2(raidPtr->mutex);
   1397 			return (EINVAL);
   1398 		}
   1399 		if (raidPtr->Disks[column].status ==
   1400 		    rf_ds_reconstructing) {
   1401 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1402 			       raidPtr->raidid);
   1403 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1404 
   1405 			rf_unlock_mutex2(raidPtr->mutex);
   1406 			return (EINVAL);
   1407 		}
   1408 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1409 			rf_unlock_mutex2(raidPtr->mutex);
   1410 			return (EINVAL);
   1411 		}
   1412 		rf_unlock_mutex2(raidPtr->mutex);
   1413 
   1414 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1415 		if (rrcopy == NULL)
   1416 			return(ENOMEM);
   1417 
   1418 		rrcopy->raidPtr = (void *) raidPtr;
   1419 		rrcopy->col = column;
   1420 
   1421 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1422 					   rf_ReconstructInPlaceThread,
   1423 					   rrcopy,"raid_reconip");
   1424 		return(retcode);
   1425 
   1426 	case RAIDFRAME_GET_INFO:
   1427 		if (!raidPtr->valid)
   1428 			return (ENODEV);
   1429 		ucfgp = (RF_DeviceConfig_t **) data;
   1430 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1431 			  (RF_DeviceConfig_t *));
   1432 		if (d_cfg == NULL)
   1433 			return (ENOMEM);
   1434 		d_cfg->rows = 1; /* there is only 1 row now */
   1435 		d_cfg->cols = raidPtr->numCol;
   1436 		d_cfg->ndevs = raidPtr->numCol;
   1437 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1438 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1439 			return (ENOMEM);
   1440 		}
   1441 		d_cfg->nspares = raidPtr->numSpare;
   1442 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1443 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1444 			return (ENOMEM);
   1445 		}
   1446 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1447 		d = 0;
   1448 		for (j = 0; j < d_cfg->cols; j++) {
   1449 			d_cfg->devs[d] = raidPtr->Disks[j];
   1450 			d++;
   1451 		}
   1452 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1453 			d_cfg->spares[i] = raidPtr->Disks[j];
   1454 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1455 				/* XXX: raidctl(8) expects to see this as a used spare */
   1456 				d_cfg->spares[i].status = rf_ds_used_spare;
   1457 			}
   1458 		}
   1459 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1460 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1461 
   1462 		return (retcode);
   1463 
   1464 	case RAIDFRAME_CHECK_PARITY:
   1465 		*(int *) data = raidPtr->parity_good;
   1466 		return (0);
   1467 
   1468 	case RAIDFRAME_PARITYMAP_STATUS:
   1469 		if (rf_paritymap_ineligible(raidPtr))
   1470 			return EINVAL;
   1471 		rf_paritymap_status(raidPtr->parity_map,
   1472 		    (struct rf_pmstat *)data);
   1473 		return 0;
   1474 
   1475 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1476 		if (rf_paritymap_ineligible(raidPtr))
   1477 			return EINVAL;
   1478 		if (raidPtr->parity_map == NULL)
   1479 			return ENOENT; /* ??? */
   1480 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1481 			(struct rf_pmparams *)data, 1))
   1482 			return EINVAL;
   1483 		return 0;
   1484 
   1485 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1486 		if (rf_paritymap_ineligible(raidPtr))
   1487 			return EINVAL;
   1488 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1489 		return 0;
   1490 
   1491 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1492 		if (rf_paritymap_ineligible(raidPtr))
   1493 			return EINVAL;
   1494 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1495 		/* XXX should errors be passed up? */
   1496 		return 0;
   1497 
   1498 	case RAIDFRAME_RESET_ACCTOTALS:
   1499 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1500 		return (0);
   1501 
   1502 	case RAIDFRAME_GET_ACCTOTALS:
   1503 		totals = (RF_AccTotals_t *) data;
   1504 		*totals = raidPtr->acc_totals;
   1505 		return (0);
   1506 
   1507 	case RAIDFRAME_KEEP_ACCTOTALS:
   1508 		raidPtr->keep_acc_totals = *(int *)data;
   1509 		return (0);
   1510 
   1511 	case RAIDFRAME_GET_SIZE:
   1512 		*(int *) data = raidPtr->totalSectors;
   1513 		return (0);
   1514 
   1515 		/* fail a disk & optionally start reconstruction */
   1516 	case RAIDFRAME_FAIL_DISK:
   1517 
   1518 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1519 			/* Can't do this on a RAID 0!! */
   1520 			return(EINVAL);
   1521 		}
   1522 
   1523 		rr = (struct rf_recon_req *) data;
   1524 		rr->row = 0;
   1525 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1526 			return (EINVAL);
   1527 
   1528 
   1529 		rf_lock_mutex2(raidPtr->mutex);
   1530 		if (raidPtr->status == rf_rs_reconstructing) {
   1531 			/* you can't fail a disk while we're reconstructing! */
   1532 			/* XXX wrong for RAID6 */
   1533 			rf_unlock_mutex2(raidPtr->mutex);
   1534 			return (EINVAL);
   1535 		}
   1536 		if ((raidPtr->Disks[rr->col].status ==
   1537 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1538 			/* some other component has failed.  Let's not make
   1539 			   things worse. XXX wrong for RAID6 */
   1540 			rf_unlock_mutex2(raidPtr->mutex);
   1541 			return (EINVAL);
   1542 		}
   1543 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1544 			/* Can't fail a spared disk! */
   1545 			rf_unlock_mutex2(raidPtr->mutex);
   1546 			return (EINVAL);
   1547 		}
   1548 		rf_unlock_mutex2(raidPtr->mutex);
   1549 
   1550 		/* make a copy of the recon request so that we don't rely on
   1551 		 * the user's buffer */
   1552 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1553 		if (rrcopy == NULL)
   1554 			return(ENOMEM);
   1555 		memcpy(rrcopy, rr, sizeof(*rr));
   1556 		rrcopy->raidPtr = (void *) raidPtr;
   1557 
   1558 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1559 					   rf_ReconThread,
   1560 					   rrcopy,"raid_recon");
   1561 		return (0);
   1562 
   1563 		/* invoke a copyback operation after recon on whatever disk
   1564 		 * needs it, if any */
   1565 	case RAIDFRAME_COPYBACK:
   1566 
   1567 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1568 			/* This makes no sense on a RAID 0!! */
   1569 			return(EINVAL);
   1570 		}
   1571 
   1572 		if (raidPtr->copyback_in_progress == 1) {
   1573 			/* Copyback is already in progress! */
   1574 			return(EINVAL);
   1575 		}
   1576 
   1577 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1578 					   rf_CopybackThread,
   1579 					   raidPtr,"raid_copyback");
   1580 		return (retcode);
   1581 
   1582 		/* return the percentage completion of reconstruction */
   1583 	case RAIDFRAME_CHECK_RECON_STATUS:
   1584 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1585 			/* This makes no sense on a RAID 0, so tell the
   1586 			   user it's done. */
   1587 			*(int *) data = 100;
   1588 			return(0);
   1589 		}
   1590 		if (raidPtr->status != rf_rs_reconstructing)
   1591 			*(int *) data = 100;
   1592 		else {
   1593 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1594 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1595 			} else {
   1596 				*(int *) data = 0;
   1597 			}
   1598 		}
   1599 		return (0);
   1600 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1601 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1602 		if (raidPtr->status != rf_rs_reconstructing) {
   1603 			progressInfo.remaining = 0;
   1604 			progressInfo.completed = 100;
   1605 			progressInfo.total = 100;
   1606 		} else {
   1607 			progressInfo.total =
   1608 				raidPtr->reconControl->numRUsTotal;
   1609 			progressInfo.completed =
   1610 				raidPtr->reconControl->numRUsComplete;
   1611 			progressInfo.remaining = progressInfo.total -
   1612 				progressInfo.completed;
   1613 		}
   1614 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1615 				  sizeof(RF_ProgressInfo_t));
   1616 		return (retcode);
   1617 
   1618 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1619 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1620 			/* This makes no sense on a RAID 0, so tell the
   1621 			   user it's done. */
   1622 			*(int *) data = 100;
   1623 			return(0);
   1624 		}
   1625 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1626 			*(int *) data = 100 *
   1627 				raidPtr->parity_rewrite_stripes_done /
   1628 				raidPtr->Layout.numStripe;
   1629 		} else {
   1630 			*(int *) data = 100;
   1631 		}
   1632 		return (0);
   1633 
   1634 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1635 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1636 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1637 			progressInfo.total = raidPtr->Layout.numStripe;
   1638 			progressInfo.completed =
   1639 				raidPtr->parity_rewrite_stripes_done;
   1640 			progressInfo.remaining = progressInfo.total -
   1641 				progressInfo.completed;
   1642 		} else {
   1643 			progressInfo.remaining = 0;
   1644 			progressInfo.completed = 100;
   1645 			progressInfo.total = 100;
   1646 		}
   1647 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1648 				  sizeof(RF_ProgressInfo_t));
   1649 		return (retcode);
   1650 
   1651 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1652 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1653 			/* This makes no sense on a RAID 0 */
   1654 			*(int *) data = 100;
   1655 			return(0);
   1656 		}
   1657 		if (raidPtr->copyback_in_progress == 1) {
   1658 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1659 				raidPtr->Layout.numStripe;
   1660 		} else {
   1661 			*(int *) data = 100;
   1662 		}
   1663 		return (0);
   1664 
   1665 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1666 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1667 		if (raidPtr->copyback_in_progress == 1) {
   1668 			progressInfo.total = raidPtr->Layout.numStripe;
   1669 			progressInfo.completed =
   1670 				raidPtr->copyback_stripes_done;
   1671 			progressInfo.remaining = progressInfo.total -
   1672 				progressInfo.completed;
   1673 		} else {
   1674 			progressInfo.remaining = 0;
   1675 			progressInfo.completed = 100;
   1676 			progressInfo.total = 100;
   1677 		}
   1678 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1679 				  sizeof(RF_ProgressInfo_t));
   1680 		return (retcode);
   1681 
   1682 	case RAIDFRAME_SET_LAST_UNIT:
   1683 		for (column = 0; column < raidPtr->numCol; column++)
   1684 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1685 				return EBUSY;
   1686 
   1687 		for (column = 0; column < raidPtr->numCol; column++) {
   1688 			clabel = raidget_component_label(raidPtr, column);
   1689 			clabel->last_unit = *(int *)data;
   1690 			raidflush_component_label(raidPtr, column);
   1691 		}
   1692 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1693 		return 0;
   1694 
   1695 		/* the sparetable daemon calls this to wait for the kernel to
   1696 		 * need a spare table. this ioctl does not return until a
   1697 		 * spare table is needed. XXX -- calling mpsleep here in the
   1698 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1699 		 * -- I should either compute the spare table in the kernel,
   1700 		 * or have a different -- XXX XXX -- interface (a different
   1701 		 * character device) for delivering the table     -- XXX */
   1702 #if 0
   1703 	case RAIDFRAME_SPARET_WAIT:
   1704 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1705 		while (!rf_sparet_wait_queue)
   1706 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1707 		waitreq = rf_sparet_wait_queue;
   1708 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1709 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1710 
   1711 		/* structure assignment */
   1712 		*((RF_SparetWait_t *) data) = *waitreq;
   1713 
   1714 		RF_Free(waitreq, sizeof(*waitreq));
   1715 		return (0);
   1716 
   1717 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1718 		 * code in it that will cause the dameon to exit */
   1719 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1720 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1721 		waitreq->fcol = -1;
   1722 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1723 		waitreq->next = rf_sparet_wait_queue;
   1724 		rf_sparet_wait_queue = waitreq;
   1725 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1726 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1727 		return (0);
   1728 
   1729 		/* used by the spare table daemon to deliver a spare table
   1730 		 * into the kernel */
   1731 	case RAIDFRAME_SEND_SPARET:
   1732 
   1733 		/* install the spare table */
   1734 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1735 
   1736 		/* respond to the requestor.  the return status of the spare
   1737 		 * table installation is passed in the "fcol" field */
   1738 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1739 		waitreq->fcol = retcode;
   1740 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1741 		waitreq->next = rf_sparet_resp_queue;
   1742 		rf_sparet_resp_queue = waitreq;
   1743 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1744 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1745 
   1746 		return (retcode);
   1747 #endif
   1748 
   1749 	default:
   1750 		break; /* fall through to the os-specific code below */
   1751 
   1752 	}
   1753 
   1754 	if (!raidPtr->valid)
   1755 		return (EINVAL);
   1756 
   1757 	/*
   1758 	 * Add support for "regular" device ioctls here.
   1759 	 */
   1760 
   1761 	error = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1762 	if (error != EPASSTHROUGH)
   1763 		return (error);
   1764 
   1765 	switch (cmd) {
   1766 	case DIOCCACHESYNC:
   1767 		return rf_sync_component_caches(raidPtr);
   1768 
   1769 	default:
   1770 		retcode = ENOTTY;
   1771 	}
   1772 	return (retcode);
   1773 
   1774 }
   1775 
   1776 
   1777 /* raidinit -- complete the rest of the initialization for the
   1778    RAIDframe device.  */
   1779 
   1780 
   1781 static void
   1782 raidinit(struct raid_softc *rs)
   1783 {
   1784 	cfdata_t cf;
   1785 	unsigned int unit;
   1786 	struct dk_softc *dksc = &rs->sc_dksc;
   1787 	RF_Raid_t *raidPtr = &rs->sc_r;
   1788 	device_t dev;
   1789 
   1790 	unit = raidPtr->raidid;
   1791 
   1792 	/* XXX doesn't check bounds. */
   1793 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1794 
   1795 	/* attach the pseudo device */
   1796 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1797 	cf->cf_name = raid_cd.cd_name;
   1798 	cf->cf_atname = raid_cd.cd_name;
   1799 	cf->cf_unit = unit;
   1800 	cf->cf_fstate = FSTATE_STAR;
   1801 
   1802 	dev = config_attach_pseudo(cf);
   1803 	if (dev == NULL) {
   1804 		printf("raid%d: config_attach_pseudo failed\n",
   1805 		    raidPtr->raidid);
   1806 		free(cf, M_RAIDFRAME);
   1807 		return;
   1808 	}
   1809 
   1810 	/* provide a backpointer to the real softc */
   1811 	raidsoftc(dev) = rs;
   1812 
   1813 	/* disk_attach actually creates space for the CPU disklabel, among
   1814 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1815 	 * with disklabels. */
   1816 	dk_init(dksc, dev, DKTYPE_RAID);
   1817 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1818 
   1819 	/* XXX There may be a weird interaction here between this, and
   1820 	 * protectedSectors, as used in RAIDframe.  */
   1821 
   1822 	rs->sc_size = raidPtr->totalSectors;
   1823 
   1824 	/* Attach dk and disk subsystems */
   1825 	dk_attach(dksc);
   1826 	disk_attach(&dksc->sc_dkdev);
   1827 	rf_set_geometry(rs, raidPtr);
   1828 
   1829 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1830 
   1831 	/* mark unit as usuable */
   1832 	rs->sc_flags |= RAIDF_INITED;
   1833 
   1834 	dkwedge_discover(&dksc->sc_dkdev);
   1835 }
   1836 
   1837 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1838 /* wake up the daemon & tell it to get us a spare table
   1839  * XXX
   1840  * the entries in the queues should be tagged with the raidPtr
   1841  * so that in the extremely rare case that two recons happen at once,
   1842  * we know for which device were requesting a spare table
   1843  * XXX
   1844  *
   1845  * XXX This code is not currently used. GO
   1846  */
   1847 int
   1848 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1849 {
   1850 	int     retcode;
   1851 
   1852 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1853 	req->next = rf_sparet_wait_queue;
   1854 	rf_sparet_wait_queue = req;
   1855 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1856 
   1857 	/* mpsleep unlocks the mutex */
   1858 	while (!rf_sparet_resp_queue) {
   1859 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1860 	}
   1861 	req = rf_sparet_resp_queue;
   1862 	rf_sparet_resp_queue = req->next;
   1863 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1864 
   1865 	retcode = req->fcol;
   1866 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1867 					 * alloc'd */
   1868 	return (retcode);
   1869 }
   1870 #endif
   1871 
   1872 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1873  * bp & passes it down.
   1874  * any calls originating in the kernel must use non-blocking I/O
   1875  * do some extra sanity checking to return "appropriate" error values for
   1876  * certain conditions (to make some standard utilities work)
   1877  *
   1878  * Formerly known as: rf_DoAccessKernel
   1879  */
   1880 void
   1881 raidstart(RF_Raid_t *raidPtr)
   1882 {
   1883 	struct raid_softc *rs;
   1884 	struct dk_softc *dksc;
   1885 
   1886 	rs = raidPtr->softc;
   1887 	dksc = &rs->sc_dksc;
   1888 	/* quick check to see if anything has died recently */
   1889 	rf_lock_mutex2(raidPtr->mutex);
   1890 	if (raidPtr->numNewFailures > 0) {
   1891 		rf_unlock_mutex2(raidPtr->mutex);
   1892 		rf_update_component_labels(raidPtr,
   1893 					   RF_NORMAL_COMPONENT_UPDATE);
   1894 		rf_lock_mutex2(raidPtr->mutex);
   1895 		raidPtr->numNewFailures--;
   1896 	}
   1897 	rf_unlock_mutex2(raidPtr->mutex);
   1898 
   1899 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1900 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1901 		return;
   1902 	}
   1903 
   1904 	dk_start(dksc, NULL);
   1905 }
   1906 
   1907 static int
   1908 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1909 {
   1910 	RF_SectorCount_t num_blocks, pb, sum;
   1911 	RF_RaidAddr_t raid_addr;
   1912 	daddr_t blocknum;
   1913 	int     do_async;
   1914 	int rc;
   1915 
   1916 	rf_lock_mutex2(raidPtr->mutex);
   1917 	if (raidPtr->openings == 0) {
   1918 		rf_unlock_mutex2(raidPtr->mutex);
   1919 		return EAGAIN;
   1920 	}
   1921 	rf_unlock_mutex2(raidPtr->mutex);
   1922 
   1923 	blocknum = bp->b_rawblkno;
   1924 
   1925 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1926 		    (int) blocknum));
   1927 
   1928 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1929 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1930 
   1931 	/* *THIS* is where we adjust what block we're going to...
   1932 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1933 	raid_addr = blocknum;
   1934 
   1935 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1936 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1937 	sum = raid_addr + num_blocks + pb;
   1938 	if (1 || rf_debugKernelAccess) {
   1939 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1940 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1941 			    (int) pb, (int) bp->b_resid));
   1942 	}
   1943 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1944 	    || (sum < num_blocks) || (sum < pb)) {
   1945 		rc = ENOSPC;
   1946 		goto done;
   1947 	}
   1948 	/*
   1949 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1950 	 */
   1951 
   1952 	if (bp->b_bcount & raidPtr->sectorMask) {
   1953 		rc = ENOSPC;
   1954 		goto done;
   1955 	}
   1956 	db1_printf(("Calling DoAccess..\n"));
   1957 
   1958 
   1959 	rf_lock_mutex2(raidPtr->mutex);
   1960 	raidPtr->openings--;
   1961 	rf_unlock_mutex2(raidPtr->mutex);
   1962 
   1963 	/*
   1964 	 * Everything is async.
   1965 	 */
   1966 	do_async = 1;
   1967 
   1968 	/* don't ever condition on bp->b_flags & B_WRITE.
   1969 	 * always condition on B_READ instead */
   1970 
   1971 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1972 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1973 			 do_async, raid_addr, num_blocks,
   1974 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1975 
   1976 done:
   1977 	return rc;
   1978 }
   1979 
   1980 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1981 
   1982 int
   1983 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1984 {
   1985 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1986 	struct buf *bp;
   1987 
   1988 	req->queue = queue;
   1989 	bp = req->bp;
   1990 
   1991 	switch (req->type) {
   1992 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1993 		/* XXX need to do something extra here.. */
   1994 		/* I'm leaving this in, as I've never actually seen it used,
   1995 		 * and I'd like folks to report it... GO */
   1996 		printf(("WAKEUP CALLED\n"));
   1997 		queue->numOutstanding++;
   1998 
   1999 		bp->b_flags = 0;
   2000 		bp->b_private = req;
   2001 
   2002 		KernelWakeupFunc(bp);
   2003 		break;
   2004 
   2005 	case RF_IO_TYPE_READ:
   2006 	case RF_IO_TYPE_WRITE:
   2007 #if RF_ACC_TRACE > 0
   2008 		if (req->tracerec) {
   2009 			RF_ETIMER_START(req->tracerec->timer);
   2010 		}
   2011 #endif
   2012 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2013 		    op, queue->rf_cinfo->ci_dev,
   2014 		    req->sectorOffset, req->numSector,
   2015 		    req->buf, KernelWakeupFunc, (void *) req,
   2016 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2017 
   2018 		if (rf_debugKernelAccess) {
   2019 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2020 				(long) bp->b_blkno));
   2021 		}
   2022 		queue->numOutstanding++;
   2023 		queue->last_deq_sector = req->sectorOffset;
   2024 		/* acc wouldn't have been let in if there were any pending
   2025 		 * reqs at any other priority */
   2026 		queue->curPriority = req->priority;
   2027 
   2028 		db1_printf(("Going for %c to unit %d col %d\n",
   2029 			    req->type, queue->raidPtr->raidid,
   2030 			    queue->col));
   2031 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2032 			(int) req->sectorOffset, (int) req->numSector,
   2033 			(int) (req->numSector <<
   2034 			    queue->raidPtr->logBytesPerSector),
   2035 			(int) queue->raidPtr->logBytesPerSector));
   2036 
   2037 		/*
   2038 		 * XXX: drop lock here since this can block at
   2039 		 * least with backing SCSI devices.  Retake it
   2040 		 * to minimize fuss with calling interfaces.
   2041 		 */
   2042 
   2043 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2044 		bdev_strategy(bp);
   2045 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2046 		break;
   2047 
   2048 	default:
   2049 		panic("bad req->type in rf_DispatchKernelIO");
   2050 	}
   2051 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2052 
   2053 	return (0);
   2054 }
   2055 /* this is the callback function associated with a I/O invoked from
   2056    kernel code.
   2057  */
   2058 static void
   2059 KernelWakeupFunc(struct buf *bp)
   2060 {
   2061 	RF_DiskQueueData_t *req = NULL;
   2062 	RF_DiskQueue_t *queue;
   2063 
   2064 	db1_printf(("recovering the request queue:\n"));
   2065 
   2066 	req = bp->b_private;
   2067 
   2068 	queue = (RF_DiskQueue_t *) req->queue;
   2069 
   2070 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2071 
   2072 #if RF_ACC_TRACE > 0
   2073 	if (req->tracerec) {
   2074 		RF_ETIMER_STOP(req->tracerec->timer);
   2075 		RF_ETIMER_EVAL(req->tracerec->timer);
   2076 		rf_lock_mutex2(rf_tracing_mutex);
   2077 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2078 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2079 		req->tracerec->num_phys_ios++;
   2080 		rf_unlock_mutex2(rf_tracing_mutex);
   2081 	}
   2082 #endif
   2083 
   2084 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2085 	 * ballistic, and mark the component as hosed... */
   2086 
   2087 	if (bp->b_error != 0) {
   2088 		/* Mark the disk as dead */
   2089 		/* but only mark it once... */
   2090 		/* and only if it wouldn't leave this RAID set
   2091 		   completely broken */
   2092 		if (((queue->raidPtr->Disks[queue->col].status ==
   2093 		      rf_ds_optimal) ||
   2094 		     (queue->raidPtr->Disks[queue->col].status ==
   2095 		      rf_ds_used_spare)) &&
   2096 		     (queue->raidPtr->numFailures <
   2097 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2098 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2099 			       queue->raidPtr->raidid,
   2100 			       bp->b_error,
   2101 			       queue->raidPtr->Disks[queue->col].devname);
   2102 			queue->raidPtr->Disks[queue->col].status =
   2103 			    rf_ds_failed;
   2104 			queue->raidPtr->status = rf_rs_degraded;
   2105 			queue->raidPtr->numFailures++;
   2106 			queue->raidPtr->numNewFailures++;
   2107 		} else {	/* Disk is already dead... */
   2108 			/* printf("Disk already marked as dead!\n"); */
   2109 		}
   2110 
   2111 	}
   2112 
   2113 	/* Fill in the error value */
   2114 	req->error = bp->b_error;
   2115 
   2116 	/* Drop this one on the "finished" queue... */
   2117 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2118 
   2119 	/* Let the raidio thread know there is work to be done. */
   2120 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2121 
   2122 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2123 }
   2124 
   2125 
   2126 /*
   2127  * initialize a buf structure for doing an I/O in the kernel.
   2128  */
   2129 static void
   2130 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2131        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2132        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2133        struct proc *b_proc)
   2134 {
   2135 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2136 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2137 	bp->b_oflags = 0;
   2138 	bp->b_cflags = 0;
   2139 	bp->b_bcount = numSect << logBytesPerSector;
   2140 	bp->b_bufsize = bp->b_bcount;
   2141 	bp->b_error = 0;
   2142 	bp->b_dev = dev;
   2143 	bp->b_data = bf;
   2144 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2145 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2146 	if (bp->b_bcount == 0) {
   2147 		panic("bp->b_bcount is zero in InitBP!!");
   2148 	}
   2149 	bp->b_proc = b_proc;
   2150 	bp->b_iodone = cbFunc;
   2151 	bp->b_private = cbArg;
   2152 }
   2153 
   2154 /*
   2155  * Wait interruptibly for an exclusive lock.
   2156  *
   2157  * XXX
   2158  * Several drivers do this; it should be abstracted and made MP-safe.
   2159  * (Hmm... where have we seen this warning before :->  GO )
   2160  */
   2161 static int
   2162 raidlock(struct raid_softc *rs)
   2163 {
   2164 	int     error;
   2165 
   2166 	error = 0;
   2167 	mutex_enter(&rs->sc_mutex);
   2168 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2169 		rs->sc_flags |= RAIDF_WANTED;
   2170 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2171 		if (error != 0)
   2172 			goto done;
   2173 	}
   2174 	rs->sc_flags |= RAIDF_LOCKED;
   2175 done:
   2176 	mutex_exit(&rs->sc_mutex);
   2177 	return (error);
   2178 }
   2179 /*
   2180  * Unlock and wake up any waiters.
   2181  */
   2182 static void
   2183 raidunlock(struct raid_softc *rs)
   2184 {
   2185 
   2186 	mutex_enter(&rs->sc_mutex);
   2187 	rs->sc_flags &= ~RAIDF_LOCKED;
   2188 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2189 		rs->sc_flags &= ~RAIDF_WANTED;
   2190 		cv_broadcast(&rs->sc_cv);
   2191 	}
   2192 	mutex_exit(&rs->sc_mutex);
   2193 }
   2194 
   2195 
   2196 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2197 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2198 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2199 
   2200 static daddr_t
   2201 rf_component_info_offset(void)
   2202 {
   2203 
   2204 	return RF_COMPONENT_INFO_OFFSET;
   2205 }
   2206 
   2207 static daddr_t
   2208 rf_component_info_size(unsigned secsize)
   2209 {
   2210 	daddr_t info_size;
   2211 
   2212 	KASSERT(secsize);
   2213 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2214 		info_size = secsize;
   2215 	else
   2216 		info_size = RF_COMPONENT_INFO_SIZE;
   2217 
   2218 	return info_size;
   2219 }
   2220 
   2221 static daddr_t
   2222 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2223 {
   2224 	daddr_t map_offset;
   2225 
   2226 	KASSERT(raidPtr->bytesPerSector);
   2227 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2228 		map_offset = raidPtr->bytesPerSector;
   2229 	else
   2230 		map_offset = RF_COMPONENT_INFO_SIZE;
   2231 	map_offset += rf_component_info_offset();
   2232 
   2233 	return map_offset;
   2234 }
   2235 
   2236 static daddr_t
   2237 rf_parity_map_size(RF_Raid_t *raidPtr)
   2238 {
   2239 	daddr_t map_size;
   2240 
   2241 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2242 		map_size = raidPtr->bytesPerSector;
   2243 	else
   2244 		map_size = RF_PARITY_MAP_SIZE;
   2245 
   2246 	return map_size;
   2247 }
   2248 
   2249 int
   2250 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2251 {
   2252 	RF_ComponentLabel_t *clabel;
   2253 
   2254 	clabel = raidget_component_label(raidPtr, col);
   2255 	clabel->clean = RF_RAID_CLEAN;
   2256 	raidflush_component_label(raidPtr, col);
   2257 	return(0);
   2258 }
   2259 
   2260 
   2261 int
   2262 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2263 {
   2264 	RF_ComponentLabel_t *clabel;
   2265 
   2266 	clabel = raidget_component_label(raidPtr, col);
   2267 	clabel->clean = RF_RAID_DIRTY;
   2268 	raidflush_component_label(raidPtr, col);
   2269 	return(0);
   2270 }
   2271 
   2272 int
   2273 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2274 {
   2275 	KASSERT(raidPtr->bytesPerSector);
   2276 	return raidread_component_label(raidPtr->bytesPerSector,
   2277 	    raidPtr->Disks[col].dev,
   2278 	    raidPtr->raid_cinfo[col].ci_vp,
   2279 	    &raidPtr->raid_cinfo[col].ci_label);
   2280 }
   2281 
   2282 RF_ComponentLabel_t *
   2283 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2284 {
   2285 	return &raidPtr->raid_cinfo[col].ci_label;
   2286 }
   2287 
   2288 int
   2289 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2290 {
   2291 	RF_ComponentLabel_t *label;
   2292 
   2293 	label = &raidPtr->raid_cinfo[col].ci_label;
   2294 	label->mod_counter = raidPtr->mod_counter;
   2295 #ifndef RF_NO_PARITY_MAP
   2296 	label->parity_map_modcount = label->mod_counter;
   2297 #endif
   2298 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2299 	    raidPtr->Disks[col].dev,
   2300 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2301 }
   2302 
   2303 
   2304 static int
   2305 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2306     RF_ComponentLabel_t *clabel)
   2307 {
   2308 	return raidread_component_area(dev, b_vp, clabel,
   2309 	    sizeof(RF_ComponentLabel_t),
   2310 	    rf_component_info_offset(),
   2311 	    rf_component_info_size(secsize));
   2312 }
   2313 
   2314 /* ARGSUSED */
   2315 static int
   2316 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2317     size_t msize, daddr_t offset, daddr_t dsize)
   2318 {
   2319 	struct buf *bp;
   2320 	int error;
   2321 
   2322 	/* XXX should probably ensure that we don't try to do this if
   2323 	   someone has changed rf_protected_sectors. */
   2324 
   2325 	if (b_vp == NULL) {
   2326 		/* For whatever reason, this component is not valid.
   2327 		   Don't try to read a component label from it. */
   2328 		return(EINVAL);
   2329 	}
   2330 
   2331 	/* get a block of the appropriate size... */
   2332 	bp = geteblk((int)dsize);
   2333 	bp->b_dev = dev;
   2334 
   2335 	/* get our ducks in a row for the read */
   2336 	bp->b_blkno = offset / DEV_BSIZE;
   2337 	bp->b_bcount = dsize;
   2338 	bp->b_flags |= B_READ;
   2339  	bp->b_resid = dsize;
   2340 
   2341 	bdev_strategy(bp);
   2342 	error = biowait(bp);
   2343 
   2344 	if (!error) {
   2345 		memcpy(data, bp->b_data, msize);
   2346 	}
   2347 
   2348 	brelse(bp, 0);
   2349 	return(error);
   2350 }
   2351 
   2352 
   2353 static int
   2354 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2355     RF_ComponentLabel_t *clabel)
   2356 {
   2357 	return raidwrite_component_area(dev, b_vp, clabel,
   2358 	    sizeof(RF_ComponentLabel_t),
   2359 	    rf_component_info_offset(),
   2360 	    rf_component_info_size(secsize), 0);
   2361 }
   2362 
   2363 /* ARGSUSED */
   2364 static int
   2365 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2366     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2367 {
   2368 	struct buf *bp;
   2369 	int error;
   2370 
   2371 	/* get a block of the appropriate size... */
   2372 	bp = geteblk((int)dsize);
   2373 	bp->b_dev = dev;
   2374 
   2375 	/* get our ducks in a row for the write */
   2376 	bp->b_blkno = offset / DEV_BSIZE;
   2377 	bp->b_bcount = dsize;
   2378 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2379  	bp->b_resid = dsize;
   2380 
   2381 	memset(bp->b_data, 0, dsize);
   2382 	memcpy(bp->b_data, data, msize);
   2383 
   2384 	bdev_strategy(bp);
   2385 	if (asyncp)
   2386 		return 0;
   2387 	error = biowait(bp);
   2388 	brelse(bp, 0);
   2389 	if (error) {
   2390 #if 1
   2391 		printf("Failed to write RAID component info!\n");
   2392 #endif
   2393 	}
   2394 
   2395 	return(error);
   2396 }
   2397 
   2398 void
   2399 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2400 {
   2401 	int c;
   2402 
   2403 	for (c = 0; c < raidPtr->numCol; c++) {
   2404 		/* Skip dead disks. */
   2405 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2406 			continue;
   2407 		/* XXXjld: what if an error occurs here? */
   2408 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2409 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2410 		    RF_PARITYMAP_NBYTE,
   2411 		    rf_parity_map_offset(raidPtr),
   2412 		    rf_parity_map_size(raidPtr), 0);
   2413 	}
   2414 }
   2415 
   2416 void
   2417 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2418 {
   2419 	struct rf_paritymap_ondisk tmp;
   2420 	int c,first;
   2421 
   2422 	first=1;
   2423 	for (c = 0; c < raidPtr->numCol; c++) {
   2424 		/* Skip dead disks. */
   2425 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2426 			continue;
   2427 		raidread_component_area(raidPtr->Disks[c].dev,
   2428 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2429 		    RF_PARITYMAP_NBYTE,
   2430 		    rf_parity_map_offset(raidPtr),
   2431 		    rf_parity_map_size(raidPtr));
   2432 		if (first) {
   2433 			memcpy(map, &tmp, sizeof(*map));
   2434 			first = 0;
   2435 		} else {
   2436 			rf_paritymap_merge(map, &tmp);
   2437 		}
   2438 	}
   2439 }
   2440 
   2441 void
   2442 rf_markalldirty(RF_Raid_t *raidPtr)
   2443 {
   2444 	RF_ComponentLabel_t *clabel;
   2445 	int sparecol;
   2446 	int c;
   2447 	int j;
   2448 	int scol = -1;
   2449 
   2450 	raidPtr->mod_counter++;
   2451 	for (c = 0; c < raidPtr->numCol; c++) {
   2452 		/* we don't want to touch (at all) a disk that has
   2453 		   failed */
   2454 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2455 			clabel = raidget_component_label(raidPtr, c);
   2456 			if (clabel->status == rf_ds_spared) {
   2457 				/* XXX do something special...
   2458 				   but whatever you do, don't
   2459 				   try to access it!! */
   2460 			} else {
   2461 				raidmarkdirty(raidPtr, c);
   2462 			}
   2463 		}
   2464 	}
   2465 
   2466 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2467 		sparecol = raidPtr->numCol + c;
   2468 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2469 			/*
   2470 
   2471 			   we claim this disk is "optimal" if it's
   2472 			   rf_ds_used_spare, as that means it should be
   2473 			   directly substitutable for the disk it replaced.
   2474 			   We note that too...
   2475 
   2476 			 */
   2477 
   2478 			for(j=0;j<raidPtr->numCol;j++) {
   2479 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2480 					scol = j;
   2481 					break;
   2482 				}
   2483 			}
   2484 
   2485 			clabel = raidget_component_label(raidPtr, sparecol);
   2486 			/* make sure status is noted */
   2487 
   2488 			raid_init_component_label(raidPtr, clabel);
   2489 
   2490 			clabel->row = 0;
   2491 			clabel->column = scol;
   2492 			/* Note: we *don't* change status from rf_ds_used_spare
   2493 			   to rf_ds_optimal */
   2494 			/* clabel.status = rf_ds_optimal; */
   2495 
   2496 			raidmarkdirty(raidPtr, sparecol);
   2497 		}
   2498 	}
   2499 }
   2500 
   2501 
   2502 void
   2503 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2504 {
   2505 	RF_ComponentLabel_t *clabel;
   2506 	int sparecol;
   2507 	int c;
   2508 	int j;
   2509 	int scol;
   2510 	struct raid_softc *rs = raidPtr->softc;
   2511 
   2512 	scol = -1;
   2513 
   2514 	/* XXX should do extra checks to make sure things really are clean,
   2515 	   rather than blindly setting the clean bit... */
   2516 
   2517 	raidPtr->mod_counter++;
   2518 
   2519 	for (c = 0; c < raidPtr->numCol; c++) {
   2520 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2521 			clabel = raidget_component_label(raidPtr, c);
   2522 			/* make sure status is noted */
   2523 			clabel->status = rf_ds_optimal;
   2524 
   2525 			/* note what unit we are configured as */
   2526 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2527 				clabel->last_unit = raidPtr->raidid;
   2528 
   2529 			raidflush_component_label(raidPtr, c);
   2530 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2531 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2532 					raidmarkclean(raidPtr, c);
   2533 				}
   2534 			}
   2535 		}
   2536 		/* else we don't touch it.. */
   2537 	}
   2538 
   2539 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2540 		sparecol = raidPtr->numCol + c;
   2541 		/* Need to ensure that the reconstruct actually completed! */
   2542 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2543 			/*
   2544 
   2545 			   we claim this disk is "optimal" if it's
   2546 			   rf_ds_used_spare, as that means it should be
   2547 			   directly substitutable for the disk it replaced.
   2548 			   We note that too...
   2549 
   2550 			 */
   2551 
   2552 			for(j=0;j<raidPtr->numCol;j++) {
   2553 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2554 					scol = j;
   2555 					break;
   2556 				}
   2557 			}
   2558 
   2559 			/* XXX shouldn't *really* need this... */
   2560 			clabel = raidget_component_label(raidPtr, sparecol);
   2561 			/* make sure status is noted */
   2562 
   2563 			raid_init_component_label(raidPtr, clabel);
   2564 
   2565 			clabel->column = scol;
   2566 			clabel->status = rf_ds_optimal;
   2567 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2568 				clabel->last_unit = raidPtr->raidid;
   2569 
   2570 			raidflush_component_label(raidPtr, sparecol);
   2571 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2572 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2573 					raidmarkclean(raidPtr, sparecol);
   2574 				}
   2575 			}
   2576 		}
   2577 	}
   2578 }
   2579 
   2580 void
   2581 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2582 {
   2583 
   2584 	if (vp != NULL) {
   2585 		if (auto_configured == 1) {
   2586 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2587 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2588 			vput(vp);
   2589 
   2590 		} else {
   2591 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2592 		}
   2593 	}
   2594 }
   2595 
   2596 
   2597 void
   2598 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2599 {
   2600 	int r,c;
   2601 	struct vnode *vp;
   2602 	int acd;
   2603 
   2604 
   2605 	/* We take this opportunity to close the vnodes like we should.. */
   2606 
   2607 	for (c = 0; c < raidPtr->numCol; c++) {
   2608 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2609 		acd = raidPtr->Disks[c].auto_configured;
   2610 		rf_close_component(raidPtr, vp, acd);
   2611 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2612 		raidPtr->Disks[c].auto_configured = 0;
   2613 	}
   2614 
   2615 	for (r = 0; r < raidPtr->numSpare; r++) {
   2616 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2617 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2618 		rf_close_component(raidPtr, vp, acd);
   2619 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2620 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2621 	}
   2622 }
   2623 
   2624 
   2625 void
   2626 rf_ReconThread(struct rf_recon_req *req)
   2627 {
   2628 	int     s;
   2629 	RF_Raid_t *raidPtr;
   2630 
   2631 	s = splbio();
   2632 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2633 	raidPtr->recon_in_progress = 1;
   2634 
   2635 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2636 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2637 
   2638 	RF_Free(req, sizeof(*req));
   2639 
   2640 	raidPtr->recon_in_progress = 0;
   2641 	splx(s);
   2642 
   2643 	/* That's all... */
   2644 	kthread_exit(0);	/* does not return */
   2645 }
   2646 
   2647 void
   2648 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2649 {
   2650 	int retcode;
   2651 	int s;
   2652 
   2653 	raidPtr->parity_rewrite_stripes_done = 0;
   2654 	raidPtr->parity_rewrite_in_progress = 1;
   2655 	s = splbio();
   2656 	retcode = rf_RewriteParity(raidPtr);
   2657 	splx(s);
   2658 	if (retcode) {
   2659 		printf("raid%d: Error re-writing parity (%d)!\n",
   2660 		    raidPtr->raidid, retcode);
   2661 	} else {
   2662 		/* set the clean bit!  If we shutdown correctly,
   2663 		   the clean bit on each component label will get
   2664 		   set */
   2665 		raidPtr->parity_good = RF_RAID_CLEAN;
   2666 	}
   2667 	raidPtr->parity_rewrite_in_progress = 0;
   2668 
   2669 	/* Anyone waiting for us to stop?  If so, inform them... */
   2670 	if (raidPtr->waitShutdown) {
   2671 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2672 	}
   2673 
   2674 	/* That's all... */
   2675 	kthread_exit(0);	/* does not return */
   2676 }
   2677 
   2678 
   2679 void
   2680 rf_CopybackThread(RF_Raid_t *raidPtr)
   2681 {
   2682 	int s;
   2683 
   2684 	raidPtr->copyback_in_progress = 1;
   2685 	s = splbio();
   2686 	rf_CopybackReconstructedData(raidPtr);
   2687 	splx(s);
   2688 	raidPtr->copyback_in_progress = 0;
   2689 
   2690 	/* That's all... */
   2691 	kthread_exit(0);	/* does not return */
   2692 }
   2693 
   2694 
   2695 void
   2696 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2697 {
   2698 	int s;
   2699 	RF_Raid_t *raidPtr;
   2700 
   2701 	s = splbio();
   2702 	raidPtr = req->raidPtr;
   2703 	raidPtr->recon_in_progress = 1;
   2704 	rf_ReconstructInPlace(raidPtr, req->col);
   2705 	RF_Free(req, sizeof(*req));
   2706 	raidPtr->recon_in_progress = 0;
   2707 	splx(s);
   2708 
   2709 	/* That's all... */
   2710 	kthread_exit(0);	/* does not return */
   2711 }
   2712 
   2713 static RF_AutoConfig_t *
   2714 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2715     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2716     unsigned secsize)
   2717 {
   2718 	int good_one = 0;
   2719 	RF_ComponentLabel_t *clabel;
   2720 	RF_AutoConfig_t *ac;
   2721 
   2722 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2723 	if (clabel == NULL) {
   2724 oomem:
   2725 		    while(ac_list) {
   2726 			    ac = ac_list;
   2727 			    if (ac->clabel)
   2728 				    free(ac->clabel, M_RAIDFRAME);
   2729 			    ac_list = ac_list->next;
   2730 			    free(ac, M_RAIDFRAME);
   2731 		    }
   2732 		    printf("RAID auto config: out of memory!\n");
   2733 		    return NULL; /* XXX probably should panic? */
   2734 	}
   2735 
   2736 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2737 		/* Got the label.  Does it look reasonable? */
   2738 		if (rf_reasonable_label(clabel, numsecs) &&
   2739 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2740 #ifdef DEBUG
   2741 			printf("Component on: %s: %llu\n",
   2742 				cname, (unsigned long long)size);
   2743 			rf_print_component_label(clabel);
   2744 #endif
   2745 			/* if it's reasonable, add it, else ignore it. */
   2746 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2747 				M_NOWAIT);
   2748 			if (ac == NULL) {
   2749 				free(clabel, M_RAIDFRAME);
   2750 				goto oomem;
   2751 			}
   2752 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2753 			ac->dev = dev;
   2754 			ac->vp = vp;
   2755 			ac->clabel = clabel;
   2756 			ac->next = ac_list;
   2757 			ac_list = ac;
   2758 			good_one = 1;
   2759 		}
   2760 	}
   2761 	if (!good_one) {
   2762 		/* cleanup */
   2763 		free(clabel, M_RAIDFRAME);
   2764 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2765 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2766 		vput(vp);
   2767 	}
   2768 	return ac_list;
   2769 }
   2770 
   2771 RF_AutoConfig_t *
   2772 rf_find_raid_components(void)
   2773 {
   2774 	struct vnode *vp;
   2775 	struct disklabel label;
   2776 	device_t dv;
   2777 	deviter_t di;
   2778 	dev_t dev;
   2779 	int bmajor, bminor, wedge, rf_part_found;
   2780 	int error;
   2781 	int i;
   2782 	RF_AutoConfig_t *ac_list;
   2783 	uint64_t numsecs;
   2784 	unsigned secsize;
   2785 	int dowedges;
   2786 
   2787 	/* initialize the AutoConfig list */
   2788 	ac_list = NULL;
   2789 
   2790 	/*
   2791 	 * we begin by trolling through *all* the devices on the system *twice*
   2792 	 * first we scan for wedges, second for other devices. This avoids
   2793 	 * using a raw partition instead of a wedge that covers the whole disk
   2794 	 */
   2795 
   2796 	for (dowedges=1; dowedges>=0; --dowedges) {
   2797 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2798 		     dv = deviter_next(&di)) {
   2799 
   2800 			/* we are only interested in disks... */
   2801 			if (device_class(dv) != DV_DISK)
   2802 				continue;
   2803 
   2804 			/* we don't care about floppies... */
   2805 			if (device_is_a(dv, "fd")) {
   2806 				continue;
   2807 			}
   2808 
   2809 			/* we don't care about CD's... */
   2810 			if (device_is_a(dv, "cd")) {
   2811 				continue;
   2812 			}
   2813 
   2814 			/* we don't care about md's... */
   2815 			if (device_is_a(dv, "md")) {
   2816 				continue;
   2817 			}
   2818 
   2819 			/* hdfd is the Atari/Hades floppy driver */
   2820 			if (device_is_a(dv, "hdfd")) {
   2821 				continue;
   2822 			}
   2823 
   2824 			/* fdisa is the Atari/Milan floppy driver */
   2825 			if (device_is_a(dv, "fdisa")) {
   2826 				continue;
   2827 			}
   2828 
   2829 			/* are we in the wedges pass ? */
   2830 			wedge = device_is_a(dv, "dk");
   2831 			if (wedge != dowedges) {
   2832 				continue;
   2833 			}
   2834 
   2835 			/* need to find the device_name_to_block_device_major stuff */
   2836 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2837 
   2838 			rf_part_found = 0; /*No raid partition as yet*/
   2839 
   2840 			/* get a vnode for the raw partition of this disk */
   2841 			bminor = minor(device_unit(dv));
   2842 			dev = wedge ? makedev(bmajor, bminor) :
   2843 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2844 			if (bdevvp(dev, &vp))
   2845 				panic("RAID can't alloc vnode");
   2846 
   2847 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2848 
   2849 			if (error) {
   2850 				/* "Who cares."  Continue looking
   2851 				   for something that exists*/
   2852 				vput(vp);
   2853 				continue;
   2854 			}
   2855 
   2856 			error = getdisksize(vp, &numsecs, &secsize);
   2857 			if (error) {
   2858 				/*
   2859 				 * Pseudo devices like vnd and cgd can be
   2860 				 * opened but may still need some configuration.
   2861 				 * Ignore these quietly.
   2862 				 */
   2863 				if (error != ENXIO)
   2864 					printf("RAIDframe: can't get disk size"
   2865 					    " for dev %s (%d)\n",
   2866 					    device_xname(dv), error);
   2867 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2868 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2869 				vput(vp);
   2870 				continue;
   2871 			}
   2872 			if (wedge) {
   2873 				struct dkwedge_info dkw;
   2874 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2875 				    NOCRED);
   2876 				if (error) {
   2877 					printf("RAIDframe: can't get wedge info for "
   2878 					    "dev %s (%d)\n", device_xname(dv), error);
   2879 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2880 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2881 					vput(vp);
   2882 					continue;
   2883 				}
   2884 
   2885 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2886 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2887 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2888 					vput(vp);
   2889 					continue;
   2890 				}
   2891 
   2892 				ac_list = rf_get_component(ac_list, dev, vp,
   2893 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2894 				rf_part_found = 1; /*There is a raid component on this disk*/
   2895 				continue;
   2896 			}
   2897 
   2898 			/* Ok, the disk exists.  Go get the disklabel. */
   2899 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2900 			if (error) {
   2901 				/*
   2902 				 * XXX can't happen - open() would
   2903 				 * have errored out (or faked up one)
   2904 				 */
   2905 				if (error != ENOTTY)
   2906 					printf("RAIDframe: can't get label for dev "
   2907 					    "%s (%d)\n", device_xname(dv), error);
   2908 			}
   2909 
   2910 			/* don't need this any more.  We'll allocate it again
   2911 			   a little later if we really do... */
   2912 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2913 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2914 			vput(vp);
   2915 
   2916 			if (error)
   2917 				continue;
   2918 
   2919 			rf_part_found = 0; /*No raid partitions yet*/
   2920 			for (i = 0; i < label.d_npartitions; i++) {
   2921 				char cname[sizeof(ac_list->devname)];
   2922 
   2923 				/* We only support partitions marked as RAID */
   2924 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2925 					continue;
   2926 
   2927 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2928 				if (bdevvp(dev, &vp))
   2929 					panic("RAID can't alloc vnode");
   2930 
   2931 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2932 				if (error) {
   2933 					/* Whatever... */
   2934 					vput(vp);
   2935 					continue;
   2936 				}
   2937 				snprintf(cname, sizeof(cname), "%s%c",
   2938 				    device_xname(dv), 'a' + i);
   2939 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2940 					label.d_partitions[i].p_size, numsecs, secsize);
   2941 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2942 			}
   2943 
   2944 			/*
   2945 			 *If there is no raid component on this disk, either in a
   2946 			 *disklabel or inside a wedge, check the raw partition as well,
   2947 			 *as it is possible to configure raid components on raw disk
   2948 			 *devices.
   2949 			 */
   2950 
   2951 			if (!rf_part_found) {
   2952 				char cname[sizeof(ac_list->devname)];
   2953 
   2954 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2955 				if (bdevvp(dev, &vp))
   2956 					panic("RAID can't alloc vnode");
   2957 
   2958 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2959 				if (error) {
   2960 					/* Whatever... */
   2961 					vput(vp);
   2962 					continue;
   2963 				}
   2964 				snprintf(cname, sizeof(cname), "%s%c",
   2965 				    device_xname(dv), 'a' + RAW_PART);
   2966 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2967 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2968 			}
   2969 		}
   2970 		deviter_release(&di);
   2971 	}
   2972 	return ac_list;
   2973 }
   2974 
   2975 
   2976 int
   2977 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2978 {
   2979 
   2980 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2981 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2982 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2983 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2984 	    clabel->row >=0 &&
   2985 	    clabel->column >= 0 &&
   2986 	    clabel->num_rows > 0 &&
   2987 	    clabel->num_columns > 0 &&
   2988 	    clabel->row < clabel->num_rows &&
   2989 	    clabel->column < clabel->num_columns &&
   2990 	    clabel->blockSize > 0 &&
   2991 	    /*
   2992 	     * numBlocksHi may contain garbage, but it is ok since
   2993 	     * the type is unsigned.  If it is really garbage,
   2994 	     * rf_fix_old_label_size() will fix it.
   2995 	     */
   2996 	    rf_component_label_numblocks(clabel) > 0) {
   2997 		/*
   2998 		 * label looks reasonable enough...
   2999 		 * let's make sure it has no old garbage.
   3000 		 */
   3001 		if (numsecs)
   3002 			rf_fix_old_label_size(clabel, numsecs);
   3003 		return(1);
   3004 	}
   3005 	return(0);
   3006 }
   3007 
   3008 
   3009 /*
   3010  * For reasons yet unknown, some old component labels have garbage in
   3011  * the newer numBlocksHi region, and this causes lossage.  Since those
   3012  * disks will also have numsecs set to less than 32 bits of sectors,
   3013  * we can determine when this corruption has occurred, and fix it.
   3014  *
   3015  * The exact same problem, with the same unknown reason, happens to
   3016  * the partitionSizeHi member as well.
   3017  */
   3018 static void
   3019 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3020 {
   3021 
   3022 	if (numsecs < ((uint64_t)1 << 32)) {
   3023 		if (clabel->numBlocksHi) {
   3024 			printf("WARNING: total sectors < 32 bits, yet "
   3025 			       "numBlocksHi set\n"
   3026 			       "WARNING: resetting numBlocksHi to zero.\n");
   3027 			clabel->numBlocksHi = 0;
   3028 		}
   3029 
   3030 		if (clabel->partitionSizeHi) {
   3031 			printf("WARNING: total sectors < 32 bits, yet "
   3032 			       "partitionSizeHi set\n"
   3033 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3034 			clabel->partitionSizeHi = 0;
   3035 		}
   3036 	}
   3037 }
   3038 
   3039 
   3040 #ifdef DEBUG
   3041 void
   3042 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3043 {
   3044 	uint64_t numBlocks;
   3045 	static const char *rp[] = {
   3046 	    "No", "Force", "Soft", "*invalid*"
   3047 	};
   3048 
   3049 
   3050 	numBlocks = rf_component_label_numblocks(clabel);
   3051 
   3052 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3053 	       clabel->row, clabel->column,
   3054 	       clabel->num_rows, clabel->num_columns);
   3055 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3056 	       clabel->version, clabel->serial_number,
   3057 	       clabel->mod_counter);
   3058 	printf("   Clean: %s Status: %d\n",
   3059 	       clabel->clean ? "Yes" : "No", clabel->status);
   3060 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3061 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3062 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3063 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3064 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3065 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3066 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3067 #if 0
   3068 	   printf("   Config order: %d\n", clabel->config_order);
   3069 #endif
   3070 
   3071 }
   3072 #endif
   3073 
   3074 RF_ConfigSet_t *
   3075 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3076 {
   3077 	RF_AutoConfig_t *ac;
   3078 	RF_ConfigSet_t *config_sets;
   3079 	RF_ConfigSet_t *cset;
   3080 	RF_AutoConfig_t *ac_next;
   3081 
   3082 
   3083 	config_sets = NULL;
   3084 
   3085 	/* Go through the AutoConfig list, and figure out which components
   3086 	   belong to what sets.  */
   3087 	ac = ac_list;
   3088 	while(ac!=NULL) {
   3089 		/* we're going to putz with ac->next, so save it here
   3090 		   for use at the end of the loop */
   3091 		ac_next = ac->next;
   3092 
   3093 		if (config_sets == NULL) {
   3094 			/* will need at least this one... */
   3095 			config_sets = (RF_ConfigSet_t *)
   3096 				malloc(sizeof(RF_ConfigSet_t),
   3097 				       M_RAIDFRAME, M_NOWAIT);
   3098 			if (config_sets == NULL) {
   3099 				panic("rf_create_auto_sets: No memory!");
   3100 			}
   3101 			/* this one is easy :) */
   3102 			config_sets->ac = ac;
   3103 			config_sets->next = NULL;
   3104 			config_sets->rootable = 0;
   3105 			ac->next = NULL;
   3106 		} else {
   3107 			/* which set does this component fit into? */
   3108 			cset = config_sets;
   3109 			while(cset!=NULL) {
   3110 				if (rf_does_it_fit(cset, ac)) {
   3111 					/* looks like it matches... */
   3112 					ac->next = cset->ac;
   3113 					cset->ac = ac;
   3114 					break;
   3115 				}
   3116 				cset = cset->next;
   3117 			}
   3118 			if (cset==NULL) {
   3119 				/* didn't find a match above... new set..*/
   3120 				cset = (RF_ConfigSet_t *)
   3121 					malloc(sizeof(RF_ConfigSet_t),
   3122 					       M_RAIDFRAME, M_NOWAIT);
   3123 				if (cset == NULL) {
   3124 					panic("rf_create_auto_sets: No memory!");
   3125 				}
   3126 				cset->ac = ac;
   3127 				ac->next = NULL;
   3128 				cset->next = config_sets;
   3129 				cset->rootable = 0;
   3130 				config_sets = cset;
   3131 			}
   3132 		}
   3133 		ac = ac_next;
   3134 	}
   3135 
   3136 
   3137 	return(config_sets);
   3138 }
   3139 
   3140 static int
   3141 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3142 {
   3143 	RF_ComponentLabel_t *clabel1, *clabel2;
   3144 
   3145 	/* If this one matches the *first* one in the set, that's good
   3146 	   enough, since the other members of the set would have been
   3147 	   through here too... */
   3148 	/* note that we are not checking partitionSize here..
   3149 
   3150 	   Note that we are also not checking the mod_counters here.
   3151 	   If everything else matches except the mod_counter, that's
   3152 	   good enough for this test.  We will deal with the mod_counters
   3153 	   a little later in the autoconfiguration process.
   3154 
   3155 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3156 
   3157 	   The reason we don't check for this is that failed disks
   3158 	   will have lower modification counts.  If those disks are
   3159 	   not added to the set they used to belong to, then they will
   3160 	   form their own set, which may result in 2 different sets,
   3161 	   for example, competing to be configured at raid0, and
   3162 	   perhaps competing to be the root filesystem set.  If the
   3163 	   wrong ones get configured, or both attempt to become /,
   3164 	   weird behaviour and or serious lossage will occur.  Thus we
   3165 	   need to bring them into the fold here, and kick them out at
   3166 	   a later point.
   3167 
   3168 	*/
   3169 
   3170 	clabel1 = cset->ac->clabel;
   3171 	clabel2 = ac->clabel;
   3172 	if ((clabel1->version == clabel2->version) &&
   3173 	    (clabel1->serial_number == clabel2->serial_number) &&
   3174 	    (clabel1->num_rows == clabel2->num_rows) &&
   3175 	    (clabel1->num_columns == clabel2->num_columns) &&
   3176 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3177 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3178 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3179 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3180 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3181 	    (clabel1->blockSize == clabel2->blockSize) &&
   3182 	    rf_component_label_numblocks(clabel1) ==
   3183 	    rf_component_label_numblocks(clabel2) &&
   3184 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3185 	    (clabel1->root_partition == clabel2->root_partition) &&
   3186 	    (clabel1->last_unit == clabel2->last_unit) &&
   3187 	    (clabel1->config_order == clabel2->config_order)) {
   3188 		/* if it get's here, it almost *has* to be a match */
   3189 	} else {
   3190 		/* it's not consistent with somebody in the set..
   3191 		   punt */
   3192 		return(0);
   3193 	}
   3194 	/* all was fine.. it must fit... */
   3195 	return(1);
   3196 }
   3197 
   3198 int
   3199 rf_have_enough_components(RF_ConfigSet_t *cset)
   3200 {
   3201 	RF_AutoConfig_t *ac;
   3202 	RF_AutoConfig_t *auto_config;
   3203 	RF_ComponentLabel_t *clabel;
   3204 	int c;
   3205 	int num_cols;
   3206 	int num_missing;
   3207 	int mod_counter;
   3208 	int mod_counter_found;
   3209 	int even_pair_failed;
   3210 	char parity_type;
   3211 
   3212 
   3213 	/* check to see that we have enough 'live' components
   3214 	   of this set.  If so, we can configure it if necessary */
   3215 
   3216 	num_cols = cset->ac->clabel->num_columns;
   3217 	parity_type = cset->ac->clabel->parityConfig;
   3218 
   3219 	/* XXX Check for duplicate components!?!?!? */
   3220 
   3221 	/* Determine what the mod_counter is supposed to be for this set. */
   3222 
   3223 	mod_counter_found = 0;
   3224 	mod_counter = 0;
   3225 	ac = cset->ac;
   3226 	while(ac!=NULL) {
   3227 		if (mod_counter_found==0) {
   3228 			mod_counter = ac->clabel->mod_counter;
   3229 			mod_counter_found = 1;
   3230 		} else {
   3231 			if (ac->clabel->mod_counter > mod_counter) {
   3232 				mod_counter = ac->clabel->mod_counter;
   3233 			}
   3234 		}
   3235 		ac = ac->next;
   3236 	}
   3237 
   3238 	num_missing = 0;
   3239 	auto_config = cset->ac;
   3240 
   3241 	even_pair_failed = 0;
   3242 	for(c=0; c<num_cols; c++) {
   3243 		ac = auto_config;
   3244 		while(ac!=NULL) {
   3245 			if ((ac->clabel->column == c) &&
   3246 			    (ac->clabel->mod_counter == mod_counter)) {
   3247 				/* it's this one... */
   3248 #ifdef DEBUG
   3249 				printf("Found: %s at %d\n",
   3250 				       ac->devname,c);
   3251 #endif
   3252 				break;
   3253 			}
   3254 			ac=ac->next;
   3255 		}
   3256 		if (ac==NULL) {
   3257 				/* Didn't find one here! */
   3258 				/* special case for RAID 1, especially
   3259 				   where there are more than 2
   3260 				   components (where RAIDframe treats
   3261 				   things a little differently :( ) */
   3262 			if (parity_type == '1') {
   3263 				if (c%2 == 0) { /* even component */
   3264 					even_pair_failed = 1;
   3265 				} else { /* odd component.  If
   3266 					    we're failed, and
   3267 					    so is the even
   3268 					    component, it's
   3269 					    "Good Night, Charlie" */
   3270 					if (even_pair_failed == 1) {
   3271 						return(0);
   3272 					}
   3273 				}
   3274 			} else {
   3275 				/* normal accounting */
   3276 				num_missing++;
   3277 			}
   3278 		}
   3279 		if ((parity_type == '1') && (c%2 == 1)) {
   3280 				/* Just did an even component, and we didn't
   3281 				   bail.. reset the even_pair_failed flag,
   3282 				   and go on to the next component.... */
   3283 			even_pair_failed = 0;
   3284 		}
   3285 	}
   3286 
   3287 	clabel = cset->ac->clabel;
   3288 
   3289 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3290 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3291 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3292 		/* XXX this needs to be made *much* more general */
   3293 		/* Too many failures */
   3294 		return(0);
   3295 	}
   3296 	/* otherwise, all is well, and we've got enough to take a kick
   3297 	   at autoconfiguring this set */
   3298 	return(1);
   3299 }
   3300 
   3301 void
   3302 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3303 			RF_Raid_t *raidPtr)
   3304 {
   3305 	RF_ComponentLabel_t *clabel;
   3306 	int i;
   3307 
   3308 	clabel = ac->clabel;
   3309 
   3310 	/* 1. Fill in the common stuff */
   3311 	config->numRow = clabel->num_rows = 1;
   3312 	config->numCol = clabel->num_columns;
   3313 	config->numSpare = 0; /* XXX should this be set here? */
   3314 	config->sectPerSU = clabel->sectPerSU;
   3315 	config->SUsPerPU = clabel->SUsPerPU;
   3316 	config->SUsPerRU = clabel->SUsPerRU;
   3317 	config->parityConfig = clabel->parityConfig;
   3318 	/* XXX... */
   3319 	strcpy(config->diskQueueType,"fifo");
   3320 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3321 	config->layoutSpecificSize = 0; /* XXX ?? */
   3322 
   3323 	while(ac!=NULL) {
   3324 		/* row/col values will be in range due to the checks
   3325 		   in reasonable_label() */
   3326 		strcpy(config->devnames[0][ac->clabel->column],
   3327 		       ac->devname);
   3328 		ac = ac->next;
   3329 	}
   3330 
   3331 	for(i=0;i<RF_MAXDBGV;i++) {
   3332 		config->debugVars[i][0] = 0;
   3333 	}
   3334 }
   3335 
   3336 int
   3337 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3338 {
   3339 	RF_ComponentLabel_t *clabel;
   3340 	int column;
   3341 	int sparecol;
   3342 
   3343 	raidPtr->autoconfigure = new_value;
   3344 
   3345 	for(column=0; column<raidPtr->numCol; column++) {
   3346 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3347 			clabel = raidget_component_label(raidPtr, column);
   3348 			clabel->autoconfigure = new_value;
   3349 			raidflush_component_label(raidPtr, column);
   3350 		}
   3351 	}
   3352 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3353 		sparecol = raidPtr->numCol + column;
   3354 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3355 			clabel = raidget_component_label(raidPtr, sparecol);
   3356 			clabel->autoconfigure = new_value;
   3357 			raidflush_component_label(raidPtr, sparecol);
   3358 		}
   3359 	}
   3360 	return(new_value);
   3361 }
   3362 
   3363 int
   3364 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3365 {
   3366 	RF_ComponentLabel_t *clabel;
   3367 	int column;
   3368 	int sparecol;
   3369 
   3370 	raidPtr->root_partition = new_value;
   3371 	for(column=0; column<raidPtr->numCol; column++) {
   3372 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3373 			clabel = raidget_component_label(raidPtr, column);
   3374 			clabel->root_partition = new_value;
   3375 			raidflush_component_label(raidPtr, column);
   3376 		}
   3377 	}
   3378 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3379 		sparecol = raidPtr->numCol + column;
   3380 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3381 			clabel = raidget_component_label(raidPtr, sparecol);
   3382 			clabel->root_partition = new_value;
   3383 			raidflush_component_label(raidPtr, sparecol);
   3384 		}
   3385 	}
   3386 	return(new_value);
   3387 }
   3388 
   3389 void
   3390 rf_release_all_vps(RF_ConfigSet_t *cset)
   3391 {
   3392 	RF_AutoConfig_t *ac;
   3393 
   3394 	ac = cset->ac;
   3395 	while(ac!=NULL) {
   3396 		/* Close the vp, and give it back */
   3397 		if (ac->vp) {
   3398 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3399 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3400 			vput(ac->vp);
   3401 			ac->vp = NULL;
   3402 		}
   3403 		ac = ac->next;
   3404 	}
   3405 }
   3406 
   3407 
   3408 void
   3409 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3410 {
   3411 	RF_AutoConfig_t *ac;
   3412 	RF_AutoConfig_t *next_ac;
   3413 
   3414 	ac = cset->ac;
   3415 	while(ac!=NULL) {
   3416 		next_ac = ac->next;
   3417 		/* nuke the label */
   3418 		free(ac->clabel, M_RAIDFRAME);
   3419 		/* cleanup the config structure */
   3420 		free(ac, M_RAIDFRAME);
   3421 		/* "next.." */
   3422 		ac = next_ac;
   3423 	}
   3424 	/* and, finally, nuke the config set */
   3425 	free(cset, M_RAIDFRAME);
   3426 }
   3427 
   3428 
   3429 void
   3430 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3431 {
   3432 	/* current version number */
   3433 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3434 	clabel->serial_number = raidPtr->serial_number;
   3435 	clabel->mod_counter = raidPtr->mod_counter;
   3436 
   3437 	clabel->num_rows = 1;
   3438 	clabel->num_columns = raidPtr->numCol;
   3439 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3440 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3441 
   3442 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3443 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3444 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3445 
   3446 	clabel->blockSize = raidPtr->bytesPerSector;
   3447 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3448 
   3449 	/* XXX not portable */
   3450 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3451 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3452 	clabel->autoconfigure = raidPtr->autoconfigure;
   3453 	clabel->root_partition = raidPtr->root_partition;
   3454 	clabel->last_unit = raidPtr->raidid;
   3455 	clabel->config_order = raidPtr->config_order;
   3456 
   3457 #ifndef RF_NO_PARITY_MAP
   3458 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3459 #endif
   3460 }
   3461 
   3462 struct raid_softc *
   3463 rf_auto_config_set(RF_ConfigSet_t *cset)
   3464 {
   3465 	RF_Raid_t *raidPtr;
   3466 	RF_Config_t *config;
   3467 	int raidID;
   3468 	struct raid_softc *sc;
   3469 
   3470 #ifdef DEBUG
   3471 	printf("RAID autoconfigure\n");
   3472 #endif
   3473 
   3474 	/* 1. Create a config structure */
   3475 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3476 	if (config == NULL) {
   3477 		printf("%s: Out of mem - config!?!?\n", __func__);
   3478 				/* XXX do something more intelligent here. */
   3479 		return NULL;
   3480 	}
   3481 
   3482 	/*
   3483 	   2. Figure out what RAID ID this one is supposed to live at
   3484 	   See if we can get the same RAID dev that it was configured
   3485 	   on last time..
   3486 	*/
   3487 
   3488 	raidID = cset->ac->clabel->last_unit;
   3489 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3490 	     sc = raidget(++raidID, false))
   3491 		continue;
   3492 #ifdef DEBUG
   3493 	printf("Configuring raid%d:\n",raidID);
   3494 #endif
   3495 
   3496 	if (sc == NULL)
   3497 		sc = raidget(raidID, true);
   3498 	if (sc == NULL) {
   3499 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3500 				/* XXX do something more intelligent here. */
   3501 		free(config, M_RAIDFRAME);
   3502 		return NULL;
   3503 	}
   3504 
   3505 	raidPtr = &sc->sc_r;
   3506 
   3507 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3508 	raidPtr->softc = sc;
   3509 	raidPtr->raidid = raidID;
   3510 	raidPtr->openings = RAIDOUTSTANDING;
   3511 
   3512 	/* 3. Build the configuration structure */
   3513 	rf_create_configuration(cset->ac, config, raidPtr);
   3514 
   3515 	/* 4. Do the configuration */
   3516 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3517 		raidinit(sc);
   3518 
   3519 		rf_markalldirty(raidPtr);
   3520 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3521 		switch (cset->ac->clabel->root_partition) {
   3522 		case 1:	/* Force Root */
   3523 		case 2:	/* Soft Root: root when boot partition part of raid */
   3524 			/*
   3525 			 * everything configured just fine.  Make a note
   3526 			 * that this set is eligible to be root,
   3527 			 * or forced to be root
   3528 			 */
   3529 			cset->rootable = cset->ac->clabel->root_partition;
   3530 			/* XXX do this here? */
   3531 			raidPtr->root_partition = cset->rootable;
   3532 			break;
   3533 		default:
   3534 			break;
   3535 		}
   3536 	} else {
   3537 		raidput(sc);
   3538 		sc = NULL;
   3539 	}
   3540 
   3541 	/* 5. Cleanup */
   3542 	free(config, M_RAIDFRAME);
   3543 	return sc;
   3544 }
   3545 
   3546 void
   3547 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3548 	     size_t xmin, size_t xmax)
   3549 {
   3550 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3551 	pool_sethiwat(p, xmax);
   3552 	pool_prime(p, xmin);
   3553 	pool_setlowat(p, xmin);
   3554 }
   3555 
   3556 /*
   3557  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3558  * to see if there is IO pending and if that IO could possibly be done
   3559  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3560  * otherwise.
   3561  *
   3562  */
   3563 int
   3564 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3565 {
   3566 	struct raid_softc *rs;
   3567 	struct dk_softc *dksc;
   3568 
   3569 	rs = raidPtr->softc;
   3570 	dksc = &rs->sc_dksc;
   3571 
   3572 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3573 		return 1;
   3574 
   3575 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3576 		/* there is work to do */
   3577 		return 0;
   3578 	}
   3579 	/* default is nothing to do */
   3580 	return 1;
   3581 }
   3582 
   3583 int
   3584 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3585 {
   3586 	uint64_t numsecs;
   3587 	unsigned secsize;
   3588 	int error;
   3589 
   3590 	error = getdisksize(vp, &numsecs, &secsize);
   3591 	if (error == 0) {
   3592 		diskPtr->blockSize = secsize;
   3593 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3594 		diskPtr->partitionSize = numsecs;
   3595 		return 0;
   3596 	}
   3597 	return error;
   3598 }
   3599 
   3600 static int
   3601 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3602 {
   3603 	return 1;
   3604 }
   3605 
   3606 static void
   3607 raid_attach(device_t parent, device_t self, void *aux)
   3608 {
   3609 }
   3610 
   3611 
   3612 static int
   3613 raid_detach(device_t self, int flags)
   3614 {
   3615 	int error;
   3616 	struct raid_softc *rs = raidsoftc(self);
   3617 
   3618 	if (rs == NULL)
   3619 		return ENXIO;
   3620 
   3621 	if ((error = raidlock(rs)) != 0)
   3622 		return (error);
   3623 
   3624 	error = raid_detach_unlocked(rs);
   3625 
   3626 	raidunlock(rs);
   3627 
   3628 	/* XXX raid can be referenced here */
   3629 
   3630 	if (error)
   3631 		return error;
   3632 
   3633 	/* Free the softc */
   3634 	raidput(rs);
   3635 
   3636 	return 0;
   3637 }
   3638 
   3639 static void
   3640 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3641 {
   3642 	struct dk_softc *dksc = &rs->sc_dksc;
   3643 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3644 
   3645 	memset(dg, 0, sizeof(*dg));
   3646 
   3647 	dg->dg_secperunit = raidPtr->totalSectors;
   3648 	dg->dg_secsize = raidPtr->bytesPerSector;
   3649 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3650 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3651 
   3652 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3653 }
   3654 
   3655 /*
   3656  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3657  * We end up returning whatever error was returned by the first cache flush
   3658  * that fails.
   3659  */
   3660 
   3661 int
   3662 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3663 {
   3664 	int c, sparecol;
   3665 	int e,error;
   3666 	int force = 1;
   3667 
   3668 	error = 0;
   3669 	for (c = 0; c < raidPtr->numCol; c++) {
   3670 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3671 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3672 					  &force, FWRITE, NOCRED);
   3673 			if (e) {
   3674 				if (e != ENODEV)
   3675 					printf("raid%d: cache flush to component %s failed.\n",
   3676 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3677 				if (error == 0) {
   3678 					error = e;
   3679 				}
   3680 			}
   3681 		}
   3682 	}
   3683 
   3684 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3685 		sparecol = raidPtr->numCol + c;
   3686 		/* Need to ensure that the reconstruct actually completed! */
   3687 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3688 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3689 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3690 			if (e) {
   3691 				if (e != ENODEV)
   3692 					printf("raid%d: cache flush to component %s failed.\n",
   3693 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3694 				if (error == 0) {
   3695 					error = e;
   3696 				}
   3697 			}
   3698 		}
   3699 	}
   3700 	return error;
   3701 }
   3702 
   3703 /*
   3704  * Module interface
   3705  */
   3706 
   3707 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3708 
   3709 #ifdef _MODULE
   3710 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3711 #endif
   3712 
   3713 static int raid_modcmd(modcmd_t, void *);
   3714 static int raid_modcmd_init(void);
   3715 static int raid_modcmd_fini(void);
   3716 
   3717 static int
   3718 raid_modcmd(modcmd_t cmd, void *data)
   3719 {
   3720 	int error;
   3721 
   3722 	error = 0;
   3723 	switch (cmd) {
   3724 	case MODULE_CMD_INIT:
   3725 		error = raid_modcmd_init();
   3726 		break;
   3727 	case MODULE_CMD_FINI:
   3728 		error = raid_modcmd_fini();
   3729 		break;
   3730 	default:
   3731 		error = ENOTTY;
   3732 		break;
   3733 	}
   3734 	return error;
   3735 }
   3736 
   3737 static int
   3738 raid_modcmd_init(void)
   3739 {
   3740 	int error;
   3741 	int bmajor, cmajor;
   3742 
   3743 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3744 	mutex_enter(&raid_lock);
   3745 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3746 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3747 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3748 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3749 
   3750 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3751 #endif
   3752 
   3753 	bmajor = cmajor = -1;
   3754 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3755 	    &raid_cdevsw, &cmajor);
   3756 	if (error != 0 && error != EEXIST) {
   3757 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3758 		mutex_exit(&raid_lock);
   3759 		return error;
   3760 	}
   3761 #ifdef _MODULE
   3762 	error = config_cfdriver_attach(&raid_cd);
   3763 	if (error != 0) {
   3764 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3765 		    __func__, error);
   3766 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3767 		mutex_exit(&raid_lock);
   3768 		return error;
   3769 	}
   3770 #endif
   3771 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3772 	if (error != 0) {
   3773 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3774 		    __func__, error);
   3775 #ifdef _MODULE
   3776 		config_cfdriver_detach(&raid_cd);
   3777 #endif
   3778 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3779 		mutex_exit(&raid_lock);
   3780 		return error;
   3781 	}
   3782 
   3783 	raidautoconfigdone = false;
   3784 
   3785 	mutex_exit(&raid_lock);
   3786 
   3787 	if (error == 0) {
   3788 		if (rf_BootRaidframe(true) == 0)
   3789 			aprint_verbose("Kernelized RAIDframe activated\n");
   3790 		else
   3791 			panic("Serious error activating RAID!!");
   3792 	}
   3793 
   3794 	/*
   3795 	 * Register a finalizer which will be used to auto-config RAID
   3796 	 * sets once all real hardware devices have been found.
   3797 	 */
   3798 	error = config_finalize_register(NULL, rf_autoconfig);
   3799 	if (error != 0) {
   3800 		aprint_error("WARNING: unable to register RAIDframe "
   3801 		    "finalizer\n");
   3802 		error = 0;
   3803 	}
   3804 
   3805 	return error;
   3806 }
   3807 
   3808 static int
   3809 raid_modcmd_fini(void)
   3810 {
   3811 	int error;
   3812 
   3813 	mutex_enter(&raid_lock);
   3814 
   3815 	/* Don't allow unload if raid device(s) exist.  */
   3816 	if (!LIST_EMPTY(&raids)) {
   3817 		mutex_exit(&raid_lock);
   3818 		return EBUSY;
   3819 	}
   3820 
   3821 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3822 	if (error != 0) {
   3823 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3824 		mutex_exit(&raid_lock);
   3825 		return error;
   3826 	}
   3827 #ifdef _MODULE
   3828 	error = config_cfdriver_detach(&raid_cd);
   3829 	if (error != 0) {
   3830 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3831 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3832 		mutex_exit(&raid_lock);
   3833 		return error;
   3834 	}
   3835 #endif
   3836 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3837 	if (error != 0) {
   3838 		aprint_error("%s: cannot detach devsw\n",__func__);
   3839 #ifdef _MODULE
   3840 		config_cfdriver_attach(&raid_cd);
   3841 #endif
   3842 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3843 		mutex_exit(&raid_lock);
   3844 		return error;
   3845 	}
   3846 	rf_BootRaidframe(false);
   3847 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3848 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3849 	rf_destroy_cond2(rf_sparet_wait_cv);
   3850 	rf_destroy_cond2(rf_sparet_resp_cv);
   3851 #endif
   3852 	mutex_exit(&raid_lock);
   3853 	mutex_destroy(&raid_lock);
   3854 
   3855 	return error;
   3856 }
   3857