Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.347.2.1
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.347.2.1 2017/04/21 16:53:52 bouyer Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.347.2.1 2017/04/21 16:53:52 bouyer Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #ifdef DEBUG_ROOT
    165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    166 #else
    167 #define DPRINTF(a, ...)
    168 #endif
    169 
    170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    171 static rf_declare_mutex2(rf_sparet_wait_mutex);
    172 static rf_declare_cond2(rf_sparet_wait_cv);
    173 static rf_declare_cond2(rf_sparet_resp_cv);
    174 
    175 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    176 						 * spare table */
    177 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    178 						 * installation process */
    179 #endif
    180 
    181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    182 
    183 /* prototypes */
    184 static void KernelWakeupFunc(struct buf *);
    185 static void InitBP(struct buf *, struct vnode *, unsigned,
    186     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    187     void *, int, struct proc *);
    188 struct raid_softc;
    189 static void raidinit(struct raid_softc *);
    190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 const struct bdevsw raid_bdevsw = {
    221 	.d_open = raidopen,
    222 	.d_close = raidclose,
    223 	.d_strategy = raidstrategy,
    224 	.d_ioctl = raidioctl,
    225 	.d_dump = raiddump,
    226 	.d_psize = raidsize,
    227 	.d_discard = nodiscard,
    228 	.d_flag = D_DISK
    229 };
    230 
    231 const struct cdevsw raid_cdevsw = {
    232 	.d_open = raidopen,
    233 	.d_close = raidclose,
    234 	.d_read = raidread,
    235 	.d_write = raidwrite,
    236 	.d_ioctl = raidioctl,
    237 	.d_stop = nostop,
    238 	.d_tty = notty,
    239 	.d_poll = nopoll,
    240 	.d_mmap = nommap,
    241 	.d_kqfilter = nokqfilter,
    242 	.d_discard = nodiscard,
    243 	.d_flag = D_DISK
    244 };
    245 
    246 static struct dkdriver rf_dkdriver = {
    247 	.d_open = raidopen,
    248 	.d_close = raidclose,
    249 	.d_strategy = raidstrategy,
    250 	.d_diskstart = raid_diskstart,
    251 	.d_dumpblocks = raid_dumpblocks,
    252 	.d_lastclose = raid_lastclose,
    253 	.d_minphys = minphys
    254 };
    255 
    256 struct raid_softc {
    257 	struct dk_softc sc_dksc;
    258 	int	sc_unit;
    259 	int     sc_flags;	/* flags */
    260 	int     sc_cflags;	/* configuration flags */
    261 	kmutex_t sc_mutex;	/* interlock mutex */
    262 	kcondvar_t sc_cv;	/* and the condvar */
    263 	uint64_t sc_size;	/* size of the raid device */
    264 	char    sc_xname[20];	/* XXX external name */
    265 	RF_Raid_t sc_r;
    266 	LIST_ENTRY(raid_softc) sc_link;
    267 };
    268 /* sc_flags */
    269 #define RAIDF_INITED		0x01	/* unit has been initialized */
    270 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    271 #define RAIDF_DETACH  		0x04	/* detach after final close */
    272 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    273 #define RAIDF_LOCKED		0x10	/* unit is locked */
    274 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    275 
    276 #define	raidunit(x)	DISKUNIT(x)
    277 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    278 
    279 extern struct cfdriver raid_cd;
    280 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    281     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    282     DVF_DETACH_SHUTDOWN);
    283 
    284 /*
    285  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    286  * Be aware that large numbers can allow the driver to consume a lot of
    287  * kernel memory, especially on writes, and in degraded mode reads.
    288  *
    289  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    290  * a single 64K write will typically require 64K for the old data,
    291  * 64K for the old parity, and 64K for the new parity, for a total
    292  * of 192K (if the parity buffer is not re-used immediately).
    293  * Even it if is used immediately, that's still 128K, which when multiplied
    294  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    295  *
    296  * Now in degraded mode, for example, a 64K read on the above setup may
    297  * require data reconstruction, which will require *all* of the 4 remaining
    298  * disks to participate -- 4 * 32K/disk == 128K again.
    299  */
    300 
    301 #ifndef RAIDOUTSTANDING
    302 #define RAIDOUTSTANDING   6
    303 #endif
    304 
    305 #define RAIDLABELDEV(dev)	\
    306 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    307 
    308 /* declared here, and made public, for the benefit of KVM stuff.. */
    309 
    310 static int raidlock(struct raid_softc *);
    311 static void raidunlock(struct raid_softc *);
    312 
    313 static int raid_detach_unlocked(struct raid_softc *);
    314 
    315 static void rf_markalldirty(RF_Raid_t *);
    316 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    317 
    318 void rf_ReconThread(struct rf_recon_req *);
    319 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    320 void rf_CopybackThread(RF_Raid_t *raidPtr);
    321 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    322 int rf_autoconfig(device_t);
    323 void rf_buildroothack(RF_ConfigSet_t *);
    324 
    325 RF_AutoConfig_t *rf_find_raid_components(void);
    326 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    327 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    328 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    329 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    330 int rf_set_autoconfig(RF_Raid_t *, int);
    331 int rf_set_rootpartition(RF_Raid_t *, int);
    332 void rf_release_all_vps(RF_ConfigSet_t *);
    333 void rf_cleanup_config_set(RF_ConfigSet_t *);
    334 int rf_have_enough_components(RF_ConfigSet_t *);
    335 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    336 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    337 
    338 /*
    339  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    340  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    341  * in the kernel config file.
    342  */
    343 #ifdef RAID_AUTOCONFIG
    344 int raidautoconfig = 1;
    345 #else
    346 int raidautoconfig = 0;
    347 #endif
    348 static bool raidautoconfigdone = false;
    349 
    350 struct RF_Pools_s rf_pools;
    351 
    352 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    353 static kmutex_t raid_lock;
    354 
    355 static struct raid_softc *
    356 raidcreate(int unit) {
    357 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    358 	if (sc == NULL) {
    359 #ifdef DIAGNOSTIC
    360 		printf("%s: out of memory\n", __func__);
    361 #endif
    362 		return NULL;
    363 	}
    364 	sc->sc_unit = unit;
    365 	cv_init(&sc->sc_cv, "raidunit");
    366 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    367 	return sc;
    368 }
    369 
    370 static void
    371 raiddestroy(struct raid_softc *sc) {
    372 	cv_destroy(&sc->sc_cv);
    373 	mutex_destroy(&sc->sc_mutex);
    374 	kmem_free(sc, sizeof(*sc));
    375 }
    376 
    377 static struct raid_softc *
    378 raidget(int unit, bool create) {
    379 	struct raid_softc *sc;
    380 	if (unit < 0) {
    381 #ifdef DIAGNOSTIC
    382 		panic("%s: unit %d!", __func__, unit);
    383 #endif
    384 		return NULL;
    385 	}
    386 	mutex_enter(&raid_lock);
    387 	LIST_FOREACH(sc, &raids, sc_link) {
    388 		if (sc->sc_unit == unit) {
    389 			mutex_exit(&raid_lock);
    390 			return sc;
    391 		}
    392 	}
    393 	mutex_exit(&raid_lock);
    394 	if (!create)
    395 		return NULL;
    396 	if ((sc = raidcreate(unit)) == NULL)
    397 		return NULL;
    398 	mutex_enter(&raid_lock);
    399 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    400 	mutex_exit(&raid_lock);
    401 	return sc;
    402 }
    403 
    404 static void
    405 raidput(struct raid_softc *sc) {
    406 	mutex_enter(&raid_lock);
    407 	LIST_REMOVE(sc, sc_link);
    408 	mutex_exit(&raid_lock);
    409 	raiddestroy(sc);
    410 }
    411 
    412 void
    413 raidattach(int num)
    414 {
    415 
    416 	/*
    417 	 * Device attachment and associated initialization now occurs
    418 	 * as part of the module initialization.
    419 	 */
    420 }
    421 
    422 int
    423 rf_autoconfig(device_t self)
    424 {
    425 	RF_AutoConfig_t *ac_list;
    426 	RF_ConfigSet_t *config_sets;
    427 
    428 	if (!raidautoconfig || raidautoconfigdone == true)
    429 		return (0);
    430 
    431 	/* XXX This code can only be run once. */
    432 	raidautoconfigdone = true;
    433 
    434 #ifdef __HAVE_CPU_BOOTCONF
    435 	/*
    436 	 * 0. find the boot device if needed first so we can use it later
    437 	 * this needs to be done before we autoconfigure any raid sets,
    438 	 * because if we use wedges we are not going to be able to open
    439 	 * the boot device later
    440 	 */
    441 	if (booted_device == NULL)
    442 		cpu_bootconf();
    443 #endif
    444 	/* 1. locate all RAID components on the system */
    445 	aprint_debug("Searching for RAID components...\n");
    446 	ac_list = rf_find_raid_components();
    447 
    448 	/* 2. Sort them into their respective sets. */
    449 	config_sets = rf_create_auto_sets(ac_list);
    450 
    451 	/*
    452 	 * 3. Evaluate each set and configure the valid ones.
    453 	 * This gets done in rf_buildroothack().
    454 	 */
    455 	rf_buildroothack(config_sets);
    456 
    457 	return 1;
    458 }
    459 
    460 static int
    461 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    462 	const char *bootname = device_xname(bdv);
    463 	size_t len = strlen(bootname);
    464 
    465 	for (int col = 0; col < r->numCol; col++) {
    466 		const char *devname = r->Disks[col].devname;
    467 		devname += sizeof("/dev/") - 1;
    468 		if (strncmp(devname, "dk", 2) == 0) {
    469 			const char *parent =
    470 			    dkwedge_get_parent_name(r->Disks[col].dev);
    471 			if (parent != NULL)
    472 				devname = parent;
    473 		}
    474 		if (strncmp(devname, bootname, len) == 0) {
    475 			struct raid_softc *sc = r->softc;
    476 			aprint_debug("raid%d includes boot device %s\n",
    477 			    sc->sc_unit, devname);
    478 			return 1;
    479 		}
    480 	}
    481 	return 0;
    482 }
    483 
    484 void
    485 rf_buildroothack(RF_ConfigSet_t *config_sets)
    486 {
    487 	RF_ConfigSet_t *cset;
    488 	RF_ConfigSet_t *next_cset;
    489 	int num_root;
    490 	struct raid_softc *sc, *rsc;
    491 	struct dk_softc *dksc;
    492 
    493 	sc = rsc = NULL;
    494 	num_root = 0;
    495 	cset = config_sets;
    496 	while (cset != NULL) {
    497 		next_cset = cset->next;
    498 		if (rf_have_enough_components(cset) &&
    499 		    cset->ac->clabel->autoconfigure == 1) {
    500 			sc = rf_auto_config_set(cset);
    501 			if (sc != NULL) {
    502 				aprint_debug("raid%d: configured ok\n",
    503 				    sc->sc_unit);
    504 				if (cset->rootable) {
    505 					rsc = sc;
    506 					num_root++;
    507 				}
    508 			} else {
    509 				/* The autoconfig didn't work :( */
    510 				aprint_debug("Autoconfig failed\n");
    511 				rf_release_all_vps(cset);
    512 			}
    513 		} else {
    514 			/* we're not autoconfiguring this set...
    515 			   release the associated resources */
    516 			rf_release_all_vps(cset);
    517 		}
    518 		/* cleanup */
    519 		rf_cleanup_config_set(cset);
    520 		cset = next_cset;
    521 	}
    522 	dksc = &rsc->sc_dksc;
    523 
    524 	/* if the user has specified what the root device should be
    525 	   then we don't touch booted_device or boothowto... */
    526 
    527 	if (rootspec != NULL)
    528 		return;
    529 
    530 	/* we found something bootable... */
    531 
    532 	/*
    533 	 * XXX: The following code assumes that the root raid
    534 	 * is the first ('a') partition. This is about the best
    535 	 * we can do with a BSD disklabel, but we might be able
    536 	 * to do better with a GPT label, by setting a specified
    537 	 * attribute to indicate the root partition. We can then
    538 	 * stash the partition number in the r->root_partition
    539 	 * high bits (the bottom 2 bits are already used). For
    540 	 * now we just set booted_partition to 0 when we override
    541 	 * root.
    542 	 */
    543 	if (num_root == 1) {
    544 		device_t candidate_root;
    545 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    546 			char cname[sizeof(cset->ac->devname)];
    547 			/* XXX: assume partition 'a' first */
    548 			snprintf(cname, sizeof(cname), "%s%c",
    549 			    device_xname(dksc->sc_dev), 'a');
    550 			candidate_root = dkwedge_find_by_wname(cname);
    551 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    552 			    cname);
    553 			if (candidate_root == NULL) {
    554 				/*
    555 				 * If that is not found, because we don't use
    556 				 * disklabel, return the first dk child
    557 				 * XXX: we can skip the 'a' check above
    558 				 * and always do this...
    559 				 */
    560 				size_t i = 0;
    561 				candidate_root = dkwedge_find_by_parent(
    562 				    device_xname(dksc->sc_dev), &i);
    563 			}
    564 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    565 			    candidate_root);
    566 		} else
    567 			candidate_root = dksc->sc_dev;
    568 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    569 		DPRINTF("%s: booted_device=%p root_partition=%d "
    570 		   "contains_boot=%d\n", __func__, booted_device,
    571 		   rsc->sc_r.root_partition,
    572 		   rf_containsboot(&rsc->sc_r, booted_device));
    573 		if (booted_device == NULL ||
    574 		    rsc->sc_r.root_partition == 1 ||
    575 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    576 			booted_device = candidate_root;
    577 			booted_partition = 0;	/* XXX assume 'a' */
    578 		}
    579 	} else if (num_root > 1) {
    580 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    581 		    booted_device);
    582 
    583 		/*
    584 		 * Maybe the MD code can help. If it cannot, then
    585 		 * setroot() will discover that we have no
    586 		 * booted_device and will ask the user if nothing was
    587 		 * hardwired in the kernel config file
    588 		 */
    589 		if (booted_device == NULL)
    590 			return;
    591 
    592 		num_root = 0;
    593 		mutex_enter(&raid_lock);
    594 		LIST_FOREACH(sc, &raids, sc_link) {
    595 			RF_Raid_t *r = &sc->sc_r;
    596 			if (r->valid == 0)
    597 				continue;
    598 
    599 			if (r->root_partition == 0)
    600 				continue;
    601 
    602 			if (rf_containsboot(r, booted_device)) {
    603 				num_root++;
    604 				rsc = sc;
    605 				dksc = &rsc->sc_dksc;
    606 			}
    607 		}
    608 		mutex_exit(&raid_lock);
    609 
    610 		if (num_root == 1) {
    611 			booted_device = dksc->sc_dev;
    612 			booted_partition = 0;	/* XXX assume 'a' */
    613 		} else {
    614 			/* we can't guess.. require the user to answer... */
    615 			boothowto |= RB_ASKNAME;
    616 		}
    617 	}
    618 }
    619 
    620 static int
    621 raidsize(dev_t dev)
    622 {
    623 	struct raid_softc *rs;
    624 	struct dk_softc *dksc;
    625 	unsigned int unit;
    626 
    627 	unit = raidunit(dev);
    628 	if ((rs = raidget(unit, false)) == NULL)
    629 		return -1;
    630 	dksc = &rs->sc_dksc;
    631 
    632 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    633 		return -1;
    634 
    635 	return dk_size(dksc, dev);
    636 }
    637 
    638 static int
    639 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    640 {
    641 	unsigned int unit;
    642 	struct raid_softc *rs;
    643 	struct dk_softc *dksc;
    644 
    645 	unit = raidunit(dev);
    646 	if ((rs = raidget(unit, false)) == NULL)
    647 		return ENXIO;
    648 	dksc = &rs->sc_dksc;
    649 
    650 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    651 		return ENODEV;
    652 
    653         /*
    654            Note that blkno is relative to this particular partition.
    655            By adding adding RF_PROTECTED_SECTORS, we get a value that
    656 	   is relative to the partition used for the underlying component.
    657         */
    658 	blkno += RF_PROTECTED_SECTORS;
    659 
    660 	return dk_dump(dksc, dev, blkno, va, size);
    661 }
    662 
    663 static int
    664 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    665 {
    666 	struct raid_softc *rs = raidsoftc(dev);
    667 	const struct bdevsw *bdev;
    668 	RF_Raid_t *raidPtr;
    669 	int     c, sparecol, j, scol, dumpto;
    670 	int     error = 0;
    671 
    672 	raidPtr = &rs->sc_r;
    673 
    674 	/* we only support dumping to RAID 1 sets */
    675 	if (raidPtr->Layout.numDataCol != 1 ||
    676 	    raidPtr->Layout.numParityCol != 1)
    677 		return EINVAL;
    678 
    679 	if ((error = raidlock(rs)) != 0)
    680 		return error;
    681 
    682 	/* figure out what device is alive.. */
    683 
    684 	/*
    685 	   Look for a component to dump to.  The preference for the
    686 	   component to dump to is as follows:
    687 	   1) the master
    688 	   2) a used_spare of the master
    689 	   3) the slave
    690 	   4) a used_spare of the slave
    691 	*/
    692 
    693 	dumpto = -1;
    694 	for (c = 0; c < raidPtr->numCol; c++) {
    695 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    696 			/* this might be the one */
    697 			dumpto = c;
    698 			break;
    699 		}
    700 	}
    701 
    702 	/*
    703 	   At this point we have possibly selected a live master or a
    704 	   live slave.  We now check to see if there is a spared
    705 	   master (or a spared slave), if we didn't find a live master
    706 	   or a live slave.
    707 	*/
    708 
    709 	for (c = 0; c < raidPtr->numSpare; c++) {
    710 		sparecol = raidPtr->numCol + c;
    711 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    712 			/* How about this one? */
    713 			scol = -1;
    714 			for(j=0;j<raidPtr->numCol;j++) {
    715 				if (raidPtr->Disks[j].spareCol == sparecol) {
    716 					scol = j;
    717 					break;
    718 				}
    719 			}
    720 			if (scol == 0) {
    721 				/*
    722 				   We must have found a spared master!
    723 				   We'll take that over anything else
    724 				   found so far.  (We couldn't have
    725 				   found a real master before, since
    726 				   this is a used spare, and it's
    727 				   saying that it's replacing the
    728 				   master.)  On reboot (with
    729 				   autoconfiguration turned on)
    730 				   sparecol will become the 1st
    731 				   component (component0) of this set.
    732 				*/
    733 				dumpto = sparecol;
    734 				break;
    735 			} else if (scol != -1) {
    736 				/*
    737 				   Must be a spared slave.  We'll dump
    738 				   to that if we havn't found anything
    739 				   else so far.
    740 				*/
    741 				if (dumpto == -1)
    742 					dumpto = sparecol;
    743 			}
    744 		}
    745 	}
    746 
    747 	if (dumpto == -1) {
    748 		/* we couldn't find any live components to dump to!?!?
    749 		 */
    750 		error = EINVAL;
    751 		goto out;
    752 	}
    753 
    754 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    755 	if (bdev == NULL) {
    756 		error = ENXIO;
    757 		goto out;
    758 	}
    759 
    760 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    761 				blkno, va, nblk * raidPtr->bytesPerSector);
    762 
    763 out:
    764 	raidunlock(rs);
    765 
    766 	return error;
    767 }
    768 
    769 /* ARGSUSED */
    770 static int
    771 raidopen(dev_t dev, int flags, int fmt,
    772     struct lwp *l)
    773 {
    774 	int     unit = raidunit(dev);
    775 	struct raid_softc *rs;
    776 	struct dk_softc *dksc;
    777 	int     error = 0;
    778 	int     part, pmask;
    779 
    780 	if ((rs = raidget(unit, true)) == NULL)
    781 		return ENXIO;
    782 	if ((error = raidlock(rs)) != 0)
    783 		return (error);
    784 
    785 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    786 		error = EBUSY;
    787 		goto bad;
    788 	}
    789 
    790 	dksc = &rs->sc_dksc;
    791 
    792 	part = DISKPART(dev);
    793 	pmask = (1 << part);
    794 
    795 	if (!DK_BUSY(dksc, pmask) &&
    796 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    797 		/* First one... mark things as dirty... Note that we *MUST*
    798 		 have done a configure before this.  I DO NOT WANT TO BE
    799 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    800 		 THAT THEY BELONG TOGETHER!!!!! */
    801 		/* XXX should check to see if we're only open for reading
    802 		   here... If so, we needn't do this, but then need some
    803 		   other way of keeping track of what's happened.. */
    804 
    805 		rf_markalldirty(&rs->sc_r);
    806 	}
    807 
    808 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    809 		error = dk_open(dksc, dev, flags, fmt, l);
    810 
    811 bad:
    812 	raidunlock(rs);
    813 
    814 	return (error);
    815 
    816 
    817 }
    818 
    819 static int
    820 raid_lastclose(device_t self)
    821 {
    822 	struct raid_softc *rs = raidsoftc(self);
    823 
    824 	/* Last one... device is not unconfigured yet.
    825 	   Device shutdown has taken care of setting the
    826 	   clean bits if RAIDF_INITED is not set
    827 	   mark things as clean... */
    828 
    829 	rf_update_component_labels(&rs->sc_r,
    830 	    RF_FINAL_COMPONENT_UPDATE);
    831 
    832 	/* pass to unlocked code */
    833 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    834 		rs->sc_flags |= RAIDF_DETACH;
    835 
    836 	return 0;
    837 }
    838 
    839 /* ARGSUSED */
    840 static int
    841 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    842 {
    843 	int     unit = raidunit(dev);
    844 	struct raid_softc *rs;
    845 	struct dk_softc *dksc;
    846 	cfdata_t cf;
    847 	int     error = 0, do_detach = 0, do_put = 0;
    848 
    849 	if ((rs = raidget(unit, false)) == NULL)
    850 		return ENXIO;
    851 	dksc = &rs->sc_dksc;
    852 
    853 	if ((error = raidlock(rs)) != 0)
    854 		return (error);
    855 
    856 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    857 		error = dk_close(dksc, dev, flags, fmt, l);
    858 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    859 			do_detach = 1;
    860 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    861 		do_put = 1;
    862 
    863 	raidunlock(rs);
    864 
    865 	if (do_detach) {
    866 		/* free the pseudo device attach bits */
    867 		cf = device_cfdata(dksc->sc_dev);
    868 		error = config_detach(dksc->sc_dev, 0);
    869 		if (error == 0)
    870 			free(cf, M_RAIDFRAME);
    871 	} else if (do_put) {
    872 		raidput(rs);
    873 	}
    874 
    875 	return (error);
    876 
    877 }
    878 
    879 static void
    880 raid_wakeup(RF_Raid_t *raidPtr)
    881 {
    882 	rf_lock_mutex2(raidPtr->iodone_lock);
    883 	rf_signal_cond2(raidPtr->iodone_cv);
    884 	rf_unlock_mutex2(raidPtr->iodone_lock);
    885 }
    886 
    887 static void
    888 raidstrategy(struct buf *bp)
    889 {
    890 	unsigned int unit;
    891 	struct raid_softc *rs;
    892 	struct dk_softc *dksc;
    893 	RF_Raid_t *raidPtr;
    894 
    895 	unit = raidunit(bp->b_dev);
    896 	if ((rs = raidget(unit, false)) == NULL) {
    897 		bp->b_error = ENXIO;
    898 		goto fail;
    899 	}
    900 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    901 		bp->b_error = ENXIO;
    902 		goto fail;
    903 	}
    904 	dksc = &rs->sc_dksc;
    905 	raidPtr = &rs->sc_r;
    906 
    907 	/* Queue IO only */
    908 	if (dk_strategy_defer(dksc, bp))
    909 		goto done;
    910 
    911 	/* schedule the IO to happen at the next convenient time */
    912 	raid_wakeup(raidPtr);
    913 
    914 done:
    915 	return;
    916 
    917 fail:
    918 	bp->b_resid = bp->b_bcount;
    919 	biodone(bp);
    920 }
    921 
    922 static int
    923 raid_diskstart(device_t dev, struct buf *bp)
    924 {
    925 	struct raid_softc *rs = raidsoftc(dev);
    926 	RF_Raid_t *raidPtr;
    927 
    928 	raidPtr = &rs->sc_r;
    929 	if (!raidPtr->valid) {
    930 		db1_printf(("raid is not valid..\n"));
    931 		return ENODEV;
    932 	}
    933 
    934 	/* XXX */
    935 	bp->b_resid = 0;
    936 
    937 	return raiddoaccess(raidPtr, bp);
    938 }
    939 
    940 void
    941 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    942 {
    943 	struct raid_softc *rs;
    944 	struct dk_softc *dksc;
    945 
    946 	rs = raidPtr->softc;
    947 	dksc = &rs->sc_dksc;
    948 
    949 	dk_done(dksc, bp);
    950 
    951 	rf_lock_mutex2(raidPtr->mutex);
    952 	raidPtr->openings++;
    953 	rf_unlock_mutex2(raidPtr->mutex);
    954 
    955 	/* schedule more IO */
    956 	raid_wakeup(raidPtr);
    957 }
    958 
    959 /* ARGSUSED */
    960 static int
    961 raidread(dev_t dev, struct uio *uio, int flags)
    962 {
    963 	int     unit = raidunit(dev);
    964 	struct raid_softc *rs;
    965 
    966 	if ((rs = raidget(unit, false)) == NULL)
    967 		return ENXIO;
    968 
    969 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    970 		return (ENXIO);
    971 
    972 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    973 
    974 }
    975 
    976 /* ARGSUSED */
    977 static int
    978 raidwrite(dev_t dev, struct uio *uio, int flags)
    979 {
    980 	int     unit = raidunit(dev);
    981 	struct raid_softc *rs;
    982 
    983 	if ((rs = raidget(unit, false)) == NULL)
    984 		return ENXIO;
    985 
    986 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    987 		return (ENXIO);
    988 
    989 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    990 
    991 }
    992 
    993 static int
    994 raid_detach_unlocked(struct raid_softc *rs)
    995 {
    996 	struct dk_softc *dksc = &rs->sc_dksc;
    997 	RF_Raid_t *raidPtr;
    998 	int error;
    999 
   1000 	raidPtr = &rs->sc_r;
   1001 
   1002 	if (DK_BUSY(dksc, 0) ||
   1003 	    raidPtr->recon_in_progress != 0 ||
   1004 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1005 	    raidPtr->copyback_in_progress != 0)
   1006 		return EBUSY;
   1007 
   1008 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1009 		return 0;
   1010 
   1011 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1012 
   1013 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1014 		return error;
   1015 
   1016 	rs->sc_flags &= ~RAIDF_INITED;
   1017 
   1018 	/* Kill off any queued buffers */
   1019 	dk_drain(dksc);
   1020 	bufq_free(dksc->sc_bufq);
   1021 
   1022 	/* Detach the disk. */
   1023 	dkwedge_delall(&dksc->sc_dkdev);
   1024 	disk_detach(&dksc->sc_dkdev);
   1025 	disk_destroy(&dksc->sc_dkdev);
   1026 	dk_detach(dksc);
   1027 
   1028 	return 0;
   1029 }
   1030 
   1031 static int
   1032 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1033 {
   1034 	int     unit = raidunit(dev);
   1035 	int     error = 0;
   1036 	int     part, pmask;
   1037 	struct raid_softc *rs;
   1038 	struct dk_softc *dksc;
   1039 	RF_Config_t *k_cfg, *u_cfg;
   1040 	RF_Raid_t *raidPtr;
   1041 	RF_RaidDisk_t *diskPtr;
   1042 	RF_AccTotals_t *totals;
   1043 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1044 	u_char *specific_buf;
   1045 	int retcode = 0;
   1046 	int column;
   1047 /*	int raidid; */
   1048 	struct rf_recon_req *rrcopy, *rr;
   1049 	RF_ComponentLabel_t *clabel;
   1050 	RF_ComponentLabel_t *ci_label;
   1051 	RF_ComponentLabel_t **clabel_ptr;
   1052 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1053 	RF_SingleComponent_t component;
   1054 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1055 	int i, j, d;
   1056 
   1057 	if ((rs = raidget(unit, false)) == NULL)
   1058 		return ENXIO;
   1059 	dksc = &rs->sc_dksc;
   1060 	raidPtr = &rs->sc_r;
   1061 
   1062 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1063 		(int) DISKPART(dev), (int) unit, cmd));
   1064 
   1065 	/* Must be initialized for these... */
   1066 	switch (cmd) {
   1067 	case RAIDFRAME_REWRITEPARITY:
   1068 	case RAIDFRAME_GET_INFO:
   1069 	case RAIDFRAME_RESET_ACCTOTALS:
   1070 	case RAIDFRAME_GET_ACCTOTALS:
   1071 	case RAIDFRAME_KEEP_ACCTOTALS:
   1072 	case RAIDFRAME_GET_SIZE:
   1073 	case RAIDFRAME_FAIL_DISK:
   1074 	case RAIDFRAME_COPYBACK:
   1075 	case RAIDFRAME_CHECK_RECON_STATUS:
   1076 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1077 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1078 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1079 	case RAIDFRAME_ADD_HOT_SPARE:
   1080 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1081 	case RAIDFRAME_INIT_LABELS:
   1082 	case RAIDFRAME_REBUILD_IN_PLACE:
   1083 	case RAIDFRAME_CHECK_PARITY:
   1084 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1085 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1086 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1087 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1088 	case RAIDFRAME_SET_AUTOCONFIG:
   1089 	case RAIDFRAME_SET_ROOT:
   1090 	case RAIDFRAME_DELETE_COMPONENT:
   1091 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1092 	case RAIDFRAME_PARITYMAP_STATUS:
   1093 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1094 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1095 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1096 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1097 			return (ENXIO);
   1098 	}
   1099 
   1100 	switch (cmd) {
   1101 #ifdef COMPAT_50
   1102 	case RAIDFRAME_GET_INFO50:
   1103 		return rf_get_info50(raidPtr, data);
   1104 
   1105 	case RAIDFRAME_CONFIGURE50:
   1106 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1107 			return retcode;
   1108 		goto config;
   1109 #endif
   1110 		/* configure the system */
   1111 	case RAIDFRAME_CONFIGURE:
   1112 
   1113 		if (raidPtr->valid) {
   1114 			/* There is a valid RAID set running on this unit! */
   1115 			printf("raid%d: Device already configured!\n",unit);
   1116 			return(EINVAL);
   1117 		}
   1118 
   1119 		/* copy-in the configuration information */
   1120 		/* data points to a pointer to the configuration structure */
   1121 
   1122 		u_cfg = *((RF_Config_t **) data);
   1123 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1124 		if (k_cfg == NULL) {
   1125 			return (ENOMEM);
   1126 		}
   1127 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1128 		if (retcode) {
   1129 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1130 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1131 				retcode));
   1132 			goto no_config;
   1133 		}
   1134 		goto config;
   1135 	config:
   1136 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1137 
   1138 		/* allocate a buffer for the layout-specific data, and copy it
   1139 		 * in */
   1140 		if (k_cfg->layoutSpecificSize) {
   1141 			if (k_cfg->layoutSpecificSize > 10000) {
   1142 				/* sanity check */
   1143 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1144 				retcode = EINVAL;
   1145 				goto no_config;
   1146 			}
   1147 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1148 			    (u_char *));
   1149 			if (specific_buf == NULL) {
   1150 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1151 				retcode = ENOMEM;
   1152 				goto no_config;
   1153 			}
   1154 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1155 			    k_cfg->layoutSpecificSize);
   1156 			if (retcode) {
   1157 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1158 				RF_Free(specific_buf,
   1159 					k_cfg->layoutSpecificSize);
   1160 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1161 					retcode));
   1162 				goto no_config;
   1163 			}
   1164 		} else
   1165 			specific_buf = NULL;
   1166 		k_cfg->layoutSpecific = specific_buf;
   1167 
   1168 		/* should do some kind of sanity check on the configuration.
   1169 		 * Store the sum of all the bytes in the last byte? */
   1170 
   1171 		/* configure the system */
   1172 
   1173 		/*
   1174 		 * Clear the entire RAID descriptor, just to make sure
   1175 		 *  there is no stale data left in the case of a
   1176 		 *  reconfiguration
   1177 		 */
   1178 		memset(raidPtr, 0, sizeof(*raidPtr));
   1179 		raidPtr->softc = rs;
   1180 		raidPtr->raidid = unit;
   1181 
   1182 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1183 
   1184 		if (retcode == 0) {
   1185 
   1186 			/* allow this many simultaneous IO's to
   1187 			   this RAID device */
   1188 			raidPtr->openings = RAIDOUTSTANDING;
   1189 
   1190 			raidinit(rs);
   1191 			raid_wakeup(raidPtr);
   1192 			rf_markalldirty(raidPtr);
   1193 		}
   1194 		/* free the buffers.  No return code here. */
   1195 		if (k_cfg->layoutSpecificSize) {
   1196 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1197 		}
   1198 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1199 
   1200 	no_config:
   1201 		/*
   1202 		 * If configuration failed, set sc_flags so that we
   1203 		 * will detach the device when we close it.
   1204 		 */
   1205 		if (retcode != 0)
   1206 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1207 		return (retcode);
   1208 
   1209 		/* shutdown the system */
   1210 	case RAIDFRAME_SHUTDOWN:
   1211 
   1212 		part = DISKPART(dev);
   1213 		pmask = (1 << part);
   1214 
   1215 		if ((error = raidlock(rs)) != 0)
   1216 			return (error);
   1217 
   1218 		if (DK_BUSY(dksc, pmask) ||
   1219 		    raidPtr->recon_in_progress != 0 ||
   1220 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1221 		    raidPtr->copyback_in_progress != 0)
   1222 			retcode = EBUSY;
   1223 		else {
   1224 			/* detach and free on close */
   1225 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1226 			retcode = 0;
   1227 		}
   1228 
   1229 		raidunlock(rs);
   1230 
   1231 		return (retcode);
   1232 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1233 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1234 		/* need to read the component label for the disk indicated
   1235 		   by row,column in clabel */
   1236 
   1237 		/*
   1238 		 * Perhaps there should be an option to skip the in-core
   1239 		 * copy and hit the disk, as with disklabel(8).
   1240 		 */
   1241 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1242 
   1243 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1244 
   1245 		if (retcode) {
   1246 			RF_Free(clabel, sizeof(*clabel));
   1247 			return retcode;
   1248 		}
   1249 
   1250 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1251 
   1252 		column = clabel->column;
   1253 
   1254 		if ((column < 0) || (column >= raidPtr->numCol +
   1255 		    raidPtr->numSpare)) {
   1256 			RF_Free(clabel, sizeof(*clabel));
   1257 			return EINVAL;
   1258 		}
   1259 
   1260 		RF_Free(clabel, sizeof(*clabel));
   1261 
   1262 		clabel = raidget_component_label(raidPtr, column);
   1263 
   1264 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1265 
   1266 #if 0
   1267 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1268 		clabel = (RF_ComponentLabel_t *) data;
   1269 
   1270 		/* XXX check the label for valid stuff... */
   1271 		/* Note that some things *should not* get modified --
   1272 		   the user should be re-initing the labels instead of
   1273 		   trying to patch things.
   1274 		   */
   1275 
   1276 		raidid = raidPtr->raidid;
   1277 #ifdef DEBUG
   1278 		printf("raid%d: Got component label:\n", raidid);
   1279 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1280 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1281 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1282 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1283 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1284 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1285 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1286 #endif
   1287 		clabel->row = 0;
   1288 		column = clabel->column;
   1289 
   1290 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1291 			return(EINVAL);
   1292 		}
   1293 
   1294 		/* XXX this isn't allowed to do anything for now :-) */
   1295 
   1296 		/* XXX and before it is, we need to fill in the rest
   1297 		   of the fields!?!?!?! */
   1298 		memcpy(raidget_component_label(raidPtr, column),
   1299 		    clabel, sizeof(*clabel));
   1300 		raidflush_component_label(raidPtr, column);
   1301 		return (0);
   1302 #endif
   1303 
   1304 	case RAIDFRAME_INIT_LABELS:
   1305 		clabel = (RF_ComponentLabel_t *) data;
   1306 		/*
   1307 		   we only want the serial number from
   1308 		   the above.  We get all the rest of the information
   1309 		   from the config that was used to create this RAID
   1310 		   set.
   1311 		   */
   1312 
   1313 		raidPtr->serial_number = clabel->serial_number;
   1314 
   1315 		for(column=0;column<raidPtr->numCol;column++) {
   1316 			diskPtr = &raidPtr->Disks[column];
   1317 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1318 				ci_label = raidget_component_label(raidPtr,
   1319 				    column);
   1320 				/* Zeroing this is important. */
   1321 				memset(ci_label, 0, sizeof(*ci_label));
   1322 				raid_init_component_label(raidPtr, ci_label);
   1323 				ci_label->serial_number =
   1324 				    raidPtr->serial_number;
   1325 				ci_label->row = 0; /* we dont' pretend to support more */
   1326 				rf_component_label_set_partitionsize(ci_label,
   1327 				    diskPtr->partitionSize);
   1328 				ci_label->column = column;
   1329 				raidflush_component_label(raidPtr, column);
   1330 			}
   1331 			/* XXXjld what about the spares? */
   1332 		}
   1333 
   1334 		return (retcode);
   1335 	case RAIDFRAME_SET_AUTOCONFIG:
   1336 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1337 		printf("raid%d: New autoconfig value is: %d\n",
   1338 		       raidPtr->raidid, d);
   1339 		*(int *) data = d;
   1340 		return (retcode);
   1341 
   1342 	case RAIDFRAME_SET_ROOT:
   1343 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1344 		printf("raid%d: New rootpartition value is: %d\n",
   1345 		       raidPtr->raidid, d);
   1346 		*(int *) data = d;
   1347 		return (retcode);
   1348 
   1349 		/* initialize all parity */
   1350 	case RAIDFRAME_REWRITEPARITY:
   1351 
   1352 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1353 			/* Parity for RAID 0 is trivially correct */
   1354 			raidPtr->parity_good = RF_RAID_CLEAN;
   1355 			return(0);
   1356 		}
   1357 
   1358 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1359 			/* Re-write is already in progress! */
   1360 			return(EINVAL);
   1361 		}
   1362 
   1363 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1364 					   rf_RewriteParityThread,
   1365 					   raidPtr,"raid_parity");
   1366 		return (retcode);
   1367 
   1368 
   1369 	case RAIDFRAME_ADD_HOT_SPARE:
   1370 		sparePtr = (RF_SingleComponent_t *) data;
   1371 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1372 		retcode = rf_add_hot_spare(raidPtr, &component);
   1373 		return(retcode);
   1374 
   1375 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1376 		return(retcode);
   1377 
   1378 	case RAIDFRAME_DELETE_COMPONENT:
   1379 		componentPtr = (RF_SingleComponent_t *)data;
   1380 		memcpy( &component, componentPtr,
   1381 			sizeof(RF_SingleComponent_t));
   1382 		retcode = rf_delete_component(raidPtr, &component);
   1383 		return(retcode);
   1384 
   1385 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1386 		componentPtr = (RF_SingleComponent_t *)data;
   1387 		memcpy( &component, componentPtr,
   1388 			sizeof(RF_SingleComponent_t));
   1389 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1390 		return(retcode);
   1391 
   1392 	case RAIDFRAME_REBUILD_IN_PLACE:
   1393 
   1394 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1395 			/* Can't do this on a RAID 0!! */
   1396 			return(EINVAL);
   1397 		}
   1398 
   1399 		if (raidPtr->recon_in_progress == 1) {
   1400 			/* a reconstruct is already in progress! */
   1401 			return(EINVAL);
   1402 		}
   1403 
   1404 		componentPtr = (RF_SingleComponent_t *) data;
   1405 		memcpy( &component, componentPtr,
   1406 			sizeof(RF_SingleComponent_t));
   1407 		component.row = 0; /* we don't support any more */
   1408 		column = component.column;
   1409 
   1410 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1411 			return(EINVAL);
   1412 		}
   1413 
   1414 		rf_lock_mutex2(raidPtr->mutex);
   1415 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1416 		    (raidPtr->numFailures > 0)) {
   1417 			/* XXX 0 above shouldn't be constant!!! */
   1418 			/* some component other than this has failed.
   1419 			   Let's not make things worse than they already
   1420 			   are... */
   1421 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1422 			       raidPtr->raidid);
   1423 			printf("raid%d:     Col: %d   Too many failures.\n",
   1424 			       raidPtr->raidid, column);
   1425 			rf_unlock_mutex2(raidPtr->mutex);
   1426 			return (EINVAL);
   1427 		}
   1428 		if (raidPtr->Disks[column].status ==
   1429 		    rf_ds_reconstructing) {
   1430 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1431 			       raidPtr->raidid);
   1432 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1433 
   1434 			rf_unlock_mutex2(raidPtr->mutex);
   1435 			return (EINVAL);
   1436 		}
   1437 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1438 			rf_unlock_mutex2(raidPtr->mutex);
   1439 			return (EINVAL);
   1440 		}
   1441 		rf_unlock_mutex2(raidPtr->mutex);
   1442 
   1443 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1444 		if (rrcopy == NULL)
   1445 			return(ENOMEM);
   1446 
   1447 		rrcopy->raidPtr = (void *) raidPtr;
   1448 		rrcopy->col = column;
   1449 
   1450 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1451 					   rf_ReconstructInPlaceThread,
   1452 					   rrcopy,"raid_reconip");
   1453 		return(retcode);
   1454 
   1455 	case RAIDFRAME_GET_INFO:
   1456 		if (!raidPtr->valid)
   1457 			return (ENODEV);
   1458 		ucfgp = (RF_DeviceConfig_t **) data;
   1459 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1460 			  (RF_DeviceConfig_t *));
   1461 		if (d_cfg == NULL)
   1462 			return (ENOMEM);
   1463 		d_cfg->rows = 1; /* there is only 1 row now */
   1464 		d_cfg->cols = raidPtr->numCol;
   1465 		d_cfg->ndevs = raidPtr->numCol;
   1466 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1467 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1468 			return (ENOMEM);
   1469 		}
   1470 		d_cfg->nspares = raidPtr->numSpare;
   1471 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1472 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1473 			return (ENOMEM);
   1474 		}
   1475 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1476 		d = 0;
   1477 		for (j = 0; j < d_cfg->cols; j++) {
   1478 			d_cfg->devs[d] = raidPtr->Disks[j];
   1479 			d++;
   1480 		}
   1481 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1482 			d_cfg->spares[i] = raidPtr->Disks[j];
   1483 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1484 				/* XXX: raidctl(8) expects to see this as a used spare */
   1485 				d_cfg->spares[i].status = rf_ds_used_spare;
   1486 			}
   1487 		}
   1488 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1489 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1490 
   1491 		return (retcode);
   1492 
   1493 	case RAIDFRAME_CHECK_PARITY:
   1494 		*(int *) data = raidPtr->parity_good;
   1495 		return (0);
   1496 
   1497 	case RAIDFRAME_PARITYMAP_STATUS:
   1498 		if (rf_paritymap_ineligible(raidPtr))
   1499 			return EINVAL;
   1500 		rf_paritymap_status(raidPtr->parity_map,
   1501 		    (struct rf_pmstat *)data);
   1502 		return 0;
   1503 
   1504 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1505 		if (rf_paritymap_ineligible(raidPtr))
   1506 			return EINVAL;
   1507 		if (raidPtr->parity_map == NULL)
   1508 			return ENOENT; /* ??? */
   1509 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1510 			(struct rf_pmparams *)data, 1))
   1511 			return EINVAL;
   1512 		return 0;
   1513 
   1514 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1515 		if (rf_paritymap_ineligible(raidPtr))
   1516 			return EINVAL;
   1517 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1518 		return 0;
   1519 
   1520 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1521 		if (rf_paritymap_ineligible(raidPtr))
   1522 			return EINVAL;
   1523 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1524 		/* XXX should errors be passed up? */
   1525 		return 0;
   1526 
   1527 	case RAIDFRAME_RESET_ACCTOTALS:
   1528 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1529 		return (0);
   1530 
   1531 	case RAIDFRAME_GET_ACCTOTALS:
   1532 		totals = (RF_AccTotals_t *) data;
   1533 		*totals = raidPtr->acc_totals;
   1534 		return (0);
   1535 
   1536 	case RAIDFRAME_KEEP_ACCTOTALS:
   1537 		raidPtr->keep_acc_totals = *(int *)data;
   1538 		return (0);
   1539 
   1540 	case RAIDFRAME_GET_SIZE:
   1541 		*(int *) data = raidPtr->totalSectors;
   1542 		return (0);
   1543 
   1544 		/* fail a disk & optionally start reconstruction */
   1545 	case RAIDFRAME_FAIL_DISK:
   1546 
   1547 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1548 			/* Can't do this on a RAID 0!! */
   1549 			return(EINVAL);
   1550 		}
   1551 
   1552 		rr = (struct rf_recon_req *) data;
   1553 		rr->row = 0;
   1554 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1555 			return (EINVAL);
   1556 
   1557 
   1558 		rf_lock_mutex2(raidPtr->mutex);
   1559 		if (raidPtr->status == rf_rs_reconstructing) {
   1560 			/* you can't fail a disk while we're reconstructing! */
   1561 			/* XXX wrong for RAID6 */
   1562 			rf_unlock_mutex2(raidPtr->mutex);
   1563 			return (EINVAL);
   1564 		}
   1565 		if ((raidPtr->Disks[rr->col].status ==
   1566 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1567 			/* some other component has failed.  Let's not make
   1568 			   things worse. XXX wrong for RAID6 */
   1569 			rf_unlock_mutex2(raidPtr->mutex);
   1570 			return (EINVAL);
   1571 		}
   1572 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1573 			/* Can't fail a spared disk! */
   1574 			rf_unlock_mutex2(raidPtr->mutex);
   1575 			return (EINVAL);
   1576 		}
   1577 		rf_unlock_mutex2(raidPtr->mutex);
   1578 
   1579 		/* make a copy of the recon request so that we don't rely on
   1580 		 * the user's buffer */
   1581 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1582 		if (rrcopy == NULL)
   1583 			return(ENOMEM);
   1584 		memcpy(rrcopy, rr, sizeof(*rr));
   1585 		rrcopy->raidPtr = (void *) raidPtr;
   1586 
   1587 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1588 					   rf_ReconThread,
   1589 					   rrcopy,"raid_recon");
   1590 		return (0);
   1591 
   1592 		/* invoke a copyback operation after recon on whatever disk
   1593 		 * needs it, if any */
   1594 	case RAIDFRAME_COPYBACK:
   1595 
   1596 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1597 			/* This makes no sense on a RAID 0!! */
   1598 			return(EINVAL);
   1599 		}
   1600 
   1601 		if (raidPtr->copyback_in_progress == 1) {
   1602 			/* Copyback is already in progress! */
   1603 			return(EINVAL);
   1604 		}
   1605 
   1606 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1607 					   rf_CopybackThread,
   1608 					   raidPtr,"raid_copyback");
   1609 		return (retcode);
   1610 
   1611 		/* return the percentage completion of reconstruction */
   1612 	case RAIDFRAME_CHECK_RECON_STATUS:
   1613 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1614 			/* This makes no sense on a RAID 0, so tell the
   1615 			   user it's done. */
   1616 			*(int *) data = 100;
   1617 			return(0);
   1618 		}
   1619 		if (raidPtr->status != rf_rs_reconstructing)
   1620 			*(int *) data = 100;
   1621 		else {
   1622 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1623 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1624 			} else {
   1625 				*(int *) data = 0;
   1626 			}
   1627 		}
   1628 		return (0);
   1629 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1630 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1631 		if (raidPtr->status != rf_rs_reconstructing) {
   1632 			progressInfo.remaining = 0;
   1633 			progressInfo.completed = 100;
   1634 			progressInfo.total = 100;
   1635 		} else {
   1636 			progressInfo.total =
   1637 				raidPtr->reconControl->numRUsTotal;
   1638 			progressInfo.completed =
   1639 				raidPtr->reconControl->numRUsComplete;
   1640 			progressInfo.remaining = progressInfo.total -
   1641 				progressInfo.completed;
   1642 		}
   1643 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1644 				  sizeof(RF_ProgressInfo_t));
   1645 		return (retcode);
   1646 
   1647 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1648 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1649 			/* This makes no sense on a RAID 0, so tell the
   1650 			   user it's done. */
   1651 			*(int *) data = 100;
   1652 			return(0);
   1653 		}
   1654 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1655 			*(int *) data = 100 *
   1656 				raidPtr->parity_rewrite_stripes_done /
   1657 				raidPtr->Layout.numStripe;
   1658 		} else {
   1659 			*(int *) data = 100;
   1660 		}
   1661 		return (0);
   1662 
   1663 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1664 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1665 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1666 			progressInfo.total = raidPtr->Layout.numStripe;
   1667 			progressInfo.completed =
   1668 				raidPtr->parity_rewrite_stripes_done;
   1669 			progressInfo.remaining = progressInfo.total -
   1670 				progressInfo.completed;
   1671 		} else {
   1672 			progressInfo.remaining = 0;
   1673 			progressInfo.completed = 100;
   1674 			progressInfo.total = 100;
   1675 		}
   1676 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1677 				  sizeof(RF_ProgressInfo_t));
   1678 		return (retcode);
   1679 
   1680 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1681 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1682 			/* This makes no sense on a RAID 0 */
   1683 			*(int *) data = 100;
   1684 			return(0);
   1685 		}
   1686 		if (raidPtr->copyback_in_progress == 1) {
   1687 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1688 				raidPtr->Layout.numStripe;
   1689 		} else {
   1690 			*(int *) data = 100;
   1691 		}
   1692 		return (0);
   1693 
   1694 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1695 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1696 		if (raidPtr->copyback_in_progress == 1) {
   1697 			progressInfo.total = raidPtr->Layout.numStripe;
   1698 			progressInfo.completed =
   1699 				raidPtr->copyback_stripes_done;
   1700 			progressInfo.remaining = progressInfo.total -
   1701 				progressInfo.completed;
   1702 		} else {
   1703 			progressInfo.remaining = 0;
   1704 			progressInfo.completed = 100;
   1705 			progressInfo.total = 100;
   1706 		}
   1707 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1708 				  sizeof(RF_ProgressInfo_t));
   1709 		return (retcode);
   1710 
   1711 	case RAIDFRAME_SET_LAST_UNIT:
   1712 		for (column = 0; column < raidPtr->numCol; column++)
   1713 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1714 				return EBUSY;
   1715 
   1716 		for (column = 0; column < raidPtr->numCol; column++) {
   1717 			clabel = raidget_component_label(raidPtr, column);
   1718 			clabel->last_unit = *(int *)data;
   1719 			raidflush_component_label(raidPtr, column);
   1720 		}
   1721 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1722 		return 0;
   1723 
   1724 		/* the sparetable daemon calls this to wait for the kernel to
   1725 		 * need a spare table. this ioctl does not return until a
   1726 		 * spare table is needed. XXX -- calling mpsleep here in the
   1727 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1728 		 * -- I should either compute the spare table in the kernel,
   1729 		 * or have a different -- XXX XXX -- interface (a different
   1730 		 * character device) for delivering the table     -- XXX */
   1731 #if 0
   1732 	case RAIDFRAME_SPARET_WAIT:
   1733 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1734 		while (!rf_sparet_wait_queue)
   1735 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1736 		waitreq = rf_sparet_wait_queue;
   1737 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1738 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1739 
   1740 		/* structure assignment */
   1741 		*((RF_SparetWait_t *) data) = *waitreq;
   1742 
   1743 		RF_Free(waitreq, sizeof(*waitreq));
   1744 		return (0);
   1745 
   1746 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1747 		 * code in it that will cause the dameon to exit */
   1748 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1749 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1750 		waitreq->fcol = -1;
   1751 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1752 		waitreq->next = rf_sparet_wait_queue;
   1753 		rf_sparet_wait_queue = waitreq;
   1754 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1755 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1756 		return (0);
   1757 
   1758 		/* used by the spare table daemon to deliver a spare table
   1759 		 * into the kernel */
   1760 	case RAIDFRAME_SEND_SPARET:
   1761 
   1762 		/* install the spare table */
   1763 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1764 
   1765 		/* respond to the requestor.  the return status of the spare
   1766 		 * table installation is passed in the "fcol" field */
   1767 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1768 		waitreq->fcol = retcode;
   1769 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1770 		waitreq->next = rf_sparet_resp_queue;
   1771 		rf_sparet_resp_queue = waitreq;
   1772 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1773 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1774 
   1775 		return (retcode);
   1776 #endif
   1777 
   1778 	default:
   1779 		break; /* fall through to the os-specific code below */
   1780 
   1781 	}
   1782 
   1783 	if (!raidPtr->valid)
   1784 		return (EINVAL);
   1785 
   1786 	/*
   1787 	 * Add support for "regular" device ioctls here.
   1788 	 */
   1789 
   1790 	switch (cmd) {
   1791 	case DIOCGCACHE:
   1792 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1793 		break;
   1794 
   1795 	case DIOCCACHESYNC:
   1796 		retcode = rf_sync_component_caches(raidPtr);
   1797 		break;
   1798 
   1799 	default:
   1800 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1801 		break;
   1802 	}
   1803 
   1804 	return (retcode);
   1805 
   1806 }
   1807 
   1808 
   1809 /* raidinit -- complete the rest of the initialization for the
   1810    RAIDframe device.  */
   1811 
   1812 
   1813 static void
   1814 raidinit(struct raid_softc *rs)
   1815 {
   1816 	cfdata_t cf;
   1817 	unsigned int unit;
   1818 	struct dk_softc *dksc = &rs->sc_dksc;
   1819 	RF_Raid_t *raidPtr = &rs->sc_r;
   1820 	device_t dev;
   1821 
   1822 	unit = raidPtr->raidid;
   1823 
   1824 	/* XXX doesn't check bounds. */
   1825 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1826 
   1827 	/* attach the pseudo device */
   1828 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1829 	cf->cf_name = raid_cd.cd_name;
   1830 	cf->cf_atname = raid_cd.cd_name;
   1831 	cf->cf_unit = unit;
   1832 	cf->cf_fstate = FSTATE_STAR;
   1833 
   1834 	dev = config_attach_pseudo(cf);
   1835 	if (dev == NULL) {
   1836 		printf("raid%d: config_attach_pseudo failed\n",
   1837 		    raidPtr->raidid);
   1838 		free(cf, M_RAIDFRAME);
   1839 		return;
   1840 	}
   1841 
   1842 	/* provide a backpointer to the real softc */
   1843 	raidsoftc(dev) = rs;
   1844 
   1845 	/* disk_attach actually creates space for the CPU disklabel, among
   1846 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1847 	 * with disklabels. */
   1848 	dk_init(dksc, dev, DKTYPE_RAID);
   1849 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1850 
   1851 	/* XXX There may be a weird interaction here between this, and
   1852 	 * protectedSectors, as used in RAIDframe.  */
   1853 
   1854 	rs->sc_size = raidPtr->totalSectors;
   1855 
   1856 	/* Attach dk and disk subsystems */
   1857 	dk_attach(dksc);
   1858 	disk_attach(&dksc->sc_dkdev);
   1859 	rf_set_geometry(rs, raidPtr);
   1860 
   1861 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1862 
   1863 	/* mark unit as usuable */
   1864 	rs->sc_flags |= RAIDF_INITED;
   1865 
   1866 	dkwedge_discover(&dksc->sc_dkdev);
   1867 }
   1868 
   1869 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1870 /* wake up the daemon & tell it to get us a spare table
   1871  * XXX
   1872  * the entries in the queues should be tagged with the raidPtr
   1873  * so that in the extremely rare case that two recons happen at once,
   1874  * we know for which device were requesting a spare table
   1875  * XXX
   1876  *
   1877  * XXX This code is not currently used. GO
   1878  */
   1879 int
   1880 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1881 {
   1882 	int     retcode;
   1883 
   1884 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1885 	req->next = rf_sparet_wait_queue;
   1886 	rf_sparet_wait_queue = req;
   1887 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1888 
   1889 	/* mpsleep unlocks the mutex */
   1890 	while (!rf_sparet_resp_queue) {
   1891 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1892 	}
   1893 	req = rf_sparet_resp_queue;
   1894 	rf_sparet_resp_queue = req->next;
   1895 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1896 
   1897 	retcode = req->fcol;
   1898 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1899 					 * alloc'd */
   1900 	return (retcode);
   1901 }
   1902 #endif
   1903 
   1904 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1905  * bp & passes it down.
   1906  * any calls originating in the kernel must use non-blocking I/O
   1907  * do some extra sanity checking to return "appropriate" error values for
   1908  * certain conditions (to make some standard utilities work)
   1909  *
   1910  * Formerly known as: rf_DoAccessKernel
   1911  */
   1912 void
   1913 raidstart(RF_Raid_t *raidPtr)
   1914 {
   1915 	struct raid_softc *rs;
   1916 	struct dk_softc *dksc;
   1917 
   1918 	rs = raidPtr->softc;
   1919 	dksc = &rs->sc_dksc;
   1920 	/* quick check to see if anything has died recently */
   1921 	rf_lock_mutex2(raidPtr->mutex);
   1922 	if (raidPtr->numNewFailures > 0) {
   1923 		rf_unlock_mutex2(raidPtr->mutex);
   1924 		rf_update_component_labels(raidPtr,
   1925 					   RF_NORMAL_COMPONENT_UPDATE);
   1926 		rf_lock_mutex2(raidPtr->mutex);
   1927 		raidPtr->numNewFailures--;
   1928 	}
   1929 	rf_unlock_mutex2(raidPtr->mutex);
   1930 
   1931 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1932 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1933 		return;
   1934 	}
   1935 
   1936 	dk_start(dksc, NULL);
   1937 }
   1938 
   1939 static int
   1940 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1941 {
   1942 	RF_SectorCount_t num_blocks, pb, sum;
   1943 	RF_RaidAddr_t raid_addr;
   1944 	daddr_t blocknum;
   1945 	int     do_async;
   1946 	int rc;
   1947 
   1948 	rf_lock_mutex2(raidPtr->mutex);
   1949 	if (raidPtr->openings == 0) {
   1950 		rf_unlock_mutex2(raidPtr->mutex);
   1951 		return EAGAIN;
   1952 	}
   1953 	rf_unlock_mutex2(raidPtr->mutex);
   1954 
   1955 	blocknum = bp->b_rawblkno;
   1956 
   1957 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1958 		    (int) blocknum));
   1959 
   1960 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1961 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1962 
   1963 	/* *THIS* is where we adjust what block we're going to...
   1964 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1965 	raid_addr = blocknum;
   1966 
   1967 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1968 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1969 	sum = raid_addr + num_blocks + pb;
   1970 	if (1 || rf_debugKernelAccess) {
   1971 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1972 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1973 			    (int) pb, (int) bp->b_resid));
   1974 	}
   1975 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1976 	    || (sum < num_blocks) || (sum < pb)) {
   1977 		rc = ENOSPC;
   1978 		goto done;
   1979 	}
   1980 	/*
   1981 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1982 	 */
   1983 
   1984 	if (bp->b_bcount & raidPtr->sectorMask) {
   1985 		rc = ENOSPC;
   1986 		goto done;
   1987 	}
   1988 	db1_printf(("Calling DoAccess..\n"));
   1989 
   1990 
   1991 	rf_lock_mutex2(raidPtr->mutex);
   1992 	raidPtr->openings--;
   1993 	rf_unlock_mutex2(raidPtr->mutex);
   1994 
   1995 	/*
   1996 	 * Everything is async.
   1997 	 */
   1998 	do_async = 1;
   1999 
   2000 	/* don't ever condition on bp->b_flags & B_WRITE.
   2001 	 * always condition on B_READ instead */
   2002 
   2003 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2004 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2005 			 do_async, raid_addr, num_blocks,
   2006 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2007 
   2008 done:
   2009 	return rc;
   2010 }
   2011 
   2012 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2013 
   2014 int
   2015 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2016 {
   2017 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2018 	struct buf *bp;
   2019 
   2020 	req->queue = queue;
   2021 	bp = req->bp;
   2022 
   2023 	switch (req->type) {
   2024 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2025 		/* XXX need to do something extra here.. */
   2026 		/* I'm leaving this in, as I've never actually seen it used,
   2027 		 * and I'd like folks to report it... GO */
   2028 		printf(("WAKEUP CALLED\n"));
   2029 		queue->numOutstanding++;
   2030 
   2031 		bp->b_flags = 0;
   2032 		bp->b_private = req;
   2033 
   2034 		KernelWakeupFunc(bp);
   2035 		break;
   2036 
   2037 	case RF_IO_TYPE_READ:
   2038 	case RF_IO_TYPE_WRITE:
   2039 #if RF_ACC_TRACE > 0
   2040 		if (req->tracerec) {
   2041 			RF_ETIMER_START(req->tracerec->timer);
   2042 		}
   2043 #endif
   2044 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2045 		    op, queue->rf_cinfo->ci_dev,
   2046 		    req->sectorOffset, req->numSector,
   2047 		    req->buf, KernelWakeupFunc, (void *) req,
   2048 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2049 
   2050 		if (rf_debugKernelAccess) {
   2051 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2052 				(long) bp->b_blkno));
   2053 		}
   2054 		queue->numOutstanding++;
   2055 		queue->last_deq_sector = req->sectorOffset;
   2056 		/* acc wouldn't have been let in if there were any pending
   2057 		 * reqs at any other priority */
   2058 		queue->curPriority = req->priority;
   2059 
   2060 		db1_printf(("Going for %c to unit %d col %d\n",
   2061 			    req->type, queue->raidPtr->raidid,
   2062 			    queue->col));
   2063 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2064 			(int) req->sectorOffset, (int) req->numSector,
   2065 			(int) (req->numSector <<
   2066 			    queue->raidPtr->logBytesPerSector),
   2067 			(int) queue->raidPtr->logBytesPerSector));
   2068 
   2069 		/*
   2070 		 * XXX: drop lock here since this can block at
   2071 		 * least with backing SCSI devices.  Retake it
   2072 		 * to minimize fuss with calling interfaces.
   2073 		 */
   2074 
   2075 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2076 		bdev_strategy(bp);
   2077 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2078 		break;
   2079 
   2080 	default:
   2081 		panic("bad req->type in rf_DispatchKernelIO");
   2082 	}
   2083 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2084 
   2085 	return (0);
   2086 }
   2087 /* this is the callback function associated with a I/O invoked from
   2088    kernel code.
   2089  */
   2090 static void
   2091 KernelWakeupFunc(struct buf *bp)
   2092 {
   2093 	RF_DiskQueueData_t *req = NULL;
   2094 	RF_DiskQueue_t *queue;
   2095 
   2096 	db1_printf(("recovering the request queue:\n"));
   2097 
   2098 	req = bp->b_private;
   2099 
   2100 	queue = (RF_DiskQueue_t *) req->queue;
   2101 
   2102 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2103 
   2104 #if RF_ACC_TRACE > 0
   2105 	if (req->tracerec) {
   2106 		RF_ETIMER_STOP(req->tracerec->timer);
   2107 		RF_ETIMER_EVAL(req->tracerec->timer);
   2108 		rf_lock_mutex2(rf_tracing_mutex);
   2109 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2110 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2111 		req->tracerec->num_phys_ios++;
   2112 		rf_unlock_mutex2(rf_tracing_mutex);
   2113 	}
   2114 #endif
   2115 
   2116 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2117 	 * ballistic, and mark the component as hosed... */
   2118 
   2119 	if (bp->b_error != 0) {
   2120 		/* Mark the disk as dead */
   2121 		/* but only mark it once... */
   2122 		/* and only if it wouldn't leave this RAID set
   2123 		   completely broken */
   2124 		if (((queue->raidPtr->Disks[queue->col].status ==
   2125 		      rf_ds_optimal) ||
   2126 		     (queue->raidPtr->Disks[queue->col].status ==
   2127 		      rf_ds_used_spare)) &&
   2128 		     (queue->raidPtr->numFailures <
   2129 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2130 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2131 			       queue->raidPtr->raidid,
   2132 			       bp->b_error,
   2133 			       queue->raidPtr->Disks[queue->col].devname);
   2134 			queue->raidPtr->Disks[queue->col].status =
   2135 			    rf_ds_failed;
   2136 			queue->raidPtr->status = rf_rs_degraded;
   2137 			queue->raidPtr->numFailures++;
   2138 			queue->raidPtr->numNewFailures++;
   2139 		} else {	/* Disk is already dead... */
   2140 			/* printf("Disk already marked as dead!\n"); */
   2141 		}
   2142 
   2143 	}
   2144 
   2145 	/* Fill in the error value */
   2146 	req->error = bp->b_error;
   2147 
   2148 	/* Drop this one on the "finished" queue... */
   2149 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2150 
   2151 	/* Let the raidio thread know there is work to be done. */
   2152 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2153 
   2154 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2155 }
   2156 
   2157 
   2158 /*
   2159  * initialize a buf structure for doing an I/O in the kernel.
   2160  */
   2161 static void
   2162 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2163        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2164        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2165        struct proc *b_proc)
   2166 {
   2167 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2168 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2169 	bp->b_oflags = 0;
   2170 	bp->b_cflags = 0;
   2171 	bp->b_bcount = numSect << logBytesPerSector;
   2172 	bp->b_bufsize = bp->b_bcount;
   2173 	bp->b_error = 0;
   2174 	bp->b_dev = dev;
   2175 	bp->b_data = bf;
   2176 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2177 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2178 	if (bp->b_bcount == 0) {
   2179 		panic("bp->b_bcount is zero in InitBP!!");
   2180 	}
   2181 	bp->b_proc = b_proc;
   2182 	bp->b_iodone = cbFunc;
   2183 	bp->b_private = cbArg;
   2184 }
   2185 
   2186 /*
   2187  * Wait interruptibly for an exclusive lock.
   2188  *
   2189  * XXX
   2190  * Several drivers do this; it should be abstracted and made MP-safe.
   2191  * (Hmm... where have we seen this warning before :->  GO )
   2192  */
   2193 static int
   2194 raidlock(struct raid_softc *rs)
   2195 {
   2196 	int     error;
   2197 
   2198 	error = 0;
   2199 	mutex_enter(&rs->sc_mutex);
   2200 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2201 		rs->sc_flags |= RAIDF_WANTED;
   2202 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2203 		if (error != 0)
   2204 			goto done;
   2205 	}
   2206 	rs->sc_flags |= RAIDF_LOCKED;
   2207 done:
   2208 	mutex_exit(&rs->sc_mutex);
   2209 	return (error);
   2210 }
   2211 /*
   2212  * Unlock and wake up any waiters.
   2213  */
   2214 static void
   2215 raidunlock(struct raid_softc *rs)
   2216 {
   2217 
   2218 	mutex_enter(&rs->sc_mutex);
   2219 	rs->sc_flags &= ~RAIDF_LOCKED;
   2220 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2221 		rs->sc_flags &= ~RAIDF_WANTED;
   2222 		cv_broadcast(&rs->sc_cv);
   2223 	}
   2224 	mutex_exit(&rs->sc_mutex);
   2225 }
   2226 
   2227 
   2228 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2229 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2230 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2231 
   2232 static daddr_t
   2233 rf_component_info_offset(void)
   2234 {
   2235 
   2236 	return RF_COMPONENT_INFO_OFFSET;
   2237 }
   2238 
   2239 static daddr_t
   2240 rf_component_info_size(unsigned secsize)
   2241 {
   2242 	daddr_t info_size;
   2243 
   2244 	KASSERT(secsize);
   2245 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2246 		info_size = secsize;
   2247 	else
   2248 		info_size = RF_COMPONENT_INFO_SIZE;
   2249 
   2250 	return info_size;
   2251 }
   2252 
   2253 static daddr_t
   2254 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2255 {
   2256 	daddr_t map_offset;
   2257 
   2258 	KASSERT(raidPtr->bytesPerSector);
   2259 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2260 		map_offset = raidPtr->bytesPerSector;
   2261 	else
   2262 		map_offset = RF_COMPONENT_INFO_SIZE;
   2263 	map_offset += rf_component_info_offset();
   2264 
   2265 	return map_offset;
   2266 }
   2267 
   2268 static daddr_t
   2269 rf_parity_map_size(RF_Raid_t *raidPtr)
   2270 {
   2271 	daddr_t map_size;
   2272 
   2273 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2274 		map_size = raidPtr->bytesPerSector;
   2275 	else
   2276 		map_size = RF_PARITY_MAP_SIZE;
   2277 
   2278 	return map_size;
   2279 }
   2280 
   2281 int
   2282 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2283 {
   2284 	RF_ComponentLabel_t *clabel;
   2285 
   2286 	clabel = raidget_component_label(raidPtr, col);
   2287 	clabel->clean = RF_RAID_CLEAN;
   2288 	raidflush_component_label(raidPtr, col);
   2289 	return(0);
   2290 }
   2291 
   2292 
   2293 int
   2294 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2295 {
   2296 	RF_ComponentLabel_t *clabel;
   2297 
   2298 	clabel = raidget_component_label(raidPtr, col);
   2299 	clabel->clean = RF_RAID_DIRTY;
   2300 	raidflush_component_label(raidPtr, col);
   2301 	return(0);
   2302 }
   2303 
   2304 int
   2305 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2306 {
   2307 	KASSERT(raidPtr->bytesPerSector);
   2308 	return raidread_component_label(raidPtr->bytesPerSector,
   2309 	    raidPtr->Disks[col].dev,
   2310 	    raidPtr->raid_cinfo[col].ci_vp,
   2311 	    &raidPtr->raid_cinfo[col].ci_label);
   2312 }
   2313 
   2314 RF_ComponentLabel_t *
   2315 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2316 {
   2317 	return &raidPtr->raid_cinfo[col].ci_label;
   2318 }
   2319 
   2320 int
   2321 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2322 {
   2323 	RF_ComponentLabel_t *label;
   2324 
   2325 	label = &raidPtr->raid_cinfo[col].ci_label;
   2326 	label->mod_counter = raidPtr->mod_counter;
   2327 #ifndef RF_NO_PARITY_MAP
   2328 	label->parity_map_modcount = label->mod_counter;
   2329 #endif
   2330 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2331 	    raidPtr->Disks[col].dev,
   2332 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2333 }
   2334 
   2335 
   2336 static int
   2337 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2338     RF_ComponentLabel_t *clabel)
   2339 {
   2340 	return raidread_component_area(dev, b_vp, clabel,
   2341 	    sizeof(RF_ComponentLabel_t),
   2342 	    rf_component_info_offset(),
   2343 	    rf_component_info_size(secsize));
   2344 }
   2345 
   2346 /* ARGSUSED */
   2347 static int
   2348 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2349     size_t msize, daddr_t offset, daddr_t dsize)
   2350 {
   2351 	struct buf *bp;
   2352 	int error;
   2353 
   2354 	/* XXX should probably ensure that we don't try to do this if
   2355 	   someone has changed rf_protected_sectors. */
   2356 
   2357 	if (b_vp == NULL) {
   2358 		/* For whatever reason, this component is not valid.
   2359 		   Don't try to read a component label from it. */
   2360 		return(EINVAL);
   2361 	}
   2362 
   2363 	/* get a block of the appropriate size... */
   2364 	bp = geteblk((int)dsize);
   2365 	bp->b_dev = dev;
   2366 
   2367 	/* get our ducks in a row for the read */
   2368 	bp->b_blkno = offset / DEV_BSIZE;
   2369 	bp->b_bcount = dsize;
   2370 	bp->b_flags |= B_READ;
   2371  	bp->b_resid = dsize;
   2372 
   2373 	bdev_strategy(bp);
   2374 	error = biowait(bp);
   2375 
   2376 	if (!error) {
   2377 		memcpy(data, bp->b_data, msize);
   2378 	}
   2379 
   2380 	brelse(bp, 0);
   2381 	return(error);
   2382 }
   2383 
   2384 
   2385 static int
   2386 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2387     RF_ComponentLabel_t *clabel)
   2388 {
   2389 	return raidwrite_component_area(dev, b_vp, clabel,
   2390 	    sizeof(RF_ComponentLabel_t),
   2391 	    rf_component_info_offset(),
   2392 	    rf_component_info_size(secsize), 0);
   2393 }
   2394 
   2395 /* ARGSUSED */
   2396 static int
   2397 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2398     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2399 {
   2400 	struct buf *bp;
   2401 	int error;
   2402 
   2403 	/* get a block of the appropriate size... */
   2404 	bp = geteblk((int)dsize);
   2405 	bp->b_dev = dev;
   2406 
   2407 	/* get our ducks in a row for the write */
   2408 	bp->b_blkno = offset / DEV_BSIZE;
   2409 	bp->b_bcount = dsize;
   2410 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2411  	bp->b_resid = dsize;
   2412 
   2413 	memset(bp->b_data, 0, dsize);
   2414 	memcpy(bp->b_data, data, msize);
   2415 
   2416 	bdev_strategy(bp);
   2417 	if (asyncp)
   2418 		return 0;
   2419 	error = biowait(bp);
   2420 	brelse(bp, 0);
   2421 	if (error) {
   2422 #if 1
   2423 		printf("Failed to write RAID component info!\n");
   2424 #endif
   2425 	}
   2426 
   2427 	return(error);
   2428 }
   2429 
   2430 void
   2431 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2432 {
   2433 	int c;
   2434 
   2435 	for (c = 0; c < raidPtr->numCol; c++) {
   2436 		/* Skip dead disks. */
   2437 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2438 			continue;
   2439 		/* XXXjld: what if an error occurs here? */
   2440 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2441 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2442 		    RF_PARITYMAP_NBYTE,
   2443 		    rf_parity_map_offset(raidPtr),
   2444 		    rf_parity_map_size(raidPtr), 0);
   2445 	}
   2446 }
   2447 
   2448 void
   2449 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2450 {
   2451 	struct rf_paritymap_ondisk tmp;
   2452 	int c,first;
   2453 
   2454 	first=1;
   2455 	for (c = 0; c < raidPtr->numCol; c++) {
   2456 		/* Skip dead disks. */
   2457 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2458 			continue;
   2459 		raidread_component_area(raidPtr->Disks[c].dev,
   2460 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2461 		    RF_PARITYMAP_NBYTE,
   2462 		    rf_parity_map_offset(raidPtr),
   2463 		    rf_parity_map_size(raidPtr));
   2464 		if (first) {
   2465 			memcpy(map, &tmp, sizeof(*map));
   2466 			first = 0;
   2467 		} else {
   2468 			rf_paritymap_merge(map, &tmp);
   2469 		}
   2470 	}
   2471 }
   2472 
   2473 void
   2474 rf_markalldirty(RF_Raid_t *raidPtr)
   2475 {
   2476 	RF_ComponentLabel_t *clabel;
   2477 	int sparecol;
   2478 	int c;
   2479 	int j;
   2480 	int scol = -1;
   2481 
   2482 	raidPtr->mod_counter++;
   2483 	for (c = 0; c < raidPtr->numCol; c++) {
   2484 		/* we don't want to touch (at all) a disk that has
   2485 		   failed */
   2486 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2487 			clabel = raidget_component_label(raidPtr, c);
   2488 			if (clabel->status == rf_ds_spared) {
   2489 				/* XXX do something special...
   2490 				   but whatever you do, don't
   2491 				   try to access it!! */
   2492 			} else {
   2493 				raidmarkdirty(raidPtr, c);
   2494 			}
   2495 		}
   2496 	}
   2497 
   2498 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2499 		sparecol = raidPtr->numCol + c;
   2500 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2501 			/*
   2502 
   2503 			   we claim this disk is "optimal" if it's
   2504 			   rf_ds_used_spare, as that means it should be
   2505 			   directly substitutable for the disk it replaced.
   2506 			   We note that too...
   2507 
   2508 			 */
   2509 
   2510 			for(j=0;j<raidPtr->numCol;j++) {
   2511 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2512 					scol = j;
   2513 					break;
   2514 				}
   2515 			}
   2516 
   2517 			clabel = raidget_component_label(raidPtr, sparecol);
   2518 			/* make sure status is noted */
   2519 
   2520 			raid_init_component_label(raidPtr, clabel);
   2521 
   2522 			clabel->row = 0;
   2523 			clabel->column = scol;
   2524 			/* Note: we *don't* change status from rf_ds_used_spare
   2525 			   to rf_ds_optimal */
   2526 			/* clabel.status = rf_ds_optimal; */
   2527 
   2528 			raidmarkdirty(raidPtr, sparecol);
   2529 		}
   2530 	}
   2531 }
   2532 
   2533 
   2534 void
   2535 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2536 {
   2537 	RF_ComponentLabel_t *clabel;
   2538 	int sparecol;
   2539 	int c;
   2540 	int j;
   2541 	int scol;
   2542 	struct raid_softc *rs = raidPtr->softc;
   2543 
   2544 	scol = -1;
   2545 
   2546 	/* XXX should do extra checks to make sure things really are clean,
   2547 	   rather than blindly setting the clean bit... */
   2548 
   2549 	raidPtr->mod_counter++;
   2550 
   2551 	for (c = 0; c < raidPtr->numCol; c++) {
   2552 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2553 			clabel = raidget_component_label(raidPtr, c);
   2554 			/* make sure status is noted */
   2555 			clabel->status = rf_ds_optimal;
   2556 
   2557 			/* note what unit we are configured as */
   2558 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2559 				clabel->last_unit = raidPtr->raidid;
   2560 
   2561 			raidflush_component_label(raidPtr, c);
   2562 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2563 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2564 					raidmarkclean(raidPtr, c);
   2565 				}
   2566 			}
   2567 		}
   2568 		/* else we don't touch it.. */
   2569 	}
   2570 
   2571 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2572 		sparecol = raidPtr->numCol + c;
   2573 		/* Need to ensure that the reconstruct actually completed! */
   2574 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2575 			/*
   2576 
   2577 			   we claim this disk is "optimal" if it's
   2578 			   rf_ds_used_spare, as that means it should be
   2579 			   directly substitutable for the disk it replaced.
   2580 			   We note that too...
   2581 
   2582 			 */
   2583 
   2584 			for(j=0;j<raidPtr->numCol;j++) {
   2585 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2586 					scol = j;
   2587 					break;
   2588 				}
   2589 			}
   2590 
   2591 			/* XXX shouldn't *really* need this... */
   2592 			clabel = raidget_component_label(raidPtr, sparecol);
   2593 			/* make sure status is noted */
   2594 
   2595 			raid_init_component_label(raidPtr, clabel);
   2596 
   2597 			clabel->column = scol;
   2598 			clabel->status = rf_ds_optimal;
   2599 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2600 				clabel->last_unit = raidPtr->raidid;
   2601 
   2602 			raidflush_component_label(raidPtr, sparecol);
   2603 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2604 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2605 					raidmarkclean(raidPtr, sparecol);
   2606 				}
   2607 			}
   2608 		}
   2609 	}
   2610 }
   2611 
   2612 void
   2613 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2614 {
   2615 
   2616 	if (vp != NULL) {
   2617 		if (auto_configured == 1) {
   2618 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2619 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2620 			vput(vp);
   2621 
   2622 		} else {
   2623 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2624 		}
   2625 	}
   2626 }
   2627 
   2628 
   2629 void
   2630 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2631 {
   2632 	int r,c;
   2633 	struct vnode *vp;
   2634 	int acd;
   2635 
   2636 
   2637 	/* We take this opportunity to close the vnodes like we should.. */
   2638 
   2639 	for (c = 0; c < raidPtr->numCol; c++) {
   2640 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2641 		acd = raidPtr->Disks[c].auto_configured;
   2642 		rf_close_component(raidPtr, vp, acd);
   2643 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2644 		raidPtr->Disks[c].auto_configured = 0;
   2645 	}
   2646 
   2647 	for (r = 0; r < raidPtr->numSpare; r++) {
   2648 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2649 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2650 		rf_close_component(raidPtr, vp, acd);
   2651 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2652 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2653 	}
   2654 }
   2655 
   2656 
   2657 void
   2658 rf_ReconThread(struct rf_recon_req *req)
   2659 {
   2660 	int     s;
   2661 	RF_Raid_t *raidPtr;
   2662 
   2663 	s = splbio();
   2664 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2665 	raidPtr->recon_in_progress = 1;
   2666 
   2667 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2668 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2669 
   2670 	RF_Free(req, sizeof(*req));
   2671 
   2672 	raidPtr->recon_in_progress = 0;
   2673 	splx(s);
   2674 
   2675 	/* That's all... */
   2676 	kthread_exit(0);	/* does not return */
   2677 }
   2678 
   2679 void
   2680 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2681 {
   2682 	int retcode;
   2683 	int s;
   2684 
   2685 	raidPtr->parity_rewrite_stripes_done = 0;
   2686 	raidPtr->parity_rewrite_in_progress = 1;
   2687 	s = splbio();
   2688 	retcode = rf_RewriteParity(raidPtr);
   2689 	splx(s);
   2690 	if (retcode) {
   2691 		printf("raid%d: Error re-writing parity (%d)!\n",
   2692 		    raidPtr->raidid, retcode);
   2693 	} else {
   2694 		/* set the clean bit!  If we shutdown correctly,
   2695 		   the clean bit on each component label will get
   2696 		   set */
   2697 		raidPtr->parity_good = RF_RAID_CLEAN;
   2698 	}
   2699 	raidPtr->parity_rewrite_in_progress = 0;
   2700 
   2701 	/* Anyone waiting for us to stop?  If so, inform them... */
   2702 	if (raidPtr->waitShutdown) {
   2703 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2704 	}
   2705 
   2706 	/* That's all... */
   2707 	kthread_exit(0);	/* does not return */
   2708 }
   2709 
   2710 
   2711 void
   2712 rf_CopybackThread(RF_Raid_t *raidPtr)
   2713 {
   2714 	int s;
   2715 
   2716 	raidPtr->copyback_in_progress = 1;
   2717 	s = splbio();
   2718 	rf_CopybackReconstructedData(raidPtr);
   2719 	splx(s);
   2720 	raidPtr->copyback_in_progress = 0;
   2721 
   2722 	/* That's all... */
   2723 	kthread_exit(0);	/* does not return */
   2724 }
   2725 
   2726 
   2727 void
   2728 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2729 {
   2730 	int s;
   2731 	RF_Raid_t *raidPtr;
   2732 
   2733 	s = splbio();
   2734 	raidPtr = req->raidPtr;
   2735 	raidPtr->recon_in_progress = 1;
   2736 	rf_ReconstructInPlace(raidPtr, req->col);
   2737 	RF_Free(req, sizeof(*req));
   2738 	raidPtr->recon_in_progress = 0;
   2739 	splx(s);
   2740 
   2741 	/* That's all... */
   2742 	kthread_exit(0);	/* does not return */
   2743 }
   2744 
   2745 static RF_AutoConfig_t *
   2746 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2747     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2748     unsigned secsize)
   2749 {
   2750 	int good_one = 0;
   2751 	RF_ComponentLabel_t *clabel;
   2752 	RF_AutoConfig_t *ac;
   2753 
   2754 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2755 	if (clabel == NULL) {
   2756 oomem:
   2757 		    while(ac_list) {
   2758 			    ac = ac_list;
   2759 			    if (ac->clabel)
   2760 				    free(ac->clabel, M_RAIDFRAME);
   2761 			    ac_list = ac_list->next;
   2762 			    free(ac, M_RAIDFRAME);
   2763 		    }
   2764 		    printf("RAID auto config: out of memory!\n");
   2765 		    return NULL; /* XXX probably should panic? */
   2766 	}
   2767 
   2768 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2769 		/* Got the label.  Does it look reasonable? */
   2770 		if (rf_reasonable_label(clabel, numsecs) &&
   2771 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2772 #ifdef DEBUG
   2773 			printf("Component on: %s: %llu\n",
   2774 				cname, (unsigned long long)size);
   2775 			rf_print_component_label(clabel);
   2776 #endif
   2777 			/* if it's reasonable, add it, else ignore it. */
   2778 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2779 				M_NOWAIT);
   2780 			if (ac == NULL) {
   2781 				free(clabel, M_RAIDFRAME);
   2782 				goto oomem;
   2783 			}
   2784 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2785 			ac->dev = dev;
   2786 			ac->vp = vp;
   2787 			ac->clabel = clabel;
   2788 			ac->next = ac_list;
   2789 			ac_list = ac;
   2790 			good_one = 1;
   2791 		}
   2792 	}
   2793 	if (!good_one) {
   2794 		/* cleanup */
   2795 		free(clabel, M_RAIDFRAME);
   2796 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2797 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2798 		vput(vp);
   2799 	}
   2800 	return ac_list;
   2801 }
   2802 
   2803 RF_AutoConfig_t *
   2804 rf_find_raid_components(void)
   2805 {
   2806 	struct vnode *vp;
   2807 	struct disklabel label;
   2808 	device_t dv;
   2809 	deviter_t di;
   2810 	dev_t dev;
   2811 	int bmajor, bminor, wedge, rf_part_found;
   2812 	int error;
   2813 	int i;
   2814 	RF_AutoConfig_t *ac_list;
   2815 	uint64_t numsecs;
   2816 	unsigned secsize;
   2817 	int dowedges;
   2818 
   2819 	/* initialize the AutoConfig list */
   2820 	ac_list = NULL;
   2821 
   2822 	/*
   2823 	 * we begin by trolling through *all* the devices on the system *twice*
   2824 	 * first we scan for wedges, second for other devices. This avoids
   2825 	 * using a raw partition instead of a wedge that covers the whole disk
   2826 	 */
   2827 
   2828 	for (dowedges=1; dowedges>=0; --dowedges) {
   2829 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2830 		     dv = deviter_next(&di)) {
   2831 
   2832 			/* we are only interested in disks... */
   2833 			if (device_class(dv) != DV_DISK)
   2834 				continue;
   2835 
   2836 			/* we don't care about floppies... */
   2837 			if (device_is_a(dv, "fd")) {
   2838 				continue;
   2839 			}
   2840 
   2841 			/* we don't care about CD's... */
   2842 			if (device_is_a(dv, "cd")) {
   2843 				continue;
   2844 			}
   2845 
   2846 			/* we don't care about md's... */
   2847 			if (device_is_a(dv, "md")) {
   2848 				continue;
   2849 			}
   2850 
   2851 			/* hdfd is the Atari/Hades floppy driver */
   2852 			if (device_is_a(dv, "hdfd")) {
   2853 				continue;
   2854 			}
   2855 
   2856 			/* fdisa is the Atari/Milan floppy driver */
   2857 			if (device_is_a(dv, "fdisa")) {
   2858 				continue;
   2859 			}
   2860 
   2861 			/* are we in the wedges pass ? */
   2862 			wedge = device_is_a(dv, "dk");
   2863 			if (wedge != dowedges) {
   2864 				continue;
   2865 			}
   2866 
   2867 			/* need to find the device_name_to_block_device_major stuff */
   2868 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2869 
   2870 			rf_part_found = 0; /*No raid partition as yet*/
   2871 
   2872 			/* get a vnode for the raw partition of this disk */
   2873 			bminor = minor(device_unit(dv));
   2874 			dev = wedge ? makedev(bmajor, bminor) :
   2875 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2876 			if (bdevvp(dev, &vp))
   2877 				panic("RAID can't alloc vnode");
   2878 
   2879 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2880 
   2881 			if (error) {
   2882 				/* "Who cares."  Continue looking
   2883 				   for something that exists*/
   2884 				vput(vp);
   2885 				continue;
   2886 			}
   2887 
   2888 			error = getdisksize(vp, &numsecs, &secsize);
   2889 			if (error) {
   2890 				/*
   2891 				 * Pseudo devices like vnd and cgd can be
   2892 				 * opened but may still need some configuration.
   2893 				 * Ignore these quietly.
   2894 				 */
   2895 				if (error != ENXIO)
   2896 					printf("RAIDframe: can't get disk size"
   2897 					    " for dev %s (%d)\n",
   2898 					    device_xname(dv), error);
   2899 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2900 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2901 				vput(vp);
   2902 				continue;
   2903 			}
   2904 			if (wedge) {
   2905 				struct dkwedge_info dkw;
   2906 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2907 				    NOCRED);
   2908 				if (error) {
   2909 					printf("RAIDframe: can't get wedge info for "
   2910 					    "dev %s (%d)\n", device_xname(dv), error);
   2911 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2912 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2913 					vput(vp);
   2914 					continue;
   2915 				}
   2916 
   2917 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2918 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2919 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2920 					vput(vp);
   2921 					continue;
   2922 				}
   2923 
   2924 				ac_list = rf_get_component(ac_list, dev, vp,
   2925 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2926 				rf_part_found = 1; /*There is a raid component on this disk*/
   2927 				continue;
   2928 			}
   2929 
   2930 			/* Ok, the disk exists.  Go get the disklabel. */
   2931 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2932 			if (error) {
   2933 				/*
   2934 				 * XXX can't happen - open() would
   2935 				 * have errored out (or faked up one)
   2936 				 */
   2937 				if (error != ENOTTY)
   2938 					printf("RAIDframe: can't get label for dev "
   2939 					    "%s (%d)\n", device_xname(dv), error);
   2940 			}
   2941 
   2942 			/* don't need this any more.  We'll allocate it again
   2943 			   a little later if we really do... */
   2944 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2945 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2946 			vput(vp);
   2947 
   2948 			if (error)
   2949 				continue;
   2950 
   2951 			rf_part_found = 0; /*No raid partitions yet*/
   2952 			for (i = 0; i < label.d_npartitions; i++) {
   2953 				char cname[sizeof(ac_list->devname)];
   2954 
   2955 				/* We only support partitions marked as RAID */
   2956 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2957 					continue;
   2958 
   2959 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2960 				if (bdevvp(dev, &vp))
   2961 					panic("RAID can't alloc vnode");
   2962 
   2963 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2964 				if (error) {
   2965 					/* Whatever... */
   2966 					vput(vp);
   2967 					continue;
   2968 				}
   2969 				snprintf(cname, sizeof(cname), "%s%c",
   2970 				    device_xname(dv), 'a' + i);
   2971 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2972 					label.d_partitions[i].p_size, numsecs, secsize);
   2973 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2974 			}
   2975 
   2976 			/*
   2977 			 *If there is no raid component on this disk, either in a
   2978 			 *disklabel or inside a wedge, check the raw partition as well,
   2979 			 *as it is possible to configure raid components on raw disk
   2980 			 *devices.
   2981 			 */
   2982 
   2983 			if (!rf_part_found) {
   2984 				char cname[sizeof(ac_list->devname)];
   2985 
   2986 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2987 				if (bdevvp(dev, &vp))
   2988 					panic("RAID can't alloc vnode");
   2989 
   2990 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2991 				if (error) {
   2992 					/* Whatever... */
   2993 					vput(vp);
   2994 					continue;
   2995 				}
   2996 				snprintf(cname, sizeof(cname), "%s%c",
   2997 				    device_xname(dv), 'a' + RAW_PART);
   2998 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2999 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3000 			}
   3001 		}
   3002 		deviter_release(&di);
   3003 	}
   3004 	return ac_list;
   3005 }
   3006 
   3007 
   3008 int
   3009 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3010 {
   3011 
   3012 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3013 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3014 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3015 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3016 	    clabel->row >=0 &&
   3017 	    clabel->column >= 0 &&
   3018 	    clabel->num_rows > 0 &&
   3019 	    clabel->num_columns > 0 &&
   3020 	    clabel->row < clabel->num_rows &&
   3021 	    clabel->column < clabel->num_columns &&
   3022 	    clabel->blockSize > 0 &&
   3023 	    /*
   3024 	     * numBlocksHi may contain garbage, but it is ok since
   3025 	     * the type is unsigned.  If it is really garbage,
   3026 	     * rf_fix_old_label_size() will fix it.
   3027 	     */
   3028 	    rf_component_label_numblocks(clabel) > 0) {
   3029 		/*
   3030 		 * label looks reasonable enough...
   3031 		 * let's make sure it has no old garbage.
   3032 		 */
   3033 		if (numsecs)
   3034 			rf_fix_old_label_size(clabel, numsecs);
   3035 		return(1);
   3036 	}
   3037 	return(0);
   3038 }
   3039 
   3040 
   3041 /*
   3042  * For reasons yet unknown, some old component labels have garbage in
   3043  * the newer numBlocksHi region, and this causes lossage.  Since those
   3044  * disks will also have numsecs set to less than 32 bits of sectors,
   3045  * we can determine when this corruption has occurred, and fix it.
   3046  *
   3047  * The exact same problem, with the same unknown reason, happens to
   3048  * the partitionSizeHi member as well.
   3049  */
   3050 static void
   3051 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3052 {
   3053 
   3054 	if (numsecs < ((uint64_t)1 << 32)) {
   3055 		if (clabel->numBlocksHi) {
   3056 			printf("WARNING: total sectors < 32 bits, yet "
   3057 			       "numBlocksHi set\n"
   3058 			       "WARNING: resetting numBlocksHi to zero.\n");
   3059 			clabel->numBlocksHi = 0;
   3060 		}
   3061 
   3062 		if (clabel->partitionSizeHi) {
   3063 			printf("WARNING: total sectors < 32 bits, yet "
   3064 			       "partitionSizeHi set\n"
   3065 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3066 			clabel->partitionSizeHi = 0;
   3067 		}
   3068 	}
   3069 }
   3070 
   3071 
   3072 #ifdef DEBUG
   3073 void
   3074 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3075 {
   3076 	uint64_t numBlocks;
   3077 	static const char *rp[] = {
   3078 	    "No", "Force", "Soft", "*invalid*"
   3079 	};
   3080 
   3081 
   3082 	numBlocks = rf_component_label_numblocks(clabel);
   3083 
   3084 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3085 	       clabel->row, clabel->column,
   3086 	       clabel->num_rows, clabel->num_columns);
   3087 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3088 	       clabel->version, clabel->serial_number,
   3089 	       clabel->mod_counter);
   3090 	printf("   Clean: %s Status: %d\n",
   3091 	       clabel->clean ? "Yes" : "No", clabel->status);
   3092 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3093 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3094 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3095 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3096 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3097 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3098 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3099 #if 0
   3100 	   printf("   Config order: %d\n", clabel->config_order);
   3101 #endif
   3102 
   3103 }
   3104 #endif
   3105 
   3106 RF_ConfigSet_t *
   3107 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3108 {
   3109 	RF_AutoConfig_t *ac;
   3110 	RF_ConfigSet_t *config_sets;
   3111 	RF_ConfigSet_t *cset;
   3112 	RF_AutoConfig_t *ac_next;
   3113 
   3114 
   3115 	config_sets = NULL;
   3116 
   3117 	/* Go through the AutoConfig list, and figure out which components
   3118 	   belong to what sets.  */
   3119 	ac = ac_list;
   3120 	while(ac!=NULL) {
   3121 		/* we're going to putz with ac->next, so save it here
   3122 		   for use at the end of the loop */
   3123 		ac_next = ac->next;
   3124 
   3125 		if (config_sets == NULL) {
   3126 			/* will need at least this one... */
   3127 			config_sets = (RF_ConfigSet_t *)
   3128 				malloc(sizeof(RF_ConfigSet_t),
   3129 				       M_RAIDFRAME, M_NOWAIT);
   3130 			if (config_sets == NULL) {
   3131 				panic("rf_create_auto_sets: No memory!");
   3132 			}
   3133 			/* this one is easy :) */
   3134 			config_sets->ac = ac;
   3135 			config_sets->next = NULL;
   3136 			config_sets->rootable = 0;
   3137 			ac->next = NULL;
   3138 		} else {
   3139 			/* which set does this component fit into? */
   3140 			cset = config_sets;
   3141 			while(cset!=NULL) {
   3142 				if (rf_does_it_fit(cset, ac)) {
   3143 					/* looks like it matches... */
   3144 					ac->next = cset->ac;
   3145 					cset->ac = ac;
   3146 					break;
   3147 				}
   3148 				cset = cset->next;
   3149 			}
   3150 			if (cset==NULL) {
   3151 				/* didn't find a match above... new set..*/
   3152 				cset = (RF_ConfigSet_t *)
   3153 					malloc(sizeof(RF_ConfigSet_t),
   3154 					       M_RAIDFRAME, M_NOWAIT);
   3155 				if (cset == NULL) {
   3156 					panic("rf_create_auto_sets: No memory!");
   3157 				}
   3158 				cset->ac = ac;
   3159 				ac->next = NULL;
   3160 				cset->next = config_sets;
   3161 				cset->rootable = 0;
   3162 				config_sets = cset;
   3163 			}
   3164 		}
   3165 		ac = ac_next;
   3166 	}
   3167 
   3168 
   3169 	return(config_sets);
   3170 }
   3171 
   3172 static int
   3173 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3174 {
   3175 	RF_ComponentLabel_t *clabel1, *clabel2;
   3176 
   3177 	/* If this one matches the *first* one in the set, that's good
   3178 	   enough, since the other members of the set would have been
   3179 	   through here too... */
   3180 	/* note that we are not checking partitionSize here..
   3181 
   3182 	   Note that we are also not checking the mod_counters here.
   3183 	   If everything else matches except the mod_counter, that's
   3184 	   good enough for this test.  We will deal with the mod_counters
   3185 	   a little later in the autoconfiguration process.
   3186 
   3187 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3188 
   3189 	   The reason we don't check for this is that failed disks
   3190 	   will have lower modification counts.  If those disks are
   3191 	   not added to the set they used to belong to, then they will
   3192 	   form their own set, which may result in 2 different sets,
   3193 	   for example, competing to be configured at raid0, and
   3194 	   perhaps competing to be the root filesystem set.  If the
   3195 	   wrong ones get configured, or both attempt to become /,
   3196 	   weird behaviour and or serious lossage will occur.  Thus we
   3197 	   need to bring them into the fold here, and kick them out at
   3198 	   a later point.
   3199 
   3200 	*/
   3201 
   3202 	clabel1 = cset->ac->clabel;
   3203 	clabel2 = ac->clabel;
   3204 	if ((clabel1->version == clabel2->version) &&
   3205 	    (clabel1->serial_number == clabel2->serial_number) &&
   3206 	    (clabel1->num_rows == clabel2->num_rows) &&
   3207 	    (clabel1->num_columns == clabel2->num_columns) &&
   3208 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3209 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3210 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3211 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3212 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3213 	    (clabel1->blockSize == clabel2->blockSize) &&
   3214 	    rf_component_label_numblocks(clabel1) ==
   3215 	    rf_component_label_numblocks(clabel2) &&
   3216 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3217 	    (clabel1->root_partition == clabel2->root_partition) &&
   3218 	    (clabel1->last_unit == clabel2->last_unit) &&
   3219 	    (clabel1->config_order == clabel2->config_order)) {
   3220 		/* if it get's here, it almost *has* to be a match */
   3221 	} else {
   3222 		/* it's not consistent with somebody in the set..
   3223 		   punt */
   3224 		return(0);
   3225 	}
   3226 	/* all was fine.. it must fit... */
   3227 	return(1);
   3228 }
   3229 
   3230 int
   3231 rf_have_enough_components(RF_ConfigSet_t *cset)
   3232 {
   3233 	RF_AutoConfig_t *ac;
   3234 	RF_AutoConfig_t *auto_config;
   3235 	RF_ComponentLabel_t *clabel;
   3236 	int c;
   3237 	int num_cols;
   3238 	int num_missing;
   3239 	int mod_counter;
   3240 	int mod_counter_found;
   3241 	int even_pair_failed;
   3242 	char parity_type;
   3243 
   3244 
   3245 	/* check to see that we have enough 'live' components
   3246 	   of this set.  If so, we can configure it if necessary */
   3247 
   3248 	num_cols = cset->ac->clabel->num_columns;
   3249 	parity_type = cset->ac->clabel->parityConfig;
   3250 
   3251 	/* XXX Check for duplicate components!?!?!? */
   3252 
   3253 	/* Determine what the mod_counter is supposed to be for this set. */
   3254 
   3255 	mod_counter_found = 0;
   3256 	mod_counter = 0;
   3257 	ac = cset->ac;
   3258 	while(ac!=NULL) {
   3259 		if (mod_counter_found==0) {
   3260 			mod_counter = ac->clabel->mod_counter;
   3261 			mod_counter_found = 1;
   3262 		} else {
   3263 			if (ac->clabel->mod_counter > mod_counter) {
   3264 				mod_counter = ac->clabel->mod_counter;
   3265 			}
   3266 		}
   3267 		ac = ac->next;
   3268 	}
   3269 
   3270 	num_missing = 0;
   3271 	auto_config = cset->ac;
   3272 
   3273 	even_pair_failed = 0;
   3274 	for(c=0; c<num_cols; c++) {
   3275 		ac = auto_config;
   3276 		while(ac!=NULL) {
   3277 			if ((ac->clabel->column == c) &&
   3278 			    (ac->clabel->mod_counter == mod_counter)) {
   3279 				/* it's this one... */
   3280 #ifdef DEBUG
   3281 				printf("Found: %s at %d\n",
   3282 				       ac->devname,c);
   3283 #endif
   3284 				break;
   3285 			}
   3286 			ac=ac->next;
   3287 		}
   3288 		if (ac==NULL) {
   3289 				/* Didn't find one here! */
   3290 				/* special case for RAID 1, especially
   3291 				   where there are more than 2
   3292 				   components (where RAIDframe treats
   3293 				   things a little differently :( ) */
   3294 			if (parity_type == '1') {
   3295 				if (c%2 == 0) { /* even component */
   3296 					even_pair_failed = 1;
   3297 				} else { /* odd component.  If
   3298 					    we're failed, and
   3299 					    so is the even
   3300 					    component, it's
   3301 					    "Good Night, Charlie" */
   3302 					if (even_pair_failed == 1) {
   3303 						return(0);
   3304 					}
   3305 				}
   3306 			} else {
   3307 				/* normal accounting */
   3308 				num_missing++;
   3309 			}
   3310 		}
   3311 		if ((parity_type == '1') && (c%2 == 1)) {
   3312 				/* Just did an even component, and we didn't
   3313 				   bail.. reset the even_pair_failed flag,
   3314 				   and go on to the next component.... */
   3315 			even_pair_failed = 0;
   3316 		}
   3317 	}
   3318 
   3319 	clabel = cset->ac->clabel;
   3320 
   3321 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3322 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3323 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3324 		/* XXX this needs to be made *much* more general */
   3325 		/* Too many failures */
   3326 		return(0);
   3327 	}
   3328 	/* otherwise, all is well, and we've got enough to take a kick
   3329 	   at autoconfiguring this set */
   3330 	return(1);
   3331 }
   3332 
   3333 void
   3334 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3335 			RF_Raid_t *raidPtr)
   3336 {
   3337 	RF_ComponentLabel_t *clabel;
   3338 	int i;
   3339 
   3340 	clabel = ac->clabel;
   3341 
   3342 	/* 1. Fill in the common stuff */
   3343 	config->numRow = clabel->num_rows = 1;
   3344 	config->numCol = clabel->num_columns;
   3345 	config->numSpare = 0; /* XXX should this be set here? */
   3346 	config->sectPerSU = clabel->sectPerSU;
   3347 	config->SUsPerPU = clabel->SUsPerPU;
   3348 	config->SUsPerRU = clabel->SUsPerRU;
   3349 	config->parityConfig = clabel->parityConfig;
   3350 	/* XXX... */
   3351 	strcpy(config->diskQueueType,"fifo");
   3352 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3353 	config->layoutSpecificSize = 0; /* XXX ?? */
   3354 
   3355 	while(ac!=NULL) {
   3356 		/* row/col values will be in range due to the checks
   3357 		   in reasonable_label() */
   3358 		strcpy(config->devnames[0][ac->clabel->column],
   3359 		       ac->devname);
   3360 		ac = ac->next;
   3361 	}
   3362 
   3363 	for(i=0;i<RF_MAXDBGV;i++) {
   3364 		config->debugVars[i][0] = 0;
   3365 	}
   3366 }
   3367 
   3368 int
   3369 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3370 {
   3371 	RF_ComponentLabel_t *clabel;
   3372 	int column;
   3373 	int sparecol;
   3374 
   3375 	raidPtr->autoconfigure = new_value;
   3376 
   3377 	for(column=0; column<raidPtr->numCol; column++) {
   3378 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3379 			clabel = raidget_component_label(raidPtr, column);
   3380 			clabel->autoconfigure = new_value;
   3381 			raidflush_component_label(raidPtr, column);
   3382 		}
   3383 	}
   3384 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3385 		sparecol = raidPtr->numCol + column;
   3386 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3387 			clabel = raidget_component_label(raidPtr, sparecol);
   3388 			clabel->autoconfigure = new_value;
   3389 			raidflush_component_label(raidPtr, sparecol);
   3390 		}
   3391 	}
   3392 	return(new_value);
   3393 }
   3394 
   3395 int
   3396 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3397 {
   3398 	RF_ComponentLabel_t *clabel;
   3399 	int column;
   3400 	int sparecol;
   3401 
   3402 	raidPtr->root_partition = new_value;
   3403 	for(column=0; column<raidPtr->numCol; column++) {
   3404 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3405 			clabel = raidget_component_label(raidPtr, column);
   3406 			clabel->root_partition = new_value;
   3407 			raidflush_component_label(raidPtr, column);
   3408 		}
   3409 	}
   3410 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3411 		sparecol = raidPtr->numCol + column;
   3412 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3413 			clabel = raidget_component_label(raidPtr, sparecol);
   3414 			clabel->root_partition = new_value;
   3415 			raidflush_component_label(raidPtr, sparecol);
   3416 		}
   3417 	}
   3418 	return(new_value);
   3419 }
   3420 
   3421 void
   3422 rf_release_all_vps(RF_ConfigSet_t *cset)
   3423 {
   3424 	RF_AutoConfig_t *ac;
   3425 
   3426 	ac = cset->ac;
   3427 	while(ac!=NULL) {
   3428 		/* Close the vp, and give it back */
   3429 		if (ac->vp) {
   3430 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3431 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3432 			vput(ac->vp);
   3433 			ac->vp = NULL;
   3434 		}
   3435 		ac = ac->next;
   3436 	}
   3437 }
   3438 
   3439 
   3440 void
   3441 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3442 {
   3443 	RF_AutoConfig_t *ac;
   3444 	RF_AutoConfig_t *next_ac;
   3445 
   3446 	ac = cset->ac;
   3447 	while(ac!=NULL) {
   3448 		next_ac = ac->next;
   3449 		/* nuke the label */
   3450 		free(ac->clabel, M_RAIDFRAME);
   3451 		/* cleanup the config structure */
   3452 		free(ac, M_RAIDFRAME);
   3453 		/* "next.." */
   3454 		ac = next_ac;
   3455 	}
   3456 	/* and, finally, nuke the config set */
   3457 	free(cset, M_RAIDFRAME);
   3458 }
   3459 
   3460 
   3461 void
   3462 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3463 {
   3464 	/* current version number */
   3465 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3466 	clabel->serial_number = raidPtr->serial_number;
   3467 	clabel->mod_counter = raidPtr->mod_counter;
   3468 
   3469 	clabel->num_rows = 1;
   3470 	clabel->num_columns = raidPtr->numCol;
   3471 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3472 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3473 
   3474 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3475 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3476 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3477 
   3478 	clabel->blockSize = raidPtr->bytesPerSector;
   3479 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3480 
   3481 	/* XXX not portable */
   3482 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3483 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3484 	clabel->autoconfigure = raidPtr->autoconfigure;
   3485 	clabel->root_partition = raidPtr->root_partition;
   3486 	clabel->last_unit = raidPtr->raidid;
   3487 	clabel->config_order = raidPtr->config_order;
   3488 
   3489 #ifndef RF_NO_PARITY_MAP
   3490 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3491 #endif
   3492 }
   3493 
   3494 struct raid_softc *
   3495 rf_auto_config_set(RF_ConfigSet_t *cset)
   3496 {
   3497 	RF_Raid_t *raidPtr;
   3498 	RF_Config_t *config;
   3499 	int raidID;
   3500 	struct raid_softc *sc;
   3501 
   3502 #ifdef DEBUG
   3503 	printf("RAID autoconfigure\n");
   3504 #endif
   3505 
   3506 	/* 1. Create a config structure */
   3507 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3508 	if (config == NULL) {
   3509 		printf("%s: Out of mem - config!?!?\n", __func__);
   3510 				/* XXX do something more intelligent here. */
   3511 		return NULL;
   3512 	}
   3513 
   3514 	/*
   3515 	   2. Figure out what RAID ID this one is supposed to live at
   3516 	   See if we can get the same RAID dev that it was configured
   3517 	   on last time..
   3518 	*/
   3519 
   3520 	raidID = cset->ac->clabel->last_unit;
   3521 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3522 	     sc = raidget(++raidID, false))
   3523 		continue;
   3524 #ifdef DEBUG
   3525 	printf("Configuring raid%d:\n",raidID);
   3526 #endif
   3527 
   3528 	if (sc == NULL)
   3529 		sc = raidget(raidID, true);
   3530 	if (sc == NULL) {
   3531 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3532 				/* XXX do something more intelligent here. */
   3533 		free(config, M_RAIDFRAME);
   3534 		return NULL;
   3535 	}
   3536 
   3537 	raidPtr = &sc->sc_r;
   3538 
   3539 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3540 	raidPtr->softc = sc;
   3541 	raidPtr->raidid = raidID;
   3542 	raidPtr->openings = RAIDOUTSTANDING;
   3543 
   3544 	/* 3. Build the configuration structure */
   3545 	rf_create_configuration(cset->ac, config, raidPtr);
   3546 
   3547 	/* 4. Do the configuration */
   3548 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3549 		raidinit(sc);
   3550 
   3551 		rf_markalldirty(raidPtr);
   3552 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3553 		switch (cset->ac->clabel->root_partition) {
   3554 		case 1:	/* Force Root */
   3555 		case 2:	/* Soft Root: root when boot partition part of raid */
   3556 			/*
   3557 			 * everything configured just fine.  Make a note
   3558 			 * that this set is eligible to be root,
   3559 			 * or forced to be root
   3560 			 */
   3561 			cset->rootable = cset->ac->clabel->root_partition;
   3562 			/* XXX do this here? */
   3563 			raidPtr->root_partition = cset->rootable;
   3564 			break;
   3565 		default:
   3566 			break;
   3567 		}
   3568 	} else {
   3569 		raidput(sc);
   3570 		sc = NULL;
   3571 	}
   3572 
   3573 	/* 5. Cleanup */
   3574 	free(config, M_RAIDFRAME);
   3575 	return sc;
   3576 }
   3577 
   3578 void
   3579 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3580 	     size_t xmin, size_t xmax)
   3581 {
   3582 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3583 	pool_sethiwat(p, xmax);
   3584 	pool_prime(p, xmin);
   3585 	pool_setlowat(p, xmin);
   3586 }
   3587 
   3588 /*
   3589  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3590  * to see if there is IO pending and if that IO could possibly be done
   3591  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3592  * otherwise.
   3593  *
   3594  */
   3595 int
   3596 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3597 {
   3598 	struct raid_softc *rs;
   3599 	struct dk_softc *dksc;
   3600 
   3601 	rs = raidPtr->softc;
   3602 	dksc = &rs->sc_dksc;
   3603 
   3604 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3605 		return 1;
   3606 
   3607 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3608 		/* there is work to do */
   3609 		return 0;
   3610 	}
   3611 	/* default is nothing to do */
   3612 	return 1;
   3613 }
   3614 
   3615 int
   3616 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3617 {
   3618 	uint64_t numsecs;
   3619 	unsigned secsize;
   3620 	int error;
   3621 
   3622 	error = getdisksize(vp, &numsecs, &secsize);
   3623 	if (error == 0) {
   3624 		diskPtr->blockSize = secsize;
   3625 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3626 		diskPtr->partitionSize = numsecs;
   3627 		return 0;
   3628 	}
   3629 	return error;
   3630 }
   3631 
   3632 static int
   3633 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3634 {
   3635 	return 1;
   3636 }
   3637 
   3638 static void
   3639 raid_attach(device_t parent, device_t self, void *aux)
   3640 {
   3641 }
   3642 
   3643 
   3644 static int
   3645 raid_detach(device_t self, int flags)
   3646 {
   3647 	int error;
   3648 	struct raid_softc *rs = raidsoftc(self);
   3649 
   3650 	if (rs == NULL)
   3651 		return ENXIO;
   3652 
   3653 	if ((error = raidlock(rs)) != 0)
   3654 		return (error);
   3655 
   3656 	error = raid_detach_unlocked(rs);
   3657 
   3658 	raidunlock(rs);
   3659 
   3660 	/* XXX raid can be referenced here */
   3661 
   3662 	if (error)
   3663 		return error;
   3664 
   3665 	/* Free the softc */
   3666 	raidput(rs);
   3667 
   3668 	return 0;
   3669 }
   3670 
   3671 static void
   3672 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3673 {
   3674 	struct dk_softc *dksc = &rs->sc_dksc;
   3675 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3676 
   3677 	memset(dg, 0, sizeof(*dg));
   3678 
   3679 	dg->dg_secperunit = raidPtr->totalSectors;
   3680 	dg->dg_secsize = raidPtr->bytesPerSector;
   3681 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3682 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3683 
   3684 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3685 }
   3686 
   3687 /*
   3688  * Get cache info for all the components (including spares).
   3689  * Returns intersection of all the cache flags of all disks, or first
   3690  * error if any encountered.
   3691  * XXXfua feature flags can change as spares are added - lock down somehow
   3692  */
   3693 static int
   3694 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3695 {
   3696 	int c;
   3697 	int error;
   3698 	int dkwhole = 0, dkpart;
   3699 
   3700 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3701 		/*
   3702 		 * Check any non-dead disk, even when currently being
   3703 		 * reconstructed.
   3704 		 */
   3705 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3706 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3707 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3708 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3709 			if (error) {
   3710 				if (error != ENODEV) {
   3711 					printf("raid%d: get cache for component %s failed\n",
   3712 					    raidPtr->raidid,
   3713 					    raidPtr->Disks[c].devname);
   3714 				}
   3715 
   3716 				return error;
   3717 			}
   3718 
   3719 			if (c == 0)
   3720 				dkwhole = dkpart;
   3721 			else
   3722 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3723 		}
   3724 	}
   3725 
   3726 	*data = dkwhole;
   3727 
   3728 	return 0;
   3729 }
   3730 
   3731 /*
   3732  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3733  * We end up returning whatever error was returned by the first cache flush
   3734  * that fails.
   3735  */
   3736 
   3737 int
   3738 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3739 {
   3740 	int c, sparecol;
   3741 	int e,error;
   3742 	int force = 1;
   3743 
   3744 	error = 0;
   3745 	for (c = 0; c < raidPtr->numCol; c++) {
   3746 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3747 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3748 					  &force, FWRITE, NOCRED);
   3749 			if (e) {
   3750 				if (e != ENODEV)
   3751 					printf("raid%d: cache flush to component %s failed.\n",
   3752 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3753 				if (error == 0) {
   3754 					error = e;
   3755 				}
   3756 			}
   3757 		}
   3758 	}
   3759 
   3760 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3761 		sparecol = raidPtr->numCol + c;
   3762 		/* Need to ensure that the reconstruct actually completed! */
   3763 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3764 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3765 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3766 			if (e) {
   3767 				if (e != ENODEV)
   3768 					printf("raid%d: cache flush to component %s failed.\n",
   3769 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3770 				if (error == 0) {
   3771 					error = e;
   3772 				}
   3773 			}
   3774 		}
   3775 	}
   3776 	return error;
   3777 }
   3778 
   3779 /*
   3780  * Module interface
   3781  */
   3782 
   3783 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3784 
   3785 #ifdef _MODULE
   3786 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3787 #endif
   3788 
   3789 static int raid_modcmd(modcmd_t, void *);
   3790 static int raid_modcmd_init(void);
   3791 static int raid_modcmd_fini(void);
   3792 
   3793 static int
   3794 raid_modcmd(modcmd_t cmd, void *data)
   3795 {
   3796 	int error;
   3797 
   3798 	error = 0;
   3799 	switch (cmd) {
   3800 	case MODULE_CMD_INIT:
   3801 		error = raid_modcmd_init();
   3802 		break;
   3803 	case MODULE_CMD_FINI:
   3804 		error = raid_modcmd_fini();
   3805 		break;
   3806 	default:
   3807 		error = ENOTTY;
   3808 		break;
   3809 	}
   3810 	return error;
   3811 }
   3812 
   3813 static int
   3814 raid_modcmd_init(void)
   3815 {
   3816 	int error;
   3817 	int bmajor, cmajor;
   3818 
   3819 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3820 	mutex_enter(&raid_lock);
   3821 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3822 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3823 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3824 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3825 
   3826 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3827 #endif
   3828 
   3829 	bmajor = cmajor = -1;
   3830 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3831 	    &raid_cdevsw, &cmajor);
   3832 	if (error != 0 && error != EEXIST) {
   3833 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3834 		mutex_exit(&raid_lock);
   3835 		return error;
   3836 	}
   3837 #ifdef _MODULE
   3838 	error = config_cfdriver_attach(&raid_cd);
   3839 	if (error != 0) {
   3840 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3841 		    __func__, error);
   3842 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3843 		mutex_exit(&raid_lock);
   3844 		return error;
   3845 	}
   3846 #endif
   3847 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3848 	if (error != 0) {
   3849 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3850 		    __func__, error);
   3851 #ifdef _MODULE
   3852 		config_cfdriver_detach(&raid_cd);
   3853 #endif
   3854 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3855 		mutex_exit(&raid_lock);
   3856 		return error;
   3857 	}
   3858 
   3859 	raidautoconfigdone = false;
   3860 
   3861 	mutex_exit(&raid_lock);
   3862 
   3863 	if (error == 0) {
   3864 		if (rf_BootRaidframe(true) == 0)
   3865 			aprint_verbose("Kernelized RAIDframe activated\n");
   3866 		else
   3867 			panic("Serious error activating RAID!!");
   3868 	}
   3869 
   3870 	/*
   3871 	 * Register a finalizer which will be used to auto-config RAID
   3872 	 * sets once all real hardware devices have been found.
   3873 	 */
   3874 	error = config_finalize_register(NULL, rf_autoconfig);
   3875 	if (error != 0) {
   3876 		aprint_error("WARNING: unable to register RAIDframe "
   3877 		    "finalizer\n");
   3878 		error = 0;
   3879 	}
   3880 
   3881 	return error;
   3882 }
   3883 
   3884 static int
   3885 raid_modcmd_fini(void)
   3886 {
   3887 	int error;
   3888 
   3889 	mutex_enter(&raid_lock);
   3890 
   3891 	/* Don't allow unload if raid device(s) exist.  */
   3892 	if (!LIST_EMPTY(&raids)) {
   3893 		mutex_exit(&raid_lock);
   3894 		return EBUSY;
   3895 	}
   3896 
   3897 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3898 	if (error != 0) {
   3899 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3900 		mutex_exit(&raid_lock);
   3901 		return error;
   3902 	}
   3903 #ifdef _MODULE
   3904 	error = config_cfdriver_detach(&raid_cd);
   3905 	if (error != 0) {
   3906 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3907 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3908 		mutex_exit(&raid_lock);
   3909 		return error;
   3910 	}
   3911 #endif
   3912 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3913 	if (error != 0) {
   3914 		aprint_error("%s: cannot detach devsw\n",__func__);
   3915 #ifdef _MODULE
   3916 		config_cfdriver_attach(&raid_cd);
   3917 #endif
   3918 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3919 		mutex_exit(&raid_lock);
   3920 		return error;
   3921 	}
   3922 	rf_BootRaidframe(false);
   3923 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3924 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3925 	rf_destroy_cond2(rf_sparet_wait_cv);
   3926 	rf_destroy_cond2(rf_sparet_resp_cv);
   3927 #endif
   3928 	mutex_exit(&raid_lock);
   3929 	mutex_destroy(&raid_lock);
   3930 
   3931 	return error;
   3932 }
   3933