Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.349.4.1
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.349.4.1 2017/04/27 05:36:36 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.349.4.1 2017/04/27 05:36:36 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/localcount.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #ifdef COMPAT_50
    153 #include "rf_compat50.h"
    154 #endif
    155 
    156 #include "ioconf.h"
    157 
    158 #ifdef DEBUG
    159 int     rf_kdebug_level = 0;
    160 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    161 #else				/* DEBUG */
    162 #define db1_printf(a) { }
    163 #endif				/* DEBUG */
    164 
    165 #ifdef DEBUG_ROOT
    166 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    167 #else
    168 #define DPRINTF(a, ...)
    169 #endif
    170 
    171 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    172 static rf_declare_mutex2(rf_sparet_wait_mutex);
    173 static rf_declare_cond2(rf_sparet_wait_cv);
    174 static rf_declare_cond2(rf_sparet_resp_cv);
    175 
    176 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    177 						 * spare table */
    178 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    179 						 * installation process */
    180 #endif
    181 
    182 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    183 
    184 /* prototypes */
    185 static void KernelWakeupFunc(struct buf *);
    186 static void InitBP(struct buf *, struct vnode *, unsigned,
    187     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    188     void *, int, struct proc *);
    189 struct raid_softc;
    190 static void raidinit(struct raid_softc *);
    191 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    192 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    193 
    194 static int raid_match(device_t, cfdata_t, void *);
    195 static void raid_attach(device_t, device_t, void *);
    196 static int raid_detach(device_t, int);
    197 
    198 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    199     daddr_t, daddr_t);
    200 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    201     daddr_t, daddr_t, int);
    202 
    203 static int raidwrite_component_label(unsigned,
    204     dev_t, struct vnode *, RF_ComponentLabel_t *);
    205 static int raidread_component_label(unsigned,
    206     dev_t, struct vnode *, RF_ComponentLabel_t *);
    207 
    208 static int raid_diskstart(device_t, struct buf *bp);
    209 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    210 static int raid_lastclose(device_t);
    211 
    212 static dev_type_open(raidopen);
    213 static dev_type_close(raidclose);
    214 static dev_type_read(raidread);
    215 static dev_type_write(raidwrite);
    216 static dev_type_ioctl(raidioctl);
    217 static dev_type_strategy(raidstrategy);
    218 static dev_type_dump(raiddump);
    219 static dev_type_size(raidsize);
    220 
    221 const struct bdevsw raid_bdevsw = {
    222 	DEVSW_MODULE_INIT
    223 	.d_open = raidopen,
    224 	.d_close = raidclose,
    225 	.d_strategy = raidstrategy,
    226 	.d_ioctl = raidioctl,
    227 	.d_dump = raiddump,
    228 	.d_psize = raidsize,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 const struct cdevsw raid_cdevsw = {
    234 	DEVSW_MODULE_INIT
    235 	.d_open = raidopen,
    236 	.d_close = raidclose,
    237 	.d_read = raidread,
    238 	.d_write = raidwrite,
    239 	.d_ioctl = raidioctl,
    240 	.d_stop = nostop,
    241 	.d_tty = notty,
    242 	.d_poll = nopoll,
    243 	.d_mmap = nommap,
    244 	.d_kqfilter = nokqfilter,
    245 	.d_discard = nodiscard,
    246 	.d_flag = D_DISK
    247 };
    248 
    249 static struct dkdriver rf_dkdriver = {
    250 	.d_open = raidopen,
    251 	.d_close = raidclose,
    252 	.d_strategy = raidstrategy,
    253 	.d_diskstart = raid_diskstart,
    254 	.d_dumpblocks = raid_dumpblocks,
    255 	.d_lastclose = raid_lastclose,
    256 	.d_minphys = minphys
    257 };
    258 
    259 struct raid_softc {
    260 	struct dk_softc sc_dksc;
    261 	int	sc_unit;
    262 	int     sc_flags;	/* flags */
    263 	int     sc_cflags;	/* configuration flags */
    264 	kmutex_t sc_mutex;	/* interlock mutex */
    265 	kcondvar_t sc_cv;	/* and the condvar */
    266 	uint64_t sc_size;	/* size of the raid device */
    267 	char    sc_xname[20];	/* XXX external name */
    268 	RF_Raid_t sc_r;
    269 	LIST_ENTRY(raid_softc) sc_link;
    270 };
    271 /* sc_flags */
    272 #define RAIDF_INITED		0x01	/* unit has been initialized */
    273 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    274 #define RAIDF_DETACH  		0x04	/* detach after final close */
    275 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    276 #define RAIDF_LOCKED		0x10	/* unit is locked */
    277 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    278 
    279 #define	raidunit(x)	DISKUNIT(x)
    280 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    281 
    282 extern struct cfdriver raid_cd;
    283 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    284     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    285     DVF_DETACH_SHUTDOWN);
    286 
    287 /*
    288  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    289  * Be aware that large numbers can allow the driver to consume a lot of
    290  * kernel memory, especially on writes, and in degraded mode reads.
    291  *
    292  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    293  * a single 64K write will typically require 64K for the old data,
    294  * 64K for the old parity, and 64K for the new parity, for a total
    295  * of 192K (if the parity buffer is not re-used immediately).
    296  * Even it if is used immediately, that's still 128K, which when multiplied
    297  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    298  *
    299  * Now in degraded mode, for example, a 64K read on the above setup may
    300  * require data reconstruction, which will require *all* of the 4 remaining
    301  * disks to participate -- 4 * 32K/disk == 128K again.
    302  */
    303 
    304 #ifndef RAIDOUTSTANDING
    305 #define RAIDOUTSTANDING   6
    306 #endif
    307 
    308 #define RAIDLABELDEV(dev)	\
    309 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    310 
    311 /* declared here, and made public, for the benefit of KVM stuff.. */
    312 
    313 static int raidlock(struct raid_softc *);
    314 static void raidunlock(struct raid_softc *);
    315 
    316 static int raid_detach_unlocked(struct raid_softc *);
    317 
    318 static void rf_markalldirty(RF_Raid_t *);
    319 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    320 
    321 void rf_ReconThread(struct rf_recon_req *);
    322 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    323 void rf_CopybackThread(RF_Raid_t *raidPtr);
    324 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    325 int rf_autoconfig(device_t);
    326 void rf_buildroothack(RF_ConfigSet_t *);
    327 
    328 RF_AutoConfig_t *rf_find_raid_components(void);
    329 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    330 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    331 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    332 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    333 int rf_set_autoconfig(RF_Raid_t *, int);
    334 int rf_set_rootpartition(RF_Raid_t *, int);
    335 void rf_release_all_vps(RF_ConfigSet_t *);
    336 void rf_cleanup_config_set(RF_ConfigSet_t *);
    337 int rf_have_enough_components(RF_ConfigSet_t *);
    338 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    339 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    340 
    341 /*
    342  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    343  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    344  * in the kernel config file.
    345  */
    346 #ifdef RAID_AUTOCONFIG
    347 int raidautoconfig = 1;
    348 #else
    349 int raidautoconfig = 0;
    350 #endif
    351 static bool raidautoconfigdone = false;
    352 
    353 struct RF_Pools_s rf_pools;
    354 
    355 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    356 static kmutex_t raid_lock;
    357 
    358 static struct raid_softc *
    359 raidcreate(int unit) {
    360 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    361 	if (sc == NULL) {
    362 #ifdef DIAGNOSTIC
    363 		printf("%s: out of memory\n", __func__);
    364 #endif
    365 		return NULL;
    366 	}
    367 	sc->sc_unit = unit;
    368 	cv_init(&sc->sc_cv, "raidunit");
    369 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    370 	return sc;
    371 }
    372 
    373 static void
    374 raiddestroy(struct raid_softc *sc) {
    375 	cv_destroy(&sc->sc_cv);
    376 	mutex_destroy(&sc->sc_mutex);
    377 	kmem_free(sc, sizeof(*sc));
    378 }
    379 
    380 static struct raid_softc *
    381 raidget(int unit, bool create) {
    382 	struct raid_softc *sc;
    383 	if (unit < 0) {
    384 #ifdef DIAGNOSTIC
    385 		panic("%s: unit %d!", __func__, unit);
    386 #endif
    387 		return NULL;
    388 	}
    389 	mutex_enter(&raid_lock);
    390 	LIST_FOREACH(sc, &raids, sc_link) {
    391 		if (sc->sc_unit == unit) {
    392 			mutex_exit(&raid_lock);
    393 			return sc;
    394 		}
    395 	}
    396 	mutex_exit(&raid_lock);
    397 	if (!create)
    398 		return NULL;
    399 	if ((sc = raidcreate(unit)) == NULL)
    400 		return NULL;
    401 	mutex_enter(&raid_lock);
    402 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    403 	mutex_exit(&raid_lock);
    404 	return sc;
    405 }
    406 
    407 static void
    408 raidput(struct raid_softc *sc) {
    409 	mutex_enter(&raid_lock);
    410 	LIST_REMOVE(sc, sc_link);
    411 	mutex_exit(&raid_lock);
    412 	raiddestroy(sc);
    413 }
    414 
    415 void
    416 raidattach(int num)
    417 {
    418 
    419 	/*
    420 	 * Device attachment and associated initialization now occurs
    421 	 * as part of the module initialization.
    422 	 */
    423 }
    424 
    425 int
    426 rf_autoconfig(device_t self)
    427 {
    428 	RF_AutoConfig_t *ac_list;
    429 	RF_ConfigSet_t *config_sets;
    430 
    431 	if (!raidautoconfig || raidautoconfigdone == true)
    432 		return (0);
    433 
    434 	/* XXX This code can only be run once. */
    435 	raidautoconfigdone = true;
    436 
    437 #ifdef __HAVE_CPU_BOOTCONF
    438 	/*
    439 	 * 0. find the boot device if needed first so we can use it later
    440 	 * this needs to be done before we autoconfigure any raid sets,
    441 	 * because if we use wedges we are not going to be able to open
    442 	 * the boot device later
    443 	 */
    444 	if (booted_device == NULL)
    445 		cpu_bootconf();
    446 #endif
    447 	/* 1. locate all RAID components on the system */
    448 	aprint_debug("Searching for RAID components...\n");
    449 	ac_list = rf_find_raid_components();
    450 
    451 	/* 2. Sort them into their respective sets. */
    452 	config_sets = rf_create_auto_sets(ac_list);
    453 
    454 	/*
    455 	 * 3. Evaluate each set and configure the valid ones.
    456 	 * This gets done in rf_buildroothack().
    457 	 */
    458 	rf_buildroothack(config_sets);
    459 
    460 	return 1;
    461 }
    462 
    463 static int
    464 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    465 	const char *bootname = device_xname(bdv);
    466 	size_t len = strlen(bootname);
    467 
    468 	for (int col = 0; col < r->numCol; col++) {
    469 		const char *devname = r->Disks[col].devname;
    470 		devname += sizeof("/dev/") - 1;
    471 		if (strncmp(devname, "dk", 2) == 0) {
    472 			const char *parent =
    473 			    dkwedge_get_parent_name(r->Disks[col].dev);
    474 			if (parent != NULL)
    475 				devname = parent;
    476 		}
    477 		if (strncmp(devname, bootname, len) == 0) {
    478 			struct raid_softc *sc = r->softc;
    479 			aprint_debug("raid%d includes boot device %s\n",
    480 			    sc->sc_unit, devname);
    481 			return 1;
    482 		}
    483 	}
    484 	return 0;
    485 }
    486 
    487 void
    488 rf_buildroothack(RF_ConfigSet_t *config_sets)
    489 {
    490 	RF_ConfigSet_t *cset;
    491 	RF_ConfigSet_t *next_cset;
    492 	int num_root;
    493 	struct raid_softc *sc, *rsc;
    494 	struct dk_softc *dksc;
    495 
    496 	sc = rsc = NULL;
    497 	num_root = 0;
    498 	cset = config_sets;
    499 	while (cset != NULL) {
    500 		next_cset = cset->next;
    501 		if (rf_have_enough_components(cset) &&
    502 		    cset->ac->clabel->autoconfigure == 1) {
    503 			sc = rf_auto_config_set(cset);
    504 			if (sc != NULL) {
    505 				aprint_debug("raid%d: configured ok\n",
    506 				    sc->sc_unit);
    507 				if (cset->rootable) {
    508 					rsc = sc;
    509 					num_root++;
    510 				}
    511 			} else {
    512 				/* The autoconfig didn't work :( */
    513 				aprint_debug("Autoconfig failed\n");
    514 				rf_release_all_vps(cset);
    515 			}
    516 		} else {
    517 			/* we're not autoconfiguring this set...
    518 			   release the associated resources */
    519 			rf_release_all_vps(cset);
    520 		}
    521 		/* cleanup */
    522 		rf_cleanup_config_set(cset);
    523 		cset = next_cset;
    524 	}
    525 	dksc = &rsc->sc_dksc;
    526 
    527 	/* if the user has specified what the root device should be
    528 	   then we don't touch booted_device or boothowto... */
    529 
    530 	if (rootspec != NULL)
    531 		return;
    532 
    533 	/* we found something bootable... */
    534 
    535 	/*
    536 	 * XXX: The following code assumes that the root raid
    537 	 * is the first ('a') partition. This is about the best
    538 	 * we can do with a BSD disklabel, but we might be able
    539 	 * to do better with a GPT label, by setting a specified
    540 	 * attribute to indicate the root partition. We can then
    541 	 * stash the partition number in the r->root_partition
    542 	 * high bits (the bottom 2 bits are already used). For
    543 	 * now we just set booted_partition to 0 when we override
    544 	 * root.
    545 	 */
    546 	if (num_root == 1) {
    547 		device_t candidate_root;
    548 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    549 			char cname[sizeof(cset->ac->devname)];
    550 			/* XXX: assume partition 'a' first */
    551 			snprintf(cname, sizeof(cname), "%s%c",
    552 			    device_xname(dksc->sc_dev), 'a');
    553 			candidate_root = dkwedge_find_by_wname(cname);
    554 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    555 			    cname);
    556 			if (candidate_root == NULL) {
    557 				/*
    558 				 * If that is not found, because we don't use
    559 				 * disklabel, return the first dk child
    560 				 * XXX: we can skip the 'a' check above
    561 				 * and always do this...
    562 				 */
    563 				size_t i = 0;
    564 				candidate_root = dkwedge_find_by_parent(
    565 				    device_xname(dksc->sc_dev), &i);
    566 			}
    567 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    568 			    candidate_root);
    569 		} else
    570 			candidate_root = dksc->sc_dev;
    571 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    572 		DPRINTF("%s: booted_device=%p root_partition=%d "
    573 		   "contains_boot=%d\n", __func__, booted_device,
    574 		   rsc->sc_r.root_partition,
    575 		   rf_containsboot(&rsc->sc_r, booted_device));
    576 		if (booted_device == NULL ||
    577 		    rsc->sc_r.root_partition == 1 ||
    578 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    579 			booted_device = candidate_root;
    580 			booted_partition = 0;	/* XXX assume 'a' */
    581 		}
    582 	} else if (num_root > 1) {
    583 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    584 		    booted_device);
    585 
    586 		/*
    587 		 * Maybe the MD code can help. If it cannot, then
    588 		 * setroot() will discover that we have no
    589 		 * booted_device and will ask the user if nothing was
    590 		 * hardwired in the kernel config file
    591 		 */
    592 		if (booted_device == NULL)
    593 			return;
    594 
    595 		num_root = 0;
    596 		mutex_enter(&raid_lock);
    597 		LIST_FOREACH(sc, &raids, sc_link) {
    598 			RF_Raid_t *r = &sc->sc_r;
    599 			if (r->valid == 0)
    600 				continue;
    601 
    602 			if (r->root_partition == 0)
    603 				continue;
    604 
    605 			if (rf_containsboot(r, booted_device)) {
    606 				num_root++;
    607 				rsc = sc;
    608 				dksc = &rsc->sc_dksc;
    609 			}
    610 		}
    611 		mutex_exit(&raid_lock);
    612 
    613 		if (num_root == 1) {
    614 			booted_device = dksc->sc_dev;
    615 			booted_partition = 0;	/* XXX assume 'a' */
    616 		} else {
    617 			/* we can't guess.. require the user to answer... */
    618 			boothowto |= RB_ASKNAME;
    619 		}
    620 	}
    621 }
    622 
    623 static int
    624 raidsize(dev_t dev)
    625 {
    626 	struct raid_softc *rs;
    627 	struct dk_softc *dksc;
    628 	unsigned int unit;
    629 
    630 	unit = raidunit(dev);
    631 	if ((rs = raidget(unit, false)) == NULL)
    632 		return -1;
    633 	dksc = &rs->sc_dksc;
    634 
    635 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    636 		return -1;
    637 
    638 	return dk_size(dksc, dev);
    639 }
    640 
    641 static int
    642 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    643 {
    644 	unsigned int unit;
    645 	struct raid_softc *rs;
    646 	struct dk_softc *dksc;
    647 
    648 	unit = raidunit(dev);
    649 	if ((rs = raidget(unit, false)) == NULL)
    650 		return ENXIO;
    651 	dksc = &rs->sc_dksc;
    652 
    653 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    654 		return ENODEV;
    655 
    656         /*
    657            Note that blkno is relative to this particular partition.
    658            By adding adding RF_PROTECTED_SECTORS, we get a value that
    659 	   is relative to the partition used for the underlying component.
    660         */
    661 	blkno += RF_PROTECTED_SECTORS;
    662 
    663 	return dk_dump(dksc, dev, blkno, va, size);
    664 }
    665 
    666 static int
    667 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    668 {
    669 	struct raid_softc *rs = raidsoftc(dev);
    670 	const struct bdevsw *bdev;
    671 	RF_Raid_t *raidPtr;
    672 	int     c, sparecol, j, scol, dumpto;
    673 	int     error = 0;
    674 
    675 	raidPtr = &rs->sc_r;
    676 
    677 	/* we only support dumping to RAID 1 sets */
    678 	if (raidPtr->Layout.numDataCol != 1 ||
    679 	    raidPtr->Layout.numParityCol != 1)
    680 		return EINVAL;
    681 
    682 	if ((error = raidlock(rs)) != 0)
    683 		return error;
    684 
    685 	/* figure out what device is alive.. */
    686 
    687 	/*
    688 	   Look for a component to dump to.  The preference for the
    689 	   component to dump to is as follows:
    690 	   1) the master
    691 	   2) a used_spare of the master
    692 	   3) the slave
    693 	   4) a used_spare of the slave
    694 	*/
    695 
    696 	dumpto = -1;
    697 	for (c = 0; c < raidPtr->numCol; c++) {
    698 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    699 			/* this might be the one */
    700 			dumpto = c;
    701 			break;
    702 		}
    703 	}
    704 
    705 	/*
    706 	   At this point we have possibly selected a live master or a
    707 	   live slave.  We now check to see if there is a spared
    708 	   master (or a spared slave), if we didn't find a live master
    709 	   or a live slave.
    710 	*/
    711 
    712 	for (c = 0; c < raidPtr->numSpare; c++) {
    713 		sparecol = raidPtr->numCol + c;
    714 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    715 			/* How about this one? */
    716 			scol = -1;
    717 			for(j=0;j<raidPtr->numCol;j++) {
    718 				if (raidPtr->Disks[j].spareCol == sparecol) {
    719 					scol = j;
    720 					break;
    721 				}
    722 			}
    723 			if (scol == 0) {
    724 				/*
    725 				   We must have found a spared master!
    726 				   We'll take that over anything else
    727 				   found so far.  (We couldn't have
    728 				   found a real master before, since
    729 				   this is a used spare, and it's
    730 				   saying that it's replacing the
    731 				   master.)  On reboot (with
    732 				   autoconfiguration turned on)
    733 				   sparecol will become the 1st
    734 				   component (component0) of this set.
    735 				*/
    736 				dumpto = sparecol;
    737 				break;
    738 			} else if (scol != -1) {
    739 				/*
    740 				   Must be a spared slave.  We'll dump
    741 				   to that if we havn't found anything
    742 				   else so far.
    743 				*/
    744 				if (dumpto == -1)
    745 					dumpto = sparecol;
    746 			}
    747 		}
    748 	}
    749 
    750 	if (dumpto == -1) {
    751 		/* we couldn't find any live components to dump to!?!?
    752 		 */
    753 		error = EINVAL;
    754 		goto out;
    755 	}
    756 
    757 	bdev = bdevsw_lookup_acquire(raidPtr->Disks[dumpto].dev);
    758 	if (bdev == NULL) {
    759 		error = ENXIO;
    760 		goto out;
    761 	}
    762 
    763 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    764 				blkno, va, nblk * raidPtr->bytesPerSector);
    765 	bdevsw_release(bdev);
    766 out:
    767 	raidunlock(rs);
    768 
    769 	return error;
    770 }
    771 
    772 /* ARGSUSED */
    773 static int
    774 raidopen(dev_t dev, int flags, int fmt,
    775     struct lwp *l)
    776 {
    777 	int     unit = raidunit(dev);
    778 	struct raid_softc *rs;
    779 	struct dk_softc *dksc;
    780 	int     error = 0;
    781 	int     part, pmask;
    782 
    783 	if ((rs = raidget(unit, true)) == NULL)
    784 		return ENXIO;
    785 	if ((error = raidlock(rs)) != 0)
    786 		return (error);
    787 
    788 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    789 		error = EBUSY;
    790 		goto bad;
    791 	}
    792 
    793 	dksc = &rs->sc_dksc;
    794 
    795 	part = DISKPART(dev);
    796 	pmask = (1 << part);
    797 
    798 	if (!DK_BUSY(dksc, pmask) &&
    799 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    800 		/* First one... mark things as dirty... Note that we *MUST*
    801 		 have done a configure before this.  I DO NOT WANT TO BE
    802 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    803 		 THAT THEY BELONG TOGETHER!!!!! */
    804 		/* XXX should check to see if we're only open for reading
    805 		   here... If so, we needn't do this, but then need some
    806 		   other way of keeping track of what's happened.. */
    807 
    808 		rf_markalldirty(&rs->sc_r);
    809 	}
    810 
    811 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    812 		error = dk_open(dksc, dev, flags, fmt, l);
    813 
    814 bad:
    815 	raidunlock(rs);
    816 
    817 	return (error);
    818 
    819 
    820 }
    821 
    822 static int
    823 raid_lastclose(device_t self)
    824 {
    825 	struct raid_softc *rs = raidsoftc(self);
    826 
    827 	/* Last one... device is not unconfigured yet.
    828 	   Device shutdown has taken care of setting the
    829 	   clean bits if RAIDF_INITED is not set
    830 	   mark things as clean... */
    831 
    832 	rf_update_component_labels(&rs->sc_r,
    833 	    RF_FINAL_COMPONENT_UPDATE);
    834 
    835 	/* pass to unlocked code */
    836 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    837 		rs->sc_flags |= RAIDF_DETACH;
    838 
    839 	return 0;
    840 }
    841 
    842 /* ARGSUSED */
    843 static int
    844 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    845 {
    846 	int     unit = raidunit(dev);
    847 	struct raid_softc *rs;
    848 	struct dk_softc *dksc;
    849 	cfdata_t cf;
    850 	int     error = 0, do_detach = 0, do_put = 0;
    851 
    852 	if ((rs = raidget(unit, false)) == NULL)
    853 		return ENXIO;
    854 	dksc = &rs->sc_dksc;
    855 
    856 	if ((error = raidlock(rs)) != 0)
    857 		return (error);
    858 
    859 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    860 		error = dk_close(dksc, dev, flags, fmt, l);
    861 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    862 			do_detach = 1;
    863 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    864 		do_put = 1;
    865 
    866 	raidunlock(rs);
    867 
    868 	if (do_detach) {
    869 		/* free the pseudo device attach bits */
    870 		cf = device_cfdata(dksc->sc_dev);
    871 		error = config_detach(dksc->sc_dev, 0);
    872 		if (error == 0)
    873 			free(cf, M_RAIDFRAME);
    874 	} else if (do_put) {
    875 		raidput(rs);
    876 	}
    877 
    878 	return (error);
    879 
    880 }
    881 
    882 static void
    883 raid_wakeup(RF_Raid_t *raidPtr)
    884 {
    885 	rf_lock_mutex2(raidPtr->iodone_lock);
    886 	rf_signal_cond2(raidPtr->iodone_cv);
    887 	rf_unlock_mutex2(raidPtr->iodone_lock);
    888 }
    889 
    890 static void
    891 raidstrategy(struct buf *bp)
    892 {
    893 	unsigned int unit;
    894 	struct raid_softc *rs;
    895 	struct dk_softc *dksc;
    896 	RF_Raid_t *raidPtr;
    897 
    898 	unit = raidunit(bp->b_dev);
    899 	if ((rs = raidget(unit, false)) == NULL) {
    900 		bp->b_error = ENXIO;
    901 		goto fail;
    902 	}
    903 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    904 		bp->b_error = ENXIO;
    905 		goto fail;
    906 	}
    907 	dksc = &rs->sc_dksc;
    908 	raidPtr = &rs->sc_r;
    909 
    910 	/* Queue IO only */
    911 	if (dk_strategy_defer(dksc, bp))
    912 		goto done;
    913 
    914 	/* schedule the IO to happen at the next convenient time */
    915 	raid_wakeup(raidPtr);
    916 
    917 done:
    918 	return;
    919 
    920 fail:
    921 	bp->b_resid = bp->b_bcount;
    922 	biodone(bp);
    923 }
    924 
    925 static int
    926 raid_diskstart(device_t dev, struct buf *bp)
    927 {
    928 	struct raid_softc *rs = raidsoftc(dev);
    929 	RF_Raid_t *raidPtr;
    930 
    931 	raidPtr = &rs->sc_r;
    932 	if (!raidPtr->valid) {
    933 		db1_printf(("raid is not valid..\n"));
    934 		return ENODEV;
    935 	}
    936 
    937 	/* XXX */
    938 	bp->b_resid = 0;
    939 
    940 	return raiddoaccess(raidPtr, bp);
    941 }
    942 
    943 void
    944 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    945 {
    946 	struct raid_softc *rs;
    947 	struct dk_softc *dksc;
    948 
    949 	rs = raidPtr->softc;
    950 	dksc = &rs->sc_dksc;
    951 
    952 	dk_done(dksc, bp);
    953 
    954 	rf_lock_mutex2(raidPtr->mutex);
    955 	raidPtr->openings++;
    956 	rf_unlock_mutex2(raidPtr->mutex);
    957 
    958 	/* schedule more IO */
    959 	raid_wakeup(raidPtr);
    960 }
    961 
    962 /* ARGSUSED */
    963 static int
    964 raidread(dev_t dev, struct uio *uio, int flags)
    965 {
    966 	int     unit = raidunit(dev);
    967 	struct raid_softc *rs;
    968 
    969 	if ((rs = raidget(unit, false)) == NULL)
    970 		return ENXIO;
    971 
    972 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    973 		return (ENXIO);
    974 
    975 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    976 
    977 }
    978 
    979 /* ARGSUSED */
    980 static int
    981 raidwrite(dev_t dev, struct uio *uio, int flags)
    982 {
    983 	int     unit = raidunit(dev);
    984 	struct raid_softc *rs;
    985 
    986 	if ((rs = raidget(unit, false)) == NULL)
    987 		return ENXIO;
    988 
    989 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    990 		return (ENXIO);
    991 
    992 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    993 
    994 }
    995 
    996 static int
    997 raid_detach_unlocked(struct raid_softc *rs)
    998 {
    999 	struct dk_softc *dksc = &rs->sc_dksc;
   1000 	RF_Raid_t *raidPtr;
   1001 	int error;
   1002 
   1003 	raidPtr = &rs->sc_r;
   1004 
   1005 	if (DK_BUSY(dksc, 0) ||
   1006 	    raidPtr->recon_in_progress != 0 ||
   1007 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1008 	    raidPtr->copyback_in_progress != 0)
   1009 		return EBUSY;
   1010 
   1011 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1012 		return 0;
   1013 
   1014 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1015 
   1016 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1017 		return error;
   1018 
   1019 	rs->sc_flags &= ~RAIDF_INITED;
   1020 
   1021 	/* Kill off any queued buffers */
   1022 	dk_drain(dksc);
   1023 	bufq_free(dksc->sc_bufq);
   1024 
   1025 	/* Detach the disk. */
   1026 	dkwedge_delall(&dksc->sc_dkdev);
   1027 	disk_detach(&dksc->sc_dkdev);
   1028 	disk_destroy(&dksc->sc_dkdev);
   1029 	dk_detach(dksc);
   1030 
   1031 	return 0;
   1032 }
   1033 
   1034 static int
   1035 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1036 {
   1037 	int     unit = raidunit(dev);
   1038 	int     error = 0;
   1039 	int     part, pmask;
   1040 	struct raid_softc *rs;
   1041 	struct dk_softc *dksc;
   1042 	RF_Config_t *k_cfg, *u_cfg;
   1043 	RF_Raid_t *raidPtr;
   1044 	RF_RaidDisk_t *diskPtr;
   1045 	RF_AccTotals_t *totals;
   1046 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1047 	u_char *specific_buf;
   1048 	int retcode = 0;
   1049 	int column;
   1050 /*	int raidid; */
   1051 	struct rf_recon_req *rrcopy, *rr;
   1052 	RF_ComponentLabel_t *clabel;
   1053 	RF_ComponentLabel_t *ci_label;
   1054 	RF_ComponentLabel_t **clabel_ptr;
   1055 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1056 	RF_SingleComponent_t component;
   1057 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1058 	int i, j, d;
   1059 
   1060 	if ((rs = raidget(unit, false)) == NULL)
   1061 		return ENXIO;
   1062 	dksc = &rs->sc_dksc;
   1063 	raidPtr = &rs->sc_r;
   1064 
   1065 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1066 		(int) DISKPART(dev), (int) unit, cmd));
   1067 
   1068 	/* Must be initialized for these... */
   1069 	switch (cmd) {
   1070 	case RAIDFRAME_REWRITEPARITY:
   1071 	case RAIDFRAME_GET_INFO:
   1072 	case RAIDFRAME_RESET_ACCTOTALS:
   1073 	case RAIDFRAME_GET_ACCTOTALS:
   1074 	case RAIDFRAME_KEEP_ACCTOTALS:
   1075 	case RAIDFRAME_GET_SIZE:
   1076 	case RAIDFRAME_FAIL_DISK:
   1077 	case RAIDFRAME_COPYBACK:
   1078 	case RAIDFRAME_CHECK_RECON_STATUS:
   1079 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1080 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1081 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1082 	case RAIDFRAME_ADD_HOT_SPARE:
   1083 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1084 	case RAIDFRAME_INIT_LABELS:
   1085 	case RAIDFRAME_REBUILD_IN_PLACE:
   1086 	case RAIDFRAME_CHECK_PARITY:
   1087 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1088 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1089 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1090 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1091 	case RAIDFRAME_SET_AUTOCONFIG:
   1092 	case RAIDFRAME_SET_ROOT:
   1093 	case RAIDFRAME_DELETE_COMPONENT:
   1094 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1095 	case RAIDFRAME_PARITYMAP_STATUS:
   1096 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1097 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1098 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1099 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1100 			return (ENXIO);
   1101 	}
   1102 
   1103 	switch (cmd) {
   1104 #ifdef COMPAT_50
   1105 	case RAIDFRAME_GET_INFO50:
   1106 		return rf_get_info50(raidPtr, data);
   1107 
   1108 	case RAIDFRAME_CONFIGURE50:
   1109 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1110 			return retcode;
   1111 		goto config;
   1112 #endif
   1113 		/* configure the system */
   1114 	case RAIDFRAME_CONFIGURE:
   1115 
   1116 		if (raidPtr->valid) {
   1117 			/* There is a valid RAID set running on this unit! */
   1118 			printf("raid%d: Device already configured!\n",unit);
   1119 			return(EINVAL);
   1120 		}
   1121 
   1122 		/* copy-in the configuration information */
   1123 		/* data points to a pointer to the configuration structure */
   1124 
   1125 		u_cfg = *((RF_Config_t **) data);
   1126 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1127 		if (k_cfg == NULL) {
   1128 			return (ENOMEM);
   1129 		}
   1130 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1131 		if (retcode) {
   1132 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1133 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1134 				retcode));
   1135 			goto no_config;
   1136 		}
   1137 		goto config;
   1138 	config:
   1139 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1140 
   1141 		/* allocate a buffer for the layout-specific data, and copy it
   1142 		 * in */
   1143 		if (k_cfg->layoutSpecificSize) {
   1144 			if (k_cfg->layoutSpecificSize > 10000) {
   1145 				/* sanity check */
   1146 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1147 				retcode = EINVAL;
   1148 				goto no_config;
   1149 			}
   1150 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1151 			    (u_char *));
   1152 			if (specific_buf == NULL) {
   1153 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1154 				retcode = ENOMEM;
   1155 				goto no_config;
   1156 			}
   1157 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1158 			    k_cfg->layoutSpecificSize);
   1159 			if (retcode) {
   1160 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1161 				RF_Free(specific_buf,
   1162 					k_cfg->layoutSpecificSize);
   1163 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1164 					retcode));
   1165 				goto no_config;
   1166 			}
   1167 		} else
   1168 			specific_buf = NULL;
   1169 		k_cfg->layoutSpecific = specific_buf;
   1170 
   1171 		/* should do some kind of sanity check on the configuration.
   1172 		 * Store the sum of all the bytes in the last byte? */
   1173 
   1174 		/* configure the system */
   1175 
   1176 		/*
   1177 		 * Clear the entire RAID descriptor, just to make sure
   1178 		 *  there is no stale data left in the case of a
   1179 		 *  reconfiguration
   1180 		 */
   1181 		memset(raidPtr, 0, sizeof(*raidPtr));
   1182 		raidPtr->softc = rs;
   1183 		raidPtr->raidid = unit;
   1184 
   1185 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1186 
   1187 		if (retcode == 0) {
   1188 
   1189 			/* allow this many simultaneous IO's to
   1190 			   this RAID device */
   1191 			raidPtr->openings = RAIDOUTSTANDING;
   1192 
   1193 			raidinit(rs);
   1194 			raid_wakeup(raidPtr);
   1195 			rf_markalldirty(raidPtr);
   1196 		}
   1197 		/* free the buffers.  No return code here. */
   1198 		if (k_cfg->layoutSpecificSize) {
   1199 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1200 		}
   1201 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1202 
   1203 	no_config:
   1204 		/*
   1205 		 * If configuration failed, set sc_flags so that we
   1206 		 * will detach the device when we close it.
   1207 		 */
   1208 		if (retcode != 0)
   1209 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1210 		return (retcode);
   1211 
   1212 		/* shutdown the system */
   1213 	case RAIDFRAME_SHUTDOWN:
   1214 
   1215 		part = DISKPART(dev);
   1216 		pmask = (1 << part);
   1217 
   1218 		if ((error = raidlock(rs)) != 0)
   1219 			return (error);
   1220 
   1221 		if (DK_BUSY(dksc, pmask) ||
   1222 		    raidPtr->recon_in_progress != 0 ||
   1223 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1224 		    raidPtr->copyback_in_progress != 0)
   1225 			retcode = EBUSY;
   1226 		else {
   1227 			/* detach and free on close */
   1228 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1229 			retcode = 0;
   1230 		}
   1231 
   1232 		raidunlock(rs);
   1233 
   1234 		return (retcode);
   1235 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1236 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1237 		/* need to read the component label for the disk indicated
   1238 		   by row,column in clabel */
   1239 
   1240 		/*
   1241 		 * Perhaps there should be an option to skip the in-core
   1242 		 * copy and hit the disk, as with disklabel(8).
   1243 		 */
   1244 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1245 
   1246 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1247 
   1248 		if (retcode) {
   1249 			RF_Free(clabel, sizeof(*clabel));
   1250 			return retcode;
   1251 		}
   1252 
   1253 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1254 
   1255 		column = clabel->column;
   1256 
   1257 		if ((column < 0) || (column >= raidPtr->numCol +
   1258 		    raidPtr->numSpare)) {
   1259 			RF_Free(clabel, sizeof(*clabel));
   1260 			return EINVAL;
   1261 		}
   1262 
   1263 		RF_Free(clabel, sizeof(*clabel));
   1264 
   1265 		clabel = raidget_component_label(raidPtr, column);
   1266 
   1267 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1268 
   1269 #if 0
   1270 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1271 		clabel = (RF_ComponentLabel_t *) data;
   1272 
   1273 		/* XXX check the label for valid stuff... */
   1274 		/* Note that some things *should not* get modified --
   1275 		   the user should be re-initing the labels instead of
   1276 		   trying to patch things.
   1277 		   */
   1278 
   1279 		raidid = raidPtr->raidid;
   1280 #ifdef DEBUG
   1281 		printf("raid%d: Got component label:\n", raidid);
   1282 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1283 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1284 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1285 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1286 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1287 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1288 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1289 #endif
   1290 		clabel->row = 0;
   1291 		column = clabel->column;
   1292 
   1293 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1294 			return(EINVAL);
   1295 		}
   1296 
   1297 		/* XXX this isn't allowed to do anything for now :-) */
   1298 
   1299 		/* XXX and before it is, we need to fill in the rest
   1300 		   of the fields!?!?!?! */
   1301 		memcpy(raidget_component_label(raidPtr, column),
   1302 		    clabel, sizeof(*clabel));
   1303 		raidflush_component_label(raidPtr, column);
   1304 		return (0);
   1305 #endif
   1306 
   1307 	case RAIDFRAME_INIT_LABELS:
   1308 		clabel = (RF_ComponentLabel_t *) data;
   1309 		/*
   1310 		   we only want the serial number from
   1311 		   the above.  We get all the rest of the information
   1312 		   from the config that was used to create this RAID
   1313 		   set.
   1314 		   */
   1315 
   1316 		raidPtr->serial_number = clabel->serial_number;
   1317 
   1318 		for(column=0;column<raidPtr->numCol;column++) {
   1319 			diskPtr = &raidPtr->Disks[column];
   1320 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1321 				ci_label = raidget_component_label(raidPtr,
   1322 				    column);
   1323 				/* Zeroing this is important. */
   1324 				memset(ci_label, 0, sizeof(*ci_label));
   1325 				raid_init_component_label(raidPtr, ci_label);
   1326 				ci_label->serial_number =
   1327 				    raidPtr->serial_number;
   1328 				ci_label->row = 0; /* we dont' pretend to support more */
   1329 				rf_component_label_set_partitionsize(ci_label,
   1330 				    diskPtr->partitionSize);
   1331 				ci_label->column = column;
   1332 				raidflush_component_label(raidPtr, column);
   1333 			}
   1334 			/* XXXjld what about the spares? */
   1335 		}
   1336 
   1337 		return (retcode);
   1338 	case RAIDFRAME_SET_AUTOCONFIG:
   1339 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1340 		printf("raid%d: New autoconfig value is: %d\n",
   1341 		       raidPtr->raidid, d);
   1342 		*(int *) data = d;
   1343 		return (retcode);
   1344 
   1345 	case RAIDFRAME_SET_ROOT:
   1346 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1347 		printf("raid%d: New rootpartition value is: %d\n",
   1348 		       raidPtr->raidid, d);
   1349 		*(int *) data = d;
   1350 		return (retcode);
   1351 
   1352 		/* initialize all parity */
   1353 	case RAIDFRAME_REWRITEPARITY:
   1354 
   1355 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1356 			/* Parity for RAID 0 is trivially correct */
   1357 			raidPtr->parity_good = RF_RAID_CLEAN;
   1358 			return(0);
   1359 		}
   1360 
   1361 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1362 			/* Re-write is already in progress! */
   1363 			return(EINVAL);
   1364 		}
   1365 
   1366 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1367 					   rf_RewriteParityThread,
   1368 					   raidPtr,"raid_parity");
   1369 		return (retcode);
   1370 
   1371 
   1372 	case RAIDFRAME_ADD_HOT_SPARE:
   1373 		sparePtr = (RF_SingleComponent_t *) data;
   1374 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1375 		retcode = rf_add_hot_spare(raidPtr, &component);
   1376 		return(retcode);
   1377 
   1378 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1379 		return(retcode);
   1380 
   1381 	case RAIDFRAME_DELETE_COMPONENT:
   1382 		componentPtr = (RF_SingleComponent_t *)data;
   1383 		memcpy( &component, componentPtr,
   1384 			sizeof(RF_SingleComponent_t));
   1385 		retcode = rf_delete_component(raidPtr, &component);
   1386 		return(retcode);
   1387 
   1388 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1389 		componentPtr = (RF_SingleComponent_t *)data;
   1390 		memcpy( &component, componentPtr,
   1391 			sizeof(RF_SingleComponent_t));
   1392 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1393 		return(retcode);
   1394 
   1395 	case RAIDFRAME_REBUILD_IN_PLACE:
   1396 
   1397 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1398 			/* Can't do this on a RAID 0!! */
   1399 			return(EINVAL);
   1400 		}
   1401 
   1402 		if (raidPtr->recon_in_progress == 1) {
   1403 			/* a reconstruct is already in progress! */
   1404 			return(EINVAL);
   1405 		}
   1406 
   1407 		componentPtr = (RF_SingleComponent_t *) data;
   1408 		memcpy( &component, componentPtr,
   1409 			sizeof(RF_SingleComponent_t));
   1410 		component.row = 0; /* we don't support any more */
   1411 		column = component.column;
   1412 
   1413 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1414 			return(EINVAL);
   1415 		}
   1416 
   1417 		rf_lock_mutex2(raidPtr->mutex);
   1418 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1419 		    (raidPtr->numFailures > 0)) {
   1420 			/* XXX 0 above shouldn't be constant!!! */
   1421 			/* some component other than this has failed.
   1422 			   Let's not make things worse than they already
   1423 			   are... */
   1424 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1425 			       raidPtr->raidid);
   1426 			printf("raid%d:     Col: %d   Too many failures.\n",
   1427 			       raidPtr->raidid, column);
   1428 			rf_unlock_mutex2(raidPtr->mutex);
   1429 			return (EINVAL);
   1430 		}
   1431 		if (raidPtr->Disks[column].status ==
   1432 		    rf_ds_reconstructing) {
   1433 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1434 			       raidPtr->raidid);
   1435 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1436 
   1437 			rf_unlock_mutex2(raidPtr->mutex);
   1438 			return (EINVAL);
   1439 		}
   1440 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1441 			rf_unlock_mutex2(raidPtr->mutex);
   1442 			return (EINVAL);
   1443 		}
   1444 		rf_unlock_mutex2(raidPtr->mutex);
   1445 
   1446 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1447 		if (rrcopy == NULL)
   1448 			return(ENOMEM);
   1449 
   1450 		rrcopy->raidPtr = (void *) raidPtr;
   1451 		rrcopy->col = column;
   1452 
   1453 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1454 					   rf_ReconstructInPlaceThread,
   1455 					   rrcopy,"raid_reconip");
   1456 		return(retcode);
   1457 
   1458 	case RAIDFRAME_GET_INFO:
   1459 		if (!raidPtr->valid)
   1460 			return (ENODEV);
   1461 		ucfgp = (RF_DeviceConfig_t **) data;
   1462 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1463 			  (RF_DeviceConfig_t *));
   1464 		if (d_cfg == NULL)
   1465 			return (ENOMEM);
   1466 		d_cfg->rows = 1; /* there is only 1 row now */
   1467 		d_cfg->cols = raidPtr->numCol;
   1468 		d_cfg->ndevs = raidPtr->numCol;
   1469 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1470 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1471 			return (ENOMEM);
   1472 		}
   1473 		d_cfg->nspares = raidPtr->numSpare;
   1474 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1475 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1476 			return (ENOMEM);
   1477 		}
   1478 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1479 		d = 0;
   1480 		for (j = 0; j < d_cfg->cols; j++) {
   1481 			d_cfg->devs[d] = raidPtr->Disks[j];
   1482 			d++;
   1483 		}
   1484 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1485 			d_cfg->spares[i] = raidPtr->Disks[j];
   1486 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1487 				/* XXX: raidctl(8) expects to see this as a used spare */
   1488 				d_cfg->spares[i].status = rf_ds_used_spare;
   1489 			}
   1490 		}
   1491 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1492 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1493 
   1494 		return (retcode);
   1495 
   1496 	case RAIDFRAME_CHECK_PARITY:
   1497 		*(int *) data = raidPtr->parity_good;
   1498 		return (0);
   1499 
   1500 	case RAIDFRAME_PARITYMAP_STATUS:
   1501 		if (rf_paritymap_ineligible(raidPtr))
   1502 			return EINVAL;
   1503 		rf_paritymap_status(raidPtr->parity_map,
   1504 		    (struct rf_pmstat *)data);
   1505 		return 0;
   1506 
   1507 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1508 		if (rf_paritymap_ineligible(raidPtr))
   1509 			return EINVAL;
   1510 		if (raidPtr->parity_map == NULL)
   1511 			return ENOENT; /* ??? */
   1512 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1513 			(struct rf_pmparams *)data, 1))
   1514 			return EINVAL;
   1515 		return 0;
   1516 
   1517 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1518 		if (rf_paritymap_ineligible(raidPtr))
   1519 			return EINVAL;
   1520 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1521 		return 0;
   1522 
   1523 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1524 		if (rf_paritymap_ineligible(raidPtr))
   1525 			return EINVAL;
   1526 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1527 		/* XXX should errors be passed up? */
   1528 		return 0;
   1529 
   1530 	case RAIDFRAME_RESET_ACCTOTALS:
   1531 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1532 		return (0);
   1533 
   1534 	case RAIDFRAME_GET_ACCTOTALS:
   1535 		totals = (RF_AccTotals_t *) data;
   1536 		*totals = raidPtr->acc_totals;
   1537 		return (0);
   1538 
   1539 	case RAIDFRAME_KEEP_ACCTOTALS:
   1540 		raidPtr->keep_acc_totals = *(int *)data;
   1541 		return (0);
   1542 
   1543 	case RAIDFRAME_GET_SIZE:
   1544 		*(int *) data = raidPtr->totalSectors;
   1545 		return (0);
   1546 
   1547 		/* fail a disk & optionally start reconstruction */
   1548 	case RAIDFRAME_FAIL_DISK:
   1549 
   1550 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1551 			/* Can't do this on a RAID 0!! */
   1552 			return(EINVAL);
   1553 		}
   1554 
   1555 		rr = (struct rf_recon_req *) data;
   1556 		rr->row = 0;
   1557 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1558 			return (EINVAL);
   1559 
   1560 
   1561 		rf_lock_mutex2(raidPtr->mutex);
   1562 		if (raidPtr->status == rf_rs_reconstructing) {
   1563 			/* you can't fail a disk while we're reconstructing! */
   1564 			/* XXX wrong for RAID6 */
   1565 			rf_unlock_mutex2(raidPtr->mutex);
   1566 			return (EINVAL);
   1567 		}
   1568 		if ((raidPtr->Disks[rr->col].status ==
   1569 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1570 			/* some other component has failed.  Let's not make
   1571 			   things worse. XXX wrong for RAID6 */
   1572 			rf_unlock_mutex2(raidPtr->mutex);
   1573 			return (EINVAL);
   1574 		}
   1575 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1576 			/* Can't fail a spared disk! */
   1577 			rf_unlock_mutex2(raidPtr->mutex);
   1578 			return (EINVAL);
   1579 		}
   1580 		rf_unlock_mutex2(raidPtr->mutex);
   1581 
   1582 		/* make a copy of the recon request so that we don't rely on
   1583 		 * the user's buffer */
   1584 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1585 		if (rrcopy == NULL)
   1586 			return(ENOMEM);
   1587 		memcpy(rrcopy, rr, sizeof(*rr));
   1588 		rrcopy->raidPtr = (void *) raidPtr;
   1589 
   1590 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1591 					   rf_ReconThread,
   1592 					   rrcopy,"raid_recon");
   1593 		return (0);
   1594 
   1595 		/* invoke a copyback operation after recon on whatever disk
   1596 		 * needs it, if any */
   1597 	case RAIDFRAME_COPYBACK:
   1598 
   1599 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1600 			/* This makes no sense on a RAID 0!! */
   1601 			return(EINVAL);
   1602 		}
   1603 
   1604 		if (raidPtr->copyback_in_progress == 1) {
   1605 			/* Copyback is already in progress! */
   1606 			return(EINVAL);
   1607 		}
   1608 
   1609 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1610 					   rf_CopybackThread,
   1611 					   raidPtr,"raid_copyback");
   1612 		return (retcode);
   1613 
   1614 		/* return the percentage completion of reconstruction */
   1615 	case RAIDFRAME_CHECK_RECON_STATUS:
   1616 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1617 			/* This makes no sense on a RAID 0, so tell the
   1618 			   user it's done. */
   1619 			*(int *) data = 100;
   1620 			return(0);
   1621 		}
   1622 		if (raidPtr->status != rf_rs_reconstructing)
   1623 			*(int *) data = 100;
   1624 		else {
   1625 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1626 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1627 			} else {
   1628 				*(int *) data = 0;
   1629 			}
   1630 		}
   1631 		return (0);
   1632 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1633 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1634 		if (raidPtr->status != rf_rs_reconstructing) {
   1635 			progressInfo.remaining = 0;
   1636 			progressInfo.completed = 100;
   1637 			progressInfo.total = 100;
   1638 		} else {
   1639 			progressInfo.total =
   1640 				raidPtr->reconControl->numRUsTotal;
   1641 			progressInfo.completed =
   1642 				raidPtr->reconControl->numRUsComplete;
   1643 			progressInfo.remaining = progressInfo.total -
   1644 				progressInfo.completed;
   1645 		}
   1646 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1647 				  sizeof(RF_ProgressInfo_t));
   1648 		return (retcode);
   1649 
   1650 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1651 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1652 			/* This makes no sense on a RAID 0, so tell the
   1653 			   user it's done. */
   1654 			*(int *) data = 100;
   1655 			return(0);
   1656 		}
   1657 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1658 			*(int *) data = 100 *
   1659 				raidPtr->parity_rewrite_stripes_done /
   1660 				raidPtr->Layout.numStripe;
   1661 		} else {
   1662 			*(int *) data = 100;
   1663 		}
   1664 		return (0);
   1665 
   1666 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1667 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1668 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1669 			progressInfo.total = raidPtr->Layout.numStripe;
   1670 			progressInfo.completed =
   1671 				raidPtr->parity_rewrite_stripes_done;
   1672 			progressInfo.remaining = progressInfo.total -
   1673 				progressInfo.completed;
   1674 		} else {
   1675 			progressInfo.remaining = 0;
   1676 			progressInfo.completed = 100;
   1677 			progressInfo.total = 100;
   1678 		}
   1679 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1680 				  sizeof(RF_ProgressInfo_t));
   1681 		return (retcode);
   1682 
   1683 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1684 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1685 			/* This makes no sense on a RAID 0 */
   1686 			*(int *) data = 100;
   1687 			return(0);
   1688 		}
   1689 		if (raidPtr->copyback_in_progress == 1) {
   1690 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1691 				raidPtr->Layout.numStripe;
   1692 		} else {
   1693 			*(int *) data = 100;
   1694 		}
   1695 		return (0);
   1696 
   1697 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1698 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1699 		if (raidPtr->copyback_in_progress == 1) {
   1700 			progressInfo.total = raidPtr->Layout.numStripe;
   1701 			progressInfo.completed =
   1702 				raidPtr->copyback_stripes_done;
   1703 			progressInfo.remaining = progressInfo.total -
   1704 				progressInfo.completed;
   1705 		} else {
   1706 			progressInfo.remaining = 0;
   1707 			progressInfo.completed = 100;
   1708 			progressInfo.total = 100;
   1709 		}
   1710 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1711 				  sizeof(RF_ProgressInfo_t));
   1712 		return (retcode);
   1713 
   1714 	case RAIDFRAME_SET_LAST_UNIT:
   1715 		for (column = 0; column < raidPtr->numCol; column++)
   1716 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1717 				return EBUSY;
   1718 
   1719 		for (column = 0; column < raidPtr->numCol; column++) {
   1720 			clabel = raidget_component_label(raidPtr, column);
   1721 			clabel->last_unit = *(int *)data;
   1722 			raidflush_component_label(raidPtr, column);
   1723 		}
   1724 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1725 		return 0;
   1726 
   1727 		/* the sparetable daemon calls this to wait for the kernel to
   1728 		 * need a spare table. this ioctl does not return until a
   1729 		 * spare table is needed. XXX -- calling mpsleep here in the
   1730 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1731 		 * -- I should either compute the spare table in the kernel,
   1732 		 * or have a different -- XXX XXX -- interface (a different
   1733 		 * character device) for delivering the table     -- XXX */
   1734 #if 0
   1735 	case RAIDFRAME_SPARET_WAIT:
   1736 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1737 		while (!rf_sparet_wait_queue)
   1738 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1739 		waitreq = rf_sparet_wait_queue;
   1740 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1741 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1742 
   1743 		/* structure assignment */
   1744 		*((RF_SparetWait_t *) data) = *waitreq;
   1745 
   1746 		RF_Free(waitreq, sizeof(*waitreq));
   1747 		return (0);
   1748 
   1749 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1750 		 * code in it that will cause the dameon to exit */
   1751 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1752 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1753 		waitreq->fcol = -1;
   1754 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1755 		waitreq->next = rf_sparet_wait_queue;
   1756 		rf_sparet_wait_queue = waitreq;
   1757 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1758 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1759 		return (0);
   1760 
   1761 		/* used by the spare table daemon to deliver a spare table
   1762 		 * into the kernel */
   1763 	case RAIDFRAME_SEND_SPARET:
   1764 
   1765 		/* install the spare table */
   1766 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1767 
   1768 		/* respond to the requestor.  the return status of the spare
   1769 		 * table installation is passed in the "fcol" field */
   1770 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1771 		waitreq->fcol = retcode;
   1772 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1773 		waitreq->next = rf_sparet_resp_queue;
   1774 		rf_sparet_resp_queue = waitreq;
   1775 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1776 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1777 
   1778 		return (retcode);
   1779 #endif
   1780 
   1781 	default:
   1782 		break; /* fall through to the os-specific code below */
   1783 
   1784 	}
   1785 
   1786 	if (!raidPtr->valid)
   1787 		return (EINVAL);
   1788 
   1789 	/*
   1790 	 * Add support for "regular" device ioctls here.
   1791 	 */
   1792 
   1793 	switch (cmd) {
   1794 	case DIOCGCACHE:
   1795 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1796 		break;
   1797 
   1798 	case DIOCCACHESYNC:
   1799 		retcode = rf_sync_component_caches(raidPtr);
   1800 		break;
   1801 
   1802 	default:
   1803 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1804 		break;
   1805 	}
   1806 
   1807 	return (retcode);
   1808 
   1809 }
   1810 
   1811 
   1812 /* raidinit -- complete the rest of the initialization for the
   1813    RAIDframe device.  */
   1814 
   1815 
   1816 static void
   1817 raidinit(struct raid_softc *rs)
   1818 {
   1819 	cfdata_t cf;
   1820 	unsigned int unit;
   1821 	struct dk_softc *dksc = &rs->sc_dksc;
   1822 	RF_Raid_t *raidPtr = &rs->sc_r;
   1823 	device_t dev;
   1824 
   1825 	unit = raidPtr->raidid;
   1826 
   1827 	/* XXX doesn't check bounds. */
   1828 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1829 
   1830 	/* attach the pseudo device */
   1831 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1832 	cf->cf_name = raid_cd.cd_name;
   1833 	cf->cf_atname = raid_cd.cd_name;
   1834 	cf->cf_unit = unit;
   1835 	cf->cf_fstate = FSTATE_STAR;
   1836 
   1837 	dev = config_attach_pseudo(cf);
   1838 	if (dev == NULL) {
   1839 		printf("raid%d: config_attach_pseudo failed\n",
   1840 		    raidPtr->raidid);
   1841 		free(cf, M_RAIDFRAME);
   1842 		return;
   1843 	}
   1844 
   1845 	/* provide a backpointer to the real softc */
   1846 	raidsoftc(dev) = rs;
   1847 
   1848 	/* disk_attach actually creates space for the CPU disklabel, among
   1849 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1850 	 * with disklabels. */
   1851 	dk_init(dksc, dev, DKTYPE_RAID);
   1852 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1853 
   1854 	/* XXX There may be a weird interaction here between this, and
   1855 	 * protectedSectors, as used in RAIDframe.  */
   1856 
   1857 	rs->sc_size = raidPtr->totalSectors;
   1858 
   1859 	/* Attach dk and disk subsystems */
   1860 	dk_attach(dksc);
   1861 	disk_attach(&dksc->sc_dkdev);
   1862 	rf_set_geometry(rs, raidPtr);
   1863 
   1864 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1865 
   1866 	/* mark unit as usuable */
   1867 	rs->sc_flags |= RAIDF_INITED;
   1868 
   1869 	dkwedge_discover(&dksc->sc_dkdev);
   1870 }
   1871 
   1872 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1873 /* wake up the daemon & tell it to get us a spare table
   1874  * XXX
   1875  * the entries in the queues should be tagged with the raidPtr
   1876  * so that in the extremely rare case that two recons happen at once,
   1877  * we know for which device were requesting a spare table
   1878  * XXX
   1879  *
   1880  * XXX This code is not currently used. GO
   1881  */
   1882 int
   1883 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1884 {
   1885 	int     retcode;
   1886 
   1887 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1888 	req->next = rf_sparet_wait_queue;
   1889 	rf_sparet_wait_queue = req;
   1890 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1891 
   1892 	/* mpsleep unlocks the mutex */
   1893 	while (!rf_sparet_resp_queue) {
   1894 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1895 	}
   1896 	req = rf_sparet_resp_queue;
   1897 	rf_sparet_resp_queue = req->next;
   1898 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1899 
   1900 	retcode = req->fcol;
   1901 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1902 					 * alloc'd */
   1903 	return (retcode);
   1904 }
   1905 #endif
   1906 
   1907 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1908  * bp & passes it down.
   1909  * any calls originating in the kernel must use non-blocking I/O
   1910  * do some extra sanity checking to return "appropriate" error values for
   1911  * certain conditions (to make some standard utilities work)
   1912  *
   1913  * Formerly known as: rf_DoAccessKernel
   1914  */
   1915 void
   1916 raidstart(RF_Raid_t *raidPtr)
   1917 {
   1918 	struct raid_softc *rs;
   1919 	struct dk_softc *dksc;
   1920 
   1921 	rs = raidPtr->softc;
   1922 	dksc = &rs->sc_dksc;
   1923 	/* quick check to see if anything has died recently */
   1924 	rf_lock_mutex2(raidPtr->mutex);
   1925 	if (raidPtr->numNewFailures > 0) {
   1926 		rf_unlock_mutex2(raidPtr->mutex);
   1927 		rf_update_component_labels(raidPtr,
   1928 					   RF_NORMAL_COMPONENT_UPDATE);
   1929 		rf_lock_mutex2(raidPtr->mutex);
   1930 		raidPtr->numNewFailures--;
   1931 	}
   1932 	rf_unlock_mutex2(raidPtr->mutex);
   1933 
   1934 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1935 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1936 		return;
   1937 	}
   1938 
   1939 	dk_start(dksc, NULL);
   1940 }
   1941 
   1942 static int
   1943 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1944 {
   1945 	RF_SectorCount_t num_blocks, pb, sum;
   1946 	RF_RaidAddr_t raid_addr;
   1947 	daddr_t blocknum;
   1948 	int     do_async;
   1949 	int rc;
   1950 
   1951 	rf_lock_mutex2(raidPtr->mutex);
   1952 	if (raidPtr->openings == 0) {
   1953 		rf_unlock_mutex2(raidPtr->mutex);
   1954 		return EAGAIN;
   1955 	}
   1956 	rf_unlock_mutex2(raidPtr->mutex);
   1957 
   1958 	blocknum = bp->b_rawblkno;
   1959 
   1960 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1961 		    (int) blocknum));
   1962 
   1963 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1964 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1965 
   1966 	/* *THIS* is where we adjust what block we're going to...
   1967 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1968 	raid_addr = blocknum;
   1969 
   1970 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1971 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1972 	sum = raid_addr + num_blocks + pb;
   1973 	if (1 || rf_debugKernelAccess) {
   1974 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1975 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1976 			    (int) pb, (int) bp->b_resid));
   1977 	}
   1978 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1979 	    || (sum < num_blocks) || (sum < pb)) {
   1980 		rc = ENOSPC;
   1981 		goto done;
   1982 	}
   1983 	/*
   1984 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1985 	 */
   1986 
   1987 	if (bp->b_bcount & raidPtr->sectorMask) {
   1988 		rc = ENOSPC;
   1989 		goto done;
   1990 	}
   1991 	db1_printf(("Calling DoAccess..\n"));
   1992 
   1993 
   1994 	rf_lock_mutex2(raidPtr->mutex);
   1995 	raidPtr->openings--;
   1996 	rf_unlock_mutex2(raidPtr->mutex);
   1997 
   1998 	/*
   1999 	 * Everything is async.
   2000 	 */
   2001 	do_async = 1;
   2002 
   2003 	/* don't ever condition on bp->b_flags & B_WRITE.
   2004 	 * always condition on B_READ instead */
   2005 
   2006 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2007 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2008 			 do_async, raid_addr, num_blocks,
   2009 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2010 
   2011 done:
   2012 	return rc;
   2013 }
   2014 
   2015 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2016 
   2017 int
   2018 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2019 {
   2020 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2021 	struct buf *bp;
   2022 
   2023 	req->queue = queue;
   2024 	bp = req->bp;
   2025 
   2026 	switch (req->type) {
   2027 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2028 		/* XXX need to do something extra here.. */
   2029 		/* I'm leaving this in, as I've never actually seen it used,
   2030 		 * and I'd like folks to report it... GO */
   2031 		printf(("WAKEUP CALLED\n"));
   2032 		queue->numOutstanding++;
   2033 
   2034 		bp->b_flags = 0;
   2035 		bp->b_private = req;
   2036 
   2037 		KernelWakeupFunc(bp);
   2038 		break;
   2039 
   2040 	case RF_IO_TYPE_READ:
   2041 	case RF_IO_TYPE_WRITE:
   2042 #if RF_ACC_TRACE > 0
   2043 		if (req->tracerec) {
   2044 			RF_ETIMER_START(req->tracerec->timer);
   2045 		}
   2046 #endif
   2047 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2048 		    op, queue->rf_cinfo->ci_dev,
   2049 		    req->sectorOffset, req->numSector,
   2050 		    req->buf, KernelWakeupFunc, (void *) req,
   2051 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2052 
   2053 		if (rf_debugKernelAccess) {
   2054 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2055 				(long) bp->b_blkno));
   2056 		}
   2057 		queue->numOutstanding++;
   2058 		queue->last_deq_sector = req->sectorOffset;
   2059 		/* acc wouldn't have been let in if there were any pending
   2060 		 * reqs at any other priority */
   2061 		queue->curPriority = req->priority;
   2062 
   2063 		db1_printf(("Going for %c to unit %d col %d\n",
   2064 			    req->type, queue->raidPtr->raidid,
   2065 			    queue->col));
   2066 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2067 			(int) req->sectorOffset, (int) req->numSector,
   2068 			(int) (req->numSector <<
   2069 			    queue->raidPtr->logBytesPerSector),
   2070 			(int) queue->raidPtr->logBytesPerSector));
   2071 
   2072 		/*
   2073 		 * XXX: drop lock here since this can block at
   2074 		 * least with backing SCSI devices.  Retake it
   2075 		 * to minimize fuss with calling interfaces.
   2076 		 */
   2077 
   2078 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2079 		bdev_strategy(bp);
   2080 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2081 		break;
   2082 
   2083 	default:
   2084 		panic("bad req->type in rf_DispatchKernelIO");
   2085 	}
   2086 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2087 
   2088 	return (0);
   2089 }
   2090 /* this is the callback function associated with a I/O invoked from
   2091    kernel code.
   2092  */
   2093 static void
   2094 KernelWakeupFunc(struct buf *bp)
   2095 {
   2096 	RF_DiskQueueData_t *req = NULL;
   2097 	RF_DiskQueue_t *queue;
   2098 
   2099 	db1_printf(("recovering the request queue:\n"));
   2100 
   2101 	req = bp->b_private;
   2102 
   2103 	queue = (RF_DiskQueue_t *) req->queue;
   2104 
   2105 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2106 
   2107 #if RF_ACC_TRACE > 0
   2108 	if (req->tracerec) {
   2109 		RF_ETIMER_STOP(req->tracerec->timer);
   2110 		RF_ETIMER_EVAL(req->tracerec->timer);
   2111 		rf_lock_mutex2(rf_tracing_mutex);
   2112 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2113 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2114 		req->tracerec->num_phys_ios++;
   2115 		rf_unlock_mutex2(rf_tracing_mutex);
   2116 	}
   2117 #endif
   2118 
   2119 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2120 	 * ballistic, and mark the component as hosed... */
   2121 
   2122 	if (bp->b_error != 0) {
   2123 		/* Mark the disk as dead */
   2124 		/* but only mark it once... */
   2125 		/* and only if it wouldn't leave this RAID set
   2126 		   completely broken */
   2127 		if (((queue->raidPtr->Disks[queue->col].status ==
   2128 		      rf_ds_optimal) ||
   2129 		     (queue->raidPtr->Disks[queue->col].status ==
   2130 		      rf_ds_used_spare)) &&
   2131 		     (queue->raidPtr->numFailures <
   2132 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2133 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2134 			       queue->raidPtr->raidid,
   2135 			       bp->b_error,
   2136 			       queue->raidPtr->Disks[queue->col].devname);
   2137 			queue->raidPtr->Disks[queue->col].status =
   2138 			    rf_ds_failed;
   2139 			queue->raidPtr->status = rf_rs_degraded;
   2140 			queue->raidPtr->numFailures++;
   2141 			queue->raidPtr->numNewFailures++;
   2142 		} else {	/* Disk is already dead... */
   2143 			/* printf("Disk already marked as dead!\n"); */
   2144 		}
   2145 
   2146 	}
   2147 
   2148 	/* Fill in the error value */
   2149 	req->error = bp->b_error;
   2150 
   2151 	/* Drop this one on the "finished" queue... */
   2152 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2153 
   2154 	/* Let the raidio thread know there is work to be done. */
   2155 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2156 
   2157 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2158 }
   2159 
   2160 
   2161 /*
   2162  * initialize a buf structure for doing an I/O in the kernel.
   2163  */
   2164 static void
   2165 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2166        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2167        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2168        struct proc *b_proc)
   2169 {
   2170 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2171 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2172 	bp->b_oflags = 0;
   2173 	bp->b_cflags = 0;
   2174 	bp->b_bcount = numSect << logBytesPerSector;
   2175 	bp->b_bufsize = bp->b_bcount;
   2176 	bp->b_error = 0;
   2177 	bp->b_dev = dev;
   2178 	bp->b_data = bf;
   2179 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2180 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2181 	if (bp->b_bcount == 0) {
   2182 		panic("bp->b_bcount is zero in InitBP!!");
   2183 	}
   2184 	bp->b_proc = b_proc;
   2185 	bp->b_iodone = cbFunc;
   2186 	bp->b_private = cbArg;
   2187 }
   2188 
   2189 /*
   2190  * Wait interruptibly for an exclusive lock.
   2191  *
   2192  * XXX
   2193  * Several drivers do this; it should be abstracted and made MP-safe.
   2194  * (Hmm... where have we seen this warning before :->  GO )
   2195  */
   2196 static int
   2197 raidlock(struct raid_softc *rs)
   2198 {
   2199 	int     error;
   2200 
   2201 	error = 0;
   2202 	mutex_enter(&rs->sc_mutex);
   2203 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2204 		rs->sc_flags |= RAIDF_WANTED;
   2205 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2206 		if (error != 0)
   2207 			goto done;
   2208 	}
   2209 	rs->sc_flags |= RAIDF_LOCKED;
   2210 done:
   2211 	mutex_exit(&rs->sc_mutex);
   2212 	return (error);
   2213 }
   2214 /*
   2215  * Unlock and wake up any waiters.
   2216  */
   2217 static void
   2218 raidunlock(struct raid_softc *rs)
   2219 {
   2220 
   2221 	mutex_enter(&rs->sc_mutex);
   2222 	rs->sc_flags &= ~RAIDF_LOCKED;
   2223 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2224 		rs->sc_flags &= ~RAIDF_WANTED;
   2225 		cv_broadcast(&rs->sc_cv);
   2226 	}
   2227 	mutex_exit(&rs->sc_mutex);
   2228 }
   2229 
   2230 
   2231 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2232 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2233 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2234 
   2235 static daddr_t
   2236 rf_component_info_offset(void)
   2237 {
   2238 
   2239 	return RF_COMPONENT_INFO_OFFSET;
   2240 }
   2241 
   2242 static daddr_t
   2243 rf_component_info_size(unsigned secsize)
   2244 {
   2245 	daddr_t info_size;
   2246 
   2247 	KASSERT(secsize);
   2248 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2249 		info_size = secsize;
   2250 	else
   2251 		info_size = RF_COMPONENT_INFO_SIZE;
   2252 
   2253 	return info_size;
   2254 }
   2255 
   2256 static daddr_t
   2257 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2258 {
   2259 	daddr_t map_offset;
   2260 
   2261 	KASSERT(raidPtr->bytesPerSector);
   2262 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2263 		map_offset = raidPtr->bytesPerSector;
   2264 	else
   2265 		map_offset = RF_COMPONENT_INFO_SIZE;
   2266 	map_offset += rf_component_info_offset();
   2267 
   2268 	return map_offset;
   2269 }
   2270 
   2271 static daddr_t
   2272 rf_parity_map_size(RF_Raid_t *raidPtr)
   2273 {
   2274 	daddr_t map_size;
   2275 
   2276 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2277 		map_size = raidPtr->bytesPerSector;
   2278 	else
   2279 		map_size = RF_PARITY_MAP_SIZE;
   2280 
   2281 	return map_size;
   2282 }
   2283 
   2284 int
   2285 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2286 {
   2287 	RF_ComponentLabel_t *clabel;
   2288 
   2289 	clabel = raidget_component_label(raidPtr, col);
   2290 	clabel->clean = RF_RAID_CLEAN;
   2291 	raidflush_component_label(raidPtr, col);
   2292 	return(0);
   2293 }
   2294 
   2295 
   2296 int
   2297 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2298 {
   2299 	RF_ComponentLabel_t *clabel;
   2300 
   2301 	clabel = raidget_component_label(raidPtr, col);
   2302 	clabel->clean = RF_RAID_DIRTY;
   2303 	raidflush_component_label(raidPtr, col);
   2304 	return(0);
   2305 }
   2306 
   2307 int
   2308 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2309 {
   2310 	KASSERT(raidPtr->bytesPerSector);
   2311 	return raidread_component_label(raidPtr->bytesPerSector,
   2312 	    raidPtr->Disks[col].dev,
   2313 	    raidPtr->raid_cinfo[col].ci_vp,
   2314 	    &raidPtr->raid_cinfo[col].ci_label);
   2315 }
   2316 
   2317 RF_ComponentLabel_t *
   2318 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2319 {
   2320 	return &raidPtr->raid_cinfo[col].ci_label;
   2321 }
   2322 
   2323 int
   2324 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2325 {
   2326 	RF_ComponentLabel_t *label;
   2327 
   2328 	label = &raidPtr->raid_cinfo[col].ci_label;
   2329 	label->mod_counter = raidPtr->mod_counter;
   2330 #ifndef RF_NO_PARITY_MAP
   2331 	label->parity_map_modcount = label->mod_counter;
   2332 #endif
   2333 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2334 	    raidPtr->Disks[col].dev,
   2335 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2336 }
   2337 
   2338 
   2339 static int
   2340 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2341     RF_ComponentLabel_t *clabel)
   2342 {
   2343 	return raidread_component_area(dev, b_vp, clabel,
   2344 	    sizeof(RF_ComponentLabel_t),
   2345 	    rf_component_info_offset(),
   2346 	    rf_component_info_size(secsize));
   2347 }
   2348 
   2349 /* ARGSUSED */
   2350 static int
   2351 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2352     size_t msize, daddr_t offset, daddr_t dsize)
   2353 {
   2354 	struct buf *bp;
   2355 	int error;
   2356 
   2357 	/* XXX should probably ensure that we don't try to do this if
   2358 	   someone has changed rf_protected_sectors. */
   2359 
   2360 	if (b_vp == NULL) {
   2361 		/* For whatever reason, this component is not valid.
   2362 		   Don't try to read a component label from it. */
   2363 		return(EINVAL);
   2364 	}
   2365 
   2366 	/* get a block of the appropriate size... */
   2367 	bp = geteblk((int)dsize);
   2368 	bp->b_dev = dev;
   2369 
   2370 	/* get our ducks in a row for the read */
   2371 	bp->b_blkno = offset / DEV_BSIZE;
   2372 	bp->b_bcount = dsize;
   2373 	bp->b_flags |= B_READ;
   2374  	bp->b_resid = dsize;
   2375 
   2376 	bdev_strategy(bp);
   2377 	error = biowait(bp);
   2378 
   2379 	if (!error) {
   2380 		memcpy(data, bp->b_data, msize);
   2381 	}
   2382 
   2383 	brelse(bp, 0);
   2384 	return(error);
   2385 }
   2386 
   2387 
   2388 static int
   2389 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2390     RF_ComponentLabel_t *clabel)
   2391 {
   2392 	return raidwrite_component_area(dev, b_vp, clabel,
   2393 	    sizeof(RF_ComponentLabel_t),
   2394 	    rf_component_info_offset(),
   2395 	    rf_component_info_size(secsize), 0);
   2396 }
   2397 
   2398 /* ARGSUSED */
   2399 static int
   2400 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2401     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2402 {
   2403 	struct buf *bp;
   2404 	int error;
   2405 
   2406 	/* get a block of the appropriate size... */
   2407 	bp = geteblk((int)dsize);
   2408 	bp->b_dev = dev;
   2409 
   2410 	/* get our ducks in a row for the write */
   2411 	bp->b_blkno = offset / DEV_BSIZE;
   2412 	bp->b_bcount = dsize;
   2413 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2414  	bp->b_resid = dsize;
   2415 
   2416 	memset(bp->b_data, 0, dsize);
   2417 	memcpy(bp->b_data, data, msize);
   2418 
   2419 	bdev_strategy(bp);
   2420 	if (asyncp)
   2421 		return 0;
   2422 	error = biowait(bp);
   2423 	brelse(bp, 0);
   2424 	if (error) {
   2425 #if 1
   2426 		printf("Failed to write RAID component info!\n");
   2427 #endif
   2428 	}
   2429 
   2430 	return(error);
   2431 }
   2432 
   2433 void
   2434 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2435 {
   2436 	int c;
   2437 
   2438 	for (c = 0; c < raidPtr->numCol; c++) {
   2439 		/* Skip dead disks. */
   2440 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2441 			continue;
   2442 		/* XXXjld: what if an error occurs here? */
   2443 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2444 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2445 		    RF_PARITYMAP_NBYTE,
   2446 		    rf_parity_map_offset(raidPtr),
   2447 		    rf_parity_map_size(raidPtr), 0);
   2448 	}
   2449 }
   2450 
   2451 void
   2452 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2453 {
   2454 	struct rf_paritymap_ondisk tmp;
   2455 	int c,first;
   2456 
   2457 	first=1;
   2458 	for (c = 0; c < raidPtr->numCol; c++) {
   2459 		/* Skip dead disks. */
   2460 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2461 			continue;
   2462 		raidread_component_area(raidPtr->Disks[c].dev,
   2463 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2464 		    RF_PARITYMAP_NBYTE,
   2465 		    rf_parity_map_offset(raidPtr),
   2466 		    rf_parity_map_size(raidPtr));
   2467 		if (first) {
   2468 			memcpy(map, &tmp, sizeof(*map));
   2469 			first = 0;
   2470 		} else {
   2471 			rf_paritymap_merge(map, &tmp);
   2472 		}
   2473 	}
   2474 }
   2475 
   2476 void
   2477 rf_markalldirty(RF_Raid_t *raidPtr)
   2478 {
   2479 	RF_ComponentLabel_t *clabel;
   2480 	int sparecol;
   2481 	int c;
   2482 	int j;
   2483 	int scol = -1;
   2484 
   2485 	raidPtr->mod_counter++;
   2486 	for (c = 0; c < raidPtr->numCol; c++) {
   2487 		/* we don't want to touch (at all) a disk that has
   2488 		   failed */
   2489 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2490 			clabel = raidget_component_label(raidPtr, c);
   2491 			if (clabel->status == rf_ds_spared) {
   2492 				/* XXX do something special...
   2493 				   but whatever you do, don't
   2494 				   try to access it!! */
   2495 			} else {
   2496 				raidmarkdirty(raidPtr, c);
   2497 			}
   2498 		}
   2499 	}
   2500 
   2501 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2502 		sparecol = raidPtr->numCol + c;
   2503 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2504 			/*
   2505 
   2506 			   we claim this disk is "optimal" if it's
   2507 			   rf_ds_used_spare, as that means it should be
   2508 			   directly substitutable for the disk it replaced.
   2509 			   We note that too...
   2510 
   2511 			 */
   2512 
   2513 			for(j=0;j<raidPtr->numCol;j++) {
   2514 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2515 					scol = j;
   2516 					break;
   2517 				}
   2518 			}
   2519 
   2520 			clabel = raidget_component_label(raidPtr, sparecol);
   2521 			/* make sure status is noted */
   2522 
   2523 			raid_init_component_label(raidPtr, clabel);
   2524 
   2525 			clabel->row = 0;
   2526 			clabel->column = scol;
   2527 			/* Note: we *don't* change status from rf_ds_used_spare
   2528 			   to rf_ds_optimal */
   2529 			/* clabel.status = rf_ds_optimal; */
   2530 
   2531 			raidmarkdirty(raidPtr, sparecol);
   2532 		}
   2533 	}
   2534 }
   2535 
   2536 
   2537 void
   2538 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2539 {
   2540 	RF_ComponentLabel_t *clabel;
   2541 	int sparecol;
   2542 	int c;
   2543 	int j;
   2544 	int scol;
   2545 	struct raid_softc *rs = raidPtr->softc;
   2546 
   2547 	scol = -1;
   2548 
   2549 	/* XXX should do extra checks to make sure things really are clean,
   2550 	   rather than blindly setting the clean bit... */
   2551 
   2552 	raidPtr->mod_counter++;
   2553 
   2554 	for (c = 0; c < raidPtr->numCol; c++) {
   2555 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2556 			clabel = raidget_component_label(raidPtr, c);
   2557 			/* make sure status is noted */
   2558 			clabel->status = rf_ds_optimal;
   2559 
   2560 			/* note what unit we are configured as */
   2561 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2562 				clabel->last_unit = raidPtr->raidid;
   2563 
   2564 			raidflush_component_label(raidPtr, c);
   2565 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2566 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2567 					raidmarkclean(raidPtr, c);
   2568 				}
   2569 			}
   2570 		}
   2571 		/* else we don't touch it.. */
   2572 	}
   2573 
   2574 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2575 		sparecol = raidPtr->numCol + c;
   2576 		/* Need to ensure that the reconstruct actually completed! */
   2577 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2578 			/*
   2579 
   2580 			   we claim this disk is "optimal" if it's
   2581 			   rf_ds_used_spare, as that means it should be
   2582 			   directly substitutable for the disk it replaced.
   2583 			   We note that too...
   2584 
   2585 			 */
   2586 
   2587 			for(j=0;j<raidPtr->numCol;j++) {
   2588 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2589 					scol = j;
   2590 					break;
   2591 				}
   2592 			}
   2593 
   2594 			/* XXX shouldn't *really* need this... */
   2595 			clabel = raidget_component_label(raidPtr, sparecol);
   2596 			/* make sure status is noted */
   2597 
   2598 			raid_init_component_label(raidPtr, clabel);
   2599 
   2600 			clabel->column = scol;
   2601 			clabel->status = rf_ds_optimal;
   2602 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2603 				clabel->last_unit = raidPtr->raidid;
   2604 
   2605 			raidflush_component_label(raidPtr, sparecol);
   2606 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2607 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2608 					raidmarkclean(raidPtr, sparecol);
   2609 				}
   2610 			}
   2611 		}
   2612 	}
   2613 }
   2614 
   2615 void
   2616 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2617 {
   2618 
   2619 	if (vp != NULL) {
   2620 		if (auto_configured == 1) {
   2621 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2622 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2623 			vput(vp);
   2624 
   2625 		} else {
   2626 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2627 		}
   2628 	}
   2629 }
   2630 
   2631 
   2632 void
   2633 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2634 {
   2635 	int r,c;
   2636 	struct vnode *vp;
   2637 	int acd;
   2638 
   2639 
   2640 	/* We take this opportunity to close the vnodes like we should.. */
   2641 
   2642 	for (c = 0; c < raidPtr->numCol; c++) {
   2643 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2644 		acd = raidPtr->Disks[c].auto_configured;
   2645 		rf_close_component(raidPtr, vp, acd);
   2646 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2647 		raidPtr->Disks[c].auto_configured = 0;
   2648 	}
   2649 
   2650 	for (r = 0; r < raidPtr->numSpare; r++) {
   2651 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2652 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2653 		rf_close_component(raidPtr, vp, acd);
   2654 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2655 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2656 	}
   2657 }
   2658 
   2659 
   2660 void
   2661 rf_ReconThread(struct rf_recon_req *req)
   2662 {
   2663 	int     s;
   2664 	RF_Raid_t *raidPtr;
   2665 
   2666 	s = splbio();
   2667 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2668 	raidPtr->recon_in_progress = 1;
   2669 
   2670 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2671 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2672 
   2673 	RF_Free(req, sizeof(*req));
   2674 
   2675 	raidPtr->recon_in_progress = 0;
   2676 	splx(s);
   2677 
   2678 	/* That's all... */
   2679 	kthread_exit(0);	/* does not return */
   2680 }
   2681 
   2682 void
   2683 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2684 {
   2685 	int retcode;
   2686 	int s;
   2687 
   2688 	raidPtr->parity_rewrite_stripes_done = 0;
   2689 	raidPtr->parity_rewrite_in_progress = 1;
   2690 	s = splbio();
   2691 	retcode = rf_RewriteParity(raidPtr);
   2692 	splx(s);
   2693 	if (retcode) {
   2694 		printf("raid%d: Error re-writing parity (%d)!\n",
   2695 		    raidPtr->raidid, retcode);
   2696 	} else {
   2697 		/* set the clean bit!  If we shutdown correctly,
   2698 		   the clean bit on each component label will get
   2699 		   set */
   2700 		raidPtr->parity_good = RF_RAID_CLEAN;
   2701 	}
   2702 	raidPtr->parity_rewrite_in_progress = 0;
   2703 
   2704 	/* Anyone waiting for us to stop?  If so, inform them... */
   2705 	if (raidPtr->waitShutdown) {
   2706 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2707 	}
   2708 
   2709 	/* That's all... */
   2710 	kthread_exit(0);	/* does not return */
   2711 }
   2712 
   2713 
   2714 void
   2715 rf_CopybackThread(RF_Raid_t *raidPtr)
   2716 {
   2717 	int s;
   2718 
   2719 	raidPtr->copyback_in_progress = 1;
   2720 	s = splbio();
   2721 	rf_CopybackReconstructedData(raidPtr);
   2722 	splx(s);
   2723 	raidPtr->copyback_in_progress = 0;
   2724 
   2725 	/* That's all... */
   2726 	kthread_exit(0);	/* does not return */
   2727 }
   2728 
   2729 
   2730 void
   2731 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2732 {
   2733 	int s;
   2734 	RF_Raid_t *raidPtr;
   2735 
   2736 	s = splbio();
   2737 	raidPtr = req->raidPtr;
   2738 	raidPtr->recon_in_progress = 1;
   2739 	rf_ReconstructInPlace(raidPtr, req->col);
   2740 	RF_Free(req, sizeof(*req));
   2741 	raidPtr->recon_in_progress = 0;
   2742 	splx(s);
   2743 
   2744 	/* That's all... */
   2745 	kthread_exit(0);	/* does not return */
   2746 }
   2747 
   2748 static RF_AutoConfig_t *
   2749 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2750     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2751     unsigned secsize)
   2752 {
   2753 	int good_one = 0;
   2754 	RF_ComponentLabel_t *clabel;
   2755 	RF_AutoConfig_t *ac;
   2756 
   2757 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2758 	if (clabel == NULL) {
   2759 oomem:
   2760 		    while(ac_list) {
   2761 			    ac = ac_list;
   2762 			    if (ac->clabel)
   2763 				    free(ac->clabel, M_RAIDFRAME);
   2764 			    ac_list = ac_list->next;
   2765 			    free(ac, M_RAIDFRAME);
   2766 		    }
   2767 		    printf("RAID auto config: out of memory!\n");
   2768 		    return NULL; /* XXX probably should panic? */
   2769 	}
   2770 
   2771 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2772 		/* Got the label.  Does it look reasonable? */
   2773 		if (rf_reasonable_label(clabel, numsecs) &&
   2774 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2775 #ifdef DEBUG
   2776 			printf("Component on: %s: %llu\n",
   2777 				cname, (unsigned long long)size);
   2778 			rf_print_component_label(clabel);
   2779 #endif
   2780 			/* if it's reasonable, add it, else ignore it. */
   2781 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2782 				M_NOWAIT);
   2783 			if (ac == NULL) {
   2784 				free(clabel, M_RAIDFRAME);
   2785 				goto oomem;
   2786 			}
   2787 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2788 			ac->dev = dev;
   2789 			ac->vp = vp;
   2790 			ac->clabel = clabel;
   2791 			ac->next = ac_list;
   2792 			ac_list = ac;
   2793 			good_one = 1;
   2794 		}
   2795 	}
   2796 	if (!good_one) {
   2797 		/* cleanup */
   2798 		free(clabel, M_RAIDFRAME);
   2799 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2800 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2801 		vput(vp);
   2802 	}
   2803 	return ac_list;
   2804 }
   2805 
   2806 RF_AutoConfig_t *
   2807 rf_find_raid_components(void)
   2808 {
   2809 	struct vnode *vp;
   2810 	struct disklabel label;
   2811 	device_t dv;
   2812 	deviter_t di;
   2813 	dev_t dev;
   2814 	int bmajor, bminor, wedge, rf_part_found;
   2815 	int error;
   2816 	int i;
   2817 	RF_AutoConfig_t *ac_list;
   2818 	uint64_t numsecs;
   2819 	unsigned secsize;
   2820 	int dowedges;
   2821 
   2822 	/* initialize the AutoConfig list */
   2823 	ac_list = NULL;
   2824 
   2825 	/*
   2826 	 * we begin by trolling through *all* the devices on the system *twice*
   2827 	 * first we scan for wedges, second for other devices. This avoids
   2828 	 * using a raw partition instead of a wedge that covers the whole disk
   2829 	 */
   2830 
   2831 	for (dowedges=1; dowedges>=0; --dowedges) {
   2832 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2833 		     dv = deviter_next(&di)) {
   2834 
   2835 			/* we are only interested in disks... */
   2836 			if (device_class(dv) != DV_DISK)
   2837 				continue;
   2838 
   2839 			/* we don't care about floppies... */
   2840 			if (device_is_a(dv, "fd")) {
   2841 				continue;
   2842 			}
   2843 
   2844 			/* we don't care about CD's... */
   2845 			if (device_is_a(dv, "cd")) {
   2846 				continue;
   2847 			}
   2848 
   2849 			/* we don't care about md's... */
   2850 			if (device_is_a(dv, "md")) {
   2851 				continue;
   2852 			}
   2853 
   2854 			/* hdfd is the Atari/Hades floppy driver */
   2855 			if (device_is_a(dv, "hdfd")) {
   2856 				continue;
   2857 			}
   2858 
   2859 			/* fdisa is the Atari/Milan floppy driver */
   2860 			if (device_is_a(dv, "fdisa")) {
   2861 				continue;
   2862 			}
   2863 
   2864 			/* are we in the wedges pass ? */
   2865 			wedge = device_is_a(dv, "dk");
   2866 			if (wedge != dowedges) {
   2867 				continue;
   2868 			}
   2869 
   2870 			/* need to find the device_name_to_block_device_major stuff */
   2871 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2872 
   2873 			rf_part_found = 0; /*No raid partition as yet*/
   2874 
   2875 			/* get a vnode for the raw partition of this disk */
   2876 			bminor = minor(device_unit(dv));
   2877 			dev = wedge ? makedev(bmajor, bminor) :
   2878 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2879 			if (bdevvp(dev, &vp))
   2880 				panic("RAID can't alloc vnode");
   2881 
   2882 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2883 
   2884 			if (error) {
   2885 				/* "Who cares."  Continue looking
   2886 				   for something that exists*/
   2887 				vput(vp);
   2888 				continue;
   2889 			}
   2890 
   2891 			error = getdisksize(vp, &numsecs, &secsize);
   2892 			if (error) {
   2893 				/*
   2894 				 * Pseudo devices like vnd and cgd can be
   2895 				 * opened but may still need some configuration.
   2896 				 * Ignore these quietly.
   2897 				 */
   2898 				if (error != ENXIO)
   2899 					printf("RAIDframe: can't get disk size"
   2900 					    " for dev %s (%d)\n",
   2901 					    device_xname(dv), error);
   2902 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2903 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2904 				vput(vp);
   2905 				continue;
   2906 			}
   2907 			if (wedge) {
   2908 				struct dkwedge_info dkw;
   2909 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2910 				    NOCRED);
   2911 				if (error) {
   2912 					printf("RAIDframe: can't get wedge info for "
   2913 					    "dev %s (%d)\n", device_xname(dv), error);
   2914 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2915 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2916 					vput(vp);
   2917 					continue;
   2918 				}
   2919 
   2920 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2921 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2922 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2923 					vput(vp);
   2924 					continue;
   2925 				}
   2926 
   2927 				ac_list = rf_get_component(ac_list, dev, vp,
   2928 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2929 				rf_part_found = 1; /*There is a raid component on this disk*/
   2930 				continue;
   2931 			}
   2932 
   2933 			/* Ok, the disk exists.  Go get the disklabel. */
   2934 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2935 			if (error) {
   2936 				/*
   2937 				 * XXX can't happen - open() would
   2938 				 * have errored out (or faked up one)
   2939 				 */
   2940 				if (error != ENOTTY)
   2941 					printf("RAIDframe: can't get label for dev "
   2942 					    "%s (%d)\n", device_xname(dv), error);
   2943 			}
   2944 
   2945 			/* don't need this any more.  We'll allocate it again
   2946 			   a little later if we really do... */
   2947 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2948 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2949 			vput(vp);
   2950 
   2951 			if (error)
   2952 				continue;
   2953 
   2954 			rf_part_found = 0; /*No raid partitions yet*/
   2955 			for (i = 0; i < label.d_npartitions; i++) {
   2956 				char cname[sizeof(ac_list->devname)];
   2957 
   2958 				/* We only support partitions marked as RAID */
   2959 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2960 					continue;
   2961 
   2962 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2963 				if (bdevvp(dev, &vp))
   2964 					panic("RAID can't alloc vnode");
   2965 
   2966 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2967 				if (error) {
   2968 					/* Whatever... */
   2969 					vput(vp);
   2970 					continue;
   2971 				}
   2972 				snprintf(cname, sizeof(cname), "%s%c",
   2973 				    device_xname(dv), 'a' + i);
   2974 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2975 					label.d_partitions[i].p_size, numsecs, secsize);
   2976 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2977 			}
   2978 
   2979 			/*
   2980 			 *If there is no raid component on this disk, either in a
   2981 			 *disklabel or inside a wedge, check the raw partition as well,
   2982 			 *as it is possible to configure raid components on raw disk
   2983 			 *devices.
   2984 			 */
   2985 
   2986 			if (!rf_part_found) {
   2987 				char cname[sizeof(ac_list->devname)];
   2988 
   2989 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2990 				if (bdevvp(dev, &vp))
   2991 					panic("RAID can't alloc vnode");
   2992 
   2993 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2994 				if (error) {
   2995 					/* Whatever... */
   2996 					vput(vp);
   2997 					continue;
   2998 				}
   2999 				snprintf(cname, sizeof(cname), "%s%c",
   3000 				    device_xname(dv), 'a' + RAW_PART);
   3001 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3002 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3003 			}
   3004 		}
   3005 		deviter_release(&di);
   3006 	}
   3007 	return ac_list;
   3008 }
   3009 
   3010 
   3011 int
   3012 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3013 {
   3014 
   3015 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3016 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3017 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3018 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3019 	    clabel->row >=0 &&
   3020 	    clabel->column >= 0 &&
   3021 	    clabel->num_rows > 0 &&
   3022 	    clabel->num_columns > 0 &&
   3023 	    clabel->row < clabel->num_rows &&
   3024 	    clabel->column < clabel->num_columns &&
   3025 	    clabel->blockSize > 0 &&
   3026 	    /*
   3027 	     * numBlocksHi may contain garbage, but it is ok since
   3028 	     * the type is unsigned.  If it is really garbage,
   3029 	     * rf_fix_old_label_size() will fix it.
   3030 	     */
   3031 	    rf_component_label_numblocks(clabel) > 0) {
   3032 		/*
   3033 		 * label looks reasonable enough...
   3034 		 * let's make sure it has no old garbage.
   3035 		 */
   3036 		if (numsecs)
   3037 			rf_fix_old_label_size(clabel, numsecs);
   3038 		return(1);
   3039 	}
   3040 	return(0);
   3041 }
   3042 
   3043 
   3044 /*
   3045  * For reasons yet unknown, some old component labels have garbage in
   3046  * the newer numBlocksHi region, and this causes lossage.  Since those
   3047  * disks will also have numsecs set to less than 32 bits of sectors,
   3048  * we can determine when this corruption has occurred, and fix it.
   3049  *
   3050  * The exact same problem, with the same unknown reason, happens to
   3051  * the partitionSizeHi member as well.
   3052  */
   3053 static void
   3054 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3055 {
   3056 
   3057 	if (numsecs < ((uint64_t)1 << 32)) {
   3058 		if (clabel->numBlocksHi) {
   3059 			printf("WARNING: total sectors < 32 bits, yet "
   3060 			       "numBlocksHi set\n"
   3061 			       "WARNING: resetting numBlocksHi to zero.\n");
   3062 			clabel->numBlocksHi = 0;
   3063 		}
   3064 
   3065 		if (clabel->partitionSizeHi) {
   3066 			printf("WARNING: total sectors < 32 bits, yet "
   3067 			       "partitionSizeHi set\n"
   3068 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3069 			clabel->partitionSizeHi = 0;
   3070 		}
   3071 	}
   3072 }
   3073 
   3074 
   3075 #ifdef DEBUG
   3076 void
   3077 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3078 {
   3079 	uint64_t numBlocks;
   3080 	static const char *rp[] = {
   3081 	    "No", "Force", "Soft", "*invalid*"
   3082 	};
   3083 
   3084 
   3085 	numBlocks = rf_component_label_numblocks(clabel);
   3086 
   3087 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3088 	       clabel->row, clabel->column,
   3089 	       clabel->num_rows, clabel->num_columns);
   3090 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3091 	       clabel->version, clabel->serial_number,
   3092 	       clabel->mod_counter);
   3093 	printf("   Clean: %s Status: %d\n",
   3094 	       clabel->clean ? "Yes" : "No", clabel->status);
   3095 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3096 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3097 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3098 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3099 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3100 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3101 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3102 #if 0
   3103 	   printf("   Config order: %d\n", clabel->config_order);
   3104 #endif
   3105 
   3106 }
   3107 #endif
   3108 
   3109 RF_ConfigSet_t *
   3110 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3111 {
   3112 	RF_AutoConfig_t *ac;
   3113 	RF_ConfigSet_t *config_sets;
   3114 	RF_ConfigSet_t *cset;
   3115 	RF_AutoConfig_t *ac_next;
   3116 
   3117 
   3118 	config_sets = NULL;
   3119 
   3120 	/* Go through the AutoConfig list, and figure out which components
   3121 	   belong to what sets.  */
   3122 	ac = ac_list;
   3123 	while(ac!=NULL) {
   3124 		/* we're going to putz with ac->next, so save it here
   3125 		   for use at the end of the loop */
   3126 		ac_next = ac->next;
   3127 
   3128 		if (config_sets == NULL) {
   3129 			/* will need at least this one... */
   3130 			config_sets = (RF_ConfigSet_t *)
   3131 				malloc(sizeof(RF_ConfigSet_t),
   3132 				       M_RAIDFRAME, M_NOWAIT);
   3133 			if (config_sets == NULL) {
   3134 				panic("rf_create_auto_sets: No memory!");
   3135 			}
   3136 			/* this one is easy :) */
   3137 			config_sets->ac = ac;
   3138 			config_sets->next = NULL;
   3139 			config_sets->rootable = 0;
   3140 			ac->next = NULL;
   3141 		} else {
   3142 			/* which set does this component fit into? */
   3143 			cset = config_sets;
   3144 			while(cset!=NULL) {
   3145 				if (rf_does_it_fit(cset, ac)) {
   3146 					/* looks like it matches... */
   3147 					ac->next = cset->ac;
   3148 					cset->ac = ac;
   3149 					break;
   3150 				}
   3151 				cset = cset->next;
   3152 			}
   3153 			if (cset==NULL) {
   3154 				/* didn't find a match above... new set..*/
   3155 				cset = (RF_ConfigSet_t *)
   3156 					malloc(sizeof(RF_ConfigSet_t),
   3157 					       M_RAIDFRAME, M_NOWAIT);
   3158 				if (cset == NULL) {
   3159 					panic("rf_create_auto_sets: No memory!");
   3160 				}
   3161 				cset->ac = ac;
   3162 				ac->next = NULL;
   3163 				cset->next = config_sets;
   3164 				cset->rootable = 0;
   3165 				config_sets = cset;
   3166 			}
   3167 		}
   3168 		ac = ac_next;
   3169 	}
   3170 
   3171 
   3172 	return(config_sets);
   3173 }
   3174 
   3175 static int
   3176 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3177 {
   3178 	RF_ComponentLabel_t *clabel1, *clabel2;
   3179 
   3180 	/* If this one matches the *first* one in the set, that's good
   3181 	   enough, since the other members of the set would have been
   3182 	   through here too... */
   3183 	/* note that we are not checking partitionSize here..
   3184 
   3185 	   Note that we are also not checking the mod_counters here.
   3186 	   If everything else matches except the mod_counter, that's
   3187 	   good enough for this test.  We will deal with the mod_counters
   3188 	   a little later in the autoconfiguration process.
   3189 
   3190 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3191 
   3192 	   The reason we don't check for this is that failed disks
   3193 	   will have lower modification counts.  If those disks are
   3194 	   not added to the set they used to belong to, then they will
   3195 	   form their own set, which may result in 2 different sets,
   3196 	   for example, competing to be configured at raid0, and
   3197 	   perhaps competing to be the root filesystem set.  If the
   3198 	   wrong ones get configured, or both attempt to become /,
   3199 	   weird behaviour and or serious lossage will occur.  Thus we
   3200 	   need to bring them into the fold here, and kick them out at
   3201 	   a later point.
   3202 
   3203 	*/
   3204 
   3205 	clabel1 = cset->ac->clabel;
   3206 	clabel2 = ac->clabel;
   3207 	if ((clabel1->version == clabel2->version) &&
   3208 	    (clabel1->serial_number == clabel2->serial_number) &&
   3209 	    (clabel1->num_rows == clabel2->num_rows) &&
   3210 	    (clabel1->num_columns == clabel2->num_columns) &&
   3211 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3212 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3213 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3214 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3215 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3216 	    (clabel1->blockSize == clabel2->blockSize) &&
   3217 	    rf_component_label_numblocks(clabel1) ==
   3218 	    rf_component_label_numblocks(clabel2) &&
   3219 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3220 	    (clabel1->root_partition == clabel2->root_partition) &&
   3221 	    (clabel1->last_unit == clabel2->last_unit) &&
   3222 	    (clabel1->config_order == clabel2->config_order)) {
   3223 		/* if it get's here, it almost *has* to be a match */
   3224 	} else {
   3225 		/* it's not consistent with somebody in the set..
   3226 		   punt */
   3227 		return(0);
   3228 	}
   3229 	/* all was fine.. it must fit... */
   3230 	return(1);
   3231 }
   3232 
   3233 int
   3234 rf_have_enough_components(RF_ConfigSet_t *cset)
   3235 {
   3236 	RF_AutoConfig_t *ac;
   3237 	RF_AutoConfig_t *auto_config;
   3238 	RF_ComponentLabel_t *clabel;
   3239 	int c;
   3240 	int num_cols;
   3241 	int num_missing;
   3242 	int mod_counter;
   3243 	int mod_counter_found;
   3244 	int even_pair_failed;
   3245 	char parity_type;
   3246 
   3247 
   3248 	/* check to see that we have enough 'live' components
   3249 	   of this set.  If so, we can configure it if necessary */
   3250 
   3251 	num_cols = cset->ac->clabel->num_columns;
   3252 	parity_type = cset->ac->clabel->parityConfig;
   3253 
   3254 	/* XXX Check for duplicate components!?!?!? */
   3255 
   3256 	/* Determine what the mod_counter is supposed to be for this set. */
   3257 
   3258 	mod_counter_found = 0;
   3259 	mod_counter = 0;
   3260 	ac = cset->ac;
   3261 	while(ac!=NULL) {
   3262 		if (mod_counter_found==0) {
   3263 			mod_counter = ac->clabel->mod_counter;
   3264 			mod_counter_found = 1;
   3265 		} else {
   3266 			if (ac->clabel->mod_counter > mod_counter) {
   3267 				mod_counter = ac->clabel->mod_counter;
   3268 			}
   3269 		}
   3270 		ac = ac->next;
   3271 	}
   3272 
   3273 	num_missing = 0;
   3274 	auto_config = cset->ac;
   3275 
   3276 	even_pair_failed = 0;
   3277 	for(c=0; c<num_cols; c++) {
   3278 		ac = auto_config;
   3279 		while(ac!=NULL) {
   3280 			if ((ac->clabel->column == c) &&
   3281 			    (ac->clabel->mod_counter == mod_counter)) {
   3282 				/* it's this one... */
   3283 #ifdef DEBUG
   3284 				printf("Found: %s at %d\n",
   3285 				       ac->devname,c);
   3286 #endif
   3287 				break;
   3288 			}
   3289 			ac=ac->next;
   3290 		}
   3291 		if (ac==NULL) {
   3292 				/* Didn't find one here! */
   3293 				/* special case for RAID 1, especially
   3294 				   where there are more than 2
   3295 				   components (where RAIDframe treats
   3296 				   things a little differently :( ) */
   3297 			if (parity_type == '1') {
   3298 				if (c%2 == 0) { /* even component */
   3299 					even_pair_failed = 1;
   3300 				} else { /* odd component.  If
   3301 					    we're failed, and
   3302 					    so is the even
   3303 					    component, it's
   3304 					    "Good Night, Charlie" */
   3305 					if (even_pair_failed == 1) {
   3306 						return(0);
   3307 					}
   3308 				}
   3309 			} else {
   3310 				/* normal accounting */
   3311 				num_missing++;
   3312 			}
   3313 		}
   3314 		if ((parity_type == '1') && (c%2 == 1)) {
   3315 				/* Just did an even component, and we didn't
   3316 				   bail.. reset the even_pair_failed flag,
   3317 				   and go on to the next component.... */
   3318 			even_pair_failed = 0;
   3319 		}
   3320 	}
   3321 
   3322 	clabel = cset->ac->clabel;
   3323 
   3324 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3325 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3326 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3327 		/* XXX this needs to be made *much* more general */
   3328 		/* Too many failures */
   3329 		return(0);
   3330 	}
   3331 	/* otherwise, all is well, and we've got enough to take a kick
   3332 	   at autoconfiguring this set */
   3333 	return(1);
   3334 }
   3335 
   3336 void
   3337 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3338 			RF_Raid_t *raidPtr)
   3339 {
   3340 	RF_ComponentLabel_t *clabel;
   3341 	int i;
   3342 
   3343 	clabel = ac->clabel;
   3344 
   3345 	/* 1. Fill in the common stuff */
   3346 	config->numRow = clabel->num_rows = 1;
   3347 	config->numCol = clabel->num_columns;
   3348 	config->numSpare = 0; /* XXX should this be set here? */
   3349 	config->sectPerSU = clabel->sectPerSU;
   3350 	config->SUsPerPU = clabel->SUsPerPU;
   3351 	config->SUsPerRU = clabel->SUsPerRU;
   3352 	config->parityConfig = clabel->parityConfig;
   3353 	/* XXX... */
   3354 	strcpy(config->diskQueueType,"fifo");
   3355 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3356 	config->layoutSpecificSize = 0; /* XXX ?? */
   3357 
   3358 	while(ac!=NULL) {
   3359 		/* row/col values will be in range due to the checks
   3360 		   in reasonable_label() */
   3361 		strcpy(config->devnames[0][ac->clabel->column],
   3362 		       ac->devname);
   3363 		ac = ac->next;
   3364 	}
   3365 
   3366 	for(i=0;i<RF_MAXDBGV;i++) {
   3367 		config->debugVars[i][0] = 0;
   3368 	}
   3369 }
   3370 
   3371 int
   3372 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3373 {
   3374 	RF_ComponentLabel_t *clabel;
   3375 	int column;
   3376 	int sparecol;
   3377 
   3378 	raidPtr->autoconfigure = new_value;
   3379 
   3380 	for(column=0; column<raidPtr->numCol; column++) {
   3381 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3382 			clabel = raidget_component_label(raidPtr, column);
   3383 			clabel->autoconfigure = new_value;
   3384 			raidflush_component_label(raidPtr, column);
   3385 		}
   3386 	}
   3387 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3388 		sparecol = raidPtr->numCol + column;
   3389 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3390 			clabel = raidget_component_label(raidPtr, sparecol);
   3391 			clabel->autoconfigure = new_value;
   3392 			raidflush_component_label(raidPtr, sparecol);
   3393 		}
   3394 	}
   3395 	return(new_value);
   3396 }
   3397 
   3398 int
   3399 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3400 {
   3401 	RF_ComponentLabel_t *clabel;
   3402 	int column;
   3403 	int sparecol;
   3404 
   3405 	raidPtr->root_partition = new_value;
   3406 	for(column=0; column<raidPtr->numCol; column++) {
   3407 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3408 			clabel = raidget_component_label(raidPtr, column);
   3409 			clabel->root_partition = new_value;
   3410 			raidflush_component_label(raidPtr, column);
   3411 		}
   3412 	}
   3413 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3414 		sparecol = raidPtr->numCol + column;
   3415 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3416 			clabel = raidget_component_label(raidPtr, sparecol);
   3417 			clabel->root_partition = new_value;
   3418 			raidflush_component_label(raidPtr, sparecol);
   3419 		}
   3420 	}
   3421 	return(new_value);
   3422 }
   3423 
   3424 void
   3425 rf_release_all_vps(RF_ConfigSet_t *cset)
   3426 {
   3427 	RF_AutoConfig_t *ac;
   3428 
   3429 	ac = cset->ac;
   3430 	while(ac!=NULL) {
   3431 		/* Close the vp, and give it back */
   3432 		if (ac->vp) {
   3433 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3434 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3435 			vput(ac->vp);
   3436 			ac->vp = NULL;
   3437 		}
   3438 		ac = ac->next;
   3439 	}
   3440 }
   3441 
   3442 
   3443 void
   3444 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3445 {
   3446 	RF_AutoConfig_t *ac;
   3447 	RF_AutoConfig_t *next_ac;
   3448 
   3449 	ac = cset->ac;
   3450 	while(ac!=NULL) {
   3451 		next_ac = ac->next;
   3452 		/* nuke the label */
   3453 		free(ac->clabel, M_RAIDFRAME);
   3454 		/* cleanup the config structure */
   3455 		free(ac, M_RAIDFRAME);
   3456 		/* "next.." */
   3457 		ac = next_ac;
   3458 	}
   3459 	/* and, finally, nuke the config set */
   3460 	free(cset, M_RAIDFRAME);
   3461 }
   3462 
   3463 
   3464 void
   3465 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3466 {
   3467 	/* current version number */
   3468 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3469 	clabel->serial_number = raidPtr->serial_number;
   3470 	clabel->mod_counter = raidPtr->mod_counter;
   3471 
   3472 	clabel->num_rows = 1;
   3473 	clabel->num_columns = raidPtr->numCol;
   3474 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3475 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3476 
   3477 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3478 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3479 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3480 
   3481 	clabel->blockSize = raidPtr->bytesPerSector;
   3482 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3483 
   3484 	/* XXX not portable */
   3485 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3486 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3487 	clabel->autoconfigure = raidPtr->autoconfigure;
   3488 	clabel->root_partition = raidPtr->root_partition;
   3489 	clabel->last_unit = raidPtr->raidid;
   3490 	clabel->config_order = raidPtr->config_order;
   3491 
   3492 #ifndef RF_NO_PARITY_MAP
   3493 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3494 #endif
   3495 }
   3496 
   3497 struct raid_softc *
   3498 rf_auto_config_set(RF_ConfigSet_t *cset)
   3499 {
   3500 	RF_Raid_t *raidPtr;
   3501 	RF_Config_t *config;
   3502 	int raidID;
   3503 	struct raid_softc *sc;
   3504 
   3505 #ifdef DEBUG
   3506 	printf("RAID autoconfigure\n");
   3507 #endif
   3508 
   3509 	/* 1. Create a config structure */
   3510 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3511 	if (config == NULL) {
   3512 		printf("%s: Out of mem - config!?!?\n", __func__);
   3513 				/* XXX do something more intelligent here. */
   3514 		return NULL;
   3515 	}
   3516 
   3517 	/*
   3518 	   2. Figure out what RAID ID this one is supposed to live at
   3519 	   See if we can get the same RAID dev that it was configured
   3520 	   on last time..
   3521 	*/
   3522 
   3523 	raidID = cset->ac->clabel->last_unit;
   3524 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3525 	     sc = raidget(++raidID, false))
   3526 		continue;
   3527 #ifdef DEBUG
   3528 	printf("Configuring raid%d:\n",raidID);
   3529 #endif
   3530 
   3531 	if (sc == NULL)
   3532 		sc = raidget(raidID, true);
   3533 	if (sc == NULL) {
   3534 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3535 				/* XXX do something more intelligent here. */
   3536 		free(config, M_RAIDFRAME);
   3537 		return NULL;
   3538 	}
   3539 
   3540 	raidPtr = &sc->sc_r;
   3541 
   3542 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3543 	raidPtr->softc = sc;
   3544 	raidPtr->raidid = raidID;
   3545 	raidPtr->openings = RAIDOUTSTANDING;
   3546 
   3547 	/* 3. Build the configuration structure */
   3548 	rf_create_configuration(cset->ac, config, raidPtr);
   3549 
   3550 	/* 4. Do the configuration */
   3551 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3552 		raidinit(sc);
   3553 
   3554 		rf_markalldirty(raidPtr);
   3555 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3556 		switch (cset->ac->clabel->root_partition) {
   3557 		case 1:	/* Force Root */
   3558 		case 2:	/* Soft Root: root when boot partition part of raid */
   3559 			/*
   3560 			 * everything configured just fine.  Make a note
   3561 			 * that this set is eligible to be root,
   3562 			 * or forced to be root
   3563 			 */
   3564 			cset->rootable = cset->ac->clabel->root_partition;
   3565 			/* XXX do this here? */
   3566 			raidPtr->root_partition = cset->rootable;
   3567 			break;
   3568 		default:
   3569 			break;
   3570 		}
   3571 	} else {
   3572 		raidput(sc);
   3573 		sc = NULL;
   3574 	}
   3575 
   3576 	/* 5. Cleanup */
   3577 	free(config, M_RAIDFRAME);
   3578 	return sc;
   3579 }
   3580 
   3581 void
   3582 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3583 	     size_t xmin, size_t xmax)
   3584 {
   3585 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3586 	pool_sethiwat(p, xmax);
   3587 	pool_prime(p, xmin);
   3588 	pool_setlowat(p, xmin);
   3589 }
   3590 
   3591 /*
   3592  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3593  * to see if there is IO pending and if that IO could possibly be done
   3594  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3595  * otherwise.
   3596  *
   3597  */
   3598 int
   3599 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3600 {
   3601 	struct raid_softc *rs;
   3602 	struct dk_softc *dksc;
   3603 
   3604 	rs = raidPtr->softc;
   3605 	dksc = &rs->sc_dksc;
   3606 
   3607 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3608 		return 1;
   3609 
   3610 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3611 		/* there is work to do */
   3612 		return 0;
   3613 	}
   3614 	/* default is nothing to do */
   3615 	return 1;
   3616 }
   3617 
   3618 int
   3619 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3620 {
   3621 	uint64_t numsecs;
   3622 	unsigned secsize;
   3623 	int error;
   3624 
   3625 	error = getdisksize(vp, &numsecs, &secsize);
   3626 	if (error == 0) {
   3627 		diskPtr->blockSize = secsize;
   3628 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3629 		diskPtr->partitionSize = numsecs;
   3630 		return 0;
   3631 	}
   3632 	return error;
   3633 }
   3634 
   3635 static int
   3636 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3637 {
   3638 	return 1;
   3639 }
   3640 
   3641 static void
   3642 raid_attach(device_t parent, device_t self, void *aux)
   3643 {
   3644 }
   3645 
   3646 
   3647 static int
   3648 raid_detach(device_t self, int flags)
   3649 {
   3650 	int error;
   3651 	struct raid_softc *rs = raidsoftc(self);
   3652 
   3653 	if (rs == NULL)
   3654 		return ENXIO;
   3655 
   3656 	if ((error = raidlock(rs)) != 0)
   3657 		return (error);
   3658 
   3659 	error = raid_detach_unlocked(rs);
   3660 
   3661 	raidunlock(rs);
   3662 
   3663 	/* XXX raid can be referenced here */
   3664 
   3665 	if (error)
   3666 		return error;
   3667 
   3668 	/* Free the softc */
   3669 	raidput(rs);
   3670 
   3671 	return 0;
   3672 }
   3673 
   3674 static void
   3675 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3676 {
   3677 	struct dk_softc *dksc = &rs->sc_dksc;
   3678 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3679 
   3680 	memset(dg, 0, sizeof(*dg));
   3681 
   3682 	dg->dg_secperunit = raidPtr->totalSectors;
   3683 	dg->dg_secsize = raidPtr->bytesPerSector;
   3684 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3685 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3686 
   3687 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3688 }
   3689 
   3690 /*
   3691  * Get cache info for all the components (including spares).
   3692  * Returns intersection of all the cache flags of all disks, or first
   3693  * error if any encountered.
   3694  * XXXfua feature flags can change as spares are added - lock down somehow
   3695  */
   3696 static int
   3697 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3698 {
   3699 	int c;
   3700 	int error;
   3701 	int dkwhole = 0, dkpart;
   3702 
   3703 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3704 		/*
   3705 		 * Check any non-dead disk, even when currently being
   3706 		 * reconstructed.
   3707 		 */
   3708 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3709 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3710 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3711 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3712 			if (error) {
   3713 				if (error != ENODEV) {
   3714 					printf("raid%d: get cache for component %s failed\n",
   3715 					    raidPtr->raidid,
   3716 					    raidPtr->Disks[c].devname);
   3717 				}
   3718 
   3719 				return error;
   3720 			}
   3721 
   3722 			if (c == 0)
   3723 				dkwhole = dkpart;
   3724 			else
   3725 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3726 		}
   3727 	}
   3728 
   3729 	*data = dkwhole;
   3730 
   3731 	return 0;
   3732 }
   3733 
   3734 /*
   3735  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3736  * We end up returning whatever error was returned by the first cache flush
   3737  * that fails.
   3738  */
   3739 
   3740 int
   3741 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3742 {
   3743 	int c, sparecol;
   3744 	int e,error;
   3745 	int force = 1;
   3746 
   3747 	error = 0;
   3748 	for (c = 0; c < raidPtr->numCol; c++) {
   3749 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3750 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3751 					  &force, FWRITE, NOCRED);
   3752 			if (e) {
   3753 				if (e != ENODEV)
   3754 					printf("raid%d: cache flush to component %s failed.\n",
   3755 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3756 				if (error == 0) {
   3757 					error = e;
   3758 				}
   3759 			}
   3760 		}
   3761 	}
   3762 
   3763 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3764 		sparecol = raidPtr->numCol + c;
   3765 		/* Need to ensure that the reconstruct actually completed! */
   3766 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3767 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3768 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3769 			if (e) {
   3770 				if (e != ENODEV)
   3771 					printf("raid%d: cache flush to component %s failed.\n",
   3772 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3773 				if (error == 0) {
   3774 					error = e;
   3775 				}
   3776 			}
   3777 		}
   3778 	}
   3779 	return error;
   3780 }
   3781 
   3782 /*
   3783  * Module interface
   3784  */
   3785 
   3786 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3787 
   3788 #ifdef _MODULE
   3789 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3790 #endif
   3791 
   3792 static int raid_modcmd(modcmd_t, void *);
   3793 static int raid_modcmd_init(void);
   3794 static int raid_modcmd_fini(void);
   3795 
   3796 static int
   3797 raid_modcmd(modcmd_t cmd, void *data)
   3798 {
   3799 	int error;
   3800 
   3801 	error = 0;
   3802 	switch (cmd) {
   3803 	case MODULE_CMD_INIT:
   3804 		error = raid_modcmd_init();
   3805 		break;
   3806 	case MODULE_CMD_FINI:
   3807 		error = raid_modcmd_fini();
   3808 		break;
   3809 	default:
   3810 		error = ENOTTY;
   3811 		break;
   3812 	}
   3813 	return error;
   3814 }
   3815 
   3816 static int
   3817 raid_modcmd_init(void)
   3818 {
   3819 	int error;
   3820 #ifdef _MODULE
   3821 	int bmajor, cmajor;
   3822 #endif
   3823 
   3824 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3825 	mutex_enter(&raid_lock);
   3826 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3827 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3828 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3829 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3830 
   3831 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3832 #endif
   3833 
   3834 #ifdef _MODULE
   3835 	bmajor = cmajor = -1;
   3836 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3837 	    &raid_cdevsw, &cmajor);
   3838 	if (error != 0) {
   3839 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3840 		mutex_exit(&raid_lock);
   3841 		return error;
   3842 	}
   3843 	error = config_cfdriver_attach(&raid_cd);
   3844 	if (error != 0) {
   3845 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3846 		    __func__, error);
   3847 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3848 		mutex_exit(&raid_lock);
   3849 		return error;
   3850 	}
   3851 #endif
   3852 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3853 	if (error != 0) {
   3854 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3855 		    __func__, error);
   3856 #ifdef _MODULE
   3857 		config_cfdriver_detach(&raid_cd);
   3858 #endif
   3859 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3860 		mutex_exit(&raid_lock);
   3861 		return error;
   3862 	}
   3863 
   3864 	raidautoconfigdone = false;
   3865 
   3866 	mutex_exit(&raid_lock);
   3867 
   3868 	if (error == 0) {
   3869 		if (rf_BootRaidframe(true) == 0)
   3870 			aprint_verbose("Kernelized RAIDframe activated\n");
   3871 		else
   3872 			panic("Serious error activating RAID!!");
   3873 	}
   3874 
   3875 	/*
   3876 	 * Register a finalizer which will be used to auto-config RAID
   3877 	 * sets once all real hardware devices have been found.
   3878 	 */
   3879 	error = config_finalize_register(NULL, rf_autoconfig);
   3880 	if (error != 0) {
   3881 		aprint_error("WARNING: unable to register RAIDframe "
   3882 		    "finalizer\n");
   3883 		error = 0;
   3884 	}
   3885 
   3886 	return error;
   3887 }
   3888 
   3889 static int
   3890 raid_modcmd_fini(void)
   3891 {
   3892 	int error;
   3893 
   3894 	mutex_enter(&raid_lock);
   3895 
   3896 	/* Don't allow unload if raid device(s) exist.  */
   3897 	if (!LIST_EMPTY(&raids)) {
   3898 		mutex_exit(&raid_lock);
   3899 		return EBUSY;
   3900 	}
   3901 
   3902 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3903 	if (error != 0) {
   3904 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3905 		mutex_exit(&raid_lock);
   3906 		return error;
   3907 	}
   3908 #ifdef _MODULE
   3909 	error = config_cfdriver_detach(&raid_cd);
   3910 	if (error != 0) {
   3911 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3912 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3913 		mutex_exit(&raid_lock);
   3914 		return error;
   3915 	}
   3916 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3917 	if (error != 0) {
   3918 		aprint_error("%s: cannot detach devsw\n",__func__);
   3919 		config_cfdriver_attach(&raid_cd);
   3920 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3921 		mutex_exit(&raid_lock);
   3922 		return error;
   3923 	}
   3924 #endif
   3925 	rf_BootRaidframe(false);
   3926 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3927 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3928 	rf_destroy_cond2(rf_sparet_wait_cv);
   3929 	rf_destroy_cond2(rf_sparet_resp_cv);
   3930 #endif
   3931 	mutex_exit(&raid_lock);
   3932 	mutex_destroy(&raid_lock);
   3933 
   3934 	return error;
   3935 }
   3936