Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.316.2.7
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.316.2.7 2016/10/05 20:55:56 skrll Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.316.2.7 2016/10/05 20:55:56 skrll Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #ifdef DEBUG_ROOT
    165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    166 #else
    167 #define DPRINTF(a, ...)
    168 #endif
    169 
    170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    171 static rf_declare_mutex2(rf_sparet_wait_mutex);
    172 static rf_declare_cond2(rf_sparet_wait_cv);
    173 static rf_declare_cond2(rf_sparet_resp_cv);
    174 
    175 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    176 						 * spare table */
    177 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    178 						 * installation process */
    179 #endif
    180 
    181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    182 
    183 /* prototypes */
    184 static void KernelWakeupFunc(struct buf *);
    185 static void InitBP(struct buf *, struct vnode *, unsigned,
    186     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    187     void *, int, struct proc *);
    188 struct raid_softc;
    189 static void raidinit(struct raid_softc *);
    190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    191 
    192 static int raid_match(device_t, cfdata_t, void *);
    193 static void raid_attach(device_t, device_t, void *);
    194 static int raid_detach(device_t, int);
    195 
    196 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    197     daddr_t, daddr_t);
    198 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    199     daddr_t, daddr_t, int);
    200 
    201 static int raidwrite_component_label(unsigned,
    202     dev_t, struct vnode *, RF_ComponentLabel_t *);
    203 static int raidread_component_label(unsigned,
    204     dev_t, struct vnode *, RF_ComponentLabel_t *);
    205 
    206 static int raid_diskstart(device_t, struct buf *bp);
    207 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    208 static int raid_lastclose(device_t);
    209 
    210 static dev_type_open(raidopen);
    211 static dev_type_close(raidclose);
    212 static dev_type_read(raidread);
    213 static dev_type_write(raidwrite);
    214 static dev_type_ioctl(raidioctl);
    215 static dev_type_strategy(raidstrategy);
    216 static dev_type_dump(raiddump);
    217 static dev_type_size(raidsize);
    218 
    219 const struct bdevsw raid_bdevsw = {
    220 	.d_open = raidopen,
    221 	.d_close = raidclose,
    222 	.d_strategy = raidstrategy,
    223 	.d_ioctl = raidioctl,
    224 	.d_dump = raiddump,
    225 	.d_psize = raidsize,
    226 	.d_discard = nodiscard,
    227 	.d_flag = D_DISK
    228 };
    229 
    230 const struct cdevsw raid_cdevsw = {
    231 	.d_open = raidopen,
    232 	.d_close = raidclose,
    233 	.d_read = raidread,
    234 	.d_write = raidwrite,
    235 	.d_ioctl = raidioctl,
    236 	.d_stop = nostop,
    237 	.d_tty = notty,
    238 	.d_poll = nopoll,
    239 	.d_mmap = nommap,
    240 	.d_kqfilter = nokqfilter,
    241 	.d_discard = nodiscard,
    242 	.d_flag = D_DISK
    243 };
    244 
    245 static struct dkdriver rf_dkdriver = {
    246 	.d_open = raidopen,
    247 	.d_close = raidclose,
    248 	.d_strategy = raidstrategy,
    249 	.d_diskstart = raid_diskstart,
    250 	.d_dumpblocks = raid_dumpblocks,
    251 	.d_lastclose = raid_lastclose,
    252 	.d_minphys = minphys
    253 };
    254 
    255 struct raid_softc {
    256 	struct dk_softc sc_dksc;
    257 	int	sc_unit;
    258 	int     sc_flags;	/* flags */
    259 	int     sc_cflags;	/* configuration flags */
    260 	kmutex_t sc_mutex;	/* interlock mutex */
    261 	kcondvar_t sc_cv;	/* and the condvar */
    262 	uint64_t sc_size;	/* size of the raid device */
    263 	char    sc_xname[20];	/* XXX external name */
    264 	RF_Raid_t sc_r;
    265 	LIST_ENTRY(raid_softc) sc_link;
    266 };
    267 /* sc_flags */
    268 #define RAIDF_INITED		0x01	/* unit has been initialized */
    269 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    270 #define RAIDF_DETACH  		0x04	/* detach after final close */
    271 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    272 #define RAIDF_LOCKED		0x10	/* unit is locked */
    273 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    274 
    275 #define	raidunit(x)	DISKUNIT(x)
    276 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    277 
    278 extern struct cfdriver raid_cd;
    279 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    280     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    281     DVF_DETACH_SHUTDOWN);
    282 
    283 /*
    284  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    285  * Be aware that large numbers can allow the driver to consume a lot of
    286  * kernel memory, especially on writes, and in degraded mode reads.
    287  *
    288  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    289  * a single 64K write will typically require 64K for the old data,
    290  * 64K for the old parity, and 64K for the new parity, for a total
    291  * of 192K (if the parity buffer is not re-used immediately).
    292  * Even it if is used immediately, that's still 128K, which when multiplied
    293  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    294  *
    295  * Now in degraded mode, for example, a 64K read on the above setup may
    296  * require data reconstruction, which will require *all* of the 4 remaining
    297  * disks to participate -- 4 * 32K/disk == 128K again.
    298  */
    299 
    300 #ifndef RAIDOUTSTANDING
    301 #define RAIDOUTSTANDING   6
    302 #endif
    303 
    304 #define RAIDLABELDEV(dev)	\
    305 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    306 
    307 /* declared here, and made public, for the benefit of KVM stuff.. */
    308 
    309 static int raidlock(struct raid_softc *);
    310 static void raidunlock(struct raid_softc *);
    311 
    312 static int raid_detach_unlocked(struct raid_softc *);
    313 
    314 static void rf_markalldirty(RF_Raid_t *);
    315 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    316 
    317 void rf_ReconThread(struct rf_recon_req *);
    318 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    319 void rf_CopybackThread(RF_Raid_t *raidPtr);
    320 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    321 int rf_autoconfig(device_t);
    322 void rf_buildroothack(RF_ConfigSet_t *);
    323 
    324 RF_AutoConfig_t *rf_find_raid_components(void);
    325 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    326 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    327 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    328 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    329 int rf_set_autoconfig(RF_Raid_t *, int);
    330 int rf_set_rootpartition(RF_Raid_t *, int);
    331 void rf_release_all_vps(RF_ConfigSet_t *);
    332 void rf_cleanup_config_set(RF_ConfigSet_t *);
    333 int rf_have_enough_components(RF_ConfigSet_t *);
    334 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    335 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    336 
    337 /*
    338  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    339  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    340  * in the kernel config file.
    341  */
    342 #ifdef RAID_AUTOCONFIG
    343 int raidautoconfig = 1;
    344 #else
    345 int raidautoconfig = 0;
    346 #endif
    347 static bool raidautoconfigdone = false;
    348 
    349 struct RF_Pools_s rf_pools;
    350 
    351 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    352 static kmutex_t raid_lock;
    353 
    354 static struct raid_softc *
    355 raidcreate(int unit) {
    356 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    357 	if (sc == NULL) {
    358 #ifdef DIAGNOSTIC
    359 		printf("%s: out of memory\n", __func__);
    360 #endif
    361 		return NULL;
    362 	}
    363 	sc->sc_unit = unit;
    364 	cv_init(&sc->sc_cv, "raidunit");
    365 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    366 	return sc;
    367 }
    368 
    369 static void
    370 raiddestroy(struct raid_softc *sc) {
    371 	cv_destroy(&sc->sc_cv);
    372 	mutex_destroy(&sc->sc_mutex);
    373 	kmem_free(sc, sizeof(*sc));
    374 }
    375 
    376 static struct raid_softc *
    377 raidget(int unit, bool create) {
    378 	struct raid_softc *sc;
    379 	if (unit < 0) {
    380 #ifdef DIAGNOSTIC
    381 		panic("%s: unit %d!", __func__, unit);
    382 #endif
    383 		return NULL;
    384 	}
    385 	mutex_enter(&raid_lock);
    386 	LIST_FOREACH(sc, &raids, sc_link) {
    387 		if (sc->sc_unit == unit) {
    388 			mutex_exit(&raid_lock);
    389 			return sc;
    390 		}
    391 	}
    392 	mutex_exit(&raid_lock);
    393 	if (!create)
    394 		return NULL;
    395 	if ((sc = raidcreate(unit)) == NULL)
    396 		return NULL;
    397 	mutex_enter(&raid_lock);
    398 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    399 	mutex_exit(&raid_lock);
    400 	return sc;
    401 }
    402 
    403 static void
    404 raidput(struct raid_softc *sc) {
    405 	mutex_enter(&raid_lock);
    406 	LIST_REMOVE(sc, sc_link);
    407 	mutex_exit(&raid_lock);
    408 	raiddestroy(sc);
    409 }
    410 
    411 void
    412 raidattach(int num)
    413 {
    414 
    415 	/*
    416 	 * Device attachment and associated initialization now occurs
    417 	 * as part of the module initialization.
    418 	 */
    419 }
    420 
    421 int
    422 rf_autoconfig(device_t self)
    423 {
    424 	RF_AutoConfig_t *ac_list;
    425 	RF_ConfigSet_t *config_sets;
    426 
    427 	if (!raidautoconfig || raidautoconfigdone == true)
    428 		return (0);
    429 
    430 	/* XXX This code can only be run once. */
    431 	raidautoconfigdone = true;
    432 
    433 #ifdef __HAVE_CPU_BOOTCONF
    434 	/*
    435 	 * 0. find the boot device if needed first so we can use it later
    436 	 * this needs to be done before we autoconfigure any raid sets,
    437 	 * because if we use wedges we are not going to be able to open
    438 	 * the boot device later
    439 	 */
    440 	if (booted_device == NULL)
    441 		cpu_bootconf();
    442 #endif
    443 	/* 1. locate all RAID components on the system */
    444 	aprint_debug("Searching for RAID components...\n");
    445 	ac_list = rf_find_raid_components();
    446 
    447 	/* 2. Sort them into their respective sets. */
    448 	config_sets = rf_create_auto_sets(ac_list);
    449 
    450 	/*
    451 	 * 3. Evaluate each set and configure the valid ones.
    452 	 * This gets done in rf_buildroothack().
    453 	 */
    454 	rf_buildroothack(config_sets);
    455 
    456 	return 1;
    457 }
    458 
    459 static int
    460 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    461 	const char *bootname = device_xname(bdv);
    462 	size_t len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok\n",
    502 				    sc->sc_unit);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 	dksc = &rsc->sc_dksc;
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL)
    527 		return;
    528 
    529 	/* we found something bootable... */
    530 
    531 	/*
    532 	 * XXX: The following code assumes that the root raid
    533 	 * is the first ('a') partition. This is about the best
    534 	 * we can do with a BSD disklabel, but we might be able
    535 	 * to do better with a GPT label, by setting a specified
    536 	 * attribute to indicate the root partition. We can then
    537 	 * stash the partition number in the r->root_partition
    538 	 * high bits (the bottom 2 bits are already used). For
    539 	 * now we just set booted_partition to 0 when we override
    540 	 * root.
    541 	 */
    542 	if (num_root == 1) {
    543 		device_t candidate_root;
    544 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    545 			char cname[sizeof(cset->ac->devname)];
    546 			/* XXX: assume partition 'a' first */
    547 			snprintf(cname, sizeof(cname), "%s%c",
    548 			    device_xname(dksc->sc_dev), 'a');
    549 			candidate_root = dkwedge_find_by_wname(cname);
    550 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    551 			    cname);
    552 			if (candidate_root == NULL) {
    553 				/*
    554 				 * If that is not found, because we don't use
    555 				 * disklabel, return the first dk child
    556 				 * XXX: we can skip the 'a' check above
    557 				 * and always do this...
    558 				 */
    559 				size_t i = 0;
    560 				candidate_root = dkwedge_find_by_parent(
    561 				    device_xname(dksc->sc_dev), &i);
    562 			}
    563 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    564 			    candidate_root);
    565 		} else
    566 			candidate_root = dksc->sc_dev;
    567 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    568 		DPRINTF("%s: booted_device=%p root_partition=%d "
    569 		   "contains_boot=%d\n", __func__, booted_device,
    570 		   rsc->sc_r.root_partition,
    571 		   rf_containsboot(&rsc->sc_r, booted_device));
    572 		if (booted_device == NULL ||
    573 		    rsc->sc_r.root_partition == 1 ||
    574 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    575 			booted_device = candidate_root;
    576 			booted_partition = 0;	/* XXX assume 'a' */
    577 		}
    578 	} else if (num_root > 1) {
    579 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    580 		    booted_device);
    581 
    582 		/*
    583 		 * Maybe the MD code can help. If it cannot, then
    584 		 * setroot() will discover that we have no
    585 		 * booted_device and will ask the user if nothing was
    586 		 * hardwired in the kernel config file
    587 		 */
    588 		if (booted_device == NULL)
    589 			return;
    590 
    591 		num_root = 0;
    592 		mutex_enter(&raid_lock);
    593 		LIST_FOREACH(sc, &raids, sc_link) {
    594 			RF_Raid_t *r = &sc->sc_r;
    595 			if (r->valid == 0)
    596 				continue;
    597 
    598 			if (r->root_partition == 0)
    599 				continue;
    600 
    601 			if (rf_containsboot(r, booted_device)) {
    602 				num_root++;
    603 				rsc = sc;
    604 				dksc = &rsc->sc_dksc;
    605 			}
    606 		}
    607 		mutex_exit(&raid_lock);
    608 
    609 		if (num_root == 1) {
    610 			booted_device = dksc->sc_dev;
    611 			booted_partition = 0;	/* XXX assume 'a' */
    612 		} else {
    613 			/* we can't guess.. require the user to answer... */
    614 			boothowto |= RB_ASKNAME;
    615 		}
    616 	}
    617 }
    618 
    619 static int
    620 raidsize(dev_t dev)
    621 {
    622 	struct raid_softc *rs;
    623 	struct dk_softc *dksc;
    624 	unsigned int unit;
    625 
    626 	unit = raidunit(dev);
    627 	if ((rs = raidget(unit, false)) == NULL)
    628 		return -1;
    629 	dksc = &rs->sc_dksc;
    630 
    631 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    632 		return -1;
    633 
    634 	return dk_size(dksc, dev);
    635 }
    636 
    637 static int
    638 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    639 {
    640 	unsigned int unit;
    641 	struct raid_softc *rs;
    642 	struct dk_softc *dksc;
    643 
    644 	unit = raidunit(dev);
    645 	if ((rs = raidget(unit, false)) == NULL)
    646 		return ENXIO;
    647 	dksc = &rs->sc_dksc;
    648 
    649 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    650 		return ENODEV;
    651 
    652         /*
    653            Note that blkno is relative to this particular partition.
    654            By adding adding RF_PROTECTED_SECTORS, we get a value that
    655 	   is relative to the partition used for the underlying component.
    656         */
    657 	blkno += RF_PROTECTED_SECTORS;
    658 
    659 	return dk_dump(dksc, dev, blkno, va, size);
    660 }
    661 
    662 static int
    663 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    664 {
    665 	struct raid_softc *rs = raidsoftc(dev);
    666 	const struct bdevsw *bdev;
    667 	RF_Raid_t *raidPtr;
    668 	int     c, sparecol, j, scol, dumpto;
    669 	int     error = 0;
    670 
    671 	raidPtr = &rs->sc_r;
    672 
    673 	/* we only support dumping to RAID 1 sets */
    674 	if (raidPtr->Layout.numDataCol != 1 ||
    675 	    raidPtr->Layout.numParityCol != 1)
    676 		return EINVAL;
    677 
    678 	if ((error = raidlock(rs)) != 0)
    679 		return error;
    680 
    681 	/* figure out what device is alive.. */
    682 
    683 	/*
    684 	   Look for a component to dump to.  The preference for the
    685 	   component to dump to is as follows:
    686 	   1) the master
    687 	   2) a used_spare of the master
    688 	   3) the slave
    689 	   4) a used_spare of the slave
    690 	*/
    691 
    692 	dumpto = -1;
    693 	for (c = 0; c < raidPtr->numCol; c++) {
    694 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    695 			/* this might be the one */
    696 			dumpto = c;
    697 			break;
    698 		}
    699 	}
    700 
    701 	/*
    702 	   At this point we have possibly selected a live master or a
    703 	   live slave.  We now check to see if there is a spared
    704 	   master (or a spared slave), if we didn't find a live master
    705 	   or a live slave.
    706 	*/
    707 
    708 	for (c = 0; c < raidPtr->numSpare; c++) {
    709 		sparecol = raidPtr->numCol + c;
    710 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    711 			/* How about this one? */
    712 			scol = -1;
    713 			for(j=0;j<raidPtr->numCol;j++) {
    714 				if (raidPtr->Disks[j].spareCol == sparecol) {
    715 					scol = j;
    716 					break;
    717 				}
    718 			}
    719 			if (scol == 0) {
    720 				/*
    721 				   We must have found a spared master!
    722 				   We'll take that over anything else
    723 				   found so far.  (We couldn't have
    724 				   found a real master before, since
    725 				   this is a used spare, and it's
    726 				   saying that it's replacing the
    727 				   master.)  On reboot (with
    728 				   autoconfiguration turned on)
    729 				   sparecol will become the 1st
    730 				   component (component0) of this set.
    731 				*/
    732 				dumpto = sparecol;
    733 				break;
    734 			} else if (scol != -1) {
    735 				/*
    736 				   Must be a spared slave.  We'll dump
    737 				   to that if we havn't found anything
    738 				   else so far.
    739 				*/
    740 				if (dumpto == -1)
    741 					dumpto = sparecol;
    742 			}
    743 		}
    744 	}
    745 
    746 	if (dumpto == -1) {
    747 		/* we couldn't find any live components to dump to!?!?
    748 		 */
    749 		error = EINVAL;
    750 		goto out;
    751 	}
    752 
    753 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    754 	if (bdev == NULL) {
    755 		error = ENXIO;
    756 		goto out;
    757 	}
    758 
    759 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    760 				blkno, va, nblk * raidPtr->bytesPerSector);
    761 
    762 out:
    763 	raidunlock(rs);
    764 
    765 	return error;
    766 }
    767 
    768 /* ARGSUSED */
    769 static int
    770 raidopen(dev_t dev, int flags, int fmt,
    771     struct lwp *l)
    772 {
    773 	int     unit = raidunit(dev);
    774 	struct raid_softc *rs;
    775 	struct dk_softc *dksc;
    776 	int     error = 0;
    777 	int     part, pmask;
    778 
    779 	if ((rs = raidget(unit, true)) == NULL)
    780 		return ENXIO;
    781 	if ((error = raidlock(rs)) != 0)
    782 		return (error);
    783 
    784 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    785 		error = EBUSY;
    786 		goto bad;
    787 	}
    788 
    789 	dksc = &rs->sc_dksc;
    790 
    791 	part = DISKPART(dev);
    792 	pmask = (1 << part);
    793 
    794 	if (!DK_BUSY(dksc, pmask) &&
    795 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    796 		/* First one... mark things as dirty... Note that we *MUST*
    797 		 have done a configure before this.  I DO NOT WANT TO BE
    798 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    799 		 THAT THEY BELONG TOGETHER!!!!! */
    800 		/* XXX should check to see if we're only open for reading
    801 		   here... If so, we needn't do this, but then need some
    802 		   other way of keeping track of what's happened.. */
    803 
    804 		rf_markalldirty(&rs->sc_r);
    805 	}
    806 
    807 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    808 		error = dk_open(dksc, dev, flags, fmt, l);
    809 
    810 bad:
    811 	raidunlock(rs);
    812 
    813 	return (error);
    814 
    815 
    816 }
    817 
    818 static int
    819 raid_lastclose(device_t self)
    820 {
    821 	struct raid_softc *rs = raidsoftc(self);
    822 
    823 	/* Last one... device is not unconfigured yet.
    824 	   Device shutdown has taken care of setting the
    825 	   clean bits if RAIDF_INITED is not set
    826 	   mark things as clean... */
    827 
    828 	rf_update_component_labels(&rs->sc_r,
    829 	    RF_FINAL_COMPONENT_UPDATE);
    830 
    831 	/* pass to unlocked code */
    832 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    833 		rs->sc_flags |= RAIDF_DETACH;
    834 
    835 	return 0;
    836 }
    837 
    838 /* ARGSUSED */
    839 static int
    840 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    841 {
    842 	int     unit = raidunit(dev);
    843 	struct raid_softc *rs;
    844 	struct dk_softc *dksc;
    845 	cfdata_t cf;
    846 	int     error = 0, do_detach = 0, do_put = 0;
    847 
    848 	if ((rs = raidget(unit, false)) == NULL)
    849 		return ENXIO;
    850 	dksc = &rs->sc_dksc;
    851 
    852 	if ((error = raidlock(rs)) != 0)
    853 		return (error);
    854 
    855 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    856 		error = dk_close(dksc, dev, flags, fmt, l);
    857 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    858 			do_detach = 1;
    859 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    860 		do_put = 1;
    861 
    862 	raidunlock(rs);
    863 
    864 	if (do_detach) {
    865 		/* free the pseudo device attach bits */
    866 		cf = device_cfdata(dksc->sc_dev);
    867 		error = config_detach(dksc->sc_dev, 0);
    868 		if (error == 0)
    869 			free(cf, M_RAIDFRAME);
    870 	} else if (do_put) {
    871 		raidput(rs);
    872 	}
    873 
    874 	return (error);
    875 
    876 }
    877 
    878 static void
    879 raid_wakeup(RF_Raid_t *raidPtr)
    880 {
    881 	rf_lock_mutex2(raidPtr->iodone_lock);
    882 	rf_signal_cond2(raidPtr->iodone_cv);
    883 	rf_unlock_mutex2(raidPtr->iodone_lock);
    884 }
    885 
    886 static void
    887 raidstrategy(struct buf *bp)
    888 {
    889 	unsigned int unit;
    890 	struct raid_softc *rs;
    891 	struct dk_softc *dksc;
    892 	RF_Raid_t *raidPtr;
    893 
    894 	unit = raidunit(bp->b_dev);
    895 	if ((rs = raidget(unit, false)) == NULL) {
    896 		bp->b_error = ENXIO;
    897 		goto fail;
    898 	}
    899 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    900 		bp->b_error = ENXIO;
    901 		goto fail;
    902 	}
    903 	dksc = &rs->sc_dksc;
    904 	raidPtr = &rs->sc_r;
    905 
    906 	/* Queue IO only */
    907 	if (dk_strategy_defer(dksc, bp))
    908 		goto done;
    909 
    910 	/* schedule the IO to happen at the next convenient time */
    911 	raid_wakeup(raidPtr);
    912 
    913 done:
    914 	return;
    915 
    916 fail:
    917 	bp->b_resid = bp->b_bcount;
    918 	biodone(bp);
    919 }
    920 
    921 static int
    922 raid_diskstart(device_t dev, struct buf *bp)
    923 {
    924 	struct raid_softc *rs = raidsoftc(dev);
    925 	RF_Raid_t *raidPtr;
    926 
    927 	raidPtr = &rs->sc_r;
    928 	if (!raidPtr->valid) {
    929 		db1_printf(("raid is not valid..\n"));
    930 		return ENODEV;
    931 	}
    932 
    933 	/* XXX */
    934 	bp->b_resid = 0;
    935 
    936 	return raiddoaccess(raidPtr, bp);
    937 }
    938 
    939 void
    940 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    941 {
    942 	struct raid_softc *rs;
    943 	struct dk_softc *dksc;
    944 
    945 	rs = raidPtr->softc;
    946 	dksc = &rs->sc_dksc;
    947 
    948 	dk_done(dksc, bp);
    949 
    950 	rf_lock_mutex2(raidPtr->mutex);
    951 	raidPtr->openings++;
    952 	rf_unlock_mutex2(raidPtr->mutex);
    953 
    954 	/* schedule more IO */
    955 	raid_wakeup(raidPtr);
    956 }
    957 
    958 /* ARGSUSED */
    959 static int
    960 raidread(dev_t dev, struct uio *uio, int flags)
    961 {
    962 	int     unit = raidunit(dev);
    963 	struct raid_softc *rs;
    964 
    965 	if ((rs = raidget(unit, false)) == NULL)
    966 		return ENXIO;
    967 
    968 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    969 		return (ENXIO);
    970 
    971 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    972 
    973 }
    974 
    975 /* ARGSUSED */
    976 static int
    977 raidwrite(dev_t dev, struct uio *uio, int flags)
    978 {
    979 	int     unit = raidunit(dev);
    980 	struct raid_softc *rs;
    981 
    982 	if ((rs = raidget(unit, false)) == NULL)
    983 		return ENXIO;
    984 
    985 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    986 		return (ENXIO);
    987 
    988 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    989 
    990 }
    991 
    992 static int
    993 raid_detach_unlocked(struct raid_softc *rs)
    994 {
    995 	struct dk_softc *dksc = &rs->sc_dksc;
    996 	RF_Raid_t *raidPtr;
    997 	int error;
    998 
    999 	raidPtr = &rs->sc_r;
   1000 
   1001 	if (DK_BUSY(dksc, 0) ||
   1002 	    raidPtr->recon_in_progress != 0 ||
   1003 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1004 	    raidPtr->copyback_in_progress != 0)
   1005 		return EBUSY;
   1006 
   1007 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1008 		return 0;
   1009 
   1010 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1011 
   1012 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1013 		return error;
   1014 
   1015 	rs->sc_flags &= ~RAIDF_INITED;
   1016 
   1017 	/* Kill off any queued buffers */
   1018 	dk_drain(dksc);
   1019 	bufq_free(dksc->sc_bufq);
   1020 
   1021 	/* Detach the disk. */
   1022 	dkwedge_delall(&dksc->sc_dkdev);
   1023 	disk_detach(&dksc->sc_dkdev);
   1024 	disk_destroy(&dksc->sc_dkdev);
   1025 	dk_detach(dksc);
   1026 
   1027 	return 0;
   1028 }
   1029 
   1030 static int
   1031 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1032 {
   1033 	int     unit = raidunit(dev);
   1034 	int     error = 0;
   1035 	int     part, pmask;
   1036 	struct raid_softc *rs;
   1037 	struct dk_softc *dksc;
   1038 	RF_Config_t *k_cfg, *u_cfg;
   1039 	RF_Raid_t *raidPtr;
   1040 	RF_RaidDisk_t *diskPtr;
   1041 	RF_AccTotals_t *totals;
   1042 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1043 	u_char *specific_buf;
   1044 	int retcode = 0;
   1045 	int column;
   1046 /*	int raidid; */
   1047 	struct rf_recon_req *rrcopy, *rr;
   1048 	RF_ComponentLabel_t *clabel;
   1049 	RF_ComponentLabel_t *ci_label;
   1050 	RF_ComponentLabel_t **clabel_ptr;
   1051 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1052 	RF_SingleComponent_t component;
   1053 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1054 	int i, j, d;
   1055 
   1056 	if ((rs = raidget(unit, false)) == NULL)
   1057 		return ENXIO;
   1058 	dksc = &rs->sc_dksc;
   1059 	raidPtr = &rs->sc_r;
   1060 
   1061 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1062 		(int) DISKPART(dev), (int) unit, cmd));
   1063 
   1064 	/* Must be initialized for these... */
   1065 	switch (cmd) {
   1066 	case RAIDFRAME_REWRITEPARITY:
   1067 	case RAIDFRAME_GET_INFO:
   1068 	case RAIDFRAME_RESET_ACCTOTALS:
   1069 	case RAIDFRAME_GET_ACCTOTALS:
   1070 	case RAIDFRAME_KEEP_ACCTOTALS:
   1071 	case RAIDFRAME_GET_SIZE:
   1072 	case RAIDFRAME_FAIL_DISK:
   1073 	case RAIDFRAME_COPYBACK:
   1074 	case RAIDFRAME_CHECK_RECON_STATUS:
   1075 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1076 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1077 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1078 	case RAIDFRAME_ADD_HOT_SPARE:
   1079 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1080 	case RAIDFRAME_INIT_LABELS:
   1081 	case RAIDFRAME_REBUILD_IN_PLACE:
   1082 	case RAIDFRAME_CHECK_PARITY:
   1083 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1084 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1085 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1086 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1087 	case RAIDFRAME_SET_AUTOCONFIG:
   1088 	case RAIDFRAME_SET_ROOT:
   1089 	case RAIDFRAME_DELETE_COMPONENT:
   1090 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1091 	case RAIDFRAME_PARITYMAP_STATUS:
   1092 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1093 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1094 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1095 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1096 			return (ENXIO);
   1097 	}
   1098 
   1099 	switch (cmd) {
   1100 #ifdef COMPAT_50
   1101 	case RAIDFRAME_GET_INFO50:
   1102 		return rf_get_info50(raidPtr, data);
   1103 
   1104 	case RAIDFRAME_CONFIGURE50:
   1105 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1106 			return retcode;
   1107 		goto config;
   1108 #endif
   1109 		/* configure the system */
   1110 	case RAIDFRAME_CONFIGURE:
   1111 
   1112 		if (raidPtr->valid) {
   1113 			/* There is a valid RAID set running on this unit! */
   1114 			printf("raid%d: Device already configured!\n",unit);
   1115 			return(EINVAL);
   1116 		}
   1117 
   1118 		/* copy-in the configuration information */
   1119 		/* data points to a pointer to the configuration structure */
   1120 
   1121 		u_cfg = *((RF_Config_t **) data);
   1122 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1123 		if (k_cfg == NULL) {
   1124 			return (ENOMEM);
   1125 		}
   1126 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1127 		if (retcode) {
   1128 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1129 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1130 				retcode));
   1131 			goto no_config;
   1132 		}
   1133 		goto config;
   1134 	config:
   1135 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1136 
   1137 		/* allocate a buffer for the layout-specific data, and copy it
   1138 		 * in */
   1139 		if (k_cfg->layoutSpecificSize) {
   1140 			if (k_cfg->layoutSpecificSize > 10000) {
   1141 				/* sanity check */
   1142 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1143 				retcode = EINVAL;
   1144 				goto no_config;
   1145 			}
   1146 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1147 			    (u_char *));
   1148 			if (specific_buf == NULL) {
   1149 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1150 				retcode = ENOMEM;
   1151 				goto no_config;
   1152 			}
   1153 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1154 			    k_cfg->layoutSpecificSize);
   1155 			if (retcode) {
   1156 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1157 				RF_Free(specific_buf,
   1158 					k_cfg->layoutSpecificSize);
   1159 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1160 					retcode));
   1161 				goto no_config;
   1162 			}
   1163 		} else
   1164 			specific_buf = NULL;
   1165 		k_cfg->layoutSpecific = specific_buf;
   1166 
   1167 		/* should do some kind of sanity check on the configuration.
   1168 		 * Store the sum of all the bytes in the last byte? */
   1169 
   1170 		/* configure the system */
   1171 
   1172 		/*
   1173 		 * Clear the entire RAID descriptor, just to make sure
   1174 		 *  there is no stale data left in the case of a
   1175 		 *  reconfiguration
   1176 		 */
   1177 		memset(raidPtr, 0, sizeof(*raidPtr));
   1178 		raidPtr->softc = rs;
   1179 		raidPtr->raidid = unit;
   1180 
   1181 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1182 
   1183 		if (retcode == 0) {
   1184 
   1185 			/* allow this many simultaneous IO's to
   1186 			   this RAID device */
   1187 			raidPtr->openings = RAIDOUTSTANDING;
   1188 
   1189 			raidinit(rs);
   1190 			raid_wakeup(raidPtr);
   1191 			rf_markalldirty(raidPtr);
   1192 		}
   1193 		/* free the buffers.  No return code here. */
   1194 		if (k_cfg->layoutSpecificSize) {
   1195 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1196 		}
   1197 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1198 
   1199 	no_config:
   1200 		/*
   1201 		 * If configuration failed, set sc_flags so that we
   1202 		 * will detach the device when we close it.
   1203 		 */
   1204 		if (retcode != 0)
   1205 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1206 		return (retcode);
   1207 
   1208 		/* shutdown the system */
   1209 	case RAIDFRAME_SHUTDOWN:
   1210 
   1211 		part = DISKPART(dev);
   1212 		pmask = (1 << part);
   1213 
   1214 		if ((error = raidlock(rs)) != 0)
   1215 			return (error);
   1216 
   1217 		if (DK_BUSY(dksc, pmask) ||
   1218 		    raidPtr->recon_in_progress != 0 ||
   1219 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1220 		    raidPtr->copyback_in_progress != 0)
   1221 			retcode = EBUSY;
   1222 		else {
   1223 			/* detach and free on close */
   1224 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1225 			retcode = 0;
   1226 		}
   1227 
   1228 		raidunlock(rs);
   1229 
   1230 		return (retcode);
   1231 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1232 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1233 		/* need to read the component label for the disk indicated
   1234 		   by row,column in clabel */
   1235 
   1236 		/*
   1237 		 * Perhaps there should be an option to skip the in-core
   1238 		 * copy and hit the disk, as with disklabel(8).
   1239 		 */
   1240 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1241 
   1242 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1243 
   1244 		if (retcode) {
   1245 			RF_Free(clabel, sizeof(*clabel));
   1246 			return retcode;
   1247 		}
   1248 
   1249 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1250 
   1251 		column = clabel->column;
   1252 
   1253 		if ((column < 0) || (column >= raidPtr->numCol +
   1254 		    raidPtr->numSpare)) {
   1255 			RF_Free(clabel, sizeof(*clabel));
   1256 			return EINVAL;
   1257 		}
   1258 
   1259 		RF_Free(clabel, sizeof(*clabel));
   1260 
   1261 		clabel = raidget_component_label(raidPtr, column);
   1262 
   1263 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1264 
   1265 #if 0
   1266 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1267 		clabel = (RF_ComponentLabel_t *) data;
   1268 
   1269 		/* XXX check the label for valid stuff... */
   1270 		/* Note that some things *should not* get modified --
   1271 		   the user should be re-initing the labels instead of
   1272 		   trying to patch things.
   1273 		   */
   1274 
   1275 		raidid = raidPtr->raidid;
   1276 #ifdef DEBUG
   1277 		printf("raid%d: Got component label:\n", raidid);
   1278 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1279 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1280 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1281 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1282 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1283 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1284 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1285 #endif
   1286 		clabel->row = 0;
   1287 		column = clabel->column;
   1288 
   1289 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1290 			return(EINVAL);
   1291 		}
   1292 
   1293 		/* XXX this isn't allowed to do anything for now :-) */
   1294 
   1295 		/* XXX and before it is, we need to fill in the rest
   1296 		   of the fields!?!?!?! */
   1297 		memcpy(raidget_component_label(raidPtr, column),
   1298 		    clabel, sizeof(*clabel));
   1299 		raidflush_component_label(raidPtr, column);
   1300 		return (0);
   1301 #endif
   1302 
   1303 	case RAIDFRAME_INIT_LABELS:
   1304 		clabel = (RF_ComponentLabel_t *) data;
   1305 		/*
   1306 		   we only want the serial number from
   1307 		   the above.  We get all the rest of the information
   1308 		   from the config that was used to create this RAID
   1309 		   set.
   1310 		   */
   1311 
   1312 		raidPtr->serial_number = clabel->serial_number;
   1313 
   1314 		for(column=0;column<raidPtr->numCol;column++) {
   1315 			diskPtr = &raidPtr->Disks[column];
   1316 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1317 				ci_label = raidget_component_label(raidPtr,
   1318 				    column);
   1319 				/* Zeroing this is important. */
   1320 				memset(ci_label, 0, sizeof(*ci_label));
   1321 				raid_init_component_label(raidPtr, ci_label);
   1322 				ci_label->serial_number =
   1323 				    raidPtr->serial_number;
   1324 				ci_label->row = 0; /* we dont' pretend to support more */
   1325 				rf_component_label_set_partitionsize(ci_label,
   1326 				    diskPtr->partitionSize);
   1327 				ci_label->column = column;
   1328 				raidflush_component_label(raidPtr, column);
   1329 			}
   1330 			/* XXXjld what about the spares? */
   1331 		}
   1332 
   1333 		return (retcode);
   1334 	case RAIDFRAME_SET_AUTOCONFIG:
   1335 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1336 		printf("raid%d: New autoconfig value is: %d\n",
   1337 		       raidPtr->raidid, d);
   1338 		*(int *) data = d;
   1339 		return (retcode);
   1340 
   1341 	case RAIDFRAME_SET_ROOT:
   1342 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1343 		printf("raid%d: New rootpartition value is: %d\n",
   1344 		       raidPtr->raidid, d);
   1345 		*(int *) data = d;
   1346 		return (retcode);
   1347 
   1348 		/* initialize all parity */
   1349 	case RAIDFRAME_REWRITEPARITY:
   1350 
   1351 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1352 			/* Parity for RAID 0 is trivially correct */
   1353 			raidPtr->parity_good = RF_RAID_CLEAN;
   1354 			return(0);
   1355 		}
   1356 
   1357 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1358 			/* Re-write is already in progress! */
   1359 			return(EINVAL);
   1360 		}
   1361 
   1362 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1363 					   rf_RewriteParityThread,
   1364 					   raidPtr,"raid_parity");
   1365 		return (retcode);
   1366 
   1367 
   1368 	case RAIDFRAME_ADD_HOT_SPARE:
   1369 		sparePtr = (RF_SingleComponent_t *) data;
   1370 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1371 		retcode = rf_add_hot_spare(raidPtr, &component);
   1372 		return(retcode);
   1373 
   1374 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1375 		return(retcode);
   1376 
   1377 	case RAIDFRAME_DELETE_COMPONENT:
   1378 		componentPtr = (RF_SingleComponent_t *)data;
   1379 		memcpy( &component, componentPtr,
   1380 			sizeof(RF_SingleComponent_t));
   1381 		retcode = rf_delete_component(raidPtr, &component);
   1382 		return(retcode);
   1383 
   1384 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1385 		componentPtr = (RF_SingleComponent_t *)data;
   1386 		memcpy( &component, componentPtr,
   1387 			sizeof(RF_SingleComponent_t));
   1388 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1389 		return(retcode);
   1390 
   1391 	case RAIDFRAME_REBUILD_IN_PLACE:
   1392 
   1393 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1394 			/* Can't do this on a RAID 0!! */
   1395 			return(EINVAL);
   1396 		}
   1397 
   1398 		if (raidPtr->recon_in_progress == 1) {
   1399 			/* a reconstruct is already in progress! */
   1400 			return(EINVAL);
   1401 		}
   1402 
   1403 		componentPtr = (RF_SingleComponent_t *) data;
   1404 		memcpy( &component, componentPtr,
   1405 			sizeof(RF_SingleComponent_t));
   1406 		component.row = 0; /* we don't support any more */
   1407 		column = component.column;
   1408 
   1409 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1410 			return(EINVAL);
   1411 		}
   1412 
   1413 		rf_lock_mutex2(raidPtr->mutex);
   1414 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1415 		    (raidPtr->numFailures > 0)) {
   1416 			/* XXX 0 above shouldn't be constant!!! */
   1417 			/* some component other than this has failed.
   1418 			   Let's not make things worse than they already
   1419 			   are... */
   1420 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1421 			       raidPtr->raidid);
   1422 			printf("raid%d:     Col: %d   Too many failures.\n",
   1423 			       raidPtr->raidid, column);
   1424 			rf_unlock_mutex2(raidPtr->mutex);
   1425 			return (EINVAL);
   1426 		}
   1427 		if (raidPtr->Disks[column].status ==
   1428 		    rf_ds_reconstructing) {
   1429 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1430 			       raidPtr->raidid);
   1431 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1432 
   1433 			rf_unlock_mutex2(raidPtr->mutex);
   1434 			return (EINVAL);
   1435 		}
   1436 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1437 			rf_unlock_mutex2(raidPtr->mutex);
   1438 			return (EINVAL);
   1439 		}
   1440 		rf_unlock_mutex2(raidPtr->mutex);
   1441 
   1442 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1443 		if (rrcopy == NULL)
   1444 			return(ENOMEM);
   1445 
   1446 		rrcopy->raidPtr = (void *) raidPtr;
   1447 		rrcopy->col = column;
   1448 
   1449 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1450 					   rf_ReconstructInPlaceThread,
   1451 					   rrcopy,"raid_reconip");
   1452 		return(retcode);
   1453 
   1454 	case RAIDFRAME_GET_INFO:
   1455 		if (!raidPtr->valid)
   1456 			return (ENODEV);
   1457 		ucfgp = (RF_DeviceConfig_t **) data;
   1458 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1459 			  (RF_DeviceConfig_t *));
   1460 		if (d_cfg == NULL)
   1461 			return (ENOMEM);
   1462 		d_cfg->rows = 1; /* there is only 1 row now */
   1463 		d_cfg->cols = raidPtr->numCol;
   1464 		d_cfg->ndevs = raidPtr->numCol;
   1465 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1466 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1467 			return (ENOMEM);
   1468 		}
   1469 		d_cfg->nspares = raidPtr->numSpare;
   1470 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1471 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1472 			return (ENOMEM);
   1473 		}
   1474 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1475 		d = 0;
   1476 		for (j = 0; j < d_cfg->cols; j++) {
   1477 			d_cfg->devs[d] = raidPtr->Disks[j];
   1478 			d++;
   1479 		}
   1480 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1481 			d_cfg->spares[i] = raidPtr->Disks[j];
   1482 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1483 				/* XXX: raidctl(8) expects to see this as a used spare */
   1484 				d_cfg->spares[i].status = rf_ds_used_spare;
   1485 			}
   1486 		}
   1487 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1488 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1489 
   1490 		return (retcode);
   1491 
   1492 	case RAIDFRAME_CHECK_PARITY:
   1493 		*(int *) data = raidPtr->parity_good;
   1494 		return (0);
   1495 
   1496 	case RAIDFRAME_PARITYMAP_STATUS:
   1497 		if (rf_paritymap_ineligible(raidPtr))
   1498 			return EINVAL;
   1499 		rf_paritymap_status(raidPtr->parity_map,
   1500 		    (struct rf_pmstat *)data);
   1501 		return 0;
   1502 
   1503 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1504 		if (rf_paritymap_ineligible(raidPtr))
   1505 			return EINVAL;
   1506 		if (raidPtr->parity_map == NULL)
   1507 			return ENOENT; /* ??? */
   1508 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1509 			(struct rf_pmparams *)data, 1))
   1510 			return EINVAL;
   1511 		return 0;
   1512 
   1513 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1514 		if (rf_paritymap_ineligible(raidPtr))
   1515 			return EINVAL;
   1516 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1517 		return 0;
   1518 
   1519 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1520 		if (rf_paritymap_ineligible(raidPtr))
   1521 			return EINVAL;
   1522 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1523 		/* XXX should errors be passed up? */
   1524 		return 0;
   1525 
   1526 	case RAIDFRAME_RESET_ACCTOTALS:
   1527 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1528 		return (0);
   1529 
   1530 	case RAIDFRAME_GET_ACCTOTALS:
   1531 		totals = (RF_AccTotals_t *) data;
   1532 		*totals = raidPtr->acc_totals;
   1533 		return (0);
   1534 
   1535 	case RAIDFRAME_KEEP_ACCTOTALS:
   1536 		raidPtr->keep_acc_totals = *(int *)data;
   1537 		return (0);
   1538 
   1539 	case RAIDFRAME_GET_SIZE:
   1540 		*(int *) data = raidPtr->totalSectors;
   1541 		return (0);
   1542 
   1543 		/* fail a disk & optionally start reconstruction */
   1544 	case RAIDFRAME_FAIL_DISK:
   1545 
   1546 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1547 			/* Can't do this on a RAID 0!! */
   1548 			return(EINVAL);
   1549 		}
   1550 
   1551 		rr = (struct rf_recon_req *) data;
   1552 		rr->row = 0;
   1553 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1554 			return (EINVAL);
   1555 
   1556 
   1557 		rf_lock_mutex2(raidPtr->mutex);
   1558 		if (raidPtr->status == rf_rs_reconstructing) {
   1559 			/* you can't fail a disk while we're reconstructing! */
   1560 			/* XXX wrong for RAID6 */
   1561 			rf_unlock_mutex2(raidPtr->mutex);
   1562 			return (EINVAL);
   1563 		}
   1564 		if ((raidPtr->Disks[rr->col].status ==
   1565 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1566 			/* some other component has failed.  Let's not make
   1567 			   things worse. XXX wrong for RAID6 */
   1568 			rf_unlock_mutex2(raidPtr->mutex);
   1569 			return (EINVAL);
   1570 		}
   1571 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1572 			/* Can't fail a spared disk! */
   1573 			rf_unlock_mutex2(raidPtr->mutex);
   1574 			return (EINVAL);
   1575 		}
   1576 		rf_unlock_mutex2(raidPtr->mutex);
   1577 
   1578 		/* make a copy of the recon request so that we don't rely on
   1579 		 * the user's buffer */
   1580 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1581 		if (rrcopy == NULL)
   1582 			return(ENOMEM);
   1583 		memcpy(rrcopy, rr, sizeof(*rr));
   1584 		rrcopy->raidPtr = (void *) raidPtr;
   1585 
   1586 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1587 					   rf_ReconThread,
   1588 					   rrcopy,"raid_recon");
   1589 		return (0);
   1590 
   1591 		/* invoke a copyback operation after recon on whatever disk
   1592 		 * needs it, if any */
   1593 	case RAIDFRAME_COPYBACK:
   1594 
   1595 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1596 			/* This makes no sense on a RAID 0!! */
   1597 			return(EINVAL);
   1598 		}
   1599 
   1600 		if (raidPtr->copyback_in_progress == 1) {
   1601 			/* Copyback is already in progress! */
   1602 			return(EINVAL);
   1603 		}
   1604 
   1605 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1606 					   rf_CopybackThread,
   1607 					   raidPtr,"raid_copyback");
   1608 		return (retcode);
   1609 
   1610 		/* return the percentage completion of reconstruction */
   1611 	case RAIDFRAME_CHECK_RECON_STATUS:
   1612 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1613 			/* This makes no sense on a RAID 0, so tell the
   1614 			   user it's done. */
   1615 			*(int *) data = 100;
   1616 			return(0);
   1617 		}
   1618 		if (raidPtr->status != rf_rs_reconstructing)
   1619 			*(int *) data = 100;
   1620 		else {
   1621 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1622 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1623 			} else {
   1624 				*(int *) data = 0;
   1625 			}
   1626 		}
   1627 		return (0);
   1628 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1629 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1630 		if (raidPtr->status != rf_rs_reconstructing) {
   1631 			progressInfo.remaining = 0;
   1632 			progressInfo.completed = 100;
   1633 			progressInfo.total = 100;
   1634 		} else {
   1635 			progressInfo.total =
   1636 				raidPtr->reconControl->numRUsTotal;
   1637 			progressInfo.completed =
   1638 				raidPtr->reconControl->numRUsComplete;
   1639 			progressInfo.remaining = progressInfo.total -
   1640 				progressInfo.completed;
   1641 		}
   1642 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1643 				  sizeof(RF_ProgressInfo_t));
   1644 		return (retcode);
   1645 
   1646 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1647 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1648 			/* This makes no sense on a RAID 0, so tell the
   1649 			   user it's done. */
   1650 			*(int *) data = 100;
   1651 			return(0);
   1652 		}
   1653 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1654 			*(int *) data = 100 *
   1655 				raidPtr->parity_rewrite_stripes_done /
   1656 				raidPtr->Layout.numStripe;
   1657 		} else {
   1658 			*(int *) data = 100;
   1659 		}
   1660 		return (0);
   1661 
   1662 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1663 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1664 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1665 			progressInfo.total = raidPtr->Layout.numStripe;
   1666 			progressInfo.completed =
   1667 				raidPtr->parity_rewrite_stripes_done;
   1668 			progressInfo.remaining = progressInfo.total -
   1669 				progressInfo.completed;
   1670 		} else {
   1671 			progressInfo.remaining = 0;
   1672 			progressInfo.completed = 100;
   1673 			progressInfo.total = 100;
   1674 		}
   1675 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1676 				  sizeof(RF_ProgressInfo_t));
   1677 		return (retcode);
   1678 
   1679 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1680 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1681 			/* This makes no sense on a RAID 0 */
   1682 			*(int *) data = 100;
   1683 			return(0);
   1684 		}
   1685 		if (raidPtr->copyback_in_progress == 1) {
   1686 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1687 				raidPtr->Layout.numStripe;
   1688 		} else {
   1689 			*(int *) data = 100;
   1690 		}
   1691 		return (0);
   1692 
   1693 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1694 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1695 		if (raidPtr->copyback_in_progress == 1) {
   1696 			progressInfo.total = raidPtr->Layout.numStripe;
   1697 			progressInfo.completed =
   1698 				raidPtr->copyback_stripes_done;
   1699 			progressInfo.remaining = progressInfo.total -
   1700 				progressInfo.completed;
   1701 		} else {
   1702 			progressInfo.remaining = 0;
   1703 			progressInfo.completed = 100;
   1704 			progressInfo.total = 100;
   1705 		}
   1706 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1707 				  sizeof(RF_ProgressInfo_t));
   1708 		return (retcode);
   1709 
   1710 	case RAIDFRAME_SET_LAST_UNIT:
   1711 		for (column = 0; column < raidPtr->numCol; column++)
   1712 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1713 				return EBUSY;
   1714 
   1715 		for (column = 0; column < raidPtr->numCol; column++) {
   1716 			clabel = raidget_component_label(raidPtr, column);
   1717 			clabel->last_unit = *(int *)data;
   1718 			raidflush_component_label(raidPtr, column);
   1719 		}
   1720 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1721 		return 0;
   1722 
   1723 		/* the sparetable daemon calls this to wait for the kernel to
   1724 		 * need a spare table. this ioctl does not return until a
   1725 		 * spare table is needed. XXX -- calling mpsleep here in the
   1726 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1727 		 * -- I should either compute the spare table in the kernel,
   1728 		 * or have a different -- XXX XXX -- interface (a different
   1729 		 * character device) for delivering the table     -- XXX */
   1730 #if 0
   1731 	case RAIDFRAME_SPARET_WAIT:
   1732 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1733 		while (!rf_sparet_wait_queue)
   1734 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1735 		waitreq = rf_sparet_wait_queue;
   1736 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1737 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1738 
   1739 		/* structure assignment */
   1740 		*((RF_SparetWait_t *) data) = *waitreq;
   1741 
   1742 		RF_Free(waitreq, sizeof(*waitreq));
   1743 		return (0);
   1744 
   1745 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1746 		 * code in it that will cause the dameon to exit */
   1747 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1748 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1749 		waitreq->fcol = -1;
   1750 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1751 		waitreq->next = rf_sparet_wait_queue;
   1752 		rf_sparet_wait_queue = waitreq;
   1753 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1754 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1755 		return (0);
   1756 
   1757 		/* used by the spare table daemon to deliver a spare table
   1758 		 * into the kernel */
   1759 	case RAIDFRAME_SEND_SPARET:
   1760 
   1761 		/* install the spare table */
   1762 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1763 
   1764 		/* respond to the requestor.  the return status of the spare
   1765 		 * table installation is passed in the "fcol" field */
   1766 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1767 		waitreq->fcol = retcode;
   1768 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1769 		waitreq->next = rf_sparet_resp_queue;
   1770 		rf_sparet_resp_queue = waitreq;
   1771 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1772 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1773 
   1774 		return (retcode);
   1775 #endif
   1776 
   1777 	default:
   1778 		break; /* fall through to the os-specific code below */
   1779 
   1780 	}
   1781 
   1782 	if (!raidPtr->valid)
   1783 		return (EINVAL);
   1784 
   1785 	/*
   1786 	 * Add support for "regular" device ioctls here.
   1787 	 */
   1788 
   1789 	switch (cmd) {
   1790 	case DIOCCACHESYNC:
   1791 		retcode = rf_sync_component_caches(raidPtr);
   1792 		break;
   1793 
   1794 	default:
   1795 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1796 		break;
   1797 	}
   1798 
   1799 	return (retcode);
   1800 
   1801 }
   1802 
   1803 
   1804 /* raidinit -- complete the rest of the initialization for the
   1805    RAIDframe device.  */
   1806 
   1807 
   1808 static void
   1809 raidinit(struct raid_softc *rs)
   1810 {
   1811 	cfdata_t cf;
   1812 	unsigned int unit;
   1813 	struct dk_softc *dksc = &rs->sc_dksc;
   1814 	RF_Raid_t *raidPtr = &rs->sc_r;
   1815 	device_t dev;
   1816 
   1817 	unit = raidPtr->raidid;
   1818 
   1819 	/* XXX doesn't check bounds. */
   1820 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1821 
   1822 	/* attach the pseudo device */
   1823 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1824 	cf->cf_name = raid_cd.cd_name;
   1825 	cf->cf_atname = raid_cd.cd_name;
   1826 	cf->cf_unit = unit;
   1827 	cf->cf_fstate = FSTATE_STAR;
   1828 
   1829 	dev = config_attach_pseudo(cf);
   1830 	if (dev == NULL) {
   1831 		printf("raid%d: config_attach_pseudo failed\n",
   1832 		    raidPtr->raidid);
   1833 		free(cf, M_RAIDFRAME);
   1834 		return;
   1835 	}
   1836 
   1837 	/* provide a backpointer to the real softc */
   1838 	raidsoftc(dev) = rs;
   1839 
   1840 	/* disk_attach actually creates space for the CPU disklabel, among
   1841 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1842 	 * with disklabels. */
   1843 	dk_init(dksc, dev, DKTYPE_RAID);
   1844 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1845 
   1846 	/* XXX There may be a weird interaction here between this, and
   1847 	 * protectedSectors, as used in RAIDframe.  */
   1848 
   1849 	rs->sc_size = raidPtr->totalSectors;
   1850 
   1851 	/* Attach dk and disk subsystems */
   1852 	dk_attach(dksc);
   1853 	disk_attach(&dksc->sc_dkdev);
   1854 	rf_set_geometry(rs, raidPtr);
   1855 
   1856 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1857 
   1858 	/* mark unit as usuable */
   1859 	rs->sc_flags |= RAIDF_INITED;
   1860 
   1861 	dkwedge_discover(&dksc->sc_dkdev);
   1862 }
   1863 
   1864 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1865 /* wake up the daemon & tell it to get us a spare table
   1866  * XXX
   1867  * the entries in the queues should be tagged with the raidPtr
   1868  * so that in the extremely rare case that two recons happen at once,
   1869  * we know for which device were requesting a spare table
   1870  * XXX
   1871  *
   1872  * XXX This code is not currently used. GO
   1873  */
   1874 int
   1875 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1876 {
   1877 	int     retcode;
   1878 
   1879 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1880 	req->next = rf_sparet_wait_queue;
   1881 	rf_sparet_wait_queue = req;
   1882 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1883 
   1884 	/* mpsleep unlocks the mutex */
   1885 	while (!rf_sparet_resp_queue) {
   1886 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1887 	}
   1888 	req = rf_sparet_resp_queue;
   1889 	rf_sparet_resp_queue = req->next;
   1890 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1891 
   1892 	retcode = req->fcol;
   1893 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1894 					 * alloc'd */
   1895 	return (retcode);
   1896 }
   1897 #endif
   1898 
   1899 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1900  * bp & passes it down.
   1901  * any calls originating in the kernel must use non-blocking I/O
   1902  * do some extra sanity checking to return "appropriate" error values for
   1903  * certain conditions (to make some standard utilities work)
   1904  *
   1905  * Formerly known as: rf_DoAccessKernel
   1906  */
   1907 void
   1908 raidstart(RF_Raid_t *raidPtr)
   1909 {
   1910 	struct raid_softc *rs;
   1911 	struct dk_softc *dksc;
   1912 
   1913 	rs = raidPtr->softc;
   1914 	dksc = &rs->sc_dksc;
   1915 	/* quick check to see if anything has died recently */
   1916 	rf_lock_mutex2(raidPtr->mutex);
   1917 	if (raidPtr->numNewFailures > 0) {
   1918 		rf_unlock_mutex2(raidPtr->mutex);
   1919 		rf_update_component_labels(raidPtr,
   1920 					   RF_NORMAL_COMPONENT_UPDATE);
   1921 		rf_lock_mutex2(raidPtr->mutex);
   1922 		raidPtr->numNewFailures--;
   1923 	}
   1924 	rf_unlock_mutex2(raidPtr->mutex);
   1925 
   1926 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1927 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1928 		return;
   1929 	}
   1930 
   1931 	dk_start(dksc, NULL);
   1932 }
   1933 
   1934 static int
   1935 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1936 {
   1937 	RF_SectorCount_t num_blocks, pb, sum;
   1938 	RF_RaidAddr_t raid_addr;
   1939 	daddr_t blocknum;
   1940 	int     do_async;
   1941 	int rc;
   1942 
   1943 	rf_lock_mutex2(raidPtr->mutex);
   1944 	if (raidPtr->openings == 0) {
   1945 		rf_unlock_mutex2(raidPtr->mutex);
   1946 		return EAGAIN;
   1947 	}
   1948 	rf_unlock_mutex2(raidPtr->mutex);
   1949 
   1950 	blocknum = bp->b_rawblkno;
   1951 
   1952 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1953 		    (int) blocknum));
   1954 
   1955 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1956 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1957 
   1958 	/* *THIS* is where we adjust what block we're going to...
   1959 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1960 	raid_addr = blocknum;
   1961 
   1962 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1963 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1964 	sum = raid_addr + num_blocks + pb;
   1965 	if (1 || rf_debugKernelAccess) {
   1966 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1967 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1968 			    (int) pb, (int) bp->b_resid));
   1969 	}
   1970 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1971 	    || (sum < num_blocks) || (sum < pb)) {
   1972 		rc = ENOSPC;
   1973 		goto done;
   1974 	}
   1975 	/*
   1976 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1977 	 */
   1978 
   1979 	if (bp->b_bcount & raidPtr->sectorMask) {
   1980 		rc = ENOSPC;
   1981 		goto done;
   1982 	}
   1983 	db1_printf(("Calling DoAccess..\n"));
   1984 
   1985 
   1986 	rf_lock_mutex2(raidPtr->mutex);
   1987 	raidPtr->openings--;
   1988 	rf_unlock_mutex2(raidPtr->mutex);
   1989 
   1990 	/*
   1991 	 * Everything is async.
   1992 	 */
   1993 	do_async = 1;
   1994 
   1995 	/* don't ever condition on bp->b_flags & B_WRITE.
   1996 	 * always condition on B_READ instead */
   1997 
   1998 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1999 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2000 			 do_async, raid_addr, num_blocks,
   2001 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2002 
   2003 done:
   2004 	return rc;
   2005 }
   2006 
   2007 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2008 
   2009 int
   2010 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2011 {
   2012 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2013 	struct buf *bp;
   2014 
   2015 	req->queue = queue;
   2016 	bp = req->bp;
   2017 
   2018 	switch (req->type) {
   2019 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2020 		/* XXX need to do something extra here.. */
   2021 		/* I'm leaving this in, as I've never actually seen it used,
   2022 		 * and I'd like folks to report it... GO */
   2023 		printf(("WAKEUP CALLED\n"));
   2024 		queue->numOutstanding++;
   2025 
   2026 		bp->b_flags = 0;
   2027 		bp->b_private = req;
   2028 
   2029 		KernelWakeupFunc(bp);
   2030 		break;
   2031 
   2032 	case RF_IO_TYPE_READ:
   2033 	case RF_IO_TYPE_WRITE:
   2034 #if RF_ACC_TRACE > 0
   2035 		if (req->tracerec) {
   2036 			RF_ETIMER_START(req->tracerec->timer);
   2037 		}
   2038 #endif
   2039 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2040 		    op, queue->rf_cinfo->ci_dev,
   2041 		    req->sectorOffset, req->numSector,
   2042 		    req->buf, KernelWakeupFunc, (void *) req,
   2043 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2044 
   2045 		if (rf_debugKernelAccess) {
   2046 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2047 				(long) bp->b_blkno));
   2048 		}
   2049 		queue->numOutstanding++;
   2050 		queue->last_deq_sector = req->sectorOffset;
   2051 		/* acc wouldn't have been let in if there were any pending
   2052 		 * reqs at any other priority */
   2053 		queue->curPriority = req->priority;
   2054 
   2055 		db1_printf(("Going for %c to unit %d col %d\n",
   2056 			    req->type, queue->raidPtr->raidid,
   2057 			    queue->col));
   2058 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2059 			(int) req->sectorOffset, (int) req->numSector,
   2060 			(int) (req->numSector <<
   2061 			    queue->raidPtr->logBytesPerSector),
   2062 			(int) queue->raidPtr->logBytesPerSector));
   2063 
   2064 		/*
   2065 		 * XXX: drop lock here since this can block at
   2066 		 * least with backing SCSI devices.  Retake it
   2067 		 * to minimize fuss with calling interfaces.
   2068 		 */
   2069 
   2070 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2071 		bdev_strategy(bp);
   2072 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2073 		break;
   2074 
   2075 	default:
   2076 		panic("bad req->type in rf_DispatchKernelIO");
   2077 	}
   2078 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2079 
   2080 	return (0);
   2081 }
   2082 /* this is the callback function associated with a I/O invoked from
   2083    kernel code.
   2084  */
   2085 static void
   2086 KernelWakeupFunc(struct buf *bp)
   2087 {
   2088 	RF_DiskQueueData_t *req = NULL;
   2089 	RF_DiskQueue_t *queue;
   2090 
   2091 	db1_printf(("recovering the request queue:\n"));
   2092 
   2093 	req = bp->b_private;
   2094 
   2095 	queue = (RF_DiskQueue_t *) req->queue;
   2096 
   2097 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2098 
   2099 #if RF_ACC_TRACE > 0
   2100 	if (req->tracerec) {
   2101 		RF_ETIMER_STOP(req->tracerec->timer);
   2102 		RF_ETIMER_EVAL(req->tracerec->timer);
   2103 		rf_lock_mutex2(rf_tracing_mutex);
   2104 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2105 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2106 		req->tracerec->num_phys_ios++;
   2107 		rf_unlock_mutex2(rf_tracing_mutex);
   2108 	}
   2109 #endif
   2110 
   2111 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2112 	 * ballistic, and mark the component as hosed... */
   2113 
   2114 	if (bp->b_error != 0) {
   2115 		/* Mark the disk as dead */
   2116 		/* but only mark it once... */
   2117 		/* and only if it wouldn't leave this RAID set
   2118 		   completely broken */
   2119 		if (((queue->raidPtr->Disks[queue->col].status ==
   2120 		      rf_ds_optimal) ||
   2121 		     (queue->raidPtr->Disks[queue->col].status ==
   2122 		      rf_ds_used_spare)) &&
   2123 		     (queue->raidPtr->numFailures <
   2124 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2125 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2126 			       queue->raidPtr->raidid,
   2127 			       bp->b_error,
   2128 			       queue->raidPtr->Disks[queue->col].devname);
   2129 			queue->raidPtr->Disks[queue->col].status =
   2130 			    rf_ds_failed;
   2131 			queue->raidPtr->status = rf_rs_degraded;
   2132 			queue->raidPtr->numFailures++;
   2133 			queue->raidPtr->numNewFailures++;
   2134 		} else {	/* Disk is already dead... */
   2135 			/* printf("Disk already marked as dead!\n"); */
   2136 		}
   2137 
   2138 	}
   2139 
   2140 	/* Fill in the error value */
   2141 	req->error = bp->b_error;
   2142 
   2143 	/* Drop this one on the "finished" queue... */
   2144 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2145 
   2146 	/* Let the raidio thread know there is work to be done. */
   2147 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2148 
   2149 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2150 }
   2151 
   2152 
   2153 /*
   2154  * initialize a buf structure for doing an I/O in the kernel.
   2155  */
   2156 static void
   2157 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2158        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2159        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2160        struct proc *b_proc)
   2161 {
   2162 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2163 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2164 	bp->b_oflags = 0;
   2165 	bp->b_cflags = 0;
   2166 	bp->b_bcount = numSect << logBytesPerSector;
   2167 	bp->b_bufsize = bp->b_bcount;
   2168 	bp->b_error = 0;
   2169 	bp->b_dev = dev;
   2170 	bp->b_data = bf;
   2171 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2172 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2173 	if (bp->b_bcount == 0) {
   2174 		panic("bp->b_bcount is zero in InitBP!!");
   2175 	}
   2176 	bp->b_proc = b_proc;
   2177 	bp->b_iodone = cbFunc;
   2178 	bp->b_private = cbArg;
   2179 }
   2180 
   2181 /*
   2182  * Wait interruptibly for an exclusive lock.
   2183  *
   2184  * XXX
   2185  * Several drivers do this; it should be abstracted and made MP-safe.
   2186  * (Hmm... where have we seen this warning before :->  GO )
   2187  */
   2188 static int
   2189 raidlock(struct raid_softc *rs)
   2190 {
   2191 	int     error;
   2192 
   2193 	error = 0;
   2194 	mutex_enter(&rs->sc_mutex);
   2195 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2196 		rs->sc_flags |= RAIDF_WANTED;
   2197 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2198 		if (error != 0)
   2199 			goto done;
   2200 	}
   2201 	rs->sc_flags |= RAIDF_LOCKED;
   2202 done:
   2203 	mutex_exit(&rs->sc_mutex);
   2204 	return (error);
   2205 }
   2206 /*
   2207  * Unlock and wake up any waiters.
   2208  */
   2209 static void
   2210 raidunlock(struct raid_softc *rs)
   2211 {
   2212 
   2213 	mutex_enter(&rs->sc_mutex);
   2214 	rs->sc_flags &= ~RAIDF_LOCKED;
   2215 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2216 		rs->sc_flags &= ~RAIDF_WANTED;
   2217 		cv_broadcast(&rs->sc_cv);
   2218 	}
   2219 	mutex_exit(&rs->sc_mutex);
   2220 }
   2221 
   2222 
   2223 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2224 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2225 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2226 
   2227 static daddr_t
   2228 rf_component_info_offset(void)
   2229 {
   2230 
   2231 	return RF_COMPONENT_INFO_OFFSET;
   2232 }
   2233 
   2234 static daddr_t
   2235 rf_component_info_size(unsigned secsize)
   2236 {
   2237 	daddr_t info_size;
   2238 
   2239 	KASSERT(secsize);
   2240 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2241 		info_size = secsize;
   2242 	else
   2243 		info_size = RF_COMPONENT_INFO_SIZE;
   2244 
   2245 	return info_size;
   2246 }
   2247 
   2248 static daddr_t
   2249 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2250 {
   2251 	daddr_t map_offset;
   2252 
   2253 	KASSERT(raidPtr->bytesPerSector);
   2254 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2255 		map_offset = raidPtr->bytesPerSector;
   2256 	else
   2257 		map_offset = RF_COMPONENT_INFO_SIZE;
   2258 	map_offset += rf_component_info_offset();
   2259 
   2260 	return map_offset;
   2261 }
   2262 
   2263 static daddr_t
   2264 rf_parity_map_size(RF_Raid_t *raidPtr)
   2265 {
   2266 	daddr_t map_size;
   2267 
   2268 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2269 		map_size = raidPtr->bytesPerSector;
   2270 	else
   2271 		map_size = RF_PARITY_MAP_SIZE;
   2272 
   2273 	return map_size;
   2274 }
   2275 
   2276 int
   2277 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2278 {
   2279 	RF_ComponentLabel_t *clabel;
   2280 
   2281 	clabel = raidget_component_label(raidPtr, col);
   2282 	clabel->clean = RF_RAID_CLEAN;
   2283 	raidflush_component_label(raidPtr, col);
   2284 	return(0);
   2285 }
   2286 
   2287 
   2288 int
   2289 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2290 {
   2291 	RF_ComponentLabel_t *clabel;
   2292 
   2293 	clabel = raidget_component_label(raidPtr, col);
   2294 	clabel->clean = RF_RAID_DIRTY;
   2295 	raidflush_component_label(raidPtr, col);
   2296 	return(0);
   2297 }
   2298 
   2299 int
   2300 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2301 {
   2302 	KASSERT(raidPtr->bytesPerSector);
   2303 	return raidread_component_label(raidPtr->bytesPerSector,
   2304 	    raidPtr->Disks[col].dev,
   2305 	    raidPtr->raid_cinfo[col].ci_vp,
   2306 	    &raidPtr->raid_cinfo[col].ci_label);
   2307 }
   2308 
   2309 RF_ComponentLabel_t *
   2310 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2311 {
   2312 	return &raidPtr->raid_cinfo[col].ci_label;
   2313 }
   2314 
   2315 int
   2316 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2317 {
   2318 	RF_ComponentLabel_t *label;
   2319 
   2320 	label = &raidPtr->raid_cinfo[col].ci_label;
   2321 	label->mod_counter = raidPtr->mod_counter;
   2322 #ifndef RF_NO_PARITY_MAP
   2323 	label->parity_map_modcount = label->mod_counter;
   2324 #endif
   2325 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2326 	    raidPtr->Disks[col].dev,
   2327 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2328 }
   2329 
   2330 
   2331 static int
   2332 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2333     RF_ComponentLabel_t *clabel)
   2334 {
   2335 	return raidread_component_area(dev, b_vp, clabel,
   2336 	    sizeof(RF_ComponentLabel_t),
   2337 	    rf_component_info_offset(),
   2338 	    rf_component_info_size(secsize));
   2339 }
   2340 
   2341 /* ARGSUSED */
   2342 static int
   2343 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2344     size_t msize, daddr_t offset, daddr_t dsize)
   2345 {
   2346 	struct buf *bp;
   2347 	int error;
   2348 
   2349 	/* XXX should probably ensure that we don't try to do this if
   2350 	   someone has changed rf_protected_sectors. */
   2351 
   2352 	if (b_vp == NULL) {
   2353 		/* For whatever reason, this component is not valid.
   2354 		   Don't try to read a component label from it. */
   2355 		return(EINVAL);
   2356 	}
   2357 
   2358 	/* get a block of the appropriate size... */
   2359 	bp = geteblk((int)dsize);
   2360 	bp->b_dev = dev;
   2361 
   2362 	/* get our ducks in a row for the read */
   2363 	bp->b_blkno = offset / DEV_BSIZE;
   2364 	bp->b_bcount = dsize;
   2365 	bp->b_flags |= B_READ;
   2366  	bp->b_resid = dsize;
   2367 
   2368 	bdev_strategy(bp);
   2369 	error = biowait(bp);
   2370 
   2371 	if (!error) {
   2372 		memcpy(data, bp->b_data, msize);
   2373 	}
   2374 
   2375 	brelse(bp, 0);
   2376 	return(error);
   2377 }
   2378 
   2379 
   2380 static int
   2381 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2382     RF_ComponentLabel_t *clabel)
   2383 {
   2384 	return raidwrite_component_area(dev, b_vp, clabel,
   2385 	    sizeof(RF_ComponentLabel_t),
   2386 	    rf_component_info_offset(),
   2387 	    rf_component_info_size(secsize), 0);
   2388 }
   2389 
   2390 /* ARGSUSED */
   2391 static int
   2392 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2393     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2394 {
   2395 	struct buf *bp;
   2396 	int error;
   2397 
   2398 	/* get a block of the appropriate size... */
   2399 	bp = geteblk((int)dsize);
   2400 	bp->b_dev = dev;
   2401 
   2402 	/* get our ducks in a row for the write */
   2403 	bp->b_blkno = offset / DEV_BSIZE;
   2404 	bp->b_bcount = dsize;
   2405 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2406  	bp->b_resid = dsize;
   2407 
   2408 	memset(bp->b_data, 0, dsize);
   2409 	memcpy(bp->b_data, data, msize);
   2410 
   2411 	bdev_strategy(bp);
   2412 	if (asyncp)
   2413 		return 0;
   2414 	error = biowait(bp);
   2415 	brelse(bp, 0);
   2416 	if (error) {
   2417 #if 1
   2418 		printf("Failed to write RAID component info!\n");
   2419 #endif
   2420 	}
   2421 
   2422 	return(error);
   2423 }
   2424 
   2425 void
   2426 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2427 {
   2428 	int c;
   2429 
   2430 	for (c = 0; c < raidPtr->numCol; c++) {
   2431 		/* Skip dead disks. */
   2432 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2433 			continue;
   2434 		/* XXXjld: what if an error occurs here? */
   2435 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2436 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2437 		    RF_PARITYMAP_NBYTE,
   2438 		    rf_parity_map_offset(raidPtr),
   2439 		    rf_parity_map_size(raidPtr), 0);
   2440 	}
   2441 }
   2442 
   2443 void
   2444 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2445 {
   2446 	struct rf_paritymap_ondisk tmp;
   2447 	int c,first;
   2448 
   2449 	first=1;
   2450 	for (c = 0; c < raidPtr->numCol; c++) {
   2451 		/* Skip dead disks. */
   2452 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2453 			continue;
   2454 		raidread_component_area(raidPtr->Disks[c].dev,
   2455 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2456 		    RF_PARITYMAP_NBYTE,
   2457 		    rf_parity_map_offset(raidPtr),
   2458 		    rf_parity_map_size(raidPtr));
   2459 		if (first) {
   2460 			memcpy(map, &tmp, sizeof(*map));
   2461 			first = 0;
   2462 		} else {
   2463 			rf_paritymap_merge(map, &tmp);
   2464 		}
   2465 	}
   2466 }
   2467 
   2468 void
   2469 rf_markalldirty(RF_Raid_t *raidPtr)
   2470 {
   2471 	RF_ComponentLabel_t *clabel;
   2472 	int sparecol;
   2473 	int c;
   2474 	int j;
   2475 	int scol = -1;
   2476 
   2477 	raidPtr->mod_counter++;
   2478 	for (c = 0; c < raidPtr->numCol; c++) {
   2479 		/* we don't want to touch (at all) a disk that has
   2480 		   failed */
   2481 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2482 			clabel = raidget_component_label(raidPtr, c);
   2483 			if (clabel->status == rf_ds_spared) {
   2484 				/* XXX do something special...
   2485 				   but whatever you do, don't
   2486 				   try to access it!! */
   2487 			} else {
   2488 				raidmarkdirty(raidPtr, c);
   2489 			}
   2490 		}
   2491 	}
   2492 
   2493 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2494 		sparecol = raidPtr->numCol + c;
   2495 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2496 			/*
   2497 
   2498 			   we claim this disk is "optimal" if it's
   2499 			   rf_ds_used_spare, as that means it should be
   2500 			   directly substitutable for the disk it replaced.
   2501 			   We note that too...
   2502 
   2503 			 */
   2504 
   2505 			for(j=0;j<raidPtr->numCol;j++) {
   2506 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2507 					scol = j;
   2508 					break;
   2509 				}
   2510 			}
   2511 
   2512 			clabel = raidget_component_label(raidPtr, sparecol);
   2513 			/* make sure status is noted */
   2514 
   2515 			raid_init_component_label(raidPtr, clabel);
   2516 
   2517 			clabel->row = 0;
   2518 			clabel->column = scol;
   2519 			/* Note: we *don't* change status from rf_ds_used_spare
   2520 			   to rf_ds_optimal */
   2521 			/* clabel.status = rf_ds_optimal; */
   2522 
   2523 			raidmarkdirty(raidPtr, sparecol);
   2524 		}
   2525 	}
   2526 }
   2527 
   2528 
   2529 void
   2530 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2531 {
   2532 	RF_ComponentLabel_t *clabel;
   2533 	int sparecol;
   2534 	int c;
   2535 	int j;
   2536 	int scol;
   2537 	struct raid_softc *rs = raidPtr->softc;
   2538 
   2539 	scol = -1;
   2540 
   2541 	/* XXX should do extra checks to make sure things really are clean,
   2542 	   rather than blindly setting the clean bit... */
   2543 
   2544 	raidPtr->mod_counter++;
   2545 
   2546 	for (c = 0; c < raidPtr->numCol; c++) {
   2547 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2548 			clabel = raidget_component_label(raidPtr, c);
   2549 			/* make sure status is noted */
   2550 			clabel->status = rf_ds_optimal;
   2551 
   2552 			/* note what unit we are configured as */
   2553 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2554 				clabel->last_unit = raidPtr->raidid;
   2555 
   2556 			raidflush_component_label(raidPtr, c);
   2557 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2558 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2559 					raidmarkclean(raidPtr, c);
   2560 				}
   2561 			}
   2562 		}
   2563 		/* else we don't touch it.. */
   2564 	}
   2565 
   2566 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2567 		sparecol = raidPtr->numCol + c;
   2568 		/* Need to ensure that the reconstruct actually completed! */
   2569 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2570 			/*
   2571 
   2572 			   we claim this disk is "optimal" if it's
   2573 			   rf_ds_used_spare, as that means it should be
   2574 			   directly substitutable for the disk it replaced.
   2575 			   We note that too...
   2576 
   2577 			 */
   2578 
   2579 			for(j=0;j<raidPtr->numCol;j++) {
   2580 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2581 					scol = j;
   2582 					break;
   2583 				}
   2584 			}
   2585 
   2586 			/* XXX shouldn't *really* need this... */
   2587 			clabel = raidget_component_label(raidPtr, sparecol);
   2588 			/* make sure status is noted */
   2589 
   2590 			raid_init_component_label(raidPtr, clabel);
   2591 
   2592 			clabel->column = scol;
   2593 			clabel->status = rf_ds_optimal;
   2594 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2595 				clabel->last_unit = raidPtr->raidid;
   2596 
   2597 			raidflush_component_label(raidPtr, sparecol);
   2598 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2599 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2600 					raidmarkclean(raidPtr, sparecol);
   2601 				}
   2602 			}
   2603 		}
   2604 	}
   2605 }
   2606 
   2607 void
   2608 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2609 {
   2610 
   2611 	if (vp != NULL) {
   2612 		if (auto_configured == 1) {
   2613 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2614 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2615 			vput(vp);
   2616 
   2617 		} else {
   2618 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2619 		}
   2620 	}
   2621 }
   2622 
   2623 
   2624 void
   2625 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2626 {
   2627 	int r,c;
   2628 	struct vnode *vp;
   2629 	int acd;
   2630 
   2631 
   2632 	/* We take this opportunity to close the vnodes like we should.. */
   2633 
   2634 	for (c = 0; c < raidPtr->numCol; c++) {
   2635 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2636 		acd = raidPtr->Disks[c].auto_configured;
   2637 		rf_close_component(raidPtr, vp, acd);
   2638 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2639 		raidPtr->Disks[c].auto_configured = 0;
   2640 	}
   2641 
   2642 	for (r = 0; r < raidPtr->numSpare; r++) {
   2643 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2644 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2645 		rf_close_component(raidPtr, vp, acd);
   2646 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2647 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2648 	}
   2649 }
   2650 
   2651 
   2652 void
   2653 rf_ReconThread(struct rf_recon_req *req)
   2654 {
   2655 	int     s;
   2656 	RF_Raid_t *raidPtr;
   2657 
   2658 	s = splbio();
   2659 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2660 	raidPtr->recon_in_progress = 1;
   2661 
   2662 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2663 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2664 
   2665 	RF_Free(req, sizeof(*req));
   2666 
   2667 	raidPtr->recon_in_progress = 0;
   2668 	splx(s);
   2669 
   2670 	/* That's all... */
   2671 	kthread_exit(0);	/* does not return */
   2672 }
   2673 
   2674 void
   2675 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2676 {
   2677 	int retcode;
   2678 	int s;
   2679 
   2680 	raidPtr->parity_rewrite_stripes_done = 0;
   2681 	raidPtr->parity_rewrite_in_progress = 1;
   2682 	s = splbio();
   2683 	retcode = rf_RewriteParity(raidPtr);
   2684 	splx(s);
   2685 	if (retcode) {
   2686 		printf("raid%d: Error re-writing parity (%d)!\n",
   2687 		    raidPtr->raidid, retcode);
   2688 	} else {
   2689 		/* set the clean bit!  If we shutdown correctly,
   2690 		   the clean bit on each component label will get
   2691 		   set */
   2692 		raidPtr->parity_good = RF_RAID_CLEAN;
   2693 	}
   2694 	raidPtr->parity_rewrite_in_progress = 0;
   2695 
   2696 	/* Anyone waiting for us to stop?  If so, inform them... */
   2697 	if (raidPtr->waitShutdown) {
   2698 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2699 	}
   2700 
   2701 	/* That's all... */
   2702 	kthread_exit(0);	/* does not return */
   2703 }
   2704 
   2705 
   2706 void
   2707 rf_CopybackThread(RF_Raid_t *raidPtr)
   2708 {
   2709 	int s;
   2710 
   2711 	raidPtr->copyback_in_progress = 1;
   2712 	s = splbio();
   2713 	rf_CopybackReconstructedData(raidPtr);
   2714 	splx(s);
   2715 	raidPtr->copyback_in_progress = 0;
   2716 
   2717 	/* That's all... */
   2718 	kthread_exit(0);	/* does not return */
   2719 }
   2720 
   2721 
   2722 void
   2723 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2724 {
   2725 	int s;
   2726 	RF_Raid_t *raidPtr;
   2727 
   2728 	s = splbio();
   2729 	raidPtr = req->raidPtr;
   2730 	raidPtr->recon_in_progress = 1;
   2731 	rf_ReconstructInPlace(raidPtr, req->col);
   2732 	RF_Free(req, sizeof(*req));
   2733 	raidPtr->recon_in_progress = 0;
   2734 	splx(s);
   2735 
   2736 	/* That's all... */
   2737 	kthread_exit(0);	/* does not return */
   2738 }
   2739 
   2740 static RF_AutoConfig_t *
   2741 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2742     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2743     unsigned secsize)
   2744 {
   2745 	int good_one = 0;
   2746 	RF_ComponentLabel_t *clabel;
   2747 	RF_AutoConfig_t *ac;
   2748 
   2749 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2750 	if (clabel == NULL) {
   2751 oomem:
   2752 		    while(ac_list) {
   2753 			    ac = ac_list;
   2754 			    if (ac->clabel)
   2755 				    free(ac->clabel, M_RAIDFRAME);
   2756 			    ac_list = ac_list->next;
   2757 			    free(ac, M_RAIDFRAME);
   2758 		    }
   2759 		    printf("RAID auto config: out of memory!\n");
   2760 		    return NULL; /* XXX probably should panic? */
   2761 	}
   2762 
   2763 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2764 		/* Got the label.  Does it look reasonable? */
   2765 		if (rf_reasonable_label(clabel, numsecs) &&
   2766 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2767 #ifdef DEBUG
   2768 			printf("Component on: %s: %llu\n",
   2769 				cname, (unsigned long long)size);
   2770 			rf_print_component_label(clabel);
   2771 #endif
   2772 			/* if it's reasonable, add it, else ignore it. */
   2773 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2774 				M_NOWAIT);
   2775 			if (ac == NULL) {
   2776 				free(clabel, M_RAIDFRAME);
   2777 				goto oomem;
   2778 			}
   2779 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2780 			ac->dev = dev;
   2781 			ac->vp = vp;
   2782 			ac->clabel = clabel;
   2783 			ac->next = ac_list;
   2784 			ac_list = ac;
   2785 			good_one = 1;
   2786 		}
   2787 	}
   2788 	if (!good_one) {
   2789 		/* cleanup */
   2790 		free(clabel, M_RAIDFRAME);
   2791 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2792 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2793 		vput(vp);
   2794 	}
   2795 	return ac_list;
   2796 }
   2797 
   2798 RF_AutoConfig_t *
   2799 rf_find_raid_components(void)
   2800 {
   2801 	struct vnode *vp;
   2802 	struct disklabel label;
   2803 	device_t dv;
   2804 	deviter_t di;
   2805 	dev_t dev;
   2806 	int bmajor, bminor, wedge, rf_part_found;
   2807 	int error;
   2808 	int i;
   2809 	RF_AutoConfig_t *ac_list;
   2810 	uint64_t numsecs;
   2811 	unsigned secsize;
   2812 	int dowedges;
   2813 
   2814 	/* initialize the AutoConfig list */
   2815 	ac_list = NULL;
   2816 
   2817 	/*
   2818 	 * we begin by trolling through *all* the devices on the system *twice*
   2819 	 * first we scan for wedges, second for other devices. This avoids
   2820 	 * using a raw partition instead of a wedge that covers the whole disk
   2821 	 */
   2822 
   2823 	for (dowedges=1; dowedges>=0; --dowedges) {
   2824 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2825 		     dv = deviter_next(&di)) {
   2826 
   2827 			/* we are only interested in disks... */
   2828 			if (device_class(dv) != DV_DISK)
   2829 				continue;
   2830 
   2831 			/* we don't care about floppies... */
   2832 			if (device_is_a(dv, "fd")) {
   2833 				continue;
   2834 			}
   2835 
   2836 			/* we don't care about CD's... */
   2837 			if (device_is_a(dv, "cd")) {
   2838 				continue;
   2839 			}
   2840 
   2841 			/* we don't care about md's... */
   2842 			if (device_is_a(dv, "md")) {
   2843 				continue;
   2844 			}
   2845 
   2846 			/* hdfd is the Atari/Hades floppy driver */
   2847 			if (device_is_a(dv, "hdfd")) {
   2848 				continue;
   2849 			}
   2850 
   2851 			/* fdisa is the Atari/Milan floppy driver */
   2852 			if (device_is_a(dv, "fdisa")) {
   2853 				continue;
   2854 			}
   2855 
   2856 			/* are we in the wedges pass ? */
   2857 			wedge = device_is_a(dv, "dk");
   2858 			if (wedge != dowedges) {
   2859 				continue;
   2860 			}
   2861 
   2862 			/* need to find the device_name_to_block_device_major stuff */
   2863 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2864 
   2865 			rf_part_found = 0; /*No raid partition as yet*/
   2866 
   2867 			/* get a vnode for the raw partition of this disk */
   2868 			bminor = minor(device_unit(dv));
   2869 			dev = wedge ? makedev(bmajor, bminor) :
   2870 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2871 			if (bdevvp(dev, &vp))
   2872 				panic("RAID can't alloc vnode");
   2873 
   2874 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2875 
   2876 			if (error) {
   2877 				/* "Who cares."  Continue looking
   2878 				   for something that exists*/
   2879 				vput(vp);
   2880 				continue;
   2881 			}
   2882 
   2883 			error = getdisksize(vp, &numsecs, &secsize);
   2884 			if (error) {
   2885 				/*
   2886 				 * Pseudo devices like vnd and cgd can be
   2887 				 * opened but may still need some configuration.
   2888 				 * Ignore these quietly.
   2889 				 */
   2890 				if (error != ENXIO)
   2891 					printf("RAIDframe: can't get disk size"
   2892 					    " for dev %s (%d)\n",
   2893 					    device_xname(dv), error);
   2894 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2895 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2896 				vput(vp);
   2897 				continue;
   2898 			}
   2899 			if (wedge) {
   2900 				struct dkwedge_info dkw;
   2901 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2902 				    NOCRED);
   2903 				if (error) {
   2904 					printf("RAIDframe: can't get wedge info for "
   2905 					    "dev %s (%d)\n", device_xname(dv), error);
   2906 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2907 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2908 					vput(vp);
   2909 					continue;
   2910 				}
   2911 
   2912 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2913 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2914 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2915 					vput(vp);
   2916 					continue;
   2917 				}
   2918 
   2919 				ac_list = rf_get_component(ac_list, dev, vp,
   2920 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2921 				rf_part_found = 1; /*There is a raid component on this disk*/
   2922 				continue;
   2923 			}
   2924 
   2925 			/* Ok, the disk exists.  Go get the disklabel. */
   2926 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2927 			if (error) {
   2928 				/*
   2929 				 * XXX can't happen - open() would
   2930 				 * have errored out (or faked up one)
   2931 				 */
   2932 				if (error != ENOTTY)
   2933 					printf("RAIDframe: can't get label for dev "
   2934 					    "%s (%d)\n", device_xname(dv), error);
   2935 			}
   2936 
   2937 			/* don't need this any more.  We'll allocate it again
   2938 			   a little later if we really do... */
   2939 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2940 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2941 			vput(vp);
   2942 
   2943 			if (error)
   2944 				continue;
   2945 
   2946 			rf_part_found = 0; /*No raid partitions yet*/
   2947 			for (i = 0; i < label.d_npartitions; i++) {
   2948 				char cname[sizeof(ac_list->devname)];
   2949 
   2950 				/* We only support partitions marked as RAID */
   2951 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2952 					continue;
   2953 
   2954 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2955 				if (bdevvp(dev, &vp))
   2956 					panic("RAID can't alloc vnode");
   2957 
   2958 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2959 				if (error) {
   2960 					/* Whatever... */
   2961 					vput(vp);
   2962 					continue;
   2963 				}
   2964 				snprintf(cname, sizeof(cname), "%s%c",
   2965 				    device_xname(dv), 'a' + i);
   2966 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2967 					label.d_partitions[i].p_size, numsecs, secsize);
   2968 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2969 			}
   2970 
   2971 			/*
   2972 			 *If there is no raid component on this disk, either in a
   2973 			 *disklabel or inside a wedge, check the raw partition as well,
   2974 			 *as it is possible to configure raid components on raw disk
   2975 			 *devices.
   2976 			 */
   2977 
   2978 			if (!rf_part_found) {
   2979 				char cname[sizeof(ac_list->devname)];
   2980 
   2981 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2982 				if (bdevvp(dev, &vp))
   2983 					panic("RAID can't alloc vnode");
   2984 
   2985 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2986 				if (error) {
   2987 					/* Whatever... */
   2988 					vput(vp);
   2989 					continue;
   2990 				}
   2991 				snprintf(cname, sizeof(cname), "%s%c",
   2992 				    device_xname(dv), 'a' + RAW_PART);
   2993 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2994 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2995 			}
   2996 		}
   2997 		deviter_release(&di);
   2998 	}
   2999 	return ac_list;
   3000 }
   3001 
   3002 
   3003 int
   3004 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3005 {
   3006 
   3007 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3008 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3009 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3010 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3011 	    clabel->row >=0 &&
   3012 	    clabel->column >= 0 &&
   3013 	    clabel->num_rows > 0 &&
   3014 	    clabel->num_columns > 0 &&
   3015 	    clabel->row < clabel->num_rows &&
   3016 	    clabel->column < clabel->num_columns &&
   3017 	    clabel->blockSize > 0 &&
   3018 	    /*
   3019 	     * numBlocksHi may contain garbage, but it is ok since
   3020 	     * the type is unsigned.  If it is really garbage,
   3021 	     * rf_fix_old_label_size() will fix it.
   3022 	     */
   3023 	    rf_component_label_numblocks(clabel) > 0) {
   3024 		/*
   3025 		 * label looks reasonable enough...
   3026 		 * let's make sure it has no old garbage.
   3027 		 */
   3028 		if (numsecs)
   3029 			rf_fix_old_label_size(clabel, numsecs);
   3030 		return(1);
   3031 	}
   3032 	return(0);
   3033 }
   3034 
   3035 
   3036 /*
   3037  * For reasons yet unknown, some old component labels have garbage in
   3038  * the newer numBlocksHi region, and this causes lossage.  Since those
   3039  * disks will also have numsecs set to less than 32 bits of sectors,
   3040  * we can determine when this corruption has occurred, and fix it.
   3041  *
   3042  * The exact same problem, with the same unknown reason, happens to
   3043  * the partitionSizeHi member as well.
   3044  */
   3045 static void
   3046 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3047 {
   3048 
   3049 	if (numsecs < ((uint64_t)1 << 32)) {
   3050 		if (clabel->numBlocksHi) {
   3051 			printf("WARNING: total sectors < 32 bits, yet "
   3052 			       "numBlocksHi set\n"
   3053 			       "WARNING: resetting numBlocksHi to zero.\n");
   3054 			clabel->numBlocksHi = 0;
   3055 		}
   3056 
   3057 		if (clabel->partitionSizeHi) {
   3058 			printf("WARNING: total sectors < 32 bits, yet "
   3059 			       "partitionSizeHi set\n"
   3060 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3061 			clabel->partitionSizeHi = 0;
   3062 		}
   3063 	}
   3064 }
   3065 
   3066 
   3067 #ifdef DEBUG
   3068 void
   3069 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3070 {
   3071 	uint64_t numBlocks;
   3072 	static const char *rp[] = {
   3073 	    "No", "Force", "Soft", "*invalid*"
   3074 	};
   3075 
   3076 
   3077 	numBlocks = rf_component_label_numblocks(clabel);
   3078 
   3079 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3080 	       clabel->row, clabel->column,
   3081 	       clabel->num_rows, clabel->num_columns);
   3082 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3083 	       clabel->version, clabel->serial_number,
   3084 	       clabel->mod_counter);
   3085 	printf("   Clean: %s Status: %d\n",
   3086 	       clabel->clean ? "Yes" : "No", clabel->status);
   3087 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3088 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3089 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3090 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3091 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3092 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3093 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3094 #if 0
   3095 	   printf("   Config order: %d\n", clabel->config_order);
   3096 #endif
   3097 
   3098 }
   3099 #endif
   3100 
   3101 RF_ConfigSet_t *
   3102 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3103 {
   3104 	RF_AutoConfig_t *ac;
   3105 	RF_ConfigSet_t *config_sets;
   3106 	RF_ConfigSet_t *cset;
   3107 	RF_AutoConfig_t *ac_next;
   3108 
   3109 
   3110 	config_sets = NULL;
   3111 
   3112 	/* Go through the AutoConfig list, and figure out which components
   3113 	   belong to what sets.  */
   3114 	ac = ac_list;
   3115 	while(ac!=NULL) {
   3116 		/* we're going to putz with ac->next, so save it here
   3117 		   for use at the end of the loop */
   3118 		ac_next = ac->next;
   3119 
   3120 		if (config_sets == NULL) {
   3121 			/* will need at least this one... */
   3122 			config_sets = (RF_ConfigSet_t *)
   3123 				malloc(sizeof(RF_ConfigSet_t),
   3124 				       M_RAIDFRAME, M_NOWAIT);
   3125 			if (config_sets == NULL) {
   3126 				panic("rf_create_auto_sets: No memory!");
   3127 			}
   3128 			/* this one is easy :) */
   3129 			config_sets->ac = ac;
   3130 			config_sets->next = NULL;
   3131 			config_sets->rootable = 0;
   3132 			ac->next = NULL;
   3133 		} else {
   3134 			/* which set does this component fit into? */
   3135 			cset = config_sets;
   3136 			while(cset!=NULL) {
   3137 				if (rf_does_it_fit(cset, ac)) {
   3138 					/* looks like it matches... */
   3139 					ac->next = cset->ac;
   3140 					cset->ac = ac;
   3141 					break;
   3142 				}
   3143 				cset = cset->next;
   3144 			}
   3145 			if (cset==NULL) {
   3146 				/* didn't find a match above... new set..*/
   3147 				cset = (RF_ConfigSet_t *)
   3148 					malloc(sizeof(RF_ConfigSet_t),
   3149 					       M_RAIDFRAME, M_NOWAIT);
   3150 				if (cset == NULL) {
   3151 					panic("rf_create_auto_sets: No memory!");
   3152 				}
   3153 				cset->ac = ac;
   3154 				ac->next = NULL;
   3155 				cset->next = config_sets;
   3156 				cset->rootable = 0;
   3157 				config_sets = cset;
   3158 			}
   3159 		}
   3160 		ac = ac_next;
   3161 	}
   3162 
   3163 
   3164 	return(config_sets);
   3165 }
   3166 
   3167 static int
   3168 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3169 {
   3170 	RF_ComponentLabel_t *clabel1, *clabel2;
   3171 
   3172 	/* If this one matches the *first* one in the set, that's good
   3173 	   enough, since the other members of the set would have been
   3174 	   through here too... */
   3175 	/* note that we are not checking partitionSize here..
   3176 
   3177 	   Note that we are also not checking the mod_counters here.
   3178 	   If everything else matches except the mod_counter, that's
   3179 	   good enough for this test.  We will deal with the mod_counters
   3180 	   a little later in the autoconfiguration process.
   3181 
   3182 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3183 
   3184 	   The reason we don't check for this is that failed disks
   3185 	   will have lower modification counts.  If those disks are
   3186 	   not added to the set they used to belong to, then they will
   3187 	   form their own set, which may result in 2 different sets,
   3188 	   for example, competing to be configured at raid0, and
   3189 	   perhaps competing to be the root filesystem set.  If the
   3190 	   wrong ones get configured, or both attempt to become /,
   3191 	   weird behaviour and or serious lossage will occur.  Thus we
   3192 	   need to bring them into the fold here, and kick them out at
   3193 	   a later point.
   3194 
   3195 	*/
   3196 
   3197 	clabel1 = cset->ac->clabel;
   3198 	clabel2 = ac->clabel;
   3199 	if ((clabel1->version == clabel2->version) &&
   3200 	    (clabel1->serial_number == clabel2->serial_number) &&
   3201 	    (clabel1->num_rows == clabel2->num_rows) &&
   3202 	    (clabel1->num_columns == clabel2->num_columns) &&
   3203 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3204 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3205 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3206 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3207 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3208 	    (clabel1->blockSize == clabel2->blockSize) &&
   3209 	    rf_component_label_numblocks(clabel1) ==
   3210 	    rf_component_label_numblocks(clabel2) &&
   3211 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3212 	    (clabel1->root_partition == clabel2->root_partition) &&
   3213 	    (clabel1->last_unit == clabel2->last_unit) &&
   3214 	    (clabel1->config_order == clabel2->config_order)) {
   3215 		/* if it get's here, it almost *has* to be a match */
   3216 	} else {
   3217 		/* it's not consistent with somebody in the set..
   3218 		   punt */
   3219 		return(0);
   3220 	}
   3221 	/* all was fine.. it must fit... */
   3222 	return(1);
   3223 }
   3224 
   3225 int
   3226 rf_have_enough_components(RF_ConfigSet_t *cset)
   3227 {
   3228 	RF_AutoConfig_t *ac;
   3229 	RF_AutoConfig_t *auto_config;
   3230 	RF_ComponentLabel_t *clabel;
   3231 	int c;
   3232 	int num_cols;
   3233 	int num_missing;
   3234 	int mod_counter;
   3235 	int mod_counter_found;
   3236 	int even_pair_failed;
   3237 	char parity_type;
   3238 
   3239 
   3240 	/* check to see that we have enough 'live' components
   3241 	   of this set.  If so, we can configure it if necessary */
   3242 
   3243 	num_cols = cset->ac->clabel->num_columns;
   3244 	parity_type = cset->ac->clabel->parityConfig;
   3245 
   3246 	/* XXX Check for duplicate components!?!?!? */
   3247 
   3248 	/* Determine what the mod_counter is supposed to be for this set. */
   3249 
   3250 	mod_counter_found = 0;
   3251 	mod_counter = 0;
   3252 	ac = cset->ac;
   3253 	while(ac!=NULL) {
   3254 		if (mod_counter_found==0) {
   3255 			mod_counter = ac->clabel->mod_counter;
   3256 			mod_counter_found = 1;
   3257 		} else {
   3258 			if (ac->clabel->mod_counter > mod_counter) {
   3259 				mod_counter = ac->clabel->mod_counter;
   3260 			}
   3261 		}
   3262 		ac = ac->next;
   3263 	}
   3264 
   3265 	num_missing = 0;
   3266 	auto_config = cset->ac;
   3267 
   3268 	even_pair_failed = 0;
   3269 	for(c=0; c<num_cols; c++) {
   3270 		ac = auto_config;
   3271 		while(ac!=NULL) {
   3272 			if ((ac->clabel->column == c) &&
   3273 			    (ac->clabel->mod_counter == mod_counter)) {
   3274 				/* it's this one... */
   3275 #ifdef DEBUG
   3276 				printf("Found: %s at %d\n",
   3277 				       ac->devname,c);
   3278 #endif
   3279 				break;
   3280 			}
   3281 			ac=ac->next;
   3282 		}
   3283 		if (ac==NULL) {
   3284 				/* Didn't find one here! */
   3285 				/* special case for RAID 1, especially
   3286 				   where there are more than 2
   3287 				   components (where RAIDframe treats
   3288 				   things a little differently :( ) */
   3289 			if (parity_type == '1') {
   3290 				if (c%2 == 0) { /* even component */
   3291 					even_pair_failed = 1;
   3292 				} else { /* odd component.  If
   3293 					    we're failed, and
   3294 					    so is the even
   3295 					    component, it's
   3296 					    "Good Night, Charlie" */
   3297 					if (even_pair_failed == 1) {
   3298 						return(0);
   3299 					}
   3300 				}
   3301 			} else {
   3302 				/* normal accounting */
   3303 				num_missing++;
   3304 			}
   3305 		}
   3306 		if ((parity_type == '1') && (c%2 == 1)) {
   3307 				/* Just did an even component, and we didn't
   3308 				   bail.. reset the even_pair_failed flag,
   3309 				   and go on to the next component.... */
   3310 			even_pair_failed = 0;
   3311 		}
   3312 	}
   3313 
   3314 	clabel = cset->ac->clabel;
   3315 
   3316 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3317 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3318 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3319 		/* XXX this needs to be made *much* more general */
   3320 		/* Too many failures */
   3321 		return(0);
   3322 	}
   3323 	/* otherwise, all is well, and we've got enough to take a kick
   3324 	   at autoconfiguring this set */
   3325 	return(1);
   3326 }
   3327 
   3328 void
   3329 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3330 			RF_Raid_t *raidPtr)
   3331 {
   3332 	RF_ComponentLabel_t *clabel;
   3333 	int i;
   3334 
   3335 	clabel = ac->clabel;
   3336 
   3337 	/* 1. Fill in the common stuff */
   3338 	config->numRow = clabel->num_rows = 1;
   3339 	config->numCol = clabel->num_columns;
   3340 	config->numSpare = 0; /* XXX should this be set here? */
   3341 	config->sectPerSU = clabel->sectPerSU;
   3342 	config->SUsPerPU = clabel->SUsPerPU;
   3343 	config->SUsPerRU = clabel->SUsPerRU;
   3344 	config->parityConfig = clabel->parityConfig;
   3345 	/* XXX... */
   3346 	strcpy(config->diskQueueType,"fifo");
   3347 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3348 	config->layoutSpecificSize = 0; /* XXX ?? */
   3349 
   3350 	while(ac!=NULL) {
   3351 		/* row/col values will be in range due to the checks
   3352 		   in reasonable_label() */
   3353 		strcpy(config->devnames[0][ac->clabel->column],
   3354 		       ac->devname);
   3355 		ac = ac->next;
   3356 	}
   3357 
   3358 	for(i=0;i<RF_MAXDBGV;i++) {
   3359 		config->debugVars[i][0] = 0;
   3360 	}
   3361 }
   3362 
   3363 int
   3364 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3365 {
   3366 	RF_ComponentLabel_t *clabel;
   3367 	int column;
   3368 	int sparecol;
   3369 
   3370 	raidPtr->autoconfigure = new_value;
   3371 
   3372 	for(column=0; column<raidPtr->numCol; column++) {
   3373 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3374 			clabel = raidget_component_label(raidPtr, column);
   3375 			clabel->autoconfigure = new_value;
   3376 			raidflush_component_label(raidPtr, column);
   3377 		}
   3378 	}
   3379 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3380 		sparecol = raidPtr->numCol + column;
   3381 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3382 			clabel = raidget_component_label(raidPtr, sparecol);
   3383 			clabel->autoconfigure = new_value;
   3384 			raidflush_component_label(raidPtr, sparecol);
   3385 		}
   3386 	}
   3387 	return(new_value);
   3388 }
   3389 
   3390 int
   3391 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3392 {
   3393 	RF_ComponentLabel_t *clabel;
   3394 	int column;
   3395 	int sparecol;
   3396 
   3397 	raidPtr->root_partition = new_value;
   3398 	for(column=0; column<raidPtr->numCol; column++) {
   3399 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3400 			clabel = raidget_component_label(raidPtr, column);
   3401 			clabel->root_partition = new_value;
   3402 			raidflush_component_label(raidPtr, column);
   3403 		}
   3404 	}
   3405 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3406 		sparecol = raidPtr->numCol + column;
   3407 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3408 			clabel = raidget_component_label(raidPtr, sparecol);
   3409 			clabel->root_partition = new_value;
   3410 			raidflush_component_label(raidPtr, sparecol);
   3411 		}
   3412 	}
   3413 	return(new_value);
   3414 }
   3415 
   3416 void
   3417 rf_release_all_vps(RF_ConfigSet_t *cset)
   3418 {
   3419 	RF_AutoConfig_t *ac;
   3420 
   3421 	ac = cset->ac;
   3422 	while(ac!=NULL) {
   3423 		/* Close the vp, and give it back */
   3424 		if (ac->vp) {
   3425 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3426 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3427 			vput(ac->vp);
   3428 			ac->vp = NULL;
   3429 		}
   3430 		ac = ac->next;
   3431 	}
   3432 }
   3433 
   3434 
   3435 void
   3436 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3437 {
   3438 	RF_AutoConfig_t *ac;
   3439 	RF_AutoConfig_t *next_ac;
   3440 
   3441 	ac = cset->ac;
   3442 	while(ac!=NULL) {
   3443 		next_ac = ac->next;
   3444 		/* nuke the label */
   3445 		free(ac->clabel, M_RAIDFRAME);
   3446 		/* cleanup the config structure */
   3447 		free(ac, M_RAIDFRAME);
   3448 		/* "next.." */
   3449 		ac = next_ac;
   3450 	}
   3451 	/* and, finally, nuke the config set */
   3452 	free(cset, M_RAIDFRAME);
   3453 }
   3454 
   3455 
   3456 void
   3457 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3458 {
   3459 	/* current version number */
   3460 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3461 	clabel->serial_number = raidPtr->serial_number;
   3462 	clabel->mod_counter = raidPtr->mod_counter;
   3463 
   3464 	clabel->num_rows = 1;
   3465 	clabel->num_columns = raidPtr->numCol;
   3466 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3467 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3468 
   3469 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3470 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3471 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3472 
   3473 	clabel->blockSize = raidPtr->bytesPerSector;
   3474 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3475 
   3476 	/* XXX not portable */
   3477 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3478 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3479 	clabel->autoconfigure = raidPtr->autoconfigure;
   3480 	clabel->root_partition = raidPtr->root_partition;
   3481 	clabel->last_unit = raidPtr->raidid;
   3482 	clabel->config_order = raidPtr->config_order;
   3483 
   3484 #ifndef RF_NO_PARITY_MAP
   3485 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3486 #endif
   3487 }
   3488 
   3489 struct raid_softc *
   3490 rf_auto_config_set(RF_ConfigSet_t *cset)
   3491 {
   3492 	RF_Raid_t *raidPtr;
   3493 	RF_Config_t *config;
   3494 	int raidID;
   3495 	struct raid_softc *sc;
   3496 
   3497 #ifdef DEBUG
   3498 	printf("RAID autoconfigure\n");
   3499 #endif
   3500 
   3501 	/* 1. Create a config structure */
   3502 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3503 	if (config == NULL) {
   3504 		printf("%s: Out of mem - config!?!?\n", __func__);
   3505 				/* XXX do something more intelligent here. */
   3506 		return NULL;
   3507 	}
   3508 
   3509 	/*
   3510 	   2. Figure out what RAID ID this one is supposed to live at
   3511 	   See if we can get the same RAID dev that it was configured
   3512 	   on last time..
   3513 	*/
   3514 
   3515 	raidID = cset->ac->clabel->last_unit;
   3516 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3517 	     sc = raidget(++raidID, false))
   3518 		continue;
   3519 #ifdef DEBUG
   3520 	printf("Configuring raid%d:\n",raidID);
   3521 #endif
   3522 
   3523 	if (sc == NULL)
   3524 		sc = raidget(raidID, true);
   3525 	if (sc == NULL) {
   3526 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3527 				/* XXX do something more intelligent here. */
   3528 		free(config, M_RAIDFRAME);
   3529 		return NULL;
   3530 	}
   3531 
   3532 	raidPtr = &sc->sc_r;
   3533 
   3534 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3535 	raidPtr->softc = sc;
   3536 	raidPtr->raidid = raidID;
   3537 	raidPtr->openings = RAIDOUTSTANDING;
   3538 
   3539 	/* 3. Build the configuration structure */
   3540 	rf_create_configuration(cset->ac, config, raidPtr);
   3541 
   3542 	/* 4. Do the configuration */
   3543 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3544 		raidinit(sc);
   3545 
   3546 		rf_markalldirty(raidPtr);
   3547 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3548 		switch (cset->ac->clabel->root_partition) {
   3549 		case 1:	/* Force Root */
   3550 		case 2:	/* Soft Root: root when boot partition part of raid */
   3551 			/*
   3552 			 * everything configured just fine.  Make a note
   3553 			 * that this set is eligible to be root,
   3554 			 * or forced to be root
   3555 			 */
   3556 			cset->rootable = cset->ac->clabel->root_partition;
   3557 			/* XXX do this here? */
   3558 			raidPtr->root_partition = cset->rootable;
   3559 			break;
   3560 		default:
   3561 			break;
   3562 		}
   3563 	} else {
   3564 		raidput(sc);
   3565 		sc = NULL;
   3566 	}
   3567 
   3568 	/* 5. Cleanup */
   3569 	free(config, M_RAIDFRAME);
   3570 	return sc;
   3571 }
   3572 
   3573 void
   3574 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3575 	     size_t xmin, size_t xmax)
   3576 {
   3577 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3578 	pool_sethiwat(p, xmax);
   3579 	pool_prime(p, xmin);
   3580 	pool_setlowat(p, xmin);
   3581 }
   3582 
   3583 /*
   3584  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3585  * to see if there is IO pending and if that IO could possibly be done
   3586  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3587  * otherwise.
   3588  *
   3589  */
   3590 int
   3591 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3592 {
   3593 	struct raid_softc *rs;
   3594 	struct dk_softc *dksc;
   3595 
   3596 	rs = raidPtr->softc;
   3597 	dksc = &rs->sc_dksc;
   3598 
   3599 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3600 		return 1;
   3601 
   3602 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3603 		/* there is work to do */
   3604 		return 0;
   3605 	}
   3606 	/* default is nothing to do */
   3607 	return 1;
   3608 }
   3609 
   3610 int
   3611 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3612 {
   3613 	uint64_t numsecs;
   3614 	unsigned secsize;
   3615 	int error;
   3616 
   3617 	error = getdisksize(vp, &numsecs, &secsize);
   3618 	if (error == 0) {
   3619 		diskPtr->blockSize = secsize;
   3620 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3621 		diskPtr->partitionSize = numsecs;
   3622 		return 0;
   3623 	}
   3624 	return error;
   3625 }
   3626 
   3627 static int
   3628 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3629 {
   3630 	return 1;
   3631 }
   3632 
   3633 static void
   3634 raid_attach(device_t parent, device_t self, void *aux)
   3635 {
   3636 }
   3637 
   3638 
   3639 static int
   3640 raid_detach(device_t self, int flags)
   3641 {
   3642 	int error;
   3643 	struct raid_softc *rs = raidsoftc(self);
   3644 
   3645 	if (rs == NULL)
   3646 		return ENXIO;
   3647 
   3648 	if ((error = raidlock(rs)) != 0)
   3649 		return (error);
   3650 
   3651 	error = raid_detach_unlocked(rs);
   3652 
   3653 	raidunlock(rs);
   3654 
   3655 	/* XXX raid can be referenced here */
   3656 
   3657 	if (error)
   3658 		return error;
   3659 
   3660 	/* Free the softc */
   3661 	raidput(rs);
   3662 
   3663 	return 0;
   3664 }
   3665 
   3666 static void
   3667 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3668 {
   3669 	struct dk_softc *dksc = &rs->sc_dksc;
   3670 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3671 
   3672 	memset(dg, 0, sizeof(*dg));
   3673 
   3674 	dg->dg_secperunit = raidPtr->totalSectors;
   3675 	dg->dg_secsize = raidPtr->bytesPerSector;
   3676 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3677 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3678 
   3679 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3680 }
   3681 
   3682 /*
   3683  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3684  * We end up returning whatever error was returned by the first cache flush
   3685  * that fails.
   3686  */
   3687 
   3688 int
   3689 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3690 {
   3691 	int c, sparecol;
   3692 	int e,error;
   3693 	int force = 1;
   3694 
   3695 	error = 0;
   3696 	for (c = 0; c < raidPtr->numCol; c++) {
   3697 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3698 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3699 					  &force, FWRITE, NOCRED);
   3700 			if (e) {
   3701 				if (e != ENODEV)
   3702 					printf("raid%d: cache flush to component %s failed.\n",
   3703 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3704 				if (error == 0) {
   3705 					error = e;
   3706 				}
   3707 			}
   3708 		}
   3709 	}
   3710 
   3711 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3712 		sparecol = raidPtr->numCol + c;
   3713 		/* Need to ensure that the reconstruct actually completed! */
   3714 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3715 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3716 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3717 			if (e) {
   3718 				if (e != ENODEV)
   3719 					printf("raid%d: cache flush to component %s failed.\n",
   3720 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3721 				if (error == 0) {
   3722 					error = e;
   3723 				}
   3724 			}
   3725 		}
   3726 	}
   3727 	return error;
   3728 }
   3729 
   3730 /*
   3731  * Module interface
   3732  */
   3733 
   3734 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3735 
   3736 #ifdef _MODULE
   3737 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3738 #endif
   3739 
   3740 static int raid_modcmd(modcmd_t, void *);
   3741 static int raid_modcmd_init(void);
   3742 static int raid_modcmd_fini(void);
   3743 
   3744 static int
   3745 raid_modcmd(modcmd_t cmd, void *data)
   3746 {
   3747 	int error;
   3748 
   3749 	error = 0;
   3750 	switch (cmd) {
   3751 	case MODULE_CMD_INIT:
   3752 		error = raid_modcmd_init();
   3753 		break;
   3754 	case MODULE_CMD_FINI:
   3755 		error = raid_modcmd_fini();
   3756 		break;
   3757 	default:
   3758 		error = ENOTTY;
   3759 		break;
   3760 	}
   3761 	return error;
   3762 }
   3763 
   3764 static int
   3765 raid_modcmd_init(void)
   3766 {
   3767 	int error;
   3768 	int bmajor, cmajor;
   3769 
   3770 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3771 	mutex_enter(&raid_lock);
   3772 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3773 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3774 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3775 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3776 
   3777 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3778 #endif
   3779 
   3780 	bmajor = cmajor = -1;
   3781 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3782 	    &raid_cdevsw, &cmajor);
   3783 	if (error != 0 && error != EEXIST) {
   3784 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3785 		mutex_exit(&raid_lock);
   3786 		return error;
   3787 	}
   3788 #ifdef _MODULE
   3789 	error = config_cfdriver_attach(&raid_cd);
   3790 	if (error != 0) {
   3791 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3792 		    __func__, error);
   3793 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3794 		mutex_exit(&raid_lock);
   3795 		return error;
   3796 	}
   3797 #endif
   3798 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3799 	if (error != 0) {
   3800 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3801 		    __func__, error);
   3802 #ifdef _MODULE
   3803 		config_cfdriver_detach(&raid_cd);
   3804 #endif
   3805 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3806 		mutex_exit(&raid_lock);
   3807 		return error;
   3808 	}
   3809 
   3810 	raidautoconfigdone = false;
   3811 
   3812 	mutex_exit(&raid_lock);
   3813 
   3814 	if (error == 0) {
   3815 		if (rf_BootRaidframe(true) == 0)
   3816 			aprint_verbose("Kernelized RAIDframe activated\n");
   3817 		else
   3818 			panic("Serious error activating RAID!!");
   3819 	}
   3820 
   3821 	/*
   3822 	 * Register a finalizer which will be used to auto-config RAID
   3823 	 * sets once all real hardware devices have been found.
   3824 	 */
   3825 	error = config_finalize_register(NULL, rf_autoconfig);
   3826 	if (error != 0) {
   3827 		aprint_error("WARNING: unable to register RAIDframe "
   3828 		    "finalizer\n");
   3829 		error = 0;
   3830 	}
   3831 
   3832 	return error;
   3833 }
   3834 
   3835 static int
   3836 raid_modcmd_fini(void)
   3837 {
   3838 	int error;
   3839 
   3840 	mutex_enter(&raid_lock);
   3841 
   3842 	/* Don't allow unload if raid device(s) exist.  */
   3843 	if (!LIST_EMPTY(&raids)) {
   3844 		mutex_exit(&raid_lock);
   3845 		return EBUSY;
   3846 	}
   3847 
   3848 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3849 	if (error != 0) {
   3850 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3851 		mutex_exit(&raid_lock);
   3852 		return error;
   3853 	}
   3854 #ifdef _MODULE
   3855 	error = config_cfdriver_detach(&raid_cd);
   3856 	if (error != 0) {
   3857 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3858 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3859 		mutex_exit(&raid_lock);
   3860 		return error;
   3861 	}
   3862 #endif
   3863 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3864 	if (error != 0) {
   3865 		aprint_error("%s: cannot detach devsw\n",__func__);
   3866 #ifdef _MODULE
   3867 		config_cfdriver_attach(&raid_cd);
   3868 #endif
   3869 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3870 		mutex_exit(&raid_lock);
   3871 		return error;
   3872 	}
   3873 	rf_BootRaidframe(false);
   3874 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3875 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3876 	rf_destroy_cond2(rf_sparet_wait_cv);
   3877 	rf_destroy_cond2(rf_sparet_resp_cv);
   3878 #endif
   3879 	mutex_exit(&raid_lock);
   3880 	mutex_destroy(&raid_lock);
   3881 
   3882 	return error;
   3883 }
   3884