Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.298.2.5
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.298.2.5 2017/12/03 11:37:31 jdolecek Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.298.2.5 2017/12/03 11:37:31 jdolecek Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #ifdef DEBUG_ROOT
    165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    166 #else
    167 #define DPRINTF(a, ...)
    168 #endif
    169 
    170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    171 static rf_declare_mutex2(rf_sparet_wait_mutex);
    172 static rf_declare_cond2(rf_sparet_wait_cv);
    173 static rf_declare_cond2(rf_sparet_resp_cv);
    174 
    175 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    176 						 * spare table */
    177 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    178 						 * installation process */
    179 #endif
    180 
    181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    182 
    183 /* prototypes */
    184 static void KernelWakeupFunc(struct buf *);
    185 static void InitBP(struct buf *, struct vnode *, unsigned,
    186     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    187     void *, int, struct proc *);
    188 struct raid_softc;
    189 static void raidinit(struct raid_softc *);
    190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 const struct bdevsw raid_bdevsw = {
    221 	.d_open = raidopen,
    222 	.d_close = raidclose,
    223 	.d_strategy = raidstrategy,
    224 	.d_ioctl = raidioctl,
    225 	.d_dump = raiddump,
    226 	.d_psize = raidsize,
    227 	.d_discard = nodiscard,
    228 	.d_flag = D_DISK
    229 };
    230 
    231 const struct cdevsw raid_cdevsw = {
    232 	.d_open = raidopen,
    233 	.d_close = raidclose,
    234 	.d_read = raidread,
    235 	.d_write = raidwrite,
    236 	.d_ioctl = raidioctl,
    237 	.d_stop = nostop,
    238 	.d_tty = notty,
    239 	.d_poll = nopoll,
    240 	.d_mmap = nommap,
    241 	.d_kqfilter = nokqfilter,
    242 	.d_discard = nodiscard,
    243 	.d_flag = D_DISK
    244 };
    245 
    246 static void	raidminphys(struct buf *);
    247 
    248 static struct dkdriver rf_dkdriver = {
    249 	.d_open = raidopen,
    250 	.d_close = raidclose,
    251 	.d_strategy = raidstrategy,
    252 	.d_diskstart = raid_diskstart,
    253 	.d_dumpblocks = raid_dumpblocks,
    254 	.d_lastclose = raid_lastclose,
    255 	.d_minphys = raidminphys
    256 };
    257 
    258 struct raid_softc {
    259 	struct dk_softc sc_dksc;
    260 	int	sc_unit;
    261 	int     sc_flags;	/* flags */
    262 	int     sc_cflags;	/* configuration flags */
    263 	kmutex_t sc_mutex;	/* interlock mutex */
    264 	kcondvar_t sc_cv;	/* and the condvar */
    265 	uint64_t sc_size;	/* size of the raid device */
    266 	char    sc_xname[20];	/* XXX external name */
    267 	RF_Raid_t sc_r;
    268 	LIST_ENTRY(raid_softc) sc_link;
    269 };
    270 /* sc_flags */
    271 #define RAIDF_INITED		0x01	/* unit has been initialized */
    272 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    273 #define RAIDF_DETACH  		0x04	/* detach after final close */
    274 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    275 #define RAIDF_LOCKED		0x10	/* unit is locked */
    276 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    277 
    278 #define	raidunit(x)	DISKUNIT(x)
    279 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    280 
    281 extern struct cfdriver raid_cd;
    282 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    283     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    284     DVF_DETACH_SHUTDOWN);
    285 
    286 /*
    287  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    288  * Be aware that large numbers can allow the driver to consume a lot of
    289  * kernel memory, especially on writes, and in degraded mode reads.
    290  *
    291  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    292  * a single 64K write will typically require 64K for the old data,
    293  * 64K for the old parity, and 64K for the new parity, for a total
    294  * of 192K (if the parity buffer is not re-used immediately).
    295  * Even it if is used immediately, that's still 128K, which when multiplied
    296  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    297  *
    298  * Now in degraded mode, for example, a 64K read on the above setup may
    299  * require data reconstruction, which will require *all* of the 4 remaining
    300  * disks to participate -- 4 * 32K/disk == 128K again.
    301  */
    302 
    303 #ifndef RAIDOUTSTANDING
    304 #define RAIDOUTSTANDING   6
    305 #endif
    306 
    307 #define RAIDLABELDEV(dev)	\
    308 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    309 
    310 /* declared here, and made public, for the benefit of KVM stuff.. */
    311 
    312 static int raidlock(struct raid_softc *);
    313 static void raidunlock(struct raid_softc *);
    314 
    315 static int raid_detach_unlocked(struct raid_softc *);
    316 
    317 static void rf_markalldirty(RF_Raid_t *);
    318 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    319 
    320 void rf_ReconThread(struct rf_recon_req *);
    321 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    322 void rf_CopybackThread(RF_Raid_t *raidPtr);
    323 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    324 int rf_autoconfig(device_t);
    325 void rf_buildroothack(RF_ConfigSet_t *);
    326 
    327 RF_AutoConfig_t *rf_find_raid_components(void);
    328 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    329 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    330 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    331 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    332 int rf_set_autoconfig(RF_Raid_t *, int);
    333 int rf_set_rootpartition(RF_Raid_t *, int);
    334 void rf_release_all_vps(RF_ConfigSet_t *);
    335 void rf_cleanup_config_set(RF_ConfigSet_t *);
    336 int rf_have_enough_components(RF_ConfigSet_t *);
    337 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    338 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    339 
    340 /*
    341  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    342  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    343  * in the kernel config file.
    344  */
    345 #ifdef RAID_AUTOCONFIG
    346 int raidautoconfig = 1;
    347 #else
    348 int raidautoconfig = 0;
    349 #endif
    350 static bool raidautoconfigdone = false;
    351 
    352 struct RF_Pools_s rf_pools;
    353 
    354 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    355 static kmutex_t raid_lock;
    356 
    357 static struct raid_softc *
    358 raidcreate(int unit) {
    359 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    360 	sc->sc_unit = unit;
    361 	cv_init(&sc->sc_cv, "raidunit");
    362 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    363 	return sc;
    364 }
    365 
    366 static void
    367 raiddestroy(struct raid_softc *sc) {
    368 	cv_destroy(&sc->sc_cv);
    369 	mutex_destroy(&sc->sc_mutex);
    370 	kmem_free(sc, sizeof(*sc));
    371 }
    372 
    373 static struct raid_softc *
    374 raidget(int unit, bool create) {
    375 	struct raid_softc *sc;
    376 	if (unit < 0) {
    377 #ifdef DIAGNOSTIC
    378 		panic("%s: unit %d!", __func__, unit);
    379 #endif
    380 		return NULL;
    381 	}
    382 	mutex_enter(&raid_lock);
    383 	LIST_FOREACH(sc, &raids, sc_link) {
    384 		if (sc->sc_unit == unit) {
    385 			mutex_exit(&raid_lock);
    386 			return sc;
    387 		}
    388 	}
    389 	mutex_exit(&raid_lock);
    390 	if (!create)
    391 		return NULL;
    392 	if ((sc = raidcreate(unit)) == NULL)
    393 		return NULL;
    394 	mutex_enter(&raid_lock);
    395 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    396 	mutex_exit(&raid_lock);
    397 	return sc;
    398 }
    399 
    400 static void
    401 raidput(struct raid_softc *sc) {
    402 	mutex_enter(&raid_lock);
    403 	LIST_REMOVE(sc, sc_link);
    404 	mutex_exit(&raid_lock);
    405 	raiddestroy(sc);
    406 }
    407 
    408 void
    409 raidattach(int num)
    410 {
    411 
    412 	/*
    413 	 * Device attachment and associated initialization now occurs
    414 	 * as part of the module initialization.
    415 	 */
    416 }
    417 
    418 int
    419 rf_autoconfig(device_t self)
    420 {
    421 	RF_AutoConfig_t *ac_list;
    422 	RF_ConfigSet_t *config_sets;
    423 
    424 	if (!raidautoconfig || raidautoconfigdone == true)
    425 		return (0);
    426 
    427 	/* XXX This code can only be run once. */
    428 	raidautoconfigdone = true;
    429 
    430 #ifdef __HAVE_CPU_BOOTCONF
    431 	/*
    432 	 * 0. find the boot device if needed first so we can use it later
    433 	 * this needs to be done before we autoconfigure any raid sets,
    434 	 * because if we use wedges we are not going to be able to open
    435 	 * the boot device later
    436 	 */
    437 	if (booted_device == NULL)
    438 		cpu_bootconf();
    439 #endif
    440 	/* 1. locate all RAID components on the system */
    441 	aprint_debug("Searching for RAID components...\n");
    442 	ac_list = rf_find_raid_components();
    443 
    444 	/* 2. Sort them into their respective sets. */
    445 	config_sets = rf_create_auto_sets(ac_list);
    446 
    447 	/*
    448 	 * 3. Evaluate each set and configure the valid ones.
    449 	 * This gets done in rf_buildroothack().
    450 	 */
    451 	rf_buildroothack(config_sets);
    452 
    453 	return 1;
    454 }
    455 
    456 static int
    457 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    458 	const char *bootname = device_xname(bdv);
    459 	size_t len = strlen(bootname);
    460 
    461 	for (int col = 0; col < r->numCol; col++) {
    462 		const char *devname = r->Disks[col].devname;
    463 		devname += sizeof("/dev/") - 1;
    464 		if (strncmp(devname, "dk", 2) == 0) {
    465 			const char *parent =
    466 			    dkwedge_get_parent_name(r->Disks[col].dev);
    467 			if (parent != NULL)
    468 				devname = parent;
    469 		}
    470 		if (strncmp(devname, bootname, len) == 0) {
    471 			struct raid_softc *sc = r->softc;
    472 			aprint_debug("raid%d includes boot device %s\n",
    473 			    sc->sc_unit, devname);
    474 			return 1;
    475 		}
    476 	}
    477 	return 0;
    478 }
    479 
    480 void
    481 rf_buildroothack(RF_ConfigSet_t *config_sets)
    482 {
    483 	RF_ConfigSet_t *cset;
    484 	RF_ConfigSet_t *next_cset;
    485 	int num_root;
    486 	struct raid_softc *sc, *rsc;
    487 	struct dk_softc *dksc;
    488 
    489 	sc = rsc = NULL;
    490 	num_root = 0;
    491 	cset = config_sets;
    492 	while (cset != NULL) {
    493 		next_cset = cset->next;
    494 		if (rf_have_enough_components(cset) &&
    495 		    cset->ac->clabel->autoconfigure == 1) {
    496 			sc = rf_auto_config_set(cset);
    497 			if (sc != NULL) {
    498 				aprint_debug("raid%d: configured ok\n",
    499 				    sc->sc_unit);
    500 				if (cset->rootable) {
    501 					rsc = sc;
    502 					num_root++;
    503 				}
    504 			} else {
    505 				/* The autoconfig didn't work :( */
    506 				aprint_debug("Autoconfig failed\n");
    507 				rf_release_all_vps(cset);
    508 			}
    509 		} else {
    510 			/* we're not autoconfiguring this set...
    511 			   release the associated resources */
    512 			rf_release_all_vps(cset);
    513 		}
    514 		/* cleanup */
    515 		rf_cleanup_config_set(cset);
    516 		cset = next_cset;
    517 	}
    518 	dksc = &rsc->sc_dksc;
    519 
    520 	/* if the user has specified what the root device should be
    521 	   then we don't touch booted_device or boothowto... */
    522 
    523 	if (rootspec != NULL)
    524 		return;
    525 
    526 	/* we found something bootable... */
    527 
    528 	/*
    529 	 * XXX: The following code assumes that the root raid
    530 	 * is the first ('a') partition. This is about the best
    531 	 * we can do with a BSD disklabel, but we might be able
    532 	 * to do better with a GPT label, by setting a specified
    533 	 * attribute to indicate the root partition. We can then
    534 	 * stash the partition number in the r->root_partition
    535 	 * high bits (the bottom 2 bits are already used). For
    536 	 * now we just set booted_partition to 0 when we override
    537 	 * root.
    538 	 */
    539 	if (num_root == 1) {
    540 		device_t candidate_root;
    541 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    542 			char cname[sizeof(cset->ac->devname)];
    543 			/* XXX: assume partition 'a' first */
    544 			snprintf(cname, sizeof(cname), "%s%c",
    545 			    device_xname(dksc->sc_dev), 'a');
    546 			candidate_root = dkwedge_find_by_wname(cname);
    547 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    548 			    cname);
    549 			if (candidate_root == NULL) {
    550 				/*
    551 				 * If that is not found, because we don't use
    552 				 * disklabel, return the first dk child
    553 				 * XXX: we can skip the 'a' check above
    554 				 * and always do this...
    555 				 */
    556 				size_t i = 0;
    557 				candidate_root = dkwedge_find_by_parent(
    558 				    device_xname(dksc->sc_dev), &i);
    559 			}
    560 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    561 			    candidate_root);
    562 		} else
    563 			candidate_root = dksc->sc_dev;
    564 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    565 		DPRINTF("%s: booted_device=%p root_partition=%d "
    566 		   "contains_boot=%d\n", __func__, booted_device,
    567 		   rsc->sc_r.root_partition,
    568 		   rf_containsboot(&rsc->sc_r, booted_device));
    569 		if (booted_device == NULL ||
    570 		    rsc->sc_r.root_partition == 1 ||
    571 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    572 			booted_device = candidate_root;
    573 			booted_method = "raidframe/single";
    574 			booted_partition = 0;	/* XXX assume 'a' */
    575 		}
    576 	} else if (num_root > 1) {
    577 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    578 		    booted_device);
    579 
    580 		/*
    581 		 * Maybe the MD code can help. If it cannot, then
    582 		 * setroot() will discover that we have no
    583 		 * booted_device and will ask the user if nothing was
    584 		 * hardwired in the kernel config file
    585 		 */
    586 		if (booted_device == NULL)
    587 			return;
    588 
    589 		num_root = 0;
    590 		mutex_enter(&raid_lock);
    591 		LIST_FOREACH(sc, &raids, sc_link) {
    592 			RF_Raid_t *r = &sc->sc_r;
    593 			if (r->valid == 0)
    594 				continue;
    595 
    596 			if (r->root_partition == 0)
    597 				continue;
    598 
    599 			if (rf_containsboot(r, booted_device)) {
    600 				num_root++;
    601 				rsc = sc;
    602 				dksc = &rsc->sc_dksc;
    603 			}
    604 		}
    605 		mutex_exit(&raid_lock);
    606 
    607 		if (num_root == 1) {
    608 			booted_device = dksc->sc_dev;
    609 			booted_method = "raidframe/multi";
    610 			booted_partition = 0;	/* XXX assume 'a' */
    611 		} else {
    612 			/* we can't guess.. require the user to answer... */
    613 			boothowto |= RB_ASKNAME;
    614 		}
    615 	}
    616 }
    617 
    618 static int
    619 raidsize(dev_t dev)
    620 {
    621 	struct raid_softc *rs;
    622 	struct dk_softc *dksc;
    623 	unsigned int unit;
    624 
    625 	unit = raidunit(dev);
    626 	if ((rs = raidget(unit, false)) == NULL)
    627 		return -1;
    628 	dksc = &rs->sc_dksc;
    629 
    630 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    631 		return -1;
    632 
    633 	return dk_size(dksc, dev);
    634 }
    635 
    636 static int
    637 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    638 {
    639 	unsigned int unit;
    640 	struct raid_softc *rs;
    641 	struct dk_softc *dksc;
    642 
    643 	unit = raidunit(dev);
    644 	if ((rs = raidget(unit, false)) == NULL)
    645 		return ENXIO;
    646 	dksc = &rs->sc_dksc;
    647 
    648 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    649 		return ENODEV;
    650 
    651         /*
    652            Note that blkno is relative to this particular partition.
    653            By adding adding RF_PROTECTED_SECTORS, we get a value that
    654 	   is relative to the partition used for the underlying component.
    655         */
    656 	blkno += RF_PROTECTED_SECTORS;
    657 
    658 	return dk_dump(dksc, dev, blkno, va, size);
    659 }
    660 
    661 static int
    662 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    663 {
    664 	struct raid_softc *rs = raidsoftc(dev);
    665 	const struct bdevsw *bdev;
    666 	RF_Raid_t *raidPtr;
    667 	int     c, sparecol, j, scol, dumpto;
    668 	int     error = 0;
    669 
    670 	raidPtr = &rs->sc_r;
    671 
    672 	/* we only support dumping to RAID 1 sets */
    673 	if (raidPtr->Layout.numDataCol != 1 ||
    674 	    raidPtr->Layout.numParityCol != 1)
    675 		return EINVAL;
    676 
    677 	if ((error = raidlock(rs)) != 0)
    678 		return error;
    679 
    680 	/* figure out what device is alive.. */
    681 
    682 	/*
    683 	   Look for a component to dump to.  The preference for the
    684 	   component to dump to is as follows:
    685 	   1) the master
    686 	   2) a used_spare of the master
    687 	   3) the slave
    688 	   4) a used_spare of the slave
    689 	*/
    690 
    691 	dumpto = -1;
    692 	for (c = 0; c < raidPtr->numCol; c++) {
    693 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    694 			/* this might be the one */
    695 			dumpto = c;
    696 			break;
    697 		}
    698 	}
    699 
    700 	/*
    701 	   At this point we have possibly selected a live master or a
    702 	   live slave.  We now check to see if there is a spared
    703 	   master (or a spared slave), if we didn't find a live master
    704 	   or a live slave.
    705 	*/
    706 
    707 	for (c = 0; c < raidPtr->numSpare; c++) {
    708 		sparecol = raidPtr->numCol + c;
    709 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    710 			/* How about this one? */
    711 			scol = -1;
    712 			for(j=0;j<raidPtr->numCol;j++) {
    713 				if (raidPtr->Disks[j].spareCol == sparecol) {
    714 					scol = j;
    715 					break;
    716 				}
    717 			}
    718 			if (scol == 0) {
    719 				/*
    720 				   We must have found a spared master!
    721 				   We'll take that over anything else
    722 				   found so far.  (We couldn't have
    723 				   found a real master before, since
    724 				   this is a used spare, and it's
    725 				   saying that it's replacing the
    726 				   master.)  On reboot (with
    727 				   autoconfiguration turned on)
    728 				   sparecol will become the 1st
    729 				   component (component0) of this set.
    730 				*/
    731 				dumpto = sparecol;
    732 				break;
    733 			} else if (scol != -1) {
    734 				/*
    735 				   Must be a spared slave.  We'll dump
    736 				   to that if we havn't found anything
    737 				   else so far.
    738 				*/
    739 				if (dumpto == -1)
    740 					dumpto = sparecol;
    741 			}
    742 		}
    743 	}
    744 
    745 	if (dumpto == -1) {
    746 		/* we couldn't find any live components to dump to!?!?
    747 		 */
    748 		error = EINVAL;
    749 		goto out;
    750 	}
    751 
    752 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    753 	if (bdev == NULL) {
    754 		error = ENXIO;
    755 		goto out;
    756 	}
    757 
    758 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    759 				blkno, va, nblk * raidPtr->bytesPerSector);
    760 
    761 out:
    762 	raidunlock(rs);
    763 
    764 	return error;
    765 }
    766 
    767 /* ARGSUSED */
    768 static int
    769 raidopen(dev_t dev, int flags, int fmt,
    770     struct lwp *l)
    771 {
    772 	int     unit = raidunit(dev);
    773 	struct raid_softc *rs;
    774 	struct dk_softc *dksc;
    775 	int     error = 0;
    776 	int     part, pmask;
    777 
    778 	if ((rs = raidget(unit, true)) == NULL)
    779 		return ENXIO;
    780 	if ((error = raidlock(rs)) != 0)
    781 		return (error);
    782 
    783 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    784 		error = EBUSY;
    785 		goto bad;
    786 	}
    787 
    788 	dksc = &rs->sc_dksc;
    789 
    790 	part = DISKPART(dev);
    791 	pmask = (1 << part);
    792 
    793 	if (!DK_BUSY(dksc, pmask) &&
    794 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    795 		/* First one... mark things as dirty... Note that we *MUST*
    796 		 have done a configure before this.  I DO NOT WANT TO BE
    797 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    798 		 THAT THEY BELONG TOGETHER!!!!! */
    799 		/* XXX should check to see if we're only open for reading
    800 		   here... If so, we needn't do this, but then need some
    801 		   other way of keeping track of what's happened.. */
    802 
    803 		rf_markalldirty(&rs->sc_r);
    804 	}
    805 
    806 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    807 		error = dk_open(dksc, dev, flags, fmt, l);
    808 
    809 bad:
    810 	raidunlock(rs);
    811 
    812 	return (error);
    813 
    814 
    815 }
    816 
    817 static int
    818 raid_lastclose(device_t self)
    819 {
    820 	struct raid_softc *rs = raidsoftc(self);
    821 
    822 	/* Last one... device is not unconfigured yet.
    823 	   Device shutdown has taken care of setting the
    824 	   clean bits if RAIDF_INITED is not set
    825 	   mark things as clean... */
    826 
    827 	rf_update_component_labels(&rs->sc_r,
    828 	    RF_FINAL_COMPONENT_UPDATE);
    829 
    830 	/* pass to unlocked code */
    831 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    832 		rs->sc_flags |= RAIDF_DETACH;
    833 
    834 	return 0;
    835 }
    836 
    837 /* ARGSUSED */
    838 static int
    839 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    840 {
    841 	int     unit = raidunit(dev);
    842 	struct raid_softc *rs;
    843 	struct dk_softc *dksc;
    844 	cfdata_t cf;
    845 	int     error = 0, do_detach = 0, do_put = 0;
    846 
    847 	if ((rs = raidget(unit, false)) == NULL)
    848 		return ENXIO;
    849 	dksc = &rs->sc_dksc;
    850 
    851 	if ((error = raidlock(rs)) != 0)
    852 		return (error);
    853 
    854 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    855 		error = dk_close(dksc, dev, flags, fmt, l);
    856 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    857 			do_detach = 1;
    858 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    859 		do_put = 1;
    860 
    861 	raidunlock(rs);
    862 
    863 	if (do_detach) {
    864 		/* free the pseudo device attach bits */
    865 		cf = device_cfdata(dksc->sc_dev);
    866 		error = config_detach(dksc->sc_dev, 0);
    867 		if (error == 0)
    868 			free(cf, M_RAIDFRAME);
    869 	} else if (do_put) {
    870 		raidput(rs);
    871 	}
    872 
    873 	return (error);
    874 
    875 }
    876 
    877 static void
    878 raid_wakeup(RF_Raid_t *raidPtr)
    879 {
    880 	rf_lock_mutex2(raidPtr->iodone_lock);
    881 	rf_signal_cond2(raidPtr->iodone_cv);
    882 	rf_unlock_mutex2(raidPtr->iodone_lock);
    883 }
    884 
    885 static void
    886 raidstrategy(struct buf *bp)
    887 {
    888 	unsigned int unit;
    889 	struct raid_softc *rs;
    890 	struct dk_softc *dksc;
    891 	RF_Raid_t *raidPtr;
    892 
    893 	unit = raidunit(bp->b_dev);
    894 	if ((rs = raidget(unit, false)) == NULL) {
    895 		bp->b_error = ENXIO;
    896 		goto fail;
    897 	}
    898 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    899 		bp->b_error = ENXIO;
    900 		goto fail;
    901 	}
    902 	dksc = &rs->sc_dksc;
    903 	raidPtr = &rs->sc_r;
    904 
    905 	/* Queue IO only */
    906 	if (dk_strategy_defer(dksc, bp))
    907 		goto done;
    908 
    909 	/* schedule the IO to happen at the next convenient time */
    910 	raid_wakeup(raidPtr);
    911 
    912 done:
    913 	return;
    914 
    915 fail:
    916 	bp->b_resid = bp->b_bcount;
    917 	biodone(bp);
    918 }
    919 
    920 static int
    921 raid_diskstart(device_t dev, struct buf *bp)
    922 {
    923 	struct raid_softc *rs = raidsoftc(dev);
    924 	RF_Raid_t *raidPtr;
    925 
    926 	raidPtr = &rs->sc_r;
    927 	if (!raidPtr->valid) {
    928 		db1_printf(("raid is not valid..\n"));
    929 		return ENODEV;
    930 	}
    931 
    932 	/* XXX */
    933 	bp->b_resid = 0;
    934 
    935 	return raiddoaccess(raidPtr, bp);
    936 }
    937 
    938 void
    939 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    940 {
    941 	struct raid_softc *rs;
    942 	struct dk_softc *dksc;
    943 
    944 	rs = raidPtr->softc;
    945 	dksc = &rs->sc_dksc;
    946 
    947 	dk_done(dksc, bp);
    948 
    949 	rf_lock_mutex2(raidPtr->mutex);
    950 	raidPtr->openings++;
    951 	rf_unlock_mutex2(raidPtr->mutex);
    952 
    953 	/* schedule more IO */
    954 	raid_wakeup(raidPtr);
    955 }
    956 
    957 /* ARGSUSED */
    958 static int
    959 raidread(dev_t dev, struct uio *uio, int flags)
    960 {
    961 	int     unit = raidunit(dev);
    962 	struct raid_softc *rs;
    963 
    964 	if ((rs = raidget(unit, false)) == NULL)
    965 		return ENXIO;
    966 
    967 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    968 		return (ENXIO);
    969 
    970 	return (physio(raidstrategy, NULL, dev, B_READ, raidminphys, uio));
    971 
    972 }
    973 
    974 /* ARGSUSED */
    975 static int
    976 raidwrite(dev_t dev, struct uio *uio, int flags)
    977 {
    978 	int     unit = raidunit(dev);
    979 	struct raid_softc *rs;
    980 
    981 	if ((rs = raidget(unit, false)) == NULL)
    982 		return ENXIO;
    983 
    984 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    985 		return (ENXIO);
    986 
    987 	return (physio(raidstrategy, NULL, dev, B_WRITE, raidminphys, uio));
    988 
    989 }
    990 
    991 static int
    992 raid_detach_unlocked(struct raid_softc *rs)
    993 {
    994 	struct dk_softc *dksc = &rs->sc_dksc;
    995 	RF_Raid_t *raidPtr;
    996 	int error;
    997 
    998 	raidPtr = &rs->sc_r;
    999 
   1000 	if (DK_BUSY(dksc, 0) ||
   1001 	    raidPtr->recon_in_progress != 0 ||
   1002 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1003 	    raidPtr->copyback_in_progress != 0)
   1004 		return EBUSY;
   1005 
   1006 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1007 		return 0;
   1008 
   1009 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1010 
   1011 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1012 		return error;
   1013 
   1014 	rs->sc_flags &= ~RAIDF_INITED;
   1015 
   1016 	/* Kill off any queued buffers */
   1017 	dk_drain(dksc);
   1018 	bufq_free(dksc->sc_bufq);
   1019 
   1020 	/* Detach the disk. */
   1021 	dkwedge_delall(&dksc->sc_dkdev);
   1022 	disk_detach(&dksc->sc_dkdev);
   1023 	disk_destroy(&dksc->sc_dkdev);
   1024 	dk_detach(dksc);
   1025 
   1026 	return 0;
   1027 }
   1028 
   1029 static int
   1030 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1031 {
   1032 	int     unit = raidunit(dev);
   1033 	int     error = 0;
   1034 	int     part, pmask;
   1035 	struct raid_softc *rs;
   1036 	struct dk_softc *dksc;
   1037 	RF_Config_t *k_cfg, *u_cfg;
   1038 	RF_Raid_t *raidPtr;
   1039 	RF_RaidDisk_t *diskPtr;
   1040 	RF_AccTotals_t *totals;
   1041 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1042 	u_char *specific_buf;
   1043 	int retcode = 0;
   1044 	int column;
   1045 /*	int raidid; */
   1046 	struct rf_recon_req *rrcopy, *rr;
   1047 	RF_ComponentLabel_t *clabel;
   1048 	RF_ComponentLabel_t *ci_label;
   1049 	RF_ComponentLabel_t **clabel_ptr;
   1050 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1051 	RF_SingleComponent_t component;
   1052 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1053 	int i, j, d;
   1054 
   1055 	if ((rs = raidget(unit, false)) == NULL)
   1056 		return ENXIO;
   1057 	dksc = &rs->sc_dksc;
   1058 	raidPtr = &rs->sc_r;
   1059 
   1060 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1061 		(int) DISKPART(dev), (int) unit, cmd));
   1062 
   1063 	/* Must be initialized for these... */
   1064 	switch (cmd) {
   1065 	case RAIDFRAME_REWRITEPARITY:
   1066 	case RAIDFRAME_GET_INFO:
   1067 	case RAIDFRAME_RESET_ACCTOTALS:
   1068 	case RAIDFRAME_GET_ACCTOTALS:
   1069 	case RAIDFRAME_KEEP_ACCTOTALS:
   1070 	case RAIDFRAME_GET_SIZE:
   1071 	case RAIDFRAME_FAIL_DISK:
   1072 	case RAIDFRAME_COPYBACK:
   1073 	case RAIDFRAME_CHECK_RECON_STATUS:
   1074 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1075 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1076 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1077 	case RAIDFRAME_ADD_HOT_SPARE:
   1078 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1079 	case RAIDFRAME_INIT_LABELS:
   1080 	case RAIDFRAME_REBUILD_IN_PLACE:
   1081 	case RAIDFRAME_CHECK_PARITY:
   1082 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1083 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1084 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1085 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1086 	case RAIDFRAME_SET_AUTOCONFIG:
   1087 	case RAIDFRAME_SET_ROOT:
   1088 	case RAIDFRAME_DELETE_COMPONENT:
   1089 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1090 	case RAIDFRAME_PARITYMAP_STATUS:
   1091 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1092 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1093 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1094 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1095 			return (ENXIO);
   1096 	}
   1097 
   1098 	switch (cmd) {
   1099 #ifdef COMPAT_50
   1100 	case RAIDFRAME_GET_INFO50:
   1101 		return rf_get_info50(raidPtr, data);
   1102 
   1103 	case RAIDFRAME_CONFIGURE50:
   1104 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1105 			return retcode;
   1106 		goto config;
   1107 #endif
   1108 		/* configure the system */
   1109 	case RAIDFRAME_CONFIGURE:
   1110 
   1111 		if (raidPtr->valid) {
   1112 			/* There is a valid RAID set running on this unit! */
   1113 			printf("raid%d: Device already configured!\n",unit);
   1114 			return(EINVAL);
   1115 		}
   1116 
   1117 		/* copy-in the configuration information */
   1118 		/* data points to a pointer to the configuration structure */
   1119 
   1120 		u_cfg = *((RF_Config_t **) data);
   1121 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1122 		if (k_cfg == NULL) {
   1123 			return (ENOMEM);
   1124 		}
   1125 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1126 		if (retcode) {
   1127 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1128 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1129 				retcode));
   1130 			goto no_config;
   1131 		}
   1132 		goto config;
   1133 	config:
   1134 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1135 
   1136 		/* allocate a buffer for the layout-specific data, and copy it
   1137 		 * in */
   1138 		if (k_cfg->layoutSpecificSize) {
   1139 			if (k_cfg->layoutSpecificSize > 10000) {
   1140 				/* sanity check */
   1141 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1142 				retcode = EINVAL;
   1143 				goto no_config;
   1144 			}
   1145 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1146 			    (u_char *));
   1147 			if (specific_buf == NULL) {
   1148 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1149 				retcode = ENOMEM;
   1150 				goto no_config;
   1151 			}
   1152 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1153 			    k_cfg->layoutSpecificSize);
   1154 			if (retcode) {
   1155 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1156 				RF_Free(specific_buf,
   1157 					k_cfg->layoutSpecificSize);
   1158 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1159 					retcode));
   1160 				goto no_config;
   1161 			}
   1162 		} else
   1163 			specific_buf = NULL;
   1164 		k_cfg->layoutSpecific = specific_buf;
   1165 
   1166 		/* should do some kind of sanity check on the configuration.
   1167 		 * Store the sum of all the bytes in the last byte? */
   1168 
   1169 		/* configure the system */
   1170 
   1171 		/*
   1172 		 * Clear the entire RAID descriptor, just to make sure
   1173 		 *  there is no stale data left in the case of a
   1174 		 *  reconfiguration
   1175 		 */
   1176 		memset(raidPtr, 0, sizeof(*raidPtr));
   1177 		raidPtr->softc = rs;
   1178 		raidPtr->raidid = unit;
   1179 
   1180 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1181 
   1182 		if (retcode == 0) {
   1183 
   1184 			/* allow this many simultaneous IO's to
   1185 			   this RAID device */
   1186 			raidPtr->openings = RAIDOUTSTANDING;
   1187 
   1188 			raidinit(rs);
   1189 			raid_wakeup(raidPtr);
   1190 			rf_markalldirty(raidPtr);
   1191 		}
   1192 		/* free the buffers.  No return code here. */
   1193 		if (k_cfg->layoutSpecificSize) {
   1194 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1195 		}
   1196 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1197 
   1198 	no_config:
   1199 		/*
   1200 		 * If configuration failed, set sc_flags so that we
   1201 		 * will detach the device when we close it.
   1202 		 */
   1203 		if (retcode != 0)
   1204 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1205 		return (retcode);
   1206 
   1207 		/* shutdown the system */
   1208 	case RAIDFRAME_SHUTDOWN:
   1209 
   1210 		part = DISKPART(dev);
   1211 		pmask = (1 << part);
   1212 
   1213 		if ((error = raidlock(rs)) != 0)
   1214 			return (error);
   1215 
   1216 		if (DK_BUSY(dksc, pmask) ||
   1217 		    raidPtr->recon_in_progress != 0 ||
   1218 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1219 		    raidPtr->copyback_in_progress != 0)
   1220 			retcode = EBUSY;
   1221 		else {
   1222 			/* detach and free on close */
   1223 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1224 			retcode = 0;
   1225 		}
   1226 
   1227 		raidunlock(rs);
   1228 
   1229 		return (retcode);
   1230 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1231 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1232 		/* need to read the component label for the disk indicated
   1233 		   by row,column in clabel */
   1234 
   1235 		/*
   1236 		 * Perhaps there should be an option to skip the in-core
   1237 		 * copy and hit the disk, as with disklabel(8).
   1238 		 */
   1239 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1240 
   1241 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1242 
   1243 		if (retcode) {
   1244 			RF_Free(clabel, sizeof(*clabel));
   1245 			return retcode;
   1246 		}
   1247 
   1248 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1249 
   1250 		column = clabel->column;
   1251 
   1252 		if ((column < 0) || (column >= raidPtr->numCol +
   1253 		    raidPtr->numSpare)) {
   1254 			RF_Free(clabel, sizeof(*clabel));
   1255 			return EINVAL;
   1256 		}
   1257 
   1258 		RF_Free(clabel, sizeof(*clabel));
   1259 
   1260 		clabel = raidget_component_label(raidPtr, column);
   1261 
   1262 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1263 
   1264 #if 0
   1265 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1266 		clabel = (RF_ComponentLabel_t *) data;
   1267 
   1268 		/* XXX check the label for valid stuff... */
   1269 		/* Note that some things *should not* get modified --
   1270 		   the user should be re-initing the labels instead of
   1271 		   trying to patch things.
   1272 		   */
   1273 
   1274 		raidid = raidPtr->raidid;
   1275 #ifdef DEBUG
   1276 		printf("raid%d: Got component label:\n", raidid);
   1277 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1278 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1279 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1280 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1281 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1282 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1283 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1284 #endif
   1285 		clabel->row = 0;
   1286 		column = clabel->column;
   1287 
   1288 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1289 			return(EINVAL);
   1290 		}
   1291 
   1292 		/* XXX this isn't allowed to do anything for now :-) */
   1293 
   1294 		/* XXX and before it is, we need to fill in the rest
   1295 		   of the fields!?!?!?! */
   1296 		memcpy(raidget_component_label(raidPtr, column),
   1297 		    clabel, sizeof(*clabel));
   1298 		raidflush_component_label(raidPtr, column);
   1299 		return (0);
   1300 #endif
   1301 
   1302 	case RAIDFRAME_INIT_LABELS:
   1303 		clabel = (RF_ComponentLabel_t *) data;
   1304 		/*
   1305 		   we only want the serial number from
   1306 		   the above.  We get all the rest of the information
   1307 		   from the config that was used to create this RAID
   1308 		   set.
   1309 		   */
   1310 
   1311 		raidPtr->serial_number = clabel->serial_number;
   1312 
   1313 		for(column=0;column<raidPtr->numCol;column++) {
   1314 			diskPtr = &raidPtr->Disks[column];
   1315 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1316 				ci_label = raidget_component_label(raidPtr,
   1317 				    column);
   1318 				/* Zeroing this is important. */
   1319 				memset(ci_label, 0, sizeof(*ci_label));
   1320 				raid_init_component_label(raidPtr, ci_label);
   1321 				ci_label->serial_number =
   1322 				    raidPtr->serial_number;
   1323 				ci_label->row = 0; /* we dont' pretend to support more */
   1324 				rf_component_label_set_partitionsize(ci_label,
   1325 				    diskPtr->partitionSize);
   1326 				ci_label->column = column;
   1327 				raidflush_component_label(raidPtr, column);
   1328 			}
   1329 			/* XXXjld what about the spares? */
   1330 		}
   1331 
   1332 		return (retcode);
   1333 	case RAIDFRAME_SET_AUTOCONFIG:
   1334 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1335 		printf("raid%d: New autoconfig value is: %d\n",
   1336 		       raidPtr->raidid, d);
   1337 		*(int *) data = d;
   1338 		return (retcode);
   1339 
   1340 	case RAIDFRAME_SET_ROOT:
   1341 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1342 		printf("raid%d: New rootpartition value is: %d\n",
   1343 		       raidPtr->raidid, d);
   1344 		*(int *) data = d;
   1345 		return (retcode);
   1346 
   1347 		/* initialize all parity */
   1348 	case RAIDFRAME_REWRITEPARITY:
   1349 
   1350 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1351 			/* Parity for RAID 0 is trivially correct */
   1352 			raidPtr->parity_good = RF_RAID_CLEAN;
   1353 			return(0);
   1354 		}
   1355 
   1356 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1357 			/* Re-write is already in progress! */
   1358 			return(EINVAL);
   1359 		}
   1360 
   1361 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1362 					   rf_RewriteParityThread,
   1363 					   raidPtr,"raid_parity");
   1364 		return (retcode);
   1365 
   1366 
   1367 	case RAIDFRAME_ADD_HOT_SPARE:
   1368 		sparePtr = (RF_SingleComponent_t *) data;
   1369 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1370 		retcode = rf_add_hot_spare(raidPtr, &component);
   1371 		return(retcode);
   1372 
   1373 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1374 		return(retcode);
   1375 
   1376 	case RAIDFRAME_DELETE_COMPONENT:
   1377 		componentPtr = (RF_SingleComponent_t *)data;
   1378 		memcpy( &component, componentPtr,
   1379 			sizeof(RF_SingleComponent_t));
   1380 		retcode = rf_delete_component(raidPtr, &component);
   1381 		return(retcode);
   1382 
   1383 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1384 		componentPtr = (RF_SingleComponent_t *)data;
   1385 		memcpy( &component, componentPtr,
   1386 			sizeof(RF_SingleComponent_t));
   1387 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1388 		return(retcode);
   1389 
   1390 	case RAIDFRAME_REBUILD_IN_PLACE:
   1391 
   1392 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1393 			/* Can't do this on a RAID 0!! */
   1394 			return(EINVAL);
   1395 		}
   1396 
   1397 		if (raidPtr->recon_in_progress == 1) {
   1398 			/* a reconstruct is already in progress! */
   1399 			return(EINVAL);
   1400 		}
   1401 
   1402 		componentPtr = (RF_SingleComponent_t *) data;
   1403 		memcpy( &component, componentPtr,
   1404 			sizeof(RF_SingleComponent_t));
   1405 		component.row = 0; /* we don't support any more */
   1406 		column = component.column;
   1407 
   1408 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1409 			return(EINVAL);
   1410 		}
   1411 
   1412 		rf_lock_mutex2(raidPtr->mutex);
   1413 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1414 		    (raidPtr->numFailures > 0)) {
   1415 			/* XXX 0 above shouldn't be constant!!! */
   1416 			/* some component other than this has failed.
   1417 			   Let's not make things worse than they already
   1418 			   are... */
   1419 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1420 			       raidPtr->raidid);
   1421 			printf("raid%d:     Col: %d   Too many failures.\n",
   1422 			       raidPtr->raidid, column);
   1423 			rf_unlock_mutex2(raidPtr->mutex);
   1424 			return (EINVAL);
   1425 		}
   1426 		if (raidPtr->Disks[column].status ==
   1427 		    rf_ds_reconstructing) {
   1428 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1429 			       raidPtr->raidid);
   1430 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1431 
   1432 			rf_unlock_mutex2(raidPtr->mutex);
   1433 			return (EINVAL);
   1434 		}
   1435 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1436 			rf_unlock_mutex2(raidPtr->mutex);
   1437 			return (EINVAL);
   1438 		}
   1439 		rf_unlock_mutex2(raidPtr->mutex);
   1440 
   1441 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1442 		if (rrcopy == NULL)
   1443 			return(ENOMEM);
   1444 
   1445 		rrcopy->raidPtr = (void *) raidPtr;
   1446 		rrcopy->col = column;
   1447 
   1448 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1449 					   rf_ReconstructInPlaceThread,
   1450 					   rrcopy,"raid_reconip");
   1451 		return(retcode);
   1452 
   1453 	case RAIDFRAME_GET_INFO:
   1454 		if (!raidPtr->valid)
   1455 			return (ENODEV);
   1456 		ucfgp = (RF_DeviceConfig_t **) data;
   1457 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1458 			  (RF_DeviceConfig_t *));
   1459 		if (d_cfg == NULL)
   1460 			return (ENOMEM);
   1461 		d_cfg->rows = 1; /* there is only 1 row now */
   1462 		d_cfg->cols = raidPtr->numCol;
   1463 		d_cfg->ndevs = raidPtr->numCol;
   1464 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1465 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1466 			return (ENOMEM);
   1467 		}
   1468 		d_cfg->nspares = raidPtr->numSpare;
   1469 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1470 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1471 			return (ENOMEM);
   1472 		}
   1473 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1474 		d = 0;
   1475 		for (j = 0; j < d_cfg->cols; j++) {
   1476 			d_cfg->devs[d] = raidPtr->Disks[j];
   1477 			d++;
   1478 		}
   1479 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1480 			d_cfg->spares[i] = raidPtr->Disks[j];
   1481 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1482 				/* XXX: raidctl(8) expects to see this as a used spare */
   1483 				d_cfg->spares[i].status = rf_ds_used_spare;
   1484 			}
   1485 		}
   1486 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1487 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1488 
   1489 		return (retcode);
   1490 
   1491 	case RAIDFRAME_CHECK_PARITY:
   1492 		*(int *) data = raidPtr->parity_good;
   1493 		return (0);
   1494 
   1495 	case RAIDFRAME_PARITYMAP_STATUS:
   1496 		if (rf_paritymap_ineligible(raidPtr))
   1497 			return EINVAL;
   1498 		rf_paritymap_status(raidPtr->parity_map,
   1499 		    (struct rf_pmstat *)data);
   1500 		return 0;
   1501 
   1502 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1503 		if (rf_paritymap_ineligible(raidPtr))
   1504 			return EINVAL;
   1505 		if (raidPtr->parity_map == NULL)
   1506 			return ENOENT; /* ??? */
   1507 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1508 			(struct rf_pmparams *)data, 1))
   1509 			return EINVAL;
   1510 		return 0;
   1511 
   1512 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1513 		if (rf_paritymap_ineligible(raidPtr))
   1514 			return EINVAL;
   1515 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1516 		return 0;
   1517 
   1518 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1519 		if (rf_paritymap_ineligible(raidPtr))
   1520 			return EINVAL;
   1521 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1522 		/* XXX should errors be passed up? */
   1523 		return 0;
   1524 
   1525 	case RAIDFRAME_RESET_ACCTOTALS:
   1526 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1527 		return (0);
   1528 
   1529 	case RAIDFRAME_GET_ACCTOTALS:
   1530 		totals = (RF_AccTotals_t *) data;
   1531 		*totals = raidPtr->acc_totals;
   1532 		return (0);
   1533 
   1534 	case RAIDFRAME_KEEP_ACCTOTALS:
   1535 		raidPtr->keep_acc_totals = *(int *)data;
   1536 		return (0);
   1537 
   1538 	case RAIDFRAME_GET_SIZE:
   1539 		*(int *) data = raidPtr->totalSectors;
   1540 		return (0);
   1541 
   1542 		/* fail a disk & optionally start reconstruction */
   1543 	case RAIDFRAME_FAIL_DISK:
   1544 
   1545 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1546 			/* Can't do this on a RAID 0!! */
   1547 			return(EINVAL);
   1548 		}
   1549 
   1550 		rr = (struct rf_recon_req *) data;
   1551 		rr->row = 0;
   1552 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1553 			return (EINVAL);
   1554 
   1555 
   1556 		rf_lock_mutex2(raidPtr->mutex);
   1557 		if (raidPtr->status == rf_rs_reconstructing) {
   1558 			/* you can't fail a disk while we're reconstructing! */
   1559 			/* XXX wrong for RAID6 */
   1560 			rf_unlock_mutex2(raidPtr->mutex);
   1561 			return (EINVAL);
   1562 		}
   1563 		if ((raidPtr->Disks[rr->col].status ==
   1564 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1565 			/* some other component has failed.  Let's not make
   1566 			   things worse. XXX wrong for RAID6 */
   1567 			rf_unlock_mutex2(raidPtr->mutex);
   1568 			return (EINVAL);
   1569 		}
   1570 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1571 			/* Can't fail a spared disk! */
   1572 			rf_unlock_mutex2(raidPtr->mutex);
   1573 			return (EINVAL);
   1574 		}
   1575 		rf_unlock_mutex2(raidPtr->mutex);
   1576 
   1577 		/* make a copy of the recon request so that we don't rely on
   1578 		 * the user's buffer */
   1579 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1580 		if (rrcopy == NULL)
   1581 			return(ENOMEM);
   1582 		memcpy(rrcopy, rr, sizeof(*rr));
   1583 		rrcopy->raidPtr = (void *) raidPtr;
   1584 
   1585 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1586 					   rf_ReconThread,
   1587 					   rrcopy,"raid_recon");
   1588 		return (0);
   1589 
   1590 		/* invoke a copyback operation after recon on whatever disk
   1591 		 * needs it, if any */
   1592 	case RAIDFRAME_COPYBACK:
   1593 
   1594 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1595 			/* This makes no sense on a RAID 0!! */
   1596 			return(EINVAL);
   1597 		}
   1598 
   1599 		if (raidPtr->copyback_in_progress == 1) {
   1600 			/* Copyback is already in progress! */
   1601 			return(EINVAL);
   1602 		}
   1603 
   1604 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1605 					   rf_CopybackThread,
   1606 					   raidPtr,"raid_copyback");
   1607 		return (retcode);
   1608 
   1609 		/* return the percentage completion of reconstruction */
   1610 	case RAIDFRAME_CHECK_RECON_STATUS:
   1611 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1612 			/* This makes no sense on a RAID 0, so tell the
   1613 			   user it's done. */
   1614 			*(int *) data = 100;
   1615 			return(0);
   1616 		}
   1617 		if (raidPtr->status != rf_rs_reconstructing)
   1618 			*(int *) data = 100;
   1619 		else {
   1620 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1621 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1622 			} else {
   1623 				*(int *) data = 0;
   1624 			}
   1625 		}
   1626 		return (0);
   1627 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1628 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1629 		if (raidPtr->status != rf_rs_reconstructing) {
   1630 			progressInfo.remaining = 0;
   1631 			progressInfo.completed = 100;
   1632 			progressInfo.total = 100;
   1633 		} else {
   1634 			progressInfo.total =
   1635 				raidPtr->reconControl->numRUsTotal;
   1636 			progressInfo.completed =
   1637 				raidPtr->reconControl->numRUsComplete;
   1638 			progressInfo.remaining = progressInfo.total -
   1639 				progressInfo.completed;
   1640 		}
   1641 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1642 				  sizeof(RF_ProgressInfo_t));
   1643 		return (retcode);
   1644 
   1645 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1646 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1647 			/* This makes no sense on a RAID 0, so tell the
   1648 			   user it's done. */
   1649 			*(int *) data = 100;
   1650 			return(0);
   1651 		}
   1652 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1653 			*(int *) data = 100 *
   1654 				raidPtr->parity_rewrite_stripes_done /
   1655 				raidPtr->Layout.numStripe;
   1656 		} else {
   1657 			*(int *) data = 100;
   1658 		}
   1659 		return (0);
   1660 
   1661 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1662 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1663 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1664 			progressInfo.total = raidPtr->Layout.numStripe;
   1665 			progressInfo.completed =
   1666 				raidPtr->parity_rewrite_stripes_done;
   1667 			progressInfo.remaining = progressInfo.total -
   1668 				progressInfo.completed;
   1669 		} else {
   1670 			progressInfo.remaining = 0;
   1671 			progressInfo.completed = 100;
   1672 			progressInfo.total = 100;
   1673 		}
   1674 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1675 				  sizeof(RF_ProgressInfo_t));
   1676 		return (retcode);
   1677 
   1678 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1679 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1680 			/* This makes no sense on a RAID 0 */
   1681 			*(int *) data = 100;
   1682 			return(0);
   1683 		}
   1684 		if (raidPtr->copyback_in_progress == 1) {
   1685 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1686 				raidPtr->Layout.numStripe;
   1687 		} else {
   1688 			*(int *) data = 100;
   1689 		}
   1690 		return (0);
   1691 
   1692 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1693 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1694 		if (raidPtr->copyback_in_progress == 1) {
   1695 			progressInfo.total = raidPtr->Layout.numStripe;
   1696 			progressInfo.completed =
   1697 				raidPtr->copyback_stripes_done;
   1698 			progressInfo.remaining = progressInfo.total -
   1699 				progressInfo.completed;
   1700 		} else {
   1701 			progressInfo.remaining = 0;
   1702 			progressInfo.completed = 100;
   1703 			progressInfo.total = 100;
   1704 		}
   1705 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1706 				  sizeof(RF_ProgressInfo_t));
   1707 		return (retcode);
   1708 
   1709 	case RAIDFRAME_SET_LAST_UNIT:
   1710 		for (column = 0; column < raidPtr->numCol; column++)
   1711 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1712 				return EBUSY;
   1713 
   1714 		for (column = 0; column < raidPtr->numCol; column++) {
   1715 			clabel = raidget_component_label(raidPtr, column);
   1716 			clabel->last_unit = *(int *)data;
   1717 			raidflush_component_label(raidPtr, column);
   1718 		}
   1719 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1720 		return 0;
   1721 
   1722 		/* the sparetable daemon calls this to wait for the kernel to
   1723 		 * need a spare table. this ioctl does not return until a
   1724 		 * spare table is needed. XXX -- calling mpsleep here in the
   1725 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1726 		 * -- I should either compute the spare table in the kernel,
   1727 		 * or have a different -- XXX XXX -- interface (a different
   1728 		 * character device) for delivering the table     -- XXX */
   1729 #if 0
   1730 	case RAIDFRAME_SPARET_WAIT:
   1731 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1732 		while (!rf_sparet_wait_queue)
   1733 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1734 		waitreq = rf_sparet_wait_queue;
   1735 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1736 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1737 
   1738 		/* structure assignment */
   1739 		*((RF_SparetWait_t *) data) = *waitreq;
   1740 
   1741 		RF_Free(waitreq, sizeof(*waitreq));
   1742 		return (0);
   1743 
   1744 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1745 		 * code in it that will cause the dameon to exit */
   1746 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1747 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1748 		waitreq->fcol = -1;
   1749 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1750 		waitreq->next = rf_sparet_wait_queue;
   1751 		rf_sparet_wait_queue = waitreq;
   1752 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1753 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1754 		return (0);
   1755 
   1756 		/* used by the spare table daemon to deliver a spare table
   1757 		 * into the kernel */
   1758 	case RAIDFRAME_SEND_SPARET:
   1759 
   1760 		/* install the spare table */
   1761 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1762 
   1763 		/* respond to the requestor.  the return status of the spare
   1764 		 * table installation is passed in the "fcol" field */
   1765 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1766 		waitreq->fcol = retcode;
   1767 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1768 		waitreq->next = rf_sparet_resp_queue;
   1769 		rf_sparet_resp_queue = waitreq;
   1770 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1771 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1772 
   1773 		return (retcode);
   1774 #endif
   1775 
   1776 	default:
   1777 		break; /* fall through to the os-specific code below */
   1778 
   1779 	}
   1780 
   1781 	if (!raidPtr->valid)
   1782 		return (EINVAL);
   1783 
   1784 	/*
   1785 	 * Add support for "regular" device ioctls here.
   1786 	 */
   1787 
   1788 	switch (cmd) {
   1789 	case DIOCGCACHE:
   1790 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1791 		break;
   1792 
   1793 	case DIOCCACHESYNC:
   1794 		retcode = rf_sync_component_caches(raidPtr);
   1795 		break;
   1796 
   1797 	default:
   1798 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1799 		break;
   1800 	}
   1801 
   1802 	return (retcode);
   1803 
   1804 }
   1805 
   1806 
   1807 /* raidinit -- complete the rest of the initialization for the
   1808    RAIDframe device.  */
   1809 
   1810 
   1811 static void
   1812 raidinit(struct raid_softc *rs)
   1813 {
   1814 	cfdata_t cf;
   1815 	unsigned int unit;
   1816 	struct dk_softc *dksc = &rs->sc_dksc;
   1817 	RF_Raid_t *raidPtr = &rs->sc_r;
   1818 	device_t dev;
   1819 
   1820 	unit = raidPtr->raidid;
   1821 
   1822 	/* XXX doesn't check bounds. */
   1823 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1824 
   1825 	/* attach the pseudo device */
   1826 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1827 	cf->cf_name = raid_cd.cd_name;
   1828 	cf->cf_atname = raid_cd.cd_name;
   1829 	cf->cf_unit = unit;
   1830 	cf->cf_fstate = FSTATE_STAR;
   1831 
   1832 	dev = config_attach_pseudo(cf);
   1833 	if (dev == NULL) {
   1834 		printf("raid%d: config_attach_pseudo failed\n",
   1835 		    raidPtr->raidid);
   1836 		free(cf, M_RAIDFRAME);
   1837 		return;
   1838 	}
   1839 
   1840 	/* provide a backpointer to the real softc */
   1841 	raidsoftc(dev) = rs;
   1842 
   1843 	/* disk_attach actually creates space for the CPU disklabel, among
   1844 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1845 	 * with disklabels. */
   1846 	dk_init(dksc, dev, DKTYPE_RAID);
   1847 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1848 
   1849 	/* XXX There may be a weird interaction here between this, and
   1850 	 * protectedSectors, as used in RAIDframe.  */
   1851 
   1852 	rs->sc_size = raidPtr->totalSectors;
   1853 
   1854 	/* Attach dk and disk subsystems */
   1855 	dk_attach(dksc);
   1856 	disk_attach(&dksc->sc_dkdev);
   1857 	rf_set_geometry(rs, raidPtr);
   1858 
   1859 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1860 
   1861 	/* mark unit as usuable */
   1862 	rs->sc_flags |= RAIDF_INITED;
   1863 
   1864 	dkwedge_discover(&dksc->sc_dkdev);
   1865 }
   1866 
   1867 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1868 /* wake up the daemon & tell it to get us a spare table
   1869  * XXX
   1870  * the entries in the queues should be tagged with the raidPtr
   1871  * so that in the extremely rare case that two recons happen at once,
   1872  * we know for which device were requesting a spare table
   1873  * XXX
   1874  *
   1875  * XXX This code is not currently used. GO
   1876  */
   1877 int
   1878 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1879 {
   1880 	int     retcode;
   1881 
   1882 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1883 	req->next = rf_sparet_wait_queue;
   1884 	rf_sparet_wait_queue = req;
   1885 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1886 
   1887 	/* mpsleep unlocks the mutex */
   1888 	while (!rf_sparet_resp_queue) {
   1889 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1890 	}
   1891 	req = rf_sparet_resp_queue;
   1892 	rf_sparet_resp_queue = req->next;
   1893 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1894 
   1895 	retcode = req->fcol;
   1896 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1897 					 * alloc'd */
   1898 	return (retcode);
   1899 }
   1900 #endif
   1901 
   1902 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1903  * bp & passes it down.
   1904  * any calls originating in the kernel must use non-blocking I/O
   1905  * do some extra sanity checking to return "appropriate" error values for
   1906  * certain conditions (to make some standard utilities work)
   1907  *
   1908  * Formerly known as: rf_DoAccessKernel
   1909  */
   1910 void
   1911 raidstart(RF_Raid_t *raidPtr)
   1912 {
   1913 	struct raid_softc *rs;
   1914 	struct dk_softc *dksc;
   1915 
   1916 	rs = raidPtr->softc;
   1917 	dksc = &rs->sc_dksc;
   1918 	/* quick check to see if anything has died recently */
   1919 	rf_lock_mutex2(raidPtr->mutex);
   1920 	if (raidPtr->numNewFailures > 0) {
   1921 		rf_unlock_mutex2(raidPtr->mutex);
   1922 		rf_update_component_labels(raidPtr,
   1923 					   RF_NORMAL_COMPONENT_UPDATE);
   1924 		rf_lock_mutex2(raidPtr->mutex);
   1925 		raidPtr->numNewFailures--;
   1926 	}
   1927 	rf_unlock_mutex2(raidPtr->mutex);
   1928 
   1929 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1930 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1931 		return;
   1932 	}
   1933 
   1934 	dk_start(dksc, NULL);
   1935 }
   1936 
   1937 static int
   1938 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1939 {
   1940 	RF_SectorCount_t num_blocks, pb, sum;
   1941 	RF_RaidAddr_t raid_addr;
   1942 	daddr_t blocknum;
   1943 	int     do_async;
   1944 	int rc;
   1945 
   1946 	rf_lock_mutex2(raidPtr->mutex);
   1947 	if (raidPtr->openings == 0) {
   1948 		rf_unlock_mutex2(raidPtr->mutex);
   1949 		return EAGAIN;
   1950 	}
   1951 	rf_unlock_mutex2(raidPtr->mutex);
   1952 
   1953 	blocknum = bp->b_rawblkno;
   1954 
   1955 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1956 		    (int) blocknum));
   1957 
   1958 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1959 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1960 
   1961 	/* *THIS* is where we adjust what block we're going to...
   1962 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1963 	raid_addr = blocknum;
   1964 
   1965 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1966 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1967 	sum = raid_addr + num_blocks + pb;
   1968 	if (1 || rf_debugKernelAccess) {
   1969 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1970 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1971 			    (int) pb, (int) bp->b_resid));
   1972 	}
   1973 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1974 	    || (sum < num_blocks) || (sum < pb)) {
   1975 		rc = ENOSPC;
   1976 		goto done;
   1977 	}
   1978 	/*
   1979 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1980 	 */
   1981 
   1982 	if (bp->b_bcount & raidPtr->sectorMask) {
   1983 		rc = ENOSPC;
   1984 		goto done;
   1985 	}
   1986 	db1_printf(("Calling DoAccess..\n"));
   1987 
   1988 
   1989 	rf_lock_mutex2(raidPtr->mutex);
   1990 	raidPtr->openings--;
   1991 	rf_unlock_mutex2(raidPtr->mutex);
   1992 
   1993 	/*
   1994 	 * Everything is async.
   1995 	 */
   1996 	do_async = 1;
   1997 
   1998 	/* don't ever condition on bp->b_flags & B_WRITE.
   1999 	 * always condition on B_READ instead */
   2000 
   2001 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2002 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2003 			 do_async, raid_addr, num_blocks,
   2004 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2005 
   2006 done:
   2007 	return rc;
   2008 }
   2009 
   2010 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2011 
   2012 int
   2013 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2014 {
   2015 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2016 	struct buf *bp;
   2017 
   2018 	req->queue = queue;
   2019 	bp = req->bp;
   2020 
   2021 	switch (req->type) {
   2022 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2023 		/* XXX need to do something extra here.. */
   2024 		/* I'm leaving this in, as I've never actually seen it used,
   2025 		 * and I'd like folks to report it... GO */
   2026 		printf(("WAKEUP CALLED\n"));
   2027 		queue->numOutstanding++;
   2028 
   2029 		bp->b_flags = 0;
   2030 		bp->b_private = req;
   2031 
   2032 		KernelWakeupFunc(bp);
   2033 		break;
   2034 
   2035 	case RF_IO_TYPE_READ:
   2036 	case RF_IO_TYPE_WRITE:
   2037 #if RF_ACC_TRACE > 0
   2038 		if (req->tracerec) {
   2039 			RF_ETIMER_START(req->tracerec->timer);
   2040 		}
   2041 #endif
   2042 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2043 		    op, queue->rf_cinfo->ci_dev,
   2044 		    req->sectorOffset, req->numSector,
   2045 		    req->buf, KernelWakeupFunc, (void *) req,
   2046 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2047 
   2048 		if (rf_debugKernelAccess) {
   2049 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2050 				(long) bp->b_blkno));
   2051 		}
   2052 		queue->numOutstanding++;
   2053 		queue->last_deq_sector = req->sectorOffset;
   2054 		/* acc wouldn't have been let in if there were any pending
   2055 		 * reqs at any other priority */
   2056 		queue->curPriority = req->priority;
   2057 
   2058 		db1_printf(("Going for %c to unit %d col %d\n",
   2059 			    req->type, queue->raidPtr->raidid,
   2060 			    queue->col));
   2061 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2062 			(int) req->sectorOffset, (int) req->numSector,
   2063 			(int) (req->numSector <<
   2064 			    queue->raidPtr->logBytesPerSector),
   2065 			(int) queue->raidPtr->logBytesPerSector));
   2066 
   2067 		/*
   2068 		 * XXX: drop lock here since this can block at
   2069 		 * least with backing SCSI devices.  Retake it
   2070 		 * to minimize fuss with calling interfaces.
   2071 		 */
   2072 
   2073 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2074 		bdev_strategy(bp);
   2075 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2076 		break;
   2077 
   2078 	default:
   2079 		panic("bad req->type in rf_DispatchKernelIO");
   2080 	}
   2081 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2082 
   2083 	return (0);
   2084 }
   2085 /* this is the callback function associated with a I/O invoked from
   2086    kernel code.
   2087  */
   2088 static void
   2089 KernelWakeupFunc(struct buf *bp)
   2090 {
   2091 	RF_DiskQueueData_t *req = NULL;
   2092 	RF_DiskQueue_t *queue;
   2093 
   2094 	db1_printf(("recovering the request queue:\n"));
   2095 
   2096 	req = bp->b_private;
   2097 
   2098 	queue = (RF_DiskQueue_t *) req->queue;
   2099 
   2100 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2101 
   2102 #if RF_ACC_TRACE > 0
   2103 	if (req->tracerec) {
   2104 		RF_ETIMER_STOP(req->tracerec->timer);
   2105 		RF_ETIMER_EVAL(req->tracerec->timer);
   2106 		rf_lock_mutex2(rf_tracing_mutex);
   2107 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2108 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2109 		req->tracerec->num_phys_ios++;
   2110 		rf_unlock_mutex2(rf_tracing_mutex);
   2111 	}
   2112 #endif
   2113 
   2114 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2115 	 * ballistic, and mark the component as hosed... */
   2116 
   2117 	if (bp->b_error != 0) {
   2118 		/* Mark the disk as dead */
   2119 		/* but only mark it once... */
   2120 		/* and only if it wouldn't leave this RAID set
   2121 		   completely broken */
   2122 		if (((queue->raidPtr->Disks[queue->col].status ==
   2123 		      rf_ds_optimal) ||
   2124 		     (queue->raidPtr->Disks[queue->col].status ==
   2125 		      rf_ds_used_spare)) &&
   2126 		     (queue->raidPtr->numFailures <
   2127 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2128 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2129 			       queue->raidPtr->raidid,
   2130 			       bp->b_error,
   2131 			       queue->raidPtr->Disks[queue->col].devname);
   2132 			queue->raidPtr->Disks[queue->col].status =
   2133 			    rf_ds_failed;
   2134 			queue->raidPtr->status = rf_rs_degraded;
   2135 			queue->raidPtr->numFailures++;
   2136 			queue->raidPtr->numNewFailures++;
   2137 		} else {	/* Disk is already dead... */
   2138 			/* printf("Disk already marked as dead!\n"); */
   2139 		}
   2140 
   2141 	}
   2142 
   2143 	/* Fill in the error value */
   2144 	req->error = bp->b_error;
   2145 
   2146 	/* Drop this one on the "finished" queue... */
   2147 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2148 
   2149 	/* Let the raidio thread know there is work to be done. */
   2150 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2151 
   2152 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2153 }
   2154 
   2155 
   2156 /*
   2157  * initialize a buf structure for doing an I/O in the kernel.
   2158  */
   2159 static void
   2160 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2161        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2162        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2163        struct proc *b_proc)
   2164 {
   2165 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2166 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2167 	bp->b_oflags = 0;
   2168 	bp->b_cflags = 0;
   2169 	bp->b_bcount = numSect << logBytesPerSector;
   2170 	bp->b_bufsize = bp->b_bcount;
   2171 	bp->b_error = 0;
   2172 	bp->b_dev = dev;
   2173 	bp->b_data = bf;
   2174 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2175 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2176 	if (bp->b_bcount == 0) {
   2177 		panic("bp->b_bcount is zero in InitBP!!");
   2178 	}
   2179 	bp->b_proc = b_proc;
   2180 	bp->b_iodone = cbFunc;
   2181 	bp->b_private = cbArg;
   2182 }
   2183 
   2184 /*
   2185  * Wait interruptibly for an exclusive lock.
   2186  *
   2187  * XXX
   2188  * Several drivers do this; it should be abstracted and made MP-safe.
   2189  * (Hmm... where have we seen this warning before :->  GO )
   2190  */
   2191 static int
   2192 raidlock(struct raid_softc *rs)
   2193 {
   2194 	int     error;
   2195 
   2196 	error = 0;
   2197 	mutex_enter(&rs->sc_mutex);
   2198 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2199 		rs->sc_flags |= RAIDF_WANTED;
   2200 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2201 		if (error != 0)
   2202 			goto done;
   2203 	}
   2204 	rs->sc_flags |= RAIDF_LOCKED;
   2205 done:
   2206 	mutex_exit(&rs->sc_mutex);
   2207 	return (error);
   2208 }
   2209 /*
   2210  * Unlock and wake up any waiters.
   2211  */
   2212 static void
   2213 raidunlock(struct raid_softc *rs)
   2214 {
   2215 
   2216 	mutex_enter(&rs->sc_mutex);
   2217 	rs->sc_flags &= ~RAIDF_LOCKED;
   2218 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2219 		rs->sc_flags &= ~RAIDF_WANTED;
   2220 		cv_broadcast(&rs->sc_cv);
   2221 	}
   2222 	mutex_exit(&rs->sc_mutex);
   2223 }
   2224 
   2225 
   2226 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2227 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2228 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2229 
   2230 static daddr_t
   2231 rf_component_info_offset(void)
   2232 {
   2233 
   2234 	return RF_COMPONENT_INFO_OFFSET;
   2235 }
   2236 
   2237 static daddr_t
   2238 rf_component_info_size(unsigned secsize)
   2239 {
   2240 	daddr_t info_size;
   2241 
   2242 	KASSERT(secsize);
   2243 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2244 		info_size = secsize;
   2245 	else
   2246 		info_size = RF_COMPONENT_INFO_SIZE;
   2247 
   2248 	return info_size;
   2249 }
   2250 
   2251 static daddr_t
   2252 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2253 {
   2254 	daddr_t map_offset;
   2255 
   2256 	KASSERT(raidPtr->bytesPerSector);
   2257 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2258 		map_offset = raidPtr->bytesPerSector;
   2259 	else
   2260 		map_offset = RF_COMPONENT_INFO_SIZE;
   2261 	map_offset += rf_component_info_offset();
   2262 
   2263 	return map_offset;
   2264 }
   2265 
   2266 static daddr_t
   2267 rf_parity_map_size(RF_Raid_t *raidPtr)
   2268 {
   2269 	daddr_t map_size;
   2270 
   2271 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2272 		map_size = raidPtr->bytesPerSector;
   2273 	else
   2274 		map_size = RF_PARITY_MAP_SIZE;
   2275 
   2276 	return map_size;
   2277 }
   2278 
   2279 int
   2280 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2281 {
   2282 	RF_ComponentLabel_t *clabel;
   2283 
   2284 	clabel = raidget_component_label(raidPtr, col);
   2285 	clabel->clean = RF_RAID_CLEAN;
   2286 	raidflush_component_label(raidPtr, col);
   2287 	return(0);
   2288 }
   2289 
   2290 
   2291 int
   2292 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2293 {
   2294 	RF_ComponentLabel_t *clabel;
   2295 
   2296 	clabel = raidget_component_label(raidPtr, col);
   2297 	clabel->clean = RF_RAID_DIRTY;
   2298 	raidflush_component_label(raidPtr, col);
   2299 	return(0);
   2300 }
   2301 
   2302 int
   2303 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2304 {
   2305 	KASSERT(raidPtr->bytesPerSector);
   2306 	return raidread_component_label(raidPtr->bytesPerSector,
   2307 	    raidPtr->Disks[col].dev,
   2308 	    raidPtr->raid_cinfo[col].ci_vp,
   2309 	    &raidPtr->raid_cinfo[col].ci_label);
   2310 }
   2311 
   2312 RF_ComponentLabel_t *
   2313 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2314 {
   2315 	return &raidPtr->raid_cinfo[col].ci_label;
   2316 }
   2317 
   2318 int
   2319 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2320 {
   2321 	RF_ComponentLabel_t *label;
   2322 
   2323 	label = &raidPtr->raid_cinfo[col].ci_label;
   2324 	label->mod_counter = raidPtr->mod_counter;
   2325 #ifndef RF_NO_PARITY_MAP
   2326 	label->parity_map_modcount = label->mod_counter;
   2327 #endif
   2328 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2329 	    raidPtr->Disks[col].dev,
   2330 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2331 }
   2332 
   2333 
   2334 static int
   2335 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2336     RF_ComponentLabel_t *clabel)
   2337 {
   2338 	return raidread_component_area(dev, b_vp, clabel,
   2339 	    sizeof(RF_ComponentLabel_t),
   2340 	    rf_component_info_offset(),
   2341 	    rf_component_info_size(secsize));
   2342 }
   2343 
   2344 /* ARGSUSED */
   2345 static int
   2346 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2347     size_t msize, daddr_t offset, daddr_t dsize)
   2348 {
   2349 	struct buf *bp;
   2350 	int error;
   2351 
   2352 	/* XXX should probably ensure that we don't try to do this if
   2353 	   someone has changed rf_protected_sectors. */
   2354 
   2355 	if (b_vp == NULL) {
   2356 		/* For whatever reason, this component is not valid.
   2357 		   Don't try to read a component label from it. */
   2358 		return(EINVAL);
   2359 	}
   2360 
   2361 	/* get a block of the appropriate size... */
   2362 	bp = geteblk((int)dsize);
   2363 	bp->b_dev = dev;
   2364 
   2365 	/* get our ducks in a row for the read */
   2366 	bp->b_blkno = offset / DEV_BSIZE;
   2367 	bp->b_bcount = dsize;
   2368 	bp->b_flags |= B_READ;
   2369  	bp->b_resid = dsize;
   2370 
   2371 	bdev_strategy(bp);
   2372 	error = biowait(bp);
   2373 
   2374 	if (!error) {
   2375 		memcpy(data, bp->b_data, msize);
   2376 	}
   2377 
   2378 	brelse(bp, 0);
   2379 	return(error);
   2380 }
   2381 
   2382 
   2383 static int
   2384 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2385     RF_ComponentLabel_t *clabel)
   2386 {
   2387 	return raidwrite_component_area(dev, b_vp, clabel,
   2388 	    sizeof(RF_ComponentLabel_t),
   2389 	    rf_component_info_offset(),
   2390 	    rf_component_info_size(secsize), 0);
   2391 }
   2392 
   2393 /* ARGSUSED */
   2394 static int
   2395 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2396     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2397 {
   2398 	struct buf *bp;
   2399 	int error;
   2400 
   2401 	/* get a block of the appropriate size... */
   2402 	bp = geteblk((int)dsize);
   2403 	bp->b_dev = dev;
   2404 
   2405 	/* get our ducks in a row for the write */
   2406 	bp->b_blkno = offset / DEV_BSIZE;
   2407 	bp->b_bcount = dsize;
   2408 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2409  	bp->b_resid = dsize;
   2410 
   2411 	memset(bp->b_data, 0, dsize);
   2412 	memcpy(bp->b_data, data, msize);
   2413 
   2414 	bdev_strategy(bp);
   2415 	if (asyncp)
   2416 		return 0;
   2417 	error = biowait(bp);
   2418 	brelse(bp, 0);
   2419 	if (error) {
   2420 #if 1
   2421 		printf("Failed to write RAID component info!\n");
   2422 #endif
   2423 	}
   2424 
   2425 	return(error);
   2426 }
   2427 
   2428 void
   2429 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2430 {
   2431 	int c;
   2432 
   2433 	for (c = 0; c < raidPtr->numCol; c++) {
   2434 		/* Skip dead disks. */
   2435 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2436 			continue;
   2437 		/* XXXjld: what if an error occurs here? */
   2438 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2439 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2440 		    RF_PARITYMAP_NBYTE,
   2441 		    rf_parity_map_offset(raidPtr),
   2442 		    rf_parity_map_size(raidPtr), 0);
   2443 	}
   2444 }
   2445 
   2446 void
   2447 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2448 {
   2449 	struct rf_paritymap_ondisk tmp;
   2450 	int c,first;
   2451 
   2452 	first=1;
   2453 	for (c = 0; c < raidPtr->numCol; c++) {
   2454 		/* Skip dead disks. */
   2455 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2456 			continue;
   2457 		raidread_component_area(raidPtr->Disks[c].dev,
   2458 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2459 		    RF_PARITYMAP_NBYTE,
   2460 		    rf_parity_map_offset(raidPtr),
   2461 		    rf_parity_map_size(raidPtr));
   2462 		if (first) {
   2463 			memcpy(map, &tmp, sizeof(*map));
   2464 			first = 0;
   2465 		} else {
   2466 			rf_paritymap_merge(map, &tmp);
   2467 		}
   2468 	}
   2469 }
   2470 
   2471 void
   2472 rf_markalldirty(RF_Raid_t *raidPtr)
   2473 {
   2474 	RF_ComponentLabel_t *clabel;
   2475 	int sparecol;
   2476 	int c;
   2477 	int j;
   2478 	int scol = -1;
   2479 
   2480 	raidPtr->mod_counter++;
   2481 	for (c = 0; c < raidPtr->numCol; c++) {
   2482 		/* we don't want to touch (at all) a disk that has
   2483 		   failed */
   2484 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2485 			clabel = raidget_component_label(raidPtr, c);
   2486 			if (clabel->status == rf_ds_spared) {
   2487 				/* XXX do something special...
   2488 				   but whatever you do, don't
   2489 				   try to access it!! */
   2490 			} else {
   2491 				raidmarkdirty(raidPtr, c);
   2492 			}
   2493 		}
   2494 	}
   2495 
   2496 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2497 		sparecol = raidPtr->numCol + c;
   2498 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2499 			/*
   2500 
   2501 			   we claim this disk is "optimal" if it's
   2502 			   rf_ds_used_spare, as that means it should be
   2503 			   directly substitutable for the disk it replaced.
   2504 			   We note that too...
   2505 
   2506 			 */
   2507 
   2508 			for(j=0;j<raidPtr->numCol;j++) {
   2509 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2510 					scol = j;
   2511 					break;
   2512 				}
   2513 			}
   2514 
   2515 			clabel = raidget_component_label(raidPtr, sparecol);
   2516 			/* make sure status is noted */
   2517 
   2518 			raid_init_component_label(raidPtr, clabel);
   2519 
   2520 			clabel->row = 0;
   2521 			clabel->column = scol;
   2522 			/* Note: we *don't* change status from rf_ds_used_spare
   2523 			   to rf_ds_optimal */
   2524 			/* clabel.status = rf_ds_optimal; */
   2525 
   2526 			raidmarkdirty(raidPtr, sparecol);
   2527 		}
   2528 	}
   2529 }
   2530 
   2531 
   2532 void
   2533 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2534 {
   2535 	RF_ComponentLabel_t *clabel;
   2536 	int sparecol;
   2537 	int c;
   2538 	int j;
   2539 	int scol;
   2540 	struct raid_softc *rs = raidPtr->softc;
   2541 
   2542 	scol = -1;
   2543 
   2544 	/* XXX should do extra checks to make sure things really are clean,
   2545 	   rather than blindly setting the clean bit... */
   2546 
   2547 	raidPtr->mod_counter++;
   2548 
   2549 	for (c = 0; c < raidPtr->numCol; c++) {
   2550 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2551 			clabel = raidget_component_label(raidPtr, c);
   2552 			/* make sure status is noted */
   2553 			clabel->status = rf_ds_optimal;
   2554 
   2555 			/* note what unit we are configured as */
   2556 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2557 				clabel->last_unit = raidPtr->raidid;
   2558 
   2559 			raidflush_component_label(raidPtr, c);
   2560 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2561 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2562 					raidmarkclean(raidPtr, c);
   2563 				}
   2564 			}
   2565 		}
   2566 		/* else we don't touch it.. */
   2567 	}
   2568 
   2569 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2570 		sparecol = raidPtr->numCol + c;
   2571 		/* Need to ensure that the reconstruct actually completed! */
   2572 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2573 			/*
   2574 
   2575 			   we claim this disk is "optimal" if it's
   2576 			   rf_ds_used_spare, as that means it should be
   2577 			   directly substitutable for the disk it replaced.
   2578 			   We note that too...
   2579 
   2580 			 */
   2581 
   2582 			for(j=0;j<raidPtr->numCol;j++) {
   2583 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2584 					scol = j;
   2585 					break;
   2586 				}
   2587 			}
   2588 
   2589 			/* XXX shouldn't *really* need this... */
   2590 			clabel = raidget_component_label(raidPtr, sparecol);
   2591 			/* make sure status is noted */
   2592 
   2593 			raid_init_component_label(raidPtr, clabel);
   2594 
   2595 			clabel->column = scol;
   2596 			clabel->status = rf_ds_optimal;
   2597 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2598 				clabel->last_unit = raidPtr->raidid;
   2599 
   2600 			raidflush_component_label(raidPtr, sparecol);
   2601 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2602 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2603 					raidmarkclean(raidPtr, sparecol);
   2604 				}
   2605 			}
   2606 		}
   2607 	}
   2608 }
   2609 
   2610 void
   2611 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2612 {
   2613 
   2614 	if (vp != NULL) {
   2615 		if (auto_configured == 1) {
   2616 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2617 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2618 			vput(vp);
   2619 
   2620 		} else {
   2621 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2622 		}
   2623 	}
   2624 }
   2625 
   2626 
   2627 void
   2628 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2629 {
   2630 	int r,c;
   2631 	struct vnode *vp;
   2632 	int acd;
   2633 
   2634 
   2635 	/* We take this opportunity to close the vnodes like we should.. */
   2636 
   2637 	for (c = 0; c < raidPtr->numCol; c++) {
   2638 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2639 		acd = raidPtr->Disks[c].auto_configured;
   2640 		rf_close_component(raidPtr, vp, acd);
   2641 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2642 		raidPtr->Disks[c].auto_configured = 0;
   2643 	}
   2644 
   2645 	for (r = 0; r < raidPtr->numSpare; r++) {
   2646 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2647 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2648 		rf_close_component(raidPtr, vp, acd);
   2649 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2650 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2651 	}
   2652 }
   2653 
   2654 
   2655 void
   2656 rf_ReconThread(struct rf_recon_req *req)
   2657 {
   2658 	int     s;
   2659 	RF_Raid_t *raidPtr;
   2660 
   2661 	s = splbio();
   2662 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2663 	raidPtr->recon_in_progress = 1;
   2664 
   2665 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2666 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2667 
   2668 	RF_Free(req, sizeof(*req));
   2669 
   2670 	raidPtr->recon_in_progress = 0;
   2671 	splx(s);
   2672 
   2673 	/* That's all... */
   2674 	kthread_exit(0);	/* does not return */
   2675 }
   2676 
   2677 void
   2678 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2679 {
   2680 	int retcode;
   2681 	int s;
   2682 
   2683 	raidPtr->parity_rewrite_stripes_done = 0;
   2684 	raidPtr->parity_rewrite_in_progress = 1;
   2685 	s = splbio();
   2686 	retcode = rf_RewriteParity(raidPtr);
   2687 	splx(s);
   2688 	if (retcode) {
   2689 		printf("raid%d: Error re-writing parity (%d)!\n",
   2690 		    raidPtr->raidid, retcode);
   2691 	} else {
   2692 		/* set the clean bit!  If we shutdown correctly,
   2693 		   the clean bit on each component label will get
   2694 		   set */
   2695 		raidPtr->parity_good = RF_RAID_CLEAN;
   2696 	}
   2697 	raidPtr->parity_rewrite_in_progress = 0;
   2698 
   2699 	/* Anyone waiting for us to stop?  If so, inform them... */
   2700 	if (raidPtr->waitShutdown) {
   2701 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2702 	}
   2703 
   2704 	/* That's all... */
   2705 	kthread_exit(0);	/* does not return */
   2706 }
   2707 
   2708 
   2709 void
   2710 rf_CopybackThread(RF_Raid_t *raidPtr)
   2711 {
   2712 	int s;
   2713 
   2714 	raidPtr->copyback_in_progress = 1;
   2715 	s = splbio();
   2716 	rf_CopybackReconstructedData(raidPtr);
   2717 	splx(s);
   2718 	raidPtr->copyback_in_progress = 0;
   2719 
   2720 	/* That's all... */
   2721 	kthread_exit(0);	/* does not return */
   2722 }
   2723 
   2724 
   2725 void
   2726 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2727 {
   2728 	int s;
   2729 	RF_Raid_t *raidPtr;
   2730 
   2731 	s = splbio();
   2732 	raidPtr = req->raidPtr;
   2733 	raidPtr->recon_in_progress = 1;
   2734 	rf_ReconstructInPlace(raidPtr, req->col);
   2735 	RF_Free(req, sizeof(*req));
   2736 	raidPtr->recon_in_progress = 0;
   2737 	splx(s);
   2738 
   2739 	/* That's all... */
   2740 	kthread_exit(0);	/* does not return */
   2741 }
   2742 
   2743 static RF_AutoConfig_t *
   2744 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2745     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2746     unsigned secsize)
   2747 {
   2748 	int good_one = 0;
   2749 	RF_ComponentLabel_t *clabel;
   2750 	RF_AutoConfig_t *ac;
   2751 
   2752 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2753 	if (clabel == NULL) {
   2754 oomem:
   2755 		    while(ac_list) {
   2756 			    ac = ac_list;
   2757 			    if (ac->clabel)
   2758 				    free(ac->clabel, M_RAIDFRAME);
   2759 			    ac_list = ac_list->next;
   2760 			    free(ac, M_RAIDFRAME);
   2761 		    }
   2762 		    printf("RAID auto config: out of memory!\n");
   2763 		    return NULL; /* XXX probably should panic? */
   2764 	}
   2765 
   2766 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2767 		/* Got the label.  Does it look reasonable? */
   2768 		if (rf_reasonable_label(clabel, numsecs) &&
   2769 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2770 #ifdef DEBUG
   2771 			printf("Component on: %s: %llu\n",
   2772 				cname, (unsigned long long)size);
   2773 			rf_print_component_label(clabel);
   2774 #endif
   2775 			/* if it's reasonable, add it, else ignore it. */
   2776 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2777 				M_NOWAIT);
   2778 			if (ac == NULL) {
   2779 				free(clabel, M_RAIDFRAME);
   2780 				goto oomem;
   2781 			}
   2782 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2783 			ac->dev = dev;
   2784 			ac->vp = vp;
   2785 			ac->clabel = clabel;
   2786 			ac->next = ac_list;
   2787 			ac_list = ac;
   2788 			good_one = 1;
   2789 		}
   2790 	}
   2791 	if (!good_one) {
   2792 		/* cleanup */
   2793 		free(clabel, M_RAIDFRAME);
   2794 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2795 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2796 		vput(vp);
   2797 	}
   2798 	return ac_list;
   2799 }
   2800 
   2801 RF_AutoConfig_t *
   2802 rf_find_raid_components(void)
   2803 {
   2804 	struct vnode *vp;
   2805 	struct disklabel label;
   2806 	device_t dv;
   2807 	deviter_t di;
   2808 	dev_t dev;
   2809 	int bmajor, bminor, wedge, rf_part_found;
   2810 	int error;
   2811 	int i;
   2812 	RF_AutoConfig_t *ac_list;
   2813 	uint64_t numsecs;
   2814 	unsigned secsize;
   2815 	int dowedges;
   2816 
   2817 	/* initialize the AutoConfig list */
   2818 	ac_list = NULL;
   2819 
   2820 	/*
   2821 	 * we begin by trolling through *all* the devices on the system *twice*
   2822 	 * first we scan for wedges, second for other devices. This avoids
   2823 	 * using a raw partition instead of a wedge that covers the whole disk
   2824 	 */
   2825 
   2826 	for (dowedges=1; dowedges>=0; --dowedges) {
   2827 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2828 		     dv = deviter_next(&di)) {
   2829 
   2830 			/* we are only interested in disks... */
   2831 			if (device_class(dv) != DV_DISK)
   2832 				continue;
   2833 
   2834 			/* we don't care about floppies... */
   2835 			if (device_is_a(dv, "fd")) {
   2836 				continue;
   2837 			}
   2838 
   2839 			/* we don't care about CD's... */
   2840 			if (device_is_a(dv, "cd")) {
   2841 				continue;
   2842 			}
   2843 
   2844 			/* we don't care about md's... */
   2845 			if (device_is_a(dv, "md")) {
   2846 				continue;
   2847 			}
   2848 
   2849 			/* hdfd is the Atari/Hades floppy driver */
   2850 			if (device_is_a(dv, "hdfd")) {
   2851 				continue;
   2852 			}
   2853 
   2854 			/* fdisa is the Atari/Milan floppy driver */
   2855 			if (device_is_a(dv, "fdisa")) {
   2856 				continue;
   2857 			}
   2858 
   2859 			/* are we in the wedges pass ? */
   2860 			wedge = device_is_a(dv, "dk");
   2861 			if (wedge != dowedges) {
   2862 				continue;
   2863 			}
   2864 
   2865 			/* need to find the device_name_to_block_device_major stuff */
   2866 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2867 
   2868 			rf_part_found = 0; /*No raid partition as yet*/
   2869 
   2870 			/* get a vnode for the raw partition of this disk */
   2871 			bminor = minor(device_unit(dv));
   2872 			dev = wedge ? makedev(bmajor, bminor) :
   2873 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2874 			if (bdevvp(dev, &vp))
   2875 				panic("RAID can't alloc vnode");
   2876 
   2877 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2878 
   2879 			if (error) {
   2880 				/* "Who cares."  Continue looking
   2881 				   for something that exists*/
   2882 				vput(vp);
   2883 				continue;
   2884 			}
   2885 
   2886 			error = getdisksize(vp, &numsecs, &secsize);
   2887 			if (error) {
   2888 				/*
   2889 				 * Pseudo devices like vnd and cgd can be
   2890 				 * opened but may still need some configuration.
   2891 				 * Ignore these quietly.
   2892 				 */
   2893 				if (error != ENXIO)
   2894 					printf("RAIDframe: can't get disk size"
   2895 					    " for dev %s (%d)\n",
   2896 					    device_xname(dv), error);
   2897 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2898 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2899 				vput(vp);
   2900 				continue;
   2901 			}
   2902 			if (wedge) {
   2903 				struct dkwedge_info dkw;
   2904 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2905 				    NOCRED);
   2906 				if (error) {
   2907 					printf("RAIDframe: can't get wedge info for "
   2908 					    "dev %s (%d)\n", device_xname(dv), error);
   2909 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2910 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2911 					vput(vp);
   2912 					continue;
   2913 				}
   2914 
   2915 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2916 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2917 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2918 					vput(vp);
   2919 					continue;
   2920 				}
   2921 
   2922 				ac_list = rf_get_component(ac_list, dev, vp,
   2923 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2924 				rf_part_found = 1; /*There is a raid component on this disk*/
   2925 				continue;
   2926 			}
   2927 
   2928 			/* Ok, the disk exists.  Go get the disklabel. */
   2929 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2930 			if (error) {
   2931 				/*
   2932 				 * XXX can't happen - open() would
   2933 				 * have errored out (or faked up one)
   2934 				 */
   2935 				if (error != ENOTTY)
   2936 					printf("RAIDframe: can't get label for dev "
   2937 					    "%s (%d)\n", device_xname(dv), error);
   2938 			}
   2939 
   2940 			/* don't need this any more.  We'll allocate it again
   2941 			   a little later if we really do... */
   2942 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2943 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2944 			vput(vp);
   2945 
   2946 			if (error)
   2947 				continue;
   2948 
   2949 			rf_part_found = 0; /*No raid partitions yet*/
   2950 			for (i = 0; i < label.d_npartitions; i++) {
   2951 				char cname[sizeof(ac_list->devname)];
   2952 
   2953 				/* We only support partitions marked as RAID */
   2954 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2955 					continue;
   2956 
   2957 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2958 				if (bdevvp(dev, &vp))
   2959 					panic("RAID can't alloc vnode");
   2960 
   2961 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2962 				if (error) {
   2963 					/* Whatever... */
   2964 					vput(vp);
   2965 					continue;
   2966 				}
   2967 				snprintf(cname, sizeof(cname), "%s%c",
   2968 				    device_xname(dv), 'a' + i);
   2969 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2970 					label.d_partitions[i].p_size, numsecs, secsize);
   2971 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2972 			}
   2973 
   2974 			/*
   2975 			 *If there is no raid component on this disk, either in a
   2976 			 *disklabel or inside a wedge, check the raw partition as well,
   2977 			 *as it is possible to configure raid components on raw disk
   2978 			 *devices.
   2979 			 */
   2980 
   2981 			if (!rf_part_found) {
   2982 				char cname[sizeof(ac_list->devname)];
   2983 
   2984 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2985 				if (bdevvp(dev, &vp))
   2986 					panic("RAID can't alloc vnode");
   2987 
   2988 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2989 				if (error) {
   2990 					/* Whatever... */
   2991 					vput(vp);
   2992 					continue;
   2993 				}
   2994 				snprintf(cname, sizeof(cname), "%s%c",
   2995 				    device_xname(dv), 'a' + RAW_PART);
   2996 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2997 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2998 			}
   2999 		}
   3000 		deviter_release(&di);
   3001 	}
   3002 	return ac_list;
   3003 }
   3004 
   3005 
   3006 int
   3007 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3008 {
   3009 
   3010 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3011 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3012 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3013 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3014 	    clabel->row >=0 &&
   3015 	    clabel->column >= 0 &&
   3016 	    clabel->num_rows > 0 &&
   3017 	    clabel->num_columns > 0 &&
   3018 	    clabel->row < clabel->num_rows &&
   3019 	    clabel->column < clabel->num_columns &&
   3020 	    clabel->blockSize > 0 &&
   3021 	    /*
   3022 	     * numBlocksHi may contain garbage, but it is ok since
   3023 	     * the type is unsigned.  If it is really garbage,
   3024 	     * rf_fix_old_label_size() will fix it.
   3025 	     */
   3026 	    rf_component_label_numblocks(clabel) > 0) {
   3027 		/*
   3028 		 * label looks reasonable enough...
   3029 		 * let's make sure it has no old garbage.
   3030 		 */
   3031 		if (numsecs)
   3032 			rf_fix_old_label_size(clabel, numsecs);
   3033 		return(1);
   3034 	}
   3035 	return(0);
   3036 }
   3037 
   3038 
   3039 /*
   3040  * For reasons yet unknown, some old component labels have garbage in
   3041  * the newer numBlocksHi region, and this causes lossage.  Since those
   3042  * disks will also have numsecs set to less than 32 bits of sectors,
   3043  * we can determine when this corruption has occurred, and fix it.
   3044  *
   3045  * The exact same problem, with the same unknown reason, happens to
   3046  * the partitionSizeHi member as well.
   3047  */
   3048 static void
   3049 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3050 {
   3051 
   3052 	if (numsecs < ((uint64_t)1 << 32)) {
   3053 		if (clabel->numBlocksHi) {
   3054 			printf("WARNING: total sectors < 32 bits, yet "
   3055 			       "numBlocksHi set\n"
   3056 			       "WARNING: resetting numBlocksHi to zero.\n");
   3057 			clabel->numBlocksHi = 0;
   3058 		}
   3059 
   3060 		if (clabel->partitionSizeHi) {
   3061 			printf("WARNING: total sectors < 32 bits, yet "
   3062 			       "partitionSizeHi set\n"
   3063 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3064 			clabel->partitionSizeHi = 0;
   3065 		}
   3066 	}
   3067 }
   3068 
   3069 
   3070 #ifdef DEBUG
   3071 void
   3072 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3073 {
   3074 	uint64_t numBlocks;
   3075 	static const char *rp[] = {
   3076 	    "No", "Force", "Soft", "*invalid*"
   3077 	};
   3078 
   3079 
   3080 	numBlocks = rf_component_label_numblocks(clabel);
   3081 
   3082 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3083 	       clabel->row, clabel->column,
   3084 	       clabel->num_rows, clabel->num_columns);
   3085 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3086 	       clabel->version, clabel->serial_number,
   3087 	       clabel->mod_counter);
   3088 	printf("   Clean: %s Status: %d\n",
   3089 	       clabel->clean ? "Yes" : "No", clabel->status);
   3090 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3091 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3092 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3093 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3094 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3095 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3096 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3097 #if 0
   3098 	   printf("   Config order: %d\n", clabel->config_order);
   3099 #endif
   3100 
   3101 }
   3102 #endif
   3103 
   3104 RF_ConfigSet_t *
   3105 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3106 {
   3107 	RF_AutoConfig_t *ac;
   3108 	RF_ConfigSet_t *config_sets;
   3109 	RF_ConfigSet_t *cset;
   3110 	RF_AutoConfig_t *ac_next;
   3111 
   3112 
   3113 	config_sets = NULL;
   3114 
   3115 	/* Go through the AutoConfig list, and figure out which components
   3116 	   belong to what sets.  */
   3117 	ac = ac_list;
   3118 	while(ac!=NULL) {
   3119 		/* we're going to putz with ac->next, so save it here
   3120 		   for use at the end of the loop */
   3121 		ac_next = ac->next;
   3122 
   3123 		if (config_sets == NULL) {
   3124 			/* will need at least this one... */
   3125 			config_sets = (RF_ConfigSet_t *)
   3126 				malloc(sizeof(RF_ConfigSet_t),
   3127 				       M_RAIDFRAME, M_NOWAIT);
   3128 			if (config_sets == NULL) {
   3129 				panic("rf_create_auto_sets: No memory!");
   3130 			}
   3131 			/* this one is easy :) */
   3132 			config_sets->ac = ac;
   3133 			config_sets->next = NULL;
   3134 			config_sets->rootable = 0;
   3135 			ac->next = NULL;
   3136 		} else {
   3137 			/* which set does this component fit into? */
   3138 			cset = config_sets;
   3139 			while(cset!=NULL) {
   3140 				if (rf_does_it_fit(cset, ac)) {
   3141 					/* looks like it matches... */
   3142 					ac->next = cset->ac;
   3143 					cset->ac = ac;
   3144 					break;
   3145 				}
   3146 				cset = cset->next;
   3147 			}
   3148 			if (cset==NULL) {
   3149 				/* didn't find a match above... new set..*/
   3150 				cset = (RF_ConfigSet_t *)
   3151 					malloc(sizeof(RF_ConfigSet_t),
   3152 					       M_RAIDFRAME, M_NOWAIT);
   3153 				if (cset == NULL) {
   3154 					panic("rf_create_auto_sets: No memory!");
   3155 				}
   3156 				cset->ac = ac;
   3157 				ac->next = NULL;
   3158 				cset->next = config_sets;
   3159 				cset->rootable = 0;
   3160 				config_sets = cset;
   3161 			}
   3162 		}
   3163 		ac = ac_next;
   3164 	}
   3165 
   3166 
   3167 	return(config_sets);
   3168 }
   3169 
   3170 static int
   3171 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3172 {
   3173 	RF_ComponentLabel_t *clabel1, *clabel2;
   3174 
   3175 	/* If this one matches the *first* one in the set, that's good
   3176 	   enough, since the other members of the set would have been
   3177 	   through here too... */
   3178 	/* note that we are not checking partitionSize here..
   3179 
   3180 	   Note that we are also not checking the mod_counters here.
   3181 	   If everything else matches except the mod_counter, that's
   3182 	   good enough for this test.  We will deal with the mod_counters
   3183 	   a little later in the autoconfiguration process.
   3184 
   3185 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3186 
   3187 	   The reason we don't check for this is that failed disks
   3188 	   will have lower modification counts.  If those disks are
   3189 	   not added to the set they used to belong to, then they will
   3190 	   form their own set, which may result in 2 different sets,
   3191 	   for example, competing to be configured at raid0, and
   3192 	   perhaps competing to be the root filesystem set.  If the
   3193 	   wrong ones get configured, or both attempt to become /,
   3194 	   weird behaviour and or serious lossage will occur.  Thus we
   3195 	   need to bring them into the fold here, and kick them out at
   3196 	   a later point.
   3197 
   3198 	*/
   3199 
   3200 	clabel1 = cset->ac->clabel;
   3201 	clabel2 = ac->clabel;
   3202 	if ((clabel1->version == clabel2->version) &&
   3203 	    (clabel1->serial_number == clabel2->serial_number) &&
   3204 	    (clabel1->num_rows == clabel2->num_rows) &&
   3205 	    (clabel1->num_columns == clabel2->num_columns) &&
   3206 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3207 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3208 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3209 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3210 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3211 	    (clabel1->blockSize == clabel2->blockSize) &&
   3212 	    rf_component_label_numblocks(clabel1) ==
   3213 	    rf_component_label_numblocks(clabel2) &&
   3214 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3215 	    (clabel1->root_partition == clabel2->root_partition) &&
   3216 	    (clabel1->last_unit == clabel2->last_unit) &&
   3217 	    (clabel1->config_order == clabel2->config_order)) {
   3218 		/* if it get's here, it almost *has* to be a match */
   3219 	} else {
   3220 		/* it's not consistent with somebody in the set..
   3221 		   punt */
   3222 		return(0);
   3223 	}
   3224 	/* all was fine.. it must fit... */
   3225 	return(1);
   3226 }
   3227 
   3228 int
   3229 rf_have_enough_components(RF_ConfigSet_t *cset)
   3230 {
   3231 	RF_AutoConfig_t *ac;
   3232 	RF_AutoConfig_t *auto_config;
   3233 	RF_ComponentLabel_t *clabel;
   3234 	int c;
   3235 	int num_cols;
   3236 	int num_missing;
   3237 	int mod_counter;
   3238 	int mod_counter_found;
   3239 	int even_pair_failed;
   3240 	char parity_type;
   3241 
   3242 
   3243 	/* check to see that we have enough 'live' components
   3244 	   of this set.  If so, we can configure it if necessary */
   3245 
   3246 	num_cols = cset->ac->clabel->num_columns;
   3247 	parity_type = cset->ac->clabel->parityConfig;
   3248 
   3249 	/* XXX Check for duplicate components!?!?!? */
   3250 
   3251 	/* Determine what the mod_counter is supposed to be for this set. */
   3252 
   3253 	mod_counter_found = 0;
   3254 	mod_counter = 0;
   3255 	ac = cset->ac;
   3256 	while(ac!=NULL) {
   3257 		if (mod_counter_found==0) {
   3258 			mod_counter = ac->clabel->mod_counter;
   3259 			mod_counter_found = 1;
   3260 		} else {
   3261 			if (ac->clabel->mod_counter > mod_counter) {
   3262 				mod_counter = ac->clabel->mod_counter;
   3263 			}
   3264 		}
   3265 		ac = ac->next;
   3266 	}
   3267 
   3268 	num_missing = 0;
   3269 	auto_config = cset->ac;
   3270 
   3271 	even_pair_failed = 0;
   3272 	for(c=0; c<num_cols; c++) {
   3273 		ac = auto_config;
   3274 		while(ac!=NULL) {
   3275 			if ((ac->clabel->column == c) &&
   3276 			    (ac->clabel->mod_counter == mod_counter)) {
   3277 				/* it's this one... */
   3278 #ifdef DEBUG
   3279 				printf("Found: %s at %d\n",
   3280 				       ac->devname,c);
   3281 #endif
   3282 				break;
   3283 			}
   3284 			ac=ac->next;
   3285 		}
   3286 		if (ac==NULL) {
   3287 				/* Didn't find one here! */
   3288 				/* special case for RAID 1, especially
   3289 				   where there are more than 2
   3290 				   components (where RAIDframe treats
   3291 				   things a little differently :( ) */
   3292 			if (parity_type == '1') {
   3293 				if (c%2 == 0) { /* even component */
   3294 					even_pair_failed = 1;
   3295 				} else { /* odd component.  If
   3296 					    we're failed, and
   3297 					    so is the even
   3298 					    component, it's
   3299 					    "Good Night, Charlie" */
   3300 					if (even_pair_failed == 1) {
   3301 						return(0);
   3302 					}
   3303 				}
   3304 			} else {
   3305 				/* normal accounting */
   3306 				num_missing++;
   3307 			}
   3308 		}
   3309 		if ((parity_type == '1') && (c%2 == 1)) {
   3310 				/* Just did an even component, and we didn't
   3311 				   bail.. reset the even_pair_failed flag,
   3312 				   and go on to the next component.... */
   3313 			even_pair_failed = 0;
   3314 		}
   3315 	}
   3316 
   3317 	clabel = cset->ac->clabel;
   3318 
   3319 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3320 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3321 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3322 		/* XXX this needs to be made *much* more general */
   3323 		/* Too many failures */
   3324 		return(0);
   3325 	}
   3326 	/* otherwise, all is well, and we've got enough to take a kick
   3327 	   at autoconfiguring this set */
   3328 	return(1);
   3329 }
   3330 
   3331 void
   3332 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3333 			RF_Raid_t *raidPtr)
   3334 {
   3335 	RF_ComponentLabel_t *clabel;
   3336 	int i;
   3337 
   3338 	clabel = ac->clabel;
   3339 
   3340 	/* 1. Fill in the common stuff */
   3341 	config->numRow = clabel->num_rows = 1;
   3342 	config->numCol = clabel->num_columns;
   3343 	config->numSpare = 0; /* XXX should this be set here? */
   3344 	config->sectPerSU = clabel->sectPerSU;
   3345 	config->SUsPerPU = clabel->SUsPerPU;
   3346 	config->SUsPerRU = clabel->SUsPerRU;
   3347 	config->parityConfig = clabel->parityConfig;
   3348 	/* XXX... */
   3349 	strcpy(config->diskQueueType,"fifo");
   3350 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3351 	config->layoutSpecificSize = 0; /* XXX ?? */
   3352 
   3353 	while(ac!=NULL) {
   3354 		/* row/col values will be in range due to the checks
   3355 		   in reasonable_label() */
   3356 		strcpy(config->devnames[0][ac->clabel->column],
   3357 		       ac->devname);
   3358 		ac = ac->next;
   3359 	}
   3360 
   3361 	for(i=0;i<RF_MAXDBGV;i++) {
   3362 		config->debugVars[i][0] = 0;
   3363 	}
   3364 }
   3365 
   3366 int
   3367 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3368 {
   3369 	RF_ComponentLabel_t *clabel;
   3370 	int column;
   3371 	int sparecol;
   3372 
   3373 	raidPtr->autoconfigure = new_value;
   3374 
   3375 	for(column=0; column<raidPtr->numCol; column++) {
   3376 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3377 			clabel = raidget_component_label(raidPtr, column);
   3378 			clabel->autoconfigure = new_value;
   3379 			raidflush_component_label(raidPtr, column);
   3380 		}
   3381 	}
   3382 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3383 		sparecol = raidPtr->numCol + column;
   3384 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3385 			clabel = raidget_component_label(raidPtr, sparecol);
   3386 			clabel->autoconfigure = new_value;
   3387 			raidflush_component_label(raidPtr, sparecol);
   3388 		}
   3389 	}
   3390 	return(new_value);
   3391 }
   3392 
   3393 int
   3394 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3395 {
   3396 	RF_ComponentLabel_t *clabel;
   3397 	int column;
   3398 	int sparecol;
   3399 
   3400 	raidPtr->root_partition = new_value;
   3401 	for(column=0; column<raidPtr->numCol; column++) {
   3402 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3403 			clabel = raidget_component_label(raidPtr, column);
   3404 			clabel->root_partition = new_value;
   3405 			raidflush_component_label(raidPtr, column);
   3406 		}
   3407 	}
   3408 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3409 		sparecol = raidPtr->numCol + column;
   3410 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3411 			clabel = raidget_component_label(raidPtr, sparecol);
   3412 			clabel->root_partition = new_value;
   3413 			raidflush_component_label(raidPtr, sparecol);
   3414 		}
   3415 	}
   3416 	return(new_value);
   3417 }
   3418 
   3419 void
   3420 rf_release_all_vps(RF_ConfigSet_t *cset)
   3421 {
   3422 	RF_AutoConfig_t *ac;
   3423 
   3424 	ac = cset->ac;
   3425 	while(ac!=NULL) {
   3426 		/* Close the vp, and give it back */
   3427 		if (ac->vp) {
   3428 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3429 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3430 			vput(ac->vp);
   3431 			ac->vp = NULL;
   3432 		}
   3433 		ac = ac->next;
   3434 	}
   3435 }
   3436 
   3437 
   3438 void
   3439 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3440 {
   3441 	RF_AutoConfig_t *ac;
   3442 	RF_AutoConfig_t *next_ac;
   3443 
   3444 	ac = cset->ac;
   3445 	while(ac!=NULL) {
   3446 		next_ac = ac->next;
   3447 		/* nuke the label */
   3448 		free(ac->clabel, M_RAIDFRAME);
   3449 		/* cleanup the config structure */
   3450 		free(ac, M_RAIDFRAME);
   3451 		/* "next.." */
   3452 		ac = next_ac;
   3453 	}
   3454 	/* and, finally, nuke the config set */
   3455 	free(cset, M_RAIDFRAME);
   3456 }
   3457 
   3458 
   3459 void
   3460 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3461 {
   3462 	/* current version number */
   3463 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3464 	clabel->serial_number = raidPtr->serial_number;
   3465 	clabel->mod_counter = raidPtr->mod_counter;
   3466 
   3467 	clabel->num_rows = 1;
   3468 	clabel->num_columns = raidPtr->numCol;
   3469 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3470 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3471 
   3472 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3473 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3474 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3475 
   3476 	clabel->blockSize = raidPtr->bytesPerSector;
   3477 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3478 
   3479 	/* XXX not portable */
   3480 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3481 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3482 	clabel->autoconfigure = raidPtr->autoconfigure;
   3483 	clabel->root_partition = raidPtr->root_partition;
   3484 	clabel->last_unit = raidPtr->raidid;
   3485 	clabel->config_order = raidPtr->config_order;
   3486 
   3487 #ifndef RF_NO_PARITY_MAP
   3488 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3489 #endif
   3490 }
   3491 
   3492 struct raid_softc *
   3493 rf_auto_config_set(RF_ConfigSet_t *cset)
   3494 {
   3495 	RF_Raid_t *raidPtr;
   3496 	RF_Config_t *config;
   3497 	int raidID;
   3498 	struct raid_softc *sc;
   3499 
   3500 #ifdef DEBUG
   3501 	printf("RAID autoconfigure\n");
   3502 #endif
   3503 
   3504 	/* 1. Create a config structure */
   3505 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3506 	if (config == NULL) {
   3507 		printf("%s: Out of mem - config!?!?\n", __func__);
   3508 				/* XXX do something more intelligent here. */
   3509 		return NULL;
   3510 	}
   3511 
   3512 	/*
   3513 	   2. Figure out what RAID ID this one is supposed to live at
   3514 	   See if we can get the same RAID dev that it was configured
   3515 	   on last time..
   3516 	*/
   3517 
   3518 	raidID = cset->ac->clabel->last_unit;
   3519 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3520 	     sc = raidget(++raidID, false))
   3521 		continue;
   3522 #ifdef DEBUG
   3523 	printf("Configuring raid%d:\n",raidID);
   3524 #endif
   3525 
   3526 	if (sc == NULL)
   3527 		sc = raidget(raidID, true);
   3528 	if (sc == NULL) {
   3529 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3530 				/* XXX do something more intelligent here. */
   3531 		free(config, M_RAIDFRAME);
   3532 		return NULL;
   3533 	}
   3534 
   3535 	raidPtr = &sc->sc_r;
   3536 
   3537 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3538 	raidPtr->softc = sc;
   3539 	raidPtr->raidid = raidID;
   3540 	raidPtr->openings = RAIDOUTSTANDING;
   3541 
   3542 	/* 3. Build the configuration structure */
   3543 	rf_create_configuration(cset->ac, config, raidPtr);
   3544 
   3545 	/* 4. Do the configuration */
   3546 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3547 		raidinit(sc);
   3548 
   3549 		rf_markalldirty(raidPtr);
   3550 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3551 		switch (cset->ac->clabel->root_partition) {
   3552 		case 1:	/* Force Root */
   3553 		case 2:	/* Soft Root: root when boot partition part of raid */
   3554 			/*
   3555 			 * everything configured just fine.  Make a note
   3556 			 * that this set is eligible to be root,
   3557 			 * or forced to be root
   3558 			 */
   3559 			cset->rootable = cset->ac->clabel->root_partition;
   3560 			/* XXX do this here? */
   3561 			raidPtr->root_partition = cset->rootable;
   3562 			break;
   3563 		default:
   3564 			break;
   3565 		}
   3566 	} else {
   3567 		raidput(sc);
   3568 		sc = NULL;
   3569 	}
   3570 
   3571 	/* 5. Cleanup */
   3572 	free(config, M_RAIDFRAME);
   3573 	return sc;
   3574 }
   3575 
   3576 void
   3577 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3578 	     size_t xmin, size_t xmax)
   3579 {
   3580 	int error;
   3581 
   3582 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3583 	pool_sethiwat(p, xmax);
   3584 	if ((error = pool_prime(p, xmin)) != 0)
   3585 		panic("%s: failed to prime pool: %d", __func__, error);
   3586 	pool_setlowat(p, xmin);
   3587 }
   3588 
   3589 /*
   3590  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3591  * to see if there is IO pending and if that IO could possibly be done
   3592  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3593  * otherwise.
   3594  *
   3595  */
   3596 int
   3597 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3598 {
   3599 	struct raid_softc *rs;
   3600 	struct dk_softc *dksc;
   3601 
   3602 	rs = raidPtr->softc;
   3603 	dksc = &rs->sc_dksc;
   3604 
   3605 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3606 		return 1;
   3607 
   3608 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3609 		/* there is work to do */
   3610 		return 0;
   3611 	}
   3612 	/* default is nothing to do */
   3613 	return 1;
   3614 }
   3615 
   3616 int
   3617 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3618 {
   3619 	uint64_t numsecs;
   3620 	unsigned secsize;
   3621 	int error;
   3622 
   3623 	error = getdisksize(vp, &numsecs, &secsize);
   3624 	if (error == 0) {
   3625 		diskPtr->blockSize = secsize;
   3626 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3627 		diskPtr->partitionSize = numsecs;
   3628 		return 0;
   3629 	}
   3630 	return error;
   3631 }
   3632 
   3633 static int
   3634 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3635 {
   3636 	return 1;
   3637 }
   3638 
   3639 static void
   3640 raid_attach(device_t parent, device_t self, void *aux)
   3641 {
   3642 }
   3643 
   3644 
   3645 static int
   3646 raid_detach(device_t self, int flags)
   3647 {
   3648 	int error;
   3649 	struct raid_softc *rs = raidsoftc(self);
   3650 
   3651 	if (rs == NULL)
   3652 		return ENXIO;
   3653 
   3654 	if ((error = raidlock(rs)) != 0)
   3655 		return (error);
   3656 
   3657 	error = raid_detach_unlocked(rs);
   3658 
   3659 	raidunlock(rs);
   3660 
   3661 	/* XXX raid can be referenced here */
   3662 
   3663 	if (error)
   3664 		return error;
   3665 
   3666 	/* Free the softc */
   3667 	raidput(rs);
   3668 
   3669 	return 0;
   3670 }
   3671 
   3672 static void
   3673 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3674 {
   3675 	struct dk_softc *dksc = &rs->sc_dksc;
   3676 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3677 
   3678 	memset(dg, 0, sizeof(*dg));
   3679 
   3680 	dg->dg_secperunit = raidPtr->totalSectors;
   3681 	dg->dg_secsize = raidPtr->bytesPerSector;
   3682 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3683 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3684 
   3685 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3686 }
   3687 
   3688 /*
   3689  * Get cache info for all the components (including spares).
   3690  * Returns intersection of all the cache flags of all disks, or first
   3691  * error if any encountered.
   3692  * XXXfua feature flags can change as spares are added - lock down somehow
   3693  */
   3694 static int
   3695 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3696 {
   3697 	int c;
   3698 	int error;
   3699 	int dkwhole = 0, dkpart;
   3700 
   3701 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3702 		/*
   3703 		 * Check any non-dead disk, even when currently being
   3704 		 * reconstructed.
   3705 		 */
   3706 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3707 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3708 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3709 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3710 			if (error) {
   3711 				if (error != ENODEV) {
   3712 					printf("raid%d: get cache for component %s failed\n",
   3713 					    raidPtr->raidid,
   3714 					    raidPtr->Disks[c].devname);
   3715 				}
   3716 
   3717 				return error;
   3718 			}
   3719 
   3720 			if (c == 0)
   3721 				dkwhole = dkpart;
   3722 			else
   3723 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3724 		}
   3725 	}
   3726 
   3727 	*data = dkwhole;
   3728 
   3729 	return 0;
   3730 }
   3731 
   3732 /*
   3733  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3734  * We end up returning whatever error was returned by the first cache flush
   3735  * that fails.
   3736  */
   3737 
   3738 int
   3739 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3740 {
   3741 	int c, sparecol;
   3742 	int e,error;
   3743 	int force = 1;
   3744 
   3745 	error = 0;
   3746 	for (c = 0; c < raidPtr->numCol; c++) {
   3747 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3748 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3749 					  &force, FWRITE, NOCRED);
   3750 			if (e) {
   3751 				if (e != ENODEV)
   3752 					printf("raid%d: cache flush to component %s failed.\n",
   3753 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3754 				if (error == 0) {
   3755 					error = e;
   3756 				}
   3757 			}
   3758 		}
   3759 	}
   3760 
   3761 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3762 		sparecol = raidPtr->numCol + c;
   3763 		/* Need to ensure that the reconstruct actually completed! */
   3764 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3765 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3766 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3767 			if (e) {
   3768 				if (e != ENODEV)
   3769 					printf("raid%d: cache flush to component %s failed.\n",
   3770 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3771 				if (error == 0) {
   3772 					error = e;
   3773 				}
   3774 			}
   3775 		}
   3776 	}
   3777 	return error;
   3778 }
   3779 
   3780 static void
   3781 raidminphys(struct buf *bp)
   3782 {
   3783 	dev_t dev;
   3784 	int unit;
   3785 	struct raid_softc *rs;
   3786 	RF_Raid_t *raidPtr;
   3787 	long xmax;
   3788 
   3789 	dev = bp->b_dev;
   3790 	unit = raidunit(dev);
   3791 	rs = raidget(unit, false);
   3792 	raidPtr = &(rs->sc_r);
   3793 
   3794 	xmax = raidPtr->Layout.numDataCol * MAXPHYS;
   3795 
   3796 	if (bp->b_bcount > xmax) {
   3797 		bp->b_bcount = xmax;
   3798 	}
   3799 }
   3800 
   3801 /*
   3802  * Module interface
   3803  */
   3804 
   3805 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3806 
   3807 #ifdef _MODULE
   3808 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3809 #endif
   3810 
   3811 static int raid_modcmd(modcmd_t, void *);
   3812 static int raid_modcmd_init(void);
   3813 static int raid_modcmd_fini(void);
   3814 
   3815 static int
   3816 raid_modcmd(modcmd_t cmd, void *data)
   3817 {
   3818 	int error;
   3819 
   3820 	error = 0;
   3821 	switch (cmd) {
   3822 	case MODULE_CMD_INIT:
   3823 		error = raid_modcmd_init();
   3824 		break;
   3825 	case MODULE_CMD_FINI:
   3826 		error = raid_modcmd_fini();
   3827 		break;
   3828 	default:
   3829 		error = ENOTTY;
   3830 		break;
   3831 	}
   3832 	return error;
   3833 }
   3834 
   3835 static int
   3836 raid_modcmd_init(void)
   3837 {
   3838 	int error;
   3839 	int bmajor, cmajor;
   3840 
   3841 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3842 	mutex_enter(&raid_lock);
   3843 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3844 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3845 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3846 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3847 
   3848 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3849 #endif
   3850 
   3851 	bmajor = cmajor = -1;
   3852 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3853 	    &raid_cdevsw, &cmajor);
   3854 	if (error != 0 && error != EEXIST) {
   3855 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3856 		mutex_exit(&raid_lock);
   3857 		return error;
   3858 	}
   3859 #ifdef _MODULE
   3860 	error = config_cfdriver_attach(&raid_cd);
   3861 	if (error != 0) {
   3862 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3863 		    __func__, error);
   3864 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3865 		mutex_exit(&raid_lock);
   3866 		return error;
   3867 	}
   3868 #endif
   3869 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3870 	if (error != 0) {
   3871 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3872 		    __func__, error);
   3873 #ifdef _MODULE
   3874 		config_cfdriver_detach(&raid_cd);
   3875 #endif
   3876 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3877 		mutex_exit(&raid_lock);
   3878 		return error;
   3879 	}
   3880 
   3881 	raidautoconfigdone = false;
   3882 
   3883 	mutex_exit(&raid_lock);
   3884 
   3885 	if (error == 0) {
   3886 		if (rf_BootRaidframe(true) == 0)
   3887 			aprint_verbose("Kernelized RAIDframe activated\n");
   3888 		else
   3889 			panic("Serious error activating RAID!!");
   3890 	}
   3891 
   3892 	/*
   3893 	 * Register a finalizer which will be used to auto-config RAID
   3894 	 * sets once all real hardware devices have been found.
   3895 	 */
   3896 	error = config_finalize_register(NULL, rf_autoconfig);
   3897 	if (error != 0) {
   3898 		aprint_error("WARNING: unable to register RAIDframe "
   3899 		    "finalizer\n");
   3900 		error = 0;
   3901 	}
   3902 
   3903 	return error;
   3904 }
   3905 
   3906 static int
   3907 raid_modcmd_fini(void)
   3908 {
   3909 	int error;
   3910 
   3911 	mutex_enter(&raid_lock);
   3912 
   3913 	/* Don't allow unload if raid device(s) exist.  */
   3914 	if (!LIST_EMPTY(&raids)) {
   3915 		mutex_exit(&raid_lock);
   3916 		return EBUSY;
   3917 	}
   3918 
   3919 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3920 	if (error != 0) {
   3921 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3922 		mutex_exit(&raid_lock);
   3923 		return error;
   3924 	}
   3925 #ifdef _MODULE
   3926 	error = config_cfdriver_detach(&raid_cd);
   3927 	if (error != 0) {
   3928 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3929 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3930 		mutex_exit(&raid_lock);
   3931 		return error;
   3932 	}
   3933 #endif
   3934 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3935 	if (error != 0) {
   3936 		aprint_error("%s: cannot detach devsw\n",__func__);
   3937 #ifdef _MODULE
   3938 		config_cfdriver_attach(&raid_cd);
   3939 #endif
   3940 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3941 		mutex_exit(&raid_lock);
   3942 		return error;
   3943 	}
   3944 	rf_BootRaidframe(false);
   3945 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3946 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3947 	rf_destroy_cond2(rf_sparet_wait_cv);
   3948 	rf_destroy_cond2(rf_sparet_resp_cv);
   3949 #endif
   3950 	mutex_exit(&raid_lock);
   3951 	mutex_destroy(&raid_lock);
   3952 
   3953 	return error;
   3954 }
   3955