Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.326
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.326 2015/12/08 20:36:15 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.326 2015/12/08 20:36:15 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #include "ioconf.h"
    155 
    156 #ifdef DEBUG
    157 int     rf_kdebug_level = 0;
    158 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    159 #else				/* DEBUG */
    160 #define db1_printf(a) { }
    161 #endif				/* DEBUG */
    162 
    163 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    164 static rf_declare_mutex2(rf_sparet_wait_mutex);
    165 static rf_declare_cond2(rf_sparet_wait_cv);
    166 static rf_declare_cond2(rf_sparet_resp_cv);
    167 
    168 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    169 						 * spare table */
    170 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    171 						 * installation process */
    172 #endif
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int, struct proc *);
    181 struct raid_softc;
    182 static void raidinit(struct raid_softc *);
    183 
    184 static int raid_match(device_t, cfdata_t, void *);
    185 static void raid_attach(device_t, device_t, void *);
    186 static int raid_detach(device_t, int);
    187 
    188 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    189     daddr_t, daddr_t);
    190 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    191     daddr_t, daddr_t, int);
    192 
    193 static int raidwrite_component_label(unsigned,
    194     dev_t, struct vnode *, RF_ComponentLabel_t *);
    195 static int raidread_component_label(unsigned,
    196     dev_t, struct vnode *, RF_ComponentLabel_t *);
    197 
    198 
    199 static dev_type_open(raidopen);
    200 static dev_type_close(raidclose);
    201 static dev_type_read(raidread);
    202 static dev_type_write(raidwrite);
    203 static dev_type_ioctl(raidioctl);
    204 static dev_type_strategy(raidstrategy);
    205 static dev_type_dump(raiddump);
    206 static dev_type_size(raidsize);
    207 
    208 const struct bdevsw raid_bdevsw = {
    209 	.d_open = raidopen,
    210 	.d_close = raidclose,
    211 	.d_strategy = raidstrategy,
    212 	.d_ioctl = raidioctl,
    213 	.d_dump = raiddump,
    214 	.d_psize = raidsize,
    215 	.d_discard = nodiscard,
    216 	.d_flag = D_DISK
    217 };
    218 
    219 const struct cdevsw raid_cdevsw = {
    220 	.d_open = raidopen,
    221 	.d_close = raidclose,
    222 	.d_read = raidread,
    223 	.d_write = raidwrite,
    224 	.d_ioctl = raidioctl,
    225 	.d_stop = nostop,
    226 	.d_tty = notty,
    227 	.d_poll = nopoll,
    228 	.d_mmap = nommap,
    229 	.d_kqfilter = nokqfilter,
    230 	.d_discard = nodiscard,
    231 	.d_flag = D_DISK
    232 };
    233 
    234 static struct dkdriver rf_dkdriver = {
    235 	.d_strategy = raidstrategy,
    236 	.d_minphys = minphys
    237 };
    238 
    239 struct raid_softc {
    240 	device_t sc_dev;
    241 	int	sc_unit;
    242 	int     sc_flags;	/* flags */
    243 	int     sc_cflags;	/* configuration flags */
    244 	uint64_t sc_size;	/* size of the raid device */
    245 	char    sc_xname[20];	/* XXX external name */
    246 	struct disk sc_dkdev;	/* generic disk device info */
    247 	struct bufq_state *buf_queue;	/* used for the device queue */
    248 	RF_Raid_t sc_r;
    249 	LIST_ENTRY(raid_softc) sc_link;
    250 };
    251 /* sc_flags */
    252 #define RAIDF_INITED	0x01	/* unit has been initialized */
    253 #define RAIDF_WLABEL	0x02	/* label area is writable */
    254 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    255 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    256 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    257 #define RAIDF_LOCKED	0x80	/* unit is locked */
    258 
    259 #define	raidunit(x)	DISKUNIT(x)
    260 
    261 extern struct cfdriver raid_cd;
    262 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    263     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    264     DVF_DETACH_SHUTDOWN);
    265 
    266 /*
    267  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    268  * Be aware that large numbers can allow the driver to consume a lot of
    269  * kernel memory, especially on writes, and in degraded mode reads.
    270  *
    271  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    272  * a single 64K write will typically require 64K for the old data,
    273  * 64K for the old parity, and 64K for the new parity, for a total
    274  * of 192K (if the parity buffer is not re-used immediately).
    275  * Even it if is used immediately, that's still 128K, which when multiplied
    276  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    277  *
    278  * Now in degraded mode, for example, a 64K read on the above setup may
    279  * require data reconstruction, which will require *all* of the 4 remaining
    280  * disks to participate -- 4 * 32K/disk == 128K again.
    281  */
    282 
    283 #ifndef RAIDOUTSTANDING
    284 #define RAIDOUTSTANDING   6
    285 #endif
    286 
    287 #define RAIDLABELDEV(dev)	\
    288 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    289 
    290 /* declared here, and made public, for the benefit of KVM stuff.. */
    291 
    292 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    293 				     struct disklabel *);
    294 static void raidgetdisklabel(dev_t);
    295 static void raidmakedisklabel(struct raid_softc *);
    296 
    297 static int raidlock(struct raid_softc *);
    298 static void raidunlock(struct raid_softc *);
    299 
    300 static int raid_detach_unlocked(struct raid_softc *);
    301 
    302 static void rf_markalldirty(RF_Raid_t *);
    303 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    304 
    305 void rf_ReconThread(struct rf_recon_req *);
    306 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    307 void rf_CopybackThread(RF_Raid_t *raidPtr);
    308 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    309 int rf_autoconfig(device_t);
    310 void rf_buildroothack(RF_ConfigSet_t *);
    311 
    312 RF_AutoConfig_t *rf_find_raid_components(void);
    313 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    314 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    315 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    316 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    317 int rf_set_autoconfig(RF_Raid_t *, int);
    318 int rf_set_rootpartition(RF_Raid_t *, int);
    319 void rf_release_all_vps(RF_ConfigSet_t *);
    320 void rf_cleanup_config_set(RF_ConfigSet_t *);
    321 int rf_have_enough_components(RF_ConfigSet_t *);
    322 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    323 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    324 
    325 /*
    326  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    327  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    328  * in the kernel config file.
    329  */
    330 #ifdef RAID_AUTOCONFIG
    331 int raidautoconfig = 1;
    332 #else
    333 int raidautoconfig = 0;
    334 #endif
    335 static bool raidautoconfigdone = false;
    336 
    337 struct RF_Pools_s rf_pools;
    338 
    339 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    340 static kmutex_t raid_lock;
    341 
    342 static struct raid_softc *
    343 raidcreate(int unit) {
    344 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    345 	if (sc == NULL) {
    346 #ifdef DIAGNOSTIC
    347 		printf("%s: out of memory\n", __func__);
    348 #endif
    349 		return NULL;
    350 	}
    351 	sc->sc_unit = unit;
    352 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    353 	return sc;
    354 }
    355 
    356 static void
    357 raiddestroy(struct raid_softc *sc) {
    358 	bufq_free(sc->buf_queue);
    359 	kmem_free(sc, sizeof(*sc));
    360 }
    361 
    362 static struct raid_softc *
    363 raidget(int unit) {
    364 	struct raid_softc *sc;
    365 	if (unit < 0) {
    366 #ifdef DIAGNOSTIC
    367 		panic("%s: unit %d!", __func__, unit);
    368 #endif
    369 		return NULL;
    370 	}
    371 	mutex_enter(&raid_lock);
    372 	LIST_FOREACH(sc, &raids, sc_link) {
    373 		if (sc->sc_unit == unit) {
    374 			mutex_exit(&raid_lock);
    375 			return sc;
    376 		}
    377 	}
    378 	mutex_exit(&raid_lock);
    379 	if ((sc = raidcreate(unit)) == NULL)
    380 		return NULL;
    381 	mutex_enter(&raid_lock);
    382 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    383 	mutex_exit(&raid_lock);
    384 	return sc;
    385 }
    386 
    387 static void
    388 raidput(struct raid_softc *sc) {
    389 	mutex_enter(&raid_lock);
    390 	LIST_REMOVE(sc, sc_link);
    391 	mutex_exit(&raid_lock);
    392 	raiddestroy(sc);
    393 }
    394 
    395 void
    396 raidattach(int num)
    397 {
    398 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    399 	/* This is where all the initialization stuff gets done. */
    400 
    401 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    402 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    403 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    404 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    405 
    406 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    407 #endif
    408 
    409 	if (rf_BootRaidframe() == 0)
    410 		aprint_verbose("Kernelized RAIDframe activated\n");
    411 	else
    412 		panic("Serious error booting RAID!!");
    413 
    414 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    415 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    416 	}
    417 
    418 	raidautoconfigdone = false;
    419 
    420 	/*
    421 	 * Register a finalizer which will be used to auto-config RAID
    422 	 * sets once all real hardware devices have been found.
    423 	 */
    424 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    425 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    426 }
    427 
    428 int
    429 rf_autoconfig(device_t self)
    430 {
    431 	RF_AutoConfig_t *ac_list;
    432 	RF_ConfigSet_t *config_sets;
    433 
    434 	if (!raidautoconfig || raidautoconfigdone == true)
    435 		return (0);
    436 
    437 	/* XXX This code can only be run once. */
    438 	raidautoconfigdone = true;
    439 
    440 #ifdef __HAVE_CPU_BOOTCONF
    441 	/*
    442 	 * 0. find the boot device if needed first so we can use it later
    443 	 * this needs to be done before we autoconfigure any raid sets,
    444 	 * because if we use wedges we are not going to be able to open
    445 	 * the boot device later
    446 	 */
    447 	if (booted_device == NULL)
    448 		cpu_bootconf();
    449 #endif
    450 	/* 1. locate all RAID components on the system */
    451 	aprint_debug("Searching for RAID components...\n");
    452 	ac_list = rf_find_raid_components();
    453 
    454 	/* 2. Sort them into their respective sets. */
    455 	config_sets = rf_create_auto_sets(ac_list);
    456 
    457 	/*
    458 	 * 3. Evaluate each set and configure the valid ones.
    459 	 * This gets done in rf_buildroothack().
    460 	 */
    461 	rf_buildroothack(config_sets);
    462 
    463 	return 1;
    464 }
    465 
    466 static int
    467 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    468 	const char *bootname = device_xname(bdv);
    469 	size_t len = strlen(bootname);
    470 
    471 	for (int col = 0; col < r->numCol; col++) {
    472 		const char *devname = r->Disks[col].devname;
    473 		devname += sizeof("/dev/") - 1;
    474 		if (strncmp(devname, "dk", 2) == 0) {
    475 			const char *parent =
    476 			    dkwedge_get_parent_name(r->Disks[col].dev);
    477 			if (parent != NULL)
    478 				devname = parent;
    479 		}
    480 		if (strncmp(devname, bootname, len) == 0) {
    481 			struct raid_softc *sc = r->softc;
    482 			aprint_debug("raid%d includes boot device %s\n",
    483 			    sc->sc_unit, devname);
    484 			return 1;
    485 		}
    486 	}
    487 	return 0;
    488 }
    489 
    490 void
    491 rf_buildroothack(RF_ConfigSet_t *config_sets)
    492 {
    493 	RF_ConfigSet_t *cset;
    494 	RF_ConfigSet_t *next_cset;
    495 	int num_root;
    496 	struct raid_softc *sc, *rsc;
    497 
    498 	sc = rsc = NULL;
    499 	num_root = 0;
    500 	cset = config_sets;
    501 	while (cset != NULL) {
    502 		next_cset = cset->next;
    503 		if (rf_have_enough_components(cset) &&
    504 		    cset->ac->clabel->autoconfigure == 1) {
    505 			sc = rf_auto_config_set(cset);
    506 			if (sc != NULL) {
    507 				aprint_debug("raid%d: configured ok\n",
    508 				    sc->sc_unit);
    509 				if (cset->rootable) {
    510 					rsc = sc;
    511 					num_root++;
    512 				}
    513 			} else {
    514 				/* The autoconfig didn't work :( */
    515 				aprint_debug("Autoconfig failed\n");
    516 				rf_release_all_vps(cset);
    517 			}
    518 		} else {
    519 			/* we're not autoconfiguring this set...
    520 			   release the associated resources */
    521 			rf_release_all_vps(cset);
    522 		}
    523 		/* cleanup */
    524 		rf_cleanup_config_set(cset);
    525 		cset = next_cset;
    526 	}
    527 
    528 	/* if the user has specified what the root device should be
    529 	   then we don't touch booted_device or boothowto... */
    530 
    531 	if (rootspec != NULL)
    532 		return;
    533 
    534 	/* we found something bootable... */
    535 
    536 	/*
    537 	 * XXX: The following code assumes that the root raid
    538 	 * is the first ('a') partition. This is about the best
    539 	 * we can do with a BSD disklabel, but we might be able
    540 	 * to do better with a GPT label, by setting a specified
    541 	 * attribute to indicate the root partition. We can then
    542 	 * stash the partition number in the r->root_partition
    543 	 * high bits (the bottom 2 bits are already used). For
    544 	 * now we just set booted_partition to 0 when we override
    545 	 * root.
    546 	 */
    547 	if (num_root == 1) {
    548 		device_t candidate_root;
    549 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    550 			char cname[sizeof(cset->ac->devname)];
    551 			/* XXX: assume 'a' */
    552 			snprintf(cname, sizeof(cname), "%s%c",
    553 			    device_xname(rsc->sc_dev), 'a');
    554 			candidate_root = dkwedge_find_by_wname(cname);
    555 		} else
    556 			candidate_root = rsc->sc_dev;
    557 		if (booted_device == NULL ||
    558 		    rsc->sc_r.root_partition == 1 ||
    559 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    560 			booted_device = candidate_root;
    561 			booted_partition = 0;	/* XXX assume 'a' */
    562 		}
    563 	} else if (num_root > 1) {
    564 
    565 		/*
    566 		 * Maybe the MD code can help. If it cannot, then
    567 		 * setroot() will discover that we have no
    568 		 * booted_device and will ask the user if nothing was
    569 		 * hardwired in the kernel config file
    570 		 */
    571 		if (booted_device == NULL)
    572 			return;
    573 
    574 		num_root = 0;
    575 		mutex_enter(&raid_lock);
    576 		LIST_FOREACH(sc, &raids, sc_link) {
    577 			RF_Raid_t *r = &sc->sc_r;
    578 			if (r->valid == 0)
    579 				continue;
    580 
    581 			if (r->root_partition == 0)
    582 				continue;
    583 
    584 			if (rf_containsboot(r, booted_device)) {
    585 				num_root++;
    586 				rsc = sc;
    587 			}
    588 		}
    589 		mutex_exit(&raid_lock);
    590 
    591 		if (num_root == 1) {
    592 			booted_device = rsc->sc_dev;
    593 			booted_partition = 0;	/* XXX assume 'a' */
    594 		} else {
    595 			/* we can't guess.. require the user to answer... */
    596 			boothowto |= RB_ASKNAME;
    597 		}
    598 	}
    599 }
    600 
    601 static int
    602 raidsize(dev_t dev)
    603 {
    604 	struct raid_softc *rs;
    605 	struct disklabel *lp;
    606 	int     part, unit, omask, size;
    607 
    608 	unit = raidunit(dev);
    609 	if ((rs = raidget(unit)) == NULL)
    610 		return -1;
    611 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    612 		return (-1);
    613 
    614 	part = DISKPART(dev);
    615 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    616 	lp = rs->sc_dkdev.dk_label;
    617 
    618 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    619 		return (-1);
    620 
    621 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    622 		size = -1;
    623 	else
    624 		size = lp->d_partitions[part].p_size *
    625 		    (lp->d_secsize / DEV_BSIZE);
    626 
    627 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    628 		return (-1);
    629 
    630 	return (size);
    631 
    632 }
    633 
    634 static int
    635 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    636 {
    637 	int     unit = raidunit(dev);
    638 	struct raid_softc *rs;
    639 	const struct bdevsw *bdev;
    640 	struct disklabel *lp;
    641 	RF_Raid_t *raidPtr;
    642 	daddr_t offset;
    643 	int     part, c, sparecol, j, scol, dumpto;
    644 	int     error = 0;
    645 
    646 	if ((rs = raidget(unit)) == NULL)
    647 		return ENXIO;
    648 
    649 	raidPtr = &rs->sc_r;
    650 
    651 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    652 		return ENXIO;
    653 
    654 	/* we only support dumping to RAID 1 sets */
    655 	if (raidPtr->Layout.numDataCol != 1 ||
    656 	    raidPtr->Layout.numParityCol != 1)
    657 		return EINVAL;
    658 
    659 
    660 	if ((error = raidlock(rs)) != 0)
    661 		return error;
    662 
    663 	if (size % DEV_BSIZE != 0) {
    664 		error = EINVAL;
    665 		goto out;
    666 	}
    667 
    668 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    669 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    670 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    671 		    size / DEV_BSIZE, rs->sc_size);
    672 		error = EINVAL;
    673 		goto out;
    674 	}
    675 
    676 	part = DISKPART(dev);
    677 	lp = rs->sc_dkdev.dk_label;
    678 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    679 
    680 	/* figure out what device is alive.. */
    681 
    682 	/*
    683 	   Look for a component to dump to.  The preference for the
    684 	   component to dump to is as follows:
    685 	   1) the master
    686 	   2) a used_spare of the master
    687 	   3) the slave
    688 	   4) a used_spare of the slave
    689 	*/
    690 
    691 	dumpto = -1;
    692 	for (c = 0; c < raidPtr->numCol; c++) {
    693 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    694 			/* this might be the one */
    695 			dumpto = c;
    696 			break;
    697 		}
    698 	}
    699 
    700 	/*
    701 	   At this point we have possibly selected a live master or a
    702 	   live slave.  We now check to see if there is a spared
    703 	   master (or a spared slave), if we didn't find a live master
    704 	   or a live slave.
    705 	*/
    706 
    707 	for (c = 0; c < raidPtr->numSpare; c++) {
    708 		sparecol = raidPtr->numCol + c;
    709 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    710 			/* How about this one? */
    711 			scol = -1;
    712 			for(j=0;j<raidPtr->numCol;j++) {
    713 				if (raidPtr->Disks[j].spareCol == sparecol) {
    714 					scol = j;
    715 					break;
    716 				}
    717 			}
    718 			if (scol == 0) {
    719 				/*
    720 				   We must have found a spared master!
    721 				   We'll take that over anything else
    722 				   found so far.  (We couldn't have
    723 				   found a real master before, since
    724 				   this is a used spare, and it's
    725 				   saying that it's replacing the
    726 				   master.)  On reboot (with
    727 				   autoconfiguration turned on)
    728 				   sparecol will become the 1st
    729 				   component (component0) of this set.
    730 				*/
    731 				dumpto = sparecol;
    732 				break;
    733 			} else if (scol != -1) {
    734 				/*
    735 				   Must be a spared slave.  We'll dump
    736 				   to that if we havn't found anything
    737 				   else so far.
    738 				*/
    739 				if (dumpto == -1)
    740 					dumpto = sparecol;
    741 			}
    742 		}
    743 	}
    744 
    745 	if (dumpto == -1) {
    746 		/* we couldn't find any live components to dump to!?!?
    747 		 */
    748 		error = EINVAL;
    749 		goto out;
    750 	}
    751 
    752 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    753 
    754 	/*
    755 	   Note that blkno is relative to this particular partition.
    756 	   By adding the offset of this partition in the RAID
    757 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    758 	   value that is relative to the partition used for the
    759 	   underlying component.
    760 	*/
    761 
    762 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    763 				blkno + offset, va, size);
    764 
    765 out:
    766 	raidunlock(rs);
    767 
    768 	return error;
    769 }
    770 
    771 /* ARGSUSED */
    772 static int
    773 raidopen(dev_t dev, int flags, int fmt,
    774     struct lwp *l)
    775 {
    776 	int     unit = raidunit(dev);
    777 	struct raid_softc *rs;
    778 	struct disklabel *lp;
    779 	int     part, pmask;
    780 	int     error = 0;
    781 
    782 	if ((rs = raidget(unit)) == NULL)
    783 		return ENXIO;
    784 	if ((error = raidlock(rs)) != 0)
    785 		return (error);
    786 
    787 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    788 		error = EBUSY;
    789 		goto bad;
    790 	}
    791 
    792 	lp = rs->sc_dkdev.dk_label;
    793 
    794 	part = DISKPART(dev);
    795 
    796 	/*
    797 	 * If there are wedges, and this is not RAW_PART, then we
    798 	 * need to fail.
    799 	 */
    800 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    801 		error = EBUSY;
    802 		goto bad;
    803 	}
    804 	pmask = (1 << part);
    805 
    806 	if ((rs->sc_flags & RAIDF_INITED) &&
    807 	    (rs->sc_dkdev.dk_nwedges == 0) &&
    808 	    (rs->sc_dkdev.dk_openmask == 0))
    809 		raidgetdisklabel(dev);
    810 
    811 	/* make sure that this partition exists */
    812 
    813 	if (part != RAW_PART) {
    814 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    815 		    ((part >= lp->d_npartitions) ||
    816 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    817 			error = ENXIO;
    818 			goto bad;
    819 		}
    820 	}
    821 	/* Prevent this unit from being unconfigured while open. */
    822 	switch (fmt) {
    823 	case S_IFCHR:
    824 		rs->sc_dkdev.dk_copenmask |= pmask;
    825 		break;
    826 
    827 	case S_IFBLK:
    828 		rs->sc_dkdev.dk_bopenmask |= pmask;
    829 		break;
    830 	}
    831 
    832 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    833 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    834 		/* First one... mark things as dirty... Note that we *MUST*
    835 		 have done a configure before this.  I DO NOT WANT TO BE
    836 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    837 		 THAT THEY BELONG TOGETHER!!!!! */
    838 		/* XXX should check to see if we're only open for reading
    839 		   here... If so, we needn't do this, but then need some
    840 		   other way of keeping track of what's happened.. */
    841 
    842 		rf_markalldirty(&rs->sc_r);
    843 	}
    844 
    845 
    846 	rs->sc_dkdev.dk_openmask =
    847 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    848 
    849 bad:
    850 	raidunlock(rs);
    851 
    852 	return (error);
    853 
    854 
    855 }
    856 
    857 /* ARGSUSED */
    858 static int
    859 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    860 {
    861 	int     unit = raidunit(dev);
    862 	struct raid_softc *rs;
    863 	int     error = 0;
    864 	int     part;
    865 
    866 	if ((rs = raidget(unit)) == NULL)
    867 		return ENXIO;
    868 
    869 	if ((error = raidlock(rs)) != 0)
    870 		return (error);
    871 
    872 	part = DISKPART(dev);
    873 
    874 	/* ...that much closer to allowing unconfiguration... */
    875 	switch (fmt) {
    876 	case S_IFCHR:
    877 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    878 		break;
    879 
    880 	case S_IFBLK:
    881 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    882 		break;
    883 	}
    884 	rs->sc_dkdev.dk_openmask =
    885 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    886 
    887 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    888 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    889 		/* Last one... device is not unconfigured yet.
    890 		   Device shutdown has taken care of setting the
    891 		   clean bits if RAIDF_INITED is not set
    892 		   mark things as clean... */
    893 
    894 		rf_update_component_labels(&rs->sc_r,
    895 						 RF_FINAL_COMPONENT_UPDATE);
    896 
    897 		/* If the kernel is shutting down, it will detach
    898 		 * this RAID set soon enough.
    899 		 */
    900 	}
    901 
    902 	raidunlock(rs);
    903 	return (0);
    904 
    905 }
    906 
    907 static void
    908 raidstrategy(struct buf *bp)
    909 {
    910 	unsigned int unit = raidunit(bp->b_dev);
    911 	RF_Raid_t *raidPtr;
    912 	int     wlabel;
    913 	struct raid_softc *rs;
    914 
    915 	if ((rs = raidget(unit)) == NULL) {
    916 		bp->b_error = ENXIO;
    917 		goto done;
    918 	}
    919 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    920 		bp->b_error = ENXIO;
    921 		goto done;
    922 	}
    923 	raidPtr = &rs->sc_r;
    924 	if (!raidPtr->valid) {
    925 		bp->b_error = ENODEV;
    926 		goto done;
    927 	}
    928 	if (bp->b_bcount == 0) {
    929 		db1_printf(("b_bcount is zero..\n"));
    930 		goto done;
    931 	}
    932 
    933 	/*
    934 	 * Do bounds checking and adjust transfer.  If there's an
    935 	 * error, the bounds check will flag that for us.
    936 	 */
    937 
    938 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    939 	if (DISKPART(bp->b_dev) == RAW_PART) {
    940 		uint64_t size; /* device size in DEV_BSIZE unit */
    941 
    942 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    943 			size = raidPtr->totalSectors <<
    944 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    945 		} else {
    946 			size = raidPtr->totalSectors >>
    947 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    948 		}
    949 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    950 			goto done;
    951 		}
    952 	} else {
    953 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    954 			db1_printf(("Bounds check failed!!:%d %d\n",
    955 				(int) bp->b_blkno, (int) wlabel));
    956 			goto done;
    957 		}
    958 	}
    959 
    960 	rf_lock_mutex2(raidPtr->iodone_lock);
    961 
    962 	bp->b_resid = 0;
    963 
    964 	/* stuff it onto our queue */
    965 	bufq_put(rs->buf_queue, bp);
    966 
    967 	/* scheduled the IO to happen at the next convenient time */
    968 	rf_signal_cond2(raidPtr->iodone_cv);
    969 	rf_unlock_mutex2(raidPtr->iodone_lock);
    970 
    971 	return;
    972 
    973 done:
    974 	bp->b_resid = bp->b_bcount;
    975 	biodone(bp);
    976 }
    977 
    978 /* ARGSUSED */
    979 static int
    980 raidread(dev_t dev, struct uio *uio, int flags)
    981 {
    982 	int     unit = raidunit(dev);
    983 	struct raid_softc *rs;
    984 
    985 	if ((rs = raidget(unit)) == NULL)
    986 		return ENXIO;
    987 
    988 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    989 		return (ENXIO);
    990 
    991 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    992 
    993 }
    994 
    995 /* ARGSUSED */
    996 static int
    997 raidwrite(dev_t dev, struct uio *uio, int flags)
    998 {
    999 	int     unit = raidunit(dev);
   1000 	struct raid_softc *rs;
   1001 
   1002 	if ((rs = raidget(unit)) == NULL)
   1003 		return ENXIO;
   1004 
   1005 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1006 		return (ENXIO);
   1007 
   1008 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1009 
   1010 }
   1011 
   1012 static int
   1013 raid_detach_unlocked(struct raid_softc *rs)
   1014 {
   1015 	int error;
   1016 	RF_Raid_t *raidPtr;
   1017 
   1018 	raidPtr = &rs->sc_r;
   1019 
   1020 	/*
   1021 	 * If somebody has a partition mounted, we shouldn't
   1022 	 * shutdown.
   1023 	 */
   1024 	if (rs->sc_dkdev.dk_openmask != 0)
   1025 		return EBUSY;
   1026 
   1027 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1028 		;	/* not initialized: nothing to do */
   1029 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1030 		return error;
   1031 	else
   1032 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1033 
   1034 	/* Detach the disk. */
   1035 	dkwedge_delall(&rs->sc_dkdev);
   1036 	disk_detach(&rs->sc_dkdev);
   1037 	disk_destroy(&rs->sc_dkdev);
   1038 
   1039 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1040 
   1041 	return 0;
   1042 }
   1043 
   1044 static int
   1045 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1046 {
   1047 	int     unit = raidunit(dev);
   1048 	int     error = 0;
   1049 	int     part, pmask, s;
   1050 	cfdata_t cf;
   1051 	struct raid_softc *rs;
   1052 	RF_Config_t *k_cfg, *u_cfg;
   1053 	RF_Raid_t *raidPtr;
   1054 	RF_RaidDisk_t *diskPtr;
   1055 	RF_AccTotals_t *totals;
   1056 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1057 	u_char *specific_buf;
   1058 	int retcode = 0;
   1059 	int column;
   1060 /*	int raidid; */
   1061 	struct rf_recon_req *rrcopy, *rr;
   1062 	RF_ComponentLabel_t *clabel;
   1063 	RF_ComponentLabel_t *ci_label;
   1064 	RF_ComponentLabel_t **clabel_ptr;
   1065 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1066 	RF_SingleComponent_t component;
   1067 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1068 	int i, j, d;
   1069 #ifdef __HAVE_OLD_DISKLABEL
   1070 	struct disklabel newlabel;
   1071 #endif
   1072 
   1073 	if ((rs = raidget(unit)) == NULL)
   1074 		return ENXIO;
   1075 	raidPtr = &rs->sc_r;
   1076 
   1077 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1078 		(int) DISKPART(dev), (int) unit, cmd));
   1079 
   1080 	/* Must be open for writes for these commands... */
   1081 	switch (cmd) {
   1082 #ifdef DIOCGSECTORSIZE
   1083 	case DIOCGSECTORSIZE:
   1084 		*(u_int *)data = raidPtr->bytesPerSector;
   1085 		return 0;
   1086 	case DIOCGMEDIASIZE:
   1087 		*(off_t *)data =
   1088 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1089 		return 0;
   1090 #endif
   1091 	case DIOCSDINFO:
   1092 	case DIOCWDINFO:
   1093 #ifdef __HAVE_OLD_DISKLABEL
   1094 	case ODIOCWDINFO:
   1095 	case ODIOCSDINFO:
   1096 #endif
   1097 	case DIOCWLABEL:
   1098 	case DIOCAWEDGE:
   1099 	case DIOCDWEDGE:
   1100 	case DIOCMWEDGES:
   1101 	case DIOCSSTRATEGY:
   1102 		if ((flag & FWRITE) == 0)
   1103 			return (EBADF);
   1104 	}
   1105 
   1106 	/* Must be initialized for these... */
   1107 	switch (cmd) {
   1108 	case DIOCGDINFO:
   1109 	case DIOCSDINFO:
   1110 	case DIOCWDINFO:
   1111 #ifdef __HAVE_OLD_DISKLABEL
   1112 	case ODIOCGDINFO:
   1113 	case ODIOCWDINFO:
   1114 	case ODIOCSDINFO:
   1115 	case ODIOCGDEFLABEL:
   1116 #endif
   1117 	case DIOCGPARTINFO:
   1118 	case DIOCWLABEL:
   1119 	case DIOCGDEFLABEL:
   1120 	case DIOCAWEDGE:
   1121 	case DIOCDWEDGE:
   1122 	case DIOCLWEDGES:
   1123 	case DIOCMWEDGES:
   1124 	case DIOCCACHESYNC:
   1125 	case RAIDFRAME_SHUTDOWN:
   1126 	case RAIDFRAME_REWRITEPARITY:
   1127 	case RAIDFRAME_GET_INFO:
   1128 	case RAIDFRAME_RESET_ACCTOTALS:
   1129 	case RAIDFRAME_GET_ACCTOTALS:
   1130 	case RAIDFRAME_KEEP_ACCTOTALS:
   1131 	case RAIDFRAME_GET_SIZE:
   1132 	case RAIDFRAME_FAIL_DISK:
   1133 	case RAIDFRAME_COPYBACK:
   1134 	case RAIDFRAME_CHECK_RECON_STATUS:
   1135 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1136 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1137 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1138 	case RAIDFRAME_ADD_HOT_SPARE:
   1139 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1140 	case RAIDFRAME_INIT_LABELS:
   1141 	case RAIDFRAME_REBUILD_IN_PLACE:
   1142 	case RAIDFRAME_CHECK_PARITY:
   1143 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1144 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1145 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1146 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1147 	case RAIDFRAME_SET_AUTOCONFIG:
   1148 	case RAIDFRAME_SET_ROOT:
   1149 	case RAIDFRAME_DELETE_COMPONENT:
   1150 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1151 	case RAIDFRAME_PARITYMAP_STATUS:
   1152 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1153 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1154 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1155 	case DIOCGSTRATEGY:
   1156 	case DIOCSSTRATEGY:
   1157 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1158 			return (ENXIO);
   1159 	}
   1160 
   1161 	switch (cmd) {
   1162 #ifdef COMPAT_50
   1163 	case RAIDFRAME_GET_INFO50:
   1164 		return rf_get_info50(raidPtr, data);
   1165 
   1166 	case RAIDFRAME_CONFIGURE50:
   1167 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1168 			return retcode;
   1169 		goto config;
   1170 #endif
   1171 		/* configure the system */
   1172 	case RAIDFRAME_CONFIGURE:
   1173 
   1174 		if (raidPtr->valid) {
   1175 			/* There is a valid RAID set running on this unit! */
   1176 			printf("raid%d: Device already configured!\n",unit);
   1177 			return(EINVAL);
   1178 		}
   1179 
   1180 		/* copy-in the configuration information */
   1181 		/* data points to a pointer to the configuration structure */
   1182 
   1183 		u_cfg = *((RF_Config_t **) data);
   1184 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1185 		if (k_cfg == NULL) {
   1186 			return (ENOMEM);
   1187 		}
   1188 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1189 		if (retcode) {
   1190 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1191 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1192 				retcode));
   1193 			return (retcode);
   1194 		}
   1195 		goto config;
   1196 	config:
   1197 		/* allocate a buffer for the layout-specific data, and copy it
   1198 		 * in */
   1199 		if (k_cfg->layoutSpecificSize) {
   1200 			if (k_cfg->layoutSpecificSize > 10000) {
   1201 				/* sanity check */
   1202 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1203 				return (EINVAL);
   1204 			}
   1205 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1206 			    (u_char *));
   1207 			if (specific_buf == NULL) {
   1208 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1209 				return (ENOMEM);
   1210 			}
   1211 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1212 			    k_cfg->layoutSpecificSize);
   1213 			if (retcode) {
   1214 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1215 				RF_Free(specific_buf,
   1216 					k_cfg->layoutSpecificSize);
   1217 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1218 					retcode));
   1219 				return (retcode);
   1220 			}
   1221 		} else
   1222 			specific_buf = NULL;
   1223 		k_cfg->layoutSpecific = specific_buf;
   1224 
   1225 		/* should do some kind of sanity check on the configuration.
   1226 		 * Store the sum of all the bytes in the last byte? */
   1227 
   1228 		/* configure the system */
   1229 
   1230 		/*
   1231 		 * Clear the entire RAID descriptor, just to make sure
   1232 		 *  there is no stale data left in the case of a
   1233 		 *  reconfiguration
   1234 		 */
   1235 		memset(raidPtr, 0, sizeof(*raidPtr));
   1236 		raidPtr->softc = rs;
   1237 		raidPtr->raidid = unit;
   1238 
   1239 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1240 
   1241 		if (retcode == 0) {
   1242 
   1243 			/* allow this many simultaneous IO's to
   1244 			   this RAID device */
   1245 			raidPtr->openings = RAIDOUTSTANDING;
   1246 
   1247 			raidinit(rs);
   1248 			rf_markalldirty(raidPtr);
   1249 		}
   1250 		/* free the buffers.  No return code here. */
   1251 		if (k_cfg->layoutSpecificSize) {
   1252 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1253 		}
   1254 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1255 
   1256 		return (retcode);
   1257 
   1258 		/* shutdown the system */
   1259 	case RAIDFRAME_SHUTDOWN:
   1260 
   1261 		part = DISKPART(dev);
   1262 		pmask = (1 << part);
   1263 
   1264 		if ((error = raidlock(rs)) != 0)
   1265 			return (error);
   1266 
   1267 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1268 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1269 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1270 			retcode = EBUSY;
   1271 		else {
   1272 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1273 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1274 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1275 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1276 			retcode = 0;
   1277 		}
   1278 
   1279 		raidunlock(rs);
   1280 
   1281 		if (retcode != 0)
   1282 			return retcode;
   1283 
   1284 		/* free the pseudo device attach bits */
   1285 
   1286 		cf = device_cfdata(rs->sc_dev);
   1287 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1288 			free(cf, M_RAIDFRAME);
   1289 
   1290 		return (retcode);
   1291 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1292 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1293 		/* need to read the component label for the disk indicated
   1294 		   by row,column in clabel */
   1295 
   1296 		/*
   1297 		 * Perhaps there should be an option to skip the in-core
   1298 		 * copy and hit the disk, as with disklabel(8).
   1299 		 */
   1300 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1301 
   1302 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1303 
   1304 		if (retcode) {
   1305 			RF_Free(clabel, sizeof(*clabel));
   1306 			return retcode;
   1307 		}
   1308 
   1309 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1310 
   1311 		column = clabel->column;
   1312 
   1313 		if ((column < 0) || (column >= raidPtr->numCol +
   1314 		    raidPtr->numSpare)) {
   1315 			RF_Free(clabel, sizeof(*clabel));
   1316 			return EINVAL;
   1317 		}
   1318 
   1319 		RF_Free(clabel, sizeof(*clabel));
   1320 
   1321 		clabel = raidget_component_label(raidPtr, column);
   1322 
   1323 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1324 
   1325 #if 0
   1326 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1327 		clabel = (RF_ComponentLabel_t *) data;
   1328 
   1329 		/* XXX check the label for valid stuff... */
   1330 		/* Note that some things *should not* get modified --
   1331 		   the user should be re-initing the labels instead of
   1332 		   trying to patch things.
   1333 		   */
   1334 
   1335 		raidid = raidPtr->raidid;
   1336 #ifdef DEBUG
   1337 		printf("raid%d: Got component label:\n", raidid);
   1338 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1339 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1340 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1341 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1342 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1343 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1344 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1345 #endif
   1346 		clabel->row = 0;
   1347 		column = clabel->column;
   1348 
   1349 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1350 			return(EINVAL);
   1351 		}
   1352 
   1353 		/* XXX this isn't allowed to do anything for now :-) */
   1354 
   1355 		/* XXX and before it is, we need to fill in the rest
   1356 		   of the fields!?!?!?! */
   1357 		memcpy(raidget_component_label(raidPtr, column),
   1358 		    clabel, sizeof(*clabel));
   1359 		raidflush_component_label(raidPtr, column);
   1360 		return (0);
   1361 #endif
   1362 
   1363 	case RAIDFRAME_INIT_LABELS:
   1364 		clabel = (RF_ComponentLabel_t *) data;
   1365 		/*
   1366 		   we only want the serial number from
   1367 		   the above.  We get all the rest of the information
   1368 		   from the config that was used to create this RAID
   1369 		   set.
   1370 		   */
   1371 
   1372 		raidPtr->serial_number = clabel->serial_number;
   1373 
   1374 		for(column=0;column<raidPtr->numCol;column++) {
   1375 			diskPtr = &raidPtr->Disks[column];
   1376 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1377 				ci_label = raidget_component_label(raidPtr,
   1378 				    column);
   1379 				/* Zeroing this is important. */
   1380 				memset(ci_label, 0, sizeof(*ci_label));
   1381 				raid_init_component_label(raidPtr, ci_label);
   1382 				ci_label->serial_number =
   1383 				    raidPtr->serial_number;
   1384 				ci_label->row = 0; /* we dont' pretend to support more */
   1385 				rf_component_label_set_partitionsize(ci_label,
   1386 				    diskPtr->partitionSize);
   1387 				ci_label->column = column;
   1388 				raidflush_component_label(raidPtr, column);
   1389 			}
   1390 			/* XXXjld what about the spares? */
   1391 		}
   1392 
   1393 		return (retcode);
   1394 	case RAIDFRAME_SET_AUTOCONFIG:
   1395 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1396 		printf("raid%d: New autoconfig value is: %d\n",
   1397 		       raidPtr->raidid, d);
   1398 		*(int *) data = d;
   1399 		return (retcode);
   1400 
   1401 	case RAIDFRAME_SET_ROOT:
   1402 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1403 		printf("raid%d: New rootpartition value is: %d\n",
   1404 		       raidPtr->raidid, d);
   1405 		*(int *) data = d;
   1406 		return (retcode);
   1407 
   1408 		/* initialize all parity */
   1409 	case RAIDFRAME_REWRITEPARITY:
   1410 
   1411 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1412 			/* Parity for RAID 0 is trivially correct */
   1413 			raidPtr->parity_good = RF_RAID_CLEAN;
   1414 			return(0);
   1415 		}
   1416 
   1417 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1418 			/* Re-write is already in progress! */
   1419 			return(EINVAL);
   1420 		}
   1421 
   1422 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1423 					   rf_RewriteParityThread,
   1424 					   raidPtr,"raid_parity");
   1425 		return (retcode);
   1426 
   1427 
   1428 	case RAIDFRAME_ADD_HOT_SPARE:
   1429 		sparePtr = (RF_SingleComponent_t *) data;
   1430 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1431 		retcode = rf_add_hot_spare(raidPtr, &component);
   1432 		return(retcode);
   1433 
   1434 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1435 		return(retcode);
   1436 
   1437 	case RAIDFRAME_DELETE_COMPONENT:
   1438 		componentPtr = (RF_SingleComponent_t *)data;
   1439 		memcpy( &component, componentPtr,
   1440 			sizeof(RF_SingleComponent_t));
   1441 		retcode = rf_delete_component(raidPtr, &component);
   1442 		return(retcode);
   1443 
   1444 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1445 		componentPtr = (RF_SingleComponent_t *)data;
   1446 		memcpy( &component, componentPtr,
   1447 			sizeof(RF_SingleComponent_t));
   1448 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1449 		return(retcode);
   1450 
   1451 	case RAIDFRAME_REBUILD_IN_PLACE:
   1452 
   1453 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1454 			/* Can't do this on a RAID 0!! */
   1455 			return(EINVAL);
   1456 		}
   1457 
   1458 		if (raidPtr->recon_in_progress == 1) {
   1459 			/* a reconstruct is already in progress! */
   1460 			return(EINVAL);
   1461 		}
   1462 
   1463 		componentPtr = (RF_SingleComponent_t *) data;
   1464 		memcpy( &component, componentPtr,
   1465 			sizeof(RF_SingleComponent_t));
   1466 		component.row = 0; /* we don't support any more */
   1467 		column = component.column;
   1468 
   1469 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1470 			return(EINVAL);
   1471 		}
   1472 
   1473 		rf_lock_mutex2(raidPtr->mutex);
   1474 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1475 		    (raidPtr->numFailures > 0)) {
   1476 			/* XXX 0 above shouldn't be constant!!! */
   1477 			/* some component other than this has failed.
   1478 			   Let's not make things worse than they already
   1479 			   are... */
   1480 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1481 			       raidPtr->raidid);
   1482 			printf("raid%d:     Col: %d   Too many failures.\n",
   1483 			       raidPtr->raidid, column);
   1484 			rf_unlock_mutex2(raidPtr->mutex);
   1485 			return (EINVAL);
   1486 		}
   1487 		if (raidPtr->Disks[column].status ==
   1488 		    rf_ds_reconstructing) {
   1489 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1490 			       raidPtr->raidid);
   1491 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1492 
   1493 			rf_unlock_mutex2(raidPtr->mutex);
   1494 			return (EINVAL);
   1495 		}
   1496 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1497 			rf_unlock_mutex2(raidPtr->mutex);
   1498 			return (EINVAL);
   1499 		}
   1500 		rf_unlock_mutex2(raidPtr->mutex);
   1501 
   1502 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1503 		if (rrcopy == NULL)
   1504 			return(ENOMEM);
   1505 
   1506 		rrcopy->raidPtr = (void *) raidPtr;
   1507 		rrcopy->col = column;
   1508 
   1509 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1510 					   rf_ReconstructInPlaceThread,
   1511 					   rrcopy,"raid_reconip");
   1512 		return(retcode);
   1513 
   1514 	case RAIDFRAME_GET_INFO:
   1515 		if (!raidPtr->valid)
   1516 			return (ENODEV);
   1517 		ucfgp = (RF_DeviceConfig_t **) data;
   1518 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1519 			  (RF_DeviceConfig_t *));
   1520 		if (d_cfg == NULL)
   1521 			return (ENOMEM);
   1522 		d_cfg->rows = 1; /* there is only 1 row now */
   1523 		d_cfg->cols = raidPtr->numCol;
   1524 		d_cfg->ndevs = raidPtr->numCol;
   1525 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1526 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1527 			return (ENOMEM);
   1528 		}
   1529 		d_cfg->nspares = raidPtr->numSpare;
   1530 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1531 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1532 			return (ENOMEM);
   1533 		}
   1534 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1535 		d = 0;
   1536 		for (j = 0; j < d_cfg->cols; j++) {
   1537 			d_cfg->devs[d] = raidPtr->Disks[j];
   1538 			d++;
   1539 		}
   1540 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1541 			d_cfg->spares[i] = raidPtr->Disks[j];
   1542 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1543 				/* XXX: raidctl(8) expects to see this as a used spare */
   1544 				d_cfg->spares[i].status = rf_ds_used_spare;
   1545 			}
   1546 		}
   1547 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1548 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1549 
   1550 		return (retcode);
   1551 
   1552 	case RAIDFRAME_CHECK_PARITY:
   1553 		*(int *) data = raidPtr->parity_good;
   1554 		return (0);
   1555 
   1556 	case RAIDFRAME_PARITYMAP_STATUS:
   1557 		if (rf_paritymap_ineligible(raidPtr))
   1558 			return EINVAL;
   1559 		rf_paritymap_status(raidPtr->parity_map,
   1560 		    (struct rf_pmstat *)data);
   1561 		return 0;
   1562 
   1563 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1564 		if (rf_paritymap_ineligible(raidPtr))
   1565 			return EINVAL;
   1566 		if (raidPtr->parity_map == NULL)
   1567 			return ENOENT; /* ??? */
   1568 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1569 			(struct rf_pmparams *)data, 1))
   1570 			return EINVAL;
   1571 		return 0;
   1572 
   1573 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1574 		if (rf_paritymap_ineligible(raidPtr))
   1575 			return EINVAL;
   1576 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1577 		return 0;
   1578 
   1579 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1580 		if (rf_paritymap_ineligible(raidPtr))
   1581 			return EINVAL;
   1582 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1583 		/* XXX should errors be passed up? */
   1584 		return 0;
   1585 
   1586 	case RAIDFRAME_RESET_ACCTOTALS:
   1587 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1588 		return (0);
   1589 
   1590 	case RAIDFRAME_GET_ACCTOTALS:
   1591 		totals = (RF_AccTotals_t *) data;
   1592 		*totals = raidPtr->acc_totals;
   1593 		return (0);
   1594 
   1595 	case RAIDFRAME_KEEP_ACCTOTALS:
   1596 		raidPtr->keep_acc_totals = *(int *)data;
   1597 		return (0);
   1598 
   1599 	case RAIDFRAME_GET_SIZE:
   1600 		*(int *) data = raidPtr->totalSectors;
   1601 		return (0);
   1602 
   1603 		/* fail a disk & optionally start reconstruction */
   1604 	case RAIDFRAME_FAIL_DISK:
   1605 
   1606 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1607 			/* Can't do this on a RAID 0!! */
   1608 			return(EINVAL);
   1609 		}
   1610 
   1611 		rr = (struct rf_recon_req *) data;
   1612 		rr->row = 0;
   1613 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1614 			return (EINVAL);
   1615 
   1616 
   1617 		rf_lock_mutex2(raidPtr->mutex);
   1618 		if (raidPtr->status == rf_rs_reconstructing) {
   1619 			/* you can't fail a disk while we're reconstructing! */
   1620 			/* XXX wrong for RAID6 */
   1621 			rf_unlock_mutex2(raidPtr->mutex);
   1622 			return (EINVAL);
   1623 		}
   1624 		if ((raidPtr->Disks[rr->col].status ==
   1625 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1626 			/* some other component has failed.  Let's not make
   1627 			   things worse. XXX wrong for RAID6 */
   1628 			rf_unlock_mutex2(raidPtr->mutex);
   1629 			return (EINVAL);
   1630 		}
   1631 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1632 			/* Can't fail a spared disk! */
   1633 			rf_unlock_mutex2(raidPtr->mutex);
   1634 			return (EINVAL);
   1635 		}
   1636 		rf_unlock_mutex2(raidPtr->mutex);
   1637 
   1638 		/* make a copy of the recon request so that we don't rely on
   1639 		 * the user's buffer */
   1640 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1641 		if (rrcopy == NULL)
   1642 			return(ENOMEM);
   1643 		memcpy(rrcopy, rr, sizeof(*rr));
   1644 		rrcopy->raidPtr = (void *) raidPtr;
   1645 
   1646 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1647 					   rf_ReconThread,
   1648 					   rrcopy,"raid_recon");
   1649 		return (0);
   1650 
   1651 		/* invoke a copyback operation after recon on whatever disk
   1652 		 * needs it, if any */
   1653 	case RAIDFRAME_COPYBACK:
   1654 
   1655 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1656 			/* This makes no sense on a RAID 0!! */
   1657 			return(EINVAL);
   1658 		}
   1659 
   1660 		if (raidPtr->copyback_in_progress == 1) {
   1661 			/* Copyback is already in progress! */
   1662 			return(EINVAL);
   1663 		}
   1664 
   1665 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1666 					   rf_CopybackThread,
   1667 					   raidPtr,"raid_copyback");
   1668 		return (retcode);
   1669 
   1670 		/* return the percentage completion of reconstruction */
   1671 	case RAIDFRAME_CHECK_RECON_STATUS:
   1672 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1673 			/* This makes no sense on a RAID 0, so tell the
   1674 			   user it's done. */
   1675 			*(int *) data = 100;
   1676 			return(0);
   1677 		}
   1678 		if (raidPtr->status != rf_rs_reconstructing)
   1679 			*(int *) data = 100;
   1680 		else {
   1681 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1682 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1683 			} else {
   1684 				*(int *) data = 0;
   1685 			}
   1686 		}
   1687 		return (0);
   1688 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1689 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1690 		if (raidPtr->status != rf_rs_reconstructing) {
   1691 			progressInfo.remaining = 0;
   1692 			progressInfo.completed = 100;
   1693 			progressInfo.total = 100;
   1694 		} else {
   1695 			progressInfo.total =
   1696 				raidPtr->reconControl->numRUsTotal;
   1697 			progressInfo.completed =
   1698 				raidPtr->reconControl->numRUsComplete;
   1699 			progressInfo.remaining = progressInfo.total -
   1700 				progressInfo.completed;
   1701 		}
   1702 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1703 				  sizeof(RF_ProgressInfo_t));
   1704 		return (retcode);
   1705 
   1706 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1707 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1708 			/* This makes no sense on a RAID 0, so tell the
   1709 			   user it's done. */
   1710 			*(int *) data = 100;
   1711 			return(0);
   1712 		}
   1713 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1714 			*(int *) data = 100 *
   1715 				raidPtr->parity_rewrite_stripes_done /
   1716 				raidPtr->Layout.numStripe;
   1717 		} else {
   1718 			*(int *) data = 100;
   1719 		}
   1720 		return (0);
   1721 
   1722 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1723 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1724 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1725 			progressInfo.total = raidPtr->Layout.numStripe;
   1726 			progressInfo.completed =
   1727 				raidPtr->parity_rewrite_stripes_done;
   1728 			progressInfo.remaining = progressInfo.total -
   1729 				progressInfo.completed;
   1730 		} else {
   1731 			progressInfo.remaining = 0;
   1732 			progressInfo.completed = 100;
   1733 			progressInfo.total = 100;
   1734 		}
   1735 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1736 				  sizeof(RF_ProgressInfo_t));
   1737 		return (retcode);
   1738 
   1739 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1740 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1741 			/* This makes no sense on a RAID 0 */
   1742 			*(int *) data = 100;
   1743 			return(0);
   1744 		}
   1745 		if (raidPtr->copyback_in_progress == 1) {
   1746 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1747 				raidPtr->Layout.numStripe;
   1748 		} else {
   1749 			*(int *) data = 100;
   1750 		}
   1751 		return (0);
   1752 
   1753 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1754 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1755 		if (raidPtr->copyback_in_progress == 1) {
   1756 			progressInfo.total = raidPtr->Layout.numStripe;
   1757 			progressInfo.completed =
   1758 				raidPtr->copyback_stripes_done;
   1759 			progressInfo.remaining = progressInfo.total -
   1760 				progressInfo.completed;
   1761 		} else {
   1762 			progressInfo.remaining = 0;
   1763 			progressInfo.completed = 100;
   1764 			progressInfo.total = 100;
   1765 		}
   1766 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1767 				  sizeof(RF_ProgressInfo_t));
   1768 		return (retcode);
   1769 
   1770 		/* the sparetable daemon calls this to wait for the kernel to
   1771 		 * need a spare table. this ioctl does not return until a
   1772 		 * spare table is needed. XXX -- calling mpsleep here in the
   1773 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1774 		 * -- I should either compute the spare table in the kernel,
   1775 		 * or have a different -- XXX XXX -- interface (a different
   1776 		 * character device) for delivering the table     -- XXX */
   1777 #if 0
   1778 	case RAIDFRAME_SPARET_WAIT:
   1779 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1780 		while (!rf_sparet_wait_queue)
   1781 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1782 		waitreq = rf_sparet_wait_queue;
   1783 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1784 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1785 
   1786 		/* structure assignment */
   1787 		*((RF_SparetWait_t *) data) = *waitreq;
   1788 
   1789 		RF_Free(waitreq, sizeof(*waitreq));
   1790 		return (0);
   1791 
   1792 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1793 		 * code in it that will cause the dameon to exit */
   1794 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1795 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1796 		waitreq->fcol = -1;
   1797 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1798 		waitreq->next = rf_sparet_wait_queue;
   1799 		rf_sparet_wait_queue = waitreq;
   1800 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1801 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1802 		return (0);
   1803 
   1804 		/* used by the spare table daemon to deliver a spare table
   1805 		 * into the kernel */
   1806 	case RAIDFRAME_SEND_SPARET:
   1807 
   1808 		/* install the spare table */
   1809 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1810 
   1811 		/* respond to the requestor.  the return status of the spare
   1812 		 * table installation is passed in the "fcol" field */
   1813 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1814 		waitreq->fcol = retcode;
   1815 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1816 		waitreq->next = rf_sparet_resp_queue;
   1817 		rf_sparet_resp_queue = waitreq;
   1818 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1819 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1820 
   1821 		return (retcode);
   1822 #endif
   1823 
   1824 	default:
   1825 		break; /* fall through to the os-specific code below */
   1826 
   1827 	}
   1828 
   1829 	if (!raidPtr->valid)
   1830 		return (EINVAL);
   1831 
   1832 	/*
   1833 	 * Add support for "regular" device ioctls here.
   1834 	 */
   1835 
   1836 	error = disk_ioctl(&rs->sc_dkdev, dev, cmd, data, flag, l);
   1837 	if (error != EPASSTHROUGH)
   1838 		return (error);
   1839 
   1840 	switch (cmd) {
   1841 	case DIOCWDINFO:
   1842 	case DIOCSDINFO:
   1843 #ifdef __HAVE_OLD_DISKLABEL
   1844 	case ODIOCWDINFO:
   1845 	case ODIOCSDINFO:
   1846 #endif
   1847 	{
   1848 		struct disklabel *lp;
   1849 #ifdef __HAVE_OLD_DISKLABEL
   1850 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1851 			memset(&newlabel, 0, sizeof newlabel);
   1852 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1853 			lp = &newlabel;
   1854 		} else
   1855 #endif
   1856 		lp = (struct disklabel *)data;
   1857 
   1858 		if ((error = raidlock(rs)) != 0)
   1859 			return (error);
   1860 
   1861 		rs->sc_flags |= RAIDF_LABELLING;
   1862 
   1863 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1864 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1865 		if (error == 0) {
   1866 			if (cmd == DIOCWDINFO
   1867 #ifdef __HAVE_OLD_DISKLABEL
   1868 			    || cmd == ODIOCWDINFO
   1869 #endif
   1870 			   )
   1871 				error = writedisklabel(RAIDLABELDEV(dev),
   1872 				    raidstrategy, rs->sc_dkdev.dk_label,
   1873 				    rs->sc_dkdev.dk_cpulabel);
   1874 		}
   1875 		rs->sc_flags &= ~RAIDF_LABELLING;
   1876 
   1877 		raidunlock(rs);
   1878 
   1879 		if (error)
   1880 			return (error);
   1881 		break;
   1882 	}
   1883 
   1884 	case DIOCWLABEL:
   1885 		if (*(int *) data != 0)
   1886 			rs->sc_flags |= RAIDF_WLABEL;
   1887 		else
   1888 			rs->sc_flags &= ~RAIDF_WLABEL;
   1889 		break;
   1890 
   1891 	case DIOCGDEFLABEL:
   1892 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1893 		break;
   1894 
   1895 #ifdef __HAVE_OLD_DISKLABEL
   1896 	case ODIOCGDEFLABEL:
   1897 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1898 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1899 			return ENOTTY;
   1900 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1901 		break;
   1902 #endif
   1903 
   1904 	case DIOCCACHESYNC:
   1905 		return rf_sync_component_caches(raidPtr);
   1906 
   1907 	case DIOCGSTRATEGY:
   1908 	    {
   1909 		struct disk_strategy *dks = (void *)data;
   1910 
   1911 		s = splbio();
   1912 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1913 		    sizeof(dks->dks_name));
   1914 		splx(s);
   1915 		dks->dks_paramlen = 0;
   1916 
   1917 		return 0;
   1918 	    }
   1919 
   1920 	case DIOCSSTRATEGY:
   1921 	    {
   1922 		struct disk_strategy *dks = (void *)data;
   1923 		struct bufq_state *new;
   1924 		struct bufq_state *old;
   1925 
   1926 		if (dks->dks_param != NULL) {
   1927 			return EINVAL;
   1928 		}
   1929 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1930 		error = bufq_alloc(&new, dks->dks_name,
   1931 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1932 		if (error) {
   1933 			return error;
   1934 		}
   1935 		s = splbio();
   1936 		old = rs->buf_queue;
   1937 		bufq_move(new, old);
   1938 		rs->buf_queue = new;
   1939 		splx(s);
   1940 		bufq_free(old);
   1941 
   1942 		return 0;
   1943 	    }
   1944 
   1945 	default:
   1946 		retcode = ENOTTY;
   1947 	}
   1948 	return (retcode);
   1949 
   1950 }
   1951 
   1952 
   1953 /* raidinit -- complete the rest of the initialization for the
   1954    RAIDframe device.  */
   1955 
   1956 
   1957 static void
   1958 raidinit(struct raid_softc *rs)
   1959 {
   1960 	cfdata_t cf;
   1961 	int     unit;
   1962 	RF_Raid_t *raidPtr = &rs->sc_r;
   1963 
   1964 	unit = raidPtr->raidid;
   1965 
   1966 
   1967 	/* XXX should check return code first... */
   1968 	rs->sc_flags |= RAIDF_INITED;
   1969 
   1970 	/* XXX doesn't check bounds. */
   1971 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1972 
   1973 	/* attach the pseudo device */
   1974 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1975 	cf->cf_name = raid_cd.cd_name;
   1976 	cf->cf_atname = raid_cd.cd_name;
   1977 	cf->cf_unit = unit;
   1978 	cf->cf_fstate = FSTATE_STAR;
   1979 
   1980 	rs->sc_dev = config_attach_pseudo(cf);
   1981 
   1982 	if (rs->sc_dev == NULL) {
   1983 		printf("raid%d: config_attach_pseudo failed\n",
   1984 		    raidPtr->raidid);
   1985 		rs->sc_flags &= ~RAIDF_INITED;
   1986 		free(cf, M_RAIDFRAME);
   1987 		return;
   1988 	}
   1989 
   1990 	/* disk_attach actually creates space for the CPU disklabel, among
   1991 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1992 	 * with disklabels. */
   1993 
   1994 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1995 	disk_attach(&rs->sc_dkdev);
   1996 
   1997 	/* XXX There may be a weird interaction here between this, and
   1998 	 * protectedSectors, as used in RAIDframe.  */
   1999 
   2000 	rs->sc_size = raidPtr->totalSectors;
   2001 
   2002 	rf_set_geometry(rs, raidPtr);
   2003 
   2004 	dkwedge_discover(&rs->sc_dkdev);
   2005 
   2006 }
   2007 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2008 /* wake up the daemon & tell it to get us a spare table
   2009  * XXX
   2010  * the entries in the queues should be tagged with the raidPtr
   2011  * so that in the extremely rare case that two recons happen at once,
   2012  * we know for which device were requesting a spare table
   2013  * XXX
   2014  *
   2015  * XXX This code is not currently used. GO
   2016  */
   2017 int
   2018 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2019 {
   2020 	int     retcode;
   2021 
   2022 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2023 	req->next = rf_sparet_wait_queue;
   2024 	rf_sparet_wait_queue = req;
   2025 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2026 
   2027 	/* mpsleep unlocks the mutex */
   2028 	while (!rf_sparet_resp_queue) {
   2029 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2030 	}
   2031 	req = rf_sparet_resp_queue;
   2032 	rf_sparet_resp_queue = req->next;
   2033 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2034 
   2035 	retcode = req->fcol;
   2036 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2037 					 * alloc'd */
   2038 	return (retcode);
   2039 }
   2040 #endif
   2041 
   2042 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2043  * bp & passes it down.
   2044  * any calls originating in the kernel must use non-blocking I/O
   2045  * do some extra sanity checking to return "appropriate" error values for
   2046  * certain conditions (to make some standard utilities work)
   2047  *
   2048  * Formerly known as: rf_DoAccessKernel
   2049  */
   2050 void
   2051 raidstart(RF_Raid_t *raidPtr)
   2052 {
   2053 	RF_SectorCount_t num_blocks, pb, sum;
   2054 	RF_RaidAddr_t raid_addr;
   2055 	struct partition *pp;
   2056 	daddr_t blocknum;
   2057 	struct raid_softc *rs;
   2058 	int     do_async;
   2059 	struct buf *bp;
   2060 	int rc;
   2061 
   2062 	rs = raidPtr->softc;
   2063 	/* quick check to see if anything has died recently */
   2064 	rf_lock_mutex2(raidPtr->mutex);
   2065 	if (raidPtr->numNewFailures > 0) {
   2066 		rf_unlock_mutex2(raidPtr->mutex);
   2067 		rf_update_component_labels(raidPtr,
   2068 					   RF_NORMAL_COMPONENT_UPDATE);
   2069 		rf_lock_mutex2(raidPtr->mutex);
   2070 		raidPtr->numNewFailures--;
   2071 	}
   2072 
   2073 	/* Check to see if we're at the limit... */
   2074 	while (raidPtr->openings > 0) {
   2075 		rf_unlock_mutex2(raidPtr->mutex);
   2076 
   2077 		/* get the next item, if any, from the queue */
   2078 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2079 			/* nothing more to do */
   2080 			return;
   2081 		}
   2082 
   2083 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2084 		 * partition.. Need to make it absolute to the underlying
   2085 		 * device.. */
   2086 
   2087 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2088 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2089 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2090 			blocknum += pp->p_offset;
   2091 		}
   2092 
   2093 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2094 			    (int) blocknum));
   2095 
   2096 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2097 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2098 
   2099 		/* *THIS* is where we adjust what block we're going to...
   2100 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2101 		raid_addr = blocknum;
   2102 
   2103 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2104 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2105 		sum = raid_addr + num_blocks + pb;
   2106 		if (1 || rf_debugKernelAccess) {
   2107 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2108 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2109 				    (int) pb, (int) bp->b_resid));
   2110 		}
   2111 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2112 		    || (sum < num_blocks) || (sum < pb)) {
   2113 			bp->b_error = ENOSPC;
   2114 			bp->b_resid = bp->b_bcount;
   2115 			biodone(bp);
   2116 			rf_lock_mutex2(raidPtr->mutex);
   2117 			continue;
   2118 		}
   2119 		/*
   2120 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2121 		 */
   2122 
   2123 		if (bp->b_bcount & raidPtr->sectorMask) {
   2124 			bp->b_error = EINVAL;
   2125 			bp->b_resid = bp->b_bcount;
   2126 			biodone(bp);
   2127 			rf_lock_mutex2(raidPtr->mutex);
   2128 			continue;
   2129 
   2130 		}
   2131 		db1_printf(("Calling DoAccess..\n"));
   2132 
   2133 
   2134 		rf_lock_mutex2(raidPtr->mutex);
   2135 		raidPtr->openings--;
   2136 		rf_unlock_mutex2(raidPtr->mutex);
   2137 
   2138 		/*
   2139 		 * Everything is async.
   2140 		 */
   2141 		do_async = 1;
   2142 
   2143 		disk_busy(&rs->sc_dkdev);
   2144 
   2145 		/* XXX we're still at splbio() here... do we *really*
   2146 		   need to be? */
   2147 
   2148 		/* don't ever condition on bp->b_flags & B_WRITE.
   2149 		 * always condition on B_READ instead */
   2150 
   2151 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2152 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2153 				 do_async, raid_addr, num_blocks,
   2154 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2155 
   2156 		if (rc) {
   2157 			bp->b_error = rc;
   2158 			bp->b_resid = bp->b_bcount;
   2159 			biodone(bp);
   2160 			/* continue loop */
   2161 		}
   2162 
   2163 		rf_lock_mutex2(raidPtr->mutex);
   2164 	}
   2165 	rf_unlock_mutex2(raidPtr->mutex);
   2166 }
   2167 
   2168 
   2169 
   2170 
   2171 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2172 
   2173 int
   2174 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2175 {
   2176 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2177 	struct buf *bp;
   2178 
   2179 	req->queue = queue;
   2180 	bp = req->bp;
   2181 
   2182 	switch (req->type) {
   2183 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2184 		/* XXX need to do something extra here.. */
   2185 		/* I'm leaving this in, as I've never actually seen it used,
   2186 		 * and I'd like folks to report it... GO */
   2187 		printf(("WAKEUP CALLED\n"));
   2188 		queue->numOutstanding++;
   2189 
   2190 		bp->b_flags = 0;
   2191 		bp->b_private = req;
   2192 
   2193 		KernelWakeupFunc(bp);
   2194 		break;
   2195 
   2196 	case RF_IO_TYPE_READ:
   2197 	case RF_IO_TYPE_WRITE:
   2198 #if RF_ACC_TRACE > 0
   2199 		if (req->tracerec) {
   2200 			RF_ETIMER_START(req->tracerec->timer);
   2201 		}
   2202 #endif
   2203 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2204 		    op, queue->rf_cinfo->ci_dev,
   2205 		    req->sectorOffset, req->numSector,
   2206 		    req->buf, KernelWakeupFunc, (void *) req,
   2207 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2208 
   2209 		if (rf_debugKernelAccess) {
   2210 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2211 				(long) bp->b_blkno));
   2212 		}
   2213 		queue->numOutstanding++;
   2214 		queue->last_deq_sector = req->sectorOffset;
   2215 		/* acc wouldn't have been let in if there were any pending
   2216 		 * reqs at any other priority */
   2217 		queue->curPriority = req->priority;
   2218 
   2219 		db1_printf(("Going for %c to unit %d col %d\n",
   2220 			    req->type, queue->raidPtr->raidid,
   2221 			    queue->col));
   2222 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2223 			(int) req->sectorOffset, (int) req->numSector,
   2224 			(int) (req->numSector <<
   2225 			    queue->raidPtr->logBytesPerSector),
   2226 			(int) queue->raidPtr->logBytesPerSector));
   2227 
   2228 		/*
   2229 		 * XXX: drop lock here since this can block at
   2230 		 * least with backing SCSI devices.  Retake it
   2231 		 * to minimize fuss with calling interfaces.
   2232 		 */
   2233 
   2234 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2235 		bdev_strategy(bp);
   2236 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2237 		break;
   2238 
   2239 	default:
   2240 		panic("bad req->type in rf_DispatchKernelIO");
   2241 	}
   2242 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2243 
   2244 	return (0);
   2245 }
   2246 /* this is the callback function associated with a I/O invoked from
   2247    kernel code.
   2248  */
   2249 static void
   2250 KernelWakeupFunc(struct buf *bp)
   2251 {
   2252 	RF_DiskQueueData_t *req = NULL;
   2253 	RF_DiskQueue_t *queue;
   2254 
   2255 	db1_printf(("recovering the request queue:\n"));
   2256 
   2257 	req = bp->b_private;
   2258 
   2259 	queue = (RF_DiskQueue_t *) req->queue;
   2260 
   2261 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2262 
   2263 #if RF_ACC_TRACE > 0
   2264 	if (req->tracerec) {
   2265 		RF_ETIMER_STOP(req->tracerec->timer);
   2266 		RF_ETIMER_EVAL(req->tracerec->timer);
   2267 		rf_lock_mutex2(rf_tracing_mutex);
   2268 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2269 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2270 		req->tracerec->num_phys_ios++;
   2271 		rf_unlock_mutex2(rf_tracing_mutex);
   2272 	}
   2273 #endif
   2274 
   2275 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2276 	 * ballistic, and mark the component as hosed... */
   2277 
   2278 	if (bp->b_error != 0) {
   2279 		/* Mark the disk as dead */
   2280 		/* but only mark it once... */
   2281 		/* and only if it wouldn't leave this RAID set
   2282 		   completely broken */
   2283 		if (((queue->raidPtr->Disks[queue->col].status ==
   2284 		      rf_ds_optimal) ||
   2285 		     (queue->raidPtr->Disks[queue->col].status ==
   2286 		      rf_ds_used_spare)) &&
   2287 		     (queue->raidPtr->numFailures <
   2288 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2289 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2290 			       queue->raidPtr->raidid,
   2291 			       bp->b_error,
   2292 			       queue->raidPtr->Disks[queue->col].devname);
   2293 			queue->raidPtr->Disks[queue->col].status =
   2294 			    rf_ds_failed;
   2295 			queue->raidPtr->status = rf_rs_degraded;
   2296 			queue->raidPtr->numFailures++;
   2297 			queue->raidPtr->numNewFailures++;
   2298 		} else {	/* Disk is already dead... */
   2299 			/* printf("Disk already marked as dead!\n"); */
   2300 		}
   2301 
   2302 	}
   2303 
   2304 	/* Fill in the error value */
   2305 	req->error = bp->b_error;
   2306 
   2307 	/* Drop this one on the "finished" queue... */
   2308 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2309 
   2310 	/* Let the raidio thread know there is work to be done. */
   2311 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2312 
   2313 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2314 }
   2315 
   2316 
   2317 /*
   2318  * initialize a buf structure for doing an I/O in the kernel.
   2319  */
   2320 static void
   2321 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2322        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2323        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2324        struct proc *b_proc)
   2325 {
   2326 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2327 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2328 	bp->b_oflags = 0;
   2329 	bp->b_cflags = 0;
   2330 	bp->b_bcount = numSect << logBytesPerSector;
   2331 	bp->b_bufsize = bp->b_bcount;
   2332 	bp->b_error = 0;
   2333 	bp->b_dev = dev;
   2334 	bp->b_data = bf;
   2335 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2336 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2337 	if (bp->b_bcount == 0) {
   2338 		panic("bp->b_bcount is zero in InitBP!!");
   2339 	}
   2340 	bp->b_proc = b_proc;
   2341 	bp->b_iodone = cbFunc;
   2342 	bp->b_private = cbArg;
   2343 }
   2344 
   2345 static void
   2346 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2347 		    struct disklabel *lp)
   2348 {
   2349 	memset(lp, 0, sizeof(*lp));
   2350 
   2351 	/* fabricate a label... */
   2352 	if (raidPtr->totalSectors > UINT32_MAX)
   2353 		lp->d_secperunit = UINT32_MAX;
   2354 	else
   2355 		lp->d_secperunit = raidPtr->totalSectors;
   2356 	lp->d_secsize = raidPtr->bytesPerSector;
   2357 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2358 	lp->d_ntracks = 4 * raidPtr->numCol;
   2359 	lp->d_ncylinders = raidPtr->totalSectors /
   2360 		(lp->d_nsectors * lp->d_ntracks);
   2361 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2362 
   2363 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2364 	lp->d_type = DKTYPE_RAID;
   2365 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2366 	lp->d_rpm = 3600;
   2367 	lp->d_interleave = 1;
   2368 	lp->d_flags = 0;
   2369 
   2370 	lp->d_partitions[RAW_PART].p_offset = 0;
   2371 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2372 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2373 	lp->d_npartitions = RAW_PART + 1;
   2374 
   2375 	lp->d_magic = DISKMAGIC;
   2376 	lp->d_magic2 = DISKMAGIC;
   2377 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2378 
   2379 }
   2380 /*
   2381  * Read the disklabel from the raid device.  If one is not present, fake one
   2382  * up.
   2383  */
   2384 static void
   2385 raidgetdisklabel(dev_t dev)
   2386 {
   2387 	int     unit = raidunit(dev);
   2388 	struct raid_softc *rs;
   2389 	const char   *errstring;
   2390 	struct disklabel *lp;
   2391 	struct cpu_disklabel *clp;
   2392 	RF_Raid_t *raidPtr;
   2393 
   2394 	if ((rs = raidget(unit)) == NULL)
   2395 		return;
   2396 
   2397 	lp = rs->sc_dkdev.dk_label;
   2398 	clp = rs->sc_dkdev.dk_cpulabel;
   2399 
   2400 	db1_printf(("Getting the disklabel...\n"));
   2401 
   2402 	memset(clp, 0, sizeof(*clp));
   2403 
   2404 	raidPtr = &rs->sc_r;
   2405 
   2406 	raidgetdefaultlabel(raidPtr, rs, lp);
   2407 
   2408 	/*
   2409 	 * Call the generic disklabel extraction routine.
   2410 	 */
   2411 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2412 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2413 	if (errstring)
   2414 		raidmakedisklabel(rs);
   2415 	else {
   2416 		int     i;
   2417 		struct partition *pp;
   2418 
   2419 		/*
   2420 		 * Sanity check whether the found disklabel is valid.
   2421 		 *
   2422 		 * This is necessary since total size of the raid device
   2423 		 * may vary when an interleave is changed even though exactly
   2424 		 * same components are used, and old disklabel may used
   2425 		 * if that is found.
   2426 		 */
   2427 		if (lp->d_secperunit < UINT32_MAX ?
   2428 		    lp->d_secperunit != rs->sc_size :
   2429 		    lp->d_secperunit > rs->sc_size)
   2430 			printf("raid%d: WARNING: %s: "
   2431 			    "total sector size in disklabel (%ju) != "
   2432 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
   2433 			    (uintmax_t)lp->d_secperunit,
   2434 			    (uintmax_t)rs->sc_size);
   2435 		for (i = 0; i < lp->d_npartitions; i++) {
   2436 			pp = &lp->d_partitions[i];
   2437 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2438 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2439 				       "exceeds the size of raid (%ju)\n",
   2440 				       unit, rs->sc_xname, 'a' + i,
   2441 				       (uintmax_t)rs->sc_size);
   2442 		}
   2443 	}
   2444 
   2445 }
   2446 /*
   2447  * Take care of things one might want to take care of in the event
   2448  * that a disklabel isn't present.
   2449  */
   2450 static void
   2451 raidmakedisklabel(struct raid_softc *rs)
   2452 {
   2453 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2454 	db1_printf(("Making a label..\n"));
   2455 
   2456 	/*
   2457 	 * For historical reasons, if there's no disklabel present
   2458 	 * the raw partition must be marked FS_BSDFFS.
   2459 	 */
   2460 
   2461 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2462 
   2463 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2464 
   2465 	lp->d_checksum = dkcksum(lp);
   2466 }
   2467 /*
   2468  * Wait interruptibly for an exclusive lock.
   2469  *
   2470  * XXX
   2471  * Several drivers do this; it should be abstracted and made MP-safe.
   2472  * (Hmm... where have we seen this warning before :->  GO )
   2473  */
   2474 static int
   2475 raidlock(struct raid_softc *rs)
   2476 {
   2477 	int     error;
   2478 
   2479 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2480 		rs->sc_flags |= RAIDF_WANTED;
   2481 		if ((error =
   2482 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2483 			return (error);
   2484 	}
   2485 	rs->sc_flags |= RAIDF_LOCKED;
   2486 	return (0);
   2487 }
   2488 /*
   2489  * Unlock and wake up any waiters.
   2490  */
   2491 static void
   2492 raidunlock(struct raid_softc *rs)
   2493 {
   2494 
   2495 	rs->sc_flags &= ~RAIDF_LOCKED;
   2496 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2497 		rs->sc_flags &= ~RAIDF_WANTED;
   2498 		wakeup(rs);
   2499 	}
   2500 }
   2501 
   2502 
   2503 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2504 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2505 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2506 
   2507 static daddr_t
   2508 rf_component_info_offset(void)
   2509 {
   2510 
   2511 	return RF_COMPONENT_INFO_OFFSET;
   2512 }
   2513 
   2514 static daddr_t
   2515 rf_component_info_size(unsigned secsize)
   2516 {
   2517 	daddr_t info_size;
   2518 
   2519 	KASSERT(secsize);
   2520 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2521 		info_size = secsize;
   2522 	else
   2523 		info_size = RF_COMPONENT_INFO_SIZE;
   2524 
   2525 	return info_size;
   2526 }
   2527 
   2528 static daddr_t
   2529 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2530 {
   2531 	daddr_t map_offset;
   2532 
   2533 	KASSERT(raidPtr->bytesPerSector);
   2534 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2535 		map_offset = raidPtr->bytesPerSector;
   2536 	else
   2537 		map_offset = RF_COMPONENT_INFO_SIZE;
   2538 	map_offset += rf_component_info_offset();
   2539 
   2540 	return map_offset;
   2541 }
   2542 
   2543 static daddr_t
   2544 rf_parity_map_size(RF_Raid_t *raidPtr)
   2545 {
   2546 	daddr_t map_size;
   2547 
   2548 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2549 		map_size = raidPtr->bytesPerSector;
   2550 	else
   2551 		map_size = RF_PARITY_MAP_SIZE;
   2552 
   2553 	return map_size;
   2554 }
   2555 
   2556 int
   2557 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2558 {
   2559 	RF_ComponentLabel_t *clabel;
   2560 
   2561 	clabel = raidget_component_label(raidPtr, col);
   2562 	clabel->clean = RF_RAID_CLEAN;
   2563 	raidflush_component_label(raidPtr, col);
   2564 	return(0);
   2565 }
   2566 
   2567 
   2568 int
   2569 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2570 {
   2571 	RF_ComponentLabel_t *clabel;
   2572 
   2573 	clabel = raidget_component_label(raidPtr, col);
   2574 	clabel->clean = RF_RAID_DIRTY;
   2575 	raidflush_component_label(raidPtr, col);
   2576 	return(0);
   2577 }
   2578 
   2579 int
   2580 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2581 {
   2582 	KASSERT(raidPtr->bytesPerSector);
   2583 	return raidread_component_label(raidPtr->bytesPerSector,
   2584 	    raidPtr->Disks[col].dev,
   2585 	    raidPtr->raid_cinfo[col].ci_vp,
   2586 	    &raidPtr->raid_cinfo[col].ci_label);
   2587 }
   2588 
   2589 RF_ComponentLabel_t *
   2590 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2591 {
   2592 	return &raidPtr->raid_cinfo[col].ci_label;
   2593 }
   2594 
   2595 int
   2596 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2597 {
   2598 	RF_ComponentLabel_t *label;
   2599 
   2600 	label = &raidPtr->raid_cinfo[col].ci_label;
   2601 	label->mod_counter = raidPtr->mod_counter;
   2602 #ifndef RF_NO_PARITY_MAP
   2603 	label->parity_map_modcount = label->mod_counter;
   2604 #endif
   2605 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2606 	    raidPtr->Disks[col].dev,
   2607 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2608 }
   2609 
   2610 
   2611 static int
   2612 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2613     RF_ComponentLabel_t *clabel)
   2614 {
   2615 	return raidread_component_area(dev, b_vp, clabel,
   2616 	    sizeof(RF_ComponentLabel_t),
   2617 	    rf_component_info_offset(),
   2618 	    rf_component_info_size(secsize));
   2619 }
   2620 
   2621 /* ARGSUSED */
   2622 static int
   2623 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2624     size_t msize, daddr_t offset, daddr_t dsize)
   2625 {
   2626 	struct buf *bp;
   2627 	const struct bdevsw *bdev;
   2628 	int error;
   2629 
   2630 	/* XXX should probably ensure that we don't try to do this if
   2631 	   someone has changed rf_protected_sectors. */
   2632 
   2633 	if (b_vp == NULL) {
   2634 		/* For whatever reason, this component is not valid.
   2635 		   Don't try to read a component label from it. */
   2636 		return(EINVAL);
   2637 	}
   2638 
   2639 	/* get a block of the appropriate size... */
   2640 	bp = geteblk((int)dsize);
   2641 	bp->b_dev = dev;
   2642 
   2643 	/* get our ducks in a row for the read */
   2644 	bp->b_blkno = offset / DEV_BSIZE;
   2645 	bp->b_bcount = dsize;
   2646 	bp->b_flags |= B_READ;
   2647  	bp->b_resid = dsize;
   2648 
   2649 	bdev = bdevsw_lookup(bp->b_dev);
   2650 	if (bdev == NULL)
   2651 		return (ENXIO);
   2652 	(*bdev->d_strategy)(bp);
   2653 
   2654 	error = biowait(bp);
   2655 
   2656 	if (!error) {
   2657 		memcpy(data, bp->b_data, msize);
   2658 	}
   2659 
   2660 	brelse(bp, 0);
   2661 	return(error);
   2662 }
   2663 
   2664 
   2665 static int
   2666 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2667     RF_ComponentLabel_t *clabel)
   2668 {
   2669 	return raidwrite_component_area(dev, b_vp, clabel,
   2670 	    sizeof(RF_ComponentLabel_t),
   2671 	    rf_component_info_offset(),
   2672 	    rf_component_info_size(secsize), 0);
   2673 }
   2674 
   2675 /* ARGSUSED */
   2676 static int
   2677 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2678     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2679 {
   2680 	struct buf *bp;
   2681 	const struct bdevsw *bdev;
   2682 	int error;
   2683 
   2684 	/* get a block of the appropriate size... */
   2685 	bp = geteblk((int)dsize);
   2686 	bp->b_dev = dev;
   2687 
   2688 	/* get our ducks in a row for the write */
   2689 	bp->b_blkno = offset / DEV_BSIZE;
   2690 	bp->b_bcount = dsize;
   2691 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2692  	bp->b_resid = dsize;
   2693 
   2694 	memset(bp->b_data, 0, dsize);
   2695 	memcpy(bp->b_data, data, msize);
   2696 
   2697 	bdev = bdevsw_lookup(bp->b_dev);
   2698 	if (bdev == NULL)
   2699 		return (ENXIO);
   2700 	(*bdev->d_strategy)(bp);
   2701 	if (asyncp)
   2702 		return 0;
   2703 	error = biowait(bp);
   2704 	brelse(bp, 0);
   2705 	if (error) {
   2706 #if 1
   2707 		printf("Failed to write RAID component info!\n");
   2708 #endif
   2709 	}
   2710 
   2711 	return(error);
   2712 }
   2713 
   2714 void
   2715 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2716 {
   2717 	int c;
   2718 
   2719 	for (c = 0; c < raidPtr->numCol; c++) {
   2720 		/* Skip dead disks. */
   2721 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2722 			continue;
   2723 		/* XXXjld: what if an error occurs here? */
   2724 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2725 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2726 		    RF_PARITYMAP_NBYTE,
   2727 		    rf_parity_map_offset(raidPtr),
   2728 		    rf_parity_map_size(raidPtr), 0);
   2729 	}
   2730 }
   2731 
   2732 void
   2733 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2734 {
   2735 	struct rf_paritymap_ondisk tmp;
   2736 	int c,first;
   2737 
   2738 	first=1;
   2739 	for (c = 0; c < raidPtr->numCol; c++) {
   2740 		/* Skip dead disks. */
   2741 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2742 			continue;
   2743 		raidread_component_area(raidPtr->Disks[c].dev,
   2744 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2745 		    RF_PARITYMAP_NBYTE,
   2746 		    rf_parity_map_offset(raidPtr),
   2747 		    rf_parity_map_size(raidPtr));
   2748 		if (first) {
   2749 			memcpy(map, &tmp, sizeof(*map));
   2750 			first = 0;
   2751 		} else {
   2752 			rf_paritymap_merge(map, &tmp);
   2753 		}
   2754 	}
   2755 }
   2756 
   2757 void
   2758 rf_markalldirty(RF_Raid_t *raidPtr)
   2759 {
   2760 	RF_ComponentLabel_t *clabel;
   2761 	int sparecol;
   2762 	int c;
   2763 	int j;
   2764 	int scol = -1;
   2765 
   2766 	raidPtr->mod_counter++;
   2767 	for (c = 0; c < raidPtr->numCol; c++) {
   2768 		/* we don't want to touch (at all) a disk that has
   2769 		   failed */
   2770 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2771 			clabel = raidget_component_label(raidPtr, c);
   2772 			if (clabel->status == rf_ds_spared) {
   2773 				/* XXX do something special...
   2774 				   but whatever you do, don't
   2775 				   try to access it!! */
   2776 			} else {
   2777 				raidmarkdirty(raidPtr, c);
   2778 			}
   2779 		}
   2780 	}
   2781 
   2782 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2783 		sparecol = raidPtr->numCol + c;
   2784 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2785 			/*
   2786 
   2787 			   we claim this disk is "optimal" if it's
   2788 			   rf_ds_used_spare, as that means it should be
   2789 			   directly substitutable for the disk it replaced.
   2790 			   We note that too...
   2791 
   2792 			 */
   2793 
   2794 			for(j=0;j<raidPtr->numCol;j++) {
   2795 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2796 					scol = j;
   2797 					break;
   2798 				}
   2799 			}
   2800 
   2801 			clabel = raidget_component_label(raidPtr, sparecol);
   2802 			/* make sure status is noted */
   2803 
   2804 			raid_init_component_label(raidPtr, clabel);
   2805 
   2806 			clabel->row = 0;
   2807 			clabel->column = scol;
   2808 			/* Note: we *don't* change status from rf_ds_used_spare
   2809 			   to rf_ds_optimal */
   2810 			/* clabel.status = rf_ds_optimal; */
   2811 
   2812 			raidmarkdirty(raidPtr, sparecol);
   2813 		}
   2814 	}
   2815 }
   2816 
   2817 
   2818 void
   2819 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2820 {
   2821 	RF_ComponentLabel_t *clabel;
   2822 	int sparecol;
   2823 	int c;
   2824 	int j;
   2825 	int scol;
   2826 
   2827 	scol = -1;
   2828 
   2829 	/* XXX should do extra checks to make sure things really are clean,
   2830 	   rather than blindly setting the clean bit... */
   2831 
   2832 	raidPtr->mod_counter++;
   2833 
   2834 	for (c = 0; c < raidPtr->numCol; c++) {
   2835 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2836 			clabel = raidget_component_label(raidPtr, c);
   2837 			/* make sure status is noted */
   2838 			clabel->status = rf_ds_optimal;
   2839 
   2840 			/* note what unit we are configured as */
   2841 			clabel->last_unit = raidPtr->raidid;
   2842 
   2843 			raidflush_component_label(raidPtr, c);
   2844 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2845 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2846 					raidmarkclean(raidPtr, c);
   2847 				}
   2848 			}
   2849 		}
   2850 		/* else we don't touch it.. */
   2851 	}
   2852 
   2853 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2854 		sparecol = raidPtr->numCol + c;
   2855 		/* Need to ensure that the reconstruct actually completed! */
   2856 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2857 			/*
   2858 
   2859 			   we claim this disk is "optimal" if it's
   2860 			   rf_ds_used_spare, as that means it should be
   2861 			   directly substitutable for the disk it replaced.
   2862 			   We note that too...
   2863 
   2864 			 */
   2865 
   2866 			for(j=0;j<raidPtr->numCol;j++) {
   2867 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2868 					scol = j;
   2869 					break;
   2870 				}
   2871 			}
   2872 
   2873 			/* XXX shouldn't *really* need this... */
   2874 			clabel = raidget_component_label(raidPtr, sparecol);
   2875 			/* make sure status is noted */
   2876 
   2877 			raid_init_component_label(raidPtr, clabel);
   2878 
   2879 			clabel->column = scol;
   2880 			clabel->status = rf_ds_optimal;
   2881 			clabel->last_unit = raidPtr->raidid;
   2882 
   2883 			raidflush_component_label(raidPtr, sparecol);
   2884 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2885 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2886 					raidmarkclean(raidPtr, sparecol);
   2887 				}
   2888 			}
   2889 		}
   2890 	}
   2891 }
   2892 
   2893 void
   2894 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2895 {
   2896 
   2897 	if (vp != NULL) {
   2898 		if (auto_configured == 1) {
   2899 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2900 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2901 			vput(vp);
   2902 
   2903 		} else {
   2904 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2905 		}
   2906 	}
   2907 }
   2908 
   2909 
   2910 void
   2911 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2912 {
   2913 	int r,c;
   2914 	struct vnode *vp;
   2915 	int acd;
   2916 
   2917 
   2918 	/* We take this opportunity to close the vnodes like we should.. */
   2919 
   2920 	for (c = 0; c < raidPtr->numCol; c++) {
   2921 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2922 		acd = raidPtr->Disks[c].auto_configured;
   2923 		rf_close_component(raidPtr, vp, acd);
   2924 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2925 		raidPtr->Disks[c].auto_configured = 0;
   2926 	}
   2927 
   2928 	for (r = 0; r < raidPtr->numSpare; r++) {
   2929 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2930 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2931 		rf_close_component(raidPtr, vp, acd);
   2932 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2933 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2934 	}
   2935 }
   2936 
   2937 
   2938 void
   2939 rf_ReconThread(struct rf_recon_req *req)
   2940 {
   2941 	int     s;
   2942 	RF_Raid_t *raidPtr;
   2943 
   2944 	s = splbio();
   2945 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2946 	raidPtr->recon_in_progress = 1;
   2947 
   2948 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2949 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2950 
   2951 	RF_Free(req, sizeof(*req));
   2952 
   2953 	raidPtr->recon_in_progress = 0;
   2954 	splx(s);
   2955 
   2956 	/* That's all... */
   2957 	kthread_exit(0);	/* does not return */
   2958 }
   2959 
   2960 void
   2961 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2962 {
   2963 	int retcode;
   2964 	int s;
   2965 
   2966 	raidPtr->parity_rewrite_stripes_done = 0;
   2967 	raidPtr->parity_rewrite_in_progress = 1;
   2968 	s = splbio();
   2969 	retcode = rf_RewriteParity(raidPtr);
   2970 	splx(s);
   2971 	if (retcode) {
   2972 		printf("raid%d: Error re-writing parity (%d)!\n",
   2973 		    raidPtr->raidid, retcode);
   2974 	} else {
   2975 		/* set the clean bit!  If we shutdown correctly,
   2976 		   the clean bit on each component label will get
   2977 		   set */
   2978 		raidPtr->parity_good = RF_RAID_CLEAN;
   2979 	}
   2980 	raidPtr->parity_rewrite_in_progress = 0;
   2981 
   2982 	/* Anyone waiting for us to stop?  If so, inform them... */
   2983 	if (raidPtr->waitShutdown) {
   2984 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2985 	}
   2986 
   2987 	/* That's all... */
   2988 	kthread_exit(0);	/* does not return */
   2989 }
   2990 
   2991 
   2992 void
   2993 rf_CopybackThread(RF_Raid_t *raidPtr)
   2994 {
   2995 	int s;
   2996 
   2997 	raidPtr->copyback_in_progress = 1;
   2998 	s = splbio();
   2999 	rf_CopybackReconstructedData(raidPtr);
   3000 	splx(s);
   3001 	raidPtr->copyback_in_progress = 0;
   3002 
   3003 	/* That's all... */
   3004 	kthread_exit(0);	/* does not return */
   3005 }
   3006 
   3007 
   3008 void
   3009 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3010 {
   3011 	int s;
   3012 	RF_Raid_t *raidPtr;
   3013 
   3014 	s = splbio();
   3015 	raidPtr = req->raidPtr;
   3016 	raidPtr->recon_in_progress = 1;
   3017 	rf_ReconstructInPlace(raidPtr, req->col);
   3018 	RF_Free(req, sizeof(*req));
   3019 	raidPtr->recon_in_progress = 0;
   3020 	splx(s);
   3021 
   3022 	/* That's all... */
   3023 	kthread_exit(0);	/* does not return */
   3024 }
   3025 
   3026 static RF_AutoConfig_t *
   3027 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3028     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3029     unsigned secsize)
   3030 {
   3031 	int good_one = 0;
   3032 	RF_ComponentLabel_t *clabel;
   3033 	RF_AutoConfig_t *ac;
   3034 
   3035 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3036 	if (clabel == NULL) {
   3037 oomem:
   3038 		    while(ac_list) {
   3039 			    ac = ac_list;
   3040 			    if (ac->clabel)
   3041 				    free(ac->clabel, M_RAIDFRAME);
   3042 			    ac_list = ac_list->next;
   3043 			    free(ac, M_RAIDFRAME);
   3044 		    }
   3045 		    printf("RAID auto config: out of memory!\n");
   3046 		    return NULL; /* XXX probably should panic? */
   3047 	}
   3048 
   3049 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3050 		/* Got the label.  Does it look reasonable? */
   3051 		if (rf_reasonable_label(clabel, numsecs) &&
   3052 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3053 #ifdef DEBUG
   3054 			printf("Component on: %s: %llu\n",
   3055 				cname, (unsigned long long)size);
   3056 			rf_print_component_label(clabel);
   3057 #endif
   3058 			/* if it's reasonable, add it, else ignore it. */
   3059 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3060 				M_NOWAIT);
   3061 			if (ac == NULL) {
   3062 				free(clabel, M_RAIDFRAME);
   3063 				goto oomem;
   3064 			}
   3065 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3066 			ac->dev = dev;
   3067 			ac->vp = vp;
   3068 			ac->clabel = clabel;
   3069 			ac->next = ac_list;
   3070 			ac_list = ac;
   3071 			good_one = 1;
   3072 		}
   3073 	}
   3074 	if (!good_one) {
   3075 		/* cleanup */
   3076 		free(clabel, M_RAIDFRAME);
   3077 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3078 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3079 		vput(vp);
   3080 	}
   3081 	return ac_list;
   3082 }
   3083 
   3084 RF_AutoConfig_t *
   3085 rf_find_raid_components(void)
   3086 {
   3087 	struct vnode *vp;
   3088 	struct disklabel label;
   3089 	device_t dv;
   3090 	deviter_t di;
   3091 	dev_t dev;
   3092 	int bmajor, bminor, wedge, rf_part_found;
   3093 	int error;
   3094 	int i;
   3095 	RF_AutoConfig_t *ac_list;
   3096 	uint64_t numsecs;
   3097 	unsigned secsize;
   3098 
   3099 	/* initialize the AutoConfig list */
   3100 	ac_list = NULL;
   3101 
   3102 	/* we begin by trolling through *all* the devices on the system */
   3103 
   3104 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3105 	     dv = deviter_next(&di)) {
   3106 
   3107 		/* we are only interested in disks... */
   3108 		if (device_class(dv) != DV_DISK)
   3109 			continue;
   3110 
   3111 		/* we don't care about floppies... */
   3112 		if (device_is_a(dv, "fd")) {
   3113 			continue;
   3114 		}
   3115 
   3116 		/* we don't care about CD's... */
   3117 		if (device_is_a(dv, "cd")) {
   3118 			continue;
   3119 		}
   3120 
   3121 		/* we don't care about md's... */
   3122 		if (device_is_a(dv, "md")) {
   3123 			continue;
   3124 		}
   3125 
   3126 		/* hdfd is the Atari/Hades floppy driver */
   3127 		if (device_is_a(dv, "hdfd")) {
   3128 			continue;
   3129 		}
   3130 
   3131 		/* fdisa is the Atari/Milan floppy driver */
   3132 		if (device_is_a(dv, "fdisa")) {
   3133 			continue;
   3134 		}
   3135 
   3136 		/* need to find the device_name_to_block_device_major stuff */
   3137 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3138 
   3139 		rf_part_found = 0; /*No raid partition as yet*/
   3140 
   3141 		/* get a vnode for the raw partition of this disk */
   3142 
   3143 		wedge = device_is_a(dv, "dk");
   3144 		bminor = minor(device_unit(dv));
   3145 		dev = wedge ? makedev(bmajor, bminor) :
   3146 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3147 		if (bdevvp(dev, &vp))
   3148 			panic("RAID can't alloc vnode");
   3149 
   3150 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3151 
   3152 		if (error) {
   3153 			/* "Who cares."  Continue looking
   3154 			   for something that exists*/
   3155 			vput(vp);
   3156 			continue;
   3157 		}
   3158 
   3159 		error = getdisksize(vp, &numsecs, &secsize);
   3160 		if (error) {
   3161 			vput(vp);
   3162 			continue;
   3163 		}
   3164 		if (wedge) {
   3165 			struct dkwedge_info dkw;
   3166 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3167 			    NOCRED);
   3168 			if (error) {
   3169 				printf("RAIDframe: can't get wedge info for "
   3170 				    "dev %s (%d)\n", device_xname(dv), error);
   3171 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3172 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3173 				vput(vp);
   3174 				continue;
   3175 			}
   3176 
   3177 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3178 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3179 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3180 				vput(vp);
   3181 				continue;
   3182 			}
   3183 
   3184 			ac_list = rf_get_component(ac_list, dev, vp,
   3185 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3186 			rf_part_found = 1; /*There is a raid component on this disk*/
   3187 			continue;
   3188 		}
   3189 
   3190 		/* Ok, the disk exists.  Go get the disklabel. */
   3191 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3192 		if (error) {
   3193 			/*
   3194 			 * XXX can't happen - open() would
   3195 			 * have errored out (or faked up one)
   3196 			 */
   3197 			if (error != ENOTTY)
   3198 				printf("RAIDframe: can't get label for dev "
   3199 				    "%s (%d)\n", device_xname(dv), error);
   3200 		}
   3201 
   3202 		/* don't need this any more.  We'll allocate it again
   3203 		   a little later if we really do... */
   3204 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3205 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3206 		vput(vp);
   3207 
   3208 		if (error)
   3209 			continue;
   3210 
   3211 		rf_part_found = 0; /*No raid partitions yet*/
   3212 		for (i = 0; i < label.d_npartitions; i++) {
   3213 			char cname[sizeof(ac_list->devname)];
   3214 
   3215 			/* We only support partitions marked as RAID */
   3216 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3217 				continue;
   3218 
   3219 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3220 			if (bdevvp(dev, &vp))
   3221 				panic("RAID can't alloc vnode");
   3222 
   3223 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3224 			if (error) {
   3225 				/* Whatever... */
   3226 				vput(vp);
   3227 				continue;
   3228 			}
   3229 			snprintf(cname, sizeof(cname), "%s%c",
   3230 			    device_xname(dv), 'a' + i);
   3231 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3232 				label.d_partitions[i].p_size, numsecs, secsize);
   3233 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3234 		}
   3235 
   3236 		/*
   3237 		 *If there is no raid component on this disk, either in a
   3238 		 *disklabel or inside a wedge, check the raw partition as well,
   3239 		 *as it is possible to configure raid components on raw disk
   3240 		 *devices.
   3241 		 */
   3242 
   3243 		if (!rf_part_found) {
   3244 			char cname[sizeof(ac_list->devname)];
   3245 
   3246 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3247 			if (bdevvp(dev, &vp))
   3248 				panic("RAID can't alloc vnode");
   3249 
   3250 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3251 			if (error) {
   3252 				/* Whatever... */
   3253 				vput(vp);
   3254 				continue;
   3255 			}
   3256 			snprintf(cname, sizeof(cname), "%s%c",
   3257 			    device_xname(dv), 'a' + RAW_PART);
   3258 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3259 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3260 		}
   3261 	}
   3262 	deviter_release(&di);
   3263 	return ac_list;
   3264 }
   3265 
   3266 
   3267 int
   3268 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3269 {
   3270 
   3271 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3272 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3273 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3274 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3275 	    clabel->row >=0 &&
   3276 	    clabel->column >= 0 &&
   3277 	    clabel->num_rows > 0 &&
   3278 	    clabel->num_columns > 0 &&
   3279 	    clabel->row < clabel->num_rows &&
   3280 	    clabel->column < clabel->num_columns &&
   3281 	    clabel->blockSize > 0 &&
   3282 	    /*
   3283 	     * numBlocksHi may contain garbage, but it is ok since
   3284 	     * the type is unsigned.  If it is really garbage,
   3285 	     * rf_fix_old_label_size() will fix it.
   3286 	     */
   3287 	    rf_component_label_numblocks(clabel) > 0) {
   3288 		/*
   3289 		 * label looks reasonable enough...
   3290 		 * let's make sure it has no old garbage.
   3291 		 */
   3292 		if (numsecs)
   3293 			rf_fix_old_label_size(clabel, numsecs);
   3294 		return(1);
   3295 	}
   3296 	return(0);
   3297 }
   3298 
   3299 
   3300 /*
   3301  * For reasons yet unknown, some old component labels have garbage in
   3302  * the newer numBlocksHi region, and this causes lossage.  Since those
   3303  * disks will also have numsecs set to less than 32 bits of sectors,
   3304  * we can determine when this corruption has occurred, and fix it.
   3305  *
   3306  * The exact same problem, with the same unknown reason, happens to
   3307  * the partitionSizeHi member as well.
   3308  */
   3309 static void
   3310 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3311 {
   3312 
   3313 	if (numsecs < ((uint64_t)1 << 32)) {
   3314 		if (clabel->numBlocksHi) {
   3315 			printf("WARNING: total sectors < 32 bits, yet "
   3316 			       "numBlocksHi set\n"
   3317 			       "WARNING: resetting numBlocksHi to zero.\n");
   3318 			clabel->numBlocksHi = 0;
   3319 		}
   3320 
   3321 		if (clabel->partitionSizeHi) {
   3322 			printf("WARNING: total sectors < 32 bits, yet "
   3323 			       "partitionSizeHi set\n"
   3324 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3325 			clabel->partitionSizeHi = 0;
   3326 		}
   3327 	}
   3328 }
   3329 
   3330 
   3331 #ifdef DEBUG
   3332 void
   3333 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3334 {
   3335 	uint64_t numBlocks;
   3336 	static const char *rp[] = {
   3337 	    "No", "Force", "Soft", "*invalid*"
   3338 	};
   3339 
   3340 
   3341 	numBlocks = rf_component_label_numblocks(clabel);
   3342 
   3343 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3344 	       clabel->row, clabel->column,
   3345 	       clabel->num_rows, clabel->num_columns);
   3346 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3347 	       clabel->version, clabel->serial_number,
   3348 	       clabel->mod_counter);
   3349 	printf("   Clean: %s Status: %d\n",
   3350 	       clabel->clean ? "Yes" : "No", clabel->status);
   3351 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3352 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3353 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3354 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3355 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3356 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3357 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3358 #if 0
   3359 	   printf("   Config order: %d\n", clabel->config_order);
   3360 #endif
   3361 
   3362 }
   3363 #endif
   3364 
   3365 RF_ConfigSet_t *
   3366 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3367 {
   3368 	RF_AutoConfig_t *ac;
   3369 	RF_ConfigSet_t *config_sets;
   3370 	RF_ConfigSet_t *cset;
   3371 	RF_AutoConfig_t *ac_next;
   3372 
   3373 
   3374 	config_sets = NULL;
   3375 
   3376 	/* Go through the AutoConfig list, and figure out which components
   3377 	   belong to what sets.  */
   3378 	ac = ac_list;
   3379 	while(ac!=NULL) {
   3380 		/* we're going to putz with ac->next, so save it here
   3381 		   for use at the end of the loop */
   3382 		ac_next = ac->next;
   3383 
   3384 		if (config_sets == NULL) {
   3385 			/* will need at least this one... */
   3386 			config_sets = (RF_ConfigSet_t *)
   3387 				malloc(sizeof(RF_ConfigSet_t),
   3388 				       M_RAIDFRAME, M_NOWAIT);
   3389 			if (config_sets == NULL) {
   3390 				panic("rf_create_auto_sets: No memory!");
   3391 			}
   3392 			/* this one is easy :) */
   3393 			config_sets->ac = ac;
   3394 			config_sets->next = NULL;
   3395 			config_sets->rootable = 0;
   3396 			ac->next = NULL;
   3397 		} else {
   3398 			/* which set does this component fit into? */
   3399 			cset = config_sets;
   3400 			while(cset!=NULL) {
   3401 				if (rf_does_it_fit(cset, ac)) {
   3402 					/* looks like it matches... */
   3403 					ac->next = cset->ac;
   3404 					cset->ac = ac;
   3405 					break;
   3406 				}
   3407 				cset = cset->next;
   3408 			}
   3409 			if (cset==NULL) {
   3410 				/* didn't find a match above... new set..*/
   3411 				cset = (RF_ConfigSet_t *)
   3412 					malloc(sizeof(RF_ConfigSet_t),
   3413 					       M_RAIDFRAME, M_NOWAIT);
   3414 				if (cset == NULL) {
   3415 					panic("rf_create_auto_sets: No memory!");
   3416 				}
   3417 				cset->ac = ac;
   3418 				ac->next = NULL;
   3419 				cset->next = config_sets;
   3420 				cset->rootable = 0;
   3421 				config_sets = cset;
   3422 			}
   3423 		}
   3424 		ac = ac_next;
   3425 	}
   3426 
   3427 
   3428 	return(config_sets);
   3429 }
   3430 
   3431 static int
   3432 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3433 {
   3434 	RF_ComponentLabel_t *clabel1, *clabel2;
   3435 
   3436 	/* If this one matches the *first* one in the set, that's good
   3437 	   enough, since the other members of the set would have been
   3438 	   through here too... */
   3439 	/* note that we are not checking partitionSize here..
   3440 
   3441 	   Note that we are also not checking the mod_counters here.
   3442 	   If everything else matches except the mod_counter, that's
   3443 	   good enough for this test.  We will deal with the mod_counters
   3444 	   a little later in the autoconfiguration process.
   3445 
   3446 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3447 
   3448 	   The reason we don't check for this is that failed disks
   3449 	   will have lower modification counts.  If those disks are
   3450 	   not added to the set they used to belong to, then they will
   3451 	   form their own set, which may result in 2 different sets,
   3452 	   for example, competing to be configured at raid0, and
   3453 	   perhaps competing to be the root filesystem set.  If the
   3454 	   wrong ones get configured, or both attempt to become /,
   3455 	   weird behaviour and or serious lossage will occur.  Thus we
   3456 	   need to bring them into the fold here, and kick them out at
   3457 	   a later point.
   3458 
   3459 	*/
   3460 
   3461 	clabel1 = cset->ac->clabel;
   3462 	clabel2 = ac->clabel;
   3463 	if ((clabel1->version == clabel2->version) &&
   3464 	    (clabel1->serial_number == clabel2->serial_number) &&
   3465 	    (clabel1->num_rows == clabel2->num_rows) &&
   3466 	    (clabel1->num_columns == clabel2->num_columns) &&
   3467 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3468 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3469 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3470 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3471 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3472 	    (clabel1->blockSize == clabel2->blockSize) &&
   3473 	    rf_component_label_numblocks(clabel1) ==
   3474 	    rf_component_label_numblocks(clabel2) &&
   3475 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3476 	    (clabel1->root_partition == clabel2->root_partition) &&
   3477 	    (clabel1->last_unit == clabel2->last_unit) &&
   3478 	    (clabel1->config_order == clabel2->config_order)) {
   3479 		/* if it get's here, it almost *has* to be a match */
   3480 	} else {
   3481 		/* it's not consistent with somebody in the set..
   3482 		   punt */
   3483 		return(0);
   3484 	}
   3485 	/* all was fine.. it must fit... */
   3486 	return(1);
   3487 }
   3488 
   3489 int
   3490 rf_have_enough_components(RF_ConfigSet_t *cset)
   3491 {
   3492 	RF_AutoConfig_t *ac;
   3493 	RF_AutoConfig_t *auto_config;
   3494 	RF_ComponentLabel_t *clabel;
   3495 	int c;
   3496 	int num_cols;
   3497 	int num_missing;
   3498 	int mod_counter;
   3499 	int mod_counter_found;
   3500 	int even_pair_failed;
   3501 	char parity_type;
   3502 
   3503 
   3504 	/* check to see that we have enough 'live' components
   3505 	   of this set.  If so, we can configure it if necessary */
   3506 
   3507 	num_cols = cset->ac->clabel->num_columns;
   3508 	parity_type = cset->ac->clabel->parityConfig;
   3509 
   3510 	/* XXX Check for duplicate components!?!?!? */
   3511 
   3512 	/* Determine what the mod_counter is supposed to be for this set. */
   3513 
   3514 	mod_counter_found = 0;
   3515 	mod_counter = 0;
   3516 	ac = cset->ac;
   3517 	while(ac!=NULL) {
   3518 		if (mod_counter_found==0) {
   3519 			mod_counter = ac->clabel->mod_counter;
   3520 			mod_counter_found = 1;
   3521 		} else {
   3522 			if (ac->clabel->mod_counter > mod_counter) {
   3523 				mod_counter = ac->clabel->mod_counter;
   3524 			}
   3525 		}
   3526 		ac = ac->next;
   3527 	}
   3528 
   3529 	num_missing = 0;
   3530 	auto_config = cset->ac;
   3531 
   3532 	even_pair_failed = 0;
   3533 	for(c=0; c<num_cols; c++) {
   3534 		ac = auto_config;
   3535 		while(ac!=NULL) {
   3536 			if ((ac->clabel->column == c) &&
   3537 			    (ac->clabel->mod_counter == mod_counter)) {
   3538 				/* it's this one... */
   3539 #ifdef DEBUG
   3540 				printf("Found: %s at %d\n",
   3541 				       ac->devname,c);
   3542 #endif
   3543 				break;
   3544 			}
   3545 			ac=ac->next;
   3546 		}
   3547 		if (ac==NULL) {
   3548 				/* Didn't find one here! */
   3549 				/* special case for RAID 1, especially
   3550 				   where there are more than 2
   3551 				   components (where RAIDframe treats
   3552 				   things a little differently :( ) */
   3553 			if (parity_type == '1') {
   3554 				if (c%2 == 0) { /* even component */
   3555 					even_pair_failed = 1;
   3556 				} else { /* odd component.  If
   3557 					    we're failed, and
   3558 					    so is the even
   3559 					    component, it's
   3560 					    "Good Night, Charlie" */
   3561 					if (even_pair_failed == 1) {
   3562 						return(0);
   3563 					}
   3564 				}
   3565 			} else {
   3566 				/* normal accounting */
   3567 				num_missing++;
   3568 			}
   3569 		}
   3570 		if ((parity_type == '1') && (c%2 == 1)) {
   3571 				/* Just did an even component, and we didn't
   3572 				   bail.. reset the even_pair_failed flag,
   3573 				   and go on to the next component.... */
   3574 			even_pair_failed = 0;
   3575 		}
   3576 	}
   3577 
   3578 	clabel = cset->ac->clabel;
   3579 
   3580 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3581 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3582 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3583 		/* XXX this needs to be made *much* more general */
   3584 		/* Too many failures */
   3585 		return(0);
   3586 	}
   3587 	/* otherwise, all is well, and we've got enough to take a kick
   3588 	   at autoconfiguring this set */
   3589 	return(1);
   3590 }
   3591 
   3592 void
   3593 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3594 			RF_Raid_t *raidPtr)
   3595 {
   3596 	RF_ComponentLabel_t *clabel;
   3597 	int i;
   3598 
   3599 	clabel = ac->clabel;
   3600 
   3601 	/* 1. Fill in the common stuff */
   3602 	config->numRow = clabel->num_rows = 1;
   3603 	config->numCol = clabel->num_columns;
   3604 	config->numSpare = 0; /* XXX should this be set here? */
   3605 	config->sectPerSU = clabel->sectPerSU;
   3606 	config->SUsPerPU = clabel->SUsPerPU;
   3607 	config->SUsPerRU = clabel->SUsPerRU;
   3608 	config->parityConfig = clabel->parityConfig;
   3609 	/* XXX... */
   3610 	strcpy(config->diskQueueType,"fifo");
   3611 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3612 	config->layoutSpecificSize = 0; /* XXX ?? */
   3613 
   3614 	while(ac!=NULL) {
   3615 		/* row/col values will be in range due to the checks
   3616 		   in reasonable_label() */
   3617 		strcpy(config->devnames[0][ac->clabel->column],
   3618 		       ac->devname);
   3619 		ac = ac->next;
   3620 	}
   3621 
   3622 	for(i=0;i<RF_MAXDBGV;i++) {
   3623 		config->debugVars[i][0] = 0;
   3624 	}
   3625 }
   3626 
   3627 int
   3628 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3629 {
   3630 	RF_ComponentLabel_t *clabel;
   3631 	int column;
   3632 	int sparecol;
   3633 
   3634 	raidPtr->autoconfigure = new_value;
   3635 
   3636 	for(column=0; column<raidPtr->numCol; column++) {
   3637 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3638 			clabel = raidget_component_label(raidPtr, column);
   3639 			clabel->autoconfigure = new_value;
   3640 			raidflush_component_label(raidPtr, column);
   3641 		}
   3642 	}
   3643 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3644 		sparecol = raidPtr->numCol + column;
   3645 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3646 			clabel = raidget_component_label(raidPtr, sparecol);
   3647 			clabel->autoconfigure = new_value;
   3648 			raidflush_component_label(raidPtr, sparecol);
   3649 		}
   3650 	}
   3651 	return(new_value);
   3652 }
   3653 
   3654 int
   3655 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3656 {
   3657 	RF_ComponentLabel_t *clabel;
   3658 	int column;
   3659 	int sparecol;
   3660 
   3661 	raidPtr->root_partition = new_value;
   3662 	for(column=0; column<raidPtr->numCol; column++) {
   3663 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3664 			clabel = raidget_component_label(raidPtr, column);
   3665 			clabel->root_partition = new_value;
   3666 			raidflush_component_label(raidPtr, column);
   3667 		}
   3668 	}
   3669 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3670 		sparecol = raidPtr->numCol + column;
   3671 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3672 			clabel = raidget_component_label(raidPtr, sparecol);
   3673 			clabel->root_partition = new_value;
   3674 			raidflush_component_label(raidPtr, sparecol);
   3675 		}
   3676 	}
   3677 	return(new_value);
   3678 }
   3679 
   3680 void
   3681 rf_release_all_vps(RF_ConfigSet_t *cset)
   3682 {
   3683 	RF_AutoConfig_t *ac;
   3684 
   3685 	ac = cset->ac;
   3686 	while(ac!=NULL) {
   3687 		/* Close the vp, and give it back */
   3688 		if (ac->vp) {
   3689 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3690 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3691 			vput(ac->vp);
   3692 			ac->vp = NULL;
   3693 		}
   3694 		ac = ac->next;
   3695 	}
   3696 }
   3697 
   3698 
   3699 void
   3700 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3701 {
   3702 	RF_AutoConfig_t *ac;
   3703 	RF_AutoConfig_t *next_ac;
   3704 
   3705 	ac = cset->ac;
   3706 	while(ac!=NULL) {
   3707 		next_ac = ac->next;
   3708 		/* nuke the label */
   3709 		free(ac->clabel, M_RAIDFRAME);
   3710 		/* cleanup the config structure */
   3711 		free(ac, M_RAIDFRAME);
   3712 		/* "next.." */
   3713 		ac = next_ac;
   3714 	}
   3715 	/* and, finally, nuke the config set */
   3716 	free(cset, M_RAIDFRAME);
   3717 }
   3718 
   3719 
   3720 void
   3721 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3722 {
   3723 	/* current version number */
   3724 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3725 	clabel->serial_number = raidPtr->serial_number;
   3726 	clabel->mod_counter = raidPtr->mod_counter;
   3727 
   3728 	clabel->num_rows = 1;
   3729 	clabel->num_columns = raidPtr->numCol;
   3730 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3731 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3732 
   3733 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3734 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3735 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3736 
   3737 	clabel->blockSize = raidPtr->bytesPerSector;
   3738 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3739 
   3740 	/* XXX not portable */
   3741 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3742 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3743 	clabel->autoconfigure = raidPtr->autoconfigure;
   3744 	clabel->root_partition = raidPtr->root_partition;
   3745 	clabel->last_unit = raidPtr->raidid;
   3746 	clabel->config_order = raidPtr->config_order;
   3747 
   3748 #ifndef RF_NO_PARITY_MAP
   3749 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3750 #endif
   3751 }
   3752 
   3753 struct raid_softc *
   3754 rf_auto_config_set(RF_ConfigSet_t *cset)
   3755 {
   3756 	RF_Raid_t *raidPtr;
   3757 	RF_Config_t *config;
   3758 	int raidID;
   3759 	struct raid_softc *sc;
   3760 
   3761 #ifdef DEBUG
   3762 	printf("RAID autoconfigure\n");
   3763 #endif
   3764 
   3765 	/* 1. Create a config structure */
   3766 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3767 	if (config == NULL) {
   3768 		printf("Out of mem!?!?\n");
   3769 				/* XXX do something more intelligent here. */
   3770 		return NULL;
   3771 	}
   3772 
   3773 	/*
   3774 	   2. Figure out what RAID ID this one is supposed to live at
   3775 	   See if we can get the same RAID dev that it was configured
   3776 	   on last time..
   3777 	*/
   3778 
   3779 	raidID = cset->ac->clabel->last_unit;
   3780 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3781 		continue;
   3782 #ifdef DEBUG
   3783 	printf("Configuring raid%d:\n",raidID);
   3784 #endif
   3785 
   3786 	raidPtr = &sc->sc_r;
   3787 
   3788 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3789 	raidPtr->softc = sc;
   3790 	raidPtr->raidid = raidID;
   3791 	raidPtr->openings = RAIDOUTSTANDING;
   3792 
   3793 	/* 3. Build the configuration structure */
   3794 	rf_create_configuration(cset->ac, config, raidPtr);
   3795 
   3796 	/* 4. Do the configuration */
   3797 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3798 		raidinit(sc);
   3799 
   3800 		rf_markalldirty(raidPtr);
   3801 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3802 		switch (cset->ac->clabel->root_partition) {
   3803 		case 1:	/* Force Root */
   3804 		case 2:	/* Soft Root: root when boot partition part of raid */
   3805 			/*
   3806 			 * everything configured just fine.  Make a note
   3807 			 * that this set is eligible to be root,
   3808 			 * or forced to be root
   3809 			 */
   3810 			cset->rootable = cset->ac->clabel->root_partition;
   3811 			/* XXX do this here? */
   3812 			raidPtr->root_partition = cset->rootable;
   3813 			break;
   3814 		default:
   3815 			break;
   3816 		}
   3817 	} else {
   3818 		raidput(sc);
   3819 		sc = NULL;
   3820 	}
   3821 
   3822 	/* 5. Cleanup */
   3823 	free(config, M_RAIDFRAME);
   3824 	return sc;
   3825 }
   3826 
   3827 void
   3828 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3829 {
   3830 	struct buf *bp;
   3831 	struct raid_softc *rs;
   3832 
   3833 	bp = (struct buf *)desc->bp;
   3834 	rs = desc->raidPtr->softc;
   3835 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3836 	    (bp->b_flags & B_READ));
   3837 }
   3838 
   3839 void
   3840 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3841 	     size_t xmin, size_t xmax)
   3842 {
   3843 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3844 	pool_sethiwat(p, xmax);
   3845 	pool_prime(p, xmin);
   3846 	pool_setlowat(p, xmin);
   3847 }
   3848 
   3849 /*
   3850  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3851  * if there is IO pending and if that IO could possibly be done for a
   3852  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3853  * otherwise.
   3854  *
   3855  */
   3856 
   3857 int
   3858 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3859 {
   3860 	struct raid_softc *rs = raidPtr->softc;
   3861 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3862 		/* there is work to do */
   3863 		return 0;
   3864 	}
   3865 	/* default is nothing to do */
   3866 	return 1;
   3867 }
   3868 
   3869 int
   3870 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3871 {
   3872 	uint64_t numsecs;
   3873 	unsigned secsize;
   3874 	int error;
   3875 
   3876 	error = getdisksize(vp, &numsecs, &secsize);
   3877 	if (error == 0) {
   3878 		diskPtr->blockSize = secsize;
   3879 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3880 		diskPtr->partitionSize = numsecs;
   3881 		return 0;
   3882 	}
   3883 	return error;
   3884 }
   3885 
   3886 static int
   3887 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3888 {
   3889 	return 1;
   3890 }
   3891 
   3892 static void
   3893 raid_attach(device_t parent, device_t self, void *aux)
   3894 {
   3895 
   3896 }
   3897 
   3898 
   3899 static int
   3900 raid_detach(device_t self, int flags)
   3901 {
   3902 	int error;
   3903 	struct raid_softc *rs = raidget(device_unit(self));
   3904 
   3905 	if (rs == NULL)
   3906 		return ENXIO;
   3907 
   3908 	if ((error = raidlock(rs)) != 0)
   3909 		return (error);
   3910 
   3911 	error = raid_detach_unlocked(rs);
   3912 
   3913 	raidunlock(rs);
   3914 
   3915 	/* XXXkd: raidput(rs) ??? */
   3916 
   3917 	return error;
   3918 }
   3919 
   3920 static void
   3921 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3922 {
   3923 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3924 
   3925 	memset(dg, 0, sizeof(*dg));
   3926 
   3927 	dg->dg_secperunit = raidPtr->totalSectors;
   3928 	dg->dg_secsize = raidPtr->bytesPerSector;
   3929 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3930 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3931 
   3932 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3933 }
   3934 
   3935 /*
   3936  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3937  * We end up returning whatever error was returned by the first cache flush
   3938  * that fails.
   3939  */
   3940 
   3941 int
   3942 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3943 {
   3944 	int c, sparecol;
   3945 	int e,error;
   3946 	int force = 1;
   3947 
   3948 	error = 0;
   3949 	for (c = 0; c < raidPtr->numCol; c++) {
   3950 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3951 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3952 					  &force, FWRITE, NOCRED);
   3953 			if (e) {
   3954 				if (e != ENODEV)
   3955 					printf("raid%d: cache flush to component %s failed.\n",
   3956 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3957 				if (error == 0) {
   3958 					error = e;
   3959 				}
   3960 			}
   3961 		}
   3962 	}
   3963 
   3964 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3965 		sparecol = raidPtr->numCol + c;
   3966 		/* Need to ensure that the reconstruct actually completed! */
   3967 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3968 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3969 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3970 			if (e) {
   3971 				if (e != ENODEV)
   3972 					printf("raid%d: cache flush to component %s failed.\n",
   3973 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3974 				if (error == 0) {
   3975 					error = e;
   3976 				}
   3977 			}
   3978 		}
   3979 	}
   3980 	return error;
   3981 }
   3982