Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.298.2.4
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.298.2.4 2014/08/20 00:03:49 tls Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.298.2.4 2014/08/20 00:03:49 tls Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 static void	raidminphys(struct buf *);
    234 
    235 static struct dkdriver rf_dkdriver = { raidstrategy, raidminphys };
    236 
    237 struct raid_softc {
    238 	device_t sc_dev;
    239 	int	sc_unit;
    240 	int     sc_flags;	/* flags */
    241 	int     sc_cflags;	/* configuration flags */
    242 	uint64_t sc_size;	/* size of the raid device */
    243 	char    sc_xname[20];	/* XXX external name */
    244 	struct disk sc_dkdev;	/* generic disk device info */
    245 	struct bufq_state *buf_queue;	/* used for the device queue */
    246 	RF_Raid_t sc_r;
    247 	LIST_ENTRY(raid_softc) sc_link;
    248 };
    249 /* sc_flags */
    250 #define RAIDF_INITED	0x01	/* unit has been initialized */
    251 #define RAIDF_WLABEL	0x02	/* label area is writable */
    252 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    253 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    254 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    255 #define RAIDF_LOCKED	0x80	/* unit is locked */
    256 
    257 #define	raidunit(x)	DISKUNIT(x)
    258 
    259 extern struct cfdriver raid_cd;
    260 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    261     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    262     DVF_DETACH_SHUTDOWN);
    263 
    264 /*
    265  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    266  * Be aware that large numbers can allow the driver to consume a lot of
    267  * kernel memory, especially on writes, and in degraded mode reads.
    268  *
    269  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    270  * a single 64K write will typically require 64K for the old data,
    271  * 64K for the old parity, and 64K for the new parity, for a total
    272  * of 192K (if the parity buffer is not re-used immediately).
    273  * Even it if is used immediately, that's still 128K, which when multiplied
    274  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    275  *
    276  * Now in degraded mode, for example, a 64K read on the above setup may
    277  * require data reconstruction, which will require *all* of the 4 remaining
    278  * disks to participate -- 4 * 32K/disk == 128K again.
    279  */
    280 
    281 #ifndef RAIDOUTSTANDING
    282 #define RAIDOUTSTANDING   6
    283 #endif
    284 
    285 #define RAIDLABELDEV(dev)	\
    286 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    287 
    288 /* declared here, and made public, for the benefit of KVM stuff.. */
    289 
    290 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    291 				     struct disklabel *);
    292 static void raidgetdisklabel(dev_t);
    293 static void raidmakedisklabel(struct raid_softc *);
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 void rf_ReconThread(struct rf_recon_req *);
    304 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    307 int rf_autoconfig(device_t);
    308 void rf_buildroothack(RF_ConfigSet_t *);
    309 
    310 RF_AutoConfig_t *rf_find_raid_components(void);
    311 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    313 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    314 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    315 int rf_set_autoconfig(RF_Raid_t *, int);
    316 int rf_set_rootpartition(RF_Raid_t *, int);
    317 void rf_release_all_vps(RF_ConfigSet_t *);
    318 void rf_cleanup_config_set(RF_ConfigSet_t *);
    319 int rf_have_enough_components(RF_ConfigSet_t *);
    320 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    322 
    323 /*
    324  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    325  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    326  * in the kernel config file.
    327  */
    328 #ifdef RAID_AUTOCONFIG
    329 int raidautoconfig = 1;
    330 #else
    331 int raidautoconfig = 0;
    332 #endif
    333 static bool raidautoconfigdone = false;
    334 
    335 struct RF_Pools_s rf_pools;
    336 
    337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    338 static kmutex_t raid_lock;
    339 
    340 static struct raid_softc *
    341 raidcreate(int unit) {
    342 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    343 	if (sc == NULL) {
    344 #ifdef DIAGNOSTIC
    345 		printf("%s: out of memory\n", __func__);
    346 #endif
    347 		return NULL;
    348 	}
    349 	sc->sc_unit = unit;
    350 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    351 	return sc;
    352 }
    353 
    354 static void
    355 raiddestroy(struct raid_softc *sc) {
    356 	bufq_free(sc->buf_queue);
    357 	kmem_free(sc, sizeof(*sc));
    358 }
    359 
    360 static struct raid_softc *
    361 raidget(int unit) {
    362 	struct raid_softc *sc;
    363 	if (unit < 0) {
    364 #ifdef DIAGNOSTIC
    365 		panic("%s: unit %d!", __func__, unit);
    366 #endif
    367 		return NULL;
    368 	}
    369 	mutex_enter(&raid_lock);
    370 	LIST_FOREACH(sc, &raids, sc_link) {
    371 		if (sc->sc_unit == unit) {
    372 			mutex_exit(&raid_lock);
    373 			return sc;
    374 		}
    375 	}
    376 	mutex_exit(&raid_lock);
    377 	if ((sc = raidcreate(unit)) == NULL)
    378 		return NULL;
    379 	mutex_enter(&raid_lock);
    380 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    381 	mutex_exit(&raid_lock);
    382 	return sc;
    383 }
    384 
    385 static void
    386 raidput(struct raid_softc *sc) {
    387 	mutex_enter(&raid_lock);
    388 	LIST_REMOVE(sc, sc_link);
    389 	mutex_exit(&raid_lock);
    390 	raiddestroy(sc);
    391 }
    392 
    393 void
    394 raidattach(int num)
    395 {
    396 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    397 	/* This is where all the initialization stuff gets done. */
    398 
    399 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    400 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    401 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    402 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    403 
    404 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    405 #endif
    406 
    407 	if (rf_BootRaidframe() == 0)
    408 		aprint_verbose("Kernelized RAIDframe activated\n");
    409 	else
    410 		panic("Serious error booting RAID!!");
    411 
    412 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    413 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    414 	}
    415 
    416 	raidautoconfigdone = false;
    417 
    418 	/*
    419 	 * Register a finalizer which will be used to auto-config RAID
    420 	 * sets once all real hardware devices have been found.
    421 	 */
    422 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    423 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    424 }
    425 
    426 int
    427 rf_autoconfig(device_t self)
    428 {
    429 	RF_AutoConfig_t *ac_list;
    430 	RF_ConfigSet_t *config_sets;
    431 
    432 	if (!raidautoconfig || raidautoconfigdone == true)
    433 		return (0);
    434 
    435 	/* XXX This code can only be run once. */
    436 	raidautoconfigdone = true;
    437 
    438 #ifdef __HAVE_CPU_BOOTCONF
    439 	/*
    440 	 * 0. find the boot device if needed first so we can use it later
    441 	 * this needs to be done before we autoconfigure any raid sets,
    442 	 * because if we use wedges we are not going to be able to open
    443 	 * the boot device later
    444 	 */
    445 	if (booted_device == NULL)
    446 		cpu_bootconf();
    447 #endif
    448 	/* 1. locate all RAID components on the system */
    449 	aprint_debug("Searching for RAID components...\n");
    450 	ac_list = rf_find_raid_components();
    451 
    452 	/* 2. Sort them into their respective sets. */
    453 	config_sets = rf_create_auto_sets(ac_list);
    454 
    455 	/*
    456 	 * 3. Evaluate each set and configure the valid ones.
    457 	 * This gets done in rf_buildroothack().
    458 	 */
    459 	rf_buildroothack(config_sets);
    460 
    461 	return 1;
    462 }
    463 
    464 static int
    465 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    466 	const char *bootname = device_xname(bdv);
    467 	size_t len = strlen(bootname);
    468 
    469 	for (int col = 0; col < r->numCol; col++) {
    470 		const char *devname = r->Disks[col].devname;
    471 		devname += sizeof("/dev/") - 1;
    472 		if (strncmp(devname, "dk", 2) == 0) {
    473 			const char *parent =
    474 			    dkwedge_get_parent_name(r->Disks[col].dev);
    475 			if (parent != NULL)
    476 				devname = parent;
    477 		}
    478 		if (strncmp(devname, bootname, len) == 0) {
    479 			struct raid_softc *sc = r->softc;
    480 			aprint_debug("raid%d includes boot device %s\n",
    481 			    sc->sc_unit, devname);
    482 			return 1;
    483 		}
    484 	}
    485 	return 0;
    486 }
    487 
    488 void
    489 rf_buildroothack(RF_ConfigSet_t *config_sets)
    490 {
    491 	RF_ConfigSet_t *cset;
    492 	RF_ConfigSet_t *next_cset;
    493 	int num_root;
    494 	struct raid_softc *sc, *rsc;
    495 
    496 	sc = rsc = NULL;
    497 	num_root = 0;
    498 	cset = config_sets;
    499 	while (cset != NULL) {
    500 		next_cset = cset->next;
    501 		if (rf_have_enough_components(cset) &&
    502 		    cset->ac->clabel->autoconfigure == 1) {
    503 			sc = rf_auto_config_set(cset);
    504 			if (sc != NULL) {
    505 				aprint_debug("raid%d: configured ok\n",
    506 				    sc->sc_unit);
    507 				if (cset->rootable) {
    508 					rsc = sc;
    509 					num_root++;
    510 				}
    511 			} else {
    512 				/* The autoconfig didn't work :( */
    513 				aprint_debug("Autoconfig failed\n");
    514 				rf_release_all_vps(cset);
    515 			}
    516 		} else {
    517 			/* we're not autoconfiguring this set...
    518 			   release the associated resources */
    519 			rf_release_all_vps(cset);
    520 		}
    521 		/* cleanup */
    522 		rf_cleanup_config_set(cset);
    523 		cset = next_cset;
    524 	}
    525 
    526 	/* if the user has specified what the root device should be
    527 	   then we don't touch booted_device or boothowto... */
    528 
    529 	if (rootspec != NULL)
    530 		return;
    531 
    532 	/* we found something bootable... */
    533 
    534 	/*
    535 	 * XXX: The following code assumes that the root raid
    536 	 * is the first ('a') partition. This is about the best
    537 	 * we can do with a BSD disklabel, but we might be able
    538 	 * to do better with a GPT label, by setting a specified
    539 	 * attribute to indicate the root partition. We can then
    540 	 * stash the partition number in the r->root_partition
    541 	 * high bits (the bottom 2 bits are already used). For
    542 	 * now we just set booted_partition to 0 when we override
    543 	 * root.
    544 	 */
    545 	if (num_root == 1) {
    546 		device_t candidate_root;
    547 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    548 			char cname[sizeof(cset->ac->devname)];
    549 			/* XXX: assume 'a' */
    550 			snprintf(cname, sizeof(cname), "%s%c",
    551 			    device_xname(rsc->sc_dev), 'a');
    552 			candidate_root = dkwedge_find_by_wname(cname);
    553 		} else
    554 			candidate_root = rsc->sc_dev;
    555 		if (booted_device == NULL ||
    556 		    rsc->sc_r.root_partition == 1 ||
    557 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    558 			booted_device = candidate_root;
    559 			booted_partition = 0;	/* XXX assume 'a' */
    560 		}
    561 	} else if (num_root > 1) {
    562 
    563 		/*
    564 		 * Maybe the MD code can help. If it cannot, then
    565 		 * setroot() will discover that we have no
    566 		 * booted_device and will ask the user if nothing was
    567 		 * hardwired in the kernel config file
    568 		 */
    569 		if (booted_device == NULL)
    570 			return;
    571 
    572 		num_root = 0;
    573 		mutex_enter(&raid_lock);
    574 		LIST_FOREACH(sc, &raids, sc_link) {
    575 			RF_Raid_t *r = &sc->sc_r;
    576 			if (r->valid == 0)
    577 				continue;
    578 
    579 			if (r->root_partition == 0)
    580 				continue;
    581 
    582 			if (rf_containsboot(r, booted_device)) {
    583 				num_root++;
    584 				rsc = sc;
    585 			}
    586 		}
    587 		mutex_exit(&raid_lock);
    588 
    589 		if (num_root == 1) {
    590 			booted_device = rsc->sc_dev;
    591 			booted_partition = 0;	/* XXX assume 'a' */
    592 		} else {
    593 			/* we can't guess.. require the user to answer... */
    594 			boothowto |= RB_ASKNAME;
    595 		}
    596 	}
    597 }
    598 
    599 
    600 int
    601 raidsize(dev_t dev)
    602 {
    603 	struct raid_softc *rs;
    604 	struct disklabel *lp;
    605 	int     part, unit, omask, size;
    606 
    607 	unit = raidunit(dev);
    608 	if ((rs = raidget(unit)) == NULL)
    609 		return -1;
    610 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    611 		return (-1);
    612 
    613 	part = DISKPART(dev);
    614 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    615 	lp = rs->sc_dkdev.dk_label;
    616 
    617 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    618 		return (-1);
    619 
    620 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    621 		size = -1;
    622 	else
    623 		size = lp->d_partitions[part].p_size *
    624 		    (lp->d_secsize / DEV_BSIZE);
    625 
    626 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    627 		return (-1);
    628 
    629 	return (size);
    630 
    631 }
    632 
    633 int
    634 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    635 {
    636 	int     unit = raidunit(dev);
    637 	struct raid_softc *rs;
    638 	const struct bdevsw *bdev;
    639 	struct disklabel *lp;
    640 	RF_Raid_t *raidPtr;
    641 	daddr_t offset;
    642 	int     part, c, sparecol, j, scol, dumpto;
    643 	int     error = 0;
    644 
    645 	if ((rs = raidget(unit)) == NULL)
    646 		return ENXIO;
    647 
    648 	raidPtr = &rs->sc_r;
    649 
    650 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    651 		return ENXIO;
    652 
    653 	/* we only support dumping to RAID 1 sets */
    654 	if (raidPtr->Layout.numDataCol != 1 ||
    655 	    raidPtr->Layout.numParityCol != 1)
    656 		return EINVAL;
    657 
    658 
    659 	if ((error = raidlock(rs)) != 0)
    660 		return error;
    661 
    662 	if (size % DEV_BSIZE != 0) {
    663 		error = EINVAL;
    664 		goto out;
    665 	}
    666 
    667 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    668 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    669 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    670 		    size / DEV_BSIZE, rs->sc_size);
    671 		error = EINVAL;
    672 		goto out;
    673 	}
    674 
    675 	part = DISKPART(dev);
    676 	lp = rs->sc_dkdev.dk_label;
    677 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    678 
    679 	/* figure out what device is alive.. */
    680 
    681 	/*
    682 	   Look for a component to dump to.  The preference for the
    683 	   component to dump to is as follows:
    684 	   1) the master
    685 	   2) a used_spare of the master
    686 	   3) the slave
    687 	   4) a used_spare of the slave
    688 	*/
    689 
    690 	dumpto = -1;
    691 	for (c = 0; c < raidPtr->numCol; c++) {
    692 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    693 			/* this might be the one */
    694 			dumpto = c;
    695 			break;
    696 		}
    697 	}
    698 
    699 	/*
    700 	   At this point we have possibly selected a live master or a
    701 	   live slave.  We now check to see if there is a spared
    702 	   master (or a spared slave), if we didn't find a live master
    703 	   or a live slave.
    704 	*/
    705 
    706 	for (c = 0; c < raidPtr->numSpare; c++) {
    707 		sparecol = raidPtr->numCol + c;
    708 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    709 			/* How about this one? */
    710 			scol = -1;
    711 			for(j=0;j<raidPtr->numCol;j++) {
    712 				if (raidPtr->Disks[j].spareCol == sparecol) {
    713 					scol = j;
    714 					break;
    715 				}
    716 			}
    717 			if (scol == 0) {
    718 				/*
    719 				   We must have found a spared master!
    720 				   We'll take that over anything else
    721 				   found so far.  (We couldn't have
    722 				   found a real master before, since
    723 				   this is a used spare, and it's
    724 				   saying that it's replacing the
    725 				   master.)  On reboot (with
    726 				   autoconfiguration turned on)
    727 				   sparecol will become the 1st
    728 				   component (component0) of this set.
    729 				*/
    730 				dumpto = sparecol;
    731 				break;
    732 			} else if (scol != -1) {
    733 				/*
    734 				   Must be a spared slave.  We'll dump
    735 				   to that if we havn't found anything
    736 				   else so far.
    737 				*/
    738 				if (dumpto == -1)
    739 					dumpto = sparecol;
    740 			}
    741 		}
    742 	}
    743 
    744 	if (dumpto == -1) {
    745 		/* we couldn't find any live components to dump to!?!?
    746 		 */
    747 		error = EINVAL;
    748 		goto out;
    749 	}
    750 
    751 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    752 
    753 	/*
    754 	   Note that blkno is relative to this particular partition.
    755 	   By adding the offset of this partition in the RAID
    756 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    757 	   value that is relative to the partition used for the
    758 	   underlying component.
    759 	*/
    760 
    761 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    762 				blkno + offset, va, size);
    763 
    764 out:
    765 	raidunlock(rs);
    766 
    767 	return error;
    768 }
    769 /* ARGSUSED */
    770 int
    771 raidopen(dev_t dev, int flags, int fmt,
    772     struct lwp *l)
    773 {
    774 	int     unit = raidunit(dev);
    775 	struct raid_softc *rs;
    776 	struct disklabel *lp;
    777 	int     part, pmask;
    778 	int     error = 0;
    779 
    780 	if ((rs = raidget(unit)) == NULL)
    781 		return ENXIO;
    782 	if ((error = raidlock(rs)) != 0)
    783 		return (error);
    784 
    785 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    786 		error = EBUSY;
    787 		goto bad;
    788 	}
    789 
    790 	lp = rs->sc_dkdev.dk_label;
    791 
    792 	part = DISKPART(dev);
    793 
    794 	/*
    795 	 * If there are wedges, and this is not RAW_PART, then we
    796 	 * need to fail.
    797 	 */
    798 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    799 		error = EBUSY;
    800 		goto bad;
    801 	}
    802 	pmask = (1 << part);
    803 
    804 	if ((rs->sc_flags & RAIDF_INITED) &&
    805 	    (rs->sc_dkdev.dk_openmask == 0))
    806 		raidgetdisklabel(dev);
    807 
    808 	/* make sure that this partition exists */
    809 
    810 	if (part != RAW_PART) {
    811 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    812 		    ((part >= lp->d_npartitions) ||
    813 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    814 			error = ENXIO;
    815 			goto bad;
    816 		}
    817 	}
    818 	/* Prevent this unit from being unconfigured while open. */
    819 	switch (fmt) {
    820 	case S_IFCHR:
    821 		rs->sc_dkdev.dk_copenmask |= pmask;
    822 		break;
    823 
    824 	case S_IFBLK:
    825 		rs->sc_dkdev.dk_bopenmask |= pmask;
    826 		break;
    827 	}
    828 
    829 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    830 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    831 		/* First one... mark things as dirty... Note that we *MUST*
    832 		 have done a configure before this.  I DO NOT WANT TO BE
    833 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    834 		 THAT THEY BELONG TOGETHER!!!!! */
    835 		/* XXX should check to see if we're only open for reading
    836 		   here... If so, we needn't do this, but then need some
    837 		   other way of keeping track of what's happened.. */
    838 
    839 		rf_markalldirty(&rs->sc_r);
    840 	}
    841 
    842 
    843 	rs->sc_dkdev.dk_openmask =
    844 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    845 
    846 bad:
    847 	raidunlock(rs);
    848 
    849 	return (error);
    850 
    851 
    852 }
    853 /* ARGSUSED */
    854 int
    855 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    856 {
    857 	int     unit = raidunit(dev);
    858 	struct raid_softc *rs;
    859 	int     error = 0;
    860 	int     part;
    861 
    862 	if ((rs = raidget(unit)) == NULL)
    863 		return ENXIO;
    864 
    865 	if ((error = raidlock(rs)) != 0)
    866 		return (error);
    867 
    868 	part = DISKPART(dev);
    869 
    870 	/* ...that much closer to allowing unconfiguration... */
    871 	switch (fmt) {
    872 	case S_IFCHR:
    873 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    874 		break;
    875 
    876 	case S_IFBLK:
    877 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    878 		break;
    879 	}
    880 	rs->sc_dkdev.dk_openmask =
    881 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    882 
    883 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    884 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    885 		/* Last one... device is not unconfigured yet.
    886 		   Device shutdown has taken care of setting the
    887 		   clean bits if RAIDF_INITED is not set
    888 		   mark things as clean... */
    889 
    890 		rf_update_component_labels(&rs->sc_r,
    891 						 RF_FINAL_COMPONENT_UPDATE);
    892 
    893 		/* If the kernel is shutting down, it will detach
    894 		 * this RAID set soon enough.
    895 		 */
    896 	}
    897 
    898 	raidunlock(rs);
    899 	return (0);
    900 
    901 }
    902 
    903 void
    904 raidstrategy(struct buf *bp)
    905 {
    906 	unsigned int unit = raidunit(bp->b_dev);
    907 	RF_Raid_t *raidPtr;
    908 	int     wlabel;
    909 	struct raid_softc *rs;
    910 
    911 	if ((rs = raidget(unit)) == NULL) {
    912 		bp->b_error = ENXIO;
    913 		goto done;
    914 	}
    915 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    916 		bp->b_error = ENXIO;
    917 		goto done;
    918 	}
    919 	raidPtr = &rs->sc_r;
    920 	if (!raidPtr->valid) {
    921 		bp->b_error = ENODEV;
    922 		goto done;
    923 	}
    924 	if (bp->b_bcount == 0) {
    925 		db1_printf(("b_bcount is zero..\n"));
    926 		goto done;
    927 	}
    928 
    929 	/*
    930 	 * Do bounds checking and adjust transfer.  If there's an
    931 	 * error, the bounds check will flag that for us.
    932 	 */
    933 
    934 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    935 	if (DISKPART(bp->b_dev) == RAW_PART) {
    936 		uint64_t size; /* device size in DEV_BSIZE unit */
    937 
    938 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    939 			size = raidPtr->totalSectors <<
    940 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    941 		} else {
    942 			size = raidPtr->totalSectors >>
    943 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    944 		}
    945 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    946 			goto done;
    947 		}
    948 	} else {
    949 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    950 			db1_printf(("Bounds check failed!!:%d %d\n",
    951 				(int) bp->b_blkno, (int) wlabel));
    952 			goto done;
    953 		}
    954 	}
    955 
    956 	rf_lock_mutex2(raidPtr->iodone_lock);
    957 
    958 	bp->b_resid = 0;
    959 
    960 	/* stuff it onto our queue */
    961 	bufq_put(rs->buf_queue, bp);
    962 
    963 	/* scheduled the IO to happen at the next convenient time */
    964 	rf_signal_cond2(raidPtr->iodone_cv);
    965 	rf_unlock_mutex2(raidPtr->iodone_lock);
    966 
    967 	return;
    968 
    969 done:
    970 	bp->b_resid = bp->b_bcount;
    971 	biodone(bp);
    972 }
    973 /* ARGSUSED */
    974 int
    975 raidread(dev_t dev, struct uio *uio, int flags)
    976 {
    977 	int     unit = raidunit(dev);
    978 	struct raid_softc *rs;
    979 
    980 	if ((rs = raidget(unit)) == NULL)
    981 		return ENXIO;
    982 
    983 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    984 		return (ENXIO);
    985 
    986 	return (physio(raidstrategy, NULL, dev, B_READ, raidminphys, uio));
    987 
    988 }
    989 /* ARGSUSED */
    990 int
    991 raidwrite(dev_t dev, struct uio *uio, int flags)
    992 {
    993 	int     unit = raidunit(dev);
    994 	struct raid_softc *rs;
    995 
    996 	if ((rs = raidget(unit)) == NULL)
    997 		return ENXIO;
    998 
    999 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1000 		return (ENXIO);
   1001 
   1002 	return (physio(raidstrategy, NULL, dev, B_WRITE, raidminphys, uio));
   1003 
   1004 }
   1005 
   1006 static int
   1007 raid_detach_unlocked(struct raid_softc *rs)
   1008 {
   1009 	int error;
   1010 	RF_Raid_t *raidPtr;
   1011 
   1012 	raidPtr = &rs->sc_r;
   1013 
   1014 	/*
   1015 	 * If somebody has a partition mounted, we shouldn't
   1016 	 * shutdown.
   1017 	 */
   1018 	if (rs->sc_dkdev.dk_openmask != 0)
   1019 		return EBUSY;
   1020 
   1021 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1022 		;	/* not initialized: nothing to do */
   1023 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1024 		return error;
   1025 	else
   1026 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1027 
   1028 	/* Detach the disk. */
   1029 	dkwedge_delall(&rs->sc_dkdev);
   1030 	disk_detach(&rs->sc_dkdev);
   1031 	disk_destroy(&rs->sc_dkdev);
   1032 
   1033 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1034 
   1035 	return 0;
   1036 }
   1037 
   1038 int
   1039 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1040 {
   1041 	int     unit = raidunit(dev);
   1042 	int     error = 0;
   1043 	int     part, pmask, s;
   1044 	cfdata_t cf;
   1045 	struct raid_softc *rs;
   1046 	RF_Config_t *k_cfg, *u_cfg;
   1047 	RF_Raid_t *raidPtr;
   1048 	RF_RaidDisk_t *diskPtr;
   1049 	RF_AccTotals_t *totals;
   1050 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1051 	u_char *specific_buf;
   1052 	int retcode = 0;
   1053 	int column;
   1054 /*	int raidid; */
   1055 	struct rf_recon_req *rrcopy, *rr;
   1056 	RF_ComponentLabel_t *clabel;
   1057 	RF_ComponentLabel_t *ci_label;
   1058 	RF_ComponentLabel_t **clabel_ptr;
   1059 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1060 	RF_SingleComponent_t component;
   1061 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1062 	int i, j, d;
   1063 #ifdef __HAVE_OLD_DISKLABEL
   1064 	struct disklabel newlabel;
   1065 #endif
   1066 	struct dkwedge_info *dkw;
   1067 
   1068 	if ((rs = raidget(unit)) == NULL)
   1069 		return ENXIO;
   1070 	raidPtr = &rs->sc_r;
   1071 
   1072 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1073 		(int) DISKPART(dev), (int) unit, cmd));
   1074 
   1075 	/* Must be open for writes for these commands... */
   1076 	switch (cmd) {
   1077 #ifdef DIOCGSECTORSIZE
   1078 	case DIOCGSECTORSIZE:
   1079 		*(u_int *)data = raidPtr->bytesPerSector;
   1080 		return 0;
   1081 	case DIOCGMEDIASIZE:
   1082 		*(off_t *)data =
   1083 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1084 		return 0;
   1085 #endif
   1086 	case DIOCSDINFO:
   1087 	case DIOCWDINFO:
   1088 #ifdef __HAVE_OLD_DISKLABEL
   1089 	case ODIOCWDINFO:
   1090 	case ODIOCSDINFO:
   1091 #endif
   1092 	case DIOCWLABEL:
   1093 	case DIOCAWEDGE:
   1094 	case DIOCDWEDGE:
   1095 	case DIOCSSTRATEGY:
   1096 		if ((flag & FWRITE) == 0)
   1097 			return (EBADF);
   1098 	}
   1099 
   1100 	/* Must be initialized for these... */
   1101 	switch (cmd) {
   1102 	case DIOCGDINFO:
   1103 	case DIOCSDINFO:
   1104 	case DIOCWDINFO:
   1105 #ifdef __HAVE_OLD_DISKLABEL
   1106 	case ODIOCGDINFO:
   1107 	case ODIOCWDINFO:
   1108 	case ODIOCSDINFO:
   1109 	case ODIOCGDEFLABEL:
   1110 #endif
   1111 	case DIOCGPART:
   1112 	case DIOCWLABEL:
   1113 	case DIOCGDEFLABEL:
   1114 	case DIOCAWEDGE:
   1115 	case DIOCDWEDGE:
   1116 	case DIOCLWEDGES:
   1117 	case DIOCCACHESYNC:
   1118 	case RAIDFRAME_SHUTDOWN:
   1119 	case RAIDFRAME_REWRITEPARITY:
   1120 	case RAIDFRAME_GET_INFO:
   1121 	case RAIDFRAME_RESET_ACCTOTALS:
   1122 	case RAIDFRAME_GET_ACCTOTALS:
   1123 	case RAIDFRAME_KEEP_ACCTOTALS:
   1124 	case RAIDFRAME_GET_SIZE:
   1125 	case RAIDFRAME_FAIL_DISK:
   1126 	case RAIDFRAME_COPYBACK:
   1127 	case RAIDFRAME_CHECK_RECON_STATUS:
   1128 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1129 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1130 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1131 	case RAIDFRAME_ADD_HOT_SPARE:
   1132 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1133 	case RAIDFRAME_INIT_LABELS:
   1134 	case RAIDFRAME_REBUILD_IN_PLACE:
   1135 	case RAIDFRAME_CHECK_PARITY:
   1136 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1137 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1138 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1139 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1140 	case RAIDFRAME_SET_AUTOCONFIG:
   1141 	case RAIDFRAME_SET_ROOT:
   1142 	case RAIDFRAME_DELETE_COMPONENT:
   1143 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1144 	case RAIDFRAME_PARITYMAP_STATUS:
   1145 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1146 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1147 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1148 	case DIOCGSTRATEGY:
   1149 	case DIOCSSTRATEGY:
   1150 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1151 			return (ENXIO);
   1152 	}
   1153 
   1154 	switch (cmd) {
   1155 #ifdef COMPAT_50
   1156 	case RAIDFRAME_GET_INFO50:
   1157 		return rf_get_info50(raidPtr, data);
   1158 
   1159 	case RAIDFRAME_CONFIGURE50:
   1160 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1161 			return retcode;
   1162 		goto config;
   1163 #endif
   1164 		/* configure the system */
   1165 	case RAIDFRAME_CONFIGURE:
   1166 
   1167 		if (raidPtr->valid) {
   1168 			/* There is a valid RAID set running on this unit! */
   1169 			printf("raid%d: Device already configured!\n",unit);
   1170 			return(EINVAL);
   1171 		}
   1172 
   1173 		/* copy-in the configuration information */
   1174 		/* data points to a pointer to the configuration structure */
   1175 
   1176 		u_cfg = *((RF_Config_t **) data);
   1177 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1178 		if (k_cfg == NULL) {
   1179 			return (ENOMEM);
   1180 		}
   1181 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1182 		if (retcode) {
   1183 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1184 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1185 				retcode));
   1186 			return (retcode);
   1187 		}
   1188 		goto config;
   1189 	config:
   1190 		/* allocate a buffer for the layout-specific data, and copy it
   1191 		 * in */
   1192 		if (k_cfg->layoutSpecificSize) {
   1193 			if (k_cfg->layoutSpecificSize > 10000) {
   1194 				/* sanity check */
   1195 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1196 				return (EINVAL);
   1197 			}
   1198 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1199 			    (u_char *));
   1200 			if (specific_buf == NULL) {
   1201 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1202 				return (ENOMEM);
   1203 			}
   1204 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1205 			    k_cfg->layoutSpecificSize);
   1206 			if (retcode) {
   1207 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1208 				RF_Free(specific_buf,
   1209 					k_cfg->layoutSpecificSize);
   1210 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1211 					retcode));
   1212 				return (retcode);
   1213 			}
   1214 		} else
   1215 			specific_buf = NULL;
   1216 		k_cfg->layoutSpecific = specific_buf;
   1217 
   1218 		/* should do some kind of sanity check on the configuration.
   1219 		 * Store the sum of all the bytes in the last byte? */
   1220 
   1221 		/* configure the system */
   1222 
   1223 		/*
   1224 		 * Clear the entire RAID descriptor, just to make sure
   1225 		 *  there is no stale data left in the case of a
   1226 		 *  reconfiguration
   1227 		 */
   1228 		memset(raidPtr, 0, sizeof(*raidPtr));
   1229 		raidPtr->softc = rs;
   1230 		raidPtr->raidid = unit;
   1231 
   1232 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1233 
   1234 		if (retcode == 0) {
   1235 
   1236 			/* allow this many simultaneous IO's to
   1237 			   this RAID device */
   1238 			raidPtr->openings = RAIDOUTSTANDING;
   1239 
   1240 			raidinit(rs);
   1241 			rf_markalldirty(raidPtr);
   1242 		}
   1243 		/* free the buffers.  No return code here. */
   1244 		if (k_cfg->layoutSpecificSize) {
   1245 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1246 		}
   1247 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1248 
   1249 		return (retcode);
   1250 
   1251 		/* shutdown the system */
   1252 	case RAIDFRAME_SHUTDOWN:
   1253 
   1254 		part = DISKPART(dev);
   1255 		pmask = (1 << part);
   1256 
   1257 		if ((error = raidlock(rs)) != 0)
   1258 			return (error);
   1259 
   1260 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1261 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1262 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1263 			retcode = EBUSY;
   1264 		else {
   1265 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1266 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1267 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1268 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1269 			retcode = 0;
   1270 		}
   1271 
   1272 		raidunlock(rs);
   1273 
   1274 		if (retcode != 0)
   1275 			return retcode;
   1276 
   1277 		/* free the pseudo device attach bits */
   1278 
   1279 		cf = device_cfdata(rs->sc_dev);
   1280 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1281 			free(cf, M_RAIDFRAME);
   1282 
   1283 		return (retcode);
   1284 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1285 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1286 		/* need to read the component label for the disk indicated
   1287 		   by row,column in clabel */
   1288 
   1289 		/*
   1290 		 * Perhaps there should be an option to skip the in-core
   1291 		 * copy and hit the disk, as with disklabel(8).
   1292 		 */
   1293 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1294 
   1295 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1296 
   1297 		if (retcode) {
   1298 			RF_Free(clabel, sizeof(*clabel));
   1299 			return retcode;
   1300 		}
   1301 
   1302 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1303 
   1304 		column = clabel->column;
   1305 
   1306 		if ((column < 0) || (column >= raidPtr->numCol +
   1307 		    raidPtr->numSpare)) {
   1308 			RF_Free(clabel, sizeof(*clabel));
   1309 			return EINVAL;
   1310 		}
   1311 
   1312 		RF_Free(clabel, sizeof(*clabel));
   1313 
   1314 		clabel = raidget_component_label(raidPtr, column);
   1315 
   1316 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1317 
   1318 #if 0
   1319 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1320 		clabel = (RF_ComponentLabel_t *) data;
   1321 
   1322 		/* XXX check the label for valid stuff... */
   1323 		/* Note that some things *should not* get modified --
   1324 		   the user should be re-initing the labels instead of
   1325 		   trying to patch things.
   1326 		   */
   1327 
   1328 		raidid = raidPtr->raidid;
   1329 #ifdef DEBUG
   1330 		printf("raid%d: Got component label:\n", raidid);
   1331 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1332 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1333 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1334 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1335 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1336 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1337 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1338 #endif
   1339 		clabel->row = 0;
   1340 		column = clabel->column;
   1341 
   1342 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1343 			return(EINVAL);
   1344 		}
   1345 
   1346 		/* XXX this isn't allowed to do anything for now :-) */
   1347 
   1348 		/* XXX and before it is, we need to fill in the rest
   1349 		   of the fields!?!?!?! */
   1350 		memcpy(raidget_component_label(raidPtr, column),
   1351 		    clabel, sizeof(*clabel));
   1352 		raidflush_component_label(raidPtr, column);
   1353 		return (0);
   1354 #endif
   1355 
   1356 	case RAIDFRAME_INIT_LABELS:
   1357 		clabel = (RF_ComponentLabel_t *) data;
   1358 		/*
   1359 		   we only want the serial number from
   1360 		   the above.  We get all the rest of the information
   1361 		   from the config that was used to create this RAID
   1362 		   set.
   1363 		   */
   1364 
   1365 		raidPtr->serial_number = clabel->serial_number;
   1366 
   1367 		for(column=0;column<raidPtr->numCol;column++) {
   1368 			diskPtr = &raidPtr->Disks[column];
   1369 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1370 				ci_label = raidget_component_label(raidPtr,
   1371 				    column);
   1372 				/* Zeroing this is important. */
   1373 				memset(ci_label, 0, sizeof(*ci_label));
   1374 				raid_init_component_label(raidPtr, ci_label);
   1375 				ci_label->serial_number =
   1376 				    raidPtr->serial_number;
   1377 				ci_label->row = 0; /* we dont' pretend to support more */
   1378 				rf_component_label_set_partitionsize(ci_label,
   1379 				    diskPtr->partitionSize);
   1380 				ci_label->column = column;
   1381 				raidflush_component_label(raidPtr, column);
   1382 			}
   1383 			/* XXXjld what about the spares? */
   1384 		}
   1385 
   1386 		return (retcode);
   1387 	case RAIDFRAME_SET_AUTOCONFIG:
   1388 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1389 		printf("raid%d: New autoconfig value is: %d\n",
   1390 		       raidPtr->raidid, d);
   1391 		*(int *) data = d;
   1392 		return (retcode);
   1393 
   1394 	case RAIDFRAME_SET_ROOT:
   1395 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1396 		printf("raid%d: New rootpartition value is: %d\n",
   1397 		       raidPtr->raidid, d);
   1398 		*(int *) data = d;
   1399 		return (retcode);
   1400 
   1401 		/* initialize all parity */
   1402 	case RAIDFRAME_REWRITEPARITY:
   1403 
   1404 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1405 			/* Parity for RAID 0 is trivially correct */
   1406 			raidPtr->parity_good = RF_RAID_CLEAN;
   1407 			return(0);
   1408 		}
   1409 
   1410 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1411 			/* Re-write is already in progress! */
   1412 			return(EINVAL);
   1413 		}
   1414 
   1415 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1416 					   rf_RewriteParityThread,
   1417 					   raidPtr,"raid_parity");
   1418 		return (retcode);
   1419 
   1420 
   1421 	case RAIDFRAME_ADD_HOT_SPARE:
   1422 		sparePtr = (RF_SingleComponent_t *) data;
   1423 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1424 		retcode = rf_add_hot_spare(raidPtr, &component);
   1425 		return(retcode);
   1426 
   1427 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1428 		return(retcode);
   1429 
   1430 	case RAIDFRAME_DELETE_COMPONENT:
   1431 		componentPtr = (RF_SingleComponent_t *)data;
   1432 		memcpy( &component, componentPtr,
   1433 			sizeof(RF_SingleComponent_t));
   1434 		retcode = rf_delete_component(raidPtr, &component);
   1435 		return(retcode);
   1436 
   1437 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1438 		componentPtr = (RF_SingleComponent_t *)data;
   1439 		memcpy( &component, componentPtr,
   1440 			sizeof(RF_SingleComponent_t));
   1441 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1442 		return(retcode);
   1443 
   1444 	case RAIDFRAME_REBUILD_IN_PLACE:
   1445 
   1446 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1447 			/* Can't do this on a RAID 0!! */
   1448 			return(EINVAL);
   1449 		}
   1450 
   1451 		if (raidPtr->recon_in_progress == 1) {
   1452 			/* a reconstruct is already in progress! */
   1453 			return(EINVAL);
   1454 		}
   1455 
   1456 		componentPtr = (RF_SingleComponent_t *) data;
   1457 		memcpy( &component, componentPtr,
   1458 			sizeof(RF_SingleComponent_t));
   1459 		component.row = 0; /* we don't support any more */
   1460 		column = component.column;
   1461 
   1462 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1463 			return(EINVAL);
   1464 		}
   1465 
   1466 		rf_lock_mutex2(raidPtr->mutex);
   1467 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1468 		    (raidPtr->numFailures > 0)) {
   1469 			/* XXX 0 above shouldn't be constant!!! */
   1470 			/* some component other than this has failed.
   1471 			   Let's not make things worse than they already
   1472 			   are... */
   1473 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1474 			       raidPtr->raidid);
   1475 			printf("raid%d:     Col: %d   Too many failures.\n",
   1476 			       raidPtr->raidid, column);
   1477 			rf_unlock_mutex2(raidPtr->mutex);
   1478 			return (EINVAL);
   1479 		}
   1480 		if (raidPtr->Disks[column].status ==
   1481 		    rf_ds_reconstructing) {
   1482 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1483 			       raidPtr->raidid);
   1484 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1485 
   1486 			rf_unlock_mutex2(raidPtr->mutex);
   1487 			return (EINVAL);
   1488 		}
   1489 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1490 			rf_unlock_mutex2(raidPtr->mutex);
   1491 			return (EINVAL);
   1492 		}
   1493 		rf_unlock_mutex2(raidPtr->mutex);
   1494 
   1495 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1496 		if (rrcopy == NULL)
   1497 			return(ENOMEM);
   1498 
   1499 		rrcopy->raidPtr = (void *) raidPtr;
   1500 		rrcopy->col = column;
   1501 
   1502 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1503 					   rf_ReconstructInPlaceThread,
   1504 					   rrcopy,"raid_reconip");
   1505 		return(retcode);
   1506 
   1507 	case RAIDFRAME_GET_INFO:
   1508 		if (!raidPtr->valid)
   1509 			return (ENODEV);
   1510 		ucfgp = (RF_DeviceConfig_t **) data;
   1511 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1512 			  (RF_DeviceConfig_t *));
   1513 		if (d_cfg == NULL)
   1514 			return (ENOMEM);
   1515 		d_cfg->rows = 1; /* there is only 1 row now */
   1516 		d_cfg->cols = raidPtr->numCol;
   1517 		d_cfg->ndevs = raidPtr->numCol;
   1518 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1519 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1520 			return (ENOMEM);
   1521 		}
   1522 		d_cfg->nspares = raidPtr->numSpare;
   1523 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1524 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1525 			return (ENOMEM);
   1526 		}
   1527 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1528 		d = 0;
   1529 		for (j = 0; j < d_cfg->cols; j++) {
   1530 			d_cfg->devs[d] = raidPtr->Disks[j];
   1531 			d++;
   1532 		}
   1533 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1534 			d_cfg->spares[i] = raidPtr->Disks[j];
   1535 		}
   1536 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1537 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1538 
   1539 		return (retcode);
   1540 
   1541 	case RAIDFRAME_CHECK_PARITY:
   1542 		*(int *) data = raidPtr->parity_good;
   1543 		return (0);
   1544 
   1545 	case RAIDFRAME_PARITYMAP_STATUS:
   1546 		if (rf_paritymap_ineligible(raidPtr))
   1547 			return EINVAL;
   1548 		rf_paritymap_status(raidPtr->parity_map,
   1549 		    (struct rf_pmstat *)data);
   1550 		return 0;
   1551 
   1552 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1553 		if (rf_paritymap_ineligible(raidPtr))
   1554 			return EINVAL;
   1555 		if (raidPtr->parity_map == NULL)
   1556 			return ENOENT; /* ??? */
   1557 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1558 			(struct rf_pmparams *)data, 1))
   1559 			return EINVAL;
   1560 		return 0;
   1561 
   1562 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1563 		if (rf_paritymap_ineligible(raidPtr))
   1564 			return EINVAL;
   1565 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1566 		return 0;
   1567 
   1568 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1569 		if (rf_paritymap_ineligible(raidPtr))
   1570 			return EINVAL;
   1571 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1572 		/* XXX should errors be passed up? */
   1573 		return 0;
   1574 
   1575 	case RAIDFRAME_RESET_ACCTOTALS:
   1576 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1577 		return (0);
   1578 
   1579 	case RAIDFRAME_GET_ACCTOTALS:
   1580 		totals = (RF_AccTotals_t *) data;
   1581 		*totals = raidPtr->acc_totals;
   1582 		return (0);
   1583 
   1584 	case RAIDFRAME_KEEP_ACCTOTALS:
   1585 		raidPtr->keep_acc_totals = *(int *)data;
   1586 		return (0);
   1587 
   1588 	case RAIDFRAME_GET_SIZE:
   1589 		*(int *) data = raidPtr->totalSectors;
   1590 		return (0);
   1591 
   1592 		/* fail a disk & optionally start reconstruction */
   1593 	case RAIDFRAME_FAIL_DISK:
   1594 
   1595 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1596 			/* Can't do this on a RAID 0!! */
   1597 			return(EINVAL);
   1598 		}
   1599 
   1600 		rr = (struct rf_recon_req *) data;
   1601 		rr->row = 0;
   1602 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1603 			return (EINVAL);
   1604 
   1605 
   1606 		rf_lock_mutex2(raidPtr->mutex);
   1607 		if (raidPtr->status == rf_rs_reconstructing) {
   1608 			/* you can't fail a disk while we're reconstructing! */
   1609 			/* XXX wrong for RAID6 */
   1610 			rf_unlock_mutex2(raidPtr->mutex);
   1611 			return (EINVAL);
   1612 		}
   1613 		if ((raidPtr->Disks[rr->col].status ==
   1614 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1615 			/* some other component has failed.  Let's not make
   1616 			   things worse. XXX wrong for RAID6 */
   1617 			rf_unlock_mutex2(raidPtr->mutex);
   1618 			return (EINVAL);
   1619 		}
   1620 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1621 			/* Can't fail a spared disk! */
   1622 			rf_unlock_mutex2(raidPtr->mutex);
   1623 			return (EINVAL);
   1624 		}
   1625 		rf_unlock_mutex2(raidPtr->mutex);
   1626 
   1627 		/* make a copy of the recon request so that we don't rely on
   1628 		 * the user's buffer */
   1629 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1630 		if (rrcopy == NULL)
   1631 			return(ENOMEM);
   1632 		memcpy(rrcopy, rr, sizeof(*rr));
   1633 		rrcopy->raidPtr = (void *) raidPtr;
   1634 
   1635 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1636 					   rf_ReconThread,
   1637 					   rrcopy,"raid_recon");
   1638 		return (0);
   1639 
   1640 		/* invoke a copyback operation after recon on whatever disk
   1641 		 * needs it, if any */
   1642 	case RAIDFRAME_COPYBACK:
   1643 
   1644 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1645 			/* This makes no sense on a RAID 0!! */
   1646 			return(EINVAL);
   1647 		}
   1648 
   1649 		if (raidPtr->copyback_in_progress == 1) {
   1650 			/* Copyback is already in progress! */
   1651 			return(EINVAL);
   1652 		}
   1653 
   1654 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1655 					   rf_CopybackThread,
   1656 					   raidPtr,"raid_copyback");
   1657 		return (retcode);
   1658 
   1659 		/* return the percentage completion of reconstruction */
   1660 	case RAIDFRAME_CHECK_RECON_STATUS:
   1661 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1662 			/* This makes no sense on a RAID 0, so tell the
   1663 			   user it's done. */
   1664 			*(int *) data = 100;
   1665 			return(0);
   1666 		}
   1667 		if (raidPtr->status != rf_rs_reconstructing)
   1668 			*(int *) data = 100;
   1669 		else {
   1670 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1671 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1672 			} else {
   1673 				*(int *) data = 0;
   1674 			}
   1675 		}
   1676 		return (0);
   1677 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1678 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1679 		if (raidPtr->status != rf_rs_reconstructing) {
   1680 			progressInfo.remaining = 0;
   1681 			progressInfo.completed = 100;
   1682 			progressInfo.total = 100;
   1683 		} else {
   1684 			progressInfo.total =
   1685 				raidPtr->reconControl->numRUsTotal;
   1686 			progressInfo.completed =
   1687 				raidPtr->reconControl->numRUsComplete;
   1688 			progressInfo.remaining = progressInfo.total -
   1689 				progressInfo.completed;
   1690 		}
   1691 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1692 				  sizeof(RF_ProgressInfo_t));
   1693 		return (retcode);
   1694 
   1695 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1696 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1697 			/* This makes no sense on a RAID 0, so tell the
   1698 			   user it's done. */
   1699 			*(int *) data = 100;
   1700 			return(0);
   1701 		}
   1702 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1703 			*(int *) data = 100 *
   1704 				raidPtr->parity_rewrite_stripes_done /
   1705 				raidPtr->Layout.numStripe;
   1706 		} else {
   1707 			*(int *) data = 100;
   1708 		}
   1709 		return (0);
   1710 
   1711 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1712 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1713 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1714 			progressInfo.total = raidPtr->Layout.numStripe;
   1715 			progressInfo.completed =
   1716 				raidPtr->parity_rewrite_stripes_done;
   1717 			progressInfo.remaining = progressInfo.total -
   1718 				progressInfo.completed;
   1719 		} else {
   1720 			progressInfo.remaining = 0;
   1721 			progressInfo.completed = 100;
   1722 			progressInfo.total = 100;
   1723 		}
   1724 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1725 				  sizeof(RF_ProgressInfo_t));
   1726 		return (retcode);
   1727 
   1728 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1729 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1730 			/* This makes no sense on a RAID 0 */
   1731 			*(int *) data = 100;
   1732 			return(0);
   1733 		}
   1734 		if (raidPtr->copyback_in_progress == 1) {
   1735 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1736 				raidPtr->Layout.numStripe;
   1737 		} else {
   1738 			*(int *) data = 100;
   1739 		}
   1740 		return (0);
   1741 
   1742 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1743 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1744 		if (raidPtr->copyback_in_progress == 1) {
   1745 			progressInfo.total = raidPtr->Layout.numStripe;
   1746 			progressInfo.completed =
   1747 				raidPtr->copyback_stripes_done;
   1748 			progressInfo.remaining = progressInfo.total -
   1749 				progressInfo.completed;
   1750 		} else {
   1751 			progressInfo.remaining = 0;
   1752 			progressInfo.completed = 100;
   1753 			progressInfo.total = 100;
   1754 		}
   1755 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1756 				  sizeof(RF_ProgressInfo_t));
   1757 		return (retcode);
   1758 
   1759 		/* the sparetable daemon calls this to wait for the kernel to
   1760 		 * need a spare table. this ioctl does not return until a
   1761 		 * spare table is needed. XXX -- calling mpsleep here in the
   1762 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1763 		 * -- I should either compute the spare table in the kernel,
   1764 		 * or have a different -- XXX XXX -- interface (a different
   1765 		 * character device) for delivering the table     -- XXX */
   1766 #if 0
   1767 	case RAIDFRAME_SPARET_WAIT:
   1768 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1769 		while (!rf_sparet_wait_queue)
   1770 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1771 		waitreq = rf_sparet_wait_queue;
   1772 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1773 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1774 
   1775 		/* structure assignment */
   1776 		*((RF_SparetWait_t *) data) = *waitreq;
   1777 
   1778 		RF_Free(waitreq, sizeof(*waitreq));
   1779 		return (0);
   1780 
   1781 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1782 		 * code in it that will cause the dameon to exit */
   1783 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1784 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1785 		waitreq->fcol = -1;
   1786 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1787 		waitreq->next = rf_sparet_wait_queue;
   1788 		rf_sparet_wait_queue = waitreq;
   1789 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1790 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1791 		return (0);
   1792 
   1793 		/* used by the spare table daemon to deliver a spare table
   1794 		 * into the kernel */
   1795 	case RAIDFRAME_SEND_SPARET:
   1796 
   1797 		/* install the spare table */
   1798 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1799 
   1800 		/* respond to the requestor.  the return status of the spare
   1801 		 * table installation is passed in the "fcol" field */
   1802 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1803 		waitreq->fcol = retcode;
   1804 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1805 		waitreq->next = rf_sparet_resp_queue;
   1806 		rf_sparet_resp_queue = waitreq;
   1807 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1808 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1809 
   1810 		return (retcode);
   1811 #endif
   1812 
   1813 	default:
   1814 		break; /* fall through to the os-specific code below */
   1815 
   1816 	}
   1817 
   1818 	if (!raidPtr->valid)
   1819 		return (EINVAL);
   1820 
   1821 	/*
   1822 	 * Add support for "regular" device ioctls here.
   1823 	 */
   1824 
   1825 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1826 	if (error != EPASSTHROUGH)
   1827 		return (error);
   1828 
   1829 	switch (cmd) {
   1830 	case DIOCGDINFO:
   1831 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1832 		break;
   1833 #ifdef __HAVE_OLD_DISKLABEL
   1834 	case ODIOCGDINFO:
   1835 		newlabel = *(rs->sc_dkdev.dk_label);
   1836 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1837 			return ENOTTY;
   1838 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1839 		break;
   1840 #endif
   1841 
   1842 	case DIOCGPART:
   1843 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1844 		((struct partinfo *) data)->part =
   1845 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1846 		break;
   1847 
   1848 	case DIOCWDINFO:
   1849 	case DIOCSDINFO:
   1850 #ifdef __HAVE_OLD_DISKLABEL
   1851 	case ODIOCWDINFO:
   1852 	case ODIOCSDINFO:
   1853 #endif
   1854 	{
   1855 		struct disklabel *lp;
   1856 #ifdef __HAVE_OLD_DISKLABEL
   1857 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1858 			memset(&newlabel, 0, sizeof newlabel);
   1859 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1860 			lp = &newlabel;
   1861 		} else
   1862 #endif
   1863 		lp = (struct disklabel *)data;
   1864 
   1865 		if ((error = raidlock(rs)) != 0)
   1866 			return (error);
   1867 
   1868 		rs->sc_flags |= RAIDF_LABELLING;
   1869 
   1870 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1871 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1872 		if (error == 0) {
   1873 			if (cmd == DIOCWDINFO
   1874 #ifdef __HAVE_OLD_DISKLABEL
   1875 			    || cmd == ODIOCWDINFO
   1876 #endif
   1877 			   )
   1878 				error = writedisklabel(RAIDLABELDEV(dev),
   1879 				    raidstrategy, rs->sc_dkdev.dk_label,
   1880 				    rs->sc_dkdev.dk_cpulabel);
   1881 		}
   1882 		rs->sc_flags &= ~RAIDF_LABELLING;
   1883 
   1884 		raidunlock(rs);
   1885 
   1886 		if (error)
   1887 			return (error);
   1888 		break;
   1889 	}
   1890 
   1891 	case DIOCWLABEL:
   1892 		if (*(int *) data != 0)
   1893 			rs->sc_flags |= RAIDF_WLABEL;
   1894 		else
   1895 			rs->sc_flags &= ~RAIDF_WLABEL;
   1896 		break;
   1897 
   1898 	case DIOCGDEFLABEL:
   1899 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1900 		break;
   1901 
   1902 #ifdef __HAVE_OLD_DISKLABEL
   1903 	case ODIOCGDEFLABEL:
   1904 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1905 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1906 			return ENOTTY;
   1907 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1908 		break;
   1909 #endif
   1910 
   1911 	case DIOCAWEDGE:
   1912 	case DIOCDWEDGE:
   1913 	    	dkw = (void *)data;
   1914 
   1915 		/* If the ioctl happens here, the parent is us. */
   1916 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1917 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1918 
   1919 	case DIOCLWEDGES:
   1920 		return dkwedge_list(&rs->sc_dkdev,
   1921 		    (struct dkwedge_list *)data, l);
   1922 	case DIOCCACHESYNC:
   1923 		return rf_sync_component_caches(raidPtr);
   1924 
   1925 	case DIOCGSTRATEGY:
   1926 	    {
   1927 		struct disk_strategy *dks = (void *)data;
   1928 
   1929 		s = splbio();
   1930 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1931 		    sizeof(dks->dks_name));
   1932 		splx(s);
   1933 		dks->dks_paramlen = 0;
   1934 
   1935 		return 0;
   1936 	    }
   1937 
   1938 	case DIOCSSTRATEGY:
   1939 	    {
   1940 		struct disk_strategy *dks = (void *)data;
   1941 		struct bufq_state *new;
   1942 		struct bufq_state *old;
   1943 
   1944 		if (dks->dks_param != NULL) {
   1945 			return EINVAL;
   1946 		}
   1947 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1948 		error = bufq_alloc(&new, dks->dks_name,
   1949 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1950 		if (error) {
   1951 			return error;
   1952 		}
   1953 		s = splbio();
   1954 		old = rs->buf_queue;
   1955 		bufq_move(new, old);
   1956 		rs->buf_queue = new;
   1957 		splx(s);
   1958 		bufq_free(old);
   1959 
   1960 		return 0;
   1961 	    }
   1962 
   1963 	default:
   1964 		retcode = ENOTTY;
   1965 	}
   1966 	return (retcode);
   1967 
   1968 }
   1969 
   1970 
   1971 /* raidinit -- complete the rest of the initialization for the
   1972    RAIDframe device.  */
   1973 
   1974 
   1975 static void
   1976 raidinit(struct raid_softc *rs)
   1977 {
   1978 	cfdata_t cf;
   1979 	int     unit;
   1980 	RF_Raid_t *raidPtr = &rs->sc_r;
   1981 
   1982 	unit = raidPtr->raidid;
   1983 
   1984 
   1985 	/* XXX should check return code first... */
   1986 	rs->sc_flags |= RAIDF_INITED;
   1987 
   1988 	/* XXX doesn't check bounds. */
   1989 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1990 
   1991 	/* attach the pseudo device */
   1992 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1993 	cf->cf_name = raid_cd.cd_name;
   1994 	cf->cf_atname = raid_cd.cd_name;
   1995 	cf->cf_unit = unit;
   1996 	cf->cf_fstate = FSTATE_STAR;
   1997 
   1998 	rs->sc_dev = config_attach_pseudo(cf);
   1999 
   2000 	if (rs->sc_dev == NULL) {
   2001 		printf("raid%d: config_attach_pseudo failed\n",
   2002 		    raidPtr->raidid);
   2003 		rs->sc_flags &= ~RAIDF_INITED;
   2004 		free(cf, M_RAIDFRAME);
   2005 		return;
   2006 	}
   2007 
   2008 	/* disk_attach actually creates space for the CPU disklabel, among
   2009 	 * other things, so it's critical to call this *BEFORE* we try putzing
   2010 	 * with disklabels. */
   2011 
   2012 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   2013 	disk_attach(&rs->sc_dkdev);
   2014 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   2015 
   2016 	/* XXX There may be a weird interaction here between this, and
   2017 	 * protectedSectors, as used in RAIDframe.  */
   2018 
   2019 	rs->sc_size = raidPtr->totalSectors;
   2020 
   2021 	dkwedge_discover(&rs->sc_dkdev);
   2022 
   2023 	rf_set_geometry(rs, raidPtr);
   2024 
   2025 }
   2026 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2027 /* wake up the daemon & tell it to get us a spare table
   2028  * XXX
   2029  * the entries in the queues should be tagged with the raidPtr
   2030  * so that in the extremely rare case that two recons happen at once,
   2031  * we know for which device were requesting a spare table
   2032  * XXX
   2033  *
   2034  * XXX This code is not currently used. GO
   2035  */
   2036 int
   2037 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2038 {
   2039 	int     retcode;
   2040 
   2041 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2042 	req->next = rf_sparet_wait_queue;
   2043 	rf_sparet_wait_queue = req;
   2044 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2045 
   2046 	/* mpsleep unlocks the mutex */
   2047 	while (!rf_sparet_resp_queue) {
   2048 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2049 	}
   2050 	req = rf_sparet_resp_queue;
   2051 	rf_sparet_resp_queue = req->next;
   2052 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2053 
   2054 	retcode = req->fcol;
   2055 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2056 					 * alloc'd */
   2057 	return (retcode);
   2058 }
   2059 #endif
   2060 
   2061 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2062  * bp & passes it down.
   2063  * any calls originating in the kernel must use non-blocking I/O
   2064  * do some extra sanity checking to return "appropriate" error values for
   2065  * certain conditions (to make some standard utilities work)
   2066  *
   2067  * Formerly known as: rf_DoAccessKernel
   2068  */
   2069 void
   2070 raidstart(RF_Raid_t *raidPtr)
   2071 {
   2072 	RF_SectorCount_t num_blocks, pb, sum;
   2073 	RF_RaidAddr_t raid_addr;
   2074 	struct partition *pp;
   2075 	daddr_t blocknum;
   2076 	struct raid_softc *rs;
   2077 	int     do_async;
   2078 	struct buf *bp;
   2079 	int rc;
   2080 
   2081 	rs = raidPtr->softc;
   2082 	/* quick check to see if anything has died recently */
   2083 	rf_lock_mutex2(raidPtr->mutex);
   2084 	if (raidPtr->numNewFailures > 0) {
   2085 		rf_unlock_mutex2(raidPtr->mutex);
   2086 		rf_update_component_labels(raidPtr,
   2087 					   RF_NORMAL_COMPONENT_UPDATE);
   2088 		rf_lock_mutex2(raidPtr->mutex);
   2089 		raidPtr->numNewFailures--;
   2090 	}
   2091 
   2092 	/* Check to see if we're at the limit... */
   2093 	while (raidPtr->openings > 0) {
   2094 		rf_unlock_mutex2(raidPtr->mutex);
   2095 
   2096 		/* get the next item, if any, from the queue */
   2097 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2098 			/* nothing more to do */
   2099 			return;
   2100 		}
   2101 
   2102 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2103 		 * partition.. Need to make it absolute to the underlying
   2104 		 * device.. */
   2105 
   2106 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2107 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2108 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2109 			blocknum += pp->p_offset;
   2110 		}
   2111 
   2112 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2113 			    (int) blocknum));
   2114 
   2115 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2116 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2117 
   2118 		/* *THIS* is where we adjust what block we're going to...
   2119 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2120 		raid_addr = blocknum;
   2121 
   2122 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2123 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2124 		sum = raid_addr + num_blocks + pb;
   2125 		if (1 || rf_debugKernelAccess) {
   2126 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2127 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2128 				    (int) pb, (int) bp->b_resid));
   2129 		}
   2130 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2131 		    || (sum < num_blocks) || (sum < pb)) {
   2132 			bp->b_error = ENOSPC;
   2133 			bp->b_resid = bp->b_bcount;
   2134 			biodone(bp);
   2135 			rf_lock_mutex2(raidPtr->mutex);
   2136 			continue;
   2137 		}
   2138 		/*
   2139 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2140 		 */
   2141 
   2142 		if (bp->b_bcount & raidPtr->sectorMask) {
   2143 			bp->b_error = EINVAL;
   2144 			bp->b_resid = bp->b_bcount;
   2145 			biodone(bp);
   2146 			rf_lock_mutex2(raidPtr->mutex);
   2147 			continue;
   2148 
   2149 		}
   2150 		db1_printf(("Calling DoAccess..\n"));
   2151 
   2152 
   2153 		rf_lock_mutex2(raidPtr->mutex);
   2154 		raidPtr->openings--;
   2155 		rf_unlock_mutex2(raidPtr->mutex);
   2156 
   2157 		/*
   2158 		 * Everything is async.
   2159 		 */
   2160 		do_async = 1;
   2161 
   2162 		disk_busy(&rs->sc_dkdev);
   2163 
   2164 		/* XXX we're still at splbio() here... do we *really*
   2165 		   need to be? */
   2166 
   2167 		/* don't ever condition on bp->b_flags & B_WRITE.
   2168 		 * always condition on B_READ instead */
   2169 
   2170 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2171 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2172 				 do_async, raid_addr, num_blocks,
   2173 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2174 
   2175 		if (rc) {
   2176 			bp->b_error = rc;
   2177 			bp->b_resid = bp->b_bcount;
   2178 			biodone(bp);
   2179 			/* continue loop */
   2180 		}
   2181 
   2182 		rf_lock_mutex2(raidPtr->mutex);
   2183 	}
   2184 	rf_unlock_mutex2(raidPtr->mutex);
   2185 }
   2186 
   2187 
   2188 
   2189 
   2190 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2191 
   2192 int
   2193 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2194 {
   2195 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2196 	struct buf *bp;
   2197 
   2198 	req->queue = queue;
   2199 	bp = req->bp;
   2200 
   2201 	switch (req->type) {
   2202 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2203 		/* XXX need to do something extra here.. */
   2204 		/* I'm leaving this in, as I've never actually seen it used,
   2205 		 * and I'd like folks to report it... GO */
   2206 		printf(("WAKEUP CALLED\n"));
   2207 		queue->numOutstanding++;
   2208 
   2209 		bp->b_flags = 0;
   2210 		bp->b_private = req;
   2211 
   2212 		KernelWakeupFunc(bp);
   2213 		break;
   2214 
   2215 	case RF_IO_TYPE_READ:
   2216 	case RF_IO_TYPE_WRITE:
   2217 #if RF_ACC_TRACE > 0
   2218 		if (req->tracerec) {
   2219 			RF_ETIMER_START(req->tracerec->timer);
   2220 		}
   2221 #endif
   2222 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2223 		    op, queue->rf_cinfo->ci_dev,
   2224 		    req->sectorOffset, req->numSector,
   2225 		    req->buf, KernelWakeupFunc, (void *) req,
   2226 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2227 
   2228 		if (rf_debugKernelAccess) {
   2229 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2230 				(long) bp->b_blkno));
   2231 		}
   2232 		queue->numOutstanding++;
   2233 		queue->last_deq_sector = req->sectorOffset;
   2234 		/* acc wouldn't have been let in if there were any pending
   2235 		 * reqs at any other priority */
   2236 		queue->curPriority = req->priority;
   2237 
   2238 		db1_printf(("Going for %c to unit %d col %d\n",
   2239 			    req->type, queue->raidPtr->raidid,
   2240 			    queue->col));
   2241 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2242 			(int) req->sectorOffset, (int) req->numSector,
   2243 			(int) (req->numSector <<
   2244 			    queue->raidPtr->logBytesPerSector),
   2245 			(int) queue->raidPtr->logBytesPerSector));
   2246 
   2247 		/*
   2248 		 * XXX: drop lock here since this can block at
   2249 		 * least with backing SCSI devices.  Retake it
   2250 		 * to minimize fuss with calling interfaces.
   2251 		 */
   2252 
   2253 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2254 		bdev_strategy(bp);
   2255 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2256 		break;
   2257 
   2258 	default:
   2259 		panic("bad req->type in rf_DispatchKernelIO");
   2260 	}
   2261 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2262 
   2263 	return (0);
   2264 }
   2265 /* this is the callback function associated with a I/O invoked from
   2266    kernel code.
   2267  */
   2268 static void
   2269 KernelWakeupFunc(struct buf *bp)
   2270 {
   2271 	RF_DiskQueueData_t *req = NULL;
   2272 	RF_DiskQueue_t *queue;
   2273 
   2274 	db1_printf(("recovering the request queue:\n"));
   2275 
   2276 	req = bp->b_private;
   2277 
   2278 	queue = (RF_DiskQueue_t *) req->queue;
   2279 
   2280 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2281 
   2282 #if RF_ACC_TRACE > 0
   2283 	if (req->tracerec) {
   2284 		RF_ETIMER_STOP(req->tracerec->timer);
   2285 		RF_ETIMER_EVAL(req->tracerec->timer);
   2286 		rf_lock_mutex2(rf_tracing_mutex);
   2287 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2288 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2289 		req->tracerec->num_phys_ios++;
   2290 		rf_unlock_mutex2(rf_tracing_mutex);
   2291 	}
   2292 #endif
   2293 
   2294 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2295 	 * ballistic, and mark the component as hosed... */
   2296 
   2297 	if (bp->b_error != 0) {
   2298 		/* Mark the disk as dead */
   2299 		/* but only mark it once... */
   2300 		/* and only if it wouldn't leave this RAID set
   2301 		   completely broken */
   2302 		if (((queue->raidPtr->Disks[queue->col].status ==
   2303 		      rf_ds_optimal) ||
   2304 		     (queue->raidPtr->Disks[queue->col].status ==
   2305 		      rf_ds_used_spare)) &&
   2306 		     (queue->raidPtr->numFailures <
   2307 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2308 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2309 			       queue->raidPtr->raidid,
   2310 			       queue->raidPtr->Disks[queue->col].devname);
   2311 			queue->raidPtr->Disks[queue->col].status =
   2312 			    rf_ds_failed;
   2313 			queue->raidPtr->status = rf_rs_degraded;
   2314 			queue->raidPtr->numFailures++;
   2315 			queue->raidPtr->numNewFailures++;
   2316 		} else {	/* Disk is already dead... */
   2317 			/* printf("Disk already marked as dead!\n"); */
   2318 		}
   2319 
   2320 	}
   2321 
   2322 	/* Fill in the error value */
   2323 	req->error = bp->b_error;
   2324 
   2325 	/* Drop this one on the "finished" queue... */
   2326 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2327 
   2328 	/* Let the raidio thread know there is work to be done. */
   2329 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2330 
   2331 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2332 }
   2333 
   2334 
   2335 /*
   2336  * initialize a buf structure for doing an I/O in the kernel.
   2337  */
   2338 static void
   2339 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2340        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2341        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2342        struct proc *b_proc)
   2343 {
   2344 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2345 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2346 	bp->b_oflags = 0;
   2347 	bp->b_cflags = 0;
   2348 	bp->b_bcount = numSect << logBytesPerSector;
   2349 	bp->b_bufsize = bp->b_bcount;
   2350 	bp->b_error = 0;
   2351 	bp->b_dev = dev;
   2352 	bp->b_data = bf;
   2353 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2354 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2355 	if (bp->b_bcount == 0) {
   2356 		panic("bp->b_bcount is zero in InitBP!!");
   2357 	}
   2358 	bp->b_proc = b_proc;
   2359 	bp->b_iodone = cbFunc;
   2360 	bp->b_private = cbArg;
   2361 }
   2362 
   2363 static void
   2364 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2365 		    struct disklabel *lp)
   2366 {
   2367 	memset(lp, 0, sizeof(*lp));
   2368 
   2369 	/* fabricate a label... */
   2370 	lp->d_secperunit = raidPtr->totalSectors;
   2371 	lp->d_secsize = raidPtr->bytesPerSector;
   2372 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2373 	lp->d_ntracks = 4 * raidPtr->numCol;
   2374 	lp->d_ncylinders = raidPtr->totalSectors /
   2375 		(lp->d_nsectors * lp->d_ntracks);
   2376 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2377 
   2378 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2379 	lp->d_type = DTYPE_RAID;
   2380 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2381 	lp->d_rpm = 3600;
   2382 	lp->d_interleave = 1;
   2383 	lp->d_flags = 0;
   2384 
   2385 	lp->d_partitions[RAW_PART].p_offset = 0;
   2386 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2387 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2388 	lp->d_npartitions = RAW_PART + 1;
   2389 
   2390 	lp->d_magic = DISKMAGIC;
   2391 	lp->d_magic2 = DISKMAGIC;
   2392 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2393 
   2394 }
   2395 /*
   2396  * Read the disklabel from the raid device.  If one is not present, fake one
   2397  * up.
   2398  */
   2399 static void
   2400 raidgetdisklabel(dev_t dev)
   2401 {
   2402 	int     unit = raidunit(dev);
   2403 	struct raid_softc *rs;
   2404 	const char   *errstring;
   2405 	struct disklabel *lp;
   2406 	struct cpu_disklabel *clp;
   2407 	RF_Raid_t *raidPtr;
   2408 
   2409 	if ((rs = raidget(unit)) == NULL)
   2410 		return;
   2411 
   2412 	lp = rs->sc_dkdev.dk_label;
   2413 	clp = rs->sc_dkdev.dk_cpulabel;
   2414 
   2415 	db1_printf(("Getting the disklabel...\n"));
   2416 
   2417 	memset(clp, 0, sizeof(*clp));
   2418 
   2419 	raidPtr = &rs->sc_r;
   2420 
   2421 	raidgetdefaultlabel(raidPtr, rs, lp);
   2422 
   2423 	/*
   2424 	 * Call the generic disklabel extraction routine.
   2425 	 */
   2426 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2427 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2428 	if (errstring)
   2429 		raidmakedisklabel(rs);
   2430 	else {
   2431 		int     i;
   2432 		struct partition *pp;
   2433 
   2434 		/*
   2435 		 * Sanity check whether the found disklabel is valid.
   2436 		 *
   2437 		 * This is necessary since total size of the raid device
   2438 		 * may vary when an interleave is changed even though exactly
   2439 		 * same components are used, and old disklabel may used
   2440 		 * if that is found.
   2441 		 */
   2442 		if (lp->d_secperunit != rs->sc_size)
   2443 			printf("raid%d: WARNING: %s: "
   2444 			    "total sector size in disklabel (%" PRIu32 ") != "
   2445 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2446 			    lp->d_secperunit, rs->sc_size);
   2447 		for (i = 0; i < lp->d_npartitions; i++) {
   2448 			pp = &lp->d_partitions[i];
   2449 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2450 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2451 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2452 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2453 		}
   2454 	}
   2455 
   2456 }
   2457 /*
   2458  * Take care of things one might want to take care of in the event
   2459  * that a disklabel isn't present.
   2460  */
   2461 static void
   2462 raidmakedisklabel(struct raid_softc *rs)
   2463 {
   2464 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2465 	db1_printf(("Making a label..\n"));
   2466 
   2467 	/*
   2468 	 * For historical reasons, if there's no disklabel present
   2469 	 * the raw partition must be marked FS_BSDFFS.
   2470 	 */
   2471 
   2472 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2473 
   2474 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2475 
   2476 	lp->d_checksum = dkcksum(lp);
   2477 }
   2478 /*
   2479  * Wait interruptibly for an exclusive lock.
   2480  *
   2481  * XXX
   2482  * Several drivers do this; it should be abstracted and made MP-safe.
   2483  * (Hmm... where have we seen this warning before :->  GO )
   2484  */
   2485 static int
   2486 raidlock(struct raid_softc *rs)
   2487 {
   2488 	int     error;
   2489 
   2490 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2491 		rs->sc_flags |= RAIDF_WANTED;
   2492 		if ((error =
   2493 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2494 			return (error);
   2495 	}
   2496 	rs->sc_flags |= RAIDF_LOCKED;
   2497 	return (0);
   2498 }
   2499 /*
   2500  * Unlock and wake up any waiters.
   2501  */
   2502 static void
   2503 raidunlock(struct raid_softc *rs)
   2504 {
   2505 
   2506 	rs->sc_flags &= ~RAIDF_LOCKED;
   2507 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2508 		rs->sc_flags &= ~RAIDF_WANTED;
   2509 		wakeup(rs);
   2510 	}
   2511 }
   2512 
   2513 
   2514 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2515 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2516 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2517 
   2518 static daddr_t
   2519 rf_component_info_offset(void)
   2520 {
   2521 
   2522 	return RF_COMPONENT_INFO_OFFSET;
   2523 }
   2524 
   2525 static daddr_t
   2526 rf_component_info_size(unsigned secsize)
   2527 {
   2528 	daddr_t info_size;
   2529 
   2530 	KASSERT(secsize);
   2531 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2532 		info_size = secsize;
   2533 	else
   2534 		info_size = RF_COMPONENT_INFO_SIZE;
   2535 
   2536 	return info_size;
   2537 }
   2538 
   2539 static daddr_t
   2540 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2541 {
   2542 	daddr_t map_offset;
   2543 
   2544 	KASSERT(raidPtr->bytesPerSector);
   2545 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2546 		map_offset = raidPtr->bytesPerSector;
   2547 	else
   2548 		map_offset = RF_COMPONENT_INFO_SIZE;
   2549 	map_offset += rf_component_info_offset();
   2550 
   2551 	return map_offset;
   2552 }
   2553 
   2554 static daddr_t
   2555 rf_parity_map_size(RF_Raid_t *raidPtr)
   2556 {
   2557 	daddr_t map_size;
   2558 
   2559 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2560 		map_size = raidPtr->bytesPerSector;
   2561 	else
   2562 		map_size = RF_PARITY_MAP_SIZE;
   2563 
   2564 	return map_size;
   2565 }
   2566 
   2567 int
   2568 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2569 {
   2570 	RF_ComponentLabel_t *clabel;
   2571 
   2572 	clabel = raidget_component_label(raidPtr, col);
   2573 	clabel->clean = RF_RAID_CLEAN;
   2574 	raidflush_component_label(raidPtr, col);
   2575 	return(0);
   2576 }
   2577 
   2578 
   2579 int
   2580 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2581 {
   2582 	RF_ComponentLabel_t *clabel;
   2583 
   2584 	clabel = raidget_component_label(raidPtr, col);
   2585 	clabel->clean = RF_RAID_DIRTY;
   2586 	raidflush_component_label(raidPtr, col);
   2587 	return(0);
   2588 }
   2589 
   2590 int
   2591 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2592 {
   2593 	KASSERT(raidPtr->bytesPerSector);
   2594 	return raidread_component_label(raidPtr->bytesPerSector,
   2595 	    raidPtr->Disks[col].dev,
   2596 	    raidPtr->raid_cinfo[col].ci_vp,
   2597 	    &raidPtr->raid_cinfo[col].ci_label);
   2598 }
   2599 
   2600 RF_ComponentLabel_t *
   2601 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2602 {
   2603 	return &raidPtr->raid_cinfo[col].ci_label;
   2604 }
   2605 
   2606 int
   2607 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2608 {
   2609 	RF_ComponentLabel_t *label;
   2610 
   2611 	label = &raidPtr->raid_cinfo[col].ci_label;
   2612 	label->mod_counter = raidPtr->mod_counter;
   2613 #ifndef RF_NO_PARITY_MAP
   2614 	label->parity_map_modcount = label->mod_counter;
   2615 #endif
   2616 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2617 	    raidPtr->Disks[col].dev,
   2618 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2619 }
   2620 
   2621 
   2622 static int
   2623 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2624     RF_ComponentLabel_t *clabel)
   2625 {
   2626 	return raidread_component_area(dev, b_vp, clabel,
   2627 	    sizeof(RF_ComponentLabel_t),
   2628 	    rf_component_info_offset(),
   2629 	    rf_component_info_size(secsize));
   2630 }
   2631 
   2632 /* ARGSUSED */
   2633 static int
   2634 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2635     size_t msize, daddr_t offset, daddr_t dsize)
   2636 {
   2637 	struct buf *bp;
   2638 	const struct bdevsw *bdev;
   2639 	int error;
   2640 
   2641 	/* XXX should probably ensure that we don't try to do this if
   2642 	   someone has changed rf_protected_sectors. */
   2643 
   2644 	if (b_vp == NULL) {
   2645 		/* For whatever reason, this component is not valid.
   2646 		   Don't try to read a component label from it. */
   2647 		return(EINVAL);
   2648 	}
   2649 
   2650 	/* get a block of the appropriate size... */
   2651 	bp = geteblk((int)dsize);
   2652 	bp->b_dev = dev;
   2653 
   2654 	/* get our ducks in a row for the read */
   2655 	bp->b_blkno = offset / DEV_BSIZE;
   2656 	bp->b_bcount = dsize;
   2657 	bp->b_flags |= B_READ;
   2658  	bp->b_resid = dsize;
   2659 
   2660 	bdev = bdevsw_lookup(bp->b_dev);
   2661 	if (bdev == NULL)
   2662 		return (ENXIO);
   2663 	(*bdev->d_strategy)(bp);
   2664 
   2665 	error = biowait(bp);
   2666 
   2667 	if (!error) {
   2668 		memcpy(data, bp->b_data, msize);
   2669 	}
   2670 
   2671 	brelse(bp, 0);
   2672 	return(error);
   2673 }
   2674 
   2675 
   2676 static int
   2677 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2678     RF_ComponentLabel_t *clabel)
   2679 {
   2680 	return raidwrite_component_area(dev, b_vp, clabel,
   2681 	    sizeof(RF_ComponentLabel_t),
   2682 	    rf_component_info_offset(),
   2683 	    rf_component_info_size(secsize), 0);
   2684 }
   2685 
   2686 /* ARGSUSED */
   2687 static int
   2688 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2689     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2690 {
   2691 	struct buf *bp;
   2692 	const struct bdevsw *bdev;
   2693 	int error;
   2694 
   2695 	/* get a block of the appropriate size... */
   2696 	bp = geteblk((int)dsize);
   2697 	bp->b_dev = dev;
   2698 
   2699 	/* get our ducks in a row for the write */
   2700 	bp->b_blkno = offset / DEV_BSIZE;
   2701 	bp->b_bcount = dsize;
   2702 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2703  	bp->b_resid = dsize;
   2704 
   2705 	memset(bp->b_data, 0, dsize);
   2706 	memcpy(bp->b_data, data, msize);
   2707 
   2708 	bdev = bdevsw_lookup(bp->b_dev);
   2709 	if (bdev == NULL)
   2710 		return (ENXIO);
   2711 	(*bdev->d_strategy)(bp);
   2712 	if (asyncp)
   2713 		return 0;
   2714 	error = biowait(bp);
   2715 	brelse(bp, 0);
   2716 	if (error) {
   2717 #if 1
   2718 		printf("Failed to write RAID component info!\n");
   2719 #endif
   2720 	}
   2721 
   2722 	return(error);
   2723 }
   2724 
   2725 void
   2726 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2727 {
   2728 	int c;
   2729 
   2730 	for (c = 0; c < raidPtr->numCol; c++) {
   2731 		/* Skip dead disks. */
   2732 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2733 			continue;
   2734 		/* XXXjld: what if an error occurs here? */
   2735 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2736 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2737 		    RF_PARITYMAP_NBYTE,
   2738 		    rf_parity_map_offset(raidPtr),
   2739 		    rf_parity_map_size(raidPtr), 0);
   2740 	}
   2741 }
   2742 
   2743 void
   2744 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2745 {
   2746 	struct rf_paritymap_ondisk tmp;
   2747 	int c,first;
   2748 
   2749 	first=1;
   2750 	for (c = 0; c < raidPtr->numCol; c++) {
   2751 		/* Skip dead disks. */
   2752 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2753 			continue;
   2754 		raidread_component_area(raidPtr->Disks[c].dev,
   2755 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2756 		    RF_PARITYMAP_NBYTE,
   2757 		    rf_parity_map_offset(raidPtr),
   2758 		    rf_parity_map_size(raidPtr));
   2759 		if (first) {
   2760 			memcpy(map, &tmp, sizeof(*map));
   2761 			first = 0;
   2762 		} else {
   2763 			rf_paritymap_merge(map, &tmp);
   2764 		}
   2765 	}
   2766 }
   2767 
   2768 void
   2769 rf_markalldirty(RF_Raid_t *raidPtr)
   2770 {
   2771 	RF_ComponentLabel_t *clabel;
   2772 	int sparecol;
   2773 	int c;
   2774 	int j;
   2775 	int scol = -1;
   2776 
   2777 	raidPtr->mod_counter++;
   2778 	for (c = 0; c < raidPtr->numCol; c++) {
   2779 		/* we don't want to touch (at all) a disk that has
   2780 		   failed */
   2781 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2782 			clabel = raidget_component_label(raidPtr, c);
   2783 			if (clabel->status == rf_ds_spared) {
   2784 				/* XXX do something special...
   2785 				   but whatever you do, don't
   2786 				   try to access it!! */
   2787 			} else {
   2788 				raidmarkdirty(raidPtr, c);
   2789 			}
   2790 		}
   2791 	}
   2792 
   2793 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2794 		sparecol = raidPtr->numCol + c;
   2795 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2796 			/*
   2797 
   2798 			   we claim this disk is "optimal" if it's
   2799 			   rf_ds_used_spare, as that means it should be
   2800 			   directly substitutable for the disk it replaced.
   2801 			   We note that too...
   2802 
   2803 			 */
   2804 
   2805 			for(j=0;j<raidPtr->numCol;j++) {
   2806 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2807 					scol = j;
   2808 					break;
   2809 				}
   2810 			}
   2811 
   2812 			clabel = raidget_component_label(raidPtr, sparecol);
   2813 			/* make sure status is noted */
   2814 
   2815 			raid_init_component_label(raidPtr, clabel);
   2816 
   2817 			clabel->row = 0;
   2818 			clabel->column = scol;
   2819 			/* Note: we *don't* change status from rf_ds_used_spare
   2820 			   to rf_ds_optimal */
   2821 			/* clabel.status = rf_ds_optimal; */
   2822 
   2823 			raidmarkdirty(raidPtr, sparecol);
   2824 		}
   2825 	}
   2826 }
   2827 
   2828 
   2829 void
   2830 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2831 {
   2832 	RF_ComponentLabel_t *clabel;
   2833 	int sparecol;
   2834 	int c;
   2835 	int j;
   2836 	int scol;
   2837 
   2838 	scol = -1;
   2839 
   2840 	/* XXX should do extra checks to make sure things really are clean,
   2841 	   rather than blindly setting the clean bit... */
   2842 
   2843 	raidPtr->mod_counter++;
   2844 
   2845 	for (c = 0; c < raidPtr->numCol; c++) {
   2846 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2847 			clabel = raidget_component_label(raidPtr, c);
   2848 			/* make sure status is noted */
   2849 			clabel->status = rf_ds_optimal;
   2850 
   2851 			/* note what unit we are configured as */
   2852 			clabel->last_unit = raidPtr->raidid;
   2853 
   2854 			raidflush_component_label(raidPtr, c);
   2855 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2856 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2857 					raidmarkclean(raidPtr, c);
   2858 				}
   2859 			}
   2860 		}
   2861 		/* else we don't touch it.. */
   2862 	}
   2863 
   2864 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2865 		sparecol = raidPtr->numCol + c;
   2866 		/* Need to ensure that the reconstruct actually completed! */
   2867 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2868 			/*
   2869 
   2870 			   we claim this disk is "optimal" if it's
   2871 			   rf_ds_used_spare, as that means it should be
   2872 			   directly substitutable for the disk it replaced.
   2873 			   We note that too...
   2874 
   2875 			 */
   2876 
   2877 			for(j=0;j<raidPtr->numCol;j++) {
   2878 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2879 					scol = j;
   2880 					break;
   2881 				}
   2882 			}
   2883 
   2884 			/* XXX shouldn't *really* need this... */
   2885 			clabel = raidget_component_label(raidPtr, sparecol);
   2886 			/* make sure status is noted */
   2887 
   2888 			raid_init_component_label(raidPtr, clabel);
   2889 
   2890 			clabel->column = scol;
   2891 			clabel->status = rf_ds_optimal;
   2892 			clabel->last_unit = raidPtr->raidid;
   2893 
   2894 			raidflush_component_label(raidPtr, sparecol);
   2895 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2896 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2897 					raidmarkclean(raidPtr, sparecol);
   2898 				}
   2899 			}
   2900 		}
   2901 	}
   2902 }
   2903 
   2904 void
   2905 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2906 {
   2907 
   2908 	if (vp != NULL) {
   2909 		if (auto_configured == 1) {
   2910 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2911 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2912 			vput(vp);
   2913 
   2914 		} else {
   2915 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2916 		}
   2917 	}
   2918 }
   2919 
   2920 
   2921 void
   2922 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2923 {
   2924 	int r,c;
   2925 	struct vnode *vp;
   2926 	int acd;
   2927 
   2928 
   2929 	/* We take this opportunity to close the vnodes like we should.. */
   2930 
   2931 	for (c = 0; c < raidPtr->numCol; c++) {
   2932 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2933 		acd = raidPtr->Disks[c].auto_configured;
   2934 		rf_close_component(raidPtr, vp, acd);
   2935 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2936 		raidPtr->Disks[c].auto_configured = 0;
   2937 	}
   2938 
   2939 	for (r = 0; r < raidPtr->numSpare; r++) {
   2940 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2941 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2942 		rf_close_component(raidPtr, vp, acd);
   2943 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2944 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2945 	}
   2946 }
   2947 
   2948 
   2949 void
   2950 rf_ReconThread(struct rf_recon_req *req)
   2951 {
   2952 	int     s;
   2953 	RF_Raid_t *raidPtr;
   2954 
   2955 	s = splbio();
   2956 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2957 	raidPtr->recon_in_progress = 1;
   2958 
   2959 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2960 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2961 
   2962 	RF_Free(req, sizeof(*req));
   2963 
   2964 	raidPtr->recon_in_progress = 0;
   2965 	splx(s);
   2966 
   2967 	/* That's all... */
   2968 	kthread_exit(0);	/* does not return */
   2969 }
   2970 
   2971 void
   2972 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2973 {
   2974 	int retcode;
   2975 	int s;
   2976 
   2977 	raidPtr->parity_rewrite_stripes_done = 0;
   2978 	raidPtr->parity_rewrite_in_progress = 1;
   2979 	s = splbio();
   2980 	retcode = rf_RewriteParity(raidPtr);
   2981 	splx(s);
   2982 	if (retcode) {
   2983 		printf("raid%d: Error re-writing parity (%d)!\n",
   2984 		    raidPtr->raidid, retcode);
   2985 	} else {
   2986 		/* set the clean bit!  If we shutdown correctly,
   2987 		   the clean bit on each component label will get
   2988 		   set */
   2989 		raidPtr->parity_good = RF_RAID_CLEAN;
   2990 	}
   2991 	raidPtr->parity_rewrite_in_progress = 0;
   2992 
   2993 	/* Anyone waiting for us to stop?  If so, inform them... */
   2994 	if (raidPtr->waitShutdown) {
   2995 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2996 	}
   2997 
   2998 	/* That's all... */
   2999 	kthread_exit(0);	/* does not return */
   3000 }
   3001 
   3002 
   3003 void
   3004 rf_CopybackThread(RF_Raid_t *raidPtr)
   3005 {
   3006 	int s;
   3007 
   3008 	raidPtr->copyback_in_progress = 1;
   3009 	s = splbio();
   3010 	rf_CopybackReconstructedData(raidPtr);
   3011 	splx(s);
   3012 	raidPtr->copyback_in_progress = 0;
   3013 
   3014 	/* That's all... */
   3015 	kthread_exit(0);	/* does not return */
   3016 }
   3017 
   3018 
   3019 void
   3020 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3021 {
   3022 	int s;
   3023 	RF_Raid_t *raidPtr;
   3024 
   3025 	s = splbio();
   3026 	raidPtr = req->raidPtr;
   3027 	raidPtr->recon_in_progress = 1;
   3028 	rf_ReconstructInPlace(raidPtr, req->col);
   3029 	RF_Free(req, sizeof(*req));
   3030 	raidPtr->recon_in_progress = 0;
   3031 	splx(s);
   3032 
   3033 	/* That's all... */
   3034 	kthread_exit(0);	/* does not return */
   3035 }
   3036 
   3037 static RF_AutoConfig_t *
   3038 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3039     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3040     unsigned secsize)
   3041 {
   3042 	int good_one = 0;
   3043 	RF_ComponentLabel_t *clabel;
   3044 	RF_AutoConfig_t *ac;
   3045 
   3046 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3047 	if (clabel == NULL) {
   3048 oomem:
   3049 		    while(ac_list) {
   3050 			    ac = ac_list;
   3051 			    if (ac->clabel)
   3052 				    free(ac->clabel, M_RAIDFRAME);
   3053 			    ac_list = ac_list->next;
   3054 			    free(ac, M_RAIDFRAME);
   3055 		    }
   3056 		    printf("RAID auto config: out of memory!\n");
   3057 		    return NULL; /* XXX probably should panic? */
   3058 	}
   3059 
   3060 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3061 		/* Got the label.  Does it look reasonable? */
   3062 		if (rf_reasonable_label(clabel, numsecs) &&
   3063 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3064 #ifdef DEBUG
   3065 			printf("Component on: %s: %llu\n",
   3066 				cname, (unsigned long long)size);
   3067 			rf_print_component_label(clabel);
   3068 #endif
   3069 			/* if it's reasonable, add it, else ignore it. */
   3070 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3071 				M_NOWAIT);
   3072 			if (ac == NULL) {
   3073 				free(clabel, M_RAIDFRAME);
   3074 				goto oomem;
   3075 			}
   3076 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3077 			ac->dev = dev;
   3078 			ac->vp = vp;
   3079 			ac->clabel = clabel;
   3080 			ac->next = ac_list;
   3081 			ac_list = ac;
   3082 			good_one = 1;
   3083 		}
   3084 	}
   3085 	if (!good_one) {
   3086 		/* cleanup */
   3087 		free(clabel, M_RAIDFRAME);
   3088 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3089 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3090 		vput(vp);
   3091 	}
   3092 	return ac_list;
   3093 }
   3094 
   3095 RF_AutoConfig_t *
   3096 rf_find_raid_components(void)
   3097 {
   3098 	struct vnode *vp;
   3099 	struct disklabel label;
   3100 	device_t dv;
   3101 	deviter_t di;
   3102 	dev_t dev;
   3103 	int bmajor, bminor, wedge, rf_part_found;
   3104 	int error;
   3105 	int i;
   3106 	RF_AutoConfig_t *ac_list;
   3107 	uint64_t numsecs;
   3108 	unsigned secsize;
   3109 
   3110 	/* initialize the AutoConfig list */
   3111 	ac_list = NULL;
   3112 
   3113 	/* we begin by trolling through *all* the devices on the system */
   3114 
   3115 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3116 	     dv = deviter_next(&di)) {
   3117 
   3118 		/* we are only interested in disks... */
   3119 		if (device_class(dv) != DV_DISK)
   3120 			continue;
   3121 
   3122 		/* we don't care about floppies... */
   3123 		if (device_is_a(dv, "fd")) {
   3124 			continue;
   3125 		}
   3126 
   3127 		/* we don't care about CD's... */
   3128 		if (device_is_a(dv, "cd")) {
   3129 			continue;
   3130 		}
   3131 
   3132 		/* we don't care about md's... */
   3133 		if (device_is_a(dv, "md")) {
   3134 			continue;
   3135 		}
   3136 
   3137 		/* hdfd is the Atari/Hades floppy driver */
   3138 		if (device_is_a(dv, "hdfd")) {
   3139 			continue;
   3140 		}
   3141 
   3142 		/* fdisa is the Atari/Milan floppy driver */
   3143 		if (device_is_a(dv, "fdisa")) {
   3144 			continue;
   3145 		}
   3146 
   3147 		/* need to find the device_name_to_block_device_major stuff */
   3148 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3149 
   3150 		rf_part_found = 0; /*No raid partition as yet*/
   3151 
   3152 		/* get a vnode for the raw partition of this disk */
   3153 
   3154 		wedge = device_is_a(dv, "dk");
   3155 		bminor = minor(device_unit(dv));
   3156 		dev = wedge ? makedev(bmajor, bminor) :
   3157 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3158 		if (bdevvp(dev, &vp))
   3159 			panic("RAID can't alloc vnode");
   3160 
   3161 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3162 
   3163 		if (error) {
   3164 			/* "Who cares."  Continue looking
   3165 			   for something that exists*/
   3166 			vput(vp);
   3167 			continue;
   3168 		}
   3169 
   3170 		error = getdisksize(vp, &numsecs, &secsize);
   3171 		if (error) {
   3172 			vput(vp);
   3173 			continue;
   3174 		}
   3175 		if (wedge) {
   3176 			struct dkwedge_info dkw;
   3177 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3178 			    NOCRED);
   3179 			if (error) {
   3180 				printf("RAIDframe: can't get wedge info for "
   3181 				    "dev %s (%d)\n", device_xname(dv), error);
   3182 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3183 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3184 				vput(vp);
   3185 				continue;
   3186 			}
   3187 
   3188 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3189 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3190 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3191 				vput(vp);
   3192 				continue;
   3193 			}
   3194 
   3195 			ac_list = rf_get_component(ac_list, dev, vp,
   3196 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3197 			rf_part_found = 1; /*There is a raid component on this disk*/
   3198 			continue;
   3199 		}
   3200 
   3201 		/* Ok, the disk exists.  Go get the disklabel. */
   3202 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3203 		if (error) {
   3204 			/*
   3205 			 * XXX can't happen - open() would
   3206 			 * have errored out (or faked up one)
   3207 			 */
   3208 			if (error != ENOTTY)
   3209 				printf("RAIDframe: can't get label for dev "
   3210 				    "%s (%d)\n", device_xname(dv), error);
   3211 		}
   3212 
   3213 		/* don't need this any more.  We'll allocate it again
   3214 		   a little later if we really do... */
   3215 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3216 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3217 		vput(vp);
   3218 
   3219 		if (error)
   3220 			continue;
   3221 
   3222 		rf_part_found = 0; /*No raid partitions yet*/
   3223 		for (i = 0; i < label.d_npartitions; i++) {
   3224 			char cname[sizeof(ac_list->devname)];
   3225 
   3226 			/* We only support partitions marked as RAID */
   3227 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3228 				continue;
   3229 
   3230 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3231 			if (bdevvp(dev, &vp))
   3232 				panic("RAID can't alloc vnode");
   3233 
   3234 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3235 			if (error) {
   3236 				/* Whatever... */
   3237 				vput(vp);
   3238 				continue;
   3239 			}
   3240 			snprintf(cname, sizeof(cname), "%s%c",
   3241 			    device_xname(dv), 'a' + i);
   3242 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3243 				label.d_partitions[i].p_size, numsecs, secsize);
   3244 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3245 		}
   3246 
   3247 		/*
   3248 		 *If there is no raid component on this disk, either in a
   3249 		 *disklabel or inside a wedge, check the raw partition as well,
   3250 		 *as it is possible to configure raid components on raw disk
   3251 		 *devices.
   3252 		 */
   3253 
   3254 		if (!rf_part_found) {
   3255 			char cname[sizeof(ac_list->devname)];
   3256 
   3257 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3258 			if (bdevvp(dev, &vp))
   3259 				panic("RAID can't alloc vnode");
   3260 
   3261 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3262 			if (error) {
   3263 				/* Whatever... */
   3264 				vput(vp);
   3265 				continue;
   3266 			}
   3267 			snprintf(cname, sizeof(cname), "%s%c",
   3268 			    device_xname(dv), 'a' + RAW_PART);
   3269 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3270 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3271 		}
   3272 	}
   3273 	deviter_release(&di);
   3274 	return ac_list;
   3275 }
   3276 
   3277 
   3278 int
   3279 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3280 {
   3281 
   3282 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3283 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3284 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3285 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3286 	    clabel->row >=0 &&
   3287 	    clabel->column >= 0 &&
   3288 	    clabel->num_rows > 0 &&
   3289 	    clabel->num_columns > 0 &&
   3290 	    clabel->row < clabel->num_rows &&
   3291 	    clabel->column < clabel->num_columns &&
   3292 	    clabel->blockSize > 0 &&
   3293 	    /*
   3294 	     * numBlocksHi may contain garbage, but it is ok since
   3295 	     * the type is unsigned.  If it is really garbage,
   3296 	     * rf_fix_old_label_size() will fix it.
   3297 	     */
   3298 	    rf_component_label_numblocks(clabel) > 0) {
   3299 		/*
   3300 		 * label looks reasonable enough...
   3301 		 * let's make sure it has no old garbage.
   3302 		 */
   3303 		if (numsecs)
   3304 			rf_fix_old_label_size(clabel, numsecs);
   3305 		return(1);
   3306 	}
   3307 	return(0);
   3308 }
   3309 
   3310 
   3311 /*
   3312  * For reasons yet unknown, some old component labels have garbage in
   3313  * the newer numBlocksHi region, and this causes lossage.  Since those
   3314  * disks will also have numsecs set to less than 32 bits of sectors,
   3315  * we can determine when this corruption has occurred, and fix it.
   3316  *
   3317  * The exact same problem, with the same unknown reason, happens to
   3318  * the partitionSizeHi member as well.
   3319  */
   3320 static void
   3321 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3322 {
   3323 
   3324 	if (numsecs < ((uint64_t)1 << 32)) {
   3325 		if (clabel->numBlocksHi) {
   3326 			printf("WARNING: total sectors < 32 bits, yet "
   3327 			       "numBlocksHi set\n"
   3328 			       "WARNING: resetting numBlocksHi to zero.\n");
   3329 			clabel->numBlocksHi = 0;
   3330 		}
   3331 
   3332 		if (clabel->partitionSizeHi) {
   3333 			printf("WARNING: total sectors < 32 bits, yet "
   3334 			       "partitionSizeHi set\n"
   3335 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3336 			clabel->partitionSizeHi = 0;
   3337 		}
   3338 	}
   3339 }
   3340 
   3341 
   3342 #ifdef DEBUG
   3343 void
   3344 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3345 {
   3346 	uint64_t numBlocks;
   3347 	static const char *rp[] = {
   3348 	    "No", "Force", "Soft", "*invalid*"
   3349 	};
   3350 
   3351 
   3352 	numBlocks = rf_component_label_numblocks(clabel);
   3353 
   3354 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3355 	       clabel->row, clabel->column,
   3356 	       clabel->num_rows, clabel->num_columns);
   3357 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3358 	       clabel->version, clabel->serial_number,
   3359 	       clabel->mod_counter);
   3360 	printf("   Clean: %s Status: %d\n",
   3361 	       clabel->clean ? "Yes" : "No", clabel->status);
   3362 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3363 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3364 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3365 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3366 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3367 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3368 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3369 #if 0
   3370 	   printf("   Config order: %d\n", clabel->config_order);
   3371 #endif
   3372 
   3373 }
   3374 #endif
   3375 
   3376 RF_ConfigSet_t *
   3377 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3378 {
   3379 	RF_AutoConfig_t *ac;
   3380 	RF_ConfigSet_t *config_sets;
   3381 	RF_ConfigSet_t *cset;
   3382 	RF_AutoConfig_t *ac_next;
   3383 
   3384 
   3385 	config_sets = NULL;
   3386 
   3387 	/* Go through the AutoConfig list, and figure out which components
   3388 	   belong to what sets.  */
   3389 	ac = ac_list;
   3390 	while(ac!=NULL) {
   3391 		/* we're going to putz with ac->next, so save it here
   3392 		   for use at the end of the loop */
   3393 		ac_next = ac->next;
   3394 
   3395 		if (config_sets == NULL) {
   3396 			/* will need at least this one... */
   3397 			config_sets = (RF_ConfigSet_t *)
   3398 				malloc(sizeof(RF_ConfigSet_t),
   3399 				       M_RAIDFRAME, M_NOWAIT);
   3400 			if (config_sets == NULL) {
   3401 				panic("rf_create_auto_sets: No memory!");
   3402 			}
   3403 			/* this one is easy :) */
   3404 			config_sets->ac = ac;
   3405 			config_sets->next = NULL;
   3406 			config_sets->rootable = 0;
   3407 			ac->next = NULL;
   3408 		} else {
   3409 			/* which set does this component fit into? */
   3410 			cset = config_sets;
   3411 			while(cset!=NULL) {
   3412 				if (rf_does_it_fit(cset, ac)) {
   3413 					/* looks like it matches... */
   3414 					ac->next = cset->ac;
   3415 					cset->ac = ac;
   3416 					break;
   3417 				}
   3418 				cset = cset->next;
   3419 			}
   3420 			if (cset==NULL) {
   3421 				/* didn't find a match above... new set..*/
   3422 				cset = (RF_ConfigSet_t *)
   3423 					malloc(sizeof(RF_ConfigSet_t),
   3424 					       M_RAIDFRAME, M_NOWAIT);
   3425 				if (cset == NULL) {
   3426 					panic("rf_create_auto_sets: No memory!");
   3427 				}
   3428 				cset->ac = ac;
   3429 				ac->next = NULL;
   3430 				cset->next = config_sets;
   3431 				cset->rootable = 0;
   3432 				config_sets = cset;
   3433 			}
   3434 		}
   3435 		ac = ac_next;
   3436 	}
   3437 
   3438 
   3439 	return(config_sets);
   3440 }
   3441 
   3442 static int
   3443 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3444 {
   3445 	RF_ComponentLabel_t *clabel1, *clabel2;
   3446 
   3447 	/* If this one matches the *first* one in the set, that's good
   3448 	   enough, since the other members of the set would have been
   3449 	   through here too... */
   3450 	/* note that we are not checking partitionSize here..
   3451 
   3452 	   Note that we are also not checking the mod_counters here.
   3453 	   If everything else matches except the mod_counter, that's
   3454 	   good enough for this test.  We will deal with the mod_counters
   3455 	   a little later in the autoconfiguration process.
   3456 
   3457 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3458 
   3459 	   The reason we don't check for this is that failed disks
   3460 	   will have lower modification counts.  If those disks are
   3461 	   not added to the set they used to belong to, then they will
   3462 	   form their own set, which may result in 2 different sets,
   3463 	   for example, competing to be configured at raid0, and
   3464 	   perhaps competing to be the root filesystem set.  If the
   3465 	   wrong ones get configured, or both attempt to become /,
   3466 	   weird behaviour and or serious lossage will occur.  Thus we
   3467 	   need to bring them into the fold here, and kick them out at
   3468 	   a later point.
   3469 
   3470 	*/
   3471 
   3472 	clabel1 = cset->ac->clabel;
   3473 	clabel2 = ac->clabel;
   3474 	if ((clabel1->version == clabel2->version) &&
   3475 	    (clabel1->serial_number == clabel2->serial_number) &&
   3476 	    (clabel1->num_rows == clabel2->num_rows) &&
   3477 	    (clabel1->num_columns == clabel2->num_columns) &&
   3478 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3479 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3480 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3481 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3482 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3483 	    (clabel1->blockSize == clabel2->blockSize) &&
   3484 	    rf_component_label_numblocks(clabel1) ==
   3485 	    rf_component_label_numblocks(clabel2) &&
   3486 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3487 	    (clabel1->root_partition == clabel2->root_partition) &&
   3488 	    (clabel1->last_unit == clabel2->last_unit) &&
   3489 	    (clabel1->config_order == clabel2->config_order)) {
   3490 		/* if it get's here, it almost *has* to be a match */
   3491 	} else {
   3492 		/* it's not consistent with somebody in the set..
   3493 		   punt */
   3494 		return(0);
   3495 	}
   3496 	/* all was fine.. it must fit... */
   3497 	return(1);
   3498 }
   3499 
   3500 int
   3501 rf_have_enough_components(RF_ConfigSet_t *cset)
   3502 {
   3503 	RF_AutoConfig_t *ac;
   3504 	RF_AutoConfig_t *auto_config;
   3505 	RF_ComponentLabel_t *clabel;
   3506 	int c;
   3507 	int num_cols;
   3508 	int num_missing;
   3509 	int mod_counter;
   3510 	int mod_counter_found;
   3511 	int even_pair_failed;
   3512 	char parity_type;
   3513 
   3514 
   3515 	/* check to see that we have enough 'live' components
   3516 	   of this set.  If so, we can configure it if necessary */
   3517 
   3518 	num_cols = cset->ac->clabel->num_columns;
   3519 	parity_type = cset->ac->clabel->parityConfig;
   3520 
   3521 	/* XXX Check for duplicate components!?!?!? */
   3522 
   3523 	/* Determine what the mod_counter is supposed to be for this set. */
   3524 
   3525 	mod_counter_found = 0;
   3526 	mod_counter = 0;
   3527 	ac = cset->ac;
   3528 	while(ac!=NULL) {
   3529 		if (mod_counter_found==0) {
   3530 			mod_counter = ac->clabel->mod_counter;
   3531 			mod_counter_found = 1;
   3532 		} else {
   3533 			if (ac->clabel->mod_counter > mod_counter) {
   3534 				mod_counter = ac->clabel->mod_counter;
   3535 			}
   3536 		}
   3537 		ac = ac->next;
   3538 	}
   3539 
   3540 	num_missing = 0;
   3541 	auto_config = cset->ac;
   3542 
   3543 	even_pair_failed = 0;
   3544 	for(c=0; c<num_cols; c++) {
   3545 		ac = auto_config;
   3546 		while(ac!=NULL) {
   3547 			if ((ac->clabel->column == c) &&
   3548 			    (ac->clabel->mod_counter == mod_counter)) {
   3549 				/* it's this one... */
   3550 #ifdef DEBUG
   3551 				printf("Found: %s at %d\n",
   3552 				       ac->devname,c);
   3553 #endif
   3554 				break;
   3555 			}
   3556 			ac=ac->next;
   3557 		}
   3558 		if (ac==NULL) {
   3559 				/* Didn't find one here! */
   3560 				/* special case for RAID 1, especially
   3561 				   where there are more than 2
   3562 				   components (where RAIDframe treats
   3563 				   things a little differently :( ) */
   3564 			if (parity_type == '1') {
   3565 				if (c%2 == 0) { /* even component */
   3566 					even_pair_failed = 1;
   3567 				} else { /* odd component.  If
   3568 					    we're failed, and
   3569 					    so is the even
   3570 					    component, it's
   3571 					    "Good Night, Charlie" */
   3572 					if (even_pair_failed == 1) {
   3573 						return(0);
   3574 					}
   3575 				}
   3576 			} else {
   3577 				/* normal accounting */
   3578 				num_missing++;
   3579 			}
   3580 		}
   3581 		if ((parity_type == '1') && (c%2 == 1)) {
   3582 				/* Just did an even component, and we didn't
   3583 				   bail.. reset the even_pair_failed flag,
   3584 				   and go on to the next component.... */
   3585 			even_pair_failed = 0;
   3586 		}
   3587 	}
   3588 
   3589 	clabel = cset->ac->clabel;
   3590 
   3591 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3592 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3593 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3594 		/* XXX this needs to be made *much* more general */
   3595 		/* Too many failures */
   3596 		return(0);
   3597 	}
   3598 	/* otherwise, all is well, and we've got enough to take a kick
   3599 	   at autoconfiguring this set */
   3600 	return(1);
   3601 }
   3602 
   3603 void
   3604 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3605 			RF_Raid_t *raidPtr)
   3606 {
   3607 	RF_ComponentLabel_t *clabel;
   3608 	int i;
   3609 
   3610 	clabel = ac->clabel;
   3611 
   3612 	/* 1. Fill in the common stuff */
   3613 	config->numRow = clabel->num_rows = 1;
   3614 	config->numCol = clabel->num_columns;
   3615 	config->numSpare = 0; /* XXX should this be set here? */
   3616 	config->sectPerSU = clabel->sectPerSU;
   3617 	config->SUsPerPU = clabel->SUsPerPU;
   3618 	config->SUsPerRU = clabel->SUsPerRU;
   3619 	config->parityConfig = clabel->parityConfig;
   3620 	/* XXX... */
   3621 	strcpy(config->diskQueueType,"fifo");
   3622 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3623 	config->layoutSpecificSize = 0; /* XXX ?? */
   3624 
   3625 	while(ac!=NULL) {
   3626 		/* row/col values will be in range due to the checks
   3627 		   in reasonable_label() */
   3628 		strcpy(config->devnames[0][ac->clabel->column],
   3629 		       ac->devname);
   3630 		ac = ac->next;
   3631 	}
   3632 
   3633 	for(i=0;i<RF_MAXDBGV;i++) {
   3634 		config->debugVars[i][0] = 0;
   3635 	}
   3636 }
   3637 
   3638 int
   3639 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3640 {
   3641 	RF_ComponentLabel_t *clabel;
   3642 	int column;
   3643 	int sparecol;
   3644 
   3645 	raidPtr->autoconfigure = new_value;
   3646 
   3647 	for(column=0; column<raidPtr->numCol; column++) {
   3648 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3649 			clabel = raidget_component_label(raidPtr, column);
   3650 			clabel->autoconfigure = new_value;
   3651 			raidflush_component_label(raidPtr, column);
   3652 		}
   3653 	}
   3654 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3655 		sparecol = raidPtr->numCol + column;
   3656 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3657 			clabel = raidget_component_label(raidPtr, sparecol);
   3658 			clabel->autoconfigure = new_value;
   3659 			raidflush_component_label(raidPtr, sparecol);
   3660 		}
   3661 	}
   3662 	return(new_value);
   3663 }
   3664 
   3665 int
   3666 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3667 {
   3668 	RF_ComponentLabel_t *clabel;
   3669 	int column;
   3670 	int sparecol;
   3671 
   3672 	raidPtr->root_partition = new_value;
   3673 	for(column=0; column<raidPtr->numCol; column++) {
   3674 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3675 			clabel = raidget_component_label(raidPtr, column);
   3676 			clabel->root_partition = new_value;
   3677 			raidflush_component_label(raidPtr, column);
   3678 		}
   3679 	}
   3680 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3681 		sparecol = raidPtr->numCol + column;
   3682 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3683 			clabel = raidget_component_label(raidPtr, sparecol);
   3684 			clabel->root_partition = new_value;
   3685 			raidflush_component_label(raidPtr, sparecol);
   3686 		}
   3687 	}
   3688 	return(new_value);
   3689 }
   3690 
   3691 void
   3692 rf_release_all_vps(RF_ConfigSet_t *cset)
   3693 {
   3694 	RF_AutoConfig_t *ac;
   3695 
   3696 	ac = cset->ac;
   3697 	while(ac!=NULL) {
   3698 		/* Close the vp, and give it back */
   3699 		if (ac->vp) {
   3700 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3701 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3702 			vput(ac->vp);
   3703 			ac->vp = NULL;
   3704 		}
   3705 		ac = ac->next;
   3706 	}
   3707 }
   3708 
   3709 
   3710 void
   3711 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3712 {
   3713 	RF_AutoConfig_t *ac;
   3714 	RF_AutoConfig_t *next_ac;
   3715 
   3716 	ac = cset->ac;
   3717 	while(ac!=NULL) {
   3718 		next_ac = ac->next;
   3719 		/* nuke the label */
   3720 		free(ac->clabel, M_RAIDFRAME);
   3721 		/* cleanup the config structure */
   3722 		free(ac, M_RAIDFRAME);
   3723 		/* "next.." */
   3724 		ac = next_ac;
   3725 	}
   3726 	/* and, finally, nuke the config set */
   3727 	free(cset, M_RAIDFRAME);
   3728 }
   3729 
   3730 
   3731 void
   3732 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3733 {
   3734 	/* current version number */
   3735 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3736 	clabel->serial_number = raidPtr->serial_number;
   3737 	clabel->mod_counter = raidPtr->mod_counter;
   3738 
   3739 	clabel->num_rows = 1;
   3740 	clabel->num_columns = raidPtr->numCol;
   3741 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3742 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3743 
   3744 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3745 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3746 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3747 
   3748 	clabel->blockSize = raidPtr->bytesPerSector;
   3749 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3750 
   3751 	/* XXX not portable */
   3752 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3753 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3754 	clabel->autoconfigure = raidPtr->autoconfigure;
   3755 	clabel->root_partition = raidPtr->root_partition;
   3756 	clabel->last_unit = raidPtr->raidid;
   3757 	clabel->config_order = raidPtr->config_order;
   3758 
   3759 #ifndef RF_NO_PARITY_MAP
   3760 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3761 #endif
   3762 }
   3763 
   3764 struct raid_softc *
   3765 rf_auto_config_set(RF_ConfigSet_t *cset)
   3766 {
   3767 	RF_Raid_t *raidPtr;
   3768 	RF_Config_t *config;
   3769 	int raidID;
   3770 	struct raid_softc *sc;
   3771 
   3772 #ifdef DEBUG
   3773 	printf("RAID autoconfigure\n");
   3774 #endif
   3775 
   3776 	/* 1. Create a config structure */
   3777 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3778 	if (config == NULL) {
   3779 		printf("Out of mem!?!?\n");
   3780 				/* XXX do something more intelligent here. */
   3781 		return NULL;
   3782 	}
   3783 
   3784 	/*
   3785 	   2. Figure out what RAID ID this one is supposed to live at
   3786 	   See if we can get the same RAID dev that it was configured
   3787 	   on last time..
   3788 	*/
   3789 
   3790 	raidID = cset->ac->clabel->last_unit;
   3791 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3792 		continue;
   3793 #ifdef DEBUG
   3794 	printf("Configuring raid%d:\n",raidID);
   3795 #endif
   3796 
   3797 	raidPtr = &sc->sc_r;
   3798 
   3799 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3800 	raidPtr->softc = sc;
   3801 	raidPtr->raidid = raidID;
   3802 	raidPtr->openings = RAIDOUTSTANDING;
   3803 
   3804 	/* 3. Build the configuration structure */
   3805 	rf_create_configuration(cset->ac, config, raidPtr);
   3806 
   3807 	/* 4. Do the configuration */
   3808 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3809 		raidinit(sc);
   3810 
   3811 		rf_markalldirty(raidPtr);
   3812 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3813 		switch (cset->ac->clabel->root_partition) {
   3814 		case 1:	/* Force Root */
   3815 		case 2:	/* Soft Root: root when boot partition part of raid */
   3816 			/*
   3817 			 * everything configured just fine.  Make a note
   3818 			 * that this set is eligible to be root,
   3819 			 * or forced to be root
   3820 			 */
   3821 			cset->rootable = cset->ac->clabel->root_partition;
   3822 			/* XXX do this here? */
   3823 			raidPtr->root_partition = cset->rootable;
   3824 			break;
   3825 		default:
   3826 			break;
   3827 		}
   3828 	} else {
   3829 		raidput(sc);
   3830 		sc = NULL;
   3831 	}
   3832 
   3833 	/* 5. Cleanup */
   3834 	free(config, M_RAIDFRAME);
   3835 	return sc;
   3836 }
   3837 
   3838 void
   3839 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3840 {
   3841 	struct buf *bp;
   3842 	struct raid_softc *rs;
   3843 
   3844 	bp = (struct buf *)desc->bp;
   3845 	rs = desc->raidPtr->softc;
   3846 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3847 	    (bp->b_flags & B_READ));
   3848 }
   3849 
   3850 void
   3851 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3852 	     size_t xmin, size_t xmax)
   3853 {
   3854 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3855 	pool_sethiwat(p, xmax);
   3856 	pool_prime(p, xmin);
   3857 	pool_setlowat(p, xmin);
   3858 }
   3859 
   3860 /*
   3861  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3862  * if there is IO pending and if that IO could possibly be done for a
   3863  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3864  * otherwise.
   3865  *
   3866  */
   3867 
   3868 int
   3869 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3870 {
   3871 	struct raid_softc *rs = raidPtr->softc;
   3872 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3873 		/* there is work to do */
   3874 		return 0;
   3875 	}
   3876 	/* default is nothing to do */
   3877 	return 1;
   3878 }
   3879 
   3880 int
   3881 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3882 {
   3883 	uint64_t numsecs;
   3884 	unsigned secsize;
   3885 	int error;
   3886 
   3887 	error = getdisksize(vp, &numsecs, &secsize);
   3888 	if (error == 0) {
   3889 		diskPtr->blockSize = secsize;
   3890 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3891 		diskPtr->partitionSize = numsecs;
   3892 		return 0;
   3893 	}
   3894 	return error;
   3895 }
   3896 
   3897 static int
   3898 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3899 {
   3900 	return 1;
   3901 }
   3902 
   3903 static void
   3904 raid_attach(device_t parent, device_t self, void *aux)
   3905 {
   3906 
   3907 }
   3908 
   3909 
   3910 static int
   3911 raid_detach(device_t self, int flags)
   3912 {
   3913 	int error;
   3914 	struct raid_softc *rs = raidget(device_unit(self));
   3915 
   3916 	if (rs == NULL)
   3917 		return ENXIO;
   3918 
   3919 	if ((error = raidlock(rs)) != 0)
   3920 		return (error);
   3921 
   3922 	error = raid_detach_unlocked(rs);
   3923 
   3924 	raidunlock(rs);
   3925 
   3926 	/* XXXkd: raidput(rs) ??? */
   3927 
   3928 	return error;
   3929 }
   3930 
   3931 static void
   3932 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3933 {
   3934 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3935 
   3936 	memset(dg, 0, sizeof(*dg));
   3937 
   3938 	dg->dg_secperunit = raidPtr->totalSectors;
   3939 	dg->dg_secsize = raidPtr->bytesPerSector;
   3940 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3941 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3942 
   3943 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3944 }
   3945 
   3946 /*
   3947  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3948  * We end up returning whatever error was returned by the first cache flush
   3949  * that fails.
   3950  */
   3951 
   3952 int
   3953 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3954 {
   3955 	int c, sparecol;
   3956 	int e,error;
   3957 	int force = 1;
   3958 
   3959 	error = 0;
   3960 	for (c = 0; c < raidPtr->numCol; c++) {
   3961 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3962 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3963 					  &force, FWRITE, NOCRED);
   3964 			if (e) {
   3965 				if (e != ENODEV)
   3966 					printf("raid%d: cache flush to component %s failed.\n",
   3967 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3968 				if (error == 0) {
   3969 					error = e;
   3970 				}
   3971 			}
   3972 		}
   3973 	}
   3974 
   3975 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3976 		sparecol = raidPtr->numCol + c;
   3977 		/* Need to ensure that the reconstruct actually completed! */
   3978 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3979 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3980 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3981 			if (e) {
   3982 				if (e != ENODEV)
   3983 					printf("raid%d: cache flush to component %s failed.\n",
   3984 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3985 				if (error == 0) {
   3986 					error = e;
   3987 				}
   3988 			}
   3989 		}
   3990 	}
   3991 	return error;
   3992 }
   3993 
   3994 static void
   3995 raidminphys(struct buf *bp)
   3996 {
   3997 	dev_t dev;
   3998 	int unit;
   3999 	struct raid_softc *rs;
   4000 	RF_Raid_t *raidPtr;
   4001 	long xmax;
   4002 
   4003 	dev = bp->b_dev;
   4004 	unit = raidunit(dev);
   4005 	rs = raidget(unit);
   4006 	raidPtr = &(rs->sc_r);
   4007 
   4008 	xmax = raidPtr->Layout.numDataCol * MAXPHYS;
   4009 
   4010 	if (bp->b_bcount > xmax) {
   4011 		bp->b_bcount = xmax;
   4012 	}
   4013 }
   4014