Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.324
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.324 2015/07/10 09:49:56 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.324 2015/07/10 09:49:56 mrg Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 static dev_type_open(raidopen);
    199 static dev_type_close(raidclose);
    200 static dev_type_read(raidread);
    201 static dev_type_write(raidwrite);
    202 static dev_type_ioctl(raidioctl);
    203 static dev_type_strategy(raidstrategy);
    204 static dev_type_dump(raiddump);
    205 static dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 static struct dkdriver rf_dkdriver = {
    234 	.d_strategy = raidstrategy,
    235 	.d_minphys = minphys
    236 };
    237 
    238 struct raid_softc {
    239 	device_t sc_dev;
    240 	int	sc_unit;
    241 	int     sc_flags;	/* flags */
    242 	int     sc_cflags;	/* configuration flags */
    243 	uint64_t sc_size;	/* size of the raid device */
    244 	char    sc_xname[20];	/* XXX external name */
    245 	struct disk sc_dkdev;	/* generic disk device info */
    246 	struct bufq_state *buf_queue;	/* used for the device queue */
    247 	RF_Raid_t sc_r;
    248 	LIST_ENTRY(raid_softc) sc_link;
    249 };
    250 /* sc_flags */
    251 #define RAIDF_INITED	0x01	/* unit has been initialized */
    252 #define RAIDF_WLABEL	0x02	/* label area is writable */
    253 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    254 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    255 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    256 #define RAIDF_LOCKED	0x80	/* unit is locked */
    257 
    258 #define	raidunit(x)	DISKUNIT(x)
    259 
    260 extern struct cfdriver raid_cd;
    261 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    262     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    263     DVF_DETACH_SHUTDOWN);
    264 
    265 /*
    266  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    267  * Be aware that large numbers can allow the driver to consume a lot of
    268  * kernel memory, especially on writes, and in degraded mode reads.
    269  *
    270  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    271  * a single 64K write will typically require 64K for the old data,
    272  * 64K for the old parity, and 64K for the new parity, for a total
    273  * of 192K (if the parity buffer is not re-used immediately).
    274  * Even it if is used immediately, that's still 128K, which when multiplied
    275  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    276  *
    277  * Now in degraded mode, for example, a 64K read on the above setup may
    278  * require data reconstruction, which will require *all* of the 4 remaining
    279  * disks to participate -- 4 * 32K/disk == 128K again.
    280  */
    281 
    282 #ifndef RAIDOUTSTANDING
    283 #define RAIDOUTSTANDING   6
    284 #endif
    285 
    286 #define RAIDLABELDEV(dev)	\
    287 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    288 
    289 /* declared here, and made public, for the benefit of KVM stuff.. */
    290 
    291 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    292 				     struct disklabel *);
    293 static void raidgetdisklabel(dev_t);
    294 static void raidmakedisklabel(struct raid_softc *);
    295 
    296 static int raidlock(struct raid_softc *);
    297 static void raidunlock(struct raid_softc *);
    298 
    299 static int raid_detach_unlocked(struct raid_softc *);
    300 
    301 static void rf_markalldirty(RF_Raid_t *);
    302 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    303 
    304 void rf_ReconThread(struct rf_recon_req *);
    305 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    306 void rf_CopybackThread(RF_Raid_t *raidPtr);
    307 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    308 int rf_autoconfig(device_t);
    309 void rf_buildroothack(RF_ConfigSet_t *);
    310 
    311 RF_AutoConfig_t *rf_find_raid_components(void);
    312 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    313 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    314 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    315 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    316 int rf_set_autoconfig(RF_Raid_t *, int);
    317 int rf_set_rootpartition(RF_Raid_t *, int);
    318 void rf_release_all_vps(RF_ConfigSet_t *);
    319 void rf_cleanup_config_set(RF_ConfigSet_t *);
    320 int rf_have_enough_components(RF_ConfigSet_t *);
    321 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    322 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    323 
    324 /*
    325  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    326  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    327  * in the kernel config file.
    328  */
    329 #ifdef RAID_AUTOCONFIG
    330 int raidautoconfig = 1;
    331 #else
    332 int raidautoconfig = 0;
    333 #endif
    334 static bool raidautoconfigdone = false;
    335 
    336 struct RF_Pools_s rf_pools;
    337 
    338 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    339 static kmutex_t raid_lock;
    340 
    341 static struct raid_softc *
    342 raidcreate(int unit) {
    343 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    344 	if (sc == NULL) {
    345 #ifdef DIAGNOSTIC
    346 		printf("%s: out of memory\n", __func__);
    347 #endif
    348 		return NULL;
    349 	}
    350 	sc->sc_unit = unit;
    351 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    352 	return sc;
    353 }
    354 
    355 static void
    356 raiddestroy(struct raid_softc *sc) {
    357 	bufq_free(sc->buf_queue);
    358 	kmem_free(sc, sizeof(*sc));
    359 }
    360 
    361 static struct raid_softc *
    362 raidget(int unit) {
    363 	struct raid_softc *sc;
    364 	if (unit < 0) {
    365 #ifdef DIAGNOSTIC
    366 		panic("%s: unit %d!", __func__, unit);
    367 #endif
    368 		return NULL;
    369 	}
    370 	mutex_enter(&raid_lock);
    371 	LIST_FOREACH(sc, &raids, sc_link) {
    372 		if (sc->sc_unit == unit) {
    373 			mutex_exit(&raid_lock);
    374 			return sc;
    375 		}
    376 	}
    377 	mutex_exit(&raid_lock);
    378 	if ((sc = raidcreate(unit)) == NULL)
    379 		return NULL;
    380 	mutex_enter(&raid_lock);
    381 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    382 	mutex_exit(&raid_lock);
    383 	return sc;
    384 }
    385 
    386 static void
    387 raidput(struct raid_softc *sc) {
    388 	mutex_enter(&raid_lock);
    389 	LIST_REMOVE(sc, sc_link);
    390 	mutex_exit(&raid_lock);
    391 	raiddestroy(sc);
    392 }
    393 
    394 void
    395 raidattach(int num)
    396 {
    397 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    398 	/* This is where all the initialization stuff gets done. */
    399 
    400 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    401 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    402 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    403 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    404 
    405 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    406 #endif
    407 
    408 	if (rf_BootRaidframe() == 0)
    409 		aprint_verbose("Kernelized RAIDframe activated\n");
    410 	else
    411 		panic("Serious error booting RAID!!");
    412 
    413 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    414 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    415 	}
    416 
    417 	raidautoconfigdone = false;
    418 
    419 	/*
    420 	 * Register a finalizer which will be used to auto-config RAID
    421 	 * sets once all real hardware devices have been found.
    422 	 */
    423 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    424 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    425 }
    426 
    427 int
    428 rf_autoconfig(device_t self)
    429 {
    430 	RF_AutoConfig_t *ac_list;
    431 	RF_ConfigSet_t *config_sets;
    432 
    433 	if (!raidautoconfig || raidautoconfigdone == true)
    434 		return (0);
    435 
    436 	/* XXX This code can only be run once. */
    437 	raidautoconfigdone = true;
    438 
    439 #ifdef __HAVE_CPU_BOOTCONF
    440 	/*
    441 	 * 0. find the boot device if needed first so we can use it later
    442 	 * this needs to be done before we autoconfigure any raid sets,
    443 	 * because if we use wedges we are not going to be able to open
    444 	 * the boot device later
    445 	 */
    446 	if (booted_device == NULL)
    447 		cpu_bootconf();
    448 #endif
    449 	/* 1. locate all RAID components on the system */
    450 	aprint_debug("Searching for RAID components...\n");
    451 	ac_list = rf_find_raid_components();
    452 
    453 	/* 2. Sort them into their respective sets. */
    454 	config_sets = rf_create_auto_sets(ac_list);
    455 
    456 	/*
    457 	 * 3. Evaluate each set and configure the valid ones.
    458 	 * This gets done in rf_buildroothack().
    459 	 */
    460 	rf_buildroothack(config_sets);
    461 
    462 	return 1;
    463 }
    464 
    465 static int
    466 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    467 	const char *bootname = device_xname(bdv);
    468 	size_t len = strlen(bootname);
    469 
    470 	for (int col = 0; col < r->numCol; col++) {
    471 		const char *devname = r->Disks[col].devname;
    472 		devname += sizeof("/dev/") - 1;
    473 		if (strncmp(devname, "dk", 2) == 0) {
    474 			const char *parent =
    475 			    dkwedge_get_parent_name(r->Disks[col].dev);
    476 			if (parent != NULL)
    477 				devname = parent;
    478 		}
    479 		if (strncmp(devname, bootname, len) == 0) {
    480 			struct raid_softc *sc = r->softc;
    481 			aprint_debug("raid%d includes boot device %s\n",
    482 			    sc->sc_unit, devname);
    483 			return 1;
    484 		}
    485 	}
    486 	return 0;
    487 }
    488 
    489 void
    490 rf_buildroothack(RF_ConfigSet_t *config_sets)
    491 {
    492 	RF_ConfigSet_t *cset;
    493 	RF_ConfigSet_t *next_cset;
    494 	int num_root;
    495 	struct raid_softc *sc, *rsc;
    496 
    497 	sc = rsc = NULL;
    498 	num_root = 0;
    499 	cset = config_sets;
    500 	while (cset != NULL) {
    501 		next_cset = cset->next;
    502 		if (rf_have_enough_components(cset) &&
    503 		    cset->ac->clabel->autoconfigure == 1) {
    504 			sc = rf_auto_config_set(cset);
    505 			if (sc != NULL) {
    506 				aprint_debug("raid%d: configured ok\n",
    507 				    sc->sc_unit);
    508 				if (cset->rootable) {
    509 					rsc = sc;
    510 					num_root++;
    511 				}
    512 			} else {
    513 				/* The autoconfig didn't work :( */
    514 				aprint_debug("Autoconfig failed\n");
    515 				rf_release_all_vps(cset);
    516 			}
    517 		} else {
    518 			/* we're not autoconfiguring this set...
    519 			   release the associated resources */
    520 			rf_release_all_vps(cset);
    521 		}
    522 		/* cleanup */
    523 		rf_cleanup_config_set(cset);
    524 		cset = next_cset;
    525 	}
    526 
    527 	/* if the user has specified what the root device should be
    528 	   then we don't touch booted_device or boothowto... */
    529 
    530 	if (rootspec != NULL)
    531 		return;
    532 
    533 	/* we found something bootable... */
    534 
    535 	/*
    536 	 * XXX: The following code assumes that the root raid
    537 	 * is the first ('a') partition. This is about the best
    538 	 * we can do with a BSD disklabel, but we might be able
    539 	 * to do better with a GPT label, by setting a specified
    540 	 * attribute to indicate the root partition. We can then
    541 	 * stash the partition number in the r->root_partition
    542 	 * high bits (the bottom 2 bits are already used). For
    543 	 * now we just set booted_partition to 0 when we override
    544 	 * root.
    545 	 */
    546 	if (num_root == 1) {
    547 		device_t candidate_root;
    548 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    549 			char cname[sizeof(cset->ac->devname)];
    550 			/* XXX: assume 'a' */
    551 			snprintf(cname, sizeof(cname), "%s%c",
    552 			    device_xname(rsc->sc_dev), 'a');
    553 			candidate_root = dkwedge_find_by_wname(cname);
    554 		} else
    555 			candidate_root = rsc->sc_dev;
    556 		if (booted_device == NULL ||
    557 		    rsc->sc_r.root_partition == 1 ||
    558 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    559 			booted_device = candidate_root;
    560 			booted_partition = 0;	/* XXX assume 'a' */
    561 		}
    562 	} else if (num_root > 1) {
    563 
    564 		/*
    565 		 * Maybe the MD code can help. If it cannot, then
    566 		 * setroot() will discover that we have no
    567 		 * booted_device and will ask the user if nothing was
    568 		 * hardwired in the kernel config file
    569 		 */
    570 		if (booted_device == NULL)
    571 			return;
    572 
    573 		num_root = 0;
    574 		mutex_enter(&raid_lock);
    575 		LIST_FOREACH(sc, &raids, sc_link) {
    576 			RF_Raid_t *r = &sc->sc_r;
    577 			if (r->valid == 0)
    578 				continue;
    579 
    580 			if (r->root_partition == 0)
    581 				continue;
    582 
    583 			if (rf_containsboot(r, booted_device)) {
    584 				num_root++;
    585 				rsc = sc;
    586 			}
    587 		}
    588 		mutex_exit(&raid_lock);
    589 
    590 		if (num_root == 1) {
    591 			booted_device = rsc->sc_dev;
    592 			booted_partition = 0;	/* XXX assume 'a' */
    593 		} else {
    594 			/* we can't guess.. require the user to answer... */
    595 			boothowto |= RB_ASKNAME;
    596 		}
    597 	}
    598 }
    599 
    600 static int
    601 raidsize(dev_t dev)
    602 {
    603 	struct raid_softc *rs;
    604 	struct disklabel *lp;
    605 	int     part, unit, omask, size;
    606 
    607 	unit = raidunit(dev);
    608 	if ((rs = raidget(unit)) == NULL)
    609 		return -1;
    610 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    611 		return (-1);
    612 
    613 	part = DISKPART(dev);
    614 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    615 	lp = rs->sc_dkdev.dk_label;
    616 
    617 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    618 		return (-1);
    619 
    620 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    621 		size = -1;
    622 	else
    623 		size = lp->d_partitions[part].p_size *
    624 		    (lp->d_secsize / DEV_BSIZE);
    625 
    626 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    627 		return (-1);
    628 
    629 	return (size);
    630 
    631 }
    632 
    633 static int
    634 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    635 {
    636 	int     unit = raidunit(dev);
    637 	struct raid_softc *rs;
    638 	const struct bdevsw *bdev;
    639 	struct disklabel *lp;
    640 	RF_Raid_t *raidPtr;
    641 	daddr_t offset;
    642 	int     part, c, sparecol, j, scol, dumpto;
    643 	int     error = 0;
    644 
    645 	if ((rs = raidget(unit)) == NULL)
    646 		return ENXIO;
    647 
    648 	raidPtr = &rs->sc_r;
    649 
    650 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    651 		return ENXIO;
    652 
    653 	/* we only support dumping to RAID 1 sets */
    654 	if (raidPtr->Layout.numDataCol != 1 ||
    655 	    raidPtr->Layout.numParityCol != 1)
    656 		return EINVAL;
    657 
    658 
    659 	if ((error = raidlock(rs)) != 0)
    660 		return error;
    661 
    662 	if (size % DEV_BSIZE != 0) {
    663 		error = EINVAL;
    664 		goto out;
    665 	}
    666 
    667 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    668 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    669 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    670 		    size / DEV_BSIZE, rs->sc_size);
    671 		error = EINVAL;
    672 		goto out;
    673 	}
    674 
    675 	part = DISKPART(dev);
    676 	lp = rs->sc_dkdev.dk_label;
    677 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    678 
    679 	/* figure out what device is alive.. */
    680 
    681 	/*
    682 	   Look for a component to dump to.  The preference for the
    683 	   component to dump to is as follows:
    684 	   1) the master
    685 	   2) a used_spare of the master
    686 	   3) the slave
    687 	   4) a used_spare of the slave
    688 	*/
    689 
    690 	dumpto = -1;
    691 	for (c = 0; c < raidPtr->numCol; c++) {
    692 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    693 			/* this might be the one */
    694 			dumpto = c;
    695 			break;
    696 		}
    697 	}
    698 
    699 	/*
    700 	   At this point we have possibly selected a live master or a
    701 	   live slave.  We now check to see if there is a spared
    702 	   master (or a spared slave), if we didn't find a live master
    703 	   or a live slave.
    704 	*/
    705 
    706 	for (c = 0; c < raidPtr->numSpare; c++) {
    707 		sparecol = raidPtr->numCol + c;
    708 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    709 			/* How about this one? */
    710 			scol = -1;
    711 			for(j=0;j<raidPtr->numCol;j++) {
    712 				if (raidPtr->Disks[j].spareCol == sparecol) {
    713 					scol = j;
    714 					break;
    715 				}
    716 			}
    717 			if (scol == 0) {
    718 				/*
    719 				   We must have found a spared master!
    720 				   We'll take that over anything else
    721 				   found so far.  (We couldn't have
    722 				   found a real master before, since
    723 				   this is a used spare, and it's
    724 				   saying that it's replacing the
    725 				   master.)  On reboot (with
    726 				   autoconfiguration turned on)
    727 				   sparecol will become the 1st
    728 				   component (component0) of this set.
    729 				*/
    730 				dumpto = sparecol;
    731 				break;
    732 			} else if (scol != -1) {
    733 				/*
    734 				   Must be a spared slave.  We'll dump
    735 				   to that if we havn't found anything
    736 				   else so far.
    737 				*/
    738 				if (dumpto == -1)
    739 					dumpto = sparecol;
    740 			}
    741 		}
    742 	}
    743 
    744 	if (dumpto == -1) {
    745 		/* we couldn't find any live components to dump to!?!?
    746 		 */
    747 		error = EINVAL;
    748 		goto out;
    749 	}
    750 
    751 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    752 
    753 	/*
    754 	   Note that blkno is relative to this particular partition.
    755 	   By adding the offset of this partition in the RAID
    756 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    757 	   value that is relative to the partition used for the
    758 	   underlying component.
    759 	*/
    760 
    761 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    762 				blkno + offset, va, size);
    763 
    764 out:
    765 	raidunlock(rs);
    766 
    767 	return error;
    768 }
    769 
    770 /* ARGSUSED */
    771 static int
    772 raidopen(dev_t dev, int flags, int fmt,
    773     struct lwp *l)
    774 {
    775 	int     unit = raidunit(dev);
    776 	struct raid_softc *rs;
    777 	struct disklabel *lp;
    778 	int     part, pmask;
    779 	int     error = 0;
    780 
    781 	if ((rs = raidget(unit)) == NULL)
    782 		return ENXIO;
    783 	if ((error = raidlock(rs)) != 0)
    784 		return (error);
    785 
    786 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    787 		error = EBUSY;
    788 		goto bad;
    789 	}
    790 
    791 	lp = rs->sc_dkdev.dk_label;
    792 
    793 	part = DISKPART(dev);
    794 
    795 	/*
    796 	 * If there are wedges, and this is not RAW_PART, then we
    797 	 * need to fail.
    798 	 */
    799 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    800 		error = EBUSY;
    801 		goto bad;
    802 	}
    803 	pmask = (1 << part);
    804 
    805 	if ((rs->sc_flags & RAIDF_INITED) &&
    806 	    (rs->sc_dkdev.dk_nwedges == 0) &&
    807 	    (rs->sc_dkdev.dk_openmask == 0))
    808 		raidgetdisklabel(dev);
    809 
    810 	/* make sure that this partition exists */
    811 
    812 	if (part != RAW_PART) {
    813 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    814 		    ((part >= lp->d_npartitions) ||
    815 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    816 			error = ENXIO;
    817 			goto bad;
    818 		}
    819 	}
    820 	/* Prevent this unit from being unconfigured while open. */
    821 	switch (fmt) {
    822 	case S_IFCHR:
    823 		rs->sc_dkdev.dk_copenmask |= pmask;
    824 		break;
    825 
    826 	case S_IFBLK:
    827 		rs->sc_dkdev.dk_bopenmask |= pmask;
    828 		break;
    829 	}
    830 
    831 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    832 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    833 		/* First one... mark things as dirty... Note that we *MUST*
    834 		 have done a configure before this.  I DO NOT WANT TO BE
    835 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    836 		 THAT THEY BELONG TOGETHER!!!!! */
    837 		/* XXX should check to see if we're only open for reading
    838 		   here... If so, we needn't do this, but then need some
    839 		   other way of keeping track of what's happened.. */
    840 
    841 		rf_markalldirty(&rs->sc_r);
    842 	}
    843 
    844 
    845 	rs->sc_dkdev.dk_openmask =
    846 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    847 
    848 bad:
    849 	raidunlock(rs);
    850 
    851 	return (error);
    852 
    853 
    854 }
    855 
    856 /* ARGSUSED */
    857 static int
    858 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    859 {
    860 	int     unit = raidunit(dev);
    861 	struct raid_softc *rs;
    862 	int     error = 0;
    863 	int     part;
    864 
    865 	if ((rs = raidget(unit)) == NULL)
    866 		return ENXIO;
    867 
    868 	if ((error = raidlock(rs)) != 0)
    869 		return (error);
    870 
    871 	part = DISKPART(dev);
    872 
    873 	/* ...that much closer to allowing unconfiguration... */
    874 	switch (fmt) {
    875 	case S_IFCHR:
    876 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    877 		break;
    878 
    879 	case S_IFBLK:
    880 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    881 		break;
    882 	}
    883 	rs->sc_dkdev.dk_openmask =
    884 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    885 
    886 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    887 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    888 		/* Last one... device is not unconfigured yet.
    889 		   Device shutdown has taken care of setting the
    890 		   clean bits if RAIDF_INITED is not set
    891 		   mark things as clean... */
    892 
    893 		rf_update_component_labels(&rs->sc_r,
    894 						 RF_FINAL_COMPONENT_UPDATE);
    895 
    896 		/* If the kernel is shutting down, it will detach
    897 		 * this RAID set soon enough.
    898 		 */
    899 	}
    900 
    901 	raidunlock(rs);
    902 	return (0);
    903 
    904 }
    905 
    906 static void
    907 raidstrategy(struct buf *bp)
    908 {
    909 	unsigned int unit = raidunit(bp->b_dev);
    910 	RF_Raid_t *raidPtr;
    911 	int     wlabel;
    912 	struct raid_softc *rs;
    913 
    914 	if ((rs = raidget(unit)) == NULL) {
    915 		bp->b_error = ENXIO;
    916 		goto done;
    917 	}
    918 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    919 		bp->b_error = ENXIO;
    920 		goto done;
    921 	}
    922 	raidPtr = &rs->sc_r;
    923 	if (!raidPtr->valid) {
    924 		bp->b_error = ENODEV;
    925 		goto done;
    926 	}
    927 	if (bp->b_bcount == 0) {
    928 		db1_printf(("b_bcount is zero..\n"));
    929 		goto done;
    930 	}
    931 
    932 	/*
    933 	 * Do bounds checking and adjust transfer.  If there's an
    934 	 * error, the bounds check will flag that for us.
    935 	 */
    936 
    937 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    938 	if (DISKPART(bp->b_dev) == RAW_PART) {
    939 		uint64_t size; /* device size in DEV_BSIZE unit */
    940 
    941 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    942 			size = raidPtr->totalSectors <<
    943 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    944 		} else {
    945 			size = raidPtr->totalSectors >>
    946 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    947 		}
    948 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    949 			goto done;
    950 		}
    951 	} else {
    952 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    953 			db1_printf(("Bounds check failed!!:%d %d\n",
    954 				(int) bp->b_blkno, (int) wlabel));
    955 			goto done;
    956 		}
    957 	}
    958 
    959 	rf_lock_mutex2(raidPtr->iodone_lock);
    960 
    961 	bp->b_resid = 0;
    962 
    963 	/* stuff it onto our queue */
    964 	bufq_put(rs->buf_queue, bp);
    965 
    966 	/* scheduled the IO to happen at the next convenient time */
    967 	rf_signal_cond2(raidPtr->iodone_cv);
    968 	rf_unlock_mutex2(raidPtr->iodone_lock);
    969 
    970 	return;
    971 
    972 done:
    973 	bp->b_resid = bp->b_bcount;
    974 	biodone(bp);
    975 }
    976 
    977 /* ARGSUSED */
    978 static int
    979 raidread(dev_t dev, struct uio *uio, int flags)
    980 {
    981 	int     unit = raidunit(dev);
    982 	struct raid_softc *rs;
    983 
    984 	if ((rs = raidget(unit)) == NULL)
    985 		return ENXIO;
    986 
    987 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    988 		return (ENXIO);
    989 
    990 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    991 
    992 }
    993 
    994 /* ARGSUSED */
    995 static int
    996 raidwrite(dev_t dev, struct uio *uio, int flags)
    997 {
    998 	int     unit = raidunit(dev);
    999 	struct raid_softc *rs;
   1000 
   1001 	if ((rs = raidget(unit)) == NULL)
   1002 		return ENXIO;
   1003 
   1004 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1005 		return (ENXIO);
   1006 
   1007 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1008 
   1009 }
   1010 
   1011 static int
   1012 raid_detach_unlocked(struct raid_softc *rs)
   1013 {
   1014 	int error;
   1015 	RF_Raid_t *raidPtr;
   1016 
   1017 	raidPtr = &rs->sc_r;
   1018 
   1019 	/*
   1020 	 * If somebody has a partition mounted, we shouldn't
   1021 	 * shutdown.
   1022 	 */
   1023 	if (rs->sc_dkdev.dk_openmask != 0)
   1024 		return EBUSY;
   1025 
   1026 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1027 		;	/* not initialized: nothing to do */
   1028 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1029 		return error;
   1030 	else
   1031 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1032 
   1033 	/* Detach the disk. */
   1034 	dkwedge_delall(&rs->sc_dkdev);
   1035 	disk_detach(&rs->sc_dkdev);
   1036 	disk_destroy(&rs->sc_dkdev);
   1037 
   1038 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1039 
   1040 	return 0;
   1041 }
   1042 
   1043 static int
   1044 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1045 {
   1046 	int     unit = raidunit(dev);
   1047 	int     error = 0;
   1048 	int     part, pmask, s;
   1049 	cfdata_t cf;
   1050 	struct raid_softc *rs;
   1051 	RF_Config_t *k_cfg, *u_cfg;
   1052 	RF_Raid_t *raidPtr;
   1053 	RF_RaidDisk_t *diskPtr;
   1054 	RF_AccTotals_t *totals;
   1055 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1056 	u_char *specific_buf;
   1057 	int retcode = 0;
   1058 	int column;
   1059 /*	int raidid; */
   1060 	struct rf_recon_req *rrcopy, *rr;
   1061 	RF_ComponentLabel_t *clabel;
   1062 	RF_ComponentLabel_t *ci_label;
   1063 	RF_ComponentLabel_t **clabel_ptr;
   1064 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1065 	RF_SingleComponent_t component;
   1066 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1067 	int i, j, d;
   1068 #ifdef __HAVE_OLD_DISKLABEL
   1069 	struct disklabel newlabel;
   1070 #endif
   1071 
   1072 	if ((rs = raidget(unit)) == NULL)
   1073 		return ENXIO;
   1074 	raidPtr = &rs->sc_r;
   1075 
   1076 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1077 		(int) DISKPART(dev), (int) unit, cmd));
   1078 
   1079 	/* Must be open for writes for these commands... */
   1080 	switch (cmd) {
   1081 #ifdef DIOCGSECTORSIZE
   1082 	case DIOCGSECTORSIZE:
   1083 		*(u_int *)data = raidPtr->bytesPerSector;
   1084 		return 0;
   1085 	case DIOCGMEDIASIZE:
   1086 		*(off_t *)data =
   1087 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1088 		return 0;
   1089 #endif
   1090 	case DIOCSDINFO:
   1091 	case DIOCWDINFO:
   1092 #ifdef __HAVE_OLD_DISKLABEL
   1093 	case ODIOCWDINFO:
   1094 	case ODIOCSDINFO:
   1095 #endif
   1096 	case DIOCWLABEL:
   1097 	case DIOCAWEDGE:
   1098 	case DIOCDWEDGE:
   1099 	case DIOCMWEDGES:
   1100 	case DIOCSSTRATEGY:
   1101 		if ((flag & FWRITE) == 0)
   1102 			return (EBADF);
   1103 	}
   1104 
   1105 	/* Must be initialized for these... */
   1106 	switch (cmd) {
   1107 	case DIOCGDINFO:
   1108 	case DIOCSDINFO:
   1109 	case DIOCWDINFO:
   1110 #ifdef __HAVE_OLD_DISKLABEL
   1111 	case ODIOCGDINFO:
   1112 	case ODIOCWDINFO:
   1113 	case ODIOCSDINFO:
   1114 	case ODIOCGDEFLABEL:
   1115 #endif
   1116 	case DIOCGPART:
   1117 	case DIOCWLABEL:
   1118 	case DIOCGDEFLABEL:
   1119 	case DIOCAWEDGE:
   1120 	case DIOCDWEDGE:
   1121 	case DIOCLWEDGES:
   1122 	case DIOCMWEDGES:
   1123 	case DIOCCACHESYNC:
   1124 	case RAIDFRAME_SHUTDOWN:
   1125 	case RAIDFRAME_REWRITEPARITY:
   1126 	case RAIDFRAME_GET_INFO:
   1127 	case RAIDFRAME_RESET_ACCTOTALS:
   1128 	case RAIDFRAME_GET_ACCTOTALS:
   1129 	case RAIDFRAME_KEEP_ACCTOTALS:
   1130 	case RAIDFRAME_GET_SIZE:
   1131 	case RAIDFRAME_FAIL_DISK:
   1132 	case RAIDFRAME_COPYBACK:
   1133 	case RAIDFRAME_CHECK_RECON_STATUS:
   1134 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1135 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1136 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1137 	case RAIDFRAME_ADD_HOT_SPARE:
   1138 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1139 	case RAIDFRAME_INIT_LABELS:
   1140 	case RAIDFRAME_REBUILD_IN_PLACE:
   1141 	case RAIDFRAME_CHECK_PARITY:
   1142 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1143 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1144 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1145 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1146 	case RAIDFRAME_SET_AUTOCONFIG:
   1147 	case RAIDFRAME_SET_ROOT:
   1148 	case RAIDFRAME_DELETE_COMPONENT:
   1149 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1150 	case RAIDFRAME_PARITYMAP_STATUS:
   1151 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1152 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1153 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1154 	case DIOCGSTRATEGY:
   1155 	case DIOCSSTRATEGY:
   1156 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1157 			return (ENXIO);
   1158 	}
   1159 
   1160 	switch (cmd) {
   1161 #ifdef COMPAT_50
   1162 	case RAIDFRAME_GET_INFO50:
   1163 		return rf_get_info50(raidPtr, data);
   1164 
   1165 	case RAIDFRAME_CONFIGURE50:
   1166 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1167 			return retcode;
   1168 		goto config;
   1169 #endif
   1170 		/* configure the system */
   1171 	case RAIDFRAME_CONFIGURE:
   1172 
   1173 		if (raidPtr->valid) {
   1174 			/* There is a valid RAID set running on this unit! */
   1175 			printf("raid%d: Device already configured!\n",unit);
   1176 			return(EINVAL);
   1177 		}
   1178 
   1179 		/* copy-in the configuration information */
   1180 		/* data points to a pointer to the configuration structure */
   1181 
   1182 		u_cfg = *((RF_Config_t **) data);
   1183 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1184 		if (k_cfg == NULL) {
   1185 			return (ENOMEM);
   1186 		}
   1187 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1188 		if (retcode) {
   1189 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1190 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1191 				retcode));
   1192 			return (retcode);
   1193 		}
   1194 		goto config;
   1195 	config:
   1196 		/* allocate a buffer for the layout-specific data, and copy it
   1197 		 * in */
   1198 		if (k_cfg->layoutSpecificSize) {
   1199 			if (k_cfg->layoutSpecificSize > 10000) {
   1200 				/* sanity check */
   1201 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1202 				return (EINVAL);
   1203 			}
   1204 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1205 			    (u_char *));
   1206 			if (specific_buf == NULL) {
   1207 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1208 				return (ENOMEM);
   1209 			}
   1210 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1211 			    k_cfg->layoutSpecificSize);
   1212 			if (retcode) {
   1213 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1214 				RF_Free(specific_buf,
   1215 					k_cfg->layoutSpecificSize);
   1216 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1217 					retcode));
   1218 				return (retcode);
   1219 			}
   1220 		} else
   1221 			specific_buf = NULL;
   1222 		k_cfg->layoutSpecific = specific_buf;
   1223 
   1224 		/* should do some kind of sanity check on the configuration.
   1225 		 * Store the sum of all the bytes in the last byte? */
   1226 
   1227 		/* configure the system */
   1228 
   1229 		/*
   1230 		 * Clear the entire RAID descriptor, just to make sure
   1231 		 *  there is no stale data left in the case of a
   1232 		 *  reconfiguration
   1233 		 */
   1234 		memset(raidPtr, 0, sizeof(*raidPtr));
   1235 		raidPtr->softc = rs;
   1236 		raidPtr->raidid = unit;
   1237 
   1238 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1239 
   1240 		if (retcode == 0) {
   1241 
   1242 			/* allow this many simultaneous IO's to
   1243 			   this RAID device */
   1244 			raidPtr->openings = RAIDOUTSTANDING;
   1245 
   1246 			raidinit(rs);
   1247 			rf_markalldirty(raidPtr);
   1248 		}
   1249 		/* free the buffers.  No return code here. */
   1250 		if (k_cfg->layoutSpecificSize) {
   1251 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1252 		}
   1253 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1254 
   1255 		return (retcode);
   1256 
   1257 		/* shutdown the system */
   1258 	case RAIDFRAME_SHUTDOWN:
   1259 
   1260 		part = DISKPART(dev);
   1261 		pmask = (1 << part);
   1262 
   1263 		if ((error = raidlock(rs)) != 0)
   1264 			return (error);
   1265 
   1266 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1267 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1268 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1269 			retcode = EBUSY;
   1270 		else {
   1271 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1272 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1273 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1274 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1275 			retcode = 0;
   1276 		}
   1277 
   1278 		raidunlock(rs);
   1279 
   1280 		if (retcode != 0)
   1281 			return retcode;
   1282 
   1283 		/* free the pseudo device attach bits */
   1284 
   1285 		cf = device_cfdata(rs->sc_dev);
   1286 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1287 			free(cf, M_RAIDFRAME);
   1288 
   1289 		return (retcode);
   1290 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1291 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1292 		/* need to read the component label for the disk indicated
   1293 		   by row,column in clabel */
   1294 
   1295 		/*
   1296 		 * Perhaps there should be an option to skip the in-core
   1297 		 * copy and hit the disk, as with disklabel(8).
   1298 		 */
   1299 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1300 
   1301 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1302 
   1303 		if (retcode) {
   1304 			RF_Free(clabel, sizeof(*clabel));
   1305 			return retcode;
   1306 		}
   1307 
   1308 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1309 
   1310 		column = clabel->column;
   1311 
   1312 		if ((column < 0) || (column >= raidPtr->numCol +
   1313 		    raidPtr->numSpare)) {
   1314 			RF_Free(clabel, sizeof(*clabel));
   1315 			return EINVAL;
   1316 		}
   1317 
   1318 		RF_Free(clabel, sizeof(*clabel));
   1319 
   1320 		clabel = raidget_component_label(raidPtr, column);
   1321 
   1322 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1323 
   1324 #if 0
   1325 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1326 		clabel = (RF_ComponentLabel_t *) data;
   1327 
   1328 		/* XXX check the label for valid stuff... */
   1329 		/* Note that some things *should not* get modified --
   1330 		   the user should be re-initing the labels instead of
   1331 		   trying to patch things.
   1332 		   */
   1333 
   1334 		raidid = raidPtr->raidid;
   1335 #ifdef DEBUG
   1336 		printf("raid%d: Got component label:\n", raidid);
   1337 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1338 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1339 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1340 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1341 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1342 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1343 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1344 #endif
   1345 		clabel->row = 0;
   1346 		column = clabel->column;
   1347 
   1348 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1349 			return(EINVAL);
   1350 		}
   1351 
   1352 		/* XXX this isn't allowed to do anything for now :-) */
   1353 
   1354 		/* XXX and before it is, we need to fill in the rest
   1355 		   of the fields!?!?!?! */
   1356 		memcpy(raidget_component_label(raidPtr, column),
   1357 		    clabel, sizeof(*clabel));
   1358 		raidflush_component_label(raidPtr, column);
   1359 		return (0);
   1360 #endif
   1361 
   1362 	case RAIDFRAME_INIT_LABELS:
   1363 		clabel = (RF_ComponentLabel_t *) data;
   1364 		/*
   1365 		   we only want the serial number from
   1366 		   the above.  We get all the rest of the information
   1367 		   from the config that was used to create this RAID
   1368 		   set.
   1369 		   */
   1370 
   1371 		raidPtr->serial_number = clabel->serial_number;
   1372 
   1373 		for(column=0;column<raidPtr->numCol;column++) {
   1374 			diskPtr = &raidPtr->Disks[column];
   1375 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1376 				ci_label = raidget_component_label(raidPtr,
   1377 				    column);
   1378 				/* Zeroing this is important. */
   1379 				memset(ci_label, 0, sizeof(*ci_label));
   1380 				raid_init_component_label(raidPtr, ci_label);
   1381 				ci_label->serial_number =
   1382 				    raidPtr->serial_number;
   1383 				ci_label->row = 0; /* we dont' pretend to support more */
   1384 				rf_component_label_set_partitionsize(ci_label,
   1385 				    diskPtr->partitionSize);
   1386 				ci_label->column = column;
   1387 				raidflush_component_label(raidPtr, column);
   1388 			}
   1389 			/* XXXjld what about the spares? */
   1390 		}
   1391 
   1392 		return (retcode);
   1393 	case RAIDFRAME_SET_AUTOCONFIG:
   1394 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1395 		printf("raid%d: New autoconfig value is: %d\n",
   1396 		       raidPtr->raidid, d);
   1397 		*(int *) data = d;
   1398 		return (retcode);
   1399 
   1400 	case RAIDFRAME_SET_ROOT:
   1401 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1402 		printf("raid%d: New rootpartition value is: %d\n",
   1403 		       raidPtr->raidid, d);
   1404 		*(int *) data = d;
   1405 		return (retcode);
   1406 
   1407 		/* initialize all parity */
   1408 	case RAIDFRAME_REWRITEPARITY:
   1409 
   1410 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1411 			/* Parity for RAID 0 is trivially correct */
   1412 			raidPtr->parity_good = RF_RAID_CLEAN;
   1413 			return(0);
   1414 		}
   1415 
   1416 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1417 			/* Re-write is already in progress! */
   1418 			return(EINVAL);
   1419 		}
   1420 
   1421 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1422 					   rf_RewriteParityThread,
   1423 					   raidPtr,"raid_parity");
   1424 		return (retcode);
   1425 
   1426 
   1427 	case RAIDFRAME_ADD_HOT_SPARE:
   1428 		sparePtr = (RF_SingleComponent_t *) data;
   1429 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1430 		retcode = rf_add_hot_spare(raidPtr, &component);
   1431 		return(retcode);
   1432 
   1433 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1434 		return(retcode);
   1435 
   1436 	case RAIDFRAME_DELETE_COMPONENT:
   1437 		componentPtr = (RF_SingleComponent_t *)data;
   1438 		memcpy( &component, componentPtr,
   1439 			sizeof(RF_SingleComponent_t));
   1440 		retcode = rf_delete_component(raidPtr, &component);
   1441 		return(retcode);
   1442 
   1443 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1444 		componentPtr = (RF_SingleComponent_t *)data;
   1445 		memcpy( &component, componentPtr,
   1446 			sizeof(RF_SingleComponent_t));
   1447 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1448 		return(retcode);
   1449 
   1450 	case RAIDFRAME_REBUILD_IN_PLACE:
   1451 
   1452 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1453 			/* Can't do this on a RAID 0!! */
   1454 			return(EINVAL);
   1455 		}
   1456 
   1457 		if (raidPtr->recon_in_progress == 1) {
   1458 			/* a reconstruct is already in progress! */
   1459 			return(EINVAL);
   1460 		}
   1461 
   1462 		componentPtr = (RF_SingleComponent_t *) data;
   1463 		memcpy( &component, componentPtr,
   1464 			sizeof(RF_SingleComponent_t));
   1465 		component.row = 0; /* we don't support any more */
   1466 		column = component.column;
   1467 
   1468 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1469 			return(EINVAL);
   1470 		}
   1471 
   1472 		rf_lock_mutex2(raidPtr->mutex);
   1473 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1474 		    (raidPtr->numFailures > 0)) {
   1475 			/* XXX 0 above shouldn't be constant!!! */
   1476 			/* some component other than this has failed.
   1477 			   Let's not make things worse than they already
   1478 			   are... */
   1479 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1480 			       raidPtr->raidid);
   1481 			printf("raid%d:     Col: %d   Too many failures.\n",
   1482 			       raidPtr->raidid, column);
   1483 			rf_unlock_mutex2(raidPtr->mutex);
   1484 			return (EINVAL);
   1485 		}
   1486 		if (raidPtr->Disks[column].status ==
   1487 		    rf_ds_reconstructing) {
   1488 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1489 			       raidPtr->raidid);
   1490 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1491 
   1492 			rf_unlock_mutex2(raidPtr->mutex);
   1493 			return (EINVAL);
   1494 		}
   1495 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1496 			rf_unlock_mutex2(raidPtr->mutex);
   1497 			return (EINVAL);
   1498 		}
   1499 		rf_unlock_mutex2(raidPtr->mutex);
   1500 
   1501 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1502 		if (rrcopy == NULL)
   1503 			return(ENOMEM);
   1504 
   1505 		rrcopy->raidPtr = (void *) raidPtr;
   1506 		rrcopy->col = column;
   1507 
   1508 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1509 					   rf_ReconstructInPlaceThread,
   1510 					   rrcopy,"raid_reconip");
   1511 		return(retcode);
   1512 
   1513 	case RAIDFRAME_GET_INFO:
   1514 		if (!raidPtr->valid)
   1515 			return (ENODEV);
   1516 		ucfgp = (RF_DeviceConfig_t **) data;
   1517 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1518 			  (RF_DeviceConfig_t *));
   1519 		if (d_cfg == NULL)
   1520 			return (ENOMEM);
   1521 		d_cfg->rows = 1; /* there is only 1 row now */
   1522 		d_cfg->cols = raidPtr->numCol;
   1523 		d_cfg->ndevs = raidPtr->numCol;
   1524 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1525 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1526 			return (ENOMEM);
   1527 		}
   1528 		d_cfg->nspares = raidPtr->numSpare;
   1529 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1530 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1531 			return (ENOMEM);
   1532 		}
   1533 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1534 		d = 0;
   1535 		for (j = 0; j < d_cfg->cols; j++) {
   1536 			d_cfg->devs[d] = raidPtr->Disks[j];
   1537 			d++;
   1538 		}
   1539 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1540 			d_cfg->spares[i] = raidPtr->Disks[j];
   1541 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1542 				/* XXX: raidctl(8) expects to see this as a used spare */
   1543 				d_cfg->spares[i].status = rf_ds_used_spare;
   1544 			}
   1545 		}
   1546 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1547 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1548 
   1549 		return (retcode);
   1550 
   1551 	case RAIDFRAME_CHECK_PARITY:
   1552 		*(int *) data = raidPtr->parity_good;
   1553 		return (0);
   1554 
   1555 	case RAIDFRAME_PARITYMAP_STATUS:
   1556 		if (rf_paritymap_ineligible(raidPtr))
   1557 			return EINVAL;
   1558 		rf_paritymap_status(raidPtr->parity_map,
   1559 		    (struct rf_pmstat *)data);
   1560 		return 0;
   1561 
   1562 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1563 		if (rf_paritymap_ineligible(raidPtr))
   1564 			return EINVAL;
   1565 		if (raidPtr->parity_map == NULL)
   1566 			return ENOENT; /* ??? */
   1567 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1568 			(struct rf_pmparams *)data, 1))
   1569 			return EINVAL;
   1570 		return 0;
   1571 
   1572 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1573 		if (rf_paritymap_ineligible(raidPtr))
   1574 			return EINVAL;
   1575 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1576 		return 0;
   1577 
   1578 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1579 		if (rf_paritymap_ineligible(raidPtr))
   1580 			return EINVAL;
   1581 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1582 		/* XXX should errors be passed up? */
   1583 		return 0;
   1584 
   1585 	case RAIDFRAME_RESET_ACCTOTALS:
   1586 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1587 		return (0);
   1588 
   1589 	case RAIDFRAME_GET_ACCTOTALS:
   1590 		totals = (RF_AccTotals_t *) data;
   1591 		*totals = raidPtr->acc_totals;
   1592 		return (0);
   1593 
   1594 	case RAIDFRAME_KEEP_ACCTOTALS:
   1595 		raidPtr->keep_acc_totals = *(int *)data;
   1596 		return (0);
   1597 
   1598 	case RAIDFRAME_GET_SIZE:
   1599 		*(int *) data = raidPtr->totalSectors;
   1600 		return (0);
   1601 
   1602 		/* fail a disk & optionally start reconstruction */
   1603 	case RAIDFRAME_FAIL_DISK:
   1604 
   1605 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1606 			/* Can't do this on a RAID 0!! */
   1607 			return(EINVAL);
   1608 		}
   1609 
   1610 		rr = (struct rf_recon_req *) data;
   1611 		rr->row = 0;
   1612 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1613 			return (EINVAL);
   1614 
   1615 
   1616 		rf_lock_mutex2(raidPtr->mutex);
   1617 		if (raidPtr->status == rf_rs_reconstructing) {
   1618 			/* you can't fail a disk while we're reconstructing! */
   1619 			/* XXX wrong for RAID6 */
   1620 			rf_unlock_mutex2(raidPtr->mutex);
   1621 			return (EINVAL);
   1622 		}
   1623 		if ((raidPtr->Disks[rr->col].status ==
   1624 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1625 			/* some other component has failed.  Let's not make
   1626 			   things worse. XXX wrong for RAID6 */
   1627 			rf_unlock_mutex2(raidPtr->mutex);
   1628 			return (EINVAL);
   1629 		}
   1630 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1631 			/* Can't fail a spared disk! */
   1632 			rf_unlock_mutex2(raidPtr->mutex);
   1633 			return (EINVAL);
   1634 		}
   1635 		rf_unlock_mutex2(raidPtr->mutex);
   1636 
   1637 		/* make a copy of the recon request so that we don't rely on
   1638 		 * the user's buffer */
   1639 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1640 		if (rrcopy == NULL)
   1641 			return(ENOMEM);
   1642 		memcpy(rrcopy, rr, sizeof(*rr));
   1643 		rrcopy->raidPtr = (void *) raidPtr;
   1644 
   1645 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1646 					   rf_ReconThread,
   1647 					   rrcopy,"raid_recon");
   1648 		return (0);
   1649 
   1650 		/* invoke a copyback operation after recon on whatever disk
   1651 		 * needs it, if any */
   1652 	case RAIDFRAME_COPYBACK:
   1653 
   1654 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1655 			/* This makes no sense on a RAID 0!! */
   1656 			return(EINVAL);
   1657 		}
   1658 
   1659 		if (raidPtr->copyback_in_progress == 1) {
   1660 			/* Copyback is already in progress! */
   1661 			return(EINVAL);
   1662 		}
   1663 
   1664 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1665 					   rf_CopybackThread,
   1666 					   raidPtr,"raid_copyback");
   1667 		return (retcode);
   1668 
   1669 		/* return the percentage completion of reconstruction */
   1670 	case RAIDFRAME_CHECK_RECON_STATUS:
   1671 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1672 			/* This makes no sense on a RAID 0, so tell the
   1673 			   user it's done. */
   1674 			*(int *) data = 100;
   1675 			return(0);
   1676 		}
   1677 		if (raidPtr->status != rf_rs_reconstructing)
   1678 			*(int *) data = 100;
   1679 		else {
   1680 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1681 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1682 			} else {
   1683 				*(int *) data = 0;
   1684 			}
   1685 		}
   1686 		return (0);
   1687 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1688 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1689 		if (raidPtr->status != rf_rs_reconstructing) {
   1690 			progressInfo.remaining = 0;
   1691 			progressInfo.completed = 100;
   1692 			progressInfo.total = 100;
   1693 		} else {
   1694 			progressInfo.total =
   1695 				raidPtr->reconControl->numRUsTotal;
   1696 			progressInfo.completed =
   1697 				raidPtr->reconControl->numRUsComplete;
   1698 			progressInfo.remaining = progressInfo.total -
   1699 				progressInfo.completed;
   1700 		}
   1701 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1702 				  sizeof(RF_ProgressInfo_t));
   1703 		return (retcode);
   1704 
   1705 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1706 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1707 			/* This makes no sense on a RAID 0, so tell the
   1708 			   user it's done. */
   1709 			*(int *) data = 100;
   1710 			return(0);
   1711 		}
   1712 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1713 			*(int *) data = 100 *
   1714 				raidPtr->parity_rewrite_stripes_done /
   1715 				raidPtr->Layout.numStripe;
   1716 		} else {
   1717 			*(int *) data = 100;
   1718 		}
   1719 		return (0);
   1720 
   1721 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1722 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1723 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1724 			progressInfo.total = raidPtr->Layout.numStripe;
   1725 			progressInfo.completed =
   1726 				raidPtr->parity_rewrite_stripes_done;
   1727 			progressInfo.remaining = progressInfo.total -
   1728 				progressInfo.completed;
   1729 		} else {
   1730 			progressInfo.remaining = 0;
   1731 			progressInfo.completed = 100;
   1732 			progressInfo.total = 100;
   1733 		}
   1734 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1735 				  sizeof(RF_ProgressInfo_t));
   1736 		return (retcode);
   1737 
   1738 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1739 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1740 			/* This makes no sense on a RAID 0 */
   1741 			*(int *) data = 100;
   1742 			return(0);
   1743 		}
   1744 		if (raidPtr->copyback_in_progress == 1) {
   1745 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1746 				raidPtr->Layout.numStripe;
   1747 		} else {
   1748 			*(int *) data = 100;
   1749 		}
   1750 		return (0);
   1751 
   1752 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1753 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1754 		if (raidPtr->copyback_in_progress == 1) {
   1755 			progressInfo.total = raidPtr->Layout.numStripe;
   1756 			progressInfo.completed =
   1757 				raidPtr->copyback_stripes_done;
   1758 			progressInfo.remaining = progressInfo.total -
   1759 				progressInfo.completed;
   1760 		} else {
   1761 			progressInfo.remaining = 0;
   1762 			progressInfo.completed = 100;
   1763 			progressInfo.total = 100;
   1764 		}
   1765 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1766 				  sizeof(RF_ProgressInfo_t));
   1767 		return (retcode);
   1768 
   1769 		/* the sparetable daemon calls this to wait for the kernel to
   1770 		 * need a spare table. this ioctl does not return until a
   1771 		 * spare table is needed. XXX -- calling mpsleep here in the
   1772 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1773 		 * -- I should either compute the spare table in the kernel,
   1774 		 * or have a different -- XXX XXX -- interface (a different
   1775 		 * character device) for delivering the table     -- XXX */
   1776 #if 0
   1777 	case RAIDFRAME_SPARET_WAIT:
   1778 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1779 		while (!rf_sparet_wait_queue)
   1780 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1781 		waitreq = rf_sparet_wait_queue;
   1782 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1783 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1784 
   1785 		/* structure assignment */
   1786 		*((RF_SparetWait_t *) data) = *waitreq;
   1787 
   1788 		RF_Free(waitreq, sizeof(*waitreq));
   1789 		return (0);
   1790 
   1791 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1792 		 * code in it that will cause the dameon to exit */
   1793 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1794 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1795 		waitreq->fcol = -1;
   1796 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1797 		waitreq->next = rf_sparet_wait_queue;
   1798 		rf_sparet_wait_queue = waitreq;
   1799 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1800 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1801 		return (0);
   1802 
   1803 		/* used by the spare table daemon to deliver a spare table
   1804 		 * into the kernel */
   1805 	case RAIDFRAME_SEND_SPARET:
   1806 
   1807 		/* install the spare table */
   1808 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1809 
   1810 		/* respond to the requestor.  the return status of the spare
   1811 		 * table installation is passed in the "fcol" field */
   1812 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1813 		waitreq->fcol = retcode;
   1814 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1815 		waitreq->next = rf_sparet_resp_queue;
   1816 		rf_sparet_resp_queue = waitreq;
   1817 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1818 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1819 
   1820 		return (retcode);
   1821 #endif
   1822 
   1823 	default:
   1824 		break; /* fall through to the os-specific code below */
   1825 
   1826 	}
   1827 
   1828 	if (!raidPtr->valid)
   1829 		return (EINVAL);
   1830 
   1831 	/*
   1832 	 * Add support for "regular" device ioctls here.
   1833 	 */
   1834 
   1835 	error = disk_ioctl(&rs->sc_dkdev, dev, cmd, data, flag, l);
   1836 	if (error != EPASSTHROUGH)
   1837 		return (error);
   1838 
   1839 	switch (cmd) {
   1840 	case DIOCWDINFO:
   1841 	case DIOCSDINFO:
   1842 #ifdef __HAVE_OLD_DISKLABEL
   1843 	case ODIOCWDINFO:
   1844 	case ODIOCSDINFO:
   1845 #endif
   1846 	{
   1847 		struct disklabel *lp;
   1848 #ifdef __HAVE_OLD_DISKLABEL
   1849 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1850 			memset(&newlabel, 0, sizeof newlabel);
   1851 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1852 			lp = &newlabel;
   1853 		} else
   1854 #endif
   1855 		lp = (struct disklabel *)data;
   1856 
   1857 		if ((error = raidlock(rs)) != 0)
   1858 			return (error);
   1859 
   1860 		rs->sc_flags |= RAIDF_LABELLING;
   1861 
   1862 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1863 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1864 		if (error == 0) {
   1865 			if (cmd == DIOCWDINFO
   1866 #ifdef __HAVE_OLD_DISKLABEL
   1867 			    || cmd == ODIOCWDINFO
   1868 #endif
   1869 			   )
   1870 				error = writedisklabel(RAIDLABELDEV(dev),
   1871 				    raidstrategy, rs->sc_dkdev.dk_label,
   1872 				    rs->sc_dkdev.dk_cpulabel);
   1873 		}
   1874 		rs->sc_flags &= ~RAIDF_LABELLING;
   1875 
   1876 		raidunlock(rs);
   1877 
   1878 		if (error)
   1879 			return (error);
   1880 		break;
   1881 	}
   1882 
   1883 	case DIOCWLABEL:
   1884 		if (*(int *) data != 0)
   1885 			rs->sc_flags |= RAIDF_WLABEL;
   1886 		else
   1887 			rs->sc_flags &= ~RAIDF_WLABEL;
   1888 		break;
   1889 
   1890 	case DIOCGDEFLABEL:
   1891 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1892 		break;
   1893 
   1894 #ifdef __HAVE_OLD_DISKLABEL
   1895 	case ODIOCGDEFLABEL:
   1896 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1897 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1898 			return ENOTTY;
   1899 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1900 		break;
   1901 #endif
   1902 
   1903 	case DIOCCACHESYNC:
   1904 		return rf_sync_component_caches(raidPtr);
   1905 
   1906 	case DIOCGSTRATEGY:
   1907 	    {
   1908 		struct disk_strategy *dks = (void *)data;
   1909 
   1910 		s = splbio();
   1911 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1912 		    sizeof(dks->dks_name));
   1913 		splx(s);
   1914 		dks->dks_paramlen = 0;
   1915 
   1916 		return 0;
   1917 	    }
   1918 
   1919 	case DIOCSSTRATEGY:
   1920 	    {
   1921 		struct disk_strategy *dks = (void *)data;
   1922 		struct bufq_state *new;
   1923 		struct bufq_state *old;
   1924 
   1925 		if (dks->dks_param != NULL) {
   1926 			return EINVAL;
   1927 		}
   1928 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1929 		error = bufq_alloc(&new, dks->dks_name,
   1930 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1931 		if (error) {
   1932 			return error;
   1933 		}
   1934 		s = splbio();
   1935 		old = rs->buf_queue;
   1936 		bufq_move(new, old);
   1937 		rs->buf_queue = new;
   1938 		splx(s);
   1939 		bufq_free(old);
   1940 
   1941 		return 0;
   1942 	    }
   1943 
   1944 	default:
   1945 		retcode = ENOTTY;
   1946 	}
   1947 	return (retcode);
   1948 
   1949 }
   1950 
   1951 
   1952 /* raidinit -- complete the rest of the initialization for the
   1953    RAIDframe device.  */
   1954 
   1955 
   1956 static void
   1957 raidinit(struct raid_softc *rs)
   1958 {
   1959 	cfdata_t cf;
   1960 	int     unit;
   1961 	RF_Raid_t *raidPtr = &rs->sc_r;
   1962 
   1963 	unit = raidPtr->raidid;
   1964 
   1965 
   1966 	/* XXX should check return code first... */
   1967 	rs->sc_flags |= RAIDF_INITED;
   1968 
   1969 	/* XXX doesn't check bounds. */
   1970 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1971 
   1972 	/* attach the pseudo device */
   1973 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1974 	cf->cf_name = raid_cd.cd_name;
   1975 	cf->cf_atname = raid_cd.cd_name;
   1976 	cf->cf_unit = unit;
   1977 	cf->cf_fstate = FSTATE_STAR;
   1978 
   1979 	rs->sc_dev = config_attach_pseudo(cf);
   1980 
   1981 	if (rs->sc_dev == NULL) {
   1982 		printf("raid%d: config_attach_pseudo failed\n",
   1983 		    raidPtr->raidid);
   1984 		rs->sc_flags &= ~RAIDF_INITED;
   1985 		free(cf, M_RAIDFRAME);
   1986 		return;
   1987 	}
   1988 
   1989 	/* disk_attach actually creates space for the CPU disklabel, among
   1990 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1991 	 * with disklabels. */
   1992 
   1993 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1994 	disk_attach(&rs->sc_dkdev);
   1995 
   1996 	/* XXX There may be a weird interaction here between this, and
   1997 	 * protectedSectors, as used in RAIDframe.  */
   1998 
   1999 	rs->sc_size = raidPtr->totalSectors;
   2000 
   2001 	rf_set_geometry(rs, raidPtr);
   2002 
   2003 	dkwedge_discover(&rs->sc_dkdev);
   2004 
   2005 }
   2006 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2007 /* wake up the daemon & tell it to get us a spare table
   2008  * XXX
   2009  * the entries in the queues should be tagged with the raidPtr
   2010  * so that in the extremely rare case that two recons happen at once,
   2011  * we know for which device were requesting a spare table
   2012  * XXX
   2013  *
   2014  * XXX This code is not currently used. GO
   2015  */
   2016 int
   2017 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2018 {
   2019 	int     retcode;
   2020 
   2021 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2022 	req->next = rf_sparet_wait_queue;
   2023 	rf_sparet_wait_queue = req;
   2024 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2025 
   2026 	/* mpsleep unlocks the mutex */
   2027 	while (!rf_sparet_resp_queue) {
   2028 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2029 	}
   2030 	req = rf_sparet_resp_queue;
   2031 	rf_sparet_resp_queue = req->next;
   2032 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2033 
   2034 	retcode = req->fcol;
   2035 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2036 					 * alloc'd */
   2037 	return (retcode);
   2038 }
   2039 #endif
   2040 
   2041 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2042  * bp & passes it down.
   2043  * any calls originating in the kernel must use non-blocking I/O
   2044  * do some extra sanity checking to return "appropriate" error values for
   2045  * certain conditions (to make some standard utilities work)
   2046  *
   2047  * Formerly known as: rf_DoAccessKernel
   2048  */
   2049 void
   2050 raidstart(RF_Raid_t *raidPtr)
   2051 {
   2052 	RF_SectorCount_t num_blocks, pb, sum;
   2053 	RF_RaidAddr_t raid_addr;
   2054 	struct partition *pp;
   2055 	daddr_t blocknum;
   2056 	struct raid_softc *rs;
   2057 	int     do_async;
   2058 	struct buf *bp;
   2059 	int rc;
   2060 
   2061 	rs = raidPtr->softc;
   2062 	/* quick check to see if anything has died recently */
   2063 	rf_lock_mutex2(raidPtr->mutex);
   2064 	if (raidPtr->numNewFailures > 0) {
   2065 		rf_unlock_mutex2(raidPtr->mutex);
   2066 		rf_update_component_labels(raidPtr,
   2067 					   RF_NORMAL_COMPONENT_UPDATE);
   2068 		rf_lock_mutex2(raidPtr->mutex);
   2069 		raidPtr->numNewFailures--;
   2070 	}
   2071 
   2072 	/* Check to see if we're at the limit... */
   2073 	while (raidPtr->openings > 0) {
   2074 		rf_unlock_mutex2(raidPtr->mutex);
   2075 
   2076 		/* get the next item, if any, from the queue */
   2077 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2078 			/* nothing more to do */
   2079 			return;
   2080 		}
   2081 
   2082 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2083 		 * partition.. Need to make it absolute to the underlying
   2084 		 * device.. */
   2085 
   2086 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2087 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2088 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2089 			blocknum += pp->p_offset;
   2090 		}
   2091 
   2092 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2093 			    (int) blocknum));
   2094 
   2095 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2096 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2097 
   2098 		/* *THIS* is where we adjust what block we're going to...
   2099 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2100 		raid_addr = blocknum;
   2101 
   2102 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2103 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2104 		sum = raid_addr + num_blocks + pb;
   2105 		if (1 || rf_debugKernelAccess) {
   2106 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2107 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2108 				    (int) pb, (int) bp->b_resid));
   2109 		}
   2110 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2111 		    || (sum < num_blocks) || (sum < pb)) {
   2112 			bp->b_error = ENOSPC;
   2113 			bp->b_resid = bp->b_bcount;
   2114 			biodone(bp);
   2115 			rf_lock_mutex2(raidPtr->mutex);
   2116 			continue;
   2117 		}
   2118 		/*
   2119 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2120 		 */
   2121 
   2122 		if (bp->b_bcount & raidPtr->sectorMask) {
   2123 			bp->b_error = EINVAL;
   2124 			bp->b_resid = bp->b_bcount;
   2125 			biodone(bp);
   2126 			rf_lock_mutex2(raidPtr->mutex);
   2127 			continue;
   2128 
   2129 		}
   2130 		db1_printf(("Calling DoAccess..\n"));
   2131 
   2132 
   2133 		rf_lock_mutex2(raidPtr->mutex);
   2134 		raidPtr->openings--;
   2135 		rf_unlock_mutex2(raidPtr->mutex);
   2136 
   2137 		/*
   2138 		 * Everything is async.
   2139 		 */
   2140 		do_async = 1;
   2141 
   2142 		disk_busy(&rs->sc_dkdev);
   2143 
   2144 		/* XXX we're still at splbio() here... do we *really*
   2145 		   need to be? */
   2146 
   2147 		/* don't ever condition on bp->b_flags & B_WRITE.
   2148 		 * always condition on B_READ instead */
   2149 
   2150 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2151 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2152 				 do_async, raid_addr, num_blocks,
   2153 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2154 
   2155 		if (rc) {
   2156 			bp->b_error = rc;
   2157 			bp->b_resid = bp->b_bcount;
   2158 			biodone(bp);
   2159 			/* continue loop */
   2160 		}
   2161 
   2162 		rf_lock_mutex2(raidPtr->mutex);
   2163 	}
   2164 	rf_unlock_mutex2(raidPtr->mutex);
   2165 }
   2166 
   2167 
   2168 
   2169 
   2170 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2171 
   2172 int
   2173 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2174 {
   2175 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2176 	struct buf *bp;
   2177 
   2178 	req->queue = queue;
   2179 	bp = req->bp;
   2180 
   2181 	switch (req->type) {
   2182 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2183 		/* XXX need to do something extra here.. */
   2184 		/* I'm leaving this in, as I've never actually seen it used,
   2185 		 * and I'd like folks to report it... GO */
   2186 		printf(("WAKEUP CALLED\n"));
   2187 		queue->numOutstanding++;
   2188 
   2189 		bp->b_flags = 0;
   2190 		bp->b_private = req;
   2191 
   2192 		KernelWakeupFunc(bp);
   2193 		break;
   2194 
   2195 	case RF_IO_TYPE_READ:
   2196 	case RF_IO_TYPE_WRITE:
   2197 #if RF_ACC_TRACE > 0
   2198 		if (req->tracerec) {
   2199 			RF_ETIMER_START(req->tracerec->timer);
   2200 		}
   2201 #endif
   2202 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2203 		    op, queue->rf_cinfo->ci_dev,
   2204 		    req->sectorOffset, req->numSector,
   2205 		    req->buf, KernelWakeupFunc, (void *) req,
   2206 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2207 
   2208 		if (rf_debugKernelAccess) {
   2209 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2210 				(long) bp->b_blkno));
   2211 		}
   2212 		queue->numOutstanding++;
   2213 		queue->last_deq_sector = req->sectorOffset;
   2214 		/* acc wouldn't have been let in if there were any pending
   2215 		 * reqs at any other priority */
   2216 		queue->curPriority = req->priority;
   2217 
   2218 		db1_printf(("Going for %c to unit %d col %d\n",
   2219 			    req->type, queue->raidPtr->raidid,
   2220 			    queue->col));
   2221 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2222 			(int) req->sectorOffset, (int) req->numSector,
   2223 			(int) (req->numSector <<
   2224 			    queue->raidPtr->logBytesPerSector),
   2225 			(int) queue->raidPtr->logBytesPerSector));
   2226 
   2227 		/*
   2228 		 * XXX: drop lock here since this can block at
   2229 		 * least with backing SCSI devices.  Retake it
   2230 		 * to minimize fuss with calling interfaces.
   2231 		 */
   2232 
   2233 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2234 		bdev_strategy(bp);
   2235 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2236 		break;
   2237 
   2238 	default:
   2239 		panic("bad req->type in rf_DispatchKernelIO");
   2240 	}
   2241 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2242 
   2243 	return (0);
   2244 }
   2245 /* this is the callback function associated with a I/O invoked from
   2246    kernel code.
   2247  */
   2248 static void
   2249 KernelWakeupFunc(struct buf *bp)
   2250 {
   2251 	RF_DiskQueueData_t *req = NULL;
   2252 	RF_DiskQueue_t *queue;
   2253 
   2254 	db1_printf(("recovering the request queue:\n"));
   2255 
   2256 	req = bp->b_private;
   2257 
   2258 	queue = (RF_DiskQueue_t *) req->queue;
   2259 
   2260 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2261 
   2262 #if RF_ACC_TRACE > 0
   2263 	if (req->tracerec) {
   2264 		RF_ETIMER_STOP(req->tracerec->timer);
   2265 		RF_ETIMER_EVAL(req->tracerec->timer);
   2266 		rf_lock_mutex2(rf_tracing_mutex);
   2267 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2268 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2269 		req->tracerec->num_phys_ios++;
   2270 		rf_unlock_mutex2(rf_tracing_mutex);
   2271 	}
   2272 #endif
   2273 
   2274 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2275 	 * ballistic, and mark the component as hosed... */
   2276 
   2277 	if (bp->b_error != 0) {
   2278 		/* Mark the disk as dead */
   2279 		/* but only mark it once... */
   2280 		/* and only if it wouldn't leave this RAID set
   2281 		   completely broken */
   2282 		if (((queue->raidPtr->Disks[queue->col].status ==
   2283 		      rf_ds_optimal) ||
   2284 		     (queue->raidPtr->Disks[queue->col].status ==
   2285 		      rf_ds_used_spare)) &&
   2286 		     (queue->raidPtr->numFailures <
   2287 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2288 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2289 			       queue->raidPtr->raidid,
   2290 			       bp->b_error,
   2291 			       queue->raidPtr->Disks[queue->col].devname);
   2292 			queue->raidPtr->Disks[queue->col].status =
   2293 			    rf_ds_failed;
   2294 			queue->raidPtr->status = rf_rs_degraded;
   2295 			queue->raidPtr->numFailures++;
   2296 			queue->raidPtr->numNewFailures++;
   2297 		} else {	/* Disk is already dead... */
   2298 			/* printf("Disk already marked as dead!\n"); */
   2299 		}
   2300 
   2301 	}
   2302 
   2303 	/* Fill in the error value */
   2304 	req->error = bp->b_error;
   2305 
   2306 	/* Drop this one on the "finished" queue... */
   2307 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2308 
   2309 	/* Let the raidio thread know there is work to be done. */
   2310 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2311 
   2312 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2313 }
   2314 
   2315 
   2316 /*
   2317  * initialize a buf structure for doing an I/O in the kernel.
   2318  */
   2319 static void
   2320 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2321        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2322        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2323        struct proc *b_proc)
   2324 {
   2325 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2326 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2327 	bp->b_oflags = 0;
   2328 	bp->b_cflags = 0;
   2329 	bp->b_bcount = numSect << logBytesPerSector;
   2330 	bp->b_bufsize = bp->b_bcount;
   2331 	bp->b_error = 0;
   2332 	bp->b_dev = dev;
   2333 	bp->b_data = bf;
   2334 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2335 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2336 	if (bp->b_bcount == 0) {
   2337 		panic("bp->b_bcount is zero in InitBP!!");
   2338 	}
   2339 	bp->b_proc = b_proc;
   2340 	bp->b_iodone = cbFunc;
   2341 	bp->b_private = cbArg;
   2342 }
   2343 
   2344 static void
   2345 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2346 		    struct disklabel *lp)
   2347 {
   2348 	memset(lp, 0, sizeof(*lp));
   2349 
   2350 	/* fabricate a label... */
   2351 	if (raidPtr->totalSectors > UINT32_MAX)
   2352 		lp->d_secperunit = UINT32_MAX;
   2353 	else
   2354 		lp->d_secperunit = raidPtr->totalSectors;
   2355 	lp->d_secsize = raidPtr->bytesPerSector;
   2356 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2357 	lp->d_ntracks = 4 * raidPtr->numCol;
   2358 	lp->d_ncylinders = raidPtr->totalSectors /
   2359 		(lp->d_nsectors * lp->d_ntracks);
   2360 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2361 
   2362 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2363 	lp->d_type = DKTYPE_RAID;
   2364 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2365 	lp->d_rpm = 3600;
   2366 	lp->d_interleave = 1;
   2367 	lp->d_flags = 0;
   2368 
   2369 	lp->d_partitions[RAW_PART].p_offset = 0;
   2370 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2371 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2372 	lp->d_npartitions = RAW_PART + 1;
   2373 
   2374 	lp->d_magic = DISKMAGIC;
   2375 	lp->d_magic2 = DISKMAGIC;
   2376 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2377 
   2378 }
   2379 /*
   2380  * Read the disklabel from the raid device.  If one is not present, fake one
   2381  * up.
   2382  */
   2383 static void
   2384 raidgetdisklabel(dev_t dev)
   2385 {
   2386 	int     unit = raidunit(dev);
   2387 	struct raid_softc *rs;
   2388 	const char   *errstring;
   2389 	struct disklabel *lp;
   2390 	struct cpu_disklabel *clp;
   2391 	RF_Raid_t *raidPtr;
   2392 
   2393 	if ((rs = raidget(unit)) == NULL)
   2394 		return;
   2395 
   2396 	lp = rs->sc_dkdev.dk_label;
   2397 	clp = rs->sc_dkdev.dk_cpulabel;
   2398 
   2399 	db1_printf(("Getting the disklabel...\n"));
   2400 
   2401 	memset(clp, 0, sizeof(*clp));
   2402 
   2403 	raidPtr = &rs->sc_r;
   2404 
   2405 	raidgetdefaultlabel(raidPtr, rs, lp);
   2406 
   2407 	/*
   2408 	 * Call the generic disklabel extraction routine.
   2409 	 */
   2410 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2411 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2412 	if (errstring)
   2413 		raidmakedisklabel(rs);
   2414 	else {
   2415 		int     i;
   2416 		struct partition *pp;
   2417 
   2418 		/*
   2419 		 * Sanity check whether the found disklabel is valid.
   2420 		 *
   2421 		 * This is necessary since total size of the raid device
   2422 		 * may vary when an interleave is changed even though exactly
   2423 		 * same components are used, and old disklabel may used
   2424 		 * if that is found.
   2425 		 */
   2426 		if (lp->d_secperunit < UINT32_MAX ?
   2427 		    lp->d_secperunit != rs->sc_size :
   2428 		    lp->d_secperunit > rs->sc_size)
   2429 			printf("raid%d: WARNING: %s: "
   2430 			    "total sector size in disklabel (%ju) != "
   2431 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
   2432 			    (uintmax_t)lp->d_secperunit,
   2433 			    (uintmax_t)rs->sc_size);
   2434 		for (i = 0; i < lp->d_npartitions; i++) {
   2435 			pp = &lp->d_partitions[i];
   2436 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2437 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2438 				       "exceeds the size of raid (%ju)\n",
   2439 				       unit, rs->sc_xname, 'a' + i,
   2440 				       (uintmax_t)rs->sc_size);
   2441 		}
   2442 	}
   2443 
   2444 }
   2445 /*
   2446  * Take care of things one might want to take care of in the event
   2447  * that a disklabel isn't present.
   2448  */
   2449 static void
   2450 raidmakedisklabel(struct raid_softc *rs)
   2451 {
   2452 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2453 	db1_printf(("Making a label..\n"));
   2454 
   2455 	/*
   2456 	 * For historical reasons, if there's no disklabel present
   2457 	 * the raw partition must be marked FS_BSDFFS.
   2458 	 */
   2459 
   2460 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2461 
   2462 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2463 
   2464 	lp->d_checksum = dkcksum(lp);
   2465 }
   2466 /*
   2467  * Wait interruptibly for an exclusive lock.
   2468  *
   2469  * XXX
   2470  * Several drivers do this; it should be abstracted and made MP-safe.
   2471  * (Hmm... where have we seen this warning before :->  GO )
   2472  */
   2473 static int
   2474 raidlock(struct raid_softc *rs)
   2475 {
   2476 	int     error;
   2477 
   2478 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2479 		rs->sc_flags |= RAIDF_WANTED;
   2480 		if ((error =
   2481 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2482 			return (error);
   2483 	}
   2484 	rs->sc_flags |= RAIDF_LOCKED;
   2485 	return (0);
   2486 }
   2487 /*
   2488  * Unlock and wake up any waiters.
   2489  */
   2490 static void
   2491 raidunlock(struct raid_softc *rs)
   2492 {
   2493 
   2494 	rs->sc_flags &= ~RAIDF_LOCKED;
   2495 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2496 		rs->sc_flags &= ~RAIDF_WANTED;
   2497 		wakeup(rs);
   2498 	}
   2499 }
   2500 
   2501 
   2502 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2503 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2504 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2505 
   2506 static daddr_t
   2507 rf_component_info_offset(void)
   2508 {
   2509 
   2510 	return RF_COMPONENT_INFO_OFFSET;
   2511 }
   2512 
   2513 static daddr_t
   2514 rf_component_info_size(unsigned secsize)
   2515 {
   2516 	daddr_t info_size;
   2517 
   2518 	KASSERT(secsize);
   2519 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2520 		info_size = secsize;
   2521 	else
   2522 		info_size = RF_COMPONENT_INFO_SIZE;
   2523 
   2524 	return info_size;
   2525 }
   2526 
   2527 static daddr_t
   2528 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2529 {
   2530 	daddr_t map_offset;
   2531 
   2532 	KASSERT(raidPtr->bytesPerSector);
   2533 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2534 		map_offset = raidPtr->bytesPerSector;
   2535 	else
   2536 		map_offset = RF_COMPONENT_INFO_SIZE;
   2537 	map_offset += rf_component_info_offset();
   2538 
   2539 	return map_offset;
   2540 }
   2541 
   2542 static daddr_t
   2543 rf_parity_map_size(RF_Raid_t *raidPtr)
   2544 {
   2545 	daddr_t map_size;
   2546 
   2547 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2548 		map_size = raidPtr->bytesPerSector;
   2549 	else
   2550 		map_size = RF_PARITY_MAP_SIZE;
   2551 
   2552 	return map_size;
   2553 }
   2554 
   2555 int
   2556 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2557 {
   2558 	RF_ComponentLabel_t *clabel;
   2559 
   2560 	clabel = raidget_component_label(raidPtr, col);
   2561 	clabel->clean = RF_RAID_CLEAN;
   2562 	raidflush_component_label(raidPtr, col);
   2563 	return(0);
   2564 }
   2565 
   2566 
   2567 int
   2568 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2569 {
   2570 	RF_ComponentLabel_t *clabel;
   2571 
   2572 	clabel = raidget_component_label(raidPtr, col);
   2573 	clabel->clean = RF_RAID_DIRTY;
   2574 	raidflush_component_label(raidPtr, col);
   2575 	return(0);
   2576 }
   2577 
   2578 int
   2579 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2580 {
   2581 	KASSERT(raidPtr->bytesPerSector);
   2582 	return raidread_component_label(raidPtr->bytesPerSector,
   2583 	    raidPtr->Disks[col].dev,
   2584 	    raidPtr->raid_cinfo[col].ci_vp,
   2585 	    &raidPtr->raid_cinfo[col].ci_label);
   2586 }
   2587 
   2588 RF_ComponentLabel_t *
   2589 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2590 {
   2591 	return &raidPtr->raid_cinfo[col].ci_label;
   2592 }
   2593 
   2594 int
   2595 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2596 {
   2597 	RF_ComponentLabel_t *label;
   2598 
   2599 	label = &raidPtr->raid_cinfo[col].ci_label;
   2600 	label->mod_counter = raidPtr->mod_counter;
   2601 #ifndef RF_NO_PARITY_MAP
   2602 	label->parity_map_modcount = label->mod_counter;
   2603 #endif
   2604 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2605 	    raidPtr->Disks[col].dev,
   2606 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2607 }
   2608 
   2609 
   2610 static int
   2611 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2612     RF_ComponentLabel_t *clabel)
   2613 {
   2614 	return raidread_component_area(dev, b_vp, clabel,
   2615 	    sizeof(RF_ComponentLabel_t),
   2616 	    rf_component_info_offset(),
   2617 	    rf_component_info_size(secsize));
   2618 }
   2619 
   2620 /* ARGSUSED */
   2621 static int
   2622 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2623     size_t msize, daddr_t offset, daddr_t dsize)
   2624 {
   2625 	struct buf *bp;
   2626 	const struct bdevsw *bdev;
   2627 	int error;
   2628 
   2629 	/* XXX should probably ensure that we don't try to do this if
   2630 	   someone has changed rf_protected_sectors. */
   2631 
   2632 	if (b_vp == NULL) {
   2633 		/* For whatever reason, this component is not valid.
   2634 		   Don't try to read a component label from it. */
   2635 		return(EINVAL);
   2636 	}
   2637 
   2638 	/* get a block of the appropriate size... */
   2639 	bp = geteblk((int)dsize);
   2640 	bp->b_dev = dev;
   2641 
   2642 	/* get our ducks in a row for the read */
   2643 	bp->b_blkno = offset / DEV_BSIZE;
   2644 	bp->b_bcount = dsize;
   2645 	bp->b_flags |= B_READ;
   2646  	bp->b_resid = dsize;
   2647 
   2648 	bdev = bdevsw_lookup(bp->b_dev);
   2649 	if (bdev == NULL)
   2650 		return (ENXIO);
   2651 	(*bdev->d_strategy)(bp);
   2652 
   2653 	error = biowait(bp);
   2654 
   2655 	if (!error) {
   2656 		memcpy(data, bp->b_data, msize);
   2657 	}
   2658 
   2659 	brelse(bp, 0);
   2660 	return(error);
   2661 }
   2662 
   2663 
   2664 static int
   2665 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2666     RF_ComponentLabel_t *clabel)
   2667 {
   2668 	return raidwrite_component_area(dev, b_vp, clabel,
   2669 	    sizeof(RF_ComponentLabel_t),
   2670 	    rf_component_info_offset(),
   2671 	    rf_component_info_size(secsize), 0);
   2672 }
   2673 
   2674 /* ARGSUSED */
   2675 static int
   2676 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2677     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2678 {
   2679 	struct buf *bp;
   2680 	const struct bdevsw *bdev;
   2681 	int error;
   2682 
   2683 	/* get a block of the appropriate size... */
   2684 	bp = geteblk((int)dsize);
   2685 	bp->b_dev = dev;
   2686 
   2687 	/* get our ducks in a row for the write */
   2688 	bp->b_blkno = offset / DEV_BSIZE;
   2689 	bp->b_bcount = dsize;
   2690 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2691  	bp->b_resid = dsize;
   2692 
   2693 	memset(bp->b_data, 0, dsize);
   2694 	memcpy(bp->b_data, data, msize);
   2695 
   2696 	bdev = bdevsw_lookup(bp->b_dev);
   2697 	if (bdev == NULL)
   2698 		return (ENXIO);
   2699 	(*bdev->d_strategy)(bp);
   2700 	if (asyncp)
   2701 		return 0;
   2702 	error = biowait(bp);
   2703 	brelse(bp, 0);
   2704 	if (error) {
   2705 #if 1
   2706 		printf("Failed to write RAID component info!\n");
   2707 #endif
   2708 	}
   2709 
   2710 	return(error);
   2711 }
   2712 
   2713 void
   2714 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2715 {
   2716 	int c;
   2717 
   2718 	for (c = 0; c < raidPtr->numCol; c++) {
   2719 		/* Skip dead disks. */
   2720 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2721 			continue;
   2722 		/* XXXjld: what if an error occurs here? */
   2723 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2724 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2725 		    RF_PARITYMAP_NBYTE,
   2726 		    rf_parity_map_offset(raidPtr),
   2727 		    rf_parity_map_size(raidPtr), 0);
   2728 	}
   2729 }
   2730 
   2731 void
   2732 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2733 {
   2734 	struct rf_paritymap_ondisk tmp;
   2735 	int c,first;
   2736 
   2737 	first=1;
   2738 	for (c = 0; c < raidPtr->numCol; c++) {
   2739 		/* Skip dead disks. */
   2740 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2741 			continue;
   2742 		raidread_component_area(raidPtr->Disks[c].dev,
   2743 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2744 		    RF_PARITYMAP_NBYTE,
   2745 		    rf_parity_map_offset(raidPtr),
   2746 		    rf_parity_map_size(raidPtr));
   2747 		if (first) {
   2748 			memcpy(map, &tmp, sizeof(*map));
   2749 			first = 0;
   2750 		} else {
   2751 			rf_paritymap_merge(map, &tmp);
   2752 		}
   2753 	}
   2754 }
   2755 
   2756 void
   2757 rf_markalldirty(RF_Raid_t *raidPtr)
   2758 {
   2759 	RF_ComponentLabel_t *clabel;
   2760 	int sparecol;
   2761 	int c;
   2762 	int j;
   2763 	int scol = -1;
   2764 
   2765 	raidPtr->mod_counter++;
   2766 	for (c = 0; c < raidPtr->numCol; c++) {
   2767 		/* we don't want to touch (at all) a disk that has
   2768 		   failed */
   2769 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2770 			clabel = raidget_component_label(raidPtr, c);
   2771 			if (clabel->status == rf_ds_spared) {
   2772 				/* XXX do something special...
   2773 				   but whatever you do, don't
   2774 				   try to access it!! */
   2775 			} else {
   2776 				raidmarkdirty(raidPtr, c);
   2777 			}
   2778 		}
   2779 	}
   2780 
   2781 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2782 		sparecol = raidPtr->numCol + c;
   2783 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2784 			/*
   2785 
   2786 			   we claim this disk is "optimal" if it's
   2787 			   rf_ds_used_spare, as that means it should be
   2788 			   directly substitutable for the disk it replaced.
   2789 			   We note that too...
   2790 
   2791 			 */
   2792 
   2793 			for(j=0;j<raidPtr->numCol;j++) {
   2794 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2795 					scol = j;
   2796 					break;
   2797 				}
   2798 			}
   2799 
   2800 			clabel = raidget_component_label(raidPtr, sparecol);
   2801 			/* make sure status is noted */
   2802 
   2803 			raid_init_component_label(raidPtr, clabel);
   2804 
   2805 			clabel->row = 0;
   2806 			clabel->column = scol;
   2807 			/* Note: we *don't* change status from rf_ds_used_spare
   2808 			   to rf_ds_optimal */
   2809 			/* clabel.status = rf_ds_optimal; */
   2810 
   2811 			raidmarkdirty(raidPtr, sparecol);
   2812 		}
   2813 	}
   2814 }
   2815 
   2816 
   2817 void
   2818 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2819 {
   2820 	RF_ComponentLabel_t *clabel;
   2821 	int sparecol;
   2822 	int c;
   2823 	int j;
   2824 	int scol;
   2825 
   2826 	scol = -1;
   2827 
   2828 	/* XXX should do extra checks to make sure things really are clean,
   2829 	   rather than blindly setting the clean bit... */
   2830 
   2831 	raidPtr->mod_counter++;
   2832 
   2833 	for (c = 0; c < raidPtr->numCol; c++) {
   2834 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2835 			clabel = raidget_component_label(raidPtr, c);
   2836 			/* make sure status is noted */
   2837 			clabel->status = rf_ds_optimal;
   2838 
   2839 			/* note what unit we are configured as */
   2840 			clabel->last_unit = raidPtr->raidid;
   2841 
   2842 			raidflush_component_label(raidPtr, c);
   2843 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2844 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2845 					raidmarkclean(raidPtr, c);
   2846 				}
   2847 			}
   2848 		}
   2849 		/* else we don't touch it.. */
   2850 	}
   2851 
   2852 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2853 		sparecol = raidPtr->numCol + c;
   2854 		/* Need to ensure that the reconstruct actually completed! */
   2855 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2856 			/*
   2857 
   2858 			   we claim this disk is "optimal" if it's
   2859 			   rf_ds_used_spare, as that means it should be
   2860 			   directly substitutable for the disk it replaced.
   2861 			   We note that too...
   2862 
   2863 			 */
   2864 
   2865 			for(j=0;j<raidPtr->numCol;j++) {
   2866 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2867 					scol = j;
   2868 					break;
   2869 				}
   2870 			}
   2871 
   2872 			/* XXX shouldn't *really* need this... */
   2873 			clabel = raidget_component_label(raidPtr, sparecol);
   2874 			/* make sure status is noted */
   2875 
   2876 			raid_init_component_label(raidPtr, clabel);
   2877 
   2878 			clabel->column = scol;
   2879 			clabel->status = rf_ds_optimal;
   2880 			clabel->last_unit = raidPtr->raidid;
   2881 
   2882 			raidflush_component_label(raidPtr, sparecol);
   2883 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2884 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2885 					raidmarkclean(raidPtr, sparecol);
   2886 				}
   2887 			}
   2888 		}
   2889 	}
   2890 }
   2891 
   2892 void
   2893 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2894 {
   2895 
   2896 	if (vp != NULL) {
   2897 		if (auto_configured == 1) {
   2898 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2899 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2900 			vput(vp);
   2901 
   2902 		} else {
   2903 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2904 		}
   2905 	}
   2906 }
   2907 
   2908 
   2909 void
   2910 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2911 {
   2912 	int r,c;
   2913 	struct vnode *vp;
   2914 	int acd;
   2915 
   2916 
   2917 	/* We take this opportunity to close the vnodes like we should.. */
   2918 
   2919 	for (c = 0; c < raidPtr->numCol; c++) {
   2920 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2921 		acd = raidPtr->Disks[c].auto_configured;
   2922 		rf_close_component(raidPtr, vp, acd);
   2923 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2924 		raidPtr->Disks[c].auto_configured = 0;
   2925 	}
   2926 
   2927 	for (r = 0; r < raidPtr->numSpare; r++) {
   2928 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2929 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2930 		rf_close_component(raidPtr, vp, acd);
   2931 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2932 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2933 	}
   2934 }
   2935 
   2936 
   2937 void
   2938 rf_ReconThread(struct rf_recon_req *req)
   2939 {
   2940 	int     s;
   2941 	RF_Raid_t *raidPtr;
   2942 
   2943 	s = splbio();
   2944 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2945 	raidPtr->recon_in_progress = 1;
   2946 
   2947 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2948 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2949 
   2950 	RF_Free(req, sizeof(*req));
   2951 
   2952 	raidPtr->recon_in_progress = 0;
   2953 	splx(s);
   2954 
   2955 	/* That's all... */
   2956 	kthread_exit(0);	/* does not return */
   2957 }
   2958 
   2959 void
   2960 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2961 {
   2962 	int retcode;
   2963 	int s;
   2964 
   2965 	raidPtr->parity_rewrite_stripes_done = 0;
   2966 	raidPtr->parity_rewrite_in_progress = 1;
   2967 	s = splbio();
   2968 	retcode = rf_RewriteParity(raidPtr);
   2969 	splx(s);
   2970 	if (retcode) {
   2971 		printf("raid%d: Error re-writing parity (%d)!\n",
   2972 		    raidPtr->raidid, retcode);
   2973 	} else {
   2974 		/* set the clean bit!  If we shutdown correctly,
   2975 		   the clean bit on each component label will get
   2976 		   set */
   2977 		raidPtr->parity_good = RF_RAID_CLEAN;
   2978 	}
   2979 	raidPtr->parity_rewrite_in_progress = 0;
   2980 
   2981 	/* Anyone waiting for us to stop?  If so, inform them... */
   2982 	if (raidPtr->waitShutdown) {
   2983 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2984 	}
   2985 
   2986 	/* That's all... */
   2987 	kthread_exit(0);	/* does not return */
   2988 }
   2989 
   2990 
   2991 void
   2992 rf_CopybackThread(RF_Raid_t *raidPtr)
   2993 {
   2994 	int s;
   2995 
   2996 	raidPtr->copyback_in_progress = 1;
   2997 	s = splbio();
   2998 	rf_CopybackReconstructedData(raidPtr);
   2999 	splx(s);
   3000 	raidPtr->copyback_in_progress = 0;
   3001 
   3002 	/* That's all... */
   3003 	kthread_exit(0);	/* does not return */
   3004 }
   3005 
   3006 
   3007 void
   3008 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3009 {
   3010 	int s;
   3011 	RF_Raid_t *raidPtr;
   3012 
   3013 	s = splbio();
   3014 	raidPtr = req->raidPtr;
   3015 	raidPtr->recon_in_progress = 1;
   3016 	rf_ReconstructInPlace(raidPtr, req->col);
   3017 	RF_Free(req, sizeof(*req));
   3018 	raidPtr->recon_in_progress = 0;
   3019 	splx(s);
   3020 
   3021 	/* That's all... */
   3022 	kthread_exit(0);	/* does not return */
   3023 }
   3024 
   3025 static RF_AutoConfig_t *
   3026 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3027     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3028     unsigned secsize)
   3029 {
   3030 	int good_one = 0;
   3031 	RF_ComponentLabel_t *clabel;
   3032 	RF_AutoConfig_t *ac;
   3033 
   3034 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3035 	if (clabel == NULL) {
   3036 oomem:
   3037 		    while(ac_list) {
   3038 			    ac = ac_list;
   3039 			    if (ac->clabel)
   3040 				    free(ac->clabel, M_RAIDFRAME);
   3041 			    ac_list = ac_list->next;
   3042 			    free(ac, M_RAIDFRAME);
   3043 		    }
   3044 		    printf("RAID auto config: out of memory!\n");
   3045 		    return NULL; /* XXX probably should panic? */
   3046 	}
   3047 
   3048 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3049 		/* Got the label.  Does it look reasonable? */
   3050 		if (rf_reasonable_label(clabel, numsecs) &&
   3051 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3052 #ifdef DEBUG
   3053 			printf("Component on: %s: %llu\n",
   3054 				cname, (unsigned long long)size);
   3055 			rf_print_component_label(clabel);
   3056 #endif
   3057 			/* if it's reasonable, add it, else ignore it. */
   3058 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3059 				M_NOWAIT);
   3060 			if (ac == NULL) {
   3061 				free(clabel, M_RAIDFRAME);
   3062 				goto oomem;
   3063 			}
   3064 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3065 			ac->dev = dev;
   3066 			ac->vp = vp;
   3067 			ac->clabel = clabel;
   3068 			ac->next = ac_list;
   3069 			ac_list = ac;
   3070 			good_one = 1;
   3071 		}
   3072 	}
   3073 	if (!good_one) {
   3074 		/* cleanup */
   3075 		free(clabel, M_RAIDFRAME);
   3076 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3077 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3078 		vput(vp);
   3079 	}
   3080 	return ac_list;
   3081 }
   3082 
   3083 RF_AutoConfig_t *
   3084 rf_find_raid_components(void)
   3085 {
   3086 	struct vnode *vp;
   3087 	struct disklabel label;
   3088 	device_t dv;
   3089 	deviter_t di;
   3090 	dev_t dev;
   3091 	int bmajor, bminor, wedge, rf_part_found;
   3092 	int error;
   3093 	int i;
   3094 	RF_AutoConfig_t *ac_list;
   3095 	uint64_t numsecs;
   3096 	unsigned secsize;
   3097 
   3098 	/* initialize the AutoConfig list */
   3099 	ac_list = NULL;
   3100 
   3101 	/* we begin by trolling through *all* the devices on the system */
   3102 
   3103 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3104 	     dv = deviter_next(&di)) {
   3105 
   3106 		/* we are only interested in disks... */
   3107 		if (device_class(dv) != DV_DISK)
   3108 			continue;
   3109 
   3110 		/* we don't care about floppies... */
   3111 		if (device_is_a(dv, "fd")) {
   3112 			continue;
   3113 		}
   3114 
   3115 		/* we don't care about CD's... */
   3116 		if (device_is_a(dv, "cd")) {
   3117 			continue;
   3118 		}
   3119 
   3120 		/* we don't care about md's... */
   3121 		if (device_is_a(dv, "md")) {
   3122 			continue;
   3123 		}
   3124 
   3125 		/* hdfd is the Atari/Hades floppy driver */
   3126 		if (device_is_a(dv, "hdfd")) {
   3127 			continue;
   3128 		}
   3129 
   3130 		/* fdisa is the Atari/Milan floppy driver */
   3131 		if (device_is_a(dv, "fdisa")) {
   3132 			continue;
   3133 		}
   3134 
   3135 		/* need to find the device_name_to_block_device_major stuff */
   3136 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3137 
   3138 		rf_part_found = 0; /*No raid partition as yet*/
   3139 
   3140 		/* get a vnode for the raw partition of this disk */
   3141 
   3142 		wedge = device_is_a(dv, "dk");
   3143 		bminor = minor(device_unit(dv));
   3144 		dev = wedge ? makedev(bmajor, bminor) :
   3145 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3146 		if (bdevvp(dev, &vp))
   3147 			panic("RAID can't alloc vnode");
   3148 
   3149 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3150 
   3151 		if (error) {
   3152 			/* "Who cares."  Continue looking
   3153 			   for something that exists*/
   3154 			vput(vp);
   3155 			continue;
   3156 		}
   3157 
   3158 		error = getdisksize(vp, &numsecs, &secsize);
   3159 		if (error) {
   3160 			vput(vp);
   3161 			continue;
   3162 		}
   3163 		if (wedge) {
   3164 			struct dkwedge_info dkw;
   3165 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3166 			    NOCRED);
   3167 			if (error) {
   3168 				printf("RAIDframe: can't get wedge info for "
   3169 				    "dev %s (%d)\n", device_xname(dv), error);
   3170 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3171 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3172 				vput(vp);
   3173 				continue;
   3174 			}
   3175 
   3176 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3177 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3178 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3179 				vput(vp);
   3180 				continue;
   3181 			}
   3182 
   3183 			ac_list = rf_get_component(ac_list, dev, vp,
   3184 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3185 			rf_part_found = 1; /*There is a raid component on this disk*/
   3186 			continue;
   3187 		}
   3188 
   3189 		/* Ok, the disk exists.  Go get the disklabel. */
   3190 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3191 		if (error) {
   3192 			/*
   3193 			 * XXX can't happen - open() would
   3194 			 * have errored out (or faked up one)
   3195 			 */
   3196 			if (error != ENOTTY)
   3197 				printf("RAIDframe: can't get label for dev "
   3198 				    "%s (%d)\n", device_xname(dv), error);
   3199 		}
   3200 
   3201 		/* don't need this any more.  We'll allocate it again
   3202 		   a little later if we really do... */
   3203 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3204 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3205 		vput(vp);
   3206 
   3207 		if (error)
   3208 			continue;
   3209 
   3210 		rf_part_found = 0; /*No raid partitions yet*/
   3211 		for (i = 0; i < label.d_npartitions; i++) {
   3212 			char cname[sizeof(ac_list->devname)];
   3213 
   3214 			/* We only support partitions marked as RAID */
   3215 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3216 				continue;
   3217 
   3218 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3219 			if (bdevvp(dev, &vp))
   3220 				panic("RAID can't alloc vnode");
   3221 
   3222 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3223 			if (error) {
   3224 				/* Whatever... */
   3225 				vput(vp);
   3226 				continue;
   3227 			}
   3228 			snprintf(cname, sizeof(cname), "%s%c",
   3229 			    device_xname(dv), 'a' + i);
   3230 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3231 				label.d_partitions[i].p_size, numsecs, secsize);
   3232 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3233 		}
   3234 
   3235 		/*
   3236 		 *If there is no raid component on this disk, either in a
   3237 		 *disklabel or inside a wedge, check the raw partition as well,
   3238 		 *as it is possible to configure raid components on raw disk
   3239 		 *devices.
   3240 		 */
   3241 
   3242 		if (!rf_part_found) {
   3243 			char cname[sizeof(ac_list->devname)];
   3244 
   3245 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3246 			if (bdevvp(dev, &vp))
   3247 				panic("RAID can't alloc vnode");
   3248 
   3249 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3250 			if (error) {
   3251 				/* Whatever... */
   3252 				vput(vp);
   3253 				continue;
   3254 			}
   3255 			snprintf(cname, sizeof(cname), "%s%c",
   3256 			    device_xname(dv), 'a' + RAW_PART);
   3257 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3258 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3259 		}
   3260 	}
   3261 	deviter_release(&di);
   3262 	return ac_list;
   3263 }
   3264 
   3265 
   3266 int
   3267 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3268 {
   3269 
   3270 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3271 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3272 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3273 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3274 	    clabel->row >=0 &&
   3275 	    clabel->column >= 0 &&
   3276 	    clabel->num_rows > 0 &&
   3277 	    clabel->num_columns > 0 &&
   3278 	    clabel->row < clabel->num_rows &&
   3279 	    clabel->column < clabel->num_columns &&
   3280 	    clabel->blockSize > 0 &&
   3281 	    /*
   3282 	     * numBlocksHi may contain garbage, but it is ok since
   3283 	     * the type is unsigned.  If it is really garbage,
   3284 	     * rf_fix_old_label_size() will fix it.
   3285 	     */
   3286 	    rf_component_label_numblocks(clabel) > 0) {
   3287 		/*
   3288 		 * label looks reasonable enough...
   3289 		 * let's make sure it has no old garbage.
   3290 		 */
   3291 		if (numsecs)
   3292 			rf_fix_old_label_size(clabel, numsecs);
   3293 		return(1);
   3294 	}
   3295 	return(0);
   3296 }
   3297 
   3298 
   3299 /*
   3300  * For reasons yet unknown, some old component labels have garbage in
   3301  * the newer numBlocksHi region, and this causes lossage.  Since those
   3302  * disks will also have numsecs set to less than 32 bits of sectors,
   3303  * we can determine when this corruption has occurred, and fix it.
   3304  *
   3305  * The exact same problem, with the same unknown reason, happens to
   3306  * the partitionSizeHi member as well.
   3307  */
   3308 static void
   3309 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3310 {
   3311 
   3312 	if (numsecs < ((uint64_t)1 << 32)) {
   3313 		if (clabel->numBlocksHi) {
   3314 			printf("WARNING: total sectors < 32 bits, yet "
   3315 			       "numBlocksHi set\n"
   3316 			       "WARNING: resetting numBlocksHi to zero.\n");
   3317 			clabel->numBlocksHi = 0;
   3318 		}
   3319 
   3320 		if (clabel->partitionSizeHi) {
   3321 			printf("WARNING: total sectors < 32 bits, yet "
   3322 			       "partitionSizeHi set\n"
   3323 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3324 			clabel->partitionSizeHi = 0;
   3325 		}
   3326 	}
   3327 }
   3328 
   3329 
   3330 #ifdef DEBUG
   3331 void
   3332 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3333 {
   3334 	uint64_t numBlocks;
   3335 	static const char *rp[] = {
   3336 	    "No", "Force", "Soft", "*invalid*"
   3337 	};
   3338 
   3339 
   3340 	numBlocks = rf_component_label_numblocks(clabel);
   3341 
   3342 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3343 	       clabel->row, clabel->column,
   3344 	       clabel->num_rows, clabel->num_columns);
   3345 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3346 	       clabel->version, clabel->serial_number,
   3347 	       clabel->mod_counter);
   3348 	printf("   Clean: %s Status: %d\n",
   3349 	       clabel->clean ? "Yes" : "No", clabel->status);
   3350 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3351 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3352 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3353 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3354 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3355 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3356 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3357 #if 0
   3358 	   printf("   Config order: %d\n", clabel->config_order);
   3359 #endif
   3360 
   3361 }
   3362 #endif
   3363 
   3364 RF_ConfigSet_t *
   3365 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3366 {
   3367 	RF_AutoConfig_t *ac;
   3368 	RF_ConfigSet_t *config_sets;
   3369 	RF_ConfigSet_t *cset;
   3370 	RF_AutoConfig_t *ac_next;
   3371 
   3372 
   3373 	config_sets = NULL;
   3374 
   3375 	/* Go through the AutoConfig list, and figure out which components
   3376 	   belong to what sets.  */
   3377 	ac = ac_list;
   3378 	while(ac!=NULL) {
   3379 		/* we're going to putz with ac->next, so save it here
   3380 		   for use at the end of the loop */
   3381 		ac_next = ac->next;
   3382 
   3383 		if (config_sets == NULL) {
   3384 			/* will need at least this one... */
   3385 			config_sets = (RF_ConfigSet_t *)
   3386 				malloc(sizeof(RF_ConfigSet_t),
   3387 				       M_RAIDFRAME, M_NOWAIT);
   3388 			if (config_sets == NULL) {
   3389 				panic("rf_create_auto_sets: No memory!");
   3390 			}
   3391 			/* this one is easy :) */
   3392 			config_sets->ac = ac;
   3393 			config_sets->next = NULL;
   3394 			config_sets->rootable = 0;
   3395 			ac->next = NULL;
   3396 		} else {
   3397 			/* which set does this component fit into? */
   3398 			cset = config_sets;
   3399 			while(cset!=NULL) {
   3400 				if (rf_does_it_fit(cset, ac)) {
   3401 					/* looks like it matches... */
   3402 					ac->next = cset->ac;
   3403 					cset->ac = ac;
   3404 					break;
   3405 				}
   3406 				cset = cset->next;
   3407 			}
   3408 			if (cset==NULL) {
   3409 				/* didn't find a match above... new set..*/
   3410 				cset = (RF_ConfigSet_t *)
   3411 					malloc(sizeof(RF_ConfigSet_t),
   3412 					       M_RAIDFRAME, M_NOWAIT);
   3413 				if (cset == NULL) {
   3414 					panic("rf_create_auto_sets: No memory!");
   3415 				}
   3416 				cset->ac = ac;
   3417 				ac->next = NULL;
   3418 				cset->next = config_sets;
   3419 				cset->rootable = 0;
   3420 				config_sets = cset;
   3421 			}
   3422 		}
   3423 		ac = ac_next;
   3424 	}
   3425 
   3426 
   3427 	return(config_sets);
   3428 }
   3429 
   3430 static int
   3431 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3432 {
   3433 	RF_ComponentLabel_t *clabel1, *clabel2;
   3434 
   3435 	/* If this one matches the *first* one in the set, that's good
   3436 	   enough, since the other members of the set would have been
   3437 	   through here too... */
   3438 	/* note that we are not checking partitionSize here..
   3439 
   3440 	   Note that we are also not checking the mod_counters here.
   3441 	   If everything else matches except the mod_counter, that's
   3442 	   good enough for this test.  We will deal with the mod_counters
   3443 	   a little later in the autoconfiguration process.
   3444 
   3445 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3446 
   3447 	   The reason we don't check for this is that failed disks
   3448 	   will have lower modification counts.  If those disks are
   3449 	   not added to the set they used to belong to, then they will
   3450 	   form their own set, which may result in 2 different sets,
   3451 	   for example, competing to be configured at raid0, and
   3452 	   perhaps competing to be the root filesystem set.  If the
   3453 	   wrong ones get configured, or both attempt to become /,
   3454 	   weird behaviour and or serious lossage will occur.  Thus we
   3455 	   need to bring them into the fold here, and kick them out at
   3456 	   a later point.
   3457 
   3458 	*/
   3459 
   3460 	clabel1 = cset->ac->clabel;
   3461 	clabel2 = ac->clabel;
   3462 	if ((clabel1->version == clabel2->version) &&
   3463 	    (clabel1->serial_number == clabel2->serial_number) &&
   3464 	    (clabel1->num_rows == clabel2->num_rows) &&
   3465 	    (clabel1->num_columns == clabel2->num_columns) &&
   3466 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3467 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3468 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3469 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3470 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3471 	    (clabel1->blockSize == clabel2->blockSize) &&
   3472 	    rf_component_label_numblocks(clabel1) ==
   3473 	    rf_component_label_numblocks(clabel2) &&
   3474 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3475 	    (clabel1->root_partition == clabel2->root_partition) &&
   3476 	    (clabel1->last_unit == clabel2->last_unit) &&
   3477 	    (clabel1->config_order == clabel2->config_order)) {
   3478 		/* if it get's here, it almost *has* to be a match */
   3479 	} else {
   3480 		/* it's not consistent with somebody in the set..
   3481 		   punt */
   3482 		return(0);
   3483 	}
   3484 	/* all was fine.. it must fit... */
   3485 	return(1);
   3486 }
   3487 
   3488 int
   3489 rf_have_enough_components(RF_ConfigSet_t *cset)
   3490 {
   3491 	RF_AutoConfig_t *ac;
   3492 	RF_AutoConfig_t *auto_config;
   3493 	RF_ComponentLabel_t *clabel;
   3494 	int c;
   3495 	int num_cols;
   3496 	int num_missing;
   3497 	int mod_counter;
   3498 	int mod_counter_found;
   3499 	int even_pair_failed;
   3500 	char parity_type;
   3501 
   3502 
   3503 	/* check to see that we have enough 'live' components
   3504 	   of this set.  If so, we can configure it if necessary */
   3505 
   3506 	num_cols = cset->ac->clabel->num_columns;
   3507 	parity_type = cset->ac->clabel->parityConfig;
   3508 
   3509 	/* XXX Check for duplicate components!?!?!? */
   3510 
   3511 	/* Determine what the mod_counter is supposed to be for this set. */
   3512 
   3513 	mod_counter_found = 0;
   3514 	mod_counter = 0;
   3515 	ac = cset->ac;
   3516 	while(ac!=NULL) {
   3517 		if (mod_counter_found==0) {
   3518 			mod_counter = ac->clabel->mod_counter;
   3519 			mod_counter_found = 1;
   3520 		} else {
   3521 			if (ac->clabel->mod_counter > mod_counter) {
   3522 				mod_counter = ac->clabel->mod_counter;
   3523 			}
   3524 		}
   3525 		ac = ac->next;
   3526 	}
   3527 
   3528 	num_missing = 0;
   3529 	auto_config = cset->ac;
   3530 
   3531 	even_pair_failed = 0;
   3532 	for(c=0; c<num_cols; c++) {
   3533 		ac = auto_config;
   3534 		while(ac!=NULL) {
   3535 			if ((ac->clabel->column == c) &&
   3536 			    (ac->clabel->mod_counter == mod_counter)) {
   3537 				/* it's this one... */
   3538 #ifdef DEBUG
   3539 				printf("Found: %s at %d\n",
   3540 				       ac->devname,c);
   3541 #endif
   3542 				break;
   3543 			}
   3544 			ac=ac->next;
   3545 		}
   3546 		if (ac==NULL) {
   3547 				/* Didn't find one here! */
   3548 				/* special case for RAID 1, especially
   3549 				   where there are more than 2
   3550 				   components (where RAIDframe treats
   3551 				   things a little differently :( ) */
   3552 			if (parity_type == '1') {
   3553 				if (c%2 == 0) { /* even component */
   3554 					even_pair_failed = 1;
   3555 				} else { /* odd component.  If
   3556 					    we're failed, and
   3557 					    so is the even
   3558 					    component, it's
   3559 					    "Good Night, Charlie" */
   3560 					if (even_pair_failed == 1) {
   3561 						return(0);
   3562 					}
   3563 				}
   3564 			} else {
   3565 				/* normal accounting */
   3566 				num_missing++;
   3567 			}
   3568 		}
   3569 		if ((parity_type == '1') && (c%2 == 1)) {
   3570 				/* Just did an even component, and we didn't
   3571 				   bail.. reset the even_pair_failed flag,
   3572 				   and go on to the next component.... */
   3573 			even_pair_failed = 0;
   3574 		}
   3575 	}
   3576 
   3577 	clabel = cset->ac->clabel;
   3578 
   3579 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3580 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3581 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3582 		/* XXX this needs to be made *much* more general */
   3583 		/* Too many failures */
   3584 		return(0);
   3585 	}
   3586 	/* otherwise, all is well, and we've got enough to take a kick
   3587 	   at autoconfiguring this set */
   3588 	return(1);
   3589 }
   3590 
   3591 void
   3592 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3593 			RF_Raid_t *raidPtr)
   3594 {
   3595 	RF_ComponentLabel_t *clabel;
   3596 	int i;
   3597 
   3598 	clabel = ac->clabel;
   3599 
   3600 	/* 1. Fill in the common stuff */
   3601 	config->numRow = clabel->num_rows = 1;
   3602 	config->numCol = clabel->num_columns;
   3603 	config->numSpare = 0; /* XXX should this be set here? */
   3604 	config->sectPerSU = clabel->sectPerSU;
   3605 	config->SUsPerPU = clabel->SUsPerPU;
   3606 	config->SUsPerRU = clabel->SUsPerRU;
   3607 	config->parityConfig = clabel->parityConfig;
   3608 	/* XXX... */
   3609 	strcpy(config->diskQueueType,"fifo");
   3610 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3611 	config->layoutSpecificSize = 0; /* XXX ?? */
   3612 
   3613 	while(ac!=NULL) {
   3614 		/* row/col values will be in range due to the checks
   3615 		   in reasonable_label() */
   3616 		strcpy(config->devnames[0][ac->clabel->column],
   3617 		       ac->devname);
   3618 		ac = ac->next;
   3619 	}
   3620 
   3621 	for(i=0;i<RF_MAXDBGV;i++) {
   3622 		config->debugVars[i][0] = 0;
   3623 	}
   3624 }
   3625 
   3626 int
   3627 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3628 {
   3629 	RF_ComponentLabel_t *clabel;
   3630 	int column;
   3631 	int sparecol;
   3632 
   3633 	raidPtr->autoconfigure = new_value;
   3634 
   3635 	for(column=0; column<raidPtr->numCol; column++) {
   3636 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3637 			clabel = raidget_component_label(raidPtr, column);
   3638 			clabel->autoconfigure = new_value;
   3639 			raidflush_component_label(raidPtr, column);
   3640 		}
   3641 	}
   3642 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3643 		sparecol = raidPtr->numCol + column;
   3644 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3645 			clabel = raidget_component_label(raidPtr, sparecol);
   3646 			clabel->autoconfigure = new_value;
   3647 			raidflush_component_label(raidPtr, sparecol);
   3648 		}
   3649 	}
   3650 	return(new_value);
   3651 }
   3652 
   3653 int
   3654 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3655 {
   3656 	RF_ComponentLabel_t *clabel;
   3657 	int column;
   3658 	int sparecol;
   3659 
   3660 	raidPtr->root_partition = new_value;
   3661 	for(column=0; column<raidPtr->numCol; column++) {
   3662 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3663 			clabel = raidget_component_label(raidPtr, column);
   3664 			clabel->root_partition = new_value;
   3665 			raidflush_component_label(raidPtr, column);
   3666 		}
   3667 	}
   3668 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3669 		sparecol = raidPtr->numCol + column;
   3670 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3671 			clabel = raidget_component_label(raidPtr, sparecol);
   3672 			clabel->root_partition = new_value;
   3673 			raidflush_component_label(raidPtr, sparecol);
   3674 		}
   3675 	}
   3676 	return(new_value);
   3677 }
   3678 
   3679 void
   3680 rf_release_all_vps(RF_ConfigSet_t *cset)
   3681 {
   3682 	RF_AutoConfig_t *ac;
   3683 
   3684 	ac = cset->ac;
   3685 	while(ac!=NULL) {
   3686 		/* Close the vp, and give it back */
   3687 		if (ac->vp) {
   3688 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3689 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3690 			vput(ac->vp);
   3691 			ac->vp = NULL;
   3692 		}
   3693 		ac = ac->next;
   3694 	}
   3695 }
   3696 
   3697 
   3698 void
   3699 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3700 {
   3701 	RF_AutoConfig_t *ac;
   3702 	RF_AutoConfig_t *next_ac;
   3703 
   3704 	ac = cset->ac;
   3705 	while(ac!=NULL) {
   3706 		next_ac = ac->next;
   3707 		/* nuke the label */
   3708 		free(ac->clabel, M_RAIDFRAME);
   3709 		/* cleanup the config structure */
   3710 		free(ac, M_RAIDFRAME);
   3711 		/* "next.." */
   3712 		ac = next_ac;
   3713 	}
   3714 	/* and, finally, nuke the config set */
   3715 	free(cset, M_RAIDFRAME);
   3716 }
   3717 
   3718 
   3719 void
   3720 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3721 {
   3722 	/* current version number */
   3723 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3724 	clabel->serial_number = raidPtr->serial_number;
   3725 	clabel->mod_counter = raidPtr->mod_counter;
   3726 
   3727 	clabel->num_rows = 1;
   3728 	clabel->num_columns = raidPtr->numCol;
   3729 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3730 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3731 
   3732 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3733 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3734 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3735 
   3736 	clabel->blockSize = raidPtr->bytesPerSector;
   3737 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3738 
   3739 	/* XXX not portable */
   3740 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3741 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3742 	clabel->autoconfigure = raidPtr->autoconfigure;
   3743 	clabel->root_partition = raidPtr->root_partition;
   3744 	clabel->last_unit = raidPtr->raidid;
   3745 	clabel->config_order = raidPtr->config_order;
   3746 
   3747 #ifndef RF_NO_PARITY_MAP
   3748 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3749 #endif
   3750 }
   3751 
   3752 struct raid_softc *
   3753 rf_auto_config_set(RF_ConfigSet_t *cset)
   3754 {
   3755 	RF_Raid_t *raidPtr;
   3756 	RF_Config_t *config;
   3757 	int raidID;
   3758 	struct raid_softc *sc;
   3759 
   3760 #ifdef DEBUG
   3761 	printf("RAID autoconfigure\n");
   3762 #endif
   3763 
   3764 	/* 1. Create a config structure */
   3765 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3766 	if (config == NULL) {
   3767 		printf("Out of mem!?!?\n");
   3768 				/* XXX do something more intelligent here. */
   3769 		return NULL;
   3770 	}
   3771 
   3772 	/*
   3773 	   2. Figure out what RAID ID this one is supposed to live at
   3774 	   See if we can get the same RAID dev that it was configured
   3775 	   on last time..
   3776 	*/
   3777 
   3778 	raidID = cset->ac->clabel->last_unit;
   3779 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3780 		continue;
   3781 #ifdef DEBUG
   3782 	printf("Configuring raid%d:\n",raidID);
   3783 #endif
   3784 
   3785 	raidPtr = &sc->sc_r;
   3786 
   3787 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3788 	raidPtr->softc = sc;
   3789 	raidPtr->raidid = raidID;
   3790 	raidPtr->openings = RAIDOUTSTANDING;
   3791 
   3792 	/* 3. Build the configuration structure */
   3793 	rf_create_configuration(cset->ac, config, raidPtr);
   3794 
   3795 	/* 4. Do the configuration */
   3796 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3797 		raidinit(sc);
   3798 
   3799 		rf_markalldirty(raidPtr);
   3800 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3801 		switch (cset->ac->clabel->root_partition) {
   3802 		case 1:	/* Force Root */
   3803 		case 2:	/* Soft Root: root when boot partition part of raid */
   3804 			/*
   3805 			 * everything configured just fine.  Make a note
   3806 			 * that this set is eligible to be root,
   3807 			 * or forced to be root
   3808 			 */
   3809 			cset->rootable = cset->ac->clabel->root_partition;
   3810 			/* XXX do this here? */
   3811 			raidPtr->root_partition = cset->rootable;
   3812 			break;
   3813 		default:
   3814 			break;
   3815 		}
   3816 	} else {
   3817 		raidput(sc);
   3818 		sc = NULL;
   3819 	}
   3820 
   3821 	/* 5. Cleanup */
   3822 	free(config, M_RAIDFRAME);
   3823 	return sc;
   3824 }
   3825 
   3826 void
   3827 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3828 {
   3829 	struct buf *bp;
   3830 	struct raid_softc *rs;
   3831 
   3832 	bp = (struct buf *)desc->bp;
   3833 	rs = desc->raidPtr->softc;
   3834 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3835 	    (bp->b_flags & B_READ));
   3836 }
   3837 
   3838 void
   3839 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3840 	     size_t xmin, size_t xmax)
   3841 {
   3842 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3843 	pool_sethiwat(p, xmax);
   3844 	pool_prime(p, xmin);
   3845 	pool_setlowat(p, xmin);
   3846 }
   3847 
   3848 /*
   3849  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3850  * if there is IO pending and if that IO could possibly be done for a
   3851  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3852  * otherwise.
   3853  *
   3854  */
   3855 
   3856 int
   3857 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3858 {
   3859 	struct raid_softc *rs = raidPtr->softc;
   3860 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3861 		/* there is work to do */
   3862 		return 0;
   3863 	}
   3864 	/* default is nothing to do */
   3865 	return 1;
   3866 }
   3867 
   3868 int
   3869 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3870 {
   3871 	uint64_t numsecs;
   3872 	unsigned secsize;
   3873 	int error;
   3874 
   3875 	error = getdisksize(vp, &numsecs, &secsize);
   3876 	if (error == 0) {
   3877 		diskPtr->blockSize = secsize;
   3878 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3879 		diskPtr->partitionSize = numsecs;
   3880 		return 0;
   3881 	}
   3882 	return error;
   3883 }
   3884 
   3885 static int
   3886 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3887 {
   3888 	return 1;
   3889 }
   3890 
   3891 static void
   3892 raid_attach(device_t parent, device_t self, void *aux)
   3893 {
   3894 
   3895 }
   3896 
   3897 
   3898 static int
   3899 raid_detach(device_t self, int flags)
   3900 {
   3901 	int error;
   3902 	struct raid_softc *rs = raidget(device_unit(self));
   3903 
   3904 	if (rs == NULL)
   3905 		return ENXIO;
   3906 
   3907 	if ((error = raidlock(rs)) != 0)
   3908 		return (error);
   3909 
   3910 	error = raid_detach_unlocked(rs);
   3911 
   3912 	raidunlock(rs);
   3913 
   3914 	/* XXXkd: raidput(rs) ??? */
   3915 
   3916 	return error;
   3917 }
   3918 
   3919 static void
   3920 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3921 {
   3922 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3923 
   3924 	memset(dg, 0, sizeof(*dg));
   3925 
   3926 	dg->dg_secperunit = raidPtr->totalSectors;
   3927 	dg->dg_secsize = raidPtr->bytesPerSector;
   3928 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3929 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3930 
   3931 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3932 }
   3933 
   3934 /*
   3935  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3936  * We end up returning whatever error was returned by the first cache flush
   3937  * that fails.
   3938  */
   3939 
   3940 int
   3941 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3942 {
   3943 	int c, sparecol;
   3944 	int e,error;
   3945 	int force = 1;
   3946 
   3947 	error = 0;
   3948 	for (c = 0; c < raidPtr->numCol; c++) {
   3949 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3950 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3951 					  &force, FWRITE, NOCRED);
   3952 			if (e) {
   3953 				if (e != ENODEV)
   3954 					printf("raid%d: cache flush to component %s failed.\n",
   3955 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3956 				if (error == 0) {
   3957 					error = e;
   3958 				}
   3959 			}
   3960 		}
   3961 	}
   3962 
   3963 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3964 		sparecol = raidPtr->numCol + c;
   3965 		/* Need to ensure that the reconstruct actually completed! */
   3966 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3967 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3968 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3969 			if (e) {
   3970 				if (e != ENODEV)
   3971 					printf("raid%d: cache flush to component %s failed.\n",
   3972 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3973 				if (error == 0) {
   3974 					error = e;
   3975 				}
   3976 			}
   3977 		}
   3978 	}
   3979 	return error;
   3980 }
   3981