Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.323
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.323 2015/04/26 15:15:20 mlelstv Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.323 2015/04/26 15:15:20 mlelstv Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 static struct dkdriver rf_dkdriver = {
    234 	.d_strategy = raidstrategy,
    235 	.d_minphys = minphys
    236 };
    237 
    238 struct raid_softc {
    239 	device_t sc_dev;
    240 	int	sc_unit;
    241 	int     sc_flags;	/* flags */
    242 	int     sc_cflags;	/* configuration flags */
    243 	uint64_t sc_size;	/* size of the raid device */
    244 	char    sc_xname[20];	/* XXX external name */
    245 	struct disk sc_dkdev;	/* generic disk device info */
    246 	struct bufq_state *buf_queue;	/* used for the device queue */
    247 	RF_Raid_t sc_r;
    248 	LIST_ENTRY(raid_softc) sc_link;
    249 };
    250 /* sc_flags */
    251 #define RAIDF_INITED	0x01	/* unit has been initialized */
    252 #define RAIDF_WLABEL	0x02	/* label area is writable */
    253 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    254 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    255 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    256 #define RAIDF_LOCKED	0x80	/* unit is locked */
    257 
    258 #define	raidunit(x)	DISKUNIT(x)
    259 
    260 extern struct cfdriver raid_cd;
    261 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    262     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    263     DVF_DETACH_SHUTDOWN);
    264 
    265 /*
    266  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    267  * Be aware that large numbers can allow the driver to consume a lot of
    268  * kernel memory, especially on writes, and in degraded mode reads.
    269  *
    270  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    271  * a single 64K write will typically require 64K for the old data,
    272  * 64K for the old parity, and 64K for the new parity, for a total
    273  * of 192K (if the parity buffer is not re-used immediately).
    274  * Even it if is used immediately, that's still 128K, which when multiplied
    275  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    276  *
    277  * Now in degraded mode, for example, a 64K read on the above setup may
    278  * require data reconstruction, which will require *all* of the 4 remaining
    279  * disks to participate -- 4 * 32K/disk == 128K again.
    280  */
    281 
    282 #ifndef RAIDOUTSTANDING
    283 #define RAIDOUTSTANDING   6
    284 #endif
    285 
    286 #define RAIDLABELDEV(dev)	\
    287 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    288 
    289 /* declared here, and made public, for the benefit of KVM stuff.. */
    290 
    291 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    292 				     struct disklabel *);
    293 static void raidgetdisklabel(dev_t);
    294 static void raidmakedisklabel(struct raid_softc *);
    295 
    296 static int raidlock(struct raid_softc *);
    297 static void raidunlock(struct raid_softc *);
    298 
    299 static int raid_detach_unlocked(struct raid_softc *);
    300 
    301 static void rf_markalldirty(RF_Raid_t *);
    302 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    303 
    304 void rf_ReconThread(struct rf_recon_req *);
    305 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    306 void rf_CopybackThread(RF_Raid_t *raidPtr);
    307 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    308 int rf_autoconfig(device_t);
    309 void rf_buildroothack(RF_ConfigSet_t *);
    310 
    311 RF_AutoConfig_t *rf_find_raid_components(void);
    312 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    313 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    314 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    315 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    316 int rf_set_autoconfig(RF_Raid_t *, int);
    317 int rf_set_rootpartition(RF_Raid_t *, int);
    318 void rf_release_all_vps(RF_ConfigSet_t *);
    319 void rf_cleanup_config_set(RF_ConfigSet_t *);
    320 int rf_have_enough_components(RF_ConfigSet_t *);
    321 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    322 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    323 
    324 /*
    325  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    326  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    327  * in the kernel config file.
    328  */
    329 #ifdef RAID_AUTOCONFIG
    330 int raidautoconfig = 1;
    331 #else
    332 int raidautoconfig = 0;
    333 #endif
    334 static bool raidautoconfigdone = false;
    335 
    336 struct RF_Pools_s rf_pools;
    337 
    338 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    339 static kmutex_t raid_lock;
    340 
    341 static struct raid_softc *
    342 raidcreate(int unit) {
    343 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    344 	if (sc == NULL) {
    345 #ifdef DIAGNOSTIC
    346 		printf("%s: out of memory\n", __func__);
    347 #endif
    348 		return NULL;
    349 	}
    350 	sc->sc_unit = unit;
    351 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    352 	return sc;
    353 }
    354 
    355 static void
    356 raiddestroy(struct raid_softc *sc) {
    357 	bufq_free(sc->buf_queue);
    358 	kmem_free(sc, sizeof(*sc));
    359 }
    360 
    361 static struct raid_softc *
    362 raidget(int unit) {
    363 	struct raid_softc *sc;
    364 	if (unit < 0) {
    365 #ifdef DIAGNOSTIC
    366 		panic("%s: unit %d!", __func__, unit);
    367 #endif
    368 		return NULL;
    369 	}
    370 	mutex_enter(&raid_lock);
    371 	LIST_FOREACH(sc, &raids, sc_link) {
    372 		if (sc->sc_unit == unit) {
    373 			mutex_exit(&raid_lock);
    374 			return sc;
    375 		}
    376 	}
    377 	mutex_exit(&raid_lock);
    378 	if ((sc = raidcreate(unit)) == NULL)
    379 		return NULL;
    380 	mutex_enter(&raid_lock);
    381 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    382 	mutex_exit(&raid_lock);
    383 	return sc;
    384 }
    385 
    386 static void
    387 raidput(struct raid_softc *sc) {
    388 	mutex_enter(&raid_lock);
    389 	LIST_REMOVE(sc, sc_link);
    390 	mutex_exit(&raid_lock);
    391 	raiddestroy(sc);
    392 }
    393 
    394 void
    395 raidattach(int num)
    396 {
    397 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    398 	/* This is where all the initialization stuff gets done. */
    399 
    400 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    401 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    402 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    403 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    404 
    405 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    406 #endif
    407 
    408 	if (rf_BootRaidframe() == 0)
    409 		aprint_verbose("Kernelized RAIDframe activated\n");
    410 	else
    411 		panic("Serious error booting RAID!!");
    412 
    413 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    414 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    415 	}
    416 
    417 	raidautoconfigdone = false;
    418 
    419 	/*
    420 	 * Register a finalizer which will be used to auto-config RAID
    421 	 * sets once all real hardware devices have been found.
    422 	 */
    423 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    424 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    425 }
    426 
    427 int
    428 rf_autoconfig(device_t self)
    429 {
    430 	RF_AutoConfig_t *ac_list;
    431 	RF_ConfigSet_t *config_sets;
    432 
    433 	if (!raidautoconfig || raidautoconfigdone == true)
    434 		return (0);
    435 
    436 	/* XXX This code can only be run once. */
    437 	raidautoconfigdone = true;
    438 
    439 #ifdef __HAVE_CPU_BOOTCONF
    440 	/*
    441 	 * 0. find the boot device if needed first so we can use it later
    442 	 * this needs to be done before we autoconfigure any raid sets,
    443 	 * because if we use wedges we are not going to be able to open
    444 	 * the boot device later
    445 	 */
    446 	if (booted_device == NULL)
    447 		cpu_bootconf();
    448 #endif
    449 	/* 1. locate all RAID components on the system */
    450 	aprint_debug("Searching for RAID components...\n");
    451 	ac_list = rf_find_raid_components();
    452 
    453 	/* 2. Sort them into their respective sets. */
    454 	config_sets = rf_create_auto_sets(ac_list);
    455 
    456 	/*
    457 	 * 3. Evaluate each set and configure the valid ones.
    458 	 * This gets done in rf_buildroothack().
    459 	 */
    460 	rf_buildroothack(config_sets);
    461 
    462 	return 1;
    463 }
    464 
    465 static int
    466 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    467 	const char *bootname = device_xname(bdv);
    468 	size_t len = strlen(bootname);
    469 
    470 	for (int col = 0; col < r->numCol; col++) {
    471 		const char *devname = r->Disks[col].devname;
    472 		devname += sizeof("/dev/") - 1;
    473 		if (strncmp(devname, "dk", 2) == 0) {
    474 			const char *parent =
    475 			    dkwedge_get_parent_name(r->Disks[col].dev);
    476 			if (parent != NULL)
    477 				devname = parent;
    478 		}
    479 		if (strncmp(devname, bootname, len) == 0) {
    480 			struct raid_softc *sc = r->softc;
    481 			aprint_debug("raid%d includes boot device %s\n",
    482 			    sc->sc_unit, devname);
    483 			return 1;
    484 		}
    485 	}
    486 	return 0;
    487 }
    488 
    489 void
    490 rf_buildroothack(RF_ConfigSet_t *config_sets)
    491 {
    492 	RF_ConfigSet_t *cset;
    493 	RF_ConfigSet_t *next_cset;
    494 	int num_root;
    495 	struct raid_softc *sc, *rsc;
    496 
    497 	sc = rsc = NULL;
    498 	num_root = 0;
    499 	cset = config_sets;
    500 	while (cset != NULL) {
    501 		next_cset = cset->next;
    502 		if (rf_have_enough_components(cset) &&
    503 		    cset->ac->clabel->autoconfigure == 1) {
    504 			sc = rf_auto_config_set(cset);
    505 			if (sc != NULL) {
    506 				aprint_debug("raid%d: configured ok\n",
    507 				    sc->sc_unit);
    508 				if (cset->rootable) {
    509 					rsc = sc;
    510 					num_root++;
    511 				}
    512 			} else {
    513 				/* The autoconfig didn't work :( */
    514 				aprint_debug("Autoconfig failed\n");
    515 				rf_release_all_vps(cset);
    516 			}
    517 		} else {
    518 			/* we're not autoconfiguring this set...
    519 			   release the associated resources */
    520 			rf_release_all_vps(cset);
    521 		}
    522 		/* cleanup */
    523 		rf_cleanup_config_set(cset);
    524 		cset = next_cset;
    525 	}
    526 
    527 	/* if the user has specified what the root device should be
    528 	   then we don't touch booted_device or boothowto... */
    529 
    530 	if (rootspec != NULL)
    531 		return;
    532 
    533 	/* we found something bootable... */
    534 
    535 	/*
    536 	 * XXX: The following code assumes that the root raid
    537 	 * is the first ('a') partition. This is about the best
    538 	 * we can do with a BSD disklabel, but we might be able
    539 	 * to do better with a GPT label, by setting a specified
    540 	 * attribute to indicate the root partition. We can then
    541 	 * stash the partition number in the r->root_partition
    542 	 * high bits (the bottom 2 bits are already used). For
    543 	 * now we just set booted_partition to 0 when we override
    544 	 * root.
    545 	 */
    546 	if (num_root == 1) {
    547 		device_t candidate_root;
    548 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    549 			char cname[sizeof(cset->ac->devname)];
    550 			/* XXX: assume 'a' */
    551 			snprintf(cname, sizeof(cname), "%s%c",
    552 			    device_xname(rsc->sc_dev), 'a');
    553 			candidate_root = dkwedge_find_by_wname(cname);
    554 		} else
    555 			candidate_root = rsc->sc_dev;
    556 		if (booted_device == NULL ||
    557 		    rsc->sc_r.root_partition == 1 ||
    558 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    559 			booted_device = candidate_root;
    560 			booted_partition = 0;	/* XXX assume 'a' */
    561 		}
    562 	} else if (num_root > 1) {
    563 
    564 		/*
    565 		 * Maybe the MD code can help. If it cannot, then
    566 		 * setroot() will discover that we have no
    567 		 * booted_device and will ask the user if nothing was
    568 		 * hardwired in the kernel config file
    569 		 */
    570 		if (booted_device == NULL)
    571 			return;
    572 
    573 		num_root = 0;
    574 		mutex_enter(&raid_lock);
    575 		LIST_FOREACH(sc, &raids, sc_link) {
    576 			RF_Raid_t *r = &sc->sc_r;
    577 			if (r->valid == 0)
    578 				continue;
    579 
    580 			if (r->root_partition == 0)
    581 				continue;
    582 
    583 			if (rf_containsboot(r, booted_device)) {
    584 				num_root++;
    585 				rsc = sc;
    586 			}
    587 		}
    588 		mutex_exit(&raid_lock);
    589 
    590 		if (num_root == 1) {
    591 			booted_device = rsc->sc_dev;
    592 			booted_partition = 0;	/* XXX assume 'a' */
    593 		} else {
    594 			/* we can't guess.. require the user to answer... */
    595 			boothowto |= RB_ASKNAME;
    596 		}
    597 	}
    598 }
    599 
    600 
    601 int
    602 raidsize(dev_t dev)
    603 {
    604 	struct raid_softc *rs;
    605 	struct disklabel *lp;
    606 	int     part, unit, omask, size;
    607 
    608 	unit = raidunit(dev);
    609 	if ((rs = raidget(unit)) == NULL)
    610 		return -1;
    611 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    612 		return (-1);
    613 
    614 	part = DISKPART(dev);
    615 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    616 	lp = rs->sc_dkdev.dk_label;
    617 
    618 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    619 		return (-1);
    620 
    621 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    622 		size = -1;
    623 	else
    624 		size = lp->d_partitions[part].p_size *
    625 		    (lp->d_secsize / DEV_BSIZE);
    626 
    627 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    628 		return (-1);
    629 
    630 	return (size);
    631 
    632 }
    633 
    634 int
    635 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    636 {
    637 	int     unit = raidunit(dev);
    638 	struct raid_softc *rs;
    639 	const struct bdevsw *bdev;
    640 	struct disklabel *lp;
    641 	RF_Raid_t *raidPtr;
    642 	daddr_t offset;
    643 	int     part, c, sparecol, j, scol, dumpto;
    644 	int     error = 0;
    645 
    646 	if ((rs = raidget(unit)) == NULL)
    647 		return ENXIO;
    648 
    649 	raidPtr = &rs->sc_r;
    650 
    651 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    652 		return ENXIO;
    653 
    654 	/* we only support dumping to RAID 1 sets */
    655 	if (raidPtr->Layout.numDataCol != 1 ||
    656 	    raidPtr->Layout.numParityCol != 1)
    657 		return EINVAL;
    658 
    659 
    660 	if ((error = raidlock(rs)) != 0)
    661 		return error;
    662 
    663 	if (size % DEV_BSIZE != 0) {
    664 		error = EINVAL;
    665 		goto out;
    666 	}
    667 
    668 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    669 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    670 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    671 		    size / DEV_BSIZE, rs->sc_size);
    672 		error = EINVAL;
    673 		goto out;
    674 	}
    675 
    676 	part = DISKPART(dev);
    677 	lp = rs->sc_dkdev.dk_label;
    678 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    679 
    680 	/* figure out what device is alive.. */
    681 
    682 	/*
    683 	   Look for a component to dump to.  The preference for the
    684 	   component to dump to is as follows:
    685 	   1) the master
    686 	   2) a used_spare of the master
    687 	   3) the slave
    688 	   4) a used_spare of the slave
    689 	*/
    690 
    691 	dumpto = -1;
    692 	for (c = 0; c < raidPtr->numCol; c++) {
    693 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    694 			/* this might be the one */
    695 			dumpto = c;
    696 			break;
    697 		}
    698 	}
    699 
    700 	/*
    701 	   At this point we have possibly selected a live master or a
    702 	   live slave.  We now check to see if there is a spared
    703 	   master (or a spared slave), if we didn't find a live master
    704 	   or a live slave.
    705 	*/
    706 
    707 	for (c = 0; c < raidPtr->numSpare; c++) {
    708 		sparecol = raidPtr->numCol + c;
    709 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    710 			/* How about this one? */
    711 			scol = -1;
    712 			for(j=0;j<raidPtr->numCol;j++) {
    713 				if (raidPtr->Disks[j].spareCol == sparecol) {
    714 					scol = j;
    715 					break;
    716 				}
    717 			}
    718 			if (scol == 0) {
    719 				/*
    720 				   We must have found a spared master!
    721 				   We'll take that over anything else
    722 				   found so far.  (We couldn't have
    723 				   found a real master before, since
    724 				   this is a used spare, and it's
    725 				   saying that it's replacing the
    726 				   master.)  On reboot (with
    727 				   autoconfiguration turned on)
    728 				   sparecol will become the 1st
    729 				   component (component0) of this set.
    730 				*/
    731 				dumpto = sparecol;
    732 				break;
    733 			} else if (scol != -1) {
    734 				/*
    735 				   Must be a spared slave.  We'll dump
    736 				   to that if we havn't found anything
    737 				   else so far.
    738 				*/
    739 				if (dumpto == -1)
    740 					dumpto = sparecol;
    741 			}
    742 		}
    743 	}
    744 
    745 	if (dumpto == -1) {
    746 		/* we couldn't find any live components to dump to!?!?
    747 		 */
    748 		error = EINVAL;
    749 		goto out;
    750 	}
    751 
    752 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    753 
    754 	/*
    755 	   Note that blkno is relative to this particular partition.
    756 	   By adding the offset of this partition in the RAID
    757 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    758 	   value that is relative to the partition used for the
    759 	   underlying component.
    760 	*/
    761 
    762 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    763 				blkno + offset, va, size);
    764 
    765 out:
    766 	raidunlock(rs);
    767 
    768 	return error;
    769 }
    770 /* ARGSUSED */
    771 int
    772 raidopen(dev_t dev, int flags, int fmt,
    773     struct lwp *l)
    774 {
    775 	int     unit = raidunit(dev);
    776 	struct raid_softc *rs;
    777 	struct disklabel *lp;
    778 	int     part, pmask;
    779 	int     error = 0;
    780 
    781 	if ((rs = raidget(unit)) == NULL)
    782 		return ENXIO;
    783 	if ((error = raidlock(rs)) != 0)
    784 		return (error);
    785 
    786 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    787 		error = EBUSY;
    788 		goto bad;
    789 	}
    790 
    791 	lp = rs->sc_dkdev.dk_label;
    792 
    793 	part = DISKPART(dev);
    794 
    795 	/*
    796 	 * If there are wedges, and this is not RAW_PART, then we
    797 	 * need to fail.
    798 	 */
    799 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    800 		error = EBUSY;
    801 		goto bad;
    802 	}
    803 	pmask = (1 << part);
    804 
    805 	if ((rs->sc_flags & RAIDF_INITED) &&
    806 	    (rs->sc_dkdev.dk_nwedges == 0) &&
    807 	    (rs->sc_dkdev.dk_openmask == 0))
    808 		raidgetdisklabel(dev);
    809 
    810 	/* make sure that this partition exists */
    811 
    812 	if (part != RAW_PART) {
    813 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    814 		    ((part >= lp->d_npartitions) ||
    815 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    816 			error = ENXIO;
    817 			goto bad;
    818 		}
    819 	}
    820 	/* Prevent this unit from being unconfigured while open. */
    821 	switch (fmt) {
    822 	case S_IFCHR:
    823 		rs->sc_dkdev.dk_copenmask |= pmask;
    824 		break;
    825 
    826 	case S_IFBLK:
    827 		rs->sc_dkdev.dk_bopenmask |= pmask;
    828 		break;
    829 	}
    830 
    831 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    832 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    833 		/* First one... mark things as dirty... Note that we *MUST*
    834 		 have done a configure before this.  I DO NOT WANT TO BE
    835 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    836 		 THAT THEY BELONG TOGETHER!!!!! */
    837 		/* XXX should check to see if we're only open for reading
    838 		   here... If so, we needn't do this, but then need some
    839 		   other way of keeping track of what's happened.. */
    840 
    841 		rf_markalldirty(&rs->sc_r);
    842 	}
    843 
    844 
    845 	rs->sc_dkdev.dk_openmask =
    846 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    847 
    848 bad:
    849 	raidunlock(rs);
    850 
    851 	return (error);
    852 
    853 
    854 }
    855 /* ARGSUSED */
    856 int
    857 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    858 {
    859 	int     unit = raidunit(dev);
    860 	struct raid_softc *rs;
    861 	int     error = 0;
    862 	int     part;
    863 
    864 	if ((rs = raidget(unit)) == NULL)
    865 		return ENXIO;
    866 
    867 	if ((error = raidlock(rs)) != 0)
    868 		return (error);
    869 
    870 	part = DISKPART(dev);
    871 
    872 	/* ...that much closer to allowing unconfiguration... */
    873 	switch (fmt) {
    874 	case S_IFCHR:
    875 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    876 		break;
    877 
    878 	case S_IFBLK:
    879 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    880 		break;
    881 	}
    882 	rs->sc_dkdev.dk_openmask =
    883 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    884 
    885 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    886 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    887 		/* Last one... device is not unconfigured yet.
    888 		   Device shutdown has taken care of setting the
    889 		   clean bits if RAIDF_INITED is not set
    890 		   mark things as clean... */
    891 
    892 		rf_update_component_labels(&rs->sc_r,
    893 						 RF_FINAL_COMPONENT_UPDATE);
    894 
    895 		/* If the kernel is shutting down, it will detach
    896 		 * this RAID set soon enough.
    897 		 */
    898 	}
    899 
    900 	raidunlock(rs);
    901 	return (0);
    902 
    903 }
    904 
    905 void
    906 raidstrategy(struct buf *bp)
    907 {
    908 	unsigned int unit = raidunit(bp->b_dev);
    909 	RF_Raid_t *raidPtr;
    910 	int     wlabel;
    911 	struct raid_softc *rs;
    912 
    913 	if ((rs = raidget(unit)) == NULL) {
    914 		bp->b_error = ENXIO;
    915 		goto done;
    916 	}
    917 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    918 		bp->b_error = ENXIO;
    919 		goto done;
    920 	}
    921 	raidPtr = &rs->sc_r;
    922 	if (!raidPtr->valid) {
    923 		bp->b_error = ENODEV;
    924 		goto done;
    925 	}
    926 	if (bp->b_bcount == 0) {
    927 		db1_printf(("b_bcount is zero..\n"));
    928 		goto done;
    929 	}
    930 
    931 	/*
    932 	 * Do bounds checking and adjust transfer.  If there's an
    933 	 * error, the bounds check will flag that for us.
    934 	 */
    935 
    936 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    937 	if (DISKPART(bp->b_dev) == RAW_PART) {
    938 		uint64_t size; /* device size in DEV_BSIZE unit */
    939 
    940 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    941 			size = raidPtr->totalSectors <<
    942 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    943 		} else {
    944 			size = raidPtr->totalSectors >>
    945 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    946 		}
    947 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    948 			goto done;
    949 		}
    950 	} else {
    951 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    952 			db1_printf(("Bounds check failed!!:%d %d\n",
    953 				(int) bp->b_blkno, (int) wlabel));
    954 			goto done;
    955 		}
    956 	}
    957 
    958 	rf_lock_mutex2(raidPtr->iodone_lock);
    959 
    960 	bp->b_resid = 0;
    961 
    962 	/* stuff it onto our queue */
    963 	bufq_put(rs->buf_queue, bp);
    964 
    965 	/* scheduled the IO to happen at the next convenient time */
    966 	rf_signal_cond2(raidPtr->iodone_cv);
    967 	rf_unlock_mutex2(raidPtr->iodone_lock);
    968 
    969 	return;
    970 
    971 done:
    972 	bp->b_resid = bp->b_bcount;
    973 	biodone(bp);
    974 }
    975 /* ARGSUSED */
    976 int
    977 raidread(dev_t dev, struct uio *uio, int flags)
    978 {
    979 	int     unit = raidunit(dev);
    980 	struct raid_softc *rs;
    981 
    982 	if ((rs = raidget(unit)) == NULL)
    983 		return ENXIO;
    984 
    985 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    986 		return (ENXIO);
    987 
    988 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    989 
    990 }
    991 /* ARGSUSED */
    992 int
    993 raidwrite(dev_t dev, struct uio *uio, int flags)
    994 {
    995 	int     unit = raidunit(dev);
    996 	struct raid_softc *rs;
    997 
    998 	if ((rs = raidget(unit)) == NULL)
    999 		return ENXIO;
   1000 
   1001 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1002 		return (ENXIO);
   1003 
   1004 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1005 
   1006 }
   1007 
   1008 static int
   1009 raid_detach_unlocked(struct raid_softc *rs)
   1010 {
   1011 	int error;
   1012 	RF_Raid_t *raidPtr;
   1013 
   1014 	raidPtr = &rs->sc_r;
   1015 
   1016 	/*
   1017 	 * If somebody has a partition mounted, we shouldn't
   1018 	 * shutdown.
   1019 	 */
   1020 	if (rs->sc_dkdev.dk_openmask != 0)
   1021 		return EBUSY;
   1022 
   1023 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1024 		;	/* not initialized: nothing to do */
   1025 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1026 		return error;
   1027 	else
   1028 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1029 
   1030 	/* Detach the disk. */
   1031 	dkwedge_delall(&rs->sc_dkdev);
   1032 	disk_detach(&rs->sc_dkdev);
   1033 	disk_destroy(&rs->sc_dkdev);
   1034 
   1035 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1036 
   1037 	return 0;
   1038 }
   1039 
   1040 int
   1041 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1042 {
   1043 	int     unit = raidunit(dev);
   1044 	int     error = 0;
   1045 	int     part, pmask, s;
   1046 	cfdata_t cf;
   1047 	struct raid_softc *rs;
   1048 	RF_Config_t *k_cfg, *u_cfg;
   1049 	RF_Raid_t *raidPtr;
   1050 	RF_RaidDisk_t *diskPtr;
   1051 	RF_AccTotals_t *totals;
   1052 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1053 	u_char *specific_buf;
   1054 	int retcode = 0;
   1055 	int column;
   1056 /*	int raidid; */
   1057 	struct rf_recon_req *rrcopy, *rr;
   1058 	RF_ComponentLabel_t *clabel;
   1059 	RF_ComponentLabel_t *ci_label;
   1060 	RF_ComponentLabel_t **clabel_ptr;
   1061 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1062 	RF_SingleComponent_t component;
   1063 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1064 	int i, j, d;
   1065 #ifdef __HAVE_OLD_DISKLABEL
   1066 	struct disklabel newlabel;
   1067 #endif
   1068 
   1069 	if ((rs = raidget(unit)) == NULL)
   1070 		return ENXIO;
   1071 	raidPtr = &rs->sc_r;
   1072 
   1073 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1074 		(int) DISKPART(dev), (int) unit, cmd));
   1075 
   1076 	/* Must be open for writes for these commands... */
   1077 	switch (cmd) {
   1078 #ifdef DIOCGSECTORSIZE
   1079 	case DIOCGSECTORSIZE:
   1080 		*(u_int *)data = raidPtr->bytesPerSector;
   1081 		return 0;
   1082 	case DIOCGMEDIASIZE:
   1083 		*(off_t *)data =
   1084 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1085 		return 0;
   1086 #endif
   1087 	case DIOCSDINFO:
   1088 	case DIOCWDINFO:
   1089 #ifdef __HAVE_OLD_DISKLABEL
   1090 	case ODIOCWDINFO:
   1091 	case ODIOCSDINFO:
   1092 #endif
   1093 	case DIOCWLABEL:
   1094 	case DIOCAWEDGE:
   1095 	case DIOCDWEDGE:
   1096 	case DIOCMWEDGES:
   1097 	case DIOCSSTRATEGY:
   1098 		if ((flag & FWRITE) == 0)
   1099 			return (EBADF);
   1100 	}
   1101 
   1102 	/* Must be initialized for these... */
   1103 	switch (cmd) {
   1104 	case DIOCGDINFO:
   1105 	case DIOCSDINFO:
   1106 	case DIOCWDINFO:
   1107 #ifdef __HAVE_OLD_DISKLABEL
   1108 	case ODIOCGDINFO:
   1109 	case ODIOCWDINFO:
   1110 	case ODIOCSDINFO:
   1111 	case ODIOCGDEFLABEL:
   1112 #endif
   1113 	case DIOCGPART:
   1114 	case DIOCWLABEL:
   1115 	case DIOCGDEFLABEL:
   1116 	case DIOCAWEDGE:
   1117 	case DIOCDWEDGE:
   1118 	case DIOCLWEDGES:
   1119 	case DIOCMWEDGES:
   1120 	case DIOCCACHESYNC:
   1121 	case RAIDFRAME_SHUTDOWN:
   1122 	case RAIDFRAME_REWRITEPARITY:
   1123 	case RAIDFRAME_GET_INFO:
   1124 	case RAIDFRAME_RESET_ACCTOTALS:
   1125 	case RAIDFRAME_GET_ACCTOTALS:
   1126 	case RAIDFRAME_KEEP_ACCTOTALS:
   1127 	case RAIDFRAME_GET_SIZE:
   1128 	case RAIDFRAME_FAIL_DISK:
   1129 	case RAIDFRAME_COPYBACK:
   1130 	case RAIDFRAME_CHECK_RECON_STATUS:
   1131 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1132 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1133 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1134 	case RAIDFRAME_ADD_HOT_SPARE:
   1135 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1136 	case RAIDFRAME_INIT_LABELS:
   1137 	case RAIDFRAME_REBUILD_IN_PLACE:
   1138 	case RAIDFRAME_CHECK_PARITY:
   1139 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1140 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1141 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1142 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1143 	case RAIDFRAME_SET_AUTOCONFIG:
   1144 	case RAIDFRAME_SET_ROOT:
   1145 	case RAIDFRAME_DELETE_COMPONENT:
   1146 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1147 	case RAIDFRAME_PARITYMAP_STATUS:
   1148 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1149 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1150 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1151 	case DIOCGSTRATEGY:
   1152 	case DIOCSSTRATEGY:
   1153 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1154 			return (ENXIO);
   1155 	}
   1156 
   1157 	switch (cmd) {
   1158 #ifdef COMPAT_50
   1159 	case RAIDFRAME_GET_INFO50:
   1160 		return rf_get_info50(raidPtr, data);
   1161 
   1162 	case RAIDFRAME_CONFIGURE50:
   1163 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1164 			return retcode;
   1165 		goto config;
   1166 #endif
   1167 		/* configure the system */
   1168 	case RAIDFRAME_CONFIGURE:
   1169 
   1170 		if (raidPtr->valid) {
   1171 			/* There is a valid RAID set running on this unit! */
   1172 			printf("raid%d: Device already configured!\n",unit);
   1173 			return(EINVAL);
   1174 		}
   1175 
   1176 		/* copy-in the configuration information */
   1177 		/* data points to a pointer to the configuration structure */
   1178 
   1179 		u_cfg = *((RF_Config_t **) data);
   1180 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1181 		if (k_cfg == NULL) {
   1182 			return (ENOMEM);
   1183 		}
   1184 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1185 		if (retcode) {
   1186 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1187 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1188 				retcode));
   1189 			return (retcode);
   1190 		}
   1191 		goto config;
   1192 	config:
   1193 		/* allocate a buffer for the layout-specific data, and copy it
   1194 		 * in */
   1195 		if (k_cfg->layoutSpecificSize) {
   1196 			if (k_cfg->layoutSpecificSize > 10000) {
   1197 				/* sanity check */
   1198 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1199 				return (EINVAL);
   1200 			}
   1201 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1202 			    (u_char *));
   1203 			if (specific_buf == NULL) {
   1204 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1205 				return (ENOMEM);
   1206 			}
   1207 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1208 			    k_cfg->layoutSpecificSize);
   1209 			if (retcode) {
   1210 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1211 				RF_Free(specific_buf,
   1212 					k_cfg->layoutSpecificSize);
   1213 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1214 					retcode));
   1215 				return (retcode);
   1216 			}
   1217 		} else
   1218 			specific_buf = NULL;
   1219 		k_cfg->layoutSpecific = specific_buf;
   1220 
   1221 		/* should do some kind of sanity check on the configuration.
   1222 		 * Store the sum of all the bytes in the last byte? */
   1223 
   1224 		/* configure the system */
   1225 
   1226 		/*
   1227 		 * Clear the entire RAID descriptor, just to make sure
   1228 		 *  there is no stale data left in the case of a
   1229 		 *  reconfiguration
   1230 		 */
   1231 		memset(raidPtr, 0, sizeof(*raidPtr));
   1232 		raidPtr->softc = rs;
   1233 		raidPtr->raidid = unit;
   1234 
   1235 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1236 
   1237 		if (retcode == 0) {
   1238 
   1239 			/* allow this many simultaneous IO's to
   1240 			   this RAID device */
   1241 			raidPtr->openings = RAIDOUTSTANDING;
   1242 
   1243 			raidinit(rs);
   1244 			rf_markalldirty(raidPtr);
   1245 		}
   1246 		/* free the buffers.  No return code here. */
   1247 		if (k_cfg->layoutSpecificSize) {
   1248 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1249 		}
   1250 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1251 
   1252 		return (retcode);
   1253 
   1254 		/* shutdown the system */
   1255 	case RAIDFRAME_SHUTDOWN:
   1256 
   1257 		part = DISKPART(dev);
   1258 		pmask = (1 << part);
   1259 
   1260 		if ((error = raidlock(rs)) != 0)
   1261 			return (error);
   1262 
   1263 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1264 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1265 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1266 			retcode = EBUSY;
   1267 		else {
   1268 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1269 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1270 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1271 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1272 			retcode = 0;
   1273 		}
   1274 
   1275 		raidunlock(rs);
   1276 
   1277 		if (retcode != 0)
   1278 			return retcode;
   1279 
   1280 		/* free the pseudo device attach bits */
   1281 
   1282 		cf = device_cfdata(rs->sc_dev);
   1283 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1284 			free(cf, M_RAIDFRAME);
   1285 
   1286 		return (retcode);
   1287 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1288 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1289 		/* need to read the component label for the disk indicated
   1290 		   by row,column in clabel */
   1291 
   1292 		/*
   1293 		 * Perhaps there should be an option to skip the in-core
   1294 		 * copy and hit the disk, as with disklabel(8).
   1295 		 */
   1296 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1297 
   1298 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1299 
   1300 		if (retcode) {
   1301 			RF_Free(clabel, sizeof(*clabel));
   1302 			return retcode;
   1303 		}
   1304 
   1305 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1306 
   1307 		column = clabel->column;
   1308 
   1309 		if ((column < 0) || (column >= raidPtr->numCol +
   1310 		    raidPtr->numSpare)) {
   1311 			RF_Free(clabel, sizeof(*clabel));
   1312 			return EINVAL;
   1313 		}
   1314 
   1315 		RF_Free(clabel, sizeof(*clabel));
   1316 
   1317 		clabel = raidget_component_label(raidPtr, column);
   1318 
   1319 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1320 
   1321 #if 0
   1322 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1323 		clabel = (RF_ComponentLabel_t *) data;
   1324 
   1325 		/* XXX check the label for valid stuff... */
   1326 		/* Note that some things *should not* get modified --
   1327 		   the user should be re-initing the labels instead of
   1328 		   trying to patch things.
   1329 		   */
   1330 
   1331 		raidid = raidPtr->raidid;
   1332 #ifdef DEBUG
   1333 		printf("raid%d: Got component label:\n", raidid);
   1334 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1335 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1336 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1337 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1338 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1339 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1340 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1341 #endif
   1342 		clabel->row = 0;
   1343 		column = clabel->column;
   1344 
   1345 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1346 			return(EINVAL);
   1347 		}
   1348 
   1349 		/* XXX this isn't allowed to do anything for now :-) */
   1350 
   1351 		/* XXX and before it is, we need to fill in the rest
   1352 		   of the fields!?!?!?! */
   1353 		memcpy(raidget_component_label(raidPtr, column),
   1354 		    clabel, sizeof(*clabel));
   1355 		raidflush_component_label(raidPtr, column);
   1356 		return (0);
   1357 #endif
   1358 
   1359 	case RAIDFRAME_INIT_LABELS:
   1360 		clabel = (RF_ComponentLabel_t *) data;
   1361 		/*
   1362 		   we only want the serial number from
   1363 		   the above.  We get all the rest of the information
   1364 		   from the config that was used to create this RAID
   1365 		   set.
   1366 		   */
   1367 
   1368 		raidPtr->serial_number = clabel->serial_number;
   1369 
   1370 		for(column=0;column<raidPtr->numCol;column++) {
   1371 			diskPtr = &raidPtr->Disks[column];
   1372 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1373 				ci_label = raidget_component_label(raidPtr,
   1374 				    column);
   1375 				/* Zeroing this is important. */
   1376 				memset(ci_label, 0, sizeof(*ci_label));
   1377 				raid_init_component_label(raidPtr, ci_label);
   1378 				ci_label->serial_number =
   1379 				    raidPtr->serial_number;
   1380 				ci_label->row = 0; /* we dont' pretend to support more */
   1381 				rf_component_label_set_partitionsize(ci_label,
   1382 				    diskPtr->partitionSize);
   1383 				ci_label->column = column;
   1384 				raidflush_component_label(raidPtr, column);
   1385 			}
   1386 			/* XXXjld what about the spares? */
   1387 		}
   1388 
   1389 		return (retcode);
   1390 	case RAIDFRAME_SET_AUTOCONFIG:
   1391 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1392 		printf("raid%d: New autoconfig value is: %d\n",
   1393 		       raidPtr->raidid, d);
   1394 		*(int *) data = d;
   1395 		return (retcode);
   1396 
   1397 	case RAIDFRAME_SET_ROOT:
   1398 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1399 		printf("raid%d: New rootpartition value is: %d\n",
   1400 		       raidPtr->raidid, d);
   1401 		*(int *) data = d;
   1402 		return (retcode);
   1403 
   1404 		/* initialize all parity */
   1405 	case RAIDFRAME_REWRITEPARITY:
   1406 
   1407 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1408 			/* Parity for RAID 0 is trivially correct */
   1409 			raidPtr->parity_good = RF_RAID_CLEAN;
   1410 			return(0);
   1411 		}
   1412 
   1413 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1414 			/* Re-write is already in progress! */
   1415 			return(EINVAL);
   1416 		}
   1417 
   1418 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1419 					   rf_RewriteParityThread,
   1420 					   raidPtr,"raid_parity");
   1421 		return (retcode);
   1422 
   1423 
   1424 	case RAIDFRAME_ADD_HOT_SPARE:
   1425 		sparePtr = (RF_SingleComponent_t *) data;
   1426 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1427 		retcode = rf_add_hot_spare(raidPtr, &component);
   1428 		return(retcode);
   1429 
   1430 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1431 		return(retcode);
   1432 
   1433 	case RAIDFRAME_DELETE_COMPONENT:
   1434 		componentPtr = (RF_SingleComponent_t *)data;
   1435 		memcpy( &component, componentPtr,
   1436 			sizeof(RF_SingleComponent_t));
   1437 		retcode = rf_delete_component(raidPtr, &component);
   1438 		return(retcode);
   1439 
   1440 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1441 		componentPtr = (RF_SingleComponent_t *)data;
   1442 		memcpy( &component, componentPtr,
   1443 			sizeof(RF_SingleComponent_t));
   1444 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1445 		return(retcode);
   1446 
   1447 	case RAIDFRAME_REBUILD_IN_PLACE:
   1448 
   1449 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1450 			/* Can't do this on a RAID 0!! */
   1451 			return(EINVAL);
   1452 		}
   1453 
   1454 		if (raidPtr->recon_in_progress == 1) {
   1455 			/* a reconstruct is already in progress! */
   1456 			return(EINVAL);
   1457 		}
   1458 
   1459 		componentPtr = (RF_SingleComponent_t *) data;
   1460 		memcpy( &component, componentPtr,
   1461 			sizeof(RF_SingleComponent_t));
   1462 		component.row = 0; /* we don't support any more */
   1463 		column = component.column;
   1464 
   1465 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1466 			return(EINVAL);
   1467 		}
   1468 
   1469 		rf_lock_mutex2(raidPtr->mutex);
   1470 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1471 		    (raidPtr->numFailures > 0)) {
   1472 			/* XXX 0 above shouldn't be constant!!! */
   1473 			/* some component other than this has failed.
   1474 			   Let's not make things worse than they already
   1475 			   are... */
   1476 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1477 			       raidPtr->raidid);
   1478 			printf("raid%d:     Col: %d   Too many failures.\n",
   1479 			       raidPtr->raidid, column);
   1480 			rf_unlock_mutex2(raidPtr->mutex);
   1481 			return (EINVAL);
   1482 		}
   1483 		if (raidPtr->Disks[column].status ==
   1484 		    rf_ds_reconstructing) {
   1485 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1486 			       raidPtr->raidid);
   1487 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1488 
   1489 			rf_unlock_mutex2(raidPtr->mutex);
   1490 			return (EINVAL);
   1491 		}
   1492 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1493 			rf_unlock_mutex2(raidPtr->mutex);
   1494 			return (EINVAL);
   1495 		}
   1496 		rf_unlock_mutex2(raidPtr->mutex);
   1497 
   1498 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1499 		if (rrcopy == NULL)
   1500 			return(ENOMEM);
   1501 
   1502 		rrcopy->raidPtr = (void *) raidPtr;
   1503 		rrcopy->col = column;
   1504 
   1505 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1506 					   rf_ReconstructInPlaceThread,
   1507 					   rrcopy,"raid_reconip");
   1508 		return(retcode);
   1509 
   1510 	case RAIDFRAME_GET_INFO:
   1511 		if (!raidPtr->valid)
   1512 			return (ENODEV);
   1513 		ucfgp = (RF_DeviceConfig_t **) data;
   1514 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1515 			  (RF_DeviceConfig_t *));
   1516 		if (d_cfg == NULL)
   1517 			return (ENOMEM);
   1518 		d_cfg->rows = 1; /* there is only 1 row now */
   1519 		d_cfg->cols = raidPtr->numCol;
   1520 		d_cfg->ndevs = raidPtr->numCol;
   1521 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1522 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1523 			return (ENOMEM);
   1524 		}
   1525 		d_cfg->nspares = raidPtr->numSpare;
   1526 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1527 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1528 			return (ENOMEM);
   1529 		}
   1530 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1531 		d = 0;
   1532 		for (j = 0; j < d_cfg->cols; j++) {
   1533 			d_cfg->devs[d] = raidPtr->Disks[j];
   1534 			d++;
   1535 		}
   1536 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1537 			d_cfg->spares[i] = raidPtr->Disks[j];
   1538 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1539 				/* XXX: raidctl(8) expects to see this as a used spare */
   1540 				d_cfg->spares[i].status = rf_ds_used_spare;
   1541 			}
   1542 		}
   1543 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1544 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1545 
   1546 		return (retcode);
   1547 
   1548 	case RAIDFRAME_CHECK_PARITY:
   1549 		*(int *) data = raidPtr->parity_good;
   1550 		return (0);
   1551 
   1552 	case RAIDFRAME_PARITYMAP_STATUS:
   1553 		if (rf_paritymap_ineligible(raidPtr))
   1554 			return EINVAL;
   1555 		rf_paritymap_status(raidPtr->parity_map,
   1556 		    (struct rf_pmstat *)data);
   1557 		return 0;
   1558 
   1559 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1560 		if (rf_paritymap_ineligible(raidPtr))
   1561 			return EINVAL;
   1562 		if (raidPtr->parity_map == NULL)
   1563 			return ENOENT; /* ??? */
   1564 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1565 			(struct rf_pmparams *)data, 1))
   1566 			return EINVAL;
   1567 		return 0;
   1568 
   1569 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1570 		if (rf_paritymap_ineligible(raidPtr))
   1571 			return EINVAL;
   1572 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1573 		return 0;
   1574 
   1575 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1576 		if (rf_paritymap_ineligible(raidPtr))
   1577 			return EINVAL;
   1578 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1579 		/* XXX should errors be passed up? */
   1580 		return 0;
   1581 
   1582 	case RAIDFRAME_RESET_ACCTOTALS:
   1583 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1584 		return (0);
   1585 
   1586 	case RAIDFRAME_GET_ACCTOTALS:
   1587 		totals = (RF_AccTotals_t *) data;
   1588 		*totals = raidPtr->acc_totals;
   1589 		return (0);
   1590 
   1591 	case RAIDFRAME_KEEP_ACCTOTALS:
   1592 		raidPtr->keep_acc_totals = *(int *)data;
   1593 		return (0);
   1594 
   1595 	case RAIDFRAME_GET_SIZE:
   1596 		*(int *) data = raidPtr->totalSectors;
   1597 		return (0);
   1598 
   1599 		/* fail a disk & optionally start reconstruction */
   1600 	case RAIDFRAME_FAIL_DISK:
   1601 
   1602 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1603 			/* Can't do this on a RAID 0!! */
   1604 			return(EINVAL);
   1605 		}
   1606 
   1607 		rr = (struct rf_recon_req *) data;
   1608 		rr->row = 0;
   1609 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1610 			return (EINVAL);
   1611 
   1612 
   1613 		rf_lock_mutex2(raidPtr->mutex);
   1614 		if (raidPtr->status == rf_rs_reconstructing) {
   1615 			/* you can't fail a disk while we're reconstructing! */
   1616 			/* XXX wrong for RAID6 */
   1617 			rf_unlock_mutex2(raidPtr->mutex);
   1618 			return (EINVAL);
   1619 		}
   1620 		if ((raidPtr->Disks[rr->col].status ==
   1621 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1622 			/* some other component has failed.  Let's not make
   1623 			   things worse. XXX wrong for RAID6 */
   1624 			rf_unlock_mutex2(raidPtr->mutex);
   1625 			return (EINVAL);
   1626 		}
   1627 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1628 			/* Can't fail a spared disk! */
   1629 			rf_unlock_mutex2(raidPtr->mutex);
   1630 			return (EINVAL);
   1631 		}
   1632 		rf_unlock_mutex2(raidPtr->mutex);
   1633 
   1634 		/* make a copy of the recon request so that we don't rely on
   1635 		 * the user's buffer */
   1636 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1637 		if (rrcopy == NULL)
   1638 			return(ENOMEM);
   1639 		memcpy(rrcopy, rr, sizeof(*rr));
   1640 		rrcopy->raidPtr = (void *) raidPtr;
   1641 
   1642 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1643 					   rf_ReconThread,
   1644 					   rrcopy,"raid_recon");
   1645 		return (0);
   1646 
   1647 		/* invoke a copyback operation after recon on whatever disk
   1648 		 * needs it, if any */
   1649 	case RAIDFRAME_COPYBACK:
   1650 
   1651 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1652 			/* This makes no sense on a RAID 0!! */
   1653 			return(EINVAL);
   1654 		}
   1655 
   1656 		if (raidPtr->copyback_in_progress == 1) {
   1657 			/* Copyback is already in progress! */
   1658 			return(EINVAL);
   1659 		}
   1660 
   1661 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1662 					   rf_CopybackThread,
   1663 					   raidPtr,"raid_copyback");
   1664 		return (retcode);
   1665 
   1666 		/* return the percentage completion of reconstruction */
   1667 	case RAIDFRAME_CHECK_RECON_STATUS:
   1668 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1669 			/* This makes no sense on a RAID 0, so tell the
   1670 			   user it's done. */
   1671 			*(int *) data = 100;
   1672 			return(0);
   1673 		}
   1674 		if (raidPtr->status != rf_rs_reconstructing)
   1675 			*(int *) data = 100;
   1676 		else {
   1677 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1678 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1679 			} else {
   1680 				*(int *) data = 0;
   1681 			}
   1682 		}
   1683 		return (0);
   1684 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1685 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1686 		if (raidPtr->status != rf_rs_reconstructing) {
   1687 			progressInfo.remaining = 0;
   1688 			progressInfo.completed = 100;
   1689 			progressInfo.total = 100;
   1690 		} else {
   1691 			progressInfo.total =
   1692 				raidPtr->reconControl->numRUsTotal;
   1693 			progressInfo.completed =
   1694 				raidPtr->reconControl->numRUsComplete;
   1695 			progressInfo.remaining = progressInfo.total -
   1696 				progressInfo.completed;
   1697 		}
   1698 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1699 				  sizeof(RF_ProgressInfo_t));
   1700 		return (retcode);
   1701 
   1702 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1703 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1704 			/* This makes no sense on a RAID 0, so tell the
   1705 			   user it's done. */
   1706 			*(int *) data = 100;
   1707 			return(0);
   1708 		}
   1709 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1710 			*(int *) data = 100 *
   1711 				raidPtr->parity_rewrite_stripes_done /
   1712 				raidPtr->Layout.numStripe;
   1713 		} else {
   1714 			*(int *) data = 100;
   1715 		}
   1716 		return (0);
   1717 
   1718 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1719 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1720 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1721 			progressInfo.total = raidPtr->Layout.numStripe;
   1722 			progressInfo.completed =
   1723 				raidPtr->parity_rewrite_stripes_done;
   1724 			progressInfo.remaining = progressInfo.total -
   1725 				progressInfo.completed;
   1726 		} else {
   1727 			progressInfo.remaining = 0;
   1728 			progressInfo.completed = 100;
   1729 			progressInfo.total = 100;
   1730 		}
   1731 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1732 				  sizeof(RF_ProgressInfo_t));
   1733 		return (retcode);
   1734 
   1735 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1736 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1737 			/* This makes no sense on a RAID 0 */
   1738 			*(int *) data = 100;
   1739 			return(0);
   1740 		}
   1741 		if (raidPtr->copyback_in_progress == 1) {
   1742 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1743 				raidPtr->Layout.numStripe;
   1744 		} else {
   1745 			*(int *) data = 100;
   1746 		}
   1747 		return (0);
   1748 
   1749 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1750 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1751 		if (raidPtr->copyback_in_progress == 1) {
   1752 			progressInfo.total = raidPtr->Layout.numStripe;
   1753 			progressInfo.completed =
   1754 				raidPtr->copyback_stripes_done;
   1755 			progressInfo.remaining = progressInfo.total -
   1756 				progressInfo.completed;
   1757 		} else {
   1758 			progressInfo.remaining = 0;
   1759 			progressInfo.completed = 100;
   1760 			progressInfo.total = 100;
   1761 		}
   1762 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1763 				  sizeof(RF_ProgressInfo_t));
   1764 		return (retcode);
   1765 
   1766 		/* the sparetable daemon calls this to wait for the kernel to
   1767 		 * need a spare table. this ioctl does not return until a
   1768 		 * spare table is needed. XXX -- calling mpsleep here in the
   1769 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1770 		 * -- I should either compute the spare table in the kernel,
   1771 		 * or have a different -- XXX XXX -- interface (a different
   1772 		 * character device) for delivering the table     -- XXX */
   1773 #if 0
   1774 	case RAIDFRAME_SPARET_WAIT:
   1775 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1776 		while (!rf_sparet_wait_queue)
   1777 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1778 		waitreq = rf_sparet_wait_queue;
   1779 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1780 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1781 
   1782 		/* structure assignment */
   1783 		*((RF_SparetWait_t *) data) = *waitreq;
   1784 
   1785 		RF_Free(waitreq, sizeof(*waitreq));
   1786 		return (0);
   1787 
   1788 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1789 		 * code in it that will cause the dameon to exit */
   1790 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1791 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1792 		waitreq->fcol = -1;
   1793 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1794 		waitreq->next = rf_sparet_wait_queue;
   1795 		rf_sparet_wait_queue = waitreq;
   1796 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1797 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1798 		return (0);
   1799 
   1800 		/* used by the spare table daemon to deliver a spare table
   1801 		 * into the kernel */
   1802 	case RAIDFRAME_SEND_SPARET:
   1803 
   1804 		/* install the spare table */
   1805 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1806 
   1807 		/* respond to the requestor.  the return status of the spare
   1808 		 * table installation is passed in the "fcol" field */
   1809 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1810 		waitreq->fcol = retcode;
   1811 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1812 		waitreq->next = rf_sparet_resp_queue;
   1813 		rf_sparet_resp_queue = waitreq;
   1814 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1815 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1816 
   1817 		return (retcode);
   1818 #endif
   1819 
   1820 	default:
   1821 		break; /* fall through to the os-specific code below */
   1822 
   1823 	}
   1824 
   1825 	if (!raidPtr->valid)
   1826 		return (EINVAL);
   1827 
   1828 	/*
   1829 	 * Add support for "regular" device ioctls here.
   1830 	 */
   1831 
   1832 	error = disk_ioctl(&rs->sc_dkdev, dev, cmd, data, flag, l);
   1833 	if (error != EPASSTHROUGH)
   1834 		return (error);
   1835 
   1836 	switch (cmd) {
   1837 	case DIOCWDINFO:
   1838 	case DIOCSDINFO:
   1839 #ifdef __HAVE_OLD_DISKLABEL
   1840 	case ODIOCWDINFO:
   1841 	case ODIOCSDINFO:
   1842 #endif
   1843 	{
   1844 		struct disklabel *lp;
   1845 #ifdef __HAVE_OLD_DISKLABEL
   1846 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1847 			memset(&newlabel, 0, sizeof newlabel);
   1848 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1849 			lp = &newlabel;
   1850 		} else
   1851 #endif
   1852 		lp = (struct disklabel *)data;
   1853 
   1854 		if ((error = raidlock(rs)) != 0)
   1855 			return (error);
   1856 
   1857 		rs->sc_flags |= RAIDF_LABELLING;
   1858 
   1859 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1860 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1861 		if (error == 0) {
   1862 			if (cmd == DIOCWDINFO
   1863 #ifdef __HAVE_OLD_DISKLABEL
   1864 			    || cmd == ODIOCWDINFO
   1865 #endif
   1866 			   )
   1867 				error = writedisklabel(RAIDLABELDEV(dev),
   1868 				    raidstrategy, rs->sc_dkdev.dk_label,
   1869 				    rs->sc_dkdev.dk_cpulabel);
   1870 		}
   1871 		rs->sc_flags &= ~RAIDF_LABELLING;
   1872 
   1873 		raidunlock(rs);
   1874 
   1875 		if (error)
   1876 			return (error);
   1877 		break;
   1878 	}
   1879 
   1880 	case DIOCWLABEL:
   1881 		if (*(int *) data != 0)
   1882 			rs->sc_flags |= RAIDF_WLABEL;
   1883 		else
   1884 			rs->sc_flags &= ~RAIDF_WLABEL;
   1885 		break;
   1886 
   1887 	case DIOCGDEFLABEL:
   1888 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1889 		break;
   1890 
   1891 #ifdef __HAVE_OLD_DISKLABEL
   1892 	case ODIOCGDEFLABEL:
   1893 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1894 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1895 			return ENOTTY;
   1896 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1897 		break;
   1898 #endif
   1899 
   1900 	case DIOCCACHESYNC:
   1901 		return rf_sync_component_caches(raidPtr);
   1902 
   1903 	case DIOCGSTRATEGY:
   1904 	    {
   1905 		struct disk_strategy *dks = (void *)data;
   1906 
   1907 		s = splbio();
   1908 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1909 		    sizeof(dks->dks_name));
   1910 		splx(s);
   1911 		dks->dks_paramlen = 0;
   1912 
   1913 		return 0;
   1914 	    }
   1915 
   1916 	case DIOCSSTRATEGY:
   1917 	    {
   1918 		struct disk_strategy *dks = (void *)data;
   1919 		struct bufq_state *new;
   1920 		struct bufq_state *old;
   1921 
   1922 		if (dks->dks_param != NULL) {
   1923 			return EINVAL;
   1924 		}
   1925 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1926 		error = bufq_alloc(&new, dks->dks_name,
   1927 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1928 		if (error) {
   1929 			return error;
   1930 		}
   1931 		s = splbio();
   1932 		old = rs->buf_queue;
   1933 		bufq_move(new, old);
   1934 		rs->buf_queue = new;
   1935 		splx(s);
   1936 		bufq_free(old);
   1937 
   1938 		return 0;
   1939 	    }
   1940 
   1941 	default:
   1942 		retcode = ENOTTY;
   1943 	}
   1944 	return (retcode);
   1945 
   1946 }
   1947 
   1948 
   1949 /* raidinit -- complete the rest of the initialization for the
   1950    RAIDframe device.  */
   1951 
   1952 
   1953 static void
   1954 raidinit(struct raid_softc *rs)
   1955 {
   1956 	cfdata_t cf;
   1957 	int     unit;
   1958 	RF_Raid_t *raidPtr = &rs->sc_r;
   1959 
   1960 	unit = raidPtr->raidid;
   1961 
   1962 
   1963 	/* XXX should check return code first... */
   1964 	rs->sc_flags |= RAIDF_INITED;
   1965 
   1966 	/* XXX doesn't check bounds. */
   1967 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1968 
   1969 	/* attach the pseudo device */
   1970 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1971 	cf->cf_name = raid_cd.cd_name;
   1972 	cf->cf_atname = raid_cd.cd_name;
   1973 	cf->cf_unit = unit;
   1974 	cf->cf_fstate = FSTATE_STAR;
   1975 
   1976 	rs->sc_dev = config_attach_pseudo(cf);
   1977 
   1978 	if (rs->sc_dev == NULL) {
   1979 		printf("raid%d: config_attach_pseudo failed\n",
   1980 		    raidPtr->raidid);
   1981 		rs->sc_flags &= ~RAIDF_INITED;
   1982 		free(cf, M_RAIDFRAME);
   1983 		return;
   1984 	}
   1985 
   1986 	/* disk_attach actually creates space for the CPU disklabel, among
   1987 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1988 	 * with disklabels. */
   1989 
   1990 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1991 	disk_attach(&rs->sc_dkdev);
   1992 
   1993 	/* XXX There may be a weird interaction here between this, and
   1994 	 * protectedSectors, as used in RAIDframe.  */
   1995 
   1996 	rs->sc_size = raidPtr->totalSectors;
   1997 
   1998 	rf_set_geometry(rs, raidPtr);
   1999 
   2000 	dkwedge_discover(&rs->sc_dkdev);
   2001 
   2002 }
   2003 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2004 /* wake up the daemon & tell it to get us a spare table
   2005  * XXX
   2006  * the entries in the queues should be tagged with the raidPtr
   2007  * so that in the extremely rare case that two recons happen at once,
   2008  * we know for which device were requesting a spare table
   2009  * XXX
   2010  *
   2011  * XXX This code is not currently used. GO
   2012  */
   2013 int
   2014 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2015 {
   2016 	int     retcode;
   2017 
   2018 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2019 	req->next = rf_sparet_wait_queue;
   2020 	rf_sparet_wait_queue = req;
   2021 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2022 
   2023 	/* mpsleep unlocks the mutex */
   2024 	while (!rf_sparet_resp_queue) {
   2025 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2026 	}
   2027 	req = rf_sparet_resp_queue;
   2028 	rf_sparet_resp_queue = req->next;
   2029 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2030 
   2031 	retcode = req->fcol;
   2032 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2033 					 * alloc'd */
   2034 	return (retcode);
   2035 }
   2036 #endif
   2037 
   2038 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2039  * bp & passes it down.
   2040  * any calls originating in the kernel must use non-blocking I/O
   2041  * do some extra sanity checking to return "appropriate" error values for
   2042  * certain conditions (to make some standard utilities work)
   2043  *
   2044  * Formerly known as: rf_DoAccessKernel
   2045  */
   2046 void
   2047 raidstart(RF_Raid_t *raidPtr)
   2048 {
   2049 	RF_SectorCount_t num_blocks, pb, sum;
   2050 	RF_RaidAddr_t raid_addr;
   2051 	struct partition *pp;
   2052 	daddr_t blocknum;
   2053 	struct raid_softc *rs;
   2054 	int     do_async;
   2055 	struct buf *bp;
   2056 	int rc;
   2057 
   2058 	rs = raidPtr->softc;
   2059 	/* quick check to see if anything has died recently */
   2060 	rf_lock_mutex2(raidPtr->mutex);
   2061 	if (raidPtr->numNewFailures > 0) {
   2062 		rf_unlock_mutex2(raidPtr->mutex);
   2063 		rf_update_component_labels(raidPtr,
   2064 					   RF_NORMAL_COMPONENT_UPDATE);
   2065 		rf_lock_mutex2(raidPtr->mutex);
   2066 		raidPtr->numNewFailures--;
   2067 	}
   2068 
   2069 	/* Check to see if we're at the limit... */
   2070 	while (raidPtr->openings > 0) {
   2071 		rf_unlock_mutex2(raidPtr->mutex);
   2072 
   2073 		/* get the next item, if any, from the queue */
   2074 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2075 			/* nothing more to do */
   2076 			return;
   2077 		}
   2078 
   2079 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2080 		 * partition.. Need to make it absolute to the underlying
   2081 		 * device.. */
   2082 
   2083 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2084 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2085 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2086 			blocknum += pp->p_offset;
   2087 		}
   2088 
   2089 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2090 			    (int) blocknum));
   2091 
   2092 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2093 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2094 
   2095 		/* *THIS* is where we adjust what block we're going to...
   2096 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2097 		raid_addr = blocknum;
   2098 
   2099 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2100 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2101 		sum = raid_addr + num_blocks + pb;
   2102 		if (1 || rf_debugKernelAccess) {
   2103 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2104 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2105 				    (int) pb, (int) bp->b_resid));
   2106 		}
   2107 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2108 		    || (sum < num_blocks) || (sum < pb)) {
   2109 			bp->b_error = ENOSPC;
   2110 			bp->b_resid = bp->b_bcount;
   2111 			biodone(bp);
   2112 			rf_lock_mutex2(raidPtr->mutex);
   2113 			continue;
   2114 		}
   2115 		/*
   2116 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2117 		 */
   2118 
   2119 		if (bp->b_bcount & raidPtr->sectorMask) {
   2120 			bp->b_error = EINVAL;
   2121 			bp->b_resid = bp->b_bcount;
   2122 			biodone(bp);
   2123 			rf_lock_mutex2(raidPtr->mutex);
   2124 			continue;
   2125 
   2126 		}
   2127 		db1_printf(("Calling DoAccess..\n"));
   2128 
   2129 
   2130 		rf_lock_mutex2(raidPtr->mutex);
   2131 		raidPtr->openings--;
   2132 		rf_unlock_mutex2(raidPtr->mutex);
   2133 
   2134 		/*
   2135 		 * Everything is async.
   2136 		 */
   2137 		do_async = 1;
   2138 
   2139 		disk_busy(&rs->sc_dkdev);
   2140 
   2141 		/* XXX we're still at splbio() here... do we *really*
   2142 		   need to be? */
   2143 
   2144 		/* don't ever condition on bp->b_flags & B_WRITE.
   2145 		 * always condition on B_READ instead */
   2146 
   2147 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2148 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2149 				 do_async, raid_addr, num_blocks,
   2150 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2151 
   2152 		if (rc) {
   2153 			bp->b_error = rc;
   2154 			bp->b_resid = bp->b_bcount;
   2155 			biodone(bp);
   2156 			/* continue loop */
   2157 		}
   2158 
   2159 		rf_lock_mutex2(raidPtr->mutex);
   2160 	}
   2161 	rf_unlock_mutex2(raidPtr->mutex);
   2162 }
   2163 
   2164 
   2165 
   2166 
   2167 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2168 
   2169 int
   2170 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2171 {
   2172 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2173 	struct buf *bp;
   2174 
   2175 	req->queue = queue;
   2176 	bp = req->bp;
   2177 
   2178 	switch (req->type) {
   2179 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2180 		/* XXX need to do something extra here.. */
   2181 		/* I'm leaving this in, as I've never actually seen it used,
   2182 		 * and I'd like folks to report it... GO */
   2183 		printf(("WAKEUP CALLED\n"));
   2184 		queue->numOutstanding++;
   2185 
   2186 		bp->b_flags = 0;
   2187 		bp->b_private = req;
   2188 
   2189 		KernelWakeupFunc(bp);
   2190 		break;
   2191 
   2192 	case RF_IO_TYPE_READ:
   2193 	case RF_IO_TYPE_WRITE:
   2194 #if RF_ACC_TRACE > 0
   2195 		if (req->tracerec) {
   2196 			RF_ETIMER_START(req->tracerec->timer);
   2197 		}
   2198 #endif
   2199 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2200 		    op, queue->rf_cinfo->ci_dev,
   2201 		    req->sectorOffset, req->numSector,
   2202 		    req->buf, KernelWakeupFunc, (void *) req,
   2203 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2204 
   2205 		if (rf_debugKernelAccess) {
   2206 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2207 				(long) bp->b_blkno));
   2208 		}
   2209 		queue->numOutstanding++;
   2210 		queue->last_deq_sector = req->sectorOffset;
   2211 		/* acc wouldn't have been let in if there were any pending
   2212 		 * reqs at any other priority */
   2213 		queue->curPriority = req->priority;
   2214 
   2215 		db1_printf(("Going for %c to unit %d col %d\n",
   2216 			    req->type, queue->raidPtr->raidid,
   2217 			    queue->col));
   2218 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2219 			(int) req->sectorOffset, (int) req->numSector,
   2220 			(int) (req->numSector <<
   2221 			    queue->raidPtr->logBytesPerSector),
   2222 			(int) queue->raidPtr->logBytesPerSector));
   2223 
   2224 		/*
   2225 		 * XXX: drop lock here since this can block at
   2226 		 * least with backing SCSI devices.  Retake it
   2227 		 * to minimize fuss with calling interfaces.
   2228 		 */
   2229 
   2230 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2231 		bdev_strategy(bp);
   2232 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2233 		break;
   2234 
   2235 	default:
   2236 		panic("bad req->type in rf_DispatchKernelIO");
   2237 	}
   2238 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2239 
   2240 	return (0);
   2241 }
   2242 /* this is the callback function associated with a I/O invoked from
   2243    kernel code.
   2244  */
   2245 static void
   2246 KernelWakeupFunc(struct buf *bp)
   2247 {
   2248 	RF_DiskQueueData_t *req = NULL;
   2249 	RF_DiskQueue_t *queue;
   2250 
   2251 	db1_printf(("recovering the request queue:\n"));
   2252 
   2253 	req = bp->b_private;
   2254 
   2255 	queue = (RF_DiskQueue_t *) req->queue;
   2256 
   2257 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2258 
   2259 #if RF_ACC_TRACE > 0
   2260 	if (req->tracerec) {
   2261 		RF_ETIMER_STOP(req->tracerec->timer);
   2262 		RF_ETIMER_EVAL(req->tracerec->timer);
   2263 		rf_lock_mutex2(rf_tracing_mutex);
   2264 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2265 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2266 		req->tracerec->num_phys_ios++;
   2267 		rf_unlock_mutex2(rf_tracing_mutex);
   2268 	}
   2269 #endif
   2270 
   2271 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2272 	 * ballistic, and mark the component as hosed... */
   2273 
   2274 	if (bp->b_error != 0) {
   2275 		/* Mark the disk as dead */
   2276 		/* but only mark it once... */
   2277 		/* and only if it wouldn't leave this RAID set
   2278 		   completely broken */
   2279 		if (((queue->raidPtr->Disks[queue->col].status ==
   2280 		      rf_ds_optimal) ||
   2281 		     (queue->raidPtr->Disks[queue->col].status ==
   2282 		      rf_ds_used_spare)) &&
   2283 		     (queue->raidPtr->numFailures <
   2284 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2285 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2286 			       queue->raidPtr->raidid,
   2287 			       bp->b_error,
   2288 			       queue->raidPtr->Disks[queue->col].devname);
   2289 			queue->raidPtr->Disks[queue->col].status =
   2290 			    rf_ds_failed;
   2291 			queue->raidPtr->status = rf_rs_degraded;
   2292 			queue->raidPtr->numFailures++;
   2293 			queue->raidPtr->numNewFailures++;
   2294 		} else {	/* Disk is already dead... */
   2295 			/* printf("Disk already marked as dead!\n"); */
   2296 		}
   2297 
   2298 	}
   2299 
   2300 	/* Fill in the error value */
   2301 	req->error = bp->b_error;
   2302 
   2303 	/* Drop this one on the "finished" queue... */
   2304 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2305 
   2306 	/* Let the raidio thread know there is work to be done. */
   2307 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2308 
   2309 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2310 }
   2311 
   2312 
   2313 /*
   2314  * initialize a buf structure for doing an I/O in the kernel.
   2315  */
   2316 static void
   2317 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2318        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2319        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2320        struct proc *b_proc)
   2321 {
   2322 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2323 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2324 	bp->b_oflags = 0;
   2325 	bp->b_cflags = 0;
   2326 	bp->b_bcount = numSect << logBytesPerSector;
   2327 	bp->b_bufsize = bp->b_bcount;
   2328 	bp->b_error = 0;
   2329 	bp->b_dev = dev;
   2330 	bp->b_data = bf;
   2331 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2332 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2333 	if (bp->b_bcount == 0) {
   2334 		panic("bp->b_bcount is zero in InitBP!!");
   2335 	}
   2336 	bp->b_proc = b_proc;
   2337 	bp->b_iodone = cbFunc;
   2338 	bp->b_private = cbArg;
   2339 }
   2340 
   2341 static void
   2342 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2343 		    struct disklabel *lp)
   2344 {
   2345 	memset(lp, 0, sizeof(*lp));
   2346 
   2347 	/* fabricate a label... */
   2348 	if (raidPtr->totalSectors > UINT32_MAX)
   2349 		lp->d_secperunit = UINT32_MAX;
   2350 	else
   2351 		lp->d_secperunit = raidPtr->totalSectors;
   2352 	lp->d_secsize = raidPtr->bytesPerSector;
   2353 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2354 	lp->d_ntracks = 4 * raidPtr->numCol;
   2355 	lp->d_ncylinders = raidPtr->totalSectors /
   2356 		(lp->d_nsectors * lp->d_ntracks);
   2357 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2358 
   2359 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2360 	lp->d_type = DKTYPE_RAID;
   2361 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2362 	lp->d_rpm = 3600;
   2363 	lp->d_interleave = 1;
   2364 	lp->d_flags = 0;
   2365 
   2366 	lp->d_partitions[RAW_PART].p_offset = 0;
   2367 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2368 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2369 	lp->d_npartitions = RAW_PART + 1;
   2370 
   2371 	lp->d_magic = DISKMAGIC;
   2372 	lp->d_magic2 = DISKMAGIC;
   2373 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2374 
   2375 }
   2376 /*
   2377  * Read the disklabel from the raid device.  If one is not present, fake one
   2378  * up.
   2379  */
   2380 static void
   2381 raidgetdisklabel(dev_t dev)
   2382 {
   2383 	int     unit = raidunit(dev);
   2384 	struct raid_softc *rs;
   2385 	const char   *errstring;
   2386 	struct disklabel *lp;
   2387 	struct cpu_disklabel *clp;
   2388 	RF_Raid_t *raidPtr;
   2389 
   2390 	if ((rs = raidget(unit)) == NULL)
   2391 		return;
   2392 
   2393 	lp = rs->sc_dkdev.dk_label;
   2394 	clp = rs->sc_dkdev.dk_cpulabel;
   2395 
   2396 	db1_printf(("Getting the disklabel...\n"));
   2397 
   2398 	memset(clp, 0, sizeof(*clp));
   2399 
   2400 	raidPtr = &rs->sc_r;
   2401 
   2402 	raidgetdefaultlabel(raidPtr, rs, lp);
   2403 
   2404 	/*
   2405 	 * Call the generic disklabel extraction routine.
   2406 	 */
   2407 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2408 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2409 	if (errstring)
   2410 		raidmakedisklabel(rs);
   2411 	else {
   2412 		int     i;
   2413 		struct partition *pp;
   2414 
   2415 		/*
   2416 		 * Sanity check whether the found disklabel is valid.
   2417 		 *
   2418 		 * This is necessary since total size of the raid device
   2419 		 * may vary when an interleave is changed even though exactly
   2420 		 * same components are used, and old disklabel may used
   2421 		 * if that is found.
   2422 		 */
   2423 		if (lp->d_secperunit < UINT32_MAX ?
   2424 		    lp->d_secperunit != rs->sc_size :
   2425 		    lp->d_secperunit > rs->sc_size)
   2426 			printf("raid%d: WARNING: %s: "
   2427 			    "total sector size in disklabel (%ju) != "
   2428 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
   2429 			    (uintmax_t)lp->d_secperunit,
   2430 			    (uintmax_t)rs->sc_size);
   2431 		for (i = 0; i < lp->d_npartitions; i++) {
   2432 			pp = &lp->d_partitions[i];
   2433 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2434 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2435 				       "exceeds the size of raid (%ju)\n",
   2436 				       unit, rs->sc_xname, 'a' + i,
   2437 				       (uintmax_t)rs->sc_size);
   2438 		}
   2439 	}
   2440 
   2441 }
   2442 /*
   2443  * Take care of things one might want to take care of in the event
   2444  * that a disklabel isn't present.
   2445  */
   2446 static void
   2447 raidmakedisklabel(struct raid_softc *rs)
   2448 {
   2449 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2450 	db1_printf(("Making a label..\n"));
   2451 
   2452 	/*
   2453 	 * For historical reasons, if there's no disklabel present
   2454 	 * the raw partition must be marked FS_BSDFFS.
   2455 	 */
   2456 
   2457 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2458 
   2459 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2460 
   2461 	lp->d_checksum = dkcksum(lp);
   2462 }
   2463 /*
   2464  * Wait interruptibly for an exclusive lock.
   2465  *
   2466  * XXX
   2467  * Several drivers do this; it should be abstracted and made MP-safe.
   2468  * (Hmm... where have we seen this warning before :->  GO )
   2469  */
   2470 static int
   2471 raidlock(struct raid_softc *rs)
   2472 {
   2473 	int     error;
   2474 
   2475 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2476 		rs->sc_flags |= RAIDF_WANTED;
   2477 		if ((error =
   2478 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2479 			return (error);
   2480 	}
   2481 	rs->sc_flags |= RAIDF_LOCKED;
   2482 	return (0);
   2483 }
   2484 /*
   2485  * Unlock and wake up any waiters.
   2486  */
   2487 static void
   2488 raidunlock(struct raid_softc *rs)
   2489 {
   2490 
   2491 	rs->sc_flags &= ~RAIDF_LOCKED;
   2492 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2493 		rs->sc_flags &= ~RAIDF_WANTED;
   2494 		wakeup(rs);
   2495 	}
   2496 }
   2497 
   2498 
   2499 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2500 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2501 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2502 
   2503 static daddr_t
   2504 rf_component_info_offset(void)
   2505 {
   2506 
   2507 	return RF_COMPONENT_INFO_OFFSET;
   2508 }
   2509 
   2510 static daddr_t
   2511 rf_component_info_size(unsigned secsize)
   2512 {
   2513 	daddr_t info_size;
   2514 
   2515 	KASSERT(secsize);
   2516 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2517 		info_size = secsize;
   2518 	else
   2519 		info_size = RF_COMPONENT_INFO_SIZE;
   2520 
   2521 	return info_size;
   2522 }
   2523 
   2524 static daddr_t
   2525 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2526 {
   2527 	daddr_t map_offset;
   2528 
   2529 	KASSERT(raidPtr->bytesPerSector);
   2530 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2531 		map_offset = raidPtr->bytesPerSector;
   2532 	else
   2533 		map_offset = RF_COMPONENT_INFO_SIZE;
   2534 	map_offset += rf_component_info_offset();
   2535 
   2536 	return map_offset;
   2537 }
   2538 
   2539 static daddr_t
   2540 rf_parity_map_size(RF_Raid_t *raidPtr)
   2541 {
   2542 	daddr_t map_size;
   2543 
   2544 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2545 		map_size = raidPtr->bytesPerSector;
   2546 	else
   2547 		map_size = RF_PARITY_MAP_SIZE;
   2548 
   2549 	return map_size;
   2550 }
   2551 
   2552 int
   2553 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2554 {
   2555 	RF_ComponentLabel_t *clabel;
   2556 
   2557 	clabel = raidget_component_label(raidPtr, col);
   2558 	clabel->clean = RF_RAID_CLEAN;
   2559 	raidflush_component_label(raidPtr, col);
   2560 	return(0);
   2561 }
   2562 
   2563 
   2564 int
   2565 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2566 {
   2567 	RF_ComponentLabel_t *clabel;
   2568 
   2569 	clabel = raidget_component_label(raidPtr, col);
   2570 	clabel->clean = RF_RAID_DIRTY;
   2571 	raidflush_component_label(raidPtr, col);
   2572 	return(0);
   2573 }
   2574 
   2575 int
   2576 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2577 {
   2578 	KASSERT(raidPtr->bytesPerSector);
   2579 	return raidread_component_label(raidPtr->bytesPerSector,
   2580 	    raidPtr->Disks[col].dev,
   2581 	    raidPtr->raid_cinfo[col].ci_vp,
   2582 	    &raidPtr->raid_cinfo[col].ci_label);
   2583 }
   2584 
   2585 RF_ComponentLabel_t *
   2586 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2587 {
   2588 	return &raidPtr->raid_cinfo[col].ci_label;
   2589 }
   2590 
   2591 int
   2592 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2593 {
   2594 	RF_ComponentLabel_t *label;
   2595 
   2596 	label = &raidPtr->raid_cinfo[col].ci_label;
   2597 	label->mod_counter = raidPtr->mod_counter;
   2598 #ifndef RF_NO_PARITY_MAP
   2599 	label->parity_map_modcount = label->mod_counter;
   2600 #endif
   2601 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2602 	    raidPtr->Disks[col].dev,
   2603 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2604 }
   2605 
   2606 
   2607 static int
   2608 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2609     RF_ComponentLabel_t *clabel)
   2610 {
   2611 	return raidread_component_area(dev, b_vp, clabel,
   2612 	    sizeof(RF_ComponentLabel_t),
   2613 	    rf_component_info_offset(),
   2614 	    rf_component_info_size(secsize));
   2615 }
   2616 
   2617 /* ARGSUSED */
   2618 static int
   2619 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2620     size_t msize, daddr_t offset, daddr_t dsize)
   2621 {
   2622 	struct buf *bp;
   2623 	const struct bdevsw *bdev;
   2624 	int error;
   2625 
   2626 	/* XXX should probably ensure that we don't try to do this if
   2627 	   someone has changed rf_protected_sectors. */
   2628 
   2629 	if (b_vp == NULL) {
   2630 		/* For whatever reason, this component is not valid.
   2631 		   Don't try to read a component label from it. */
   2632 		return(EINVAL);
   2633 	}
   2634 
   2635 	/* get a block of the appropriate size... */
   2636 	bp = geteblk((int)dsize);
   2637 	bp->b_dev = dev;
   2638 
   2639 	/* get our ducks in a row for the read */
   2640 	bp->b_blkno = offset / DEV_BSIZE;
   2641 	bp->b_bcount = dsize;
   2642 	bp->b_flags |= B_READ;
   2643  	bp->b_resid = dsize;
   2644 
   2645 	bdev = bdevsw_lookup(bp->b_dev);
   2646 	if (bdev == NULL)
   2647 		return (ENXIO);
   2648 	(*bdev->d_strategy)(bp);
   2649 
   2650 	error = biowait(bp);
   2651 
   2652 	if (!error) {
   2653 		memcpy(data, bp->b_data, msize);
   2654 	}
   2655 
   2656 	brelse(bp, 0);
   2657 	return(error);
   2658 }
   2659 
   2660 
   2661 static int
   2662 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2663     RF_ComponentLabel_t *clabel)
   2664 {
   2665 	return raidwrite_component_area(dev, b_vp, clabel,
   2666 	    sizeof(RF_ComponentLabel_t),
   2667 	    rf_component_info_offset(),
   2668 	    rf_component_info_size(secsize), 0);
   2669 }
   2670 
   2671 /* ARGSUSED */
   2672 static int
   2673 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2674     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2675 {
   2676 	struct buf *bp;
   2677 	const struct bdevsw *bdev;
   2678 	int error;
   2679 
   2680 	/* get a block of the appropriate size... */
   2681 	bp = geteblk((int)dsize);
   2682 	bp->b_dev = dev;
   2683 
   2684 	/* get our ducks in a row for the write */
   2685 	bp->b_blkno = offset / DEV_BSIZE;
   2686 	bp->b_bcount = dsize;
   2687 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2688  	bp->b_resid = dsize;
   2689 
   2690 	memset(bp->b_data, 0, dsize);
   2691 	memcpy(bp->b_data, data, msize);
   2692 
   2693 	bdev = bdevsw_lookup(bp->b_dev);
   2694 	if (bdev == NULL)
   2695 		return (ENXIO);
   2696 	(*bdev->d_strategy)(bp);
   2697 	if (asyncp)
   2698 		return 0;
   2699 	error = biowait(bp);
   2700 	brelse(bp, 0);
   2701 	if (error) {
   2702 #if 1
   2703 		printf("Failed to write RAID component info!\n");
   2704 #endif
   2705 	}
   2706 
   2707 	return(error);
   2708 }
   2709 
   2710 void
   2711 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2712 {
   2713 	int c;
   2714 
   2715 	for (c = 0; c < raidPtr->numCol; c++) {
   2716 		/* Skip dead disks. */
   2717 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2718 			continue;
   2719 		/* XXXjld: what if an error occurs here? */
   2720 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2721 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2722 		    RF_PARITYMAP_NBYTE,
   2723 		    rf_parity_map_offset(raidPtr),
   2724 		    rf_parity_map_size(raidPtr), 0);
   2725 	}
   2726 }
   2727 
   2728 void
   2729 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2730 {
   2731 	struct rf_paritymap_ondisk tmp;
   2732 	int c,first;
   2733 
   2734 	first=1;
   2735 	for (c = 0; c < raidPtr->numCol; c++) {
   2736 		/* Skip dead disks. */
   2737 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2738 			continue;
   2739 		raidread_component_area(raidPtr->Disks[c].dev,
   2740 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2741 		    RF_PARITYMAP_NBYTE,
   2742 		    rf_parity_map_offset(raidPtr),
   2743 		    rf_parity_map_size(raidPtr));
   2744 		if (first) {
   2745 			memcpy(map, &tmp, sizeof(*map));
   2746 			first = 0;
   2747 		} else {
   2748 			rf_paritymap_merge(map, &tmp);
   2749 		}
   2750 	}
   2751 }
   2752 
   2753 void
   2754 rf_markalldirty(RF_Raid_t *raidPtr)
   2755 {
   2756 	RF_ComponentLabel_t *clabel;
   2757 	int sparecol;
   2758 	int c;
   2759 	int j;
   2760 	int scol = -1;
   2761 
   2762 	raidPtr->mod_counter++;
   2763 	for (c = 0; c < raidPtr->numCol; c++) {
   2764 		/* we don't want to touch (at all) a disk that has
   2765 		   failed */
   2766 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2767 			clabel = raidget_component_label(raidPtr, c);
   2768 			if (clabel->status == rf_ds_spared) {
   2769 				/* XXX do something special...
   2770 				   but whatever you do, don't
   2771 				   try to access it!! */
   2772 			} else {
   2773 				raidmarkdirty(raidPtr, c);
   2774 			}
   2775 		}
   2776 	}
   2777 
   2778 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2779 		sparecol = raidPtr->numCol + c;
   2780 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2781 			/*
   2782 
   2783 			   we claim this disk is "optimal" if it's
   2784 			   rf_ds_used_spare, as that means it should be
   2785 			   directly substitutable for the disk it replaced.
   2786 			   We note that too...
   2787 
   2788 			 */
   2789 
   2790 			for(j=0;j<raidPtr->numCol;j++) {
   2791 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2792 					scol = j;
   2793 					break;
   2794 				}
   2795 			}
   2796 
   2797 			clabel = raidget_component_label(raidPtr, sparecol);
   2798 			/* make sure status is noted */
   2799 
   2800 			raid_init_component_label(raidPtr, clabel);
   2801 
   2802 			clabel->row = 0;
   2803 			clabel->column = scol;
   2804 			/* Note: we *don't* change status from rf_ds_used_spare
   2805 			   to rf_ds_optimal */
   2806 			/* clabel.status = rf_ds_optimal; */
   2807 
   2808 			raidmarkdirty(raidPtr, sparecol);
   2809 		}
   2810 	}
   2811 }
   2812 
   2813 
   2814 void
   2815 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2816 {
   2817 	RF_ComponentLabel_t *clabel;
   2818 	int sparecol;
   2819 	int c;
   2820 	int j;
   2821 	int scol;
   2822 
   2823 	scol = -1;
   2824 
   2825 	/* XXX should do extra checks to make sure things really are clean,
   2826 	   rather than blindly setting the clean bit... */
   2827 
   2828 	raidPtr->mod_counter++;
   2829 
   2830 	for (c = 0; c < raidPtr->numCol; c++) {
   2831 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2832 			clabel = raidget_component_label(raidPtr, c);
   2833 			/* make sure status is noted */
   2834 			clabel->status = rf_ds_optimal;
   2835 
   2836 			/* note what unit we are configured as */
   2837 			clabel->last_unit = raidPtr->raidid;
   2838 
   2839 			raidflush_component_label(raidPtr, c);
   2840 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2841 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2842 					raidmarkclean(raidPtr, c);
   2843 				}
   2844 			}
   2845 		}
   2846 		/* else we don't touch it.. */
   2847 	}
   2848 
   2849 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2850 		sparecol = raidPtr->numCol + c;
   2851 		/* Need to ensure that the reconstruct actually completed! */
   2852 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2853 			/*
   2854 
   2855 			   we claim this disk is "optimal" if it's
   2856 			   rf_ds_used_spare, as that means it should be
   2857 			   directly substitutable for the disk it replaced.
   2858 			   We note that too...
   2859 
   2860 			 */
   2861 
   2862 			for(j=0;j<raidPtr->numCol;j++) {
   2863 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2864 					scol = j;
   2865 					break;
   2866 				}
   2867 			}
   2868 
   2869 			/* XXX shouldn't *really* need this... */
   2870 			clabel = raidget_component_label(raidPtr, sparecol);
   2871 			/* make sure status is noted */
   2872 
   2873 			raid_init_component_label(raidPtr, clabel);
   2874 
   2875 			clabel->column = scol;
   2876 			clabel->status = rf_ds_optimal;
   2877 			clabel->last_unit = raidPtr->raidid;
   2878 
   2879 			raidflush_component_label(raidPtr, sparecol);
   2880 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2881 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2882 					raidmarkclean(raidPtr, sparecol);
   2883 				}
   2884 			}
   2885 		}
   2886 	}
   2887 }
   2888 
   2889 void
   2890 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2891 {
   2892 
   2893 	if (vp != NULL) {
   2894 		if (auto_configured == 1) {
   2895 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2896 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2897 			vput(vp);
   2898 
   2899 		} else {
   2900 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2901 		}
   2902 	}
   2903 }
   2904 
   2905 
   2906 void
   2907 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2908 {
   2909 	int r,c;
   2910 	struct vnode *vp;
   2911 	int acd;
   2912 
   2913 
   2914 	/* We take this opportunity to close the vnodes like we should.. */
   2915 
   2916 	for (c = 0; c < raidPtr->numCol; c++) {
   2917 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2918 		acd = raidPtr->Disks[c].auto_configured;
   2919 		rf_close_component(raidPtr, vp, acd);
   2920 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2921 		raidPtr->Disks[c].auto_configured = 0;
   2922 	}
   2923 
   2924 	for (r = 0; r < raidPtr->numSpare; r++) {
   2925 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2926 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2927 		rf_close_component(raidPtr, vp, acd);
   2928 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2929 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2930 	}
   2931 }
   2932 
   2933 
   2934 void
   2935 rf_ReconThread(struct rf_recon_req *req)
   2936 {
   2937 	int     s;
   2938 	RF_Raid_t *raidPtr;
   2939 
   2940 	s = splbio();
   2941 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2942 	raidPtr->recon_in_progress = 1;
   2943 
   2944 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2945 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2946 
   2947 	RF_Free(req, sizeof(*req));
   2948 
   2949 	raidPtr->recon_in_progress = 0;
   2950 	splx(s);
   2951 
   2952 	/* That's all... */
   2953 	kthread_exit(0);	/* does not return */
   2954 }
   2955 
   2956 void
   2957 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2958 {
   2959 	int retcode;
   2960 	int s;
   2961 
   2962 	raidPtr->parity_rewrite_stripes_done = 0;
   2963 	raidPtr->parity_rewrite_in_progress = 1;
   2964 	s = splbio();
   2965 	retcode = rf_RewriteParity(raidPtr);
   2966 	splx(s);
   2967 	if (retcode) {
   2968 		printf("raid%d: Error re-writing parity (%d)!\n",
   2969 		    raidPtr->raidid, retcode);
   2970 	} else {
   2971 		/* set the clean bit!  If we shutdown correctly,
   2972 		   the clean bit on each component label will get
   2973 		   set */
   2974 		raidPtr->parity_good = RF_RAID_CLEAN;
   2975 	}
   2976 	raidPtr->parity_rewrite_in_progress = 0;
   2977 
   2978 	/* Anyone waiting for us to stop?  If so, inform them... */
   2979 	if (raidPtr->waitShutdown) {
   2980 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2981 	}
   2982 
   2983 	/* That's all... */
   2984 	kthread_exit(0);	/* does not return */
   2985 }
   2986 
   2987 
   2988 void
   2989 rf_CopybackThread(RF_Raid_t *raidPtr)
   2990 {
   2991 	int s;
   2992 
   2993 	raidPtr->copyback_in_progress = 1;
   2994 	s = splbio();
   2995 	rf_CopybackReconstructedData(raidPtr);
   2996 	splx(s);
   2997 	raidPtr->copyback_in_progress = 0;
   2998 
   2999 	/* That's all... */
   3000 	kthread_exit(0);	/* does not return */
   3001 }
   3002 
   3003 
   3004 void
   3005 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3006 {
   3007 	int s;
   3008 	RF_Raid_t *raidPtr;
   3009 
   3010 	s = splbio();
   3011 	raidPtr = req->raidPtr;
   3012 	raidPtr->recon_in_progress = 1;
   3013 	rf_ReconstructInPlace(raidPtr, req->col);
   3014 	RF_Free(req, sizeof(*req));
   3015 	raidPtr->recon_in_progress = 0;
   3016 	splx(s);
   3017 
   3018 	/* That's all... */
   3019 	kthread_exit(0);	/* does not return */
   3020 }
   3021 
   3022 static RF_AutoConfig_t *
   3023 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3024     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3025     unsigned secsize)
   3026 {
   3027 	int good_one = 0;
   3028 	RF_ComponentLabel_t *clabel;
   3029 	RF_AutoConfig_t *ac;
   3030 
   3031 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3032 	if (clabel == NULL) {
   3033 oomem:
   3034 		    while(ac_list) {
   3035 			    ac = ac_list;
   3036 			    if (ac->clabel)
   3037 				    free(ac->clabel, M_RAIDFRAME);
   3038 			    ac_list = ac_list->next;
   3039 			    free(ac, M_RAIDFRAME);
   3040 		    }
   3041 		    printf("RAID auto config: out of memory!\n");
   3042 		    return NULL; /* XXX probably should panic? */
   3043 	}
   3044 
   3045 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3046 		/* Got the label.  Does it look reasonable? */
   3047 		if (rf_reasonable_label(clabel, numsecs) &&
   3048 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3049 #ifdef DEBUG
   3050 			printf("Component on: %s: %llu\n",
   3051 				cname, (unsigned long long)size);
   3052 			rf_print_component_label(clabel);
   3053 #endif
   3054 			/* if it's reasonable, add it, else ignore it. */
   3055 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3056 				M_NOWAIT);
   3057 			if (ac == NULL) {
   3058 				free(clabel, M_RAIDFRAME);
   3059 				goto oomem;
   3060 			}
   3061 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3062 			ac->dev = dev;
   3063 			ac->vp = vp;
   3064 			ac->clabel = clabel;
   3065 			ac->next = ac_list;
   3066 			ac_list = ac;
   3067 			good_one = 1;
   3068 		}
   3069 	}
   3070 	if (!good_one) {
   3071 		/* cleanup */
   3072 		free(clabel, M_RAIDFRAME);
   3073 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3074 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3075 		vput(vp);
   3076 	}
   3077 	return ac_list;
   3078 }
   3079 
   3080 RF_AutoConfig_t *
   3081 rf_find_raid_components(void)
   3082 {
   3083 	struct vnode *vp;
   3084 	struct disklabel label;
   3085 	device_t dv;
   3086 	deviter_t di;
   3087 	dev_t dev;
   3088 	int bmajor, bminor, wedge, rf_part_found;
   3089 	int error;
   3090 	int i;
   3091 	RF_AutoConfig_t *ac_list;
   3092 	uint64_t numsecs;
   3093 	unsigned secsize;
   3094 
   3095 	/* initialize the AutoConfig list */
   3096 	ac_list = NULL;
   3097 
   3098 	/* we begin by trolling through *all* the devices on the system */
   3099 
   3100 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3101 	     dv = deviter_next(&di)) {
   3102 
   3103 		/* we are only interested in disks... */
   3104 		if (device_class(dv) != DV_DISK)
   3105 			continue;
   3106 
   3107 		/* we don't care about floppies... */
   3108 		if (device_is_a(dv, "fd")) {
   3109 			continue;
   3110 		}
   3111 
   3112 		/* we don't care about CD's... */
   3113 		if (device_is_a(dv, "cd")) {
   3114 			continue;
   3115 		}
   3116 
   3117 		/* we don't care about md's... */
   3118 		if (device_is_a(dv, "md")) {
   3119 			continue;
   3120 		}
   3121 
   3122 		/* hdfd is the Atari/Hades floppy driver */
   3123 		if (device_is_a(dv, "hdfd")) {
   3124 			continue;
   3125 		}
   3126 
   3127 		/* fdisa is the Atari/Milan floppy driver */
   3128 		if (device_is_a(dv, "fdisa")) {
   3129 			continue;
   3130 		}
   3131 
   3132 		/* need to find the device_name_to_block_device_major stuff */
   3133 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3134 
   3135 		rf_part_found = 0; /*No raid partition as yet*/
   3136 
   3137 		/* get a vnode for the raw partition of this disk */
   3138 
   3139 		wedge = device_is_a(dv, "dk");
   3140 		bminor = minor(device_unit(dv));
   3141 		dev = wedge ? makedev(bmajor, bminor) :
   3142 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3143 		if (bdevvp(dev, &vp))
   3144 			panic("RAID can't alloc vnode");
   3145 
   3146 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3147 
   3148 		if (error) {
   3149 			/* "Who cares."  Continue looking
   3150 			   for something that exists*/
   3151 			vput(vp);
   3152 			continue;
   3153 		}
   3154 
   3155 		error = getdisksize(vp, &numsecs, &secsize);
   3156 		if (error) {
   3157 			vput(vp);
   3158 			continue;
   3159 		}
   3160 		if (wedge) {
   3161 			struct dkwedge_info dkw;
   3162 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3163 			    NOCRED);
   3164 			if (error) {
   3165 				printf("RAIDframe: can't get wedge info for "
   3166 				    "dev %s (%d)\n", device_xname(dv), error);
   3167 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3168 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3169 				vput(vp);
   3170 				continue;
   3171 			}
   3172 
   3173 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3174 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3175 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3176 				vput(vp);
   3177 				continue;
   3178 			}
   3179 
   3180 			ac_list = rf_get_component(ac_list, dev, vp,
   3181 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3182 			rf_part_found = 1; /*There is a raid component on this disk*/
   3183 			continue;
   3184 		}
   3185 
   3186 		/* Ok, the disk exists.  Go get the disklabel. */
   3187 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3188 		if (error) {
   3189 			/*
   3190 			 * XXX can't happen - open() would
   3191 			 * have errored out (or faked up one)
   3192 			 */
   3193 			if (error != ENOTTY)
   3194 				printf("RAIDframe: can't get label for dev "
   3195 				    "%s (%d)\n", device_xname(dv), error);
   3196 		}
   3197 
   3198 		/* don't need this any more.  We'll allocate it again
   3199 		   a little later if we really do... */
   3200 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3201 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3202 		vput(vp);
   3203 
   3204 		if (error)
   3205 			continue;
   3206 
   3207 		rf_part_found = 0; /*No raid partitions yet*/
   3208 		for (i = 0; i < label.d_npartitions; i++) {
   3209 			char cname[sizeof(ac_list->devname)];
   3210 
   3211 			/* We only support partitions marked as RAID */
   3212 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3213 				continue;
   3214 
   3215 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3216 			if (bdevvp(dev, &vp))
   3217 				panic("RAID can't alloc vnode");
   3218 
   3219 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3220 			if (error) {
   3221 				/* Whatever... */
   3222 				vput(vp);
   3223 				continue;
   3224 			}
   3225 			snprintf(cname, sizeof(cname), "%s%c",
   3226 			    device_xname(dv), 'a' + i);
   3227 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3228 				label.d_partitions[i].p_size, numsecs, secsize);
   3229 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3230 		}
   3231 
   3232 		/*
   3233 		 *If there is no raid component on this disk, either in a
   3234 		 *disklabel or inside a wedge, check the raw partition as well,
   3235 		 *as it is possible to configure raid components on raw disk
   3236 		 *devices.
   3237 		 */
   3238 
   3239 		if (!rf_part_found) {
   3240 			char cname[sizeof(ac_list->devname)];
   3241 
   3242 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3243 			if (bdevvp(dev, &vp))
   3244 				panic("RAID can't alloc vnode");
   3245 
   3246 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3247 			if (error) {
   3248 				/* Whatever... */
   3249 				vput(vp);
   3250 				continue;
   3251 			}
   3252 			snprintf(cname, sizeof(cname), "%s%c",
   3253 			    device_xname(dv), 'a' + RAW_PART);
   3254 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3255 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3256 		}
   3257 	}
   3258 	deviter_release(&di);
   3259 	return ac_list;
   3260 }
   3261 
   3262 
   3263 int
   3264 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3265 {
   3266 
   3267 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3268 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3269 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3270 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3271 	    clabel->row >=0 &&
   3272 	    clabel->column >= 0 &&
   3273 	    clabel->num_rows > 0 &&
   3274 	    clabel->num_columns > 0 &&
   3275 	    clabel->row < clabel->num_rows &&
   3276 	    clabel->column < clabel->num_columns &&
   3277 	    clabel->blockSize > 0 &&
   3278 	    /*
   3279 	     * numBlocksHi may contain garbage, but it is ok since
   3280 	     * the type is unsigned.  If it is really garbage,
   3281 	     * rf_fix_old_label_size() will fix it.
   3282 	     */
   3283 	    rf_component_label_numblocks(clabel) > 0) {
   3284 		/*
   3285 		 * label looks reasonable enough...
   3286 		 * let's make sure it has no old garbage.
   3287 		 */
   3288 		if (numsecs)
   3289 			rf_fix_old_label_size(clabel, numsecs);
   3290 		return(1);
   3291 	}
   3292 	return(0);
   3293 }
   3294 
   3295 
   3296 /*
   3297  * For reasons yet unknown, some old component labels have garbage in
   3298  * the newer numBlocksHi region, and this causes lossage.  Since those
   3299  * disks will also have numsecs set to less than 32 bits of sectors,
   3300  * we can determine when this corruption has occurred, and fix it.
   3301  *
   3302  * The exact same problem, with the same unknown reason, happens to
   3303  * the partitionSizeHi member as well.
   3304  */
   3305 static void
   3306 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3307 {
   3308 
   3309 	if (numsecs < ((uint64_t)1 << 32)) {
   3310 		if (clabel->numBlocksHi) {
   3311 			printf("WARNING: total sectors < 32 bits, yet "
   3312 			       "numBlocksHi set\n"
   3313 			       "WARNING: resetting numBlocksHi to zero.\n");
   3314 			clabel->numBlocksHi = 0;
   3315 		}
   3316 
   3317 		if (clabel->partitionSizeHi) {
   3318 			printf("WARNING: total sectors < 32 bits, yet "
   3319 			       "partitionSizeHi set\n"
   3320 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3321 			clabel->partitionSizeHi = 0;
   3322 		}
   3323 	}
   3324 }
   3325 
   3326 
   3327 #ifdef DEBUG
   3328 void
   3329 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3330 {
   3331 	uint64_t numBlocks;
   3332 	static const char *rp[] = {
   3333 	    "No", "Force", "Soft", "*invalid*"
   3334 	};
   3335 
   3336 
   3337 	numBlocks = rf_component_label_numblocks(clabel);
   3338 
   3339 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3340 	       clabel->row, clabel->column,
   3341 	       clabel->num_rows, clabel->num_columns);
   3342 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3343 	       clabel->version, clabel->serial_number,
   3344 	       clabel->mod_counter);
   3345 	printf("   Clean: %s Status: %d\n",
   3346 	       clabel->clean ? "Yes" : "No", clabel->status);
   3347 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3348 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3349 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3350 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3351 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3352 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3353 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3354 #if 0
   3355 	   printf("   Config order: %d\n", clabel->config_order);
   3356 #endif
   3357 
   3358 }
   3359 #endif
   3360 
   3361 RF_ConfigSet_t *
   3362 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3363 {
   3364 	RF_AutoConfig_t *ac;
   3365 	RF_ConfigSet_t *config_sets;
   3366 	RF_ConfigSet_t *cset;
   3367 	RF_AutoConfig_t *ac_next;
   3368 
   3369 
   3370 	config_sets = NULL;
   3371 
   3372 	/* Go through the AutoConfig list, and figure out which components
   3373 	   belong to what sets.  */
   3374 	ac = ac_list;
   3375 	while(ac!=NULL) {
   3376 		/* we're going to putz with ac->next, so save it here
   3377 		   for use at the end of the loop */
   3378 		ac_next = ac->next;
   3379 
   3380 		if (config_sets == NULL) {
   3381 			/* will need at least this one... */
   3382 			config_sets = (RF_ConfigSet_t *)
   3383 				malloc(sizeof(RF_ConfigSet_t),
   3384 				       M_RAIDFRAME, M_NOWAIT);
   3385 			if (config_sets == NULL) {
   3386 				panic("rf_create_auto_sets: No memory!");
   3387 			}
   3388 			/* this one is easy :) */
   3389 			config_sets->ac = ac;
   3390 			config_sets->next = NULL;
   3391 			config_sets->rootable = 0;
   3392 			ac->next = NULL;
   3393 		} else {
   3394 			/* which set does this component fit into? */
   3395 			cset = config_sets;
   3396 			while(cset!=NULL) {
   3397 				if (rf_does_it_fit(cset, ac)) {
   3398 					/* looks like it matches... */
   3399 					ac->next = cset->ac;
   3400 					cset->ac = ac;
   3401 					break;
   3402 				}
   3403 				cset = cset->next;
   3404 			}
   3405 			if (cset==NULL) {
   3406 				/* didn't find a match above... new set..*/
   3407 				cset = (RF_ConfigSet_t *)
   3408 					malloc(sizeof(RF_ConfigSet_t),
   3409 					       M_RAIDFRAME, M_NOWAIT);
   3410 				if (cset == NULL) {
   3411 					panic("rf_create_auto_sets: No memory!");
   3412 				}
   3413 				cset->ac = ac;
   3414 				ac->next = NULL;
   3415 				cset->next = config_sets;
   3416 				cset->rootable = 0;
   3417 				config_sets = cset;
   3418 			}
   3419 		}
   3420 		ac = ac_next;
   3421 	}
   3422 
   3423 
   3424 	return(config_sets);
   3425 }
   3426 
   3427 static int
   3428 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3429 {
   3430 	RF_ComponentLabel_t *clabel1, *clabel2;
   3431 
   3432 	/* If this one matches the *first* one in the set, that's good
   3433 	   enough, since the other members of the set would have been
   3434 	   through here too... */
   3435 	/* note that we are not checking partitionSize here..
   3436 
   3437 	   Note that we are also not checking the mod_counters here.
   3438 	   If everything else matches except the mod_counter, that's
   3439 	   good enough for this test.  We will deal with the mod_counters
   3440 	   a little later in the autoconfiguration process.
   3441 
   3442 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3443 
   3444 	   The reason we don't check for this is that failed disks
   3445 	   will have lower modification counts.  If those disks are
   3446 	   not added to the set they used to belong to, then they will
   3447 	   form their own set, which may result in 2 different sets,
   3448 	   for example, competing to be configured at raid0, and
   3449 	   perhaps competing to be the root filesystem set.  If the
   3450 	   wrong ones get configured, or both attempt to become /,
   3451 	   weird behaviour and or serious lossage will occur.  Thus we
   3452 	   need to bring them into the fold here, and kick them out at
   3453 	   a later point.
   3454 
   3455 	*/
   3456 
   3457 	clabel1 = cset->ac->clabel;
   3458 	clabel2 = ac->clabel;
   3459 	if ((clabel1->version == clabel2->version) &&
   3460 	    (clabel1->serial_number == clabel2->serial_number) &&
   3461 	    (clabel1->num_rows == clabel2->num_rows) &&
   3462 	    (clabel1->num_columns == clabel2->num_columns) &&
   3463 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3464 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3465 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3466 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3467 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3468 	    (clabel1->blockSize == clabel2->blockSize) &&
   3469 	    rf_component_label_numblocks(clabel1) ==
   3470 	    rf_component_label_numblocks(clabel2) &&
   3471 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3472 	    (clabel1->root_partition == clabel2->root_partition) &&
   3473 	    (clabel1->last_unit == clabel2->last_unit) &&
   3474 	    (clabel1->config_order == clabel2->config_order)) {
   3475 		/* if it get's here, it almost *has* to be a match */
   3476 	} else {
   3477 		/* it's not consistent with somebody in the set..
   3478 		   punt */
   3479 		return(0);
   3480 	}
   3481 	/* all was fine.. it must fit... */
   3482 	return(1);
   3483 }
   3484 
   3485 int
   3486 rf_have_enough_components(RF_ConfigSet_t *cset)
   3487 {
   3488 	RF_AutoConfig_t *ac;
   3489 	RF_AutoConfig_t *auto_config;
   3490 	RF_ComponentLabel_t *clabel;
   3491 	int c;
   3492 	int num_cols;
   3493 	int num_missing;
   3494 	int mod_counter;
   3495 	int mod_counter_found;
   3496 	int even_pair_failed;
   3497 	char parity_type;
   3498 
   3499 
   3500 	/* check to see that we have enough 'live' components
   3501 	   of this set.  If so, we can configure it if necessary */
   3502 
   3503 	num_cols = cset->ac->clabel->num_columns;
   3504 	parity_type = cset->ac->clabel->parityConfig;
   3505 
   3506 	/* XXX Check for duplicate components!?!?!? */
   3507 
   3508 	/* Determine what the mod_counter is supposed to be for this set. */
   3509 
   3510 	mod_counter_found = 0;
   3511 	mod_counter = 0;
   3512 	ac = cset->ac;
   3513 	while(ac!=NULL) {
   3514 		if (mod_counter_found==0) {
   3515 			mod_counter = ac->clabel->mod_counter;
   3516 			mod_counter_found = 1;
   3517 		} else {
   3518 			if (ac->clabel->mod_counter > mod_counter) {
   3519 				mod_counter = ac->clabel->mod_counter;
   3520 			}
   3521 		}
   3522 		ac = ac->next;
   3523 	}
   3524 
   3525 	num_missing = 0;
   3526 	auto_config = cset->ac;
   3527 
   3528 	even_pair_failed = 0;
   3529 	for(c=0; c<num_cols; c++) {
   3530 		ac = auto_config;
   3531 		while(ac!=NULL) {
   3532 			if ((ac->clabel->column == c) &&
   3533 			    (ac->clabel->mod_counter == mod_counter)) {
   3534 				/* it's this one... */
   3535 #ifdef DEBUG
   3536 				printf("Found: %s at %d\n",
   3537 				       ac->devname,c);
   3538 #endif
   3539 				break;
   3540 			}
   3541 			ac=ac->next;
   3542 		}
   3543 		if (ac==NULL) {
   3544 				/* Didn't find one here! */
   3545 				/* special case for RAID 1, especially
   3546 				   where there are more than 2
   3547 				   components (where RAIDframe treats
   3548 				   things a little differently :( ) */
   3549 			if (parity_type == '1') {
   3550 				if (c%2 == 0) { /* even component */
   3551 					even_pair_failed = 1;
   3552 				} else { /* odd component.  If
   3553 					    we're failed, and
   3554 					    so is the even
   3555 					    component, it's
   3556 					    "Good Night, Charlie" */
   3557 					if (even_pair_failed == 1) {
   3558 						return(0);
   3559 					}
   3560 				}
   3561 			} else {
   3562 				/* normal accounting */
   3563 				num_missing++;
   3564 			}
   3565 		}
   3566 		if ((parity_type == '1') && (c%2 == 1)) {
   3567 				/* Just did an even component, and we didn't
   3568 				   bail.. reset the even_pair_failed flag,
   3569 				   and go on to the next component.... */
   3570 			even_pair_failed = 0;
   3571 		}
   3572 	}
   3573 
   3574 	clabel = cset->ac->clabel;
   3575 
   3576 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3577 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3578 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3579 		/* XXX this needs to be made *much* more general */
   3580 		/* Too many failures */
   3581 		return(0);
   3582 	}
   3583 	/* otherwise, all is well, and we've got enough to take a kick
   3584 	   at autoconfiguring this set */
   3585 	return(1);
   3586 }
   3587 
   3588 void
   3589 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3590 			RF_Raid_t *raidPtr)
   3591 {
   3592 	RF_ComponentLabel_t *clabel;
   3593 	int i;
   3594 
   3595 	clabel = ac->clabel;
   3596 
   3597 	/* 1. Fill in the common stuff */
   3598 	config->numRow = clabel->num_rows = 1;
   3599 	config->numCol = clabel->num_columns;
   3600 	config->numSpare = 0; /* XXX should this be set here? */
   3601 	config->sectPerSU = clabel->sectPerSU;
   3602 	config->SUsPerPU = clabel->SUsPerPU;
   3603 	config->SUsPerRU = clabel->SUsPerRU;
   3604 	config->parityConfig = clabel->parityConfig;
   3605 	/* XXX... */
   3606 	strcpy(config->diskQueueType,"fifo");
   3607 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3608 	config->layoutSpecificSize = 0; /* XXX ?? */
   3609 
   3610 	while(ac!=NULL) {
   3611 		/* row/col values will be in range due to the checks
   3612 		   in reasonable_label() */
   3613 		strcpy(config->devnames[0][ac->clabel->column],
   3614 		       ac->devname);
   3615 		ac = ac->next;
   3616 	}
   3617 
   3618 	for(i=0;i<RF_MAXDBGV;i++) {
   3619 		config->debugVars[i][0] = 0;
   3620 	}
   3621 }
   3622 
   3623 int
   3624 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3625 {
   3626 	RF_ComponentLabel_t *clabel;
   3627 	int column;
   3628 	int sparecol;
   3629 
   3630 	raidPtr->autoconfigure = new_value;
   3631 
   3632 	for(column=0; column<raidPtr->numCol; column++) {
   3633 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3634 			clabel = raidget_component_label(raidPtr, column);
   3635 			clabel->autoconfigure = new_value;
   3636 			raidflush_component_label(raidPtr, column);
   3637 		}
   3638 	}
   3639 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3640 		sparecol = raidPtr->numCol + column;
   3641 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3642 			clabel = raidget_component_label(raidPtr, sparecol);
   3643 			clabel->autoconfigure = new_value;
   3644 			raidflush_component_label(raidPtr, sparecol);
   3645 		}
   3646 	}
   3647 	return(new_value);
   3648 }
   3649 
   3650 int
   3651 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3652 {
   3653 	RF_ComponentLabel_t *clabel;
   3654 	int column;
   3655 	int sparecol;
   3656 
   3657 	raidPtr->root_partition = new_value;
   3658 	for(column=0; column<raidPtr->numCol; column++) {
   3659 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3660 			clabel = raidget_component_label(raidPtr, column);
   3661 			clabel->root_partition = new_value;
   3662 			raidflush_component_label(raidPtr, column);
   3663 		}
   3664 	}
   3665 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3666 		sparecol = raidPtr->numCol + column;
   3667 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3668 			clabel = raidget_component_label(raidPtr, sparecol);
   3669 			clabel->root_partition = new_value;
   3670 			raidflush_component_label(raidPtr, sparecol);
   3671 		}
   3672 	}
   3673 	return(new_value);
   3674 }
   3675 
   3676 void
   3677 rf_release_all_vps(RF_ConfigSet_t *cset)
   3678 {
   3679 	RF_AutoConfig_t *ac;
   3680 
   3681 	ac = cset->ac;
   3682 	while(ac!=NULL) {
   3683 		/* Close the vp, and give it back */
   3684 		if (ac->vp) {
   3685 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3686 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3687 			vput(ac->vp);
   3688 			ac->vp = NULL;
   3689 		}
   3690 		ac = ac->next;
   3691 	}
   3692 }
   3693 
   3694 
   3695 void
   3696 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3697 {
   3698 	RF_AutoConfig_t *ac;
   3699 	RF_AutoConfig_t *next_ac;
   3700 
   3701 	ac = cset->ac;
   3702 	while(ac!=NULL) {
   3703 		next_ac = ac->next;
   3704 		/* nuke the label */
   3705 		free(ac->clabel, M_RAIDFRAME);
   3706 		/* cleanup the config structure */
   3707 		free(ac, M_RAIDFRAME);
   3708 		/* "next.." */
   3709 		ac = next_ac;
   3710 	}
   3711 	/* and, finally, nuke the config set */
   3712 	free(cset, M_RAIDFRAME);
   3713 }
   3714 
   3715 
   3716 void
   3717 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3718 {
   3719 	/* current version number */
   3720 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3721 	clabel->serial_number = raidPtr->serial_number;
   3722 	clabel->mod_counter = raidPtr->mod_counter;
   3723 
   3724 	clabel->num_rows = 1;
   3725 	clabel->num_columns = raidPtr->numCol;
   3726 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3727 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3728 
   3729 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3730 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3731 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3732 
   3733 	clabel->blockSize = raidPtr->bytesPerSector;
   3734 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3735 
   3736 	/* XXX not portable */
   3737 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3738 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3739 	clabel->autoconfigure = raidPtr->autoconfigure;
   3740 	clabel->root_partition = raidPtr->root_partition;
   3741 	clabel->last_unit = raidPtr->raidid;
   3742 	clabel->config_order = raidPtr->config_order;
   3743 
   3744 #ifndef RF_NO_PARITY_MAP
   3745 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3746 #endif
   3747 }
   3748 
   3749 struct raid_softc *
   3750 rf_auto_config_set(RF_ConfigSet_t *cset)
   3751 {
   3752 	RF_Raid_t *raidPtr;
   3753 	RF_Config_t *config;
   3754 	int raidID;
   3755 	struct raid_softc *sc;
   3756 
   3757 #ifdef DEBUG
   3758 	printf("RAID autoconfigure\n");
   3759 #endif
   3760 
   3761 	/* 1. Create a config structure */
   3762 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3763 	if (config == NULL) {
   3764 		printf("Out of mem!?!?\n");
   3765 				/* XXX do something more intelligent here. */
   3766 		return NULL;
   3767 	}
   3768 
   3769 	/*
   3770 	   2. Figure out what RAID ID this one is supposed to live at
   3771 	   See if we can get the same RAID dev that it was configured
   3772 	   on last time..
   3773 	*/
   3774 
   3775 	raidID = cset->ac->clabel->last_unit;
   3776 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3777 		continue;
   3778 #ifdef DEBUG
   3779 	printf("Configuring raid%d:\n",raidID);
   3780 #endif
   3781 
   3782 	raidPtr = &sc->sc_r;
   3783 
   3784 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3785 	raidPtr->softc = sc;
   3786 	raidPtr->raidid = raidID;
   3787 	raidPtr->openings = RAIDOUTSTANDING;
   3788 
   3789 	/* 3. Build the configuration structure */
   3790 	rf_create_configuration(cset->ac, config, raidPtr);
   3791 
   3792 	/* 4. Do the configuration */
   3793 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3794 		raidinit(sc);
   3795 
   3796 		rf_markalldirty(raidPtr);
   3797 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3798 		switch (cset->ac->clabel->root_partition) {
   3799 		case 1:	/* Force Root */
   3800 		case 2:	/* Soft Root: root when boot partition part of raid */
   3801 			/*
   3802 			 * everything configured just fine.  Make a note
   3803 			 * that this set is eligible to be root,
   3804 			 * or forced to be root
   3805 			 */
   3806 			cset->rootable = cset->ac->clabel->root_partition;
   3807 			/* XXX do this here? */
   3808 			raidPtr->root_partition = cset->rootable;
   3809 			break;
   3810 		default:
   3811 			break;
   3812 		}
   3813 	} else {
   3814 		raidput(sc);
   3815 		sc = NULL;
   3816 	}
   3817 
   3818 	/* 5. Cleanup */
   3819 	free(config, M_RAIDFRAME);
   3820 	return sc;
   3821 }
   3822 
   3823 void
   3824 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3825 {
   3826 	struct buf *bp;
   3827 	struct raid_softc *rs;
   3828 
   3829 	bp = (struct buf *)desc->bp;
   3830 	rs = desc->raidPtr->softc;
   3831 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3832 	    (bp->b_flags & B_READ));
   3833 }
   3834 
   3835 void
   3836 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3837 	     size_t xmin, size_t xmax)
   3838 {
   3839 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3840 	pool_sethiwat(p, xmax);
   3841 	pool_prime(p, xmin);
   3842 	pool_setlowat(p, xmin);
   3843 }
   3844 
   3845 /*
   3846  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3847  * if there is IO pending and if that IO could possibly be done for a
   3848  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3849  * otherwise.
   3850  *
   3851  */
   3852 
   3853 int
   3854 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3855 {
   3856 	struct raid_softc *rs = raidPtr->softc;
   3857 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3858 		/* there is work to do */
   3859 		return 0;
   3860 	}
   3861 	/* default is nothing to do */
   3862 	return 1;
   3863 }
   3864 
   3865 int
   3866 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3867 {
   3868 	uint64_t numsecs;
   3869 	unsigned secsize;
   3870 	int error;
   3871 
   3872 	error = getdisksize(vp, &numsecs, &secsize);
   3873 	if (error == 0) {
   3874 		diskPtr->blockSize = secsize;
   3875 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3876 		diskPtr->partitionSize = numsecs;
   3877 		return 0;
   3878 	}
   3879 	return error;
   3880 }
   3881 
   3882 static int
   3883 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3884 {
   3885 	return 1;
   3886 }
   3887 
   3888 static void
   3889 raid_attach(device_t parent, device_t self, void *aux)
   3890 {
   3891 
   3892 }
   3893 
   3894 
   3895 static int
   3896 raid_detach(device_t self, int flags)
   3897 {
   3898 	int error;
   3899 	struct raid_softc *rs = raidget(device_unit(self));
   3900 
   3901 	if (rs == NULL)
   3902 		return ENXIO;
   3903 
   3904 	if ((error = raidlock(rs)) != 0)
   3905 		return (error);
   3906 
   3907 	error = raid_detach_unlocked(rs);
   3908 
   3909 	raidunlock(rs);
   3910 
   3911 	/* XXXkd: raidput(rs) ??? */
   3912 
   3913 	return error;
   3914 }
   3915 
   3916 static void
   3917 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3918 {
   3919 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3920 
   3921 	memset(dg, 0, sizeof(*dg));
   3922 
   3923 	dg->dg_secperunit = raidPtr->totalSectors;
   3924 	dg->dg_secsize = raidPtr->bytesPerSector;
   3925 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3926 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3927 
   3928 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3929 }
   3930 
   3931 /*
   3932  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3933  * We end up returning whatever error was returned by the first cache flush
   3934  * that fails.
   3935  */
   3936 
   3937 int
   3938 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3939 {
   3940 	int c, sparecol;
   3941 	int e,error;
   3942 	int force = 1;
   3943 
   3944 	error = 0;
   3945 	for (c = 0; c < raidPtr->numCol; c++) {
   3946 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3947 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3948 					  &force, FWRITE, NOCRED);
   3949 			if (e) {
   3950 				if (e != ENODEV)
   3951 					printf("raid%d: cache flush to component %s failed.\n",
   3952 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3953 				if (error == 0) {
   3954 					error = e;
   3955 				}
   3956 			}
   3957 		}
   3958 	}
   3959 
   3960 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3961 		sparecol = raidPtr->numCol + c;
   3962 		/* Need to ensure that the reconstruct actually completed! */
   3963 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3964 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3965 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3966 			if (e) {
   3967 				if (e != ENODEV)
   3968 					printf("raid%d: cache flush to component %s failed.\n",
   3969 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3970 				if (error == 0) {
   3971 					error = e;
   3972 				}
   3973 			}
   3974 		}
   3975 	}
   3976 	return error;
   3977 }
   3978