Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.309
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.309 2014/05/08 20:36:15 jakllsch Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.309 2014/05/08 20:36:15 jakllsch Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_flag = D_DISK
    215 };
    216 
    217 const struct cdevsw raid_cdevsw = {
    218 	.d_open = raidopen,
    219 	.d_close = raidclose,
    220 	.d_read = raidread,
    221 	.d_write = raidwrite,
    222 	.d_ioctl = raidioctl,
    223 	.d_stop = nostop,
    224 	.d_tty = notty,
    225 	.d_poll = nopoll,
    226 	.d_mmap = nommap,
    227 	.d_kqfilter = nokqfilter,
    228 	.d_flag = D_DISK
    229 };
    230 
    231 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    232 
    233 struct raid_softc {
    234 	device_t sc_dev;
    235 	int	sc_unit;
    236 	int     sc_flags;	/* flags */
    237 	int     sc_cflags;	/* configuration flags */
    238 	uint64_t sc_size;	/* size of the raid device */
    239 	char    sc_xname[20];	/* XXX external name */
    240 	struct disk sc_dkdev;	/* generic disk device info */
    241 	struct bufq_state *buf_queue;	/* used for the device queue */
    242 	RF_Raid_t sc_r;
    243 	LIST_ENTRY(raid_softc) sc_link;
    244 };
    245 /* sc_flags */
    246 #define RAIDF_INITED	0x01	/* unit has been initialized */
    247 #define RAIDF_WLABEL	0x02	/* label area is writable */
    248 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    249 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    250 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    251 #define RAIDF_LOCKED	0x80	/* unit is locked */
    252 
    253 #define	raidunit(x)	DISKUNIT(x)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /*
    261  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    262  * Be aware that large numbers can allow the driver to consume a lot of
    263  * kernel memory, especially on writes, and in degraded mode reads.
    264  *
    265  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    266  * a single 64K write will typically require 64K for the old data,
    267  * 64K for the old parity, and 64K for the new parity, for a total
    268  * of 192K (if the parity buffer is not re-used immediately).
    269  * Even it if is used immediately, that's still 128K, which when multiplied
    270  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    271  *
    272  * Now in degraded mode, for example, a 64K read on the above setup may
    273  * require data reconstruction, which will require *all* of the 4 remaining
    274  * disks to participate -- 4 * 32K/disk == 128K again.
    275  */
    276 
    277 #ifndef RAIDOUTSTANDING
    278 #define RAIDOUTSTANDING   6
    279 #endif
    280 
    281 #define RAIDLABELDEV(dev)	\
    282 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    283 
    284 /* declared here, and made public, for the benefit of KVM stuff.. */
    285 
    286 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    287 				     struct disklabel *);
    288 static void raidgetdisklabel(dev_t);
    289 static void raidmakedisklabel(struct raid_softc *);
    290 
    291 static int raidlock(struct raid_softc *);
    292 static void raidunlock(struct raid_softc *);
    293 
    294 static int raid_detach_unlocked(struct raid_softc *);
    295 
    296 static void rf_markalldirty(RF_Raid_t *);
    297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    298 
    299 void rf_ReconThread(struct rf_recon_req *);
    300 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    301 void rf_CopybackThread(RF_Raid_t *raidPtr);
    302 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    303 int rf_autoconfig(device_t);
    304 void rf_buildroothack(RF_ConfigSet_t *);
    305 
    306 RF_AutoConfig_t *rf_find_raid_components(void);
    307 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    308 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    309 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    310 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    311 int rf_set_autoconfig(RF_Raid_t *, int);
    312 int rf_set_rootpartition(RF_Raid_t *, int);
    313 void rf_release_all_vps(RF_ConfigSet_t *);
    314 void rf_cleanup_config_set(RF_ConfigSet_t *);
    315 int rf_have_enough_components(RF_ConfigSet_t *);
    316 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    318 
    319 /*
    320  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    321  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    322  * in the kernel config file.
    323  */
    324 #ifdef RAID_AUTOCONFIG
    325 int raidautoconfig = 1;
    326 #else
    327 int raidautoconfig = 0;
    328 #endif
    329 static bool raidautoconfigdone = false;
    330 
    331 struct RF_Pools_s rf_pools;
    332 
    333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    334 static kmutex_t raid_lock;
    335 
    336 static struct raid_softc *
    337 raidcreate(int unit) {
    338 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    339 	if (sc == NULL) {
    340 #ifdef DIAGNOSTIC
    341 		printf("%s: out of memory\n", __func__);
    342 #endif
    343 		return NULL;
    344 	}
    345 	sc->sc_unit = unit;
    346 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    347 	return sc;
    348 }
    349 
    350 static void
    351 raiddestroy(struct raid_softc *sc) {
    352 	bufq_free(sc->buf_queue);
    353 	kmem_free(sc, sizeof(*sc));
    354 }
    355 
    356 static struct raid_softc *
    357 raidget(int unit) {
    358 	struct raid_softc *sc;
    359 	if (unit < 0) {
    360 #ifdef DIAGNOSTIC
    361 		panic("%s: unit %d!", __func__, unit);
    362 #endif
    363 		return NULL;
    364 	}
    365 	mutex_enter(&raid_lock);
    366 	LIST_FOREACH(sc, &raids, sc_link) {
    367 		if (sc->sc_unit == unit) {
    368 			mutex_exit(&raid_lock);
    369 			return sc;
    370 		}
    371 	}
    372 	mutex_exit(&raid_lock);
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    393 	/* This is where all the initialization stuff gets done. */
    394 
    395 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    396 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    397 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    398 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    399 
    400 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    401 #endif
    402 
    403 	if (rf_BootRaidframe() == 0)
    404 		aprint_verbose("Kernelized RAIDframe activated\n");
    405 	else
    406 		panic("Serious error booting RAID!!");
    407 
    408 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    409 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    410 	}
    411 
    412 	raidautoconfigdone = false;
    413 
    414 	/*
    415 	 * Register a finalizer which will be used to auto-config RAID
    416 	 * sets once all real hardware devices have been found.
    417 	 */
    418 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    419 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    420 }
    421 
    422 int
    423 rf_autoconfig(device_t self)
    424 {
    425 	RF_AutoConfig_t *ac_list;
    426 	RF_ConfigSet_t *config_sets;
    427 
    428 	if (!raidautoconfig || raidautoconfigdone == true)
    429 		return (0);
    430 
    431 	/* XXX This code can only be run once. */
    432 	raidautoconfigdone = true;
    433 
    434 #ifdef __HAVE_CPU_BOOTCONF
    435 	/*
    436 	 * 0. find the boot device if needed first so we can use it later
    437 	 * this needs to be done before we autoconfigure any raid sets,
    438 	 * because if we use wedges we are not going to be able to open
    439 	 * the boot device later
    440 	 */
    441 	if (booted_device == NULL)
    442 		cpu_bootconf();
    443 #endif
    444 	/* 1. locate all RAID components on the system */
    445 	aprint_debug("Searching for RAID components...\n");
    446 	ac_list = rf_find_raid_components();
    447 
    448 	/* 2. Sort them into their respective sets. */
    449 	config_sets = rf_create_auto_sets(ac_list);
    450 
    451 	/*
    452 	 * 3. Evaluate each set and configure the valid ones.
    453 	 * This gets done in rf_buildroothack().
    454 	 */
    455 	rf_buildroothack(config_sets);
    456 
    457 	return 1;
    458 }
    459 
    460 static int
    461 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    462 	const char *bootname = device_xname(bdv);
    463 	size_t len = strlen(bootname);
    464 
    465 	for (int col = 0; col < r->numCol; col++) {
    466 		const char *devname = r->Disks[col].devname;
    467 		devname += sizeof("/dev/") - 1;
    468 		if (strncmp(devname, "dk", 2) == 0) {
    469 			const char *parent =
    470 			    dkwedge_get_parent_name(r->Disks[col].dev);
    471 			if (parent != NULL)
    472 				devname = parent;
    473 		}
    474 		if (strncmp(devname, bootname, len) == 0) {
    475 			struct raid_softc *sc = r->softc;
    476 			aprint_debug("raid%d includes boot device %s\n",
    477 			    sc->sc_unit, devname);
    478 			return 1;
    479 		}
    480 	}
    481 	return 0;
    482 }
    483 
    484 void
    485 rf_buildroothack(RF_ConfigSet_t *config_sets)
    486 {
    487 	RF_ConfigSet_t *cset;
    488 	RF_ConfigSet_t *next_cset;
    489 	int num_root;
    490 	struct raid_softc *sc, *rsc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok\n",
    502 				    sc->sc_unit);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 
    522 	/* if the user has specified what the root device should be
    523 	   then we don't touch booted_device or boothowto... */
    524 
    525 	if (rootspec != NULL)
    526 		return;
    527 
    528 	/* we found something bootable... */
    529 
    530 	if (num_root == 1) {
    531 		device_t candidate_root;
    532 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    533 			/* XXX: How do we find the real root partition? */
    534 			char cname[sizeof(cset->ac->devname)];
    535 			snprintf(cname, sizeof(cname), "%s%c",
    536 			    device_xname(rsc->sc_dev), 'a');
    537 			candidate_root = dkwedge_find_by_wname(cname);
    538 		} else
    539 			candidate_root = rsc->sc_dev;
    540 		if (booted_device == NULL ||
    541 		    rsc->sc_r.root_partition == 1 ||
    542 		    rf_containsboot(&rsc->sc_r, booted_device))
    543 			booted_device = candidate_root;
    544 	} else if (num_root > 1) {
    545 
    546 		/*
    547 		 * Maybe the MD code can help. If it cannot, then
    548 		 * setroot() will discover that we have no
    549 		 * booted_device and will ask the user if nothing was
    550 		 * hardwired in the kernel config file
    551 		 */
    552 		if (booted_device == NULL)
    553 			return;
    554 
    555 		num_root = 0;
    556 		mutex_enter(&raid_lock);
    557 		LIST_FOREACH(sc, &raids, sc_link) {
    558 			RF_Raid_t *r = &sc->sc_r;
    559 			if (r->valid == 0)
    560 				continue;
    561 
    562 			if (r->root_partition == 0)
    563 				continue;
    564 
    565 			if (rf_containsboot(r, booted_device)) {
    566 				num_root++;
    567 				rsc = sc;
    568 			}
    569 		}
    570 		mutex_exit(&raid_lock);
    571 
    572 		if (num_root == 1) {
    573 			booted_device = rsc->sc_dev;
    574 		} else {
    575 			/* we can't guess.. require the user to answer... */
    576 			boothowto |= RB_ASKNAME;
    577 		}
    578 	}
    579 }
    580 
    581 
    582 int
    583 raidsize(dev_t dev)
    584 {
    585 	struct raid_softc *rs;
    586 	struct disklabel *lp;
    587 	int     part, unit, omask, size;
    588 
    589 	unit = raidunit(dev);
    590 	if ((rs = raidget(unit)) == NULL)
    591 		return -1;
    592 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    593 		return (-1);
    594 
    595 	part = DISKPART(dev);
    596 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    597 	lp = rs->sc_dkdev.dk_label;
    598 
    599 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    600 		return (-1);
    601 
    602 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    603 		size = -1;
    604 	else
    605 		size = lp->d_partitions[part].p_size *
    606 		    (lp->d_secsize / DEV_BSIZE);
    607 
    608 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    609 		return (-1);
    610 
    611 	return (size);
    612 
    613 }
    614 
    615 int
    616 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    617 {
    618 	int     unit = raidunit(dev);
    619 	struct raid_softc *rs;
    620 	const struct bdevsw *bdev;
    621 	struct disklabel *lp;
    622 	RF_Raid_t *raidPtr;
    623 	daddr_t offset;
    624 	int     part, c, sparecol, j, scol, dumpto;
    625 	int     error = 0;
    626 
    627 	if ((rs = raidget(unit)) == NULL)
    628 		return ENXIO;
    629 
    630 	raidPtr = &rs->sc_r;
    631 
    632 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    633 		return ENXIO;
    634 
    635 	/* we only support dumping to RAID 1 sets */
    636 	if (raidPtr->Layout.numDataCol != 1 ||
    637 	    raidPtr->Layout.numParityCol != 1)
    638 		return EINVAL;
    639 
    640 
    641 	if ((error = raidlock(rs)) != 0)
    642 		return error;
    643 
    644 	if (size % DEV_BSIZE != 0) {
    645 		error = EINVAL;
    646 		goto out;
    647 	}
    648 
    649 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    650 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    651 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    652 		    size / DEV_BSIZE, rs->sc_size);
    653 		error = EINVAL;
    654 		goto out;
    655 	}
    656 
    657 	part = DISKPART(dev);
    658 	lp = rs->sc_dkdev.dk_label;
    659 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    660 
    661 	/* figure out what device is alive.. */
    662 
    663 	/*
    664 	   Look for a component to dump to.  The preference for the
    665 	   component to dump to is as follows:
    666 	   1) the master
    667 	   2) a used_spare of the master
    668 	   3) the slave
    669 	   4) a used_spare of the slave
    670 	*/
    671 
    672 	dumpto = -1;
    673 	for (c = 0; c < raidPtr->numCol; c++) {
    674 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    675 			/* this might be the one */
    676 			dumpto = c;
    677 			break;
    678 		}
    679 	}
    680 
    681 	/*
    682 	   At this point we have possibly selected a live master or a
    683 	   live slave.  We now check to see if there is a spared
    684 	   master (or a spared slave), if we didn't find a live master
    685 	   or a live slave.
    686 	*/
    687 
    688 	for (c = 0; c < raidPtr->numSpare; c++) {
    689 		sparecol = raidPtr->numCol + c;
    690 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    691 			/* How about this one? */
    692 			scol = -1;
    693 			for(j=0;j<raidPtr->numCol;j++) {
    694 				if (raidPtr->Disks[j].spareCol == sparecol) {
    695 					scol = j;
    696 					break;
    697 				}
    698 			}
    699 			if (scol == 0) {
    700 				/*
    701 				   We must have found a spared master!
    702 				   We'll take that over anything else
    703 				   found so far.  (We couldn't have
    704 				   found a real master before, since
    705 				   this is a used spare, and it's
    706 				   saying that it's replacing the
    707 				   master.)  On reboot (with
    708 				   autoconfiguration turned on)
    709 				   sparecol will become the 1st
    710 				   component (component0) of this set.
    711 				*/
    712 				dumpto = sparecol;
    713 				break;
    714 			} else if (scol != -1) {
    715 				/*
    716 				   Must be a spared slave.  We'll dump
    717 				   to that if we havn't found anything
    718 				   else so far.
    719 				*/
    720 				if (dumpto == -1)
    721 					dumpto = sparecol;
    722 			}
    723 		}
    724 	}
    725 
    726 	if (dumpto == -1) {
    727 		/* we couldn't find any live components to dump to!?!?
    728 		 */
    729 		error = EINVAL;
    730 		goto out;
    731 	}
    732 
    733 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    734 
    735 	/*
    736 	   Note that blkno is relative to this particular partition.
    737 	   By adding the offset of this partition in the RAID
    738 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    739 	   value that is relative to the partition used for the
    740 	   underlying component.
    741 	*/
    742 
    743 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    744 				blkno + offset, va, size);
    745 
    746 out:
    747 	raidunlock(rs);
    748 
    749 	return error;
    750 }
    751 /* ARGSUSED */
    752 int
    753 raidopen(dev_t dev, int flags, int fmt,
    754     struct lwp *l)
    755 {
    756 	int     unit = raidunit(dev);
    757 	struct raid_softc *rs;
    758 	struct disklabel *lp;
    759 	int     part, pmask;
    760 	int     error = 0;
    761 
    762 	if ((rs = raidget(unit)) == NULL)
    763 		return ENXIO;
    764 	if ((error = raidlock(rs)) != 0)
    765 		return (error);
    766 
    767 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    768 		error = EBUSY;
    769 		goto bad;
    770 	}
    771 
    772 	lp = rs->sc_dkdev.dk_label;
    773 
    774 	part = DISKPART(dev);
    775 
    776 	/*
    777 	 * If there are wedges, and this is not RAW_PART, then we
    778 	 * need to fail.
    779 	 */
    780 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    781 		error = EBUSY;
    782 		goto bad;
    783 	}
    784 	pmask = (1 << part);
    785 
    786 	if ((rs->sc_flags & RAIDF_INITED) &&
    787 	    (rs->sc_dkdev.dk_openmask == 0))
    788 		raidgetdisklabel(dev);
    789 
    790 	/* make sure that this partition exists */
    791 
    792 	if (part != RAW_PART) {
    793 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    794 		    ((part >= lp->d_npartitions) ||
    795 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    796 			error = ENXIO;
    797 			goto bad;
    798 		}
    799 	}
    800 	/* Prevent this unit from being unconfigured while open. */
    801 	switch (fmt) {
    802 	case S_IFCHR:
    803 		rs->sc_dkdev.dk_copenmask |= pmask;
    804 		break;
    805 
    806 	case S_IFBLK:
    807 		rs->sc_dkdev.dk_bopenmask |= pmask;
    808 		break;
    809 	}
    810 
    811 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    812 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    813 		/* First one... mark things as dirty... Note that we *MUST*
    814 		 have done a configure before this.  I DO NOT WANT TO BE
    815 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    816 		 THAT THEY BELONG TOGETHER!!!!! */
    817 		/* XXX should check to see if we're only open for reading
    818 		   here... If so, we needn't do this, but then need some
    819 		   other way of keeping track of what's happened.. */
    820 
    821 		rf_markalldirty(&rs->sc_r);
    822 	}
    823 
    824 
    825 	rs->sc_dkdev.dk_openmask =
    826 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    827 
    828 bad:
    829 	raidunlock(rs);
    830 
    831 	return (error);
    832 
    833 
    834 }
    835 /* ARGSUSED */
    836 int
    837 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    838 {
    839 	int     unit = raidunit(dev);
    840 	struct raid_softc *rs;
    841 	int     error = 0;
    842 	int     part;
    843 
    844 	if ((rs = raidget(unit)) == NULL)
    845 		return ENXIO;
    846 
    847 	if ((error = raidlock(rs)) != 0)
    848 		return (error);
    849 
    850 	part = DISKPART(dev);
    851 
    852 	/* ...that much closer to allowing unconfiguration... */
    853 	switch (fmt) {
    854 	case S_IFCHR:
    855 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    856 		break;
    857 
    858 	case S_IFBLK:
    859 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    860 		break;
    861 	}
    862 	rs->sc_dkdev.dk_openmask =
    863 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    864 
    865 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    866 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    867 		/* Last one... device is not unconfigured yet.
    868 		   Device shutdown has taken care of setting the
    869 		   clean bits if RAIDF_INITED is not set
    870 		   mark things as clean... */
    871 
    872 		rf_update_component_labels(&rs->sc_r,
    873 						 RF_FINAL_COMPONENT_UPDATE);
    874 
    875 		/* If the kernel is shutting down, it will detach
    876 		 * this RAID set soon enough.
    877 		 */
    878 	}
    879 
    880 	raidunlock(rs);
    881 	return (0);
    882 
    883 }
    884 
    885 void
    886 raidstrategy(struct buf *bp)
    887 {
    888 	unsigned int unit = raidunit(bp->b_dev);
    889 	RF_Raid_t *raidPtr;
    890 	int     wlabel;
    891 	struct raid_softc *rs;
    892 
    893 	if ((rs = raidget(unit)) == NULL) {
    894 		bp->b_error = ENXIO;
    895 		goto done;
    896 	}
    897 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    898 		bp->b_error = ENXIO;
    899 		goto done;
    900 	}
    901 	raidPtr = &rs->sc_r;
    902 	if (!raidPtr->valid) {
    903 		bp->b_error = ENODEV;
    904 		goto done;
    905 	}
    906 	if (bp->b_bcount == 0) {
    907 		db1_printf(("b_bcount is zero..\n"));
    908 		goto done;
    909 	}
    910 
    911 	/*
    912 	 * Do bounds checking and adjust transfer.  If there's an
    913 	 * error, the bounds check will flag that for us.
    914 	 */
    915 
    916 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    917 	if (DISKPART(bp->b_dev) == RAW_PART) {
    918 		uint64_t size; /* device size in DEV_BSIZE unit */
    919 
    920 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    921 			size = raidPtr->totalSectors <<
    922 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    923 		} else {
    924 			size = raidPtr->totalSectors >>
    925 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    926 		}
    927 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    928 			goto done;
    929 		}
    930 	} else {
    931 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    932 			db1_printf(("Bounds check failed!!:%d %d\n",
    933 				(int) bp->b_blkno, (int) wlabel));
    934 			goto done;
    935 		}
    936 	}
    937 
    938 	rf_lock_mutex2(raidPtr->iodone_lock);
    939 
    940 	bp->b_resid = 0;
    941 
    942 	/* stuff it onto our queue */
    943 	bufq_put(rs->buf_queue, bp);
    944 
    945 	/* scheduled the IO to happen at the next convenient time */
    946 	rf_signal_cond2(raidPtr->iodone_cv);
    947 	rf_unlock_mutex2(raidPtr->iodone_lock);
    948 
    949 	return;
    950 
    951 done:
    952 	bp->b_resid = bp->b_bcount;
    953 	biodone(bp);
    954 }
    955 /* ARGSUSED */
    956 int
    957 raidread(dev_t dev, struct uio *uio, int flags)
    958 {
    959 	int     unit = raidunit(dev);
    960 	struct raid_softc *rs;
    961 
    962 	if ((rs = raidget(unit)) == NULL)
    963 		return ENXIO;
    964 
    965 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    966 		return (ENXIO);
    967 
    968 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    969 
    970 }
    971 /* ARGSUSED */
    972 int
    973 raidwrite(dev_t dev, struct uio *uio, int flags)
    974 {
    975 	int     unit = raidunit(dev);
    976 	struct raid_softc *rs;
    977 
    978 	if ((rs = raidget(unit)) == NULL)
    979 		return ENXIO;
    980 
    981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    982 		return (ENXIO);
    983 
    984 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    985 
    986 }
    987 
    988 static int
    989 raid_detach_unlocked(struct raid_softc *rs)
    990 {
    991 	int error;
    992 	RF_Raid_t *raidPtr;
    993 
    994 	raidPtr = &rs->sc_r;
    995 
    996 	/*
    997 	 * If somebody has a partition mounted, we shouldn't
    998 	 * shutdown.
    999 	 */
   1000 	if (rs->sc_dkdev.dk_openmask != 0)
   1001 		return EBUSY;
   1002 
   1003 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1004 		;	/* not initialized: nothing to do */
   1005 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1006 		return error;
   1007 	else
   1008 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1009 
   1010 	/* Detach the disk. */
   1011 	dkwedge_delall(&rs->sc_dkdev);
   1012 	disk_detach(&rs->sc_dkdev);
   1013 	disk_destroy(&rs->sc_dkdev);
   1014 
   1015 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1016 
   1017 	return 0;
   1018 }
   1019 
   1020 int
   1021 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1022 {
   1023 	int     unit = raidunit(dev);
   1024 	int     error = 0;
   1025 	int     part, pmask, s;
   1026 	cfdata_t cf;
   1027 	struct raid_softc *rs;
   1028 	RF_Config_t *k_cfg, *u_cfg;
   1029 	RF_Raid_t *raidPtr;
   1030 	RF_RaidDisk_t *diskPtr;
   1031 	RF_AccTotals_t *totals;
   1032 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1033 	u_char *specific_buf;
   1034 	int retcode = 0;
   1035 	int column;
   1036 /*	int raidid; */
   1037 	struct rf_recon_req *rrcopy, *rr;
   1038 	RF_ComponentLabel_t *clabel;
   1039 	RF_ComponentLabel_t *ci_label;
   1040 	RF_ComponentLabel_t **clabel_ptr;
   1041 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1042 	RF_SingleComponent_t component;
   1043 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1044 	int i, j, d;
   1045 #ifdef __HAVE_OLD_DISKLABEL
   1046 	struct disklabel newlabel;
   1047 #endif
   1048 	struct dkwedge_info *dkw;
   1049 
   1050 	if ((rs = raidget(unit)) == NULL)
   1051 		return ENXIO;
   1052 	raidPtr = &rs->sc_r;
   1053 
   1054 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1055 		(int) DISKPART(dev), (int) unit, cmd));
   1056 
   1057 	/* Must be open for writes for these commands... */
   1058 	switch (cmd) {
   1059 #ifdef DIOCGSECTORSIZE
   1060 	case DIOCGSECTORSIZE:
   1061 		*(u_int *)data = raidPtr->bytesPerSector;
   1062 		return 0;
   1063 	case DIOCGMEDIASIZE:
   1064 		*(off_t *)data =
   1065 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1066 		return 0;
   1067 #endif
   1068 	case DIOCSDINFO:
   1069 	case DIOCWDINFO:
   1070 #ifdef __HAVE_OLD_DISKLABEL
   1071 	case ODIOCWDINFO:
   1072 	case ODIOCSDINFO:
   1073 #endif
   1074 	case DIOCWLABEL:
   1075 	case DIOCAWEDGE:
   1076 	case DIOCDWEDGE:
   1077 	case DIOCSSTRATEGY:
   1078 		if ((flag & FWRITE) == 0)
   1079 			return (EBADF);
   1080 	}
   1081 
   1082 	/* Must be initialized for these... */
   1083 	switch (cmd) {
   1084 	case DIOCGDINFO:
   1085 	case DIOCSDINFO:
   1086 	case DIOCWDINFO:
   1087 #ifdef __HAVE_OLD_DISKLABEL
   1088 	case ODIOCGDINFO:
   1089 	case ODIOCWDINFO:
   1090 	case ODIOCSDINFO:
   1091 	case ODIOCGDEFLABEL:
   1092 #endif
   1093 	case DIOCGPART:
   1094 	case DIOCWLABEL:
   1095 	case DIOCGDEFLABEL:
   1096 	case DIOCAWEDGE:
   1097 	case DIOCDWEDGE:
   1098 	case DIOCLWEDGES:
   1099 	case DIOCCACHESYNC:
   1100 	case RAIDFRAME_SHUTDOWN:
   1101 	case RAIDFRAME_REWRITEPARITY:
   1102 	case RAIDFRAME_GET_INFO:
   1103 	case RAIDFRAME_RESET_ACCTOTALS:
   1104 	case RAIDFRAME_GET_ACCTOTALS:
   1105 	case RAIDFRAME_KEEP_ACCTOTALS:
   1106 	case RAIDFRAME_GET_SIZE:
   1107 	case RAIDFRAME_FAIL_DISK:
   1108 	case RAIDFRAME_COPYBACK:
   1109 	case RAIDFRAME_CHECK_RECON_STATUS:
   1110 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1111 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1112 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1113 	case RAIDFRAME_ADD_HOT_SPARE:
   1114 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1115 	case RAIDFRAME_INIT_LABELS:
   1116 	case RAIDFRAME_REBUILD_IN_PLACE:
   1117 	case RAIDFRAME_CHECK_PARITY:
   1118 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1119 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1120 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1121 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1122 	case RAIDFRAME_SET_AUTOCONFIG:
   1123 	case RAIDFRAME_SET_ROOT:
   1124 	case RAIDFRAME_DELETE_COMPONENT:
   1125 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1126 	case RAIDFRAME_PARITYMAP_STATUS:
   1127 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1128 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1129 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1130 	case DIOCGSTRATEGY:
   1131 	case DIOCSSTRATEGY:
   1132 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1133 			return (ENXIO);
   1134 	}
   1135 
   1136 	switch (cmd) {
   1137 #ifdef COMPAT_50
   1138 	case RAIDFRAME_GET_INFO50:
   1139 		return rf_get_info50(raidPtr, data);
   1140 
   1141 	case RAIDFRAME_CONFIGURE50:
   1142 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1143 			return retcode;
   1144 		goto config;
   1145 #endif
   1146 		/* configure the system */
   1147 	case RAIDFRAME_CONFIGURE:
   1148 
   1149 		if (raidPtr->valid) {
   1150 			/* There is a valid RAID set running on this unit! */
   1151 			printf("raid%d: Device already configured!\n",unit);
   1152 			return(EINVAL);
   1153 		}
   1154 
   1155 		/* copy-in the configuration information */
   1156 		/* data points to a pointer to the configuration structure */
   1157 
   1158 		u_cfg = *((RF_Config_t **) data);
   1159 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1160 		if (k_cfg == NULL) {
   1161 			return (ENOMEM);
   1162 		}
   1163 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1164 		if (retcode) {
   1165 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1166 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1167 				retcode));
   1168 			return (retcode);
   1169 		}
   1170 		goto config;
   1171 	config:
   1172 		/* allocate a buffer for the layout-specific data, and copy it
   1173 		 * in */
   1174 		if (k_cfg->layoutSpecificSize) {
   1175 			if (k_cfg->layoutSpecificSize > 10000) {
   1176 				/* sanity check */
   1177 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1178 				return (EINVAL);
   1179 			}
   1180 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1181 			    (u_char *));
   1182 			if (specific_buf == NULL) {
   1183 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1184 				return (ENOMEM);
   1185 			}
   1186 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1187 			    k_cfg->layoutSpecificSize);
   1188 			if (retcode) {
   1189 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1190 				RF_Free(specific_buf,
   1191 					k_cfg->layoutSpecificSize);
   1192 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1193 					retcode));
   1194 				return (retcode);
   1195 			}
   1196 		} else
   1197 			specific_buf = NULL;
   1198 		k_cfg->layoutSpecific = specific_buf;
   1199 
   1200 		/* should do some kind of sanity check on the configuration.
   1201 		 * Store the sum of all the bytes in the last byte? */
   1202 
   1203 		/* configure the system */
   1204 
   1205 		/*
   1206 		 * Clear the entire RAID descriptor, just to make sure
   1207 		 *  there is no stale data left in the case of a
   1208 		 *  reconfiguration
   1209 		 */
   1210 		memset(raidPtr, 0, sizeof(*raidPtr));
   1211 		raidPtr->softc = rs;
   1212 		raidPtr->raidid = unit;
   1213 
   1214 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1215 
   1216 		if (retcode == 0) {
   1217 
   1218 			/* allow this many simultaneous IO's to
   1219 			   this RAID device */
   1220 			raidPtr->openings = RAIDOUTSTANDING;
   1221 
   1222 			raidinit(rs);
   1223 			rf_markalldirty(raidPtr);
   1224 		}
   1225 		/* free the buffers.  No return code here. */
   1226 		if (k_cfg->layoutSpecificSize) {
   1227 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1228 		}
   1229 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1230 
   1231 		return (retcode);
   1232 
   1233 		/* shutdown the system */
   1234 	case RAIDFRAME_SHUTDOWN:
   1235 
   1236 		part = DISKPART(dev);
   1237 		pmask = (1 << part);
   1238 
   1239 		if ((error = raidlock(rs)) != 0)
   1240 			return (error);
   1241 
   1242 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1243 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1244 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1245 			retcode = EBUSY;
   1246 		else {
   1247 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1248 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1249 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1250 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1251 			retcode = 0;
   1252 		}
   1253 
   1254 		raidunlock(rs);
   1255 
   1256 		if (retcode != 0)
   1257 			return retcode;
   1258 
   1259 		/* free the pseudo device attach bits */
   1260 
   1261 		cf = device_cfdata(rs->sc_dev);
   1262 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1263 			free(cf, M_RAIDFRAME);
   1264 
   1265 		return (retcode);
   1266 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1267 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1268 		/* need to read the component label for the disk indicated
   1269 		   by row,column in clabel */
   1270 
   1271 		/*
   1272 		 * Perhaps there should be an option to skip the in-core
   1273 		 * copy and hit the disk, as with disklabel(8).
   1274 		 */
   1275 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1276 
   1277 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1278 
   1279 		if (retcode) {
   1280 			RF_Free(clabel, sizeof(*clabel));
   1281 			return retcode;
   1282 		}
   1283 
   1284 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1285 
   1286 		column = clabel->column;
   1287 
   1288 		if ((column < 0) || (column >= raidPtr->numCol +
   1289 		    raidPtr->numSpare)) {
   1290 			RF_Free(clabel, sizeof(*clabel));
   1291 			return EINVAL;
   1292 		}
   1293 
   1294 		RF_Free(clabel, sizeof(*clabel));
   1295 
   1296 		clabel = raidget_component_label(raidPtr, column);
   1297 
   1298 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1299 
   1300 #if 0
   1301 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1302 		clabel = (RF_ComponentLabel_t *) data;
   1303 
   1304 		/* XXX check the label for valid stuff... */
   1305 		/* Note that some things *should not* get modified --
   1306 		   the user should be re-initing the labels instead of
   1307 		   trying to patch things.
   1308 		   */
   1309 
   1310 		raidid = raidPtr->raidid;
   1311 #ifdef DEBUG
   1312 		printf("raid%d: Got component label:\n", raidid);
   1313 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1314 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1315 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1316 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1317 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1318 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1319 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1320 #endif
   1321 		clabel->row = 0;
   1322 		column = clabel->column;
   1323 
   1324 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1325 			return(EINVAL);
   1326 		}
   1327 
   1328 		/* XXX this isn't allowed to do anything for now :-) */
   1329 
   1330 		/* XXX and before it is, we need to fill in the rest
   1331 		   of the fields!?!?!?! */
   1332 		memcpy(raidget_component_label(raidPtr, column),
   1333 		    clabel, sizeof(*clabel));
   1334 		raidflush_component_label(raidPtr, column);
   1335 		return (0);
   1336 #endif
   1337 
   1338 	case RAIDFRAME_INIT_LABELS:
   1339 		clabel = (RF_ComponentLabel_t *) data;
   1340 		/*
   1341 		   we only want the serial number from
   1342 		   the above.  We get all the rest of the information
   1343 		   from the config that was used to create this RAID
   1344 		   set.
   1345 		   */
   1346 
   1347 		raidPtr->serial_number = clabel->serial_number;
   1348 
   1349 		for(column=0;column<raidPtr->numCol;column++) {
   1350 			diskPtr = &raidPtr->Disks[column];
   1351 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1352 				ci_label = raidget_component_label(raidPtr,
   1353 				    column);
   1354 				/* Zeroing this is important. */
   1355 				memset(ci_label, 0, sizeof(*ci_label));
   1356 				raid_init_component_label(raidPtr, ci_label);
   1357 				ci_label->serial_number =
   1358 				    raidPtr->serial_number;
   1359 				ci_label->row = 0; /* we dont' pretend to support more */
   1360 				rf_component_label_set_partitionsize(ci_label,
   1361 				    diskPtr->partitionSize);
   1362 				ci_label->column = column;
   1363 				raidflush_component_label(raidPtr, column);
   1364 			}
   1365 			/* XXXjld what about the spares? */
   1366 		}
   1367 
   1368 		return (retcode);
   1369 	case RAIDFRAME_SET_AUTOCONFIG:
   1370 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1371 		printf("raid%d: New autoconfig value is: %d\n",
   1372 		       raidPtr->raidid, d);
   1373 		*(int *) data = d;
   1374 		return (retcode);
   1375 
   1376 	case RAIDFRAME_SET_ROOT:
   1377 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1378 		printf("raid%d: New rootpartition value is: %d\n",
   1379 		       raidPtr->raidid, d);
   1380 		*(int *) data = d;
   1381 		return (retcode);
   1382 
   1383 		/* initialize all parity */
   1384 	case RAIDFRAME_REWRITEPARITY:
   1385 
   1386 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1387 			/* Parity for RAID 0 is trivially correct */
   1388 			raidPtr->parity_good = RF_RAID_CLEAN;
   1389 			return(0);
   1390 		}
   1391 
   1392 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1393 			/* Re-write is already in progress! */
   1394 			return(EINVAL);
   1395 		}
   1396 
   1397 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1398 					   rf_RewriteParityThread,
   1399 					   raidPtr,"raid_parity");
   1400 		return (retcode);
   1401 
   1402 
   1403 	case RAIDFRAME_ADD_HOT_SPARE:
   1404 		sparePtr = (RF_SingleComponent_t *) data;
   1405 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1406 		retcode = rf_add_hot_spare(raidPtr, &component);
   1407 		return(retcode);
   1408 
   1409 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1410 		return(retcode);
   1411 
   1412 	case RAIDFRAME_DELETE_COMPONENT:
   1413 		componentPtr = (RF_SingleComponent_t *)data;
   1414 		memcpy( &component, componentPtr,
   1415 			sizeof(RF_SingleComponent_t));
   1416 		retcode = rf_delete_component(raidPtr, &component);
   1417 		return(retcode);
   1418 
   1419 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1420 		componentPtr = (RF_SingleComponent_t *)data;
   1421 		memcpy( &component, componentPtr,
   1422 			sizeof(RF_SingleComponent_t));
   1423 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1424 		return(retcode);
   1425 
   1426 	case RAIDFRAME_REBUILD_IN_PLACE:
   1427 
   1428 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1429 			/* Can't do this on a RAID 0!! */
   1430 			return(EINVAL);
   1431 		}
   1432 
   1433 		if (raidPtr->recon_in_progress == 1) {
   1434 			/* a reconstruct is already in progress! */
   1435 			return(EINVAL);
   1436 		}
   1437 
   1438 		componentPtr = (RF_SingleComponent_t *) data;
   1439 		memcpy( &component, componentPtr,
   1440 			sizeof(RF_SingleComponent_t));
   1441 		component.row = 0; /* we don't support any more */
   1442 		column = component.column;
   1443 
   1444 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1445 			return(EINVAL);
   1446 		}
   1447 
   1448 		rf_lock_mutex2(raidPtr->mutex);
   1449 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1450 		    (raidPtr->numFailures > 0)) {
   1451 			/* XXX 0 above shouldn't be constant!!! */
   1452 			/* some component other than this has failed.
   1453 			   Let's not make things worse than they already
   1454 			   are... */
   1455 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1456 			       raidPtr->raidid);
   1457 			printf("raid%d:     Col: %d   Too many failures.\n",
   1458 			       raidPtr->raidid, column);
   1459 			rf_unlock_mutex2(raidPtr->mutex);
   1460 			return (EINVAL);
   1461 		}
   1462 		if (raidPtr->Disks[column].status ==
   1463 		    rf_ds_reconstructing) {
   1464 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1465 			       raidPtr->raidid);
   1466 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1467 
   1468 			rf_unlock_mutex2(raidPtr->mutex);
   1469 			return (EINVAL);
   1470 		}
   1471 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1472 			rf_unlock_mutex2(raidPtr->mutex);
   1473 			return (EINVAL);
   1474 		}
   1475 		rf_unlock_mutex2(raidPtr->mutex);
   1476 
   1477 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1478 		if (rrcopy == NULL)
   1479 			return(ENOMEM);
   1480 
   1481 		rrcopy->raidPtr = (void *) raidPtr;
   1482 		rrcopy->col = column;
   1483 
   1484 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1485 					   rf_ReconstructInPlaceThread,
   1486 					   rrcopy,"raid_reconip");
   1487 		return(retcode);
   1488 
   1489 	case RAIDFRAME_GET_INFO:
   1490 		if (!raidPtr->valid)
   1491 			return (ENODEV);
   1492 		ucfgp = (RF_DeviceConfig_t **) data;
   1493 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1494 			  (RF_DeviceConfig_t *));
   1495 		if (d_cfg == NULL)
   1496 			return (ENOMEM);
   1497 		d_cfg->rows = 1; /* there is only 1 row now */
   1498 		d_cfg->cols = raidPtr->numCol;
   1499 		d_cfg->ndevs = raidPtr->numCol;
   1500 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1501 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1502 			return (ENOMEM);
   1503 		}
   1504 		d_cfg->nspares = raidPtr->numSpare;
   1505 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1506 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1507 			return (ENOMEM);
   1508 		}
   1509 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1510 		d = 0;
   1511 		for (j = 0; j < d_cfg->cols; j++) {
   1512 			d_cfg->devs[d] = raidPtr->Disks[j];
   1513 			d++;
   1514 		}
   1515 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1516 			d_cfg->spares[i] = raidPtr->Disks[j];
   1517 		}
   1518 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1519 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1520 
   1521 		return (retcode);
   1522 
   1523 	case RAIDFRAME_CHECK_PARITY:
   1524 		*(int *) data = raidPtr->parity_good;
   1525 		return (0);
   1526 
   1527 	case RAIDFRAME_PARITYMAP_STATUS:
   1528 		if (rf_paritymap_ineligible(raidPtr))
   1529 			return EINVAL;
   1530 		rf_paritymap_status(raidPtr->parity_map,
   1531 		    (struct rf_pmstat *)data);
   1532 		return 0;
   1533 
   1534 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1535 		if (rf_paritymap_ineligible(raidPtr))
   1536 			return EINVAL;
   1537 		if (raidPtr->parity_map == NULL)
   1538 			return ENOENT; /* ??? */
   1539 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1540 			(struct rf_pmparams *)data, 1))
   1541 			return EINVAL;
   1542 		return 0;
   1543 
   1544 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1545 		if (rf_paritymap_ineligible(raidPtr))
   1546 			return EINVAL;
   1547 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1548 		return 0;
   1549 
   1550 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1551 		if (rf_paritymap_ineligible(raidPtr))
   1552 			return EINVAL;
   1553 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1554 		/* XXX should errors be passed up? */
   1555 		return 0;
   1556 
   1557 	case RAIDFRAME_RESET_ACCTOTALS:
   1558 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1559 		return (0);
   1560 
   1561 	case RAIDFRAME_GET_ACCTOTALS:
   1562 		totals = (RF_AccTotals_t *) data;
   1563 		*totals = raidPtr->acc_totals;
   1564 		return (0);
   1565 
   1566 	case RAIDFRAME_KEEP_ACCTOTALS:
   1567 		raidPtr->keep_acc_totals = *(int *)data;
   1568 		return (0);
   1569 
   1570 	case RAIDFRAME_GET_SIZE:
   1571 		*(int *) data = raidPtr->totalSectors;
   1572 		return (0);
   1573 
   1574 		/* fail a disk & optionally start reconstruction */
   1575 	case RAIDFRAME_FAIL_DISK:
   1576 
   1577 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1578 			/* Can't do this on a RAID 0!! */
   1579 			return(EINVAL);
   1580 		}
   1581 
   1582 		rr = (struct rf_recon_req *) data;
   1583 		rr->row = 0;
   1584 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1585 			return (EINVAL);
   1586 
   1587 
   1588 		rf_lock_mutex2(raidPtr->mutex);
   1589 		if (raidPtr->status == rf_rs_reconstructing) {
   1590 			/* you can't fail a disk while we're reconstructing! */
   1591 			/* XXX wrong for RAID6 */
   1592 			rf_unlock_mutex2(raidPtr->mutex);
   1593 			return (EINVAL);
   1594 		}
   1595 		if ((raidPtr->Disks[rr->col].status ==
   1596 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1597 			/* some other component has failed.  Let's not make
   1598 			   things worse. XXX wrong for RAID6 */
   1599 			rf_unlock_mutex2(raidPtr->mutex);
   1600 			return (EINVAL);
   1601 		}
   1602 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1603 			/* Can't fail a spared disk! */
   1604 			rf_unlock_mutex2(raidPtr->mutex);
   1605 			return (EINVAL);
   1606 		}
   1607 		rf_unlock_mutex2(raidPtr->mutex);
   1608 
   1609 		/* make a copy of the recon request so that we don't rely on
   1610 		 * the user's buffer */
   1611 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1612 		if (rrcopy == NULL)
   1613 			return(ENOMEM);
   1614 		memcpy(rrcopy, rr, sizeof(*rr));
   1615 		rrcopy->raidPtr = (void *) raidPtr;
   1616 
   1617 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1618 					   rf_ReconThread,
   1619 					   rrcopy,"raid_recon");
   1620 		return (0);
   1621 
   1622 		/* invoke a copyback operation after recon on whatever disk
   1623 		 * needs it, if any */
   1624 	case RAIDFRAME_COPYBACK:
   1625 
   1626 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1627 			/* This makes no sense on a RAID 0!! */
   1628 			return(EINVAL);
   1629 		}
   1630 
   1631 		if (raidPtr->copyback_in_progress == 1) {
   1632 			/* Copyback is already in progress! */
   1633 			return(EINVAL);
   1634 		}
   1635 
   1636 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1637 					   rf_CopybackThread,
   1638 					   raidPtr,"raid_copyback");
   1639 		return (retcode);
   1640 
   1641 		/* return the percentage completion of reconstruction */
   1642 	case RAIDFRAME_CHECK_RECON_STATUS:
   1643 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1644 			/* This makes no sense on a RAID 0, so tell the
   1645 			   user it's done. */
   1646 			*(int *) data = 100;
   1647 			return(0);
   1648 		}
   1649 		if (raidPtr->status != rf_rs_reconstructing)
   1650 			*(int *) data = 100;
   1651 		else {
   1652 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1653 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1654 			} else {
   1655 				*(int *) data = 0;
   1656 			}
   1657 		}
   1658 		return (0);
   1659 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1660 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1661 		if (raidPtr->status != rf_rs_reconstructing) {
   1662 			progressInfo.remaining = 0;
   1663 			progressInfo.completed = 100;
   1664 			progressInfo.total = 100;
   1665 		} else {
   1666 			progressInfo.total =
   1667 				raidPtr->reconControl->numRUsTotal;
   1668 			progressInfo.completed =
   1669 				raidPtr->reconControl->numRUsComplete;
   1670 			progressInfo.remaining = progressInfo.total -
   1671 				progressInfo.completed;
   1672 		}
   1673 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1674 				  sizeof(RF_ProgressInfo_t));
   1675 		return (retcode);
   1676 
   1677 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1678 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1679 			/* This makes no sense on a RAID 0, so tell the
   1680 			   user it's done. */
   1681 			*(int *) data = 100;
   1682 			return(0);
   1683 		}
   1684 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1685 			*(int *) data = 100 *
   1686 				raidPtr->parity_rewrite_stripes_done /
   1687 				raidPtr->Layout.numStripe;
   1688 		} else {
   1689 			*(int *) data = 100;
   1690 		}
   1691 		return (0);
   1692 
   1693 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1694 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1695 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1696 			progressInfo.total = raidPtr->Layout.numStripe;
   1697 			progressInfo.completed =
   1698 				raidPtr->parity_rewrite_stripes_done;
   1699 			progressInfo.remaining = progressInfo.total -
   1700 				progressInfo.completed;
   1701 		} else {
   1702 			progressInfo.remaining = 0;
   1703 			progressInfo.completed = 100;
   1704 			progressInfo.total = 100;
   1705 		}
   1706 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1707 				  sizeof(RF_ProgressInfo_t));
   1708 		return (retcode);
   1709 
   1710 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1711 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1712 			/* This makes no sense on a RAID 0 */
   1713 			*(int *) data = 100;
   1714 			return(0);
   1715 		}
   1716 		if (raidPtr->copyback_in_progress == 1) {
   1717 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1718 				raidPtr->Layout.numStripe;
   1719 		} else {
   1720 			*(int *) data = 100;
   1721 		}
   1722 		return (0);
   1723 
   1724 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1725 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1726 		if (raidPtr->copyback_in_progress == 1) {
   1727 			progressInfo.total = raidPtr->Layout.numStripe;
   1728 			progressInfo.completed =
   1729 				raidPtr->copyback_stripes_done;
   1730 			progressInfo.remaining = progressInfo.total -
   1731 				progressInfo.completed;
   1732 		} else {
   1733 			progressInfo.remaining = 0;
   1734 			progressInfo.completed = 100;
   1735 			progressInfo.total = 100;
   1736 		}
   1737 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1738 				  sizeof(RF_ProgressInfo_t));
   1739 		return (retcode);
   1740 
   1741 		/* the sparetable daemon calls this to wait for the kernel to
   1742 		 * need a spare table. this ioctl does not return until a
   1743 		 * spare table is needed. XXX -- calling mpsleep here in the
   1744 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1745 		 * -- I should either compute the spare table in the kernel,
   1746 		 * or have a different -- XXX XXX -- interface (a different
   1747 		 * character device) for delivering the table     -- XXX */
   1748 #if 0
   1749 	case RAIDFRAME_SPARET_WAIT:
   1750 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1751 		while (!rf_sparet_wait_queue)
   1752 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1753 		waitreq = rf_sparet_wait_queue;
   1754 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1755 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1756 
   1757 		/* structure assignment */
   1758 		*((RF_SparetWait_t *) data) = *waitreq;
   1759 
   1760 		RF_Free(waitreq, sizeof(*waitreq));
   1761 		return (0);
   1762 
   1763 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1764 		 * code in it that will cause the dameon to exit */
   1765 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1766 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1767 		waitreq->fcol = -1;
   1768 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1769 		waitreq->next = rf_sparet_wait_queue;
   1770 		rf_sparet_wait_queue = waitreq;
   1771 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1772 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1773 		return (0);
   1774 
   1775 		/* used by the spare table daemon to deliver a spare table
   1776 		 * into the kernel */
   1777 	case RAIDFRAME_SEND_SPARET:
   1778 
   1779 		/* install the spare table */
   1780 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1781 
   1782 		/* respond to the requestor.  the return status of the spare
   1783 		 * table installation is passed in the "fcol" field */
   1784 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1785 		waitreq->fcol = retcode;
   1786 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1787 		waitreq->next = rf_sparet_resp_queue;
   1788 		rf_sparet_resp_queue = waitreq;
   1789 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1790 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1791 
   1792 		return (retcode);
   1793 #endif
   1794 
   1795 	default:
   1796 		break; /* fall through to the os-specific code below */
   1797 
   1798 	}
   1799 
   1800 	if (!raidPtr->valid)
   1801 		return (EINVAL);
   1802 
   1803 	/*
   1804 	 * Add support for "regular" device ioctls here.
   1805 	 */
   1806 
   1807 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1808 	if (error != EPASSTHROUGH)
   1809 		return (error);
   1810 
   1811 	switch (cmd) {
   1812 	case DIOCGDINFO:
   1813 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1814 		break;
   1815 #ifdef __HAVE_OLD_DISKLABEL
   1816 	case ODIOCGDINFO:
   1817 		newlabel = *(rs->sc_dkdev.dk_label);
   1818 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1819 			return ENOTTY;
   1820 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1821 		break;
   1822 #endif
   1823 
   1824 	case DIOCGPART:
   1825 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1826 		((struct partinfo *) data)->part =
   1827 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1828 		break;
   1829 
   1830 	case DIOCWDINFO:
   1831 	case DIOCSDINFO:
   1832 #ifdef __HAVE_OLD_DISKLABEL
   1833 	case ODIOCWDINFO:
   1834 	case ODIOCSDINFO:
   1835 #endif
   1836 	{
   1837 		struct disklabel *lp;
   1838 #ifdef __HAVE_OLD_DISKLABEL
   1839 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1840 			memset(&newlabel, 0, sizeof newlabel);
   1841 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1842 			lp = &newlabel;
   1843 		} else
   1844 #endif
   1845 		lp = (struct disklabel *)data;
   1846 
   1847 		if ((error = raidlock(rs)) != 0)
   1848 			return (error);
   1849 
   1850 		rs->sc_flags |= RAIDF_LABELLING;
   1851 
   1852 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1853 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1854 		if (error == 0) {
   1855 			if (cmd == DIOCWDINFO
   1856 #ifdef __HAVE_OLD_DISKLABEL
   1857 			    || cmd == ODIOCWDINFO
   1858 #endif
   1859 			   )
   1860 				error = writedisklabel(RAIDLABELDEV(dev),
   1861 				    raidstrategy, rs->sc_dkdev.dk_label,
   1862 				    rs->sc_dkdev.dk_cpulabel);
   1863 		}
   1864 		rs->sc_flags &= ~RAIDF_LABELLING;
   1865 
   1866 		raidunlock(rs);
   1867 
   1868 		if (error)
   1869 			return (error);
   1870 		break;
   1871 	}
   1872 
   1873 	case DIOCWLABEL:
   1874 		if (*(int *) data != 0)
   1875 			rs->sc_flags |= RAIDF_WLABEL;
   1876 		else
   1877 			rs->sc_flags &= ~RAIDF_WLABEL;
   1878 		break;
   1879 
   1880 	case DIOCGDEFLABEL:
   1881 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1882 		break;
   1883 
   1884 #ifdef __HAVE_OLD_DISKLABEL
   1885 	case ODIOCGDEFLABEL:
   1886 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1887 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1888 			return ENOTTY;
   1889 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1890 		break;
   1891 #endif
   1892 
   1893 	case DIOCAWEDGE:
   1894 	case DIOCDWEDGE:
   1895 	    	dkw = (void *)data;
   1896 
   1897 		/* If the ioctl happens here, the parent is us. */
   1898 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1899 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1900 
   1901 	case DIOCLWEDGES:
   1902 		return dkwedge_list(&rs->sc_dkdev,
   1903 		    (struct dkwedge_list *)data, l);
   1904 	case DIOCCACHESYNC:
   1905 		return rf_sync_component_caches(raidPtr);
   1906 
   1907 	case DIOCGSTRATEGY:
   1908 	    {
   1909 		struct disk_strategy *dks = (void *)data;
   1910 
   1911 		s = splbio();
   1912 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1913 		    sizeof(dks->dks_name));
   1914 		splx(s);
   1915 		dks->dks_paramlen = 0;
   1916 
   1917 		return 0;
   1918 	    }
   1919 
   1920 	case DIOCSSTRATEGY:
   1921 	    {
   1922 		struct disk_strategy *dks = (void *)data;
   1923 		struct bufq_state *new;
   1924 		struct bufq_state *old;
   1925 
   1926 		if (dks->dks_param != NULL) {
   1927 			return EINVAL;
   1928 		}
   1929 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1930 		error = bufq_alloc(&new, dks->dks_name,
   1931 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1932 		if (error) {
   1933 			return error;
   1934 		}
   1935 		s = splbio();
   1936 		old = rs->buf_queue;
   1937 		bufq_move(new, old);
   1938 		rs->buf_queue = new;
   1939 		splx(s);
   1940 		bufq_free(old);
   1941 
   1942 		return 0;
   1943 	    }
   1944 
   1945 	default:
   1946 		retcode = ENOTTY;
   1947 	}
   1948 	return (retcode);
   1949 
   1950 }
   1951 
   1952 
   1953 /* raidinit -- complete the rest of the initialization for the
   1954    RAIDframe device.  */
   1955 
   1956 
   1957 static void
   1958 raidinit(struct raid_softc *rs)
   1959 {
   1960 	cfdata_t cf;
   1961 	int     unit;
   1962 	RF_Raid_t *raidPtr = &rs->sc_r;
   1963 
   1964 	unit = raidPtr->raidid;
   1965 
   1966 
   1967 	/* XXX should check return code first... */
   1968 	rs->sc_flags |= RAIDF_INITED;
   1969 
   1970 	/* XXX doesn't check bounds. */
   1971 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1972 
   1973 	/* attach the pseudo device */
   1974 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1975 	cf->cf_name = raid_cd.cd_name;
   1976 	cf->cf_atname = raid_cd.cd_name;
   1977 	cf->cf_unit = unit;
   1978 	cf->cf_fstate = FSTATE_STAR;
   1979 
   1980 	rs->sc_dev = config_attach_pseudo(cf);
   1981 
   1982 	if (rs->sc_dev == NULL) {
   1983 		printf("raid%d: config_attach_pseudo failed\n",
   1984 		    raidPtr->raidid);
   1985 		rs->sc_flags &= ~RAIDF_INITED;
   1986 		free(cf, M_RAIDFRAME);
   1987 		return;
   1988 	}
   1989 
   1990 	/* disk_attach actually creates space for the CPU disklabel, among
   1991 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1992 	 * with disklabels. */
   1993 
   1994 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1995 	disk_attach(&rs->sc_dkdev);
   1996 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1997 
   1998 	/* XXX There may be a weird interaction here between this, and
   1999 	 * protectedSectors, as used in RAIDframe.  */
   2000 
   2001 	rs->sc_size = raidPtr->totalSectors;
   2002 
   2003 	dkwedge_discover(&rs->sc_dkdev);
   2004 
   2005 	rf_set_geometry(rs, raidPtr);
   2006 
   2007 }
   2008 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2009 /* wake up the daemon & tell it to get us a spare table
   2010  * XXX
   2011  * the entries in the queues should be tagged with the raidPtr
   2012  * so that in the extremely rare case that two recons happen at once,
   2013  * we know for which device were requesting a spare table
   2014  * XXX
   2015  *
   2016  * XXX This code is not currently used. GO
   2017  */
   2018 int
   2019 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2020 {
   2021 	int     retcode;
   2022 
   2023 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2024 	req->next = rf_sparet_wait_queue;
   2025 	rf_sparet_wait_queue = req;
   2026 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2027 
   2028 	/* mpsleep unlocks the mutex */
   2029 	while (!rf_sparet_resp_queue) {
   2030 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2031 	}
   2032 	req = rf_sparet_resp_queue;
   2033 	rf_sparet_resp_queue = req->next;
   2034 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2035 
   2036 	retcode = req->fcol;
   2037 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2038 					 * alloc'd */
   2039 	return (retcode);
   2040 }
   2041 #endif
   2042 
   2043 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2044  * bp & passes it down.
   2045  * any calls originating in the kernel must use non-blocking I/O
   2046  * do some extra sanity checking to return "appropriate" error values for
   2047  * certain conditions (to make some standard utilities work)
   2048  *
   2049  * Formerly known as: rf_DoAccessKernel
   2050  */
   2051 void
   2052 raidstart(RF_Raid_t *raidPtr)
   2053 {
   2054 	RF_SectorCount_t num_blocks, pb, sum;
   2055 	RF_RaidAddr_t raid_addr;
   2056 	struct partition *pp;
   2057 	daddr_t blocknum;
   2058 	struct raid_softc *rs;
   2059 	int     do_async;
   2060 	struct buf *bp;
   2061 	int rc;
   2062 
   2063 	rs = raidPtr->softc;
   2064 	/* quick check to see if anything has died recently */
   2065 	rf_lock_mutex2(raidPtr->mutex);
   2066 	if (raidPtr->numNewFailures > 0) {
   2067 		rf_unlock_mutex2(raidPtr->mutex);
   2068 		rf_update_component_labels(raidPtr,
   2069 					   RF_NORMAL_COMPONENT_UPDATE);
   2070 		rf_lock_mutex2(raidPtr->mutex);
   2071 		raidPtr->numNewFailures--;
   2072 	}
   2073 
   2074 	/* Check to see if we're at the limit... */
   2075 	while (raidPtr->openings > 0) {
   2076 		rf_unlock_mutex2(raidPtr->mutex);
   2077 
   2078 		/* get the next item, if any, from the queue */
   2079 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2080 			/* nothing more to do */
   2081 			return;
   2082 		}
   2083 
   2084 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2085 		 * partition.. Need to make it absolute to the underlying
   2086 		 * device.. */
   2087 
   2088 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2089 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2090 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2091 			blocknum += pp->p_offset;
   2092 		}
   2093 
   2094 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2095 			    (int) blocknum));
   2096 
   2097 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2098 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2099 
   2100 		/* *THIS* is where we adjust what block we're going to...
   2101 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2102 		raid_addr = blocknum;
   2103 
   2104 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2105 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2106 		sum = raid_addr + num_blocks + pb;
   2107 		if (1 || rf_debugKernelAccess) {
   2108 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2109 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2110 				    (int) pb, (int) bp->b_resid));
   2111 		}
   2112 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2113 		    || (sum < num_blocks) || (sum < pb)) {
   2114 			bp->b_error = ENOSPC;
   2115 			bp->b_resid = bp->b_bcount;
   2116 			biodone(bp);
   2117 			rf_lock_mutex2(raidPtr->mutex);
   2118 			continue;
   2119 		}
   2120 		/*
   2121 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2122 		 */
   2123 
   2124 		if (bp->b_bcount & raidPtr->sectorMask) {
   2125 			bp->b_error = EINVAL;
   2126 			bp->b_resid = bp->b_bcount;
   2127 			biodone(bp);
   2128 			rf_lock_mutex2(raidPtr->mutex);
   2129 			continue;
   2130 
   2131 		}
   2132 		db1_printf(("Calling DoAccess..\n"));
   2133 
   2134 
   2135 		rf_lock_mutex2(raidPtr->mutex);
   2136 		raidPtr->openings--;
   2137 		rf_unlock_mutex2(raidPtr->mutex);
   2138 
   2139 		/*
   2140 		 * Everything is async.
   2141 		 */
   2142 		do_async = 1;
   2143 
   2144 		disk_busy(&rs->sc_dkdev);
   2145 
   2146 		/* XXX we're still at splbio() here... do we *really*
   2147 		   need to be? */
   2148 
   2149 		/* don't ever condition on bp->b_flags & B_WRITE.
   2150 		 * always condition on B_READ instead */
   2151 
   2152 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2153 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2154 				 do_async, raid_addr, num_blocks,
   2155 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2156 
   2157 		if (rc) {
   2158 			bp->b_error = rc;
   2159 			bp->b_resid = bp->b_bcount;
   2160 			biodone(bp);
   2161 			/* continue loop */
   2162 		}
   2163 
   2164 		rf_lock_mutex2(raidPtr->mutex);
   2165 	}
   2166 	rf_unlock_mutex2(raidPtr->mutex);
   2167 }
   2168 
   2169 
   2170 
   2171 
   2172 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2173 
   2174 int
   2175 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2176 {
   2177 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2178 	struct buf *bp;
   2179 
   2180 	req->queue = queue;
   2181 	bp = req->bp;
   2182 
   2183 	switch (req->type) {
   2184 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2185 		/* XXX need to do something extra here.. */
   2186 		/* I'm leaving this in, as I've never actually seen it used,
   2187 		 * and I'd like folks to report it... GO */
   2188 		printf(("WAKEUP CALLED\n"));
   2189 		queue->numOutstanding++;
   2190 
   2191 		bp->b_flags = 0;
   2192 		bp->b_private = req;
   2193 
   2194 		KernelWakeupFunc(bp);
   2195 		break;
   2196 
   2197 	case RF_IO_TYPE_READ:
   2198 	case RF_IO_TYPE_WRITE:
   2199 #if RF_ACC_TRACE > 0
   2200 		if (req->tracerec) {
   2201 			RF_ETIMER_START(req->tracerec->timer);
   2202 		}
   2203 #endif
   2204 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2205 		    op, queue->rf_cinfo->ci_dev,
   2206 		    req->sectorOffset, req->numSector,
   2207 		    req->buf, KernelWakeupFunc, (void *) req,
   2208 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2209 
   2210 		if (rf_debugKernelAccess) {
   2211 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2212 				(long) bp->b_blkno));
   2213 		}
   2214 		queue->numOutstanding++;
   2215 		queue->last_deq_sector = req->sectorOffset;
   2216 		/* acc wouldn't have been let in if there were any pending
   2217 		 * reqs at any other priority */
   2218 		queue->curPriority = req->priority;
   2219 
   2220 		db1_printf(("Going for %c to unit %d col %d\n",
   2221 			    req->type, queue->raidPtr->raidid,
   2222 			    queue->col));
   2223 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2224 			(int) req->sectorOffset, (int) req->numSector,
   2225 			(int) (req->numSector <<
   2226 			    queue->raidPtr->logBytesPerSector),
   2227 			(int) queue->raidPtr->logBytesPerSector));
   2228 
   2229 		/*
   2230 		 * XXX: drop lock here since this can block at
   2231 		 * least with backing SCSI devices.  Retake it
   2232 		 * to minimize fuss with calling interfaces.
   2233 		 */
   2234 
   2235 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2236 		bdev_strategy(bp);
   2237 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2238 		break;
   2239 
   2240 	default:
   2241 		panic("bad req->type in rf_DispatchKernelIO");
   2242 	}
   2243 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2244 
   2245 	return (0);
   2246 }
   2247 /* this is the callback function associated with a I/O invoked from
   2248    kernel code.
   2249  */
   2250 static void
   2251 KernelWakeupFunc(struct buf *bp)
   2252 {
   2253 	RF_DiskQueueData_t *req = NULL;
   2254 	RF_DiskQueue_t *queue;
   2255 
   2256 	db1_printf(("recovering the request queue:\n"));
   2257 
   2258 	req = bp->b_private;
   2259 
   2260 	queue = (RF_DiskQueue_t *) req->queue;
   2261 
   2262 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2263 
   2264 #if RF_ACC_TRACE > 0
   2265 	if (req->tracerec) {
   2266 		RF_ETIMER_STOP(req->tracerec->timer);
   2267 		RF_ETIMER_EVAL(req->tracerec->timer);
   2268 		rf_lock_mutex2(rf_tracing_mutex);
   2269 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2270 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2271 		req->tracerec->num_phys_ios++;
   2272 		rf_unlock_mutex2(rf_tracing_mutex);
   2273 	}
   2274 #endif
   2275 
   2276 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2277 	 * ballistic, and mark the component as hosed... */
   2278 
   2279 	if (bp->b_error != 0) {
   2280 		/* Mark the disk as dead */
   2281 		/* but only mark it once... */
   2282 		/* and only if it wouldn't leave this RAID set
   2283 		   completely broken */
   2284 		if (((queue->raidPtr->Disks[queue->col].status ==
   2285 		      rf_ds_optimal) ||
   2286 		     (queue->raidPtr->Disks[queue->col].status ==
   2287 		      rf_ds_used_spare)) &&
   2288 		     (queue->raidPtr->numFailures <
   2289 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2290 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2291 			       queue->raidPtr->raidid,
   2292 			       queue->raidPtr->Disks[queue->col].devname);
   2293 			queue->raidPtr->Disks[queue->col].status =
   2294 			    rf_ds_failed;
   2295 			queue->raidPtr->status = rf_rs_degraded;
   2296 			queue->raidPtr->numFailures++;
   2297 			queue->raidPtr->numNewFailures++;
   2298 		} else {	/* Disk is already dead... */
   2299 			/* printf("Disk already marked as dead!\n"); */
   2300 		}
   2301 
   2302 	}
   2303 
   2304 	/* Fill in the error value */
   2305 	req->error = bp->b_error;
   2306 
   2307 	/* Drop this one on the "finished" queue... */
   2308 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2309 
   2310 	/* Let the raidio thread know there is work to be done. */
   2311 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2312 
   2313 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2314 }
   2315 
   2316 
   2317 /*
   2318  * initialize a buf structure for doing an I/O in the kernel.
   2319  */
   2320 static void
   2321 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2322        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2323        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2324        struct proc *b_proc)
   2325 {
   2326 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2327 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2328 	bp->b_oflags = 0;
   2329 	bp->b_cflags = 0;
   2330 	bp->b_bcount = numSect << logBytesPerSector;
   2331 	bp->b_bufsize = bp->b_bcount;
   2332 	bp->b_error = 0;
   2333 	bp->b_dev = dev;
   2334 	bp->b_data = bf;
   2335 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2336 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2337 	if (bp->b_bcount == 0) {
   2338 		panic("bp->b_bcount is zero in InitBP!!");
   2339 	}
   2340 	bp->b_proc = b_proc;
   2341 	bp->b_iodone = cbFunc;
   2342 	bp->b_private = cbArg;
   2343 }
   2344 
   2345 static void
   2346 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2347 		    struct disklabel *lp)
   2348 {
   2349 	memset(lp, 0, sizeof(*lp));
   2350 
   2351 	/* fabricate a label... */
   2352 	lp->d_secperunit = raidPtr->totalSectors;
   2353 	lp->d_secsize = raidPtr->bytesPerSector;
   2354 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2355 	lp->d_ntracks = 4 * raidPtr->numCol;
   2356 	lp->d_ncylinders = raidPtr->totalSectors /
   2357 		(lp->d_nsectors * lp->d_ntracks);
   2358 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2359 
   2360 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2361 	lp->d_type = DTYPE_RAID;
   2362 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2363 	lp->d_rpm = 3600;
   2364 	lp->d_interleave = 1;
   2365 	lp->d_flags = 0;
   2366 
   2367 	lp->d_partitions[RAW_PART].p_offset = 0;
   2368 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2369 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2370 	lp->d_npartitions = RAW_PART + 1;
   2371 
   2372 	lp->d_magic = DISKMAGIC;
   2373 	lp->d_magic2 = DISKMAGIC;
   2374 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2375 
   2376 }
   2377 /*
   2378  * Read the disklabel from the raid device.  If one is not present, fake one
   2379  * up.
   2380  */
   2381 static void
   2382 raidgetdisklabel(dev_t dev)
   2383 {
   2384 	int     unit = raidunit(dev);
   2385 	struct raid_softc *rs;
   2386 	const char   *errstring;
   2387 	struct disklabel *lp;
   2388 	struct cpu_disklabel *clp;
   2389 	RF_Raid_t *raidPtr;
   2390 
   2391 	if ((rs = raidget(unit)) == NULL)
   2392 		return;
   2393 
   2394 	lp = rs->sc_dkdev.dk_label;
   2395 	clp = rs->sc_dkdev.dk_cpulabel;
   2396 
   2397 	db1_printf(("Getting the disklabel...\n"));
   2398 
   2399 	memset(clp, 0, sizeof(*clp));
   2400 
   2401 	raidPtr = &rs->sc_r;
   2402 
   2403 	raidgetdefaultlabel(raidPtr, rs, lp);
   2404 
   2405 	/*
   2406 	 * Call the generic disklabel extraction routine.
   2407 	 */
   2408 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2409 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2410 	if (errstring)
   2411 		raidmakedisklabel(rs);
   2412 	else {
   2413 		int     i;
   2414 		struct partition *pp;
   2415 
   2416 		/*
   2417 		 * Sanity check whether the found disklabel is valid.
   2418 		 *
   2419 		 * This is necessary since total size of the raid device
   2420 		 * may vary when an interleave is changed even though exactly
   2421 		 * same components are used, and old disklabel may used
   2422 		 * if that is found.
   2423 		 */
   2424 		if (lp->d_secperunit != rs->sc_size)
   2425 			printf("raid%d: WARNING: %s: "
   2426 			    "total sector size in disklabel (%" PRIu32 ") != "
   2427 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2428 			    lp->d_secperunit, rs->sc_size);
   2429 		for (i = 0; i < lp->d_npartitions; i++) {
   2430 			pp = &lp->d_partitions[i];
   2431 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2432 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2433 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2434 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2435 		}
   2436 	}
   2437 
   2438 }
   2439 /*
   2440  * Take care of things one might want to take care of in the event
   2441  * that a disklabel isn't present.
   2442  */
   2443 static void
   2444 raidmakedisklabel(struct raid_softc *rs)
   2445 {
   2446 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2447 	db1_printf(("Making a label..\n"));
   2448 
   2449 	/*
   2450 	 * For historical reasons, if there's no disklabel present
   2451 	 * the raw partition must be marked FS_BSDFFS.
   2452 	 */
   2453 
   2454 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2455 
   2456 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2457 
   2458 	lp->d_checksum = dkcksum(lp);
   2459 }
   2460 /*
   2461  * Wait interruptibly for an exclusive lock.
   2462  *
   2463  * XXX
   2464  * Several drivers do this; it should be abstracted and made MP-safe.
   2465  * (Hmm... where have we seen this warning before :->  GO )
   2466  */
   2467 static int
   2468 raidlock(struct raid_softc *rs)
   2469 {
   2470 	int     error;
   2471 
   2472 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2473 		rs->sc_flags |= RAIDF_WANTED;
   2474 		if ((error =
   2475 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2476 			return (error);
   2477 	}
   2478 	rs->sc_flags |= RAIDF_LOCKED;
   2479 	return (0);
   2480 }
   2481 /*
   2482  * Unlock and wake up any waiters.
   2483  */
   2484 static void
   2485 raidunlock(struct raid_softc *rs)
   2486 {
   2487 
   2488 	rs->sc_flags &= ~RAIDF_LOCKED;
   2489 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2490 		rs->sc_flags &= ~RAIDF_WANTED;
   2491 		wakeup(rs);
   2492 	}
   2493 }
   2494 
   2495 
   2496 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2497 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2498 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2499 
   2500 static daddr_t
   2501 rf_component_info_offset(void)
   2502 {
   2503 
   2504 	return RF_COMPONENT_INFO_OFFSET;
   2505 }
   2506 
   2507 static daddr_t
   2508 rf_component_info_size(unsigned secsize)
   2509 {
   2510 	daddr_t info_size;
   2511 
   2512 	KASSERT(secsize);
   2513 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2514 		info_size = secsize;
   2515 	else
   2516 		info_size = RF_COMPONENT_INFO_SIZE;
   2517 
   2518 	return info_size;
   2519 }
   2520 
   2521 static daddr_t
   2522 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2523 {
   2524 	daddr_t map_offset;
   2525 
   2526 	KASSERT(raidPtr->bytesPerSector);
   2527 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2528 		map_offset = raidPtr->bytesPerSector;
   2529 	else
   2530 		map_offset = RF_COMPONENT_INFO_SIZE;
   2531 	map_offset += rf_component_info_offset();
   2532 
   2533 	return map_offset;
   2534 }
   2535 
   2536 static daddr_t
   2537 rf_parity_map_size(RF_Raid_t *raidPtr)
   2538 {
   2539 	daddr_t map_size;
   2540 
   2541 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2542 		map_size = raidPtr->bytesPerSector;
   2543 	else
   2544 		map_size = RF_PARITY_MAP_SIZE;
   2545 
   2546 	return map_size;
   2547 }
   2548 
   2549 int
   2550 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2551 {
   2552 	RF_ComponentLabel_t *clabel;
   2553 
   2554 	clabel = raidget_component_label(raidPtr, col);
   2555 	clabel->clean = RF_RAID_CLEAN;
   2556 	raidflush_component_label(raidPtr, col);
   2557 	return(0);
   2558 }
   2559 
   2560 
   2561 int
   2562 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2563 {
   2564 	RF_ComponentLabel_t *clabel;
   2565 
   2566 	clabel = raidget_component_label(raidPtr, col);
   2567 	clabel->clean = RF_RAID_DIRTY;
   2568 	raidflush_component_label(raidPtr, col);
   2569 	return(0);
   2570 }
   2571 
   2572 int
   2573 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2574 {
   2575 	KASSERT(raidPtr->bytesPerSector);
   2576 	return raidread_component_label(raidPtr->bytesPerSector,
   2577 	    raidPtr->Disks[col].dev,
   2578 	    raidPtr->raid_cinfo[col].ci_vp,
   2579 	    &raidPtr->raid_cinfo[col].ci_label);
   2580 }
   2581 
   2582 RF_ComponentLabel_t *
   2583 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2584 {
   2585 	return &raidPtr->raid_cinfo[col].ci_label;
   2586 }
   2587 
   2588 int
   2589 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2590 {
   2591 	RF_ComponentLabel_t *label;
   2592 
   2593 	label = &raidPtr->raid_cinfo[col].ci_label;
   2594 	label->mod_counter = raidPtr->mod_counter;
   2595 #ifndef RF_NO_PARITY_MAP
   2596 	label->parity_map_modcount = label->mod_counter;
   2597 #endif
   2598 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2599 	    raidPtr->Disks[col].dev,
   2600 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2601 }
   2602 
   2603 
   2604 static int
   2605 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2606     RF_ComponentLabel_t *clabel)
   2607 {
   2608 	return raidread_component_area(dev, b_vp, clabel,
   2609 	    sizeof(RF_ComponentLabel_t),
   2610 	    rf_component_info_offset(),
   2611 	    rf_component_info_size(secsize));
   2612 }
   2613 
   2614 /* ARGSUSED */
   2615 static int
   2616 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2617     size_t msize, daddr_t offset, daddr_t dsize)
   2618 {
   2619 	struct buf *bp;
   2620 	const struct bdevsw *bdev;
   2621 	int error;
   2622 
   2623 	/* XXX should probably ensure that we don't try to do this if
   2624 	   someone has changed rf_protected_sectors. */
   2625 
   2626 	if (b_vp == NULL) {
   2627 		/* For whatever reason, this component is not valid.
   2628 		   Don't try to read a component label from it. */
   2629 		return(EINVAL);
   2630 	}
   2631 
   2632 	/* get a block of the appropriate size... */
   2633 	bp = geteblk((int)dsize);
   2634 	bp->b_dev = dev;
   2635 
   2636 	/* get our ducks in a row for the read */
   2637 	bp->b_blkno = offset / DEV_BSIZE;
   2638 	bp->b_bcount = dsize;
   2639 	bp->b_flags |= B_READ;
   2640  	bp->b_resid = dsize;
   2641 
   2642 	bdev = bdevsw_lookup(bp->b_dev);
   2643 	if (bdev == NULL)
   2644 		return (ENXIO);
   2645 	(*bdev->d_strategy)(bp);
   2646 
   2647 	error = biowait(bp);
   2648 
   2649 	if (!error) {
   2650 		memcpy(data, bp->b_data, msize);
   2651 	}
   2652 
   2653 	brelse(bp, 0);
   2654 	return(error);
   2655 }
   2656 
   2657 
   2658 static int
   2659 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2660     RF_ComponentLabel_t *clabel)
   2661 {
   2662 	return raidwrite_component_area(dev, b_vp, clabel,
   2663 	    sizeof(RF_ComponentLabel_t),
   2664 	    rf_component_info_offset(),
   2665 	    rf_component_info_size(secsize), 0);
   2666 }
   2667 
   2668 /* ARGSUSED */
   2669 static int
   2670 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2671     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2672 {
   2673 	struct buf *bp;
   2674 	const struct bdevsw *bdev;
   2675 	int error;
   2676 
   2677 	/* get a block of the appropriate size... */
   2678 	bp = geteblk((int)dsize);
   2679 	bp->b_dev = dev;
   2680 
   2681 	/* get our ducks in a row for the write */
   2682 	bp->b_blkno = offset / DEV_BSIZE;
   2683 	bp->b_bcount = dsize;
   2684 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2685  	bp->b_resid = dsize;
   2686 
   2687 	memset(bp->b_data, 0, dsize);
   2688 	memcpy(bp->b_data, data, msize);
   2689 
   2690 	bdev = bdevsw_lookup(bp->b_dev);
   2691 	if (bdev == NULL)
   2692 		return (ENXIO);
   2693 	(*bdev->d_strategy)(bp);
   2694 	if (asyncp)
   2695 		return 0;
   2696 	error = biowait(bp);
   2697 	brelse(bp, 0);
   2698 	if (error) {
   2699 #if 1
   2700 		printf("Failed to write RAID component info!\n");
   2701 #endif
   2702 	}
   2703 
   2704 	return(error);
   2705 }
   2706 
   2707 void
   2708 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2709 {
   2710 	int c;
   2711 
   2712 	for (c = 0; c < raidPtr->numCol; c++) {
   2713 		/* Skip dead disks. */
   2714 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2715 			continue;
   2716 		/* XXXjld: what if an error occurs here? */
   2717 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2718 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2719 		    RF_PARITYMAP_NBYTE,
   2720 		    rf_parity_map_offset(raidPtr),
   2721 		    rf_parity_map_size(raidPtr), 0);
   2722 	}
   2723 }
   2724 
   2725 void
   2726 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2727 {
   2728 	struct rf_paritymap_ondisk tmp;
   2729 	int c,first;
   2730 
   2731 	first=1;
   2732 	for (c = 0; c < raidPtr->numCol; c++) {
   2733 		/* Skip dead disks. */
   2734 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2735 			continue;
   2736 		raidread_component_area(raidPtr->Disks[c].dev,
   2737 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2738 		    RF_PARITYMAP_NBYTE,
   2739 		    rf_parity_map_offset(raidPtr),
   2740 		    rf_parity_map_size(raidPtr));
   2741 		if (first) {
   2742 			memcpy(map, &tmp, sizeof(*map));
   2743 			first = 0;
   2744 		} else {
   2745 			rf_paritymap_merge(map, &tmp);
   2746 		}
   2747 	}
   2748 }
   2749 
   2750 void
   2751 rf_markalldirty(RF_Raid_t *raidPtr)
   2752 {
   2753 	RF_ComponentLabel_t *clabel;
   2754 	int sparecol;
   2755 	int c;
   2756 	int j;
   2757 	int scol = -1;
   2758 
   2759 	raidPtr->mod_counter++;
   2760 	for (c = 0; c < raidPtr->numCol; c++) {
   2761 		/* we don't want to touch (at all) a disk that has
   2762 		   failed */
   2763 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2764 			clabel = raidget_component_label(raidPtr, c);
   2765 			if (clabel->status == rf_ds_spared) {
   2766 				/* XXX do something special...
   2767 				   but whatever you do, don't
   2768 				   try to access it!! */
   2769 			} else {
   2770 				raidmarkdirty(raidPtr, c);
   2771 			}
   2772 		}
   2773 	}
   2774 
   2775 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2776 		sparecol = raidPtr->numCol + c;
   2777 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2778 			/*
   2779 
   2780 			   we claim this disk is "optimal" if it's
   2781 			   rf_ds_used_spare, as that means it should be
   2782 			   directly substitutable for the disk it replaced.
   2783 			   We note that too...
   2784 
   2785 			 */
   2786 
   2787 			for(j=0;j<raidPtr->numCol;j++) {
   2788 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2789 					scol = j;
   2790 					break;
   2791 				}
   2792 			}
   2793 
   2794 			clabel = raidget_component_label(raidPtr, sparecol);
   2795 			/* make sure status is noted */
   2796 
   2797 			raid_init_component_label(raidPtr, clabel);
   2798 
   2799 			clabel->row = 0;
   2800 			clabel->column = scol;
   2801 			/* Note: we *don't* change status from rf_ds_used_spare
   2802 			   to rf_ds_optimal */
   2803 			/* clabel.status = rf_ds_optimal; */
   2804 
   2805 			raidmarkdirty(raidPtr, sparecol);
   2806 		}
   2807 	}
   2808 }
   2809 
   2810 
   2811 void
   2812 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2813 {
   2814 	RF_ComponentLabel_t *clabel;
   2815 	int sparecol;
   2816 	int c;
   2817 	int j;
   2818 	int scol;
   2819 
   2820 	scol = -1;
   2821 
   2822 	/* XXX should do extra checks to make sure things really are clean,
   2823 	   rather than blindly setting the clean bit... */
   2824 
   2825 	raidPtr->mod_counter++;
   2826 
   2827 	for (c = 0; c < raidPtr->numCol; c++) {
   2828 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2829 			clabel = raidget_component_label(raidPtr, c);
   2830 			/* make sure status is noted */
   2831 			clabel->status = rf_ds_optimal;
   2832 
   2833 			/* note what unit we are configured as */
   2834 			clabel->last_unit = raidPtr->raidid;
   2835 
   2836 			raidflush_component_label(raidPtr, c);
   2837 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2838 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2839 					raidmarkclean(raidPtr, c);
   2840 				}
   2841 			}
   2842 		}
   2843 		/* else we don't touch it.. */
   2844 	}
   2845 
   2846 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2847 		sparecol = raidPtr->numCol + c;
   2848 		/* Need to ensure that the reconstruct actually completed! */
   2849 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2850 			/*
   2851 
   2852 			   we claim this disk is "optimal" if it's
   2853 			   rf_ds_used_spare, as that means it should be
   2854 			   directly substitutable for the disk it replaced.
   2855 			   We note that too...
   2856 
   2857 			 */
   2858 
   2859 			for(j=0;j<raidPtr->numCol;j++) {
   2860 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2861 					scol = j;
   2862 					break;
   2863 				}
   2864 			}
   2865 
   2866 			/* XXX shouldn't *really* need this... */
   2867 			clabel = raidget_component_label(raidPtr, sparecol);
   2868 			/* make sure status is noted */
   2869 
   2870 			raid_init_component_label(raidPtr, clabel);
   2871 
   2872 			clabel->column = scol;
   2873 			clabel->status = rf_ds_optimal;
   2874 			clabel->last_unit = raidPtr->raidid;
   2875 
   2876 			raidflush_component_label(raidPtr, sparecol);
   2877 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2878 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2879 					raidmarkclean(raidPtr, sparecol);
   2880 				}
   2881 			}
   2882 		}
   2883 	}
   2884 }
   2885 
   2886 void
   2887 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2888 {
   2889 
   2890 	if (vp != NULL) {
   2891 		if (auto_configured == 1) {
   2892 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2893 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2894 			vput(vp);
   2895 
   2896 		} else {
   2897 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2898 		}
   2899 	}
   2900 }
   2901 
   2902 
   2903 void
   2904 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2905 {
   2906 	int r,c;
   2907 	struct vnode *vp;
   2908 	int acd;
   2909 
   2910 
   2911 	/* We take this opportunity to close the vnodes like we should.. */
   2912 
   2913 	for (c = 0; c < raidPtr->numCol; c++) {
   2914 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2915 		acd = raidPtr->Disks[c].auto_configured;
   2916 		rf_close_component(raidPtr, vp, acd);
   2917 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2918 		raidPtr->Disks[c].auto_configured = 0;
   2919 	}
   2920 
   2921 	for (r = 0; r < raidPtr->numSpare; r++) {
   2922 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2923 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2924 		rf_close_component(raidPtr, vp, acd);
   2925 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2926 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2927 	}
   2928 }
   2929 
   2930 
   2931 void
   2932 rf_ReconThread(struct rf_recon_req *req)
   2933 {
   2934 	int     s;
   2935 	RF_Raid_t *raidPtr;
   2936 
   2937 	s = splbio();
   2938 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2939 	raidPtr->recon_in_progress = 1;
   2940 
   2941 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2942 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2943 
   2944 	RF_Free(req, sizeof(*req));
   2945 
   2946 	raidPtr->recon_in_progress = 0;
   2947 	splx(s);
   2948 
   2949 	/* That's all... */
   2950 	kthread_exit(0);	/* does not return */
   2951 }
   2952 
   2953 void
   2954 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2955 {
   2956 	int retcode;
   2957 	int s;
   2958 
   2959 	raidPtr->parity_rewrite_stripes_done = 0;
   2960 	raidPtr->parity_rewrite_in_progress = 1;
   2961 	s = splbio();
   2962 	retcode = rf_RewriteParity(raidPtr);
   2963 	splx(s);
   2964 	if (retcode) {
   2965 		printf("raid%d: Error re-writing parity (%d)!\n",
   2966 		    raidPtr->raidid, retcode);
   2967 	} else {
   2968 		/* set the clean bit!  If we shutdown correctly,
   2969 		   the clean bit on each component label will get
   2970 		   set */
   2971 		raidPtr->parity_good = RF_RAID_CLEAN;
   2972 	}
   2973 	raidPtr->parity_rewrite_in_progress = 0;
   2974 
   2975 	/* Anyone waiting for us to stop?  If so, inform them... */
   2976 	if (raidPtr->waitShutdown) {
   2977 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2978 	}
   2979 
   2980 	/* That's all... */
   2981 	kthread_exit(0);	/* does not return */
   2982 }
   2983 
   2984 
   2985 void
   2986 rf_CopybackThread(RF_Raid_t *raidPtr)
   2987 {
   2988 	int s;
   2989 
   2990 	raidPtr->copyback_in_progress = 1;
   2991 	s = splbio();
   2992 	rf_CopybackReconstructedData(raidPtr);
   2993 	splx(s);
   2994 	raidPtr->copyback_in_progress = 0;
   2995 
   2996 	/* That's all... */
   2997 	kthread_exit(0);	/* does not return */
   2998 }
   2999 
   3000 
   3001 void
   3002 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3003 {
   3004 	int s;
   3005 	RF_Raid_t *raidPtr;
   3006 
   3007 	s = splbio();
   3008 	raidPtr = req->raidPtr;
   3009 	raidPtr->recon_in_progress = 1;
   3010 	rf_ReconstructInPlace(raidPtr, req->col);
   3011 	RF_Free(req, sizeof(*req));
   3012 	raidPtr->recon_in_progress = 0;
   3013 	splx(s);
   3014 
   3015 	/* That's all... */
   3016 	kthread_exit(0);	/* does not return */
   3017 }
   3018 
   3019 static RF_AutoConfig_t *
   3020 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3021     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3022     unsigned secsize)
   3023 {
   3024 	int good_one = 0;
   3025 	RF_ComponentLabel_t *clabel;
   3026 	RF_AutoConfig_t *ac;
   3027 
   3028 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3029 	if (clabel == NULL) {
   3030 oomem:
   3031 		    while(ac_list) {
   3032 			    ac = ac_list;
   3033 			    if (ac->clabel)
   3034 				    free(ac->clabel, M_RAIDFRAME);
   3035 			    ac_list = ac_list->next;
   3036 			    free(ac, M_RAIDFRAME);
   3037 		    }
   3038 		    printf("RAID auto config: out of memory!\n");
   3039 		    return NULL; /* XXX probably should panic? */
   3040 	}
   3041 
   3042 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3043 		/* Got the label.  Does it look reasonable? */
   3044 		if (rf_reasonable_label(clabel, numsecs) &&
   3045 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3046 #ifdef DEBUG
   3047 			printf("Component on: %s: %llu\n",
   3048 				cname, (unsigned long long)size);
   3049 			rf_print_component_label(clabel);
   3050 #endif
   3051 			/* if it's reasonable, add it, else ignore it. */
   3052 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3053 				M_NOWAIT);
   3054 			if (ac == NULL) {
   3055 				free(clabel, M_RAIDFRAME);
   3056 				goto oomem;
   3057 			}
   3058 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3059 			ac->dev = dev;
   3060 			ac->vp = vp;
   3061 			ac->clabel = clabel;
   3062 			ac->next = ac_list;
   3063 			ac_list = ac;
   3064 			good_one = 1;
   3065 		}
   3066 	}
   3067 	if (!good_one) {
   3068 		/* cleanup */
   3069 		free(clabel, M_RAIDFRAME);
   3070 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3071 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3072 		vput(vp);
   3073 	}
   3074 	return ac_list;
   3075 }
   3076 
   3077 RF_AutoConfig_t *
   3078 rf_find_raid_components(void)
   3079 {
   3080 	struct vnode *vp;
   3081 	struct disklabel label;
   3082 	device_t dv;
   3083 	deviter_t di;
   3084 	dev_t dev;
   3085 	int bmajor, bminor, wedge, rf_part_found;
   3086 	int error;
   3087 	int i;
   3088 	RF_AutoConfig_t *ac_list;
   3089 	uint64_t numsecs;
   3090 	unsigned secsize;
   3091 
   3092 	/* initialize the AutoConfig list */
   3093 	ac_list = NULL;
   3094 
   3095 	/* we begin by trolling through *all* the devices on the system */
   3096 
   3097 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3098 	     dv = deviter_next(&di)) {
   3099 
   3100 		/* we are only interested in disks... */
   3101 		if (device_class(dv) != DV_DISK)
   3102 			continue;
   3103 
   3104 		/* we don't care about floppies... */
   3105 		if (device_is_a(dv, "fd")) {
   3106 			continue;
   3107 		}
   3108 
   3109 		/* we don't care about CD's... */
   3110 		if (device_is_a(dv, "cd")) {
   3111 			continue;
   3112 		}
   3113 
   3114 		/* we don't care about md's... */
   3115 		if (device_is_a(dv, "md")) {
   3116 			continue;
   3117 		}
   3118 
   3119 		/* hdfd is the Atari/Hades floppy driver */
   3120 		if (device_is_a(dv, "hdfd")) {
   3121 			continue;
   3122 		}
   3123 
   3124 		/* fdisa is the Atari/Milan floppy driver */
   3125 		if (device_is_a(dv, "fdisa")) {
   3126 			continue;
   3127 		}
   3128 
   3129 		/* need to find the device_name_to_block_device_major stuff */
   3130 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3131 
   3132 		rf_part_found = 0; /*No raid partition as yet*/
   3133 
   3134 		/* get a vnode for the raw partition of this disk */
   3135 
   3136 		wedge = device_is_a(dv, "dk");
   3137 		bminor = minor(device_unit(dv));
   3138 		dev = wedge ? makedev(bmajor, bminor) :
   3139 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3140 		if (bdevvp(dev, &vp))
   3141 			panic("RAID can't alloc vnode");
   3142 
   3143 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3144 
   3145 		if (error) {
   3146 			/* "Who cares."  Continue looking
   3147 			   for something that exists*/
   3148 			vput(vp);
   3149 			continue;
   3150 		}
   3151 
   3152 		error = getdisksize(vp, &numsecs, &secsize);
   3153 		if (error) {
   3154 			vput(vp);
   3155 			continue;
   3156 		}
   3157 		if (wedge) {
   3158 			struct dkwedge_info dkw;
   3159 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3160 			    NOCRED);
   3161 			if (error) {
   3162 				printf("RAIDframe: can't get wedge info for "
   3163 				    "dev %s (%d)\n", device_xname(dv), error);
   3164 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3165 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3166 				vput(vp);
   3167 				continue;
   3168 			}
   3169 
   3170 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3171 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3172 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3173 				vput(vp);
   3174 				continue;
   3175 			}
   3176 
   3177 			ac_list = rf_get_component(ac_list, dev, vp,
   3178 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3179 			rf_part_found = 1; /*There is a raid component on this disk*/
   3180 			continue;
   3181 		}
   3182 
   3183 		/* Ok, the disk exists.  Go get the disklabel. */
   3184 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3185 		if (error) {
   3186 			/*
   3187 			 * XXX can't happen - open() would
   3188 			 * have errored out (or faked up one)
   3189 			 */
   3190 			if (error != ENOTTY)
   3191 				printf("RAIDframe: can't get label for dev "
   3192 				    "%s (%d)\n", device_xname(dv), error);
   3193 		}
   3194 
   3195 		/* don't need this any more.  We'll allocate it again
   3196 		   a little later if we really do... */
   3197 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3198 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3199 		vput(vp);
   3200 
   3201 		if (error)
   3202 			continue;
   3203 
   3204 		rf_part_found = 0; /*No raid partitions yet*/
   3205 		for (i = 0; i < label.d_npartitions; i++) {
   3206 			char cname[sizeof(ac_list->devname)];
   3207 
   3208 			/* We only support partitions marked as RAID */
   3209 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3210 				continue;
   3211 
   3212 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3213 			if (bdevvp(dev, &vp))
   3214 				panic("RAID can't alloc vnode");
   3215 
   3216 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3217 			if (error) {
   3218 				/* Whatever... */
   3219 				vput(vp);
   3220 				continue;
   3221 			}
   3222 			snprintf(cname, sizeof(cname), "%s%c",
   3223 			    device_xname(dv), 'a' + i);
   3224 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3225 				label.d_partitions[i].p_size, numsecs, secsize);
   3226 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3227 		}
   3228 
   3229 		/*
   3230 		 *If there is no raid component on this disk, either in a
   3231 		 *disklabel or inside a wedge, check the raw partition as well,
   3232 		 *as it is possible to configure raid components on raw disk
   3233 		 *devices.
   3234 		 */
   3235 
   3236 		if (!rf_part_found) {
   3237 			char cname[sizeof(ac_list->devname)];
   3238 
   3239 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3240 			if (bdevvp(dev, &vp))
   3241 				panic("RAID can't alloc vnode");
   3242 
   3243 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3244 			if (error) {
   3245 				/* Whatever... */
   3246 				vput(vp);
   3247 				continue;
   3248 			}
   3249 			snprintf(cname, sizeof(cname), "%s%c",
   3250 			    device_xname(dv), 'a' + RAW_PART);
   3251 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3252 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3253 		}
   3254 	}
   3255 	deviter_release(&di);
   3256 	return ac_list;
   3257 }
   3258 
   3259 
   3260 int
   3261 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3262 {
   3263 
   3264 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3265 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3266 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3267 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3268 	    clabel->row >=0 &&
   3269 	    clabel->column >= 0 &&
   3270 	    clabel->num_rows > 0 &&
   3271 	    clabel->num_columns > 0 &&
   3272 	    clabel->row < clabel->num_rows &&
   3273 	    clabel->column < clabel->num_columns &&
   3274 	    clabel->blockSize > 0 &&
   3275 	    /*
   3276 	     * numBlocksHi may contain garbage, but it is ok since
   3277 	     * the type is unsigned.  If it is really garbage,
   3278 	     * rf_fix_old_label_size() will fix it.
   3279 	     */
   3280 	    rf_component_label_numblocks(clabel) > 0) {
   3281 		/*
   3282 		 * label looks reasonable enough...
   3283 		 * let's make sure it has no old garbage.
   3284 		 */
   3285 		if (numsecs)
   3286 			rf_fix_old_label_size(clabel, numsecs);
   3287 		return(1);
   3288 	}
   3289 	return(0);
   3290 }
   3291 
   3292 
   3293 /*
   3294  * For reasons yet unknown, some old component labels have garbage in
   3295  * the newer numBlocksHi region, and this causes lossage.  Since those
   3296  * disks will also have numsecs set to less than 32 bits of sectors,
   3297  * we can determine when this corruption has occurred, and fix it.
   3298  *
   3299  * The exact same problem, with the same unknown reason, happens to
   3300  * the partitionSizeHi member as well.
   3301  */
   3302 static void
   3303 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3304 {
   3305 
   3306 	if (numsecs < ((uint64_t)1 << 32)) {
   3307 		if (clabel->numBlocksHi) {
   3308 			printf("WARNING: total sectors < 32 bits, yet "
   3309 			       "numBlocksHi set\n"
   3310 			       "WARNING: resetting numBlocksHi to zero.\n");
   3311 			clabel->numBlocksHi = 0;
   3312 		}
   3313 
   3314 		if (clabel->partitionSizeHi) {
   3315 			printf("WARNING: total sectors < 32 bits, yet "
   3316 			       "partitionSizeHi set\n"
   3317 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3318 			clabel->partitionSizeHi = 0;
   3319 		}
   3320 	}
   3321 }
   3322 
   3323 
   3324 #ifdef DEBUG
   3325 void
   3326 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3327 {
   3328 	uint64_t numBlocks;
   3329 	static const char *rp[] = {
   3330 	    "No", "Force", "Soft", "*invalid*"
   3331 	};
   3332 
   3333 
   3334 	numBlocks = rf_component_label_numblocks(clabel);
   3335 
   3336 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3337 	       clabel->row, clabel->column,
   3338 	       clabel->num_rows, clabel->num_columns);
   3339 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3340 	       clabel->version, clabel->serial_number,
   3341 	       clabel->mod_counter);
   3342 	printf("   Clean: %s Status: %d\n",
   3343 	       clabel->clean ? "Yes" : "No", clabel->status);
   3344 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3345 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3346 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3347 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3348 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3349 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3350 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3351 #if 0
   3352 	   printf("   Config order: %d\n", clabel->config_order);
   3353 #endif
   3354 
   3355 }
   3356 #endif
   3357 
   3358 RF_ConfigSet_t *
   3359 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3360 {
   3361 	RF_AutoConfig_t *ac;
   3362 	RF_ConfigSet_t *config_sets;
   3363 	RF_ConfigSet_t *cset;
   3364 	RF_AutoConfig_t *ac_next;
   3365 
   3366 
   3367 	config_sets = NULL;
   3368 
   3369 	/* Go through the AutoConfig list, and figure out which components
   3370 	   belong to what sets.  */
   3371 	ac = ac_list;
   3372 	while(ac!=NULL) {
   3373 		/* we're going to putz with ac->next, so save it here
   3374 		   for use at the end of the loop */
   3375 		ac_next = ac->next;
   3376 
   3377 		if (config_sets == NULL) {
   3378 			/* will need at least this one... */
   3379 			config_sets = (RF_ConfigSet_t *)
   3380 				malloc(sizeof(RF_ConfigSet_t),
   3381 				       M_RAIDFRAME, M_NOWAIT);
   3382 			if (config_sets == NULL) {
   3383 				panic("rf_create_auto_sets: No memory!");
   3384 			}
   3385 			/* this one is easy :) */
   3386 			config_sets->ac = ac;
   3387 			config_sets->next = NULL;
   3388 			config_sets->rootable = 0;
   3389 			ac->next = NULL;
   3390 		} else {
   3391 			/* which set does this component fit into? */
   3392 			cset = config_sets;
   3393 			while(cset!=NULL) {
   3394 				if (rf_does_it_fit(cset, ac)) {
   3395 					/* looks like it matches... */
   3396 					ac->next = cset->ac;
   3397 					cset->ac = ac;
   3398 					break;
   3399 				}
   3400 				cset = cset->next;
   3401 			}
   3402 			if (cset==NULL) {
   3403 				/* didn't find a match above... new set..*/
   3404 				cset = (RF_ConfigSet_t *)
   3405 					malloc(sizeof(RF_ConfigSet_t),
   3406 					       M_RAIDFRAME, M_NOWAIT);
   3407 				if (cset == NULL) {
   3408 					panic("rf_create_auto_sets: No memory!");
   3409 				}
   3410 				cset->ac = ac;
   3411 				ac->next = NULL;
   3412 				cset->next = config_sets;
   3413 				cset->rootable = 0;
   3414 				config_sets = cset;
   3415 			}
   3416 		}
   3417 		ac = ac_next;
   3418 	}
   3419 
   3420 
   3421 	return(config_sets);
   3422 }
   3423 
   3424 static int
   3425 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3426 {
   3427 	RF_ComponentLabel_t *clabel1, *clabel2;
   3428 
   3429 	/* If this one matches the *first* one in the set, that's good
   3430 	   enough, since the other members of the set would have been
   3431 	   through here too... */
   3432 	/* note that we are not checking partitionSize here..
   3433 
   3434 	   Note that we are also not checking the mod_counters here.
   3435 	   If everything else matches except the mod_counter, that's
   3436 	   good enough for this test.  We will deal with the mod_counters
   3437 	   a little later in the autoconfiguration process.
   3438 
   3439 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3440 
   3441 	   The reason we don't check for this is that failed disks
   3442 	   will have lower modification counts.  If those disks are
   3443 	   not added to the set they used to belong to, then they will
   3444 	   form their own set, which may result in 2 different sets,
   3445 	   for example, competing to be configured at raid0, and
   3446 	   perhaps competing to be the root filesystem set.  If the
   3447 	   wrong ones get configured, or both attempt to become /,
   3448 	   weird behaviour and or serious lossage will occur.  Thus we
   3449 	   need to bring them into the fold here, and kick them out at
   3450 	   a later point.
   3451 
   3452 	*/
   3453 
   3454 	clabel1 = cset->ac->clabel;
   3455 	clabel2 = ac->clabel;
   3456 	if ((clabel1->version == clabel2->version) &&
   3457 	    (clabel1->serial_number == clabel2->serial_number) &&
   3458 	    (clabel1->num_rows == clabel2->num_rows) &&
   3459 	    (clabel1->num_columns == clabel2->num_columns) &&
   3460 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3461 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3462 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3463 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3464 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3465 	    (clabel1->blockSize == clabel2->blockSize) &&
   3466 	    rf_component_label_numblocks(clabel1) ==
   3467 	    rf_component_label_numblocks(clabel2) &&
   3468 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3469 	    (clabel1->root_partition == clabel2->root_partition) &&
   3470 	    (clabel1->last_unit == clabel2->last_unit) &&
   3471 	    (clabel1->config_order == clabel2->config_order)) {
   3472 		/* if it get's here, it almost *has* to be a match */
   3473 	} else {
   3474 		/* it's not consistent with somebody in the set..
   3475 		   punt */
   3476 		return(0);
   3477 	}
   3478 	/* all was fine.. it must fit... */
   3479 	return(1);
   3480 }
   3481 
   3482 int
   3483 rf_have_enough_components(RF_ConfigSet_t *cset)
   3484 {
   3485 	RF_AutoConfig_t *ac;
   3486 	RF_AutoConfig_t *auto_config;
   3487 	RF_ComponentLabel_t *clabel;
   3488 	int c;
   3489 	int num_cols;
   3490 	int num_missing;
   3491 	int mod_counter;
   3492 	int mod_counter_found;
   3493 	int even_pair_failed;
   3494 	char parity_type;
   3495 
   3496 
   3497 	/* check to see that we have enough 'live' components
   3498 	   of this set.  If so, we can configure it if necessary */
   3499 
   3500 	num_cols = cset->ac->clabel->num_columns;
   3501 	parity_type = cset->ac->clabel->parityConfig;
   3502 
   3503 	/* XXX Check for duplicate components!?!?!? */
   3504 
   3505 	/* Determine what the mod_counter is supposed to be for this set. */
   3506 
   3507 	mod_counter_found = 0;
   3508 	mod_counter = 0;
   3509 	ac = cset->ac;
   3510 	while(ac!=NULL) {
   3511 		if (mod_counter_found==0) {
   3512 			mod_counter = ac->clabel->mod_counter;
   3513 			mod_counter_found = 1;
   3514 		} else {
   3515 			if (ac->clabel->mod_counter > mod_counter) {
   3516 				mod_counter = ac->clabel->mod_counter;
   3517 			}
   3518 		}
   3519 		ac = ac->next;
   3520 	}
   3521 
   3522 	num_missing = 0;
   3523 	auto_config = cset->ac;
   3524 
   3525 	even_pair_failed = 0;
   3526 	for(c=0; c<num_cols; c++) {
   3527 		ac = auto_config;
   3528 		while(ac!=NULL) {
   3529 			if ((ac->clabel->column == c) &&
   3530 			    (ac->clabel->mod_counter == mod_counter)) {
   3531 				/* it's this one... */
   3532 #ifdef DEBUG
   3533 				printf("Found: %s at %d\n",
   3534 				       ac->devname,c);
   3535 #endif
   3536 				break;
   3537 			}
   3538 			ac=ac->next;
   3539 		}
   3540 		if (ac==NULL) {
   3541 				/* Didn't find one here! */
   3542 				/* special case for RAID 1, especially
   3543 				   where there are more than 2
   3544 				   components (where RAIDframe treats
   3545 				   things a little differently :( ) */
   3546 			if (parity_type == '1') {
   3547 				if (c%2 == 0) { /* even component */
   3548 					even_pair_failed = 1;
   3549 				} else { /* odd component.  If
   3550 					    we're failed, and
   3551 					    so is the even
   3552 					    component, it's
   3553 					    "Good Night, Charlie" */
   3554 					if (even_pair_failed == 1) {
   3555 						return(0);
   3556 					}
   3557 				}
   3558 			} else {
   3559 				/* normal accounting */
   3560 				num_missing++;
   3561 			}
   3562 		}
   3563 		if ((parity_type == '1') && (c%2 == 1)) {
   3564 				/* Just did an even component, and we didn't
   3565 				   bail.. reset the even_pair_failed flag,
   3566 				   and go on to the next component.... */
   3567 			even_pair_failed = 0;
   3568 		}
   3569 	}
   3570 
   3571 	clabel = cset->ac->clabel;
   3572 
   3573 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3574 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3575 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3576 		/* XXX this needs to be made *much* more general */
   3577 		/* Too many failures */
   3578 		return(0);
   3579 	}
   3580 	/* otherwise, all is well, and we've got enough to take a kick
   3581 	   at autoconfiguring this set */
   3582 	return(1);
   3583 }
   3584 
   3585 void
   3586 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3587 			RF_Raid_t *raidPtr)
   3588 {
   3589 	RF_ComponentLabel_t *clabel;
   3590 	int i;
   3591 
   3592 	clabel = ac->clabel;
   3593 
   3594 	/* 1. Fill in the common stuff */
   3595 	config->numRow = clabel->num_rows = 1;
   3596 	config->numCol = clabel->num_columns;
   3597 	config->numSpare = 0; /* XXX should this be set here? */
   3598 	config->sectPerSU = clabel->sectPerSU;
   3599 	config->SUsPerPU = clabel->SUsPerPU;
   3600 	config->SUsPerRU = clabel->SUsPerRU;
   3601 	config->parityConfig = clabel->parityConfig;
   3602 	/* XXX... */
   3603 	strcpy(config->diskQueueType,"fifo");
   3604 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3605 	config->layoutSpecificSize = 0; /* XXX ?? */
   3606 
   3607 	while(ac!=NULL) {
   3608 		/* row/col values will be in range due to the checks
   3609 		   in reasonable_label() */
   3610 		strcpy(config->devnames[0][ac->clabel->column],
   3611 		       ac->devname);
   3612 		ac = ac->next;
   3613 	}
   3614 
   3615 	for(i=0;i<RF_MAXDBGV;i++) {
   3616 		config->debugVars[i][0] = 0;
   3617 	}
   3618 }
   3619 
   3620 int
   3621 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3622 {
   3623 	RF_ComponentLabel_t *clabel;
   3624 	int column;
   3625 	int sparecol;
   3626 
   3627 	raidPtr->autoconfigure = new_value;
   3628 
   3629 	for(column=0; column<raidPtr->numCol; column++) {
   3630 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3631 			clabel = raidget_component_label(raidPtr, column);
   3632 			clabel->autoconfigure = new_value;
   3633 			raidflush_component_label(raidPtr, column);
   3634 		}
   3635 	}
   3636 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3637 		sparecol = raidPtr->numCol + column;
   3638 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3639 			clabel = raidget_component_label(raidPtr, sparecol);
   3640 			clabel->autoconfigure = new_value;
   3641 			raidflush_component_label(raidPtr, sparecol);
   3642 		}
   3643 	}
   3644 	return(new_value);
   3645 }
   3646 
   3647 int
   3648 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3649 {
   3650 	RF_ComponentLabel_t *clabel;
   3651 	int column;
   3652 	int sparecol;
   3653 
   3654 	raidPtr->root_partition = new_value;
   3655 	for(column=0; column<raidPtr->numCol; column++) {
   3656 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3657 			clabel = raidget_component_label(raidPtr, column);
   3658 			clabel->root_partition = new_value;
   3659 			raidflush_component_label(raidPtr, column);
   3660 		}
   3661 	}
   3662 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3663 		sparecol = raidPtr->numCol + column;
   3664 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3665 			clabel = raidget_component_label(raidPtr, sparecol);
   3666 			clabel->root_partition = new_value;
   3667 			raidflush_component_label(raidPtr, sparecol);
   3668 		}
   3669 	}
   3670 	return(new_value);
   3671 }
   3672 
   3673 void
   3674 rf_release_all_vps(RF_ConfigSet_t *cset)
   3675 {
   3676 	RF_AutoConfig_t *ac;
   3677 
   3678 	ac = cset->ac;
   3679 	while(ac!=NULL) {
   3680 		/* Close the vp, and give it back */
   3681 		if (ac->vp) {
   3682 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3683 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3684 			vput(ac->vp);
   3685 			ac->vp = NULL;
   3686 		}
   3687 		ac = ac->next;
   3688 	}
   3689 }
   3690 
   3691 
   3692 void
   3693 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3694 {
   3695 	RF_AutoConfig_t *ac;
   3696 	RF_AutoConfig_t *next_ac;
   3697 
   3698 	ac = cset->ac;
   3699 	while(ac!=NULL) {
   3700 		next_ac = ac->next;
   3701 		/* nuke the label */
   3702 		free(ac->clabel, M_RAIDFRAME);
   3703 		/* cleanup the config structure */
   3704 		free(ac, M_RAIDFRAME);
   3705 		/* "next.." */
   3706 		ac = next_ac;
   3707 	}
   3708 	/* and, finally, nuke the config set */
   3709 	free(cset, M_RAIDFRAME);
   3710 }
   3711 
   3712 
   3713 void
   3714 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3715 {
   3716 	/* current version number */
   3717 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3718 	clabel->serial_number = raidPtr->serial_number;
   3719 	clabel->mod_counter = raidPtr->mod_counter;
   3720 
   3721 	clabel->num_rows = 1;
   3722 	clabel->num_columns = raidPtr->numCol;
   3723 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3724 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3725 
   3726 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3727 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3728 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3729 
   3730 	clabel->blockSize = raidPtr->bytesPerSector;
   3731 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3732 
   3733 	/* XXX not portable */
   3734 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3735 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3736 	clabel->autoconfigure = raidPtr->autoconfigure;
   3737 	clabel->root_partition = raidPtr->root_partition;
   3738 	clabel->last_unit = raidPtr->raidid;
   3739 	clabel->config_order = raidPtr->config_order;
   3740 
   3741 #ifndef RF_NO_PARITY_MAP
   3742 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3743 #endif
   3744 }
   3745 
   3746 struct raid_softc *
   3747 rf_auto_config_set(RF_ConfigSet_t *cset)
   3748 {
   3749 	RF_Raid_t *raidPtr;
   3750 	RF_Config_t *config;
   3751 	int raidID;
   3752 	struct raid_softc *sc;
   3753 
   3754 #ifdef DEBUG
   3755 	printf("RAID autoconfigure\n");
   3756 #endif
   3757 
   3758 	/* 1. Create a config structure */
   3759 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3760 	if (config == NULL) {
   3761 		printf("Out of mem!?!?\n");
   3762 				/* XXX do something more intelligent here. */
   3763 		return NULL;
   3764 	}
   3765 
   3766 	/*
   3767 	   2. Figure out what RAID ID this one is supposed to live at
   3768 	   See if we can get the same RAID dev that it was configured
   3769 	   on last time..
   3770 	*/
   3771 
   3772 	raidID = cset->ac->clabel->last_unit;
   3773 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3774 		continue;
   3775 #ifdef DEBUG
   3776 	printf("Configuring raid%d:\n",raidID);
   3777 #endif
   3778 
   3779 	raidPtr = &sc->sc_r;
   3780 
   3781 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3782 	raidPtr->softc = sc;
   3783 	raidPtr->raidid = raidID;
   3784 	raidPtr->openings = RAIDOUTSTANDING;
   3785 
   3786 	/* 3. Build the configuration structure */
   3787 	rf_create_configuration(cset->ac, config, raidPtr);
   3788 
   3789 	/* 4. Do the configuration */
   3790 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3791 		raidinit(sc);
   3792 
   3793 		rf_markalldirty(raidPtr);
   3794 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3795 		switch (cset->ac->clabel->root_partition) {
   3796 		case 1:	/* Force Root */
   3797 		case 2:	/* Soft Root: root when boot partition part of raid */
   3798 			/*
   3799 			 * everything configured just fine.  Make a note
   3800 			 * that this set is eligible to be root,
   3801 			 * or forced to be root
   3802 			 */
   3803 			cset->rootable = cset->ac->clabel->root_partition;
   3804 			/* XXX do this here? */
   3805 			raidPtr->root_partition = cset->rootable;
   3806 			break;
   3807 		default:
   3808 			break;
   3809 		}
   3810 	} else {
   3811 		raidput(sc);
   3812 		sc = NULL;
   3813 	}
   3814 
   3815 	/* 5. Cleanup */
   3816 	free(config, M_RAIDFRAME);
   3817 	return sc;
   3818 }
   3819 
   3820 void
   3821 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3822 {
   3823 	struct buf *bp;
   3824 	struct raid_softc *rs;
   3825 
   3826 	bp = (struct buf *)desc->bp;
   3827 	rs = desc->raidPtr->softc;
   3828 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3829 	    (bp->b_flags & B_READ));
   3830 }
   3831 
   3832 void
   3833 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3834 	     size_t xmin, size_t xmax)
   3835 {
   3836 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3837 	pool_sethiwat(p, xmax);
   3838 	pool_prime(p, xmin);
   3839 	pool_setlowat(p, xmin);
   3840 }
   3841 
   3842 /*
   3843  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3844  * if there is IO pending and if that IO could possibly be done for a
   3845  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3846  * otherwise.
   3847  *
   3848  */
   3849 
   3850 int
   3851 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3852 {
   3853 	struct raid_softc *rs = raidPtr->softc;
   3854 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3855 		/* there is work to do */
   3856 		return 0;
   3857 	}
   3858 	/* default is nothing to do */
   3859 	return 1;
   3860 }
   3861 
   3862 int
   3863 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3864 {
   3865 	uint64_t numsecs;
   3866 	unsigned secsize;
   3867 	int error;
   3868 
   3869 	error = getdisksize(vp, &numsecs, &secsize);
   3870 	if (error == 0) {
   3871 		diskPtr->blockSize = secsize;
   3872 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3873 		diskPtr->partitionSize = numsecs;
   3874 		return 0;
   3875 	}
   3876 	return error;
   3877 }
   3878 
   3879 static int
   3880 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3881 {
   3882 	return 1;
   3883 }
   3884 
   3885 static void
   3886 raid_attach(device_t parent, device_t self, void *aux)
   3887 {
   3888 
   3889 }
   3890 
   3891 
   3892 static int
   3893 raid_detach(device_t self, int flags)
   3894 {
   3895 	int error;
   3896 	struct raid_softc *rs = raidget(device_unit(self));
   3897 
   3898 	if (rs == NULL)
   3899 		return ENXIO;
   3900 
   3901 	if ((error = raidlock(rs)) != 0)
   3902 		return (error);
   3903 
   3904 	error = raid_detach_unlocked(rs);
   3905 
   3906 	raidunlock(rs);
   3907 
   3908 	/* XXXkd: raidput(rs) ??? */
   3909 
   3910 	return error;
   3911 }
   3912 
   3913 static void
   3914 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3915 {
   3916 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3917 
   3918 	memset(dg, 0, sizeof(*dg));
   3919 
   3920 	dg->dg_secperunit = raidPtr->totalSectors;
   3921 	dg->dg_secsize = raidPtr->bytesPerSector;
   3922 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3923 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3924 
   3925 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3926 }
   3927 
   3928 /*
   3929  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3930  * We end up returning whatever error was returned by the first cache flush
   3931  * that fails.
   3932  */
   3933 
   3934 int
   3935 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3936 {
   3937 	int c, sparecol;
   3938 	int e,error;
   3939 	int force = 1;
   3940 
   3941 	error = 0;
   3942 	for (c = 0; c < raidPtr->numCol; c++) {
   3943 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3944 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3945 					  &force, FWRITE, NOCRED);
   3946 			if (e) {
   3947 				if (e != ENODEV)
   3948 					printf("raid%d: cache flush to component %s failed.\n",
   3949 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3950 				if (error == 0) {
   3951 					error = e;
   3952 				}
   3953 			}
   3954 		}
   3955 	}
   3956 
   3957 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3958 		sparecol = raidPtr->numCol + c;
   3959 		/* Need to ensure that the reconstruct actually completed! */
   3960 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3961 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3962 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3963 			if (e) {
   3964 				if (e != ENODEV)
   3965 					printf("raid%d: cache flush to component %s failed.\n",
   3966 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3967 				if (error == 0) {
   3968 					error = e;
   3969 				}
   3970 			}
   3971 		}
   3972 	}
   3973 	return error;
   3974 }
   3975