Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.311
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.311 2014/07/25 08:02:20 dholland Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.311 2014/07/25 08:02:20 dholland Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_flag = D_DISK
    230 };
    231 
    232 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    233 
    234 struct raid_softc {
    235 	device_t sc_dev;
    236 	int	sc_unit;
    237 	int     sc_flags;	/* flags */
    238 	int     sc_cflags;	/* configuration flags */
    239 	uint64_t sc_size;	/* size of the raid device */
    240 	char    sc_xname[20];	/* XXX external name */
    241 	struct disk sc_dkdev;	/* generic disk device info */
    242 	struct bufq_state *buf_queue;	/* used for the device queue */
    243 	RF_Raid_t sc_r;
    244 	LIST_ENTRY(raid_softc) sc_link;
    245 };
    246 /* sc_flags */
    247 #define RAIDF_INITED	0x01	/* unit has been initialized */
    248 #define RAIDF_WLABEL	0x02	/* label area is writable */
    249 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    250 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    251 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    252 #define RAIDF_LOCKED	0x80	/* unit is locked */
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 
    256 extern struct cfdriver raid_cd;
    257 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    258     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    259     DVF_DETACH_SHUTDOWN);
    260 
    261 /*
    262  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    263  * Be aware that large numbers can allow the driver to consume a lot of
    264  * kernel memory, especially on writes, and in degraded mode reads.
    265  *
    266  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    267  * a single 64K write will typically require 64K for the old data,
    268  * 64K for the old parity, and 64K for the new parity, for a total
    269  * of 192K (if the parity buffer is not re-used immediately).
    270  * Even it if is used immediately, that's still 128K, which when multiplied
    271  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    272  *
    273  * Now in degraded mode, for example, a 64K read on the above setup may
    274  * require data reconstruction, which will require *all* of the 4 remaining
    275  * disks to participate -- 4 * 32K/disk == 128K again.
    276  */
    277 
    278 #ifndef RAIDOUTSTANDING
    279 #define RAIDOUTSTANDING   6
    280 #endif
    281 
    282 #define RAIDLABELDEV(dev)	\
    283 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    284 
    285 /* declared here, and made public, for the benefit of KVM stuff.. */
    286 
    287 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    288 				     struct disklabel *);
    289 static void raidgetdisklabel(dev_t);
    290 static void raidmakedisklabel(struct raid_softc *);
    291 
    292 static int raidlock(struct raid_softc *);
    293 static void raidunlock(struct raid_softc *);
    294 
    295 static int raid_detach_unlocked(struct raid_softc *);
    296 
    297 static void rf_markalldirty(RF_Raid_t *);
    298 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    299 
    300 void rf_ReconThread(struct rf_recon_req *);
    301 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    302 void rf_CopybackThread(RF_Raid_t *raidPtr);
    303 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    304 int rf_autoconfig(device_t);
    305 void rf_buildroothack(RF_ConfigSet_t *);
    306 
    307 RF_AutoConfig_t *rf_find_raid_components(void);
    308 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    309 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    310 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    311 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    312 int rf_set_autoconfig(RF_Raid_t *, int);
    313 int rf_set_rootpartition(RF_Raid_t *, int);
    314 void rf_release_all_vps(RF_ConfigSet_t *);
    315 void rf_cleanup_config_set(RF_ConfigSet_t *);
    316 int rf_have_enough_components(RF_ConfigSet_t *);
    317 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    318 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    319 
    320 /*
    321  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    322  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    323  * in the kernel config file.
    324  */
    325 #ifdef RAID_AUTOCONFIG
    326 int raidautoconfig = 1;
    327 #else
    328 int raidautoconfig = 0;
    329 #endif
    330 static bool raidautoconfigdone = false;
    331 
    332 struct RF_Pools_s rf_pools;
    333 
    334 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    335 static kmutex_t raid_lock;
    336 
    337 static struct raid_softc *
    338 raidcreate(int unit) {
    339 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    340 	if (sc == NULL) {
    341 #ifdef DIAGNOSTIC
    342 		printf("%s: out of memory\n", __func__);
    343 #endif
    344 		return NULL;
    345 	}
    346 	sc->sc_unit = unit;
    347 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    348 	return sc;
    349 }
    350 
    351 static void
    352 raiddestroy(struct raid_softc *sc) {
    353 	bufq_free(sc->buf_queue);
    354 	kmem_free(sc, sizeof(*sc));
    355 }
    356 
    357 static struct raid_softc *
    358 raidget(int unit) {
    359 	struct raid_softc *sc;
    360 	if (unit < 0) {
    361 #ifdef DIAGNOSTIC
    362 		panic("%s: unit %d!", __func__, unit);
    363 #endif
    364 		return NULL;
    365 	}
    366 	mutex_enter(&raid_lock);
    367 	LIST_FOREACH(sc, &raids, sc_link) {
    368 		if (sc->sc_unit == unit) {
    369 			mutex_exit(&raid_lock);
    370 			return sc;
    371 		}
    372 	}
    373 	mutex_exit(&raid_lock);
    374 	if ((sc = raidcreate(unit)) == NULL)
    375 		return NULL;
    376 	mutex_enter(&raid_lock);
    377 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    378 	mutex_exit(&raid_lock);
    379 	return sc;
    380 }
    381 
    382 static void
    383 raidput(struct raid_softc *sc) {
    384 	mutex_enter(&raid_lock);
    385 	LIST_REMOVE(sc, sc_link);
    386 	mutex_exit(&raid_lock);
    387 	raiddestroy(sc);
    388 }
    389 
    390 void
    391 raidattach(int num)
    392 {
    393 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    394 	/* This is where all the initialization stuff gets done. */
    395 
    396 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    397 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    398 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    399 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    400 
    401 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    402 #endif
    403 
    404 	if (rf_BootRaidframe() == 0)
    405 		aprint_verbose("Kernelized RAIDframe activated\n");
    406 	else
    407 		panic("Serious error booting RAID!!");
    408 
    409 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    410 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    411 	}
    412 
    413 	raidautoconfigdone = false;
    414 
    415 	/*
    416 	 * Register a finalizer which will be used to auto-config RAID
    417 	 * sets once all real hardware devices have been found.
    418 	 */
    419 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    420 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    421 }
    422 
    423 int
    424 rf_autoconfig(device_t self)
    425 {
    426 	RF_AutoConfig_t *ac_list;
    427 	RF_ConfigSet_t *config_sets;
    428 
    429 	if (!raidautoconfig || raidautoconfigdone == true)
    430 		return (0);
    431 
    432 	/* XXX This code can only be run once. */
    433 	raidautoconfigdone = true;
    434 
    435 #ifdef __HAVE_CPU_BOOTCONF
    436 	/*
    437 	 * 0. find the boot device if needed first so we can use it later
    438 	 * this needs to be done before we autoconfigure any raid sets,
    439 	 * because if we use wedges we are not going to be able to open
    440 	 * the boot device later
    441 	 */
    442 	if (booted_device == NULL)
    443 		cpu_bootconf();
    444 #endif
    445 	/* 1. locate all RAID components on the system */
    446 	aprint_debug("Searching for RAID components...\n");
    447 	ac_list = rf_find_raid_components();
    448 
    449 	/* 2. Sort them into their respective sets. */
    450 	config_sets = rf_create_auto_sets(ac_list);
    451 
    452 	/*
    453 	 * 3. Evaluate each set and configure the valid ones.
    454 	 * This gets done in rf_buildroothack().
    455 	 */
    456 	rf_buildroothack(config_sets);
    457 
    458 	return 1;
    459 }
    460 
    461 static int
    462 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    463 	const char *bootname = device_xname(bdv);
    464 	size_t len = strlen(bootname);
    465 
    466 	for (int col = 0; col < r->numCol; col++) {
    467 		const char *devname = r->Disks[col].devname;
    468 		devname += sizeof("/dev/") - 1;
    469 		if (strncmp(devname, "dk", 2) == 0) {
    470 			const char *parent =
    471 			    dkwedge_get_parent_name(r->Disks[col].dev);
    472 			if (parent != NULL)
    473 				devname = parent;
    474 		}
    475 		if (strncmp(devname, bootname, len) == 0) {
    476 			struct raid_softc *sc = r->softc;
    477 			aprint_debug("raid%d includes boot device %s\n",
    478 			    sc->sc_unit, devname);
    479 			return 1;
    480 		}
    481 	}
    482 	return 0;
    483 }
    484 
    485 void
    486 rf_buildroothack(RF_ConfigSet_t *config_sets)
    487 {
    488 	RF_ConfigSet_t *cset;
    489 	RF_ConfigSet_t *next_cset;
    490 	int num_root;
    491 	struct raid_softc *sc, *rsc;
    492 
    493 	sc = rsc = NULL;
    494 	num_root = 0;
    495 	cset = config_sets;
    496 	while (cset != NULL) {
    497 		next_cset = cset->next;
    498 		if (rf_have_enough_components(cset) &&
    499 		    cset->ac->clabel->autoconfigure == 1) {
    500 			sc = rf_auto_config_set(cset);
    501 			if (sc != NULL) {
    502 				aprint_debug("raid%d: configured ok\n",
    503 				    sc->sc_unit);
    504 				if (cset->rootable) {
    505 					rsc = sc;
    506 					num_root++;
    507 				}
    508 			} else {
    509 				/* The autoconfig didn't work :( */
    510 				aprint_debug("Autoconfig failed\n");
    511 				rf_release_all_vps(cset);
    512 			}
    513 		} else {
    514 			/* we're not autoconfiguring this set...
    515 			   release the associated resources */
    516 			rf_release_all_vps(cset);
    517 		}
    518 		/* cleanup */
    519 		rf_cleanup_config_set(cset);
    520 		cset = next_cset;
    521 	}
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL)
    527 		return;
    528 
    529 	/* we found something bootable... */
    530 
    531 	/*
    532 	 * XXX: The following code assumes that the root raid
    533 	 * is the first ('a') partition. This is about the best
    534 	 * we can do with a BSD disklabel, but we might be able
    535 	 * to do better with a GPT label, by setting a specified
    536 	 * attribute to indicate the root partition. We can then
    537 	 * stash the partition number in the r->root_partition
    538 	 * high bits (the bottom 2 bits are already used). For
    539 	 * now we just set booted_partition to 0 when we override
    540 	 * root.
    541 	 */
    542 	if (num_root == 1) {
    543 		device_t candidate_root;
    544 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    545 			char cname[sizeof(cset->ac->devname)];
    546 			/* XXX: assume 'a' */
    547 			snprintf(cname, sizeof(cname), "%s%c",
    548 			    device_xname(rsc->sc_dev), 'a');
    549 			candidate_root = dkwedge_find_by_wname(cname);
    550 		} else
    551 			candidate_root = rsc->sc_dev;
    552 		if (booted_device == NULL ||
    553 		    rsc->sc_r.root_partition == 1 ||
    554 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    555 			booted_device = candidate_root;
    556 			booted_partition = 0;	/* XXX assume 'a' */
    557 		}
    558 	} else if (num_root > 1) {
    559 
    560 		/*
    561 		 * Maybe the MD code can help. If it cannot, then
    562 		 * setroot() will discover that we have no
    563 		 * booted_device and will ask the user if nothing was
    564 		 * hardwired in the kernel config file
    565 		 */
    566 		if (booted_device == NULL)
    567 			return;
    568 
    569 		num_root = 0;
    570 		mutex_enter(&raid_lock);
    571 		LIST_FOREACH(sc, &raids, sc_link) {
    572 			RF_Raid_t *r = &sc->sc_r;
    573 			if (r->valid == 0)
    574 				continue;
    575 
    576 			if (r->root_partition == 0)
    577 				continue;
    578 
    579 			if (rf_containsboot(r, booted_device)) {
    580 				num_root++;
    581 				rsc = sc;
    582 			}
    583 		}
    584 		mutex_exit(&raid_lock);
    585 
    586 		if (num_root == 1) {
    587 			booted_device = rsc->sc_dev;
    588 			booted_partition = 0;	/* XXX assume 'a' */
    589 		} else {
    590 			/* we can't guess.. require the user to answer... */
    591 			boothowto |= RB_ASKNAME;
    592 		}
    593 	}
    594 }
    595 
    596 
    597 int
    598 raidsize(dev_t dev)
    599 {
    600 	struct raid_softc *rs;
    601 	struct disklabel *lp;
    602 	int     part, unit, omask, size;
    603 
    604 	unit = raidunit(dev);
    605 	if ((rs = raidget(unit)) == NULL)
    606 		return -1;
    607 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    608 		return (-1);
    609 
    610 	part = DISKPART(dev);
    611 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    612 	lp = rs->sc_dkdev.dk_label;
    613 
    614 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    615 		return (-1);
    616 
    617 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    618 		size = -1;
    619 	else
    620 		size = lp->d_partitions[part].p_size *
    621 		    (lp->d_secsize / DEV_BSIZE);
    622 
    623 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    624 		return (-1);
    625 
    626 	return (size);
    627 
    628 }
    629 
    630 int
    631 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    632 {
    633 	int     unit = raidunit(dev);
    634 	struct raid_softc *rs;
    635 	const struct bdevsw *bdev;
    636 	struct disklabel *lp;
    637 	RF_Raid_t *raidPtr;
    638 	daddr_t offset;
    639 	int     part, c, sparecol, j, scol, dumpto;
    640 	int     error = 0;
    641 
    642 	if ((rs = raidget(unit)) == NULL)
    643 		return ENXIO;
    644 
    645 	raidPtr = &rs->sc_r;
    646 
    647 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    648 		return ENXIO;
    649 
    650 	/* we only support dumping to RAID 1 sets */
    651 	if (raidPtr->Layout.numDataCol != 1 ||
    652 	    raidPtr->Layout.numParityCol != 1)
    653 		return EINVAL;
    654 
    655 
    656 	if ((error = raidlock(rs)) != 0)
    657 		return error;
    658 
    659 	if (size % DEV_BSIZE != 0) {
    660 		error = EINVAL;
    661 		goto out;
    662 	}
    663 
    664 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    665 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    666 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    667 		    size / DEV_BSIZE, rs->sc_size);
    668 		error = EINVAL;
    669 		goto out;
    670 	}
    671 
    672 	part = DISKPART(dev);
    673 	lp = rs->sc_dkdev.dk_label;
    674 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    675 
    676 	/* figure out what device is alive.. */
    677 
    678 	/*
    679 	   Look for a component to dump to.  The preference for the
    680 	   component to dump to is as follows:
    681 	   1) the master
    682 	   2) a used_spare of the master
    683 	   3) the slave
    684 	   4) a used_spare of the slave
    685 	*/
    686 
    687 	dumpto = -1;
    688 	for (c = 0; c < raidPtr->numCol; c++) {
    689 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    690 			/* this might be the one */
    691 			dumpto = c;
    692 			break;
    693 		}
    694 	}
    695 
    696 	/*
    697 	   At this point we have possibly selected a live master or a
    698 	   live slave.  We now check to see if there is a spared
    699 	   master (or a spared slave), if we didn't find a live master
    700 	   or a live slave.
    701 	*/
    702 
    703 	for (c = 0; c < raidPtr->numSpare; c++) {
    704 		sparecol = raidPtr->numCol + c;
    705 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    706 			/* How about this one? */
    707 			scol = -1;
    708 			for(j=0;j<raidPtr->numCol;j++) {
    709 				if (raidPtr->Disks[j].spareCol == sparecol) {
    710 					scol = j;
    711 					break;
    712 				}
    713 			}
    714 			if (scol == 0) {
    715 				/*
    716 				   We must have found a spared master!
    717 				   We'll take that over anything else
    718 				   found so far.  (We couldn't have
    719 				   found a real master before, since
    720 				   this is a used spare, and it's
    721 				   saying that it's replacing the
    722 				   master.)  On reboot (with
    723 				   autoconfiguration turned on)
    724 				   sparecol will become the 1st
    725 				   component (component0) of this set.
    726 				*/
    727 				dumpto = sparecol;
    728 				break;
    729 			} else if (scol != -1) {
    730 				/*
    731 				   Must be a spared slave.  We'll dump
    732 				   to that if we havn't found anything
    733 				   else so far.
    734 				*/
    735 				if (dumpto == -1)
    736 					dumpto = sparecol;
    737 			}
    738 		}
    739 	}
    740 
    741 	if (dumpto == -1) {
    742 		/* we couldn't find any live components to dump to!?!?
    743 		 */
    744 		error = EINVAL;
    745 		goto out;
    746 	}
    747 
    748 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    749 
    750 	/*
    751 	   Note that blkno is relative to this particular partition.
    752 	   By adding the offset of this partition in the RAID
    753 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    754 	   value that is relative to the partition used for the
    755 	   underlying component.
    756 	*/
    757 
    758 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    759 				blkno + offset, va, size);
    760 
    761 out:
    762 	raidunlock(rs);
    763 
    764 	return error;
    765 }
    766 /* ARGSUSED */
    767 int
    768 raidopen(dev_t dev, int flags, int fmt,
    769     struct lwp *l)
    770 {
    771 	int     unit = raidunit(dev);
    772 	struct raid_softc *rs;
    773 	struct disklabel *lp;
    774 	int     part, pmask;
    775 	int     error = 0;
    776 
    777 	if ((rs = raidget(unit)) == NULL)
    778 		return ENXIO;
    779 	if ((error = raidlock(rs)) != 0)
    780 		return (error);
    781 
    782 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    783 		error = EBUSY;
    784 		goto bad;
    785 	}
    786 
    787 	lp = rs->sc_dkdev.dk_label;
    788 
    789 	part = DISKPART(dev);
    790 
    791 	/*
    792 	 * If there are wedges, and this is not RAW_PART, then we
    793 	 * need to fail.
    794 	 */
    795 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    796 		error = EBUSY;
    797 		goto bad;
    798 	}
    799 	pmask = (1 << part);
    800 
    801 	if ((rs->sc_flags & RAIDF_INITED) &&
    802 	    (rs->sc_dkdev.dk_openmask == 0))
    803 		raidgetdisklabel(dev);
    804 
    805 	/* make sure that this partition exists */
    806 
    807 	if (part != RAW_PART) {
    808 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    809 		    ((part >= lp->d_npartitions) ||
    810 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    811 			error = ENXIO;
    812 			goto bad;
    813 		}
    814 	}
    815 	/* Prevent this unit from being unconfigured while open. */
    816 	switch (fmt) {
    817 	case S_IFCHR:
    818 		rs->sc_dkdev.dk_copenmask |= pmask;
    819 		break;
    820 
    821 	case S_IFBLK:
    822 		rs->sc_dkdev.dk_bopenmask |= pmask;
    823 		break;
    824 	}
    825 
    826 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    827 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    828 		/* First one... mark things as dirty... Note that we *MUST*
    829 		 have done a configure before this.  I DO NOT WANT TO BE
    830 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    831 		 THAT THEY BELONG TOGETHER!!!!! */
    832 		/* XXX should check to see if we're only open for reading
    833 		   here... If so, we needn't do this, but then need some
    834 		   other way of keeping track of what's happened.. */
    835 
    836 		rf_markalldirty(&rs->sc_r);
    837 	}
    838 
    839 
    840 	rs->sc_dkdev.dk_openmask =
    841 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    842 
    843 bad:
    844 	raidunlock(rs);
    845 
    846 	return (error);
    847 
    848 
    849 }
    850 /* ARGSUSED */
    851 int
    852 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    853 {
    854 	int     unit = raidunit(dev);
    855 	struct raid_softc *rs;
    856 	int     error = 0;
    857 	int     part;
    858 
    859 	if ((rs = raidget(unit)) == NULL)
    860 		return ENXIO;
    861 
    862 	if ((error = raidlock(rs)) != 0)
    863 		return (error);
    864 
    865 	part = DISKPART(dev);
    866 
    867 	/* ...that much closer to allowing unconfiguration... */
    868 	switch (fmt) {
    869 	case S_IFCHR:
    870 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    871 		break;
    872 
    873 	case S_IFBLK:
    874 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    875 		break;
    876 	}
    877 	rs->sc_dkdev.dk_openmask =
    878 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    879 
    880 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    881 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    882 		/* Last one... device is not unconfigured yet.
    883 		   Device shutdown has taken care of setting the
    884 		   clean bits if RAIDF_INITED is not set
    885 		   mark things as clean... */
    886 
    887 		rf_update_component_labels(&rs->sc_r,
    888 						 RF_FINAL_COMPONENT_UPDATE);
    889 
    890 		/* If the kernel is shutting down, it will detach
    891 		 * this RAID set soon enough.
    892 		 */
    893 	}
    894 
    895 	raidunlock(rs);
    896 	return (0);
    897 
    898 }
    899 
    900 void
    901 raidstrategy(struct buf *bp)
    902 {
    903 	unsigned int unit = raidunit(bp->b_dev);
    904 	RF_Raid_t *raidPtr;
    905 	int     wlabel;
    906 	struct raid_softc *rs;
    907 
    908 	if ((rs = raidget(unit)) == NULL) {
    909 		bp->b_error = ENXIO;
    910 		goto done;
    911 	}
    912 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    913 		bp->b_error = ENXIO;
    914 		goto done;
    915 	}
    916 	raidPtr = &rs->sc_r;
    917 	if (!raidPtr->valid) {
    918 		bp->b_error = ENODEV;
    919 		goto done;
    920 	}
    921 	if (bp->b_bcount == 0) {
    922 		db1_printf(("b_bcount is zero..\n"));
    923 		goto done;
    924 	}
    925 
    926 	/*
    927 	 * Do bounds checking and adjust transfer.  If there's an
    928 	 * error, the bounds check will flag that for us.
    929 	 */
    930 
    931 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    932 	if (DISKPART(bp->b_dev) == RAW_PART) {
    933 		uint64_t size; /* device size in DEV_BSIZE unit */
    934 
    935 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    936 			size = raidPtr->totalSectors <<
    937 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    938 		} else {
    939 			size = raidPtr->totalSectors >>
    940 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    941 		}
    942 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    943 			goto done;
    944 		}
    945 	} else {
    946 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    947 			db1_printf(("Bounds check failed!!:%d %d\n",
    948 				(int) bp->b_blkno, (int) wlabel));
    949 			goto done;
    950 		}
    951 	}
    952 
    953 	rf_lock_mutex2(raidPtr->iodone_lock);
    954 
    955 	bp->b_resid = 0;
    956 
    957 	/* stuff it onto our queue */
    958 	bufq_put(rs->buf_queue, bp);
    959 
    960 	/* scheduled the IO to happen at the next convenient time */
    961 	rf_signal_cond2(raidPtr->iodone_cv);
    962 	rf_unlock_mutex2(raidPtr->iodone_lock);
    963 
    964 	return;
    965 
    966 done:
    967 	bp->b_resid = bp->b_bcount;
    968 	biodone(bp);
    969 }
    970 /* ARGSUSED */
    971 int
    972 raidread(dev_t dev, struct uio *uio, int flags)
    973 {
    974 	int     unit = raidunit(dev);
    975 	struct raid_softc *rs;
    976 
    977 	if ((rs = raidget(unit)) == NULL)
    978 		return ENXIO;
    979 
    980 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    981 		return (ENXIO);
    982 
    983 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    984 
    985 }
    986 /* ARGSUSED */
    987 int
    988 raidwrite(dev_t dev, struct uio *uio, int flags)
    989 {
    990 	int     unit = raidunit(dev);
    991 	struct raid_softc *rs;
    992 
    993 	if ((rs = raidget(unit)) == NULL)
    994 		return ENXIO;
    995 
    996 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    997 		return (ENXIO);
    998 
    999 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1000 
   1001 }
   1002 
   1003 static int
   1004 raid_detach_unlocked(struct raid_softc *rs)
   1005 {
   1006 	int error;
   1007 	RF_Raid_t *raidPtr;
   1008 
   1009 	raidPtr = &rs->sc_r;
   1010 
   1011 	/*
   1012 	 * If somebody has a partition mounted, we shouldn't
   1013 	 * shutdown.
   1014 	 */
   1015 	if (rs->sc_dkdev.dk_openmask != 0)
   1016 		return EBUSY;
   1017 
   1018 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1019 		;	/* not initialized: nothing to do */
   1020 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1021 		return error;
   1022 	else
   1023 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1024 
   1025 	/* Detach the disk. */
   1026 	dkwedge_delall(&rs->sc_dkdev);
   1027 	disk_detach(&rs->sc_dkdev);
   1028 	disk_destroy(&rs->sc_dkdev);
   1029 
   1030 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1031 
   1032 	return 0;
   1033 }
   1034 
   1035 int
   1036 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1037 {
   1038 	int     unit = raidunit(dev);
   1039 	int     error = 0;
   1040 	int     part, pmask, s;
   1041 	cfdata_t cf;
   1042 	struct raid_softc *rs;
   1043 	RF_Config_t *k_cfg, *u_cfg;
   1044 	RF_Raid_t *raidPtr;
   1045 	RF_RaidDisk_t *diskPtr;
   1046 	RF_AccTotals_t *totals;
   1047 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1048 	u_char *specific_buf;
   1049 	int retcode = 0;
   1050 	int column;
   1051 /*	int raidid; */
   1052 	struct rf_recon_req *rrcopy, *rr;
   1053 	RF_ComponentLabel_t *clabel;
   1054 	RF_ComponentLabel_t *ci_label;
   1055 	RF_ComponentLabel_t **clabel_ptr;
   1056 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1057 	RF_SingleComponent_t component;
   1058 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1059 	int i, j, d;
   1060 #ifdef __HAVE_OLD_DISKLABEL
   1061 	struct disklabel newlabel;
   1062 #endif
   1063 	struct dkwedge_info *dkw;
   1064 
   1065 	if ((rs = raidget(unit)) == NULL)
   1066 		return ENXIO;
   1067 	raidPtr = &rs->sc_r;
   1068 
   1069 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1070 		(int) DISKPART(dev), (int) unit, cmd));
   1071 
   1072 	/* Must be open for writes for these commands... */
   1073 	switch (cmd) {
   1074 #ifdef DIOCGSECTORSIZE
   1075 	case DIOCGSECTORSIZE:
   1076 		*(u_int *)data = raidPtr->bytesPerSector;
   1077 		return 0;
   1078 	case DIOCGMEDIASIZE:
   1079 		*(off_t *)data =
   1080 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1081 		return 0;
   1082 #endif
   1083 	case DIOCSDINFO:
   1084 	case DIOCWDINFO:
   1085 #ifdef __HAVE_OLD_DISKLABEL
   1086 	case ODIOCWDINFO:
   1087 	case ODIOCSDINFO:
   1088 #endif
   1089 	case DIOCWLABEL:
   1090 	case DIOCAWEDGE:
   1091 	case DIOCDWEDGE:
   1092 	case DIOCSSTRATEGY:
   1093 		if ((flag & FWRITE) == 0)
   1094 			return (EBADF);
   1095 	}
   1096 
   1097 	/* Must be initialized for these... */
   1098 	switch (cmd) {
   1099 	case DIOCGDINFO:
   1100 	case DIOCSDINFO:
   1101 	case DIOCWDINFO:
   1102 #ifdef __HAVE_OLD_DISKLABEL
   1103 	case ODIOCGDINFO:
   1104 	case ODIOCWDINFO:
   1105 	case ODIOCSDINFO:
   1106 	case ODIOCGDEFLABEL:
   1107 #endif
   1108 	case DIOCGPART:
   1109 	case DIOCWLABEL:
   1110 	case DIOCGDEFLABEL:
   1111 	case DIOCAWEDGE:
   1112 	case DIOCDWEDGE:
   1113 	case DIOCLWEDGES:
   1114 	case DIOCCACHESYNC:
   1115 	case RAIDFRAME_SHUTDOWN:
   1116 	case RAIDFRAME_REWRITEPARITY:
   1117 	case RAIDFRAME_GET_INFO:
   1118 	case RAIDFRAME_RESET_ACCTOTALS:
   1119 	case RAIDFRAME_GET_ACCTOTALS:
   1120 	case RAIDFRAME_KEEP_ACCTOTALS:
   1121 	case RAIDFRAME_GET_SIZE:
   1122 	case RAIDFRAME_FAIL_DISK:
   1123 	case RAIDFRAME_COPYBACK:
   1124 	case RAIDFRAME_CHECK_RECON_STATUS:
   1125 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1126 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1127 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1128 	case RAIDFRAME_ADD_HOT_SPARE:
   1129 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1130 	case RAIDFRAME_INIT_LABELS:
   1131 	case RAIDFRAME_REBUILD_IN_PLACE:
   1132 	case RAIDFRAME_CHECK_PARITY:
   1133 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1134 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1135 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1136 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1137 	case RAIDFRAME_SET_AUTOCONFIG:
   1138 	case RAIDFRAME_SET_ROOT:
   1139 	case RAIDFRAME_DELETE_COMPONENT:
   1140 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1141 	case RAIDFRAME_PARITYMAP_STATUS:
   1142 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1143 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1144 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1145 	case DIOCGSTRATEGY:
   1146 	case DIOCSSTRATEGY:
   1147 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1148 			return (ENXIO);
   1149 	}
   1150 
   1151 	switch (cmd) {
   1152 #ifdef COMPAT_50
   1153 	case RAIDFRAME_GET_INFO50:
   1154 		return rf_get_info50(raidPtr, data);
   1155 
   1156 	case RAIDFRAME_CONFIGURE50:
   1157 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1158 			return retcode;
   1159 		goto config;
   1160 #endif
   1161 		/* configure the system */
   1162 	case RAIDFRAME_CONFIGURE:
   1163 
   1164 		if (raidPtr->valid) {
   1165 			/* There is a valid RAID set running on this unit! */
   1166 			printf("raid%d: Device already configured!\n",unit);
   1167 			return(EINVAL);
   1168 		}
   1169 
   1170 		/* copy-in the configuration information */
   1171 		/* data points to a pointer to the configuration structure */
   1172 
   1173 		u_cfg = *((RF_Config_t **) data);
   1174 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1175 		if (k_cfg == NULL) {
   1176 			return (ENOMEM);
   1177 		}
   1178 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1179 		if (retcode) {
   1180 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1181 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1182 				retcode));
   1183 			return (retcode);
   1184 		}
   1185 		goto config;
   1186 	config:
   1187 		/* allocate a buffer for the layout-specific data, and copy it
   1188 		 * in */
   1189 		if (k_cfg->layoutSpecificSize) {
   1190 			if (k_cfg->layoutSpecificSize > 10000) {
   1191 				/* sanity check */
   1192 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1193 				return (EINVAL);
   1194 			}
   1195 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1196 			    (u_char *));
   1197 			if (specific_buf == NULL) {
   1198 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1199 				return (ENOMEM);
   1200 			}
   1201 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1202 			    k_cfg->layoutSpecificSize);
   1203 			if (retcode) {
   1204 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1205 				RF_Free(specific_buf,
   1206 					k_cfg->layoutSpecificSize);
   1207 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1208 					retcode));
   1209 				return (retcode);
   1210 			}
   1211 		} else
   1212 			specific_buf = NULL;
   1213 		k_cfg->layoutSpecific = specific_buf;
   1214 
   1215 		/* should do some kind of sanity check on the configuration.
   1216 		 * Store the sum of all the bytes in the last byte? */
   1217 
   1218 		/* configure the system */
   1219 
   1220 		/*
   1221 		 * Clear the entire RAID descriptor, just to make sure
   1222 		 *  there is no stale data left in the case of a
   1223 		 *  reconfiguration
   1224 		 */
   1225 		memset(raidPtr, 0, sizeof(*raidPtr));
   1226 		raidPtr->softc = rs;
   1227 		raidPtr->raidid = unit;
   1228 
   1229 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1230 
   1231 		if (retcode == 0) {
   1232 
   1233 			/* allow this many simultaneous IO's to
   1234 			   this RAID device */
   1235 			raidPtr->openings = RAIDOUTSTANDING;
   1236 
   1237 			raidinit(rs);
   1238 			rf_markalldirty(raidPtr);
   1239 		}
   1240 		/* free the buffers.  No return code here. */
   1241 		if (k_cfg->layoutSpecificSize) {
   1242 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1243 		}
   1244 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1245 
   1246 		return (retcode);
   1247 
   1248 		/* shutdown the system */
   1249 	case RAIDFRAME_SHUTDOWN:
   1250 
   1251 		part = DISKPART(dev);
   1252 		pmask = (1 << part);
   1253 
   1254 		if ((error = raidlock(rs)) != 0)
   1255 			return (error);
   1256 
   1257 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1258 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1259 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1260 			retcode = EBUSY;
   1261 		else {
   1262 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1263 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1264 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1265 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1266 			retcode = 0;
   1267 		}
   1268 
   1269 		raidunlock(rs);
   1270 
   1271 		if (retcode != 0)
   1272 			return retcode;
   1273 
   1274 		/* free the pseudo device attach bits */
   1275 
   1276 		cf = device_cfdata(rs->sc_dev);
   1277 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1278 			free(cf, M_RAIDFRAME);
   1279 
   1280 		return (retcode);
   1281 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1282 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1283 		/* need to read the component label for the disk indicated
   1284 		   by row,column in clabel */
   1285 
   1286 		/*
   1287 		 * Perhaps there should be an option to skip the in-core
   1288 		 * copy and hit the disk, as with disklabel(8).
   1289 		 */
   1290 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1291 
   1292 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1293 
   1294 		if (retcode) {
   1295 			RF_Free(clabel, sizeof(*clabel));
   1296 			return retcode;
   1297 		}
   1298 
   1299 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1300 
   1301 		column = clabel->column;
   1302 
   1303 		if ((column < 0) || (column >= raidPtr->numCol +
   1304 		    raidPtr->numSpare)) {
   1305 			RF_Free(clabel, sizeof(*clabel));
   1306 			return EINVAL;
   1307 		}
   1308 
   1309 		RF_Free(clabel, sizeof(*clabel));
   1310 
   1311 		clabel = raidget_component_label(raidPtr, column);
   1312 
   1313 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1314 
   1315 #if 0
   1316 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1317 		clabel = (RF_ComponentLabel_t *) data;
   1318 
   1319 		/* XXX check the label for valid stuff... */
   1320 		/* Note that some things *should not* get modified --
   1321 		   the user should be re-initing the labels instead of
   1322 		   trying to patch things.
   1323 		   */
   1324 
   1325 		raidid = raidPtr->raidid;
   1326 #ifdef DEBUG
   1327 		printf("raid%d: Got component label:\n", raidid);
   1328 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1329 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1330 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1331 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1332 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1333 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1334 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1335 #endif
   1336 		clabel->row = 0;
   1337 		column = clabel->column;
   1338 
   1339 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1340 			return(EINVAL);
   1341 		}
   1342 
   1343 		/* XXX this isn't allowed to do anything for now :-) */
   1344 
   1345 		/* XXX and before it is, we need to fill in the rest
   1346 		   of the fields!?!?!?! */
   1347 		memcpy(raidget_component_label(raidPtr, column),
   1348 		    clabel, sizeof(*clabel));
   1349 		raidflush_component_label(raidPtr, column);
   1350 		return (0);
   1351 #endif
   1352 
   1353 	case RAIDFRAME_INIT_LABELS:
   1354 		clabel = (RF_ComponentLabel_t *) data;
   1355 		/*
   1356 		   we only want the serial number from
   1357 		   the above.  We get all the rest of the information
   1358 		   from the config that was used to create this RAID
   1359 		   set.
   1360 		   */
   1361 
   1362 		raidPtr->serial_number = clabel->serial_number;
   1363 
   1364 		for(column=0;column<raidPtr->numCol;column++) {
   1365 			diskPtr = &raidPtr->Disks[column];
   1366 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1367 				ci_label = raidget_component_label(raidPtr,
   1368 				    column);
   1369 				/* Zeroing this is important. */
   1370 				memset(ci_label, 0, sizeof(*ci_label));
   1371 				raid_init_component_label(raidPtr, ci_label);
   1372 				ci_label->serial_number =
   1373 				    raidPtr->serial_number;
   1374 				ci_label->row = 0; /* we dont' pretend to support more */
   1375 				rf_component_label_set_partitionsize(ci_label,
   1376 				    diskPtr->partitionSize);
   1377 				ci_label->column = column;
   1378 				raidflush_component_label(raidPtr, column);
   1379 			}
   1380 			/* XXXjld what about the spares? */
   1381 		}
   1382 
   1383 		return (retcode);
   1384 	case RAIDFRAME_SET_AUTOCONFIG:
   1385 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1386 		printf("raid%d: New autoconfig value is: %d\n",
   1387 		       raidPtr->raidid, d);
   1388 		*(int *) data = d;
   1389 		return (retcode);
   1390 
   1391 	case RAIDFRAME_SET_ROOT:
   1392 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1393 		printf("raid%d: New rootpartition value is: %d\n",
   1394 		       raidPtr->raidid, d);
   1395 		*(int *) data = d;
   1396 		return (retcode);
   1397 
   1398 		/* initialize all parity */
   1399 	case RAIDFRAME_REWRITEPARITY:
   1400 
   1401 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1402 			/* Parity for RAID 0 is trivially correct */
   1403 			raidPtr->parity_good = RF_RAID_CLEAN;
   1404 			return(0);
   1405 		}
   1406 
   1407 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1408 			/* Re-write is already in progress! */
   1409 			return(EINVAL);
   1410 		}
   1411 
   1412 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1413 					   rf_RewriteParityThread,
   1414 					   raidPtr,"raid_parity");
   1415 		return (retcode);
   1416 
   1417 
   1418 	case RAIDFRAME_ADD_HOT_SPARE:
   1419 		sparePtr = (RF_SingleComponent_t *) data;
   1420 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1421 		retcode = rf_add_hot_spare(raidPtr, &component);
   1422 		return(retcode);
   1423 
   1424 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1425 		return(retcode);
   1426 
   1427 	case RAIDFRAME_DELETE_COMPONENT:
   1428 		componentPtr = (RF_SingleComponent_t *)data;
   1429 		memcpy( &component, componentPtr,
   1430 			sizeof(RF_SingleComponent_t));
   1431 		retcode = rf_delete_component(raidPtr, &component);
   1432 		return(retcode);
   1433 
   1434 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1435 		componentPtr = (RF_SingleComponent_t *)data;
   1436 		memcpy( &component, componentPtr,
   1437 			sizeof(RF_SingleComponent_t));
   1438 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1439 		return(retcode);
   1440 
   1441 	case RAIDFRAME_REBUILD_IN_PLACE:
   1442 
   1443 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1444 			/* Can't do this on a RAID 0!! */
   1445 			return(EINVAL);
   1446 		}
   1447 
   1448 		if (raidPtr->recon_in_progress == 1) {
   1449 			/* a reconstruct is already in progress! */
   1450 			return(EINVAL);
   1451 		}
   1452 
   1453 		componentPtr = (RF_SingleComponent_t *) data;
   1454 		memcpy( &component, componentPtr,
   1455 			sizeof(RF_SingleComponent_t));
   1456 		component.row = 0; /* we don't support any more */
   1457 		column = component.column;
   1458 
   1459 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1460 			return(EINVAL);
   1461 		}
   1462 
   1463 		rf_lock_mutex2(raidPtr->mutex);
   1464 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1465 		    (raidPtr->numFailures > 0)) {
   1466 			/* XXX 0 above shouldn't be constant!!! */
   1467 			/* some component other than this has failed.
   1468 			   Let's not make things worse than they already
   1469 			   are... */
   1470 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1471 			       raidPtr->raidid);
   1472 			printf("raid%d:     Col: %d   Too many failures.\n",
   1473 			       raidPtr->raidid, column);
   1474 			rf_unlock_mutex2(raidPtr->mutex);
   1475 			return (EINVAL);
   1476 		}
   1477 		if (raidPtr->Disks[column].status ==
   1478 		    rf_ds_reconstructing) {
   1479 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1480 			       raidPtr->raidid);
   1481 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1482 
   1483 			rf_unlock_mutex2(raidPtr->mutex);
   1484 			return (EINVAL);
   1485 		}
   1486 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1487 			rf_unlock_mutex2(raidPtr->mutex);
   1488 			return (EINVAL);
   1489 		}
   1490 		rf_unlock_mutex2(raidPtr->mutex);
   1491 
   1492 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1493 		if (rrcopy == NULL)
   1494 			return(ENOMEM);
   1495 
   1496 		rrcopy->raidPtr = (void *) raidPtr;
   1497 		rrcopy->col = column;
   1498 
   1499 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1500 					   rf_ReconstructInPlaceThread,
   1501 					   rrcopy,"raid_reconip");
   1502 		return(retcode);
   1503 
   1504 	case RAIDFRAME_GET_INFO:
   1505 		if (!raidPtr->valid)
   1506 			return (ENODEV);
   1507 		ucfgp = (RF_DeviceConfig_t **) data;
   1508 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1509 			  (RF_DeviceConfig_t *));
   1510 		if (d_cfg == NULL)
   1511 			return (ENOMEM);
   1512 		d_cfg->rows = 1; /* there is only 1 row now */
   1513 		d_cfg->cols = raidPtr->numCol;
   1514 		d_cfg->ndevs = raidPtr->numCol;
   1515 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1516 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1517 			return (ENOMEM);
   1518 		}
   1519 		d_cfg->nspares = raidPtr->numSpare;
   1520 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1521 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1522 			return (ENOMEM);
   1523 		}
   1524 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1525 		d = 0;
   1526 		for (j = 0; j < d_cfg->cols; j++) {
   1527 			d_cfg->devs[d] = raidPtr->Disks[j];
   1528 			d++;
   1529 		}
   1530 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1531 			d_cfg->spares[i] = raidPtr->Disks[j];
   1532 		}
   1533 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1534 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1535 
   1536 		return (retcode);
   1537 
   1538 	case RAIDFRAME_CHECK_PARITY:
   1539 		*(int *) data = raidPtr->parity_good;
   1540 		return (0);
   1541 
   1542 	case RAIDFRAME_PARITYMAP_STATUS:
   1543 		if (rf_paritymap_ineligible(raidPtr))
   1544 			return EINVAL;
   1545 		rf_paritymap_status(raidPtr->parity_map,
   1546 		    (struct rf_pmstat *)data);
   1547 		return 0;
   1548 
   1549 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1550 		if (rf_paritymap_ineligible(raidPtr))
   1551 			return EINVAL;
   1552 		if (raidPtr->parity_map == NULL)
   1553 			return ENOENT; /* ??? */
   1554 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1555 			(struct rf_pmparams *)data, 1))
   1556 			return EINVAL;
   1557 		return 0;
   1558 
   1559 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1560 		if (rf_paritymap_ineligible(raidPtr))
   1561 			return EINVAL;
   1562 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1563 		return 0;
   1564 
   1565 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1566 		if (rf_paritymap_ineligible(raidPtr))
   1567 			return EINVAL;
   1568 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1569 		/* XXX should errors be passed up? */
   1570 		return 0;
   1571 
   1572 	case RAIDFRAME_RESET_ACCTOTALS:
   1573 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1574 		return (0);
   1575 
   1576 	case RAIDFRAME_GET_ACCTOTALS:
   1577 		totals = (RF_AccTotals_t *) data;
   1578 		*totals = raidPtr->acc_totals;
   1579 		return (0);
   1580 
   1581 	case RAIDFRAME_KEEP_ACCTOTALS:
   1582 		raidPtr->keep_acc_totals = *(int *)data;
   1583 		return (0);
   1584 
   1585 	case RAIDFRAME_GET_SIZE:
   1586 		*(int *) data = raidPtr->totalSectors;
   1587 		return (0);
   1588 
   1589 		/* fail a disk & optionally start reconstruction */
   1590 	case RAIDFRAME_FAIL_DISK:
   1591 
   1592 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1593 			/* Can't do this on a RAID 0!! */
   1594 			return(EINVAL);
   1595 		}
   1596 
   1597 		rr = (struct rf_recon_req *) data;
   1598 		rr->row = 0;
   1599 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1600 			return (EINVAL);
   1601 
   1602 
   1603 		rf_lock_mutex2(raidPtr->mutex);
   1604 		if (raidPtr->status == rf_rs_reconstructing) {
   1605 			/* you can't fail a disk while we're reconstructing! */
   1606 			/* XXX wrong for RAID6 */
   1607 			rf_unlock_mutex2(raidPtr->mutex);
   1608 			return (EINVAL);
   1609 		}
   1610 		if ((raidPtr->Disks[rr->col].status ==
   1611 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1612 			/* some other component has failed.  Let's not make
   1613 			   things worse. XXX wrong for RAID6 */
   1614 			rf_unlock_mutex2(raidPtr->mutex);
   1615 			return (EINVAL);
   1616 		}
   1617 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1618 			/* Can't fail a spared disk! */
   1619 			rf_unlock_mutex2(raidPtr->mutex);
   1620 			return (EINVAL);
   1621 		}
   1622 		rf_unlock_mutex2(raidPtr->mutex);
   1623 
   1624 		/* make a copy of the recon request so that we don't rely on
   1625 		 * the user's buffer */
   1626 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1627 		if (rrcopy == NULL)
   1628 			return(ENOMEM);
   1629 		memcpy(rrcopy, rr, sizeof(*rr));
   1630 		rrcopy->raidPtr = (void *) raidPtr;
   1631 
   1632 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1633 					   rf_ReconThread,
   1634 					   rrcopy,"raid_recon");
   1635 		return (0);
   1636 
   1637 		/* invoke a copyback operation after recon on whatever disk
   1638 		 * needs it, if any */
   1639 	case RAIDFRAME_COPYBACK:
   1640 
   1641 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1642 			/* This makes no sense on a RAID 0!! */
   1643 			return(EINVAL);
   1644 		}
   1645 
   1646 		if (raidPtr->copyback_in_progress == 1) {
   1647 			/* Copyback is already in progress! */
   1648 			return(EINVAL);
   1649 		}
   1650 
   1651 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1652 					   rf_CopybackThread,
   1653 					   raidPtr,"raid_copyback");
   1654 		return (retcode);
   1655 
   1656 		/* return the percentage completion of reconstruction */
   1657 	case RAIDFRAME_CHECK_RECON_STATUS:
   1658 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1659 			/* This makes no sense on a RAID 0, so tell the
   1660 			   user it's done. */
   1661 			*(int *) data = 100;
   1662 			return(0);
   1663 		}
   1664 		if (raidPtr->status != rf_rs_reconstructing)
   1665 			*(int *) data = 100;
   1666 		else {
   1667 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1668 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1669 			} else {
   1670 				*(int *) data = 0;
   1671 			}
   1672 		}
   1673 		return (0);
   1674 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1675 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1676 		if (raidPtr->status != rf_rs_reconstructing) {
   1677 			progressInfo.remaining = 0;
   1678 			progressInfo.completed = 100;
   1679 			progressInfo.total = 100;
   1680 		} else {
   1681 			progressInfo.total =
   1682 				raidPtr->reconControl->numRUsTotal;
   1683 			progressInfo.completed =
   1684 				raidPtr->reconControl->numRUsComplete;
   1685 			progressInfo.remaining = progressInfo.total -
   1686 				progressInfo.completed;
   1687 		}
   1688 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1689 				  sizeof(RF_ProgressInfo_t));
   1690 		return (retcode);
   1691 
   1692 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1693 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1694 			/* This makes no sense on a RAID 0, so tell the
   1695 			   user it's done. */
   1696 			*(int *) data = 100;
   1697 			return(0);
   1698 		}
   1699 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1700 			*(int *) data = 100 *
   1701 				raidPtr->parity_rewrite_stripes_done /
   1702 				raidPtr->Layout.numStripe;
   1703 		} else {
   1704 			*(int *) data = 100;
   1705 		}
   1706 		return (0);
   1707 
   1708 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1709 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1710 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1711 			progressInfo.total = raidPtr->Layout.numStripe;
   1712 			progressInfo.completed =
   1713 				raidPtr->parity_rewrite_stripes_done;
   1714 			progressInfo.remaining = progressInfo.total -
   1715 				progressInfo.completed;
   1716 		} else {
   1717 			progressInfo.remaining = 0;
   1718 			progressInfo.completed = 100;
   1719 			progressInfo.total = 100;
   1720 		}
   1721 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1722 				  sizeof(RF_ProgressInfo_t));
   1723 		return (retcode);
   1724 
   1725 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1726 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1727 			/* This makes no sense on a RAID 0 */
   1728 			*(int *) data = 100;
   1729 			return(0);
   1730 		}
   1731 		if (raidPtr->copyback_in_progress == 1) {
   1732 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1733 				raidPtr->Layout.numStripe;
   1734 		} else {
   1735 			*(int *) data = 100;
   1736 		}
   1737 		return (0);
   1738 
   1739 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1740 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1741 		if (raidPtr->copyback_in_progress == 1) {
   1742 			progressInfo.total = raidPtr->Layout.numStripe;
   1743 			progressInfo.completed =
   1744 				raidPtr->copyback_stripes_done;
   1745 			progressInfo.remaining = progressInfo.total -
   1746 				progressInfo.completed;
   1747 		} else {
   1748 			progressInfo.remaining = 0;
   1749 			progressInfo.completed = 100;
   1750 			progressInfo.total = 100;
   1751 		}
   1752 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1753 				  sizeof(RF_ProgressInfo_t));
   1754 		return (retcode);
   1755 
   1756 		/* the sparetable daemon calls this to wait for the kernel to
   1757 		 * need a spare table. this ioctl does not return until a
   1758 		 * spare table is needed. XXX -- calling mpsleep here in the
   1759 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1760 		 * -- I should either compute the spare table in the kernel,
   1761 		 * or have a different -- XXX XXX -- interface (a different
   1762 		 * character device) for delivering the table     -- XXX */
   1763 #if 0
   1764 	case RAIDFRAME_SPARET_WAIT:
   1765 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1766 		while (!rf_sparet_wait_queue)
   1767 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1768 		waitreq = rf_sparet_wait_queue;
   1769 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1770 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1771 
   1772 		/* structure assignment */
   1773 		*((RF_SparetWait_t *) data) = *waitreq;
   1774 
   1775 		RF_Free(waitreq, sizeof(*waitreq));
   1776 		return (0);
   1777 
   1778 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1779 		 * code in it that will cause the dameon to exit */
   1780 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1781 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1782 		waitreq->fcol = -1;
   1783 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1784 		waitreq->next = rf_sparet_wait_queue;
   1785 		rf_sparet_wait_queue = waitreq;
   1786 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1787 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1788 		return (0);
   1789 
   1790 		/* used by the spare table daemon to deliver a spare table
   1791 		 * into the kernel */
   1792 	case RAIDFRAME_SEND_SPARET:
   1793 
   1794 		/* install the spare table */
   1795 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1796 
   1797 		/* respond to the requestor.  the return status of the spare
   1798 		 * table installation is passed in the "fcol" field */
   1799 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1800 		waitreq->fcol = retcode;
   1801 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1802 		waitreq->next = rf_sparet_resp_queue;
   1803 		rf_sparet_resp_queue = waitreq;
   1804 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1805 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1806 
   1807 		return (retcode);
   1808 #endif
   1809 
   1810 	default:
   1811 		break; /* fall through to the os-specific code below */
   1812 
   1813 	}
   1814 
   1815 	if (!raidPtr->valid)
   1816 		return (EINVAL);
   1817 
   1818 	/*
   1819 	 * Add support for "regular" device ioctls here.
   1820 	 */
   1821 
   1822 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1823 	if (error != EPASSTHROUGH)
   1824 		return (error);
   1825 
   1826 	switch (cmd) {
   1827 	case DIOCGDINFO:
   1828 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1829 		break;
   1830 #ifdef __HAVE_OLD_DISKLABEL
   1831 	case ODIOCGDINFO:
   1832 		newlabel = *(rs->sc_dkdev.dk_label);
   1833 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1834 			return ENOTTY;
   1835 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1836 		break;
   1837 #endif
   1838 
   1839 	case DIOCGPART:
   1840 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1841 		((struct partinfo *) data)->part =
   1842 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1843 		break;
   1844 
   1845 	case DIOCWDINFO:
   1846 	case DIOCSDINFO:
   1847 #ifdef __HAVE_OLD_DISKLABEL
   1848 	case ODIOCWDINFO:
   1849 	case ODIOCSDINFO:
   1850 #endif
   1851 	{
   1852 		struct disklabel *lp;
   1853 #ifdef __HAVE_OLD_DISKLABEL
   1854 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1855 			memset(&newlabel, 0, sizeof newlabel);
   1856 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1857 			lp = &newlabel;
   1858 		} else
   1859 #endif
   1860 		lp = (struct disklabel *)data;
   1861 
   1862 		if ((error = raidlock(rs)) != 0)
   1863 			return (error);
   1864 
   1865 		rs->sc_flags |= RAIDF_LABELLING;
   1866 
   1867 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1868 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1869 		if (error == 0) {
   1870 			if (cmd == DIOCWDINFO
   1871 #ifdef __HAVE_OLD_DISKLABEL
   1872 			    || cmd == ODIOCWDINFO
   1873 #endif
   1874 			   )
   1875 				error = writedisklabel(RAIDLABELDEV(dev),
   1876 				    raidstrategy, rs->sc_dkdev.dk_label,
   1877 				    rs->sc_dkdev.dk_cpulabel);
   1878 		}
   1879 		rs->sc_flags &= ~RAIDF_LABELLING;
   1880 
   1881 		raidunlock(rs);
   1882 
   1883 		if (error)
   1884 			return (error);
   1885 		break;
   1886 	}
   1887 
   1888 	case DIOCWLABEL:
   1889 		if (*(int *) data != 0)
   1890 			rs->sc_flags |= RAIDF_WLABEL;
   1891 		else
   1892 			rs->sc_flags &= ~RAIDF_WLABEL;
   1893 		break;
   1894 
   1895 	case DIOCGDEFLABEL:
   1896 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1897 		break;
   1898 
   1899 #ifdef __HAVE_OLD_DISKLABEL
   1900 	case ODIOCGDEFLABEL:
   1901 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1902 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1903 			return ENOTTY;
   1904 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1905 		break;
   1906 #endif
   1907 
   1908 	case DIOCAWEDGE:
   1909 	case DIOCDWEDGE:
   1910 	    	dkw = (void *)data;
   1911 
   1912 		/* If the ioctl happens here, the parent is us. */
   1913 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1914 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1915 
   1916 	case DIOCLWEDGES:
   1917 		return dkwedge_list(&rs->sc_dkdev,
   1918 		    (struct dkwedge_list *)data, l);
   1919 	case DIOCCACHESYNC:
   1920 		return rf_sync_component_caches(raidPtr);
   1921 
   1922 	case DIOCGSTRATEGY:
   1923 	    {
   1924 		struct disk_strategy *dks = (void *)data;
   1925 
   1926 		s = splbio();
   1927 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1928 		    sizeof(dks->dks_name));
   1929 		splx(s);
   1930 		dks->dks_paramlen = 0;
   1931 
   1932 		return 0;
   1933 	    }
   1934 
   1935 	case DIOCSSTRATEGY:
   1936 	    {
   1937 		struct disk_strategy *dks = (void *)data;
   1938 		struct bufq_state *new;
   1939 		struct bufq_state *old;
   1940 
   1941 		if (dks->dks_param != NULL) {
   1942 			return EINVAL;
   1943 		}
   1944 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1945 		error = bufq_alloc(&new, dks->dks_name,
   1946 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1947 		if (error) {
   1948 			return error;
   1949 		}
   1950 		s = splbio();
   1951 		old = rs->buf_queue;
   1952 		bufq_move(new, old);
   1953 		rs->buf_queue = new;
   1954 		splx(s);
   1955 		bufq_free(old);
   1956 
   1957 		return 0;
   1958 	    }
   1959 
   1960 	default:
   1961 		retcode = ENOTTY;
   1962 	}
   1963 	return (retcode);
   1964 
   1965 }
   1966 
   1967 
   1968 /* raidinit -- complete the rest of the initialization for the
   1969    RAIDframe device.  */
   1970 
   1971 
   1972 static void
   1973 raidinit(struct raid_softc *rs)
   1974 {
   1975 	cfdata_t cf;
   1976 	int     unit;
   1977 	RF_Raid_t *raidPtr = &rs->sc_r;
   1978 
   1979 	unit = raidPtr->raidid;
   1980 
   1981 
   1982 	/* XXX should check return code first... */
   1983 	rs->sc_flags |= RAIDF_INITED;
   1984 
   1985 	/* XXX doesn't check bounds. */
   1986 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1987 
   1988 	/* attach the pseudo device */
   1989 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1990 	cf->cf_name = raid_cd.cd_name;
   1991 	cf->cf_atname = raid_cd.cd_name;
   1992 	cf->cf_unit = unit;
   1993 	cf->cf_fstate = FSTATE_STAR;
   1994 
   1995 	rs->sc_dev = config_attach_pseudo(cf);
   1996 
   1997 	if (rs->sc_dev == NULL) {
   1998 		printf("raid%d: config_attach_pseudo failed\n",
   1999 		    raidPtr->raidid);
   2000 		rs->sc_flags &= ~RAIDF_INITED;
   2001 		free(cf, M_RAIDFRAME);
   2002 		return;
   2003 	}
   2004 
   2005 	/* disk_attach actually creates space for the CPU disklabel, among
   2006 	 * other things, so it's critical to call this *BEFORE* we try putzing
   2007 	 * with disklabels. */
   2008 
   2009 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   2010 	disk_attach(&rs->sc_dkdev);
   2011 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   2012 
   2013 	/* XXX There may be a weird interaction here between this, and
   2014 	 * protectedSectors, as used in RAIDframe.  */
   2015 
   2016 	rs->sc_size = raidPtr->totalSectors;
   2017 
   2018 	dkwedge_discover(&rs->sc_dkdev);
   2019 
   2020 	rf_set_geometry(rs, raidPtr);
   2021 
   2022 }
   2023 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2024 /* wake up the daemon & tell it to get us a spare table
   2025  * XXX
   2026  * the entries in the queues should be tagged with the raidPtr
   2027  * so that in the extremely rare case that two recons happen at once,
   2028  * we know for which device were requesting a spare table
   2029  * XXX
   2030  *
   2031  * XXX This code is not currently used. GO
   2032  */
   2033 int
   2034 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2035 {
   2036 	int     retcode;
   2037 
   2038 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2039 	req->next = rf_sparet_wait_queue;
   2040 	rf_sparet_wait_queue = req;
   2041 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2042 
   2043 	/* mpsleep unlocks the mutex */
   2044 	while (!rf_sparet_resp_queue) {
   2045 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2046 	}
   2047 	req = rf_sparet_resp_queue;
   2048 	rf_sparet_resp_queue = req->next;
   2049 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2050 
   2051 	retcode = req->fcol;
   2052 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2053 					 * alloc'd */
   2054 	return (retcode);
   2055 }
   2056 #endif
   2057 
   2058 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2059  * bp & passes it down.
   2060  * any calls originating in the kernel must use non-blocking I/O
   2061  * do some extra sanity checking to return "appropriate" error values for
   2062  * certain conditions (to make some standard utilities work)
   2063  *
   2064  * Formerly known as: rf_DoAccessKernel
   2065  */
   2066 void
   2067 raidstart(RF_Raid_t *raidPtr)
   2068 {
   2069 	RF_SectorCount_t num_blocks, pb, sum;
   2070 	RF_RaidAddr_t raid_addr;
   2071 	struct partition *pp;
   2072 	daddr_t blocknum;
   2073 	struct raid_softc *rs;
   2074 	int     do_async;
   2075 	struct buf *bp;
   2076 	int rc;
   2077 
   2078 	rs = raidPtr->softc;
   2079 	/* quick check to see if anything has died recently */
   2080 	rf_lock_mutex2(raidPtr->mutex);
   2081 	if (raidPtr->numNewFailures > 0) {
   2082 		rf_unlock_mutex2(raidPtr->mutex);
   2083 		rf_update_component_labels(raidPtr,
   2084 					   RF_NORMAL_COMPONENT_UPDATE);
   2085 		rf_lock_mutex2(raidPtr->mutex);
   2086 		raidPtr->numNewFailures--;
   2087 	}
   2088 
   2089 	/* Check to see if we're at the limit... */
   2090 	while (raidPtr->openings > 0) {
   2091 		rf_unlock_mutex2(raidPtr->mutex);
   2092 
   2093 		/* get the next item, if any, from the queue */
   2094 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2095 			/* nothing more to do */
   2096 			return;
   2097 		}
   2098 
   2099 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2100 		 * partition.. Need to make it absolute to the underlying
   2101 		 * device.. */
   2102 
   2103 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2104 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2105 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2106 			blocknum += pp->p_offset;
   2107 		}
   2108 
   2109 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2110 			    (int) blocknum));
   2111 
   2112 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2113 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2114 
   2115 		/* *THIS* is where we adjust what block we're going to...
   2116 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2117 		raid_addr = blocknum;
   2118 
   2119 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2120 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2121 		sum = raid_addr + num_blocks + pb;
   2122 		if (1 || rf_debugKernelAccess) {
   2123 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2124 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2125 				    (int) pb, (int) bp->b_resid));
   2126 		}
   2127 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2128 		    || (sum < num_blocks) || (sum < pb)) {
   2129 			bp->b_error = ENOSPC;
   2130 			bp->b_resid = bp->b_bcount;
   2131 			biodone(bp);
   2132 			rf_lock_mutex2(raidPtr->mutex);
   2133 			continue;
   2134 		}
   2135 		/*
   2136 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2137 		 */
   2138 
   2139 		if (bp->b_bcount & raidPtr->sectorMask) {
   2140 			bp->b_error = EINVAL;
   2141 			bp->b_resid = bp->b_bcount;
   2142 			biodone(bp);
   2143 			rf_lock_mutex2(raidPtr->mutex);
   2144 			continue;
   2145 
   2146 		}
   2147 		db1_printf(("Calling DoAccess..\n"));
   2148 
   2149 
   2150 		rf_lock_mutex2(raidPtr->mutex);
   2151 		raidPtr->openings--;
   2152 		rf_unlock_mutex2(raidPtr->mutex);
   2153 
   2154 		/*
   2155 		 * Everything is async.
   2156 		 */
   2157 		do_async = 1;
   2158 
   2159 		disk_busy(&rs->sc_dkdev);
   2160 
   2161 		/* XXX we're still at splbio() here... do we *really*
   2162 		   need to be? */
   2163 
   2164 		/* don't ever condition on bp->b_flags & B_WRITE.
   2165 		 * always condition on B_READ instead */
   2166 
   2167 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2168 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2169 				 do_async, raid_addr, num_blocks,
   2170 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2171 
   2172 		if (rc) {
   2173 			bp->b_error = rc;
   2174 			bp->b_resid = bp->b_bcount;
   2175 			biodone(bp);
   2176 			/* continue loop */
   2177 		}
   2178 
   2179 		rf_lock_mutex2(raidPtr->mutex);
   2180 	}
   2181 	rf_unlock_mutex2(raidPtr->mutex);
   2182 }
   2183 
   2184 
   2185 
   2186 
   2187 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2188 
   2189 int
   2190 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2191 {
   2192 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2193 	struct buf *bp;
   2194 
   2195 	req->queue = queue;
   2196 	bp = req->bp;
   2197 
   2198 	switch (req->type) {
   2199 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2200 		/* XXX need to do something extra here.. */
   2201 		/* I'm leaving this in, as I've never actually seen it used,
   2202 		 * and I'd like folks to report it... GO */
   2203 		printf(("WAKEUP CALLED\n"));
   2204 		queue->numOutstanding++;
   2205 
   2206 		bp->b_flags = 0;
   2207 		bp->b_private = req;
   2208 
   2209 		KernelWakeupFunc(bp);
   2210 		break;
   2211 
   2212 	case RF_IO_TYPE_READ:
   2213 	case RF_IO_TYPE_WRITE:
   2214 #if RF_ACC_TRACE > 0
   2215 		if (req->tracerec) {
   2216 			RF_ETIMER_START(req->tracerec->timer);
   2217 		}
   2218 #endif
   2219 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2220 		    op, queue->rf_cinfo->ci_dev,
   2221 		    req->sectorOffset, req->numSector,
   2222 		    req->buf, KernelWakeupFunc, (void *) req,
   2223 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2224 
   2225 		if (rf_debugKernelAccess) {
   2226 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2227 				(long) bp->b_blkno));
   2228 		}
   2229 		queue->numOutstanding++;
   2230 		queue->last_deq_sector = req->sectorOffset;
   2231 		/* acc wouldn't have been let in if there were any pending
   2232 		 * reqs at any other priority */
   2233 		queue->curPriority = req->priority;
   2234 
   2235 		db1_printf(("Going for %c to unit %d col %d\n",
   2236 			    req->type, queue->raidPtr->raidid,
   2237 			    queue->col));
   2238 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2239 			(int) req->sectorOffset, (int) req->numSector,
   2240 			(int) (req->numSector <<
   2241 			    queue->raidPtr->logBytesPerSector),
   2242 			(int) queue->raidPtr->logBytesPerSector));
   2243 
   2244 		/*
   2245 		 * XXX: drop lock here since this can block at
   2246 		 * least with backing SCSI devices.  Retake it
   2247 		 * to minimize fuss with calling interfaces.
   2248 		 */
   2249 
   2250 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2251 		bdev_strategy(bp);
   2252 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2253 		break;
   2254 
   2255 	default:
   2256 		panic("bad req->type in rf_DispatchKernelIO");
   2257 	}
   2258 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2259 
   2260 	return (0);
   2261 }
   2262 /* this is the callback function associated with a I/O invoked from
   2263    kernel code.
   2264  */
   2265 static void
   2266 KernelWakeupFunc(struct buf *bp)
   2267 {
   2268 	RF_DiskQueueData_t *req = NULL;
   2269 	RF_DiskQueue_t *queue;
   2270 
   2271 	db1_printf(("recovering the request queue:\n"));
   2272 
   2273 	req = bp->b_private;
   2274 
   2275 	queue = (RF_DiskQueue_t *) req->queue;
   2276 
   2277 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2278 
   2279 #if RF_ACC_TRACE > 0
   2280 	if (req->tracerec) {
   2281 		RF_ETIMER_STOP(req->tracerec->timer);
   2282 		RF_ETIMER_EVAL(req->tracerec->timer);
   2283 		rf_lock_mutex2(rf_tracing_mutex);
   2284 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2285 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2286 		req->tracerec->num_phys_ios++;
   2287 		rf_unlock_mutex2(rf_tracing_mutex);
   2288 	}
   2289 #endif
   2290 
   2291 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2292 	 * ballistic, and mark the component as hosed... */
   2293 
   2294 	if (bp->b_error != 0) {
   2295 		/* Mark the disk as dead */
   2296 		/* but only mark it once... */
   2297 		/* and only if it wouldn't leave this RAID set
   2298 		   completely broken */
   2299 		if (((queue->raidPtr->Disks[queue->col].status ==
   2300 		      rf_ds_optimal) ||
   2301 		     (queue->raidPtr->Disks[queue->col].status ==
   2302 		      rf_ds_used_spare)) &&
   2303 		     (queue->raidPtr->numFailures <
   2304 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2305 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2306 			       queue->raidPtr->raidid,
   2307 			       queue->raidPtr->Disks[queue->col].devname);
   2308 			queue->raidPtr->Disks[queue->col].status =
   2309 			    rf_ds_failed;
   2310 			queue->raidPtr->status = rf_rs_degraded;
   2311 			queue->raidPtr->numFailures++;
   2312 			queue->raidPtr->numNewFailures++;
   2313 		} else {	/* Disk is already dead... */
   2314 			/* printf("Disk already marked as dead!\n"); */
   2315 		}
   2316 
   2317 	}
   2318 
   2319 	/* Fill in the error value */
   2320 	req->error = bp->b_error;
   2321 
   2322 	/* Drop this one on the "finished" queue... */
   2323 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2324 
   2325 	/* Let the raidio thread know there is work to be done. */
   2326 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2327 
   2328 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2329 }
   2330 
   2331 
   2332 /*
   2333  * initialize a buf structure for doing an I/O in the kernel.
   2334  */
   2335 static void
   2336 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2337        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2338        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2339        struct proc *b_proc)
   2340 {
   2341 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2342 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2343 	bp->b_oflags = 0;
   2344 	bp->b_cflags = 0;
   2345 	bp->b_bcount = numSect << logBytesPerSector;
   2346 	bp->b_bufsize = bp->b_bcount;
   2347 	bp->b_error = 0;
   2348 	bp->b_dev = dev;
   2349 	bp->b_data = bf;
   2350 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2351 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2352 	if (bp->b_bcount == 0) {
   2353 		panic("bp->b_bcount is zero in InitBP!!");
   2354 	}
   2355 	bp->b_proc = b_proc;
   2356 	bp->b_iodone = cbFunc;
   2357 	bp->b_private = cbArg;
   2358 }
   2359 
   2360 static void
   2361 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2362 		    struct disklabel *lp)
   2363 {
   2364 	memset(lp, 0, sizeof(*lp));
   2365 
   2366 	/* fabricate a label... */
   2367 	lp->d_secperunit = raidPtr->totalSectors;
   2368 	lp->d_secsize = raidPtr->bytesPerSector;
   2369 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2370 	lp->d_ntracks = 4 * raidPtr->numCol;
   2371 	lp->d_ncylinders = raidPtr->totalSectors /
   2372 		(lp->d_nsectors * lp->d_ntracks);
   2373 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2374 
   2375 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2376 	lp->d_type = DTYPE_RAID;
   2377 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2378 	lp->d_rpm = 3600;
   2379 	lp->d_interleave = 1;
   2380 	lp->d_flags = 0;
   2381 
   2382 	lp->d_partitions[RAW_PART].p_offset = 0;
   2383 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2384 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2385 	lp->d_npartitions = RAW_PART + 1;
   2386 
   2387 	lp->d_magic = DISKMAGIC;
   2388 	lp->d_magic2 = DISKMAGIC;
   2389 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2390 
   2391 }
   2392 /*
   2393  * Read the disklabel from the raid device.  If one is not present, fake one
   2394  * up.
   2395  */
   2396 static void
   2397 raidgetdisklabel(dev_t dev)
   2398 {
   2399 	int     unit = raidunit(dev);
   2400 	struct raid_softc *rs;
   2401 	const char   *errstring;
   2402 	struct disklabel *lp;
   2403 	struct cpu_disklabel *clp;
   2404 	RF_Raid_t *raidPtr;
   2405 
   2406 	if ((rs = raidget(unit)) == NULL)
   2407 		return;
   2408 
   2409 	lp = rs->sc_dkdev.dk_label;
   2410 	clp = rs->sc_dkdev.dk_cpulabel;
   2411 
   2412 	db1_printf(("Getting the disklabel...\n"));
   2413 
   2414 	memset(clp, 0, sizeof(*clp));
   2415 
   2416 	raidPtr = &rs->sc_r;
   2417 
   2418 	raidgetdefaultlabel(raidPtr, rs, lp);
   2419 
   2420 	/*
   2421 	 * Call the generic disklabel extraction routine.
   2422 	 */
   2423 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2424 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2425 	if (errstring)
   2426 		raidmakedisklabel(rs);
   2427 	else {
   2428 		int     i;
   2429 		struct partition *pp;
   2430 
   2431 		/*
   2432 		 * Sanity check whether the found disklabel is valid.
   2433 		 *
   2434 		 * This is necessary since total size of the raid device
   2435 		 * may vary when an interleave is changed even though exactly
   2436 		 * same components are used, and old disklabel may used
   2437 		 * if that is found.
   2438 		 */
   2439 		if (lp->d_secperunit != rs->sc_size)
   2440 			printf("raid%d: WARNING: %s: "
   2441 			    "total sector size in disklabel (%" PRIu32 ") != "
   2442 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2443 			    lp->d_secperunit, rs->sc_size);
   2444 		for (i = 0; i < lp->d_npartitions; i++) {
   2445 			pp = &lp->d_partitions[i];
   2446 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2447 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2448 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2449 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2450 		}
   2451 	}
   2452 
   2453 }
   2454 /*
   2455  * Take care of things one might want to take care of in the event
   2456  * that a disklabel isn't present.
   2457  */
   2458 static void
   2459 raidmakedisklabel(struct raid_softc *rs)
   2460 {
   2461 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2462 	db1_printf(("Making a label..\n"));
   2463 
   2464 	/*
   2465 	 * For historical reasons, if there's no disklabel present
   2466 	 * the raw partition must be marked FS_BSDFFS.
   2467 	 */
   2468 
   2469 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2470 
   2471 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2472 
   2473 	lp->d_checksum = dkcksum(lp);
   2474 }
   2475 /*
   2476  * Wait interruptibly for an exclusive lock.
   2477  *
   2478  * XXX
   2479  * Several drivers do this; it should be abstracted and made MP-safe.
   2480  * (Hmm... where have we seen this warning before :->  GO )
   2481  */
   2482 static int
   2483 raidlock(struct raid_softc *rs)
   2484 {
   2485 	int     error;
   2486 
   2487 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2488 		rs->sc_flags |= RAIDF_WANTED;
   2489 		if ((error =
   2490 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2491 			return (error);
   2492 	}
   2493 	rs->sc_flags |= RAIDF_LOCKED;
   2494 	return (0);
   2495 }
   2496 /*
   2497  * Unlock and wake up any waiters.
   2498  */
   2499 static void
   2500 raidunlock(struct raid_softc *rs)
   2501 {
   2502 
   2503 	rs->sc_flags &= ~RAIDF_LOCKED;
   2504 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2505 		rs->sc_flags &= ~RAIDF_WANTED;
   2506 		wakeup(rs);
   2507 	}
   2508 }
   2509 
   2510 
   2511 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2512 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2513 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2514 
   2515 static daddr_t
   2516 rf_component_info_offset(void)
   2517 {
   2518 
   2519 	return RF_COMPONENT_INFO_OFFSET;
   2520 }
   2521 
   2522 static daddr_t
   2523 rf_component_info_size(unsigned secsize)
   2524 {
   2525 	daddr_t info_size;
   2526 
   2527 	KASSERT(secsize);
   2528 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2529 		info_size = secsize;
   2530 	else
   2531 		info_size = RF_COMPONENT_INFO_SIZE;
   2532 
   2533 	return info_size;
   2534 }
   2535 
   2536 static daddr_t
   2537 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2538 {
   2539 	daddr_t map_offset;
   2540 
   2541 	KASSERT(raidPtr->bytesPerSector);
   2542 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2543 		map_offset = raidPtr->bytesPerSector;
   2544 	else
   2545 		map_offset = RF_COMPONENT_INFO_SIZE;
   2546 	map_offset += rf_component_info_offset();
   2547 
   2548 	return map_offset;
   2549 }
   2550 
   2551 static daddr_t
   2552 rf_parity_map_size(RF_Raid_t *raidPtr)
   2553 {
   2554 	daddr_t map_size;
   2555 
   2556 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2557 		map_size = raidPtr->bytesPerSector;
   2558 	else
   2559 		map_size = RF_PARITY_MAP_SIZE;
   2560 
   2561 	return map_size;
   2562 }
   2563 
   2564 int
   2565 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2566 {
   2567 	RF_ComponentLabel_t *clabel;
   2568 
   2569 	clabel = raidget_component_label(raidPtr, col);
   2570 	clabel->clean = RF_RAID_CLEAN;
   2571 	raidflush_component_label(raidPtr, col);
   2572 	return(0);
   2573 }
   2574 
   2575 
   2576 int
   2577 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2578 {
   2579 	RF_ComponentLabel_t *clabel;
   2580 
   2581 	clabel = raidget_component_label(raidPtr, col);
   2582 	clabel->clean = RF_RAID_DIRTY;
   2583 	raidflush_component_label(raidPtr, col);
   2584 	return(0);
   2585 }
   2586 
   2587 int
   2588 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2589 {
   2590 	KASSERT(raidPtr->bytesPerSector);
   2591 	return raidread_component_label(raidPtr->bytesPerSector,
   2592 	    raidPtr->Disks[col].dev,
   2593 	    raidPtr->raid_cinfo[col].ci_vp,
   2594 	    &raidPtr->raid_cinfo[col].ci_label);
   2595 }
   2596 
   2597 RF_ComponentLabel_t *
   2598 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2599 {
   2600 	return &raidPtr->raid_cinfo[col].ci_label;
   2601 }
   2602 
   2603 int
   2604 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2605 {
   2606 	RF_ComponentLabel_t *label;
   2607 
   2608 	label = &raidPtr->raid_cinfo[col].ci_label;
   2609 	label->mod_counter = raidPtr->mod_counter;
   2610 #ifndef RF_NO_PARITY_MAP
   2611 	label->parity_map_modcount = label->mod_counter;
   2612 #endif
   2613 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2614 	    raidPtr->Disks[col].dev,
   2615 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2616 }
   2617 
   2618 
   2619 static int
   2620 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2621     RF_ComponentLabel_t *clabel)
   2622 {
   2623 	return raidread_component_area(dev, b_vp, clabel,
   2624 	    sizeof(RF_ComponentLabel_t),
   2625 	    rf_component_info_offset(),
   2626 	    rf_component_info_size(secsize));
   2627 }
   2628 
   2629 /* ARGSUSED */
   2630 static int
   2631 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2632     size_t msize, daddr_t offset, daddr_t dsize)
   2633 {
   2634 	struct buf *bp;
   2635 	const struct bdevsw *bdev;
   2636 	int error;
   2637 
   2638 	/* XXX should probably ensure that we don't try to do this if
   2639 	   someone has changed rf_protected_sectors. */
   2640 
   2641 	if (b_vp == NULL) {
   2642 		/* For whatever reason, this component is not valid.
   2643 		   Don't try to read a component label from it. */
   2644 		return(EINVAL);
   2645 	}
   2646 
   2647 	/* get a block of the appropriate size... */
   2648 	bp = geteblk((int)dsize);
   2649 	bp->b_dev = dev;
   2650 
   2651 	/* get our ducks in a row for the read */
   2652 	bp->b_blkno = offset / DEV_BSIZE;
   2653 	bp->b_bcount = dsize;
   2654 	bp->b_flags |= B_READ;
   2655  	bp->b_resid = dsize;
   2656 
   2657 	bdev = bdevsw_lookup(bp->b_dev);
   2658 	if (bdev == NULL)
   2659 		return (ENXIO);
   2660 	(*bdev->d_strategy)(bp);
   2661 
   2662 	error = biowait(bp);
   2663 
   2664 	if (!error) {
   2665 		memcpy(data, bp->b_data, msize);
   2666 	}
   2667 
   2668 	brelse(bp, 0);
   2669 	return(error);
   2670 }
   2671 
   2672 
   2673 static int
   2674 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2675     RF_ComponentLabel_t *clabel)
   2676 {
   2677 	return raidwrite_component_area(dev, b_vp, clabel,
   2678 	    sizeof(RF_ComponentLabel_t),
   2679 	    rf_component_info_offset(),
   2680 	    rf_component_info_size(secsize), 0);
   2681 }
   2682 
   2683 /* ARGSUSED */
   2684 static int
   2685 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2686     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2687 {
   2688 	struct buf *bp;
   2689 	const struct bdevsw *bdev;
   2690 	int error;
   2691 
   2692 	/* get a block of the appropriate size... */
   2693 	bp = geteblk((int)dsize);
   2694 	bp->b_dev = dev;
   2695 
   2696 	/* get our ducks in a row for the write */
   2697 	bp->b_blkno = offset / DEV_BSIZE;
   2698 	bp->b_bcount = dsize;
   2699 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2700  	bp->b_resid = dsize;
   2701 
   2702 	memset(bp->b_data, 0, dsize);
   2703 	memcpy(bp->b_data, data, msize);
   2704 
   2705 	bdev = bdevsw_lookup(bp->b_dev);
   2706 	if (bdev == NULL)
   2707 		return (ENXIO);
   2708 	(*bdev->d_strategy)(bp);
   2709 	if (asyncp)
   2710 		return 0;
   2711 	error = biowait(bp);
   2712 	brelse(bp, 0);
   2713 	if (error) {
   2714 #if 1
   2715 		printf("Failed to write RAID component info!\n");
   2716 #endif
   2717 	}
   2718 
   2719 	return(error);
   2720 }
   2721 
   2722 void
   2723 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2724 {
   2725 	int c;
   2726 
   2727 	for (c = 0; c < raidPtr->numCol; c++) {
   2728 		/* Skip dead disks. */
   2729 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2730 			continue;
   2731 		/* XXXjld: what if an error occurs here? */
   2732 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2733 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2734 		    RF_PARITYMAP_NBYTE,
   2735 		    rf_parity_map_offset(raidPtr),
   2736 		    rf_parity_map_size(raidPtr), 0);
   2737 	}
   2738 }
   2739 
   2740 void
   2741 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2742 {
   2743 	struct rf_paritymap_ondisk tmp;
   2744 	int c,first;
   2745 
   2746 	first=1;
   2747 	for (c = 0; c < raidPtr->numCol; c++) {
   2748 		/* Skip dead disks. */
   2749 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2750 			continue;
   2751 		raidread_component_area(raidPtr->Disks[c].dev,
   2752 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2753 		    RF_PARITYMAP_NBYTE,
   2754 		    rf_parity_map_offset(raidPtr),
   2755 		    rf_parity_map_size(raidPtr));
   2756 		if (first) {
   2757 			memcpy(map, &tmp, sizeof(*map));
   2758 			first = 0;
   2759 		} else {
   2760 			rf_paritymap_merge(map, &tmp);
   2761 		}
   2762 	}
   2763 }
   2764 
   2765 void
   2766 rf_markalldirty(RF_Raid_t *raidPtr)
   2767 {
   2768 	RF_ComponentLabel_t *clabel;
   2769 	int sparecol;
   2770 	int c;
   2771 	int j;
   2772 	int scol = -1;
   2773 
   2774 	raidPtr->mod_counter++;
   2775 	for (c = 0; c < raidPtr->numCol; c++) {
   2776 		/* we don't want to touch (at all) a disk that has
   2777 		   failed */
   2778 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2779 			clabel = raidget_component_label(raidPtr, c);
   2780 			if (clabel->status == rf_ds_spared) {
   2781 				/* XXX do something special...
   2782 				   but whatever you do, don't
   2783 				   try to access it!! */
   2784 			} else {
   2785 				raidmarkdirty(raidPtr, c);
   2786 			}
   2787 		}
   2788 	}
   2789 
   2790 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2791 		sparecol = raidPtr->numCol + c;
   2792 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2793 			/*
   2794 
   2795 			   we claim this disk is "optimal" if it's
   2796 			   rf_ds_used_spare, as that means it should be
   2797 			   directly substitutable for the disk it replaced.
   2798 			   We note that too...
   2799 
   2800 			 */
   2801 
   2802 			for(j=0;j<raidPtr->numCol;j++) {
   2803 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2804 					scol = j;
   2805 					break;
   2806 				}
   2807 			}
   2808 
   2809 			clabel = raidget_component_label(raidPtr, sparecol);
   2810 			/* make sure status is noted */
   2811 
   2812 			raid_init_component_label(raidPtr, clabel);
   2813 
   2814 			clabel->row = 0;
   2815 			clabel->column = scol;
   2816 			/* Note: we *don't* change status from rf_ds_used_spare
   2817 			   to rf_ds_optimal */
   2818 			/* clabel.status = rf_ds_optimal; */
   2819 
   2820 			raidmarkdirty(raidPtr, sparecol);
   2821 		}
   2822 	}
   2823 }
   2824 
   2825 
   2826 void
   2827 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2828 {
   2829 	RF_ComponentLabel_t *clabel;
   2830 	int sparecol;
   2831 	int c;
   2832 	int j;
   2833 	int scol;
   2834 
   2835 	scol = -1;
   2836 
   2837 	/* XXX should do extra checks to make sure things really are clean,
   2838 	   rather than blindly setting the clean bit... */
   2839 
   2840 	raidPtr->mod_counter++;
   2841 
   2842 	for (c = 0; c < raidPtr->numCol; c++) {
   2843 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2844 			clabel = raidget_component_label(raidPtr, c);
   2845 			/* make sure status is noted */
   2846 			clabel->status = rf_ds_optimal;
   2847 
   2848 			/* note what unit we are configured as */
   2849 			clabel->last_unit = raidPtr->raidid;
   2850 
   2851 			raidflush_component_label(raidPtr, c);
   2852 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2853 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2854 					raidmarkclean(raidPtr, c);
   2855 				}
   2856 			}
   2857 		}
   2858 		/* else we don't touch it.. */
   2859 	}
   2860 
   2861 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2862 		sparecol = raidPtr->numCol + c;
   2863 		/* Need to ensure that the reconstruct actually completed! */
   2864 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2865 			/*
   2866 
   2867 			   we claim this disk is "optimal" if it's
   2868 			   rf_ds_used_spare, as that means it should be
   2869 			   directly substitutable for the disk it replaced.
   2870 			   We note that too...
   2871 
   2872 			 */
   2873 
   2874 			for(j=0;j<raidPtr->numCol;j++) {
   2875 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2876 					scol = j;
   2877 					break;
   2878 				}
   2879 			}
   2880 
   2881 			/* XXX shouldn't *really* need this... */
   2882 			clabel = raidget_component_label(raidPtr, sparecol);
   2883 			/* make sure status is noted */
   2884 
   2885 			raid_init_component_label(raidPtr, clabel);
   2886 
   2887 			clabel->column = scol;
   2888 			clabel->status = rf_ds_optimal;
   2889 			clabel->last_unit = raidPtr->raidid;
   2890 
   2891 			raidflush_component_label(raidPtr, sparecol);
   2892 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2893 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2894 					raidmarkclean(raidPtr, sparecol);
   2895 				}
   2896 			}
   2897 		}
   2898 	}
   2899 }
   2900 
   2901 void
   2902 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2903 {
   2904 
   2905 	if (vp != NULL) {
   2906 		if (auto_configured == 1) {
   2907 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2908 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2909 			vput(vp);
   2910 
   2911 		} else {
   2912 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2913 		}
   2914 	}
   2915 }
   2916 
   2917 
   2918 void
   2919 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2920 {
   2921 	int r,c;
   2922 	struct vnode *vp;
   2923 	int acd;
   2924 
   2925 
   2926 	/* We take this opportunity to close the vnodes like we should.. */
   2927 
   2928 	for (c = 0; c < raidPtr->numCol; c++) {
   2929 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2930 		acd = raidPtr->Disks[c].auto_configured;
   2931 		rf_close_component(raidPtr, vp, acd);
   2932 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2933 		raidPtr->Disks[c].auto_configured = 0;
   2934 	}
   2935 
   2936 	for (r = 0; r < raidPtr->numSpare; r++) {
   2937 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2938 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2939 		rf_close_component(raidPtr, vp, acd);
   2940 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2941 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2942 	}
   2943 }
   2944 
   2945 
   2946 void
   2947 rf_ReconThread(struct rf_recon_req *req)
   2948 {
   2949 	int     s;
   2950 	RF_Raid_t *raidPtr;
   2951 
   2952 	s = splbio();
   2953 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2954 	raidPtr->recon_in_progress = 1;
   2955 
   2956 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2957 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2958 
   2959 	RF_Free(req, sizeof(*req));
   2960 
   2961 	raidPtr->recon_in_progress = 0;
   2962 	splx(s);
   2963 
   2964 	/* That's all... */
   2965 	kthread_exit(0);	/* does not return */
   2966 }
   2967 
   2968 void
   2969 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2970 {
   2971 	int retcode;
   2972 	int s;
   2973 
   2974 	raidPtr->parity_rewrite_stripes_done = 0;
   2975 	raidPtr->parity_rewrite_in_progress = 1;
   2976 	s = splbio();
   2977 	retcode = rf_RewriteParity(raidPtr);
   2978 	splx(s);
   2979 	if (retcode) {
   2980 		printf("raid%d: Error re-writing parity (%d)!\n",
   2981 		    raidPtr->raidid, retcode);
   2982 	} else {
   2983 		/* set the clean bit!  If we shutdown correctly,
   2984 		   the clean bit on each component label will get
   2985 		   set */
   2986 		raidPtr->parity_good = RF_RAID_CLEAN;
   2987 	}
   2988 	raidPtr->parity_rewrite_in_progress = 0;
   2989 
   2990 	/* Anyone waiting for us to stop?  If so, inform them... */
   2991 	if (raidPtr->waitShutdown) {
   2992 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2993 	}
   2994 
   2995 	/* That's all... */
   2996 	kthread_exit(0);	/* does not return */
   2997 }
   2998 
   2999 
   3000 void
   3001 rf_CopybackThread(RF_Raid_t *raidPtr)
   3002 {
   3003 	int s;
   3004 
   3005 	raidPtr->copyback_in_progress = 1;
   3006 	s = splbio();
   3007 	rf_CopybackReconstructedData(raidPtr);
   3008 	splx(s);
   3009 	raidPtr->copyback_in_progress = 0;
   3010 
   3011 	/* That's all... */
   3012 	kthread_exit(0);	/* does not return */
   3013 }
   3014 
   3015 
   3016 void
   3017 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3018 {
   3019 	int s;
   3020 	RF_Raid_t *raidPtr;
   3021 
   3022 	s = splbio();
   3023 	raidPtr = req->raidPtr;
   3024 	raidPtr->recon_in_progress = 1;
   3025 	rf_ReconstructInPlace(raidPtr, req->col);
   3026 	RF_Free(req, sizeof(*req));
   3027 	raidPtr->recon_in_progress = 0;
   3028 	splx(s);
   3029 
   3030 	/* That's all... */
   3031 	kthread_exit(0);	/* does not return */
   3032 }
   3033 
   3034 static RF_AutoConfig_t *
   3035 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3036     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3037     unsigned secsize)
   3038 {
   3039 	int good_one = 0;
   3040 	RF_ComponentLabel_t *clabel;
   3041 	RF_AutoConfig_t *ac;
   3042 
   3043 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3044 	if (clabel == NULL) {
   3045 oomem:
   3046 		    while(ac_list) {
   3047 			    ac = ac_list;
   3048 			    if (ac->clabel)
   3049 				    free(ac->clabel, M_RAIDFRAME);
   3050 			    ac_list = ac_list->next;
   3051 			    free(ac, M_RAIDFRAME);
   3052 		    }
   3053 		    printf("RAID auto config: out of memory!\n");
   3054 		    return NULL; /* XXX probably should panic? */
   3055 	}
   3056 
   3057 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3058 		/* Got the label.  Does it look reasonable? */
   3059 		if (rf_reasonable_label(clabel, numsecs) &&
   3060 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3061 #ifdef DEBUG
   3062 			printf("Component on: %s: %llu\n",
   3063 				cname, (unsigned long long)size);
   3064 			rf_print_component_label(clabel);
   3065 #endif
   3066 			/* if it's reasonable, add it, else ignore it. */
   3067 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3068 				M_NOWAIT);
   3069 			if (ac == NULL) {
   3070 				free(clabel, M_RAIDFRAME);
   3071 				goto oomem;
   3072 			}
   3073 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3074 			ac->dev = dev;
   3075 			ac->vp = vp;
   3076 			ac->clabel = clabel;
   3077 			ac->next = ac_list;
   3078 			ac_list = ac;
   3079 			good_one = 1;
   3080 		}
   3081 	}
   3082 	if (!good_one) {
   3083 		/* cleanup */
   3084 		free(clabel, M_RAIDFRAME);
   3085 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3086 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3087 		vput(vp);
   3088 	}
   3089 	return ac_list;
   3090 }
   3091 
   3092 RF_AutoConfig_t *
   3093 rf_find_raid_components(void)
   3094 {
   3095 	struct vnode *vp;
   3096 	struct disklabel label;
   3097 	device_t dv;
   3098 	deviter_t di;
   3099 	dev_t dev;
   3100 	int bmajor, bminor, wedge, rf_part_found;
   3101 	int error;
   3102 	int i;
   3103 	RF_AutoConfig_t *ac_list;
   3104 	uint64_t numsecs;
   3105 	unsigned secsize;
   3106 
   3107 	/* initialize the AutoConfig list */
   3108 	ac_list = NULL;
   3109 
   3110 	/* we begin by trolling through *all* the devices on the system */
   3111 
   3112 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3113 	     dv = deviter_next(&di)) {
   3114 
   3115 		/* we are only interested in disks... */
   3116 		if (device_class(dv) != DV_DISK)
   3117 			continue;
   3118 
   3119 		/* we don't care about floppies... */
   3120 		if (device_is_a(dv, "fd")) {
   3121 			continue;
   3122 		}
   3123 
   3124 		/* we don't care about CD's... */
   3125 		if (device_is_a(dv, "cd")) {
   3126 			continue;
   3127 		}
   3128 
   3129 		/* we don't care about md's... */
   3130 		if (device_is_a(dv, "md")) {
   3131 			continue;
   3132 		}
   3133 
   3134 		/* hdfd is the Atari/Hades floppy driver */
   3135 		if (device_is_a(dv, "hdfd")) {
   3136 			continue;
   3137 		}
   3138 
   3139 		/* fdisa is the Atari/Milan floppy driver */
   3140 		if (device_is_a(dv, "fdisa")) {
   3141 			continue;
   3142 		}
   3143 
   3144 		/* need to find the device_name_to_block_device_major stuff */
   3145 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3146 
   3147 		rf_part_found = 0; /*No raid partition as yet*/
   3148 
   3149 		/* get a vnode for the raw partition of this disk */
   3150 
   3151 		wedge = device_is_a(dv, "dk");
   3152 		bminor = minor(device_unit(dv));
   3153 		dev = wedge ? makedev(bmajor, bminor) :
   3154 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3155 		if (bdevvp(dev, &vp))
   3156 			panic("RAID can't alloc vnode");
   3157 
   3158 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3159 
   3160 		if (error) {
   3161 			/* "Who cares."  Continue looking
   3162 			   for something that exists*/
   3163 			vput(vp);
   3164 			continue;
   3165 		}
   3166 
   3167 		error = getdisksize(vp, &numsecs, &secsize);
   3168 		if (error) {
   3169 			vput(vp);
   3170 			continue;
   3171 		}
   3172 		if (wedge) {
   3173 			struct dkwedge_info dkw;
   3174 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3175 			    NOCRED);
   3176 			if (error) {
   3177 				printf("RAIDframe: can't get wedge info for "
   3178 				    "dev %s (%d)\n", device_xname(dv), error);
   3179 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3180 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3181 				vput(vp);
   3182 				continue;
   3183 			}
   3184 
   3185 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3186 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3187 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3188 				vput(vp);
   3189 				continue;
   3190 			}
   3191 
   3192 			ac_list = rf_get_component(ac_list, dev, vp,
   3193 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3194 			rf_part_found = 1; /*There is a raid component on this disk*/
   3195 			continue;
   3196 		}
   3197 
   3198 		/* Ok, the disk exists.  Go get the disklabel. */
   3199 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3200 		if (error) {
   3201 			/*
   3202 			 * XXX can't happen - open() would
   3203 			 * have errored out (or faked up one)
   3204 			 */
   3205 			if (error != ENOTTY)
   3206 				printf("RAIDframe: can't get label for dev "
   3207 				    "%s (%d)\n", device_xname(dv), error);
   3208 		}
   3209 
   3210 		/* don't need this any more.  We'll allocate it again
   3211 		   a little later if we really do... */
   3212 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3213 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3214 		vput(vp);
   3215 
   3216 		if (error)
   3217 			continue;
   3218 
   3219 		rf_part_found = 0; /*No raid partitions yet*/
   3220 		for (i = 0; i < label.d_npartitions; i++) {
   3221 			char cname[sizeof(ac_list->devname)];
   3222 
   3223 			/* We only support partitions marked as RAID */
   3224 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3225 				continue;
   3226 
   3227 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3228 			if (bdevvp(dev, &vp))
   3229 				panic("RAID can't alloc vnode");
   3230 
   3231 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3232 			if (error) {
   3233 				/* Whatever... */
   3234 				vput(vp);
   3235 				continue;
   3236 			}
   3237 			snprintf(cname, sizeof(cname), "%s%c",
   3238 			    device_xname(dv), 'a' + i);
   3239 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3240 				label.d_partitions[i].p_size, numsecs, secsize);
   3241 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3242 		}
   3243 
   3244 		/*
   3245 		 *If there is no raid component on this disk, either in a
   3246 		 *disklabel or inside a wedge, check the raw partition as well,
   3247 		 *as it is possible to configure raid components on raw disk
   3248 		 *devices.
   3249 		 */
   3250 
   3251 		if (!rf_part_found) {
   3252 			char cname[sizeof(ac_list->devname)];
   3253 
   3254 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3255 			if (bdevvp(dev, &vp))
   3256 				panic("RAID can't alloc vnode");
   3257 
   3258 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3259 			if (error) {
   3260 				/* Whatever... */
   3261 				vput(vp);
   3262 				continue;
   3263 			}
   3264 			snprintf(cname, sizeof(cname), "%s%c",
   3265 			    device_xname(dv), 'a' + RAW_PART);
   3266 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3267 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3268 		}
   3269 	}
   3270 	deviter_release(&di);
   3271 	return ac_list;
   3272 }
   3273 
   3274 
   3275 int
   3276 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3277 {
   3278 
   3279 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3280 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3281 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3282 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3283 	    clabel->row >=0 &&
   3284 	    clabel->column >= 0 &&
   3285 	    clabel->num_rows > 0 &&
   3286 	    clabel->num_columns > 0 &&
   3287 	    clabel->row < clabel->num_rows &&
   3288 	    clabel->column < clabel->num_columns &&
   3289 	    clabel->blockSize > 0 &&
   3290 	    /*
   3291 	     * numBlocksHi may contain garbage, but it is ok since
   3292 	     * the type is unsigned.  If it is really garbage,
   3293 	     * rf_fix_old_label_size() will fix it.
   3294 	     */
   3295 	    rf_component_label_numblocks(clabel) > 0) {
   3296 		/*
   3297 		 * label looks reasonable enough...
   3298 		 * let's make sure it has no old garbage.
   3299 		 */
   3300 		if (numsecs)
   3301 			rf_fix_old_label_size(clabel, numsecs);
   3302 		return(1);
   3303 	}
   3304 	return(0);
   3305 }
   3306 
   3307 
   3308 /*
   3309  * For reasons yet unknown, some old component labels have garbage in
   3310  * the newer numBlocksHi region, and this causes lossage.  Since those
   3311  * disks will also have numsecs set to less than 32 bits of sectors,
   3312  * we can determine when this corruption has occurred, and fix it.
   3313  *
   3314  * The exact same problem, with the same unknown reason, happens to
   3315  * the partitionSizeHi member as well.
   3316  */
   3317 static void
   3318 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3319 {
   3320 
   3321 	if (numsecs < ((uint64_t)1 << 32)) {
   3322 		if (clabel->numBlocksHi) {
   3323 			printf("WARNING: total sectors < 32 bits, yet "
   3324 			       "numBlocksHi set\n"
   3325 			       "WARNING: resetting numBlocksHi to zero.\n");
   3326 			clabel->numBlocksHi = 0;
   3327 		}
   3328 
   3329 		if (clabel->partitionSizeHi) {
   3330 			printf("WARNING: total sectors < 32 bits, yet "
   3331 			       "partitionSizeHi set\n"
   3332 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3333 			clabel->partitionSizeHi = 0;
   3334 		}
   3335 	}
   3336 }
   3337 
   3338 
   3339 #ifdef DEBUG
   3340 void
   3341 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3342 {
   3343 	uint64_t numBlocks;
   3344 	static const char *rp[] = {
   3345 	    "No", "Force", "Soft", "*invalid*"
   3346 	};
   3347 
   3348 
   3349 	numBlocks = rf_component_label_numblocks(clabel);
   3350 
   3351 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3352 	       clabel->row, clabel->column,
   3353 	       clabel->num_rows, clabel->num_columns);
   3354 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3355 	       clabel->version, clabel->serial_number,
   3356 	       clabel->mod_counter);
   3357 	printf("   Clean: %s Status: %d\n",
   3358 	       clabel->clean ? "Yes" : "No", clabel->status);
   3359 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3360 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3361 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3362 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3363 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3364 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3365 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3366 #if 0
   3367 	   printf("   Config order: %d\n", clabel->config_order);
   3368 #endif
   3369 
   3370 }
   3371 #endif
   3372 
   3373 RF_ConfigSet_t *
   3374 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3375 {
   3376 	RF_AutoConfig_t *ac;
   3377 	RF_ConfigSet_t *config_sets;
   3378 	RF_ConfigSet_t *cset;
   3379 	RF_AutoConfig_t *ac_next;
   3380 
   3381 
   3382 	config_sets = NULL;
   3383 
   3384 	/* Go through the AutoConfig list, and figure out which components
   3385 	   belong to what sets.  */
   3386 	ac = ac_list;
   3387 	while(ac!=NULL) {
   3388 		/* we're going to putz with ac->next, so save it here
   3389 		   for use at the end of the loop */
   3390 		ac_next = ac->next;
   3391 
   3392 		if (config_sets == NULL) {
   3393 			/* will need at least this one... */
   3394 			config_sets = (RF_ConfigSet_t *)
   3395 				malloc(sizeof(RF_ConfigSet_t),
   3396 				       M_RAIDFRAME, M_NOWAIT);
   3397 			if (config_sets == NULL) {
   3398 				panic("rf_create_auto_sets: No memory!");
   3399 			}
   3400 			/* this one is easy :) */
   3401 			config_sets->ac = ac;
   3402 			config_sets->next = NULL;
   3403 			config_sets->rootable = 0;
   3404 			ac->next = NULL;
   3405 		} else {
   3406 			/* which set does this component fit into? */
   3407 			cset = config_sets;
   3408 			while(cset!=NULL) {
   3409 				if (rf_does_it_fit(cset, ac)) {
   3410 					/* looks like it matches... */
   3411 					ac->next = cset->ac;
   3412 					cset->ac = ac;
   3413 					break;
   3414 				}
   3415 				cset = cset->next;
   3416 			}
   3417 			if (cset==NULL) {
   3418 				/* didn't find a match above... new set..*/
   3419 				cset = (RF_ConfigSet_t *)
   3420 					malloc(sizeof(RF_ConfigSet_t),
   3421 					       M_RAIDFRAME, M_NOWAIT);
   3422 				if (cset == NULL) {
   3423 					panic("rf_create_auto_sets: No memory!");
   3424 				}
   3425 				cset->ac = ac;
   3426 				ac->next = NULL;
   3427 				cset->next = config_sets;
   3428 				cset->rootable = 0;
   3429 				config_sets = cset;
   3430 			}
   3431 		}
   3432 		ac = ac_next;
   3433 	}
   3434 
   3435 
   3436 	return(config_sets);
   3437 }
   3438 
   3439 static int
   3440 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3441 {
   3442 	RF_ComponentLabel_t *clabel1, *clabel2;
   3443 
   3444 	/* If this one matches the *first* one in the set, that's good
   3445 	   enough, since the other members of the set would have been
   3446 	   through here too... */
   3447 	/* note that we are not checking partitionSize here..
   3448 
   3449 	   Note that we are also not checking the mod_counters here.
   3450 	   If everything else matches except the mod_counter, that's
   3451 	   good enough for this test.  We will deal with the mod_counters
   3452 	   a little later in the autoconfiguration process.
   3453 
   3454 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3455 
   3456 	   The reason we don't check for this is that failed disks
   3457 	   will have lower modification counts.  If those disks are
   3458 	   not added to the set they used to belong to, then they will
   3459 	   form their own set, which may result in 2 different sets,
   3460 	   for example, competing to be configured at raid0, and
   3461 	   perhaps competing to be the root filesystem set.  If the
   3462 	   wrong ones get configured, or both attempt to become /,
   3463 	   weird behaviour and or serious lossage will occur.  Thus we
   3464 	   need to bring them into the fold here, and kick them out at
   3465 	   a later point.
   3466 
   3467 	*/
   3468 
   3469 	clabel1 = cset->ac->clabel;
   3470 	clabel2 = ac->clabel;
   3471 	if ((clabel1->version == clabel2->version) &&
   3472 	    (clabel1->serial_number == clabel2->serial_number) &&
   3473 	    (clabel1->num_rows == clabel2->num_rows) &&
   3474 	    (clabel1->num_columns == clabel2->num_columns) &&
   3475 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3476 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3477 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3478 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3479 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3480 	    (clabel1->blockSize == clabel2->blockSize) &&
   3481 	    rf_component_label_numblocks(clabel1) ==
   3482 	    rf_component_label_numblocks(clabel2) &&
   3483 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3484 	    (clabel1->root_partition == clabel2->root_partition) &&
   3485 	    (clabel1->last_unit == clabel2->last_unit) &&
   3486 	    (clabel1->config_order == clabel2->config_order)) {
   3487 		/* if it get's here, it almost *has* to be a match */
   3488 	} else {
   3489 		/* it's not consistent with somebody in the set..
   3490 		   punt */
   3491 		return(0);
   3492 	}
   3493 	/* all was fine.. it must fit... */
   3494 	return(1);
   3495 }
   3496 
   3497 int
   3498 rf_have_enough_components(RF_ConfigSet_t *cset)
   3499 {
   3500 	RF_AutoConfig_t *ac;
   3501 	RF_AutoConfig_t *auto_config;
   3502 	RF_ComponentLabel_t *clabel;
   3503 	int c;
   3504 	int num_cols;
   3505 	int num_missing;
   3506 	int mod_counter;
   3507 	int mod_counter_found;
   3508 	int even_pair_failed;
   3509 	char parity_type;
   3510 
   3511 
   3512 	/* check to see that we have enough 'live' components
   3513 	   of this set.  If so, we can configure it if necessary */
   3514 
   3515 	num_cols = cset->ac->clabel->num_columns;
   3516 	parity_type = cset->ac->clabel->parityConfig;
   3517 
   3518 	/* XXX Check for duplicate components!?!?!? */
   3519 
   3520 	/* Determine what the mod_counter is supposed to be for this set. */
   3521 
   3522 	mod_counter_found = 0;
   3523 	mod_counter = 0;
   3524 	ac = cset->ac;
   3525 	while(ac!=NULL) {
   3526 		if (mod_counter_found==0) {
   3527 			mod_counter = ac->clabel->mod_counter;
   3528 			mod_counter_found = 1;
   3529 		} else {
   3530 			if (ac->clabel->mod_counter > mod_counter) {
   3531 				mod_counter = ac->clabel->mod_counter;
   3532 			}
   3533 		}
   3534 		ac = ac->next;
   3535 	}
   3536 
   3537 	num_missing = 0;
   3538 	auto_config = cset->ac;
   3539 
   3540 	even_pair_failed = 0;
   3541 	for(c=0; c<num_cols; c++) {
   3542 		ac = auto_config;
   3543 		while(ac!=NULL) {
   3544 			if ((ac->clabel->column == c) &&
   3545 			    (ac->clabel->mod_counter == mod_counter)) {
   3546 				/* it's this one... */
   3547 #ifdef DEBUG
   3548 				printf("Found: %s at %d\n",
   3549 				       ac->devname,c);
   3550 #endif
   3551 				break;
   3552 			}
   3553 			ac=ac->next;
   3554 		}
   3555 		if (ac==NULL) {
   3556 				/* Didn't find one here! */
   3557 				/* special case for RAID 1, especially
   3558 				   where there are more than 2
   3559 				   components (where RAIDframe treats
   3560 				   things a little differently :( ) */
   3561 			if (parity_type == '1') {
   3562 				if (c%2 == 0) { /* even component */
   3563 					even_pair_failed = 1;
   3564 				} else { /* odd component.  If
   3565 					    we're failed, and
   3566 					    so is the even
   3567 					    component, it's
   3568 					    "Good Night, Charlie" */
   3569 					if (even_pair_failed == 1) {
   3570 						return(0);
   3571 					}
   3572 				}
   3573 			} else {
   3574 				/* normal accounting */
   3575 				num_missing++;
   3576 			}
   3577 		}
   3578 		if ((parity_type == '1') && (c%2 == 1)) {
   3579 				/* Just did an even component, and we didn't
   3580 				   bail.. reset the even_pair_failed flag,
   3581 				   and go on to the next component.... */
   3582 			even_pair_failed = 0;
   3583 		}
   3584 	}
   3585 
   3586 	clabel = cset->ac->clabel;
   3587 
   3588 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3589 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3590 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3591 		/* XXX this needs to be made *much* more general */
   3592 		/* Too many failures */
   3593 		return(0);
   3594 	}
   3595 	/* otherwise, all is well, and we've got enough to take a kick
   3596 	   at autoconfiguring this set */
   3597 	return(1);
   3598 }
   3599 
   3600 void
   3601 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3602 			RF_Raid_t *raidPtr)
   3603 {
   3604 	RF_ComponentLabel_t *clabel;
   3605 	int i;
   3606 
   3607 	clabel = ac->clabel;
   3608 
   3609 	/* 1. Fill in the common stuff */
   3610 	config->numRow = clabel->num_rows = 1;
   3611 	config->numCol = clabel->num_columns;
   3612 	config->numSpare = 0; /* XXX should this be set here? */
   3613 	config->sectPerSU = clabel->sectPerSU;
   3614 	config->SUsPerPU = clabel->SUsPerPU;
   3615 	config->SUsPerRU = clabel->SUsPerRU;
   3616 	config->parityConfig = clabel->parityConfig;
   3617 	/* XXX... */
   3618 	strcpy(config->diskQueueType,"fifo");
   3619 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3620 	config->layoutSpecificSize = 0; /* XXX ?? */
   3621 
   3622 	while(ac!=NULL) {
   3623 		/* row/col values will be in range due to the checks
   3624 		   in reasonable_label() */
   3625 		strcpy(config->devnames[0][ac->clabel->column],
   3626 		       ac->devname);
   3627 		ac = ac->next;
   3628 	}
   3629 
   3630 	for(i=0;i<RF_MAXDBGV;i++) {
   3631 		config->debugVars[i][0] = 0;
   3632 	}
   3633 }
   3634 
   3635 int
   3636 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3637 {
   3638 	RF_ComponentLabel_t *clabel;
   3639 	int column;
   3640 	int sparecol;
   3641 
   3642 	raidPtr->autoconfigure = new_value;
   3643 
   3644 	for(column=0; column<raidPtr->numCol; column++) {
   3645 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3646 			clabel = raidget_component_label(raidPtr, column);
   3647 			clabel->autoconfigure = new_value;
   3648 			raidflush_component_label(raidPtr, column);
   3649 		}
   3650 	}
   3651 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3652 		sparecol = raidPtr->numCol + column;
   3653 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3654 			clabel = raidget_component_label(raidPtr, sparecol);
   3655 			clabel->autoconfigure = new_value;
   3656 			raidflush_component_label(raidPtr, sparecol);
   3657 		}
   3658 	}
   3659 	return(new_value);
   3660 }
   3661 
   3662 int
   3663 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3664 {
   3665 	RF_ComponentLabel_t *clabel;
   3666 	int column;
   3667 	int sparecol;
   3668 
   3669 	raidPtr->root_partition = new_value;
   3670 	for(column=0; column<raidPtr->numCol; column++) {
   3671 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3672 			clabel = raidget_component_label(raidPtr, column);
   3673 			clabel->root_partition = new_value;
   3674 			raidflush_component_label(raidPtr, column);
   3675 		}
   3676 	}
   3677 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3678 		sparecol = raidPtr->numCol + column;
   3679 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3680 			clabel = raidget_component_label(raidPtr, sparecol);
   3681 			clabel->root_partition = new_value;
   3682 			raidflush_component_label(raidPtr, sparecol);
   3683 		}
   3684 	}
   3685 	return(new_value);
   3686 }
   3687 
   3688 void
   3689 rf_release_all_vps(RF_ConfigSet_t *cset)
   3690 {
   3691 	RF_AutoConfig_t *ac;
   3692 
   3693 	ac = cset->ac;
   3694 	while(ac!=NULL) {
   3695 		/* Close the vp, and give it back */
   3696 		if (ac->vp) {
   3697 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3698 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3699 			vput(ac->vp);
   3700 			ac->vp = NULL;
   3701 		}
   3702 		ac = ac->next;
   3703 	}
   3704 }
   3705 
   3706 
   3707 void
   3708 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3709 {
   3710 	RF_AutoConfig_t *ac;
   3711 	RF_AutoConfig_t *next_ac;
   3712 
   3713 	ac = cset->ac;
   3714 	while(ac!=NULL) {
   3715 		next_ac = ac->next;
   3716 		/* nuke the label */
   3717 		free(ac->clabel, M_RAIDFRAME);
   3718 		/* cleanup the config structure */
   3719 		free(ac, M_RAIDFRAME);
   3720 		/* "next.." */
   3721 		ac = next_ac;
   3722 	}
   3723 	/* and, finally, nuke the config set */
   3724 	free(cset, M_RAIDFRAME);
   3725 }
   3726 
   3727 
   3728 void
   3729 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3730 {
   3731 	/* current version number */
   3732 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3733 	clabel->serial_number = raidPtr->serial_number;
   3734 	clabel->mod_counter = raidPtr->mod_counter;
   3735 
   3736 	clabel->num_rows = 1;
   3737 	clabel->num_columns = raidPtr->numCol;
   3738 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3739 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3740 
   3741 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3742 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3743 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3744 
   3745 	clabel->blockSize = raidPtr->bytesPerSector;
   3746 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3747 
   3748 	/* XXX not portable */
   3749 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3750 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3751 	clabel->autoconfigure = raidPtr->autoconfigure;
   3752 	clabel->root_partition = raidPtr->root_partition;
   3753 	clabel->last_unit = raidPtr->raidid;
   3754 	clabel->config_order = raidPtr->config_order;
   3755 
   3756 #ifndef RF_NO_PARITY_MAP
   3757 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3758 #endif
   3759 }
   3760 
   3761 struct raid_softc *
   3762 rf_auto_config_set(RF_ConfigSet_t *cset)
   3763 {
   3764 	RF_Raid_t *raidPtr;
   3765 	RF_Config_t *config;
   3766 	int raidID;
   3767 	struct raid_softc *sc;
   3768 
   3769 #ifdef DEBUG
   3770 	printf("RAID autoconfigure\n");
   3771 #endif
   3772 
   3773 	/* 1. Create a config structure */
   3774 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3775 	if (config == NULL) {
   3776 		printf("Out of mem!?!?\n");
   3777 				/* XXX do something more intelligent here. */
   3778 		return NULL;
   3779 	}
   3780 
   3781 	/*
   3782 	   2. Figure out what RAID ID this one is supposed to live at
   3783 	   See if we can get the same RAID dev that it was configured
   3784 	   on last time..
   3785 	*/
   3786 
   3787 	raidID = cset->ac->clabel->last_unit;
   3788 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3789 		continue;
   3790 #ifdef DEBUG
   3791 	printf("Configuring raid%d:\n",raidID);
   3792 #endif
   3793 
   3794 	raidPtr = &sc->sc_r;
   3795 
   3796 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3797 	raidPtr->softc = sc;
   3798 	raidPtr->raidid = raidID;
   3799 	raidPtr->openings = RAIDOUTSTANDING;
   3800 
   3801 	/* 3. Build the configuration structure */
   3802 	rf_create_configuration(cset->ac, config, raidPtr);
   3803 
   3804 	/* 4. Do the configuration */
   3805 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3806 		raidinit(sc);
   3807 
   3808 		rf_markalldirty(raidPtr);
   3809 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3810 		switch (cset->ac->clabel->root_partition) {
   3811 		case 1:	/* Force Root */
   3812 		case 2:	/* Soft Root: root when boot partition part of raid */
   3813 			/*
   3814 			 * everything configured just fine.  Make a note
   3815 			 * that this set is eligible to be root,
   3816 			 * or forced to be root
   3817 			 */
   3818 			cset->rootable = cset->ac->clabel->root_partition;
   3819 			/* XXX do this here? */
   3820 			raidPtr->root_partition = cset->rootable;
   3821 			break;
   3822 		default:
   3823 			break;
   3824 		}
   3825 	} else {
   3826 		raidput(sc);
   3827 		sc = NULL;
   3828 	}
   3829 
   3830 	/* 5. Cleanup */
   3831 	free(config, M_RAIDFRAME);
   3832 	return sc;
   3833 }
   3834 
   3835 void
   3836 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3837 {
   3838 	struct buf *bp;
   3839 	struct raid_softc *rs;
   3840 
   3841 	bp = (struct buf *)desc->bp;
   3842 	rs = desc->raidPtr->softc;
   3843 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3844 	    (bp->b_flags & B_READ));
   3845 }
   3846 
   3847 void
   3848 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3849 	     size_t xmin, size_t xmax)
   3850 {
   3851 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3852 	pool_sethiwat(p, xmax);
   3853 	pool_prime(p, xmin);
   3854 	pool_setlowat(p, xmin);
   3855 }
   3856 
   3857 /*
   3858  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3859  * if there is IO pending and if that IO could possibly be done for a
   3860  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3861  * otherwise.
   3862  *
   3863  */
   3864 
   3865 int
   3866 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3867 {
   3868 	struct raid_softc *rs = raidPtr->softc;
   3869 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3870 		/* there is work to do */
   3871 		return 0;
   3872 	}
   3873 	/* default is nothing to do */
   3874 	return 1;
   3875 }
   3876 
   3877 int
   3878 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3879 {
   3880 	uint64_t numsecs;
   3881 	unsigned secsize;
   3882 	int error;
   3883 
   3884 	error = getdisksize(vp, &numsecs, &secsize);
   3885 	if (error == 0) {
   3886 		diskPtr->blockSize = secsize;
   3887 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3888 		diskPtr->partitionSize = numsecs;
   3889 		return 0;
   3890 	}
   3891 	return error;
   3892 }
   3893 
   3894 static int
   3895 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3896 {
   3897 	return 1;
   3898 }
   3899 
   3900 static void
   3901 raid_attach(device_t parent, device_t self, void *aux)
   3902 {
   3903 
   3904 }
   3905 
   3906 
   3907 static int
   3908 raid_detach(device_t self, int flags)
   3909 {
   3910 	int error;
   3911 	struct raid_softc *rs = raidget(device_unit(self));
   3912 
   3913 	if (rs == NULL)
   3914 		return ENXIO;
   3915 
   3916 	if ((error = raidlock(rs)) != 0)
   3917 		return (error);
   3918 
   3919 	error = raid_detach_unlocked(rs);
   3920 
   3921 	raidunlock(rs);
   3922 
   3923 	/* XXXkd: raidput(rs) ??? */
   3924 
   3925 	return error;
   3926 }
   3927 
   3928 static void
   3929 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3930 {
   3931 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3932 
   3933 	memset(dg, 0, sizeof(*dg));
   3934 
   3935 	dg->dg_secperunit = raidPtr->totalSectors;
   3936 	dg->dg_secsize = raidPtr->bytesPerSector;
   3937 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3938 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3939 
   3940 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3941 }
   3942 
   3943 /*
   3944  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3945  * We end up returning whatever error was returned by the first cache flush
   3946  * that fails.
   3947  */
   3948 
   3949 int
   3950 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3951 {
   3952 	int c, sparecol;
   3953 	int e,error;
   3954 	int force = 1;
   3955 
   3956 	error = 0;
   3957 	for (c = 0; c < raidPtr->numCol; c++) {
   3958 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3959 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3960 					  &force, FWRITE, NOCRED);
   3961 			if (e) {
   3962 				if (e != ENODEV)
   3963 					printf("raid%d: cache flush to component %s failed.\n",
   3964 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3965 				if (error == 0) {
   3966 					error = e;
   3967 				}
   3968 			}
   3969 		}
   3970 	}
   3971 
   3972 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3973 		sparecol = raidPtr->numCol + c;
   3974 		/* Need to ensure that the reconstruct actually completed! */
   3975 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3976 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3977 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3978 			if (e) {
   3979 				if (e != ENODEV)
   3980 					printf("raid%d: cache flush to component %s failed.\n",
   3981 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3982 				if (error == 0) {
   3983 					error = e;
   3984 				}
   3985 			}
   3986 		}
   3987 	}
   3988 	return error;
   3989 }
   3990