Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.312.2.4
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.312.2.4 2014/12/22 02:19:32 msaitoh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.312.2.4 2014/12/22 02:19:32 msaitoh Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    234 
    235 struct raid_softc {
    236 	device_t sc_dev;
    237 	int	sc_unit;
    238 	int     sc_flags;	/* flags */
    239 	int     sc_cflags;	/* configuration flags */
    240 	uint64_t sc_size;	/* size of the raid device */
    241 	char    sc_xname[20];	/* XXX external name */
    242 	struct disk sc_dkdev;	/* generic disk device info */
    243 	struct bufq_state *buf_queue;	/* used for the device queue */
    244 	RF_Raid_t sc_r;
    245 	LIST_ENTRY(raid_softc) sc_link;
    246 };
    247 /* sc_flags */
    248 #define RAIDF_INITED	0x01	/* unit has been initialized */
    249 #define RAIDF_WLABEL	0x02	/* label area is writable */
    250 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    251 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    252 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    253 #define RAIDF_LOCKED	0x80	/* unit is locked */
    254 
    255 #define	raidunit(x)	DISKUNIT(x)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /*
    263  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    264  * Be aware that large numbers can allow the driver to consume a lot of
    265  * kernel memory, especially on writes, and in degraded mode reads.
    266  *
    267  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    268  * a single 64K write will typically require 64K for the old data,
    269  * 64K for the old parity, and 64K for the new parity, for a total
    270  * of 192K (if the parity buffer is not re-used immediately).
    271  * Even it if is used immediately, that's still 128K, which when multiplied
    272  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    273  *
    274  * Now in degraded mode, for example, a 64K read on the above setup may
    275  * require data reconstruction, which will require *all* of the 4 remaining
    276  * disks to participate -- 4 * 32K/disk == 128K again.
    277  */
    278 
    279 #ifndef RAIDOUTSTANDING
    280 #define RAIDOUTSTANDING   6
    281 #endif
    282 
    283 #define RAIDLABELDEV(dev)	\
    284 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    285 
    286 /* declared here, and made public, for the benefit of KVM stuff.. */
    287 
    288 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    289 				     struct disklabel *);
    290 static void raidgetdisklabel(dev_t);
    291 static void raidmakedisklabel(struct raid_softc *);
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	if (sc == NULL) {
    342 #ifdef DIAGNOSTIC
    343 		printf("%s: out of memory\n", __func__);
    344 #endif
    345 		return NULL;
    346 	}
    347 	sc->sc_unit = unit;
    348 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    349 	return sc;
    350 }
    351 
    352 static void
    353 raiddestroy(struct raid_softc *sc) {
    354 	bufq_free(sc->buf_queue);
    355 	kmem_free(sc, sizeof(*sc));
    356 }
    357 
    358 static struct raid_softc *
    359 raidget(int unit) {
    360 	struct raid_softc *sc;
    361 	if (unit < 0) {
    362 #ifdef DIAGNOSTIC
    363 		panic("%s: unit %d!", __func__, unit);
    364 #endif
    365 		return NULL;
    366 	}
    367 	mutex_enter(&raid_lock);
    368 	LIST_FOREACH(sc, &raids, sc_link) {
    369 		if (sc->sc_unit == unit) {
    370 			mutex_exit(&raid_lock);
    371 			return sc;
    372 		}
    373 	}
    374 	mutex_exit(&raid_lock);
    375 	if ((sc = raidcreate(unit)) == NULL)
    376 		return NULL;
    377 	mutex_enter(&raid_lock);
    378 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    379 	mutex_exit(&raid_lock);
    380 	return sc;
    381 }
    382 
    383 static void
    384 raidput(struct raid_softc *sc) {
    385 	mutex_enter(&raid_lock);
    386 	LIST_REMOVE(sc, sc_link);
    387 	mutex_exit(&raid_lock);
    388 	raiddestroy(sc);
    389 }
    390 
    391 void
    392 raidattach(int num)
    393 {
    394 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    395 	/* This is where all the initialization stuff gets done. */
    396 
    397 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    398 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    399 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    400 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    401 
    402 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    403 #endif
    404 
    405 	if (rf_BootRaidframe() == 0)
    406 		aprint_verbose("Kernelized RAIDframe activated\n");
    407 	else
    408 		panic("Serious error booting RAID!!");
    409 
    410 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    411 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    412 	}
    413 
    414 	raidautoconfigdone = false;
    415 
    416 	/*
    417 	 * Register a finalizer which will be used to auto-config RAID
    418 	 * sets once all real hardware devices have been found.
    419 	 */
    420 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    421 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    422 }
    423 
    424 int
    425 rf_autoconfig(device_t self)
    426 {
    427 	RF_AutoConfig_t *ac_list;
    428 	RF_ConfigSet_t *config_sets;
    429 
    430 	if (!raidautoconfig || raidautoconfigdone == true)
    431 		return (0);
    432 
    433 	/* XXX This code can only be run once. */
    434 	raidautoconfigdone = true;
    435 
    436 #ifdef __HAVE_CPU_BOOTCONF
    437 	/*
    438 	 * 0. find the boot device if needed first so we can use it later
    439 	 * this needs to be done before we autoconfigure any raid sets,
    440 	 * because if we use wedges we are not going to be able to open
    441 	 * the boot device later
    442 	 */
    443 	if (booted_device == NULL)
    444 		cpu_bootconf();
    445 #endif
    446 	/* 1. locate all RAID components on the system */
    447 	aprint_debug("Searching for RAID components...\n");
    448 	ac_list = rf_find_raid_components();
    449 
    450 	/* 2. Sort them into their respective sets. */
    451 	config_sets = rf_create_auto_sets(ac_list);
    452 
    453 	/*
    454 	 * 3. Evaluate each set and configure the valid ones.
    455 	 * This gets done in rf_buildroothack().
    456 	 */
    457 	rf_buildroothack(config_sets);
    458 
    459 	return 1;
    460 }
    461 
    462 static int
    463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    464 	const char *bootname = device_xname(bdv);
    465 	size_t len = strlen(bootname);
    466 
    467 	for (int col = 0; col < r->numCol; col++) {
    468 		const char *devname = r->Disks[col].devname;
    469 		devname += sizeof("/dev/") - 1;
    470 		if (strncmp(devname, "dk", 2) == 0) {
    471 			const char *parent =
    472 			    dkwedge_get_parent_name(r->Disks[col].dev);
    473 			if (parent != NULL)
    474 				devname = parent;
    475 		}
    476 		if (strncmp(devname, bootname, len) == 0) {
    477 			struct raid_softc *sc = r->softc;
    478 			aprint_debug("raid%d includes boot device %s\n",
    479 			    sc->sc_unit, devname);
    480 			return 1;
    481 		}
    482 	}
    483 	return 0;
    484 }
    485 
    486 void
    487 rf_buildroothack(RF_ConfigSet_t *config_sets)
    488 {
    489 	RF_ConfigSet_t *cset;
    490 	RF_ConfigSet_t *next_cset;
    491 	int num_root;
    492 	struct raid_softc *sc, *rsc;
    493 
    494 	sc = rsc = NULL;
    495 	num_root = 0;
    496 	cset = config_sets;
    497 	while (cset != NULL) {
    498 		next_cset = cset->next;
    499 		if (rf_have_enough_components(cset) &&
    500 		    cset->ac->clabel->autoconfigure == 1) {
    501 			sc = rf_auto_config_set(cset);
    502 			if (sc != NULL) {
    503 				aprint_debug("raid%d: configured ok\n",
    504 				    sc->sc_unit);
    505 				if (cset->rootable) {
    506 					rsc = sc;
    507 					num_root++;
    508 				}
    509 			} else {
    510 				/* The autoconfig didn't work :( */
    511 				aprint_debug("Autoconfig failed\n");
    512 				rf_release_all_vps(cset);
    513 			}
    514 		} else {
    515 			/* we're not autoconfiguring this set...
    516 			   release the associated resources */
    517 			rf_release_all_vps(cset);
    518 		}
    519 		/* cleanup */
    520 		rf_cleanup_config_set(cset);
    521 		cset = next_cset;
    522 	}
    523 
    524 	/* if the user has specified what the root device should be
    525 	   then we don't touch booted_device or boothowto... */
    526 
    527 	if (rootspec != NULL)
    528 		return;
    529 
    530 	/* we found something bootable... */
    531 
    532 	/*
    533 	 * XXX: The following code assumes that the root raid
    534 	 * is the first ('a') partition. This is about the best
    535 	 * we can do with a BSD disklabel, but we might be able
    536 	 * to do better with a GPT label, by setting a specified
    537 	 * attribute to indicate the root partition. We can then
    538 	 * stash the partition number in the r->root_partition
    539 	 * high bits (the bottom 2 bits are already used). For
    540 	 * now we just set booted_partition to 0 when we override
    541 	 * root.
    542 	 */
    543 	if (num_root == 1) {
    544 		device_t candidate_root;
    545 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    546 			char cname[sizeof(cset->ac->devname)];
    547 			/* XXX: assume 'a' */
    548 			snprintf(cname, sizeof(cname), "%s%c",
    549 			    device_xname(rsc->sc_dev), 'a');
    550 			candidate_root = dkwedge_find_by_wname(cname);
    551 		} else
    552 			candidate_root = rsc->sc_dev;
    553 		if (booted_device == NULL ||
    554 		    rsc->sc_r.root_partition == 1 ||
    555 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    556 			booted_device = candidate_root;
    557 			booted_partition = 0;	/* XXX assume 'a' */
    558 		}
    559 	} else if (num_root > 1) {
    560 
    561 		/*
    562 		 * Maybe the MD code can help. If it cannot, then
    563 		 * setroot() will discover that we have no
    564 		 * booted_device and will ask the user if nothing was
    565 		 * hardwired in the kernel config file
    566 		 */
    567 		if (booted_device == NULL)
    568 			return;
    569 
    570 		num_root = 0;
    571 		mutex_enter(&raid_lock);
    572 		LIST_FOREACH(sc, &raids, sc_link) {
    573 			RF_Raid_t *r = &sc->sc_r;
    574 			if (r->valid == 0)
    575 				continue;
    576 
    577 			if (r->root_partition == 0)
    578 				continue;
    579 
    580 			if (rf_containsboot(r, booted_device)) {
    581 				num_root++;
    582 				rsc = sc;
    583 			}
    584 		}
    585 		mutex_exit(&raid_lock);
    586 
    587 		if (num_root == 1) {
    588 			booted_device = rsc->sc_dev;
    589 			booted_partition = 0;	/* XXX assume 'a' */
    590 		} else {
    591 			/* we can't guess.. require the user to answer... */
    592 			boothowto |= RB_ASKNAME;
    593 		}
    594 	}
    595 }
    596 
    597 
    598 int
    599 raidsize(dev_t dev)
    600 {
    601 	struct raid_softc *rs;
    602 	struct disklabel *lp;
    603 	int     part, unit, omask, size;
    604 
    605 	unit = raidunit(dev);
    606 	if ((rs = raidget(unit)) == NULL)
    607 		return -1;
    608 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    609 		return (-1);
    610 
    611 	part = DISKPART(dev);
    612 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    613 	lp = rs->sc_dkdev.dk_label;
    614 
    615 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    616 		return (-1);
    617 
    618 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    619 		size = -1;
    620 	else
    621 		size = lp->d_partitions[part].p_size *
    622 		    (lp->d_secsize / DEV_BSIZE);
    623 
    624 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    625 		return (-1);
    626 
    627 	return (size);
    628 
    629 }
    630 
    631 int
    632 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    633 {
    634 	int     unit = raidunit(dev);
    635 	struct raid_softc *rs;
    636 	const struct bdevsw *bdev;
    637 	struct disklabel *lp;
    638 	RF_Raid_t *raidPtr;
    639 	daddr_t offset;
    640 	int     part, c, sparecol, j, scol, dumpto;
    641 	int     error = 0;
    642 
    643 	if ((rs = raidget(unit)) == NULL)
    644 		return ENXIO;
    645 
    646 	raidPtr = &rs->sc_r;
    647 
    648 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    649 		return ENXIO;
    650 
    651 	/* we only support dumping to RAID 1 sets */
    652 	if (raidPtr->Layout.numDataCol != 1 ||
    653 	    raidPtr->Layout.numParityCol != 1)
    654 		return EINVAL;
    655 
    656 
    657 	if ((error = raidlock(rs)) != 0)
    658 		return error;
    659 
    660 	if (size % DEV_BSIZE != 0) {
    661 		error = EINVAL;
    662 		goto out;
    663 	}
    664 
    665 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    666 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    667 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    668 		    size / DEV_BSIZE, rs->sc_size);
    669 		error = EINVAL;
    670 		goto out;
    671 	}
    672 
    673 	part = DISKPART(dev);
    674 	lp = rs->sc_dkdev.dk_label;
    675 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    676 
    677 	/* figure out what device is alive.. */
    678 
    679 	/*
    680 	   Look for a component to dump to.  The preference for the
    681 	   component to dump to is as follows:
    682 	   1) the master
    683 	   2) a used_spare of the master
    684 	   3) the slave
    685 	   4) a used_spare of the slave
    686 	*/
    687 
    688 	dumpto = -1;
    689 	for (c = 0; c < raidPtr->numCol; c++) {
    690 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    691 			/* this might be the one */
    692 			dumpto = c;
    693 			break;
    694 		}
    695 	}
    696 
    697 	/*
    698 	   At this point we have possibly selected a live master or a
    699 	   live slave.  We now check to see if there is a spared
    700 	   master (or a spared slave), if we didn't find a live master
    701 	   or a live slave.
    702 	*/
    703 
    704 	for (c = 0; c < raidPtr->numSpare; c++) {
    705 		sparecol = raidPtr->numCol + c;
    706 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    707 			/* How about this one? */
    708 			scol = -1;
    709 			for(j=0;j<raidPtr->numCol;j++) {
    710 				if (raidPtr->Disks[j].spareCol == sparecol) {
    711 					scol = j;
    712 					break;
    713 				}
    714 			}
    715 			if (scol == 0) {
    716 				/*
    717 				   We must have found a spared master!
    718 				   We'll take that over anything else
    719 				   found so far.  (We couldn't have
    720 				   found a real master before, since
    721 				   this is a used spare, and it's
    722 				   saying that it's replacing the
    723 				   master.)  On reboot (with
    724 				   autoconfiguration turned on)
    725 				   sparecol will become the 1st
    726 				   component (component0) of this set.
    727 				*/
    728 				dumpto = sparecol;
    729 				break;
    730 			} else if (scol != -1) {
    731 				/*
    732 				   Must be a spared slave.  We'll dump
    733 				   to that if we havn't found anything
    734 				   else so far.
    735 				*/
    736 				if (dumpto == -1)
    737 					dumpto = sparecol;
    738 			}
    739 		}
    740 	}
    741 
    742 	if (dumpto == -1) {
    743 		/* we couldn't find any live components to dump to!?!?
    744 		 */
    745 		error = EINVAL;
    746 		goto out;
    747 	}
    748 
    749 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    750 
    751 	/*
    752 	   Note that blkno is relative to this particular partition.
    753 	   By adding the offset of this partition in the RAID
    754 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    755 	   value that is relative to the partition used for the
    756 	   underlying component.
    757 	*/
    758 
    759 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    760 				blkno + offset, va, size);
    761 
    762 out:
    763 	raidunlock(rs);
    764 
    765 	return error;
    766 }
    767 /* ARGSUSED */
    768 int
    769 raidopen(dev_t dev, int flags, int fmt,
    770     struct lwp *l)
    771 {
    772 	int     unit = raidunit(dev);
    773 	struct raid_softc *rs;
    774 	struct disklabel *lp;
    775 	int     part, pmask;
    776 	int     error = 0;
    777 
    778 	if ((rs = raidget(unit)) == NULL)
    779 		return ENXIO;
    780 	if ((error = raidlock(rs)) != 0)
    781 		return (error);
    782 
    783 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    784 		error = EBUSY;
    785 		goto bad;
    786 	}
    787 
    788 	lp = rs->sc_dkdev.dk_label;
    789 
    790 	part = DISKPART(dev);
    791 
    792 	/*
    793 	 * If there are wedges, and this is not RAW_PART, then we
    794 	 * need to fail.
    795 	 */
    796 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    797 		error = EBUSY;
    798 		goto bad;
    799 	}
    800 	pmask = (1 << part);
    801 
    802 	if ((rs->sc_flags & RAIDF_INITED) &&
    803 	    (rs->sc_dkdev.dk_nwedges == 0) &&
    804 	    (rs->sc_dkdev.dk_openmask == 0))
    805 		raidgetdisklabel(dev);
    806 
    807 	/* make sure that this partition exists */
    808 
    809 	if (part != RAW_PART) {
    810 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    811 		    ((part >= lp->d_npartitions) ||
    812 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    813 			error = ENXIO;
    814 			goto bad;
    815 		}
    816 	}
    817 	/* Prevent this unit from being unconfigured while open. */
    818 	switch (fmt) {
    819 	case S_IFCHR:
    820 		rs->sc_dkdev.dk_copenmask |= pmask;
    821 		break;
    822 
    823 	case S_IFBLK:
    824 		rs->sc_dkdev.dk_bopenmask |= pmask;
    825 		break;
    826 	}
    827 
    828 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    829 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    830 		/* First one... mark things as dirty... Note that we *MUST*
    831 		 have done a configure before this.  I DO NOT WANT TO BE
    832 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    833 		 THAT THEY BELONG TOGETHER!!!!! */
    834 		/* XXX should check to see if we're only open for reading
    835 		   here... If so, we needn't do this, but then need some
    836 		   other way of keeping track of what's happened.. */
    837 
    838 		rf_markalldirty(&rs->sc_r);
    839 	}
    840 
    841 
    842 	rs->sc_dkdev.dk_openmask =
    843 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    844 
    845 bad:
    846 	raidunlock(rs);
    847 
    848 	return (error);
    849 
    850 
    851 }
    852 /* ARGSUSED */
    853 int
    854 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    855 {
    856 	int     unit = raidunit(dev);
    857 	struct raid_softc *rs;
    858 	int     error = 0;
    859 	int     part;
    860 
    861 	if ((rs = raidget(unit)) == NULL)
    862 		return ENXIO;
    863 
    864 	if ((error = raidlock(rs)) != 0)
    865 		return (error);
    866 
    867 	part = DISKPART(dev);
    868 
    869 	/* ...that much closer to allowing unconfiguration... */
    870 	switch (fmt) {
    871 	case S_IFCHR:
    872 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    873 		break;
    874 
    875 	case S_IFBLK:
    876 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    877 		break;
    878 	}
    879 	rs->sc_dkdev.dk_openmask =
    880 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    881 
    882 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    883 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    884 		/* Last one... device is not unconfigured yet.
    885 		   Device shutdown has taken care of setting the
    886 		   clean bits if RAIDF_INITED is not set
    887 		   mark things as clean... */
    888 
    889 		rf_update_component_labels(&rs->sc_r,
    890 						 RF_FINAL_COMPONENT_UPDATE);
    891 
    892 		/* If the kernel is shutting down, it will detach
    893 		 * this RAID set soon enough.
    894 		 */
    895 	}
    896 
    897 	raidunlock(rs);
    898 	return (0);
    899 
    900 }
    901 
    902 void
    903 raidstrategy(struct buf *bp)
    904 {
    905 	unsigned int unit = raidunit(bp->b_dev);
    906 	RF_Raid_t *raidPtr;
    907 	int     wlabel;
    908 	struct raid_softc *rs;
    909 
    910 	if ((rs = raidget(unit)) == NULL) {
    911 		bp->b_error = ENXIO;
    912 		goto done;
    913 	}
    914 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    915 		bp->b_error = ENXIO;
    916 		goto done;
    917 	}
    918 	raidPtr = &rs->sc_r;
    919 	if (!raidPtr->valid) {
    920 		bp->b_error = ENODEV;
    921 		goto done;
    922 	}
    923 	if (bp->b_bcount == 0) {
    924 		db1_printf(("b_bcount is zero..\n"));
    925 		goto done;
    926 	}
    927 
    928 	/*
    929 	 * Do bounds checking and adjust transfer.  If there's an
    930 	 * error, the bounds check will flag that for us.
    931 	 */
    932 
    933 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    934 	if (DISKPART(bp->b_dev) == RAW_PART) {
    935 		uint64_t size; /* device size in DEV_BSIZE unit */
    936 
    937 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    938 			size = raidPtr->totalSectors <<
    939 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    940 		} else {
    941 			size = raidPtr->totalSectors >>
    942 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    943 		}
    944 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    945 			goto done;
    946 		}
    947 	} else {
    948 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    949 			db1_printf(("Bounds check failed!!:%d %d\n",
    950 				(int) bp->b_blkno, (int) wlabel));
    951 			goto done;
    952 		}
    953 	}
    954 
    955 	rf_lock_mutex2(raidPtr->iodone_lock);
    956 
    957 	bp->b_resid = 0;
    958 
    959 	/* stuff it onto our queue */
    960 	bufq_put(rs->buf_queue, bp);
    961 
    962 	/* scheduled the IO to happen at the next convenient time */
    963 	rf_signal_cond2(raidPtr->iodone_cv);
    964 	rf_unlock_mutex2(raidPtr->iodone_lock);
    965 
    966 	return;
    967 
    968 done:
    969 	bp->b_resid = bp->b_bcount;
    970 	biodone(bp);
    971 }
    972 /* ARGSUSED */
    973 int
    974 raidread(dev_t dev, struct uio *uio, int flags)
    975 {
    976 	int     unit = raidunit(dev);
    977 	struct raid_softc *rs;
    978 
    979 	if ((rs = raidget(unit)) == NULL)
    980 		return ENXIO;
    981 
    982 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    983 		return (ENXIO);
    984 
    985 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    986 
    987 }
    988 /* ARGSUSED */
    989 int
    990 raidwrite(dev_t dev, struct uio *uio, int flags)
    991 {
    992 	int     unit = raidunit(dev);
    993 	struct raid_softc *rs;
    994 
    995 	if ((rs = raidget(unit)) == NULL)
    996 		return ENXIO;
    997 
    998 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    999 		return (ENXIO);
   1000 
   1001 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1002 
   1003 }
   1004 
   1005 static int
   1006 raid_detach_unlocked(struct raid_softc *rs)
   1007 {
   1008 	int error;
   1009 	RF_Raid_t *raidPtr;
   1010 
   1011 	raidPtr = &rs->sc_r;
   1012 
   1013 	/*
   1014 	 * If somebody has a partition mounted, we shouldn't
   1015 	 * shutdown.
   1016 	 */
   1017 	if (rs->sc_dkdev.dk_openmask != 0)
   1018 		return EBUSY;
   1019 
   1020 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1021 		;	/* not initialized: nothing to do */
   1022 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1023 		return error;
   1024 	else
   1025 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1026 
   1027 	/* Detach the disk. */
   1028 	dkwedge_delall(&rs->sc_dkdev);
   1029 	disk_detach(&rs->sc_dkdev);
   1030 	disk_destroy(&rs->sc_dkdev);
   1031 
   1032 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1033 
   1034 	return 0;
   1035 }
   1036 
   1037 int
   1038 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1039 {
   1040 	int     unit = raidunit(dev);
   1041 	int     error = 0;
   1042 	int     part, pmask, s;
   1043 	cfdata_t cf;
   1044 	struct raid_softc *rs;
   1045 	RF_Config_t *k_cfg, *u_cfg;
   1046 	RF_Raid_t *raidPtr;
   1047 	RF_RaidDisk_t *diskPtr;
   1048 	RF_AccTotals_t *totals;
   1049 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1050 	u_char *specific_buf;
   1051 	int retcode = 0;
   1052 	int column;
   1053 /*	int raidid; */
   1054 	struct rf_recon_req *rrcopy, *rr;
   1055 	RF_ComponentLabel_t *clabel;
   1056 	RF_ComponentLabel_t *ci_label;
   1057 	RF_ComponentLabel_t **clabel_ptr;
   1058 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1059 	RF_SingleComponent_t component;
   1060 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1061 	int i, j, d;
   1062 #ifdef __HAVE_OLD_DISKLABEL
   1063 	struct disklabel newlabel;
   1064 #endif
   1065 	struct dkwedge_info *dkw;
   1066 
   1067 	if ((rs = raidget(unit)) == NULL)
   1068 		return ENXIO;
   1069 	raidPtr = &rs->sc_r;
   1070 
   1071 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1072 		(int) DISKPART(dev), (int) unit, cmd));
   1073 
   1074 	/* Must be open for writes for these commands... */
   1075 	switch (cmd) {
   1076 #ifdef DIOCGSECTORSIZE
   1077 	case DIOCGSECTORSIZE:
   1078 		*(u_int *)data = raidPtr->bytesPerSector;
   1079 		return 0;
   1080 	case DIOCGMEDIASIZE:
   1081 		*(off_t *)data =
   1082 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1083 		return 0;
   1084 #endif
   1085 	case DIOCSDINFO:
   1086 	case DIOCWDINFO:
   1087 #ifdef __HAVE_OLD_DISKLABEL
   1088 	case ODIOCWDINFO:
   1089 	case ODIOCSDINFO:
   1090 #endif
   1091 	case DIOCWLABEL:
   1092 	case DIOCAWEDGE:
   1093 	case DIOCDWEDGE:
   1094 	case DIOCMWEDGES:
   1095 	case DIOCSSTRATEGY:
   1096 		if ((flag & FWRITE) == 0)
   1097 			return (EBADF);
   1098 	}
   1099 
   1100 	/* Must be initialized for these... */
   1101 	switch (cmd) {
   1102 	case DIOCGDINFO:
   1103 	case DIOCSDINFO:
   1104 	case DIOCWDINFO:
   1105 #ifdef __HAVE_OLD_DISKLABEL
   1106 	case ODIOCGDINFO:
   1107 	case ODIOCWDINFO:
   1108 	case ODIOCSDINFO:
   1109 	case ODIOCGDEFLABEL:
   1110 #endif
   1111 	case DIOCGPART:
   1112 	case DIOCWLABEL:
   1113 	case DIOCGDEFLABEL:
   1114 	case DIOCAWEDGE:
   1115 	case DIOCDWEDGE:
   1116 	case DIOCLWEDGES:
   1117 	case DIOCMWEDGES:
   1118 	case DIOCCACHESYNC:
   1119 	case RAIDFRAME_SHUTDOWN:
   1120 	case RAIDFRAME_REWRITEPARITY:
   1121 	case RAIDFRAME_GET_INFO:
   1122 	case RAIDFRAME_RESET_ACCTOTALS:
   1123 	case RAIDFRAME_GET_ACCTOTALS:
   1124 	case RAIDFRAME_KEEP_ACCTOTALS:
   1125 	case RAIDFRAME_GET_SIZE:
   1126 	case RAIDFRAME_FAIL_DISK:
   1127 	case RAIDFRAME_COPYBACK:
   1128 	case RAIDFRAME_CHECK_RECON_STATUS:
   1129 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1130 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1131 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1132 	case RAIDFRAME_ADD_HOT_SPARE:
   1133 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1134 	case RAIDFRAME_INIT_LABELS:
   1135 	case RAIDFRAME_REBUILD_IN_PLACE:
   1136 	case RAIDFRAME_CHECK_PARITY:
   1137 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1138 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1139 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1140 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1141 	case RAIDFRAME_SET_AUTOCONFIG:
   1142 	case RAIDFRAME_SET_ROOT:
   1143 	case RAIDFRAME_DELETE_COMPONENT:
   1144 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1145 	case RAIDFRAME_PARITYMAP_STATUS:
   1146 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1147 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1148 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1149 	case DIOCGSTRATEGY:
   1150 	case DIOCSSTRATEGY:
   1151 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1152 			return (ENXIO);
   1153 	}
   1154 
   1155 	switch (cmd) {
   1156 #ifdef COMPAT_50
   1157 	case RAIDFRAME_GET_INFO50:
   1158 		return rf_get_info50(raidPtr, data);
   1159 
   1160 	case RAIDFRAME_CONFIGURE50:
   1161 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1162 			return retcode;
   1163 		goto config;
   1164 #endif
   1165 		/* configure the system */
   1166 	case RAIDFRAME_CONFIGURE:
   1167 
   1168 		if (raidPtr->valid) {
   1169 			/* There is a valid RAID set running on this unit! */
   1170 			printf("raid%d: Device already configured!\n",unit);
   1171 			return(EINVAL);
   1172 		}
   1173 
   1174 		/* copy-in the configuration information */
   1175 		/* data points to a pointer to the configuration structure */
   1176 
   1177 		u_cfg = *((RF_Config_t **) data);
   1178 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1179 		if (k_cfg == NULL) {
   1180 			return (ENOMEM);
   1181 		}
   1182 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1183 		if (retcode) {
   1184 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1185 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1186 				retcode));
   1187 			return (retcode);
   1188 		}
   1189 		goto config;
   1190 	config:
   1191 		/* allocate a buffer for the layout-specific data, and copy it
   1192 		 * in */
   1193 		if (k_cfg->layoutSpecificSize) {
   1194 			if (k_cfg->layoutSpecificSize > 10000) {
   1195 				/* sanity check */
   1196 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1197 				return (EINVAL);
   1198 			}
   1199 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1200 			    (u_char *));
   1201 			if (specific_buf == NULL) {
   1202 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1203 				return (ENOMEM);
   1204 			}
   1205 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1206 			    k_cfg->layoutSpecificSize);
   1207 			if (retcode) {
   1208 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1209 				RF_Free(specific_buf,
   1210 					k_cfg->layoutSpecificSize);
   1211 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1212 					retcode));
   1213 				return (retcode);
   1214 			}
   1215 		} else
   1216 			specific_buf = NULL;
   1217 		k_cfg->layoutSpecific = specific_buf;
   1218 
   1219 		/* should do some kind of sanity check on the configuration.
   1220 		 * Store the sum of all the bytes in the last byte? */
   1221 
   1222 		/* configure the system */
   1223 
   1224 		/*
   1225 		 * Clear the entire RAID descriptor, just to make sure
   1226 		 *  there is no stale data left in the case of a
   1227 		 *  reconfiguration
   1228 		 */
   1229 		memset(raidPtr, 0, sizeof(*raidPtr));
   1230 		raidPtr->softc = rs;
   1231 		raidPtr->raidid = unit;
   1232 
   1233 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1234 
   1235 		if (retcode == 0) {
   1236 
   1237 			/* allow this many simultaneous IO's to
   1238 			   this RAID device */
   1239 			raidPtr->openings = RAIDOUTSTANDING;
   1240 
   1241 			raidinit(rs);
   1242 			rf_markalldirty(raidPtr);
   1243 		}
   1244 		/* free the buffers.  No return code here. */
   1245 		if (k_cfg->layoutSpecificSize) {
   1246 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1247 		}
   1248 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1249 
   1250 		return (retcode);
   1251 
   1252 		/* shutdown the system */
   1253 	case RAIDFRAME_SHUTDOWN:
   1254 
   1255 		part = DISKPART(dev);
   1256 		pmask = (1 << part);
   1257 
   1258 		if ((error = raidlock(rs)) != 0)
   1259 			return (error);
   1260 
   1261 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1262 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1263 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1264 			retcode = EBUSY;
   1265 		else {
   1266 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1267 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1268 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1269 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1270 			retcode = 0;
   1271 		}
   1272 
   1273 		raidunlock(rs);
   1274 
   1275 		if (retcode != 0)
   1276 			return retcode;
   1277 
   1278 		/* free the pseudo device attach bits */
   1279 
   1280 		cf = device_cfdata(rs->sc_dev);
   1281 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1282 			free(cf, M_RAIDFRAME);
   1283 
   1284 		return (retcode);
   1285 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1286 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1287 		/* need to read the component label for the disk indicated
   1288 		   by row,column in clabel */
   1289 
   1290 		/*
   1291 		 * Perhaps there should be an option to skip the in-core
   1292 		 * copy and hit the disk, as with disklabel(8).
   1293 		 */
   1294 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1295 
   1296 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1297 
   1298 		if (retcode) {
   1299 			RF_Free(clabel, sizeof(*clabel));
   1300 			return retcode;
   1301 		}
   1302 
   1303 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1304 
   1305 		column = clabel->column;
   1306 
   1307 		if ((column < 0) || (column >= raidPtr->numCol +
   1308 		    raidPtr->numSpare)) {
   1309 			RF_Free(clabel, sizeof(*clabel));
   1310 			return EINVAL;
   1311 		}
   1312 
   1313 		RF_Free(clabel, sizeof(*clabel));
   1314 
   1315 		clabel = raidget_component_label(raidPtr, column);
   1316 
   1317 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1318 
   1319 #if 0
   1320 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1321 		clabel = (RF_ComponentLabel_t *) data;
   1322 
   1323 		/* XXX check the label for valid stuff... */
   1324 		/* Note that some things *should not* get modified --
   1325 		   the user should be re-initing the labels instead of
   1326 		   trying to patch things.
   1327 		   */
   1328 
   1329 		raidid = raidPtr->raidid;
   1330 #ifdef DEBUG
   1331 		printf("raid%d: Got component label:\n", raidid);
   1332 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1333 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1334 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1335 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1336 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1337 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1338 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1339 #endif
   1340 		clabel->row = 0;
   1341 		column = clabel->column;
   1342 
   1343 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1344 			return(EINVAL);
   1345 		}
   1346 
   1347 		/* XXX this isn't allowed to do anything for now :-) */
   1348 
   1349 		/* XXX and before it is, we need to fill in the rest
   1350 		   of the fields!?!?!?! */
   1351 		memcpy(raidget_component_label(raidPtr, column),
   1352 		    clabel, sizeof(*clabel));
   1353 		raidflush_component_label(raidPtr, column);
   1354 		return (0);
   1355 #endif
   1356 
   1357 	case RAIDFRAME_INIT_LABELS:
   1358 		clabel = (RF_ComponentLabel_t *) data;
   1359 		/*
   1360 		   we only want the serial number from
   1361 		   the above.  We get all the rest of the information
   1362 		   from the config that was used to create this RAID
   1363 		   set.
   1364 		   */
   1365 
   1366 		raidPtr->serial_number = clabel->serial_number;
   1367 
   1368 		for(column=0;column<raidPtr->numCol;column++) {
   1369 			diskPtr = &raidPtr->Disks[column];
   1370 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1371 				ci_label = raidget_component_label(raidPtr,
   1372 				    column);
   1373 				/* Zeroing this is important. */
   1374 				memset(ci_label, 0, sizeof(*ci_label));
   1375 				raid_init_component_label(raidPtr, ci_label);
   1376 				ci_label->serial_number =
   1377 				    raidPtr->serial_number;
   1378 				ci_label->row = 0; /* we dont' pretend to support more */
   1379 				rf_component_label_set_partitionsize(ci_label,
   1380 				    diskPtr->partitionSize);
   1381 				ci_label->column = column;
   1382 				raidflush_component_label(raidPtr, column);
   1383 			}
   1384 			/* XXXjld what about the spares? */
   1385 		}
   1386 
   1387 		return (retcode);
   1388 	case RAIDFRAME_SET_AUTOCONFIG:
   1389 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1390 		printf("raid%d: New autoconfig value is: %d\n",
   1391 		       raidPtr->raidid, d);
   1392 		*(int *) data = d;
   1393 		return (retcode);
   1394 
   1395 	case RAIDFRAME_SET_ROOT:
   1396 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1397 		printf("raid%d: New rootpartition value is: %d\n",
   1398 		       raidPtr->raidid, d);
   1399 		*(int *) data = d;
   1400 		return (retcode);
   1401 
   1402 		/* initialize all parity */
   1403 	case RAIDFRAME_REWRITEPARITY:
   1404 
   1405 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1406 			/* Parity for RAID 0 is trivially correct */
   1407 			raidPtr->parity_good = RF_RAID_CLEAN;
   1408 			return(0);
   1409 		}
   1410 
   1411 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1412 			/* Re-write is already in progress! */
   1413 			return(EINVAL);
   1414 		}
   1415 
   1416 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1417 					   rf_RewriteParityThread,
   1418 					   raidPtr,"raid_parity");
   1419 		return (retcode);
   1420 
   1421 
   1422 	case RAIDFRAME_ADD_HOT_SPARE:
   1423 		sparePtr = (RF_SingleComponent_t *) data;
   1424 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1425 		retcode = rf_add_hot_spare(raidPtr, &component);
   1426 		return(retcode);
   1427 
   1428 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1429 		return(retcode);
   1430 
   1431 	case RAIDFRAME_DELETE_COMPONENT:
   1432 		componentPtr = (RF_SingleComponent_t *)data;
   1433 		memcpy( &component, componentPtr,
   1434 			sizeof(RF_SingleComponent_t));
   1435 		retcode = rf_delete_component(raidPtr, &component);
   1436 		return(retcode);
   1437 
   1438 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1439 		componentPtr = (RF_SingleComponent_t *)data;
   1440 		memcpy( &component, componentPtr,
   1441 			sizeof(RF_SingleComponent_t));
   1442 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1443 		return(retcode);
   1444 
   1445 	case RAIDFRAME_REBUILD_IN_PLACE:
   1446 
   1447 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1448 			/* Can't do this on a RAID 0!! */
   1449 			return(EINVAL);
   1450 		}
   1451 
   1452 		if (raidPtr->recon_in_progress == 1) {
   1453 			/* a reconstruct is already in progress! */
   1454 			return(EINVAL);
   1455 		}
   1456 
   1457 		componentPtr = (RF_SingleComponent_t *) data;
   1458 		memcpy( &component, componentPtr,
   1459 			sizeof(RF_SingleComponent_t));
   1460 		component.row = 0; /* we don't support any more */
   1461 		column = component.column;
   1462 
   1463 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1464 			return(EINVAL);
   1465 		}
   1466 
   1467 		rf_lock_mutex2(raidPtr->mutex);
   1468 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1469 		    (raidPtr->numFailures > 0)) {
   1470 			/* XXX 0 above shouldn't be constant!!! */
   1471 			/* some component other than this has failed.
   1472 			   Let's not make things worse than they already
   1473 			   are... */
   1474 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1475 			       raidPtr->raidid);
   1476 			printf("raid%d:     Col: %d   Too many failures.\n",
   1477 			       raidPtr->raidid, column);
   1478 			rf_unlock_mutex2(raidPtr->mutex);
   1479 			return (EINVAL);
   1480 		}
   1481 		if (raidPtr->Disks[column].status ==
   1482 		    rf_ds_reconstructing) {
   1483 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1484 			       raidPtr->raidid);
   1485 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1486 
   1487 			rf_unlock_mutex2(raidPtr->mutex);
   1488 			return (EINVAL);
   1489 		}
   1490 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1491 			rf_unlock_mutex2(raidPtr->mutex);
   1492 			return (EINVAL);
   1493 		}
   1494 		rf_unlock_mutex2(raidPtr->mutex);
   1495 
   1496 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1497 		if (rrcopy == NULL)
   1498 			return(ENOMEM);
   1499 
   1500 		rrcopy->raidPtr = (void *) raidPtr;
   1501 		rrcopy->col = column;
   1502 
   1503 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1504 					   rf_ReconstructInPlaceThread,
   1505 					   rrcopy,"raid_reconip");
   1506 		return(retcode);
   1507 
   1508 	case RAIDFRAME_GET_INFO:
   1509 		if (!raidPtr->valid)
   1510 			return (ENODEV);
   1511 		ucfgp = (RF_DeviceConfig_t **) data;
   1512 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1513 			  (RF_DeviceConfig_t *));
   1514 		if (d_cfg == NULL)
   1515 			return (ENOMEM);
   1516 		d_cfg->rows = 1; /* there is only 1 row now */
   1517 		d_cfg->cols = raidPtr->numCol;
   1518 		d_cfg->ndevs = raidPtr->numCol;
   1519 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1520 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1521 			return (ENOMEM);
   1522 		}
   1523 		d_cfg->nspares = raidPtr->numSpare;
   1524 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1525 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1526 			return (ENOMEM);
   1527 		}
   1528 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1529 		d = 0;
   1530 		for (j = 0; j < d_cfg->cols; j++) {
   1531 			d_cfg->devs[d] = raidPtr->Disks[j];
   1532 			d++;
   1533 		}
   1534 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1535 			d_cfg->spares[i] = raidPtr->Disks[j];
   1536 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1537 				/* XXX: raidctl(8) expects to see this as a used spare */
   1538 				d_cfg->spares[i].status = rf_ds_used_spare;
   1539 			}
   1540 		}
   1541 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1542 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1543 
   1544 		return (retcode);
   1545 
   1546 	case RAIDFRAME_CHECK_PARITY:
   1547 		*(int *) data = raidPtr->parity_good;
   1548 		return (0);
   1549 
   1550 	case RAIDFRAME_PARITYMAP_STATUS:
   1551 		if (rf_paritymap_ineligible(raidPtr))
   1552 			return EINVAL;
   1553 		rf_paritymap_status(raidPtr->parity_map,
   1554 		    (struct rf_pmstat *)data);
   1555 		return 0;
   1556 
   1557 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1558 		if (rf_paritymap_ineligible(raidPtr))
   1559 			return EINVAL;
   1560 		if (raidPtr->parity_map == NULL)
   1561 			return ENOENT; /* ??? */
   1562 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1563 			(struct rf_pmparams *)data, 1))
   1564 			return EINVAL;
   1565 		return 0;
   1566 
   1567 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1568 		if (rf_paritymap_ineligible(raidPtr))
   1569 			return EINVAL;
   1570 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1571 		return 0;
   1572 
   1573 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1574 		if (rf_paritymap_ineligible(raidPtr))
   1575 			return EINVAL;
   1576 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1577 		/* XXX should errors be passed up? */
   1578 		return 0;
   1579 
   1580 	case RAIDFRAME_RESET_ACCTOTALS:
   1581 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1582 		return (0);
   1583 
   1584 	case RAIDFRAME_GET_ACCTOTALS:
   1585 		totals = (RF_AccTotals_t *) data;
   1586 		*totals = raidPtr->acc_totals;
   1587 		return (0);
   1588 
   1589 	case RAIDFRAME_KEEP_ACCTOTALS:
   1590 		raidPtr->keep_acc_totals = *(int *)data;
   1591 		return (0);
   1592 
   1593 	case RAIDFRAME_GET_SIZE:
   1594 		*(int *) data = raidPtr->totalSectors;
   1595 		return (0);
   1596 
   1597 		/* fail a disk & optionally start reconstruction */
   1598 	case RAIDFRAME_FAIL_DISK:
   1599 
   1600 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1601 			/* Can't do this on a RAID 0!! */
   1602 			return(EINVAL);
   1603 		}
   1604 
   1605 		rr = (struct rf_recon_req *) data;
   1606 		rr->row = 0;
   1607 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1608 			return (EINVAL);
   1609 
   1610 
   1611 		rf_lock_mutex2(raidPtr->mutex);
   1612 		if (raidPtr->status == rf_rs_reconstructing) {
   1613 			/* you can't fail a disk while we're reconstructing! */
   1614 			/* XXX wrong for RAID6 */
   1615 			rf_unlock_mutex2(raidPtr->mutex);
   1616 			return (EINVAL);
   1617 		}
   1618 		if ((raidPtr->Disks[rr->col].status ==
   1619 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1620 			/* some other component has failed.  Let's not make
   1621 			   things worse. XXX wrong for RAID6 */
   1622 			rf_unlock_mutex2(raidPtr->mutex);
   1623 			return (EINVAL);
   1624 		}
   1625 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1626 			/* Can't fail a spared disk! */
   1627 			rf_unlock_mutex2(raidPtr->mutex);
   1628 			return (EINVAL);
   1629 		}
   1630 		rf_unlock_mutex2(raidPtr->mutex);
   1631 
   1632 		/* make a copy of the recon request so that we don't rely on
   1633 		 * the user's buffer */
   1634 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1635 		if (rrcopy == NULL)
   1636 			return(ENOMEM);
   1637 		memcpy(rrcopy, rr, sizeof(*rr));
   1638 		rrcopy->raidPtr = (void *) raidPtr;
   1639 
   1640 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1641 					   rf_ReconThread,
   1642 					   rrcopy,"raid_recon");
   1643 		return (0);
   1644 
   1645 		/* invoke a copyback operation after recon on whatever disk
   1646 		 * needs it, if any */
   1647 	case RAIDFRAME_COPYBACK:
   1648 
   1649 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1650 			/* This makes no sense on a RAID 0!! */
   1651 			return(EINVAL);
   1652 		}
   1653 
   1654 		if (raidPtr->copyback_in_progress == 1) {
   1655 			/* Copyback is already in progress! */
   1656 			return(EINVAL);
   1657 		}
   1658 
   1659 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1660 					   rf_CopybackThread,
   1661 					   raidPtr,"raid_copyback");
   1662 		return (retcode);
   1663 
   1664 		/* return the percentage completion of reconstruction */
   1665 	case RAIDFRAME_CHECK_RECON_STATUS:
   1666 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1667 			/* This makes no sense on a RAID 0, so tell the
   1668 			   user it's done. */
   1669 			*(int *) data = 100;
   1670 			return(0);
   1671 		}
   1672 		if (raidPtr->status != rf_rs_reconstructing)
   1673 			*(int *) data = 100;
   1674 		else {
   1675 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1676 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1677 			} else {
   1678 				*(int *) data = 0;
   1679 			}
   1680 		}
   1681 		return (0);
   1682 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1683 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1684 		if (raidPtr->status != rf_rs_reconstructing) {
   1685 			progressInfo.remaining = 0;
   1686 			progressInfo.completed = 100;
   1687 			progressInfo.total = 100;
   1688 		} else {
   1689 			progressInfo.total =
   1690 				raidPtr->reconControl->numRUsTotal;
   1691 			progressInfo.completed =
   1692 				raidPtr->reconControl->numRUsComplete;
   1693 			progressInfo.remaining = progressInfo.total -
   1694 				progressInfo.completed;
   1695 		}
   1696 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1697 				  sizeof(RF_ProgressInfo_t));
   1698 		return (retcode);
   1699 
   1700 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1701 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1702 			/* This makes no sense on a RAID 0, so tell the
   1703 			   user it's done. */
   1704 			*(int *) data = 100;
   1705 			return(0);
   1706 		}
   1707 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1708 			*(int *) data = 100 *
   1709 				raidPtr->parity_rewrite_stripes_done /
   1710 				raidPtr->Layout.numStripe;
   1711 		} else {
   1712 			*(int *) data = 100;
   1713 		}
   1714 		return (0);
   1715 
   1716 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1717 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1718 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1719 			progressInfo.total = raidPtr->Layout.numStripe;
   1720 			progressInfo.completed =
   1721 				raidPtr->parity_rewrite_stripes_done;
   1722 			progressInfo.remaining = progressInfo.total -
   1723 				progressInfo.completed;
   1724 		} else {
   1725 			progressInfo.remaining = 0;
   1726 			progressInfo.completed = 100;
   1727 			progressInfo.total = 100;
   1728 		}
   1729 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1730 				  sizeof(RF_ProgressInfo_t));
   1731 		return (retcode);
   1732 
   1733 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1734 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1735 			/* This makes no sense on a RAID 0 */
   1736 			*(int *) data = 100;
   1737 			return(0);
   1738 		}
   1739 		if (raidPtr->copyback_in_progress == 1) {
   1740 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1741 				raidPtr->Layout.numStripe;
   1742 		} else {
   1743 			*(int *) data = 100;
   1744 		}
   1745 		return (0);
   1746 
   1747 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1748 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1749 		if (raidPtr->copyback_in_progress == 1) {
   1750 			progressInfo.total = raidPtr->Layout.numStripe;
   1751 			progressInfo.completed =
   1752 				raidPtr->copyback_stripes_done;
   1753 			progressInfo.remaining = progressInfo.total -
   1754 				progressInfo.completed;
   1755 		} else {
   1756 			progressInfo.remaining = 0;
   1757 			progressInfo.completed = 100;
   1758 			progressInfo.total = 100;
   1759 		}
   1760 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1761 				  sizeof(RF_ProgressInfo_t));
   1762 		return (retcode);
   1763 
   1764 		/* the sparetable daemon calls this to wait for the kernel to
   1765 		 * need a spare table. this ioctl does not return until a
   1766 		 * spare table is needed. XXX -- calling mpsleep here in the
   1767 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1768 		 * -- I should either compute the spare table in the kernel,
   1769 		 * or have a different -- XXX XXX -- interface (a different
   1770 		 * character device) for delivering the table     -- XXX */
   1771 #if 0
   1772 	case RAIDFRAME_SPARET_WAIT:
   1773 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1774 		while (!rf_sparet_wait_queue)
   1775 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1776 		waitreq = rf_sparet_wait_queue;
   1777 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1778 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1779 
   1780 		/* structure assignment */
   1781 		*((RF_SparetWait_t *) data) = *waitreq;
   1782 
   1783 		RF_Free(waitreq, sizeof(*waitreq));
   1784 		return (0);
   1785 
   1786 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1787 		 * code in it that will cause the dameon to exit */
   1788 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1789 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1790 		waitreq->fcol = -1;
   1791 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1792 		waitreq->next = rf_sparet_wait_queue;
   1793 		rf_sparet_wait_queue = waitreq;
   1794 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1795 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1796 		return (0);
   1797 
   1798 		/* used by the spare table daemon to deliver a spare table
   1799 		 * into the kernel */
   1800 	case RAIDFRAME_SEND_SPARET:
   1801 
   1802 		/* install the spare table */
   1803 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1804 
   1805 		/* respond to the requestor.  the return status of the spare
   1806 		 * table installation is passed in the "fcol" field */
   1807 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1808 		waitreq->fcol = retcode;
   1809 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1810 		waitreq->next = rf_sparet_resp_queue;
   1811 		rf_sparet_resp_queue = waitreq;
   1812 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1813 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1814 
   1815 		return (retcode);
   1816 #endif
   1817 
   1818 	default:
   1819 		break; /* fall through to the os-specific code below */
   1820 
   1821 	}
   1822 
   1823 	if (!raidPtr->valid)
   1824 		return (EINVAL);
   1825 
   1826 	/*
   1827 	 * Add support for "regular" device ioctls here.
   1828 	 */
   1829 
   1830 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1831 	if (error != EPASSTHROUGH)
   1832 		return (error);
   1833 
   1834 	switch (cmd) {
   1835 	case DIOCGDINFO:
   1836 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1837 		break;
   1838 #ifdef __HAVE_OLD_DISKLABEL
   1839 	case ODIOCGDINFO:
   1840 		newlabel = *(rs->sc_dkdev.dk_label);
   1841 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1842 			return ENOTTY;
   1843 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1844 		break;
   1845 #endif
   1846 
   1847 	case DIOCGPART:
   1848 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1849 		((struct partinfo *) data)->part =
   1850 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1851 		break;
   1852 
   1853 	case DIOCWDINFO:
   1854 	case DIOCSDINFO:
   1855 #ifdef __HAVE_OLD_DISKLABEL
   1856 	case ODIOCWDINFO:
   1857 	case ODIOCSDINFO:
   1858 #endif
   1859 	{
   1860 		struct disklabel *lp;
   1861 #ifdef __HAVE_OLD_DISKLABEL
   1862 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1863 			memset(&newlabel, 0, sizeof newlabel);
   1864 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1865 			lp = &newlabel;
   1866 		} else
   1867 #endif
   1868 		lp = (struct disklabel *)data;
   1869 
   1870 		if ((error = raidlock(rs)) != 0)
   1871 			return (error);
   1872 
   1873 		rs->sc_flags |= RAIDF_LABELLING;
   1874 
   1875 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1876 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1877 		if (error == 0) {
   1878 			if (cmd == DIOCWDINFO
   1879 #ifdef __HAVE_OLD_DISKLABEL
   1880 			    || cmd == ODIOCWDINFO
   1881 #endif
   1882 			   )
   1883 				error = writedisklabel(RAIDLABELDEV(dev),
   1884 				    raidstrategy, rs->sc_dkdev.dk_label,
   1885 				    rs->sc_dkdev.dk_cpulabel);
   1886 		}
   1887 		rs->sc_flags &= ~RAIDF_LABELLING;
   1888 
   1889 		raidunlock(rs);
   1890 
   1891 		if (error)
   1892 			return (error);
   1893 		break;
   1894 	}
   1895 
   1896 	case DIOCWLABEL:
   1897 		if (*(int *) data != 0)
   1898 			rs->sc_flags |= RAIDF_WLABEL;
   1899 		else
   1900 			rs->sc_flags &= ~RAIDF_WLABEL;
   1901 		break;
   1902 
   1903 	case DIOCGDEFLABEL:
   1904 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1905 		break;
   1906 
   1907 #ifdef __HAVE_OLD_DISKLABEL
   1908 	case ODIOCGDEFLABEL:
   1909 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1910 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1911 			return ENOTTY;
   1912 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1913 		break;
   1914 #endif
   1915 
   1916 	case DIOCAWEDGE:
   1917 	case DIOCDWEDGE:
   1918 	    	dkw = (void *)data;
   1919 
   1920 		/* If the ioctl happens here, the parent is us. */
   1921 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1922 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1923 
   1924 	case DIOCLWEDGES:
   1925 		return dkwedge_list(&rs->sc_dkdev,
   1926 		    (struct dkwedge_list *)data, l);
   1927 	case DIOCMWEDGES:
   1928 		dkwedge_discover(&rs->sc_dkdev);
   1929 		return 0;
   1930 	case DIOCCACHESYNC:
   1931 		return rf_sync_component_caches(raidPtr);
   1932 
   1933 	case DIOCGSTRATEGY:
   1934 	    {
   1935 		struct disk_strategy *dks = (void *)data;
   1936 
   1937 		s = splbio();
   1938 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1939 		    sizeof(dks->dks_name));
   1940 		splx(s);
   1941 		dks->dks_paramlen = 0;
   1942 
   1943 		return 0;
   1944 	    }
   1945 
   1946 	case DIOCSSTRATEGY:
   1947 	    {
   1948 		struct disk_strategy *dks = (void *)data;
   1949 		struct bufq_state *new;
   1950 		struct bufq_state *old;
   1951 
   1952 		if (dks->dks_param != NULL) {
   1953 			return EINVAL;
   1954 		}
   1955 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1956 		error = bufq_alloc(&new, dks->dks_name,
   1957 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1958 		if (error) {
   1959 			return error;
   1960 		}
   1961 		s = splbio();
   1962 		old = rs->buf_queue;
   1963 		bufq_move(new, old);
   1964 		rs->buf_queue = new;
   1965 		splx(s);
   1966 		bufq_free(old);
   1967 
   1968 		return 0;
   1969 	    }
   1970 
   1971 	default:
   1972 		retcode = ENOTTY;
   1973 	}
   1974 	return (retcode);
   1975 
   1976 }
   1977 
   1978 
   1979 /* raidinit -- complete the rest of the initialization for the
   1980    RAIDframe device.  */
   1981 
   1982 
   1983 static void
   1984 raidinit(struct raid_softc *rs)
   1985 {
   1986 	cfdata_t cf;
   1987 	int     unit;
   1988 	RF_Raid_t *raidPtr = &rs->sc_r;
   1989 
   1990 	unit = raidPtr->raidid;
   1991 
   1992 
   1993 	/* XXX should check return code first... */
   1994 	rs->sc_flags |= RAIDF_INITED;
   1995 
   1996 	/* XXX doesn't check bounds. */
   1997 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1998 
   1999 	/* attach the pseudo device */
   2000 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   2001 	cf->cf_name = raid_cd.cd_name;
   2002 	cf->cf_atname = raid_cd.cd_name;
   2003 	cf->cf_unit = unit;
   2004 	cf->cf_fstate = FSTATE_STAR;
   2005 
   2006 	rs->sc_dev = config_attach_pseudo(cf);
   2007 
   2008 	if (rs->sc_dev == NULL) {
   2009 		printf("raid%d: config_attach_pseudo failed\n",
   2010 		    raidPtr->raidid);
   2011 		rs->sc_flags &= ~RAIDF_INITED;
   2012 		free(cf, M_RAIDFRAME);
   2013 		return;
   2014 	}
   2015 
   2016 	/* disk_attach actually creates space for the CPU disklabel, among
   2017 	 * other things, so it's critical to call this *BEFORE* we try putzing
   2018 	 * with disklabels. */
   2019 
   2020 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   2021 	disk_attach(&rs->sc_dkdev);
   2022 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   2023 
   2024 	/* XXX There may be a weird interaction here between this, and
   2025 	 * protectedSectors, as used in RAIDframe.  */
   2026 
   2027 	rs->sc_size = raidPtr->totalSectors;
   2028 
   2029 	dkwedge_discover(&rs->sc_dkdev);
   2030 
   2031 	rf_set_geometry(rs, raidPtr);
   2032 
   2033 }
   2034 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2035 /* wake up the daemon & tell it to get us a spare table
   2036  * XXX
   2037  * the entries in the queues should be tagged with the raidPtr
   2038  * so that in the extremely rare case that two recons happen at once,
   2039  * we know for which device were requesting a spare table
   2040  * XXX
   2041  *
   2042  * XXX This code is not currently used. GO
   2043  */
   2044 int
   2045 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2046 {
   2047 	int     retcode;
   2048 
   2049 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2050 	req->next = rf_sparet_wait_queue;
   2051 	rf_sparet_wait_queue = req;
   2052 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2053 
   2054 	/* mpsleep unlocks the mutex */
   2055 	while (!rf_sparet_resp_queue) {
   2056 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2057 	}
   2058 	req = rf_sparet_resp_queue;
   2059 	rf_sparet_resp_queue = req->next;
   2060 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2061 
   2062 	retcode = req->fcol;
   2063 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2064 					 * alloc'd */
   2065 	return (retcode);
   2066 }
   2067 #endif
   2068 
   2069 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2070  * bp & passes it down.
   2071  * any calls originating in the kernel must use non-blocking I/O
   2072  * do some extra sanity checking to return "appropriate" error values for
   2073  * certain conditions (to make some standard utilities work)
   2074  *
   2075  * Formerly known as: rf_DoAccessKernel
   2076  */
   2077 void
   2078 raidstart(RF_Raid_t *raidPtr)
   2079 {
   2080 	RF_SectorCount_t num_blocks, pb, sum;
   2081 	RF_RaidAddr_t raid_addr;
   2082 	struct partition *pp;
   2083 	daddr_t blocknum;
   2084 	struct raid_softc *rs;
   2085 	int     do_async;
   2086 	struct buf *bp;
   2087 	int rc;
   2088 
   2089 	rs = raidPtr->softc;
   2090 	/* quick check to see if anything has died recently */
   2091 	rf_lock_mutex2(raidPtr->mutex);
   2092 	if (raidPtr->numNewFailures > 0) {
   2093 		rf_unlock_mutex2(raidPtr->mutex);
   2094 		rf_update_component_labels(raidPtr,
   2095 					   RF_NORMAL_COMPONENT_UPDATE);
   2096 		rf_lock_mutex2(raidPtr->mutex);
   2097 		raidPtr->numNewFailures--;
   2098 	}
   2099 
   2100 	/* Check to see if we're at the limit... */
   2101 	while (raidPtr->openings > 0) {
   2102 		rf_unlock_mutex2(raidPtr->mutex);
   2103 
   2104 		/* get the next item, if any, from the queue */
   2105 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2106 			/* nothing more to do */
   2107 			return;
   2108 		}
   2109 
   2110 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2111 		 * partition.. Need to make it absolute to the underlying
   2112 		 * device.. */
   2113 
   2114 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2115 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2116 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2117 			blocknum += pp->p_offset;
   2118 		}
   2119 
   2120 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2121 			    (int) blocknum));
   2122 
   2123 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2124 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2125 
   2126 		/* *THIS* is where we adjust what block we're going to...
   2127 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2128 		raid_addr = blocknum;
   2129 
   2130 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2131 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2132 		sum = raid_addr + num_blocks + pb;
   2133 		if (1 || rf_debugKernelAccess) {
   2134 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2135 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2136 				    (int) pb, (int) bp->b_resid));
   2137 		}
   2138 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2139 		    || (sum < num_blocks) || (sum < pb)) {
   2140 			bp->b_error = ENOSPC;
   2141 			bp->b_resid = bp->b_bcount;
   2142 			biodone(bp);
   2143 			rf_lock_mutex2(raidPtr->mutex);
   2144 			continue;
   2145 		}
   2146 		/*
   2147 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2148 		 */
   2149 
   2150 		if (bp->b_bcount & raidPtr->sectorMask) {
   2151 			bp->b_error = EINVAL;
   2152 			bp->b_resid = bp->b_bcount;
   2153 			biodone(bp);
   2154 			rf_lock_mutex2(raidPtr->mutex);
   2155 			continue;
   2156 
   2157 		}
   2158 		db1_printf(("Calling DoAccess..\n"));
   2159 
   2160 
   2161 		rf_lock_mutex2(raidPtr->mutex);
   2162 		raidPtr->openings--;
   2163 		rf_unlock_mutex2(raidPtr->mutex);
   2164 
   2165 		/*
   2166 		 * Everything is async.
   2167 		 */
   2168 		do_async = 1;
   2169 
   2170 		disk_busy(&rs->sc_dkdev);
   2171 
   2172 		/* XXX we're still at splbio() here... do we *really*
   2173 		   need to be? */
   2174 
   2175 		/* don't ever condition on bp->b_flags & B_WRITE.
   2176 		 * always condition on B_READ instead */
   2177 
   2178 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2179 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2180 				 do_async, raid_addr, num_blocks,
   2181 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2182 
   2183 		if (rc) {
   2184 			bp->b_error = rc;
   2185 			bp->b_resid = bp->b_bcount;
   2186 			biodone(bp);
   2187 			/* continue loop */
   2188 		}
   2189 
   2190 		rf_lock_mutex2(raidPtr->mutex);
   2191 	}
   2192 	rf_unlock_mutex2(raidPtr->mutex);
   2193 }
   2194 
   2195 
   2196 
   2197 
   2198 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2199 
   2200 int
   2201 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2202 {
   2203 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2204 	struct buf *bp;
   2205 
   2206 	req->queue = queue;
   2207 	bp = req->bp;
   2208 
   2209 	switch (req->type) {
   2210 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2211 		/* XXX need to do something extra here.. */
   2212 		/* I'm leaving this in, as I've never actually seen it used,
   2213 		 * and I'd like folks to report it... GO */
   2214 		printf(("WAKEUP CALLED\n"));
   2215 		queue->numOutstanding++;
   2216 
   2217 		bp->b_flags = 0;
   2218 		bp->b_private = req;
   2219 
   2220 		KernelWakeupFunc(bp);
   2221 		break;
   2222 
   2223 	case RF_IO_TYPE_READ:
   2224 	case RF_IO_TYPE_WRITE:
   2225 #if RF_ACC_TRACE > 0
   2226 		if (req->tracerec) {
   2227 			RF_ETIMER_START(req->tracerec->timer);
   2228 		}
   2229 #endif
   2230 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2231 		    op, queue->rf_cinfo->ci_dev,
   2232 		    req->sectorOffset, req->numSector,
   2233 		    req->buf, KernelWakeupFunc, (void *) req,
   2234 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2235 
   2236 		if (rf_debugKernelAccess) {
   2237 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2238 				(long) bp->b_blkno));
   2239 		}
   2240 		queue->numOutstanding++;
   2241 		queue->last_deq_sector = req->sectorOffset;
   2242 		/* acc wouldn't have been let in if there were any pending
   2243 		 * reqs at any other priority */
   2244 		queue->curPriority = req->priority;
   2245 
   2246 		db1_printf(("Going for %c to unit %d col %d\n",
   2247 			    req->type, queue->raidPtr->raidid,
   2248 			    queue->col));
   2249 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2250 			(int) req->sectorOffset, (int) req->numSector,
   2251 			(int) (req->numSector <<
   2252 			    queue->raidPtr->logBytesPerSector),
   2253 			(int) queue->raidPtr->logBytesPerSector));
   2254 
   2255 		/*
   2256 		 * XXX: drop lock here since this can block at
   2257 		 * least with backing SCSI devices.  Retake it
   2258 		 * to minimize fuss with calling interfaces.
   2259 		 */
   2260 
   2261 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2262 		bdev_strategy(bp);
   2263 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2264 		break;
   2265 
   2266 	default:
   2267 		panic("bad req->type in rf_DispatchKernelIO");
   2268 	}
   2269 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2270 
   2271 	return (0);
   2272 }
   2273 /* this is the callback function associated with a I/O invoked from
   2274    kernel code.
   2275  */
   2276 static void
   2277 KernelWakeupFunc(struct buf *bp)
   2278 {
   2279 	RF_DiskQueueData_t *req = NULL;
   2280 	RF_DiskQueue_t *queue;
   2281 
   2282 	db1_printf(("recovering the request queue:\n"));
   2283 
   2284 	req = bp->b_private;
   2285 
   2286 	queue = (RF_DiskQueue_t *) req->queue;
   2287 
   2288 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2289 
   2290 #if RF_ACC_TRACE > 0
   2291 	if (req->tracerec) {
   2292 		RF_ETIMER_STOP(req->tracerec->timer);
   2293 		RF_ETIMER_EVAL(req->tracerec->timer);
   2294 		rf_lock_mutex2(rf_tracing_mutex);
   2295 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2296 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2297 		req->tracerec->num_phys_ios++;
   2298 		rf_unlock_mutex2(rf_tracing_mutex);
   2299 	}
   2300 #endif
   2301 
   2302 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2303 	 * ballistic, and mark the component as hosed... */
   2304 
   2305 	if (bp->b_error != 0) {
   2306 		/* Mark the disk as dead */
   2307 		/* but only mark it once... */
   2308 		/* and only if it wouldn't leave this RAID set
   2309 		   completely broken */
   2310 		if (((queue->raidPtr->Disks[queue->col].status ==
   2311 		      rf_ds_optimal) ||
   2312 		     (queue->raidPtr->Disks[queue->col].status ==
   2313 		      rf_ds_used_spare)) &&
   2314 		     (queue->raidPtr->numFailures <
   2315 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2316 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2317 			       queue->raidPtr->raidid,
   2318 			       queue->raidPtr->Disks[queue->col].devname);
   2319 			queue->raidPtr->Disks[queue->col].status =
   2320 			    rf_ds_failed;
   2321 			queue->raidPtr->status = rf_rs_degraded;
   2322 			queue->raidPtr->numFailures++;
   2323 			queue->raidPtr->numNewFailures++;
   2324 		} else {	/* Disk is already dead... */
   2325 			/* printf("Disk already marked as dead!\n"); */
   2326 		}
   2327 
   2328 	}
   2329 
   2330 	/* Fill in the error value */
   2331 	req->error = bp->b_error;
   2332 
   2333 	/* Drop this one on the "finished" queue... */
   2334 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2335 
   2336 	/* Let the raidio thread know there is work to be done. */
   2337 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2338 
   2339 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2340 }
   2341 
   2342 
   2343 /*
   2344  * initialize a buf structure for doing an I/O in the kernel.
   2345  */
   2346 static void
   2347 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2348        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2349        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2350        struct proc *b_proc)
   2351 {
   2352 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2353 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2354 	bp->b_oflags = 0;
   2355 	bp->b_cflags = 0;
   2356 	bp->b_bcount = numSect << logBytesPerSector;
   2357 	bp->b_bufsize = bp->b_bcount;
   2358 	bp->b_error = 0;
   2359 	bp->b_dev = dev;
   2360 	bp->b_data = bf;
   2361 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2362 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2363 	if (bp->b_bcount == 0) {
   2364 		panic("bp->b_bcount is zero in InitBP!!");
   2365 	}
   2366 	bp->b_proc = b_proc;
   2367 	bp->b_iodone = cbFunc;
   2368 	bp->b_private = cbArg;
   2369 }
   2370 
   2371 static void
   2372 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2373 		    struct disklabel *lp)
   2374 {
   2375 	memset(lp, 0, sizeof(*lp));
   2376 
   2377 	/* fabricate a label... */
   2378 	if (raidPtr->totalSectors > UINT32_MAX)
   2379 		lp->d_secperunit = UINT32_MAX;
   2380 	else
   2381 		lp->d_secperunit = raidPtr->totalSectors;
   2382 	lp->d_secsize = raidPtr->bytesPerSector;
   2383 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2384 	lp->d_ntracks = 4 * raidPtr->numCol;
   2385 	lp->d_ncylinders = raidPtr->totalSectors /
   2386 		(lp->d_nsectors * lp->d_ntracks);
   2387 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2388 
   2389 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2390 	lp->d_type = DTYPE_RAID;
   2391 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2392 	lp->d_rpm = 3600;
   2393 	lp->d_interleave = 1;
   2394 	lp->d_flags = 0;
   2395 
   2396 	lp->d_partitions[RAW_PART].p_offset = 0;
   2397 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2398 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2399 	lp->d_npartitions = RAW_PART + 1;
   2400 
   2401 	lp->d_magic = DISKMAGIC;
   2402 	lp->d_magic2 = DISKMAGIC;
   2403 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2404 
   2405 }
   2406 /*
   2407  * Read the disklabel from the raid device.  If one is not present, fake one
   2408  * up.
   2409  */
   2410 static void
   2411 raidgetdisklabel(dev_t dev)
   2412 {
   2413 	int     unit = raidunit(dev);
   2414 	struct raid_softc *rs;
   2415 	const char   *errstring;
   2416 	struct disklabel *lp;
   2417 	struct cpu_disklabel *clp;
   2418 	RF_Raid_t *raidPtr;
   2419 
   2420 	if ((rs = raidget(unit)) == NULL)
   2421 		return;
   2422 
   2423 	lp = rs->sc_dkdev.dk_label;
   2424 	clp = rs->sc_dkdev.dk_cpulabel;
   2425 
   2426 	db1_printf(("Getting the disklabel...\n"));
   2427 
   2428 	memset(clp, 0, sizeof(*clp));
   2429 
   2430 	raidPtr = &rs->sc_r;
   2431 
   2432 	raidgetdefaultlabel(raidPtr, rs, lp);
   2433 
   2434 	/*
   2435 	 * Call the generic disklabel extraction routine.
   2436 	 */
   2437 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2438 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2439 	if (errstring)
   2440 		raidmakedisklabel(rs);
   2441 	else {
   2442 		int     i;
   2443 		struct partition *pp;
   2444 
   2445 		/*
   2446 		 * Sanity check whether the found disklabel is valid.
   2447 		 *
   2448 		 * This is necessary since total size of the raid device
   2449 		 * may vary when an interleave is changed even though exactly
   2450 		 * same components are used, and old disklabel may used
   2451 		 * if that is found.
   2452 		 */
   2453 		if (lp->d_secperunit < UINT32_MAX ?
   2454 		    lp->d_secperunit != rs->sc_size :
   2455 		    lp->d_secperunit > rs->sc_size)
   2456 			printf("raid%d: WARNING: %s: "
   2457 			    "total sector size in disklabel (%ju) != "
   2458 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
   2459 			    (uintmax_t)lp->d_secperunit,
   2460 			    (uintmax_t)rs->sc_size);
   2461 		for (i = 0; i < lp->d_npartitions; i++) {
   2462 			pp = &lp->d_partitions[i];
   2463 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2464 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2465 				       "exceeds the size of raid (%ju)\n",
   2466 				       unit, rs->sc_xname, 'a' + i,
   2467 				       (uintmax_t)rs->sc_size);
   2468 		}
   2469 	}
   2470 
   2471 }
   2472 /*
   2473  * Take care of things one might want to take care of in the event
   2474  * that a disklabel isn't present.
   2475  */
   2476 static void
   2477 raidmakedisklabel(struct raid_softc *rs)
   2478 {
   2479 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2480 	db1_printf(("Making a label..\n"));
   2481 
   2482 	/*
   2483 	 * For historical reasons, if there's no disklabel present
   2484 	 * the raw partition must be marked FS_BSDFFS.
   2485 	 */
   2486 
   2487 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2488 
   2489 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2490 
   2491 	lp->d_checksum = dkcksum(lp);
   2492 }
   2493 /*
   2494  * Wait interruptibly for an exclusive lock.
   2495  *
   2496  * XXX
   2497  * Several drivers do this; it should be abstracted and made MP-safe.
   2498  * (Hmm... where have we seen this warning before :->  GO )
   2499  */
   2500 static int
   2501 raidlock(struct raid_softc *rs)
   2502 {
   2503 	int     error;
   2504 
   2505 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2506 		rs->sc_flags |= RAIDF_WANTED;
   2507 		if ((error =
   2508 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2509 			return (error);
   2510 	}
   2511 	rs->sc_flags |= RAIDF_LOCKED;
   2512 	return (0);
   2513 }
   2514 /*
   2515  * Unlock and wake up any waiters.
   2516  */
   2517 static void
   2518 raidunlock(struct raid_softc *rs)
   2519 {
   2520 
   2521 	rs->sc_flags &= ~RAIDF_LOCKED;
   2522 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2523 		rs->sc_flags &= ~RAIDF_WANTED;
   2524 		wakeup(rs);
   2525 	}
   2526 }
   2527 
   2528 
   2529 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2530 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2531 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2532 
   2533 static daddr_t
   2534 rf_component_info_offset(void)
   2535 {
   2536 
   2537 	return RF_COMPONENT_INFO_OFFSET;
   2538 }
   2539 
   2540 static daddr_t
   2541 rf_component_info_size(unsigned secsize)
   2542 {
   2543 	daddr_t info_size;
   2544 
   2545 	KASSERT(secsize);
   2546 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2547 		info_size = secsize;
   2548 	else
   2549 		info_size = RF_COMPONENT_INFO_SIZE;
   2550 
   2551 	return info_size;
   2552 }
   2553 
   2554 static daddr_t
   2555 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2556 {
   2557 	daddr_t map_offset;
   2558 
   2559 	KASSERT(raidPtr->bytesPerSector);
   2560 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2561 		map_offset = raidPtr->bytesPerSector;
   2562 	else
   2563 		map_offset = RF_COMPONENT_INFO_SIZE;
   2564 	map_offset += rf_component_info_offset();
   2565 
   2566 	return map_offset;
   2567 }
   2568 
   2569 static daddr_t
   2570 rf_parity_map_size(RF_Raid_t *raidPtr)
   2571 {
   2572 	daddr_t map_size;
   2573 
   2574 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2575 		map_size = raidPtr->bytesPerSector;
   2576 	else
   2577 		map_size = RF_PARITY_MAP_SIZE;
   2578 
   2579 	return map_size;
   2580 }
   2581 
   2582 int
   2583 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2584 {
   2585 	RF_ComponentLabel_t *clabel;
   2586 
   2587 	clabel = raidget_component_label(raidPtr, col);
   2588 	clabel->clean = RF_RAID_CLEAN;
   2589 	raidflush_component_label(raidPtr, col);
   2590 	return(0);
   2591 }
   2592 
   2593 
   2594 int
   2595 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2596 {
   2597 	RF_ComponentLabel_t *clabel;
   2598 
   2599 	clabel = raidget_component_label(raidPtr, col);
   2600 	clabel->clean = RF_RAID_DIRTY;
   2601 	raidflush_component_label(raidPtr, col);
   2602 	return(0);
   2603 }
   2604 
   2605 int
   2606 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2607 {
   2608 	KASSERT(raidPtr->bytesPerSector);
   2609 	return raidread_component_label(raidPtr->bytesPerSector,
   2610 	    raidPtr->Disks[col].dev,
   2611 	    raidPtr->raid_cinfo[col].ci_vp,
   2612 	    &raidPtr->raid_cinfo[col].ci_label);
   2613 }
   2614 
   2615 RF_ComponentLabel_t *
   2616 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2617 {
   2618 	return &raidPtr->raid_cinfo[col].ci_label;
   2619 }
   2620 
   2621 int
   2622 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2623 {
   2624 	RF_ComponentLabel_t *label;
   2625 
   2626 	label = &raidPtr->raid_cinfo[col].ci_label;
   2627 	label->mod_counter = raidPtr->mod_counter;
   2628 #ifndef RF_NO_PARITY_MAP
   2629 	label->parity_map_modcount = label->mod_counter;
   2630 #endif
   2631 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2632 	    raidPtr->Disks[col].dev,
   2633 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2634 }
   2635 
   2636 
   2637 static int
   2638 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2639     RF_ComponentLabel_t *clabel)
   2640 {
   2641 	return raidread_component_area(dev, b_vp, clabel,
   2642 	    sizeof(RF_ComponentLabel_t),
   2643 	    rf_component_info_offset(),
   2644 	    rf_component_info_size(secsize));
   2645 }
   2646 
   2647 /* ARGSUSED */
   2648 static int
   2649 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2650     size_t msize, daddr_t offset, daddr_t dsize)
   2651 {
   2652 	struct buf *bp;
   2653 	const struct bdevsw *bdev;
   2654 	int error;
   2655 
   2656 	/* XXX should probably ensure that we don't try to do this if
   2657 	   someone has changed rf_protected_sectors. */
   2658 
   2659 	if (b_vp == NULL) {
   2660 		/* For whatever reason, this component is not valid.
   2661 		   Don't try to read a component label from it. */
   2662 		return(EINVAL);
   2663 	}
   2664 
   2665 	/* get a block of the appropriate size... */
   2666 	bp = geteblk((int)dsize);
   2667 	bp->b_dev = dev;
   2668 
   2669 	/* get our ducks in a row for the read */
   2670 	bp->b_blkno = offset / DEV_BSIZE;
   2671 	bp->b_bcount = dsize;
   2672 	bp->b_flags |= B_READ;
   2673  	bp->b_resid = dsize;
   2674 
   2675 	bdev = bdevsw_lookup(bp->b_dev);
   2676 	if (bdev == NULL)
   2677 		return (ENXIO);
   2678 	(*bdev->d_strategy)(bp);
   2679 
   2680 	error = biowait(bp);
   2681 
   2682 	if (!error) {
   2683 		memcpy(data, bp->b_data, msize);
   2684 	}
   2685 
   2686 	brelse(bp, 0);
   2687 	return(error);
   2688 }
   2689 
   2690 
   2691 static int
   2692 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2693     RF_ComponentLabel_t *clabel)
   2694 {
   2695 	return raidwrite_component_area(dev, b_vp, clabel,
   2696 	    sizeof(RF_ComponentLabel_t),
   2697 	    rf_component_info_offset(),
   2698 	    rf_component_info_size(secsize), 0);
   2699 }
   2700 
   2701 /* ARGSUSED */
   2702 static int
   2703 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2704     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2705 {
   2706 	struct buf *bp;
   2707 	const struct bdevsw *bdev;
   2708 	int error;
   2709 
   2710 	/* get a block of the appropriate size... */
   2711 	bp = geteblk((int)dsize);
   2712 	bp->b_dev = dev;
   2713 
   2714 	/* get our ducks in a row for the write */
   2715 	bp->b_blkno = offset / DEV_BSIZE;
   2716 	bp->b_bcount = dsize;
   2717 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2718  	bp->b_resid = dsize;
   2719 
   2720 	memset(bp->b_data, 0, dsize);
   2721 	memcpy(bp->b_data, data, msize);
   2722 
   2723 	bdev = bdevsw_lookup(bp->b_dev);
   2724 	if (bdev == NULL)
   2725 		return (ENXIO);
   2726 	(*bdev->d_strategy)(bp);
   2727 	if (asyncp)
   2728 		return 0;
   2729 	error = biowait(bp);
   2730 	brelse(bp, 0);
   2731 	if (error) {
   2732 #if 1
   2733 		printf("Failed to write RAID component info!\n");
   2734 #endif
   2735 	}
   2736 
   2737 	return(error);
   2738 }
   2739 
   2740 void
   2741 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2742 {
   2743 	int c;
   2744 
   2745 	for (c = 0; c < raidPtr->numCol; c++) {
   2746 		/* Skip dead disks. */
   2747 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2748 			continue;
   2749 		/* XXXjld: what if an error occurs here? */
   2750 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2751 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2752 		    RF_PARITYMAP_NBYTE,
   2753 		    rf_parity_map_offset(raidPtr),
   2754 		    rf_parity_map_size(raidPtr), 0);
   2755 	}
   2756 }
   2757 
   2758 void
   2759 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2760 {
   2761 	struct rf_paritymap_ondisk tmp;
   2762 	int c,first;
   2763 
   2764 	first=1;
   2765 	for (c = 0; c < raidPtr->numCol; c++) {
   2766 		/* Skip dead disks. */
   2767 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2768 			continue;
   2769 		raidread_component_area(raidPtr->Disks[c].dev,
   2770 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2771 		    RF_PARITYMAP_NBYTE,
   2772 		    rf_parity_map_offset(raidPtr),
   2773 		    rf_parity_map_size(raidPtr));
   2774 		if (first) {
   2775 			memcpy(map, &tmp, sizeof(*map));
   2776 			first = 0;
   2777 		} else {
   2778 			rf_paritymap_merge(map, &tmp);
   2779 		}
   2780 	}
   2781 }
   2782 
   2783 void
   2784 rf_markalldirty(RF_Raid_t *raidPtr)
   2785 {
   2786 	RF_ComponentLabel_t *clabel;
   2787 	int sparecol;
   2788 	int c;
   2789 	int j;
   2790 	int scol = -1;
   2791 
   2792 	raidPtr->mod_counter++;
   2793 	for (c = 0; c < raidPtr->numCol; c++) {
   2794 		/* we don't want to touch (at all) a disk that has
   2795 		   failed */
   2796 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2797 			clabel = raidget_component_label(raidPtr, c);
   2798 			if (clabel->status == rf_ds_spared) {
   2799 				/* XXX do something special...
   2800 				   but whatever you do, don't
   2801 				   try to access it!! */
   2802 			} else {
   2803 				raidmarkdirty(raidPtr, c);
   2804 			}
   2805 		}
   2806 	}
   2807 
   2808 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2809 		sparecol = raidPtr->numCol + c;
   2810 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2811 			/*
   2812 
   2813 			   we claim this disk is "optimal" if it's
   2814 			   rf_ds_used_spare, as that means it should be
   2815 			   directly substitutable for the disk it replaced.
   2816 			   We note that too...
   2817 
   2818 			 */
   2819 
   2820 			for(j=0;j<raidPtr->numCol;j++) {
   2821 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2822 					scol = j;
   2823 					break;
   2824 				}
   2825 			}
   2826 
   2827 			clabel = raidget_component_label(raidPtr, sparecol);
   2828 			/* make sure status is noted */
   2829 
   2830 			raid_init_component_label(raidPtr, clabel);
   2831 
   2832 			clabel->row = 0;
   2833 			clabel->column = scol;
   2834 			/* Note: we *don't* change status from rf_ds_used_spare
   2835 			   to rf_ds_optimal */
   2836 			/* clabel.status = rf_ds_optimal; */
   2837 
   2838 			raidmarkdirty(raidPtr, sparecol);
   2839 		}
   2840 	}
   2841 }
   2842 
   2843 
   2844 void
   2845 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2846 {
   2847 	RF_ComponentLabel_t *clabel;
   2848 	int sparecol;
   2849 	int c;
   2850 	int j;
   2851 	int scol;
   2852 
   2853 	scol = -1;
   2854 
   2855 	/* XXX should do extra checks to make sure things really are clean,
   2856 	   rather than blindly setting the clean bit... */
   2857 
   2858 	raidPtr->mod_counter++;
   2859 
   2860 	for (c = 0; c < raidPtr->numCol; c++) {
   2861 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2862 			clabel = raidget_component_label(raidPtr, c);
   2863 			/* make sure status is noted */
   2864 			clabel->status = rf_ds_optimal;
   2865 
   2866 			/* note what unit we are configured as */
   2867 			clabel->last_unit = raidPtr->raidid;
   2868 
   2869 			raidflush_component_label(raidPtr, c);
   2870 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2871 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2872 					raidmarkclean(raidPtr, c);
   2873 				}
   2874 			}
   2875 		}
   2876 		/* else we don't touch it.. */
   2877 	}
   2878 
   2879 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2880 		sparecol = raidPtr->numCol + c;
   2881 		/* Need to ensure that the reconstruct actually completed! */
   2882 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2883 			/*
   2884 
   2885 			   we claim this disk is "optimal" if it's
   2886 			   rf_ds_used_spare, as that means it should be
   2887 			   directly substitutable for the disk it replaced.
   2888 			   We note that too...
   2889 
   2890 			 */
   2891 
   2892 			for(j=0;j<raidPtr->numCol;j++) {
   2893 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2894 					scol = j;
   2895 					break;
   2896 				}
   2897 			}
   2898 
   2899 			/* XXX shouldn't *really* need this... */
   2900 			clabel = raidget_component_label(raidPtr, sparecol);
   2901 			/* make sure status is noted */
   2902 
   2903 			raid_init_component_label(raidPtr, clabel);
   2904 
   2905 			clabel->column = scol;
   2906 			clabel->status = rf_ds_optimal;
   2907 			clabel->last_unit = raidPtr->raidid;
   2908 
   2909 			raidflush_component_label(raidPtr, sparecol);
   2910 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2911 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2912 					raidmarkclean(raidPtr, sparecol);
   2913 				}
   2914 			}
   2915 		}
   2916 	}
   2917 }
   2918 
   2919 void
   2920 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2921 {
   2922 
   2923 	if (vp != NULL) {
   2924 		if (auto_configured == 1) {
   2925 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2926 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2927 			vput(vp);
   2928 
   2929 		} else {
   2930 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2931 		}
   2932 	}
   2933 }
   2934 
   2935 
   2936 void
   2937 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2938 {
   2939 	int r,c;
   2940 	struct vnode *vp;
   2941 	int acd;
   2942 
   2943 
   2944 	/* We take this opportunity to close the vnodes like we should.. */
   2945 
   2946 	for (c = 0; c < raidPtr->numCol; c++) {
   2947 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2948 		acd = raidPtr->Disks[c].auto_configured;
   2949 		rf_close_component(raidPtr, vp, acd);
   2950 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2951 		raidPtr->Disks[c].auto_configured = 0;
   2952 	}
   2953 
   2954 	for (r = 0; r < raidPtr->numSpare; r++) {
   2955 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2956 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2957 		rf_close_component(raidPtr, vp, acd);
   2958 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2959 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2960 	}
   2961 }
   2962 
   2963 
   2964 void
   2965 rf_ReconThread(struct rf_recon_req *req)
   2966 {
   2967 	int     s;
   2968 	RF_Raid_t *raidPtr;
   2969 
   2970 	s = splbio();
   2971 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2972 	raidPtr->recon_in_progress = 1;
   2973 
   2974 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2975 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2976 
   2977 	RF_Free(req, sizeof(*req));
   2978 
   2979 	raidPtr->recon_in_progress = 0;
   2980 	splx(s);
   2981 
   2982 	/* That's all... */
   2983 	kthread_exit(0);	/* does not return */
   2984 }
   2985 
   2986 void
   2987 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2988 {
   2989 	int retcode;
   2990 	int s;
   2991 
   2992 	raidPtr->parity_rewrite_stripes_done = 0;
   2993 	raidPtr->parity_rewrite_in_progress = 1;
   2994 	s = splbio();
   2995 	retcode = rf_RewriteParity(raidPtr);
   2996 	splx(s);
   2997 	if (retcode) {
   2998 		printf("raid%d: Error re-writing parity (%d)!\n",
   2999 		    raidPtr->raidid, retcode);
   3000 	} else {
   3001 		/* set the clean bit!  If we shutdown correctly,
   3002 		   the clean bit on each component label will get
   3003 		   set */
   3004 		raidPtr->parity_good = RF_RAID_CLEAN;
   3005 	}
   3006 	raidPtr->parity_rewrite_in_progress = 0;
   3007 
   3008 	/* Anyone waiting for us to stop?  If so, inform them... */
   3009 	if (raidPtr->waitShutdown) {
   3010 		wakeup(&raidPtr->parity_rewrite_in_progress);
   3011 	}
   3012 
   3013 	/* That's all... */
   3014 	kthread_exit(0);	/* does not return */
   3015 }
   3016 
   3017 
   3018 void
   3019 rf_CopybackThread(RF_Raid_t *raidPtr)
   3020 {
   3021 	int s;
   3022 
   3023 	raidPtr->copyback_in_progress = 1;
   3024 	s = splbio();
   3025 	rf_CopybackReconstructedData(raidPtr);
   3026 	splx(s);
   3027 	raidPtr->copyback_in_progress = 0;
   3028 
   3029 	/* That's all... */
   3030 	kthread_exit(0);	/* does not return */
   3031 }
   3032 
   3033 
   3034 void
   3035 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3036 {
   3037 	int s;
   3038 	RF_Raid_t *raidPtr;
   3039 
   3040 	s = splbio();
   3041 	raidPtr = req->raidPtr;
   3042 	raidPtr->recon_in_progress = 1;
   3043 	rf_ReconstructInPlace(raidPtr, req->col);
   3044 	RF_Free(req, sizeof(*req));
   3045 	raidPtr->recon_in_progress = 0;
   3046 	splx(s);
   3047 
   3048 	/* That's all... */
   3049 	kthread_exit(0);	/* does not return */
   3050 }
   3051 
   3052 static RF_AutoConfig_t *
   3053 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3054     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3055     unsigned secsize)
   3056 {
   3057 	int good_one = 0;
   3058 	RF_ComponentLabel_t *clabel;
   3059 	RF_AutoConfig_t *ac;
   3060 
   3061 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3062 	if (clabel == NULL) {
   3063 oomem:
   3064 		    while(ac_list) {
   3065 			    ac = ac_list;
   3066 			    if (ac->clabel)
   3067 				    free(ac->clabel, M_RAIDFRAME);
   3068 			    ac_list = ac_list->next;
   3069 			    free(ac, M_RAIDFRAME);
   3070 		    }
   3071 		    printf("RAID auto config: out of memory!\n");
   3072 		    return NULL; /* XXX probably should panic? */
   3073 	}
   3074 
   3075 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3076 		/* Got the label.  Does it look reasonable? */
   3077 		if (rf_reasonable_label(clabel, numsecs) &&
   3078 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3079 #ifdef DEBUG
   3080 			printf("Component on: %s: %llu\n",
   3081 				cname, (unsigned long long)size);
   3082 			rf_print_component_label(clabel);
   3083 #endif
   3084 			/* if it's reasonable, add it, else ignore it. */
   3085 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3086 				M_NOWAIT);
   3087 			if (ac == NULL) {
   3088 				free(clabel, M_RAIDFRAME);
   3089 				goto oomem;
   3090 			}
   3091 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3092 			ac->dev = dev;
   3093 			ac->vp = vp;
   3094 			ac->clabel = clabel;
   3095 			ac->next = ac_list;
   3096 			ac_list = ac;
   3097 			good_one = 1;
   3098 		}
   3099 	}
   3100 	if (!good_one) {
   3101 		/* cleanup */
   3102 		free(clabel, M_RAIDFRAME);
   3103 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3104 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3105 		vput(vp);
   3106 	}
   3107 	return ac_list;
   3108 }
   3109 
   3110 RF_AutoConfig_t *
   3111 rf_find_raid_components(void)
   3112 {
   3113 	struct vnode *vp;
   3114 	struct disklabel label;
   3115 	device_t dv;
   3116 	deviter_t di;
   3117 	dev_t dev;
   3118 	int bmajor, bminor, wedge, rf_part_found;
   3119 	int error;
   3120 	int i;
   3121 	RF_AutoConfig_t *ac_list;
   3122 	uint64_t numsecs;
   3123 	unsigned secsize;
   3124 
   3125 	/* initialize the AutoConfig list */
   3126 	ac_list = NULL;
   3127 
   3128 	/* we begin by trolling through *all* the devices on the system */
   3129 
   3130 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3131 	     dv = deviter_next(&di)) {
   3132 
   3133 		/* we are only interested in disks... */
   3134 		if (device_class(dv) != DV_DISK)
   3135 			continue;
   3136 
   3137 		/* we don't care about floppies... */
   3138 		if (device_is_a(dv, "fd")) {
   3139 			continue;
   3140 		}
   3141 
   3142 		/* we don't care about CD's... */
   3143 		if (device_is_a(dv, "cd")) {
   3144 			continue;
   3145 		}
   3146 
   3147 		/* we don't care about md's... */
   3148 		if (device_is_a(dv, "md")) {
   3149 			continue;
   3150 		}
   3151 
   3152 		/* hdfd is the Atari/Hades floppy driver */
   3153 		if (device_is_a(dv, "hdfd")) {
   3154 			continue;
   3155 		}
   3156 
   3157 		/* fdisa is the Atari/Milan floppy driver */
   3158 		if (device_is_a(dv, "fdisa")) {
   3159 			continue;
   3160 		}
   3161 
   3162 		/* need to find the device_name_to_block_device_major stuff */
   3163 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3164 
   3165 		rf_part_found = 0; /*No raid partition as yet*/
   3166 
   3167 		/* get a vnode for the raw partition of this disk */
   3168 
   3169 		wedge = device_is_a(dv, "dk");
   3170 		bminor = minor(device_unit(dv));
   3171 		dev = wedge ? makedev(bmajor, bminor) :
   3172 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3173 		if (bdevvp(dev, &vp))
   3174 			panic("RAID can't alloc vnode");
   3175 
   3176 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3177 
   3178 		if (error) {
   3179 			/* "Who cares."  Continue looking
   3180 			   for something that exists*/
   3181 			vput(vp);
   3182 			continue;
   3183 		}
   3184 
   3185 		error = getdisksize(vp, &numsecs, &secsize);
   3186 		if (error) {
   3187 			vput(vp);
   3188 			continue;
   3189 		}
   3190 		if (wedge) {
   3191 			struct dkwedge_info dkw;
   3192 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3193 			    NOCRED);
   3194 			if (error) {
   3195 				printf("RAIDframe: can't get wedge info for "
   3196 				    "dev %s (%d)\n", device_xname(dv), error);
   3197 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3198 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3199 				vput(vp);
   3200 				continue;
   3201 			}
   3202 
   3203 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3204 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3205 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3206 				vput(vp);
   3207 				continue;
   3208 			}
   3209 
   3210 			ac_list = rf_get_component(ac_list, dev, vp,
   3211 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3212 			rf_part_found = 1; /*There is a raid component on this disk*/
   3213 			continue;
   3214 		}
   3215 
   3216 		/* Ok, the disk exists.  Go get the disklabel. */
   3217 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3218 		if (error) {
   3219 			/*
   3220 			 * XXX can't happen - open() would
   3221 			 * have errored out (or faked up one)
   3222 			 */
   3223 			if (error != ENOTTY)
   3224 				printf("RAIDframe: can't get label for dev "
   3225 				    "%s (%d)\n", device_xname(dv), error);
   3226 		}
   3227 
   3228 		/* don't need this any more.  We'll allocate it again
   3229 		   a little later if we really do... */
   3230 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3231 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3232 		vput(vp);
   3233 
   3234 		if (error)
   3235 			continue;
   3236 
   3237 		rf_part_found = 0; /*No raid partitions yet*/
   3238 		for (i = 0; i < label.d_npartitions; i++) {
   3239 			char cname[sizeof(ac_list->devname)];
   3240 
   3241 			/* We only support partitions marked as RAID */
   3242 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3243 				continue;
   3244 
   3245 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3246 			if (bdevvp(dev, &vp))
   3247 				panic("RAID can't alloc vnode");
   3248 
   3249 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3250 			if (error) {
   3251 				/* Whatever... */
   3252 				vput(vp);
   3253 				continue;
   3254 			}
   3255 			snprintf(cname, sizeof(cname), "%s%c",
   3256 			    device_xname(dv), 'a' + i);
   3257 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3258 				label.d_partitions[i].p_size, numsecs, secsize);
   3259 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3260 		}
   3261 
   3262 		/*
   3263 		 *If there is no raid component on this disk, either in a
   3264 		 *disklabel or inside a wedge, check the raw partition as well,
   3265 		 *as it is possible to configure raid components on raw disk
   3266 		 *devices.
   3267 		 */
   3268 
   3269 		if (!rf_part_found) {
   3270 			char cname[sizeof(ac_list->devname)];
   3271 
   3272 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3273 			if (bdevvp(dev, &vp))
   3274 				panic("RAID can't alloc vnode");
   3275 
   3276 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3277 			if (error) {
   3278 				/* Whatever... */
   3279 				vput(vp);
   3280 				continue;
   3281 			}
   3282 			snprintf(cname, sizeof(cname), "%s%c",
   3283 			    device_xname(dv), 'a' + RAW_PART);
   3284 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3285 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3286 		}
   3287 	}
   3288 	deviter_release(&di);
   3289 	return ac_list;
   3290 }
   3291 
   3292 
   3293 int
   3294 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3295 {
   3296 
   3297 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3298 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3299 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3300 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3301 	    clabel->row >=0 &&
   3302 	    clabel->column >= 0 &&
   3303 	    clabel->num_rows > 0 &&
   3304 	    clabel->num_columns > 0 &&
   3305 	    clabel->row < clabel->num_rows &&
   3306 	    clabel->column < clabel->num_columns &&
   3307 	    clabel->blockSize > 0 &&
   3308 	    /*
   3309 	     * numBlocksHi may contain garbage, but it is ok since
   3310 	     * the type is unsigned.  If it is really garbage,
   3311 	     * rf_fix_old_label_size() will fix it.
   3312 	     */
   3313 	    rf_component_label_numblocks(clabel) > 0) {
   3314 		/*
   3315 		 * label looks reasonable enough...
   3316 		 * let's make sure it has no old garbage.
   3317 		 */
   3318 		if (numsecs)
   3319 			rf_fix_old_label_size(clabel, numsecs);
   3320 		return(1);
   3321 	}
   3322 	return(0);
   3323 }
   3324 
   3325 
   3326 /*
   3327  * For reasons yet unknown, some old component labels have garbage in
   3328  * the newer numBlocksHi region, and this causes lossage.  Since those
   3329  * disks will also have numsecs set to less than 32 bits of sectors,
   3330  * we can determine when this corruption has occurred, and fix it.
   3331  *
   3332  * The exact same problem, with the same unknown reason, happens to
   3333  * the partitionSizeHi member as well.
   3334  */
   3335 static void
   3336 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3337 {
   3338 
   3339 	if (numsecs < ((uint64_t)1 << 32)) {
   3340 		if (clabel->numBlocksHi) {
   3341 			printf("WARNING: total sectors < 32 bits, yet "
   3342 			       "numBlocksHi set\n"
   3343 			       "WARNING: resetting numBlocksHi to zero.\n");
   3344 			clabel->numBlocksHi = 0;
   3345 		}
   3346 
   3347 		if (clabel->partitionSizeHi) {
   3348 			printf("WARNING: total sectors < 32 bits, yet "
   3349 			       "partitionSizeHi set\n"
   3350 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3351 			clabel->partitionSizeHi = 0;
   3352 		}
   3353 	}
   3354 }
   3355 
   3356 
   3357 #ifdef DEBUG
   3358 void
   3359 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3360 {
   3361 	uint64_t numBlocks;
   3362 	static const char *rp[] = {
   3363 	    "No", "Force", "Soft", "*invalid*"
   3364 	};
   3365 
   3366 
   3367 	numBlocks = rf_component_label_numblocks(clabel);
   3368 
   3369 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3370 	       clabel->row, clabel->column,
   3371 	       clabel->num_rows, clabel->num_columns);
   3372 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3373 	       clabel->version, clabel->serial_number,
   3374 	       clabel->mod_counter);
   3375 	printf("   Clean: %s Status: %d\n",
   3376 	       clabel->clean ? "Yes" : "No", clabel->status);
   3377 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3378 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3379 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3380 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3381 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3382 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3383 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3384 #if 0
   3385 	   printf("   Config order: %d\n", clabel->config_order);
   3386 #endif
   3387 
   3388 }
   3389 #endif
   3390 
   3391 RF_ConfigSet_t *
   3392 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3393 {
   3394 	RF_AutoConfig_t *ac;
   3395 	RF_ConfigSet_t *config_sets;
   3396 	RF_ConfigSet_t *cset;
   3397 	RF_AutoConfig_t *ac_next;
   3398 
   3399 
   3400 	config_sets = NULL;
   3401 
   3402 	/* Go through the AutoConfig list, and figure out which components
   3403 	   belong to what sets.  */
   3404 	ac = ac_list;
   3405 	while(ac!=NULL) {
   3406 		/* we're going to putz with ac->next, so save it here
   3407 		   for use at the end of the loop */
   3408 		ac_next = ac->next;
   3409 
   3410 		if (config_sets == NULL) {
   3411 			/* will need at least this one... */
   3412 			config_sets = (RF_ConfigSet_t *)
   3413 				malloc(sizeof(RF_ConfigSet_t),
   3414 				       M_RAIDFRAME, M_NOWAIT);
   3415 			if (config_sets == NULL) {
   3416 				panic("rf_create_auto_sets: No memory!");
   3417 			}
   3418 			/* this one is easy :) */
   3419 			config_sets->ac = ac;
   3420 			config_sets->next = NULL;
   3421 			config_sets->rootable = 0;
   3422 			ac->next = NULL;
   3423 		} else {
   3424 			/* which set does this component fit into? */
   3425 			cset = config_sets;
   3426 			while(cset!=NULL) {
   3427 				if (rf_does_it_fit(cset, ac)) {
   3428 					/* looks like it matches... */
   3429 					ac->next = cset->ac;
   3430 					cset->ac = ac;
   3431 					break;
   3432 				}
   3433 				cset = cset->next;
   3434 			}
   3435 			if (cset==NULL) {
   3436 				/* didn't find a match above... new set..*/
   3437 				cset = (RF_ConfigSet_t *)
   3438 					malloc(sizeof(RF_ConfigSet_t),
   3439 					       M_RAIDFRAME, M_NOWAIT);
   3440 				if (cset == NULL) {
   3441 					panic("rf_create_auto_sets: No memory!");
   3442 				}
   3443 				cset->ac = ac;
   3444 				ac->next = NULL;
   3445 				cset->next = config_sets;
   3446 				cset->rootable = 0;
   3447 				config_sets = cset;
   3448 			}
   3449 		}
   3450 		ac = ac_next;
   3451 	}
   3452 
   3453 
   3454 	return(config_sets);
   3455 }
   3456 
   3457 static int
   3458 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3459 {
   3460 	RF_ComponentLabel_t *clabel1, *clabel2;
   3461 
   3462 	/* If this one matches the *first* one in the set, that's good
   3463 	   enough, since the other members of the set would have been
   3464 	   through here too... */
   3465 	/* note that we are not checking partitionSize here..
   3466 
   3467 	   Note that we are also not checking the mod_counters here.
   3468 	   If everything else matches except the mod_counter, that's
   3469 	   good enough for this test.  We will deal with the mod_counters
   3470 	   a little later in the autoconfiguration process.
   3471 
   3472 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3473 
   3474 	   The reason we don't check for this is that failed disks
   3475 	   will have lower modification counts.  If those disks are
   3476 	   not added to the set they used to belong to, then they will
   3477 	   form their own set, which may result in 2 different sets,
   3478 	   for example, competing to be configured at raid0, and
   3479 	   perhaps competing to be the root filesystem set.  If the
   3480 	   wrong ones get configured, or both attempt to become /,
   3481 	   weird behaviour and or serious lossage will occur.  Thus we
   3482 	   need to bring them into the fold here, and kick them out at
   3483 	   a later point.
   3484 
   3485 	*/
   3486 
   3487 	clabel1 = cset->ac->clabel;
   3488 	clabel2 = ac->clabel;
   3489 	if ((clabel1->version == clabel2->version) &&
   3490 	    (clabel1->serial_number == clabel2->serial_number) &&
   3491 	    (clabel1->num_rows == clabel2->num_rows) &&
   3492 	    (clabel1->num_columns == clabel2->num_columns) &&
   3493 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3494 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3495 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3496 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3497 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3498 	    (clabel1->blockSize == clabel2->blockSize) &&
   3499 	    rf_component_label_numblocks(clabel1) ==
   3500 	    rf_component_label_numblocks(clabel2) &&
   3501 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3502 	    (clabel1->root_partition == clabel2->root_partition) &&
   3503 	    (clabel1->last_unit == clabel2->last_unit) &&
   3504 	    (clabel1->config_order == clabel2->config_order)) {
   3505 		/* if it get's here, it almost *has* to be a match */
   3506 	} else {
   3507 		/* it's not consistent with somebody in the set..
   3508 		   punt */
   3509 		return(0);
   3510 	}
   3511 	/* all was fine.. it must fit... */
   3512 	return(1);
   3513 }
   3514 
   3515 int
   3516 rf_have_enough_components(RF_ConfigSet_t *cset)
   3517 {
   3518 	RF_AutoConfig_t *ac;
   3519 	RF_AutoConfig_t *auto_config;
   3520 	RF_ComponentLabel_t *clabel;
   3521 	int c;
   3522 	int num_cols;
   3523 	int num_missing;
   3524 	int mod_counter;
   3525 	int mod_counter_found;
   3526 	int even_pair_failed;
   3527 	char parity_type;
   3528 
   3529 
   3530 	/* check to see that we have enough 'live' components
   3531 	   of this set.  If so, we can configure it if necessary */
   3532 
   3533 	num_cols = cset->ac->clabel->num_columns;
   3534 	parity_type = cset->ac->clabel->parityConfig;
   3535 
   3536 	/* XXX Check for duplicate components!?!?!? */
   3537 
   3538 	/* Determine what the mod_counter is supposed to be for this set. */
   3539 
   3540 	mod_counter_found = 0;
   3541 	mod_counter = 0;
   3542 	ac = cset->ac;
   3543 	while(ac!=NULL) {
   3544 		if (mod_counter_found==0) {
   3545 			mod_counter = ac->clabel->mod_counter;
   3546 			mod_counter_found = 1;
   3547 		} else {
   3548 			if (ac->clabel->mod_counter > mod_counter) {
   3549 				mod_counter = ac->clabel->mod_counter;
   3550 			}
   3551 		}
   3552 		ac = ac->next;
   3553 	}
   3554 
   3555 	num_missing = 0;
   3556 	auto_config = cset->ac;
   3557 
   3558 	even_pair_failed = 0;
   3559 	for(c=0; c<num_cols; c++) {
   3560 		ac = auto_config;
   3561 		while(ac!=NULL) {
   3562 			if ((ac->clabel->column == c) &&
   3563 			    (ac->clabel->mod_counter == mod_counter)) {
   3564 				/* it's this one... */
   3565 #ifdef DEBUG
   3566 				printf("Found: %s at %d\n",
   3567 				       ac->devname,c);
   3568 #endif
   3569 				break;
   3570 			}
   3571 			ac=ac->next;
   3572 		}
   3573 		if (ac==NULL) {
   3574 				/* Didn't find one here! */
   3575 				/* special case for RAID 1, especially
   3576 				   where there are more than 2
   3577 				   components (where RAIDframe treats
   3578 				   things a little differently :( ) */
   3579 			if (parity_type == '1') {
   3580 				if (c%2 == 0) { /* even component */
   3581 					even_pair_failed = 1;
   3582 				} else { /* odd component.  If
   3583 					    we're failed, and
   3584 					    so is the even
   3585 					    component, it's
   3586 					    "Good Night, Charlie" */
   3587 					if (even_pair_failed == 1) {
   3588 						return(0);
   3589 					}
   3590 				}
   3591 			} else {
   3592 				/* normal accounting */
   3593 				num_missing++;
   3594 			}
   3595 		}
   3596 		if ((parity_type == '1') && (c%2 == 1)) {
   3597 				/* Just did an even component, and we didn't
   3598 				   bail.. reset the even_pair_failed flag,
   3599 				   and go on to the next component.... */
   3600 			even_pair_failed = 0;
   3601 		}
   3602 	}
   3603 
   3604 	clabel = cset->ac->clabel;
   3605 
   3606 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3607 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3608 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3609 		/* XXX this needs to be made *much* more general */
   3610 		/* Too many failures */
   3611 		return(0);
   3612 	}
   3613 	/* otherwise, all is well, and we've got enough to take a kick
   3614 	   at autoconfiguring this set */
   3615 	return(1);
   3616 }
   3617 
   3618 void
   3619 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3620 			RF_Raid_t *raidPtr)
   3621 {
   3622 	RF_ComponentLabel_t *clabel;
   3623 	int i;
   3624 
   3625 	clabel = ac->clabel;
   3626 
   3627 	/* 1. Fill in the common stuff */
   3628 	config->numRow = clabel->num_rows = 1;
   3629 	config->numCol = clabel->num_columns;
   3630 	config->numSpare = 0; /* XXX should this be set here? */
   3631 	config->sectPerSU = clabel->sectPerSU;
   3632 	config->SUsPerPU = clabel->SUsPerPU;
   3633 	config->SUsPerRU = clabel->SUsPerRU;
   3634 	config->parityConfig = clabel->parityConfig;
   3635 	/* XXX... */
   3636 	strcpy(config->diskQueueType,"fifo");
   3637 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3638 	config->layoutSpecificSize = 0; /* XXX ?? */
   3639 
   3640 	while(ac!=NULL) {
   3641 		/* row/col values will be in range due to the checks
   3642 		   in reasonable_label() */
   3643 		strcpy(config->devnames[0][ac->clabel->column],
   3644 		       ac->devname);
   3645 		ac = ac->next;
   3646 	}
   3647 
   3648 	for(i=0;i<RF_MAXDBGV;i++) {
   3649 		config->debugVars[i][0] = 0;
   3650 	}
   3651 }
   3652 
   3653 int
   3654 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3655 {
   3656 	RF_ComponentLabel_t *clabel;
   3657 	int column;
   3658 	int sparecol;
   3659 
   3660 	raidPtr->autoconfigure = new_value;
   3661 
   3662 	for(column=0; column<raidPtr->numCol; column++) {
   3663 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3664 			clabel = raidget_component_label(raidPtr, column);
   3665 			clabel->autoconfigure = new_value;
   3666 			raidflush_component_label(raidPtr, column);
   3667 		}
   3668 	}
   3669 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3670 		sparecol = raidPtr->numCol + column;
   3671 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3672 			clabel = raidget_component_label(raidPtr, sparecol);
   3673 			clabel->autoconfigure = new_value;
   3674 			raidflush_component_label(raidPtr, sparecol);
   3675 		}
   3676 	}
   3677 	return(new_value);
   3678 }
   3679 
   3680 int
   3681 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3682 {
   3683 	RF_ComponentLabel_t *clabel;
   3684 	int column;
   3685 	int sparecol;
   3686 
   3687 	raidPtr->root_partition = new_value;
   3688 	for(column=0; column<raidPtr->numCol; column++) {
   3689 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3690 			clabel = raidget_component_label(raidPtr, column);
   3691 			clabel->root_partition = new_value;
   3692 			raidflush_component_label(raidPtr, column);
   3693 		}
   3694 	}
   3695 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3696 		sparecol = raidPtr->numCol + column;
   3697 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3698 			clabel = raidget_component_label(raidPtr, sparecol);
   3699 			clabel->root_partition = new_value;
   3700 			raidflush_component_label(raidPtr, sparecol);
   3701 		}
   3702 	}
   3703 	return(new_value);
   3704 }
   3705 
   3706 void
   3707 rf_release_all_vps(RF_ConfigSet_t *cset)
   3708 {
   3709 	RF_AutoConfig_t *ac;
   3710 
   3711 	ac = cset->ac;
   3712 	while(ac!=NULL) {
   3713 		/* Close the vp, and give it back */
   3714 		if (ac->vp) {
   3715 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3716 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3717 			vput(ac->vp);
   3718 			ac->vp = NULL;
   3719 		}
   3720 		ac = ac->next;
   3721 	}
   3722 }
   3723 
   3724 
   3725 void
   3726 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3727 {
   3728 	RF_AutoConfig_t *ac;
   3729 	RF_AutoConfig_t *next_ac;
   3730 
   3731 	ac = cset->ac;
   3732 	while(ac!=NULL) {
   3733 		next_ac = ac->next;
   3734 		/* nuke the label */
   3735 		free(ac->clabel, M_RAIDFRAME);
   3736 		/* cleanup the config structure */
   3737 		free(ac, M_RAIDFRAME);
   3738 		/* "next.." */
   3739 		ac = next_ac;
   3740 	}
   3741 	/* and, finally, nuke the config set */
   3742 	free(cset, M_RAIDFRAME);
   3743 }
   3744 
   3745 
   3746 void
   3747 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3748 {
   3749 	/* current version number */
   3750 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3751 	clabel->serial_number = raidPtr->serial_number;
   3752 	clabel->mod_counter = raidPtr->mod_counter;
   3753 
   3754 	clabel->num_rows = 1;
   3755 	clabel->num_columns = raidPtr->numCol;
   3756 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3757 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3758 
   3759 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3760 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3761 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3762 
   3763 	clabel->blockSize = raidPtr->bytesPerSector;
   3764 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3765 
   3766 	/* XXX not portable */
   3767 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3768 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3769 	clabel->autoconfigure = raidPtr->autoconfigure;
   3770 	clabel->root_partition = raidPtr->root_partition;
   3771 	clabel->last_unit = raidPtr->raidid;
   3772 	clabel->config_order = raidPtr->config_order;
   3773 
   3774 #ifndef RF_NO_PARITY_MAP
   3775 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3776 #endif
   3777 }
   3778 
   3779 struct raid_softc *
   3780 rf_auto_config_set(RF_ConfigSet_t *cset)
   3781 {
   3782 	RF_Raid_t *raidPtr;
   3783 	RF_Config_t *config;
   3784 	int raidID;
   3785 	struct raid_softc *sc;
   3786 
   3787 #ifdef DEBUG
   3788 	printf("RAID autoconfigure\n");
   3789 #endif
   3790 
   3791 	/* 1. Create a config structure */
   3792 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3793 	if (config == NULL) {
   3794 		printf("Out of mem!?!?\n");
   3795 				/* XXX do something more intelligent here. */
   3796 		return NULL;
   3797 	}
   3798 
   3799 	/*
   3800 	   2. Figure out what RAID ID this one is supposed to live at
   3801 	   See if we can get the same RAID dev that it was configured
   3802 	   on last time..
   3803 	*/
   3804 
   3805 	raidID = cset->ac->clabel->last_unit;
   3806 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3807 		continue;
   3808 #ifdef DEBUG
   3809 	printf("Configuring raid%d:\n",raidID);
   3810 #endif
   3811 
   3812 	raidPtr = &sc->sc_r;
   3813 
   3814 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3815 	raidPtr->softc = sc;
   3816 	raidPtr->raidid = raidID;
   3817 	raidPtr->openings = RAIDOUTSTANDING;
   3818 
   3819 	/* 3. Build the configuration structure */
   3820 	rf_create_configuration(cset->ac, config, raidPtr);
   3821 
   3822 	/* 4. Do the configuration */
   3823 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3824 		raidinit(sc);
   3825 
   3826 		rf_markalldirty(raidPtr);
   3827 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3828 		switch (cset->ac->clabel->root_partition) {
   3829 		case 1:	/* Force Root */
   3830 		case 2:	/* Soft Root: root when boot partition part of raid */
   3831 			/*
   3832 			 * everything configured just fine.  Make a note
   3833 			 * that this set is eligible to be root,
   3834 			 * or forced to be root
   3835 			 */
   3836 			cset->rootable = cset->ac->clabel->root_partition;
   3837 			/* XXX do this here? */
   3838 			raidPtr->root_partition = cset->rootable;
   3839 			break;
   3840 		default:
   3841 			break;
   3842 		}
   3843 	} else {
   3844 		raidput(sc);
   3845 		sc = NULL;
   3846 	}
   3847 
   3848 	/* 5. Cleanup */
   3849 	free(config, M_RAIDFRAME);
   3850 	return sc;
   3851 }
   3852 
   3853 void
   3854 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3855 {
   3856 	struct buf *bp;
   3857 	struct raid_softc *rs;
   3858 
   3859 	bp = (struct buf *)desc->bp;
   3860 	rs = desc->raidPtr->softc;
   3861 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3862 	    (bp->b_flags & B_READ));
   3863 }
   3864 
   3865 void
   3866 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3867 	     size_t xmin, size_t xmax)
   3868 {
   3869 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3870 	pool_sethiwat(p, xmax);
   3871 	pool_prime(p, xmin);
   3872 	pool_setlowat(p, xmin);
   3873 }
   3874 
   3875 /*
   3876  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3877  * if there is IO pending and if that IO could possibly be done for a
   3878  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3879  * otherwise.
   3880  *
   3881  */
   3882 
   3883 int
   3884 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3885 {
   3886 	struct raid_softc *rs = raidPtr->softc;
   3887 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3888 		/* there is work to do */
   3889 		return 0;
   3890 	}
   3891 	/* default is nothing to do */
   3892 	return 1;
   3893 }
   3894 
   3895 int
   3896 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3897 {
   3898 	uint64_t numsecs;
   3899 	unsigned secsize;
   3900 	int error;
   3901 
   3902 	error = getdisksize(vp, &numsecs, &secsize);
   3903 	if (error == 0) {
   3904 		diskPtr->blockSize = secsize;
   3905 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3906 		diskPtr->partitionSize = numsecs;
   3907 		return 0;
   3908 	}
   3909 	return error;
   3910 }
   3911 
   3912 static int
   3913 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3914 {
   3915 	return 1;
   3916 }
   3917 
   3918 static void
   3919 raid_attach(device_t parent, device_t self, void *aux)
   3920 {
   3921 
   3922 }
   3923 
   3924 
   3925 static int
   3926 raid_detach(device_t self, int flags)
   3927 {
   3928 	int error;
   3929 	struct raid_softc *rs = raidget(device_unit(self));
   3930 
   3931 	if (rs == NULL)
   3932 		return ENXIO;
   3933 
   3934 	if ((error = raidlock(rs)) != 0)
   3935 		return (error);
   3936 
   3937 	error = raid_detach_unlocked(rs);
   3938 
   3939 	raidunlock(rs);
   3940 
   3941 	/* XXXkd: raidput(rs) ??? */
   3942 
   3943 	return error;
   3944 }
   3945 
   3946 static void
   3947 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3948 {
   3949 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3950 
   3951 	memset(dg, 0, sizeof(*dg));
   3952 
   3953 	dg->dg_secperunit = raidPtr->totalSectors;
   3954 	dg->dg_secsize = raidPtr->bytesPerSector;
   3955 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3956 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3957 
   3958 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3959 }
   3960 
   3961 /*
   3962  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3963  * We end up returning whatever error was returned by the first cache flush
   3964  * that fails.
   3965  */
   3966 
   3967 int
   3968 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3969 {
   3970 	int c, sparecol;
   3971 	int e,error;
   3972 	int force = 1;
   3973 
   3974 	error = 0;
   3975 	for (c = 0; c < raidPtr->numCol; c++) {
   3976 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3977 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3978 					  &force, FWRITE, NOCRED);
   3979 			if (e) {
   3980 				if (e != ENODEV)
   3981 					printf("raid%d: cache flush to component %s failed.\n",
   3982 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3983 				if (error == 0) {
   3984 					error = e;
   3985 				}
   3986 			}
   3987 		}
   3988 	}
   3989 
   3990 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3991 		sparecol = raidPtr->numCol + c;
   3992 		/* Need to ensure that the reconstruct actually completed! */
   3993 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3994 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3995 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3996 			if (e) {
   3997 				if (e != ENODEV)
   3998 					printf("raid%d: cache flush to component %s failed.\n",
   3999 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   4000 				if (error == 0) {
   4001 					error = e;
   4002 				}
   4003 			}
   4004 		}
   4005 	}
   4006 	return error;
   4007 }
   4008