Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.345.2.4
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.345.2.4 2016/07/19 06:26:59 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.345.2.4 2016/07/19 06:26:59 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/localcount.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #ifdef COMPAT_50
    153 #include "rf_compat50.h"
    154 #endif
    155 
    156 #include "ioconf.h"
    157 
    158 #ifdef DEBUG
    159 int     rf_kdebug_level = 0;
    160 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    161 #else				/* DEBUG */
    162 #define db1_printf(a) { }
    163 #endif				/* DEBUG */
    164 
    165 #ifdef DEBUG_ROOT
    166 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    167 #else
    168 #define DPRINTF(a, ...)
    169 #endif
    170 
    171 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    172 static rf_declare_mutex2(rf_sparet_wait_mutex);
    173 static rf_declare_cond2(rf_sparet_wait_cv);
    174 static rf_declare_cond2(rf_sparet_resp_cv);
    175 
    176 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    177 						 * spare table */
    178 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    179 						 * installation process */
    180 #endif
    181 
    182 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    183 
    184 /* prototypes */
    185 static void KernelWakeupFunc(struct buf *);
    186 static void InitBP(struct buf *, struct vnode *, unsigned,
    187     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    188     void *, int, struct proc *);
    189 struct raid_softc;
    190 static void raidinit(struct raid_softc *);
    191 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 const struct bdevsw raid_bdevsw = {
    221 	LOCALCOUNT_INITIALIZER
    222 	.d_open = raidopen,
    223 	.d_close = raidclose,
    224 	.d_strategy = raidstrategy,
    225 	.d_ioctl = raidioctl,
    226 	.d_dump = raiddump,
    227 	.d_psize = raidsize,
    228 	.d_discard = nodiscard,
    229 	.d_flag = D_DISK
    230 };
    231 
    232 const struct cdevsw raid_cdevsw = {
    233 	LOCALCOUNT_INITIALIZER
    234 	.d_open = raidopen,
    235 	.d_close = raidclose,
    236 	.d_read = raidread,
    237 	.d_write = raidwrite,
    238 	.d_ioctl = raidioctl,
    239 	.d_stop = nostop,
    240 	.d_tty = notty,
    241 	.d_poll = nopoll,
    242 	.d_mmap = nommap,
    243 	.d_kqfilter = nokqfilter,
    244 	.d_discard = nodiscard,
    245 	.d_flag = D_DISK
    246 };
    247 
    248 static struct dkdriver rf_dkdriver = {
    249 	.d_open = raidopen,
    250 	.d_close = raidclose,
    251 	.d_strategy = raidstrategy,
    252 	.d_diskstart = raid_diskstart,
    253 	.d_dumpblocks = raid_dumpblocks,
    254 	.d_lastclose = raid_lastclose,
    255 	.d_minphys = minphys
    256 };
    257 
    258 struct raid_softc {
    259 	struct dk_softc sc_dksc;
    260 	int	sc_unit;
    261 	int     sc_flags;	/* flags */
    262 	int     sc_cflags;	/* configuration flags */
    263 	kmutex_t sc_mutex;	/* interlock mutex */
    264 	kcondvar_t sc_cv;	/* and the condvar */
    265 	uint64_t sc_size;	/* size of the raid device */
    266 	char    sc_xname[20];	/* XXX external name */
    267 	RF_Raid_t sc_r;
    268 	LIST_ENTRY(raid_softc) sc_link;
    269 };
    270 /* sc_flags */
    271 #define RAIDF_INITED		0x01	/* unit has been initialized */
    272 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    273 #define RAIDF_DETACH  		0x04	/* detach after final close */
    274 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    275 #define RAIDF_LOCKED		0x10	/* unit is locked */
    276 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    277 
    278 #define	raidunit(x)	DISKUNIT(x)
    279 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    280 
    281 extern struct cfdriver raid_cd;
    282 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    283     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    284     DVF_DETACH_SHUTDOWN);
    285 
    286 /*
    287  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    288  * Be aware that large numbers can allow the driver to consume a lot of
    289  * kernel memory, especially on writes, and in degraded mode reads.
    290  *
    291  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    292  * a single 64K write will typically require 64K for the old data,
    293  * 64K for the old parity, and 64K for the new parity, for a total
    294  * of 192K (if the parity buffer is not re-used immediately).
    295  * Even it if is used immediately, that's still 128K, which when multiplied
    296  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    297  *
    298  * Now in degraded mode, for example, a 64K read on the above setup may
    299  * require data reconstruction, which will require *all* of the 4 remaining
    300  * disks to participate -- 4 * 32K/disk == 128K again.
    301  */
    302 
    303 #ifndef RAIDOUTSTANDING
    304 #define RAIDOUTSTANDING   6
    305 #endif
    306 
    307 #define RAIDLABELDEV(dev)	\
    308 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    309 
    310 /* declared here, and made public, for the benefit of KVM stuff.. */
    311 
    312 static int raidlock(struct raid_softc *);
    313 static void raidunlock(struct raid_softc *);
    314 
    315 static int raid_detach_unlocked(struct raid_softc *);
    316 
    317 static void rf_markalldirty(RF_Raid_t *);
    318 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    319 
    320 void rf_ReconThread(struct rf_recon_req *);
    321 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    322 void rf_CopybackThread(RF_Raid_t *raidPtr);
    323 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    324 int rf_autoconfig(device_t);
    325 void rf_buildroothack(RF_ConfigSet_t *);
    326 
    327 RF_AutoConfig_t *rf_find_raid_components(void);
    328 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    329 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    330 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    331 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    332 int rf_set_autoconfig(RF_Raid_t *, int);
    333 int rf_set_rootpartition(RF_Raid_t *, int);
    334 void rf_release_all_vps(RF_ConfigSet_t *);
    335 void rf_cleanup_config_set(RF_ConfigSet_t *);
    336 int rf_have_enough_components(RF_ConfigSet_t *);
    337 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    338 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    339 
    340 /*
    341  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    342  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    343  * in the kernel config file.
    344  */
    345 #ifdef RAID_AUTOCONFIG
    346 int raidautoconfig = 1;
    347 #else
    348 int raidautoconfig = 0;
    349 #endif
    350 static bool raidautoconfigdone = false;
    351 
    352 struct RF_Pools_s rf_pools;
    353 
    354 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    355 static kmutex_t raid_lock;
    356 
    357 static struct raid_softc *
    358 raidcreate(int unit) {
    359 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    360 	if (sc == NULL) {
    361 #ifdef DIAGNOSTIC
    362 		printf("%s: out of memory\n", __func__);
    363 #endif
    364 		return NULL;
    365 	}
    366 	sc->sc_unit = unit;
    367 	cv_init(&sc->sc_cv, "raidunit");
    368 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    369 	return sc;
    370 }
    371 
    372 static void
    373 raiddestroy(struct raid_softc *sc) {
    374 	cv_destroy(&sc->sc_cv);
    375 	mutex_destroy(&sc->sc_mutex);
    376 	kmem_free(sc, sizeof(*sc));
    377 }
    378 
    379 static struct raid_softc *
    380 raidget(int unit, bool create) {
    381 	struct raid_softc *sc;
    382 	if (unit < 0) {
    383 #ifdef DIAGNOSTIC
    384 		panic("%s: unit %d!", __func__, unit);
    385 #endif
    386 		return NULL;
    387 	}
    388 	mutex_enter(&raid_lock);
    389 	LIST_FOREACH(sc, &raids, sc_link) {
    390 		if (sc->sc_unit == unit) {
    391 			mutex_exit(&raid_lock);
    392 			return sc;
    393 		}
    394 	}
    395 	mutex_exit(&raid_lock);
    396 	if (!create)
    397 		return NULL;
    398 	if ((sc = raidcreate(unit)) == NULL)
    399 		return NULL;
    400 	mutex_enter(&raid_lock);
    401 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    402 	mutex_exit(&raid_lock);
    403 	return sc;
    404 }
    405 
    406 static void
    407 raidput(struct raid_softc *sc) {
    408 	mutex_enter(&raid_lock);
    409 	LIST_REMOVE(sc, sc_link);
    410 	mutex_exit(&raid_lock);
    411 	raiddestroy(sc);
    412 }
    413 
    414 void
    415 raidattach(int num)
    416 {
    417 
    418 	/*
    419 	 * Device attachment and associated initialization now occurs
    420 	 * as part of the module initialization.
    421 	 */
    422 }
    423 
    424 int
    425 rf_autoconfig(device_t self)
    426 {
    427 	RF_AutoConfig_t *ac_list;
    428 	RF_ConfigSet_t *config_sets;
    429 
    430 	if (!raidautoconfig || raidautoconfigdone == true)
    431 		return (0);
    432 
    433 	/* XXX This code can only be run once. */
    434 	raidautoconfigdone = true;
    435 
    436 #ifdef __HAVE_CPU_BOOTCONF
    437 	/*
    438 	 * 0. find the boot device if needed first so we can use it later
    439 	 * this needs to be done before we autoconfigure any raid sets,
    440 	 * because if we use wedges we are not going to be able to open
    441 	 * the boot device later
    442 	 */
    443 	if (booted_device == NULL)
    444 		cpu_bootconf();
    445 #endif
    446 	/* 1. locate all RAID components on the system */
    447 	aprint_debug("Searching for RAID components...\n");
    448 	ac_list = rf_find_raid_components();
    449 
    450 	/* 2. Sort them into their respective sets. */
    451 	config_sets = rf_create_auto_sets(ac_list);
    452 
    453 	/*
    454 	 * 3. Evaluate each set and configure the valid ones.
    455 	 * This gets done in rf_buildroothack().
    456 	 */
    457 	rf_buildroothack(config_sets);
    458 
    459 	return 1;
    460 }
    461 
    462 static int
    463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    464 	const char *bootname = device_xname(bdv);
    465 	size_t len = strlen(bootname);
    466 
    467 	for (int col = 0; col < r->numCol; col++) {
    468 		const char *devname = r->Disks[col].devname;
    469 		devname += sizeof("/dev/") - 1;
    470 		if (strncmp(devname, "dk", 2) == 0) {
    471 			const char *parent =
    472 			    dkwedge_get_parent_name(r->Disks[col].dev);
    473 			if (parent != NULL)
    474 				devname = parent;
    475 		}
    476 		if (strncmp(devname, bootname, len) == 0) {
    477 			struct raid_softc *sc = r->softc;
    478 			aprint_debug("raid%d includes boot device %s\n",
    479 			    sc->sc_unit, devname);
    480 			return 1;
    481 		}
    482 	}
    483 	return 0;
    484 }
    485 
    486 void
    487 rf_buildroothack(RF_ConfigSet_t *config_sets)
    488 {
    489 	RF_ConfigSet_t *cset;
    490 	RF_ConfigSet_t *next_cset;
    491 	int num_root;
    492 	struct raid_softc *sc, *rsc;
    493 	struct dk_softc *dksc;
    494 
    495 	sc = rsc = NULL;
    496 	num_root = 0;
    497 	cset = config_sets;
    498 	while (cset != NULL) {
    499 		next_cset = cset->next;
    500 		if (rf_have_enough_components(cset) &&
    501 		    cset->ac->clabel->autoconfigure == 1) {
    502 			sc = rf_auto_config_set(cset);
    503 			if (sc != NULL) {
    504 				aprint_debug("raid%d: configured ok\n",
    505 				    sc->sc_unit);
    506 				if (cset->rootable) {
    507 					rsc = sc;
    508 					num_root++;
    509 				}
    510 			} else {
    511 				/* The autoconfig didn't work :( */
    512 				aprint_debug("Autoconfig failed\n");
    513 				rf_release_all_vps(cset);
    514 			}
    515 		} else {
    516 			/* we're not autoconfiguring this set...
    517 			   release the associated resources */
    518 			rf_release_all_vps(cset);
    519 		}
    520 		/* cleanup */
    521 		rf_cleanup_config_set(cset);
    522 		cset = next_cset;
    523 	}
    524 	dksc = &rsc->sc_dksc;
    525 
    526 	/* if the user has specified what the root device should be
    527 	   then we don't touch booted_device or boothowto... */
    528 
    529 	if (rootspec != NULL)
    530 		return;
    531 
    532 	/* we found something bootable... */
    533 
    534 	/*
    535 	 * XXX: The following code assumes that the root raid
    536 	 * is the first ('a') partition. This is about the best
    537 	 * we can do with a BSD disklabel, but we might be able
    538 	 * to do better with a GPT label, by setting a specified
    539 	 * attribute to indicate the root partition. We can then
    540 	 * stash the partition number in the r->root_partition
    541 	 * high bits (the bottom 2 bits are already used). For
    542 	 * now we just set booted_partition to 0 when we override
    543 	 * root.
    544 	 */
    545 	if (num_root == 1) {
    546 		device_t candidate_root;
    547 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    548 			char cname[sizeof(cset->ac->devname)];
    549 			/* XXX: assume partition 'a' first */
    550 			snprintf(cname, sizeof(cname), "%s%c",
    551 			    device_xname(dksc->sc_dev), 'a');
    552 			candidate_root = dkwedge_find_by_wname(cname);
    553 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    554 			    cname);
    555 			if (candidate_root == NULL) {
    556 				/*
    557 				 * If that is not found, because we don't use
    558 				 * disklabel, return the first dk child
    559 				 * XXX: we can skip the 'a' check above
    560 				 * and always do this...
    561 				 */
    562 				size_t i = 0;
    563 				candidate_root = dkwedge_find_by_parent(
    564 				    device_xname(dksc->sc_dev), &i);
    565 			}
    566 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    567 			    candidate_root);
    568 		} else
    569 			candidate_root = dksc->sc_dev;
    570 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    571 		DPRINTF("%s: booted_device=%p root_partition=%d "
    572 		   "contains_boot=%d\n", __func__, booted_device,
    573 		   rsc->sc_r.root_partition,
    574 		   rf_containsboot(&rsc->sc_r, booted_device));
    575 		if (booted_device == NULL ||
    576 		    rsc->sc_r.root_partition == 1 ||
    577 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    578 			booted_device = candidate_root;
    579 			booted_partition = 0;	/* XXX assume 'a' */
    580 		}
    581 	} else if (num_root > 1) {
    582 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    583 		    booted_device);
    584 
    585 		/*
    586 		 * Maybe the MD code can help. If it cannot, then
    587 		 * setroot() will discover that we have no
    588 		 * booted_device and will ask the user if nothing was
    589 		 * hardwired in the kernel config file
    590 		 */
    591 		if (booted_device == NULL)
    592 			return;
    593 
    594 		num_root = 0;
    595 		mutex_enter(&raid_lock);
    596 		LIST_FOREACH(sc, &raids, sc_link) {
    597 			RF_Raid_t *r = &sc->sc_r;
    598 			if (r->valid == 0)
    599 				continue;
    600 
    601 			if (r->root_partition == 0)
    602 				continue;
    603 
    604 			if (rf_containsboot(r, booted_device)) {
    605 				num_root++;
    606 				rsc = sc;
    607 				dksc = &rsc->sc_dksc;
    608 			}
    609 		}
    610 		mutex_exit(&raid_lock);
    611 
    612 		if (num_root == 1) {
    613 			booted_device = dksc->sc_dev;
    614 			booted_partition = 0;	/* XXX assume 'a' */
    615 		} else {
    616 			/* we can't guess.. require the user to answer... */
    617 			boothowto |= RB_ASKNAME;
    618 		}
    619 	}
    620 }
    621 
    622 static int
    623 raidsize(dev_t dev)
    624 {
    625 	struct raid_softc *rs;
    626 	struct dk_softc *dksc;
    627 	unsigned int unit;
    628 
    629 	unit = raidunit(dev);
    630 	if ((rs = raidget(unit, false)) == NULL)
    631 		return -1;
    632 	dksc = &rs->sc_dksc;
    633 
    634 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    635 		return -1;
    636 
    637 	return dk_size(dksc, dev);
    638 }
    639 
    640 static int
    641 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    642 {
    643 	unsigned int unit;
    644 	struct raid_softc *rs;
    645 	struct dk_softc *dksc;
    646 
    647 	unit = raidunit(dev);
    648 	if ((rs = raidget(unit, false)) == NULL)
    649 		return ENXIO;
    650 	dksc = &rs->sc_dksc;
    651 
    652 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    653 		return ENODEV;
    654 
    655         /*
    656            Note that blkno is relative to this particular partition.
    657            By adding adding RF_PROTECTED_SECTORS, we get a value that
    658 	   is relative to the partition used for the underlying component.
    659         */
    660 	blkno += RF_PROTECTED_SECTORS;
    661 
    662 	return dk_dump(dksc, dev, blkno, va, size);
    663 }
    664 
    665 static int
    666 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    667 {
    668 	struct raid_softc *rs = raidsoftc(dev);
    669 	const struct bdevsw *bdev;
    670 	RF_Raid_t *raidPtr;
    671 	int     c, sparecol, j, scol, dumpto;
    672 	int     error = 0;
    673 
    674 	raidPtr = &rs->sc_r;
    675 
    676 	/* we only support dumping to RAID 1 sets */
    677 	if (raidPtr->Layout.numDataCol != 1 ||
    678 	    raidPtr->Layout.numParityCol != 1)
    679 		return EINVAL;
    680 
    681 	if ((error = raidlock(rs)) != 0)
    682 		return error;
    683 
    684 	/* figure out what device is alive.. */
    685 
    686 	/*
    687 	   Look for a component to dump to.  The preference for the
    688 	   component to dump to is as follows:
    689 	   1) the master
    690 	   2) a used_spare of the master
    691 	   3) the slave
    692 	   4) a used_spare of the slave
    693 	*/
    694 
    695 	dumpto = -1;
    696 	for (c = 0; c < raidPtr->numCol; c++) {
    697 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    698 			/* this might be the one */
    699 			dumpto = c;
    700 			break;
    701 		}
    702 	}
    703 
    704 	/*
    705 	   At this point we have possibly selected a live master or a
    706 	   live slave.  We now check to see if there is a spared
    707 	   master (or a spared slave), if we didn't find a live master
    708 	   or a live slave.
    709 	*/
    710 
    711 	for (c = 0; c < raidPtr->numSpare; c++) {
    712 		sparecol = raidPtr->numCol + c;
    713 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    714 			/* How about this one? */
    715 			scol = -1;
    716 			for(j=0;j<raidPtr->numCol;j++) {
    717 				if (raidPtr->Disks[j].spareCol == sparecol) {
    718 					scol = j;
    719 					break;
    720 				}
    721 			}
    722 			if (scol == 0) {
    723 				/*
    724 				   We must have found a spared master!
    725 				   We'll take that over anything else
    726 				   found so far.  (We couldn't have
    727 				   found a real master before, since
    728 				   this is a used spare, and it's
    729 				   saying that it's replacing the
    730 				   master.)  On reboot (with
    731 				   autoconfiguration turned on)
    732 				   sparecol will become the 1st
    733 				   component (component0) of this set.
    734 				*/
    735 				dumpto = sparecol;
    736 				break;
    737 			} else if (scol != -1) {
    738 				/*
    739 				   Must be a spared slave.  We'll dump
    740 				   to that if we havn't found anything
    741 				   else so far.
    742 				*/
    743 				if (dumpto == -1)
    744 					dumpto = sparecol;
    745 			}
    746 		}
    747 	}
    748 
    749 	if (dumpto == -1) {
    750 		/* we couldn't find any live components to dump to!?!?
    751 		 */
    752 		error = EINVAL;
    753 		goto out;
    754 	}
    755 
    756 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    757 	if (bdev == NULL) {
    758 		error = ENXIO;
    759 		goto out;
    760 	}
    761 
    762 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    763 				blkno, va, nblk * raidPtr->bytesPerSector);
    764 
    765 out:
    766 	raidunlock(rs);
    767 
    768 	return error;
    769 }
    770 
    771 /* ARGSUSED */
    772 static int
    773 raidopen(dev_t dev, int flags, int fmt,
    774     struct lwp *l)
    775 {
    776 	int     unit = raidunit(dev);
    777 	struct raid_softc *rs;
    778 	struct dk_softc *dksc;
    779 	int     error = 0;
    780 	int     part, pmask;
    781 
    782 	if ((rs = raidget(unit, true)) == NULL)
    783 		return ENXIO;
    784 	if ((error = raidlock(rs)) != 0)
    785 		return (error);
    786 
    787 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    788 		error = EBUSY;
    789 		goto bad;
    790 	}
    791 
    792 	dksc = &rs->sc_dksc;
    793 
    794 	part = DISKPART(dev);
    795 	pmask = (1 << part);
    796 
    797 	if (!DK_BUSY(dksc, pmask) &&
    798 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    799 		/* First one... mark things as dirty... Note that we *MUST*
    800 		 have done a configure before this.  I DO NOT WANT TO BE
    801 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    802 		 THAT THEY BELONG TOGETHER!!!!! */
    803 		/* XXX should check to see if we're only open for reading
    804 		   here... If so, we needn't do this, but then need some
    805 		   other way of keeping track of what's happened.. */
    806 
    807 		rf_markalldirty(&rs->sc_r);
    808 	}
    809 
    810 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    811 		error = dk_open(dksc, dev, flags, fmt, l);
    812 
    813 bad:
    814 	raidunlock(rs);
    815 
    816 	return (error);
    817 
    818 
    819 }
    820 
    821 static int
    822 raid_lastclose(device_t self)
    823 {
    824 	struct raid_softc *rs = raidsoftc(self);
    825 
    826 	/* Last one... device is not unconfigured yet.
    827 	   Device shutdown has taken care of setting the
    828 	   clean bits if RAIDF_INITED is not set
    829 	   mark things as clean... */
    830 
    831 	rf_update_component_labels(&rs->sc_r,
    832 	    RF_FINAL_COMPONENT_UPDATE);
    833 
    834 	/* pass to unlocked code */
    835 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    836 		rs->sc_flags |= RAIDF_DETACH;
    837 
    838 	return 0;
    839 }
    840 
    841 /* ARGSUSED */
    842 static int
    843 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    844 {
    845 	int     unit = raidunit(dev);
    846 	struct raid_softc *rs;
    847 	struct dk_softc *dksc;
    848 	cfdata_t cf;
    849 	int     error = 0, do_detach = 0, do_put = 0;
    850 
    851 	if ((rs = raidget(unit, false)) == NULL)
    852 		return ENXIO;
    853 	dksc = &rs->sc_dksc;
    854 
    855 	if ((error = raidlock(rs)) != 0)
    856 		return (error);
    857 
    858 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    859 		error = dk_close(dksc, dev, flags, fmt, l);
    860 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    861 			do_detach = 1;
    862 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    863 		do_put = 1;
    864 
    865 	raidunlock(rs);
    866 
    867 	if (do_detach) {
    868 		/* free the pseudo device attach bits */
    869 		cf = device_cfdata(dksc->sc_dev);
    870 		error = config_detach(dksc->sc_dev, 0);
    871 		if (error == 0)
    872 			free(cf, M_RAIDFRAME);
    873 	} else if (do_put) {
    874 		raidput(rs);
    875 	}
    876 
    877 	return (error);
    878 
    879 }
    880 
    881 static void
    882 raid_wakeup(RF_Raid_t *raidPtr)
    883 {
    884 	rf_lock_mutex2(raidPtr->iodone_lock);
    885 	rf_signal_cond2(raidPtr->iodone_cv);
    886 	rf_unlock_mutex2(raidPtr->iodone_lock);
    887 }
    888 
    889 static void
    890 raidstrategy(struct buf *bp)
    891 {
    892 	unsigned int unit;
    893 	struct raid_softc *rs;
    894 	struct dk_softc *dksc;
    895 	RF_Raid_t *raidPtr;
    896 
    897 	unit = raidunit(bp->b_dev);
    898 	if ((rs = raidget(unit, false)) == NULL) {
    899 		bp->b_error = ENXIO;
    900 		goto fail;
    901 	}
    902 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	dksc = &rs->sc_dksc;
    907 	raidPtr = &rs->sc_r;
    908 
    909 	/* Queue IO only */
    910 	if (dk_strategy_defer(dksc, bp))
    911 		goto done;
    912 
    913 	/* schedule the IO to happen at the next convenient time */
    914 	raid_wakeup(raidPtr);
    915 
    916 done:
    917 	return;
    918 
    919 fail:
    920 	bp->b_resid = bp->b_bcount;
    921 	biodone(bp);
    922 }
    923 
    924 static int
    925 raid_diskstart(device_t dev, struct buf *bp)
    926 {
    927 	struct raid_softc *rs = raidsoftc(dev);
    928 	RF_Raid_t *raidPtr;
    929 
    930 	raidPtr = &rs->sc_r;
    931 	if (!raidPtr->valid) {
    932 		db1_printf(("raid is not valid..\n"));
    933 		return ENODEV;
    934 	}
    935 
    936 	/* XXX */
    937 	bp->b_resid = 0;
    938 
    939 	return raiddoaccess(raidPtr, bp);
    940 }
    941 
    942 void
    943 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    944 {
    945 	struct raid_softc *rs;
    946 	struct dk_softc *dksc;
    947 
    948 	rs = raidPtr->softc;
    949 	dksc = &rs->sc_dksc;
    950 
    951 	dk_done(dksc, bp);
    952 
    953 	rf_lock_mutex2(raidPtr->mutex);
    954 	raidPtr->openings++;
    955 	rf_unlock_mutex2(raidPtr->mutex);
    956 
    957 	/* schedule more IO */
    958 	raid_wakeup(raidPtr);
    959 }
    960 
    961 /* ARGSUSED */
    962 static int
    963 raidread(dev_t dev, struct uio *uio, int flags)
    964 {
    965 	int     unit = raidunit(dev);
    966 	struct raid_softc *rs;
    967 
    968 	if ((rs = raidget(unit, false)) == NULL)
    969 		return ENXIO;
    970 
    971 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    972 		return (ENXIO);
    973 
    974 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    975 
    976 }
    977 
    978 /* ARGSUSED */
    979 static int
    980 raidwrite(dev_t dev, struct uio *uio, int flags)
    981 {
    982 	int     unit = raidunit(dev);
    983 	struct raid_softc *rs;
    984 
    985 	if ((rs = raidget(unit, false)) == NULL)
    986 		return ENXIO;
    987 
    988 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    989 		return (ENXIO);
    990 
    991 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    992 
    993 }
    994 
    995 static int
    996 raid_detach_unlocked(struct raid_softc *rs)
    997 {
    998 	struct dk_softc *dksc = &rs->sc_dksc;
    999 	RF_Raid_t *raidPtr;
   1000 	int error;
   1001 
   1002 	raidPtr = &rs->sc_r;
   1003 
   1004 	if (DK_BUSY(dksc, 0) ||
   1005 	    raidPtr->recon_in_progress != 0 ||
   1006 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1007 	    raidPtr->copyback_in_progress != 0)
   1008 		return EBUSY;
   1009 
   1010 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1011 		return 0;
   1012 
   1013 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1014 
   1015 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1016 		return error;
   1017 
   1018 	rs->sc_flags &= ~RAIDF_INITED;
   1019 
   1020 	/* Kill off any queued buffers */
   1021 	dk_drain(dksc);
   1022 	bufq_free(dksc->sc_bufq);
   1023 
   1024 	/* Detach the disk. */
   1025 	dkwedge_delall(&dksc->sc_dkdev);
   1026 	disk_detach(&dksc->sc_dkdev);
   1027 	disk_destroy(&dksc->sc_dkdev);
   1028 	dk_detach(dksc);
   1029 
   1030 	return 0;
   1031 }
   1032 
   1033 static int
   1034 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1035 {
   1036 	int     unit = raidunit(dev);
   1037 	int     error = 0;
   1038 	int     part, pmask;
   1039 	struct raid_softc *rs;
   1040 	struct dk_softc *dksc;
   1041 	RF_Config_t *k_cfg, *u_cfg;
   1042 	RF_Raid_t *raidPtr;
   1043 	RF_RaidDisk_t *diskPtr;
   1044 	RF_AccTotals_t *totals;
   1045 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1046 	u_char *specific_buf;
   1047 	int retcode = 0;
   1048 	int column;
   1049 /*	int raidid; */
   1050 	struct rf_recon_req *rrcopy, *rr;
   1051 	RF_ComponentLabel_t *clabel;
   1052 	RF_ComponentLabel_t *ci_label;
   1053 	RF_ComponentLabel_t **clabel_ptr;
   1054 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1055 	RF_SingleComponent_t component;
   1056 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1057 	int i, j, d;
   1058 
   1059 	if ((rs = raidget(unit, false)) == NULL)
   1060 		return ENXIO;
   1061 	dksc = &rs->sc_dksc;
   1062 	raidPtr = &rs->sc_r;
   1063 
   1064 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1065 		(int) DISKPART(dev), (int) unit, cmd));
   1066 
   1067 	/* Must be initialized for these... */
   1068 	switch (cmd) {
   1069 	case RAIDFRAME_REWRITEPARITY:
   1070 	case RAIDFRAME_GET_INFO:
   1071 	case RAIDFRAME_RESET_ACCTOTALS:
   1072 	case RAIDFRAME_GET_ACCTOTALS:
   1073 	case RAIDFRAME_KEEP_ACCTOTALS:
   1074 	case RAIDFRAME_GET_SIZE:
   1075 	case RAIDFRAME_FAIL_DISK:
   1076 	case RAIDFRAME_COPYBACK:
   1077 	case RAIDFRAME_CHECK_RECON_STATUS:
   1078 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1079 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1080 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1081 	case RAIDFRAME_ADD_HOT_SPARE:
   1082 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1083 	case RAIDFRAME_INIT_LABELS:
   1084 	case RAIDFRAME_REBUILD_IN_PLACE:
   1085 	case RAIDFRAME_CHECK_PARITY:
   1086 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1087 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1088 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1089 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1090 	case RAIDFRAME_SET_AUTOCONFIG:
   1091 	case RAIDFRAME_SET_ROOT:
   1092 	case RAIDFRAME_DELETE_COMPONENT:
   1093 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1094 	case RAIDFRAME_PARITYMAP_STATUS:
   1095 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1096 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1097 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1098 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1099 			return (ENXIO);
   1100 	}
   1101 
   1102 	switch (cmd) {
   1103 #ifdef COMPAT_50
   1104 	case RAIDFRAME_GET_INFO50:
   1105 		return rf_get_info50(raidPtr, data);
   1106 
   1107 	case RAIDFRAME_CONFIGURE50:
   1108 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1109 			return retcode;
   1110 		goto config;
   1111 #endif
   1112 		/* configure the system */
   1113 	case RAIDFRAME_CONFIGURE:
   1114 
   1115 		if (raidPtr->valid) {
   1116 			/* There is a valid RAID set running on this unit! */
   1117 			printf("raid%d: Device already configured!\n",unit);
   1118 			return(EINVAL);
   1119 		}
   1120 
   1121 		/* copy-in the configuration information */
   1122 		/* data points to a pointer to the configuration structure */
   1123 
   1124 		u_cfg = *((RF_Config_t **) data);
   1125 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1126 		if (k_cfg == NULL) {
   1127 			return (ENOMEM);
   1128 		}
   1129 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1130 		if (retcode) {
   1131 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1132 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1133 				retcode));
   1134 			goto no_config;
   1135 		}
   1136 		goto config;
   1137 	config:
   1138 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1139 
   1140 		/* allocate a buffer for the layout-specific data, and copy it
   1141 		 * in */
   1142 		if (k_cfg->layoutSpecificSize) {
   1143 			if (k_cfg->layoutSpecificSize > 10000) {
   1144 				/* sanity check */
   1145 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1146 				retcode = EINVAL;
   1147 				goto no_config;
   1148 			}
   1149 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1150 			    (u_char *));
   1151 			if (specific_buf == NULL) {
   1152 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1153 				retcode = ENOMEM;
   1154 				goto no_config;
   1155 			}
   1156 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1157 			    k_cfg->layoutSpecificSize);
   1158 			if (retcode) {
   1159 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1160 				RF_Free(specific_buf,
   1161 					k_cfg->layoutSpecificSize);
   1162 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1163 					retcode));
   1164 				goto no_config;
   1165 			}
   1166 		} else
   1167 			specific_buf = NULL;
   1168 		k_cfg->layoutSpecific = specific_buf;
   1169 
   1170 		/* should do some kind of sanity check on the configuration.
   1171 		 * Store the sum of all the bytes in the last byte? */
   1172 
   1173 		/* configure the system */
   1174 
   1175 		/*
   1176 		 * Clear the entire RAID descriptor, just to make sure
   1177 		 *  there is no stale data left in the case of a
   1178 		 *  reconfiguration
   1179 		 */
   1180 		memset(raidPtr, 0, sizeof(*raidPtr));
   1181 		raidPtr->softc = rs;
   1182 		raidPtr->raidid = unit;
   1183 
   1184 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1185 
   1186 		if (retcode == 0) {
   1187 
   1188 			/* allow this many simultaneous IO's to
   1189 			   this RAID device */
   1190 			raidPtr->openings = RAIDOUTSTANDING;
   1191 
   1192 			raidinit(rs);
   1193 			raid_wakeup(raidPtr);
   1194 			rf_markalldirty(raidPtr);
   1195 		}
   1196 		/* free the buffers.  No return code here. */
   1197 		if (k_cfg->layoutSpecificSize) {
   1198 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1199 		}
   1200 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1201 
   1202 	no_config:
   1203 		/*
   1204 		 * If configuration failed, set sc_flags so that we
   1205 		 * will detach the device when we close it.
   1206 		 */
   1207 		if (retcode != 0)
   1208 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1209 		return (retcode);
   1210 
   1211 		/* shutdown the system */
   1212 	case RAIDFRAME_SHUTDOWN:
   1213 
   1214 		part = DISKPART(dev);
   1215 		pmask = (1 << part);
   1216 
   1217 		if ((error = raidlock(rs)) != 0)
   1218 			return (error);
   1219 
   1220 		if (DK_BUSY(dksc, pmask) ||
   1221 		    raidPtr->recon_in_progress != 0 ||
   1222 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1223 		    raidPtr->copyback_in_progress != 0)
   1224 			retcode = EBUSY;
   1225 		else {
   1226 			/* detach and free on close */
   1227 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1228 			retcode = 0;
   1229 		}
   1230 
   1231 		raidunlock(rs);
   1232 
   1233 		return (retcode);
   1234 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1235 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1236 		/* need to read the component label for the disk indicated
   1237 		   by row,column in clabel */
   1238 
   1239 		/*
   1240 		 * Perhaps there should be an option to skip the in-core
   1241 		 * copy and hit the disk, as with disklabel(8).
   1242 		 */
   1243 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1244 
   1245 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1246 
   1247 		if (retcode) {
   1248 			RF_Free(clabel, sizeof(*clabel));
   1249 			return retcode;
   1250 		}
   1251 
   1252 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1253 
   1254 		column = clabel->column;
   1255 
   1256 		if ((column < 0) || (column >= raidPtr->numCol +
   1257 		    raidPtr->numSpare)) {
   1258 			RF_Free(clabel, sizeof(*clabel));
   1259 			return EINVAL;
   1260 		}
   1261 
   1262 		RF_Free(clabel, sizeof(*clabel));
   1263 
   1264 		clabel = raidget_component_label(raidPtr, column);
   1265 
   1266 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1267 
   1268 #if 0
   1269 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1270 		clabel = (RF_ComponentLabel_t *) data;
   1271 
   1272 		/* XXX check the label for valid stuff... */
   1273 		/* Note that some things *should not* get modified --
   1274 		   the user should be re-initing the labels instead of
   1275 		   trying to patch things.
   1276 		   */
   1277 
   1278 		raidid = raidPtr->raidid;
   1279 #ifdef DEBUG
   1280 		printf("raid%d: Got component label:\n", raidid);
   1281 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1282 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1283 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1284 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1285 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1286 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1287 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1288 #endif
   1289 		clabel->row = 0;
   1290 		column = clabel->column;
   1291 
   1292 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1293 			return(EINVAL);
   1294 		}
   1295 
   1296 		/* XXX this isn't allowed to do anything for now :-) */
   1297 
   1298 		/* XXX and before it is, we need to fill in the rest
   1299 		   of the fields!?!?!?! */
   1300 		memcpy(raidget_component_label(raidPtr, column),
   1301 		    clabel, sizeof(*clabel));
   1302 		raidflush_component_label(raidPtr, column);
   1303 		return (0);
   1304 #endif
   1305 
   1306 	case RAIDFRAME_INIT_LABELS:
   1307 		clabel = (RF_ComponentLabel_t *) data;
   1308 		/*
   1309 		   we only want the serial number from
   1310 		   the above.  We get all the rest of the information
   1311 		   from the config that was used to create this RAID
   1312 		   set.
   1313 		   */
   1314 
   1315 		raidPtr->serial_number = clabel->serial_number;
   1316 
   1317 		for(column=0;column<raidPtr->numCol;column++) {
   1318 			diskPtr = &raidPtr->Disks[column];
   1319 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1320 				ci_label = raidget_component_label(raidPtr,
   1321 				    column);
   1322 				/* Zeroing this is important. */
   1323 				memset(ci_label, 0, sizeof(*ci_label));
   1324 				raid_init_component_label(raidPtr, ci_label);
   1325 				ci_label->serial_number =
   1326 				    raidPtr->serial_number;
   1327 				ci_label->row = 0; /* we dont' pretend to support more */
   1328 				rf_component_label_set_partitionsize(ci_label,
   1329 				    diskPtr->partitionSize);
   1330 				ci_label->column = column;
   1331 				raidflush_component_label(raidPtr, column);
   1332 			}
   1333 			/* XXXjld what about the spares? */
   1334 		}
   1335 
   1336 		return (retcode);
   1337 	case RAIDFRAME_SET_AUTOCONFIG:
   1338 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1339 		printf("raid%d: New autoconfig value is: %d\n",
   1340 		       raidPtr->raidid, d);
   1341 		*(int *) data = d;
   1342 		return (retcode);
   1343 
   1344 	case RAIDFRAME_SET_ROOT:
   1345 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1346 		printf("raid%d: New rootpartition value is: %d\n",
   1347 		       raidPtr->raidid, d);
   1348 		*(int *) data = d;
   1349 		return (retcode);
   1350 
   1351 		/* initialize all parity */
   1352 	case RAIDFRAME_REWRITEPARITY:
   1353 
   1354 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1355 			/* Parity for RAID 0 is trivially correct */
   1356 			raidPtr->parity_good = RF_RAID_CLEAN;
   1357 			return(0);
   1358 		}
   1359 
   1360 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1361 			/* Re-write is already in progress! */
   1362 			return(EINVAL);
   1363 		}
   1364 
   1365 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1366 					   rf_RewriteParityThread,
   1367 					   raidPtr,"raid_parity");
   1368 		return (retcode);
   1369 
   1370 
   1371 	case RAIDFRAME_ADD_HOT_SPARE:
   1372 		sparePtr = (RF_SingleComponent_t *) data;
   1373 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1374 		retcode = rf_add_hot_spare(raidPtr, &component);
   1375 		return(retcode);
   1376 
   1377 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1378 		return(retcode);
   1379 
   1380 	case RAIDFRAME_DELETE_COMPONENT:
   1381 		componentPtr = (RF_SingleComponent_t *)data;
   1382 		memcpy( &component, componentPtr,
   1383 			sizeof(RF_SingleComponent_t));
   1384 		retcode = rf_delete_component(raidPtr, &component);
   1385 		return(retcode);
   1386 
   1387 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1388 		componentPtr = (RF_SingleComponent_t *)data;
   1389 		memcpy( &component, componentPtr,
   1390 			sizeof(RF_SingleComponent_t));
   1391 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1392 		return(retcode);
   1393 
   1394 	case RAIDFRAME_REBUILD_IN_PLACE:
   1395 
   1396 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1397 			/* Can't do this on a RAID 0!! */
   1398 			return(EINVAL);
   1399 		}
   1400 
   1401 		if (raidPtr->recon_in_progress == 1) {
   1402 			/* a reconstruct is already in progress! */
   1403 			return(EINVAL);
   1404 		}
   1405 
   1406 		componentPtr = (RF_SingleComponent_t *) data;
   1407 		memcpy( &component, componentPtr,
   1408 			sizeof(RF_SingleComponent_t));
   1409 		component.row = 0; /* we don't support any more */
   1410 		column = component.column;
   1411 
   1412 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1413 			return(EINVAL);
   1414 		}
   1415 
   1416 		rf_lock_mutex2(raidPtr->mutex);
   1417 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1418 		    (raidPtr->numFailures > 0)) {
   1419 			/* XXX 0 above shouldn't be constant!!! */
   1420 			/* some component other than this has failed.
   1421 			   Let's not make things worse than they already
   1422 			   are... */
   1423 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1424 			       raidPtr->raidid);
   1425 			printf("raid%d:     Col: %d   Too many failures.\n",
   1426 			       raidPtr->raidid, column);
   1427 			rf_unlock_mutex2(raidPtr->mutex);
   1428 			return (EINVAL);
   1429 		}
   1430 		if (raidPtr->Disks[column].status ==
   1431 		    rf_ds_reconstructing) {
   1432 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1433 			       raidPtr->raidid);
   1434 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1435 
   1436 			rf_unlock_mutex2(raidPtr->mutex);
   1437 			return (EINVAL);
   1438 		}
   1439 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1440 			rf_unlock_mutex2(raidPtr->mutex);
   1441 			return (EINVAL);
   1442 		}
   1443 		rf_unlock_mutex2(raidPtr->mutex);
   1444 
   1445 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1446 		if (rrcopy == NULL)
   1447 			return(ENOMEM);
   1448 
   1449 		rrcopy->raidPtr = (void *) raidPtr;
   1450 		rrcopy->col = column;
   1451 
   1452 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1453 					   rf_ReconstructInPlaceThread,
   1454 					   rrcopy,"raid_reconip");
   1455 		return(retcode);
   1456 
   1457 	case RAIDFRAME_GET_INFO:
   1458 		if (!raidPtr->valid)
   1459 			return (ENODEV);
   1460 		ucfgp = (RF_DeviceConfig_t **) data;
   1461 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1462 			  (RF_DeviceConfig_t *));
   1463 		if (d_cfg == NULL)
   1464 			return (ENOMEM);
   1465 		d_cfg->rows = 1; /* there is only 1 row now */
   1466 		d_cfg->cols = raidPtr->numCol;
   1467 		d_cfg->ndevs = raidPtr->numCol;
   1468 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1469 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1470 			return (ENOMEM);
   1471 		}
   1472 		d_cfg->nspares = raidPtr->numSpare;
   1473 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1474 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1475 			return (ENOMEM);
   1476 		}
   1477 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1478 		d = 0;
   1479 		for (j = 0; j < d_cfg->cols; j++) {
   1480 			d_cfg->devs[d] = raidPtr->Disks[j];
   1481 			d++;
   1482 		}
   1483 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1484 			d_cfg->spares[i] = raidPtr->Disks[j];
   1485 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1486 				/* XXX: raidctl(8) expects to see this as a used spare */
   1487 				d_cfg->spares[i].status = rf_ds_used_spare;
   1488 			}
   1489 		}
   1490 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1491 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1492 
   1493 		return (retcode);
   1494 
   1495 	case RAIDFRAME_CHECK_PARITY:
   1496 		*(int *) data = raidPtr->parity_good;
   1497 		return (0);
   1498 
   1499 	case RAIDFRAME_PARITYMAP_STATUS:
   1500 		if (rf_paritymap_ineligible(raidPtr))
   1501 			return EINVAL;
   1502 		rf_paritymap_status(raidPtr->parity_map,
   1503 		    (struct rf_pmstat *)data);
   1504 		return 0;
   1505 
   1506 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1507 		if (rf_paritymap_ineligible(raidPtr))
   1508 			return EINVAL;
   1509 		if (raidPtr->parity_map == NULL)
   1510 			return ENOENT; /* ??? */
   1511 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1512 			(struct rf_pmparams *)data, 1))
   1513 			return EINVAL;
   1514 		return 0;
   1515 
   1516 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1517 		if (rf_paritymap_ineligible(raidPtr))
   1518 			return EINVAL;
   1519 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1520 		return 0;
   1521 
   1522 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1523 		if (rf_paritymap_ineligible(raidPtr))
   1524 			return EINVAL;
   1525 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1526 		/* XXX should errors be passed up? */
   1527 		return 0;
   1528 
   1529 	case RAIDFRAME_RESET_ACCTOTALS:
   1530 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1531 		return (0);
   1532 
   1533 	case RAIDFRAME_GET_ACCTOTALS:
   1534 		totals = (RF_AccTotals_t *) data;
   1535 		*totals = raidPtr->acc_totals;
   1536 		return (0);
   1537 
   1538 	case RAIDFRAME_KEEP_ACCTOTALS:
   1539 		raidPtr->keep_acc_totals = *(int *)data;
   1540 		return (0);
   1541 
   1542 	case RAIDFRAME_GET_SIZE:
   1543 		*(int *) data = raidPtr->totalSectors;
   1544 		return (0);
   1545 
   1546 		/* fail a disk & optionally start reconstruction */
   1547 	case RAIDFRAME_FAIL_DISK:
   1548 
   1549 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1550 			/* Can't do this on a RAID 0!! */
   1551 			return(EINVAL);
   1552 		}
   1553 
   1554 		rr = (struct rf_recon_req *) data;
   1555 		rr->row = 0;
   1556 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1557 			return (EINVAL);
   1558 
   1559 
   1560 		rf_lock_mutex2(raidPtr->mutex);
   1561 		if (raidPtr->status == rf_rs_reconstructing) {
   1562 			/* you can't fail a disk while we're reconstructing! */
   1563 			/* XXX wrong for RAID6 */
   1564 			rf_unlock_mutex2(raidPtr->mutex);
   1565 			return (EINVAL);
   1566 		}
   1567 		if ((raidPtr->Disks[rr->col].status ==
   1568 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1569 			/* some other component has failed.  Let's not make
   1570 			   things worse. XXX wrong for RAID6 */
   1571 			rf_unlock_mutex2(raidPtr->mutex);
   1572 			return (EINVAL);
   1573 		}
   1574 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1575 			/* Can't fail a spared disk! */
   1576 			rf_unlock_mutex2(raidPtr->mutex);
   1577 			return (EINVAL);
   1578 		}
   1579 		rf_unlock_mutex2(raidPtr->mutex);
   1580 
   1581 		/* make a copy of the recon request so that we don't rely on
   1582 		 * the user's buffer */
   1583 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1584 		if (rrcopy == NULL)
   1585 			return(ENOMEM);
   1586 		memcpy(rrcopy, rr, sizeof(*rr));
   1587 		rrcopy->raidPtr = (void *) raidPtr;
   1588 
   1589 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1590 					   rf_ReconThread,
   1591 					   rrcopy,"raid_recon");
   1592 		return (0);
   1593 
   1594 		/* invoke a copyback operation after recon on whatever disk
   1595 		 * needs it, if any */
   1596 	case RAIDFRAME_COPYBACK:
   1597 
   1598 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1599 			/* This makes no sense on a RAID 0!! */
   1600 			return(EINVAL);
   1601 		}
   1602 
   1603 		if (raidPtr->copyback_in_progress == 1) {
   1604 			/* Copyback is already in progress! */
   1605 			return(EINVAL);
   1606 		}
   1607 
   1608 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1609 					   rf_CopybackThread,
   1610 					   raidPtr,"raid_copyback");
   1611 		return (retcode);
   1612 
   1613 		/* return the percentage completion of reconstruction */
   1614 	case RAIDFRAME_CHECK_RECON_STATUS:
   1615 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1616 			/* This makes no sense on a RAID 0, so tell the
   1617 			   user it's done. */
   1618 			*(int *) data = 100;
   1619 			return(0);
   1620 		}
   1621 		if (raidPtr->status != rf_rs_reconstructing)
   1622 			*(int *) data = 100;
   1623 		else {
   1624 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1625 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1626 			} else {
   1627 				*(int *) data = 0;
   1628 			}
   1629 		}
   1630 		return (0);
   1631 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1632 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1633 		if (raidPtr->status != rf_rs_reconstructing) {
   1634 			progressInfo.remaining = 0;
   1635 			progressInfo.completed = 100;
   1636 			progressInfo.total = 100;
   1637 		} else {
   1638 			progressInfo.total =
   1639 				raidPtr->reconControl->numRUsTotal;
   1640 			progressInfo.completed =
   1641 				raidPtr->reconControl->numRUsComplete;
   1642 			progressInfo.remaining = progressInfo.total -
   1643 				progressInfo.completed;
   1644 		}
   1645 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1646 				  sizeof(RF_ProgressInfo_t));
   1647 		return (retcode);
   1648 
   1649 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1650 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1651 			/* This makes no sense on a RAID 0, so tell the
   1652 			   user it's done. */
   1653 			*(int *) data = 100;
   1654 			return(0);
   1655 		}
   1656 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1657 			*(int *) data = 100 *
   1658 				raidPtr->parity_rewrite_stripes_done /
   1659 				raidPtr->Layout.numStripe;
   1660 		} else {
   1661 			*(int *) data = 100;
   1662 		}
   1663 		return (0);
   1664 
   1665 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1666 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1667 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1668 			progressInfo.total = raidPtr->Layout.numStripe;
   1669 			progressInfo.completed =
   1670 				raidPtr->parity_rewrite_stripes_done;
   1671 			progressInfo.remaining = progressInfo.total -
   1672 				progressInfo.completed;
   1673 		} else {
   1674 			progressInfo.remaining = 0;
   1675 			progressInfo.completed = 100;
   1676 			progressInfo.total = 100;
   1677 		}
   1678 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1679 				  sizeof(RF_ProgressInfo_t));
   1680 		return (retcode);
   1681 
   1682 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1683 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1684 			/* This makes no sense on a RAID 0 */
   1685 			*(int *) data = 100;
   1686 			return(0);
   1687 		}
   1688 		if (raidPtr->copyback_in_progress == 1) {
   1689 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1690 				raidPtr->Layout.numStripe;
   1691 		} else {
   1692 			*(int *) data = 100;
   1693 		}
   1694 		return (0);
   1695 
   1696 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1697 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1698 		if (raidPtr->copyback_in_progress == 1) {
   1699 			progressInfo.total = raidPtr->Layout.numStripe;
   1700 			progressInfo.completed =
   1701 				raidPtr->copyback_stripes_done;
   1702 			progressInfo.remaining = progressInfo.total -
   1703 				progressInfo.completed;
   1704 		} else {
   1705 			progressInfo.remaining = 0;
   1706 			progressInfo.completed = 100;
   1707 			progressInfo.total = 100;
   1708 		}
   1709 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1710 				  sizeof(RF_ProgressInfo_t));
   1711 		return (retcode);
   1712 
   1713 	case RAIDFRAME_SET_LAST_UNIT:
   1714 		for (column = 0; column < raidPtr->numCol; column++)
   1715 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1716 				return EBUSY;
   1717 
   1718 		for (column = 0; column < raidPtr->numCol; column++) {
   1719 			clabel = raidget_component_label(raidPtr, column);
   1720 			clabel->last_unit = *(int *)data;
   1721 			raidflush_component_label(raidPtr, column);
   1722 		}
   1723 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1724 		return 0;
   1725 
   1726 		/* the sparetable daemon calls this to wait for the kernel to
   1727 		 * need a spare table. this ioctl does not return until a
   1728 		 * spare table is needed. XXX -- calling mpsleep here in the
   1729 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1730 		 * -- I should either compute the spare table in the kernel,
   1731 		 * or have a different -- XXX XXX -- interface (a different
   1732 		 * character device) for delivering the table     -- XXX */
   1733 #if 0
   1734 	case RAIDFRAME_SPARET_WAIT:
   1735 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1736 		while (!rf_sparet_wait_queue)
   1737 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1738 		waitreq = rf_sparet_wait_queue;
   1739 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1740 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1741 
   1742 		/* structure assignment */
   1743 		*((RF_SparetWait_t *) data) = *waitreq;
   1744 
   1745 		RF_Free(waitreq, sizeof(*waitreq));
   1746 		return (0);
   1747 
   1748 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1749 		 * code in it that will cause the dameon to exit */
   1750 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1751 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1752 		waitreq->fcol = -1;
   1753 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1754 		waitreq->next = rf_sparet_wait_queue;
   1755 		rf_sparet_wait_queue = waitreq;
   1756 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1757 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1758 		return (0);
   1759 
   1760 		/* used by the spare table daemon to deliver a spare table
   1761 		 * into the kernel */
   1762 	case RAIDFRAME_SEND_SPARET:
   1763 
   1764 		/* install the spare table */
   1765 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1766 
   1767 		/* respond to the requestor.  the return status of the spare
   1768 		 * table installation is passed in the "fcol" field */
   1769 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1770 		waitreq->fcol = retcode;
   1771 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1772 		waitreq->next = rf_sparet_resp_queue;
   1773 		rf_sparet_resp_queue = waitreq;
   1774 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1775 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1776 
   1777 		return (retcode);
   1778 #endif
   1779 
   1780 	default:
   1781 		break; /* fall through to the os-specific code below */
   1782 
   1783 	}
   1784 
   1785 	if (!raidPtr->valid)
   1786 		return (EINVAL);
   1787 
   1788 	/*
   1789 	 * Add support for "regular" device ioctls here.
   1790 	 */
   1791 
   1792 	error = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1793 	if (error != EPASSTHROUGH)
   1794 		return (error);
   1795 
   1796 	switch (cmd) {
   1797 	case DIOCCACHESYNC:
   1798 		return rf_sync_component_caches(raidPtr);
   1799 
   1800 	default:
   1801 		retcode = ENOTTY;
   1802 	}
   1803 	return (retcode);
   1804 
   1805 }
   1806 
   1807 
   1808 /* raidinit -- complete the rest of the initialization for the
   1809    RAIDframe device.  */
   1810 
   1811 
   1812 static void
   1813 raidinit(struct raid_softc *rs)
   1814 {
   1815 	cfdata_t cf;
   1816 	unsigned int unit;
   1817 	struct dk_softc *dksc = &rs->sc_dksc;
   1818 	RF_Raid_t *raidPtr = &rs->sc_r;
   1819 	device_t dev;
   1820 
   1821 	unit = raidPtr->raidid;
   1822 
   1823 	/* XXX doesn't check bounds. */
   1824 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1825 
   1826 	/* attach the pseudo device */
   1827 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1828 	cf->cf_name = raid_cd.cd_name;
   1829 	cf->cf_atname = raid_cd.cd_name;
   1830 	cf->cf_unit = unit;
   1831 	cf->cf_fstate = FSTATE_STAR;
   1832 
   1833 	dev = config_attach_pseudo(cf);
   1834 	if (dev == NULL) {
   1835 		printf("raid%d: config_attach_pseudo failed\n",
   1836 		    raidPtr->raidid);
   1837 		free(cf, M_RAIDFRAME);
   1838 		return;
   1839 	}
   1840 
   1841 	/* provide a backpointer to the real softc */
   1842 	raidsoftc(dev) = rs;
   1843 
   1844 	/* disk_attach actually creates space for the CPU disklabel, among
   1845 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1846 	 * with disklabels. */
   1847 	dk_init(dksc, dev, DKTYPE_RAID);
   1848 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1849 
   1850 	/* XXX There may be a weird interaction here between this, and
   1851 	 * protectedSectors, as used in RAIDframe.  */
   1852 
   1853 	rs->sc_size = raidPtr->totalSectors;
   1854 
   1855 	/* Attach dk and disk subsystems */
   1856 	dk_attach(dksc);
   1857 	disk_attach(&dksc->sc_dkdev);
   1858 	rf_set_geometry(rs, raidPtr);
   1859 
   1860 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1861 
   1862 	/* mark unit as usuable */
   1863 	rs->sc_flags |= RAIDF_INITED;
   1864 
   1865 	dkwedge_discover(&dksc->sc_dkdev);
   1866 }
   1867 
   1868 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1869 /* wake up the daemon & tell it to get us a spare table
   1870  * XXX
   1871  * the entries in the queues should be tagged with the raidPtr
   1872  * so that in the extremely rare case that two recons happen at once,
   1873  * we know for which device were requesting a spare table
   1874  * XXX
   1875  *
   1876  * XXX This code is not currently used. GO
   1877  */
   1878 int
   1879 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1880 {
   1881 	int     retcode;
   1882 
   1883 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1884 	req->next = rf_sparet_wait_queue;
   1885 	rf_sparet_wait_queue = req;
   1886 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1887 
   1888 	/* mpsleep unlocks the mutex */
   1889 	while (!rf_sparet_resp_queue) {
   1890 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1891 	}
   1892 	req = rf_sparet_resp_queue;
   1893 	rf_sparet_resp_queue = req->next;
   1894 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1895 
   1896 	retcode = req->fcol;
   1897 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1898 					 * alloc'd */
   1899 	return (retcode);
   1900 }
   1901 #endif
   1902 
   1903 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1904  * bp & passes it down.
   1905  * any calls originating in the kernel must use non-blocking I/O
   1906  * do some extra sanity checking to return "appropriate" error values for
   1907  * certain conditions (to make some standard utilities work)
   1908  *
   1909  * Formerly known as: rf_DoAccessKernel
   1910  */
   1911 void
   1912 raidstart(RF_Raid_t *raidPtr)
   1913 {
   1914 	struct raid_softc *rs;
   1915 	struct dk_softc *dksc;
   1916 
   1917 	rs = raidPtr->softc;
   1918 	dksc = &rs->sc_dksc;
   1919 	/* quick check to see if anything has died recently */
   1920 	rf_lock_mutex2(raidPtr->mutex);
   1921 	if (raidPtr->numNewFailures > 0) {
   1922 		rf_unlock_mutex2(raidPtr->mutex);
   1923 		rf_update_component_labels(raidPtr,
   1924 					   RF_NORMAL_COMPONENT_UPDATE);
   1925 		rf_lock_mutex2(raidPtr->mutex);
   1926 		raidPtr->numNewFailures--;
   1927 	}
   1928 	rf_unlock_mutex2(raidPtr->mutex);
   1929 
   1930 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1931 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1932 		return;
   1933 	}
   1934 
   1935 	dk_start(dksc, NULL);
   1936 }
   1937 
   1938 static int
   1939 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1940 {
   1941 	RF_SectorCount_t num_blocks, pb, sum;
   1942 	RF_RaidAddr_t raid_addr;
   1943 	daddr_t blocknum;
   1944 	int     do_async;
   1945 	int rc;
   1946 
   1947 	rf_lock_mutex2(raidPtr->mutex);
   1948 	if (raidPtr->openings == 0) {
   1949 		rf_unlock_mutex2(raidPtr->mutex);
   1950 		return EAGAIN;
   1951 	}
   1952 	rf_unlock_mutex2(raidPtr->mutex);
   1953 
   1954 	blocknum = bp->b_rawblkno;
   1955 
   1956 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1957 		    (int) blocknum));
   1958 
   1959 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1960 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1961 
   1962 	/* *THIS* is where we adjust what block we're going to...
   1963 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1964 	raid_addr = blocknum;
   1965 
   1966 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1967 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1968 	sum = raid_addr + num_blocks + pb;
   1969 	if (1 || rf_debugKernelAccess) {
   1970 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1971 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1972 			    (int) pb, (int) bp->b_resid));
   1973 	}
   1974 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1975 	    || (sum < num_blocks) || (sum < pb)) {
   1976 		rc = ENOSPC;
   1977 		goto done;
   1978 	}
   1979 	/*
   1980 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1981 	 */
   1982 
   1983 	if (bp->b_bcount & raidPtr->sectorMask) {
   1984 		rc = ENOSPC;
   1985 		goto done;
   1986 	}
   1987 	db1_printf(("Calling DoAccess..\n"));
   1988 
   1989 
   1990 	rf_lock_mutex2(raidPtr->mutex);
   1991 	raidPtr->openings--;
   1992 	rf_unlock_mutex2(raidPtr->mutex);
   1993 
   1994 	/*
   1995 	 * Everything is async.
   1996 	 */
   1997 	do_async = 1;
   1998 
   1999 	/* don't ever condition on bp->b_flags & B_WRITE.
   2000 	 * always condition on B_READ instead */
   2001 
   2002 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2003 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2004 			 do_async, raid_addr, num_blocks,
   2005 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2006 
   2007 done:
   2008 	return rc;
   2009 }
   2010 
   2011 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2012 
   2013 int
   2014 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2015 {
   2016 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2017 	struct buf *bp;
   2018 
   2019 	req->queue = queue;
   2020 	bp = req->bp;
   2021 
   2022 	switch (req->type) {
   2023 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2024 		/* XXX need to do something extra here.. */
   2025 		/* I'm leaving this in, as I've never actually seen it used,
   2026 		 * and I'd like folks to report it... GO */
   2027 		printf(("WAKEUP CALLED\n"));
   2028 		queue->numOutstanding++;
   2029 
   2030 		bp->b_flags = 0;
   2031 		bp->b_private = req;
   2032 
   2033 		KernelWakeupFunc(bp);
   2034 		break;
   2035 
   2036 	case RF_IO_TYPE_READ:
   2037 	case RF_IO_TYPE_WRITE:
   2038 #if RF_ACC_TRACE > 0
   2039 		if (req->tracerec) {
   2040 			RF_ETIMER_START(req->tracerec->timer);
   2041 		}
   2042 #endif
   2043 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2044 		    op, queue->rf_cinfo->ci_dev,
   2045 		    req->sectorOffset, req->numSector,
   2046 		    req->buf, KernelWakeupFunc, (void *) req,
   2047 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2048 
   2049 		if (rf_debugKernelAccess) {
   2050 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2051 				(long) bp->b_blkno));
   2052 		}
   2053 		queue->numOutstanding++;
   2054 		queue->last_deq_sector = req->sectorOffset;
   2055 		/* acc wouldn't have been let in if there were any pending
   2056 		 * reqs at any other priority */
   2057 		queue->curPriority = req->priority;
   2058 
   2059 		db1_printf(("Going for %c to unit %d col %d\n",
   2060 			    req->type, queue->raidPtr->raidid,
   2061 			    queue->col));
   2062 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2063 			(int) req->sectorOffset, (int) req->numSector,
   2064 			(int) (req->numSector <<
   2065 			    queue->raidPtr->logBytesPerSector),
   2066 			(int) queue->raidPtr->logBytesPerSector));
   2067 
   2068 		/*
   2069 		 * XXX: drop lock here since this can block at
   2070 		 * least with backing SCSI devices.  Retake it
   2071 		 * to minimize fuss with calling interfaces.
   2072 		 */
   2073 
   2074 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2075 		bdev_strategy(bp);
   2076 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2077 		break;
   2078 
   2079 	default:
   2080 		panic("bad req->type in rf_DispatchKernelIO");
   2081 	}
   2082 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2083 
   2084 	return (0);
   2085 }
   2086 /* this is the callback function associated with a I/O invoked from
   2087    kernel code.
   2088  */
   2089 static void
   2090 KernelWakeupFunc(struct buf *bp)
   2091 {
   2092 	RF_DiskQueueData_t *req = NULL;
   2093 	RF_DiskQueue_t *queue;
   2094 
   2095 	db1_printf(("recovering the request queue:\n"));
   2096 
   2097 	req = bp->b_private;
   2098 
   2099 	queue = (RF_DiskQueue_t *) req->queue;
   2100 
   2101 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2102 
   2103 #if RF_ACC_TRACE > 0
   2104 	if (req->tracerec) {
   2105 		RF_ETIMER_STOP(req->tracerec->timer);
   2106 		RF_ETIMER_EVAL(req->tracerec->timer);
   2107 		rf_lock_mutex2(rf_tracing_mutex);
   2108 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2109 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2110 		req->tracerec->num_phys_ios++;
   2111 		rf_unlock_mutex2(rf_tracing_mutex);
   2112 	}
   2113 #endif
   2114 
   2115 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2116 	 * ballistic, and mark the component as hosed... */
   2117 
   2118 	if (bp->b_error != 0) {
   2119 		/* Mark the disk as dead */
   2120 		/* but only mark it once... */
   2121 		/* and only if it wouldn't leave this RAID set
   2122 		   completely broken */
   2123 		if (((queue->raidPtr->Disks[queue->col].status ==
   2124 		      rf_ds_optimal) ||
   2125 		     (queue->raidPtr->Disks[queue->col].status ==
   2126 		      rf_ds_used_spare)) &&
   2127 		     (queue->raidPtr->numFailures <
   2128 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2129 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2130 			       queue->raidPtr->raidid,
   2131 			       bp->b_error,
   2132 			       queue->raidPtr->Disks[queue->col].devname);
   2133 			queue->raidPtr->Disks[queue->col].status =
   2134 			    rf_ds_failed;
   2135 			queue->raidPtr->status = rf_rs_degraded;
   2136 			queue->raidPtr->numFailures++;
   2137 			queue->raidPtr->numNewFailures++;
   2138 		} else {	/* Disk is already dead... */
   2139 			/* printf("Disk already marked as dead!\n"); */
   2140 		}
   2141 
   2142 	}
   2143 
   2144 	/* Fill in the error value */
   2145 	req->error = bp->b_error;
   2146 
   2147 	/* Drop this one on the "finished" queue... */
   2148 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2149 
   2150 	/* Let the raidio thread know there is work to be done. */
   2151 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2152 
   2153 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2154 }
   2155 
   2156 
   2157 /*
   2158  * initialize a buf structure for doing an I/O in the kernel.
   2159  */
   2160 static void
   2161 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2162        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2163        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2164        struct proc *b_proc)
   2165 {
   2166 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2167 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2168 	bp->b_oflags = 0;
   2169 	bp->b_cflags = 0;
   2170 	bp->b_bcount = numSect << logBytesPerSector;
   2171 	bp->b_bufsize = bp->b_bcount;
   2172 	bp->b_error = 0;
   2173 	bp->b_dev = dev;
   2174 	bp->b_data = bf;
   2175 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2176 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2177 	if (bp->b_bcount == 0) {
   2178 		panic("bp->b_bcount is zero in InitBP!!");
   2179 	}
   2180 	bp->b_proc = b_proc;
   2181 	bp->b_iodone = cbFunc;
   2182 	bp->b_private = cbArg;
   2183 }
   2184 
   2185 /*
   2186  * Wait interruptibly for an exclusive lock.
   2187  *
   2188  * XXX
   2189  * Several drivers do this; it should be abstracted and made MP-safe.
   2190  * (Hmm... where have we seen this warning before :->  GO )
   2191  */
   2192 static int
   2193 raidlock(struct raid_softc *rs)
   2194 {
   2195 	int     error;
   2196 
   2197 	error = 0;
   2198 	mutex_enter(&rs->sc_mutex);
   2199 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2200 		rs->sc_flags |= RAIDF_WANTED;
   2201 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2202 		if (error != 0)
   2203 			goto done;
   2204 	}
   2205 	rs->sc_flags |= RAIDF_LOCKED;
   2206 done:
   2207 	mutex_exit(&rs->sc_mutex);
   2208 	return (error);
   2209 }
   2210 /*
   2211  * Unlock and wake up any waiters.
   2212  */
   2213 static void
   2214 raidunlock(struct raid_softc *rs)
   2215 {
   2216 
   2217 	mutex_enter(&rs->sc_mutex);
   2218 	rs->sc_flags &= ~RAIDF_LOCKED;
   2219 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2220 		rs->sc_flags &= ~RAIDF_WANTED;
   2221 		cv_broadcast(&rs->sc_cv);
   2222 	}
   2223 	mutex_exit(&rs->sc_mutex);
   2224 }
   2225 
   2226 
   2227 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2228 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2229 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2230 
   2231 static daddr_t
   2232 rf_component_info_offset(void)
   2233 {
   2234 
   2235 	return RF_COMPONENT_INFO_OFFSET;
   2236 }
   2237 
   2238 static daddr_t
   2239 rf_component_info_size(unsigned secsize)
   2240 {
   2241 	daddr_t info_size;
   2242 
   2243 	KASSERT(secsize);
   2244 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2245 		info_size = secsize;
   2246 	else
   2247 		info_size = RF_COMPONENT_INFO_SIZE;
   2248 
   2249 	return info_size;
   2250 }
   2251 
   2252 static daddr_t
   2253 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2254 {
   2255 	daddr_t map_offset;
   2256 
   2257 	KASSERT(raidPtr->bytesPerSector);
   2258 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2259 		map_offset = raidPtr->bytesPerSector;
   2260 	else
   2261 		map_offset = RF_COMPONENT_INFO_SIZE;
   2262 	map_offset += rf_component_info_offset();
   2263 
   2264 	return map_offset;
   2265 }
   2266 
   2267 static daddr_t
   2268 rf_parity_map_size(RF_Raid_t *raidPtr)
   2269 {
   2270 	daddr_t map_size;
   2271 
   2272 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2273 		map_size = raidPtr->bytesPerSector;
   2274 	else
   2275 		map_size = RF_PARITY_MAP_SIZE;
   2276 
   2277 	return map_size;
   2278 }
   2279 
   2280 int
   2281 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2282 {
   2283 	RF_ComponentLabel_t *clabel;
   2284 
   2285 	clabel = raidget_component_label(raidPtr, col);
   2286 	clabel->clean = RF_RAID_CLEAN;
   2287 	raidflush_component_label(raidPtr, col);
   2288 	return(0);
   2289 }
   2290 
   2291 
   2292 int
   2293 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2294 {
   2295 	RF_ComponentLabel_t *clabel;
   2296 
   2297 	clabel = raidget_component_label(raidPtr, col);
   2298 	clabel->clean = RF_RAID_DIRTY;
   2299 	raidflush_component_label(raidPtr, col);
   2300 	return(0);
   2301 }
   2302 
   2303 int
   2304 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2305 {
   2306 	KASSERT(raidPtr->bytesPerSector);
   2307 	return raidread_component_label(raidPtr->bytesPerSector,
   2308 	    raidPtr->Disks[col].dev,
   2309 	    raidPtr->raid_cinfo[col].ci_vp,
   2310 	    &raidPtr->raid_cinfo[col].ci_label);
   2311 }
   2312 
   2313 RF_ComponentLabel_t *
   2314 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2315 {
   2316 	return &raidPtr->raid_cinfo[col].ci_label;
   2317 }
   2318 
   2319 int
   2320 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2321 {
   2322 	RF_ComponentLabel_t *label;
   2323 
   2324 	label = &raidPtr->raid_cinfo[col].ci_label;
   2325 	label->mod_counter = raidPtr->mod_counter;
   2326 #ifndef RF_NO_PARITY_MAP
   2327 	label->parity_map_modcount = label->mod_counter;
   2328 #endif
   2329 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2330 	    raidPtr->Disks[col].dev,
   2331 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2332 }
   2333 
   2334 
   2335 static int
   2336 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2337     RF_ComponentLabel_t *clabel)
   2338 {
   2339 	return raidread_component_area(dev, b_vp, clabel,
   2340 	    sizeof(RF_ComponentLabel_t),
   2341 	    rf_component_info_offset(),
   2342 	    rf_component_info_size(secsize));
   2343 }
   2344 
   2345 /* ARGSUSED */
   2346 static int
   2347 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2348     size_t msize, daddr_t offset, daddr_t dsize)
   2349 {
   2350 	struct buf *bp;
   2351 	int error;
   2352 
   2353 	/* XXX should probably ensure that we don't try to do this if
   2354 	   someone has changed rf_protected_sectors. */
   2355 
   2356 	if (b_vp == NULL) {
   2357 		/* For whatever reason, this component is not valid.
   2358 		   Don't try to read a component label from it. */
   2359 		return(EINVAL);
   2360 	}
   2361 
   2362 	/* get a block of the appropriate size... */
   2363 	bp = geteblk((int)dsize);
   2364 	bp->b_dev = dev;
   2365 
   2366 	/* get our ducks in a row for the read */
   2367 	bp->b_blkno = offset / DEV_BSIZE;
   2368 	bp->b_bcount = dsize;
   2369 	bp->b_flags |= B_READ;
   2370  	bp->b_resid = dsize;
   2371 
   2372 	bdev_strategy(bp);
   2373 	error = biowait(bp);
   2374 
   2375 	if (!error) {
   2376 		memcpy(data, bp->b_data, msize);
   2377 	}
   2378 
   2379 	brelse(bp, 0);
   2380 	return(error);
   2381 }
   2382 
   2383 
   2384 static int
   2385 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2386     RF_ComponentLabel_t *clabel)
   2387 {
   2388 	return raidwrite_component_area(dev, b_vp, clabel,
   2389 	    sizeof(RF_ComponentLabel_t),
   2390 	    rf_component_info_offset(),
   2391 	    rf_component_info_size(secsize), 0);
   2392 }
   2393 
   2394 /* ARGSUSED */
   2395 static int
   2396 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2397     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2398 {
   2399 	struct buf *bp;
   2400 	int error;
   2401 
   2402 	/* get a block of the appropriate size... */
   2403 	bp = geteblk((int)dsize);
   2404 	bp->b_dev = dev;
   2405 
   2406 	/* get our ducks in a row for the write */
   2407 	bp->b_blkno = offset / DEV_BSIZE;
   2408 	bp->b_bcount = dsize;
   2409 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2410  	bp->b_resid = dsize;
   2411 
   2412 	memset(bp->b_data, 0, dsize);
   2413 	memcpy(bp->b_data, data, msize);
   2414 
   2415 	bdev_strategy(bp);
   2416 	if (asyncp)
   2417 		return 0;
   2418 	error = biowait(bp);
   2419 	brelse(bp, 0);
   2420 	if (error) {
   2421 #if 1
   2422 		printf("Failed to write RAID component info!\n");
   2423 #endif
   2424 	}
   2425 
   2426 	return(error);
   2427 }
   2428 
   2429 void
   2430 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2431 {
   2432 	int c;
   2433 
   2434 	for (c = 0; c < raidPtr->numCol; c++) {
   2435 		/* Skip dead disks. */
   2436 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2437 			continue;
   2438 		/* XXXjld: what if an error occurs here? */
   2439 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2440 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2441 		    RF_PARITYMAP_NBYTE,
   2442 		    rf_parity_map_offset(raidPtr),
   2443 		    rf_parity_map_size(raidPtr), 0);
   2444 	}
   2445 }
   2446 
   2447 void
   2448 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2449 {
   2450 	struct rf_paritymap_ondisk tmp;
   2451 	int c,first;
   2452 
   2453 	first=1;
   2454 	for (c = 0; c < raidPtr->numCol; c++) {
   2455 		/* Skip dead disks. */
   2456 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2457 			continue;
   2458 		raidread_component_area(raidPtr->Disks[c].dev,
   2459 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2460 		    RF_PARITYMAP_NBYTE,
   2461 		    rf_parity_map_offset(raidPtr),
   2462 		    rf_parity_map_size(raidPtr));
   2463 		if (first) {
   2464 			memcpy(map, &tmp, sizeof(*map));
   2465 			first = 0;
   2466 		} else {
   2467 			rf_paritymap_merge(map, &tmp);
   2468 		}
   2469 	}
   2470 }
   2471 
   2472 void
   2473 rf_markalldirty(RF_Raid_t *raidPtr)
   2474 {
   2475 	RF_ComponentLabel_t *clabel;
   2476 	int sparecol;
   2477 	int c;
   2478 	int j;
   2479 	int scol = -1;
   2480 
   2481 	raidPtr->mod_counter++;
   2482 	for (c = 0; c < raidPtr->numCol; c++) {
   2483 		/* we don't want to touch (at all) a disk that has
   2484 		   failed */
   2485 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2486 			clabel = raidget_component_label(raidPtr, c);
   2487 			if (clabel->status == rf_ds_spared) {
   2488 				/* XXX do something special...
   2489 				   but whatever you do, don't
   2490 				   try to access it!! */
   2491 			} else {
   2492 				raidmarkdirty(raidPtr, c);
   2493 			}
   2494 		}
   2495 	}
   2496 
   2497 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2498 		sparecol = raidPtr->numCol + c;
   2499 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2500 			/*
   2501 
   2502 			   we claim this disk is "optimal" if it's
   2503 			   rf_ds_used_spare, as that means it should be
   2504 			   directly substitutable for the disk it replaced.
   2505 			   We note that too...
   2506 
   2507 			 */
   2508 
   2509 			for(j=0;j<raidPtr->numCol;j++) {
   2510 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2511 					scol = j;
   2512 					break;
   2513 				}
   2514 			}
   2515 
   2516 			clabel = raidget_component_label(raidPtr, sparecol);
   2517 			/* make sure status is noted */
   2518 
   2519 			raid_init_component_label(raidPtr, clabel);
   2520 
   2521 			clabel->row = 0;
   2522 			clabel->column = scol;
   2523 			/* Note: we *don't* change status from rf_ds_used_spare
   2524 			   to rf_ds_optimal */
   2525 			/* clabel.status = rf_ds_optimal; */
   2526 
   2527 			raidmarkdirty(raidPtr, sparecol);
   2528 		}
   2529 	}
   2530 }
   2531 
   2532 
   2533 void
   2534 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2535 {
   2536 	RF_ComponentLabel_t *clabel;
   2537 	int sparecol;
   2538 	int c;
   2539 	int j;
   2540 	int scol;
   2541 	struct raid_softc *rs = raidPtr->softc;
   2542 
   2543 	scol = -1;
   2544 
   2545 	/* XXX should do extra checks to make sure things really are clean,
   2546 	   rather than blindly setting the clean bit... */
   2547 
   2548 	raidPtr->mod_counter++;
   2549 
   2550 	for (c = 0; c < raidPtr->numCol; c++) {
   2551 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2552 			clabel = raidget_component_label(raidPtr, c);
   2553 			/* make sure status is noted */
   2554 			clabel->status = rf_ds_optimal;
   2555 
   2556 			/* note what unit we are configured as */
   2557 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2558 				clabel->last_unit = raidPtr->raidid;
   2559 
   2560 			raidflush_component_label(raidPtr, c);
   2561 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2562 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2563 					raidmarkclean(raidPtr, c);
   2564 				}
   2565 			}
   2566 		}
   2567 		/* else we don't touch it.. */
   2568 	}
   2569 
   2570 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2571 		sparecol = raidPtr->numCol + c;
   2572 		/* Need to ensure that the reconstruct actually completed! */
   2573 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2574 			/*
   2575 
   2576 			   we claim this disk is "optimal" if it's
   2577 			   rf_ds_used_spare, as that means it should be
   2578 			   directly substitutable for the disk it replaced.
   2579 			   We note that too...
   2580 
   2581 			 */
   2582 
   2583 			for(j=0;j<raidPtr->numCol;j++) {
   2584 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2585 					scol = j;
   2586 					break;
   2587 				}
   2588 			}
   2589 
   2590 			/* XXX shouldn't *really* need this... */
   2591 			clabel = raidget_component_label(raidPtr, sparecol);
   2592 			/* make sure status is noted */
   2593 
   2594 			raid_init_component_label(raidPtr, clabel);
   2595 
   2596 			clabel->column = scol;
   2597 			clabel->status = rf_ds_optimal;
   2598 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2599 				clabel->last_unit = raidPtr->raidid;
   2600 
   2601 			raidflush_component_label(raidPtr, sparecol);
   2602 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2603 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2604 					raidmarkclean(raidPtr, sparecol);
   2605 				}
   2606 			}
   2607 		}
   2608 	}
   2609 }
   2610 
   2611 void
   2612 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2613 {
   2614 
   2615 	if (vp != NULL) {
   2616 		if (auto_configured == 1) {
   2617 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2618 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2619 			vput(vp);
   2620 
   2621 		} else {
   2622 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2623 		}
   2624 	}
   2625 }
   2626 
   2627 
   2628 void
   2629 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2630 {
   2631 	int r,c;
   2632 	struct vnode *vp;
   2633 	int acd;
   2634 
   2635 
   2636 	/* We take this opportunity to close the vnodes like we should.. */
   2637 
   2638 	for (c = 0; c < raidPtr->numCol; c++) {
   2639 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2640 		acd = raidPtr->Disks[c].auto_configured;
   2641 		rf_close_component(raidPtr, vp, acd);
   2642 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2643 		raidPtr->Disks[c].auto_configured = 0;
   2644 	}
   2645 
   2646 	for (r = 0; r < raidPtr->numSpare; r++) {
   2647 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2648 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2649 		rf_close_component(raidPtr, vp, acd);
   2650 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2651 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2652 	}
   2653 }
   2654 
   2655 
   2656 void
   2657 rf_ReconThread(struct rf_recon_req *req)
   2658 {
   2659 	int     s;
   2660 	RF_Raid_t *raidPtr;
   2661 
   2662 	s = splbio();
   2663 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2664 	raidPtr->recon_in_progress = 1;
   2665 
   2666 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2667 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2668 
   2669 	RF_Free(req, sizeof(*req));
   2670 
   2671 	raidPtr->recon_in_progress = 0;
   2672 	splx(s);
   2673 
   2674 	/* That's all... */
   2675 	kthread_exit(0);	/* does not return */
   2676 }
   2677 
   2678 void
   2679 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2680 {
   2681 	int retcode;
   2682 	int s;
   2683 
   2684 	raidPtr->parity_rewrite_stripes_done = 0;
   2685 	raidPtr->parity_rewrite_in_progress = 1;
   2686 	s = splbio();
   2687 	retcode = rf_RewriteParity(raidPtr);
   2688 	splx(s);
   2689 	if (retcode) {
   2690 		printf("raid%d: Error re-writing parity (%d)!\n",
   2691 		    raidPtr->raidid, retcode);
   2692 	} else {
   2693 		/* set the clean bit!  If we shutdown correctly,
   2694 		   the clean bit on each component label will get
   2695 		   set */
   2696 		raidPtr->parity_good = RF_RAID_CLEAN;
   2697 	}
   2698 	raidPtr->parity_rewrite_in_progress = 0;
   2699 
   2700 	/* Anyone waiting for us to stop?  If so, inform them... */
   2701 	if (raidPtr->waitShutdown) {
   2702 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2703 	}
   2704 
   2705 	/* That's all... */
   2706 	kthread_exit(0);	/* does not return */
   2707 }
   2708 
   2709 
   2710 void
   2711 rf_CopybackThread(RF_Raid_t *raidPtr)
   2712 {
   2713 	int s;
   2714 
   2715 	raidPtr->copyback_in_progress = 1;
   2716 	s = splbio();
   2717 	rf_CopybackReconstructedData(raidPtr);
   2718 	splx(s);
   2719 	raidPtr->copyback_in_progress = 0;
   2720 
   2721 	/* That's all... */
   2722 	kthread_exit(0);	/* does not return */
   2723 }
   2724 
   2725 
   2726 void
   2727 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2728 {
   2729 	int s;
   2730 	RF_Raid_t *raidPtr;
   2731 
   2732 	s = splbio();
   2733 	raidPtr = req->raidPtr;
   2734 	raidPtr->recon_in_progress = 1;
   2735 	rf_ReconstructInPlace(raidPtr, req->col);
   2736 	RF_Free(req, sizeof(*req));
   2737 	raidPtr->recon_in_progress = 0;
   2738 	splx(s);
   2739 
   2740 	/* That's all... */
   2741 	kthread_exit(0);	/* does not return */
   2742 }
   2743 
   2744 static RF_AutoConfig_t *
   2745 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2746     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2747     unsigned secsize)
   2748 {
   2749 	int good_one = 0;
   2750 	RF_ComponentLabel_t *clabel;
   2751 	RF_AutoConfig_t *ac;
   2752 
   2753 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2754 	if (clabel == NULL) {
   2755 oomem:
   2756 		    while(ac_list) {
   2757 			    ac = ac_list;
   2758 			    if (ac->clabel)
   2759 				    free(ac->clabel, M_RAIDFRAME);
   2760 			    ac_list = ac_list->next;
   2761 			    free(ac, M_RAIDFRAME);
   2762 		    }
   2763 		    printf("RAID auto config: out of memory!\n");
   2764 		    return NULL; /* XXX probably should panic? */
   2765 	}
   2766 
   2767 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2768 		/* Got the label.  Does it look reasonable? */
   2769 		if (rf_reasonable_label(clabel, numsecs) &&
   2770 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2771 #ifdef DEBUG
   2772 			printf("Component on: %s: %llu\n",
   2773 				cname, (unsigned long long)size);
   2774 			rf_print_component_label(clabel);
   2775 #endif
   2776 			/* if it's reasonable, add it, else ignore it. */
   2777 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2778 				M_NOWAIT);
   2779 			if (ac == NULL) {
   2780 				free(clabel, M_RAIDFRAME);
   2781 				goto oomem;
   2782 			}
   2783 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2784 			ac->dev = dev;
   2785 			ac->vp = vp;
   2786 			ac->clabel = clabel;
   2787 			ac->next = ac_list;
   2788 			ac_list = ac;
   2789 			good_one = 1;
   2790 		}
   2791 	}
   2792 	if (!good_one) {
   2793 		/* cleanup */
   2794 		free(clabel, M_RAIDFRAME);
   2795 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2796 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2797 		vput(vp);
   2798 	}
   2799 	return ac_list;
   2800 }
   2801 
   2802 RF_AutoConfig_t *
   2803 rf_find_raid_components(void)
   2804 {
   2805 	struct vnode *vp;
   2806 	struct disklabel label;
   2807 	device_t dv;
   2808 	deviter_t di;
   2809 	dev_t dev;
   2810 	int bmajor, bminor, wedge, rf_part_found;
   2811 	int error;
   2812 	int i;
   2813 	RF_AutoConfig_t *ac_list;
   2814 	uint64_t numsecs;
   2815 	unsigned secsize;
   2816 	int dowedges;
   2817 
   2818 	/* initialize the AutoConfig list */
   2819 	ac_list = NULL;
   2820 
   2821 	/*
   2822 	 * we begin by trolling through *all* the devices on the system *twice*
   2823 	 * first we scan for wedges, second for other devices. This avoids
   2824 	 * using a raw partition instead of a wedge that covers the whole disk
   2825 	 */
   2826 
   2827 	for (dowedges=1; dowedges>=0; --dowedges) {
   2828 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2829 		     dv = deviter_next(&di)) {
   2830 
   2831 			/* we are only interested in disks... */
   2832 			if (device_class(dv) != DV_DISK)
   2833 				continue;
   2834 
   2835 			/* we don't care about floppies... */
   2836 			if (device_is_a(dv, "fd")) {
   2837 				continue;
   2838 			}
   2839 
   2840 			/* we don't care about CD's... */
   2841 			if (device_is_a(dv, "cd")) {
   2842 				continue;
   2843 			}
   2844 
   2845 			/* we don't care about md's... */
   2846 			if (device_is_a(dv, "md")) {
   2847 				continue;
   2848 			}
   2849 
   2850 			/* hdfd is the Atari/Hades floppy driver */
   2851 			if (device_is_a(dv, "hdfd")) {
   2852 				continue;
   2853 			}
   2854 
   2855 			/* fdisa is the Atari/Milan floppy driver */
   2856 			if (device_is_a(dv, "fdisa")) {
   2857 				continue;
   2858 			}
   2859 
   2860 			/* are we in the wedges pass ? */
   2861 			wedge = device_is_a(dv, "dk");
   2862 			if (wedge != dowedges) {
   2863 				continue;
   2864 			}
   2865 
   2866 			/* need to find the device_name_to_block_device_major stuff */
   2867 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2868 
   2869 			rf_part_found = 0; /*No raid partition as yet*/
   2870 
   2871 			/* get a vnode for the raw partition of this disk */
   2872 			bminor = minor(device_unit(dv));
   2873 			dev = wedge ? makedev(bmajor, bminor) :
   2874 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2875 			if (bdevvp(dev, &vp))
   2876 				panic("RAID can't alloc vnode");
   2877 
   2878 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2879 
   2880 			if (error) {
   2881 				/* "Who cares."  Continue looking
   2882 				   for something that exists*/
   2883 				vput(vp);
   2884 				continue;
   2885 			}
   2886 
   2887 			error = getdisksize(vp, &numsecs, &secsize);
   2888 			if (error) {
   2889 				/*
   2890 				 * Pseudo devices like vnd and cgd can be
   2891 				 * opened but may still need some configuration.
   2892 				 * Ignore these quietly.
   2893 				 */
   2894 				if (error != ENXIO)
   2895 					printf("RAIDframe: can't get disk size"
   2896 					    " for dev %s (%d)\n",
   2897 					    device_xname(dv), error);
   2898 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2899 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2900 				vput(vp);
   2901 				continue;
   2902 			}
   2903 			if (wedge) {
   2904 				struct dkwedge_info dkw;
   2905 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2906 				    NOCRED);
   2907 				if (error) {
   2908 					printf("RAIDframe: can't get wedge info for "
   2909 					    "dev %s (%d)\n", device_xname(dv), error);
   2910 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2911 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2912 					vput(vp);
   2913 					continue;
   2914 				}
   2915 
   2916 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2917 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2918 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2919 					vput(vp);
   2920 					continue;
   2921 				}
   2922 
   2923 				ac_list = rf_get_component(ac_list, dev, vp,
   2924 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2925 				rf_part_found = 1; /*There is a raid component on this disk*/
   2926 				continue;
   2927 			}
   2928 
   2929 			/* Ok, the disk exists.  Go get the disklabel. */
   2930 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2931 			if (error) {
   2932 				/*
   2933 				 * XXX can't happen - open() would
   2934 				 * have errored out (or faked up one)
   2935 				 */
   2936 				if (error != ENOTTY)
   2937 					printf("RAIDframe: can't get label for dev "
   2938 					    "%s (%d)\n", device_xname(dv), error);
   2939 			}
   2940 
   2941 			/* don't need this any more.  We'll allocate it again
   2942 			   a little later if we really do... */
   2943 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2944 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2945 			vput(vp);
   2946 
   2947 			if (error)
   2948 				continue;
   2949 
   2950 			rf_part_found = 0; /*No raid partitions yet*/
   2951 			for (i = 0; i < label.d_npartitions; i++) {
   2952 				char cname[sizeof(ac_list->devname)];
   2953 
   2954 				/* We only support partitions marked as RAID */
   2955 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2956 					continue;
   2957 
   2958 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2959 				if (bdevvp(dev, &vp))
   2960 					panic("RAID can't alloc vnode");
   2961 
   2962 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2963 				if (error) {
   2964 					/* Whatever... */
   2965 					vput(vp);
   2966 					continue;
   2967 				}
   2968 				snprintf(cname, sizeof(cname), "%s%c",
   2969 				    device_xname(dv), 'a' + i);
   2970 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2971 					label.d_partitions[i].p_size, numsecs, secsize);
   2972 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2973 			}
   2974 
   2975 			/*
   2976 			 *If there is no raid component on this disk, either in a
   2977 			 *disklabel or inside a wedge, check the raw partition as well,
   2978 			 *as it is possible to configure raid components on raw disk
   2979 			 *devices.
   2980 			 */
   2981 
   2982 			if (!rf_part_found) {
   2983 				char cname[sizeof(ac_list->devname)];
   2984 
   2985 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2986 				if (bdevvp(dev, &vp))
   2987 					panic("RAID can't alloc vnode");
   2988 
   2989 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2990 				if (error) {
   2991 					/* Whatever... */
   2992 					vput(vp);
   2993 					continue;
   2994 				}
   2995 				snprintf(cname, sizeof(cname), "%s%c",
   2996 				    device_xname(dv), 'a' + RAW_PART);
   2997 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2998 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2999 			}
   3000 		}
   3001 		deviter_release(&di);
   3002 	}
   3003 	return ac_list;
   3004 }
   3005 
   3006 
   3007 int
   3008 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3009 {
   3010 
   3011 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3012 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3013 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3014 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3015 	    clabel->row >=0 &&
   3016 	    clabel->column >= 0 &&
   3017 	    clabel->num_rows > 0 &&
   3018 	    clabel->num_columns > 0 &&
   3019 	    clabel->row < clabel->num_rows &&
   3020 	    clabel->column < clabel->num_columns &&
   3021 	    clabel->blockSize > 0 &&
   3022 	    /*
   3023 	     * numBlocksHi may contain garbage, but it is ok since
   3024 	     * the type is unsigned.  If it is really garbage,
   3025 	     * rf_fix_old_label_size() will fix it.
   3026 	     */
   3027 	    rf_component_label_numblocks(clabel) > 0) {
   3028 		/*
   3029 		 * label looks reasonable enough...
   3030 		 * let's make sure it has no old garbage.
   3031 		 */
   3032 		if (numsecs)
   3033 			rf_fix_old_label_size(clabel, numsecs);
   3034 		return(1);
   3035 	}
   3036 	return(0);
   3037 }
   3038 
   3039 
   3040 /*
   3041  * For reasons yet unknown, some old component labels have garbage in
   3042  * the newer numBlocksHi region, and this causes lossage.  Since those
   3043  * disks will also have numsecs set to less than 32 bits of sectors,
   3044  * we can determine when this corruption has occurred, and fix it.
   3045  *
   3046  * The exact same problem, with the same unknown reason, happens to
   3047  * the partitionSizeHi member as well.
   3048  */
   3049 static void
   3050 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3051 {
   3052 
   3053 	if (numsecs < ((uint64_t)1 << 32)) {
   3054 		if (clabel->numBlocksHi) {
   3055 			printf("WARNING: total sectors < 32 bits, yet "
   3056 			       "numBlocksHi set\n"
   3057 			       "WARNING: resetting numBlocksHi to zero.\n");
   3058 			clabel->numBlocksHi = 0;
   3059 		}
   3060 
   3061 		if (clabel->partitionSizeHi) {
   3062 			printf("WARNING: total sectors < 32 bits, yet "
   3063 			       "partitionSizeHi set\n"
   3064 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3065 			clabel->partitionSizeHi = 0;
   3066 		}
   3067 	}
   3068 }
   3069 
   3070 
   3071 #ifdef DEBUG
   3072 void
   3073 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3074 {
   3075 	uint64_t numBlocks;
   3076 	static const char *rp[] = {
   3077 	    "No", "Force", "Soft", "*invalid*"
   3078 	};
   3079 
   3080 
   3081 	numBlocks = rf_component_label_numblocks(clabel);
   3082 
   3083 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3084 	       clabel->row, clabel->column,
   3085 	       clabel->num_rows, clabel->num_columns);
   3086 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3087 	       clabel->version, clabel->serial_number,
   3088 	       clabel->mod_counter);
   3089 	printf("   Clean: %s Status: %d\n",
   3090 	       clabel->clean ? "Yes" : "No", clabel->status);
   3091 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3092 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3093 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3094 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3095 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3096 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3097 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3098 #if 0
   3099 	   printf("   Config order: %d\n", clabel->config_order);
   3100 #endif
   3101 
   3102 }
   3103 #endif
   3104 
   3105 RF_ConfigSet_t *
   3106 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3107 {
   3108 	RF_AutoConfig_t *ac;
   3109 	RF_ConfigSet_t *config_sets;
   3110 	RF_ConfigSet_t *cset;
   3111 	RF_AutoConfig_t *ac_next;
   3112 
   3113 
   3114 	config_sets = NULL;
   3115 
   3116 	/* Go through the AutoConfig list, and figure out which components
   3117 	   belong to what sets.  */
   3118 	ac = ac_list;
   3119 	while(ac!=NULL) {
   3120 		/* we're going to putz with ac->next, so save it here
   3121 		   for use at the end of the loop */
   3122 		ac_next = ac->next;
   3123 
   3124 		if (config_sets == NULL) {
   3125 			/* will need at least this one... */
   3126 			config_sets = (RF_ConfigSet_t *)
   3127 				malloc(sizeof(RF_ConfigSet_t),
   3128 				       M_RAIDFRAME, M_NOWAIT);
   3129 			if (config_sets == NULL) {
   3130 				panic("rf_create_auto_sets: No memory!");
   3131 			}
   3132 			/* this one is easy :) */
   3133 			config_sets->ac = ac;
   3134 			config_sets->next = NULL;
   3135 			config_sets->rootable = 0;
   3136 			ac->next = NULL;
   3137 		} else {
   3138 			/* which set does this component fit into? */
   3139 			cset = config_sets;
   3140 			while(cset!=NULL) {
   3141 				if (rf_does_it_fit(cset, ac)) {
   3142 					/* looks like it matches... */
   3143 					ac->next = cset->ac;
   3144 					cset->ac = ac;
   3145 					break;
   3146 				}
   3147 				cset = cset->next;
   3148 			}
   3149 			if (cset==NULL) {
   3150 				/* didn't find a match above... new set..*/
   3151 				cset = (RF_ConfigSet_t *)
   3152 					malloc(sizeof(RF_ConfigSet_t),
   3153 					       M_RAIDFRAME, M_NOWAIT);
   3154 				if (cset == NULL) {
   3155 					panic("rf_create_auto_sets: No memory!");
   3156 				}
   3157 				cset->ac = ac;
   3158 				ac->next = NULL;
   3159 				cset->next = config_sets;
   3160 				cset->rootable = 0;
   3161 				config_sets = cset;
   3162 			}
   3163 		}
   3164 		ac = ac_next;
   3165 	}
   3166 
   3167 
   3168 	return(config_sets);
   3169 }
   3170 
   3171 static int
   3172 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3173 {
   3174 	RF_ComponentLabel_t *clabel1, *clabel2;
   3175 
   3176 	/* If this one matches the *first* one in the set, that's good
   3177 	   enough, since the other members of the set would have been
   3178 	   through here too... */
   3179 	/* note that we are not checking partitionSize here..
   3180 
   3181 	   Note that we are also not checking the mod_counters here.
   3182 	   If everything else matches except the mod_counter, that's
   3183 	   good enough for this test.  We will deal with the mod_counters
   3184 	   a little later in the autoconfiguration process.
   3185 
   3186 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3187 
   3188 	   The reason we don't check for this is that failed disks
   3189 	   will have lower modification counts.  If those disks are
   3190 	   not added to the set they used to belong to, then they will
   3191 	   form their own set, which may result in 2 different sets,
   3192 	   for example, competing to be configured at raid0, and
   3193 	   perhaps competing to be the root filesystem set.  If the
   3194 	   wrong ones get configured, or both attempt to become /,
   3195 	   weird behaviour and or serious lossage will occur.  Thus we
   3196 	   need to bring them into the fold here, and kick them out at
   3197 	   a later point.
   3198 
   3199 	*/
   3200 
   3201 	clabel1 = cset->ac->clabel;
   3202 	clabel2 = ac->clabel;
   3203 	if ((clabel1->version == clabel2->version) &&
   3204 	    (clabel1->serial_number == clabel2->serial_number) &&
   3205 	    (clabel1->num_rows == clabel2->num_rows) &&
   3206 	    (clabel1->num_columns == clabel2->num_columns) &&
   3207 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3208 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3209 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3210 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3211 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3212 	    (clabel1->blockSize == clabel2->blockSize) &&
   3213 	    rf_component_label_numblocks(clabel1) ==
   3214 	    rf_component_label_numblocks(clabel2) &&
   3215 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3216 	    (clabel1->root_partition == clabel2->root_partition) &&
   3217 	    (clabel1->last_unit == clabel2->last_unit) &&
   3218 	    (clabel1->config_order == clabel2->config_order)) {
   3219 		/* if it get's here, it almost *has* to be a match */
   3220 	} else {
   3221 		/* it's not consistent with somebody in the set..
   3222 		   punt */
   3223 		return(0);
   3224 	}
   3225 	/* all was fine.. it must fit... */
   3226 	return(1);
   3227 }
   3228 
   3229 int
   3230 rf_have_enough_components(RF_ConfigSet_t *cset)
   3231 {
   3232 	RF_AutoConfig_t *ac;
   3233 	RF_AutoConfig_t *auto_config;
   3234 	RF_ComponentLabel_t *clabel;
   3235 	int c;
   3236 	int num_cols;
   3237 	int num_missing;
   3238 	int mod_counter;
   3239 	int mod_counter_found;
   3240 	int even_pair_failed;
   3241 	char parity_type;
   3242 
   3243 
   3244 	/* check to see that we have enough 'live' components
   3245 	   of this set.  If so, we can configure it if necessary */
   3246 
   3247 	num_cols = cset->ac->clabel->num_columns;
   3248 	parity_type = cset->ac->clabel->parityConfig;
   3249 
   3250 	/* XXX Check for duplicate components!?!?!? */
   3251 
   3252 	/* Determine what the mod_counter is supposed to be for this set. */
   3253 
   3254 	mod_counter_found = 0;
   3255 	mod_counter = 0;
   3256 	ac = cset->ac;
   3257 	while(ac!=NULL) {
   3258 		if (mod_counter_found==0) {
   3259 			mod_counter = ac->clabel->mod_counter;
   3260 			mod_counter_found = 1;
   3261 		} else {
   3262 			if (ac->clabel->mod_counter > mod_counter) {
   3263 				mod_counter = ac->clabel->mod_counter;
   3264 			}
   3265 		}
   3266 		ac = ac->next;
   3267 	}
   3268 
   3269 	num_missing = 0;
   3270 	auto_config = cset->ac;
   3271 
   3272 	even_pair_failed = 0;
   3273 	for(c=0; c<num_cols; c++) {
   3274 		ac = auto_config;
   3275 		while(ac!=NULL) {
   3276 			if ((ac->clabel->column == c) &&
   3277 			    (ac->clabel->mod_counter == mod_counter)) {
   3278 				/* it's this one... */
   3279 #ifdef DEBUG
   3280 				printf("Found: %s at %d\n",
   3281 				       ac->devname,c);
   3282 #endif
   3283 				break;
   3284 			}
   3285 			ac=ac->next;
   3286 		}
   3287 		if (ac==NULL) {
   3288 				/* Didn't find one here! */
   3289 				/* special case for RAID 1, especially
   3290 				   where there are more than 2
   3291 				   components (where RAIDframe treats
   3292 				   things a little differently :( ) */
   3293 			if (parity_type == '1') {
   3294 				if (c%2 == 0) { /* even component */
   3295 					even_pair_failed = 1;
   3296 				} else { /* odd component.  If
   3297 					    we're failed, and
   3298 					    so is the even
   3299 					    component, it's
   3300 					    "Good Night, Charlie" */
   3301 					if (even_pair_failed == 1) {
   3302 						return(0);
   3303 					}
   3304 				}
   3305 			} else {
   3306 				/* normal accounting */
   3307 				num_missing++;
   3308 			}
   3309 		}
   3310 		if ((parity_type == '1') && (c%2 == 1)) {
   3311 				/* Just did an even component, and we didn't
   3312 				   bail.. reset the even_pair_failed flag,
   3313 				   and go on to the next component.... */
   3314 			even_pair_failed = 0;
   3315 		}
   3316 	}
   3317 
   3318 	clabel = cset->ac->clabel;
   3319 
   3320 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3321 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3322 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3323 		/* XXX this needs to be made *much* more general */
   3324 		/* Too many failures */
   3325 		return(0);
   3326 	}
   3327 	/* otherwise, all is well, and we've got enough to take a kick
   3328 	   at autoconfiguring this set */
   3329 	return(1);
   3330 }
   3331 
   3332 void
   3333 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3334 			RF_Raid_t *raidPtr)
   3335 {
   3336 	RF_ComponentLabel_t *clabel;
   3337 	int i;
   3338 
   3339 	clabel = ac->clabel;
   3340 
   3341 	/* 1. Fill in the common stuff */
   3342 	config->numRow = clabel->num_rows = 1;
   3343 	config->numCol = clabel->num_columns;
   3344 	config->numSpare = 0; /* XXX should this be set here? */
   3345 	config->sectPerSU = clabel->sectPerSU;
   3346 	config->SUsPerPU = clabel->SUsPerPU;
   3347 	config->SUsPerRU = clabel->SUsPerRU;
   3348 	config->parityConfig = clabel->parityConfig;
   3349 	/* XXX... */
   3350 	strcpy(config->diskQueueType,"fifo");
   3351 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3352 	config->layoutSpecificSize = 0; /* XXX ?? */
   3353 
   3354 	while(ac!=NULL) {
   3355 		/* row/col values will be in range due to the checks
   3356 		   in reasonable_label() */
   3357 		strcpy(config->devnames[0][ac->clabel->column],
   3358 		       ac->devname);
   3359 		ac = ac->next;
   3360 	}
   3361 
   3362 	for(i=0;i<RF_MAXDBGV;i++) {
   3363 		config->debugVars[i][0] = 0;
   3364 	}
   3365 }
   3366 
   3367 int
   3368 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3369 {
   3370 	RF_ComponentLabel_t *clabel;
   3371 	int column;
   3372 	int sparecol;
   3373 
   3374 	raidPtr->autoconfigure = new_value;
   3375 
   3376 	for(column=0; column<raidPtr->numCol; column++) {
   3377 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3378 			clabel = raidget_component_label(raidPtr, column);
   3379 			clabel->autoconfigure = new_value;
   3380 			raidflush_component_label(raidPtr, column);
   3381 		}
   3382 	}
   3383 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3384 		sparecol = raidPtr->numCol + column;
   3385 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3386 			clabel = raidget_component_label(raidPtr, sparecol);
   3387 			clabel->autoconfigure = new_value;
   3388 			raidflush_component_label(raidPtr, sparecol);
   3389 		}
   3390 	}
   3391 	return(new_value);
   3392 }
   3393 
   3394 int
   3395 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3396 {
   3397 	RF_ComponentLabel_t *clabel;
   3398 	int column;
   3399 	int sparecol;
   3400 
   3401 	raidPtr->root_partition = new_value;
   3402 	for(column=0; column<raidPtr->numCol; column++) {
   3403 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3404 			clabel = raidget_component_label(raidPtr, column);
   3405 			clabel->root_partition = new_value;
   3406 			raidflush_component_label(raidPtr, column);
   3407 		}
   3408 	}
   3409 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3410 		sparecol = raidPtr->numCol + column;
   3411 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3412 			clabel = raidget_component_label(raidPtr, sparecol);
   3413 			clabel->root_partition = new_value;
   3414 			raidflush_component_label(raidPtr, sparecol);
   3415 		}
   3416 	}
   3417 	return(new_value);
   3418 }
   3419 
   3420 void
   3421 rf_release_all_vps(RF_ConfigSet_t *cset)
   3422 {
   3423 	RF_AutoConfig_t *ac;
   3424 
   3425 	ac = cset->ac;
   3426 	while(ac!=NULL) {
   3427 		/* Close the vp, and give it back */
   3428 		if (ac->vp) {
   3429 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3430 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3431 			vput(ac->vp);
   3432 			ac->vp = NULL;
   3433 		}
   3434 		ac = ac->next;
   3435 	}
   3436 }
   3437 
   3438 
   3439 void
   3440 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3441 {
   3442 	RF_AutoConfig_t *ac;
   3443 	RF_AutoConfig_t *next_ac;
   3444 
   3445 	ac = cset->ac;
   3446 	while(ac!=NULL) {
   3447 		next_ac = ac->next;
   3448 		/* nuke the label */
   3449 		free(ac->clabel, M_RAIDFRAME);
   3450 		/* cleanup the config structure */
   3451 		free(ac, M_RAIDFRAME);
   3452 		/* "next.." */
   3453 		ac = next_ac;
   3454 	}
   3455 	/* and, finally, nuke the config set */
   3456 	free(cset, M_RAIDFRAME);
   3457 }
   3458 
   3459 
   3460 void
   3461 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3462 {
   3463 	/* current version number */
   3464 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3465 	clabel->serial_number = raidPtr->serial_number;
   3466 	clabel->mod_counter = raidPtr->mod_counter;
   3467 
   3468 	clabel->num_rows = 1;
   3469 	clabel->num_columns = raidPtr->numCol;
   3470 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3471 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3472 
   3473 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3474 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3475 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3476 
   3477 	clabel->blockSize = raidPtr->bytesPerSector;
   3478 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3479 
   3480 	/* XXX not portable */
   3481 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3482 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3483 	clabel->autoconfigure = raidPtr->autoconfigure;
   3484 	clabel->root_partition = raidPtr->root_partition;
   3485 	clabel->last_unit = raidPtr->raidid;
   3486 	clabel->config_order = raidPtr->config_order;
   3487 
   3488 #ifndef RF_NO_PARITY_MAP
   3489 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3490 #endif
   3491 }
   3492 
   3493 struct raid_softc *
   3494 rf_auto_config_set(RF_ConfigSet_t *cset)
   3495 {
   3496 	RF_Raid_t *raidPtr;
   3497 	RF_Config_t *config;
   3498 	int raidID;
   3499 	struct raid_softc *sc;
   3500 
   3501 #ifdef DEBUG
   3502 	printf("RAID autoconfigure\n");
   3503 #endif
   3504 
   3505 	/* 1. Create a config structure */
   3506 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3507 	if (config == NULL) {
   3508 		printf("%s: Out of mem - config!?!?\n", __func__);
   3509 				/* XXX do something more intelligent here. */
   3510 		return NULL;
   3511 	}
   3512 
   3513 	/*
   3514 	   2. Figure out what RAID ID this one is supposed to live at
   3515 	   See if we can get the same RAID dev that it was configured
   3516 	   on last time..
   3517 	*/
   3518 
   3519 	raidID = cset->ac->clabel->last_unit;
   3520 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3521 	     sc = raidget(++raidID, false))
   3522 		continue;
   3523 #ifdef DEBUG
   3524 	printf("Configuring raid%d:\n",raidID);
   3525 #endif
   3526 
   3527 	if (sc == NULL)
   3528 		sc = raidget(raidID, true);
   3529 	if (sc == NULL) {
   3530 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3531 				/* XXX do something more intelligent here. */
   3532 		free(config, M_RAIDFRAME);
   3533 		return NULL;
   3534 	}
   3535 
   3536 	raidPtr = &sc->sc_r;
   3537 
   3538 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3539 	raidPtr->softc = sc;
   3540 	raidPtr->raidid = raidID;
   3541 	raidPtr->openings = RAIDOUTSTANDING;
   3542 
   3543 	/* 3. Build the configuration structure */
   3544 	rf_create_configuration(cset->ac, config, raidPtr);
   3545 
   3546 	/* 4. Do the configuration */
   3547 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3548 		raidinit(sc);
   3549 
   3550 		rf_markalldirty(raidPtr);
   3551 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3552 		switch (cset->ac->clabel->root_partition) {
   3553 		case 1:	/* Force Root */
   3554 		case 2:	/* Soft Root: root when boot partition part of raid */
   3555 			/*
   3556 			 * everything configured just fine.  Make a note
   3557 			 * that this set is eligible to be root,
   3558 			 * or forced to be root
   3559 			 */
   3560 			cset->rootable = cset->ac->clabel->root_partition;
   3561 			/* XXX do this here? */
   3562 			raidPtr->root_partition = cset->rootable;
   3563 			break;
   3564 		default:
   3565 			break;
   3566 		}
   3567 	} else {
   3568 		raidput(sc);
   3569 		sc = NULL;
   3570 	}
   3571 
   3572 	/* 5. Cleanup */
   3573 	free(config, M_RAIDFRAME);
   3574 	return sc;
   3575 }
   3576 
   3577 void
   3578 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3579 	     size_t xmin, size_t xmax)
   3580 {
   3581 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3582 	pool_sethiwat(p, xmax);
   3583 	pool_prime(p, xmin);
   3584 	pool_setlowat(p, xmin);
   3585 }
   3586 
   3587 /*
   3588  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3589  * to see if there is IO pending and if that IO could possibly be done
   3590  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3591  * otherwise.
   3592  *
   3593  */
   3594 int
   3595 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3596 {
   3597 	struct raid_softc *rs;
   3598 	struct dk_softc *dksc;
   3599 
   3600 	rs = raidPtr->softc;
   3601 	dksc = &rs->sc_dksc;
   3602 
   3603 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3604 		return 1;
   3605 
   3606 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3607 		/* there is work to do */
   3608 		return 0;
   3609 	}
   3610 	/* default is nothing to do */
   3611 	return 1;
   3612 }
   3613 
   3614 int
   3615 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3616 {
   3617 	uint64_t numsecs;
   3618 	unsigned secsize;
   3619 	int error;
   3620 
   3621 	error = getdisksize(vp, &numsecs, &secsize);
   3622 	if (error == 0) {
   3623 		diskPtr->blockSize = secsize;
   3624 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3625 		diskPtr->partitionSize = numsecs;
   3626 		return 0;
   3627 	}
   3628 	return error;
   3629 }
   3630 
   3631 static int
   3632 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3633 {
   3634 	return 1;
   3635 }
   3636 
   3637 static void
   3638 raid_attach(device_t parent, device_t self, void *aux)
   3639 {
   3640 }
   3641 
   3642 
   3643 static int
   3644 raid_detach(device_t self, int flags)
   3645 {
   3646 	int error;
   3647 	struct raid_softc *rs = raidsoftc(self);
   3648 
   3649 	if (rs == NULL)
   3650 		return ENXIO;
   3651 
   3652 	if ((error = raidlock(rs)) != 0)
   3653 		return (error);
   3654 
   3655 	error = raid_detach_unlocked(rs);
   3656 
   3657 	raidunlock(rs);
   3658 
   3659 	/* XXX raid can be referenced here */
   3660 
   3661 	if (error)
   3662 		return error;
   3663 
   3664 	/* Free the softc */
   3665 	raidput(rs);
   3666 
   3667 	return 0;
   3668 }
   3669 
   3670 static void
   3671 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3672 {
   3673 	struct dk_softc *dksc = &rs->sc_dksc;
   3674 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3675 
   3676 	memset(dg, 0, sizeof(*dg));
   3677 
   3678 	dg->dg_secperunit = raidPtr->totalSectors;
   3679 	dg->dg_secsize = raidPtr->bytesPerSector;
   3680 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3681 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3682 
   3683 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3684 }
   3685 
   3686 /*
   3687  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3688  * We end up returning whatever error was returned by the first cache flush
   3689  * that fails.
   3690  */
   3691 
   3692 int
   3693 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3694 {
   3695 	int c, sparecol;
   3696 	int e,error;
   3697 	int force = 1;
   3698 
   3699 	error = 0;
   3700 	for (c = 0; c < raidPtr->numCol; c++) {
   3701 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3702 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3703 					  &force, FWRITE, NOCRED);
   3704 			if (e) {
   3705 				if (e != ENODEV)
   3706 					printf("raid%d: cache flush to component %s failed.\n",
   3707 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3708 				if (error == 0) {
   3709 					error = e;
   3710 				}
   3711 			}
   3712 		}
   3713 	}
   3714 
   3715 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3716 		sparecol = raidPtr->numCol + c;
   3717 		/* Need to ensure that the reconstruct actually completed! */
   3718 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3719 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3720 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3721 			if (e) {
   3722 				if (e != ENODEV)
   3723 					printf("raid%d: cache flush to component %s failed.\n",
   3724 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3725 				if (error == 0) {
   3726 					error = e;
   3727 				}
   3728 			}
   3729 		}
   3730 	}
   3731 	return error;
   3732 }
   3733 
   3734 /*
   3735  * Module interface
   3736  */
   3737 
   3738 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3739 
   3740 #ifdef _MODULE
   3741 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3742 #endif
   3743 
   3744 static int raid_modcmd(modcmd_t, void *);
   3745 static int raid_modcmd_init(void);
   3746 static int raid_modcmd_fini(void);
   3747 
   3748 static int
   3749 raid_modcmd(modcmd_t cmd, void *data)
   3750 {
   3751 	int error;
   3752 
   3753 	error = 0;
   3754 	switch (cmd) {
   3755 	case MODULE_CMD_INIT:
   3756 		error = raid_modcmd_init();
   3757 		break;
   3758 	case MODULE_CMD_FINI:
   3759 		error = raid_modcmd_fini();
   3760 		break;
   3761 	default:
   3762 		error = ENOTTY;
   3763 		break;
   3764 	}
   3765 	return error;
   3766 }
   3767 
   3768 static int
   3769 raid_modcmd_init(void)
   3770 {
   3771 	int error;
   3772 #ifdef _MODULE
   3773 	int bmajor, cmajor;
   3774 #endif
   3775 
   3776 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3777 	mutex_enter(&raid_lock);
   3778 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3779 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3780 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3781 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3782 
   3783 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3784 #endif
   3785 
   3786 #ifdef _MODULE
   3787 	bmajor = cmajor = -1;
   3788 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3789 	    &raid_cdevsw, &cmajor);
   3790 	if (error != 0) {
   3791 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3792 		mutex_exit(&raid_lock);
   3793 		return error;
   3794 	}
   3795 	error = config_cfdriver_attach(&raid_cd);
   3796 	if (error != 0) {
   3797 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3798 		    __func__, error);
   3799 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3800 		mutex_exit(&raid_lock);
   3801 		return error;
   3802 	}
   3803 #endif
   3804 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3805 	if (error != 0) {
   3806 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3807 		    __func__, error);
   3808 #ifdef _MODULE
   3809 		config_cfdriver_detach(&raid_cd);
   3810 #endif
   3811 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3812 		mutex_exit(&raid_lock);
   3813 		return error;
   3814 	}
   3815 
   3816 	raidautoconfigdone = false;
   3817 
   3818 	mutex_exit(&raid_lock);
   3819 
   3820 	if (error == 0) {
   3821 		if (rf_BootRaidframe(true) == 0)
   3822 			aprint_verbose("Kernelized RAIDframe activated\n");
   3823 		else
   3824 			panic("Serious error activating RAID!!");
   3825 	}
   3826 
   3827 	/*
   3828 	 * Register a finalizer which will be used to auto-config RAID
   3829 	 * sets once all real hardware devices have been found.
   3830 	 */
   3831 	error = config_finalize_register(NULL, rf_autoconfig);
   3832 	if (error != 0) {
   3833 		aprint_error("WARNING: unable to register RAIDframe "
   3834 		    "finalizer\n");
   3835 		error = 0;
   3836 	}
   3837 
   3838 	return error;
   3839 }
   3840 
   3841 static int
   3842 raid_modcmd_fini(void)
   3843 {
   3844 	int error;
   3845 
   3846 	mutex_enter(&raid_lock);
   3847 
   3848 	/* Don't allow unload if raid device(s) exist.  */
   3849 	if (!LIST_EMPTY(&raids)) {
   3850 		mutex_exit(&raid_lock);
   3851 		return EBUSY;
   3852 	}
   3853 
   3854 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3855 	if (error != 0) {
   3856 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3857 		mutex_exit(&raid_lock);
   3858 		return error;
   3859 	}
   3860 #ifdef _MODULE
   3861 	error = config_cfdriver_detach(&raid_cd);
   3862 	if (error != 0) {
   3863 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3864 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3865 		mutex_exit(&raid_lock);
   3866 		return error;
   3867 	}
   3868 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3869 	if (error != 0) {
   3870 		aprint_error("%s: cannot detach devsw\n",__func__);
   3871 		config_cfdriver_attach(&raid_cd);
   3872 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3873 		mutex_exit(&raid_lock);
   3874 		return error;
   3875 	}
   3876 #endif
   3877 	rf_BootRaidframe(false);
   3878 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3879 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3880 	rf_destroy_cond2(rf_sparet_wait_cv);
   3881 	rf_destroy_cond2(rf_sparet_resp_cv);
   3882 #endif
   3883 	mutex_exit(&raid_lock);
   3884 	mutex_destroy(&raid_lock);
   3885 
   3886 	return error;
   3887 }
   3888