Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.345.2.3
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.345.2.3 2016/07/18 11:13:23 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.345.2.3 2016/07/18 11:13:23 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/localcount.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #ifdef COMPAT_50
    153 #include "rf_compat50.h"
    154 #endif
    155 
    156 #include "ioconf.h"
    157 
    158 #ifdef DEBUG
    159 int     rf_kdebug_level = 0;
    160 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    161 #else				/* DEBUG */
    162 #define db1_printf(a) { }
    163 #endif				/* DEBUG */
    164 
    165 #ifdef DEBUG_ROOT
    166 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    167 #else
    168 #define DPRINTF(a, ...)
    169 #endif
    170 
    171 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    172 static rf_declare_mutex2(rf_sparet_wait_mutex);
    173 static rf_declare_cond2(rf_sparet_wait_cv);
    174 static rf_declare_cond2(rf_sparet_resp_cv);
    175 
    176 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    177 						 * spare table */
    178 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    179 						 * installation process */
    180 #endif
    181 
    182 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    183 
    184 /* prototypes */
    185 static void KernelWakeupFunc(struct buf *);
    186 static void InitBP(struct buf *, struct vnode *, unsigned,
    187     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    188     void *, int, struct proc *);
    189 struct raid_softc;
    190 static void raidinit(struct raid_softc *);
    191 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 #ifdef _MODULE
    221 struct localcount raid_localcount_bdev, raid_localcount_cdev;
    222 #endif
    223 
    224 const struct bdevsw raid_bdevsw = {
    225 	.d_open = raidopen,
    226 	.d_close = raidclose,
    227 	.d_strategy = raidstrategy,
    228 	.d_ioctl = raidioctl,
    229 	.d_dump = raiddump,
    230 	.d_psize = raidsize,
    231 	.d_discard = nodiscard,
    232 #ifdef _MODULE
    233 	.d_localcount = &raid_localcount_bdev,
    234 #endif
    235 	.d_flag = D_DISK
    236 };
    237 
    238 const struct cdevsw raid_cdevsw = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_read = raidread,
    242 	.d_write = raidwrite,
    243 	.d_ioctl = raidioctl,
    244 	.d_stop = nostop,
    245 	.d_tty = notty,
    246 	.d_poll = nopoll,
    247 	.d_mmap = nommap,
    248 	.d_kqfilter = nokqfilter,
    249 	.d_discard = nodiscard,
    250 #ifdef _MODULE
    251 	.d_localcount = &raid_localcount_cdev,
    252 #endif
    253 	.d_flag = D_DISK
    254 };
    255 
    256 static struct dkdriver rf_dkdriver = {
    257 	.d_open = raidopen,
    258 	.d_close = raidclose,
    259 	.d_strategy = raidstrategy,
    260 	.d_diskstart = raid_diskstart,
    261 	.d_dumpblocks = raid_dumpblocks,
    262 	.d_lastclose = raid_lastclose,
    263 	.d_minphys = minphys
    264 };
    265 
    266 struct raid_softc {
    267 	struct dk_softc sc_dksc;
    268 	int	sc_unit;
    269 	int     sc_flags;	/* flags */
    270 	int     sc_cflags;	/* configuration flags */
    271 	kmutex_t sc_mutex;	/* interlock mutex */
    272 	kcondvar_t sc_cv;	/* and the condvar */
    273 	uint64_t sc_size;	/* size of the raid device */
    274 	char    sc_xname[20];	/* XXX external name */
    275 	RF_Raid_t sc_r;
    276 	LIST_ENTRY(raid_softc) sc_link;
    277 };
    278 /* sc_flags */
    279 #define RAIDF_INITED		0x01	/* unit has been initialized */
    280 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    281 #define RAIDF_DETACH  		0x04	/* detach after final close */
    282 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    283 #define RAIDF_LOCKED		0x10	/* unit is locked */
    284 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    285 
    286 #define	raidunit(x)	DISKUNIT(x)
    287 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    288 
    289 extern struct cfdriver raid_cd;
    290 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    291     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    292     DVF_DETACH_SHUTDOWN);
    293 
    294 /*
    295  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    296  * Be aware that large numbers can allow the driver to consume a lot of
    297  * kernel memory, especially on writes, and in degraded mode reads.
    298  *
    299  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    300  * a single 64K write will typically require 64K for the old data,
    301  * 64K for the old parity, and 64K for the new parity, for a total
    302  * of 192K (if the parity buffer is not re-used immediately).
    303  * Even it if is used immediately, that's still 128K, which when multiplied
    304  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    305  *
    306  * Now in degraded mode, for example, a 64K read on the above setup may
    307  * require data reconstruction, which will require *all* of the 4 remaining
    308  * disks to participate -- 4 * 32K/disk == 128K again.
    309  */
    310 
    311 #ifndef RAIDOUTSTANDING
    312 #define RAIDOUTSTANDING   6
    313 #endif
    314 
    315 #define RAIDLABELDEV(dev)	\
    316 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    317 
    318 /* declared here, and made public, for the benefit of KVM stuff.. */
    319 
    320 static int raidlock(struct raid_softc *);
    321 static void raidunlock(struct raid_softc *);
    322 
    323 static int raid_detach_unlocked(struct raid_softc *);
    324 
    325 static void rf_markalldirty(RF_Raid_t *);
    326 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    327 
    328 void rf_ReconThread(struct rf_recon_req *);
    329 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    330 void rf_CopybackThread(RF_Raid_t *raidPtr);
    331 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    332 int rf_autoconfig(device_t);
    333 void rf_buildroothack(RF_ConfigSet_t *);
    334 
    335 RF_AutoConfig_t *rf_find_raid_components(void);
    336 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    337 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    338 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    339 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    340 int rf_set_autoconfig(RF_Raid_t *, int);
    341 int rf_set_rootpartition(RF_Raid_t *, int);
    342 void rf_release_all_vps(RF_ConfigSet_t *);
    343 void rf_cleanup_config_set(RF_ConfigSet_t *);
    344 int rf_have_enough_components(RF_ConfigSet_t *);
    345 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    346 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    347 
    348 /*
    349  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    350  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    351  * in the kernel config file.
    352  */
    353 #ifdef RAID_AUTOCONFIG
    354 int raidautoconfig = 1;
    355 #else
    356 int raidautoconfig = 0;
    357 #endif
    358 static bool raidautoconfigdone = false;
    359 
    360 struct RF_Pools_s rf_pools;
    361 
    362 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    363 static kmutex_t raid_lock;
    364 
    365 static struct raid_softc *
    366 raidcreate(int unit) {
    367 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    368 	if (sc == NULL) {
    369 #ifdef DIAGNOSTIC
    370 		printf("%s: out of memory\n", __func__);
    371 #endif
    372 		return NULL;
    373 	}
    374 	sc->sc_unit = unit;
    375 	cv_init(&sc->sc_cv, "raidunit");
    376 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    377 	return sc;
    378 }
    379 
    380 static void
    381 raiddestroy(struct raid_softc *sc) {
    382 	cv_destroy(&sc->sc_cv);
    383 	mutex_destroy(&sc->sc_mutex);
    384 	kmem_free(sc, sizeof(*sc));
    385 }
    386 
    387 static struct raid_softc *
    388 raidget(int unit, bool create) {
    389 	struct raid_softc *sc;
    390 	if (unit < 0) {
    391 #ifdef DIAGNOSTIC
    392 		panic("%s: unit %d!", __func__, unit);
    393 #endif
    394 		return NULL;
    395 	}
    396 	mutex_enter(&raid_lock);
    397 	LIST_FOREACH(sc, &raids, sc_link) {
    398 		if (sc->sc_unit == unit) {
    399 			mutex_exit(&raid_lock);
    400 			return sc;
    401 		}
    402 	}
    403 	mutex_exit(&raid_lock);
    404 	if (!create)
    405 		return NULL;
    406 	if ((sc = raidcreate(unit)) == NULL)
    407 		return NULL;
    408 	mutex_enter(&raid_lock);
    409 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    410 	mutex_exit(&raid_lock);
    411 	return sc;
    412 }
    413 
    414 static void
    415 raidput(struct raid_softc *sc) {
    416 	mutex_enter(&raid_lock);
    417 	LIST_REMOVE(sc, sc_link);
    418 	mutex_exit(&raid_lock);
    419 	raiddestroy(sc);
    420 }
    421 
    422 void
    423 raidattach(int num)
    424 {
    425 
    426 	/*
    427 	 * Device attachment and associated initialization now occurs
    428 	 * as part of the module initialization.
    429 	 */
    430 }
    431 
    432 int
    433 rf_autoconfig(device_t self)
    434 {
    435 	RF_AutoConfig_t *ac_list;
    436 	RF_ConfigSet_t *config_sets;
    437 
    438 	if (!raidautoconfig || raidautoconfigdone == true)
    439 		return (0);
    440 
    441 	/* XXX This code can only be run once. */
    442 	raidautoconfigdone = true;
    443 
    444 #ifdef __HAVE_CPU_BOOTCONF
    445 	/*
    446 	 * 0. find the boot device if needed first so we can use it later
    447 	 * this needs to be done before we autoconfigure any raid sets,
    448 	 * because if we use wedges we are not going to be able to open
    449 	 * the boot device later
    450 	 */
    451 	if (booted_device == NULL)
    452 		cpu_bootconf();
    453 #endif
    454 	/* 1. locate all RAID components on the system */
    455 	aprint_debug("Searching for RAID components...\n");
    456 	ac_list = rf_find_raid_components();
    457 
    458 	/* 2. Sort them into their respective sets. */
    459 	config_sets = rf_create_auto_sets(ac_list);
    460 
    461 	/*
    462 	 * 3. Evaluate each set and configure the valid ones.
    463 	 * This gets done in rf_buildroothack().
    464 	 */
    465 	rf_buildroothack(config_sets);
    466 
    467 	return 1;
    468 }
    469 
    470 static int
    471 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    472 	const char *bootname = device_xname(bdv);
    473 	size_t len = strlen(bootname);
    474 
    475 	for (int col = 0; col < r->numCol; col++) {
    476 		const char *devname = r->Disks[col].devname;
    477 		devname += sizeof("/dev/") - 1;
    478 		if (strncmp(devname, "dk", 2) == 0) {
    479 			const char *parent =
    480 			    dkwedge_get_parent_name(r->Disks[col].dev);
    481 			if (parent != NULL)
    482 				devname = parent;
    483 		}
    484 		if (strncmp(devname, bootname, len) == 0) {
    485 			struct raid_softc *sc = r->softc;
    486 			aprint_debug("raid%d includes boot device %s\n",
    487 			    sc->sc_unit, devname);
    488 			return 1;
    489 		}
    490 	}
    491 	return 0;
    492 }
    493 
    494 void
    495 rf_buildroothack(RF_ConfigSet_t *config_sets)
    496 {
    497 	RF_ConfigSet_t *cset;
    498 	RF_ConfigSet_t *next_cset;
    499 	int num_root;
    500 	struct raid_softc *sc, *rsc;
    501 	struct dk_softc *dksc;
    502 
    503 	sc = rsc = NULL;
    504 	num_root = 0;
    505 	cset = config_sets;
    506 	while (cset != NULL) {
    507 		next_cset = cset->next;
    508 		if (rf_have_enough_components(cset) &&
    509 		    cset->ac->clabel->autoconfigure == 1) {
    510 			sc = rf_auto_config_set(cset);
    511 			if (sc != NULL) {
    512 				aprint_debug("raid%d: configured ok\n",
    513 				    sc->sc_unit);
    514 				if (cset->rootable) {
    515 					rsc = sc;
    516 					num_root++;
    517 				}
    518 			} else {
    519 				/* The autoconfig didn't work :( */
    520 				aprint_debug("Autoconfig failed\n");
    521 				rf_release_all_vps(cset);
    522 			}
    523 		} else {
    524 			/* we're not autoconfiguring this set...
    525 			   release the associated resources */
    526 			rf_release_all_vps(cset);
    527 		}
    528 		/* cleanup */
    529 		rf_cleanup_config_set(cset);
    530 		cset = next_cset;
    531 	}
    532 	dksc = &rsc->sc_dksc;
    533 
    534 	/* if the user has specified what the root device should be
    535 	   then we don't touch booted_device or boothowto... */
    536 
    537 	if (rootspec != NULL)
    538 		return;
    539 
    540 	/* we found something bootable... */
    541 
    542 	/*
    543 	 * XXX: The following code assumes that the root raid
    544 	 * is the first ('a') partition. This is about the best
    545 	 * we can do with a BSD disklabel, but we might be able
    546 	 * to do better with a GPT label, by setting a specified
    547 	 * attribute to indicate the root partition. We can then
    548 	 * stash the partition number in the r->root_partition
    549 	 * high bits (the bottom 2 bits are already used). For
    550 	 * now we just set booted_partition to 0 when we override
    551 	 * root.
    552 	 */
    553 	if (num_root == 1) {
    554 		device_t candidate_root;
    555 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    556 			char cname[sizeof(cset->ac->devname)];
    557 			/* XXX: assume partition 'a' first */
    558 			snprintf(cname, sizeof(cname), "%s%c",
    559 			    device_xname(dksc->sc_dev), 'a');
    560 			candidate_root = dkwedge_find_by_wname(cname);
    561 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    562 			    cname);
    563 			if (candidate_root == NULL) {
    564 				/*
    565 				 * If that is not found, because we don't use
    566 				 * disklabel, return the first dk child
    567 				 * XXX: we can skip the 'a' check above
    568 				 * and always do this...
    569 				 */
    570 				size_t i = 0;
    571 				candidate_root = dkwedge_find_by_parent(
    572 				    device_xname(dksc->sc_dev), &i);
    573 			}
    574 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    575 			    candidate_root);
    576 		} else
    577 			candidate_root = dksc->sc_dev;
    578 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    579 		DPRINTF("%s: booted_device=%p root_partition=%d "
    580 		   "contains_boot=%d\n", __func__, booted_device,
    581 		   rsc->sc_r.root_partition,
    582 		   rf_containsboot(&rsc->sc_r, booted_device));
    583 		if (booted_device == NULL ||
    584 		    rsc->sc_r.root_partition == 1 ||
    585 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    586 			booted_device = candidate_root;
    587 			booted_partition = 0;	/* XXX assume 'a' */
    588 		}
    589 	} else if (num_root > 1) {
    590 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    591 		    booted_device);
    592 
    593 		/*
    594 		 * Maybe the MD code can help. If it cannot, then
    595 		 * setroot() will discover that we have no
    596 		 * booted_device and will ask the user if nothing was
    597 		 * hardwired in the kernel config file
    598 		 */
    599 		if (booted_device == NULL)
    600 			return;
    601 
    602 		num_root = 0;
    603 		mutex_enter(&raid_lock);
    604 		LIST_FOREACH(sc, &raids, sc_link) {
    605 			RF_Raid_t *r = &sc->sc_r;
    606 			if (r->valid == 0)
    607 				continue;
    608 
    609 			if (r->root_partition == 0)
    610 				continue;
    611 
    612 			if (rf_containsboot(r, booted_device)) {
    613 				num_root++;
    614 				rsc = sc;
    615 				dksc = &rsc->sc_dksc;
    616 			}
    617 		}
    618 		mutex_exit(&raid_lock);
    619 
    620 		if (num_root == 1) {
    621 			booted_device = dksc->sc_dev;
    622 			booted_partition = 0;	/* XXX assume 'a' */
    623 		} else {
    624 			/* we can't guess.. require the user to answer... */
    625 			boothowto |= RB_ASKNAME;
    626 		}
    627 	}
    628 }
    629 
    630 static int
    631 raidsize(dev_t dev)
    632 {
    633 	struct raid_softc *rs;
    634 	struct dk_softc *dksc;
    635 	unsigned int unit;
    636 
    637 	unit = raidunit(dev);
    638 	if ((rs = raidget(unit, false)) == NULL)
    639 		return -1;
    640 	dksc = &rs->sc_dksc;
    641 
    642 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    643 		return -1;
    644 
    645 	return dk_size(dksc, dev);
    646 }
    647 
    648 static int
    649 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    650 {
    651 	unsigned int unit;
    652 	struct raid_softc *rs;
    653 	struct dk_softc *dksc;
    654 
    655 	unit = raidunit(dev);
    656 	if ((rs = raidget(unit, false)) == NULL)
    657 		return ENXIO;
    658 	dksc = &rs->sc_dksc;
    659 
    660 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    661 		return ENODEV;
    662 
    663         /*
    664            Note that blkno is relative to this particular partition.
    665            By adding adding RF_PROTECTED_SECTORS, we get a value that
    666 	   is relative to the partition used for the underlying component.
    667         */
    668 	blkno += RF_PROTECTED_SECTORS;
    669 
    670 	return dk_dump(dksc, dev, blkno, va, size);
    671 }
    672 
    673 static int
    674 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    675 {
    676 	struct raid_softc *rs = raidsoftc(dev);
    677 	const struct bdevsw *bdev;
    678 	RF_Raid_t *raidPtr;
    679 	int     c, sparecol, j, scol, dumpto;
    680 	int     error = 0;
    681 
    682 	raidPtr = &rs->sc_r;
    683 
    684 	/* we only support dumping to RAID 1 sets */
    685 	if (raidPtr->Layout.numDataCol != 1 ||
    686 	    raidPtr->Layout.numParityCol != 1)
    687 		return EINVAL;
    688 
    689 	if ((error = raidlock(rs)) != 0)
    690 		return error;
    691 
    692 	/* figure out what device is alive.. */
    693 
    694 	/*
    695 	   Look for a component to dump to.  The preference for the
    696 	   component to dump to is as follows:
    697 	   1) the master
    698 	   2) a used_spare of the master
    699 	   3) the slave
    700 	   4) a used_spare of the slave
    701 	*/
    702 
    703 	dumpto = -1;
    704 	for (c = 0; c < raidPtr->numCol; c++) {
    705 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    706 			/* this might be the one */
    707 			dumpto = c;
    708 			break;
    709 		}
    710 	}
    711 
    712 	/*
    713 	   At this point we have possibly selected a live master or a
    714 	   live slave.  We now check to see if there is a spared
    715 	   master (or a spared slave), if we didn't find a live master
    716 	   or a live slave.
    717 	*/
    718 
    719 	for (c = 0; c < raidPtr->numSpare; c++) {
    720 		sparecol = raidPtr->numCol + c;
    721 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    722 			/* How about this one? */
    723 			scol = -1;
    724 			for(j=0;j<raidPtr->numCol;j++) {
    725 				if (raidPtr->Disks[j].spareCol == sparecol) {
    726 					scol = j;
    727 					break;
    728 				}
    729 			}
    730 			if (scol == 0) {
    731 				/*
    732 				   We must have found a spared master!
    733 				   We'll take that over anything else
    734 				   found so far.  (We couldn't have
    735 				   found a real master before, since
    736 				   this is a used spare, and it's
    737 				   saying that it's replacing the
    738 				   master.)  On reboot (with
    739 				   autoconfiguration turned on)
    740 				   sparecol will become the 1st
    741 				   component (component0) of this set.
    742 				*/
    743 				dumpto = sparecol;
    744 				break;
    745 			} else if (scol != -1) {
    746 				/*
    747 				   Must be a spared slave.  We'll dump
    748 				   to that if we havn't found anything
    749 				   else so far.
    750 				*/
    751 				if (dumpto == -1)
    752 					dumpto = sparecol;
    753 			}
    754 		}
    755 	}
    756 
    757 	if (dumpto == -1) {
    758 		/* we couldn't find any live components to dump to!?!?
    759 		 */
    760 		error = EINVAL;
    761 		goto out;
    762 	}
    763 
    764 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    765 	if (bdev == NULL) {
    766 		error = ENXIO;
    767 		goto out;
    768 	}
    769 
    770 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    771 				blkno, va, nblk * raidPtr->bytesPerSector);
    772 
    773 out:
    774 	raidunlock(rs);
    775 
    776 	return error;
    777 }
    778 
    779 /* ARGSUSED */
    780 static int
    781 raidopen(dev_t dev, int flags, int fmt,
    782     struct lwp *l)
    783 {
    784 	int     unit = raidunit(dev);
    785 	struct raid_softc *rs;
    786 	struct dk_softc *dksc;
    787 	int     error = 0;
    788 	int     part, pmask;
    789 
    790 	if ((rs = raidget(unit, true)) == NULL)
    791 		return ENXIO;
    792 	if ((error = raidlock(rs)) != 0)
    793 		return (error);
    794 
    795 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    796 		error = EBUSY;
    797 		goto bad;
    798 	}
    799 
    800 	dksc = &rs->sc_dksc;
    801 
    802 	part = DISKPART(dev);
    803 	pmask = (1 << part);
    804 
    805 	if (!DK_BUSY(dksc, pmask) &&
    806 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    807 		/* First one... mark things as dirty... Note that we *MUST*
    808 		 have done a configure before this.  I DO NOT WANT TO BE
    809 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    810 		 THAT THEY BELONG TOGETHER!!!!! */
    811 		/* XXX should check to see if we're only open for reading
    812 		   here... If so, we needn't do this, but then need some
    813 		   other way of keeping track of what's happened.. */
    814 
    815 		rf_markalldirty(&rs->sc_r);
    816 	}
    817 
    818 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    819 		error = dk_open(dksc, dev, flags, fmt, l);
    820 
    821 bad:
    822 	raidunlock(rs);
    823 
    824 	return (error);
    825 
    826 
    827 }
    828 
    829 static int
    830 raid_lastclose(device_t self)
    831 {
    832 	struct raid_softc *rs = raidsoftc(self);
    833 
    834 	/* Last one... device is not unconfigured yet.
    835 	   Device shutdown has taken care of setting the
    836 	   clean bits if RAIDF_INITED is not set
    837 	   mark things as clean... */
    838 
    839 	rf_update_component_labels(&rs->sc_r,
    840 	    RF_FINAL_COMPONENT_UPDATE);
    841 
    842 	/* pass to unlocked code */
    843 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    844 		rs->sc_flags |= RAIDF_DETACH;
    845 
    846 	return 0;
    847 }
    848 
    849 /* ARGSUSED */
    850 static int
    851 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    852 {
    853 	int     unit = raidunit(dev);
    854 	struct raid_softc *rs;
    855 	struct dk_softc *dksc;
    856 	cfdata_t cf;
    857 	int     error = 0, do_detach = 0, do_put = 0;
    858 
    859 	if ((rs = raidget(unit, false)) == NULL)
    860 		return ENXIO;
    861 	dksc = &rs->sc_dksc;
    862 
    863 	if ((error = raidlock(rs)) != 0)
    864 		return (error);
    865 
    866 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    867 		error = dk_close(dksc, dev, flags, fmt, l);
    868 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    869 			do_detach = 1;
    870 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    871 		do_put = 1;
    872 
    873 	raidunlock(rs);
    874 
    875 	if (do_detach) {
    876 		/* free the pseudo device attach bits */
    877 		cf = device_cfdata(dksc->sc_dev);
    878 		error = config_detach(dksc->sc_dev, 0);
    879 		if (error == 0)
    880 			free(cf, M_RAIDFRAME);
    881 	} else if (do_put) {
    882 		raidput(rs);
    883 	}
    884 
    885 	return (error);
    886 
    887 }
    888 
    889 static void
    890 raid_wakeup(RF_Raid_t *raidPtr)
    891 {
    892 	rf_lock_mutex2(raidPtr->iodone_lock);
    893 	rf_signal_cond2(raidPtr->iodone_cv);
    894 	rf_unlock_mutex2(raidPtr->iodone_lock);
    895 }
    896 
    897 static void
    898 raidstrategy(struct buf *bp)
    899 {
    900 	unsigned int unit;
    901 	struct raid_softc *rs;
    902 	struct dk_softc *dksc;
    903 	RF_Raid_t *raidPtr;
    904 
    905 	unit = raidunit(bp->b_dev);
    906 	if ((rs = raidget(unit, false)) == NULL) {
    907 		bp->b_error = ENXIO;
    908 		goto fail;
    909 	}
    910 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    911 		bp->b_error = ENXIO;
    912 		goto fail;
    913 	}
    914 	dksc = &rs->sc_dksc;
    915 	raidPtr = &rs->sc_r;
    916 
    917 	/* Queue IO only */
    918 	if (dk_strategy_defer(dksc, bp))
    919 		goto done;
    920 
    921 	/* schedule the IO to happen at the next convenient time */
    922 	raid_wakeup(raidPtr);
    923 
    924 done:
    925 	return;
    926 
    927 fail:
    928 	bp->b_resid = bp->b_bcount;
    929 	biodone(bp);
    930 }
    931 
    932 static int
    933 raid_diskstart(device_t dev, struct buf *bp)
    934 {
    935 	struct raid_softc *rs = raidsoftc(dev);
    936 	RF_Raid_t *raidPtr;
    937 
    938 	raidPtr = &rs->sc_r;
    939 	if (!raidPtr->valid) {
    940 		db1_printf(("raid is not valid..\n"));
    941 		return ENODEV;
    942 	}
    943 
    944 	/* XXX */
    945 	bp->b_resid = 0;
    946 
    947 	return raiddoaccess(raidPtr, bp);
    948 }
    949 
    950 void
    951 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    952 {
    953 	struct raid_softc *rs;
    954 	struct dk_softc *dksc;
    955 
    956 	rs = raidPtr->softc;
    957 	dksc = &rs->sc_dksc;
    958 
    959 	dk_done(dksc, bp);
    960 
    961 	rf_lock_mutex2(raidPtr->mutex);
    962 	raidPtr->openings++;
    963 	rf_unlock_mutex2(raidPtr->mutex);
    964 
    965 	/* schedule more IO */
    966 	raid_wakeup(raidPtr);
    967 }
    968 
    969 /* ARGSUSED */
    970 static int
    971 raidread(dev_t dev, struct uio *uio, int flags)
    972 {
    973 	int     unit = raidunit(dev);
    974 	struct raid_softc *rs;
    975 
    976 	if ((rs = raidget(unit, false)) == NULL)
    977 		return ENXIO;
    978 
    979 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    980 		return (ENXIO);
    981 
    982 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    983 
    984 }
    985 
    986 /* ARGSUSED */
    987 static int
    988 raidwrite(dev_t dev, struct uio *uio, int flags)
    989 {
    990 	int     unit = raidunit(dev);
    991 	struct raid_softc *rs;
    992 
    993 	if ((rs = raidget(unit, false)) == NULL)
    994 		return ENXIO;
    995 
    996 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    997 		return (ENXIO);
    998 
    999 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1000 
   1001 }
   1002 
   1003 static int
   1004 raid_detach_unlocked(struct raid_softc *rs)
   1005 {
   1006 	struct dk_softc *dksc = &rs->sc_dksc;
   1007 	RF_Raid_t *raidPtr;
   1008 	int error;
   1009 
   1010 	raidPtr = &rs->sc_r;
   1011 
   1012 	if (DK_BUSY(dksc, 0) ||
   1013 	    raidPtr->recon_in_progress != 0 ||
   1014 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1015 	    raidPtr->copyback_in_progress != 0)
   1016 		return EBUSY;
   1017 
   1018 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1019 		return 0;
   1020 
   1021 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1022 
   1023 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1024 		return error;
   1025 
   1026 	rs->sc_flags &= ~RAIDF_INITED;
   1027 
   1028 	/* Kill off any queued buffers */
   1029 	dk_drain(dksc);
   1030 	bufq_free(dksc->sc_bufq);
   1031 
   1032 	/* Detach the disk. */
   1033 	dkwedge_delall(&dksc->sc_dkdev);
   1034 	disk_detach(&dksc->sc_dkdev);
   1035 	disk_destroy(&dksc->sc_dkdev);
   1036 	dk_detach(dksc);
   1037 
   1038 	return 0;
   1039 }
   1040 
   1041 static int
   1042 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1043 {
   1044 	int     unit = raidunit(dev);
   1045 	int     error = 0;
   1046 	int     part, pmask;
   1047 	struct raid_softc *rs;
   1048 	struct dk_softc *dksc;
   1049 	RF_Config_t *k_cfg, *u_cfg;
   1050 	RF_Raid_t *raidPtr;
   1051 	RF_RaidDisk_t *diskPtr;
   1052 	RF_AccTotals_t *totals;
   1053 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1054 	u_char *specific_buf;
   1055 	int retcode = 0;
   1056 	int column;
   1057 /*	int raidid; */
   1058 	struct rf_recon_req *rrcopy, *rr;
   1059 	RF_ComponentLabel_t *clabel;
   1060 	RF_ComponentLabel_t *ci_label;
   1061 	RF_ComponentLabel_t **clabel_ptr;
   1062 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1063 	RF_SingleComponent_t component;
   1064 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1065 	int i, j, d;
   1066 
   1067 	if ((rs = raidget(unit, false)) == NULL)
   1068 		return ENXIO;
   1069 	dksc = &rs->sc_dksc;
   1070 	raidPtr = &rs->sc_r;
   1071 
   1072 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1073 		(int) DISKPART(dev), (int) unit, cmd));
   1074 
   1075 	/* Must be initialized for these... */
   1076 	switch (cmd) {
   1077 	case RAIDFRAME_REWRITEPARITY:
   1078 	case RAIDFRAME_GET_INFO:
   1079 	case RAIDFRAME_RESET_ACCTOTALS:
   1080 	case RAIDFRAME_GET_ACCTOTALS:
   1081 	case RAIDFRAME_KEEP_ACCTOTALS:
   1082 	case RAIDFRAME_GET_SIZE:
   1083 	case RAIDFRAME_FAIL_DISK:
   1084 	case RAIDFRAME_COPYBACK:
   1085 	case RAIDFRAME_CHECK_RECON_STATUS:
   1086 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1087 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1088 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1089 	case RAIDFRAME_ADD_HOT_SPARE:
   1090 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1091 	case RAIDFRAME_INIT_LABELS:
   1092 	case RAIDFRAME_REBUILD_IN_PLACE:
   1093 	case RAIDFRAME_CHECK_PARITY:
   1094 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1095 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1096 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1097 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1098 	case RAIDFRAME_SET_AUTOCONFIG:
   1099 	case RAIDFRAME_SET_ROOT:
   1100 	case RAIDFRAME_DELETE_COMPONENT:
   1101 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1102 	case RAIDFRAME_PARITYMAP_STATUS:
   1103 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1104 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1105 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1106 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1107 			return (ENXIO);
   1108 	}
   1109 
   1110 	switch (cmd) {
   1111 #ifdef COMPAT_50
   1112 	case RAIDFRAME_GET_INFO50:
   1113 		return rf_get_info50(raidPtr, data);
   1114 
   1115 	case RAIDFRAME_CONFIGURE50:
   1116 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1117 			return retcode;
   1118 		goto config;
   1119 #endif
   1120 		/* configure the system */
   1121 	case RAIDFRAME_CONFIGURE:
   1122 
   1123 		if (raidPtr->valid) {
   1124 			/* There is a valid RAID set running on this unit! */
   1125 			printf("raid%d: Device already configured!\n",unit);
   1126 			return(EINVAL);
   1127 		}
   1128 
   1129 		/* copy-in the configuration information */
   1130 		/* data points to a pointer to the configuration structure */
   1131 
   1132 		u_cfg = *((RF_Config_t **) data);
   1133 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1134 		if (k_cfg == NULL) {
   1135 			return (ENOMEM);
   1136 		}
   1137 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1138 		if (retcode) {
   1139 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1140 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1141 				retcode));
   1142 			goto no_config;
   1143 		}
   1144 		goto config;
   1145 	config:
   1146 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1147 
   1148 		/* allocate a buffer for the layout-specific data, and copy it
   1149 		 * in */
   1150 		if (k_cfg->layoutSpecificSize) {
   1151 			if (k_cfg->layoutSpecificSize > 10000) {
   1152 				/* sanity check */
   1153 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1154 				retcode = EINVAL;
   1155 				goto no_config;
   1156 			}
   1157 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1158 			    (u_char *));
   1159 			if (specific_buf == NULL) {
   1160 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1161 				retcode = ENOMEM;
   1162 				goto no_config;
   1163 			}
   1164 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1165 			    k_cfg->layoutSpecificSize);
   1166 			if (retcode) {
   1167 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1168 				RF_Free(specific_buf,
   1169 					k_cfg->layoutSpecificSize);
   1170 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1171 					retcode));
   1172 				goto no_config;
   1173 			}
   1174 		} else
   1175 			specific_buf = NULL;
   1176 		k_cfg->layoutSpecific = specific_buf;
   1177 
   1178 		/* should do some kind of sanity check on the configuration.
   1179 		 * Store the sum of all the bytes in the last byte? */
   1180 
   1181 		/* configure the system */
   1182 
   1183 		/*
   1184 		 * Clear the entire RAID descriptor, just to make sure
   1185 		 *  there is no stale data left in the case of a
   1186 		 *  reconfiguration
   1187 		 */
   1188 		memset(raidPtr, 0, sizeof(*raidPtr));
   1189 		raidPtr->softc = rs;
   1190 		raidPtr->raidid = unit;
   1191 
   1192 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1193 
   1194 		if (retcode == 0) {
   1195 
   1196 			/* allow this many simultaneous IO's to
   1197 			   this RAID device */
   1198 			raidPtr->openings = RAIDOUTSTANDING;
   1199 
   1200 			raidinit(rs);
   1201 			raid_wakeup(raidPtr);
   1202 			rf_markalldirty(raidPtr);
   1203 		}
   1204 		/* free the buffers.  No return code here. */
   1205 		if (k_cfg->layoutSpecificSize) {
   1206 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1207 		}
   1208 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1209 
   1210 	no_config:
   1211 		/*
   1212 		 * If configuration failed, set sc_flags so that we
   1213 		 * will detach the device when we close it.
   1214 		 */
   1215 		if (retcode != 0)
   1216 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1217 		return (retcode);
   1218 
   1219 		/* shutdown the system */
   1220 	case RAIDFRAME_SHUTDOWN:
   1221 
   1222 		part = DISKPART(dev);
   1223 		pmask = (1 << part);
   1224 
   1225 		if ((error = raidlock(rs)) != 0)
   1226 			return (error);
   1227 
   1228 		if (DK_BUSY(dksc, pmask) ||
   1229 		    raidPtr->recon_in_progress != 0 ||
   1230 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1231 		    raidPtr->copyback_in_progress != 0)
   1232 			retcode = EBUSY;
   1233 		else {
   1234 			/* detach and free on close */
   1235 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1236 			retcode = 0;
   1237 		}
   1238 
   1239 		raidunlock(rs);
   1240 
   1241 		return (retcode);
   1242 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1243 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1244 		/* need to read the component label for the disk indicated
   1245 		   by row,column in clabel */
   1246 
   1247 		/*
   1248 		 * Perhaps there should be an option to skip the in-core
   1249 		 * copy and hit the disk, as with disklabel(8).
   1250 		 */
   1251 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1252 
   1253 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1254 
   1255 		if (retcode) {
   1256 			RF_Free(clabel, sizeof(*clabel));
   1257 			return retcode;
   1258 		}
   1259 
   1260 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1261 
   1262 		column = clabel->column;
   1263 
   1264 		if ((column < 0) || (column >= raidPtr->numCol +
   1265 		    raidPtr->numSpare)) {
   1266 			RF_Free(clabel, sizeof(*clabel));
   1267 			return EINVAL;
   1268 		}
   1269 
   1270 		RF_Free(clabel, sizeof(*clabel));
   1271 
   1272 		clabel = raidget_component_label(raidPtr, column);
   1273 
   1274 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1275 
   1276 #if 0
   1277 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1278 		clabel = (RF_ComponentLabel_t *) data;
   1279 
   1280 		/* XXX check the label for valid stuff... */
   1281 		/* Note that some things *should not* get modified --
   1282 		   the user should be re-initing the labels instead of
   1283 		   trying to patch things.
   1284 		   */
   1285 
   1286 		raidid = raidPtr->raidid;
   1287 #ifdef DEBUG
   1288 		printf("raid%d: Got component label:\n", raidid);
   1289 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1290 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1291 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1292 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1293 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1294 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1295 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1296 #endif
   1297 		clabel->row = 0;
   1298 		column = clabel->column;
   1299 
   1300 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1301 			return(EINVAL);
   1302 		}
   1303 
   1304 		/* XXX this isn't allowed to do anything for now :-) */
   1305 
   1306 		/* XXX and before it is, we need to fill in the rest
   1307 		   of the fields!?!?!?! */
   1308 		memcpy(raidget_component_label(raidPtr, column),
   1309 		    clabel, sizeof(*clabel));
   1310 		raidflush_component_label(raidPtr, column);
   1311 		return (0);
   1312 #endif
   1313 
   1314 	case RAIDFRAME_INIT_LABELS:
   1315 		clabel = (RF_ComponentLabel_t *) data;
   1316 		/*
   1317 		   we only want the serial number from
   1318 		   the above.  We get all the rest of the information
   1319 		   from the config that was used to create this RAID
   1320 		   set.
   1321 		   */
   1322 
   1323 		raidPtr->serial_number = clabel->serial_number;
   1324 
   1325 		for(column=0;column<raidPtr->numCol;column++) {
   1326 			diskPtr = &raidPtr->Disks[column];
   1327 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1328 				ci_label = raidget_component_label(raidPtr,
   1329 				    column);
   1330 				/* Zeroing this is important. */
   1331 				memset(ci_label, 0, sizeof(*ci_label));
   1332 				raid_init_component_label(raidPtr, ci_label);
   1333 				ci_label->serial_number =
   1334 				    raidPtr->serial_number;
   1335 				ci_label->row = 0; /* we dont' pretend to support more */
   1336 				rf_component_label_set_partitionsize(ci_label,
   1337 				    diskPtr->partitionSize);
   1338 				ci_label->column = column;
   1339 				raidflush_component_label(raidPtr, column);
   1340 			}
   1341 			/* XXXjld what about the spares? */
   1342 		}
   1343 
   1344 		return (retcode);
   1345 	case RAIDFRAME_SET_AUTOCONFIG:
   1346 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1347 		printf("raid%d: New autoconfig value is: %d\n",
   1348 		       raidPtr->raidid, d);
   1349 		*(int *) data = d;
   1350 		return (retcode);
   1351 
   1352 	case RAIDFRAME_SET_ROOT:
   1353 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1354 		printf("raid%d: New rootpartition value is: %d\n",
   1355 		       raidPtr->raidid, d);
   1356 		*(int *) data = d;
   1357 		return (retcode);
   1358 
   1359 		/* initialize all parity */
   1360 	case RAIDFRAME_REWRITEPARITY:
   1361 
   1362 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1363 			/* Parity for RAID 0 is trivially correct */
   1364 			raidPtr->parity_good = RF_RAID_CLEAN;
   1365 			return(0);
   1366 		}
   1367 
   1368 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1369 			/* Re-write is already in progress! */
   1370 			return(EINVAL);
   1371 		}
   1372 
   1373 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1374 					   rf_RewriteParityThread,
   1375 					   raidPtr,"raid_parity");
   1376 		return (retcode);
   1377 
   1378 
   1379 	case RAIDFRAME_ADD_HOT_SPARE:
   1380 		sparePtr = (RF_SingleComponent_t *) data;
   1381 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1382 		retcode = rf_add_hot_spare(raidPtr, &component);
   1383 		return(retcode);
   1384 
   1385 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1386 		return(retcode);
   1387 
   1388 	case RAIDFRAME_DELETE_COMPONENT:
   1389 		componentPtr = (RF_SingleComponent_t *)data;
   1390 		memcpy( &component, componentPtr,
   1391 			sizeof(RF_SingleComponent_t));
   1392 		retcode = rf_delete_component(raidPtr, &component);
   1393 		return(retcode);
   1394 
   1395 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1396 		componentPtr = (RF_SingleComponent_t *)data;
   1397 		memcpy( &component, componentPtr,
   1398 			sizeof(RF_SingleComponent_t));
   1399 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1400 		return(retcode);
   1401 
   1402 	case RAIDFRAME_REBUILD_IN_PLACE:
   1403 
   1404 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1405 			/* Can't do this on a RAID 0!! */
   1406 			return(EINVAL);
   1407 		}
   1408 
   1409 		if (raidPtr->recon_in_progress == 1) {
   1410 			/* a reconstruct is already in progress! */
   1411 			return(EINVAL);
   1412 		}
   1413 
   1414 		componentPtr = (RF_SingleComponent_t *) data;
   1415 		memcpy( &component, componentPtr,
   1416 			sizeof(RF_SingleComponent_t));
   1417 		component.row = 0; /* we don't support any more */
   1418 		column = component.column;
   1419 
   1420 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1421 			return(EINVAL);
   1422 		}
   1423 
   1424 		rf_lock_mutex2(raidPtr->mutex);
   1425 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1426 		    (raidPtr->numFailures > 0)) {
   1427 			/* XXX 0 above shouldn't be constant!!! */
   1428 			/* some component other than this has failed.
   1429 			   Let's not make things worse than they already
   1430 			   are... */
   1431 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1432 			       raidPtr->raidid);
   1433 			printf("raid%d:     Col: %d   Too many failures.\n",
   1434 			       raidPtr->raidid, column);
   1435 			rf_unlock_mutex2(raidPtr->mutex);
   1436 			return (EINVAL);
   1437 		}
   1438 		if (raidPtr->Disks[column].status ==
   1439 		    rf_ds_reconstructing) {
   1440 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1441 			       raidPtr->raidid);
   1442 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1443 
   1444 			rf_unlock_mutex2(raidPtr->mutex);
   1445 			return (EINVAL);
   1446 		}
   1447 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1448 			rf_unlock_mutex2(raidPtr->mutex);
   1449 			return (EINVAL);
   1450 		}
   1451 		rf_unlock_mutex2(raidPtr->mutex);
   1452 
   1453 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1454 		if (rrcopy == NULL)
   1455 			return(ENOMEM);
   1456 
   1457 		rrcopy->raidPtr = (void *) raidPtr;
   1458 		rrcopy->col = column;
   1459 
   1460 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1461 					   rf_ReconstructInPlaceThread,
   1462 					   rrcopy,"raid_reconip");
   1463 		return(retcode);
   1464 
   1465 	case RAIDFRAME_GET_INFO:
   1466 		if (!raidPtr->valid)
   1467 			return (ENODEV);
   1468 		ucfgp = (RF_DeviceConfig_t **) data;
   1469 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1470 			  (RF_DeviceConfig_t *));
   1471 		if (d_cfg == NULL)
   1472 			return (ENOMEM);
   1473 		d_cfg->rows = 1; /* there is only 1 row now */
   1474 		d_cfg->cols = raidPtr->numCol;
   1475 		d_cfg->ndevs = raidPtr->numCol;
   1476 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1477 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1478 			return (ENOMEM);
   1479 		}
   1480 		d_cfg->nspares = raidPtr->numSpare;
   1481 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1482 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1483 			return (ENOMEM);
   1484 		}
   1485 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1486 		d = 0;
   1487 		for (j = 0; j < d_cfg->cols; j++) {
   1488 			d_cfg->devs[d] = raidPtr->Disks[j];
   1489 			d++;
   1490 		}
   1491 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1492 			d_cfg->spares[i] = raidPtr->Disks[j];
   1493 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1494 				/* XXX: raidctl(8) expects to see this as a used spare */
   1495 				d_cfg->spares[i].status = rf_ds_used_spare;
   1496 			}
   1497 		}
   1498 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1499 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1500 
   1501 		return (retcode);
   1502 
   1503 	case RAIDFRAME_CHECK_PARITY:
   1504 		*(int *) data = raidPtr->parity_good;
   1505 		return (0);
   1506 
   1507 	case RAIDFRAME_PARITYMAP_STATUS:
   1508 		if (rf_paritymap_ineligible(raidPtr))
   1509 			return EINVAL;
   1510 		rf_paritymap_status(raidPtr->parity_map,
   1511 		    (struct rf_pmstat *)data);
   1512 		return 0;
   1513 
   1514 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1515 		if (rf_paritymap_ineligible(raidPtr))
   1516 			return EINVAL;
   1517 		if (raidPtr->parity_map == NULL)
   1518 			return ENOENT; /* ??? */
   1519 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1520 			(struct rf_pmparams *)data, 1))
   1521 			return EINVAL;
   1522 		return 0;
   1523 
   1524 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1525 		if (rf_paritymap_ineligible(raidPtr))
   1526 			return EINVAL;
   1527 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1528 		return 0;
   1529 
   1530 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1531 		if (rf_paritymap_ineligible(raidPtr))
   1532 			return EINVAL;
   1533 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1534 		/* XXX should errors be passed up? */
   1535 		return 0;
   1536 
   1537 	case RAIDFRAME_RESET_ACCTOTALS:
   1538 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1539 		return (0);
   1540 
   1541 	case RAIDFRAME_GET_ACCTOTALS:
   1542 		totals = (RF_AccTotals_t *) data;
   1543 		*totals = raidPtr->acc_totals;
   1544 		return (0);
   1545 
   1546 	case RAIDFRAME_KEEP_ACCTOTALS:
   1547 		raidPtr->keep_acc_totals = *(int *)data;
   1548 		return (0);
   1549 
   1550 	case RAIDFRAME_GET_SIZE:
   1551 		*(int *) data = raidPtr->totalSectors;
   1552 		return (0);
   1553 
   1554 		/* fail a disk & optionally start reconstruction */
   1555 	case RAIDFRAME_FAIL_DISK:
   1556 
   1557 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1558 			/* Can't do this on a RAID 0!! */
   1559 			return(EINVAL);
   1560 		}
   1561 
   1562 		rr = (struct rf_recon_req *) data;
   1563 		rr->row = 0;
   1564 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1565 			return (EINVAL);
   1566 
   1567 
   1568 		rf_lock_mutex2(raidPtr->mutex);
   1569 		if (raidPtr->status == rf_rs_reconstructing) {
   1570 			/* you can't fail a disk while we're reconstructing! */
   1571 			/* XXX wrong for RAID6 */
   1572 			rf_unlock_mutex2(raidPtr->mutex);
   1573 			return (EINVAL);
   1574 		}
   1575 		if ((raidPtr->Disks[rr->col].status ==
   1576 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1577 			/* some other component has failed.  Let's not make
   1578 			   things worse. XXX wrong for RAID6 */
   1579 			rf_unlock_mutex2(raidPtr->mutex);
   1580 			return (EINVAL);
   1581 		}
   1582 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1583 			/* Can't fail a spared disk! */
   1584 			rf_unlock_mutex2(raidPtr->mutex);
   1585 			return (EINVAL);
   1586 		}
   1587 		rf_unlock_mutex2(raidPtr->mutex);
   1588 
   1589 		/* make a copy of the recon request so that we don't rely on
   1590 		 * the user's buffer */
   1591 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1592 		if (rrcopy == NULL)
   1593 			return(ENOMEM);
   1594 		memcpy(rrcopy, rr, sizeof(*rr));
   1595 		rrcopy->raidPtr = (void *) raidPtr;
   1596 
   1597 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1598 					   rf_ReconThread,
   1599 					   rrcopy,"raid_recon");
   1600 		return (0);
   1601 
   1602 		/* invoke a copyback operation after recon on whatever disk
   1603 		 * needs it, if any */
   1604 	case RAIDFRAME_COPYBACK:
   1605 
   1606 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1607 			/* This makes no sense on a RAID 0!! */
   1608 			return(EINVAL);
   1609 		}
   1610 
   1611 		if (raidPtr->copyback_in_progress == 1) {
   1612 			/* Copyback is already in progress! */
   1613 			return(EINVAL);
   1614 		}
   1615 
   1616 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1617 					   rf_CopybackThread,
   1618 					   raidPtr,"raid_copyback");
   1619 		return (retcode);
   1620 
   1621 		/* return the percentage completion of reconstruction */
   1622 	case RAIDFRAME_CHECK_RECON_STATUS:
   1623 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1624 			/* This makes no sense on a RAID 0, so tell the
   1625 			   user it's done. */
   1626 			*(int *) data = 100;
   1627 			return(0);
   1628 		}
   1629 		if (raidPtr->status != rf_rs_reconstructing)
   1630 			*(int *) data = 100;
   1631 		else {
   1632 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1633 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1634 			} else {
   1635 				*(int *) data = 0;
   1636 			}
   1637 		}
   1638 		return (0);
   1639 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1640 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1641 		if (raidPtr->status != rf_rs_reconstructing) {
   1642 			progressInfo.remaining = 0;
   1643 			progressInfo.completed = 100;
   1644 			progressInfo.total = 100;
   1645 		} else {
   1646 			progressInfo.total =
   1647 				raidPtr->reconControl->numRUsTotal;
   1648 			progressInfo.completed =
   1649 				raidPtr->reconControl->numRUsComplete;
   1650 			progressInfo.remaining = progressInfo.total -
   1651 				progressInfo.completed;
   1652 		}
   1653 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1654 				  sizeof(RF_ProgressInfo_t));
   1655 		return (retcode);
   1656 
   1657 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1658 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1659 			/* This makes no sense on a RAID 0, so tell the
   1660 			   user it's done. */
   1661 			*(int *) data = 100;
   1662 			return(0);
   1663 		}
   1664 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1665 			*(int *) data = 100 *
   1666 				raidPtr->parity_rewrite_stripes_done /
   1667 				raidPtr->Layout.numStripe;
   1668 		} else {
   1669 			*(int *) data = 100;
   1670 		}
   1671 		return (0);
   1672 
   1673 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1674 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1675 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1676 			progressInfo.total = raidPtr->Layout.numStripe;
   1677 			progressInfo.completed =
   1678 				raidPtr->parity_rewrite_stripes_done;
   1679 			progressInfo.remaining = progressInfo.total -
   1680 				progressInfo.completed;
   1681 		} else {
   1682 			progressInfo.remaining = 0;
   1683 			progressInfo.completed = 100;
   1684 			progressInfo.total = 100;
   1685 		}
   1686 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1687 				  sizeof(RF_ProgressInfo_t));
   1688 		return (retcode);
   1689 
   1690 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1691 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1692 			/* This makes no sense on a RAID 0 */
   1693 			*(int *) data = 100;
   1694 			return(0);
   1695 		}
   1696 		if (raidPtr->copyback_in_progress == 1) {
   1697 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1698 				raidPtr->Layout.numStripe;
   1699 		} else {
   1700 			*(int *) data = 100;
   1701 		}
   1702 		return (0);
   1703 
   1704 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1705 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1706 		if (raidPtr->copyback_in_progress == 1) {
   1707 			progressInfo.total = raidPtr->Layout.numStripe;
   1708 			progressInfo.completed =
   1709 				raidPtr->copyback_stripes_done;
   1710 			progressInfo.remaining = progressInfo.total -
   1711 				progressInfo.completed;
   1712 		} else {
   1713 			progressInfo.remaining = 0;
   1714 			progressInfo.completed = 100;
   1715 			progressInfo.total = 100;
   1716 		}
   1717 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1718 				  sizeof(RF_ProgressInfo_t));
   1719 		return (retcode);
   1720 
   1721 	case RAIDFRAME_SET_LAST_UNIT:
   1722 		for (column = 0; column < raidPtr->numCol; column++)
   1723 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1724 				return EBUSY;
   1725 
   1726 		for (column = 0; column < raidPtr->numCol; column++) {
   1727 			clabel = raidget_component_label(raidPtr, column);
   1728 			clabel->last_unit = *(int *)data;
   1729 			raidflush_component_label(raidPtr, column);
   1730 		}
   1731 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1732 		return 0;
   1733 
   1734 		/* the sparetable daemon calls this to wait for the kernel to
   1735 		 * need a spare table. this ioctl does not return until a
   1736 		 * spare table is needed. XXX -- calling mpsleep here in the
   1737 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1738 		 * -- I should either compute the spare table in the kernel,
   1739 		 * or have a different -- XXX XXX -- interface (a different
   1740 		 * character device) for delivering the table     -- XXX */
   1741 #if 0
   1742 	case RAIDFRAME_SPARET_WAIT:
   1743 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1744 		while (!rf_sparet_wait_queue)
   1745 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1746 		waitreq = rf_sparet_wait_queue;
   1747 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1748 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1749 
   1750 		/* structure assignment */
   1751 		*((RF_SparetWait_t *) data) = *waitreq;
   1752 
   1753 		RF_Free(waitreq, sizeof(*waitreq));
   1754 		return (0);
   1755 
   1756 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1757 		 * code in it that will cause the dameon to exit */
   1758 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1759 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1760 		waitreq->fcol = -1;
   1761 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1762 		waitreq->next = rf_sparet_wait_queue;
   1763 		rf_sparet_wait_queue = waitreq;
   1764 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1765 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1766 		return (0);
   1767 
   1768 		/* used by the spare table daemon to deliver a spare table
   1769 		 * into the kernel */
   1770 	case RAIDFRAME_SEND_SPARET:
   1771 
   1772 		/* install the spare table */
   1773 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1774 
   1775 		/* respond to the requestor.  the return status of the spare
   1776 		 * table installation is passed in the "fcol" field */
   1777 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1778 		waitreq->fcol = retcode;
   1779 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1780 		waitreq->next = rf_sparet_resp_queue;
   1781 		rf_sparet_resp_queue = waitreq;
   1782 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1783 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1784 
   1785 		return (retcode);
   1786 #endif
   1787 
   1788 	default:
   1789 		break; /* fall through to the os-specific code below */
   1790 
   1791 	}
   1792 
   1793 	if (!raidPtr->valid)
   1794 		return (EINVAL);
   1795 
   1796 	/*
   1797 	 * Add support for "regular" device ioctls here.
   1798 	 */
   1799 
   1800 	error = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1801 	if (error != EPASSTHROUGH)
   1802 		return (error);
   1803 
   1804 	switch (cmd) {
   1805 	case DIOCCACHESYNC:
   1806 		return rf_sync_component_caches(raidPtr);
   1807 
   1808 	default:
   1809 		retcode = ENOTTY;
   1810 	}
   1811 	return (retcode);
   1812 
   1813 }
   1814 
   1815 
   1816 /* raidinit -- complete the rest of the initialization for the
   1817    RAIDframe device.  */
   1818 
   1819 
   1820 static void
   1821 raidinit(struct raid_softc *rs)
   1822 {
   1823 	cfdata_t cf;
   1824 	unsigned int unit;
   1825 	struct dk_softc *dksc = &rs->sc_dksc;
   1826 	RF_Raid_t *raidPtr = &rs->sc_r;
   1827 	device_t dev;
   1828 
   1829 	unit = raidPtr->raidid;
   1830 
   1831 	/* XXX doesn't check bounds. */
   1832 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1833 
   1834 	/* attach the pseudo device */
   1835 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1836 	cf->cf_name = raid_cd.cd_name;
   1837 	cf->cf_atname = raid_cd.cd_name;
   1838 	cf->cf_unit = unit;
   1839 	cf->cf_fstate = FSTATE_STAR;
   1840 
   1841 	dev = config_attach_pseudo(cf);
   1842 	if (dev == NULL) {
   1843 		printf("raid%d: config_attach_pseudo failed\n",
   1844 		    raidPtr->raidid);
   1845 		free(cf, M_RAIDFRAME);
   1846 		return;
   1847 	}
   1848 
   1849 	/* provide a backpointer to the real softc */
   1850 	raidsoftc(dev) = rs;
   1851 
   1852 	/* disk_attach actually creates space for the CPU disklabel, among
   1853 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1854 	 * with disklabels. */
   1855 	dk_init(dksc, dev, DKTYPE_RAID);
   1856 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1857 
   1858 	/* XXX There may be a weird interaction here between this, and
   1859 	 * protectedSectors, as used in RAIDframe.  */
   1860 
   1861 	rs->sc_size = raidPtr->totalSectors;
   1862 
   1863 	/* Attach dk and disk subsystems */
   1864 	dk_attach(dksc);
   1865 	disk_attach(&dksc->sc_dkdev);
   1866 	rf_set_geometry(rs, raidPtr);
   1867 
   1868 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1869 
   1870 	/* mark unit as usuable */
   1871 	rs->sc_flags |= RAIDF_INITED;
   1872 
   1873 	dkwedge_discover(&dksc->sc_dkdev);
   1874 }
   1875 
   1876 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1877 /* wake up the daemon & tell it to get us a spare table
   1878  * XXX
   1879  * the entries in the queues should be tagged with the raidPtr
   1880  * so that in the extremely rare case that two recons happen at once,
   1881  * we know for which device were requesting a spare table
   1882  * XXX
   1883  *
   1884  * XXX This code is not currently used. GO
   1885  */
   1886 int
   1887 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1888 {
   1889 	int     retcode;
   1890 
   1891 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1892 	req->next = rf_sparet_wait_queue;
   1893 	rf_sparet_wait_queue = req;
   1894 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1895 
   1896 	/* mpsleep unlocks the mutex */
   1897 	while (!rf_sparet_resp_queue) {
   1898 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1899 	}
   1900 	req = rf_sparet_resp_queue;
   1901 	rf_sparet_resp_queue = req->next;
   1902 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1903 
   1904 	retcode = req->fcol;
   1905 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1906 					 * alloc'd */
   1907 	return (retcode);
   1908 }
   1909 #endif
   1910 
   1911 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1912  * bp & passes it down.
   1913  * any calls originating in the kernel must use non-blocking I/O
   1914  * do some extra sanity checking to return "appropriate" error values for
   1915  * certain conditions (to make some standard utilities work)
   1916  *
   1917  * Formerly known as: rf_DoAccessKernel
   1918  */
   1919 void
   1920 raidstart(RF_Raid_t *raidPtr)
   1921 {
   1922 	struct raid_softc *rs;
   1923 	struct dk_softc *dksc;
   1924 
   1925 	rs = raidPtr->softc;
   1926 	dksc = &rs->sc_dksc;
   1927 	/* quick check to see if anything has died recently */
   1928 	rf_lock_mutex2(raidPtr->mutex);
   1929 	if (raidPtr->numNewFailures > 0) {
   1930 		rf_unlock_mutex2(raidPtr->mutex);
   1931 		rf_update_component_labels(raidPtr,
   1932 					   RF_NORMAL_COMPONENT_UPDATE);
   1933 		rf_lock_mutex2(raidPtr->mutex);
   1934 		raidPtr->numNewFailures--;
   1935 	}
   1936 	rf_unlock_mutex2(raidPtr->mutex);
   1937 
   1938 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1939 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1940 		return;
   1941 	}
   1942 
   1943 	dk_start(dksc, NULL);
   1944 }
   1945 
   1946 static int
   1947 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1948 {
   1949 	RF_SectorCount_t num_blocks, pb, sum;
   1950 	RF_RaidAddr_t raid_addr;
   1951 	daddr_t blocknum;
   1952 	int     do_async;
   1953 	int rc;
   1954 
   1955 	rf_lock_mutex2(raidPtr->mutex);
   1956 	if (raidPtr->openings == 0) {
   1957 		rf_unlock_mutex2(raidPtr->mutex);
   1958 		return EAGAIN;
   1959 	}
   1960 	rf_unlock_mutex2(raidPtr->mutex);
   1961 
   1962 	blocknum = bp->b_rawblkno;
   1963 
   1964 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1965 		    (int) blocknum));
   1966 
   1967 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1968 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1969 
   1970 	/* *THIS* is where we adjust what block we're going to...
   1971 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1972 	raid_addr = blocknum;
   1973 
   1974 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1975 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1976 	sum = raid_addr + num_blocks + pb;
   1977 	if (1 || rf_debugKernelAccess) {
   1978 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1979 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1980 			    (int) pb, (int) bp->b_resid));
   1981 	}
   1982 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1983 	    || (sum < num_blocks) || (sum < pb)) {
   1984 		rc = ENOSPC;
   1985 		goto done;
   1986 	}
   1987 	/*
   1988 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1989 	 */
   1990 
   1991 	if (bp->b_bcount & raidPtr->sectorMask) {
   1992 		rc = ENOSPC;
   1993 		goto done;
   1994 	}
   1995 	db1_printf(("Calling DoAccess..\n"));
   1996 
   1997 
   1998 	rf_lock_mutex2(raidPtr->mutex);
   1999 	raidPtr->openings--;
   2000 	rf_unlock_mutex2(raidPtr->mutex);
   2001 
   2002 	/*
   2003 	 * Everything is async.
   2004 	 */
   2005 	do_async = 1;
   2006 
   2007 	/* don't ever condition on bp->b_flags & B_WRITE.
   2008 	 * always condition on B_READ instead */
   2009 
   2010 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2011 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2012 			 do_async, raid_addr, num_blocks,
   2013 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2014 
   2015 done:
   2016 	return rc;
   2017 }
   2018 
   2019 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2020 
   2021 int
   2022 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2023 {
   2024 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2025 	struct buf *bp;
   2026 
   2027 	req->queue = queue;
   2028 	bp = req->bp;
   2029 
   2030 	switch (req->type) {
   2031 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2032 		/* XXX need to do something extra here.. */
   2033 		/* I'm leaving this in, as I've never actually seen it used,
   2034 		 * and I'd like folks to report it... GO */
   2035 		printf(("WAKEUP CALLED\n"));
   2036 		queue->numOutstanding++;
   2037 
   2038 		bp->b_flags = 0;
   2039 		bp->b_private = req;
   2040 
   2041 		KernelWakeupFunc(bp);
   2042 		break;
   2043 
   2044 	case RF_IO_TYPE_READ:
   2045 	case RF_IO_TYPE_WRITE:
   2046 #if RF_ACC_TRACE > 0
   2047 		if (req->tracerec) {
   2048 			RF_ETIMER_START(req->tracerec->timer);
   2049 		}
   2050 #endif
   2051 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2052 		    op, queue->rf_cinfo->ci_dev,
   2053 		    req->sectorOffset, req->numSector,
   2054 		    req->buf, KernelWakeupFunc, (void *) req,
   2055 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2056 
   2057 		if (rf_debugKernelAccess) {
   2058 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2059 				(long) bp->b_blkno));
   2060 		}
   2061 		queue->numOutstanding++;
   2062 		queue->last_deq_sector = req->sectorOffset;
   2063 		/* acc wouldn't have been let in if there were any pending
   2064 		 * reqs at any other priority */
   2065 		queue->curPriority = req->priority;
   2066 
   2067 		db1_printf(("Going for %c to unit %d col %d\n",
   2068 			    req->type, queue->raidPtr->raidid,
   2069 			    queue->col));
   2070 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2071 			(int) req->sectorOffset, (int) req->numSector,
   2072 			(int) (req->numSector <<
   2073 			    queue->raidPtr->logBytesPerSector),
   2074 			(int) queue->raidPtr->logBytesPerSector));
   2075 
   2076 		/*
   2077 		 * XXX: drop lock here since this can block at
   2078 		 * least with backing SCSI devices.  Retake it
   2079 		 * to minimize fuss with calling interfaces.
   2080 		 */
   2081 
   2082 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2083 		bdev_strategy(bp);
   2084 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2085 		break;
   2086 
   2087 	default:
   2088 		panic("bad req->type in rf_DispatchKernelIO");
   2089 	}
   2090 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2091 
   2092 	return (0);
   2093 }
   2094 /* this is the callback function associated with a I/O invoked from
   2095    kernel code.
   2096  */
   2097 static void
   2098 KernelWakeupFunc(struct buf *bp)
   2099 {
   2100 	RF_DiskQueueData_t *req = NULL;
   2101 	RF_DiskQueue_t *queue;
   2102 
   2103 	db1_printf(("recovering the request queue:\n"));
   2104 
   2105 	req = bp->b_private;
   2106 
   2107 	queue = (RF_DiskQueue_t *) req->queue;
   2108 
   2109 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2110 
   2111 #if RF_ACC_TRACE > 0
   2112 	if (req->tracerec) {
   2113 		RF_ETIMER_STOP(req->tracerec->timer);
   2114 		RF_ETIMER_EVAL(req->tracerec->timer);
   2115 		rf_lock_mutex2(rf_tracing_mutex);
   2116 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2117 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2118 		req->tracerec->num_phys_ios++;
   2119 		rf_unlock_mutex2(rf_tracing_mutex);
   2120 	}
   2121 #endif
   2122 
   2123 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2124 	 * ballistic, and mark the component as hosed... */
   2125 
   2126 	if (bp->b_error != 0) {
   2127 		/* Mark the disk as dead */
   2128 		/* but only mark it once... */
   2129 		/* and only if it wouldn't leave this RAID set
   2130 		   completely broken */
   2131 		if (((queue->raidPtr->Disks[queue->col].status ==
   2132 		      rf_ds_optimal) ||
   2133 		     (queue->raidPtr->Disks[queue->col].status ==
   2134 		      rf_ds_used_spare)) &&
   2135 		     (queue->raidPtr->numFailures <
   2136 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2137 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2138 			       queue->raidPtr->raidid,
   2139 			       bp->b_error,
   2140 			       queue->raidPtr->Disks[queue->col].devname);
   2141 			queue->raidPtr->Disks[queue->col].status =
   2142 			    rf_ds_failed;
   2143 			queue->raidPtr->status = rf_rs_degraded;
   2144 			queue->raidPtr->numFailures++;
   2145 			queue->raidPtr->numNewFailures++;
   2146 		} else {	/* Disk is already dead... */
   2147 			/* printf("Disk already marked as dead!\n"); */
   2148 		}
   2149 
   2150 	}
   2151 
   2152 	/* Fill in the error value */
   2153 	req->error = bp->b_error;
   2154 
   2155 	/* Drop this one on the "finished" queue... */
   2156 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2157 
   2158 	/* Let the raidio thread know there is work to be done. */
   2159 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2160 
   2161 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2162 }
   2163 
   2164 
   2165 /*
   2166  * initialize a buf structure for doing an I/O in the kernel.
   2167  */
   2168 static void
   2169 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2170        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2171        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2172        struct proc *b_proc)
   2173 {
   2174 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2175 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2176 	bp->b_oflags = 0;
   2177 	bp->b_cflags = 0;
   2178 	bp->b_bcount = numSect << logBytesPerSector;
   2179 	bp->b_bufsize = bp->b_bcount;
   2180 	bp->b_error = 0;
   2181 	bp->b_dev = dev;
   2182 	bp->b_data = bf;
   2183 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2184 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2185 	if (bp->b_bcount == 0) {
   2186 		panic("bp->b_bcount is zero in InitBP!!");
   2187 	}
   2188 	bp->b_proc = b_proc;
   2189 	bp->b_iodone = cbFunc;
   2190 	bp->b_private = cbArg;
   2191 }
   2192 
   2193 /*
   2194  * Wait interruptibly for an exclusive lock.
   2195  *
   2196  * XXX
   2197  * Several drivers do this; it should be abstracted and made MP-safe.
   2198  * (Hmm... where have we seen this warning before :->  GO )
   2199  */
   2200 static int
   2201 raidlock(struct raid_softc *rs)
   2202 {
   2203 	int     error;
   2204 
   2205 	error = 0;
   2206 	mutex_enter(&rs->sc_mutex);
   2207 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2208 		rs->sc_flags |= RAIDF_WANTED;
   2209 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2210 		if (error != 0)
   2211 			goto done;
   2212 	}
   2213 	rs->sc_flags |= RAIDF_LOCKED;
   2214 done:
   2215 	mutex_exit(&rs->sc_mutex);
   2216 	return (error);
   2217 }
   2218 /*
   2219  * Unlock and wake up any waiters.
   2220  */
   2221 static void
   2222 raidunlock(struct raid_softc *rs)
   2223 {
   2224 
   2225 	mutex_enter(&rs->sc_mutex);
   2226 	rs->sc_flags &= ~RAIDF_LOCKED;
   2227 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2228 		rs->sc_flags &= ~RAIDF_WANTED;
   2229 		cv_broadcast(&rs->sc_cv);
   2230 	}
   2231 	mutex_exit(&rs->sc_mutex);
   2232 }
   2233 
   2234 
   2235 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2236 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2237 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2238 
   2239 static daddr_t
   2240 rf_component_info_offset(void)
   2241 {
   2242 
   2243 	return RF_COMPONENT_INFO_OFFSET;
   2244 }
   2245 
   2246 static daddr_t
   2247 rf_component_info_size(unsigned secsize)
   2248 {
   2249 	daddr_t info_size;
   2250 
   2251 	KASSERT(secsize);
   2252 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2253 		info_size = secsize;
   2254 	else
   2255 		info_size = RF_COMPONENT_INFO_SIZE;
   2256 
   2257 	return info_size;
   2258 }
   2259 
   2260 static daddr_t
   2261 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2262 {
   2263 	daddr_t map_offset;
   2264 
   2265 	KASSERT(raidPtr->bytesPerSector);
   2266 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2267 		map_offset = raidPtr->bytesPerSector;
   2268 	else
   2269 		map_offset = RF_COMPONENT_INFO_SIZE;
   2270 	map_offset += rf_component_info_offset();
   2271 
   2272 	return map_offset;
   2273 }
   2274 
   2275 static daddr_t
   2276 rf_parity_map_size(RF_Raid_t *raidPtr)
   2277 {
   2278 	daddr_t map_size;
   2279 
   2280 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2281 		map_size = raidPtr->bytesPerSector;
   2282 	else
   2283 		map_size = RF_PARITY_MAP_SIZE;
   2284 
   2285 	return map_size;
   2286 }
   2287 
   2288 int
   2289 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2290 {
   2291 	RF_ComponentLabel_t *clabel;
   2292 
   2293 	clabel = raidget_component_label(raidPtr, col);
   2294 	clabel->clean = RF_RAID_CLEAN;
   2295 	raidflush_component_label(raidPtr, col);
   2296 	return(0);
   2297 }
   2298 
   2299 
   2300 int
   2301 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2302 {
   2303 	RF_ComponentLabel_t *clabel;
   2304 
   2305 	clabel = raidget_component_label(raidPtr, col);
   2306 	clabel->clean = RF_RAID_DIRTY;
   2307 	raidflush_component_label(raidPtr, col);
   2308 	return(0);
   2309 }
   2310 
   2311 int
   2312 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2313 {
   2314 	KASSERT(raidPtr->bytesPerSector);
   2315 	return raidread_component_label(raidPtr->bytesPerSector,
   2316 	    raidPtr->Disks[col].dev,
   2317 	    raidPtr->raid_cinfo[col].ci_vp,
   2318 	    &raidPtr->raid_cinfo[col].ci_label);
   2319 }
   2320 
   2321 RF_ComponentLabel_t *
   2322 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2323 {
   2324 	return &raidPtr->raid_cinfo[col].ci_label;
   2325 }
   2326 
   2327 int
   2328 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2329 {
   2330 	RF_ComponentLabel_t *label;
   2331 
   2332 	label = &raidPtr->raid_cinfo[col].ci_label;
   2333 	label->mod_counter = raidPtr->mod_counter;
   2334 #ifndef RF_NO_PARITY_MAP
   2335 	label->parity_map_modcount = label->mod_counter;
   2336 #endif
   2337 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2338 	    raidPtr->Disks[col].dev,
   2339 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2340 }
   2341 
   2342 
   2343 static int
   2344 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2345     RF_ComponentLabel_t *clabel)
   2346 {
   2347 	return raidread_component_area(dev, b_vp, clabel,
   2348 	    sizeof(RF_ComponentLabel_t),
   2349 	    rf_component_info_offset(),
   2350 	    rf_component_info_size(secsize));
   2351 }
   2352 
   2353 /* ARGSUSED */
   2354 static int
   2355 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2356     size_t msize, daddr_t offset, daddr_t dsize)
   2357 {
   2358 	struct buf *bp;
   2359 	int error;
   2360 
   2361 	/* XXX should probably ensure that we don't try to do this if
   2362 	   someone has changed rf_protected_sectors. */
   2363 
   2364 	if (b_vp == NULL) {
   2365 		/* For whatever reason, this component is not valid.
   2366 		   Don't try to read a component label from it. */
   2367 		return(EINVAL);
   2368 	}
   2369 
   2370 	/* get a block of the appropriate size... */
   2371 	bp = geteblk((int)dsize);
   2372 	bp->b_dev = dev;
   2373 
   2374 	/* get our ducks in a row for the read */
   2375 	bp->b_blkno = offset / DEV_BSIZE;
   2376 	bp->b_bcount = dsize;
   2377 	bp->b_flags |= B_READ;
   2378  	bp->b_resid = dsize;
   2379 
   2380 	bdev_strategy(bp);
   2381 	error = biowait(bp);
   2382 
   2383 	if (!error) {
   2384 		memcpy(data, bp->b_data, msize);
   2385 	}
   2386 
   2387 	brelse(bp, 0);
   2388 	return(error);
   2389 }
   2390 
   2391 
   2392 static int
   2393 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2394     RF_ComponentLabel_t *clabel)
   2395 {
   2396 	return raidwrite_component_area(dev, b_vp, clabel,
   2397 	    sizeof(RF_ComponentLabel_t),
   2398 	    rf_component_info_offset(),
   2399 	    rf_component_info_size(secsize), 0);
   2400 }
   2401 
   2402 /* ARGSUSED */
   2403 static int
   2404 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2405     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2406 {
   2407 	struct buf *bp;
   2408 	int error;
   2409 
   2410 	/* get a block of the appropriate size... */
   2411 	bp = geteblk((int)dsize);
   2412 	bp->b_dev = dev;
   2413 
   2414 	/* get our ducks in a row for the write */
   2415 	bp->b_blkno = offset / DEV_BSIZE;
   2416 	bp->b_bcount = dsize;
   2417 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2418  	bp->b_resid = dsize;
   2419 
   2420 	memset(bp->b_data, 0, dsize);
   2421 	memcpy(bp->b_data, data, msize);
   2422 
   2423 	bdev_strategy(bp);
   2424 	if (asyncp)
   2425 		return 0;
   2426 	error = biowait(bp);
   2427 	brelse(bp, 0);
   2428 	if (error) {
   2429 #if 1
   2430 		printf("Failed to write RAID component info!\n");
   2431 #endif
   2432 	}
   2433 
   2434 	return(error);
   2435 }
   2436 
   2437 void
   2438 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2439 {
   2440 	int c;
   2441 
   2442 	for (c = 0; c < raidPtr->numCol; c++) {
   2443 		/* Skip dead disks. */
   2444 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2445 			continue;
   2446 		/* XXXjld: what if an error occurs here? */
   2447 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2448 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2449 		    RF_PARITYMAP_NBYTE,
   2450 		    rf_parity_map_offset(raidPtr),
   2451 		    rf_parity_map_size(raidPtr), 0);
   2452 	}
   2453 }
   2454 
   2455 void
   2456 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2457 {
   2458 	struct rf_paritymap_ondisk tmp;
   2459 	int c,first;
   2460 
   2461 	first=1;
   2462 	for (c = 0; c < raidPtr->numCol; c++) {
   2463 		/* Skip dead disks. */
   2464 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2465 			continue;
   2466 		raidread_component_area(raidPtr->Disks[c].dev,
   2467 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2468 		    RF_PARITYMAP_NBYTE,
   2469 		    rf_parity_map_offset(raidPtr),
   2470 		    rf_parity_map_size(raidPtr));
   2471 		if (first) {
   2472 			memcpy(map, &tmp, sizeof(*map));
   2473 			first = 0;
   2474 		} else {
   2475 			rf_paritymap_merge(map, &tmp);
   2476 		}
   2477 	}
   2478 }
   2479 
   2480 void
   2481 rf_markalldirty(RF_Raid_t *raidPtr)
   2482 {
   2483 	RF_ComponentLabel_t *clabel;
   2484 	int sparecol;
   2485 	int c;
   2486 	int j;
   2487 	int scol = -1;
   2488 
   2489 	raidPtr->mod_counter++;
   2490 	for (c = 0; c < raidPtr->numCol; c++) {
   2491 		/* we don't want to touch (at all) a disk that has
   2492 		   failed */
   2493 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2494 			clabel = raidget_component_label(raidPtr, c);
   2495 			if (clabel->status == rf_ds_spared) {
   2496 				/* XXX do something special...
   2497 				   but whatever you do, don't
   2498 				   try to access it!! */
   2499 			} else {
   2500 				raidmarkdirty(raidPtr, c);
   2501 			}
   2502 		}
   2503 	}
   2504 
   2505 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2506 		sparecol = raidPtr->numCol + c;
   2507 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2508 			/*
   2509 
   2510 			   we claim this disk is "optimal" if it's
   2511 			   rf_ds_used_spare, as that means it should be
   2512 			   directly substitutable for the disk it replaced.
   2513 			   We note that too...
   2514 
   2515 			 */
   2516 
   2517 			for(j=0;j<raidPtr->numCol;j++) {
   2518 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2519 					scol = j;
   2520 					break;
   2521 				}
   2522 			}
   2523 
   2524 			clabel = raidget_component_label(raidPtr, sparecol);
   2525 			/* make sure status is noted */
   2526 
   2527 			raid_init_component_label(raidPtr, clabel);
   2528 
   2529 			clabel->row = 0;
   2530 			clabel->column = scol;
   2531 			/* Note: we *don't* change status from rf_ds_used_spare
   2532 			   to rf_ds_optimal */
   2533 			/* clabel.status = rf_ds_optimal; */
   2534 
   2535 			raidmarkdirty(raidPtr, sparecol);
   2536 		}
   2537 	}
   2538 }
   2539 
   2540 
   2541 void
   2542 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2543 {
   2544 	RF_ComponentLabel_t *clabel;
   2545 	int sparecol;
   2546 	int c;
   2547 	int j;
   2548 	int scol;
   2549 	struct raid_softc *rs = raidPtr->softc;
   2550 
   2551 	scol = -1;
   2552 
   2553 	/* XXX should do extra checks to make sure things really are clean,
   2554 	   rather than blindly setting the clean bit... */
   2555 
   2556 	raidPtr->mod_counter++;
   2557 
   2558 	for (c = 0; c < raidPtr->numCol; c++) {
   2559 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2560 			clabel = raidget_component_label(raidPtr, c);
   2561 			/* make sure status is noted */
   2562 			clabel->status = rf_ds_optimal;
   2563 
   2564 			/* note what unit we are configured as */
   2565 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2566 				clabel->last_unit = raidPtr->raidid;
   2567 
   2568 			raidflush_component_label(raidPtr, c);
   2569 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2570 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2571 					raidmarkclean(raidPtr, c);
   2572 				}
   2573 			}
   2574 		}
   2575 		/* else we don't touch it.. */
   2576 	}
   2577 
   2578 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2579 		sparecol = raidPtr->numCol + c;
   2580 		/* Need to ensure that the reconstruct actually completed! */
   2581 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2582 			/*
   2583 
   2584 			   we claim this disk is "optimal" if it's
   2585 			   rf_ds_used_spare, as that means it should be
   2586 			   directly substitutable for the disk it replaced.
   2587 			   We note that too...
   2588 
   2589 			 */
   2590 
   2591 			for(j=0;j<raidPtr->numCol;j++) {
   2592 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2593 					scol = j;
   2594 					break;
   2595 				}
   2596 			}
   2597 
   2598 			/* XXX shouldn't *really* need this... */
   2599 			clabel = raidget_component_label(raidPtr, sparecol);
   2600 			/* make sure status is noted */
   2601 
   2602 			raid_init_component_label(raidPtr, clabel);
   2603 
   2604 			clabel->column = scol;
   2605 			clabel->status = rf_ds_optimal;
   2606 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2607 				clabel->last_unit = raidPtr->raidid;
   2608 
   2609 			raidflush_component_label(raidPtr, sparecol);
   2610 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2611 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2612 					raidmarkclean(raidPtr, sparecol);
   2613 				}
   2614 			}
   2615 		}
   2616 	}
   2617 }
   2618 
   2619 void
   2620 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2621 {
   2622 
   2623 	if (vp != NULL) {
   2624 		if (auto_configured == 1) {
   2625 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2626 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2627 			vput(vp);
   2628 
   2629 		} else {
   2630 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2631 		}
   2632 	}
   2633 }
   2634 
   2635 
   2636 void
   2637 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2638 {
   2639 	int r,c;
   2640 	struct vnode *vp;
   2641 	int acd;
   2642 
   2643 
   2644 	/* We take this opportunity to close the vnodes like we should.. */
   2645 
   2646 	for (c = 0; c < raidPtr->numCol; c++) {
   2647 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2648 		acd = raidPtr->Disks[c].auto_configured;
   2649 		rf_close_component(raidPtr, vp, acd);
   2650 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2651 		raidPtr->Disks[c].auto_configured = 0;
   2652 	}
   2653 
   2654 	for (r = 0; r < raidPtr->numSpare; r++) {
   2655 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2656 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2657 		rf_close_component(raidPtr, vp, acd);
   2658 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2659 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2660 	}
   2661 }
   2662 
   2663 
   2664 void
   2665 rf_ReconThread(struct rf_recon_req *req)
   2666 {
   2667 	int     s;
   2668 	RF_Raid_t *raidPtr;
   2669 
   2670 	s = splbio();
   2671 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2672 	raidPtr->recon_in_progress = 1;
   2673 
   2674 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2675 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2676 
   2677 	RF_Free(req, sizeof(*req));
   2678 
   2679 	raidPtr->recon_in_progress = 0;
   2680 	splx(s);
   2681 
   2682 	/* That's all... */
   2683 	kthread_exit(0);	/* does not return */
   2684 }
   2685 
   2686 void
   2687 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2688 {
   2689 	int retcode;
   2690 	int s;
   2691 
   2692 	raidPtr->parity_rewrite_stripes_done = 0;
   2693 	raidPtr->parity_rewrite_in_progress = 1;
   2694 	s = splbio();
   2695 	retcode = rf_RewriteParity(raidPtr);
   2696 	splx(s);
   2697 	if (retcode) {
   2698 		printf("raid%d: Error re-writing parity (%d)!\n",
   2699 		    raidPtr->raidid, retcode);
   2700 	} else {
   2701 		/* set the clean bit!  If we shutdown correctly,
   2702 		   the clean bit on each component label will get
   2703 		   set */
   2704 		raidPtr->parity_good = RF_RAID_CLEAN;
   2705 	}
   2706 	raidPtr->parity_rewrite_in_progress = 0;
   2707 
   2708 	/* Anyone waiting for us to stop?  If so, inform them... */
   2709 	if (raidPtr->waitShutdown) {
   2710 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2711 	}
   2712 
   2713 	/* That's all... */
   2714 	kthread_exit(0);	/* does not return */
   2715 }
   2716 
   2717 
   2718 void
   2719 rf_CopybackThread(RF_Raid_t *raidPtr)
   2720 {
   2721 	int s;
   2722 
   2723 	raidPtr->copyback_in_progress = 1;
   2724 	s = splbio();
   2725 	rf_CopybackReconstructedData(raidPtr);
   2726 	splx(s);
   2727 	raidPtr->copyback_in_progress = 0;
   2728 
   2729 	/* That's all... */
   2730 	kthread_exit(0);	/* does not return */
   2731 }
   2732 
   2733 
   2734 void
   2735 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2736 {
   2737 	int s;
   2738 	RF_Raid_t *raidPtr;
   2739 
   2740 	s = splbio();
   2741 	raidPtr = req->raidPtr;
   2742 	raidPtr->recon_in_progress = 1;
   2743 	rf_ReconstructInPlace(raidPtr, req->col);
   2744 	RF_Free(req, sizeof(*req));
   2745 	raidPtr->recon_in_progress = 0;
   2746 	splx(s);
   2747 
   2748 	/* That's all... */
   2749 	kthread_exit(0);	/* does not return */
   2750 }
   2751 
   2752 static RF_AutoConfig_t *
   2753 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2754     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2755     unsigned secsize)
   2756 {
   2757 	int good_one = 0;
   2758 	RF_ComponentLabel_t *clabel;
   2759 	RF_AutoConfig_t *ac;
   2760 
   2761 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2762 	if (clabel == NULL) {
   2763 oomem:
   2764 		    while(ac_list) {
   2765 			    ac = ac_list;
   2766 			    if (ac->clabel)
   2767 				    free(ac->clabel, M_RAIDFRAME);
   2768 			    ac_list = ac_list->next;
   2769 			    free(ac, M_RAIDFRAME);
   2770 		    }
   2771 		    printf("RAID auto config: out of memory!\n");
   2772 		    return NULL; /* XXX probably should panic? */
   2773 	}
   2774 
   2775 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2776 		/* Got the label.  Does it look reasonable? */
   2777 		if (rf_reasonable_label(clabel, numsecs) &&
   2778 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2779 #ifdef DEBUG
   2780 			printf("Component on: %s: %llu\n",
   2781 				cname, (unsigned long long)size);
   2782 			rf_print_component_label(clabel);
   2783 #endif
   2784 			/* if it's reasonable, add it, else ignore it. */
   2785 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2786 				M_NOWAIT);
   2787 			if (ac == NULL) {
   2788 				free(clabel, M_RAIDFRAME);
   2789 				goto oomem;
   2790 			}
   2791 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2792 			ac->dev = dev;
   2793 			ac->vp = vp;
   2794 			ac->clabel = clabel;
   2795 			ac->next = ac_list;
   2796 			ac_list = ac;
   2797 			good_one = 1;
   2798 		}
   2799 	}
   2800 	if (!good_one) {
   2801 		/* cleanup */
   2802 		free(clabel, M_RAIDFRAME);
   2803 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2804 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2805 		vput(vp);
   2806 	}
   2807 	return ac_list;
   2808 }
   2809 
   2810 RF_AutoConfig_t *
   2811 rf_find_raid_components(void)
   2812 {
   2813 	struct vnode *vp;
   2814 	struct disklabel label;
   2815 	device_t dv;
   2816 	deviter_t di;
   2817 	dev_t dev;
   2818 	int bmajor, bminor, wedge, rf_part_found;
   2819 	int error;
   2820 	int i;
   2821 	RF_AutoConfig_t *ac_list;
   2822 	uint64_t numsecs;
   2823 	unsigned secsize;
   2824 	int dowedges;
   2825 
   2826 	/* initialize the AutoConfig list */
   2827 	ac_list = NULL;
   2828 
   2829 	/*
   2830 	 * we begin by trolling through *all* the devices on the system *twice*
   2831 	 * first we scan for wedges, second for other devices. This avoids
   2832 	 * using a raw partition instead of a wedge that covers the whole disk
   2833 	 */
   2834 
   2835 	for (dowedges=1; dowedges>=0; --dowedges) {
   2836 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2837 		     dv = deviter_next(&di)) {
   2838 
   2839 			/* we are only interested in disks... */
   2840 			if (device_class(dv) != DV_DISK)
   2841 				continue;
   2842 
   2843 			/* we don't care about floppies... */
   2844 			if (device_is_a(dv, "fd")) {
   2845 				continue;
   2846 			}
   2847 
   2848 			/* we don't care about CD's... */
   2849 			if (device_is_a(dv, "cd")) {
   2850 				continue;
   2851 			}
   2852 
   2853 			/* we don't care about md's... */
   2854 			if (device_is_a(dv, "md")) {
   2855 				continue;
   2856 			}
   2857 
   2858 			/* hdfd is the Atari/Hades floppy driver */
   2859 			if (device_is_a(dv, "hdfd")) {
   2860 				continue;
   2861 			}
   2862 
   2863 			/* fdisa is the Atari/Milan floppy driver */
   2864 			if (device_is_a(dv, "fdisa")) {
   2865 				continue;
   2866 			}
   2867 
   2868 			/* are we in the wedges pass ? */
   2869 			wedge = device_is_a(dv, "dk");
   2870 			if (wedge != dowedges) {
   2871 				continue;
   2872 			}
   2873 
   2874 			/* need to find the device_name_to_block_device_major stuff */
   2875 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2876 
   2877 			rf_part_found = 0; /*No raid partition as yet*/
   2878 
   2879 			/* get a vnode for the raw partition of this disk */
   2880 			bminor = minor(device_unit(dv));
   2881 			dev = wedge ? makedev(bmajor, bminor) :
   2882 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2883 			if (bdevvp(dev, &vp))
   2884 				panic("RAID can't alloc vnode");
   2885 
   2886 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2887 
   2888 			if (error) {
   2889 				/* "Who cares."  Continue looking
   2890 				   for something that exists*/
   2891 				vput(vp);
   2892 				continue;
   2893 			}
   2894 
   2895 			error = getdisksize(vp, &numsecs, &secsize);
   2896 			if (error) {
   2897 				/*
   2898 				 * Pseudo devices like vnd and cgd can be
   2899 				 * opened but may still need some configuration.
   2900 				 * Ignore these quietly.
   2901 				 */
   2902 				if (error != ENXIO)
   2903 					printf("RAIDframe: can't get disk size"
   2904 					    " for dev %s (%d)\n",
   2905 					    device_xname(dv), error);
   2906 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2907 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2908 				vput(vp);
   2909 				continue;
   2910 			}
   2911 			if (wedge) {
   2912 				struct dkwedge_info dkw;
   2913 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2914 				    NOCRED);
   2915 				if (error) {
   2916 					printf("RAIDframe: can't get wedge info for "
   2917 					    "dev %s (%d)\n", device_xname(dv), error);
   2918 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2919 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2920 					vput(vp);
   2921 					continue;
   2922 				}
   2923 
   2924 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2925 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2926 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2927 					vput(vp);
   2928 					continue;
   2929 				}
   2930 
   2931 				ac_list = rf_get_component(ac_list, dev, vp,
   2932 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2933 				rf_part_found = 1; /*There is a raid component on this disk*/
   2934 				continue;
   2935 			}
   2936 
   2937 			/* Ok, the disk exists.  Go get the disklabel. */
   2938 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2939 			if (error) {
   2940 				/*
   2941 				 * XXX can't happen - open() would
   2942 				 * have errored out (or faked up one)
   2943 				 */
   2944 				if (error != ENOTTY)
   2945 					printf("RAIDframe: can't get label for dev "
   2946 					    "%s (%d)\n", device_xname(dv), error);
   2947 			}
   2948 
   2949 			/* don't need this any more.  We'll allocate it again
   2950 			   a little later if we really do... */
   2951 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2952 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2953 			vput(vp);
   2954 
   2955 			if (error)
   2956 				continue;
   2957 
   2958 			rf_part_found = 0; /*No raid partitions yet*/
   2959 			for (i = 0; i < label.d_npartitions; i++) {
   2960 				char cname[sizeof(ac_list->devname)];
   2961 
   2962 				/* We only support partitions marked as RAID */
   2963 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2964 					continue;
   2965 
   2966 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2967 				if (bdevvp(dev, &vp))
   2968 					panic("RAID can't alloc vnode");
   2969 
   2970 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2971 				if (error) {
   2972 					/* Whatever... */
   2973 					vput(vp);
   2974 					continue;
   2975 				}
   2976 				snprintf(cname, sizeof(cname), "%s%c",
   2977 				    device_xname(dv), 'a' + i);
   2978 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2979 					label.d_partitions[i].p_size, numsecs, secsize);
   2980 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2981 			}
   2982 
   2983 			/*
   2984 			 *If there is no raid component on this disk, either in a
   2985 			 *disklabel or inside a wedge, check the raw partition as well,
   2986 			 *as it is possible to configure raid components on raw disk
   2987 			 *devices.
   2988 			 */
   2989 
   2990 			if (!rf_part_found) {
   2991 				char cname[sizeof(ac_list->devname)];
   2992 
   2993 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2994 				if (bdevvp(dev, &vp))
   2995 					panic("RAID can't alloc vnode");
   2996 
   2997 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2998 				if (error) {
   2999 					/* Whatever... */
   3000 					vput(vp);
   3001 					continue;
   3002 				}
   3003 				snprintf(cname, sizeof(cname), "%s%c",
   3004 				    device_xname(dv), 'a' + RAW_PART);
   3005 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3006 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3007 			}
   3008 		}
   3009 		deviter_release(&di);
   3010 	}
   3011 	return ac_list;
   3012 }
   3013 
   3014 
   3015 int
   3016 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3017 {
   3018 
   3019 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3020 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3021 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3022 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3023 	    clabel->row >=0 &&
   3024 	    clabel->column >= 0 &&
   3025 	    clabel->num_rows > 0 &&
   3026 	    clabel->num_columns > 0 &&
   3027 	    clabel->row < clabel->num_rows &&
   3028 	    clabel->column < clabel->num_columns &&
   3029 	    clabel->blockSize > 0 &&
   3030 	    /*
   3031 	     * numBlocksHi may contain garbage, but it is ok since
   3032 	     * the type is unsigned.  If it is really garbage,
   3033 	     * rf_fix_old_label_size() will fix it.
   3034 	     */
   3035 	    rf_component_label_numblocks(clabel) > 0) {
   3036 		/*
   3037 		 * label looks reasonable enough...
   3038 		 * let's make sure it has no old garbage.
   3039 		 */
   3040 		if (numsecs)
   3041 			rf_fix_old_label_size(clabel, numsecs);
   3042 		return(1);
   3043 	}
   3044 	return(0);
   3045 }
   3046 
   3047 
   3048 /*
   3049  * For reasons yet unknown, some old component labels have garbage in
   3050  * the newer numBlocksHi region, and this causes lossage.  Since those
   3051  * disks will also have numsecs set to less than 32 bits of sectors,
   3052  * we can determine when this corruption has occurred, and fix it.
   3053  *
   3054  * The exact same problem, with the same unknown reason, happens to
   3055  * the partitionSizeHi member as well.
   3056  */
   3057 static void
   3058 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3059 {
   3060 
   3061 	if (numsecs < ((uint64_t)1 << 32)) {
   3062 		if (clabel->numBlocksHi) {
   3063 			printf("WARNING: total sectors < 32 bits, yet "
   3064 			       "numBlocksHi set\n"
   3065 			       "WARNING: resetting numBlocksHi to zero.\n");
   3066 			clabel->numBlocksHi = 0;
   3067 		}
   3068 
   3069 		if (clabel->partitionSizeHi) {
   3070 			printf("WARNING: total sectors < 32 bits, yet "
   3071 			       "partitionSizeHi set\n"
   3072 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3073 			clabel->partitionSizeHi = 0;
   3074 		}
   3075 	}
   3076 }
   3077 
   3078 
   3079 #ifdef DEBUG
   3080 void
   3081 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3082 {
   3083 	uint64_t numBlocks;
   3084 	static const char *rp[] = {
   3085 	    "No", "Force", "Soft", "*invalid*"
   3086 	};
   3087 
   3088 
   3089 	numBlocks = rf_component_label_numblocks(clabel);
   3090 
   3091 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3092 	       clabel->row, clabel->column,
   3093 	       clabel->num_rows, clabel->num_columns);
   3094 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3095 	       clabel->version, clabel->serial_number,
   3096 	       clabel->mod_counter);
   3097 	printf("   Clean: %s Status: %d\n",
   3098 	       clabel->clean ? "Yes" : "No", clabel->status);
   3099 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3100 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3101 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3102 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3103 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3104 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3105 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3106 #if 0
   3107 	   printf("   Config order: %d\n", clabel->config_order);
   3108 #endif
   3109 
   3110 }
   3111 #endif
   3112 
   3113 RF_ConfigSet_t *
   3114 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3115 {
   3116 	RF_AutoConfig_t *ac;
   3117 	RF_ConfigSet_t *config_sets;
   3118 	RF_ConfigSet_t *cset;
   3119 	RF_AutoConfig_t *ac_next;
   3120 
   3121 
   3122 	config_sets = NULL;
   3123 
   3124 	/* Go through the AutoConfig list, and figure out which components
   3125 	   belong to what sets.  */
   3126 	ac = ac_list;
   3127 	while(ac!=NULL) {
   3128 		/* we're going to putz with ac->next, so save it here
   3129 		   for use at the end of the loop */
   3130 		ac_next = ac->next;
   3131 
   3132 		if (config_sets == NULL) {
   3133 			/* will need at least this one... */
   3134 			config_sets = (RF_ConfigSet_t *)
   3135 				malloc(sizeof(RF_ConfigSet_t),
   3136 				       M_RAIDFRAME, M_NOWAIT);
   3137 			if (config_sets == NULL) {
   3138 				panic("rf_create_auto_sets: No memory!");
   3139 			}
   3140 			/* this one is easy :) */
   3141 			config_sets->ac = ac;
   3142 			config_sets->next = NULL;
   3143 			config_sets->rootable = 0;
   3144 			ac->next = NULL;
   3145 		} else {
   3146 			/* which set does this component fit into? */
   3147 			cset = config_sets;
   3148 			while(cset!=NULL) {
   3149 				if (rf_does_it_fit(cset, ac)) {
   3150 					/* looks like it matches... */
   3151 					ac->next = cset->ac;
   3152 					cset->ac = ac;
   3153 					break;
   3154 				}
   3155 				cset = cset->next;
   3156 			}
   3157 			if (cset==NULL) {
   3158 				/* didn't find a match above... new set..*/
   3159 				cset = (RF_ConfigSet_t *)
   3160 					malloc(sizeof(RF_ConfigSet_t),
   3161 					       M_RAIDFRAME, M_NOWAIT);
   3162 				if (cset == NULL) {
   3163 					panic("rf_create_auto_sets: No memory!");
   3164 				}
   3165 				cset->ac = ac;
   3166 				ac->next = NULL;
   3167 				cset->next = config_sets;
   3168 				cset->rootable = 0;
   3169 				config_sets = cset;
   3170 			}
   3171 		}
   3172 		ac = ac_next;
   3173 	}
   3174 
   3175 
   3176 	return(config_sets);
   3177 }
   3178 
   3179 static int
   3180 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3181 {
   3182 	RF_ComponentLabel_t *clabel1, *clabel2;
   3183 
   3184 	/* If this one matches the *first* one in the set, that's good
   3185 	   enough, since the other members of the set would have been
   3186 	   through here too... */
   3187 	/* note that we are not checking partitionSize here..
   3188 
   3189 	   Note that we are also not checking the mod_counters here.
   3190 	   If everything else matches except the mod_counter, that's
   3191 	   good enough for this test.  We will deal with the mod_counters
   3192 	   a little later in the autoconfiguration process.
   3193 
   3194 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3195 
   3196 	   The reason we don't check for this is that failed disks
   3197 	   will have lower modification counts.  If those disks are
   3198 	   not added to the set they used to belong to, then they will
   3199 	   form their own set, which may result in 2 different sets,
   3200 	   for example, competing to be configured at raid0, and
   3201 	   perhaps competing to be the root filesystem set.  If the
   3202 	   wrong ones get configured, or both attempt to become /,
   3203 	   weird behaviour and or serious lossage will occur.  Thus we
   3204 	   need to bring them into the fold here, and kick them out at
   3205 	   a later point.
   3206 
   3207 	*/
   3208 
   3209 	clabel1 = cset->ac->clabel;
   3210 	clabel2 = ac->clabel;
   3211 	if ((clabel1->version == clabel2->version) &&
   3212 	    (clabel1->serial_number == clabel2->serial_number) &&
   3213 	    (clabel1->num_rows == clabel2->num_rows) &&
   3214 	    (clabel1->num_columns == clabel2->num_columns) &&
   3215 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3216 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3217 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3218 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3219 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3220 	    (clabel1->blockSize == clabel2->blockSize) &&
   3221 	    rf_component_label_numblocks(clabel1) ==
   3222 	    rf_component_label_numblocks(clabel2) &&
   3223 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3224 	    (clabel1->root_partition == clabel2->root_partition) &&
   3225 	    (clabel1->last_unit == clabel2->last_unit) &&
   3226 	    (clabel1->config_order == clabel2->config_order)) {
   3227 		/* if it get's here, it almost *has* to be a match */
   3228 	} else {
   3229 		/* it's not consistent with somebody in the set..
   3230 		   punt */
   3231 		return(0);
   3232 	}
   3233 	/* all was fine.. it must fit... */
   3234 	return(1);
   3235 }
   3236 
   3237 int
   3238 rf_have_enough_components(RF_ConfigSet_t *cset)
   3239 {
   3240 	RF_AutoConfig_t *ac;
   3241 	RF_AutoConfig_t *auto_config;
   3242 	RF_ComponentLabel_t *clabel;
   3243 	int c;
   3244 	int num_cols;
   3245 	int num_missing;
   3246 	int mod_counter;
   3247 	int mod_counter_found;
   3248 	int even_pair_failed;
   3249 	char parity_type;
   3250 
   3251 
   3252 	/* check to see that we have enough 'live' components
   3253 	   of this set.  If so, we can configure it if necessary */
   3254 
   3255 	num_cols = cset->ac->clabel->num_columns;
   3256 	parity_type = cset->ac->clabel->parityConfig;
   3257 
   3258 	/* XXX Check for duplicate components!?!?!? */
   3259 
   3260 	/* Determine what the mod_counter is supposed to be for this set. */
   3261 
   3262 	mod_counter_found = 0;
   3263 	mod_counter = 0;
   3264 	ac = cset->ac;
   3265 	while(ac!=NULL) {
   3266 		if (mod_counter_found==0) {
   3267 			mod_counter = ac->clabel->mod_counter;
   3268 			mod_counter_found = 1;
   3269 		} else {
   3270 			if (ac->clabel->mod_counter > mod_counter) {
   3271 				mod_counter = ac->clabel->mod_counter;
   3272 			}
   3273 		}
   3274 		ac = ac->next;
   3275 	}
   3276 
   3277 	num_missing = 0;
   3278 	auto_config = cset->ac;
   3279 
   3280 	even_pair_failed = 0;
   3281 	for(c=0; c<num_cols; c++) {
   3282 		ac = auto_config;
   3283 		while(ac!=NULL) {
   3284 			if ((ac->clabel->column == c) &&
   3285 			    (ac->clabel->mod_counter == mod_counter)) {
   3286 				/* it's this one... */
   3287 #ifdef DEBUG
   3288 				printf("Found: %s at %d\n",
   3289 				       ac->devname,c);
   3290 #endif
   3291 				break;
   3292 			}
   3293 			ac=ac->next;
   3294 		}
   3295 		if (ac==NULL) {
   3296 				/* Didn't find one here! */
   3297 				/* special case for RAID 1, especially
   3298 				   where there are more than 2
   3299 				   components (where RAIDframe treats
   3300 				   things a little differently :( ) */
   3301 			if (parity_type == '1') {
   3302 				if (c%2 == 0) { /* even component */
   3303 					even_pair_failed = 1;
   3304 				} else { /* odd component.  If
   3305 					    we're failed, and
   3306 					    so is the even
   3307 					    component, it's
   3308 					    "Good Night, Charlie" */
   3309 					if (even_pair_failed == 1) {
   3310 						return(0);
   3311 					}
   3312 				}
   3313 			} else {
   3314 				/* normal accounting */
   3315 				num_missing++;
   3316 			}
   3317 		}
   3318 		if ((parity_type == '1') && (c%2 == 1)) {
   3319 				/* Just did an even component, and we didn't
   3320 				   bail.. reset the even_pair_failed flag,
   3321 				   and go on to the next component.... */
   3322 			even_pair_failed = 0;
   3323 		}
   3324 	}
   3325 
   3326 	clabel = cset->ac->clabel;
   3327 
   3328 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3329 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3330 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3331 		/* XXX this needs to be made *much* more general */
   3332 		/* Too many failures */
   3333 		return(0);
   3334 	}
   3335 	/* otherwise, all is well, and we've got enough to take a kick
   3336 	   at autoconfiguring this set */
   3337 	return(1);
   3338 }
   3339 
   3340 void
   3341 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3342 			RF_Raid_t *raidPtr)
   3343 {
   3344 	RF_ComponentLabel_t *clabel;
   3345 	int i;
   3346 
   3347 	clabel = ac->clabel;
   3348 
   3349 	/* 1. Fill in the common stuff */
   3350 	config->numRow = clabel->num_rows = 1;
   3351 	config->numCol = clabel->num_columns;
   3352 	config->numSpare = 0; /* XXX should this be set here? */
   3353 	config->sectPerSU = clabel->sectPerSU;
   3354 	config->SUsPerPU = clabel->SUsPerPU;
   3355 	config->SUsPerRU = clabel->SUsPerRU;
   3356 	config->parityConfig = clabel->parityConfig;
   3357 	/* XXX... */
   3358 	strcpy(config->diskQueueType,"fifo");
   3359 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3360 	config->layoutSpecificSize = 0; /* XXX ?? */
   3361 
   3362 	while(ac!=NULL) {
   3363 		/* row/col values will be in range due to the checks
   3364 		   in reasonable_label() */
   3365 		strcpy(config->devnames[0][ac->clabel->column],
   3366 		       ac->devname);
   3367 		ac = ac->next;
   3368 	}
   3369 
   3370 	for(i=0;i<RF_MAXDBGV;i++) {
   3371 		config->debugVars[i][0] = 0;
   3372 	}
   3373 }
   3374 
   3375 int
   3376 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3377 {
   3378 	RF_ComponentLabel_t *clabel;
   3379 	int column;
   3380 	int sparecol;
   3381 
   3382 	raidPtr->autoconfigure = new_value;
   3383 
   3384 	for(column=0; column<raidPtr->numCol; column++) {
   3385 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3386 			clabel = raidget_component_label(raidPtr, column);
   3387 			clabel->autoconfigure = new_value;
   3388 			raidflush_component_label(raidPtr, column);
   3389 		}
   3390 	}
   3391 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3392 		sparecol = raidPtr->numCol + column;
   3393 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3394 			clabel = raidget_component_label(raidPtr, sparecol);
   3395 			clabel->autoconfigure = new_value;
   3396 			raidflush_component_label(raidPtr, sparecol);
   3397 		}
   3398 	}
   3399 	return(new_value);
   3400 }
   3401 
   3402 int
   3403 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3404 {
   3405 	RF_ComponentLabel_t *clabel;
   3406 	int column;
   3407 	int sparecol;
   3408 
   3409 	raidPtr->root_partition = new_value;
   3410 	for(column=0; column<raidPtr->numCol; column++) {
   3411 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3412 			clabel = raidget_component_label(raidPtr, column);
   3413 			clabel->root_partition = new_value;
   3414 			raidflush_component_label(raidPtr, column);
   3415 		}
   3416 	}
   3417 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3418 		sparecol = raidPtr->numCol + column;
   3419 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3420 			clabel = raidget_component_label(raidPtr, sparecol);
   3421 			clabel->root_partition = new_value;
   3422 			raidflush_component_label(raidPtr, sparecol);
   3423 		}
   3424 	}
   3425 	return(new_value);
   3426 }
   3427 
   3428 void
   3429 rf_release_all_vps(RF_ConfigSet_t *cset)
   3430 {
   3431 	RF_AutoConfig_t *ac;
   3432 
   3433 	ac = cset->ac;
   3434 	while(ac!=NULL) {
   3435 		/* Close the vp, and give it back */
   3436 		if (ac->vp) {
   3437 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3438 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3439 			vput(ac->vp);
   3440 			ac->vp = NULL;
   3441 		}
   3442 		ac = ac->next;
   3443 	}
   3444 }
   3445 
   3446 
   3447 void
   3448 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3449 {
   3450 	RF_AutoConfig_t *ac;
   3451 	RF_AutoConfig_t *next_ac;
   3452 
   3453 	ac = cset->ac;
   3454 	while(ac!=NULL) {
   3455 		next_ac = ac->next;
   3456 		/* nuke the label */
   3457 		free(ac->clabel, M_RAIDFRAME);
   3458 		/* cleanup the config structure */
   3459 		free(ac, M_RAIDFRAME);
   3460 		/* "next.." */
   3461 		ac = next_ac;
   3462 	}
   3463 	/* and, finally, nuke the config set */
   3464 	free(cset, M_RAIDFRAME);
   3465 }
   3466 
   3467 
   3468 void
   3469 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3470 {
   3471 	/* current version number */
   3472 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3473 	clabel->serial_number = raidPtr->serial_number;
   3474 	clabel->mod_counter = raidPtr->mod_counter;
   3475 
   3476 	clabel->num_rows = 1;
   3477 	clabel->num_columns = raidPtr->numCol;
   3478 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3479 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3480 
   3481 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3482 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3483 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3484 
   3485 	clabel->blockSize = raidPtr->bytesPerSector;
   3486 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3487 
   3488 	/* XXX not portable */
   3489 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3490 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3491 	clabel->autoconfigure = raidPtr->autoconfigure;
   3492 	clabel->root_partition = raidPtr->root_partition;
   3493 	clabel->last_unit = raidPtr->raidid;
   3494 	clabel->config_order = raidPtr->config_order;
   3495 
   3496 #ifndef RF_NO_PARITY_MAP
   3497 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3498 #endif
   3499 }
   3500 
   3501 struct raid_softc *
   3502 rf_auto_config_set(RF_ConfigSet_t *cset)
   3503 {
   3504 	RF_Raid_t *raidPtr;
   3505 	RF_Config_t *config;
   3506 	int raidID;
   3507 	struct raid_softc *sc;
   3508 
   3509 #ifdef DEBUG
   3510 	printf("RAID autoconfigure\n");
   3511 #endif
   3512 
   3513 	/* 1. Create a config structure */
   3514 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3515 	if (config == NULL) {
   3516 		printf("%s: Out of mem - config!?!?\n", __func__);
   3517 				/* XXX do something more intelligent here. */
   3518 		return NULL;
   3519 	}
   3520 
   3521 	/*
   3522 	   2. Figure out what RAID ID this one is supposed to live at
   3523 	   See if we can get the same RAID dev that it was configured
   3524 	   on last time..
   3525 	*/
   3526 
   3527 	raidID = cset->ac->clabel->last_unit;
   3528 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3529 	     sc = raidget(++raidID, false))
   3530 		continue;
   3531 #ifdef DEBUG
   3532 	printf("Configuring raid%d:\n",raidID);
   3533 #endif
   3534 
   3535 	if (sc == NULL)
   3536 		sc = raidget(raidID, true);
   3537 	if (sc == NULL) {
   3538 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3539 				/* XXX do something more intelligent here. */
   3540 		free(config, M_RAIDFRAME);
   3541 		return NULL;
   3542 	}
   3543 
   3544 	raidPtr = &sc->sc_r;
   3545 
   3546 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3547 	raidPtr->softc = sc;
   3548 	raidPtr->raidid = raidID;
   3549 	raidPtr->openings = RAIDOUTSTANDING;
   3550 
   3551 	/* 3. Build the configuration structure */
   3552 	rf_create_configuration(cset->ac, config, raidPtr);
   3553 
   3554 	/* 4. Do the configuration */
   3555 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3556 		raidinit(sc);
   3557 
   3558 		rf_markalldirty(raidPtr);
   3559 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3560 		switch (cset->ac->clabel->root_partition) {
   3561 		case 1:	/* Force Root */
   3562 		case 2:	/* Soft Root: root when boot partition part of raid */
   3563 			/*
   3564 			 * everything configured just fine.  Make a note
   3565 			 * that this set is eligible to be root,
   3566 			 * or forced to be root
   3567 			 */
   3568 			cset->rootable = cset->ac->clabel->root_partition;
   3569 			/* XXX do this here? */
   3570 			raidPtr->root_partition = cset->rootable;
   3571 			break;
   3572 		default:
   3573 			break;
   3574 		}
   3575 	} else {
   3576 		raidput(sc);
   3577 		sc = NULL;
   3578 	}
   3579 
   3580 	/* 5. Cleanup */
   3581 	free(config, M_RAIDFRAME);
   3582 	return sc;
   3583 }
   3584 
   3585 void
   3586 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3587 	     size_t xmin, size_t xmax)
   3588 {
   3589 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3590 	pool_sethiwat(p, xmax);
   3591 	pool_prime(p, xmin);
   3592 	pool_setlowat(p, xmin);
   3593 }
   3594 
   3595 /*
   3596  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3597  * to see if there is IO pending and if that IO could possibly be done
   3598  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3599  * otherwise.
   3600  *
   3601  */
   3602 int
   3603 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3604 {
   3605 	struct raid_softc *rs;
   3606 	struct dk_softc *dksc;
   3607 
   3608 	rs = raidPtr->softc;
   3609 	dksc = &rs->sc_dksc;
   3610 
   3611 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3612 		return 1;
   3613 
   3614 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3615 		/* there is work to do */
   3616 		return 0;
   3617 	}
   3618 	/* default is nothing to do */
   3619 	return 1;
   3620 }
   3621 
   3622 int
   3623 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3624 {
   3625 	uint64_t numsecs;
   3626 	unsigned secsize;
   3627 	int error;
   3628 
   3629 	error = getdisksize(vp, &numsecs, &secsize);
   3630 	if (error == 0) {
   3631 		diskPtr->blockSize = secsize;
   3632 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3633 		diskPtr->partitionSize = numsecs;
   3634 		return 0;
   3635 	}
   3636 	return error;
   3637 }
   3638 
   3639 static int
   3640 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3641 {
   3642 	return 1;
   3643 }
   3644 
   3645 static void
   3646 raid_attach(device_t parent, device_t self, void *aux)
   3647 {
   3648 }
   3649 
   3650 
   3651 static int
   3652 raid_detach(device_t self, int flags)
   3653 {
   3654 	int error;
   3655 	struct raid_softc *rs = raidsoftc(self);
   3656 
   3657 	if (rs == NULL)
   3658 		return ENXIO;
   3659 
   3660 	if ((error = raidlock(rs)) != 0)
   3661 		return (error);
   3662 
   3663 	error = raid_detach_unlocked(rs);
   3664 
   3665 	raidunlock(rs);
   3666 
   3667 	/* XXX raid can be referenced here */
   3668 
   3669 	if (error)
   3670 		return error;
   3671 
   3672 	/* Free the softc */
   3673 	raidput(rs);
   3674 
   3675 	return 0;
   3676 }
   3677 
   3678 static void
   3679 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3680 {
   3681 	struct dk_softc *dksc = &rs->sc_dksc;
   3682 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3683 
   3684 	memset(dg, 0, sizeof(*dg));
   3685 
   3686 	dg->dg_secperunit = raidPtr->totalSectors;
   3687 	dg->dg_secsize = raidPtr->bytesPerSector;
   3688 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3689 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3690 
   3691 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3692 }
   3693 
   3694 /*
   3695  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3696  * We end up returning whatever error was returned by the first cache flush
   3697  * that fails.
   3698  */
   3699 
   3700 int
   3701 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3702 {
   3703 	int c, sparecol;
   3704 	int e,error;
   3705 	int force = 1;
   3706 
   3707 	error = 0;
   3708 	for (c = 0; c < raidPtr->numCol; c++) {
   3709 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3710 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3711 					  &force, FWRITE, NOCRED);
   3712 			if (e) {
   3713 				if (e != ENODEV)
   3714 					printf("raid%d: cache flush to component %s failed.\n",
   3715 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3716 				if (error == 0) {
   3717 					error = e;
   3718 				}
   3719 			}
   3720 		}
   3721 	}
   3722 
   3723 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3724 		sparecol = raidPtr->numCol + c;
   3725 		/* Need to ensure that the reconstruct actually completed! */
   3726 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3727 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3728 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3729 			if (e) {
   3730 				if (e != ENODEV)
   3731 					printf("raid%d: cache flush to component %s failed.\n",
   3732 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3733 				if (error == 0) {
   3734 					error = e;
   3735 				}
   3736 			}
   3737 		}
   3738 	}
   3739 	return error;
   3740 }
   3741 
   3742 /*
   3743  * Module interface
   3744  */
   3745 
   3746 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3747 
   3748 #ifdef _MODULE
   3749 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3750 #endif
   3751 
   3752 static int raid_modcmd(modcmd_t, void *);
   3753 static int raid_modcmd_init(void);
   3754 static int raid_modcmd_fini(void);
   3755 
   3756 static int
   3757 raid_modcmd(modcmd_t cmd, void *data)
   3758 {
   3759 	int error;
   3760 
   3761 	error = 0;
   3762 	switch (cmd) {
   3763 	case MODULE_CMD_INIT:
   3764 		error = raid_modcmd_init();
   3765 		break;
   3766 	case MODULE_CMD_FINI:
   3767 		error = raid_modcmd_fini();
   3768 		break;
   3769 	default:
   3770 		error = ENOTTY;
   3771 		break;
   3772 	}
   3773 	return error;
   3774 }
   3775 
   3776 static int
   3777 raid_modcmd_init(void)
   3778 {
   3779 	int error;
   3780 #ifdef _MODULE
   3781 	int bmajor, cmajor;
   3782 #endif
   3783 
   3784 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3785 	mutex_enter(&raid_lock);
   3786 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3787 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3788 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3789 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3790 
   3791 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3792 #endif
   3793 
   3794 #ifdef _MODULE
   3795 	bmajor = cmajor = -1;
   3796 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3797 	    &raid_cdevsw, &cmajor);
   3798 	if (error != 0) {
   3799 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3800 		mutex_exit(&raid_lock);
   3801 		return error;
   3802 	}
   3803 	error = config_cfdriver_attach(&raid_cd);
   3804 	if (error != 0) {
   3805 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3806 		    __func__, error);
   3807 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3808 		mutex_exit(&raid_lock);
   3809 		return error;
   3810 	}
   3811 #endif
   3812 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3813 	if (error != 0) {
   3814 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3815 		    __func__, error);
   3816 #ifdef _MODULE
   3817 		config_cfdriver_detach(&raid_cd);
   3818 #endif
   3819 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3820 		mutex_exit(&raid_lock);
   3821 		return error;
   3822 	}
   3823 
   3824 	raidautoconfigdone = false;
   3825 
   3826 	mutex_exit(&raid_lock);
   3827 
   3828 	if (error == 0) {
   3829 		if (rf_BootRaidframe(true) == 0)
   3830 			aprint_verbose("Kernelized RAIDframe activated\n");
   3831 		else
   3832 			panic("Serious error activating RAID!!");
   3833 	}
   3834 
   3835 	/*
   3836 	 * Register a finalizer which will be used to auto-config RAID
   3837 	 * sets once all real hardware devices have been found.
   3838 	 */
   3839 	error = config_finalize_register(NULL, rf_autoconfig);
   3840 	if (error != 0) {
   3841 		aprint_error("WARNING: unable to register RAIDframe "
   3842 		    "finalizer\n");
   3843 		error = 0;
   3844 	}
   3845 
   3846 	return error;
   3847 }
   3848 
   3849 static int
   3850 raid_modcmd_fini(void)
   3851 {
   3852 	int error;
   3853 
   3854 	mutex_enter(&raid_lock);
   3855 
   3856 	/* Don't allow unload if raid device(s) exist.  */
   3857 	if (!LIST_EMPTY(&raids)) {
   3858 		mutex_exit(&raid_lock);
   3859 		return EBUSY;
   3860 	}
   3861 
   3862 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3863 	if (error != 0) {
   3864 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3865 		mutex_exit(&raid_lock);
   3866 		return error;
   3867 	}
   3868 #ifdef _MODULE
   3869 	error = config_cfdriver_detach(&raid_cd);
   3870 	if (error != 0) {
   3871 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3872 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3873 		mutex_exit(&raid_lock);
   3874 		return error;
   3875 	}
   3876 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3877 	if (error != 0) {
   3878 		aprint_error("%s: cannot detach devsw\n",__func__);
   3879 		config_cfdriver_attach(&raid_cd);
   3880 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3881 		mutex_exit(&raid_lock);
   3882 		return error;
   3883 	}
   3884 #endif
   3885 	rf_BootRaidframe(false);
   3886 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3887 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3888 	rf_destroy_cond2(rf_sparet_wait_cv);
   3889 	rf_destroy_cond2(rf_sparet_resp_cv);
   3890 #endif
   3891 	mutex_exit(&raid_lock);
   3892 	mutex_destroy(&raid_lock);
   3893 
   3894 	return error;
   3895 }
   3896