Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.349.4.3
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.349.4.3 2017/05/17 01:44:18 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.349.4.3 2017/05/17 01:44:18 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #ifdef DEBUG_ROOT
    165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    166 #else
    167 #define DPRINTF(a, ...)
    168 #endif
    169 
    170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    171 static rf_declare_mutex2(rf_sparet_wait_mutex);
    172 static rf_declare_cond2(rf_sparet_wait_cv);
    173 static rf_declare_cond2(rf_sparet_resp_cv);
    174 
    175 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    176 						 * spare table */
    177 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    178 						 * installation process */
    179 #endif
    180 
    181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    182 
    183 /* prototypes */
    184 static void KernelWakeupFunc(struct buf *);
    185 static void InitBP(struct buf *, struct vnode *, unsigned,
    186     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    187     void *, int, struct proc *);
    188 struct raid_softc;
    189 static void raidinit(struct raid_softc *);
    190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 const struct bdevsw raid_bdevsw = {
    221 	DEVSW_MODULE_INIT
    222 	.d_open = raidopen,
    223 	.d_close = raidclose,
    224 	.d_strategy = raidstrategy,
    225 	.d_ioctl = raidioctl,
    226 	.d_dump = raiddump,
    227 	.d_psize = raidsize,
    228 	.d_discard = nodiscard,
    229 	.d_flag = D_DISK
    230 };
    231 
    232 const struct cdevsw raid_cdevsw = {
    233 	DEVSW_MODULE_INIT
    234 	.d_open = raidopen,
    235 	.d_close = raidclose,
    236 	.d_read = raidread,
    237 	.d_write = raidwrite,
    238 	.d_ioctl = raidioctl,
    239 	.d_stop = nostop,
    240 	.d_tty = notty,
    241 	.d_poll = nopoll,
    242 	.d_mmap = nommap,
    243 	.d_kqfilter = nokqfilter,
    244 	.d_discard = nodiscard,
    245 	.d_flag = D_DISK
    246 };
    247 
    248 static struct dkdriver rf_dkdriver = {
    249 	.d_open = raidopen,
    250 	.d_close = raidclose,
    251 	.d_strategy = raidstrategy,
    252 	.d_diskstart = raid_diskstart,
    253 	.d_dumpblocks = raid_dumpblocks,
    254 	.d_lastclose = raid_lastclose,
    255 	.d_minphys = minphys
    256 };
    257 
    258 struct raid_softc {
    259 	struct dk_softc sc_dksc;
    260 	int	sc_unit;
    261 	int     sc_flags;	/* flags */
    262 	int     sc_cflags;	/* configuration flags */
    263 	kmutex_t sc_mutex;	/* interlock mutex */
    264 	kcondvar_t sc_cv;	/* and the condvar */
    265 	uint64_t sc_size;	/* size of the raid device */
    266 	char    sc_xname[20];	/* XXX external name */
    267 	RF_Raid_t sc_r;
    268 	LIST_ENTRY(raid_softc) sc_link;
    269 };
    270 /* sc_flags */
    271 #define RAIDF_INITED		0x01	/* unit has been initialized */
    272 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    273 #define RAIDF_DETACH  		0x04	/* detach after final close */
    274 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    275 #define RAIDF_LOCKED		0x10	/* unit is locked */
    276 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    277 
    278 #define	raidunit(x)	DISKUNIT(x)
    279 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    280 
    281 extern struct cfdriver raid_cd;
    282 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    283     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    284     DVF_DETACH_SHUTDOWN);
    285 
    286 /*
    287  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    288  * Be aware that large numbers can allow the driver to consume a lot of
    289  * kernel memory, especially on writes, and in degraded mode reads.
    290  *
    291  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    292  * a single 64K write will typically require 64K for the old data,
    293  * 64K for the old parity, and 64K for the new parity, for a total
    294  * of 192K (if the parity buffer is not re-used immediately).
    295  * Even it if is used immediately, that's still 128K, which when multiplied
    296  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    297  *
    298  * Now in degraded mode, for example, a 64K read on the above setup may
    299  * require data reconstruction, which will require *all* of the 4 remaining
    300  * disks to participate -- 4 * 32K/disk == 128K again.
    301  */
    302 
    303 #ifndef RAIDOUTSTANDING
    304 #define RAIDOUTSTANDING   6
    305 #endif
    306 
    307 #define RAIDLABELDEV(dev)	\
    308 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    309 
    310 /* declared here, and made public, for the benefit of KVM stuff.. */
    311 
    312 static int raidlock(struct raid_softc *);
    313 static void raidunlock(struct raid_softc *);
    314 
    315 static int raid_detach_unlocked(struct raid_softc *);
    316 
    317 static void rf_markalldirty(RF_Raid_t *);
    318 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    319 
    320 void rf_ReconThread(struct rf_recon_req *);
    321 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    322 void rf_CopybackThread(RF_Raid_t *raidPtr);
    323 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    324 int rf_autoconfig(device_t);
    325 void rf_buildroothack(RF_ConfigSet_t *);
    326 
    327 RF_AutoConfig_t *rf_find_raid_components(void);
    328 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    329 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    330 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    331 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    332 int rf_set_autoconfig(RF_Raid_t *, int);
    333 int rf_set_rootpartition(RF_Raid_t *, int);
    334 void rf_release_all_vps(RF_ConfigSet_t *);
    335 void rf_cleanup_config_set(RF_ConfigSet_t *);
    336 int rf_have_enough_components(RF_ConfigSet_t *);
    337 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    338 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    339 
    340 /*
    341  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    342  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    343  * in the kernel config file.
    344  */
    345 #ifdef RAID_AUTOCONFIG
    346 int raidautoconfig = 1;
    347 #else
    348 int raidautoconfig = 0;
    349 #endif
    350 static bool raidautoconfigdone = false;
    351 
    352 struct RF_Pools_s rf_pools;
    353 
    354 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    355 static kmutex_t raid_lock;
    356 
    357 static struct raid_softc *
    358 raidcreate(int unit) {
    359 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    360 	if (sc == NULL) {
    361 #ifdef DIAGNOSTIC
    362 		printf("%s: out of memory\n", __func__);
    363 #endif
    364 		return NULL;
    365 	}
    366 	sc->sc_unit = unit;
    367 	cv_init(&sc->sc_cv, "raidunit");
    368 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    369 	return sc;
    370 }
    371 
    372 static void
    373 raiddestroy(struct raid_softc *sc) {
    374 	cv_destroy(&sc->sc_cv);
    375 	mutex_destroy(&sc->sc_mutex);
    376 	kmem_free(sc, sizeof(*sc));
    377 }
    378 
    379 static struct raid_softc *
    380 raidget(int unit, bool create) {
    381 	struct raid_softc *sc;
    382 	if (unit < 0) {
    383 #ifdef DIAGNOSTIC
    384 		panic("%s: unit %d!", __func__, unit);
    385 #endif
    386 		return NULL;
    387 	}
    388 	mutex_enter(&raid_lock);
    389 	LIST_FOREACH(sc, &raids, sc_link) {
    390 		if (sc->sc_unit == unit) {
    391 			mutex_exit(&raid_lock);
    392 			return sc;
    393 		}
    394 	}
    395 	mutex_exit(&raid_lock);
    396 	if (!create)
    397 		return NULL;
    398 	if ((sc = raidcreate(unit)) == NULL)
    399 		return NULL;
    400 	mutex_enter(&raid_lock);
    401 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    402 	mutex_exit(&raid_lock);
    403 	return sc;
    404 }
    405 
    406 static void
    407 raidput(struct raid_softc *sc) {
    408 	mutex_enter(&raid_lock);
    409 	LIST_REMOVE(sc, sc_link);
    410 	mutex_exit(&raid_lock);
    411 	raiddestroy(sc);
    412 }
    413 
    414 void
    415 raidattach(int num)
    416 {
    417 
    418 	/*
    419 	 * Device attachment and associated initialization now occurs
    420 	 * as part of the module initialization.
    421 	 */
    422 }
    423 
    424 int
    425 rf_autoconfig(device_t self)
    426 {
    427 	RF_AutoConfig_t *ac_list;
    428 	RF_ConfigSet_t *config_sets;
    429 
    430 	if (!raidautoconfig || raidautoconfigdone == true)
    431 		return (0);
    432 
    433 	/* XXX This code can only be run once. */
    434 	raidautoconfigdone = true;
    435 
    436 #ifdef __HAVE_CPU_BOOTCONF
    437 	/*
    438 	 * 0. find the boot device if needed first so we can use it later
    439 	 * this needs to be done before we autoconfigure any raid sets,
    440 	 * because if we use wedges we are not going to be able to open
    441 	 * the boot device later
    442 	 */
    443 	if (booted_device == NULL)
    444 		cpu_bootconf();
    445 #endif
    446 	/* 1. locate all RAID components on the system */
    447 	aprint_debug("Searching for RAID components...\n");
    448 	ac_list = rf_find_raid_components();
    449 
    450 	/* 2. Sort them into their respective sets. */
    451 	config_sets = rf_create_auto_sets(ac_list);
    452 
    453 	/*
    454 	 * 3. Evaluate each set and configure the valid ones.
    455 	 * This gets done in rf_buildroothack().
    456 	 */
    457 	rf_buildroothack(config_sets);
    458 
    459 	return 1;
    460 }
    461 
    462 static int
    463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    464 	const char *bootname = device_xname(bdv);
    465 	size_t len = strlen(bootname);
    466 
    467 	for (int col = 0; col < r->numCol; col++) {
    468 		const char *devname = r->Disks[col].devname;
    469 		devname += sizeof("/dev/") - 1;
    470 		if (strncmp(devname, "dk", 2) == 0) {
    471 			const char *parent =
    472 			    dkwedge_get_parent_name(r->Disks[col].dev);
    473 			if (parent != NULL)
    474 				devname = parent;
    475 		}
    476 		if (strncmp(devname, bootname, len) == 0) {
    477 			struct raid_softc *sc = r->softc;
    478 			aprint_debug("raid%d includes boot device %s\n",
    479 			    sc->sc_unit, devname);
    480 			return 1;
    481 		}
    482 	}
    483 	return 0;
    484 }
    485 
    486 void
    487 rf_buildroothack(RF_ConfigSet_t *config_sets)
    488 {
    489 	RF_ConfigSet_t *cset;
    490 	RF_ConfigSet_t *next_cset;
    491 	int num_root;
    492 	struct raid_softc *sc, *rsc;
    493 	struct dk_softc *dksc;
    494 
    495 	sc = rsc = NULL;
    496 	num_root = 0;
    497 	cset = config_sets;
    498 	while (cset != NULL) {
    499 		next_cset = cset->next;
    500 		if (rf_have_enough_components(cset) &&
    501 		    cset->ac->clabel->autoconfigure == 1) {
    502 			sc = rf_auto_config_set(cset);
    503 			if (sc != NULL) {
    504 				aprint_debug("raid%d: configured ok\n",
    505 				    sc->sc_unit);
    506 				if (cset->rootable) {
    507 					rsc = sc;
    508 					num_root++;
    509 				}
    510 			} else {
    511 				/* The autoconfig didn't work :( */
    512 				aprint_debug("Autoconfig failed\n");
    513 				rf_release_all_vps(cset);
    514 			}
    515 		} else {
    516 			/* we're not autoconfiguring this set...
    517 			   release the associated resources */
    518 			rf_release_all_vps(cset);
    519 		}
    520 		/* cleanup */
    521 		rf_cleanup_config_set(cset);
    522 		cset = next_cset;
    523 	}
    524 	dksc = &rsc->sc_dksc;
    525 
    526 	/* if the user has specified what the root device should be
    527 	   then we don't touch booted_device or boothowto... */
    528 
    529 	if (rootspec != NULL)
    530 		return;
    531 
    532 	/* we found something bootable... */
    533 
    534 	/*
    535 	 * XXX: The following code assumes that the root raid
    536 	 * is the first ('a') partition. This is about the best
    537 	 * we can do with a BSD disklabel, but we might be able
    538 	 * to do better with a GPT label, by setting a specified
    539 	 * attribute to indicate the root partition. We can then
    540 	 * stash the partition number in the r->root_partition
    541 	 * high bits (the bottom 2 bits are already used). For
    542 	 * now we just set booted_partition to 0 when we override
    543 	 * root.
    544 	 */
    545 	if (num_root == 1) {
    546 		device_t candidate_root;
    547 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    548 			char cname[sizeof(cset->ac->devname)];
    549 			/* XXX: assume partition 'a' first */
    550 			snprintf(cname, sizeof(cname), "%s%c",
    551 			    device_xname(dksc->sc_dev), 'a');
    552 			candidate_root = dkwedge_find_by_wname(cname);
    553 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    554 			    cname);
    555 			if (candidate_root == NULL) {
    556 				/*
    557 				 * If that is not found, because we don't use
    558 				 * disklabel, return the first dk child
    559 				 * XXX: we can skip the 'a' check above
    560 				 * and always do this...
    561 				 */
    562 				size_t i = 0;
    563 				candidate_root = dkwedge_find_by_parent(
    564 				    device_xname(dksc->sc_dev), &i);
    565 			}
    566 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    567 			    candidate_root);
    568 		} else
    569 			candidate_root = dksc->sc_dev;
    570 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    571 		DPRINTF("%s: booted_device=%p root_partition=%d "
    572 		   "contains_boot=%d\n", __func__, booted_device,
    573 		   rsc->sc_r.root_partition,
    574 		   rf_containsboot(&rsc->sc_r, booted_device));
    575 		if (booted_device == NULL ||
    576 		    rsc->sc_r.root_partition == 1 ||
    577 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    578 			booted_device = candidate_root;
    579 			booted_partition = 0;	/* XXX assume 'a' */
    580 		}
    581 	} else if (num_root > 1) {
    582 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    583 		    booted_device);
    584 
    585 		/*
    586 		 * Maybe the MD code can help. If it cannot, then
    587 		 * setroot() will discover that we have no
    588 		 * booted_device and will ask the user if nothing was
    589 		 * hardwired in the kernel config file
    590 		 */
    591 		if (booted_device == NULL)
    592 			return;
    593 
    594 		num_root = 0;
    595 		mutex_enter(&raid_lock);
    596 		LIST_FOREACH(sc, &raids, sc_link) {
    597 			RF_Raid_t *r = &sc->sc_r;
    598 			if (r->valid == 0)
    599 				continue;
    600 
    601 			if (r->root_partition == 0)
    602 				continue;
    603 
    604 			if (rf_containsboot(r, booted_device)) {
    605 				num_root++;
    606 				rsc = sc;
    607 				dksc = &rsc->sc_dksc;
    608 			}
    609 		}
    610 		mutex_exit(&raid_lock);
    611 
    612 		if (num_root == 1) {
    613 			booted_device = dksc->sc_dev;
    614 			booted_partition = 0;	/* XXX assume 'a' */
    615 		} else {
    616 			/* we can't guess.. require the user to answer... */
    617 			boothowto |= RB_ASKNAME;
    618 		}
    619 	}
    620 }
    621 
    622 static int
    623 raidsize(dev_t dev)
    624 {
    625 	struct raid_softc *rs;
    626 	struct dk_softc *dksc;
    627 	unsigned int unit;
    628 
    629 	unit = raidunit(dev);
    630 	if ((rs = raidget(unit, false)) == NULL)
    631 		return -1;
    632 	dksc = &rs->sc_dksc;
    633 
    634 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    635 		return -1;
    636 
    637 	return dk_size(dksc, dev);
    638 }
    639 
    640 static int
    641 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    642 {
    643 	unsigned int unit;
    644 	struct raid_softc *rs;
    645 	struct dk_softc *dksc;
    646 
    647 	unit = raidunit(dev);
    648 	if ((rs = raidget(unit, false)) == NULL)
    649 		return ENXIO;
    650 	dksc = &rs->sc_dksc;
    651 
    652 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    653 		return ENODEV;
    654 
    655         /*
    656            Note that blkno is relative to this particular partition.
    657            By adding adding RF_PROTECTED_SECTORS, we get a value that
    658 	   is relative to the partition used for the underlying component.
    659         */
    660 	blkno += RF_PROTECTED_SECTORS;
    661 
    662 	return dk_dump(dksc, dev, blkno, va, size);
    663 }
    664 
    665 static int
    666 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    667 {
    668 	struct raid_softc *rs = raidsoftc(dev);
    669 	const struct bdevsw *bdev;
    670 	RF_Raid_t *raidPtr;
    671 	int     c, sparecol, j, scol, dumpto;
    672 	int     error = 0;
    673 
    674 	raidPtr = &rs->sc_r;
    675 
    676 	/* we only support dumping to RAID 1 sets */
    677 	if (raidPtr->Layout.numDataCol != 1 ||
    678 	    raidPtr->Layout.numParityCol != 1)
    679 		return EINVAL;
    680 
    681 	if ((error = raidlock(rs)) != 0)
    682 		return error;
    683 
    684 	/* figure out what device is alive.. */
    685 
    686 	/*
    687 	   Look for a component to dump to.  The preference for the
    688 	   component to dump to is as follows:
    689 	   1) the master
    690 	   2) a used_spare of the master
    691 	   3) the slave
    692 	   4) a used_spare of the slave
    693 	*/
    694 
    695 	dumpto = -1;
    696 	for (c = 0; c < raidPtr->numCol; c++) {
    697 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    698 			/* this might be the one */
    699 			dumpto = c;
    700 			break;
    701 		}
    702 	}
    703 
    704 	/*
    705 	   At this point we have possibly selected a live master or a
    706 	   live slave.  We now check to see if there is a spared
    707 	   master (or a spared slave), if we didn't find a live master
    708 	   or a live slave.
    709 	*/
    710 
    711 	for (c = 0; c < raidPtr->numSpare; c++) {
    712 		sparecol = raidPtr->numCol + c;
    713 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    714 			/* How about this one? */
    715 			scol = -1;
    716 			for(j=0;j<raidPtr->numCol;j++) {
    717 				if (raidPtr->Disks[j].spareCol == sparecol) {
    718 					scol = j;
    719 					break;
    720 				}
    721 			}
    722 			if (scol == 0) {
    723 				/*
    724 				   We must have found a spared master!
    725 				   We'll take that over anything else
    726 				   found so far.  (We couldn't have
    727 				   found a real master before, since
    728 				   this is a used spare, and it's
    729 				   saying that it's replacing the
    730 				   master.)  On reboot (with
    731 				   autoconfiguration turned on)
    732 				   sparecol will become the 1st
    733 				   component (component0) of this set.
    734 				*/
    735 				dumpto = sparecol;
    736 				break;
    737 			} else if (scol != -1) {
    738 				/*
    739 				   Must be a spared slave.  We'll dump
    740 				   to that if we havn't found anything
    741 				   else so far.
    742 				*/
    743 				if (dumpto == -1)
    744 					dumpto = sparecol;
    745 			}
    746 		}
    747 	}
    748 
    749 	if (dumpto == -1) {
    750 		/* we couldn't find any live components to dump to!?!?
    751 		 */
    752 		error = EINVAL;
    753 		goto out;
    754 	}
    755 
    756 	bdev = bdevsw_lookup_acquire(raidPtr->Disks[dumpto].dev);
    757 	if (bdev == NULL) {
    758 		error = ENXIO;
    759 		goto out;
    760 	}
    761 
    762 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    763 				blkno, va, nblk * raidPtr->bytesPerSector);
    764 	bdevsw_release(bdev);
    765 out:
    766 	raidunlock(rs);
    767 
    768 	return error;
    769 }
    770 
    771 /* ARGSUSED */
    772 static int
    773 raidopen(dev_t dev, int flags, int fmt,
    774     struct lwp *l)
    775 {
    776 	int     unit = raidunit(dev);
    777 	struct raid_softc *rs;
    778 	struct dk_softc *dksc;
    779 	int     error = 0;
    780 	int     part, pmask;
    781 
    782 	if ((rs = raidget(unit, true)) == NULL)
    783 		return ENXIO;
    784 	if ((error = raidlock(rs)) != 0)
    785 		return (error);
    786 
    787 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    788 		error = EBUSY;
    789 		goto bad;
    790 	}
    791 
    792 	dksc = &rs->sc_dksc;
    793 
    794 	part = DISKPART(dev);
    795 	pmask = (1 << part);
    796 
    797 	if (!DK_BUSY(dksc, pmask) &&
    798 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    799 		/* First one... mark things as dirty... Note that we *MUST*
    800 		 have done a configure before this.  I DO NOT WANT TO BE
    801 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    802 		 THAT THEY BELONG TOGETHER!!!!! */
    803 		/* XXX should check to see if we're only open for reading
    804 		   here... If so, we needn't do this, but then need some
    805 		   other way of keeping track of what's happened.. */
    806 
    807 		rf_markalldirty(&rs->sc_r);
    808 	}
    809 
    810 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    811 		error = dk_open(dksc, dev, flags, fmt, l);
    812 
    813 bad:
    814 	raidunlock(rs);
    815 
    816 	return (error);
    817 
    818 
    819 }
    820 
    821 static int
    822 raid_lastclose(device_t self)
    823 {
    824 	struct raid_softc *rs = raidsoftc(self);
    825 
    826 	/* Last one... device is not unconfigured yet.
    827 	   Device shutdown has taken care of setting the
    828 	   clean bits if RAIDF_INITED is not set
    829 	   mark things as clean... */
    830 
    831 	rf_update_component_labels(&rs->sc_r,
    832 	    RF_FINAL_COMPONENT_UPDATE);
    833 
    834 	/* pass to unlocked code */
    835 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    836 		rs->sc_flags |= RAIDF_DETACH;
    837 
    838 	return 0;
    839 }
    840 
    841 /* ARGSUSED */
    842 static int
    843 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    844 {
    845 	int     unit = raidunit(dev);
    846 	struct raid_softc *rs;
    847 	struct dk_softc *dksc;
    848 	cfdata_t cf;
    849 	int     error = 0, do_detach = 0, do_put = 0;
    850 
    851 	if ((rs = raidget(unit, false)) == NULL)
    852 		return ENXIO;
    853 	dksc = &rs->sc_dksc;
    854 
    855 	if ((error = raidlock(rs)) != 0)
    856 		return (error);
    857 
    858 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    859 		error = dk_close(dksc, dev, flags, fmt, l);
    860 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    861 			do_detach = 1;
    862 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    863 		do_put = 1;
    864 
    865 	raidunlock(rs);
    866 
    867 	if (do_detach) {
    868 		/* free the pseudo device attach bits */
    869 		cf = device_cfdata(dksc->sc_dev);
    870 		error = config_detach(dksc->sc_dev, 0);
    871 		if (error == 0)
    872 			free(cf, M_RAIDFRAME);
    873 	} else if (do_put) {
    874 		raidput(rs);
    875 	}
    876 
    877 	return (error);
    878 
    879 }
    880 
    881 static void
    882 raid_wakeup(RF_Raid_t *raidPtr)
    883 {
    884 	rf_lock_mutex2(raidPtr->iodone_lock);
    885 	rf_signal_cond2(raidPtr->iodone_cv);
    886 	rf_unlock_mutex2(raidPtr->iodone_lock);
    887 }
    888 
    889 static void
    890 raidstrategy(struct buf *bp)
    891 {
    892 	unsigned int unit;
    893 	struct raid_softc *rs;
    894 	struct dk_softc *dksc;
    895 	RF_Raid_t *raidPtr;
    896 
    897 	unit = raidunit(bp->b_dev);
    898 	if ((rs = raidget(unit, false)) == NULL) {
    899 		bp->b_error = ENXIO;
    900 		goto fail;
    901 	}
    902 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	dksc = &rs->sc_dksc;
    907 	raidPtr = &rs->sc_r;
    908 
    909 	/* Queue IO only */
    910 	if (dk_strategy_defer(dksc, bp))
    911 		goto done;
    912 
    913 	/* schedule the IO to happen at the next convenient time */
    914 	raid_wakeup(raidPtr);
    915 
    916 done:
    917 	return;
    918 
    919 fail:
    920 	bp->b_resid = bp->b_bcount;
    921 	biodone(bp);
    922 }
    923 
    924 static int
    925 raid_diskstart(device_t dev, struct buf *bp)
    926 {
    927 	struct raid_softc *rs = raidsoftc(dev);
    928 	RF_Raid_t *raidPtr;
    929 
    930 	raidPtr = &rs->sc_r;
    931 	if (!raidPtr->valid) {
    932 		db1_printf(("raid is not valid..\n"));
    933 		return ENODEV;
    934 	}
    935 
    936 	/* XXX */
    937 	bp->b_resid = 0;
    938 
    939 	return raiddoaccess(raidPtr, bp);
    940 }
    941 
    942 void
    943 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    944 {
    945 	struct raid_softc *rs;
    946 	struct dk_softc *dksc;
    947 
    948 	rs = raidPtr->softc;
    949 	dksc = &rs->sc_dksc;
    950 
    951 	dk_done(dksc, bp);
    952 
    953 	rf_lock_mutex2(raidPtr->mutex);
    954 	raidPtr->openings++;
    955 	rf_unlock_mutex2(raidPtr->mutex);
    956 
    957 	/* schedule more IO */
    958 	raid_wakeup(raidPtr);
    959 }
    960 
    961 /* ARGSUSED */
    962 static int
    963 raidread(dev_t dev, struct uio *uio, int flags)
    964 {
    965 	int     unit = raidunit(dev);
    966 	struct raid_softc *rs;
    967 
    968 	if ((rs = raidget(unit, false)) == NULL)
    969 		return ENXIO;
    970 
    971 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    972 		return (ENXIO);
    973 
    974 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    975 
    976 }
    977 
    978 /* ARGSUSED */
    979 static int
    980 raidwrite(dev_t dev, struct uio *uio, int flags)
    981 {
    982 	int     unit = raidunit(dev);
    983 	struct raid_softc *rs;
    984 
    985 	if ((rs = raidget(unit, false)) == NULL)
    986 		return ENXIO;
    987 
    988 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    989 		return (ENXIO);
    990 
    991 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    992 
    993 }
    994 
    995 static int
    996 raid_detach_unlocked(struct raid_softc *rs)
    997 {
    998 	struct dk_softc *dksc = &rs->sc_dksc;
    999 	RF_Raid_t *raidPtr;
   1000 	int error;
   1001 
   1002 	raidPtr = &rs->sc_r;
   1003 
   1004 	if (DK_BUSY(dksc, 0) ||
   1005 	    raidPtr->recon_in_progress != 0 ||
   1006 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1007 	    raidPtr->copyback_in_progress != 0)
   1008 		return EBUSY;
   1009 
   1010 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1011 		return 0;
   1012 
   1013 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1014 
   1015 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1016 		return error;
   1017 
   1018 	rs->sc_flags &= ~RAIDF_INITED;
   1019 
   1020 	/* Kill off any queued buffers */
   1021 	dk_drain(dksc);
   1022 	bufq_free(dksc->sc_bufq);
   1023 
   1024 	/* Detach the disk. */
   1025 	dkwedge_delall(&dksc->sc_dkdev);
   1026 	disk_detach(&dksc->sc_dkdev);
   1027 	disk_destroy(&dksc->sc_dkdev);
   1028 	dk_detach(dksc);
   1029 
   1030 	return 0;
   1031 }
   1032 
   1033 static int
   1034 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1035 {
   1036 	int     unit = raidunit(dev);
   1037 	int     error = 0;
   1038 	int     part, pmask;
   1039 	struct raid_softc *rs;
   1040 	struct dk_softc *dksc;
   1041 	RF_Config_t *k_cfg, *u_cfg;
   1042 	RF_Raid_t *raidPtr;
   1043 	RF_RaidDisk_t *diskPtr;
   1044 	RF_AccTotals_t *totals;
   1045 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1046 	u_char *specific_buf;
   1047 	int retcode = 0;
   1048 	int column;
   1049 /*	int raidid; */
   1050 	struct rf_recon_req *rrcopy, *rr;
   1051 	RF_ComponentLabel_t *clabel;
   1052 	RF_ComponentLabel_t *ci_label;
   1053 	RF_ComponentLabel_t **clabel_ptr;
   1054 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1055 	RF_SingleComponent_t component;
   1056 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1057 	int i, j, d;
   1058 
   1059 	if ((rs = raidget(unit, false)) == NULL)
   1060 		return ENXIO;
   1061 	dksc = &rs->sc_dksc;
   1062 	raidPtr = &rs->sc_r;
   1063 
   1064 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1065 		(int) DISKPART(dev), (int) unit, cmd));
   1066 
   1067 	/* Must be initialized for these... */
   1068 	switch (cmd) {
   1069 	case RAIDFRAME_REWRITEPARITY:
   1070 	case RAIDFRAME_GET_INFO:
   1071 	case RAIDFRAME_RESET_ACCTOTALS:
   1072 	case RAIDFRAME_GET_ACCTOTALS:
   1073 	case RAIDFRAME_KEEP_ACCTOTALS:
   1074 	case RAIDFRAME_GET_SIZE:
   1075 	case RAIDFRAME_FAIL_DISK:
   1076 	case RAIDFRAME_COPYBACK:
   1077 	case RAIDFRAME_CHECK_RECON_STATUS:
   1078 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1079 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1080 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1081 	case RAIDFRAME_ADD_HOT_SPARE:
   1082 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1083 	case RAIDFRAME_INIT_LABELS:
   1084 	case RAIDFRAME_REBUILD_IN_PLACE:
   1085 	case RAIDFRAME_CHECK_PARITY:
   1086 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1087 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1088 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1089 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1090 	case RAIDFRAME_SET_AUTOCONFIG:
   1091 	case RAIDFRAME_SET_ROOT:
   1092 	case RAIDFRAME_DELETE_COMPONENT:
   1093 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1094 	case RAIDFRAME_PARITYMAP_STATUS:
   1095 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1096 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1097 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1098 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1099 			return (ENXIO);
   1100 	}
   1101 
   1102 	switch (cmd) {
   1103 #ifdef COMPAT_50
   1104 	case RAIDFRAME_GET_INFO50:
   1105 		return rf_get_info50(raidPtr, data);
   1106 
   1107 	case RAIDFRAME_CONFIGURE50:
   1108 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1109 			return retcode;
   1110 		goto config;
   1111 #endif
   1112 		/* configure the system */
   1113 	case RAIDFRAME_CONFIGURE:
   1114 
   1115 		if (raidPtr->valid) {
   1116 			/* There is a valid RAID set running on this unit! */
   1117 			printf("raid%d: Device already configured!\n",unit);
   1118 			return(EINVAL);
   1119 		}
   1120 
   1121 		/* copy-in the configuration information */
   1122 		/* data points to a pointer to the configuration structure */
   1123 
   1124 		u_cfg = *((RF_Config_t **) data);
   1125 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1126 		if (k_cfg == NULL) {
   1127 			return (ENOMEM);
   1128 		}
   1129 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1130 		if (retcode) {
   1131 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1132 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1133 				retcode));
   1134 			goto no_config;
   1135 		}
   1136 		goto config;
   1137 	config:
   1138 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1139 
   1140 		/* allocate a buffer for the layout-specific data, and copy it
   1141 		 * in */
   1142 		if (k_cfg->layoutSpecificSize) {
   1143 			if (k_cfg->layoutSpecificSize > 10000) {
   1144 				/* sanity check */
   1145 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1146 				retcode = EINVAL;
   1147 				goto no_config;
   1148 			}
   1149 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1150 			    (u_char *));
   1151 			if (specific_buf == NULL) {
   1152 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1153 				retcode = ENOMEM;
   1154 				goto no_config;
   1155 			}
   1156 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1157 			    k_cfg->layoutSpecificSize);
   1158 			if (retcode) {
   1159 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1160 				RF_Free(specific_buf,
   1161 					k_cfg->layoutSpecificSize);
   1162 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1163 					retcode));
   1164 				goto no_config;
   1165 			}
   1166 		} else
   1167 			specific_buf = NULL;
   1168 		k_cfg->layoutSpecific = specific_buf;
   1169 
   1170 		/* should do some kind of sanity check on the configuration.
   1171 		 * Store the sum of all the bytes in the last byte? */
   1172 
   1173 		/* configure the system */
   1174 
   1175 		/*
   1176 		 * Clear the entire RAID descriptor, just to make sure
   1177 		 *  there is no stale data left in the case of a
   1178 		 *  reconfiguration
   1179 		 */
   1180 		memset(raidPtr, 0, sizeof(*raidPtr));
   1181 		raidPtr->softc = rs;
   1182 		raidPtr->raidid = unit;
   1183 
   1184 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1185 
   1186 		if (retcode == 0) {
   1187 
   1188 			/* allow this many simultaneous IO's to
   1189 			   this RAID device */
   1190 			raidPtr->openings = RAIDOUTSTANDING;
   1191 
   1192 			raidinit(rs);
   1193 			raid_wakeup(raidPtr);
   1194 			rf_markalldirty(raidPtr);
   1195 		}
   1196 		/* free the buffers.  No return code here. */
   1197 		if (k_cfg->layoutSpecificSize) {
   1198 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1199 		}
   1200 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1201 
   1202 	no_config:
   1203 		/*
   1204 		 * If configuration failed, set sc_flags so that we
   1205 		 * will detach the device when we close it.
   1206 		 */
   1207 		if (retcode != 0)
   1208 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1209 		return (retcode);
   1210 
   1211 		/* shutdown the system */
   1212 	case RAIDFRAME_SHUTDOWN:
   1213 
   1214 		part = DISKPART(dev);
   1215 		pmask = (1 << part);
   1216 
   1217 		if ((error = raidlock(rs)) != 0)
   1218 			return (error);
   1219 
   1220 		if (DK_BUSY(dksc, pmask) ||
   1221 		    raidPtr->recon_in_progress != 0 ||
   1222 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1223 		    raidPtr->copyback_in_progress != 0)
   1224 			retcode = EBUSY;
   1225 		else {
   1226 			/* detach and free on close */
   1227 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1228 			retcode = 0;
   1229 		}
   1230 
   1231 		raidunlock(rs);
   1232 
   1233 		return (retcode);
   1234 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1235 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1236 		/* need to read the component label for the disk indicated
   1237 		   by row,column in clabel */
   1238 
   1239 		/*
   1240 		 * Perhaps there should be an option to skip the in-core
   1241 		 * copy and hit the disk, as with disklabel(8).
   1242 		 */
   1243 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1244 
   1245 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1246 
   1247 		if (retcode) {
   1248 			RF_Free(clabel, sizeof(*clabel));
   1249 			return retcode;
   1250 		}
   1251 
   1252 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1253 
   1254 		column = clabel->column;
   1255 
   1256 		if ((column < 0) || (column >= raidPtr->numCol +
   1257 		    raidPtr->numSpare)) {
   1258 			RF_Free(clabel, sizeof(*clabel));
   1259 			return EINVAL;
   1260 		}
   1261 
   1262 		RF_Free(clabel, sizeof(*clabel));
   1263 
   1264 		clabel = raidget_component_label(raidPtr, column);
   1265 
   1266 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1267 
   1268 #if 0
   1269 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1270 		clabel = (RF_ComponentLabel_t *) data;
   1271 
   1272 		/* XXX check the label for valid stuff... */
   1273 		/* Note that some things *should not* get modified --
   1274 		   the user should be re-initing the labels instead of
   1275 		   trying to patch things.
   1276 		   */
   1277 
   1278 		raidid = raidPtr->raidid;
   1279 #ifdef DEBUG
   1280 		printf("raid%d: Got component label:\n", raidid);
   1281 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1282 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1283 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1284 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1285 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1286 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1287 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1288 #endif
   1289 		clabel->row = 0;
   1290 		column = clabel->column;
   1291 
   1292 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1293 			return(EINVAL);
   1294 		}
   1295 
   1296 		/* XXX this isn't allowed to do anything for now :-) */
   1297 
   1298 		/* XXX and before it is, we need to fill in the rest
   1299 		   of the fields!?!?!?! */
   1300 		memcpy(raidget_component_label(raidPtr, column),
   1301 		    clabel, sizeof(*clabel));
   1302 		raidflush_component_label(raidPtr, column);
   1303 		return (0);
   1304 #endif
   1305 
   1306 	case RAIDFRAME_INIT_LABELS:
   1307 		clabel = (RF_ComponentLabel_t *) data;
   1308 		/*
   1309 		   we only want the serial number from
   1310 		   the above.  We get all the rest of the information
   1311 		   from the config that was used to create this RAID
   1312 		   set.
   1313 		   */
   1314 
   1315 		raidPtr->serial_number = clabel->serial_number;
   1316 
   1317 		for(column=0;column<raidPtr->numCol;column++) {
   1318 			diskPtr = &raidPtr->Disks[column];
   1319 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1320 				ci_label = raidget_component_label(raidPtr,
   1321 				    column);
   1322 				/* Zeroing this is important. */
   1323 				memset(ci_label, 0, sizeof(*ci_label));
   1324 				raid_init_component_label(raidPtr, ci_label);
   1325 				ci_label->serial_number =
   1326 				    raidPtr->serial_number;
   1327 				ci_label->row = 0; /* we dont' pretend to support more */
   1328 				rf_component_label_set_partitionsize(ci_label,
   1329 				    diskPtr->partitionSize);
   1330 				ci_label->column = column;
   1331 				raidflush_component_label(raidPtr, column);
   1332 			}
   1333 			/* XXXjld what about the spares? */
   1334 		}
   1335 
   1336 		return (retcode);
   1337 	case RAIDFRAME_SET_AUTOCONFIG:
   1338 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1339 		printf("raid%d: New autoconfig value is: %d\n",
   1340 		       raidPtr->raidid, d);
   1341 		*(int *) data = d;
   1342 		return (retcode);
   1343 
   1344 	case RAIDFRAME_SET_ROOT:
   1345 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1346 		printf("raid%d: New rootpartition value is: %d\n",
   1347 		       raidPtr->raidid, d);
   1348 		*(int *) data = d;
   1349 		return (retcode);
   1350 
   1351 		/* initialize all parity */
   1352 	case RAIDFRAME_REWRITEPARITY:
   1353 
   1354 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1355 			/* Parity for RAID 0 is trivially correct */
   1356 			raidPtr->parity_good = RF_RAID_CLEAN;
   1357 			return(0);
   1358 		}
   1359 
   1360 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1361 			/* Re-write is already in progress! */
   1362 			return(EINVAL);
   1363 		}
   1364 
   1365 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1366 					   rf_RewriteParityThread,
   1367 					   raidPtr,"raid_parity");
   1368 		return (retcode);
   1369 
   1370 
   1371 	case RAIDFRAME_ADD_HOT_SPARE:
   1372 		sparePtr = (RF_SingleComponent_t *) data;
   1373 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1374 		retcode = rf_add_hot_spare(raidPtr, &component);
   1375 		return(retcode);
   1376 
   1377 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1378 		return(retcode);
   1379 
   1380 	case RAIDFRAME_DELETE_COMPONENT:
   1381 		componentPtr = (RF_SingleComponent_t *)data;
   1382 		memcpy( &component, componentPtr,
   1383 			sizeof(RF_SingleComponent_t));
   1384 		retcode = rf_delete_component(raidPtr, &component);
   1385 		return(retcode);
   1386 
   1387 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1388 		componentPtr = (RF_SingleComponent_t *)data;
   1389 		memcpy( &component, componentPtr,
   1390 			sizeof(RF_SingleComponent_t));
   1391 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1392 		return(retcode);
   1393 
   1394 	case RAIDFRAME_REBUILD_IN_PLACE:
   1395 
   1396 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1397 			/* Can't do this on a RAID 0!! */
   1398 			return(EINVAL);
   1399 		}
   1400 
   1401 		if (raidPtr->recon_in_progress == 1) {
   1402 			/* a reconstruct is already in progress! */
   1403 			return(EINVAL);
   1404 		}
   1405 
   1406 		componentPtr = (RF_SingleComponent_t *) data;
   1407 		memcpy( &component, componentPtr,
   1408 			sizeof(RF_SingleComponent_t));
   1409 		component.row = 0; /* we don't support any more */
   1410 		column = component.column;
   1411 
   1412 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1413 			return(EINVAL);
   1414 		}
   1415 
   1416 		rf_lock_mutex2(raidPtr->mutex);
   1417 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1418 		    (raidPtr->numFailures > 0)) {
   1419 			/* XXX 0 above shouldn't be constant!!! */
   1420 			/* some component other than this has failed.
   1421 			   Let's not make things worse than they already
   1422 			   are... */
   1423 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1424 			       raidPtr->raidid);
   1425 			printf("raid%d:     Col: %d   Too many failures.\n",
   1426 			       raidPtr->raidid, column);
   1427 			rf_unlock_mutex2(raidPtr->mutex);
   1428 			return (EINVAL);
   1429 		}
   1430 		if (raidPtr->Disks[column].status ==
   1431 		    rf_ds_reconstructing) {
   1432 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1433 			       raidPtr->raidid);
   1434 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1435 
   1436 			rf_unlock_mutex2(raidPtr->mutex);
   1437 			return (EINVAL);
   1438 		}
   1439 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1440 			rf_unlock_mutex2(raidPtr->mutex);
   1441 			return (EINVAL);
   1442 		}
   1443 		rf_unlock_mutex2(raidPtr->mutex);
   1444 
   1445 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1446 		if (rrcopy == NULL)
   1447 			return(ENOMEM);
   1448 
   1449 		rrcopy->raidPtr = (void *) raidPtr;
   1450 		rrcopy->col = column;
   1451 
   1452 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1453 					   rf_ReconstructInPlaceThread,
   1454 					   rrcopy,"raid_reconip");
   1455 		return(retcode);
   1456 
   1457 	case RAIDFRAME_GET_INFO:
   1458 		if (!raidPtr->valid)
   1459 			return (ENODEV);
   1460 		ucfgp = (RF_DeviceConfig_t **) data;
   1461 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1462 			  (RF_DeviceConfig_t *));
   1463 		if (d_cfg == NULL)
   1464 			return (ENOMEM);
   1465 		d_cfg->rows = 1; /* there is only 1 row now */
   1466 		d_cfg->cols = raidPtr->numCol;
   1467 		d_cfg->ndevs = raidPtr->numCol;
   1468 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1469 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1470 			return (ENOMEM);
   1471 		}
   1472 		d_cfg->nspares = raidPtr->numSpare;
   1473 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1474 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1475 			return (ENOMEM);
   1476 		}
   1477 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1478 		d = 0;
   1479 		for (j = 0; j < d_cfg->cols; j++) {
   1480 			d_cfg->devs[d] = raidPtr->Disks[j];
   1481 			d++;
   1482 		}
   1483 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1484 			d_cfg->spares[i] = raidPtr->Disks[j];
   1485 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1486 				/* XXX: raidctl(8) expects to see this as a used spare */
   1487 				d_cfg->spares[i].status = rf_ds_used_spare;
   1488 			}
   1489 		}
   1490 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1491 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1492 
   1493 		return (retcode);
   1494 
   1495 	case RAIDFRAME_CHECK_PARITY:
   1496 		*(int *) data = raidPtr->parity_good;
   1497 		return (0);
   1498 
   1499 	case RAIDFRAME_PARITYMAP_STATUS:
   1500 		if (rf_paritymap_ineligible(raidPtr))
   1501 			return EINVAL;
   1502 		rf_paritymap_status(raidPtr->parity_map,
   1503 		    (struct rf_pmstat *)data);
   1504 		return 0;
   1505 
   1506 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1507 		if (rf_paritymap_ineligible(raidPtr))
   1508 			return EINVAL;
   1509 		if (raidPtr->parity_map == NULL)
   1510 			return ENOENT; /* ??? */
   1511 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1512 			(struct rf_pmparams *)data, 1))
   1513 			return EINVAL;
   1514 		return 0;
   1515 
   1516 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1517 		if (rf_paritymap_ineligible(raidPtr))
   1518 			return EINVAL;
   1519 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1520 		return 0;
   1521 
   1522 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1523 		if (rf_paritymap_ineligible(raidPtr))
   1524 			return EINVAL;
   1525 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1526 		/* XXX should errors be passed up? */
   1527 		return 0;
   1528 
   1529 	case RAIDFRAME_RESET_ACCTOTALS:
   1530 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1531 		return (0);
   1532 
   1533 	case RAIDFRAME_GET_ACCTOTALS:
   1534 		totals = (RF_AccTotals_t *) data;
   1535 		*totals = raidPtr->acc_totals;
   1536 		return (0);
   1537 
   1538 	case RAIDFRAME_KEEP_ACCTOTALS:
   1539 		raidPtr->keep_acc_totals = *(int *)data;
   1540 		return (0);
   1541 
   1542 	case RAIDFRAME_GET_SIZE:
   1543 		*(int *) data = raidPtr->totalSectors;
   1544 		return (0);
   1545 
   1546 		/* fail a disk & optionally start reconstruction */
   1547 	case RAIDFRAME_FAIL_DISK:
   1548 
   1549 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1550 			/* Can't do this on a RAID 0!! */
   1551 			return(EINVAL);
   1552 		}
   1553 
   1554 		rr = (struct rf_recon_req *) data;
   1555 		rr->row = 0;
   1556 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1557 			return (EINVAL);
   1558 
   1559 
   1560 		rf_lock_mutex2(raidPtr->mutex);
   1561 		if (raidPtr->status == rf_rs_reconstructing) {
   1562 			/* you can't fail a disk while we're reconstructing! */
   1563 			/* XXX wrong for RAID6 */
   1564 			rf_unlock_mutex2(raidPtr->mutex);
   1565 			return (EINVAL);
   1566 		}
   1567 		if ((raidPtr->Disks[rr->col].status ==
   1568 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1569 			/* some other component has failed.  Let's not make
   1570 			   things worse. XXX wrong for RAID6 */
   1571 			rf_unlock_mutex2(raidPtr->mutex);
   1572 			return (EINVAL);
   1573 		}
   1574 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1575 			/* Can't fail a spared disk! */
   1576 			rf_unlock_mutex2(raidPtr->mutex);
   1577 			return (EINVAL);
   1578 		}
   1579 		rf_unlock_mutex2(raidPtr->mutex);
   1580 
   1581 		/* make a copy of the recon request so that we don't rely on
   1582 		 * the user's buffer */
   1583 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1584 		if (rrcopy == NULL)
   1585 			return(ENOMEM);
   1586 		memcpy(rrcopy, rr, sizeof(*rr));
   1587 		rrcopy->raidPtr = (void *) raidPtr;
   1588 
   1589 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1590 					   rf_ReconThread,
   1591 					   rrcopy,"raid_recon");
   1592 		return (0);
   1593 
   1594 		/* invoke a copyback operation after recon on whatever disk
   1595 		 * needs it, if any */
   1596 	case RAIDFRAME_COPYBACK:
   1597 
   1598 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1599 			/* This makes no sense on a RAID 0!! */
   1600 			return(EINVAL);
   1601 		}
   1602 
   1603 		if (raidPtr->copyback_in_progress == 1) {
   1604 			/* Copyback is already in progress! */
   1605 			return(EINVAL);
   1606 		}
   1607 
   1608 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1609 					   rf_CopybackThread,
   1610 					   raidPtr,"raid_copyback");
   1611 		return (retcode);
   1612 
   1613 		/* return the percentage completion of reconstruction */
   1614 	case RAIDFRAME_CHECK_RECON_STATUS:
   1615 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1616 			/* This makes no sense on a RAID 0, so tell the
   1617 			   user it's done. */
   1618 			*(int *) data = 100;
   1619 			return(0);
   1620 		}
   1621 		if (raidPtr->status != rf_rs_reconstructing)
   1622 			*(int *) data = 100;
   1623 		else {
   1624 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1625 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1626 			} else {
   1627 				*(int *) data = 0;
   1628 			}
   1629 		}
   1630 		return (0);
   1631 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1632 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1633 		if (raidPtr->status != rf_rs_reconstructing) {
   1634 			progressInfo.remaining = 0;
   1635 			progressInfo.completed = 100;
   1636 			progressInfo.total = 100;
   1637 		} else {
   1638 			progressInfo.total =
   1639 				raidPtr->reconControl->numRUsTotal;
   1640 			progressInfo.completed =
   1641 				raidPtr->reconControl->numRUsComplete;
   1642 			progressInfo.remaining = progressInfo.total -
   1643 				progressInfo.completed;
   1644 		}
   1645 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1646 				  sizeof(RF_ProgressInfo_t));
   1647 		return (retcode);
   1648 
   1649 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1650 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1651 			/* This makes no sense on a RAID 0, so tell the
   1652 			   user it's done. */
   1653 			*(int *) data = 100;
   1654 			return(0);
   1655 		}
   1656 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1657 			*(int *) data = 100 *
   1658 				raidPtr->parity_rewrite_stripes_done /
   1659 				raidPtr->Layout.numStripe;
   1660 		} else {
   1661 			*(int *) data = 100;
   1662 		}
   1663 		return (0);
   1664 
   1665 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1666 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1667 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1668 			progressInfo.total = raidPtr->Layout.numStripe;
   1669 			progressInfo.completed =
   1670 				raidPtr->parity_rewrite_stripes_done;
   1671 			progressInfo.remaining = progressInfo.total -
   1672 				progressInfo.completed;
   1673 		} else {
   1674 			progressInfo.remaining = 0;
   1675 			progressInfo.completed = 100;
   1676 			progressInfo.total = 100;
   1677 		}
   1678 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1679 				  sizeof(RF_ProgressInfo_t));
   1680 		return (retcode);
   1681 
   1682 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1683 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1684 			/* This makes no sense on a RAID 0 */
   1685 			*(int *) data = 100;
   1686 			return(0);
   1687 		}
   1688 		if (raidPtr->copyback_in_progress == 1) {
   1689 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1690 				raidPtr->Layout.numStripe;
   1691 		} else {
   1692 			*(int *) data = 100;
   1693 		}
   1694 		return (0);
   1695 
   1696 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1697 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1698 		if (raidPtr->copyback_in_progress == 1) {
   1699 			progressInfo.total = raidPtr->Layout.numStripe;
   1700 			progressInfo.completed =
   1701 				raidPtr->copyback_stripes_done;
   1702 			progressInfo.remaining = progressInfo.total -
   1703 				progressInfo.completed;
   1704 		} else {
   1705 			progressInfo.remaining = 0;
   1706 			progressInfo.completed = 100;
   1707 			progressInfo.total = 100;
   1708 		}
   1709 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1710 				  sizeof(RF_ProgressInfo_t));
   1711 		return (retcode);
   1712 
   1713 	case RAIDFRAME_SET_LAST_UNIT:
   1714 		for (column = 0; column < raidPtr->numCol; column++)
   1715 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1716 				return EBUSY;
   1717 
   1718 		for (column = 0; column < raidPtr->numCol; column++) {
   1719 			clabel = raidget_component_label(raidPtr, column);
   1720 			clabel->last_unit = *(int *)data;
   1721 			raidflush_component_label(raidPtr, column);
   1722 		}
   1723 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1724 		return 0;
   1725 
   1726 		/* the sparetable daemon calls this to wait for the kernel to
   1727 		 * need a spare table. this ioctl does not return until a
   1728 		 * spare table is needed. XXX -- calling mpsleep here in the
   1729 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1730 		 * -- I should either compute the spare table in the kernel,
   1731 		 * or have a different -- XXX XXX -- interface (a different
   1732 		 * character device) for delivering the table     -- XXX */
   1733 #if 0
   1734 	case RAIDFRAME_SPARET_WAIT:
   1735 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1736 		while (!rf_sparet_wait_queue)
   1737 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1738 		waitreq = rf_sparet_wait_queue;
   1739 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1740 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1741 
   1742 		/* structure assignment */
   1743 		*((RF_SparetWait_t *) data) = *waitreq;
   1744 
   1745 		RF_Free(waitreq, sizeof(*waitreq));
   1746 		return (0);
   1747 
   1748 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1749 		 * code in it that will cause the dameon to exit */
   1750 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1751 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1752 		waitreq->fcol = -1;
   1753 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1754 		waitreq->next = rf_sparet_wait_queue;
   1755 		rf_sparet_wait_queue = waitreq;
   1756 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1757 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1758 		return (0);
   1759 
   1760 		/* used by the spare table daemon to deliver a spare table
   1761 		 * into the kernel */
   1762 	case RAIDFRAME_SEND_SPARET:
   1763 
   1764 		/* install the spare table */
   1765 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1766 
   1767 		/* respond to the requestor.  the return status of the spare
   1768 		 * table installation is passed in the "fcol" field */
   1769 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1770 		waitreq->fcol = retcode;
   1771 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1772 		waitreq->next = rf_sparet_resp_queue;
   1773 		rf_sparet_resp_queue = waitreq;
   1774 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1775 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1776 
   1777 		return (retcode);
   1778 #endif
   1779 
   1780 	default:
   1781 		break; /* fall through to the os-specific code below */
   1782 
   1783 	}
   1784 
   1785 	if (!raidPtr->valid)
   1786 		return (EINVAL);
   1787 
   1788 	/*
   1789 	 * Add support for "regular" device ioctls here.
   1790 	 */
   1791 
   1792 	switch (cmd) {
   1793 	case DIOCGCACHE:
   1794 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1795 		break;
   1796 
   1797 	case DIOCCACHESYNC:
   1798 		retcode = rf_sync_component_caches(raidPtr);
   1799 		break;
   1800 
   1801 	default:
   1802 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1803 		break;
   1804 	}
   1805 
   1806 	return (retcode);
   1807 
   1808 }
   1809 
   1810 
   1811 /* raidinit -- complete the rest of the initialization for the
   1812    RAIDframe device.  */
   1813 
   1814 
   1815 static void
   1816 raidinit(struct raid_softc *rs)
   1817 {
   1818 	cfdata_t cf;
   1819 	unsigned int unit;
   1820 	struct dk_softc *dksc = &rs->sc_dksc;
   1821 	RF_Raid_t *raidPtr = &rs->sc_r;
   1822 	device_t dev;
   1823 
   1824 	unit = raidPtr->raidid;
   1825 
   1826 	/* XXX doesn't check bounds. */
   1827 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1828 
   1829 	/* attach the pseudo device */
   1830 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1831 	cf->cf_name = raid_cd.cd_name;
   1832 	cf->cf_atname = raid_cd.cd_name;
   1833 	cf->cf_unit = unit;
   1834 	cf->cf_fstate = FSTATE_STAR;
   1835 
   1836 	dev = config_attach_pseudo(cf);
   1837 	if (dev == NULL) {
   1838 		printf("raid%d: config_attach_pseudo failed\n",
   1839 		    raidPtr->raidid);
   1840 		free(cf, M_RAIDFRAME);
   1841 		return;
   1842 	}
   1843 
   1844 	/* provide a backpointer to the real softc */
   1845 	raidsoftc(dev) = rs;
   1846 
   1847 	/* disk_attach actually creates space for the CPU disklabel, among
   1848 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1849 	 * with disklabels. */
   1850 	dk_init(dksc, dev, DKTYPE_RAID);
   1851 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1852 
   1853 	/* XXX There may be a weird interaction here between this, and
   1854 	 * protectedSectors, as used in RAIDframe.  */
   1855 
   1856 	rs->sc_size = raidPtr->totalSectors;
   1857 
   1858 	/* Attach dk and disk subsystems */
   1859 	dk_attach(dksc);
   1860 	disk_attach(&dksc->sc_dkdev);
   1861 	rf_set_geometry(rs, raidPtr);
   1862 
   1863 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1864 
   1865 	/* mark unit as usuable */
   1866 	rs->sc_flags |= RAIDF_INITED;
   1867 
   1868 	dkwedge_discover(&dksc->sc_dkdev);
   1869 	device_release(dev);
   1870 }
   1871 
   1872 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1873 /* wake up the daemon & tell it to get us a spare table
   1874  * XXX
   1875  * the entries in the queues should be tagged with the raidPtr
   1876  * so that in the extremely rare case that two recons happen at once,
   1877  * we know for which device were requesting a spare table
   1878  * XXX
   1879  *
   1880  * XXX This code is not currently used. GO
   1881  */
   1882 int
   1883 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1884 {
   1885 	int     retcode;
   1886 
   1887 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1888 	req->next = rf_sparet_wait_queue;
   1889 	rf_sparet_wait_queue = req;
   1890 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1891 
   1892 	/* mpsleep unlocks the mutex */
   1893 	while (!rf_sparet_resp_queue) {
   1894 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1895 	}
   1896 	req = rf_sparet_resp_queue;
   1897 	rf_sparet_resp_queue = req->next;
   1898 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1899 
   1900 	retcode = req->fcol;
   1901 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1902 					 * alloc'd */
   1903 	return (retcode);
   1904 }
   1905 #endif
   1906 
   1907 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1908  * bp & passes it down.
   1909  * any calls originating in the kernel must use non-blocking I/O
   1910  * do some extra sanity checking to return "appropriate" error values for
   1911  * certain conditions (to make some standard utilities work)
   1912  *
   1913  * Formerly known as: rf_DoAccessKernel
   1914  */
   1915 void
   1916 raidstart(RF_Raid_t *raidPtr)
   1917 {
   1918 	struct raid_softc *rs;
   1919 	struct dk_softc *dksc;
   1920 
   1921 	rs = raidPtr->softc;
   1922 	dksc = &rs->sc_dksc;
   1923 	/* quick check to see if anything has died recently */
   1924 	rf_lock_mutex2(raidPtr->mutex);
   1925 	if (raidPtr->numNewFailures > 0) {
   1926 		rf_unlock_mutex2(raidPtr->mutex);
   1927 		rf_update_component_labels(raidPtr,
   1928 					   RF_NORMAL_COMPONENT_UPDATE);
   1929 		rf_lock_mutex2(raidPtr->mutex);
   1930 		raidPtr->numNewFailures--;
   1931 	}
   1932 	rf_unlock_mutex2(raidPtr->mutex);
   1933 
   1934 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1935 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1936 		return;
   1937 	}
   1938 
   1939 	dk_start(dksc, NULL);
   1940 }
   1941 
   1942 static int
   1943 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1944 {
   1945 	RF_SectorCount_t num_blocks, pb, sum;
   1946 	RF_RaidAddr_t raid_addr;
   1947 	daddr_t blocknum;
   1948 	int     do_async;
   1949 	int rc;
   1950 
   1951 	rf_lock_mutex2(raidPtr->mutex);
   1952 	if (raidPtr->openings == 0) {
   1953 		rf_unlock_mutex2(raidPtr->mutex);
   1954 		return EAGAIN;
   1955 	}
   1956 	rf_unlock_mutex2(raidPtr->mutex);
   1957 
   1958 	blocknum = bp->b_rawblkno;
   1959 
   1960 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1961 		    (int) blocknum));
   1962 
   1963 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1964 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1965 
   1966 	/* *THIS* is where we adjust what block we're going to...
   1967 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1968 	raid_addr = blocknum;
   1969 
   1970 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1971 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1972 	sum = raid_addr + num_blocks + pb;
   1973 	if (1 || rf_debugKernelAccess) {
   1974 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1975 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1976 			    (int) pb, (int) bp->b_resid));
   1977 	}
   1978 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1979 	    || (sum < num_blocks) || (sum < pb)) {
   1980 		rc = ENOSPC;
   1981 		goto done;
   1982 	}
   1983 	/*
   1984 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1985 	 */
   1986 
   1987 	if (bp->b_bcount & raidPtr->sectorMask) {
   1988 		rc = ENOSPC;
   1989 		goto done;
   1990 	}
   1991 	db1_printf(("Calling DoAccess..\n"));
   1992 
   1993 
   1994 	rf_lock_mutex2(raidPtr->mutex);
   1995 	raidPtr->openings--;
   1996 	rf_unlock_mutex2(raidPtr->mutex);
   1997 
   1998 	/*
   1999 	 * Everything is async.
   2000 	 */
   2001 	do_async = 1;
   2002 
   2003 	/* don't ever condition on bp->b_flags & B_WRITE.
   2004 	 * always condition on B_READ instead */
   2005 
   2006 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2007 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2008 			 do_async, raid_addr, num_blocks,
   2009 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2010 
   2011 done:
   2012 	return rc;
   2013 }
   2014 
   2015 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2016 
   2017 int
   2018 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2019 {
   2020 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2021 	struct buf *bp;
   2022 
   2023 	req->queue = queue;
   2024 	bp = req->bp;
   2025 
   2026 	switch (req->type) {
   2027 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2028 		/* XXX need to do something extra here.. */
   2029 		/* I'm leaving this in, as I've never actually seen it used,
   2030 		 * and I'd like folks to report it... GO */
   2031 		printf(("WAKEUP CALLED\n"));
   2032 		queue->numOutstanding++;
   2033 
   2034 		bp->b_flags = 0;
   2035 		bp->b_private = req;
   2036 
   2037 		KernelWakeupFunc(bp);
   2038 		break;
   2039 
   2040 	case RF_IO_TYPE_READ:
   2041 	case RF_IO_TYPE_WRITE:
   2042 #if RF_ACC_TRACE > 0
   2043 		if (req->tracerec) {
   2044 			RF_ETIMER_START(req->tracerec->timer);
   2045 		}
   2046 #endif
   2047 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2048 		    op, queue->rf_cinfo->ci_dev,
   2049 		    req->sectorOffset, req->numSector,
   2050 		    req->buf, KernelWakeupFunc, (void *) req,
   2051 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2052 
   2053 		if (rf_debugKernelAccess) {
   2054 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2055 				(long) bp->b_blkno));
   2056 		}
   2057 		queue->numOutstanding++;
   2058 		queue->last_deq_sector = req->sectorOffset;
   2059 		/* acc wouldn't have been let in if there were any pending
   2060 		 * reqs at any other priority */
   2061 		queue->curPriority = req->priority;
   2062 
   2063 		db1_printf(("Going for %c to unit %d col %d\n",
   2064 			    req->type, queue->raidPtr->raidid,
   2065 			    queue->col));
   2066 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2067 			(int) req->sectorOffset, (int) req->numSector,
   2068 			(int) (req->numSector <<
   2069 			    queue->raidPtr->logBytesPerSector),
   2070 			(int) queue->raidPtr->logBytesPerSector));
   2071 
   2072 		/*
   2073 		 * XXX: drop lock here since this can block at
   2074 		 * least with backing SCSI devices.  Retake it
   2075 		 * to minimize fuss with calling interfaces.
   2076 		 */
   2077 
   2078 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2079 		bdev_strategy(bp);
   2080 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2081 		break;
   2082 
   2083 	default:
   2084 		panic("bad req->type in rf_DispatchKernelIO");
   2085 	}
   2086 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2087 
   2088 	return (0);
   2089 }
   2090 /* this is the callback function associated with a I/O invoked from
   2091    kernel code.
   2092  */
   2093 static void
   2094 KernelWakeupFunc(struct buf *bp)
   2095 {
   2096 	RF_DiskQueueData_t *req = NULL;
   2097 	RF_DiskQueue_t *queue;
   2098 
   2099 	db1_printf(("recovering the request queue:\n"));
   2100 
   2101 	req = bp->b_private;
   2102 
   2103 	queue = (RF_DiskQueue_t *) req->queue;
   2104 
   2105 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2106 
   2107 #if RF_ACC_TRACE > 0
   2108 	if (req->tracerec) {
   2109 		RF_ETIMER_STOP(req->tracerec->timer);
   2110 		RF_ETIMER_EVAL(req->tracerec->timer);
   2111 		rf_lock_mutex2(rf_tracing_mutex);
   2112 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2113 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2114 		req->tracerec->num_phys_ios++;
   2115 		rf_unlock_mutex2(rf_tracing_mutex);
   2116 	}
   2117 #endif
   2118 
   2119 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2120 	 * ballistic, and mark the component as hosed... */
   2121 
   2122 	if (bp->b_error != 0) {
   2123 		/* Mark the disk as dead */
   2124 		/* but only mark it once... */
   2125 		/* and only if it wouldn't leave this RAID set
   2126 		   completely broken */
   2127 		if (((queue->raidPtr->Disks[queue->col].status ==
   2128 		      rf_ds_optimal) ||
   2129 		     (queue->raidPtr->Disks[queue->col].status ==
   2130 		      rf_ds_used_spare)) &&
   2131 		     (queue->raidPtr->numFailures <
   2132 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2133 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2134 			       queue->raidPtr->raidid,
   2135 			       bp->b_error,
   2136 			       queue->raidPtr->Disks[queue->col].devname);
   2137 			queue->raidPtr->Disks[queue->col].status =
   2138 			    rf_ds_failed;
   2139 			queue->raidPtr->status = rf_rs_degraded;
   2140 			queue->raidPtr->numFailures++;
   2141 			queue->raidPtr->numNewFailures++;
   2142 		} else {	/* Disk is already dead... */
   2143 			/* printf("Disk already marked as dead!\n"); */
   2144 		}
   2145 
   2146 	}
   2147 
   2148 	/* Fill in the error value */
   2149 	req->error = bp->b_error;
   2150 
   2151 	/* Drop this one on the "finished" queue... */
   2152 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2153 
   2154 	/* Let the raidio thread know there is work to be done. */
   2155 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2156 
   2157 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2158 }
   2159 
   2160 
   2161 /*
   2162  * initialize a buf structure for doing an I/O in the kernel.
   2163  */
   2164 static void
   2165 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2166        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2167        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2168        struct proc *b_proc)
   2169 {
   2170 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2171 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2172 	bp->b_oflags = 0;
   2173 	bp->b_cflags = 0;
   2174 	bp->b_bcount = numSect << logBytesPerSector;
   2175 	bp->b_bufsize = bp->b_bcount;
   2176 	bp->b_error = 0;
   2177 	bp->b_dev = dev;
   2178 	bp->b_data = bf;
   2179 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2180 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2181 	if (bp->b_bcount == 0) {
   2182 		panic("bp->b_bcount is zero in InitBP!!");
   2183 	}
   2184 	bp->b_proc = b_proc;
   2185 	bp->b_iodone = cbFunc;
   2186 	bp->b_private = cbArg;
   2187 }
   2188 
   2189 /*
   2190  * Wait interruptibly for an exclusive lock.
   2191  *
   2192  * XXX
   2193  * Several drivers do this; it should be abstracted and made MP-safe.
   2194  * (Hmm... where have we seen this warning before :->  GO )
   2195  */
   2196 static int
   2197 raidlock(struct raid_softc *rs)
   2198 {
   2199 	int     error;
   2200 
   2201 	error = 0;
   2202 	mutex_enter(&rs->sc_mutex);
   2203 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2204 		rs->sc_flags |= RAIDF_WANTED;
   2205 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2206 		if (error != 0)
   2207 			goto done;
   2208 	}
   2209 	rs->sc_flags |= RAIDF_LOCKED;
   2210 done:
   2211 	mutex_exit(&rs->sc_mutex);
   2212 	return (error);
   2213 }
   2214 /*
   2215  * Unlock and wake up any waiters.
   2216  */
   2217 static void
   2218 raidunlock(struct raid_softc *rs)
   2219 {
   2220 
   2221 	mutex_enter(&rs->sc_mutex);
   2222 	rs->sc_flags &= ~RAIDF_LOCKED;
   2223 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2224 		rs->sc_flags &= ~RAIDF_WANTED;
   2225 		cv_broadcast(&rs->sc_cv);
   2226 	}
   2227 	mutex_exit(&rs->sc_mutex);
   2228 }
   2229 
   2230 
   2231 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2232 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2233 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2234 
   2235 static daddr_t
   2236 rf_component_info_offset(void)
   2237 {
   2238 
   2239 	return RF_COMPONENT_INFO_OFFSET;
   2240 }
   2241 
   2242 static daddr_t
   2243 rf_component_info_size(unsigned secsize)
   2244 {
   2245 	daddr_t info_size;
   2246 
   2247 	KASSERT(secsize);
   2248 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2249 		info_size = secsize;
   2250 	else
   2251 		info_size = RF_COMPONENT_INFO_SIZE;
   2252 
   2253 	return info_size;
   2254 }
   2255 
   2256 static daddr_t
   2257 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2258 {
   2259 	daddr_t map_offset;
   2260 
   2261 	KASSERT(raidPtr->bytesPerSector);
   2262 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2263 		map_offset = raidPtr->bytesPerSector;
   2264 	else
   2265 		map_offset = RF_COMPONENT_INFO_SIZE;
   2266 	map_offset += rf_component_info_offset();
   2267 
   2268 	return map_offset;
   2269 }
   2270 
   2271 static daddr_t
   2272 rf_parity_map_size(RF_Raid_t *raidPtr)
   2273 {
   2274 	daddr_t map_size;
   2275 
   2276 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2277 		map_size = raidPtr->bytesPerSector;
   2278 	else
   2279 		map_size = RF_PARITY_MAP_SIZE;
   2280 
   2281 	return map_size;
   2282 }
   2283 
   2284 int
   2285 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2286 {
   2287 	RF_ComponentLabel_t *clabel;
   2288 
   2289 	clabel = raidget_component_label(raidPtr, col);
   2290 	clabel->clean = RF_RAID_CLEAN;
   2291 	raidflush_component_label(raidPtr, col);
   2292 	return(0);
   2293 }
   2294 
   2295 
   2296 int
   2297 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2298 {
   2299 	RF_ComponentLabel_t *clabel;
   2300 
   2301 	clabel = raidget_component_label(raidPtr, col);
   2302 	clabel->clean = RF_RAID_DIRTY;
   2303 	raidflush_component_label(raidPtr, col);
   2304 	return(0);
   2305 }
   2306 
   2307 int
   2308 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2309 {
   2310 	KASSERT(raidPtr->bytesPerSector);
   2311 	return raidread_component_label(raidPtr->bytesPerSector,
   2312 	    raidPtr->Disks[col].dev,
   2313 	    raidPtr->raid_cinfo[col].ci_vp,
   2314 	    &raidPtr->raid_cinfo[col].ci_label);
   2315 }
   2316 
   2317 RF_ComponentLabel_t *
   2318 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2319 {
   2320 	return &raidPtr->raid_cinfo[col].ci_label;
   2321 }
   2322 
   2323 int
   2324 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2325 {
   2326 	RF_ComponentLabel_t *label;
   2327 
   2328 	label = &raidPtr->raid_cinfo[col].ci_label;
   2329 	label->mod_counter = raidPtr->mod_counter;
   2330 #ifndef RF_NO_PARITY_MAP
   2331 	label->parity_map_modcount = label->mod_counter;
   2332 #endif
   2333 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2334 	    raidPtr->Disks[col].dev,
   2335 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2336 }
   2337 
   2338 
   2339 static int
   2340 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2341     RF_ComponentLabel_t *clabel)
   2342 {
   2343 	return raidread_component_area(dev, b_vp, clabel,
   2344 	    sizeof(RF_ComponentLabel_t),
   2345 	    rf_component_info_offset(),
   2346 	    rf_component_info_size(secsize));
   2347 }
   2348 
   2349 /* ARGSUSED */
   2350 static int
   2351 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2352     size_t msize, daddr_t offset, daddr_t dsize)
   2353 {
   2354 	struct buf *bp;
   2355 	int error;
   2356 
   2357 	/* XXX should probably ensure that we don't try to do this if
   2358 	   someone has changed rf_protected_sectors. */
   2359 
   2360 	if (b_vp == NULL) {
   2361 		/* For whatever reason, this component is not valid.
   2362 		   Don't try to read a component label from it. */
   2363 		return(EINVAL);
   2364 	}
   2365 
   2366 	/* get a block of the appropriate size... */
   2367 	bp = geteblk((int)dsize);
   2368 	bp->b_dev = dev;
   2369 
   2370 	/* get our ducks in a row for the read */
   2371 	bp->b_blkno = offset / DEV_BSIZE;
   2372 	bp->b_bcount = dsize;
   2373 	bp->b_flags |= B_READ;
   2374  	bp->b_resid = dsize;
   2375 
   2376 	bdev_strategy(bp);
   2377 	error = biowait(bp);
   2378 
   2379 	if (!error) {
   2380 		memcpy(data, bp->b_data, msize);
   2381 	}
   2382 
   2383 	brelse(bp, 0);
   2384 	return(error);
   2385 }
   2386 
   2387 
   2388 static int
   2389 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2390     RF_ComponentLabel_t *clabel)
   2391 {
   2392 	return raidwrite_component_area(dev, b_vp, clabel,
   2393 	    sizeof(RF_ComponentLabel_t),
   2394 	    rf_component_info_offset(),
   2395 	    rf_component_info_size(secsize), 0);
   2396 }
   2397 
   2398 /* ARGSUSED */
   2399 static int
   2400 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2401     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2402 {
   2403 	struct buf *bp;
   2404 	int error;
   2405 
   2406 	/* get a block of the appropriate size... */
   2407 	bp = geteblk((int)dsize);
   2408 	bp->b_dev = dev;
   2409 
   2410 	/* get our ducks in a row for the write */
   2411 	bp->b_blkno = offset / DEV_BSIZE;
   2412 	bp->b_bcount = dsize;
   2413 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2414  	bp->b_resid = dsize;
   2415 
   2416 	memset(bp->b_data, 0, dsize);
   2417 	memcpy(bp->b_data, data, msize);
   2418 
   2419 	bdev_strategy(bp);
   2420 	if (asyncp)
   2421 		return 0;
   2422 	error = biowait(bp);
   2423 	brelse(bp, 0);
   2424 	if (error) {
   2425 #if 1
   2426 		printf("Failed to write RAID component info!\n");
   2427 #endif
   2428 	}
   2429 
   2430 	return(error);
   2431 }
   2432 
   2433 void
   2434 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2435 {
   2436 	int c;
   2437 
   2438 	for (c = 0; c < raidPtr->numCol; c++) {
   2439 		/* Skip dead disks. */
   2440 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2441 			continue;
   2442 		/* XXXjld: what if an error occurs here? */
   2443 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2444 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2445 		    RF_PARITYMAP_NBYTE,
   2446 		    rf_parity_map_offset(raidPtr),
   2447 		    rf_parity_map_size(raidPtr), 0);
   2448 	}
   2449 }
   2450 
   2451 void
   2452 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2453 {
   2454 	struct rf_paritymap_ondisk tmp;
   2455 	int c,first;
   2456 
   2457 	first=1;
   2458 	for (c = 0; c < raidPtr->numCol; c++) {
   2459 		/* Skip dead disks. */
   2460 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2461 			continue;
   2462 		raidread_component_area(raidPtr->Disks[c].dev,
   2463 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2464 		    RF_PARITYMAP_NBYTE,
   2465 		    rf_parity_map_offset(raidPtr),
   2466 		    rf_parity_map_size(raidPtr));
   2467 		if (first) {
   2468 			memcpy(map, &tmp, sizeof(*map));
   2469 			first = 0;
   2470 		} else {
   2471 			rf_paritymap_merge(map, &tmp);
   2472 		}
   2473 	}
   2474 }
   2475 
   2476 void
   2477 rf_markalldirty(RF_Raid_t *raidPtr)
   2478 {
   2479 	RF_ComponentLabel_t *clabel;
   2480 	int sparecol;
   2481 	int c;
   2482 	int j;
   2483 	int scol = -1;
   2484 
   2485 	raidPtr->mod_counter++;
   2486 	for (c = 0; c < raidPtr->numCol; c++) {
   2487 		/* we don't want to touch (at all) a disk that has
   2488 		   failed */
   2489 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2490 			clabel = raidget_component_label(raidPtr, c);
   2491 			if (clabel->status == rf_ds_spared) {
   2492 				/* XXX do something special...
   2493 				   but whatever you do, don't
   2494 				   try to access it!! */
   2495 			} else {
   2496 				raidmarkdirty(raidPtr, c);
   2497 			}
   2498 		}
   2499 	}
   2500 
   2501 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2502 		sparecol = raidPtr->numCol + c;
   2503 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2504 			/*
   2505 
   2506 			   we claim this disk is "optimal" if it's
   2507 			   rf_ds_used_spare, as that means it should be
   2508 			   directly substitutable for the disk it replaced.
   2509 			   We note that too...
   2510 
   2511 			 */
   2512 
   2513 			for(j=0;j<raidPtr->numCol;j++) {
   2514 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2515 					scol = j;
   2516 					break;
   2517 				}
   2518 			}
   2519 
   2520 			clabel = raidget_component_label(raidPtr, sparecol);
   2521 			/* make sure status is noted */
   2522 
   2523 			raid_init_component_label(raidPtr, clabel);
   2524 
   2525 			clabel->row = 0;
   2526 			clabel->column = scol;
   2527 			/* Note: we *don't* change status from rf_ds_used_spare
   2528 			   to rf_ds_optimal */
   2529 			/* clabel.status = rf_ds_optimal; */
   2530 
   2531 			raidmarkdirty(raidPtr, sparecol);
   2532 		}
   2533 	}
   2534 }
   2535 
   2536 
   2537 void
   2538 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2539 {
   2540 	RF_ComponentLabel_t *clabel;
   2541 	int sparecol;
   2542 	int c;
   2543 	int j;
   2544 	int scol;
   2545 	struct raid_softc *rs = raidPtr->softc;
   2546 
   2547 	scol = -1;
   2548 
   2549 	/* XXX should do extra checks to make sure things really are clean,
   2550 	   rather than blindly setting the clean bit... */
   2551 
   2552 	raidPtr->mod_counter++;
   2553 
   2554 	for (c = 0; c < raidPtr->numCol; c++) {
   2555 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2556 			clabel = raidget_component_label(raidPtr, c);
   2557 			/* make sure status is noted */
   2558 			clabel->status = rf_ds_optimal;
   2559 
   2560 			/* note what unit we are configured as */
   2561 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2562 				clabel->last_unit = raidPtr->raidid;
   2563 
   2564 			raidflush_component_label(raidPtr, c);
   2565 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2566 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2567 					raidmarkclean(raidPtr, c);
   2568 				}
   2569 			}
   2570 		}
   2571 		/* else we don't touch it.. */
   2572 	}
   2573 
   2574 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2575 		sparecol = raidPtr->numCol + c;
   2576 		/* Need to ensure that the reconstruct actually completed! */
   2577 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2578 			/*
   2579 
   2580 			   we claim this disk is "optimal" if it's
   2581 			   rf_ds_used_spare, as that means it should be
   2582 			   directly substitutable for the disk it replaced.
   2583 			   We note that too...
   2584 
   2585 			 */
   2586 
   2587 			for(j=0;j<raidPtr->numCol;j++) {
   2588 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2589 					scol = j;
   2590 					break;
   2591 				}
   2592 			}
   2593 
   2594 			/* XXX shouldn't *really* need this... */
   2595 			clabel = raidget_component_label(raidPtr, sparecol);
   2596 			/* make sure status is noted */
   2597 
   2598 			raid_init_component_label(raidPtr, clabel);
   2599 
   2600 			clabel->column = scol;
   2601 			clabel->status = rf_ds_optimal;
   2602 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2603 				clabel->last_unit = raidPtr->raidid;
   2604 
   2605 			raidflush_component_label(raidPtr, sparecol);
   2606 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2607 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2608 					raidmarkclean(raidPtr, sparecol);
   2609 				}
   2610 			}
   2611 		}
   2612 	}
   2613 }
   2614 
   2615 void
   2616 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2617 {
   2618 
   2619 	if (vp != NULL) {
   2620 		if (auto_configured == 1) {
   2621 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2622 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2623 			vput(vp);
   2624 
   2625 		} else {
   2626 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2627 		}
   2628 	}
   2629 }
   2630 
   2631 
   2632 void
   2633 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2634 {
   2635 	int r,c;
   2636 	struct vnode *vp;
   2637 	int acd;
   2638 
   2639 
   2640 	/* We take this opportunity to close the vnodes like we should.. */
   2641 
   2642 	for (c = 0; c < raidPtr->numCol; c++) {
   2643 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2644 		acd = raidPtr->Disks[c].auto_configured;
   2645 		rf_close_component(raidPtr, vp, acd);
   2646 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2647 		raidPtr->Disks[c].auto_configured = 0;
   2648 	}
   2649 
   2650 	for (r = 0; r < raidPtr->numSpare; r++) {
   2651 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2652 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2653 		rf_close_component(raidPtr, vp, acd);
   2654 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2655 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2656 	}
   2657 }
   2658 
   2659 
   2660 void
   2661 rf_ReconThread(struct rf_recon_req *req)
   2662 {
   2663 	int     s;
   2664 	RF_Raid_t *raidPtr;
   2665 
   2666 	s = splbio();
   2667 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2668 	raidPtr->recon_in_progress = 1;
   2669 
   2670 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2671 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2672 
   2673 	RF_Free(req, sizeof(*req));
   2674 
   2675 	raidPtr->recon_in_progress = 0;
   2676 	splx(s);
   2677 
   2678 	/* That's all... */
   2679 	kthread_exit(0);	/* does not return */
   2680 }
   2681 
   2682 void
   2683 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2684 {
   2685 	int retcode;
   2686 	int s;
   2687 
   2688 	raidPtr->parity_rewrite_stripes_done = 0;
   2689 	raidPtr->parity_rewrite_in_progress = 1;
   2690 	s = splbio();
   2691 	retcode = rf_RewriteParity(raidPtr);
   2692 	splx(s);
   2693 	if (retcode) {
   2694 		printf("raid%d: Error re-writing parity (%d)!\n",
   2695 		    raidPtr->raidid, retcode);
   2696 	} else {
   2697 		/* set the clean bit!  If we shutdown correctly,
   2698 		   the clean bit on each component label will get
   2699 		   set */
   2700 		raidPtr->parity_good = RF_RAID_CLEAN;
   2701 	}
   2702 	raidPtr->parity_rewrite_in_progress = 0;
   2703 
   2704 	/* Anyone waiting for us to stop?  If so, inform them... */
   2705 	if (raidPtr->waitShutdown) {
   2706 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2707 	}
   2708 
   2709 	/* That's all... */
   2710 	kthread_exit(0);	/* does not return */
   2711 }
   2712 
   2713 
   2714 void
   2715 rf_CopybackThread(RF_Raid_t *raidPtr)
   2716 {
   2717 	int s;
   2718 
   2719 	raidPtr->copyback_in_progress = 1;
   2720 	s = splbio();
   2721 	rf_CopybackReconstructedData(raidPtr);
   2722 	splx(s);
   2723 	raidPtr->copyback_in_progress = 0;
   2724 
   2725 	/* That's all... */
   2726 	kthread_exit(0);	/* does not return */
   2727 }
   2728 
   2729 
   2730 void
   2731 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2732 {
   2733 	int s;
   2734 	RF_Raid_t *raidPtr;
   2735 
   2736 	s = splbio();
   2737 	raidPtr = req->raidPtr;
   2738 	raidPtr->recon_in_progress = 1;
   2739 	rf_ReconstructInPlace(raidPtr, req->col);
   2740 	RF_Free(req, sizeof(*req));
   2741 	raidPtr->recon_in_progress = 0;
   2742 	splx(s);
   2743 
   2744 	/* That's all... */
   2745 	kthread_exit(0);	/* does not return */
   2746 }
   2747 
   2748 static RF_AutoConfig_t *
   2749 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2750     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2751     unsigned secsize)
   2752 {
   2753 	int good_one = 0;
   2754 	RF_ComponentLabel_t *clabel;
   2755 	RF_AutoConfig_t *ac;
   2756 
   2757 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2758 	if (clabel == NULL) {
   2759 oomem:
   2760 		    while(ac_list) {
   2761 			    ac = ac_list;
   2762 			    if (ac->clabel)
   2763 				    free(ac->clabel, M_RAIDFRAME);
   2764 			    ac_list = ac_list->next;
   2765 			    free(ac, M_RAIDFRAME);
   2766 		    }
   2767 		    printf("RAID auto config: out of memory!\n");
   2768 		    return NULL; /* XXX probably should panic? */
   2769 	}
   2770 
   2771 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2772 		/* Got the label.  Does it look reasonable? */
   2773 		if (rf_reasonable_label(clabel, numsecs) &&
   2774 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2775 #ifdef DEBUG
   2776 			printf("Component on: %s: %llu\n",
   2777 				cname, (unsigned long long)size);
   2778 			rf_print_component_label(clabel);
   2779 #endif
   2780 			/* if it's reasonable, add it, else ignore it. */
   2781 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2782 				M_NOWAIT);
   2783 			if (ac == NULL) {
   2784 				free(clabel, M_RAIDFRAME);
   2785 				goto oomem;
   2786 			}
   2787 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2788 			ac->dev = dev;
   2789 			ac->vp = vp;
   2790 			ac->clabel = clabel;
   2791 			ac->next = ac_list;
   2792 			ac_list = ac;
   2793 			good_one = 1;
   2794 		}
   2795 	}
   2796 	if (!good_one) {
   2797 		/* cleanup */
   2798 		free(clabel, M_RAIDFRAME);
   2799 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2800 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2801 		vput(vp);
   2802 	}
   2803 	return ac_list;
   2804 }
   2805 
   2806 RF_AutoConfig_t *
   2807 rf_find_raid_components(void)
   2808 {
   2809 	struct vnode *vp;
   2810 	struct disklabel label;
   2811 	device_t dv;
   2812 	deviter_t di;
   2813 	dev_t dev;
   2814 	int bmajor, bminor, wedge, rf_part_found;
   2815 	int error;
   2816 	int i;
   2817 	RF_AutoConfig_t *ac_list;
   2818 	uint64_t numsecs;
   2819 	unsigned secsize;
   2820 	int dowedges;
   2821 
   2822 	/* initialize the AutoConfig list */
   2823 	ac_list = NULL;
   2824 
   2825 	/*
   2826 	 * we begin by trolling through *all* the devices on the system *twice*
   2827 	 * first we scan for wedges, second for other devices. This avoids
   2828 	 * using a raw partition instead of a wedge that covers the whole disk
   2829 	 */
   2830 
   2831 	for (dowedges=1; dowedges>=0; --dowedges) {
   2832 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2833 		     dv = deviter_next(&di)) {
   2834 
   2835 			/* we are only interested in disks... */
   2836 			if (device_class(dv) != DV_DISK)
   2837 				continue;
   2838 
   2839 			/* we don't care about floppies... */
   2840 			if (device_is_a(dv, "fd")) {
   2841 				continue;
   2842 			}
   2843 
   2844 			/* we don't care about CD's... */
   2845 			if (device_is_a(dv, "cd")) {
   2846 				continue;
   2847 			}
   2848 
   2849 			/* we don't care about md's... */
   2850 			if (device_is_a(dv, "md")) {
   2851 				continue;
   2852 			}
   2853 
   2854 			/* hdfd is the Atari/Hades floppy driver */
   2855 			if (device_is_a(dv, "hdfd")) {
   2856 				continue;
   2857 			}
   2858 
   2859 			/* fdisa is the Atari/Milan floppy driver */
   2860 			if (device_is_a(dv, "fdisa")) {
   2861 				continue;
   2862 			}
   2863 
   2864 			/* are we in the wedges pass ? */
   2865 			wedge = device_is_a(dv, "dk");
   2866 			if (wedge != dowedges) {
   2867 				continue;
   2868 			}
   2869 
   2870 			/* need to find the device_name_to_block_device_major stuff */
   2871 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2872 
   2873 			rf_part_found = 0; /*No raid partition as yet*/
   2874 
   2875 			/* get a vnode for the raw partition of this disk */
   2876 			bminor = minor(device_unit(dv));
   2877 			dev = wedge ? makedev(bmajor, bminor) :
   2878 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2879 			if (bdevvp(dev, &vp))
   2880 				panic("RAID can't alloc vnode");
   2881 
   2882 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2883 
   2884 			if (error) {
   2885 				/* "Who cares."  Continue looking
   2886 				   for something that exists*/
   2887 				vput(vp);
   2888 				continue;
   2889 			}
   2890 
   2891 			error = getdisksize(vp, &numsecs, &secsize);
   2892 			if (error) {
   2893 				/*
   2894 				 * Pseudo devices like vnd and cgd can be
   2895 				 * opened but may still need some configuration.
   2896 				 * Ignore these quietly.
   2897 				 */
   2898 				if (error != ENXIO)
   2899 					printf("RAIDframe: can't get disk size"
   2900 					    " for dev %s (%d)\n",
   2901 					    device_xname(dv), error);
   2902 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2903 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2904 				vput(vp);
   2905 				continue;
   2906 			}
   2907 			if (wedge) {
   2908 				struct dkwedge_info dkw;
   2909 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2910 				    NOCRED);
   2911 				if (error) {
   2912 					printf("RAIDframe: can't get wedge info for "
   2913 					    "dev %s (%d)\n", device_xname(dv), error);
   2914 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2915 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2916 					vput(vp);
   2917 					continue;
   2918 				}
   2919 
   2920 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2921 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2922 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2923 					vput(vp);
   2924 					continue;
   2925 				}
   2926 
   2927 				ac_list = rf_get_component(ac_list, dev, vp,
   2928 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2929 				rf_part_found = 1; /*There is a raid component on this disk*/
   2930 				continue;
   2931 			}
   2932 
   2933 			/* Ok, the disk exists.  Go get the disklabel. */
   2934 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2935 			if (error) {
   2936 				/*
   2937 				 * XXX can't happen - open() would
   2938 				 * have errored out (or faked up one)
   2939 				 */
   2940 				if (error != ENOTTY)
   2941 					printf("RAIDframe: can't get label for dev "
   2942 					    "%s (%d)\n", device_xname(dv), error);
   2943 			}
   2944 
   2945 			/* don't need this any more.  We'll allocate it again
   2946 			   a little later if we really do... */
   2947 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2948 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2949 			vput(vp);
   2950 
   2951 			if (error)
   2952 				continue;
   2953 
   2954 			rf_part_found = 0; /*No raid partitions yet*/
   2955 			for (i = 0; i < label.d_npartitions; i++) {
   2956 				char cname[sizeof(ac_list->devname)];
   2957 
   2958 				/* We only support partitions marked as RAID */
   2959 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2960 					continue;
   2961 
   2962 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2963 				if (bdevvp(dev, &vp))
   2964 					panic("RAID can't alloc vnode");
   2965 
   2966 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2967 				if (error) {
   2968 					/* Whatever... */
   2969 					vput(vp);
   2970 					continue;
   2971 				}
   2972 				snprintf(cname, sizeof(cname), "%s%c",
   2973 				    device_xname(dv), 'a' + i);
   2974 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2975 					label.d_partitions[i].p_size, numsecs, secsize);
   2976 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2977 			}
   2978 
   2979 			/*
   2980 			 *If there is no raid component on this disk, either in a
   2981 			 *disklabel or inside a wedge, check the raw partition as well,
   2982 			 *as it is possible to configure raid components on raw disk
   2983 			 *devices.
   2984 			 */
   2985 
   2986 			if (!rf_part_found) {
   2987 				char cname[sizeof(ac_list->devname)];
   2988 
   2989 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2990 				if (bdevvp(dev, &vp))
   2991 					panic("RAID can't alloc vnode");
   2992 
   2993 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2994 				if (error) {
   2995 					/* Whatever... */
   2996 					vput(vp);
   2997 					continue;
   2998 				}
   2999 				snprintf(cname, sizeof(cname), "%s%c",
   3000 				    device_xname(dv), 'a' + RAW_PART);
   3001 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3002 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3003 			}
   3004 		}
   3005 		deviter_release(&di);
   3006 	}
   3007 	return ac_list;
   3008 }
   3009 
   3010 
   3011 int
   3012 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3013 {
   3014 
   3015 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3016 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3017 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3018 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3019 	    clabel->row >=0 &&
   3020 	    clabel->column >= 0 &&
   3021 	    clabel->num_rows > 0 &&
   3022 	    clabel->num_columns > 0 &&
   3023 	    clabel->row < clabel->num_rows &&
   3024 	    clabel->column < clabel->num_columns &&
   3025 	    clabel->blockSize > 0 &&
   3026 	    /*
   3027 	     * numBlocksHi may contain garbage, but it is ok since
   3028 	     * the type is unsigned.  If it is really garbage,
   3029 	     * rf_fix_old_label_size() will fix it.
   3030 	     */
   3031 	    rf_component_label_numblocks(clabel) > 0) {
   3032 		/*
   3033 		 * label looks reasonable enough...
   3034 		 * let's make sure it has no old garbage.
   3035 		 */
   3036 		if (numsecs)
   3037 			rf_fix_old_label_size(clabel, numsecs);
   3038 		return(1);
   3039 	}
   3040 	return(0);
   3041 }
   3042 
   3043 
   3044 /*
   3045  * For reasons yet unknown, some old component labels have garbage in
   3046  * the newer numBlocksHi region, and this causes lossage.  Since those
   3047  * disks will also have numsecs set to less than 32 bits of sectors,
   3048  * we can determine when this corruption has occurred, and fix it.
   3049  *
   3050  * The exact same problem, with the same unknown reason, happens to
   3051  * the partitionSizeHi member as well.
   3052  */
   3053 static void
   3054 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3055 {
   3056 
   3057 	if (numsecs < ((uint64_t)1 << 32)) {
   3058 		if (clabel->numBlocksHi) {
   3059 			printf("WARNING: total sectors < 32 bits, yet "
   3060 			       "numBlocksHi set\n"
   3061 			       "WARNING: resetting numBlocksHi to zero.\n");
   3062 			clabel->numBlocksHi = 0;
   3063 		}
   3064 
   3065 		if (clabel->partitionSizeHi) {
   3066 			printf("WARNING: total sectors < 32 bits, yet "
   3067 			       "partitionSizeHi set\n"
   3068 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3069 			clabel->partitionSizeHi = 0;
   3070 		}
   3071 	}
   3072 }
   3073 
   3074 
   3075 #ifdef DEBUG
   3076 void
   3077 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3078 {
   3079 	uint64_t numBlocks;
   3080 	static const char *rp[] = {
   3081 	    "No", "Force", "Soft", "*invalid*"
   3082 	};
   3083 
   3084 
   3085 	numBlocks = rf_component_label_numblocks(clabel);
   3086 
   3087 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3088 	       clabel->row, clabel->column,
   3089 	       clabel->num_rows, clabel->num_columns);
   3090 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3091 	       clabel->version, clabel->serial_number,
   3092 	       clabel->mod_counter);
   3093 	printf("   Clean: %s Status: %d\n",
   3094 	       clabel->clean ? "Yes" : "No", clabel->status);
   3095 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3096 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3097 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3098 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3099 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3100 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3101 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3102 #if 0
   3103 	   printf("   Config order: %d\n", clabel->config_order);
   3104 #endif
   3105 
   3106 }
   3107 #endif
   3108 
   3109 RF_ConfigSet_t *
   3110 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3111 {
   3112 	RF_AutoConfig_t *ac;
   3113 	RF_ConfigSet_t *config_sets;
   3114 	RF_ConfigSet_t *cset;
   3115 	RF_AutoConfig_t *ac_next;
   3116 
   3117 
   3118 	config_sets = NULL;
   3119 
   3120 	/* Go through the AutoConfig list, and figure out which components
   3121 	   belong to what sets.  */
   3122 	ac = ac_list;
   3123 	while(ac!=NULL) {
   3124 		/* we're going to putz with ac->next, so save it here
   3125 		   for use at the end of the loop */
   3126 		ac_next = ac->next;
   3127 
   3128 		if (config_sets == NULL) {
   3129 			/* will need at least this one... */
   3130 			config_sets = (RF_ConfigSet_t *)
   3131 				malloc(sizeof(RF_ConfigSet_t),
   3132 				       M_RAIDFRAME, M_NOWAIT);
   3133 			if (config_sets == NULL) {
   3134 				panic("rf_create_auto_sets: No memory!");
   3135 			}
   3136 			/* this one is easy :) */
   3137 			config_sets->ac = ac;
   3138 			config_sets->next = NULL;
   3139 			config_sets->rootable = 0;
   3140 			ac->next = NULL;
   3141 		} else {
   3142 			/* which set does this component fit into? */
   3143 			cset = config_sets;
   3144 			while(cset!=NULL) {
   3145 				if (rf_does_it_fit(cset, ac)) {
   3146 					/* looks like it matches... */
   3147 					ac->next = cset->ac;
   3148 					cset->ac = ac;
   3149 					break;
   3150 				}
   3151 				cset = cset->next;
   3152 			}
   3153 			if (cset==NULL) {
   3154 				/* didn't find a match above... new set..*/
   3155 				cset = (RF_ConfigSet_t *)
   3156 					malloc(sizeof(RF_ConfigSet_t),
   3157 					       M_RAIDFRAME, M_NOWAIT);
   3158 				if (cset == NULL) {
   3159 					panic("rf_create_auto_sets: No memory!");
   3160 				}
   3161 				cset->ac = ac;
   3162 				ac->next = NULL;
   3163 				cset->next = config_sets;
   3164 				cset->rootable = 0;
   3165 				config_sets = cset;
   3166 			}
   3167 		}
   3168 		ac = ac_next;
   3169 	}
   3170 
   3171 
   3172 	return(config_sets);
   3173 }
   3174 
   3175 static int
   3176 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3177 {
   3178 	RF_ComponentLabel_t *clabel1, *clabel2;
   3179 
   3180 	/* If this one matches the *first* one in the set, that's good
   3181 	   enough, since the other members of the set would have been
   3182 	   through here too... */
   3183 	/* note that we are not checking partitionSize here..
   3184 
   3185 	   Note that we are also not checking the mod_counters here.
   3186 	   If everything else matches except the mod_counter, that's
   3187 	   good enough for this test.  We will deal with the mod_counters
   3188 	   a little later in the autoconfiguration process.
   3189 
   3190 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3191 
   3192 	   The reason we don't check for this is that failed disks
   3193 	   will have lower modification counts.  If those disks are
   3194 	   not added to the set they used to belong to, then they will
   3195 	   form their own set, which may result in 2 different sets,
   3196 	   for example, competing to be configured at raid0, and
   3197 	   perhaps competing to be the root filesystem set.  If the
   3198 	   wrong ones get configured, or both attempt to become /,
   3199 	   weird behaviour and or serious lossage will occur.  Thus we
   3200 	   need to bring them into the fold here, and kick them out at
   3201 	   a later point.
   3202 
   3203 	*/
   3204 
   3205 	clabel1 = cset->ac->clabel;
   3206 	clabel2 = ac->clabel;
   3207 	if ((clabel1->version == clabel2->version) &&
   3208 	    (clabel1->serial_number == clabel2->serial_number) &&
   3209 	    (clabel1->num_rows == clabel2->num_rows) &&
   3210 	    (clabel1->num_columns == clabel2->num_columns) &&
   3211 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3212 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3213 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3214 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3215 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3216 	    (clabel1->blockSize == clabel2->blockSize) &&
   3217 	    rf_component_label_numblocks(clabel1) ==
   3218 	    rf_component_label_numblocks(clabel2) &&
   3219 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3220 	    (clabel1->root_partition == clabel2->root_partition) &&
   3221 	    (clabel1->last_unit == clabel2->last_unit) &&
   3222 	    (clabel1->config_order == clabel2->config_order)) {
   3223 		/* if it get's here, it almost *has* to be a match */
   3224 	} else {
   3225 		/* it's not consistent with somebody in the set..
   3226 		   punt */
   3227 		return(0);
   3228 	}
   3229 	/* all was fine.. it must fit... */
   3230 	return(1);
   3231 }
   3232 
   3233 int
   3234 rf_have_enough_components(RF_ConfigSet_t *cset)
   3235 {
   3236 	RF_AutoConfig_t *ac;
   3237 	RF_AutoConfig_t *auto_config;
   3238 	RF_ComponentLabel_t *clabel;
   3239 	int c;
   3240 	int num_cols;
   3241 	int num_missing;
   3242 	int mod_counter;
   3243 	int mod_counter_found;
   3244 	int even_pair_failed;
   3245 	char parity_type;
   3246 
   3247 
   3248 	/* check to see that we have enough 'live' components
   3249 	   of this set.  If so, we can configure it if necessary */
   3250 
   3251 	num_cols = cset->ac->clabel->num_columns;
   3252 	parity_type = cset->ac->clabel->parityConfig;
   3253 
   3254 	/* XXX Check for duplicate components!?!?!? */
   3255 
   3256 	/* Determine what the mod_counter is supposed to be for this set. */
   3257 
   3258 	mod_counter_found = 0;
   3259 	mod_counter = 0;
   3260 	ac = cset->ac;
   3261 	while(ac!=NULL) {
   3262 		if (mod_counter_found==0) {
   3263 			mod_counter = ac->clabel->mod_counter;
   3264 			mod_counter_found = 1;
   3265 		} else {
   3266 			if (ac->clabel->mod_counter > mod_counter) {
   3267 				mod_counter = ac->clabel->mod_counter;
   3268 			}
   3269 		}
   3270 		ac = ac->next;
   3271 	}
   3272 
   3273 	num_missing = 0;
   3274 	auto_config = cset->ac;
   3275 
   3276 	even_pair_failed = 0;
   3277 	for(c=0; c<num_cols; c++) {
   3278 		ac = auto_config;
   3279 		while(ac!=NULL) {
   3280 			if ((ac->clabel->column == c) &&
   3281 			    (ac->clabel->mod_counter == mod_counter)) {
   3282 				/* it's this one... */
   3283 #ifdef DEBUG
   3284 				printf("Found: %s at %d\n",
   3285 				       ac->devname,c);
   3286 #endif
   3287 				break;
   3288 			}
   3289 			ac=ac->next;
   3290 		}
   3291 		if (ac==NULL) {
   3292 				/* Didn't find one here! */
   3293 				/* special case for RAID 1, especially
   3294 				   where there are more than 2
   3295 				   components (where RAIDframe treats
   3296 				   things a little differently :( ) */
   3297 			if (parity_type == '1') {
   3298 				if (c%2 == 0) { /* even component */
   3299 					even_pair_failed = 1;
   3300 				} else { /* odd component.  If
   3301 					    we're failed, and
   3302 					    so is the even
   3303 					    component, it's
   3304 					    "Good Night, Charlie" */
   3305 					if (even_pair_failed == 1) {
   3306 						return(0);
   3307 					}
   3308 				}
   3309 			} else {
   3310 				/* normal accounting */
   3311 				num_missing++;
   3312 			}
   3313 		}
   3314 		if ((parity_type == '1') && (c%2 == 1)) {
   3315 				/* Just did an even component, and we didn't
   3316 				   bail.. reset the even_pair_failed flag,
   3317 				   and go on to the next component.... */
   3318 			even_pair_failed = 0;
   3319 		}
   3320 	}
   3321 
   3322 	clabel = cset->ac->clabel;
   3323 
   3324 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3325 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3326 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3327 		/* XXX this needs to be made *much* more general */
   3328 		/* Too many failures */
   3329 		return(0);
   3330 	}
   3331 	/* otherwise, all is well, and we've got enough to take a kick
   3332 	   at autoconfiguring this set */
   3333 	return(1);
   3334 }
   3335 
   3336 void
   3337 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3338 			RF_Raid_t *raidPtr)
   3339 {
   3340 	RF_ComponentLabel_t *clabel;
   3341 	int i;
   3342 
   3343 	clabel = ac->clabel;
   3344 
   3345 	/* 1. Fill in the common stuff */
   3346 	config->numRow = clabel->num_rows = 1;
   3347 	config->numCol = clabel->num_columns;
   3348 	config->numSpare = 0; /* XXX should this be set here? */
   3349 	config->sectPerSU = clabel->sectPerSU;
   3350 	config->SUsPerPU = clabel->SUsPerPU;
   3351 	config->SUsPerRU = clabel->SUsPerRU;
   3352 	config->parityConfig = clabel->parityConfig;
   3353 	/* XXX... */
   3354 	strcpy(config->diskQueueType,"fifo");
   3355 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3356 	config->layoutSpecificSize = 0; /* XXX ?? */
   3357 
   3358 	while(ac!=NULL) {
   3359 		/* row/col values will be in range due to the checks
   3360 		   in reasonable_label() */
   3361 		strcpy(config->devnames[0][ac->clabel->column],
   3362 		       ac->devname);
   3363 		ac = ac->next;
   3364 	}
   3365 
   3366 	for(i=0;i<RF_MAXDBGV;i++) {
   3367 		config->debugVars[i][0] = 0;
   3368 	}
   3369 }
   3370 
   3371 int
   3372 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3373 {
   3374 	RF_ComponentLabel_t *clabel;
   3375 	int column;
   3376 	int sparecol;
   3377 
   3378 	raidPtr->autoconfigure = new_value;
   3379 
   3380 	for(column=0; column<raidPtr->numCol; column++) {
   3381 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3382 			clabel = raidget_component_label(raidPtr, column);
   3383 			clabel->autoconfigure = new_value;
   3384 			raidflush_component_label(raidPtr, column);
   3385 		}
   3386 	}
   3387 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3388 		sparecol = raidPtr->numCol + column;
   3389 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3390 			clabel = raidget_component_label(raidPtr, sparecol);
   3391 			clabel->autoconfigure = new_value;
   3392 			raidflush_component_label(raidPtr, sparecol);
   3393 		}
   3394 	}
   3395 	return(new_value);
   3396 }
   3397 
   3398 int
   3399 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3400 {
   3401 	RF_ComponentLabel_t *clabel;
   3402 	int column;
   3403 	int sparecol;
   3404 
   3405 	raidPtr->root_partition = new_value;
   3406 	for(column=0; column<raidPtr->numCol; column++) {
   3407 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3408 			clabel = raidget_component_label(raidPtr, column);
   3409 			clabel->root_partition = new_value;
   3410 			raidflush_component_label(raidPtr, column);
   3411 		}
   3412 	}
   3413 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3414 		sparecol = raidPtr->numCol + column;
   3415 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3416 			clabel = raidget_component_label(raidPtr, sparecol);
   3417 			clabel->root_partition = new_value;
   3418 			raidflush_component_label(raidPtr, sparecol);
   3419 		}
   3420 	}
   3421 	return(new_value);
   3422 }
   3423 
   3424 void
   3425 rf_release_all_vps(RF_ConfigSet_t *cset)
   3426 {
   3427 	RF_AutoConfig_t *ac;
   3428 
   3429 	ac = cset->ac;
   3430 	while(ac!=NULL) {
   3431 		/* Close the vp, and give it back */
   3432 		if (ac->vp) {
   3433 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3434 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3435 			vput(ac->vp);
   3436 			ac->vp = NULL;
   3437 		}
   3438 		ac = ac->next;
   3439 	}
   3440 }
   3441 
   3442 
   3443 void
   3444 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3445 {
   3446 	RF_AutoConfig_t *ac;
   3447 	RF_AutoConfig_t *next_ac;
   3448 
   3449 	ac = cset->ac;
   3450 	while(ac!=NULL) {
   3451 		next_ac = ac->next;
   3452 		/* nuke the label */
   3453 		free(ac->clabel, M_RAIDFRAME);
   3454 		/* cleanup the config structure */
   3455 		free(ac, M_RAIDFRAME);
   3456 		/* "next.." */
   3457 		ac = next_ac;
   3458 	}
   3459 	/* and, finally, nuke the config set */
   3460 	free(cset, M_RAIDFRAME);
   3461 }
   3462 
   3463 
   3464 void
   3465 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3466 {
   3467 	/* current version number */
   3468 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3469 	clabel->serial_number = raidPtr->serial_number;
   3470 	clabel->mod_counter = raidPtr->mod_counter;
   3471 
   3472 	clabel->num_rows = 1;
   3473 	clabel->num_columns = raidPtr->numCol;
   3474 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3475 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3476 
   3477 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3478 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3479 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3480 
   3481 	clabel->blockSize = raidPtr->bytesPerSector;
   3482 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3483 
   3484 	/* XXX not portable */
   3485 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3486 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3487 	clabel->autoconfigure = raidPtr->autoconfigure;
   3488 	clabel->root_partition = raidPtr->root_partition;
   3489 	clabel->last_unit = raidPtr->raidid;
   3490 	clabel->config_order = raidPtr->config_order;
   3491 
   3492 #ifndef RF_NO_PARITY_MAP
   3493 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3494 #endif
   3495 }
   3496 
   3497 struct raid_softc *
   3498 rf_auto_config_set(RF_ConfigSet_t *cset)
   3499 {
   3500 	RF_Raid_t *raidPtr;
   3501 	RF_Config_t *config;
   3502 	int raidID;
   3503 	struct raid_softc *sc;
   3504 
   3505 #ifdef DEBUG
   3506 	printf("RAID autoconfigure\n");
   3507 #endif
   3508 
   3509 	/* 1. Create a config structure */
   3510 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3511 	if (config == NULL) {
   3512 		printf("%s: Out of mem - config!?!?\n", __func__);
   3513 				/* XXX do something more intelligent here. */
   3514 		return NULL;
   3515 	}
   3516 
   3517 	/*
   3518 	   2. Figure out what RAID ID this one is supposed to live at
   3519 	   See if we can get the same RAID dev that it was configured
   3520 	   on last time..
   3521 	*/
   3522 
   3523 	raidID = cset->ac->clabel->last_unit;
   3524 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3525 	     sc = raidget(++raidID, false))
   3526 		continue;
   3527 #ifdef DEBUG
   3528 	printf("Configuring raid%d:\n",raidID);
   3529 #endif
   3530 
   3531 	if (sc == NULL)
   3532 		sc = raidget(raidID, true);
   3533 	if (sc == NULL) {
   3534 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3535 				/* XXX do something more intelligent here. */
   3536 		free(config, M_RAIDFRAME);
   3537 		return NULL;
   3538 	}
   3539 
   3540 	raidPtr = &sc->sc_r;
   3541 
   3542 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3543 	raidPtr->softc = sc;
   3544 	raidPtr->raidid = raidID;
   3545 	raidPtr->openings = RAIDOUTSTANDING;
   3546 
   3547 	/* 3. Build the configuration structure */
   3548 	rf_create_configuration(cset->ac, config, raidPtr);
   3549 
   3550 	/* 4. Do the configuration */
   3551 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3552 		raidinit(sc);
   3553 
   3554 		rf_markalldirty(raidPtr);
   3555 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3556 		switch (cset->ac->clabel->root_partition) {
   3557 		case 1:	/* Force Root */
   3558 		case 2:	/* Soft Root: root when boot partition part of raid */
   3559 			/*
   3560 			 * everything configured just fine.  Make a note
   3561 			 * that this set is eligible to be root,
   3562 			 * or forced to be root
   3563 			 */
   3564 			cset->rootable = cset->ac->clabel->root_partition;
   3565 			/* XXX do this here? */
   3566 			raidPtr->root_partition = cset->rootable;
   3567 			break;
   3568 		default:
   3569 			break;
   3570 		}
   3571 	} else {
   3572 		raidput(sc);
   3573 		sc = NULL;
   3574 	}
   3575 
   3576 	/* 5. Cleanup */
   3577 	free(config, M_RAIDFRAME);
   3578 	return sc;
   3579 }
   3580 
   3581 void
   3582 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3583 	     size_t xmin, size_t xmax)
   3584 {
   3585 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3586 	pool_sethiwat(p, xmax);
   3587 	pool_prime(p, xmin);
   3588 	pool_setlowat(p, xmin);
   3589 }
   3590 
   3591 /*
   3592  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3593  * to see if there is IO pending and if that IO could possibly be done
   3594  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3595  * otherwise.
   3596  *
   3597  */
   3598 int
   3599 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3600 {
   3601 	struct raid_softc *rs;
   3602 	struct dk_softc *dksc;
   3603 
   3604 	rs = raidPtr->softc;
   3605 	dksc = &rs->sc_dksc;
   3606 
   3607 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3608 		return 1;
   3609 
   3610 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3611 		/* there is work to do */
   3612 		return 0;
   3613 	}
   3614 	/* default is nothing to do */
   3615 	return 1;
   3616 }
   3617 
   3618 int
   3619 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3620 {
   3621 	uint64_t numsecs;
   3622 	unsigned secsize;
   3623 	int error;
   3624 
   3625 	error = getdisksize(vp, &numsecs, &secsize);
   3626 	if (error == 0) {
   3627 		diskPtr->blockSize = secsize;
   3628 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3629 		diskPtr->partitionSize = numsecs;
   3630 		return 0;
   3631 	}
   3632 	return error;
   3633 }
   3634 
   3635 static int
   3636 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3637 {
   3638 	return 1;
   3639 }
   3640 
   3641 static void
   3642 raid_attach(device_t parent, device_t self, void *aux)
   3643 {
   3644 }
   3645 
   3646 
   3647 static int
   3648 raid_detach(device_t self, int flags)
   3649 {
   3650 	int error;
   3651 	struct raid_softc *rs = raidsoftc(self);
   3652 
   3653 	if (rs == NULL)
   3654 		return ENXIO;
   3655 
   3656 	if ((error = raidlock(rs)) != 0)
   3657 		return (error);
   3658 
   3659 	error = raid_detach_unlocked(rs);
   3660 
   3661 	raidunlock(rs);
   3662 
   3663 	/* XXX raid can be referenced here */
   3664 
   3665 	if (error)
   3666 		return error;
   3667 
   3668 	/* Free the softc */
   3669 	raidput(rs);
   3670 
   3671 	return 0;
   3672 }
   3673 
   3674 static void
   3675 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3676 {
   3677 	struct dk_softc *dksc = &rs->sc_dksc;
   3678 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3679 
   3680 	memset(dg, 0, sizeof(*dg));
   3681 
   3682 	dg->dg_secperunit = raidPtr->totalSectors;
   3683 	dg->dg_secsize = raidPtr->bytesPerSector;
   3684 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3685 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3686 
   3687 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3688 }
   3689 
   3690 /*
   3691  * Get cache info for all the components (including spares).
   3692  * Returns intersection of all the cache flags of all disks, or first
   3693  * error if any encountered.
   3694  * XXXfua feature flags can change as spares are added - lock down somehow
   3695  */
   3696 static int
   3697 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3698 {
   3699 	int c;
   3700 	int error;
   3701 	int dkwhole = 0, dkpart;
   3702 
   3703 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3704 		/*
   3705 		 * Check any non-dead disk, even when currently being
   3706 		 * reconstructed.
   3707 		 */
   3708 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3709 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3710 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3711 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3712 			if (error) {
   3713 				if (error != ENODEV) {
   3714 					printf("raid%d: get cache for component %s failed\n",
   3715 					    raidPtr->raidid,
   3716 					    raidPtr->Disks[c].devname);
   3717 				}
   3718 
   3719 				return error;
   3720 			}
   3721 
   3722 			if (c == 0)
   3723 				dkwhole = dkpart;
   3724 			else
   3725 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3726 		}
   3727 	}
   3728 
   3729 	*data = dkwhole;
   3730 
   3731 	return 0;
   3732 }
   3733 
   3734 /*
   3735  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3736  * We end up returning whatever error was returned by the first cache flush
   3737  * that fails.
   3738  */
   3739 
   3740 int
   3741 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3742 {
   3743 	int c, sparecol;
   3744 	int e,error;
   3745 	int force = 1;
   3746 
   3747 	error = 0;
   3748 	for (c = 0; c < raidPtr->numCol; c++) {
   3749 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3750 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3751 					  &force, FWRITE, NOCRED);
   3752 			if (e) {
   3753 				if (e != ENODEV)
   3754 					printf("raid%d: cache flush to component %s failed.\n",
   3755 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3756 				if (error == 0) {
   3757 					error = e;
   3758 				}
   3759 			}
   3760 		}
   3761 	}
   3762 
   3763 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3764 		sparecol = raidPtr->numCol + c;
   3765 		/* Need to ensure that the reconstruct actually completed! */
   3766 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3767 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3768 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3769 			if (e) {
   3770 				if (e != ENODEV)
   3771 					printf("raid%d: cache flush to component %s failed.\n",
   3772 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3773 				if (error == 0) {
   3774 					error = e;
   3775 				}
   3776 			}
   3777 		}
   3778 	}
   3779 	return error;
   3780 }
   3781 
   3782 /*
   3783  * Module interface
   3784  */
   3785 
   3786 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3787 
   3788 #ifdef _MODULE
   3789 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3790 #endif
   3791 
   3792 static int raid_modcmd(modcmd_t, void *);
   3793 static int raid_modcmd_init(void);
   3794 static int raid_modcmd_fini(void);
   3795 
   3796 static int
   3797 raid_modcmd(modcmd_t cmd, void *data)
   3798 {
   3799 	int error;
   3800 
   3801 	error = 0;
   3802 	switch (cmd) {
   3803 	case MODULE_CMD_INIT:
   3804 		error = raid_modcmd_init();
   3805 		break;
   3806 	case MODULE_CMD_FINI:
   3807 		error = raid_modcmd_fini();
   3808 		break;
   3809 	default:
   3810 		error = ENOTTY;
   3811 		break;
   3812 	}
   3813 	return error;
   3814 }
   3815 
   3816 static int
   3817 raid_modcmd_init(void)
   3818 {
   3819 	int error;
   3820 #ifdef _MODULE
   3821 	int bmajor, cmajor;
   3822 #endif
   3823 
   3824 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3825 	mutex_enter(&raid_lock);
   3826 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3827 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3828 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3829 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3830 
   3831 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3832 #endif
   3833 
   3834 #ifdef _MODULE
   3835 	bmajor = cmajor = -1;
   3836 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3837 	    &raid_cdevsw, &cmajor);
   3838 	if (error != 0) {
   3839 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3840 		mutex_exit(&raid_lock);
   3841 		return error;
   3842 	}
   3843 	error = config_cfdriver_attach(&raid_cd);
   3844 	if (error != 0) {
   3845 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3846 		    __func__, error);
   3847 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3848 		mutex_exit(&raid_lock);
   3849 		return error;
   3850 	}
   3851 #endif
   3852 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3853 	if (error != 0) {
   3854 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3855 		    __func__, error);
   3856 #ifdef _MODULE
   3857 		config_cfdriver_detach(&raid_cd);
   3858 #endif
   3859 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3860 		mutex_exit(&raid_lock);
   3861 		return error;
   3862 	}
   3863 
   3864 	raidautoconfigdone = false;
   3865 
   3866 	mutex_exit(&raid_lock);
   3867 
   3868 	if (error == 0) {
   3869 		if (rf_BootRaidframe(true) == 0)
   3870 			aprint_verbose("Kernelized RAIDframe activated\n");
   3871 		else
   3872 			panic("Serious error activating RAID!!");
   3873 	}
   3874 
   3875 	/*
   3876 	 * Register a finalizer which will be used to auto-config RAID
   3877 	 * sets once all real hardware devices have been found.
   3878 	 */
   3879 	error = config_finalize_register(NULL, rf_autoconfig);
   3880 	if (error != 0) {
   3881 		aprint_error("WARNING: unable to register RAIDframe "
   3882 		    "finalizer\n");
   3883 		error = 0;
   3884 	}
   3885 
   3886 	return error;
   3887 }
   3888 
   3889 static int
   3890 raid_modcmd_fini(void)
   3891 {
   3892 	int error;
   3893 
   3894 	mutex_enter(&raid_lock);
   3895 
   3896 	/* Don't allow unload if raid device(s) exist.  */
   3897 	if (!LIST_EMPTY(&raids)) {
   3898 		mutex_exit(&raid_lock);
   3899 		return EBUSY;
   3900 	}
   3901 
   3902 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3903 	if (error != 0) {
   3904 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3905 		mutex_exit(&raid_lock);
   3906 		return error;
   3907 	}
   3908 #ifdef _MODULE
   3909 	error = config_cfdriver_detach(&raid_cd);
   3910 	if (error != 0) {
   3911 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3912 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3913 		mutex_exit(&raid_lock);
   3914 		return error;
   3915 	}
   3916 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3917 	if (error != 0) {
   3918 		aprint_error("%s: cannot detach devsw\n",__func__);
   3919 		config_cfdriver_attach(&raid_cd);
   3920 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3921 		mutex_exit(&raid_lock);
   3922 		return error;
   3923 	}
   3924 #endif
   3925 	rf_BootRaidframe(false);
   3926 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3927 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3928 	rf_destroy_cond2(rf_sparet_wait_cv);
   3929 	rf_destroy_cond2(rf_sparet_resp_cv);
   3930 #endif
   3931 	mutex_exit(&raid_lock);
   3932 	mutex_destroy(&raid_lock);
   3933 
   3934 	return error;
   3935 }
   3936