Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.368
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.368 2019/02/06 02:49:09 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.368 2019/02/06 02:49:09 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    179 
    180 /* prototypes */
    181 static void KernelWakeupFunc(struct buf *);
    182 static void InitBP(struct buf *, struct vnode *, unsigned,
    183     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    184     void *, int, struct proc *);
    185 static void raidinit(struct raid_softc *);
    186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    188 
    189 static int raid_match(device_t, cfdata_t, void *);
    190 static void raid_attach(device_t, device_t, void *);
    191 static int raid_detach(device_t, int);
    192 
    193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    194     daddr_t, daddr_t);
    195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t, int);
    197 
    198 static int raidwrite_component_label(unsigned,
    199     dev_t, struct vnode *, RF_ComponentLabel_t *);
    200 static int raidread_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 
    203 static int raid_diskstart(device_t, struct buf *bp);
    204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    205 static int raid_lastclose(device_t);
    206 
    207 static dev_type_open(raidopen);
    208 static dev_type_close(raidclose);
    209 static dev_type_read(raidread);
    210 static dev_type_write(raidwrite);
    211 static dev_type_ioctl(raidioctl);
    212 static dev_type_strategy(raidstrategy);
    213 static dev_type_dump(raiddump);
    214 static dev_type_size(raidsize);
    215 
    216 const struct bdevsw raid_bdevsw = {
    217 	.d_open = raidopen,
    218 	.d_close = raidclose,
    219 	.d_strategy = raidstrategy,
    220 	.d_ioctl = raidioctl,
    221 	.d_dump = raiddump,
    222 	.d_psize = raidsize,
    223 	.d_discard = nodiscard,
    224 	.d_flag = D_DISK
    225 };
    226 
    227 const struct cdevsw raid_cdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_read = raidread,
    231 	.d_write = raidwrite,
    232 	.d_ioctl = raidioctl,
    233 	.d_stop = nostop,
    234 	.d_tty = notty,
    235 	.d_poll = nopoll,
    236 	.d_mmap = nommap,
    237 	.d_kqfilter = nokqfilter,
    238 	.d_discard = nodiscard,
    239 	.d_flag = D_DISK
    240 };
    241 
    242 static struct dkdriver rf_dkdriver = {
    243 	.d_open = raidopen,
    244 	.d_close = raidclose,
    245 	.d_strategy = raidstrategy,
    246 	.d_diskstart = raid_diskstart,
    247 	.d_dumpblocks = raid_dumpblocks,
    248 	.d_lastclose = raid_lastclose,
    249 	.d_minphys = minphys
    250 };
    251 
    252 #define	raidunit(x)	DISKUNIT(x)
    253 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /* Internal representation of a rf_recon_req */
    261 struct rf_recon_req_internal {
    262 	RF_RowCol_t col;
    263 	RF_ReconReqFlags_t flags;
    264 	void   *raidPtr;
    265 };
    266 
    267 /*
    268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    269  * Be aware that large numbers can allow the driver to consume a lot of
    270  * kernel memory, especially on writes, and in degraded mode reads.
    271  *
    272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    273  * a single 64K write will typically require 64K for the old data,
    274  * 64K for the old parity, and 64K for the new parity, for a total
    275  * of 192K (if the parity buffer is not re-used immediately).
    276  * Even it if is used immediately, that's still 128K, which when multiplied
    277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    278  *
    279  * Now in degraded mode, for example, a 64K read on the above setup may
    280  * require data reconstruction, which will require *all* of the 4 remaining
    281  * disks to participate -- 4 * 32K/disk == 128K again.
    282  */
    283 
    284 #ifndef RAIDOUTSTANDING
    285 #define RAIDOUTSTANDING   6
    286 #endif
    287 
    288 #define RAIDLABELDEV(dev)	\
    289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    290 
    291 /* declared here, and made public, for the benefit of KVM stuff.. */
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req_internal *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	sc->sc_unit = unit;
    342 	cv_init(&sc->sc_cv, "raidunit");
    343 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    344 	return sc;
    345 }
    346 
    347 static void
    348 raiddestroy(struct raid_softc *sc) {
    349 	cv_destroy(&sc->sc_cv);
    350 	mutex_destroy(&sc->sc_mutex);
    351 	kmem_free(sc, sizeof(*sc));
    352 }
    353 
    354 static struct raid_softc *
    355 raidget(int unit, bool create) {
    356 	struct raid_softc *sc;
    357 	if (unit < 0) {
    358 #ifdef DIAGNOSTIC
    359 		panic("%s: unit %d!", __func__, unit);
    360 #endif
    361 		return NULL;
    362 	}
    363 	mutex_enter(&raid_lock);
    364 	LIST_FOREACH(sc, &raids, sc_link) {
    365 		if (sc->sc_unit == unit) {
    366 			mutex_exit(&raid_lock);
    367 			return sc;
    368 		}
    369 	}
    370 	mutex_exit(&raid_lock);
    371 	if (!create)
    372 		return NULL;
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return (0);
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok, rootable %d\n",
    502 				    sc->sc_unit, cset->rootable);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 	dksc = &rsc->sc_dksc;
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL) {
    527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    528 		return;
    529 	}
    530 
    531 	/* we found something bootable... */
    532 
    533 	/*
    534 	 * XXX: The following code assumes that the root raid
    535 	 * is the first ('a') partition. This is about the best
    536 	 * we can do with a BSD disklabel, but we might be able
    537 	 * to do better with a GPT label, by setting a specified
    538 	 * attribute to indicate the root partition. We can then
    539 	 * stash the partition number in the r->root_partition
    540 	 * high bits (the bottom 2 bits are already used). For
    541 	 * now we just set booted_partition to 0 when we override
    542 	 * root.
    543 	 */
    544 	if (num_root == 1) {
    545 		device_t candidate_root;
    546 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    547 			char cname[sizeof(cset->ac->devname)];
    548 			/* XXX: assume partition 'a' first */
    549 			snprintf(cname, sizeof(cname), "%s%c",
    550 			    device_xname(dksc->sc_dev), 'a');
    551 			candidate_root = dkwedge_find_by_wname(cname);
    552 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    553 			    cname);
    554 			if (candidate_root == NULL) {
    555 				/*
    556 				 * If that is not found, because we don't use
    557 				 * disklabel, return the first dk child
    558 				 * XXX: we can skip the 'a' check above
    559 				 * and always do this...
    560 				 */
    561 				size_t i = 0;
    562 				candidate_root = dkwedge_find_by_parent(
    563 				    device_xname(dksc->sc_dev), &i);
    564 			}
    565 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    566 			    candidate_root);
    567 		} else
    568 			candidate_root = dksc->sc_dev;
    569 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    570 		DPRINTF("%s: booted_device=%p root_partition=%d "
    571 			"contains_boot=%d",
    572 		    __func__, booted_device, rsc->sc_r.root_partition,
    573 			   rf_containsboot(&rsc->sc_r, booted_device));
    574 		/* XXX the check for booted_device == NULL can probably be
    575 		 * dropped, now that rf_containsboot handles that case.
    576 		 */
    577 		if (booted_device == NULL ||
    578 		    rsc->sc_r.root_partition == 1 ||
    579 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    580 			booted_device = candidate_root;
    581 			booted_method = "raidframe/single";
    582 			booted_partition = 0;	/* XXX assume 'a' */
    583 		}
    584 	} else if (num_root > 1) {
    585 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    586 		    booted_device);
    587 
    588 		/*
    589 		 * Maybe the MD code can help. If it cannot, then
    590 		 * setroot() will discover that we have no
    591 		 * booted_device and will ask the user if nothing was
    592 		 * hardwired in the kernel config file
    593 		 */
    594 		if (booted_device == NULL)
    595 			return;
    596 
    597 		num_root = 0;
    598 		mutex_enter(&raid_lock);
    599 		LIST_FOREACH(sc, &raids, sc_link) {
    600 			RF_Raid_t *r = &sc->sc_r;
    601 			if (r->valid == 0)
    602 				continue;
    603 
    604 			if (r->root_partition == 0)
    605 				continue;
    606 
    607 			if (rf_containsboot(r, booted_device)) {
    608 				num_root++;
    609 				rsc = sc;
    610 				dksc = &rsc->sc_dksc;
    611 			}
    612 		}
    613 		mutex_exit(&raid_lock);
    614 
    615 		if (num_root == 1) {
    616 			booted_device = dksc->sc_dev;
    617 			booted_method = "raidframe/multi";
    618 			booted_partition = 0;	/* XXX assume 'a' */
    619 		} else {
    620 			/* we can't guess.. require the user to answer... */
    621 			boothowto |= RB_ASKNAME;
    622 		}
    623 	}
    624 }
    625 
    626 static int
    627 raidsize(dev_t dev)
    628 {
    629 	struct raid_softc *rs;
    630 	struct dk_softc *dksc;
    631 	unsigned int unit;
    632 
    633 	unit = raidunit(dev);
    634 	if ((rs = raidget(unit, false)) == NULL)
    635 		return -1;
    636 	dksc = &rs->sc_dksc;
    637 
    638 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    639 		return -1;
    640 
    641 	return dk_size(dksc, dev);
    642 }
    643 
    644 static int
    645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    646 {
    647 	unsigned int unit;
    648 	struct raid_softc *rs;
    649 	struct dk_softc *dksc;
    650 
    651 	unit = raidunit(dev);
    652 	if ((rs = raidget(unit, false)) == NULL)
    653 		return ENXIO;
    654 	dksc = &rs->sc_dksc;
    655 
    656 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    657 		return ENODEV;
    658 
    659         /*
    660            Note that blkno is relative to this particular partition.
    661            By adding adding RF_PROTECTED_SECTORS, we get a value that
    662 	   is relative to the partition used for the underlying component.
    663         */
    664 	blkno += RF_PROTECTED_SECTORS;
    665 
    666 	return dk_dump(dksc, dev, blkno, va, size);
    667 }
    668 
    669 static int
    670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    671 {
    672 	struct raid_softc *rs = raidsoftc(dev);
    673 	const struct bdevsw *bdev;
    674 	RF_Raid_t *raidPtr;
    675 	int     c, sparecol, j, scol, dumpto;
    676 	int     error = 0;
    677 
    678 	raidPtr = &rs->sc_r;
    679 
    680 	/* we only support dumping to RAID 1 sets */
    681 	if (raidPtr->Layout.numDataCol != 1 ||
    682 	    raidPtr->Layout.numParityCol != 1)
    683 		return EINVAL;
    684 
    685 	if ((error = raidlock(rs)) != 0)
    686 		return error;
    687 
    688 	/* figure out what device is alive.. */
    689 
    690 	/*
    691 	   Look for a component to dump to.  The preference for the
    692 	   component to dump to is as follows:
    693 	   1) the master
    694 	   2) a used_spare of the master
    695 	   3) the slave
    696 	   4) a used_spare of the slave
    697 	*/
    698 
    699 	dumpto = -1;
    700 	for (c = 0; c < raidPtr->numCol; c++) {
    701 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    702 			/* this might be the one */
    703 			dumpto = c;
    704 			break;
    705 		}
    706 	}
    707 
    708 	/*
    709 	   At this point we have possibly selected a live master or a
    710 	   live slave.  We now check to see if there is a spared
    711 	   master (or a spared slave), if we didn't find a live master
    712 	   or a live slave.
    713 	*/
    714 
    715 	for (c = 0; c < raidPtr->numSpare; c++) {
    716 		sparecol = raidPtr->numCol + c;
    717 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    718 			/* How about this one? */
    719 			scol = -1;
    720 			for(j=0;j<raidPtr->numCol;j++) {
    721 				if (raidPtr->Disks[j].spareCol == sparecol) {
    722 					scol = j;
    723 					break;
    724 				}
    725 			}
    726 			if (scol == 0) {
    727 				/*
    728 				   We must have found a spared master!
    729 				   We'll take that over anything else
    730 				   found so far.  (We couldn't have
    731 				   found a real master before, since
    732 				   this is a used spare, and it's
    733 				   saying that it's replacing the
    734 				   master.)  On reboot (with
    735 				   autoconfiguration turned on)
    736 				   sparecol will become the 1st
    737 				   component (component0) of this set.
    738 				*/
    739 				dumpto = sparecol;
    740 				break;
    741 			} else if (scol != -1) {
    742 				/*
    743 				   Must be a spared slave.  We'll dump
    744 				   to that if we havn't found anything
    745 				   else so far.
    746 				*/
    747 				if (dumpto == -1)
    748 					dumpto = sparecol;
    749 			}
    750 		}
    751 	}
    752 
    753 	if (dumpto == -1) {
    754 		/* we couldn't find any live components to dump to!?!?
    755 		 */
    756 		error = EINVAL;
    757 		goto out;
    758 	}
    759 
    760 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    761 	if (bdev == NULL) {
    762 		error = ENXIO;
    763 		goto out;
    764 	}
    765 
    766 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    767 				blkno, va, nblk * raidPtr->bytesPerSector);
    768 
    769 out:
    770 	raidunlock(rs);
    771 
    772 	return error;
    773 }
    774 
    775 /* ARGSUSED */
    776 static int
    777 raidopen(dev_t dev, int flags, int fmt,
    778     struct lwp *l)
    779 {
    780 	int     unit = raidunit(dev);
    781 	struct raid_softc *rs;
    782 	struct dk_softc *dksc;
    783 	int     error = 0;
    784 	int     part, pmask;
    785 
    786 	if ((rs = raidget(unit, true)) == NULL)
    787 		return ENXIO;
    788 	if ((error = raidlock(rs)) != 0)
    789 		return (error);
    790 
    791 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    792 		error = EBUSY;
    793 		goto bad;
    794 	}
    795 
    796 	dksc = &rs->sc_dksc;
    797 
    798 	part = DISKPART(dev);
    799 	pmask = (1 << part);
    800 
    801 	if (!DK_BUSY(dksc, pmask) &&
    802 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    803 		/* First one... mark things as dirty... Note that we *MUST*
    804 		 have done a configure before this.  I DO NOT WANT TO BE
    805 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    806 		 THAT THEY BELONG TOGETHER!!!!! */
    807 		/* XXX should check to see if we're only open for reading
    808 		   here... If so, we needn't do this, but then need some
    809 		   other way of keeping track of what's happened.. */
    810 
    811 		rf_markalldirty(&rs->sc_r);
    812 	}
    813 
    814 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    815 		error = dk_open(dksc, dev, flags, fmt, l);
    816 
    817 bad:
    818 	raidunlock(rs);
    819 
    820 	return (error);
    821 
    822 
    823 }
    824 
    825 static int
    826 raid_lastclose(device_t self)
    827 {
    828 	struct raid_softc *rs = raidsoftc(self);
    829 
    830 	/* Last one... device is not unconfigured yet.
    831 	   Device shutdown has taken care of setting the
    832 	   clean bits if RAIDF_INITED is not set
    833 	   mark things as clean... */
    834 
    835 	rf_update_component_labels(&rs->sc_r,
    836 	    RF_FINAL_COMPONENT_UPDATE);
    837 
    838 	/* pass to unlocked code */
    839 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    840 		rs->sc_flags |= RAIDF_DETACH;
    841 
    842 	return 0;
    843 }
    844 
    845 /* ARGSUSED */
    846 static int
    847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    848 {
    849 	int     unit = raidunit(dev);
    850 	struct raid_softc *rs;
    851 	struct dk_softc *dksc;
    852 	cfdata_t cf;
    853 	int     error = 0, do_detach = 0, do_put = 0;
    854 
    855 	if ((rs = raidget(unit, false)) == NULL)
    856 		return ENXIO;
    857 	dksc = &rs->sc_dksc;
    858 
    859 	if ((error = raidlock(rs)) != 0)
    860 		return (error);
    861 
    862 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    863 		error = dk_close(dksc, dev, flags, fmt, l);
    864 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    865 			do_detach = 1;
    866 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    867 		do_put = 1;
    868 
    869 	raidunlock(rs);
    870 
    871 	if (do_detach) {
    872 		/* free the pseudo device attach bits */
    873 		cf = device_cfdata(dksc->sc_dev);
    874 		error = config_detach(dksc->sc_dev, 0);
    875 		if (error == 0)
    876 			free(cf, M_RAIDFRAME);
    877 	} else if (do_put) {
    878 		raidput(rs);
    879 	}
    880 
    881 	return (error);
    882 
    883 }
    884 
    885 static void
    886 raid_wakeup(RF_Raid_t *raidPtr)
    887 {
    888 	rf_lock_mutex2(raidPtr->iodone_lock);
    889 	rf_signal_cond2(raidPtr->iodone_cv);
    890 	rf_unlock_mutex2(raidPtr->iodone_lock);
    891 }
    892 
    893 static void
    894 raidstrategy(struct buf *bp)
    895 {
    896 	unsigned int unit;
    897 	struct raid_softc *rs;
    898 	struct dk_softc *dksc;
    899 	RF_Raid_t *raidPtr;
    900 
    901 	unit = raidunit(bp->b_dev);
    902 	if ((rs = raidget(unit, false)) == NULL) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    907 		bp->b_error = ENXIO;
    908 		goto fail;
    909 	}
    910 	dksc = &rs->sc_dksc;
    911 	raidPtr = &rs->sc_r;
    912 
    913 	/* Queue IO only */
    914 	if (dk_strategy_defer(dksc, bp))
    915 		goto done;
    916 
    917 	/* schedule the IO to happen at the next convenient time */
    918 	raid_wakeup(raidPtr);
    919 
    920 done:
    921 	return;
    922 
    923 fail:
    924 	bp->b_resid = bp->b_bcount;
    925 	biodone(bp);
    926 }
    927 
    928 static int
    929 raid_diskstart(device_t dev, struct buf *bp)
    930 {
    931 	struct raid_softc *rs = raidsoftc(dev);
    932 	RF_Raid_t *raidPtr;
    933 
    934 	raidPtr = &rs->sc_r;
    935 	if (!raidPtr->valid) {
    936 		db1_printf(("raid is not valid..\n"));
    937 		return ENODEV;
    938 	}
    939 
    940 	/* XXX */
    941 	bp->b_resid = 0;
    942 
    943 	return raiddoaccess(raidPtr, bp);
    944 }
    945 
    946 void
    947 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    948 {
    949 	struct raid_softc *rs;
    950 	struct dk_softc *dksc;
    951 
    952 	rs = raidPtr->softc;
    953 	dksc = &rs->sc_dksc;
    954 
    955 	dk_done(dksc, bp);
    956 
    957 	rf_lock_mutex2(raidPtr->mutex);
    958 	raidPtr->openings++;
    959 	rf_unlock_mutex2(raidPtr->mutex);
    960 
    961 	/* schedule more IO */
    962 	raid_wakeup(raidPtr);
    963 }
    964 
    965 /* ARGSUSED */
    966 static int
    967 raidread(dev_t dev, struct uio *uio, int flags)
    968 {
    969 	int     unit = raidunit(dev);
    970 	struct raid_softc *rs;
    971 
    972 	if ((rs = raidget(unit, false)) == NULL)
    973 		return ENXIO;
    974 
    975 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    976 		return (ENXIO);
    977 
    978 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    979 
    980 }
    981 
    982 /* ARGSUSED */
    983 static int
    984 raidwrite(dev_t dev, struct uio *uio, int flags)
    985 {
    986 	int     unit = raidunit(dev);
    987 	struct raid_softc *rs;
    988 
    989 	if ((rs = raidget(unit, false)) == NULL)
    990 		return ENXIO;
    991 
    992 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    993 		return (ENXIO);
    994 
    995 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    996 
    997 }
    998 
    999 static int
   1000 raid_detach_unlocked(struct raid_softc *rs)
   1001 {
   1002 	struct dk_softc *dksc = &rs->sc_dksc;
   1003 	RF_Raid_t *raidPtr;
   1004 	int error;
   1005 
   1006 	raidPtr = &rs->sc_r;
   1007 
   1008 	if (DK_BUSY(dksc, 0) ||
   1009 	    raidPtr->recon_in_progress != 0 ||
   1010 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1011 	    raidPtr->copyback_in_progress != 0)
   1012 		return EBUSY;
   1013 
   1014 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1015 		return 0;
   1016 
   1017 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1018 
   1019 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1020 		return error;
   1021 
   1022 	rs->sc_flags &= ~RAIDF_INITED;
   1023 
   1024 	/* Kill off any queued buffers */
   1025 	dk_drain(dksc);
   1026 	bufq_free(dksc->sc_bufq);
   1027 
   1028 	/* Detach the disk. */
   1029 	dkwedge_delall(&dksc->sc_dkdev);
   1030 	disk_detach(&dksc->sc_dkdev);
   1031 	disk_destroy(&dksc->sc_dkdev);
   1032 	dk_detach(dksc);
   1033 
   1034 	return 0;
   1035 }
   1036 
   1037 static bool
   1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1039 {
   1040 	switch (cmd) {
   1041 	case RAIDFRAME_ADD_HOT_SPARE:
   1042 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1043 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1044 	case RAIDFRAME_CHECK_PARITY:
   1045 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1046 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1047 	case RAIDFRAME_CHECK_RECON_STATUS:
   1048 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1049 	case RAIDFRAME_COPYBACK:
   1050 	case RAIDFRAME_DELETE_COMPONENT:
   1051 	case RAIDFRAME_FAIL_DISK:
   1052 	case RAIDFRAME_GET_ACCTOTALS:
   1053 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1054 	case RAIDFRAME_GET_INFO:
   1055 	case RAIDFRAME_GET_SIZE:
   1056 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1057 	case RAIDFRAME_INIT_LABELS:
   1058 	case RAIDFRAME_KEEP_ACCTOTALS:
   1059 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1060 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1061 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1062 	case RAIDFRAME_PARITYMAP_STATUS:
   1063 	case RAIDFRAME_REBUILD_IN_PLACE:
   1064 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1065 	case RAIDFRAME_RESET_ACCTOTALS:
   1066 	case RAIDFRAME_REWRITEPARITY:
   1067 	case RAIDFRAME_SET_AUTOCONFIG:
   1068 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1069 	case RAIDFRAME_SET_ROOT:
   1070 		return (rs->sc_flags & RAIDF_INITED) != 0;
   1071 	}
   1072 	return false;
   1073 }
   1074 
   1075 int
   1076 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1077 {
   1078 	struct rf_recon_req_internal *rrint;
   1079 
   1080 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1081 		/* Can't do this on a RAID 0!! */
   1082 		return EINVAL;
   1083 	}
   1084 
   1085 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1086 		/* bad column */
   1087 		return EINVAL;
   1088 	}
   1089 
   1090 	rf_lock_mutex2(raidPtr->mutex);
   1091 	if (raidPtr->status == rf_rs_reconstructing) {
   1092 		/* you can't fail a disk while we're reconstructing! */
   1093 		/* XXX wrong for RAID6 */
   1094 		goto out;
   1095 	}
   1096 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1097 	    (raidPtr->numFailures > 0)) {
   1098 		/* some other component has failed.  Let's not make
   1099 		   things worse. XXX wrong for RAID6 */
   1100 		goto out;
   1101 	}
   1102 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1103 		/* Can't fail a spared disk! */
   1104 		goto out;
   1105 	}
   1106 	rf_unlock_mutex2(raidPtr->mutex);
   1107 
   1108 	/* make a copy of the recon request so that we don't rely on
   1109 	 * the user's buffer */
   1110 	RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1111 	if (rrint == NULL)
   1112 		return(ENOMEM);
   1113 	rrint->col = rr->col;
   1114 	rrint->flags = rr->flags;
   1115 	rrint->raidPtr = raidPtr;
   1116 
   1117 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1118 	    rrint, "raid_recon");
   1119 out:
   1120 	rf_unlock_mutex2(raidPtr->mutex);
   1121 	return EINVAL;
   1122 }
   1123 
   1124 static int
   1125 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1126 {
   1127 	/* allocate a buffer for the layout-specific data, and copy it in */
   1128 	if (k_cfg->layoutSpecificSize == 0)
   1129 		return 0;
   1130 
   1131 	if (k_cfg->layoutSpecificSize > 10000) {
   1132 	    /* sanity check */
   1133 	    return EINVAL;
   1134 	}
   1135 
   1136 	u_char *specific_buf;
   1137 	RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, (u_char *));
   1138 	if (specific_buf == NULL)
   1139 		return ENOMEM;
   1140 
   1141 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1142 	    k_cfg->layoutSpecificSize);
   1143 	if (retcode) {
   1144 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1145 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1146 		return retcode;
   1147 	}
   1148 
   1149 	k_cfg->layoutSpecific = specific_buf;
   1150 	return 0;
   1151 }
   1152 
   1153 static int
   1154 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1155 {
   1156 	if (rs->sc_r.valid) {
   1157 		/* There is a valid RAID set running on this unit! */
   1158 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1159 		return EINVAL;
   1160 	}
   1161 
   1162 	/* copy-in the configuration information */
   1163 	/* data points to a pointer to the configuration structure */
   1164 	RF_Malloc(*k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1165 	if (*k_cfg == NULL) {
   1166 		return ENOMEM;
   1167 	}
   1168 	int retcode = copyin(data, k_cfg, sizeof(RF_Config_t));
   1169 	if (retcode == 0)
   1170 		return 0;
   1171 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1172 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1173 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1174 	return retcode;
   1175 }
   1176 
   1177 int
   1178 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1179 {
   1180 	int retcode;
   1181 	RF_Raid_t *raidPtr = &rs->sc_r;
   1182 
   1183 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1184 
   1185 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1186 		goto out;
   1187 
   1188 	/* should do some kind of sanity check on the configuration.
   1189 	 * Store the sum of all the bytes in the last byte? */
   1190 
   1191 	/* configure the system */
   1192 
   1193 	/*
   1194 	 * Clear the entire RAID descriptor, just to make sure
   1195 	 *  there is no stale data left in the case of a
   1196 	 *  reconfiguration
   1197 	 */
   1198 	memset(raidPtr, 0, sizeof(*raidPtr));
   1199 	raidPtr->softc = rs;
   1200 	raidPtr->raidid = rs->sc_unit;
   1201 
   1202 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1203 
   1204 	if (retcode == 0) {
   1205 		/* allow this many simultaneous IO's to
   1206 		   this RAID device */
   1207 		raidPtr->openings = RAIDOUTSTANDING;
   1208 
   1209 		raidinit(rs);
   1210 		raid_wakeup(raidPtr);
   1211 		rf_markalldirty(raidPtr);
   1212 	}
   1213 
   1214 	/* free the buffers.  No return code here. */
   1215 	if (k_cfg->layoutSpecificSize) {
   1216 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1217 	}
   1218 out:
   1219 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1220 	if (retcode) {
   1221 		/*
   1222 		 * If configuration failed, set sc_flags so that we
   1223 		 * will detach the device when we close it.
   1224 		 */
   1225 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1226 	}
   1227 	return retcode;
   1228 }
   1229 
   1230 #if RF_DISABLED
   1231 static int
   1232 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1233 {
   1234 
   1235 	/* XXX check the label for valid stuff... */
   1236 	/* Note that some things *should not* get modified --
   1237 	   the user should be re-initing the labels instead of
   1238 	   trying to patch things.
   1239 	   */
   1240 #ifdef DEBUG
   1241 	int raidid = raidPtr->raidid;
   1242 	printf("raid%d: Got component label:\n", raidid);
   1243 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1244 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1245 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1246 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1247 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1248 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1249 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1250 #endif	/* DEBUG */
   1251 	clabel->row = 0;
   1252 	int column = clabel->column;
   1253 
   1254 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1255 		return(EINVAL);
   1256 	}
   1257 
   1258 	/* XXX this isn't allowed to do anything for now :-) */
   1259 
   1260 	/* XXX and before it is, we need to fill in the rest
   1261 	   of the fields!?!?!?! */
   1262 	memcpy(raidget_component_label(raidPtr, column),
   1263 	    clabel, sizeof(*clabel));
   1264 	raidflush_component_label(raidPtr, column);
   1265 	return 0;
   1266 }
   1267 #endif
   1268 
   1269 static int
   1270 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1271 {
   1272 	/*
   1273 	   we only want the serial number from
   1274 	   the above.  We get all the rest of the information
   1275 	   from the config that was used to create this RAID
   1276 	   set.
   1277 	   */
   1278 
   1279 	raidPtr->serial_number = clabel->serial_number;
   1280 
   1281 	for (int column = 0; column < raidPtr->numCol; column++) {
   1282 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1283 		if (RF_DEAD_DISK(diskPtr->status))
   1284 			continue;
   1285 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1286 		    raidPtr, column);
   1287 		/* Zeroing this is important. */
   1288 		memset(ci_label, 0, sizeof(*ci_label));
   1289 		raid_init_component_label(raidPtr, ci_label);
   1290 		ci_label->serial_number = raidPtr->serial_number;
   1291 		ci_label->row = 0; /* we dont' pretend to support more */
   1292 		rf_component_label_set_partitionsize(ci_label,
   1293 		    diskPtr->partitionSize);
   1294 		ci_label->column = column;
   1295 		raidflush_component_label(raidPtr, column);
   1296 		/* XXXjld what about the spares? */
   1297 	}
   1298 
   1299 	return 0;
   1300 }
   1301 
   1302 static int
   1303 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1304 {
   1305 
   1306 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1307 		/* Can't do this on a RAID 0!! */
   1308 		return EINVAL;
   1309 	}
   1310 
   1311 	if (raidPtr->recon_in_progress == 1) {
   1312 		/* a reconstruct is already in progress! */
   1313 		return EINVAL;
   1314 	}
   1315 
   1316 	RF_SingleComponent_t component;
   1317 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1318 	component.row = 0; /* we don't support any more */
   1319 	int column = component.column;
   1320 
   1321 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1322 		return EINVAL;
   1323 	}
   1324 
   1325 	rf_lock_mutex2(raidPtr->mutex);
   1326 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1327 	    (raidPtr->numFailures > 0)) {
   1328 		/* XXX 0 above shouldn't be constant!!! */
   1329 		/* some component other than this has failed.
   1330 		   Let's not make things worse than they already
   1331 		   are... */
   1332 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1333 		       raidPtr->raidid);
   1334 		printf("raid%d:     Col: %d   Too many failures.\n",
   1335 		       raidPtr->raidid, column);
   1336 		rf_unlock_mutex2(raidPtr->mutex);
   1337 		return EINVAL;
   1338 	}
   1339 
   1340 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1341 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1342 		       raidPtr->raidid);
   1343 		printf("raid%d:    Col: %d   "
   1344 		    "Reconstruction already occurring!\n",
   1345 		    raidPtr->raidid, column);
   1346 
   1347 		rf_unlock_mutex2(raidPtr->mutex);
   1348 		return EINVAL;
   1349 	}
   1350 
   1351 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1352 		rf_unlock_mutex2(raidPtr->mutex);
   1353 		return EINVAL;
   1354 	}
   1355 
   1356 	rf_unlock_mutex2(raidPtr->mutex);
   1357 
   1358 	struct rf_recon_req_internal *rrint;
   1359 	RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1360 	if (rrint == NULL)
   1361 		return ENOMEM;
   1362 
   1363 	rrint->col = column;
   1364 	rrint->raidPtr = raidPtr;
   1365 
   1366 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1367 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1368 }
   1369 
   1370 static int
   1371 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1372 {
   1373 	/*
   1374 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1375 	 * so tell the user it's done.
   1376 	 */
   1377 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1378 	    raidPtr->status != rf_rs_reconstructing) {
   1379 		*data = 100;
   1380 		return 0;
   1381 	}
   1382 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1383 		*data = 0;
   1384 		return 0;
   1385 	}
   1386 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1387 	    / raidPtr->reconControl->numRUsTotal);
   1388 	return 0;
   1389 }
   1390 
   1391 static int
   1392 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1393 {
   1394 	int     unit = raidunit(dev);
   1395 	int     part, pmask;
   1396 	struct raid_softc *rs;
   1397 	struct dk_softc *dksc;
   1398 	RF_Config_t *k_cfg;
   1399 	RF_Raid_t *raidPtr;
   1400 	RF_AccTotals_t *totals;
   1401 	RF_SingleComponent_t component;
   1402 	RF_DeviceConfig_t *d_cfg, *ucfgp = data;
   1403 	int retcode = 0;
   1404 	int column;
   1405 	RF_ComponentLabel_t *clabel;
   1406 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1407 	int d;
   1408 
   1409 	if ((rs = raidget(unit, false)) == NULL)
   1410 		return ENXIO;
   1411 
   1412 	dksc = &rs->sc_dksc;
   1413 	raidPtr = &rs->sc_r;
   1414 
   1415 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1416 	    (int) DISKPART(dev), (int) unit, cmd));
   1417 
   1418 	/* Must be initialized for these... */
   1419 	if (rf_must_be_initialized(rs, cmd))
   1420 		return ENXIO;
   1421 
   1422 	switch (cmd) {
   1423 		/* configure the system */
   1424 	case RAIDFRAME_CONFIGURE:
   1425 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1426 			return retcode;
   1427 		return rf_construct(rs, k_cfg);
   1428 
   1429 		/* shutdown the system */
   1430 	case RAIDFRAME_SHUTDOWN:
   1431 
   1432 		part = DISKPART(dev);
   1433 		pmask = (1 << part);
   1434 
   1435 		if ((retcode = raidlock(rs)) != 0)
   1436 			return retcode;
   1437 
   1438 		if (DK_BUSY(dksc, pmask) ||
   1439 		    raidPtr->recon_in_progress != 0 ||
   1440 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1441 		    raidPtr->copyback_in_progress != 0)
   1442 			retcode = EBUSY;
   1443 		else {
   1444 			/* detach and free on close */
   1445 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1446 			retcode = 0;
   1447 		}
   1448 
   1449 		raidunlock(rs);
   1450 
   1451 		return retcode;
   1452 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1453 		return rf_get_component_label(raidPtr, data);
   1454 
   1455 #if RF_DISABLED
   1456 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1457 		return rf_set_component_label(raidPtr, data);
   1458 #endif
   1459 
   1460 	case RAIDFRAME_INIT_LABELS:
   1461 		return rf_init_component_label(raidPtr, data);
   1462 
   1463 	case RAIDFRAME_SET_AUTOCONFIG:
   1464 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1465 		printf("raid%d: New autoconfig value is: %d\n",
   1466 		       raidPtr->raidid, d);
   1467 		*(int *) data = d;
   1468 		return retcode;
   1469 
   1470 	case RAIDFRAME_SET_ROOT:
   1471 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1472 		printf("raid%d: New rootpartition value is: %d\n",
   1473 		       raidPtr->raidid, d);
   1474 		*(int *) data = d;
   1475 		return retcode;
   1476 
   1477 		/* initialize all parity */
   1478 	case RAIDFRAME_REWRITEPARITY:
   1479 
   1480 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1481 			/* Parity for RAID 0 is trivially correct */
   1482 			raidPtr->parity_good = RF_RAID_CLEAN;
   1483 			return 0;
   1484 		}
   1485 
   1486 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1487 			/* Re-write is already in progress! */
   1488 			return EINVAL;
   1489 		}
   1490 
   1491 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1492 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1493 
   1494 	case RAIDFRAME_ADD_HOT_SPARE:
   1495 		sparePtr = (RF_SingleComponent_t *) data;
   1496 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1497 		return rf_add_hot_spare(raidPtr, &component);
   1498 
   1499 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1500 		return retcode;
   1501 
   1502 	case RAIDFRAME_DELETE_COMPONENT:
   1503 		componentPtr = (RF_SingleComponent_t *)data;
   1504 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1505 		return rf_delete_component(raidPtr, &component);
   1506 
   1507 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1508 		componentPtr = (RF_SingleComponent_t *)data;
   1509 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1510 		return rf_incorporate_hot_spare(raidPtr, &component);
   1511 
   1512 	case RAIDFRAME_REBUILD_IN_PLACE:
   1513 		return rf_rebuild_in_place(raidPtr, data);
   1514 
   1515 	case RAIDFRAME_GET_INFO:
   1516 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1517 			  (RF_DeviceConfig_t *));
   1518 		if (d_cfg == NULL)
   1519 			return ENOMEM;
   1520 		retcode = rf_get_info(raidPtr, d_cfg);
   1521 		if (retcode == 0) {
   1522 		    retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1523 		}
   1524 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1525 		return retcode;
   1526 
   1527 	case RAIDFRAME_CHECK_PARITY:
   1528 		*(int *) data = raidPtr->parity_good;
   1529 		return 0;
   1530 
   1531 	case RAIDFRAME_PARITYMAP_STATUS:
   1532 		if (rf_paritymap_ineligible(raidPtr))
   1533 			return EINVAL;
   1534 		rf_paritymap_status(raidPtr->parity_map, data);
   1535 		return 0;
   1536 
   1537 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1538 		if (rf_paritymap_ineligible(raidPtr))
   1539 			return EINVAL;
   1540 		if (raidPtr->parity_map == NULL)
   1541 			return ENOENT; /* ??? */
   1542 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1543 			return EINVAL;
   1544 		return 0;
   1545 
   1546 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1547 		if (rf_paritymap_ineligible(raidPtr))
   1548 			return EINVAL;
   1549 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1550 		return 0;
   1551 
   1552 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1553 		if (rf_paritymap_ineligible(raidPtr))
   1554 			return EINVAL;
   1555 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1556 		/* XXX should errors be passed up? */
   1557 		return 0;
   1558 
   1559 	case RAIDFRAME_RESET_ACCTOTALS:
   1560 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1561 		return 0;
   1562 
   1563 	case RAIDFRAME_GET_ACCTOTALS:
   1564 		totals = (RF_AccTotals_t *) data;
   1565 		*totals = raidPtr->acc_totals;
   1566 		return 0;
   1567 
   1568 	case RAIDFRAME_KEEP_ACCTOTALS:
   1569 		raidPtr->keep_acc_totals = *(int *)data;
   1570 		return 0;
   1571 
   1572 	case RAIDFRAME_GET_SIZE:
   1573 		*(int *) data = raidPtr->totalSectors;
   1574 		return 0;
   1575 
   1576 	case RAIDFRAME_FAIL_DISK:
   1577 		return rf_fail_disk(raidPtr, data);
   1578 
   1579 		/* invoke a copyback operation after recon on whatever disk
   1580 		 * needs it, if any */
   1581 	case RAIDFRAME_COPYBACK:
   1582 
   1583 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1584 			/* This makes no sense on a RAID 0!! */
   1585 			return EINVAL;
   1586 		}
   1587 
   1588 		if (raidPtr->copyback_in_progress == 1) {
   1589 			/* Copyback is already in progress! */
   1590 			return EINVAL;
   1591 		}
   1592 
   1593 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1594 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1595 
   1596 		/* return the percentage completion of reconstruction */
   1597 	case RAIDFRAME_CHECK_RECON_STATUS:
   1598 		return rf_check_recon_status(raidPtr, data);
   1599 
   1600 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1601 		rf_check_recon_status_ext(raidPtr, data);
   1602 		return 0;
   1603 
   1604 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1605 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1606 			/* This makes no sense on a RAID 0, so tell the
   1607 			   user it's done. */
   1608 			*(int *) data = 100;
   1609 			return 0;
   1610 		}
   1611 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1612 			*(int *) data = 100 *
   1613 				raidPtr->parity_rewrite_stripes_done /
   1614 				raidPtr->Layout.numStripe;
   1615 		} else {
   1616 			*(int *) data = 100;
   1617 		}
   1618 		return 0;
   1619 
   1620 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1621 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1622 		return 0;
   1623 
   1624 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1625 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1626 			/* This makes no sense on a RAID 0 */
   1627 			*(int *) data = 100;
   1628 			return 0;
   1629 		}
   1630 		if (raidPtr->copyback_in_progress == 1) {
   1631 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1632 				raidPtr->Layout.numStripe;
   1633 		} else {
   1634 			*(int *) data = 100;
   1635 		}
   1636 		return 0;
   1637 
   1638 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1639 		rf_check_copyback_status_ext(raidPtr, data);
   1640 		return 0;
   1641 
   1642 	case RAIDFRAME_SET_LAST_UNIT:
   1643 		for (column = 0; column < raidPtr->numCol; column++)
   1644 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1645 				return EBUSY;
   1646 
   1647 		for (column = 0; column < raidPtr->numCol; column++) {
   1648 			clabel = raidget_component_label(raidPtr, column);
   1649 			clabel->last_unit = *(int *)data;
   1650 			raidflush_component_label(raidPtr, column);
   1651 		}
   1652 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1653 		return 0;
   1654 
   1655 		/* the sparetable daemon calls this to wait for the kernel to
   1656 		 * need a spare table. this ioctl does not return until a
   1657 		 * spare table is needed. XXX -- calling mpsleep here in the
   1658 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1659 		 * -- I should either compute the spare table in the kernel,
   1660 		 * or have a different -- XXX XXX -- interface (a different
   1661 		 * character device) for delivering the table     -- XXX */
   1662 #if RF_DISABLED
   1663 	case RAIDFRAME_SPARET_WAIT:
   1664 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1665 		while (!rf_sparet_wait_queue)
   1666 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1667 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1668 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1669 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1670 
   1671 		/* structure assignment */
   1672 		*((RF_SparetWait_t *) data) = *waitreq;
   1673 
   1674 		RF_Free(waitreq, sizeof(*waitreq));
   1675 		return 0;
   1676 
   1677 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1678 		 * code in it that will cause the dameon to exit */
   1679 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1680 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1681 		waitreq->fcol = -1;
   1682 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1683 		waitreq->next = rf_sparet_wait_queue;
   1684 		rf_sparet_wait_queue = waitreq;
   1685 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1686 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1687 		return 0;
   1688 
   1689 		/* used by the spare table daemon to deliver a spare table
   1690 		 * into the kernel */
   1691 	case RAIDFRAME_SEND_SPARET:
   1692 
   1693 		/* install the spare table */
   1694 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1695 
   1696 		/* respond to the requestor.  the return status of the spare
   1697 		 * table installation is passed in the "fcol" field */
   1698 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1699 		waitreq->fcol = retcode;
   1700 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1701 		waitreq->next = rf_sparet_resp_queue;
   1702 		rf_sparet_resp_queue = waitreq;
   1703 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1704 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1705 
   1706 		return retcode;
   1707 #endif
   1708 	default:
   1709 #ifdef _LP64
   1710 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1711 			module_autoload("compat_netbsd32_raid",
   1712 			    MODULE_CLASS_EXEC);
   1713 			MODULE_CALL_HOOK(raidframe_netbsd32_ioctl_hook,
   1714 			    (rs, cmd, data), enosys(), retcode);
   1715 			if (retcode != EPASSTHROUGH)
   1716 				return retcode;
   1717 		}
   1718 #endif
   1719 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1720 		MODULE_CALL_HOOK(raidframe_ioctl_80_hook,
   1721 		    (rs, cmd, data), enosys(), retcode);
   1722 		if (retcode != EPASSTHROUGH)
   1723 			return retcode;
   1724 
   1725 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1726 		MODULE_CALL_HOOK(raidframe_ioctl_50_hook,
   1727 		    (rs, cmd, data), enosys(), retcode);
   1728 		if (retcode != EPASSTHROUGH)
   1729 			return retcode;
   1730 		break; /* fall through to the os-specific code below */
   1731 
   1732 	}
   1733 
   1734 	if (!raidPtr->valid)
   1735 		return (EINVAL);
   1736 
   1737 	/*
   1738 	 * Add support for "regular" device ioctls here.
   1739 	 */
   1740 
   1741 	switch (cmd) {
   1742 	case DIOCGCACHE:
   1743 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1744 		break;
   1745 
   1746 	case DIOCCACHESYNC:
   1747 		retcode = rf_sync_component_caches(raidPtr);
   1748 		break;
   1749 
   1750 	default:
   1751 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1752 		break;
   1753 	}
   1754 
   1755 	return (retcode);
   1756 
   1757 }
   1758 
   1759 
   1760 /* raidinit -- complete the rest of the initialization for the
   1761    RAIDframe device.  */
   1762 
   1763 
   1764 static void
   1765 raidinit(struct raid_softc *rs)
   1766 {
   1767 	cfdata_t cf;
   1768 	unsigned int unit;
   1769 	struct dk_softc *dksc = &rs->sc_dksc;
   1770 	RF_Raid_t *raidPtr = &rs->sc_r;
   1771 	device_t dev;
   1772 
   1773 	unit = raidPtr->raidid;
   1774 
   1775 	/* XXX doesn't check bounds. */
   1776 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1777 
   1778 	/* attach the pseudo device */
   1779 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1780 	cf->cf_name = raid_cd.cd_name;
   1781 	cf->cf_atname = raid_cd.cd_name;
   1782 	cf->cf_unit = unit;
   1783 	cf->cf_fstate = FSTATE_STAR;
   1784 
   1785 	dev = config_attach_pseudo(cf);
   1786 	if (dev == NULL) {
   1787 		printf("raid%d: config_attach_pseudo failed\n",
   1788 		    raidPtr->raidid);
   1789 		free(cf, M_RAIDFRAME);
   1790 		return;
   1791 	}
   1792 
   1793 	/* provide a backpointer to the real softc */
   1794 	raidsoftc(dev) = rs;
   1795 
   1796 	/* disk_attach actually creates space for the CPU disklabel, among
   1797 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1798 	 * with disklabels. */
   1799 	dk_init(dksc, dev, DKTYPE_RAID);
   1800 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1801 
   1802 	/* XXX There may be a weird interaction here between this, and
   1803 	 * protectedSectors, as used in RAIDframe.  */
   1804 
   1805 	rs->sc_size = raidPtr->totalSectors;
   1806 
   1807 	/* Attach dk and disk subsystems */
   1808 	dk_attach(dksc);
   1809 	disk_attach(&dksc->sc_dkdev);
   1810 	rf_set_geometry(rs, raidPtr);
   1811 
   1812 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1813 
   1814 	/* mark unit as usuable */
   1815 	rs->sc_flags |= RAIDF_INITED;
   1816 
   1817 	dkwedge_discover(&dksc->sc_dkdev);
   1818 }
   1819 
   1820 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1821 /* wake up the daemon & tell it to get us a spare table
   1822  * XXX
   1823  * the entries in the queues should be tagged with the raidPtr
   1824  * so that in the extremely rare case that two recons happen at once,
   1825  * we know for which device were requesting a spare table
   1826  * XXX
   1827  *
   1828  * XXX This code is not currently used. GO
   1829  */
   1830 int
   1831 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1832 {
   1833 	int     retcode;
   1834 
   1835 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1836 	req->next = rf_sparet_wait_queue;
   1837 	rf_sparet_wait_queue = req;
   1838 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1839 
   1840 	/* mpsleep unlocks the mutex */
   1841 	while (!rf_sparet_resp_queue) {
   1842 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1843 	}
   1844 	req = rf_sparet_resp_queue;
   1845 	rf_sparet_resp_queue = req->next;
   1846 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1847 
   1848 	retcode = req->fcol;
   1849 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1850 					 * alloc'd */
   1851 	return (retcode);
   1852 }
   1853 #endif
   1854 
   1855 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1856  * bp & passes it down.
   1857  * any calls originating in the kernel must use non-blocking I/O
   1858  * do some extra sanity checking to return "appropriate" error values for
   1859  * certain conditions (to make some standard utilities work)
   1860  *
   1861  * Formerly known as: rf_DoAccessKernel
   1862  */
   1863 void
   1864 raidstart(RF_Raid_t *raidPtr)
   1865 {
   1866 	struct raid_softc *rs;
   1867 	struct dk_softc *dksc;
   1868 
   1869 	rs = raidPtr->softc;
   1870 	dksc = &rs->sc_dksc;
   1871 	/* quick check to see if anything has died recently */
   1872 	rf_lock_mutex2(raidPtr->mutex);
   1873 	if (raidPtr->numNewFailures > 0) {
   1874 		rf_unlock_mutex2(raidPtr->mutex);
   1875 		rf_update_component_labels(raidPtr,
   1876 					   RF_NORMAL_COMPONENT_UPDATE);
   1877 		rf_lock_mutex2(raidPtr->mutex);
   1878 		raidPtr->numNewFailures--;
   1879 	}
   1880 	rf_unlock_mutex2(raidPtr->mutex);
   1881 
   1882 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1883 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1884 		return;
   1885 	}
   1886 
   1887 	dk_start(dksc, NULL);
   1888 }
   1889 
   1890 static int
   1891 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1892 {
   1893 	RF_SectorCount_t num_blocks, pb, sum;
   1894 	RF_RaidAddr_t raid_addr;
   1895 	daddr_t blocknum;
   1896 	int     do_async;
   1897 	int rc;
   1898 
   1899 	rf_lock_mutex2(raidPtr->mutex);
   1900 	if (raidPtr->openings == 0) {
   1901 		rf_unlock_mutex2(raidPtr->mutex);
   1902 		return EAGAIN;
   1903 	}
   1904 	rf_unlock_mutex2(raidPtr->mutex);
   1905 
   1906 	blocknum = bp->b_rawblkno;
   1907 
   1908 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1909 		    (int) blocknum));
   1910 
   1911 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1912 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1913 
   1914 	/* *THIS* is where we adjust what block we're going to...
   1915 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1916 	raid_addr = blocknum;
   1917 
   1918 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1919 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1920 	sum = raid_addr + num_blocks + pb;
   1921 	if (1 || rf_debugKernelAccess) {
   1922 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1923 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1924 			    (int) pb, (int) bp->b_resid));
   1925 	}
   1926 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1927 	    || (sum < num_blocks) || (sum < pb)) {
   1928 		rc = ENOSPC;
   1929 		goto done;
   1930 	}
   1931 	/*
   1932 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1933 	 */
   1934 
   1935 	if (bp->b_bcount & raidPtr->sectorMask) {
   1936 		rc = ENOSPC;
   1937 		goto done;
   1938 	}
   1939 	db1_printf(("Calling DoAccess..\n"));
   1940 
   1941 
   1942 	rf_lock_mutex2(raidPtr->mutex);
   1943 	raidPtr->openings--;
   1944 	rf_unlock_mutex2(raidPtr->mutex);
   1945 
   1946 	/*
   1947 	 * Everything is async.
   1948 	 */
   1949 	do_async = 1;
   1950 
   1951 	/* don't ever condition on bp->b_flags & B_WRITE.
   1952 	 * always condition on B_READ instead */
   1953 
   1954 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1955 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1956 			 do_async, raid_addr, num_blocks,
   1957 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1958 
   1959 done:
   1960 	return rc;
   1961 }
   1962 
   1963 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1964 
   1965 int
   1966 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1967 {
   1968 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1969 	struct buf *bp;
   1970 
   1971 	req->queue = queue;
   1972 	bp = req->bp;
   1973 
   1974 	switch (req->type) {
   1975 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1976 		/* XXX need to do something extra here.. */
   1977 		/* I'm leaving this in, as I've never actually seen it used,
   1978 		 * and I'd like folks to report it... GO */
   1979 		printf(("WAKEUP CALLED\n"));
   1980 		queue->numOutstanding++;
   1981 
   1982 		bp->b_flags = 0;
   1983 		bp->b_private = req;
   1984 
   1985 		KernelWakeupFunc(bp);
   1986 		break;
   1987 
   1988 	case RF_IO_TYPE_READ:
   1989 	case RF_IO_TYPE_WRITE:
   1990 #if RF_ACC_TRACE > 0
   1991 		if (req->tracerec) {
   1992 			RF_ETIMER_START(req->tracerec->timer);
   1993 		}
   1994 #endif
   1995 		InitBP(bp, queue->rf_cinfo->ci_vp,
   1996 		    op, queue->rf_cinfo->ci_dev,
   1997 		    req->sectorOffset, req->numSector,
   1998 		    req->buf, KernelWakeupFunc, (void *) req,
   1999 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2000 
   2001 		if (rf_debugKernelAccess) {
   2002 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2003 				(long) bp->b_blkno));
   2004 		}
   2005 		queue->numOutstanding++;
   2006 		queue->last_deq_sector = req->sectorOffset;
   2007 		/* acc wouldn't have been let in if there were any pending
   2008 		 * reqs at any other priority */
   2009 		queue->curPriority = req->priority;
   2010 
   2011 		db1_printf(("Going for %c to unit %d col %d\n",
   2012 			    req->type, queue->raidPtr->raidid,
   2013 			    queue->col));
   2014 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2015 			(int) req->sectorOffset, (int) req->numSector,
   2016 			(int) (req->numSector <<
   2017 			    queue->raidPtr->logBytesPerSector),
   2018 			(int) queue->raidPtr->logBytesPerSector));
   2019 
   2020 		/*
   2021 		 * XXX: drop lock here since this can block at
   2022 		 * least with backing SCSI devices.  Retake it
   2023 		 * to minimize fuss with calling interfaces.
   2024 		 */
   2025 
   2026 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2027 		bdev_strategy(bp);
   2028 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2029 		break;
   2030 
   2031 	default:
   2032 		panic("bad req->type in rf_DispatchKernelIO");
   2033 	}
   2034 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2035 
   2036 	return (0);
   2037 }
   2038 /* this is the callback function associated with a I/O invoked from
   2039    kernel code.
   2040  */
   2041 static void
   2042 KernelWakeupFunc(struct buf *bp)
   2043 {
   2044 	RF_DiskQueueData_t *req = NULL;
   2045 	RF_DiskQueue_t *queue;
   2046 
   2047 	db1_printf(("recovering the request queue:\n"));
   2048 
   2049 	req = bp->b_private;
   2050 
   2051 	queue = (RF_DiskQueue_t *) req->queue;
   2052 
   2053 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2054 
   2055 #if RF_ACC_TRACE > 0
   2056 	if (req->tracerec) {
   2057 		RF_ETIMER_STOP(req->tracerec->timer);
   2058 		RF_ETIMER_EVAL(req->tracerec->timer);
   2059 		rf_lock_mutex2(rf_tracing_mutex);
   2060 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2061 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2062 		req->tracerec->num_phys_ios++;
   2063 		rf_unlock_mutex2(rf_tracing_mutex);
   2064 	}
   2065 #endif
   2066 
   2067 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2068 	 * ballistic, and mark the component as hosed... */
   2069 
   2070 	if (bp->b_error != 0) {
   2071 		/* Mark the disk as dead */
   2072 		/* but only mark it once... */
   2073 		/* and only if it wouldn't leave this RAID set
   2074 		   completely broken */
   2075 		if (((queue->raidPtr->Disks[queue->col].status ==
   2076 		      rf_ds_optimal) ||
   2077 		     (queue->raidPtr->Disks[queue->col].status ==
   2078 		      rf_ds_used_spare)) &&
   2079 		     (queue->raidPtr->numFailures <
   2080 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2081 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2082 			       queue->raidPtr->raidid,
   2083 			       bp->b_error,
   2084 			       queue->raidPtr->Disks[queue->col].devname);
   2085 			queue->raidPtr->Disks[queue->col].status =
   2086 			    rf_ds_failed;
   2087 			queue->raidPtr->status = rf_rs_degraded;
   2088 			queue->raidPtr->numFailures++;
   2089 			queue->raidPtr->numNewFailures++;
   2090 		} else {	/* Disk is already dead... */
   2091 			/* printf("Disk already marked as dead!\n"); */
   2092 		}
   2093 
   2094 	}
   2095 
   2096 	/* Fill in the error value */
   2097 	req->error = bp->b_error;
   2098 
   2099 	/* Drop this one on the "finished" queue... */
   2100 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2101 
   2102 	/* Let the raidio thread know there is work to be done. */
   2103 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2104 
   2105 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2106 }
   2107 
   2108 
   2109 /*
   2110  * initialize a buf structure for doing an I/O in the kernel.
   2111  */
   2112 static void
   2113 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2114        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2115        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2116        struct proc *b_proc)
   2117 {
   2118 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2119 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2120 	bp->b_oflags = 0;
   2121 	bp->b_cflags = 0;
   2122 	bp->b_bcount = numSect << logBytesPerSector;
   2123 	bp->b_bufsize = bp->b_bcount;
   2124 	bp->b_error = 0;
   2125 	bp->b_dev = dev;
   2126 	bp->b_data = bf;
   2127 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2128 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2129 	if (bp->b_bcount == 0) {
   2130 		panic("bp->b_bcount is zero in InitBP!!");
   2131 	}
   2132 	bp->b_proc = b_proc;
   2133 	bp->b_iodone = cbFunc;
   2134 	bp->b_private = cbArg;
   2135 }
   2136 
   2137 /*
   2138  * Wait interruptibly for an exclusive lock.
   2139  *
   2140  * XXX
   2141  * Several drivers do this; it should be abstracted and made MP-safe.
   2142  * (Hmm... where have we seen this warning before :->  GO )
   2143  */
   2144 static int
   2145 raidlock(struct raid_softc *rs)
   2146 {
   2147 	int     error;
   2148 
   2149 	error = 0;
   2150 	mutex_enter(&rs->sc_mutex);
   2151 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2152 		rs->sc_flags |= RAIDF_WANTED;
   2153 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2154 		if (error != 0)
   2155 			goto done;
   2156 	}
   2157 	rs->sc_flags |= RAIDF_LOCKED;
   2158 done:
   2159 	mutex_exit(&rs->sc_mutex);
   2160 	return (error);
   2161 }
   2162 /*
   2163  * Unlock and wake up any waiters.
   2164  */
   2165 static void
   2166 raidunlock(struct raid_softc *rs)
   2167 {
   2168 
   2169 	mutex_enter(&rs->sc_mutex);
   2170 	rs->sc_flags &= ~RAIDF_LOCKED;
   2171 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2172 		rs->sc_flags &= ~RAIDF_WANTED;
   2173 		cv_broadcast(&rs->sc_cv);
   2174 	}
   2175 	mutex_exit(&rs->sc_mutex);
   2176 }
   2177 
   2178 
   2179 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2180 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2181 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2182 
   2183 static daddr_t
   2184 rf_component_info_offset(void)
   2185 {
   2186 
   2187 	return RF_COMPONENT_INFO_OFFSET;
   2188 }
   2189 
   2190 static daddr_t
   2191 rf_component_info_size(unsigned secsize)
   2192 {
   2193 	daddr_t info_size;
   2194 
   2195 	KASSERT(secsize);
   2196 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2197 		info_size = secsize;
   2198 	else
   2199 		info_size = RF_COMPONENT_INFO_SIZE;
   2200 
   2201 	return info_size;
   2202 }
   2203 
   2204 static daddr_t
   2205 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2206 {
   2207 	daddr_t map_offset;
   2208 
   2209 	KASSERT(raidPtr->bytesPerSector);
   2210 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2211 		map_offset = raidPtr->bytesPerSector;
   2212 	else
   2213 		map_offset = RF_COMPONENT_INFO_SIZE;
   2214 	map_offset += rf_component_info_offset();
   2215 
   2216 	return map_offset;
   2217 }
   2218 
   2219 static daddr_t
   2220 rf_parity_map_size(RF_Raid_t *raidPtr)
   2221 {
   2222 	daddr_t map_size;
   2223 
   2224 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2225 		map_size = raidPtr->bytesPerSector;
   2226 	else
   2227 		map_size = RF_PARITY_MAP_SIZE;
   2228 
   2229 	return map_size;
   2230 }
   2231 
   2232 int
   2233 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2234 {
   2235 	RF_ComponentLabel_t *clabel;
   2236 
   2237 	clabel = raidget_component_label(raidPtr, col);
   2238 	clabel->clean = RF_RAID_CLEAN;
   2239 	raidflush_component_label(raidPtr, col);
   2240 	return(0);
   2241 }
   2242 
   2243 
   2244 int
   2245 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2246 {
   2247 	RF_ComponentLabel_t *clabel;
   2248 
   2249 	clabel = raidget_component_label(raidPtr, col);
   2250 	clabel->clean = RF_RAID_DIRTY;
   2251 	raidflush_component_label(raidPtr, col);
   2252 	return(0);
   2253 }
   2254 
   2255 int
   2256 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2257 {
   2258 	KASSERT(raidPtr->bytesPerSector);
   2259 	return raidread_component_label(raidPtr->bytesPerSector,
   2260 	    raidPtr->Disks[col].dev,
   2261 	    raidPtr->raid_cinfo[col].ci_vp,
   2262 	    &raidPtr->raid_cinfo[col].ci_label);
   2263 }
   2264 
   2265 RF_ComponentLabel_t *
   2266 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2267 {
   2268 	return &raidPtr->raid_cinfo[col].ci_label;
   2269 }
   2270 
   2271 int
   2272 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2273 {
   2274 	RF_ComponentLabel_t *label;
   2275 
   2276 	label = &raidPtr->raid_cinfo[col].ci_label;
   2277 	label->mod_counter = raidPtr->mod_counter;
   2278 #ifndef RF_NO_PARITY_MAP
   2279 	label->parity_map_modcount = label->mod_counter;
   2280 #endif
   2281 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2282 	    raidPtr->Disks[col].dev,
   2283 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2284 }
   2285 
   2286 
   2287 static int
   2288 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2289     RF_ComponentLabel_t *clabel)
   2290 {
   2291 	return raidread_component_area(dev, b_vp, clabel,
   2292 	    sizeof(RF_ComponentLabel_t),
   2293 	    rf_component_info_offset(),
   2294 	    rf_component_info_size(secsize));
   2295 }
   2296 
   2297 /* ARGSUSED */
   2298 static int
   2299 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2300     size_t msize, daddr_t offset, daddr_t dsize)
   2301 {
   2302 	struct buf *bp;
   2303 	int error;
   2304 
   2305 	/* XXX should probably ensure that we don't try to do this if
   2306 	   someone has changed rf_protected_sectors. */
   2307 
   2308 	if (b_vp == NULL) {
   2309 		/* For whatever reason, this component is not valid.
   2310 		   Don't try to read a component label from it. */
   2311 		return(EINVAL);
   2312 	}
   2313 
   2314 	/* get a block of the appropriate size... */
   2315 	bp = geteblk((int)dsize);
   2316 	bp->b_dev = dev;
   2317 
   2318 	/* get our ducks in a row for the read */
   2319 	bp->b_blkno = offset / DEV_BSIZE;
   2320 	bp->b_bcount = dsize;
   2321 	bp->b_flags |= B_READ;
   2322  	bp->b_resid = dsize;
   2323 
   2324 	bdev_strategy(bp);
   2325 	error = biowait(bp);
   2326 
   2327 	if (!error) {
   2328 		memcpy(data, bp->b_data, msize);
   2329 	}
   2330 
   2331 	brelse(bp, 0);
   2332 	return(error);
   2333 }
   2334 
   2335 
   2336 static int
   2337 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2338     RF_ComponentLabel_t *clabel)
   2339 {
   2340 	return raidwrite_component_area(dev, b_vp, clabel,
   2341 	    sizeof(RF_ComponentLabel_t),
   2342 	    rf_component_info_offset(),
   2343 	    rf_component_info_size(secsize), 0);
   2344 }
   2345 
   2346 /* ARGSUSED */
   2347 static int
   2348 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2349     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2350 {
   2351 	struct buf *bp;
   2352 	int error;
   2353 
   2354 	/* get a block of the appropriate size... */
   2355 	bp = geteblk((int)dsize);
   2356 	bp->b_dev = dev;
   2357 
   2358 	/* get our ducks in a row for the write */
   2359 	bp->b_blkno = offset / DEV_BSIZE;
   2360 	bp->b_bcount = dsize;
   2361 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2362  	bp->b_resid = dsize;
   2363 
   2364 	memset(bp->b_data, 0, dsize);
   2365 	memcpy(bp->b_data, data, msize);
   2366 
   2367 	bdev_strategy(bp);
   2368 	if (asyncp)
   2369 		return 0;
   2370 	error = biowait(bp);
   2371 	brelse(bp, 0);
   2372 	if (error) {
   2373 #if 1
   2374 		printf("Failed to write RAID component info!\n");
   2375 #endif
   2376 	}
   2377 
   2378 	return(error);
   2379 }
   2380 
   2381 void
   2382 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2383 {
   2384 	int c;
   2385 
   2386 	for (c = 0; c < raidPtr->numCol; c++) {
   2387 		/* Skip dead disks. */
   2388 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2389 			continue;
   2390 		/* XXXjld: what if an error occurs here? */
   2391 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2392 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2393 		    RF_PARITYMAP_NBYTE,
   2394 		    rf_parity_map_offset(raidPtr),
   2395 		    rf_parity_map_size(raidPtr), 0);
   2396 	}
   2397 }
   2398 
   2399 void
   2400 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2401 {
   2402 	struct rf_paritymap_ondisk tmp;
   2403 	int c,first;
   2404 
   2405 	first=1;
   2406 	for (c = 0; c < raidPtr->numCol; c++) {
   2407 		/* Skip dead disks. */
   2408 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2409 			continue;
   2410 		raidread_component_area(raidPtr->Disks[c].dev,
   2411 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2412 		    RF_PARITYMAP_NBYTE,
   2413 		    rf_parity_map_offset(raidPtr),
   2414 		    rf_parity_map_size(raidPtr));
   2415 		if (first) {
   2416 			memcpy(map, &tmp, sizeof(*map));
   2417 			first = 0;
   2418 		} else {
   2419 			rf_paritymap_merge(map, &tmp);
   2420 		}
   2421 	}
   2422 }
   2423 
   2424 void
   2425 rf_markalldirty(RF_Raid_t *raidPtr)
   2426 {
   2427 	RF_ComponentLabel_t *clabel;
   2428 	int sparecol;
   2429 	int c;
   2430 	int j;
   2431 	int scol = -1;
   2432 
   2433 	raidPtr->mod_counter++;
   2434 	for (c = 0; c < raidPtr->numCol; c++) {
   2435 		/* we don't want to touch (at all) a disk that has
   2436 		   failed */
   2437 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2438 			clabel = raidget_component_label(raidPtr, c);
   2439 			if (clabel->status == rf_ds_spared) {
   2440 				/* XXX do something special...
   2441 				   but whatever you do, don't
   2442 				   try to access it!! */
   2443 			} else {
   2444 				raidmarkdirty(raidPtr, c);
   2445 			}
   2446 		}
   2447 	}
   2448 
   2449 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2450 		sparecol = raidPtr->numCol + c;
   2451 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2452 			/*
   2453 
   2454 			   we claim this disk is "optimal" if it's
   2455 			   rf_ds_used_spare, as that means it should be
   2456 			   directly substitutable for the disk it replaced.
   2457 			   We note that too...
   2458 
   2459 			 */
   2460 
   2461 			for(j=0;j<raidPtr->numCol;j++) {
   2462 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2463 					scol = j;
   2464 					break;
   2465 				}
   2466 			}
   2467 
   2468 			clabel = raidget_component_label(raidPtr, sparecol);
   2469 			/* make sure status is noted */
   2470 
   2471 			raid_init_component_label(raidPtr, clabel);
   2472 
   2473 			clabel->row = 0;
   2474 			clabel->column = scol;
   2475 			/* Note: we *don't* change status from rf_ds_used_spare
   2476 			   to rf_ds_optimal */
   2477 			/* clabel.status = rf_ds_optimal; */
   2478 
   2479 			raidmarkdirty(raidPtr, sparecol);
   2480 		}
   2481 	}
   2482 }
   2483 
   2484 
   2485 void
   2486 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2487 {
   2488 	RF_ComponentLabel_t *clabel;
   2489 	int sparecol;
   2490 	int c;
   2491 	int j;
   2492 	int scol;
   2493 	struct raid_softc *rs = raidPtr->softc;
   2494 
   2495 	scol = -1;
   2496 
   2497 	/* XXX should do extra checks to make sure things really are clean,
   2498 	   rather than blindly setting the clean bit... */
   2499 
   2500 	raidPtr->mod_counter++;
   2501 
   2502 	for (c = 0; c < raidPtr->numCol; c++) {
   2503 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2504 			clabel = raidget_component_label(raidPtr, c);
   2505 			/* make sure status is noted */
   2506 			clabel->status = rf_ds_optimal;
   2507 
   2508 			/* note what unit we are configured as */
   2509 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2510 				clabel->last_unit = raidPtr->raidid;
   2511 
   2512 			raidflush_component_label(raidPtr, c);
   2513 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2514 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2515 					raidmarkclean(raidPtr, c);
   2516 				}
   2517 			}
   2518 		}
   2519 		/* else we don't touch it.. */
   2520 	}
   2521 
   2522 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2523 		sparecol = raidPtr->numCol + c;
   2524 		/* Need to ensure that the reconstruct actually completed! */
   2525 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2526 			/*
   2527 
   2528 			   we claim this disk is "optimal" if it's
   2529 			   rf_ds_used_spare, as that means it should be
   2530 			   directly substitutable for the disk it replaced.
   2531 			   We note that too...
   2532 
   2533 			 */
   2534 
   2535 			for(j=0;j<raidPtr->numCol;j++) {
   2536 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2537 					scol = j;
   2538 					break;
   2539 				}
   2540 			}
   2541 
   2542 			/* XXX shouldn't *really* need this... */
   2543 			clabel = raidget_component_label(raidPtr, sparecol);
   2544 			/* make sure status is noted */
   2545 
   2546 			raid_init_component_label(raidPtr, clabel);
   2547 
   2548 			clabel->column = scol;
   2549 			clabel->status = rf_ds_optimal;
   2550 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2551 				clabel->last_unit = raidPtr->raidid;
   2552 
   2553 			raidflush_component_label(raidPtr, sparecol);
   2554 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2555 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2556 					raidmarkclean(raidPtr, sparecol);
   2557 				}
   2558 			}
   2559 		}
   2560 	}
   2561 }
   2562 
   2563 void
   2564 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2565 {
   2566 
   2567 	if (vp != NULL) {
   2568 		if (auto_configured == 1) {
   2569 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2570 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2571 			vput(vp);
   2572 
   2573 		} else {
   2574 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2575 		}
   2576 	}
   2577 }
   2578 
   2579 
   2580 void
   2581 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2582 {
   2583 	int r,c;
   2584 	struct vnode *vp;
   2585 	int acd;
   2586 
   2587 
   2588 	/* We take this opportunity to close the vnodes like we should.. */
   2589 
   2590 	for (c = 0; c < raidPtr->numCol; c++) {
   2591 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2592 		acd = raidPtr->Disks[c].auto_configured;
   2593 		rf_close_component(raidPtr, vp, acd);
   2594 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2595 		raidPtr->Disks[c].auto_configured = 0;
   2596 	}
   2597 
   2598 	for (r = 0; r < raidPtr->numSpare; r++) {
   2599 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2600 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2601 		rf_close_component(raidPtr, vp, acd);
   2602 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2603 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2604 	}
   2605 }
   2606 
   2607 
   2608 void
   2609 rf_ReconThread(struct rf_recon_req_internal *req)
   2610 {
   2611 	int     s;
   2612 	RF_Raid_t *raidPtr;
   2613 
   2614 	s = splbio();
   2615 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2616 	raidPtr->recon_in_progress = 1;
   2617 
   2618 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2619 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2620 
   2621 	RF_Free(req, sizeof(*req));
   2622 
   2623 	raidPtr->recon_in_progress = 0;
   2624 	splx(s);
   2625 
   2626 	/* That's all... */
   2627 	kthread_exit(0);	/* does not return */
   2628 }
   2629 
   2630 void
   2631 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2632 {
   2633 	int retcode;
   2634 	int s;
   2635 
   2636 	raidPtr->parity_rewrite_stripes_done = 0;
   2637 	raidPtr->parity_rewrite_in_progress = 1;
   2638 	s = splbio();
   2639 	retcode = rf_RewriteParity(raidPtr);
   2640 	splx(s);
   2641 	if (retcode) {
   2642 		printf("raid%d: Error re-writing parity (%d)!\n",
   2643 		    raidPtr->raidid, retcode);
   2644 	} else {
   2645 		/* set the clean bit!  If we shutdown correctly,
   2646 		   the clean bit on each component label will get
   2647 		   set */
   2648 		raidPtr->parity_good = RF_RAID_CLEAN;
   2649 	}
   2650 	raidPtr->parity_rewrite_in_progress = 0;
   2651 
   2652 	/* Anyone waiting for us to stop?  If so, inform them... */
   2653 	if (raidPtr->waitShutdown) {
   2654 		rf_lock_mutex2(raidPtr->rad_lock);
   2655 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2656 		rf_unlock_mutex2(raidPtr->rad_lock);
   2657 	}
   2658 
   2659 	/* That's all... */
   2660 	kthread_exit(0);	/* does not return */
   2661 }
   2662 
   2663 
   2664 void
   2665 rf_CopybackThread(RF_Raid_t *raidPtr)
   2666 {
   2667 	int s;
   2668 
   2669 	raidPtr->copyback_in_progress = 1;
   2670 	s = splbio();
   2671 	rf_CopybackReconstructedData(raidPtr);
   2672 	splx(s);
   2673 	raidPtr->copyback_in_progress = 0;
   2674 
   2675 	/* That's all... */
   2676 	kthread_exit(0);	/* does not return */
   2677 }
   2678 
   2679 
   2680 void
   2681 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2682 {
   2683 	int s;
   2684 	RF_Raid_t *raidPtr;
   2685 
   2686 	s = splbio();
   2687 	raidPtr = req->raidPtr;
   2688 	raidPtr->recon_in_progress = 1;
   2689 	rf_ReconstructInPlace(raidPtr, req->col);
   2690 	RF_Free(req, sizeof(*req));
   2691 	raidPtr->recon_in_progress = 0;
   2692 	splx(s);
   2693 
   2694 	/* That's all... */
   2695 	kthread_exit(0);	/* does not return */
   2696 }
   2697 
   2698 static RF_AutoConfig_t *
   2699 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2700     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2701     unsigned secsize)
   2702 {
   2703 	int good_one = 0;
   2704 	RF_ComponentLabel_t *clabel;
   2705 	RF_AutoConfig_t *ac;
   2706 
   2707 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2708 	if (clabel == NULL) {
   2709 oomem:
   2710 		    while(ac_list) {
   2711 			    ac = ac_list;
   2712 			    if (ac->clabel)
   2713 				    free(ac->clabel, M_RAIDFRAME);
   2714 			    ac_list = ac_list->next;
   2715 			    free(ac, M_RAIDFRAME);
   2716 		    }
   2717 		    printf("RAID auto config: out of memory!\n");
   2718 		    return NULL; /* XXX probably should panic? */
   2719 	}
   2720 
   2721 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2722 		/* Got the label.  Does it look reasonable? */
   2723 		if (rf_reasonable_label(clabel, numsecs) &&
   2724 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2725 #ifdef DEBUG
   2726 			printf("Component on: %s: %llu\n",
   2727 				cname, (unsigned long long)size);
   2728 			rf_print_component_label(clabel);
   2729 #endif
   2730 			/* if it's reasonable, add it, else ignore it. */
   2731 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2732 				M_NOWAIT);
   2733 			if (ac == NULL) {
   2734 				free(clabel, M_RAIDFRAME);
   2735 				goto oomem;
   2736 			}
   2737 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2738 			ac->dev = dev;
   2739 			ac->vp = vp;
   2740 			ac->clabel = clabel;
   2741 			ac->next = ac_list;
   2742 			ac_list = ac;
   2743 			good_one = 1;
   2744 		}
   2745 	}
   2746 	if (!good_one) {
   2747 		/* cleanup */
   2748 		free(clabel, M_RAIDFRAME);
   2749 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2750 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2751 		vput(vp);
   2752 	}
   2753 	return ac_list;
   2754 }
   2755 
   2756 RF_AutoConfig_t *
   2757 rf_find_raid_components(void)
   2758 {
   2759 	struct vnode *vp;
   2760 	struct disklabel label;
   2761 	device_t dv;
   2762 	deviter_t di;
   2763 	dev_t dev;
   2764 	int bmajor, bminor, wedge, rf_part_found;
   2765 	int error;
   2766 	int i;
   2767 	RF_AutoConfig_t *ac_list;
   2768 	uint64_t numsecs;
   2769 	unsigned secsize;
   2770 	int dowedges;
   2771 
   2772 	/* initialize the AutoConfig list */
   2773 	ac_list = NULL;
   2774 
   2775 	/*
   2776 	 * we begin by trolling through *all* the devices on the system *twice*
   2777 	 * first we scan for wedges, second for other devices. This avoids
   2778 	 * using a raw partition instead of a wedge that covers the whole disk
   2779 	 */
   2780 
   2781 	for (dowedges=1; dowedges>=0; --dowedges) {
   2782 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2783 		     dv = deviter_next(&di)) {
   2784 
   2785 			/* we are only interested in disks... */
   2786 			if (device_class(dv) != DV_DISK)
   2787 				continue;
   2788 
   2789 			/* we don't care about floppies... */
   2790 			if (device_is_a(dv, "fd")) {
   2791 				continue;
   2792 			}
   2793 
   2794 			/* we don't care about CD's... */
   2795 			if (device_is_a(dv, "cd")) {
   2796 				continue;
   2797 			}
   2798 
   2799 			/* we don't care about md's... */
   2800 			if (device_is_a(dv, "md")) {
   2801 				continue;
   2802 			}
   2803 
   2804 			/* hdfd is the Atari/Hades floppy driver */
   2805 			if (device_is_a(dv, "hdfd")) {
   2806 				continue;
   2807 			}
   2808 
   2809 			/* fdisa is the Atari/Milan floppy driver */
   2810 			if (device_is_a(dv, "fdisa")) {
   2811 				continue;
   2812 			}
   2813 
   2814 			/* are we in the wedges pass ? */
   2815 			wedge = device_is_a(dv, "dk");
   2816 			if (wedge != dowedges) {
   2817 				continue;
   2818 			}
   2819 
   2820 			/* need to find the device_name_to_block_device_major stuff */
   2821 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2822 
   2823 			rf_part_found = 0; /*No raid partition as yet*/
   2824 
   2825 			/* get a vnode for the raw partition of this disk */
   2826 			bminor = minor(device_unit(dv));
   2827 			dev = wedge ? makedev(bmajor, bminor) :
   2828 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2829 			if (bdevvp(dev, &vp))
   2830 				panic("RAID can't alloc vnode");
   2831 
   2832 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2833 
   2834 			if (error) {
   2835 				/* "Who cares."  Continue looking
   2836 				   for something that exists*/
   2837 				vput(vp);
   2838 				continue;
   2839 			}
   2840 
   2841 			error = getdisksize(vp, &numsecs, &secsize);
   2842 			if (error) {
   2843 				/*
   2844 				 * Pseudo devices like vnd and cgd can be
   2845 				 * opened but may still need some configuration.
   2846 				 * Ignore these quietly.
   2847 				 */
   2848 				if (error != ENXIO)
   2849 					printf("RAIDframe: can't get disk size"
   2850 					    " for dev %s (%d)\n",
   2851 					    device_xname(dv), error);
   2852 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2853 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2854 				vput(vp);
   2855 				continue;
   2856 			}
   2857 			if (wedge) {
   2858 				struct dkwedge_info dkw;
   2859 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2860 				    NOCRED);
   2861 				if (error) {
   2862 					printf("RAIDframe: can't get wedge info for "
   2863 					    "dev %s (%d)\n", device_xname(dv), error);
   2864 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2865 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2866 					vput(vp);
   2867 					continue;
   2868 				}
   2869 
   2870 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2871 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2872 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2873 					vput(vp);
   2874 					continue;
   2875 				}
   2876 
   2877 				ac_list = rf_get_component(ac_list, dev, vp,
   2878 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2879 				rf_part_found = 1; /*There is a raid component on this disk*/
   2880 				continue;
   2881 			}
   2882 
   2883 			/* Ok, the disk exists.  Go get the disklabel. */
   2884 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2885 			if (error) {
   2886 				/*
   2887 				 * XXX can't happen - open() would
   2888 				 * have errored out (or faked up one)
   2889 				 */
   2890 				if (error != ENOTTY)
   2891 					printf("RAIDframe: can't get label for dev "
   2892 					    "%s (%d)\n", device_xname(dv), error);
   2893 			}
   2894 
   2895 			/* don't need this any more.  We'll allocate it again
   2896 			   a little later if we really do... */
   2897 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2898 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2899 			vput(vp);
   2900 
   2901 			if (error)
   2902 				continue;
   2903 
   2904 			rf_part_found = 0; /*No raid partitions yet*/
   2905 			for (i = 0; i < label.d_npartitions; i++) {
   2906 				char cname[sizeof(ac_list->devname)];
   2907 
   2908 				/* We only support partitions marked as RAID */
   2909 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2910 					continue;
   2911 
   2912 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2913 				if (bdevvp(dev, &vp))
   2914 					panic("RAID can't alloc vnode");
   2915 
   2916 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2917 				if (error) {
   2918 					/* Whatever... */
   2919 					vput(vp);
   2920 					continue;
   2921 				}
   2922 				snprintf(cname, sizeof(cname), "%s%c",
   2923 				    device_xname(dv), 'a' + i);
   2924 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2925 					label.d_partitions[i].p_size, numsecs, secsize);
   2926 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2927 			}
   2928 
   2929 			/*
   2930 			 *If there is no raid component on this disk, either in a
   2931 			 *disklabel or inside a wedge, check the raw partition as well,
   2932 			 *as it is possible to configure raid components on raw disk
   2933 			 *devices.
   2934 			 */
   2935 
   2936 			if (!rf_part_found) {
   2937 				char cname[sizeof(ac_list->devname)];
   2938 
   2939 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2940 				if (bdevvp(dev, &vp))
   2941 					panic("RAID can't alloc vnode");
   2942 
   2943 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2944 				if (error) {
   2945 					/* Whatever... */
   2946 					vput(vp);
   2947 					continue;
   2948 				}
   2949 				snprintf(cname, sizeof(cname), "%s%c",
   2950 				    device_xname(dv), 'a' + RAW_PART);
   2951 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2952 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2953 			}
   2954 		}
   2955 		deviter_release(&di);
   2956 	}
   2957 	return ac_list;
   2958 }
   2959 
   2960 
   2961 int
   2962 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2963 {
   2964 
   2965 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2966 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2967 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2968 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2969 	    clabel->row >=0 &&
   2970 	    clabel->column >= 0 &&
   2971 	    clabel->num_rows > 0 &&
   2972 	    clabel->num_columns > 0 &&
   2973 	    clabel->row < clabel->num_rows &&
   2974 	    clabel->column < clabel->num_columns &&
   2975 	    clabel->blockSize > 0 &&
   2976 	    /*
   2977 	     * numBlocksHi may contain garbage, but it is ok since
   2978 	     * the type is unsigned.  If it is really garbage,
   2979 	     * rf_fix_old_label_size() will fix it.
   2980 	     */
   2981 	    rf_component_label_numblocks(clabel) > 0) {
   2982 		/*
   2983 		 * label looks reasonable enough...
   2984 		 * let's make sure it has no old garbage.
   2985 		 */
   2986 		if (numsecs)
   2987 			rf_fix_old_label_size(clabel, numsecs);
   2988 		return(1);
   2989 	}
   2990 	return(0);
   2991 }
   2992 
   2993 
   2994 /*
   2995  * For reasons yet unknown, some old component labels have garbage in
   2996  * the newer numBlocksHi region, and this causes lossage.  Since those
   2997  * disks will also have numsecs set to less than 32 bits of sectors,
   2998  * we can determine when this corruption has occurred, and fix it.
   2999  *
   3000  * The exact same problem, with the same unknown reason, happens to
   3001  * the partitionSizeHi member as well.
   3002  */
   3003 static void
   3004 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3005 {
   3006 
   3007 	if (numsecs < ((uint64_t)1 << 32)) {
   3008 		if (clabel->numBlocksHi) {
   3009 			printf("WARNING: total sectors < 32 bits, yet "
   3010 			       "numBlocksHi set\n"
   3011 			       "WARNING: resetting numBlocksHi to zero.\n");
   3012 			clabel->numBlocksHi = 0;
   3013 		}
   3014 
   3015 		if (clabel->partitionSizeHi) {
   3016 			printf("WARNING: total sectors < 32 bits, yet "
   3017 			       "partitionSizeHi set\n"
   3018 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3019 			clabel->partitionSizeHi = 0;
   3020 		}
   3021 	}
   3022 }
   3023 
   3024 
   3025 #ifdef DEBUG
   3026 void
   3027 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3028 {
   3029 	uint64_t numBlocks;
   3030 	static const char *rp[] = {
   3031 	    "No", "Force", "Soft", "*invalid*"
   3032 	};
   3033 
   3034 
   3035 	numBlocks = rf_component_label_numblocks(clabel);
   3036 
   3037 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3038 	       clabel->row, clabel->column,
   3039 	       clabel->num_rows, clabel->num_columns);
   3040 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3041 	       clabel->version, clabel->serial_number,
   3042 	       clabel->mod_counter);
   3043 	printf("   Clean: %s Status: %d\n",
   3044 	       clabel->clean ? "Yes" : "No", clabel->status);
   3045 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3046 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3047 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3048 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3049 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3050 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3051 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3052 #if 0
   3053 	   printf("   Config order: %d\n", clabel->config_order);
   3054 #endif
   3055 
   3056 }
   3057 #endif
   3058 
   3059 RF_ConfigSet_t *
   3060 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3061 {
   3062 	RF_AutoConfig_t *ac;
   3063 	RF_ConfigSet_t *config_sets;
   3064 	RF_ConfigSet_t *cset;
   3065 	RF_AutoConfig_t *ac_next;
   3066 
   3067 
   3068 	config_sets = NULL;
   3069 
   3070 	/* Go through the AutoConfig list, and figure out which components
   3071 	   belong to what sets.  */
   3072 	ac = ac_list;
   3073 	while(ac!=NULL) {
   3074 		/* we're going to putz with ac->next, so save it here
   3075 		   for use at the end of the loop */
   3076 		ac_next = ac->next;
   3077 
   3078 		if (config_sets == NULL) {
   3079 			/* will need at least this one... */
   3080 			config_sets = (RF_ConfigSet_t *)
   3081 				malloc(sizeof(RF_ConfigSet_t),
   3082 				       M_RAIDFRAME, M_NOWAIT);
   3083 			if (config_sets == NULL) {
   3084 				panic("rf_create_auto_sets: No memory!");
   3085 			}
   3086 			/* this one is easy :) */
   3087 			config_sets->ac = ac;
   3088 			config_sets->next = NULL;
   3089 			config_sets->rootable = 0;
   3090 			ac->next = NULL;
   3091 		} else {
   3092 			/* which set does this component fit into? */
   3093 			cset = config_sets;
   3094 			while(cset!=NULL) {
   3095 				if (rf_does_it_fit(cset, ac)) {
   3096 					/* looks like it matches... */
   3097 					ac->next = cset->ac;
   3098 					cset->ac = ac;
   3099 					break;
   3100 				}
   3101 				cset = cset->next;
   3102 			}
   3103 			if (cset==NULL) {
   3104 				/* didn't find a match above... new set..*/
   3105 				cset = (RF_ConfigSet_t *)
   3106 					malloc(sizeof(RF_ConfigSet_t),
   3107 					       M_RAIDFRAME, M_NOWAIT);
   3108 				if (cset == NULL) {
   3109 					panic("rf_create_auto_sets: No memory!");
   3110 				}
   3111 				cset->ac = ac;
   3112 				ac->next = NULL;
   3113 				cset->next = config_sets;
   3114 				cset->rootable = 0;
   3115 				config_sets = cset;
   3116 			}
   3117 		}
   3118 		ac = ac_next;
   3119 	}
   3120 
   3121 
   3122 	return(config_sets);
   3123 }
   3124 
   3125 static int
   3126 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3127 {
   3128 	RF_ComponentLabel_t *clabel1, *clabel2;
   3129 
   3130 	/* If this one matches the *first* one in the set, that's good
   3131 	   enough, since the other members of the set would have been
   3132 	   through here too... */
   3133 	/* note that we are not checking partitionSize here..
   3134 
   3135 	   Note that we are also not checking the mod_counters here.
   3136 	   If everything else matches except the mod_counter, that's
   3137 	   good enough for this test.  We will deal with the mod_counters
   3138 	   a little later in the autoconfiguration process.
   3139 
   3140 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3141 
   3142 	   The reason we don't check for this is that failed disks
   3143 	   will have lower modification counts.  If those disks are
   3144 	   not added to the set they used to belong to, then they will
   3145 	   form their own set, which may result in 2 different sets,
   3146 	   for example, competing to be configured at raid0, and
   3147 	   perhaps competing to be the root filesystem set.  If the
   3148 	   wrong ones get configured, or both attempt to become /,
   3149 	   weird behaviour and or serious lossage will occur.  Thus we
   3150 	   need to bring them into the fold here, and kick them out at
   3151 	   a later point.
   3152 
   3153 	*/
   3154 
   3155 	clabel1 = cset->ac->clabel;
   3156 	clabel2 = ac->clabel;
   3157 	if ((clabel1->version == clabel2->version) &&
   3158 	    (clabel1->serial_number == clabel2->serial_number) &&
   3159 	    (clabel1->num_rows == clabel2->num_rows) &&
   3160 	    (clabel1->num_columns == clabel2->num_columns) &&
   3161 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3162 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3163 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3164 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3165 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3166 	    (clabel1->blockSize == clabel2->blockSize) &&
   3167 	    rf_component_label_numblocks(clabel1) ==
   3168 	    rf_component_label_numblocks(clabel2) &&
   3169 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3170 	    (clabel1->root_partition == clabel2->root_partition) &&
   3171 	    (clabel1->last_unit == clabel2->last_unit) &&
   3172 	    (clabel1->config_order == clabel2->config_order)) {
   3173 		/* if it get's here, it almost *has* to be a match */
   3174 	} else {
   3175 		/* it's not consistent with somebody in the set..
   3176 		   punt */
   3177 		return(0);
   3178 	}
   3179 	/* all was fine.. it must fit... */
   3180 	return(1);
   3181 }
   3182 
   3183 int
   3184 rf_have_enough_components(RF_ConfigSet_t *cset)
   3185 {
   3186 	RF_AutoConfig_t *ac;
   3187 	RF_AutoConfig_t *auto_config;
   3188 	RF_ComponentLabel_t *clabel;
   3189 	int c;
   3190 	int num_cols;
   3191 	int num_missing;
   3192 	int mod_counter;
   3193 	int mod_counter_found;
   3194 	int even_pair_failed;
   3195 	char parity_type;
   3196 
   3197 
   3198 	/* check to see that we have enough 'live' components
   3199 	   of this set.  If so, we can configure it if necessary */
   3200 
   3201 	num_cols = cset->ac->clabel->num_columns;
   3202 	parity_type = cset->ac->clabel->parityConfig;
   3203 
   3204 	/* XXX Check for duplicate components!?!?!? */
   3205 
   3206 	/* Determine what the mod_counter is supposed to be for this set. */
   3207 
   3208 	mod_counter_found = 0;
   3209 	mod_counter = 0;
   3210 	ac = cset->ac;
   3211 	while(ac!=NULL) {
   3212 		if (mod_counter_found==0) {
   3213 			mod_counter = ac->clabel->mod_counter;
   3214 			mod_counter_found = 1;
   3215 		} else {
   3216 			if (ac->clabel->mod_counter > mod_counter) {
   3217 				mod_counter = ac->clabel->mod_counter;
   3218 			}
   3219 		}
   3220 		ac = ac->next;
   3221 	}
   3222 
   3223 	num_missing = 0;
   3224 	auto_config = cset->ac;
   3225 
   3226 	even_pair_failed = 0;
   3227 	for(c=0; c<num_cols; c++) {
   3228 		ac = auto_config;
   3229 		while(ac!=NULL) {
   3230 			if ((ac->clabel->column == c) &&
   3231 			    (ac->clabel->mod_counter == mod_counter)) {
   3232 				/* it's this one... */
   3233 #ifdef DEBUG
   3234 				printf("Found: %s at %d\n",
   3235 				       ac->devname,c);
   3236 #endif
   3237 				break;
   3238 			}
   3239 			ac=ac->next;
   3240 		}
   3241 		if (ac==NULL) {
   3242 				/* Didn't find one here! */
   3243 				/* special case for RAID 1, especially
   3244 				   where there are more than 2
   3245 				   components (where RAIDframe treats
   3246 				   things a little differently :( ) */
   3247 			if (parity_type == '1') {
   3248 				if (c%2 == 0) { /* even component */
   3249 					even_pair_failed = 1;
   3250 				} else { /* odd component.  If
   3251 					    we're failed, and
   3252 					    so is the even
   3253 					    component, it's
   3254 					    "Good Night, Charlie" */
   3255 					if (even_pair_failed == 1) {
   3256 						return(0);
   3257 					}
   3258 				}
   3259 			} else {
   3260 				/* normal accounting */
   3261 				num_missing++;
   3262 			}
   3263 		}
   3264 		if ((parity_type == '1') && (c%2 == 1)) {
   3265 				/* Just did an even component, and we didn't
   3266 				   bail.. reset the even_pair_failed flag,
   3267 				   and go on to the next component.... */
   3268 			even_pair_failed = 0;
   3269 		}
   3270 	}
   3271 
   3272 	clabel = cset->ac->clabel;
   3273 
   3274 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3275 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3276 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3277 		/* XXX this needs to be made *much* more general */
   3278 		/* Too many failures */
   3279 		return(0);
   3280 	}
   3281 	/* otherwise, all is well, and we've got enough to take a kick
   3282 	   at autoconfiguring this set */
   3283 	return(1);
   3284 }
   3285 
   3286 void
   3287 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3288 			RF_Raid_t *raidPtr)
   3289 {
   3290 	RF_ComponentLabel_t *clabel;
   3291 	int i;
   3292 
   3293 	clabel = ac->clabel;
   3294 
   3295 	/* 1. Fill in the common stuff */
   3296 	config->numCol = clabel->num_columns;
   3297 	config->numSpare = 0; /* XXX should this be set here? */
   3298 	config->sectPerSU = clabel->sectPerSU;
   3299 	config->SUsPerPU = clabel->SUsPerPU;
   3300 	config->SUsPerRU = clabel->SUsPerRU;
   3301 	config->parityConfig = clabel->parityConfig;
   3302 	/* XXX... */
   3303 	strcpy(config->diskQueueType,"fifo");
   3304 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3305 	config->layoutSpecificSize = 0; /* XXX ?? */
   3306 
   3307 	while(ac!=NULL) {
   3308 		/* row/col values will be in range due to the checks
   3309 		   in reasonable_label() */
   3310 		strcpy(config->devnames[0][ac->clabel->column],
   3311 		       ac->devname);
   3312 		ac = ac->next;
   3313 	}
   3314 
   3315 	for(i=0;i<RF_MAXDBGV;i++) {
   3316 		config->debugVars[i][0] = 0;
   3317 	}
   3318 }
   3319 
   3320 int
   3321 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3322 {
   3323 	RF_ComponentLabel_t *clabel;
   3324 	int column;
   3325 	int sparecol;
   3326 
   3327 	raidPtr->autoconfigure = new_value;
   3328 
   3329 	for(column=0; column<raidPtr->numCol; column++) {
   3330 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3331 			clabel = raidget_component_label(raidPtr, column);
   3332 			clabel->autoconfigure = new_value;
   3333 			raidflush_component_label(raidPtr, column);
   3334 		}
   3335 	}
   3336 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3337 		sparecol = raidPtr->numCol + column;
   3338 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3339 			clabel = raidget_component_label(raidPtr, sparecol);
   3340 			clabel->autoconfigure = new_value;
   3341 			raidflush_component_label(raidPtr, sparecol);
   3342 		}
   3343 	}
   3344 	return(new_value);
   3345 }
   3346 
   3347 int
   3348 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3349 {
   3350 	RF_ComponentLabel_t *clabel;
   3351 	int column;
   3352 	int sparecol;
   3353 
   3354 	raidPtr->root_partition = new_value;
   3355 	for(column=0; column<raidPtr->numCol; column++) {
   3356 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3357 			clabel = raidget_component_label(raidPtr, column);
   3358 			clabel->root_partition = new_value;
   3359 			raidflush_component_label(raidPtr, column);
   3360 		}
   3361 	}
   3362 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3363 		sparecol = raidPtr->numCol + column;
   3364 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3365 			clabel = raidget_component_label(raidPtr, sparecol);
   3366 			clabel->root_partition = new_value;
   3367 			raidflush_component_label(raidPtr, sparecol);
   3368 		}
   3369 	}
   3370 	return(new_value);
   3371 }
   3372 
   3373 void
   3374 rf_release_all_vps(RF_ConfigSet_t *cset)
   3375 {
   3376 	RF_AutoConfig_t *ac;
   3377 
   3378 	ac = cset->ac;
   3379 	while(ac!=NULL) {
   3380 		/* Close the vp, and give it back */
   3381 		if (ac->vp) {
   3382 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3383 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3384 			vput(ac->vp);
   3385 			ac->vp = NULL;
   3386 		}
   3387 		ac = ac->next;
   3388 	}
   3389 }
   3390 
   3391 
   3392 void
   3393 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3394 {
   3395 	RF_AutoConfig_t *ac;
   3396 	RF_AutoConfig_t *next_ac;
   3397 
   3398 	ac = cset->ac;
   3399 	while(ac!=NULL) {
   3400 		next_ac = ac->next;
   3401 		/* nuke the label */
   3402 		free(ac->clabel, M_RAIDFRAME);
   3403 		/* cleanup the config structure */
   3404 		free(ac, M_RAIDFRAME);
   3405 		/* "next.." */
   3406 		ac = next_ac;
   3407 	}
   3408 	/* and, finally, nuke the config set */
   3409 	free(cset, M_RAIDFRAME);
   3410 }
   3411 
   3412 
   3413 void
   3414 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3415 {
   3416 	/* current version number */
   3417 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3418 	clabel->serial_number = raidPtr->serial_number;
   3419 	clabel->mod_counter = raidPtr->mod_counter;
   3420 
   3421 	clabel->num_rows = 1;
   3422 	clabel->num_columns = raidPtr->numCol;
   3423 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3424 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3425 
   3426 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3427 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3428 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3429 
   3430 	clabel->blockSize = raidPtr->bytesPerSector;
   3431 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3432 
   3433 	/* XXX not portable */
   3434 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3435 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3436 	clabel->autoconfigure = raidPtr->autoconfigure;
   3437 	clabel->root_partition = raidPtr->root_partition;
   3438 	clabel->last_unit = raidPtr->raidid;
   3439 	clabel->config_order = raidPtr->config_order;
   3440 
   3441 #ifndef RF_NO_PARITY_MAP
   3442 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3443 #endif
   3444 }
   3445 
   3446 struct raid_softc *
   3447 rf_auto_config_set(RF_ConfigSet_t *cset)
   3448 {
   3449 	RF_Raid_t *raidPtr;
   3450 	RF_Config_t *config;
   3451 	int raidID;
   3452 	struct raid_softc *sc;
   3453 
   3454 #ifdef DEBUG
   3455 	printf("RAID autoconfigure\n");
   3456 #endif
   3457 
   3458 	/* 1. Create a config structure */
   3459 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3460 	if (config == NULL) {
   3461 		printf("%s: Out of mem - config!?!?\n", __func__);
   3462 				/* XXX do something more intelligent here. */
   3463 		return NULL;
   3464 	}
   3465 
   3466 	/*
   3467 	   2. Figure out what RAID ID this one is supposed to live at
   3468 	   See if we can get the same RAID dev that it was configured
   3469 	   on last time..
   3470 	*/
   3471 
   3472 	raidID = cset->ac->clabel->last_unit;
   3473 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3474 	     sc = raidget(++raidID, false))
   3475 		continue;
   3476 #ifdef DEBUG
   3477 	printf("Configuring raid%d:\n",raidID);
   3478 #endif
   3479 
   3480 	if (sc == NULL)
   3481 		sc = raidget(raidID, true);
   3482 	if (sc == NULL) {
   3483 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3484 				/* XXX do something more intelligent here. */
   3485 		free(config, M_RAIDFRAME);
   3486 		return NULL;
   3487 	}
   3488 
   3489 	raidPtr = &sc->sc_r;
   3490 
   3491 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3492 	raidPtr->softc = sc;
   3493 	raidPtr->raidid = raidID;
   3494 	raidPtr->openings = RAIDOUTSTANDING;
   3495 
   3496 	/* 3. Build the configuration structure */
   3497 	rf_create_configuration(cset->ac, config, raidPtr);
   3498 
   3499 	/* 4. Do the configuration */
   3500 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3501 		raidinit(sc);
   3502 
   3503 		rf_markalldirty(raidPtr);
   3504 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3505 		switch (cset->ac->clabel->root_partition) {
   3506 		case 1:	/* Force Root */
   3507 		case 2:	/* Soft Root: root when boot partition part of raid */
   3508 			/*
   3509 			 * everything configured just fine.  Make a note
   3510 			 * that this set is eligible to be root,
   3511 			 * or forced to be root
   3512 			 */
   3513 			cset->rootable = cset->ac->clabel->root_partition;
   3514 			/* XXX do this here? */
   3515 			raidPtr->root_partition = cset->rootable;
   3516 			break;
   3517 		default:
   3518 			break;
   3519 		}
   3520 	} else {
   3521 		raidput(sc);
   3522 		sc = NULL;
   3523 	}
   3524 
   3525 	/* 5. Cleanup */
   3526 	free(config, M_RAIDFRAME);
   3527 	return sc;
   3528 }
   3529 
   3530 void
   3531 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3532 	     size_t xmin, size_t xmax)
   3533 {
   3534 	int error;
   3535 
   3536 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3537 	pool_sethiwat(p, xmax);
   3538 	if ((error = pool_prime(p, xmin)) != 0)
   3539 		panic("%s: failed to prime pool: %d", __func__, error);
   3540 	pool_setlowat(p, xmin);
   3541 }
   3542 
   3543 /*
   3544  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3545  * to see if there is IO pending and if that IO could possibly be done
   3546  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3547  * otherwise.
   3548  *
   3549  */
   3550 int
   3551 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3552 {
   3553 	struct raid_softc *rs;
   3554 	struct dk_softc *dksc;
   3555 
   3556 	rs = raidPtr->softc;
   3557 	dksc = &rs->sc_dksc;
   3558 
   3559 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3560 		return 1;
   3561 
   3562 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3563 		/* there is work to do */
   3564 		return 0;
   3565 	}
   3566 	/* default is nothing to do */
   3567 	return 1;
   3568 }
   3569 
   3570 int
   3571 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3572 {
   3573 	uint64_t numsecs;
   3574 	unsigned secsize;
   3575 	int error;
   3576 
   3577 	error = getdisksize(vp, &numsecs, &secsize);
   3578 	if (error == 0) {
   3579 		diskPtr->blockSize = secsize;
   3580 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3581 		diskPtr->partitionSize = numsecs;
   3582 		return 0;
   3583 	}
   3584 	return error;
   3585 }
   3586 
   3587 static int
   3588 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3589 {
   3590 	return 1;
   3591 }
   3592 
   3593 static void
   3594 raid_attach(device_t parent, device_t self, void *aux)
   3595 {
   3596 }
   3597 
   3598 
   3599 static int
   3600 raid_detach(device_t self, int flags)
   3601 {
   3602 	int error;
   3603 	struct raid_softc *rs = raidsoftc(self);
   3604 
   3605 	if (rs == NULL)
   3606 		return ENXIO;
   3607 
   3608 	if ((error = raidlock(rs)) != 0)
   3609 		return (error);
   3610 
   3611 	error = raid_detach_unlocked(rs);
   3612 
   3613 	raidunlock(rs);
   3614 
   3615 	/* XXX raid can be referenced here */
   3616 
   3617 	if (error)
   3618 		return error;
   3619 
   3620 	/* Free the softc */
   3621 	raidput(rs);
   3622 
   3623 	return 0;
   3624 }
   3625 
   3626 static void
   3627 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3628 {
   3629 	struct dk_softc *dksc = &rs->sc_dksc;
   3630 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3631 
   3632 	memset(dg, 0, sizeof(*dg));
   3633 
   3634 	dg->dg_secperunit = raidPtr->totalSectors;
   3635 	dg->dg_secsize = raidPtr->bytesPerSector;
   3636 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3637 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3638 
   3639 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3640 }
   3641 
   3642 /*
   3643  * Get cache info for all the components (including spares).
   3644  * Returns intersection of all the cache flags of all disks, or first
   3645  * error if any encountered.
   3646  * XXXfua feature flags can change as spares are added - lock down somehow
   3647  */
   3648 static int
   3649 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3650 {
   3651 	int c;
   3652 	int error;
   3653 	int dkwhole = 0, dkpart;
   3654 
   3655 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3656 		/*
   3657 		 * Check any non-dead disk, even when currently being
   3658 		 * reconstructed.
   3659 		 */
   3660 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3661 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3662 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3663 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3664 			if (error) {
   3665 				if (error != ENODEV) {
   3666 					printf("raid%d: get cache for component %s failed\n",
   3667 					    raidPtr->raidid,
   3668 					    raidPtr->Disks[c].devname);
   3669 				}
   3670 
   3671 				return error;
   3672 			}
   3673 
   3674 			if (c == 0)
   3675 				dkwhole = dkpart;
   3676 			else
   3677 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3678 		}
   3679 	}
   3680 
   3681 	*data = dkwhole;
   3682 
   3683 	return 0;
   3684 }
   3685 
   3686 /*
   3687  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3688  * We end up returning whatever error was returned by the first cache flush
   3689  * that fails.
   3690  */
   3691 
   3692 int
   3693 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3694 {
   3695 	int c, sparecol;
   3696 	int e,error;
   3697 	int force = 1;
   3698 
   3699 	error = 0;
   3700 	for (c = 0; c < raidPtr->numCol; c++) {
   3701 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3702 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3703 					  &force, FWRITE, NOCRED);
   3704 			if (e) {
   3705 				if (e != ENODEV)
   3706 					printf("raid%d: cache flush to component %s failed.\n",
   3707 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3708 				if (error == 0) {
   3709 					error = e;
   3710 				}
   3711 			}
   3712 		}
   3713 	}
   3714 
   3715 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3716 		sparecol = raidPtr->numCol + c;
   3717 		/* Need to ensure that the reconstruct actually completed! */
   3718 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3719 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3720 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3721 			if (e) {
   3722 				if (e != ENODEV)
   3723 					printf("raid%d: cache flush to component %s failed.\n",
   3724 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3725 				if (error == 0) {
   3726 					error = e;
   3727 				}
   3728 			}
   3729 		}
   3730 	}
   3731 	return error;
   3732 }
   3733 
   3734 /* Fill in info with the current status */
   3735 void
   3736 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3737 {
   3738 
   3739 	if (raidPtr->status != rf_rs_reconstructing) {
   3740 		info->total = 100;
   3741 		info->completed = 100;
   3742 	} else {
   3743 		info->total = raidPtr->reconControl->numRUsTotal;
   3744 		info->completed = raidPtr->reconControl->numRUsComplete;
   3745 	}
   3746 	info->remaining = info->total - info->completed;
   3747 }
   3748 
   3749 /* Fill in info with the current status */
   3750 void
   3751 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3752 {
   3753 
   3754 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3755 		info->total = raidPtr->Layout.numStripe;
   3756 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3757 	} else {
   3758 		info->completed = 100;
   3759 		info->total = 100;
   3760 	}
   3761 	info->remaining = info->total - info->completed;
   3762 }
   3763 
   3764 /* Fill in info with the current status */
   3765 void
   3766 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3767 {
   3768 
   3769 	if (raidPtr->copyback_in_progress == 1) {
   3770 		info->total = raidPtr->Layout.numStripe;
   3771 		info->completed = raidPtr->copyback_stripes_done;
   3772 		info->remaining = info->total - info->completed;
   3773 	} else {
   3774 		info->remaining = 0;
   3775 		info->completed = 100;
   3776 		info->total = 100;
   3777 	}
   3778 }
   3779 
   3780 /* Fill in config with the current info */
   3781 int
   3782 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3783 {
   3784 	int	d, i, j;
   3785 
   3786 	if (!raidPtr->valid)
   3787 		return (ENODEV);
   3788 	config->cols = raidPtr->numCol;
   3789 	config->ndevs = raidPtr->numCol;
   3790 	if (config->ndevs >= RF_MAX_DISKS)
   3791 		return (ENOMEM);
   3792 	config->nspares = raidPtr->numSpare;
   3793 	if (config->nspares >= RF_MAX_DISKS)
   3794 		return (ENOMEM);
   3795 	config->maxqdepth = raidPtr->maxQueueDepth;
   3796 	d = 0;
   3797 	for (j = 0; j < config->cols; j++) {
   3798 		config->devs[d] = raidPtr->Disks[j];
   3799 		d++;
   3800 	}
   3801 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3802 		config->spares[i] = raidPtr->Disks[j];
   3803 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3804 			/* XXX: raidctl(8) expects to see this as a used spare */
   3805 			config->spares[i].status = rf_ds_used_spare;
   3806 		}
   3807 	}
   3808 	return 0;
   3809 }
   3810 
   3811 int
   3812 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3813 {
   3814 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3815 	RF_ComponentLabel_t *raid_clabel;
   3816 	int column = clabel->column;
   3817 
   3818 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3819 		return EINVAL;
   3820 	raid_clabel = raidget_component_label(raidPtr, column);
   3821 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3822 
   3823 	return 0;
   3824 }
   3825 
   3826 /*
   3827  * Module interface
   3828  */
   3829 
   3830 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3831 
   3832 #ifdef _MODULE
   3833 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3834 #endif
   3835 
   3836 static int raid_modcmd(modcmd_t, void *);
   3837 static int raid_modcmd_init(void);
   3838 static int raid_modcmd_fini(void);
   3839 
   3840 static int
   3841 raid_modcmd(modcmd_t cmd, void *data)
   3842 {
   3843 	int error;
   3844 
   3845 	error = 0;
   3846 	switch (cmd) {
   3847 	case MODULE_CMD_INIT:
   3848 		error = raid_modcmd_init();
   3849 		break;
   3850 	case MODULE_CMD_FINI:
   3851 		error = raid_modcmd_fini();
   3852 		break;
   3853 	default:
   3854 		error = ENOTTY;
   3855 		break;
   3856 	}
   3857 	return error;
   3858 }
   3859 
   3860 static int
   3861 raid_modcmd_init(void)
   3862 {
   3863 	int error;
   3864 	int bmajor, cmajor;
   3865 
   3866 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3867 	mutex_enter(&raid_lock);
   3868 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3869 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3870 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3871 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3872 
   3873 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3874 #endif
   3875 
   3876 	bmajor = cmajor = -1;
   3877 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3878 	    &raid_cdevsw, &cmajor);
   3879 	if (error != 0 && error != EEXIST) {
   3880 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3881 		mutex_exit(&raid_lock);
   3882 		return error;
   3883 	}
   3884 #ifdef _MODULE
   3885 	error = config_cfdriver_attach(&raid_cd);
   3886 	if (error != 0) {
   3887 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3888 		    __func__, error);
   3889 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3890 		mutex_exit(&raid_lock);
   3891 		return error;
   3892 	}
   3893 #endif
   3894 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3895 	if (error != 0) {
   3896 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3897 		    __func__, error);
   3898 #ifdef _MODULE
   3899 		config_cfdriver_detach(&raid_cd);
   3900 #endif
   3901 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3902 		mutex_exit(&raid_lock);
   3903 		return error;
   3904 	}
   3905 
   3906 	raidautoconfigdone = false;
   3907 
   3908 	mutex_exit(&raid_lock);
   3909 
   3910 	if (error == 0) {
   3911 		if (rf_BootRaidframe(true) == 0)
   3912 			aprint_verbose("Kernelized RAIDframe activated\n");
   3913 		else
   3914 			panic("Serious error activating RAID!!");
   3915 	}
   3916 
   3917 	/*
   3918 	 * Register a finalizer which will be used to auto-config RAID
   3919 	 * sets once all real hardware devices have been found.
   3920 	 */
   3921 	error = config_finalize_register(NULL, rf_autoconfig);
   3922 	if (error != 0) {
   3923 		aprint_error("WARNING: unable to register RAIDframe "
   3924 		    "finalizer\n");
   3925 		error = 0;
   3926 	}
   3927 
   3928 	return error;
   3929 }
   3930 
   3931 static int
   3932 raid_modcmd_fini(void)
   3933 {
   3934 	int error;
   3935 
   3936 	mutex_enter(&raid_lock);
   3937 
   3938 	/* Don't allow unload if raid device(s) exist.  */
   3939 	if (!LIST_EMPTY(&raids)) {
   3940 		mutex_exit(&raid_lock);
   3941 		return EBUSY;
   3942 	}
   3943 
   3944 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3945 	if (error != 0) {
   3946 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3947 		mutex_exit(&raid_lock);
   3948 		return error;
   3949 	}
   3950 #ifdef _MODULE
   3951 	error = config_cfdriver_detach(&raid_cd);
   3952 	if (error != 0) {
   3953 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3954 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3955 		mutex_exit(&raid_lock);
   3956 		return error;
   3957 	}
   3958 #endif
   3959 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3960 	if (error != 0) {
   3961 		aprint_error("%s: cannot detach devsw\n",__func__);
   3962 #ifdef _MODULE
   3963 		config_cfdriver_attach(&raid_cd);
   3964 #endif
   3965 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3966 		mutex_exit(&raid_lock);
   3967 		return error;
   3968 	}
   3969 	rf_BootRaidframe(false);
   3970 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3971 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3972 	rf_destroy_cond2(rf_sparet_wait_cv);
   3973 	rf_destroy_cond2(rf_sparet_resp_cv);
   3974 #endif
   3975 	mutex_exit(&raid_lock);
   3976 	mutex_destroy(&raid_lock);
   3977 
   3978 	return error;
   3979 }
   3980