Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.376.4.5
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.376.4.5 2023/09/18 19:00:21 martin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.376.4.5 2023/09/18 19:00:21 martin Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    179 
    180 /* prototypes */
    181 static void KernelWakeupFunc(struct buf *);
    182 static void InitBP(struct buf *, struct vnode *, unsigned,
    183     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    184     void *, int, struct proc *);
    185 static void raidinit(struct raid_softc *);
    186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    188 
    189 static int raid_match(device_t, cfdata_t, void *);
    190 static void raid_attach(device_t, device_t, void *);
    191 static int raid_detach(device_t, int);
    192 
    193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    194     daddr_t, daddr_t);
    195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t, int);
    197 
    198 static int raidwrite_component_label(unsigned,
    199     dev_t, struct vnode *, RF_ComponentLabel_t *);
    200 static int raidread_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 
    203 static int raid_diskstart(device_t, struct buf *bp);
    204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    205 static int raid_lastclose(device_t);
    206 
    207 static dev_type_open(raidopen);
    208 static dev_type_close(raidclose);
    209 static dev_type_read(raidread);
    210 static dev_type_write(raidwrite);
    211 static dev_type_ioctl(raidioctl);
    212 static dev_type_strategy(raidstrategy);
    213 static dev_type_dump(raiddump);
    214 static dev_type_size(raidsize);
    215 
    216 const struct bdevsw raid_bdevsw = {
    217 	.d_open = raidopen,
    218 	.d_close = raidclose,
    219 	.d_strategy = raidstrategy,
    220 	.d_ioctl = raidioctl,
    221 	.d_dump = raiddump,
    222 	.d_psize = raidsize,
    223 	.d_discard = nodiscard,
    224 	.d_flag = D_DISK
    225 };
    226 
    227 const struct cdevsw raid_cdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_read = raidread,
    231 	.d_write = raidwrite,
    232 	.d_ioctl = raidioctl,
    233 	.d_stop = nostop,
    234 	.d_tty = notty,
    235 	.d_poll = nopoll,
    236 	.d_mmap = nommap,
    237 	.d_kqfilter = nokqfilter,
    238 	.d_discard = nodiscard,
    239 	.d_flag = D_DISK
    240 };
    241 
    242 static struct dkdriver rf_dkdriver = {
    243 	.d_open = raidopen,
    244 	.d_close = raidclose,
    245 	.d_strategy = raidstrategy,
    246 	.d_diskstart = raid_diskstart,
    247 	.d_dumpblocks = raid_dumpblocks,
    248 	.d_lastclose = raid_lastclose,
    249 	.d_minphys = minphys
    250 };
    251 
    252 #define	raidunit(x)	DISKUNIT(x)
    253 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /* Internal representation of a rf_recon_req */
    261 struct rf_recon_req_internal {
    262 	RF_RowCol_t col;
    263 	RF_ReconReqFlags_t flags;
    264 	void   *raidPtr;
    265 };
    266 
    267 /*
    268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    269  * Be aware that large numbers can allow the driver to consume a lot of
    270  * kernel memory, especially on writes, and in degraded mode reads.
    271  *
    272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    273  * a single 64K write will typically require 64K for the old data,
    274  * 64K for the old parity, and 64K for the new parity, for a total
    275  * of 192K (if the parity buffer is not re-used immediately).
    276  * Even it if is used immediately, that's still 128K, which when multiplied
    277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    278  *
    279  * Now in degraded mode, for example, a 64K read on the above setup may
    280  * require data reconstruction, which will require *all* of the 4 remaining
    281  * disks to participate -- 4 * 32K/disk == 128K again.
    282  */
    283 
    284 #ifndef RAIDOUTSTANDING
    285 #define RAIDOUTSTANDING   6
    286 #endif
    287 
    288 #define RAIDLABELDEV(dev)	\
    289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    290 
    291 /* declared here, and made public, for the benefit of KVM stuff.. */
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req_internal *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	sc->sc_unit = unit;
    342 	cv_init(&sc->sc_cv, "raidunit");
    343 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    344 	return sc;
    345 }
    346 
    347 static void
    348 raiddestroy(struct raid_softc *sc) {
    349 	cv_destroy(&sc->sc_cv);
    350 	mutex_destroy(&sc->sc_mutex);
    351 	kmem_free(sc, sizeof(*sc));
    352 }
    353 
    354 static struct raid_softc *
    355 raidget(int unit, bool create) {
    356 	struct raid_softc *sc;
    357 	if (unit < 0) {
    358 #ifdef DIAGNOSTIC
    359 		panic("%s: unit %d!", __func__, unit);
    360 #endif
    361 		return NULL;
    362 	}
    363 	mutex_enter(&raid_lock);
    364 	LIST_FOREACH(sc, &raids, sc_link) {
    365 		if (sc->sc_unit == unit) {
    366 			mutex_exit(&raid_lock);
    367 			return sc;
    368 		}
    369 	}
    370 	mutex_exit(&raid_lock);
    371 	if (!create)
    372 		return NULL;
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return (0);
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok, rootable %d\n",
    502 				    sc->sc_unit, cset->rootable);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 	dksc = &rsc->sc_dksc;
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL) {
    527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    528 		return;
    529 	}
    530 
    531 	/* we found something bootable... */
    532 
    533 	/*
    534 	 * XXX: The following code assumes that the root raid
    535 	 * is the first ('a') partition. This is about the best
    536 	 * we can do with a BSD disklabel, but we might be able
    537 	 * to do better with a GPT label, by setting a specified
    538 	 * attribute to indicate the root partition. We can then
    539 	 * stash the partition number in the r->root_partition
    540 	 * high bits (the bottom 2 bits are already used). For
    541 	 * now we just set booted_partition to 0 when we override
    542 	 * root.
    543 	 */
    544 	if (num_root == 1) {
    545 		device_t candidate_root;
    546 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    547 			char cname[sizeof(cset->ac->devname)];
    548 			/* XXX: assume partition 'a' first */
    549 			snprintf(cname, sizeof(cname), "%s%c",
    550 			    device_xname(dksc->sc_dev), 'a');
    551 			candidate_root = dkwedge_find_by_wname(cname);
    552 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    553 			    cname);
    554 			if (candidate_root == NULL) {
    555 				/*
    556 				 * If that is not found, because we don't use
    557 				 * disklabel, return the first dk child
    558 				 * XXX: we can skip the 'a' check above
    559 				 * and always do this...
    560 				 */
    561 				size_t i = 0;
    562 				candidate_root = dkwedge_find_by_parent(
    563 				    device_xname(dksc->sc_dev), &i);
    564 			}
    565 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    566 			    candidate_root);
    567 		} else
    568 			candidate_root = dksc->sc_dev;
    569 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    570 		DPRINTF("%s: booted_device=%p root_partition=%d "
    571 			"contains_boot=%d",
    572 		    __func__, booted_device, rsc->sc_r.root_partition,
    573 			   rf_containsboot(&rsc->sc_r, booted_device));
    574 		/* XXX the check for booted_device == NULL can probably be
    575 		 * dropped, now that rf_containsboot handles that case.
    576 		 */
    577 		if (booted_device == NULL ||
    578 		    rsc->sc_r.root_partition == 1 ||
    579 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    580 			booted_device = candidate_root;
    581 			booted_method = "raidframe/single";
    582 			booted_partition = 0;	/* XXX assume 'a' */
    583 		}
    584 	} else if (num_root > 1) {
    585 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    586 		    booted_device);
    587 
    588 		/*
    589 		 * Maybe the MD code can help. If it cannot, then
    590 		 * setroot() will discover that we have no
    591 		 * booted_device and will ask the user if nothing was
    592 		 * hardwired in the kernel config file
    593 		 */
    594 		if (booted_device == NULL)
    595 			return;
    596 
    597 		num_root = 0;
    598 		mutex_enter(&raid_lock);
    599 		LIST_FOREACH(sc, &raids, sc_link) {
    600 			RF_Raid_t *r = &sc->sc_r;
    601 			if (r->valid == 0)
    602 				continue;
    603 
    604 			if (r->root_partition == 0)
    605 				continue;
    606 
    607 			if (rf_containsboot(r, booted_device)) {
    608 				num_root++;
    609 				rsc = sc;
    610 				dksc = &rsc->sc_dksc;
    611 			}
    612 		}
    613 		mutex_exit(&raid_lock);
    614 
    615 		if (num_root == 1) {
    616 			booted_device = dksc->sc_dev;
    617 			booted_method = "raidframe/multi";
    618 			booted_partition = 0;	/* XXX assume 'a' */
    619 		} else {
    620 			/* we can't guess.. require the user to answer... */
    621 			boothowto |= RB_ASKNAME;
    622 		}
    623 	}
    624 }
    625 
    626 static int
    627 raidsize(dev_t dev)
    628 {
    629 	struct raid_softc *rs;
    630 	struct dk_softc *dksc;
    631 	unsigned int unit;
    632 
    633 	unit = raidunit(dev);
    634 	if ((rs = raidget(unit, false)) == NULL)
    635 		return -1;
    636 	dksc = &rs->sc_dksc;
    637 
    638 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    639 		return -1;
    640 
    641 	return dk_size(dksc, dev);
    642 }
    643 
    644 static int
    645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    646 {
    647 	unsigned int unit;
    648 	struct raid_softc *rs;
    649 	struct dk_softc *dksc;
    650 
    651 	unit = raidunit(dev);
    652 	if ((rs = raidget(unit, false)) == NULL)
    653 		return ENXIO;
    654 	dksc = &rs->sc_dksc;
    655 
    656 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    657 		return ENODEV;
    658 
    659         /*
    660            Note that blkno is relative to this particular partition.
    661            By adding adding RF_PROTECTED_SECTORS, we get a value that
    662 	   is relative to the partition used for the underlying component.
    663         */
    664 	blkno += RF_PROTECTED_SECTORS;
    665 
    666 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    667 }
    668 
    669 static int
    670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    671 {
    672 	struct raid_softc *rs = raidsoftc(dev);
    673 	const struct bdevsw *bdev;
    674 	RF_Raid_t *raidPtr;
    675 	int     c, sparecol, j, scol, dumpto;
    676 	int     error = 0;
    677 
    678 	raidPtr = &rs->sc_r;
    679 
    680 	/* we only support dumping to RAID 1 sets */
    681 	if (raidPtr->Layout.numDataCol != 1 ||
    682 	    raidPtr->Layout.numParityCol != 1)
    683 		return EINVAL;
    684 
    685 	if ((error = raidlock(rs)) != 0)
    686 		return error;
    687 
    688 	/* figure out what device is alive.. */
    689 
    690 	/*
    691 	   Look for a component to dump to.  The preference for the
    692 	   component to dump to is as follows:
    693 	   1) the master
    694 	   2) a used_spare of the master
    695 	   3) the slave
    696 	   4) a used_spare of the slave
    697 	*/
    698 
    699 	dumpto = -1;
    700 	for (c = 0; c < raidPtr->numCol; c++) {
    701 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    702 			/* this might be the one */
    703 			dumpto = c;
    704 			break;
    705 		}
    706 	}
    707 
    708 	/*
    709 	   At this point we have possibly selected a live master or a
    710 	   live slave.  We now check to see if there is a spared
    711 	   master (or a spared slave), if we didn't find a live master
    712 	   or a live slave.
    713 	*/
    714 
    715 	for (c = 0; c < raidPtr->numSpare; c++) {
    716 		sparecol = raidPtr->numCol + c;
    717 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    718 			/* How about this one? */
    719 			scol = -1;
    720 			for(j=0;j<raidPtr->numCol;j++) {
    721 				if (raidPtr->Disks[j].spareCol == sparecol) {
    722 					scol = j;
    723 					break;
    724 				}
    725 			}
    726 			if (scol == 0) {
    727 				/*
    728 				   We must have found a spared master!
    729 				   We'll take that over anything else
    730 				   found so far.  (We couldn't have
    731 				   found a real master before, since
    732 				   this is a used spare, and it's
    733 				   saying that it's replacing the
    734 				   master.)  On reboot (with
    735 				   autoconfiguration turned on)
    736 				   sparecol will become the 1st
    737 				   component (component0) of this set.
    738 				*/
    739 				dumpto = sparecol;
    740 				break;
    741 			} else if (scol != -1) {
    742 				/*
    743 				   Must be a spared slave.  We'll dump
    744 				   to that if we havn't found anything
    745 				   else so far.
    746 				*/
    747 				if (dumpto == -1)
    748 					dumpto = sparecol;
    749 			}
    750 		}
    751 	}
    752 
    753 	if (dumpto == -1) {
    754 		/* we couldn't find any live components to dump to!?!?
    755 		 */
    756 		error = EINVAL;
    757 		goto out;
    758 	}
    759 
    760 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    761 	if (bdev == NULL) {
    762 		error = ENXIO;
    763 		goto out;
    764 	}
    765 
    766 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    767 				blkno, va, nblk * raidPtr->bytesPerSector);
    768 
    769 out:
    770 	raidunlock(rs);
    771 
    772 	return error;
    773 }
    774 
    775 /* ARGSUSED */
    776 static int
    777 raidopen(dev_t dev, int flags, int fmt,
    778     struct lwp *l)
    779 {
    780 	int     unit = raidunit(dev);
    781 	struct raid_softc *rs;
    782 	struct dk_softc *dksc;
    783 	int     error = 0;
    784 	int     part, pmask;
    785 
    786 	if ((rs = raidget(unit, true)) == NULL)
    787 		return ENXIO;
    788 	if ((error = raidlock(rs)) != 0)
    789 		return (error);
    790 
    791 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    792 		error = EBUSY;
    793 		goto bad;
    794 	}
    795 
    796 	dksc = &rs->sc_dksc;
    797 
    798 	part = DISKPART(dev);
    799 	pmask = (1 << part);
    800 
    801 	if (!DK_BUSY(dksc, pmask) &&
    802 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    803 		/* First one... mark things as dirty... Note that we *MUST*
    804 		 have done a configure before this.  I DO NOT WANT TO BE
    805 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    806 		 THAT THEY BELONG TOGETHER!!!!! */
    807 		/* XXX should check to see if we're only open for reading
    808 		   here... If so, we needn't do this, but then need some
    809 		   other way of keeping track of what's happened.. */
    810 
    811 		rf_markalldirty(&rs->sc_r);
    812 	}
    813 
    814 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    815 		error = dk_open(dksc, dev, flags, fmt, l);
    816 
    817 bad:
    818 	raidunlock(rs);
    819 
    820 	return (error);
    821 
    822 
    823 }
    824 
    825 static int
    826 raid_lastclose(device_t self)
    827 {
    828 	struct raid_softc *rs = raidsoftc(self);
    829 
    830 	/* Last one... device is not unconfigured yet.
    831 	   Device shutdown has taken care of setting the
    832 	   clean bits if RAIDF_INITED is not set
    833 	   mark things as clean... */
    834 
    835 	rf_update_component_labels(&rs->sc_r,
    836 	    RF_FINAL_COMPONENT_UPDATE);
    837 
    838 	/* pass to unlocked code */
    839 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    840 		rs->sc_flags |= RAIDF_DETACH;
    841 
    842 	return 0;
    843 }
    844 
    845 /* ARGSUSED */
    846 static int
    847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    848 {
    849 	int     unit = raidunit(dev);
    850 	struct raid_softc *rs;
    851 	struct dk_softc *dksc;
    852 	cfdata_t cf;
    853 	int     error = 0, do_detach = 0, do_put = 0;
    854 
    855 	if ((rs = raidget(unit, false)) == NULL)
    856 		return ENXIO;
    857 	dksc = &rs->sc_dksc;
    858 
    859 	if ((error = raidlock(rs)) != 0)
    860 		return (error);
    861 
    862 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    863 		error = dk_close(dksc, dev, flags, fmt, l);
    864 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    865 			do_detach = 1;
    866 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    867 		do_put = 1;
    868 
    869 	raidunlock(rs);
    870 
    871 	if (do_detach) {
    872 		/* free the pseudo device attach bits */
    873 		cf = device_cfdata(dksc->sc_dev);
    874 		error = config_detach(dksc->sc_dev, 0);
    875 		if (error == 0)
    876 			free(cf, M_RAIDFRAME);
    877 	} else if (do_put) {
    878 		raidput(rs);
    879 	}
    880 
    881 	return (error);
    882 
    883 }
    884 
    885 static void
    886 raid_wakeup(RF_Raid_t *raidPtr)
    887 {
    888 	rf_lock_mutex2(raidPtr->iodone_lock);
    889 	rf_signal_cond2(raidPtr->iodone_cv);
    890 	rf_unlock_mutex2(raidPtr->iodone_lock);
    891 }
    892 
    893 static void
    894 raidstrategy(struct buf *bp)
    895 {
    896 	unsigned int unit;
    897 	struct raid_softc *rs;
    898 	struct dk_softc *dksc;
    899 	RF_Raid_t *raidPtr;
    900 
    901 	unit = raidunit(bp->b_dev);
    902 	if ((rs = raidget(unit, false)) == NULL) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    907 		bp->b_error = ENXIO;
    908 		goto fail;
    909 	}
    910 	dksc = &rs->sc_dksc;
    911 	raidPtr = &rs->sc_r;
    912 
    913 	/* Queue IO only */
    914 	if (dk_strategy_defer(dksc, bp))
    915 		goto done;
    916 
    917 	/* schedule the IO to happen at the next convenient time */
    918 	raid_wakeup(raidPtr);
    919 
    920 done:
    921 	return;
    922 
    923 fail:
    924 	bp->b_resid = bp->b_bcount;
    925 	biodone(bp);
    926 }
    927 
    928 static int
    929 raid_diskstart(device_t dev, struct buf *bp)
    930 {
    931 	struct raid_softc *rs = raidsoftc(dev);
    932 	RF_Raid_t *raidPtr;
    933 
    934 	raidPtr = &rs->sc_r;
    935 	if (!raidPtr->valid) {
    936 		db1_printf(("raid is not valid..\n"));
    937 		return ENODEV;
    938 	}
    939 
    940 	/* XXX */
    941 	bp->b_resid = 0;
    942 
    943 	return raiddoaccess(raidPtr, bp);
    944 }
    945 
    946 void
    947 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    948 {
    949 	struct raid_softc *rs;
    950 	struct dk_softc *dksc;
    951 
    952 	rs = raidPtr->softc;
    953 	dksc = &rs->sc_dksc;
    954 
    955 	dk_done(dksc, bp);
    956 
    957 	rf_lock_mutex2(raidPtr->mutex);
    958 	raidPtr->openings++;
    959 	rf_unlock_mutex2(raidPtr->mutex);
    960 
    961 	/* schedule more IO */
    962 	raid_wakeup(raidPtr);
    963 }
    964 
    965 /* ARGSUSED */
    966 static int
    967 raidread(dev_t dev, struct uio *uio, int flags)
    968 {
    969 	int     unit = raidunit(dev);
    970 	struct raid_softc *rs;
    971 
    972 	if ((rs = raidget(unit, false)) == NULL)
    973 		return ENXIO;
    974 
    975 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    976 		return (ENXIO);
    977 
    978 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    979 
    980 }
    981 
    982 /* ARGSUSED */
    983 static int
    984 raidwrite(dev_t dev, struct uio *uio, int flags)
    985 {
    986 	int     unit = raidunit(dev);
    987 	struct raid_softc *rs;
    988 
    989 	if ((rs = raidget(unit, false)) == NULL)
    990 		return ENXIO;
    991 
    992 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    993 		return (ENXIO);
    994 
    995 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    996 
    997 }
    998 
    999 static int
   1000 raid_detach_unlocked(struct raid_softc *rs)
   1001 {
   1002 	struct dk_softc *dksc = &rs->sc_dksc;
   1003 	RF_Raid_t *raidPtr;
   1004 	int error;
   1005 
   1006 	raidPtr = &rs->sc_r;
   1007 
   1008 	if (DK_BUSY(dksc, 0) ||
   1009 	    raidPtr->recon_in_progress != 0 ||
   1010 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1011 	    raidPtr->copyback_in_progress != 0)
   1012 		return EBUSY;
   1013 
   1014 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1015 		return 0;
   1016 
   1017 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1018 
   1019 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1020 		return error;
   1021 
   1022 	rs->sc_flags &= ~RAIDF_INITED;
   1023 
   1024 	/* Kill off any queued buffers */
   1025 	dk_drain(dksc);
   1026 	bufq_free(dksc->sc_bufq);
   1027 
   1028 	/* Detach the disk. */
   1029 	dkwedge_delall(&dksc->sc_dkdev);
   1030 	disk_detach(&dksc->sc_dkdev);
   1031 	disk_destroy(&dksc->sc_dkdev);
   1032 	dk_detach(dksc);
   1033 
   1034 	return 0;
   1035 }
   1036 
   1037 static bool
   1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1039 {
   1040 	switch (cmd) {
   1041 	case RAIDFRAME_ADD_HOT_SPARE:
   1042 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1043 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1044 	case RAIDFRAME_CHECK_PARITY:
   1045 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1046 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1047 	case RAIDFRAME_CHECK_RECON_STATUS:
   1048 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1049 	case RAIDFRAME_COPYBACK:
   1050 	case RAIDFRAME_DELETE_COMPONENT:
   1051 	case RAIDFRAME_FAIL_DISK:
   1052 	case RAIDFRAME_GET_ACCTOTALS:
   1053 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1054 	case RAIDFRAME_GET_INFO:
   1055 	case RAIDFRAME_GET_SIZE:
   1056 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1057 	case RAIDFRAME_INIT_LABELS:
   1058 	case RAIDFRAME_KEEP_ACCTOTALS:
   1059 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1060 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1061 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1062 	case RAIDFRAME_PARITYMAP_STATUS:
   1063 	case RAIDFRAME_REBUILD_IN_PLACE:
   1064 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1065 	case RAIDFRAME_RESET_ACCTOTALS:
   1066 	case RAIDFRAME_REWRITEPARITY:
   1067 	case RAIDFRAME_SET_AUTOCONFIG:
   1068 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1069 	case RAIDFRAME_SET_LAST_UNIT:
   1070 	case RAIDFRAME_SET_ROOT:
   1071 	case RAIDFRAME_SHUTDOWN:
   1072 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1073 	}
   1074 	return false;
   1075 }
   1076 
   1077 int
   1078 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1079 {
   1080 	struct rf_recon_req_internal *rrint;
   1081 
   1082 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1083 		/* Can't do this on a RAID 0!! */
   1084 		return EINVAL;
   1085 	}
   1086 
   1087 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1088 		/* bad column */
   1089 		return EINVAL;
   1090 	}
   1091 
   1092 	rf_lock_mutex2(raidPtr->mutex);
   1093 	if (raidPtr->status == rf_rs_reconstructing) {
   1094 		/* you can't fail a disk while we're reconstructing! */
   1095 		/* XXX wrong for RAID6 */
   1096 		goto out;
   1097 	}
   1098 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1099 	    (raidPtr->numFailures > 0)) {
   1100 		/* some other component has failed.  Let's not make
   1101 		   things worse. XXX wrong for RAID6 */
   1102 		goto out;
   1103 	}
   1104 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1105 		/* Can't fail a spared disk! */
   1106 		goto out;
   1107 	}
   1108 	rf_unlock_mutex2(raidPtr->mutex);
   1109 
   1110 	/* make a copy of the recon request so that we don't rely on
   1111 	 * the user's buffer */
   1112 	rrint = RF_Malloc(sizeof(*rrint));
   1113 	if (rrint == NULL)
   1114 		return(ENOMEM);
   1115 	rrint->col = rr->col;
   1116 	rrint->flags = rr->flags;
   1117 	rrint->raidPtr = raidPtr;
   1118 
   1119 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1120 	    rrint, "raid_recon");
   1121 out:
   1122 	rf_unlock_mutex2(raidPtr->mutex);
   1123 	return EINVAL;
   1124 }
   1125 
   1126 static int
   1127 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1128 {
   1129 	/* allocate a buffer for the layout-specific data, and copy it in */
   1130 	if (k_cfg->layoutSpecificSize == 0)
   1131 		return 0;
   1132 
   1133 	if (k_cfg->layoutSpecificSize > 10000) {
   1134 	    /* sanity check */
   1135 	    return EINVAL;
   1136 	}
   1137 
   1138 	u_char *specific_buf;
   1139 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1140 	if (specific_buf == NULL)
   1141 		return ENOMEM;
   1142 
   1143 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1144 	    k_cfg->layoutSpecificSize);
   1145 	if (retcode) {
   1146 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1147 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1148 		return retcode;
   1149 	}
   1150 
   1151 	k_cfg->layoutSpecific = specific_buf;
   1152 	return 0;
   1153 }
   1154 
   1155 static int
   1156 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1157 {
   1158 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1159 
   1160 	if (rs->sc_r.valid) {
   1161 		/* There is a valid RAID set running on this unit! */
   1162 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1163 		return EINVAL;
   1164 	}
   1165 
   1166 	/* copy-in the configuration information */
   1167 	/* data points to a pointer to the configuration structure */
   1168 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1169 	if (*k_cfg == NULL) {
   1170 		return ENOMEM;
   1171 	}
   1172 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1173 	if (retcode == 0)
   1174 		return 0;
   1175 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1176 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1177 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1178 	return retcode;
   1179 }
   1180 
   1181 int
   1182 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1183 {
   1184 	int retcode, i;
   1185 	RF_Raid_t *raidPtr = &rs->sc_r;
   1186 
   1187 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1188 
   1189 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1190 		goto out;
   1191 
   1192 	/* should do some kind of sanity check on the configuration.
   1193 	 * Store the sum of all the bytes in the last byte? */
   1194 
   1195 	/* Force nul-termination on all strings. */
   1196 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1197 	for (i = 0; i < RF_MAXCOL; i++) {
   1198 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1199 	}
   1200 	for (i = 0; i < RF_MAXSPARE; i++) {
   1201 		ZERO_FINAL(k_cfg->spare_names[i]);
   1202 	}
   1203 	for (i = 0; i < RF_MAXDBGV; i++) {
   1204 		ZERO_FINAL(k_cfg->debugVars[i]);
   1205 	}
   1206 #undef ZERO_FINAL
   1207 
   1208 	/* Check some basic limits. */
   1209 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1210 		retcode = EINVAL;
   1211 		goto out;
   1212 	}
   1213 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1214 		retcode = EINVAL;
   1215 		goto out;
   1216 	}
   1217 
   1218 	/* configure the system */
   1219 
   1220 	/*
   1221 	 * Clear the entire RAID descriptor, just to make sure
   1222 	 *  there is no stale data left in the case of a
   1223 	 *  reconfiguration
   1224 	 */
   1225 	memset(raidPtr, 0, sizeof(*raidPtr));
   1226 	raidPtr->softc = rs;
   1227 	raidPtr->raidid = rs->sc_unit;
   1228 
   1229 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1230 
   1231 	if (retcode == 0) {
   1232 		/* allow this many simultaneous IO's to
   1233 		   this RAID device */
   1234 		raidPtr->openings = RAIDOUTSTANDING;
   1235 
   1236 		raidinit(rs);
   1237 		raid_wakeup(raidPtr);
   1238 		rf_markalldirty(raidPtr);
   1239 	}
   1240 
   1241 	/* free the buffers.  No return code here. */
   1242 	if (k_cfg->layoutSpecificSize) {
   1243 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1244 	}
   1245 out:
   1246 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1247 	if (retcode) {
   1248 		/*
   1249 		 * If configuration failed, set sc_flags so that we
   1250 		 * will detach the device when we close it.
   1251 		 */
   1252 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1253 	}
   1254 	return retcode;
   1255 }
   1256 
   1257 #if RF_DISABLED
   1258 static int
   1259 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1260 {
   1261 
   1262 	/* XXX check the label for valid stuff... */
   1263 	/* Note that some things *should not* get modified --
   1264 	   the user should be re-initing the labels instead of
   1265 	   trying to patch things.
   1266 	   */
   1267 #ifdef DEBUG
   1268 	int raidid = raidPtr->raidid;
   1269 	printf("raid%d: Got component label:\n", raidid);
   1270 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1271 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1272 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1273 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1274 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1275 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1276 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1277 #endif	/* DEBUG */
   1278 	clabel->row = 0;
   1279 	int column = clabel->column;
   1280 
   1281 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1282 		return(EINVAL);
   1283 	}
   1284 
   1285 	/* XXX this isn't allowed to do anything for now :-) */
   1286 
   1287 	/* XXX and before it is, we need to fill in the rest
   1288 	   of the fields!?!?!?! */
   1289 	memcpy(raidget_component_label(raidPtr, column),
   1290 	    clabel, sizeof(*clabel));
   1291 	raidflush_component_label(raidPtr, column);
   1292 	return 0;
   1293 }
   1294 #endif
   1295 
   1296 static int
   1297 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1298 {
   1299 	/*
   1300 	   we only want the serial number from
   1301 	   the above.  We get all the rest of the information
   1302 	   from the config that was used to create this RAID
   1303 	   set.
   1304 	   */
   1305 
   1306 	raidPtr->serial_number = clabel->serial_number;
   1307 
   1308 	for (int column = 0; column < raidPtr->numCol; column++) {
   1309 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1310 		if (RF_DEAD_DISK(diskPtr->status))
   1311 			continue;
   1312 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1313 		    raidPtr, column);
   1314 		/* Zeroing this is important. */
   1315 		memset(ci_label, 0, sizeof(*ci_label));
   1316 		raid_init_component_label(raidPtr, ci_label);
   1317 		ci_label->serial_number = raidPtr->serial_number;
   1318 		ci_label->row = 0; /* we dont' pretend to support more */
   1319 		rf_component_label_set_partitionsize(ci_label,
   1320 		    diskPtr->partitionSize);
   1321 		ci_label->column = column;
   1322 		raidflush_component_label(raidPtr, column);
   1323 		/* XXXjld what about the spares? */
   1324 	}
   1325 
   1326 	return 0;
   1327 }
   1328 
   1329 static int
   1330 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1331 {
   1332 
   1333 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1334 		/* Can't do this on a RAID 0!! */
   1335 		return EINVAL;
   1336 	}
   1337 
   1338 	if (raidPtr->recon_in_progress == 1) {
   1339 		/* a reconstruct is already in progress! */
   1340 		return EINVAL;
   1341 	}
   1342 
   1343 	RF_SingleComponent_t component;
   1344 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1345 	component.row = 0; /* we don't support any more */
   1346 	int column = component.column;
   1347 
   1348 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1349 		return EINVAL;
   1350 	}
   1351 
   1352 	rf_lock_mutex2(raidPtr->mutex);
   1353 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1354 	    (raidPtr->numFailures > 0)) {
   1355 		/* XXX 0 above shouldn't be constant!!! */
   1356 		/* some component other than this has failed.
   1357 		   Let's not make things worse than they already
   1358 		   are... */
   1359 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1360 		       raidPtr->raidid);
   1361 		printf("raid%d:     Col: %d   Too many failures.\n",
   1362 		       raidPtr->raidid, column);
   1363 		rf_unlock_mutex2(raidPtr->mutex);
   1364 		return EINVAL;
   1365 	}
   1366 
   1367 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1368 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1369 		       raidPtr->raidid);
   1370 		printf("raid%d:    Col: %d   "
   1371 		    "Reconstruction already occurring!\n",
   1372 		    raidPtr->raidid, column);
   1373 
   1374 		rf_unlock_mutex2(raidPtr->mutex);
   1375 		return EINVAL;
   1376 	}
   1377 
   1378 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1379 		rf_unlock_mutex2(raidPtr->mutex);
   1380 		return EINVAL;
   1381 	}
   1382 
   1383 	rf_unlock_mutex2(raidPtr->mutex);
   1384 
   1385 	struct rf_recon_req_internal *rrint;
   1386 	rrint = RF_Malloc(sizeof(*rrint));
   1387 	if (rrint == NULL)
   1388 		return ENOMEM;
   1389 
   1390 	rrint->col = column;
   1391 	rrint->raidPtr = raidPtr;
   1392 
   1393 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1394 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1395 }
   1396 
   1397 static int
   1398 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1399 {
   1400 	/*
   1401 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1402 	 * so tell the user it's done.
   1403 	 */
   1404 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1405 	    raidPtr->status != rf_rs_reconstructing) {
   1406 		*data = 100;
   1407 		return 0;
   1408 	}
   1409 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1410 		*data = 0;
   1411 		return 0;
   1412 	}
   1413 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1414 	    / raidPtr->reconControl->numRUsTotal);
   1415 	return 0;
   1416 }
   1417 
   1418 /*
   1419  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1420  * on the component_name[] array.
   1421  */
   1422 static void
   1423 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1424 {
   1425 
   1426 	memcpy(component, data, sizeof *component);
   1427 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1428 }
   1429 
   1430 static int
   1431 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1432 {
   1433 	int     unit = raidunit(dev);
   1434 	int     part, pmask;
   1435 	struct raid_softc *rs;
   1436 	struct dk_softc *dksc;
   1437 	RF_Config_t *k_cfg;
   1438 	RF_Raid_t *raidPtr;
   1439 	RF_AccTotals_t *totals;
   1440 	RF_SingleComponent_t component;
   1441 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1442 	int retcode = 0;
   1443 	int column;
   1444 	RF_ComponentLabel_t *clabel;
   1445 	int d;
   1446 
   1447 	if ((rs = raidget(unit, false)) == NULL)
   1448 		return ENXIO;
   1449 
   1450 	dksc = &rs->sc_dksc;
   1451 	raidPtr = &rs->sc_r;
   1452 
   1453 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1454 	    (int) DISKPART(dev), (int) unit, cmd));
   1455 
   1456 	/* Must be initialized for these... */
   1457 	if (rf_must_be_initialized(rs, cmd))
   1458 		return ENXIO;
   1459 
   1460 	switch (cmd) {
   1461 		/* configure the system */
   1462 	case RAIDFRAME_CONFIGURE:
   1463 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1464 			return retcode;
   1465 		return rf_construct(rs, k_cfg);
   1466 
   1467 		/* shutdown the system */
   1468 	case RAIDFRAME_SHUTDOWN:
   1469 
   1470 		part = DISKPART(dev);
   1471 		pmask = (1 << part);
   1472 
   1473 		if ((retcode = raidlock(rs)) != 0)
   1474 			return retcode;
   1475 
   1476 		if (DK_BUSY(dksc, pmask) ||
   1477 		    raidPtr->recon_in_progress != 0 ||
   1478 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1479 		    raidPtr->copyback_in_progress != 0)
   1480 			retcode = EBUSY;
   1481 		else {
   1482 			/* detach and free on close */
   1483 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1484 			retcode = 0;
   1485 		}
   1486 
   1487 		raidunlock(rs);
   1488 
   1489 		return retcode;
   1490 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1491 		return rf_get_component_label(raidPtr, data);
   1492 
   1493 #if RF_DISABLED
   1494 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1495 		return rf_set_component_label(raidPtr, data);
   1496 #endif
   1497 
   1498 	case RAIDFRAME_INIT_LABELS:
   1499 		return rf_init_component_label(raidPtr, data);
   1500 
   1501 	case RAIDFRAME_SET_AUTOCONFIG:
   1502 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1503 		printf("raid%d: New autoconfig value is: %d\n",
   1504 		       raidPtr->raidid, d);
   1505 		*(int *) data = d;
   1506 		return retcode;
   1507 
   1508 	case RAIDFRAME_SET_ROOT:
   1509 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1510 		printf("raid%d: New rootpartition value is: %d\n",
   1511 		       raidPtr->raidid, d);
   1512 		*(int *) data = d;
   1513 		return retcode;
   1514 
   1515 		/* initialize all parity */
   1516 	case RAIDFRAME_REWRITEPARITY:
   1517 
   1518 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1519 			/* Parity for RAID 0 is trivially correct */
   1520 			raidPtr->parity_good = RF_RAID_CLEAN;
   1521 			return 0;
   1522 		}
   1523 
   1524 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1525 			/* Re-write is already in progress! */
   1526 			return EINVAL;
   1527 		}
   1528 
   1529 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1530 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1531 
   1532 	case RAIDFRAME_ADD_HOT_SPARE:
   1533 		rf_copy_single_component(&component, data);
   1534 		return rf_add_hot_spare(raidPtr, &component);
   1535 
   1536 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1537 		return retcode;
   1538 
   1539 	case RAIDFRAME_DELETE_COMPONENT:
   1540 		rf_copy_single_component(&component, data);
   1541 		return rf_delete_component(raidPtr, &component);
   1542 
   1543 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1544 		rf_copy_single_component(&component, data);
   1545 		return rf_incorporate_hot_spare(raidPtr, &component);
   1546 
   1547 	case RAIDFRAME_REBUILD_IN_PLACE:
   1548 		return rf_rebuild_in_place(raidPtr, data);
   1549 
   1550 	case RAIDFRAME_GET_INFO:
   1551 		ucfgp = *(RF_DeviceConfig_t **)data;
   1552 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1553 		if (d_cfg == NULL)
   1554 			return ENOMEM;
   1555 		retcode = rf_get_info(raidPtr, d_cfg);
   1556 		if (retcode == 0) {
   1557 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1558 		}
   1559 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1560 		return retcode;
   1561 
   1562 	case RAIDFRAME_CHECK_PARITY:
   1563 		*(int *) data = raidPtr->parity_good;
   1564 		return 0;
   1565 
   1566 	case RAIDFRAME_PARITYMAP_STATUS:
   1567 		if (rf_paritymap_ineligible(raidPtr))
   1568 			return EINVAL;
   1569 		rf_paritymap_status(raidPtr->parity_map, data);
   1570 		return 0;
   1571 
   1572 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1573 		if (rf_paritymap_ineligible(raidPtr))
   1574 			return EINVAL;
   1575 		if (raidPtr->parity_map == NULL)
   1576 			return ENOENT; /* ??? */
   1577 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1578 			return EINVAL;
   1579 		return 0;
   1580 
   1581 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1582 		if (rf_paritymap_ineligible(raidPtr))
   1583 			return EINVAL;
   1584 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1585 		return 0;
   1586 
   1587 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1588 		if (rf_paritymap_ineligible(raidPtr))
   1589 			return EINVAL;
   1590 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1591 		/* XXX should errors be passed up? */
   1592 		return 0;
   1593 
   1594 	case RAIDFRAME_RESET_ACCTOTALS:
   1595 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1596 		return 0;
   1597 
   1598 	case RAIDFRAME_GET_ACCTOTALS:
   1599 		totals = (RF_AccTotals_t *) data;
   1600 		*totals = raidPtr->acc_totals;
   1601 		return 0;
   1602 
   1603 	case RAIDFRAME_KEEP_ACCTOTALS:
   1604 		raidPtr->keep_acc_totals = *(int *)data;
   1605 		return 0;
   1606 
   1607 	case RAIDFRAME_GET_SIZE:
   1608 		*(int *) data = raidPtr->totalSectors;
   1609 		return 0;
   1610 
   1611 	case RAIDFRAME_FAIL_DISK:
   1612 		return rf_fail_disk(raidPtr, data);
   1613 
   1614 		/* invoke a copyback operation after recon on whatever disk
   1615 		 * needs it, if any */
   1616 	case RAIDFRAME_COPYBACK:
   1617 
   1618 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1619 			/* This makes no sense on a RAID 0!! */
   1620 			return EINVAL;
   1621 		}
   1622 
   1623 		if (raidPtr->copyback_in_progress == 1) {
   1624 			/* Copyback is already in progress! */
   1625 			return EINVAL;
   1626 		}
   1627 
   1628 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1629 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1630 
   1631 		/* return the percentage completion of reconstruction */
   1632 	case RAIDFRAME_CHECK_RECON_STATUS:
   1633 		return rf_check_recon_status(raidPtr, data);
   1634 
   1635 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1636 		rf_check_recon_status_ext(raidPtr, data);
   1637 		return 0;
   1638 
   1639 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1640 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1641 			/* This makes no sense on a RAID 0, so tell the
   1642 			   user it's done. */
   1643 			*(int *) data = 100;
   1644 			return 0;
   1645 		}
   1646 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1647 			*(int *) data = 100 *
   1648 				raidPtr->parity_rewrite_stripes_done /
   1649 				raidPtr->Layout.numStripe;
   1650 		} else {
   1651 			*(int *) data = 100;
   1652 		}
   1653 		return 0;
   1654 
   1655 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1656 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1657 		return 0;
   1658 
   1659 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1660 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1661 			/* This makes no sense on a RAID 0 */
   1662 			*(int *) data = 100;
   1663 			return 0;
   1664 		}
   1665 		if (raidPtr->copyback_in_progress == 1) {
   1666 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1667 				raidPtr->Layout.numStripe;
   1668 		} else {
   1669 			*(int *) data = 100;
   1670 		}
   1671 		return 0;
   1672 
   1673 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1674 		rf_check_copyback_status_ext(raidPtr, data);
   1675 		return 0;
   1676 
   1677 	case RAIDFRAME_SET_LAST_UNIT:
   1678 		for (column = 0; column < raidPtr->numCol; column++)
   1679 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1680 				return EBUSY;
   1681 
   1682 		for (column = 0; column < raidPtr->numCol; column++) {
   1683 			clabel = raidget_component_label(raidPtr, column);
   1684 			clabel->last_unit = *(int *)data;
   1685 			raidflush_component_label(raidPtr, column);
   1686 		}
   1687 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1688 		return 0;
   1689 
   1690 		/* the sparetable daemon calls this to wait for the kernel to
   1691 		 * need a spare table. this ioctl does not return until a
   1692 		 * spare table is needed. XXX -- calling mpsleep here in the
   1693 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1694 		 * -- I should either compute the spare table in the kernel,
   1695 		 * or have a different -- XXX XXX -- interface (a different
   1696 		 * character device) for delivering the table     -- XXX */
   1697 #if RF_DISABLED
   1698 	case RAIDFRAME_SPARET_WAIT:
   1699 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1700 		while (!rf_sparet_wait_queue)
   1701 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1702 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1703 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1704 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1705 
   1706 		/* structure assignment */
   1707 		*((RF_SparetWait_t *) data) = *waitreq;
   1708 
   1709 		RF_Free(waitreq, sizeof(*waitreq));
   1710 		return 0;
   1711 
   1712 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1713 		 * code in it that will cause the dameon to exit */
   1714 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1715 		waitreq = RF_Malloc(sizeof(*waitreq));
   1716 		waitreq->fcol = -1;
   1717 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1718 		waitreq->next = rf_sparet_wait_queue;
   1719 		rf_sparet_wait_queue = waitreq;
   1720 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1721 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1722 		return 0;
   1723 
   1724 		/* used by the spare table daemon to deliver a spare table
   1725 		 * into the kernel */
   1726 	case RAIDFRAME_SEND_SPARET:
   1727 
   1728 		/* install the spare table */
   1729 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1730 
   1731 		/* respond to the requestor.  the return status of the spare
   1732 		 * table installation is passed in the "fcol" field */
   1733 		waitred = RF_Malloc(sizeof(*waitreq));
   1734 		waitreq->fcol = retcode;
   1735 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1736 		waitreq->next = rf_sparet_resp_queue;
   1737 		rf_sparet_resp_queue = waitreq;
   1738 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1739 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1740 
   1741 		return retcode;
   1742 #endif
   1743 	default:
   1744 		/*
   1745 		 * Don't bother trying to load compat modules
   1746 		 * if it is not our ioctl. This is more efficient
   1747 		 * and makes rump tests not depend on compat code
   1748 		 */
   1749 		if (IOCGROUP(cmd) != 'r')
   1750 			break;
   1751 #ifdef _LP64
   1752 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1753 			module_autoload("compat_netbsd32_raid",
   1754 			    MODULE_CLASS_EXEC);
   1755 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1756 			    (rs, cmd, data), enosys(), retcode);
   1757 			if (retcode != EPASSTHROUGH)
   1758 				return retcode;
   1759 		}
   1760 #endif
   1761 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1762 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1763 		    (rs, cmd, data), enosys(), retcode);
   1764 		if (retcode != EPASSTHROUGH)
   1765 			return retcode;
   1766 
   1767 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1768 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1769 		    (rs, cmd, data), enosys(), retcode);
   1770 		if (retcode != EPASSTHROUGH)
   1771 			return retcode;
   1772 		break; /* fall through to the os-specific code below */
   1773 
   1774 	}
   1775 
   1776 	if (!raidPtr->valid)
   1777 		return (EINVAL);
   1778 
   1779 	/*
   1780 	 * Add support for "regular" device ioctls here.
   1781 	 */
   1782 
   1783 	switch (cmd) {
   1784 	case DIOCGCACHE:
   1785 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1786 		break;
   1787 
   1788 	case DIOCCACHESYNC:
   1789 		retcode = rf_sync_component_caches(raidPtr);
   1790 		break;
   1791 
   1792 	default:
   1793 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1794 		break;
   1795 	}
   1796 
   1797 	return (retcode);
   1798 
   1799 }
   1800 
   1801 
   1802 /* raidinit -- complete the rest of the initialization for the
   1803    RAIDframe device.  */
   1804 
   1805 
   1806 static void
   1807 raidinit(struct raid_softc *rs)
   1808 {
   1809 	cfdata_t cf;
   1810 	unsigned int unit;
   1811 	struct dk_softc *dksc = &rs->sc_dksc;
   1812 	RF_Raid_t *raidPtr = &rs->sc_r;
   1813 	device_t dev;
   1814 
   1815 	unit = raidPtr->raidid;
   1816 
   1817 	/* XXX doesn't check bounds. */
   1818 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1819 
   1820 	/* attach the pseudo device */
   1821 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1822 	cf->cf_name = raid_cd.cd_name;
   1823 	cf->cf_atname = raid_cd.cd_name;
   1824 	cf->cf_unit = unit;
   1825 	cf->cf_fstate = FSTATE_STAR;
   1826 
   1827 	dev = config_attach_pseudo(cf);
   1828 	if (dev == NULL) {
   1829 		printf("raid%d: config_attach_pseudo failed\n",
   1830 		    raidPtr->raidid);
   1831 		free(cf, M_RAIDFRAME);
   1832 		return;
   1833 	}
   1834 
   1835 	/* provide a backpointer to the real softc */
   1836 	raidsoftc(dev) = rs;
   1837 
   1838 	/* disk_attach actually creates space for the CPU disklabel, among
   1839 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1840 	 * with disklabels. */
   1841 	dk_init(dksc, dev, DKTYPE_RAID);
   1842 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1843 
   1844 	/* XXX There may be a weird interaction here between this, and
   1845 	 * protectedSectors, as used in RAIDframe.  */
   1846 
   1847 	rs->sc_size = raidPtr->totalSectors;
   1848 
   1849 	/* Attach dk and disk subsystems */
   1850 	dk_attach(dksc);
   1851 	disk_attach(&dksc->sc_dkdev);
   1852 	rf_set_geometry(rs, raidPtr);
   1853 
   1854 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1855 
   1856 	/* mark unit as usuable */
   1857 	rs->sc_flags |= RAIDF_INITED;
   1858 
   1859 	dkwedge_discover(&dksc->sc_dkdev);
   1860 }
   1861 
   1862 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1863 /* wake up the daemon & tell it to get us a spare table
   1864  * XXX
   1865  * the entries in the queues should be tagged with the raidPtr
   1866  * so that in the extremely rare case that two recons happen at once,
   1867  * we know for which device were requesting a spare table
   1868  * XXX
   1869  *
   1870  * XXX This code is not currently used. GO
   1871  */
   1872 int
   1873 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1874 {
   1875 	int     retcode;
   1876 
   1877 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1878 	req->next = rf_sparet_wait_queue;
   1879 	rf_sparet_wait_queue = req;
   1880 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1881 
   1882 	/* mpsleep unlocks the mutex */
   1883 	while (!rf_sparet_resp_queue) {
   1884 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1885 	}
   1886 	req = rf_sparet_resp_queue;
   1887 	rf_sparet_resp_queue = req->next;
   1888 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1889 
   1890 	retcode = req->fcol;
   1891 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1892 					 * alloc'd */
   1893 	return (retcode);
   1894 }
   1895 #endif
   1896 
   1897 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1898  * bp & passes it down.
   1899  * any calls originating in the kernel must use non-blocking I/O
   1900  * do some extra sanity checking to return "appropriate" error values for
   1901  * certain conditions (to make some standard utilities work)
   1902  *
   1903  * Formerly known as: rf_DoAccessKernel
   1904  */
   1905 void
   1906 raidstart(RF_Raid_t *raidPtr)
   1907 {
   1908 	struct raid_softc *rs;
   1909 	struct dk_softc *dksc;
   1910 
   1911 	rs = raidPtr->softc;
   1912 	dksc = &rs->sc_dksc;
   1913 	/* quick check to see if anything has died recently */
   1914 	rf_lock_mutex2(raidPtr->mutex);
   1915 	if (raidPtr->numNewFailures > 0) {
   1916 		rf_unlock_mutex2(raidPtr->mutex);
   1917 		rf_update_component_labels(raidPtr,
   1918 					   RF_NORMAL_COMPONENT_UPDATE);
   1919 		rf_lock_mutex2(raidPtr->mutex);
   1920 		raidPtr->numNewFailures--;
   1921 	}
   1922 	rf_unlock_mutex2(raidPtr->mutex);
   1923 
   1924 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1925 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1926 		return;
   1927 	}
   1928 
   1929 	dk_start(dksc, NULL);
   1930 }
   1931 
   1932 static int
   1933 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1934 {
   1935 	RF_SectorCount_t num_blocks, pb, sum;
   1936 	RF_RaidAddr_t raid_addr;
   1937 	daddr_t blocknum;
   1938 	int     do_async;
   1939 	int rc;
   1940 
   1941 	rf_lock_mutex2(raidPtr->mutex);
   1942 	if (raidPtr->openings == 0) {
   1943 		rf_unlock_mutex2(raidPtr->mutex);
   1944 		return EAGAIN;
   1945 	}
   1946 	rf_unlock_mutex2(raidPtr->mutex);
   1947 
   1948 	blocknum = bp->b_rawblkno;
   1949 
   1950 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1951 		    (int) blocknum));
   1952 
   1953 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1954 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1955 
   1956 	/* *THIS* is where we adjust what block we're going to...
   1957 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1958 	raid_addr = blocknum;
   1959 
   1960 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1961 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1962 	sum = raid_addr + num_blocks + pb;
   1963 	if (1 || rf_debugKernelAccess) {
   1964 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1965 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1966 			    (int) pb, (int) bp->b_resid));
   1967 	}
   1968 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1969 	    || (sum < num_blocks) || (sum < pb)) {
   1970 		rc = ENOSPC;
   1971 		goto done;
   1972 	}
   1973 	/*
   1974 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1975 	 */
   1976 
   1977 	if (bp->b_bcount & raidPtr->sectorMask) {
   1978 		rc = ENOSPC;
   1979 		goto done;
   1980 	}
   1981 	db1_printf(("Calling DoAccess..\n"));
   1982 
   1983 
   1984 	rf_lock_mutex2(raidPtr->mutex);
   1985 	raidPtr->openings--;
   1986 	rf_unlock_mutex2(raidPtr->mutex);
   1987 
   1988 	/*
   1989 	 * Everything is async.
   1990 	 */
   1991 	do_async = 1;
   1992 
   1993 	/* don't ever condition on bp->b_flags & B_WRITE.
   1994 	 * always condition on B_READ instead */
   1995 
   1996 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1997 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1998 			 do_async, raid_addr, num_blocks,
   1999 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2000 
   2001 done:
   2002 	return rc;
   2003 }
   2004 
   2005 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2006 
   2007 int
   2008 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2009 {
   2010 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2011 	struct buf *bp;
   2012 
   2013 	req->queue = queue;
   2014 	bp = req->bp;
   2015 
   2016 	switch (req->type) {
   2017 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2018 		/* XXX need to do something extra here.. */
   2019 		/* I'm leaving this in, as I've never actually seen it used,
   2020 		 * and I'd like folks to report it... GO */
   2021 		printf(("WAKEUP CALLED\n"));
   2022 		queue->numOutstanding++;
   2023 
   2024 		bp->b_flags = 0;
   2025 		bp->b_private = req;
   2026 
   2027 		KernelWakeupFunc(bp);
   2028 		break;
   2029 
   2030 	case RF_IO_TYPE_READ:
   2031 	case RF_IO_TYPE_WRITE:
   2032 #if RF_ACC_TRACE > 0
   2033 		if (req->tracerec) {
   2034 			RF_ETIMER_START(req->tracerec->timer);
   2035 		}
   2036 #endif
   2037 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2038 		    op, queue->rf_cinfo->ci_dev,
   2039 		    req->sectorOffset, req->numSector,
   2040 		    req->buf, KernelWakeupFunc, (void *) req,
   2041 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2042 
   2043 		if (rf_debugKernelAccess) {
   2044 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2045 				(long) bp->b_blkno));
   2046 		}
   2047 		queue->numOutstanding++;
   2048 		queue->last_deq_sector = req->sectorOffset;
   2049 		/* acc wouldn't have been let in if there were any pending
   2050 		 * reqs at any other priority */
   2051 		queue->curPriority = req->priority;
   2052 
   2053 		db1_printf(("Going for %c to unit %d col %d\n",
   2054 			    req->type, queue->raidPtr->raidid,
   2055 			    queue->col));
   2056 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2057 			(int) req->sectorOffset, (int) req->numSector,
   2058 			(int) (req->numSector <<
   2059 			    queue->raidPtr->logBytesPerSector),
   2060 			(int) queue->raidPtr->logBytesPerSector));
   2061 
   2062 		/*
   2063 		 * XXX: drop lock here since this can block at
   2064 		 * least with backing SCSI devices.  Retake it
   2065 		 * to minimize fuss with calling interfaces.
   2066 		 */
   2067 
   2068 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2069 		bdev_strategy(bp);
   2070 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2071 		break;
   2072 
   2073 	default:
   2074 		panic("bad req->type in rf_DispatchKernelIO");
   2075 	}
   2076 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2077 
   2078 	return (0);
   2079 }
   2080 /* this is the callback function associated with a I/O invoked from
   2081    kernel code.
   2082  */
   2083 static void
   2084 KernelWakeupFunc(struct buf *bp)
   2085 {
   2086 	RF_DiskQueueData_t *req = NULL;
   2087 	RF_DiskQueue_t *queue;
   2088 
   2089 	db1_printf(("recovering the request queue:\n"));
   2090 
   2091 	req = bp->b_private;
   2092 
   2093 	queue = (RF_DiskQueue_t *) req->queue;
   2094 
   2095 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2096 
   2097 #if RF_ACC_TRACE > 0
   2098 	if (req->tracerec) {
   2099 		RF_ETIMER_STOP(req->tracerec->timer);
   2100 		RF_ETIMER_EVAL(req->tracerec->timer);
   2101 		rf_lock_mutex2(rf_tracing_mutex);
   2102 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2103 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2104 		req->tracerec->num_phys_ios++;
   2105 		rf_unlock_mutex2(rf_tracing_mutex);
   2106 	}
   2107 #endif
   2108 
   2109 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2110 	 * ballistic, and mark the component as hosed... */
   2111 
   2112 	if (bp->b_error != 0) {
   2113 		/* Mark the disk as dead */
   2114 		/* but only mark it once... */
   2115 		/* and only if it wouldn't leave this RAID set
   2116 		   completely broken */
   2117 		if (((queue->raidPtr->Disks[queue->col].status ==
   2118 		      rf_ds_optimal) ||
   2119 		     (queue->raidPtr->Disks[queue->col].status ==
   2120 		      rf_ds_used_spare)) &&
   2121 		     (queue->raidPtr->numFailures <
   2122 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2123 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2124 			       queue->raidPtr->raidid,
   2125 			       bp->b_error,
   2126 			       queue->raidPtr->Disks[queue->col].devname);
   2127 			queue->raidPtr->Disks[queue->col].status =
   2128 			    rf_ds_failed;
   2129 			queue->raidPtr->status = rf_rs_degraded;
   2130 			queue->raidPtr->numFailures++;
   2131 			queue->raidPtr->numNewFailures++;
   2132 		} else {	/* Disk is already dead... */
   2133 			/* printf("Disk already marked as dead!\n"); */
   2134 		}
   2135 
   2136 	}
   2137 
   2138 	/* Fill in the error value */
   2139 	req->error = bp->b_error;
   2140 
   2141 	/* Drop this one on the "finished" queue... */
   2142 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2143 
   2144 	/* Let the raidio thread know there is work to be done. */
   2145 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2146 
   2147 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2148 }
   2149 
   2150 
   2151 /*
   2152  * initialize a buf structure for doing an I/O in the kernel.
   2153  */
   2154 static void
   2155 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2156        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2157        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2158        struct proc *b_proc)
   2159 {
   2160 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2161 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2162 	bp->b_oflags = 0;
   2163 	bp->b_cflags = 0;
   2164 	bp->b_bcount = numSect << logBytesPerSector;
   2165 	bp->b_bufsize = bp->b_bcount;
   2166 	bp->b_error = 0;
   2167 	bp->b_dev = dev;
   2168 	bp->b_data = bf;
   2169 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2170 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2171 	if (bp->b_bcount == 0) {
   2172 		panic("bp->b_bcount is zero in InitBP!!");
   2173 	}
   2174 	bp->b_proc = b_proc;
   2175 	bp->b_iodone = cbFunc;
   2176 	bp->b_private = cbArg;
   2177 }
   2178 
   2179 /*
   2180  * Wait interruptibly for an exclusive lock.
   2181  *
   2182  * XXX
   2183  * Several drivers do this; it should be abstracted and made MP-safe.
   2184  * (Hmm... where have we seen this warning before :->  GO )
   2185  */
   2186 static int
   2187 raidlock(struct raid_softc *rs)
   2188 {
   2189 	int     error;
   2190 
   2191 	error = 0;
   2192 	mutex_enter(&rs->sc_mutex);
   2193 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2194 		rs->sc_flags |= RAIDF_WANTED;
   2195 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2196 		if (error != 0)
   2197 			goto done;
   2198 	}
   2199 	rs->sc_flags |= RAIDF_LOCKED;
   2200 done:
   2201 	mutex_exit(&rs->sc_mutex);
   2202 	return (error);
   2203 }
   2204 /*
   2205  * Unlock and wake up any waiters.
   2206  */
   2207 static void
   2208 raidunlock(struct raid_softc *rs)
   2209 {
   2210 
   2211 	mutex_enter(&rs->sc_mutex);
   2212 	rs->sc_flags &= ~RAIDF_LOCKED;
   2213 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2214 		rs->sc_flags &= ~RAIDF_WANTED;
   2215 		cv_broadcast(&rs->sc_cv);
   2216 	}
   2217 	mutex_exit(&rs->sc_mutex);
   2218 }
   2219 
   2220 
   2221 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2222 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2223 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2224 
   2225 static daddr_t
   2226 rf_component_info_offset(void)
   2227 {
   2228 
   2229 	return RF_COMPONENT_INFO_OFFSET;
   2230 }
   2231 
   2232 static daddr_t
   2233 rf_component_info_size(unsigned secsize)
   2234 {
   2235 	daddr_t info_size;
   2236 
   2237 	KASSERT(secsize);
   2238 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2239 		info_size = secsize;
   2240 	else
   2241 		info_size = RF_COMPONENT_INFO_SIZE;
   2242 
   2243 	return info_size;
   2244 }
   2245 
   2246 static daddr_t
   2247 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2248 {
   2249 	daddr_t map_offset;
   2250 
   2251 	KASSERT(raidPtr->bytesPerSector);
   2252 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2253 		map_offset = raidPtr->bytesPerSector;
   2254 	else
   2255 		map_offset = RF_COMPONENT_INFO_SIZE;
   2256 	map_offset += rf_component_info_offset();
   2257 
   2258 	return map_offset;
   2259 }
   2260 
   2261 static daddr_t
   2262 rf_parity_map_size(RF_Raid_t *raidPtr)
   2263 {
   2264 	daddr_t map_size;
   2265 
   2266 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2267 		map_size = raidPtr->bytesPerSector;
   2268 	else
   2269 		map_size = RF_PARITY_MAP_SIZE;
   2270 
   2271 	return map_size;
   2272 }
   2273 
   2274 int
   2275 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2276 {
   2277 	RF_ComponentLabel_t *clabel;
   2278 
   2279 	clabel = raidget_component_label(raidPtr, col);
   2280 	clabel->clean = RF_RAID_CLEAN;
   2281 	raidflush_component_label(raidPtr, col);
   2282 	return(0);
   2283 }
   2284 
   2285 
   2286 int
   2287 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2288 {
   2289 	RF_ComponentLabel_t *clabel;
   2290 
   2291 	clabel = raidget_component_label(raidPtr, col);
   2292 	clabel->clean = RF_RAID_DIRTY;
   2293 	raidflush_component_label(raidPtr, col);
   2294 	return(0);
   2295 }
   2296 
   2297 int
   2298 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2299 {
   2300 	KASSERT(raidPtr->bytesPerSector);
   2301 	return raidread_component_label(raidPtr->bytesPerSector,
   2302 	    raidPtr->Disks[col].dev,
   2303 	    raidPtr->raid_cinfo[col].ci_vp,
   2304 	    &raidPtr->raid_cinfo[col].ci_label);
   2305 }
   2306 
   2307 RF_ComponentLabel_t *
   2308 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2309 {
   2310 	return &raidPtr->raid_cinfo[col].ci_label;
   2311 }
   2312 
   2313 int
   2314 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2315 {
   2316 	RF_ComponentLabel_t *label;
   2317 
   2318 	label = &raidPtr->raid_cinfo[col].ci_label;
   2319 	label->mod_counter = raidPtr->mod_counter;
   2320 #ifndef RF_NO_PARITY_MAP
   2321 	label->parity_map_modcount = label->mod_counter;
   2322 #endif
   2323 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2324 	    raidPtr->Disks[col].dev,
   2325 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2326 }
   2327 
   2328 
   2329 static int
   2330 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2331     RF_ComponentLabel_t *clabel)
   2332 {
   2333 	return raidread_component_area(dev, b_vp, clabel,
   2334 	    sizeof(RF_ComponentLabel_t),
   2335 	    rf_component_info_offset(),
   2336 	    rf_component_info_size(secsize));
   2337 }
   2338 
   2339 /* ARGSUSED */
   2340 static int
   2341 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2342     size_t msize, daddr_t offset, daddr_t dsize)
   2343 {
   2344 	struct buf *bp;
   2345 	int error;
   2346 
   2347 	/* XXX should probably ensure that we don't try to do this if
   2348 	   someone has changed rf_protected_sectors. */
   2349 
   2350 	if (b_vp == NULL) {
   2351 		/* For whatever reason, this component is not valid.
   2352 		   Don't try to read a component label from it. */
   2353 		return(EINVAL);
   2354 	}
   2355 
   2356 	/* get a block of the appropriate size... */
   2357 	bp = geteblk((int)dsize);
   2358 	bp->b_dev = dev;
   2359 
   2360 	/* get our ducks in a row for the read */
   2361 	bp->b_blkno = offset / DEV_BSIZE;
   2362 	bp->b_bcount = dsize;
   2363 	bp->b_flags |= B_READ;
   2364  	bp->b_resid = dsize;
   2365 
   2366 	bdev_strategy(bp);
   2367 	error = biowait(bp);
   2368 
   2369 	if (!error) {
   2370 		memcpy(data, bp->b_data, msize);
   2371 	}
   2372 
   2373 	brelse(bp, 0);
   2374 	return(error);
   2375 }
   2376 
   2377 
   2378 static int
   2379 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2380     RF_ComponentLabel_t *clabel)
   2381 {
   2382 	return raidwrite_component_area(dev, b_vp, clabel,
   2383 	    sizeof(RF_ComponentLabel_t),
   2384 	    rf_component_info_offset(),
   2385 	    rf_component_info_size(secsize), 0);
   2386 }
   2387 
   2388 /* ARGSUSED */
   2389 static int
   2390 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2391     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2392 {
   2393 	struct buf *bp;
   2394 	int error;
   2395 
   2396 	/* get a block of the appropriate size... */
   2397 	bp = geteblk((int)dsize);
   2398 	bp->b_dev = dev;
   2399 
   2400 	/* get our ducks in a row for the write */
   2401 	bp->b_blkno = offset / DEV_BSIZE;
   2402 	bp->b_bcount = dsize;
   2403 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2404  	bp->b_resid = dsize;
   2405 
   2406 	memset(bp->b_data, 0, dsize);
   2407 	memcpy(bp->b_data, data, msize);
   2408 
   2409 	bdev_strategy(bp);
   2410 	if (asyncp)
   2411 		return 0;
   2412 	error = biowait(bp);
   2413 	brelse(bp, 0);
   2414 	if (error) {
   2415 #if 1
   2416 		printf("Failed to write RAID component info!\n");
   2417 #endif
   2418 	}
   2419 
   2420 	return(error);
   2421 }
   2422 
   2423 void
   2424 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2425 {
   2426 	int c;
   2427 
   2428 	for (c = 0; c < raidPtr->numCol; c++) {
   2429 		/* Skip dead disks. */
   2430 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2431 			continue;
   2432 		/* XXXjld: what if an error occurs here? */
   2433 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2434 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2435 		    RF_PARITYMAP_NBYTE,
   2436 		    rf_parity_map_offset(raidPtr),
   2437 		    rf_parity_map_size(raidPtr), 0);
   2438 	}
   2439 }
   2440 
   2441 void
   2442 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2443 {
   2444 	struct rf_paritymap_ondisk tmp;
   2445 	int c,first;
   2446 
   2447 	first=1;
   2448 	for (c = 0; c < raidPtr->numCol; c++) {
   2449 		/* Skip dead disks. */
   2450 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2451 			continue;
   2452 		raidread_component_area(raidPtr->Disks[c].dev,
   2453 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2454 		    RF_PARITYMAP_NBYTE,
   2455 		    rf_parity_map_offset(raidPtr),
   2456 		    rf_parity_map_size(raidPtr));
   2457 		if (first) {
   2458 			memcpy(map, &tmp, sizeof(*map));
   2459 			first = 0;
   2460 		} else {
   2461 			rf_paritymap_merge(map, &tmp);
   2462 		}
   2463 	}
   2464 }
   2465 
   2466 void
   2467 rf_markalldirty(RF_Raid_t *raidPtr)
   2468 {
   2469 	RF_ComponentLabel_t *clabel;
   2470 	int sparecol;
   2471 	int c;
   2472 	int j;
   2473 	int scol = -1;
   2474 
   2475 	raidPtr->mod_counter++;
   2476 	for (c = 0; c < raidPtr->numCol; c++) {
   2477 		/* we don't want to touch (at all) a disk that has
   2478 		   failed */
   2479 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2480 			clabel = raidget_component_label(raidPtr, c);
   2481 			if (clabel->status == rf_ds_spared) {
   2482 				/* XXX do something special...
   2483 				   but whatever you do, don't
   2484 				   try to access it!! */
   2485 			} else {
   2486 				raidmarkdirty(raidPtr, c);
   2487 			}
   2488 		}
   2489 	}
   2490 
   2491 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2492 		sparecol = raidPtr->numCol + c;
   2493 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2494 			/*
   2495 
   2496 			   we claim this disk is "optimal" if it's
   2497 			   rf_ds_used_spare, as that means it should be
   2498 			   directly substitutable for the disk it replaced.
   2499 			   We note that too...
   2500 
   2501 			 */
   2502 
   2503 			for(j=0;j<raidPtr->numCol;j++) {
   2504 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2505 					scol = j;
   2506 					break;
   2507 				}
   2508 			}
   2509 
   2510 			clabel = raidget_component_label(raidPtr, sparecol);
   2511 			/* make sure status is noted */
   2512 
   2513 			raid_init_component_label(raidPtr, clabel);
   2514 
   2515 			clabel->row = 0;
   2516 			clabel->column = scol;
   2517 			/* Note: we *don't* change status from rf_ds_used_spare
   2518 			   to rf_ds_optimal */
   2519 			/* clabel.status = rf_ds_optimal; */
   2520 
   2521 			raidmarkdirty(raidPtr, sparecol);
   2522 		}
   2523 	}
   2524 }
   2525 
   2526 
   2527 void
   2528 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2529 {
   2530 	RF_ComponentLabel_t *clabel;
   2531 	int sparecol;
   2532 	int c;
   2533 	int j;
   2534 	int scol;
   2535 	struct raid_softc *rs = raidPtr->softc;
   2536 
   2537 	scol = -1;
   2538 
   2539 	/* XXX should do extra checks to make sure things really are clean,
   2540 	   rather than blindly setting the clean bit... */
   2541 
   2542 	raidPtr->mod_counter++;
   2543 
   2544 	for (c = 0; c < raidPtr->numCol; c++) {
   2545 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2546 			clabel = raidget_component_label(raidPtr, c);
   2547 			/* make sure status is noted */
   2548 			clabel->status = rf_ds_optimal;
   2549 
   2550 			/* note what unit we are configured as */
   2551 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2552 				clabel->last_unit = raidPtr->raidid;
   2553 
   2554 			raidflush_component_label(raidPtr, c);
   2555 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2556 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2557 					raidmarkclean(raidPtr, c);
   2558 				}
   2559 			}
   2560 		}
   2561 		/* else we don't touch it.. */
   2562 	}
   2563 
   2564 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2565 		sparecol = raidPtr->numCol + c;
   2566 		/* Need to ensure that the reconstruct actually completed! */
   2567 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2568 			/*
   2569 
   2570 			   we claim this disk is "optimal" if it's
   2571 			   rf_ds_used_spare, as that means it should be
   2572 			   directly substitutable for the disk it replaced.
   2573 			   We note that too...
   2574 
   2575 			 */
   2576 
   2577 			for(j=0;j<raidPtr->numCol;j++) {
   2578 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2579 					scol = j;
   2580 					break;
   2581 				}
   2582 			}
   2583 
   2584 			/* XXX shouldn't *really* need this... */
   2585 			clabel = raidget_component_label(raidPtr, sparecol);
   2586 			/* make sure status is noted */
   2587 
   2588 			raid_init_component_label(raidPtr, clabel);
   2589 
   2590 			clabel->column = scol;
   2591 			clabel->status = rf_ds_optimal;
   2592 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2593 				clabel->last_unit = raidPtr->raidid;
   2594 
   2595 			raidflush_component_label(raidPtr, sparecol);
   2596 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2597 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2598 					raidmarkclean(raidPtr, sparecol);
   2599 				}
   2600 			}
   2601 		}
   2602 	}
   2603 }
   2604 
   2605 void
   2606 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2607 {
   2608 
   2609 	if (vp != NULL) {
   2610 		if (auto_configured == 1) {
   2611 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2612 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2613 			vput(vp);
   2614 
   2615 		} else {
   2616 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2617 		}
   2618 	}
   2619 }
   2620 
   2621 
   2622 void
   2623 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2624 {
   2625 	int r,c;
   2626 	struct vnode *vp;
   2627 	int acd;
   2628 
   2629 
   2630 	/* We take this opportunity to close the vnodes like we should.. */
   2631 
   2632 	for (c = 0; c < raidPtr->numCol; c++) {
   2633 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2634 		acd = raidPtr->Disks[c].auto_configured;
   2635 		rf_close_component(raidPtr, vp, acd);
   2636 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2637 		raidPtr->Disks[c].auto_configured = 0;
   2638 	}
   2639 
   2640 	for (r = 0; r < raidPtr->numSpare; r++) {
   2641 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2642 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2643 		rf_close_component(raidPtr, vp, acd);
   2644 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2645 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2646 	}
   2647 }
   2648 
   2649 
   2650 void
   2651 rf_ReconThread(struct rf_recon_req_internal *req)
   2652 {
   2653 	int     s;
   2654 	RF_Raid_t *raidPtr;
   2655 
   2656 	s = splbio();
   2657 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2658 	raidPtr->recon_in_progress = 1;
   2659 
   2660 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2661 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2662 
   2663 	RF_Free(req, sizeof(*req));
   2664 
   2665 	raidPtr->recon_in_progress = 0;
   2666 	splx(s);
   2667 
   2668 	/* That's all... */
   2669 	kthread_exit(0);	/* does not return */
   2670 }
   2671 
   2672 void
   2673 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2674 {
   2675 	int retcode;
   2676 	int s;
   2677 
   2678 	raidPtr->parity_rewrite_stripes_done = 0;
   2679 	raidPtr->parity_rewrite_in_progress = 1;
   2680 	s = splbio();
   2681 	retcode = rf_RewriteParity(raidPtr);
   2682 	splx(s);
   2683 	if (retcode) {
   2684 		printf("raid%d: Error re-writing parity (%d)!\n",
   2685 		    raidPtr->raidid, retcode);
   2686 	} else {
   2687 		/* set the clean bit!  If we shutdown correctly,
   2688 		   the clean bit on each component label will get
   2689 		   set */
   2690 		raidPtr->parity_good = RF_RAID_CLEAN;
   2691 	}
   2692 	raidPtr->parity_rewrite_in_progress = 0;
   2693 
   2694 	/* Anyone waiting for us to stop?  If so, inform them... */
   2695 	if (raidPtr->waitShutdown) {
   2696 		rf_lock_mutex2(raidPtr->rad_lock);
   2697 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2698 		rf_unlock_mutex2(raidPtr->rad_lock);
   2699 	}
   2700 
   2701 	/* That's all... */
   2702 	kthread_exit(0);	/* does not return */
   2703 }
   2704 
   2705 
   2706 void
   2707 rf_CopybackThread(RF_Raid_t *raidPtr)
   2708 {
   2709 	int s;
   2710 
   2711 	raidPtr->copyback_in_progress = 1;
   2712 	s = splbio();
   2713 	rf_CopybackReconstructedData(raidPtr);
   2714 	splx(s);
   2715 	raidPtr->copyback_in_progress = 0;
   2716 
   2717 	/* That's all... */
   2718 	kthread_exit(0);	/* does not return */
   2719 }
   2720 
   2721 
   2722 void
   2723 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2724 {
   2725 	int s;
   2726 	RF_Raid_t *raidPtr;
   2727 
   2728 	s = splbio();
   2729 	raidPtr = req->raidPtr;
   2730 	raidPtr->recon_in_progress = 1;
   2731 	rf_ReconstructInPlace(raidPtr, req->col);
   2732 	RF_Free(req, sizeof(*req));
   2733 	raidPtr->recon_in_progress = 0;
   2734 	splx(s);
   2735 
   2736 	/* That's all... */
   2737 	kthread_exit(0);	/* does not return */
   2738 }
   2739 
   2740 static RF_AutoConfig_t *
   2741 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2742     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2743     unsigned secsize)
   2744 {
   2745 	int good_one = 0;
   2746 	RF_ComponentLabel_t *clabel;
   2747 	RF_AutoConfig_t *ac;
   2748 
   2749 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2750 	if (clabel == NULL) {
   2751 oomem:
   2752 		    while(ac_list) {
   2753 			    ac = ac_list;
   2754 			    if (ac->clabel)
   2755 				    free(ac->clabel, M_RAIDFRAME);
   2756 			    ac_list = ac_list->next;
   2757 			    free(ac, M_RAIDFRAME);
   2758 		    }
   2759 		    printf("RAID auto config: out of memory!\n");
   2760 		    return NULL; /* XXX probably should panic? */
   2761 	}
   2762 
   2763 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2764 		/* Got the label.  Does it look reasonable? */
   2765 		if (rf_reasonable_label(clabel, numsecs) &&
   2766 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2767 #ifdef DEBUG
   2768 			printf("Component on: %s: %llu\n",
   2769 				cname, (unsigned long long)size);
   2770 			rf_print_component_label(clabel);
   2771 #endif
   2772 			/* if it's reasonable, add it, else ignore it. */
   2773 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2774 				M_NOWAIT);
   2775 			if (ac == NULL) {
   2776 				free(clabel, M_RAIDFRAME);
   2777 				goto oomem;
   2778 			}
   2779 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2780 			ac->dev = dev;
   2781 			ac->vp = vp;
   2782 			ac->clabel = clabel;
   2783 			ac->next = ac_list;
   2784 			ac_list = ac;
   2785 			good_one = 1;
   2786 		}
   2787 	}
   2788 	if (!good_one) {
   2789 		/* cleanup */
   2790 		free(clabel, M_RAIDFRAME);
   2791 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2792 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2793 		vput(vp);
   2794 	}
   2795 	return ac_list;
   2796 }
   2797 
   2798 RF_AutoConfig_t *
   2799 rf_find_raid_components(void)
   2800 {
   2801 	struct vnode *vp;
   2802 	struct disklabel label;
   2803 	device_t dv;
   2804 	deviter_t di;
   2805 	dev_t dev;
   2806 	int bmajor, bminor, wedge, rf_part_found;
   2807 	int error;
   2808 	int i;
   2809 	RF_AutoConfig_t *ac_list;
   2810 	uint64_t numsecs;
   2811 	unsigned secsize;
   2812 	int dowedges;
   2813 
   2814 	/* initialize the AutoConfig list */
   2815 	ac_list = NULL;
   2816 
   2817 	/*
   2818 	 * we begin by trolling through *all* the devices on the system *twice*
   2819 	 * first we scan for wedges, second for other devices. This avoids
   2820 	 * using a raw partition instead of a wedge that covers the whole disk
   2821 	 */
   2822 
   2823 	for (dowedges=1; dowedges>=0; --dowedges) {
   2824 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2825 		     dv = deviter_next(&di)) {
   2826 
   2827 			/* we are only interested in disks... */
   2828 			if (device_class(dv) != DV_DISK)
   2829 				continue;
   2830 
   2831 			/* we don't care about floppies... */
   2832 			if (device_is_a(dv, "fd")) {
   2833 				continue;
   2834 			}
   2835 
   2836 			/* we don't care about CD's... */
   2837 			if (device_is_a(dv, "cd")) {
   2838 				continue;
   2839 			}
   2840 
   2841 			/* we don't care about md's... */
   2842 			if (device_is_a(dv, "md")) {
   2843 				continue;
   2844 			}
   2845 
   2846 			/* hdfd is the Atari/Hades floppy driver */
   2847 			if (device_is_a(dv, "hdfd")) {
   2848 				continue;
   2849 			}
   2850 
   2851 			/* fdisa is the Atari/Milan floppy driver */
   2852 			if (device_is_a(dv, "fdisa")) {
   2853 				continue;
   2854 			}
   2855 
   2856 			/* are we in the wedges pass ? */
   2857 			wedge = device_is_a(dv, "dk");
   2858 			if (wedge != dowedges) {
   2859 				continue;
   2860 			}
   2861 
   2862 			/* need to find the device_name_to_block_device_major stuff */
   2863 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2864 
   2865 			rf_part_found = 0; /*No raid partition as yet*/
   2866 
   2867 			/* get a vnode for the raw partition of this disk */
   2868 			bminor = minor(device_unit(dv));
   2869 			dev = wedge ? makedev(bmajor, bminor) :
   2870 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2871 			if (bdevvp(dev, &vp))
   2872 				panic("RAID can't alloc vnode");
   2873 
   2874 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2875 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2876 
   2877 			if (error) {
   2878 				/* "Who cares."  Continue looking
   2879 				   for something that exists*/
   2880 				vput(vp);
   2881 				continue;
   2882 			}
   2883 
   2884 			error = getdisksize(vp, &numsecs, &secsize);
   2885 			if (error) {
   2886 				/*
   2887 				 * Pseudo devices like vnd and cgd can be
   2888 				 * opened but may still need some configuration.
   2889 				 * Ignore these quietly.
   2890 				 */
   2891 				if (error != ENXIO)
   2892 					printf("RAIDframe: can't get disk size"
   2893 					    " for dev %s (%d)\n",
   2894 					    device_xname(dv), error);
   2895 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2896 				vput(vp);
   2897 				continue;
   2898 			}
   2899 			if (wedge) {
   2900 				struct dkwedge_info dkw;
   2901 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2902 				    NOCRED);
   2903 				if (error) {
   2904 					printf("RAIDframe: can't get wedge info for "
   2905 					    "dev %s (%d)\n", device_xname(dv), error);
   2906 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2907 					vput(vp);
   2908 					continue;
   2909 				}
   2910 
   2911 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2912 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2913 					vput(vp);
   2914 					continue;
   2915 				}
   2916 
   2917 				VOP_UNLOCK(vp);
   2918 				ac_list = rf_get_component(ac_list, dev, vp,
   2919 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2920 				rf_part_found = 1; /*There is a raid component on this disk*/
   2921 				continue;
   2922 			}
   2923 
   2924 			/* Ok, the disk exists.  Go get the disklabel. */
   2925 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2926 			if (error) {
   2927 				/*
   2928 				 * XXX can't happen - open() would
   2929 				 * have errored out (or faked up one)
   2930 				 */
   2931 				if (error != ENOTTY)
   2932 					printf("RAIDframe: can't get label for dev "
   2933 					    "%s (%d)\n", device_xname(dv), error);
   2934 			}
   2935 
   2936 			/* don't need this any more.  We'll allocate it again
   2937 			   a little later if we really do... */
   2938 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2939 			vput(vp);
   2940 
   2941 			if (error)
   2942 				continue;
   2943 
   2944 			rf_part_found = 0; /*No raid partitions yet*/
   2945 			for (i = 0; i < label.d_npartitions; i++) {
   2946 				char cname[sizeof(ac_list->devname)];
   2947 
   2948 				/* We only support partitions marked as RAID */
   2949 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2950 					continue;
   2951 
   2952 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2953 				if (bdevvp(dev, &vp))
   2954 					panic("RAID can't alloc vnode");
   2955 
   2956 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2957 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2958 				if (error) {
   2959 					/* Whatever... */
   2960 					vput(vp);
   2961 					continue;
   2962 				}
   2963 				VOP_UNLOCK(vp);
   2964 				snprintf(cname, sizeof(cname), "%s%c",
   2965 				    device_xname(dv), 'a' + i);
   2966 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2967 					label.d_partitions[i].p_size, numsecs, secsize);
   2968 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2969 			}
   2970 
   2971 			/*
   2972 			 *If there is no raid component on this disk, either in a
   2973 			 *disklabel or inside a wedge, check the raw partition as well,
   2974 			 *as it is possible to configure raid components on raw disk
   2975 			 *devices.
   2976 			 */
   2977 
   2978 			if (!rf_part_found) {
   2979 				char cname[sizeof(ac_list->devname)];
   2980 
   2981 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2982 				if (bdevvp(dev, &vp))
   2983 					panic("RAID can't alloc vnode");
   2984 
   2985 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2986 
   2987 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2988 				if (error) {
   2989 					/* Whatever... */
   2990 					vput(vp);
   2991 					continue;
   2992 				}
   2993 				VOP_UNLOCK(vp);
   2994 				snprintf(cname, sizeof(cname), "%s%c",
   2995 				    device_xname(dv), 'a' + RAW_PART);
   2996 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2997 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2998 			}
   2999 		}
   3000 		deviter_release(&di);
   3001 	}
   3002 	return ac_list;
   3003 }
   3004 
   3005 
   3006 int
   3007 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3008 {
   3009 
   3010 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3011 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3012 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3013 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3014 	    clabel->row >=0 &&
   3015 	    clabel->column >= 0 &&
   3016 	    clabel->num_rows > 0 &&
   3017 	    clabel->num_columns > 0 &&
   3018 	    clabel->row < clabel->num_rows &&
   3019 	    clabel->column < clabel->num_columns &&
   3020 	    clabel->blockSize > 0 &&
   3021 	    /*
   3022 	     * numBlocksHi may contain garbage, but it is ok since
   3023 	     * the type is unsigned.  If it is really garbage,
   3024 	     * rf_fix_old_label_size() will fix it.
   3025 	     */
   3026 	    rf_component_label_numblocks(clabel) > 0) {
   3027 		/*
   3028 		 * label looks reasonable enough...
   3029 		 * let's make sure it has no old garbage.
   3030 		 */
   3031 		if (numsecs)
   3032 			rf_fix_old_label_size(clabel, numsecs);
   3033 		return(1);
   3034 	}
   3035 	return(0);
   3036 }
   3037 
   3038 
   3039 /*
   3040  * For reasons yet unknown, some old component labels have garbage in
   3041  * the newer numBlocksHi region, and this causes lossage.  Since those
   3042  * disks will also have numsecs set to less than 32 bits of sectors,
   3043  * we can determine when this corruption has occurred, and fix it.
   3044  *
   3045  * The exact same problem, with the same unknown reason, happens to
   3046  * the partitionSizeHi member as well.
   3047  */
   3048 static void
   3049 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3050 {
   3051 
   3052 	if (numsecs < ((uint64_t)1 << 32)) {
   3053 		if (clabel->numBlocksHi) {
   3054 			printf("WARNING: total sectors < 32 bits, yet "
   3055 			       "numBlocksHi set\n"
   3056 			       "WARNING: resetting numBlocksHi to zero.\n");
   3057 			clabel->numBlocksHi = 0;
   3058 		}
   3059 
   3060 		if (clabel->partitionSizeHi) {
   3061 			printf("WARNING: total sectors < 32 bits, yet "
   3062 			       "partitionSizeHi set\n"
   3063 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3064 			clabel->partitionSizeHi = 0;
   3065 		}
   3066 	}
   3067 }
   3068 
   3069 
   3070 #ifdef DEBUG
   3071 void
   3072 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3073 {
   3074 	uint64_t numBlocks;
   3075 	static const char *rp[] = {
   3076 	    "No", "Force", "Soft", "*invalid*"
   3077 	};
   3078 
   3079 
   3080 	numBlocks = rf_component_label_numblocks(clabel);
   3081 
   3082 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3083 	       clabel->row, clabel->column,
   3084 	       clabel->num_rows, clabel->num_columns);
   3085 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3086 	       clabel->version, clabel->serial_number,
   3087 	       clabel->mod_counter);
   3088 	printf("   Clean: %s Status: %d\n",
   3089 	       clabel->clean ? "Yes" : "No", clabel->status);
   3090 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3091 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3092 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3093 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3094 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3095 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3096 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3097 #if 0
   3098 	   printf("   Config order: %d\n", clabel->config_order);
   3099 #endif
   3100 
   3101 }
   3102 #endif
   3103 
   3104 RF_ConfigSet_t *
   3105 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3106 {
   3107 	RF_AutoConfig_t *ac;
   3108 	RF_ConfigSet_t *config_sets;
   3109 	RF_ConfigSet_t *cset;
   3110 	RF_AutoConfig_t *ac_next;
   3111 
   3112 
   3113 	config_sets = NULL;
   3114 
   3115 	/* Go through the AutoConfig list, and figure out which components
   3116 	   belong to what sets.  */
   3117 	ac = ac_list;
   3118 	while(ac!=NULL) {
   3119 		/* we're going to putz with ac->next, so save it here
   3120 		   for use at the end of the loop */
   3121 		ac_next = ac->next;
   3122 
   3123 		if (config_sets == NULL) {
   3124 			/* will need at least this one... */
   3125 			config_sets = (RF_ConfigSet_t *)
   3126 				malloc(sizeof(RF_ConfigSet_t),
   3127 				       M_RAIDFRAME, M_NOWAIT);
   3128 			if (config_sets == NULL) {
   3129 				panic("rf_create_auto_sets: No memory!");
   3130 			}
   3131 			/* this one is easy :) */
   3132 			config_sets->ac = ac;
   3133 			config_sets->next = NULL;
   3134 			config_sets->rootable = 0;
   3135 			ac->next = NULL;
   3136 		} else {
   3137 			/* which set does this component fit into? */
   3138 			cset = config_sets;
   3139 			while(cset!=NULL) {
   3140 				if (rf_does_it_fit(cset, ac)) {
   3141 					/* looks like it matches... */
   3142 					ac->next = cset->ac;
   3143 					cset->ac = ac;
   3144 					break;
   3145 				}
   3146 				cset = cset->next;
   3147 			}
   3148 			if (cset==NULL) {
   3149 				/* didn't find a match above... new set..*/
   3150 				cset = (RF_ConfigSet_t *)
   3151 					malloc(sizeof(RF_ConfigSet_t),
   3152 					       M_RAIDFRAME, M_NOWAIT);
   3153 				if (cset == NULL) {
   3154 					panic("rf_create_auto_sets: No memory!");
   3155 				}
   3156 				cset->ac = ac;
   3157 				ac->next = NULL;
   3158 				cset->next = config_sets;
   3159 				cset->rootable = 0;
   3160 				config_sets = cset;
   3161 			}
   3162 		}
   3163 		ac = ac_next;
   3164 	}
   3165 
   3166 
   3167 	return(config_sets);
   3168 }
   3169 
   3170 static int
   3171 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3172 {
   3173 	RF_ComponentLabel_t *clabel1, *clabel2;
   3174 
   3175 	/* If this one matches the *first* one in the set, that's good
   3176 	   enough, since the other members of the set would have been
   3177 	   through here too... */
   3178 	/* note that we are not checking partitionSize here..
   3179 
   3180 	   Note that we are also not checking the mod_counters here.
   3181 	   If everything else matches except the mod_counter, that's
   3182 	   good enough for this test.  We will deal with the mod_counters
   3183 	   a little later in the autoconfiguration process.
   3184 
   3185 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3186 
   3187 	   The reason we don't check for this is that failed disks
   3188 	   will have lower modification counts.  If those disks are
   3189 	   not added to the set they used to belong to, then they will
   3190 	   form their own set, which may result in 2 different sets,
   3191 	   for example, competing to be configured at raid0, and
   3192 	   perhaps competing to be the root filesystem set.  If the
   3193 	   wrong ones get configured, or both attempt to become /,
   3194 	   weird behaviour and or serious lossage will occur.  Thus we
   3195 	   need to bring them into the fold here, and kick them out at
   3196 	   a later point.
   3197 
   3198 	*/
   3199 
   3200 	clabel1 = cset->ac->clabel;
   3201 	clabel2 = ac->clabel;
   3202 	if ((clabel1->version == clabel2->version) &&
   3203 	    (clabel1->serial_number == clabel2->serial_number) &&
   3204 	    (clabel1->num_rows == clabel2->num_rows) &&
   3205 	    (clabel1->num_columns == clabel2->num_columns) &&
   3206 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3207 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3208 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3209 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3210 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3211 	    (clabel1->blockSize == clabel2->blockSize) &&
   3212 	    rf_component_label_numblocks(clabel1) ==
   3213 	    rf_component_label_numblocks(clabel2) &&
   3214 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3215 	    (clabel1->root_partition == clabel2->root_partition) &&
   3216 	    (clabel1->last_unit == clabel2->last_unit) &&
   3217 	    (clabel1->config_order == clabel2->config_order)) {
   3218 		/* if it get's here, it almost *has* to be a match */
   3219 	} else {
   3220 		/* it's not consistent with somebody in the set..
   3221 		   punt */
   3222 		return(0);
   3223 	}
   3224 	/* all was fine.. it must fit... */
   3225 	return(1);
   3226 }
   3227 
   3228 int
   3229 rf_have_enough_components(RF_ConfigSet_t *cset)
   3230 {
   3231 	RF_AutoConfig_t *ac;
   3232 	RF_AutoConfig_t *auto_config;
   3233 	RF_ComponentLabel_t *clabel;
   3234 	int c;
   3235 	int num_cols;
   3236 	int num_missing;
   3237 	int mod_counter;
   3238 	int mod_counter_found;
   3239 	int even_pair_failed;
   3240 	char parity_type;
   3241 
   3242 
   3243 	/* check to see that we have enough 'live' components
   3244 	   of this set.  If so, we can configure it if necessary */
   3245 
   3246 	num_cols = cset->ac->clabel->num_columns;
   3247 	parity_type = cset->ac->clabel->parityConfig;
   3248 
   3249 	/* XXX Check for duplicate components!?!?!? */
   3250 
   3251 	/* Determine what the mod_counter is supposed to be for this set. */
   3252 
   3253 	mod_counter_found = 0;
   3254 	mod_counter = 0;
   3255 	ac = cset->ac;
   3256 	while(ac!=NULL) {
   3257 		if (mod_counter_found==0) {
   3258 			mod_counter = ac->clabel->mod_counter;
   3259 			mod_counter_found = 1;
   3260 		} else {
   3261 			if (ac->clabel->mod_counter > mod_counter) {
   3262 				mod_counter = ac->clabel->mod_counter;
   3263 			}
   3264 		}
   3265 		ac = ac->next;
   3266 	}
   3267 
   3268 	num_missing = 0;
   3269 	auto_config = cset->ac;
   3270 
   3271 	even_pair_failed = 0;
   3272 	for(c=0; c<num_cols; c++) {
   3273 		ac = auto_config;
   3274 		while(ac!=NULL) {
   3275 			if ((ac->clabel->column == c) &&
   3276 			    (ac->clabel->mod_counter == mod_counter)) {
   3277 				/* it's this one... */
   3278 #ifdef DEBUG
   3279 				printf("Found: %s at %d\n",
   3280 				       ac->devname,c);
   3281 #endif
   3282 				break;
   3283 			}
   3284 			ac=ac->next;
   3285 		}
   3286 		if (ac==NULL) {
   3287 				/* Didn't find one here! */
   3288 				/* special case for RAID 1, especially
   3289 				   where there are more than 2
   3290 				   components (where RAIDframe treats
   3291 				   things a little differently :( ) */
   3292 			if (parity_type == '1') {
   3293 				if (c%2 == 0) { /* even component */
   3294 					even_pair_failed = 1;
   3295 				} else { /* odd component.  If
   3296 					    we're failed, and
   3297 					    so is the even
   3298 					    component, it's
   3299 					    "Good Night, Charlie" */
   3300 					if (even_pair_failed == 1) {
   3301 						return(0);
   3302 					}
   3303 				}
   3304 			} else {
   3305 				/* normal accounting */
   3306 				num_missing++;
   3307 			}
   3308 		}
   3309 		if ((parity_type == '1') && (c%2 == 1)) {
   3310 				/* Just did an even component, and we didn't
   3311 				   bail.. reset the even_pair_failed flag,
   3312 				   and go on to the next component.... */
   3313 			even_pair_failed = 0;
   3314 		}
   3315 	}
   3316 
   3317 	clabel = cset->ac->clabel;
   3318 
   3319 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3320 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3321 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3322 		/* XXX this needs to be made *much* more general */
   3323 		/* Too many failures */
   3324 		return(0);
   3325 	}
   3326 	/* otherwise, all is well, and we've got enough to take a kick
   3327 	   at autoconfiguring this set */
   3328 	return(1);
   3329 }
   3330 
   3331 void
   3332 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3333 			RF_Raid_t *raidPtr)
   3334 {
   3335 	RF_ComponentLabel_t *clabel;
   3336 	int i;
   3337 
   3338 	clabel = ac->clabel;
   3339 
   3340 	/* 1. Fill in the common stuff */
   3341 	config->numCol = clabel->num_columns;
   3342 	config->numSpare = 0; /* XXX should this be set here? */
   3343 	config->sectPerSU = clabel->sectPerSU;
   3344 	config->SUsPerPU = clabel->SUsPerPU;
   3345 	config->SUsPerRU = clabel->SUsPerRU;
   3346 	config->parityConfig = clabel->parityConfig;
   3347 	/* XXX... */
   3348 	strcpy(config->diskQueueType,"fifo");
   3349 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3350 	config->layoutSpecificSize = 0; /* XXX ?? */
   3351 
   3352 	while(ac!=NULL) {
   3353 		/* row/col values will be in range due to the checks
   3354 		   in reasonable_label() */
   3355 		strcpy(config->devnames[0][ac->clabel->column],
   3356 		       ac->devname);
   3357 		ac = ac->next;
   3358 	}
   3359 
   3360 	for(i=0;i<RF_MAXDBGV;i++) {
   3361 		config->debugVars[i][0] = 0;
   3362 	}
   3363 }
   3364 
   3365 int
   3366 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3367 {
   3368 	RF_ComponentLabel_t *clabel;
   3369 	int column;
   3370 	int sparecol;
   3371 
   3372 	raidPtr->autoconfigure = new_value;
   3373 
   3374 	for(column=0; column<raidPtr->numCol; column++) {
   3375 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3376 			clabel = raidget_component_label(raidPtr, column);
   3377 			clabel->autoconfigure = new_value;
   3378 			raidflush_component_label(raidPtr, column);
   3379 		}
   3380 	}
   3381 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3382 		sparecol = raidPtr->numCol + column;
   3383 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3384 			clabel = raidget_component_label(raidPtr, sparecol);
   3385 			clabel->autoconfigure = new_value;
   3386 			raidflush_component_label(raidPtr, sparecol);
   3387 		}
   3388 	}
   3389 	return(new_value);
   3390 }
   3391 
   3392 int
   3393 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3394 {
   3395 	RF_ComponentLabel_t *clabel;
   3396 	int column;
   3397 	int sparecol;
   3398 
   3399 	raidPtr->root_partition = new_value;
   3400 	for(column=0; column<raidPtr->numCol; column++) {
   3401 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3402 			clabel = raidget_component_label(raidPtr, column);
   3403 			clabel->root_partition = new_value;
   3404 			raidflush_component_label(raidPtr, column);
   3405 		}
   3406 	}
   3407 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3408 		sparecol = raidPtr->numCol + column;
   3409 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3410 			clabel = raidget_component_label(raidPtr, sparecol);
   3411 			clabel->root_partition = new_value;
   3412 			raidflush_component_label(raidPtr, sparecol);
   3413 		}
   3414 	}
   3415 	return(new_value);
   3416 }
   3417 
   3418 void
   3419 rf_release_all_vps(RF_ConfigSet_t *cset)
   3420 {
   3421 	RF_AutoConfig_t *ac;
   3422 
   3423 	ac = cset->ac;
   3424 	while(ac!=NULL) {
   3425 		/* Close the vp, and give it back */
   3426 		if (ac->vp) {
   3427 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3428 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3429 			vput(ac->vp);
   3430 			ac->vp = NULL;
   3431 		}
   3432 		ac = ac->next;
   3433 	}
   3434 }
   3435 
   3436 
   3437 void
   3438 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3439 {
   3440 	RF_AutoConfig_t *ac;
   3441 	RF_AutoConfig_t *next_ac;
   3442 
   3443 	ac = cset->ac;
   3444 	while(ac!=NULL) {
   3445 		next_ac = ac->next;
   3446 		/* nuke the label */
   3447 		free(ac->clabel, M_RAIDFRAME);
   3448 		/* cleanup the config structure */
   3449 		free(ac, M_RAIDFRAME);
   3450 		/* "next.." */
   3451 		ac = next_ac;
   3452 	}
   3453 	/* and, finally, nuke the config set */
   3454 	free(cset, M_RAIDFRAME);
   3455 }
   3456 
   3457 
   3458 void
   3459 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3460 {
   3461 	/* current version number */
   3462 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3463 	clabel->serial_number = raidPtr->serial_number;
   3464 	clabel->mod_counter = raidPtr->mod_counter;
   3465 
   3466 	clabel->num_rows = 1;
   3467 	clabel->num_columns = raidPtr->numCol;
   3468 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3469 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3470 
   3471 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3472 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3473 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3474 
   3475 	clabel->blockSize = raidPtr->bytesPerSector;
   3476 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3477 
   3478 	/* XXX not portable */
   3479 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3480 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3481 	clabel->autoconfigure = raidPtr->autoconfigure;
   3482 	clabel->root_partition = raidPtr->root_partition;
   3483 	clabel->last_unit = raidPtr->raidid;
   3484 	clabel->config_order = raidPtr->config_order;
   3485 
   3486 #ifndef RF_NO_PARITY_MAP
   3487 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3488 #endif
   3489 }
   3490 
   3491 struct raid_softc *
   3492 rf_auto_config_set(RF_ConfigSet_t *cset)
   3493 {
   3494 	RF_Raid_t *raidPtr;
   3495 	RF_Config_t *config;
   3496 	int raidID;
   3497 	struct raid_softc *sc;
   3498 
   3499 #ifdef DEBUG
   3500 	printf("RAID autoconfigure\n");
   3501 #endif
   3502 
   3503 	/* 1. Create a config structure */
   3504 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3505 	if (config == NULL) {
   3506 		printf("%s: Out of mem - config!?!?\n", __func__);
   3507 				/* XXX do something more intelligent here. */
   3508 		return NULL;
   3509 	}
   3510 
   3511 	/*
   3512 	   2. Figure out what RAID ID this one is supposed to live at
   3513 	   See if we can get the same RAID dev that it was configured
   3514 	   on last time..
   3515 	*/
   3516 
   3517 	raidID = cset->ac->clabel->last_unit;
   3518 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3519 	     sc = raidget(++raidID, false))
   3520 		continue;
   3521 #ifdef DEBUG
   3522 	printf("Configuring raid%d:\n",raidID);
   3523 #endif
   3524 
   3525 	if (sc == NULL)
   3526 		sc = raidget(raidID, true);
   3527 	if (sc == NULL) {
   3528 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3529 				/* XXX do something more intelligent here. */
   3530 		free(config, M_RAIDFRAME);
   3531 		return NULL;
   3532 	}
   3533 
   3534 	raidPtr = &sc->sc_r;
   3535 
   3536 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3537 	raidPtr->softc = sc;
   3538 	raidPtr->raidid = raidID;
   3539 	raidPtr->openings = RAIDOUTSTANDING;
   3540 
   3541 	/* 3. Build the configuration structure */
   3542 	rf_create_configuration(cset->ac, config, raidPtr);
   3543 
   3544 	/* 4. Do the configuration */
   3545 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3546 		raidinit(sc);
   3547 
   3548 		rf_markalldirty(raidPtr);
   3549 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3550 		switch (cset->ac->clabel->root_partition) {
   3551 		case 1:	/* Force Root */
   3552 		case 2:	/* Soft Root: root when boot partition part of raid */
   3553 			/*
   3554 			 * everything configured just fine.  Make a note
   3555 			 * that this set is eligible to be root,
   3556 			 * or forced to be root
   3557 			 */
   3558 			cset->rootable = cset->ac->clabel->root_partition;
   3559 			/* XXX do this here? */
   3560 			raidPtr->root_partition = cset->rootable;
   3561 			break;
   3562 		default:
   3563 			break;
   3564 		}
   3565 	} else {
   3566 		raidput(sc);
   3567 		sc = NULL;
   3568 	}
   3569 
   3570 	/* 5. Cleanup */
   3571 	free(config, M_RAIDFRAME);
   3572 	return sc;
   3573 }
   3574 
   3575 void
   3576 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3577 	     size_t xmin, size_t xmax)
   3578 {
   3579 	int error;
   3580 
   3581 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3582 	pool_sethiwat(p, xmax);
   3583 	if ((error = pool_prime(p, xmin)) != 0)
   3584 		panic("%s: failed to prime pool: %d", __func__, error);
   3585 	pool_setlowat(p, xmin);
   3586 }
   3587 
   3588 /*
   3589  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3590  * to see if there is IO pending and if that IO could possibly be done
   3591  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3592  * otherwise.
   3593  *
   3594  */
   3595 int
   3596 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3597 {
   3598 	struct raid_softc *rs;
   3599 	struct dk_softc *dksc;
   3600 
   3601 	rs = raidPtr->softc;
   3602 	dksc = &rs->sc_dksc;
   3603 
   3604 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3605 		return 1;
   3606 
   3607 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3608 		/* there is work to do */
   3609 		return 0;
   3610 	}
   3611 	/* default is nothing to do */
   3612 	return 1;
   3613 }
   3614 
   3615 int
   3616 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3617 {
   3618 	uint64_t numsecs;
   3619 	unsigned secsize;
   3620 	int error;
   3621 
   3622 	error = getdisksize(vp, &numsecs, &secsize);
   3623 	if (error == 0) {
   3624 		diskPtr->blockSize = secsize;
   3625 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3626 		diskPtr->partitionSize = numsecs;
   3627 		return 0;
   3628 	}
   3629 	return error;
   3630 }
   3631 
   3632 static int
   3633 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3634 {
   3635 	return 1;
   3636 }
   3637 
   3638 static void
   3639 raid_attach(device_t parent, device_t self, void *aux)
   3640 {
   3641 }
   3642 
   3643 
   3644 static int
   3645 raid_detach(device_t self, int flags)
   3646 {
   3647 	int error;
   3648 	struct raid_softc *rs = raidsoftc(self);
   3649 
   3650 	if (rs == NULL)
   3651 		return ENXIO;
   3652 
   3653 	if ((error = raidlock(rs)) != 0)
   3654 		return (error);
   3655 
   3656 	error = raid_detach_unlocked(rs);
   3657 
   3658 	raidunlock(rs);
   3659 
   3660 	/* XXX raid can be referenced here */
   3661 
   3662 	if (error)
   3663 		return error;
   3664 
   3665 	/* Free the softc */
   3666 	raidput(rs);
   3667 
   3668 	return 0;
   3669 }
   3670 
   3671 static void
   3672 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3673 {
   3674 	struct dk_softc *dksc = &rs->sc_dksc;
   3675 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3676 
   3677 	memset(dg, 0, sizeof(*dg));
   3678 
   3679 	dg->dg_secperunit = raidPtr->totalSectors;
   3680 	dg->dg_secsize = raidPtr->bytesPerSector;
   3681 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3682 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3683 
   3684 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3685 }
   3686 
   3687 /*
   3688  * Get cache info for all the components (including spares).
   3689  * Returns intersection of all the cache flags of all disks, or first
   3690  * error if any encountered.
   3691  * XXXfua feature flags can change as spares are added - lock down somehow
   3692  */
   3693 static int
   3694 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3695 {
   3696 	int c;
   3697 	int error;
   3698 	int dkwhole = 0, dkpart;
   3699 
   3700 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3701 		/*
   3702 		 * Check any non-dead disk, even when currently being
   3703 		 * reconstructed.
   3704 		 */
   3705 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   3706 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3707 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3708 			if (error) {
   3709 				if (error != ENODEV) {
   3710 					printf("raid%d: get cache for component %s failed\n",
   3711 					    raidPtr->raidid,
   3712 					    raidPtr->Disks[c].devname);
   3713 				}
   3714 
   3715 				return error;
   3716 			}
   3717 
   3718 			if (c == 0)
   3719 				dkwhole = dkpart;
   3720 			else
   3721 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3722 		}
   3723 	}
   3724 
   3725 	*data = dkwhole;
   3726 
   3727 	return 0;
   3728 }
   3729 
   3730 /*
   3731  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3732  * We end up returning whatever error was returned by the first cache flush
   3733  * that fails.
   3734  */
   3735 
   3736 int
   3737 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3738 {
   3739 	int c, sparecol;
   3740 	int e,error;
   3741 	int force = 1;
   3742 
   3743 	error = 0;
   3744 	for (c = 0; c < raidPtr->numCol; c++) {
   3745 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3746 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3747 					  &force, FWRITE, NOCRED);
   3748 			if (e) {
   3749 				if (e != ENODEV)
   3750 					printf("raid%d: cache flush to component %s failed.\n",
   3751 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3752 				if (error == 0) {
   3753 					error = e;
   3754 				}
   3755 			}
   3756 		}
   3757 	}
   3758 
   3759 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3760 		sparecol = raidPtr->numCol + c;
   3761 		/* Need to ensure that the reconstruct actually completed! */
   3762 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3763 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3764 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3765 			if (e) {
   3766 				if (e != ENODEV)
   3767 					printf("raid%d: cache flush to component %s failed.\n",
   3768 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3769 				if (error == 0) {
   3770 					error = e;
   3771 				}
   3772 			}
   3773 		}
   3774 	}
   3775 	return error;
   3776 }
   3777 
   3778 /* Fill in info with the current status */
   3779 void
   3780 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3781 {
   3782 
   3783 	memset(info, 0, sizeof(*info));
   3784 
   3785 	if (raidPtr->status != rf_rs_reconstructing) {
   3786 		info->total = 100;
   3787 		info->completed = 100;
   3788 	} else {
   3789 		info->total = raidPtr->reconControl->numRUsTotal;
   3790 		info->completed = raidPtr->reconControl->numRUsComplete;
   3791 	}
   3792 	info->remaining = info->total - info->completed;
   3793 }
   3794 
   3795 /* Fill in info with the current status */
   3796 void
   3797 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3798 {
   3799 
   3800 	memset(info, 0, sizeof(*info));
   3801 
   3802 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3803 		info->total = raidPtr->Layout.numStripe;
   3804 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3805 	} else {
   3806 		info->completed = 100;
   3807 		info->total = 100;
   3808 	}
   3809 	info->remaining = info->total - info->completed;
   3810 }
   3811 
   3812 /* Fill in info with the current status */
   3813 void
   3814 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3815 {
   3816 
   3817 	memset(info, 0, sizeof(*info));
   3818 
   3819 	if (raidPtr->copyback_in_progress == 1) {
   3820 		info->total = raidPtr->Layout.numStripe;
   3821 		info->completed = raidPtr->copyback_stripes_done;
   3822 		info->remaining = info->total - info->completed;
   3823 	} else {
   3824 		info->remaining = 0;
   3825 		info->completed = 100;
   3826 		info->total = 100;
   3827 	}
   3828 }
   3829 
   3830 /* Fill in config with the current info */
   3831 int
   3832 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3833 {
   3834 	int	d, i, j;
   3835 
   3836 	if (!raidPtr->valid)
   3837 		return (ENODEV);
   3838 	config->cols = raidPtr->numCol;
   3839 	config->ndevs = raidPtr->numCol;
   3840 	if (config->ndevs >= RF_MAX_DISKS)
   3841 		return (ENOMEM);
   3842 	config->nspares = raidPtr->numSpare;
   3843 	if (config->nspares >= RF_MAX_DISKS)
   3844 		return (ENOMEM);
   3845 	config->maxqdepth = raidPtr->maxQueueDepth;
   3846 	d = 0;
   3847 	for (j = 0; j < config->cols; j++) {
   3848 		config->devs[d] = raidPtr->Disks[j];
   3849 		d++;
   3850 	}
   3851 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3852 		config->spares[i] = raidPtr->Disks[j];
   3853 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3854 			/* XXX: raidctl(8) expects to see this as a used spare */
   3855 			config->spares[i].status = rf_ds_used_spare;
   3856 		}
   3857 	}
   3858 	return 0;
   3859 }
   3860 
   3861 int
   3862 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3863 {
   3864 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3865 	RF_ComponentLabel_t *raid_clabel;
   3866 	int column = clabel->column;
   3867 
   3868 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3869 		return EINVAL;
   3870 	raid_clabel = raidget_component_label(raidPtr, column);
   3871 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3872 
   3873 	return 0;
   3874 }
   3875 
   3876 /*
   3877  * Module interface
   3878  */
   3879 
   3880 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3881 
   3882 #ifdef _MODULE
   3883 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3884 #endif
   3885 
   3886 static int raid_modcmd(modcmd_t, void *);
   3887 static int raid_modcmd_init(void);
   3888 static int raid_modcmd_fini(void);
   3889 
   3890 static int
   3891 raid_modcmd(modcmd_t cmd, void *data)
   3892 {
   3893 	int error;
   3894 
   3895 	error = 0;
   3896 	switch (cmd) {
   3897 	case MODULE_CMD_INIT:
   3898 		error = raid_modcmd_init();
   3899 		break;
   3900 	case MODULE_CMD_FINI:
   3901 		error = raid_modcmd_fini();
   3902 		break;
   3903 	default:
   3904 		error = ENOTTY;
   3905 		break;
   3906 	}
   3907 	return error;
   3908 }
   3909 
   3910 static int
   3911 raid_modcmd_init(void)
   3912 {
   3913 	int error;
   3914 	int bmajor, cmajor;
   3915 
   3916 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3917 	mutex_enter(&raid_lock);
   3918 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3919 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3920 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3921 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3922 
   3923 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3924 #endif
   3925 
   3926 	bmajor = cmajor = -1;
   3927 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3928 	    &raid_cdevsw, &cmajor);
   3929 	if (error != 0 && error != EEXIST) {
   3930 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3931 		mutex_exit(&raid_lock);
   3932 		return error;
   3933 	}
   3934 #ifdef _MODULE
   3935 	error = config_cfdriver_attach(&raid_cd);
   3936 	if (error != 0) {
   3937 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3938 		    __func__, error);
   3939 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3940 		mutex_exit(&raid_lock);
   3941 		return error;
   3942 	}
   3943 #endif
   3944 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3945 	if (error != 0) {
   3946 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3947 		    __func__, error);
   3948 #ifdef _MODULE
   3949 		config_cfdriver_detach(&raid_cd);
   3950 #endif
   3951 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3952 		mutex_exit(&raid_lock);
   3953 		return error;
   3954 	}
   3955 
   3956 	raidautoconfigdone = false;
   3957 
   3958 	mutex_exit(&raid_lock);
   3959 
   3960 	if (error == 0) {
   3961 		if (rf_BootRaidframe(true) == 0)
   3962 			aprint_verbose("Kernelized RAIDframe activated\n");
   3963 		else
   3964 			panic("Serious error activating RAID!!");
   3965 	}
   3966 
   3967 	/*
   3968 	 * Register a finalizer which will be used to auto-config RAID
   3969 	 * sets once all real hardware devices have been found.
   3970 	 */
   3971 	error = config_finalize_register(NULL, rf_autoconfig);
   3972 	if (error != 0) {
   3973 		aprint_error("WARNING: unable to register RAIDframe "
   3974 		    "finalizer\n");
   3975 		error = 0;
   3976 	}
   3977 
   3978 	return error;
   3979 }
   3980 
   3981 static int
   3982 raid_modcmd_fini(void)
   3983 {
   3984 	int error;
   3985 
   3986 	mutex_enter(&raid_lock);
   3987 
   3988 	/* Don't allow unload if raid device(s) exist.  */
   3989 	if (!LIST_EMPTY(&raids)) {
   3990 		mutex_exit(&raid_lock);
   3991 		return EBUSY;
   3992 	}
   3993 
   3994 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3995 	if (error != 0) {
   3996 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3997 		mutex_exit(&raid_lock);
   3998 		return error;
   3999 	}
   4000 #ifdef _MODULE
   4001 	error = config_cfdriver_detach(&raid_cd);
   4002 	if (error != 0) {
   4003 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4004 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4005 		mutex_exit(&raid_lock);
   4006 		return error;
   4007 	}
   4008 #endif
   4009 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4010 	if (error != 0) {
   4011 		aprint_error("%s: cannot detach devsw\n",__func__);
   4012 #ifdef _MODULE
   4013 		config_cfdriver_attach(&raid_cd);
   4014 #endif
   4015 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4016 		mutex_exit(&raid_lock);
   4017 		return error;
   4018 	}
   4019 	rf_BootRaidframe(false);
   4020 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4021 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4022 	rf_destroy_cond2(rf_sparet_wait_cv);
   4023 	rf_destroy_cond2(rf_sparet_resp_cv);
   4024 #endif
   4025 	mutex_exit(&raid_lock);
   4026 	mutex_destroy(&raid_lock);
   4027 
   4028 	return error;
   4029 }
   4030