Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.376.4.3
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.376.4.3 2022/08/12 15:18:13 martin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.376.4.3 2022/08/12 15:18:13 martin Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    179 
    180 /* prototypes */
    181 static void KernelWakeupFunc(struct buf *);
    182 static void InitBP(struct buf *, struct vnode *, unsigned,
    183     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    184     void *, int, struct proc *);
    185 static void raidinit(struct raid_softc *);
    186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    188 
    189 static int raid_match(device_t, cfdata_t, void *);
    190 static void raid_attach(device_t, device_t, void *);
    191 static int raid_detach(device_t, int);
    192 
    193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    194     daddr_t, daddr_t);
    195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t, int);
    197 
    198 static int raidwrite_component_label(unsigned,
    199     dev_t, struct vnode *, RF_ComponentLabel_t *);
    200 static int raidread_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 
    203 static int raid_diskstart(device_t, struct buf *bp);
    204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    205 static int raid_lastclose(device_t);
    206 
    207 static dev_type_open(raidopen);
    208 static dev_type_close(raidclose);
    209 static dev_type_read(raidread);
    210 static dev_type_write(raidwrite);
    211 static dev_type_ioctl(raidioctl);
    212 static dev_type_strategy(raidstrategy);
    213 static dev_type_dump(raiddump);
    214 static dev_type_size(raidsize);
    215 
    216 const struct bdevsw raid_bdevsw = {
    217 	.d_open = raidopen,
    218 	.d_close = raidclose,
    219 	.d_strategy = raidstrategy,
    220 	.d_ioctl = raidioctl,
    221 	.d_dump = raiddump,
    222 	.d_psize = raidsize,
    223 	.d_discard = nodiscard,
    224 	.d_flag = D_DISK
    225 };
    226 
    227 const struct cdevsw raid_cdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_read = raidread,
    231 	.d_write = raidwrite,
    232 	.d_ioctl = raidioctl,
    233 	.d_stop = nostop,
    234 	.d_tty = notty,
    235 	.d_poll = nopoll,
    236 	.d_mmap = nommap,
    237 	.d_kqfilter = nokqfilter,
    238 	.d_discard = nodiscard,
    239 	.d_flag = D_DISK
    240 };
    241 
    242 static struct dkdriver rf_dkdriver = {
    243 	.d_open = raidopen,
    244 	.d_close = raidclose,
    245 	.d_strategy = raidstrategy,
    246 	.d_diskstart = raid_diskstart,
    247 	.d_dumpblocks = raid_dumpblocks,
    248 	.d_lastclose = raid_lastclose,
    249 	.d_minphys = minphys
    250 };
    251 
    252 #define	raidunit(x)	DISKUNIT(x)
    253 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /* Internal representation of a rf_recon_req */
    261 struct rf_recon_req_internal {
    262 	RF_RowCol_t col;
    263 	RF_ReconReqFlags_t flags;
    264 	void   *raidPtr;
    265 };
    266 
    267 /*
    268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    269  * Be aware that large numbers can allow the driver to consume a lot of
    270  * kernel memory, especially on writes, and in degraded mode reads.
    271  *
    272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    273  * a single 64K write will typically require 64K for the old data,
    274  * 64K for the old parity, and 64K for the new parity, for a total
    275  * of 192K (if the parity buffer is not re-used immediately).
    276  * Even it if is used immediately, that's still 128K, which when multiplied
    277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    278  *
    279  * Now in degraded mode, for example, a 64K read on the above setup may
    280  * require data reconstruction, which will require *all* of the 4 remaining
    281  * disks to participate -- 4 * 32K/disk == 128K again.
    282  */
    283 
    284 #ifndef RAIDOUTSTANDING
    285 #define RAIDOUTSTANDING   6
    286 #endif
    287 
    288 #define RAIDLABELDEV(dev)	\
    289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    290 
    291 /* declared here, and made public, for the benefit of KVM stuff.. */
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req_internal *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	sc->sc_unit = unit;
    342 	cv_init(&sc->sc_cv, "raidunit");
    343 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    344 	return sc;
    345 }
    346 
    347 static void
    348 raiddestroy(struct raid_softc *sc) {
    349 	cv_destroy(&sc->sc_cv);
    350 	mutex_destroy(&sc->sc_mutex);
    351 	kmem_free(sc, sizeof(*sc));
    352 }
    353 
    354 static struct raid_softc *
    355 raidget(int unit, bool create) {
    356 	struct raid_softc *sc;
    357 	if (unit < 0) {
    358 #ifdef DIAGNOSTIC
    359 		panic("%s: unit %d!", __func__, unit);
    360 #endif
    361 		return NULL;
    362 	}
    363 	mutex_enter(&raid_lock);
    364 	LIST_FOREACH(sc, &raids, sc_link) {
    365 		if (sc->sc_unit == unit) {
    366 			mutex_exit(&raid_lock);
    367 			return sc;
    368 		}
    369 	}
    370 	mutex_exit(&raid_lock);
    371 	if (!create)
    372 		return NULL;
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return (0);
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok, rootable %d\n",
    502 				    sc->sc_unit, cset->rootable);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 	dksc = &rsc->sc_dksc;
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL) {
    527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    528 		return;
    529 	}
    530 
    531 	/* we found something bootable... */
    532 
    533 	/*
    534 	 * XXX: The following code assumes that the root raid
    535 	 * is the first ('a') partition. This is about the best
    536 	 * we can do with a BSD disklabel, but we might be able
    537 	 * to do better with a GPT label, by setting a specified
    538 	 * attribute to indicate the root partition. We can then
    539 	 * stash the partition number in the r->root_partition
    540 	 * high bits (the bottom 2 bits are already used). For
    541 	 * now we just set booted_partition to 0 when we override
    542 	 * root.
    543 	 */
    544 	if (num_root == 1) {
    545 		device_t candidate_root;
    546 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    547 			char cname[sizeof(cset->ac->devname)];
    548 			/* XXX: assume partition 'a' first */
    549 			snprintf(cname, sizeof(cname), "%s%c",
    550 			    device_xname(dksc->sc_dev), 'a');
    551 			candidate_root = dkwedge_find_by_wname(cname);
    552 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    553 			    cname);
    554 			if (candidate_root == NULL) {
    555 				/*
    556 				 * If that is not found, because we don't use
    557 				 * disklabel, return the first dk child
    558 				 * XXX: we can skip the 'a' check above
    559 				 * and always do this...
    560 				 */
    561 				size_t i = 0;
    562 				candidate_root = dkwedge_find_by_parent(
    563 				    device_xname(dksc->sc_dev), &i);
    564 			}
    565 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    566 			    candidate_root);
    567 		} else
    568 			candidate_root = dksc->sc_dev;
    569 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    570 		DPRINTF("%s: booted_device=%p root_partition=%d "
    571 			"contains_boot=%d",
    572 		    __func__, booted_device, rsc->sc_r.root_partition,
    573 			   rf_containsboot(&rsc->sc_r, booted_device));
    574 		/* XXX the check for booted_device == NULL can probably be
    575 		 * dropped, now that rf_containsboot handles that case.
    576 		 */
    577 		if (booted_device == NULL ||
    578 		    rsc->sc_r.root_partition == 1 ||
    579 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    580 			booted_device = candidate_root;
    581 			booted_method = "raidframe/single";
    582 			booted_partition = 0;	/* XXX assume 'a' */
    583 		}
    584 	} else if (num_root > 1) {
    585 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    586 		    booted_device);
    587 
    588 		/*
    589 		 * Maybe the MD code can help. If it cannot, then
    590 		 * setroot() will discover that we have no
    591 		 * booted_device and will ask the user if nothing was
    592 		 * hardwired in the kernel config file
    593 		 */
    594 		if (booted_device == NULL)
    595 			return;
    596 
    597 		num_root = 0;
    598 		mutex_enter(&raid_lock);
    599 		LIST_FOREACH(sc, &raids, sc_link) {
    600 			RF_Raid_t *r = &sc->sc_r;
    601 			if (r->valid == 0)
    602 				continue;
    603 
    604 			if (r->root_partition == 0)
    605 				continue;
    606 
    607 			if (rf_containsboot(r, booted_device)) {
    608 				num_root++;
    609 				rsc = sc;
    610 				dksc = &rsc->sc_dksc;
    611 			}
    612 		}
    613 		mutex_exit(&raid_lock);
    614 
    615 		if (num_root == 1) {
    616 			booted_device = dksc->sc_dev;
    617 			booted_method = "raidframe/multi";
    618 			booted_partition = 0;	/* XXX assume 'a' */
    619 		} else {
    620 			/* we can't guess.. require the user to answer... */
    621 			boothowto |= RB_ASKNAME;
    622 		}
    623 	}
    624 }
    625 
    626 static int
    627 raidsize(dev_t dev)
    628 {
    629 	struct raid_softc *rs;
    630 	struct dk_softc *dksc;
    631 	unsigned int unit;
    632 
    633 	unit = raidunit(dev);
    634 	if ((rs = raidget(unit, false)) == NULL)
    635 		return -1;
    636 	dksc = &rs->sc_dksc;
    637 
    638 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    639 		return -1;
    640 
    641 	return dk_size(dksc, dev);
    642 }
    643 
    644 static int
    645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    646 {
    647 	unsigned int unit;
    648 	struct raid_softc *rs;
    649 	struct dk_softc *dksc;
    650 
    651 	unit = raidunit(dev);
    652 	if ((rs = raidget(unit, false)) == NULL)
    653 		return ENXIO;
    654 	dksc = &rs->sc_dksc;
    655 
    656 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    657 		return ENODEV;
    658 
    659         /*
    660            Note that blkno is relative to this particular partition.
    661            By adding adding RF_PROTECTED_SECTORS, we get a value that
    662 	   is relative to the partition used for the underlying component.
    663         */
    664 	blkno += RF_PROTECTED_SECTORS;
    665 
    666 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    667 }
    668 
    669 static int
    670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    671 {
    672 	struct raid_softc *rs = raidsoftc(dev);
    673 	const struct bdevsw *bdev;
    674 	RF_Raid_t *raidPtr;
    675 	int     c, sparecol, j, scol, dumpto;
    676 	int     error = 0;
    677 
    678 	raidPtr = &rs->sc_r;
    679 
    680 	/* we only support dumping to RAID 1 sets */
    681 	if (raidPtr->Layout.numDataCol != 1 ||
    682 	    raidPtr->Layout.numParityCol != 1)
    683 		return EINVAL;
    684 
    685 	if ((error = raidlock(rs)) != 0)
    686 		return error;
    687 
    688 	/* figure out what device is alive.. */
    689 
    690 	/*
    691 	   Look for a component to dump to.  The preference for the
    692 	   component to dump to is as follows:
    693 	   1) the master
    694 	   2) a used_spare of the master
    695 	   3) the slave
    696 	   4) a used_spare of the slave
    697 	*/
    698 
    699 	dumpto = -1;
    700 	for (c = 0; c < raidPtr->numCol; c++) {
    701 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    702 			/* this might be the one */
    703 			dumpto = c;
    704 			break;
    705 		}
    706 	}
    707 
    708 	/*
    709 	   At this point we have possibly selected a live master or a
    710 	   live slave.  We now check to see if there is a spared
    711 	   master (or a spared slave), if we didn't find a live master
    712 	   or a live slave.
    713 	*/
    714 
    715 	for (c = 0; c < raidPtr->numSpare; c++) {
    716 		sparecol = raidPtr->numCol + c;
    717 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    718 			/* How about this one? */
    719 			scol = -1;
    720 			for(j=0;j<raidPtr->numCol;j++) {
    721 				if (raidPtr->Disks[j].spareCol == sparecol) {
    722 					scol = j;
    723 					break;
    724 				}
    725 			}
    726 			if (scol == 0) {
    727 				/*
    728 				   We must have found a spared master!
    729 				   We'll take that over anything else
    730 				   found so far.  (We couldn't have
    731 				   found a real master before, since
    732 				   this is a used spare, and it's
    733 				   saying that it's replacing the
    734 				   master.)  On reboot (with
    735 				   autoconfiguration turned on)
    736 				   sparecol will become the 1st
    737 				   component (component0) of this set.
    738 				*/
    739 				dumpto = sparecol;
    740 				break;
    741 			} else if (scol != -1) {
    742 				/*
    743 				   Must be a spared slave.  We'll dump
    744 				   to that if we havn't found anything
    745 				   else so far.
    746 				*/
    747 				if (dumpto == -1)
    748 					dumpto = sparecol;
    749 			}
    750 		}
    751 	}
    752 
    753 	if (dumpto == -1) {
    754 		/* we couldn't find any live components to dump to!?!?
    755 		 */
    756 		error = EINVAL;
    757 		goto out;
    758 	}
    759 
    760 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    761 	if (bdev == NULL) {
    762 		error = ENXIO;
    763 		goto out;
    764 	}
    765 
    766 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    767 				blkno, va, nblk * raidPtr->bytesPerSector);
    768 
    769 out:
    770 	raidunlock(rs);
    771 
    772 	return error;
    773 }
    774 
    775 /* ARGSUSED */
    776 static int
    777 raidopen(dev_t dev, int flags, int fmt,
    778     struct lwp *l)
    779 {
    780 	int     unit = raidunit(dev);
    781 	struct raid_softc *rs;
    782 	struct dk_softc *dksc;
    783 	int     error = 0;
    784 	int     part, pmask;
    785 
    786 	if ((rs = raidget(unit, true)) == NULL)
    787 		return ENXIO;
    788 	if ((error = raidlock(rs)) != 0)
    789 		return (error);
    790 
    791 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    792 		error = EBUSY;
    793 		goto bad;
    794 	}
    795 
    796 	dksc = &rs->sc_dksc;
    797 
    798 	part = DISKPART(dev);
    799 	pmask = (1 << part);
    800 
    801 	if (!DK_BUSY(dksc, pmask) &&
    802 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    803 		/* First one... mark things as dirty... Note that we *MUST*
    804 		 have done a configure before this.  I DO NOT WANT TO BE
    805 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    806 		 THAT THEY BELONG TOGETHER!!!!! */
    807 		/* XXX should check to see if we're only open for reading
    808 		   here... If so, we needn't do this, but then need some
    809 		   other way of keeping track of what's happened.. */
    810 
    811 		rf_markalldirty(&rs->sc_r);
    812 	}
    813 
    814 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    815 		error = dk_open(dksc, dev, flags, fmt, l);
    816 
    817 bad:
    818 	raidunlock(rs);
    819 
    820 	return (error);
    821 
    822 
    823 }
    824 
    825 static int
    826 raid_lastclose(device_t self)
    827 {
    828 	struct raid_softc *rs = raidsoftc(self);
    829 
    830 	/* Last one... device is not unconfigured yet.
    831 	   Device shutdown has taken care of setting the
    832 	   clean bits if RAIDF_INITED is not set
    833 	   mark things as clean... */
    834 
    835 	rf_update_component_labels(&rs->sc_r,
    836 	    RF_FINAL_COMPONENT_UPDATE);
    837 
    838 	/* pass to unlocked code */
    839 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    840 		rs->sc_flags |= RAIDF_DETACH;
    841 
    842 	return 0;
    843 }
    844 
    845 /* ARGSUSED */
    846 static int
    847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    848 {
    849 	int     unit = raidunit(dev);
    850 	struct raid_softc *rs;
    851 	struct dk_softc *dksc;
    852 	cfdata_t cf;
    853 	int     error = 0, do_detach = 0, do_put = 0;
    854 
    855 	if ((rs = raidget(unit, false)) == NULL)
    856 		return ENXIO;
    857 	dksc = &rs->sc_dksc;
    858 
    859 	if ((error = raidlock(rs)) != 0)
    860 		return (error);
    861 
    862 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    863 		error = dk_close(dksc, dev, flags, fmt, l);
    864 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    865 			do_detach = 1;
    866 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    867 		do_put = 1;
    868 
    869 	raidunlock(rs);
    870 
    871 	if (do_detach) {
    872 		/* free the pseudo device attach bits */
    873 		cf = device_cfdata(dksc->sc_dev);
    874 		error = config_detach(dksc->sc_dev, 0);
    875 		if (error == 0)
    876 			free(cf, M_RAIDFRAME);
    877 	} else if (do_put) {
    878 		raidput(rs);
    879 	}
    880 
    881 	return (error);
    882 
    883 }
    884 
    885 static void
    886 raid_wakeup(RF_Raid_t *raidPtr)
    887 {
    888 	rf_lock_mutex2(raidPtr->iodone_lock);
    889 	rf_signal_cond2(raidPtr->iodone_cv);
    890 	rf_unlock_mutex2(raidPtr->iodone_lock);
    891 }
    892 
    893 static void
    894 raidstrategy(struct buf *bp)
    895 {
    896 	unsigned int unit;
    897 	struct raid_softc *rs;
    898 	struct dk_softc *dksc;
    899 	RF_Raid_t *raidPtr;
    900 
    901 	unit = raidunit(bp->b_dev);
    902 	if ((rs = raidget(unit, false)) == NULL) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    907 		bp->b_error = ENXIO;
    908 		goto fail;
    909 	}
    910 	dksc = &rs->sc_dksc;
    911 	raidPtr = &rs->sc_r;
    912 
    913 	/* Queue IO only */
    914 	if (dk_strategy_defer(dksc, bp))
    915 		goto done;
    916 
    917 	/* schedule the IO to happen at the next convenient time */
    918 	raid_wakeup(raidPtr);
    919 
    920 done:
    921 	return;
    922 
    923 fail:
    924 	bp->b_resid = bp->b_bcount;
    925 	biodone(bp);
    926 }
    927 
    928 static int
    929 raid_diskstart(device_t dev, struct buf *bp)
    930 {
    931 	struct raid_softc *rs = raidsoftc(dev);
    932 	RF_Raid_t *raidPtr;
    933 
    934 	raidPtr = &rs->sc_r;
    935 	if (!raidPtr->valid) {
    936 		db1_printf(("raid is not valid..\n"));
    937 		return ENODEV;
    938 	}
    939 
    940 	/* XXX */
    941 	bp->b_resid = 0;
    942 
    943 	return raiddoaccess(raidPtr, bp);
    944 }
    945 
    946 void
    947 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    948 {
    949 	struct raid_softc *rs;
    950 	struct dk_softc *dksc;
    951 
    952 	rs = raidPtr->softc;
    953 	dksc = &rs->sc_dksc;
    954 
    955 	dk_done(dksc, bp);
    956 
    957 	rf_lock_mutex2(raidPtr->mutex);
    958 	raidPtr->openings++;
    959 	rf_unlock_mutex2(raidPtr->mutex);
    960 
    961 	/* schedule more IO */
    962 	raid_wakeup(raidPtr);
    963 }
    964 
    965 /* ARGSUSED */
    966 static int
    967 raidread(dev_t dev, struct uio *uio, int flags)
    968 {
    969 	int     unit = raidunit(dev);
    970 	struct raid_softc *rs;
    971 
    972 	if ((rs = raidget(unit, false)) == NULL)
    973 		return ENXIO;
    974 
    975 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    976 		return (ENXIO);
    977 
    978 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    979 
    980 }
    981 
    982 /* ARGSUSED */
    983 static int
    984 raidwrite(dev_t dev, struct uio *uio, int flags)
    985 {
    986 	int     unit = raidunit(dev);
    987 	struct raid_softc *rs;
    988 
    989 	if ((rs = raidget(unit, false)) == NULL)
    990 		return ENXIO;
    991 
    992 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    993 		return (ENXIO);
    994 
    995 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    996 
    997 }
    998 
    999 static int
   1000 raid_detach_unlocked(struct raid_softc *rs)
   1001 {
   1002 	struct dk_softc *dksc = &rs->sc_dksc;
   1003 	RF_Raid_t *raidPtr;
   1004 	int error;
   1005 
   1006 	raidPtr = &rs->sc_r;
   1007 
   1008 	if (DK_BUSY(dksc, 0) ||
   1009 	    raidPtr->recon_in_progress != 0 ||
   1010 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1011 	    raidPtr->copyback_in_progress != 0)
   1012 		return EBUSY;
   1013 
   1014 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1015 		return 0;
   1016 
   1017 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1018 
   1019 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1020 		return error;
   1021 
   1022 	rs->sc_flags &= ~RAIDF_INITED;
   1023 
   1024 	/* Kill off any queued buffers */
   1025 	dk_drain(dksc);
   1026 	bufq_free(dksc->sc_bufq);
   1027 
   1028 	/* Detach the disk. */
   1029 	dkwedge_delall(&dksc->sc_dkdev);
   1030 	disk_detach(&dksc->sc_dkdev);
   1031 	disk_destroy(&dksc->sc_dkdev);
   1032 	dk_detach(dksc);
   1033 
   1034 	return 0;
   1035 }
   1036 
   1037 static bool
   1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1039 {
   1040 	switch (cmd) {
   1041 	case RAIDFRAME_ADD_HOT_SPARE:
   1042 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1043 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1044 	case RAIDFRAME_CHECK_PARITY:
   1045 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1046 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1047 	case RAIDFRAME_CHECK_RECON_STATUS:
   1048 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1049 	case RAIDFRAME_COPYBACK:
   1050 	case RAIDFRAME_DELETE_COMPONENT:
   1051 	case RAIDFRAME_FAIL_DISK:
   1052 	case RAIDFRAME_GET_ACCTOTALS:
   1053 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1054 	case RAIDFRAME_GET_INFO:
   1055 	case RAIDFRAME_GET_SIZE:
   1056 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1057 	case RAIDFRAME_INIT_LABELS:
   1058 	case RAIDFRAME_KEEP_ACCTOTALS:
   1059 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1060 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1061 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1062 	case RAIDFRAME_PARITYMAP_STATUS:
   1063 	case RAIDFRAME_REBUILD_IN_PLACE:
   1064 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1065 	case RAIDFRAME_RESET_ACCTOTALS:
   1066 	case RAIDFRAME_REWRITEPARITY:
   1067 	case RAIDFRAME_SET_AUTOCONFIG:
   1068 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1069 	case RAIDFRAME_SET_ROOT:
   1070 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1071 	}
   1072 	return false;
   1073 }
   1074 
   1075 int
   1076 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1077 {
   1078 	struct rf_recon_req_internal *rrint;
   1079 
   1080 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1081 		/* Can't do this on a RAID 0!! */
   1082 		return EINVAL;
   1083 	}
   1084 
   1085 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1086 		/* bad column */
   1087 		return EINVAL;
   1088 	}
   1089 
   1090 	rf_lock_mutex2(raidPtr->mutex);
   1091 	if (raidPtr->status == rf_rs_reconstructing) {
   1092 		/* you can't fail a disk while we're reconstructing! */
   1093 		/* XXX wrong for RAID6 */
   1094 		goto out;
   1095 	}
   1096 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1097 	    (raidPtr->numFailures > 0)) {
   1098 		/* some other component has failed.  Let's not make
   1099 		   things worse. XXX wrong for RAID6 */
   1100 		goto out;
   1101 	}
   1102 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1103 		/* Can't fail a spared disk! */
   1104 		goto out;
   1105 	}
   1106 	rf_unlock_mutex2(raidPtr->mutex);
   1107 
   1108 	/* make a copy of the recon request so that we don't rely on
   1109 	 * the user's buffer */
   1110 	rrint = RF_Malloc(sizeof(*rrint));
   1111 	if (rrint == NULL)
   1112 		return(ENOMEM);
   1113 	rrint->col = rr->col;
   1114 	rrint->flags = rr->flags;
   1115 	rrint->raidPtr = raidPtr;
   1116 
   1117 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1118 	    rrint, "raid_recon");
   1119 out:
   1120 	rf_unlock_mutex2(raidPtr->mutex);
   1121 	return EINVAL;
   1122 }
   1123 
   1124 static int
   1125 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1126 {
   1127 	/* allocate a buffer for the layout-specific data, and copy it in */
   1128 	if (k_cfg->layoutSpecificSize == 0)
   1129 		return 0;
   1130 
   1131 	if (k_cfg->layoutSpecificSize > 10000) {
   1132 	    /* sanity check */
   1133 	    return EINVAL;
   1134 	}
   1135 
   1136 	u_char *specific_buf;
   1137 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1138 	if (specific_buf == NULL)
   1139 		return ENOMEM;
   1140 
   1141 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1142 	    k_cfg->layoutSpecificSize);
   1143 	if (retcode) {
   1144 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1145 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1146 		return retcode;
   1147 	}
   1148 
   1149 	k_cfg->layoutSpecific = specific_buf;
   1150 	return 0;
   1151 }
   1152 
   1153 static int
   1154 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1155 {
   1156 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1157 
   1158 	if (rs->sc_r.valid) {
   1159 		/* There is a valid RAID set running on this unit! */
   1160 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1161 		return EINVAL;
   1162 	}
   1163 
   1164 	/* copy-in the configuration information */
   1165 	/* data points to a pointer to the configuration structure */
   1166 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1167 	if (*k_cfg == NULL) {
   1168 		return ENOMEM;
   1169 	}
   1170 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1171 	if (retcode == 0)
   1172 		return 0;
   1173 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1174 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1175 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1176 	return retcode;
   1177 }
   1178 
   1179 int
   1180 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1181 {
   1182 	int retcode, i;
   1183 	RF_Raid_t *raidPtr = &rs->sc_r;
   1184 
   1185 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1186 
   1187 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1188 		goto out;
   1189 
   1190 	/* should do some kind of sanity check on the configuration.
   1191 	 * Store the sum of all the bytes in the last byte? */
   1192 
   1193 	/* Force nul-termination on all strings. */
   1194 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1195 	for (i = 0; i < RF_MAXCOL; i++) {
   1196 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1197 	}
   1198 	for (i = 0; i < RF_MAXSPARE; i++) {
   1199 		ZERO_FINAL(k_cfg->spare_names[i]);
   1200 	}
   1201 	for (i = 0; i < RF_MAXDBGV; i++) {
   1202 		ZERO_FINAL(k_cfg->debugVars[i]);
   1203 	}
   1204 #undef ZERO_FINAL
   1205 
   1206 	/* Check some basic limits. */
   1207 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1208 		retcode = EINVAL;
   1209 		goto out;
   1210 	}
   1211 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1212 		retcode = EINVAL;
   1213 		goto out;
   1214 	}
   1215 
   1216 	/* configure the system */
   1217 
   1218 	/*
   1219 	 * Clear the entire RAID descriptor, just to make sure
   1220 	 *  there is no stale data left in the case of a
   1221 	 *  reconfiguration
   1222 	 */
   1223 	memset(raidPtr, 0, sizeof(*raidPtr));
   1224 	raidPtr->softc = rs;
   1225 	raidPtr->raidid = rs->sc_unit;
   1226 
   1227 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1228 
   1229 	if (retcode == 0) {
   1230 		/* allow this many simultaneous IO's to
   1231 		   this RAID device */
   1232 		raidPtr->openings = RAIDOUTSTANDING;
   1233 
   1234 		raidinit(rs);
   1235 		raid_wakeup(raidPtr);
   1236 		rf_markalldirty(raidPtr);
   1237 	}
   1238 
   1239 	/* free the buffers.  No return code here. */
   1240 	if (k_cfg->layoutSpecificSize) {
   1241 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1242 	}
   1243 out:
   1244 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1245 	if (retcode) {
   1246 		/*
   1247 		 * If configuration failed, set sc_flags so that we
   1248 		 * will detach the device when we close it.
   1249 		 */
   1250 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1251 	}
   1252 	return retcode;
   1253 }
   1254 
   1255 #if RF_DISABLED
   1256 static int
   1257 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1258 {
   1259 
   1260 	/* XXX check the label for valid stuff... */
   1261 	/* Note that some things *should not* get modified --
   1262 	   the user should be re-initing the labels instead of
   1263 	   trying to patch things.
   1264 	   */
   1265 #ifdef DEBUG
   1266 	int raidid = raidPtr->raidid;
   1267 	printf("raid%d: Got component label:\n", raidid);
   1268 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1269 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1270 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1271 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1272 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1273 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1274 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1275 #endif	/* DEBUG */
   1276 	clabel->row = 0;
   1277 	int column = clabel->column;
   1278 
   1279 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1280 		return(EINVAL);
   1281 	}
   1282 
   1283 	/* XXX this isn't allowed to do anything for now :-) */
   1284 
   1285 	/* XXX and before it is, we need to fill in the rest
   1286 	   of the fields!?!?!?! */
   1287 	memcpy(raidget_component_label(raidPtr, column),
   1288 	    clabel, sizeof(*clabel));
   1289 	raidflush_component_label(raidPtr, column);
   1290 	return 0;
   1291 }
   1292 #endif
   1293 
   1294 static int
   1295 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1296 {
   1297 	/*
   1298 	   we only want the serial number from
   1299 	   the above.  We get all the rest of the information
   1300 	   from the config that was used to create this RAID
   1301 	   set.
   1302 	   */
   1303 
   1304 	raidPtr->serial_number = clabel->serial_number;
   1305 
   1306 	for (int column = 0; column < raidPtr->numCol; column++) {
   1307 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1308 		if (RF_DEAD_DISK(diskPtr->status))
   1309 			continue;
   1310 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1311 		    raidPtr, column);
   1312 		/* Zeroing this is important. */
   1313 		memset(ci_label, 0, sizeof(*ci_label));
   1314 		raid_init_component_label(raidPtr, ci_label);
   1315 		ci_label->serial_number = raidPtr->serial_number;
   1316 		ci_label->row = 0; /* we dont' pretend to support more */
   1317 		rf_component_label_set_partitionsize(ci_label,
   1318 		    diskPtr->partitionSize);
   1319 		ci_label->column = column;
   1320 		raidflush_component_label(raidPtr, column);
   1321 		/* XXXjld what about the spares? */
   1322 	}
   1323 
   1324 	return 0;
   1325 }
   1326 
   1327 static int
   1328 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1329 {
   1330 
   1331 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1332 		/* Can't do this on a RAID 0!! */
   1333 		return EINVAL;
   1334 	}
   1335 
   1336 	if (raidPtr->recon_in_progress == 1) {
   1337 		/* a reconstruct is already in progress! */
   1338 		return EINVAL;
   1339 	}
   1340 
   1341 	RF_SingleComponent_t component;
   1342 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1343 	component.row = 0; /* we don't support any more */
   1344 	int column = component.column;
   1345 
   1346 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1347 		return EINVAL;
   1348 	}
   1349 
   1350 	rf_lock_mutex2(raidPtr->mutex);
   1351 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1352 	    (raidPtr->numFailures > 0)) {
   1353 		/* XXX 0 above shouldn't be constant!!! */
   1354 		/* some component other than this has failed.
   1355 		   Let's not make things worse than they already
   1356 		   are... */
   1357 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1358 		       raidPtr->raidid);
   1359 		printf("raid%d:     Col: %d   Too many failures.\n",
   1360 		       raidPtr->raidid, column);
   1361 		rf_unlock_mutex2(raidPtr->mutex);
   1362 		return EINVAL;
   1363 	}
   1364 
   1365 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1366 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1367 		       raidPtr->raidid);
   1368 		printf("raid%d:    Col: %d   "
   1369 		    "Reconstruction already occurring!\n",
   1370 		    raidPtr->raidid, column);
   1371 
   1372 		rf_unlock_mutex2(raidPtr->mutex);
   1373 		return EINVAL;
   1374 	}
   1375 
   1376 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1377 		rf_unlock_mutex2(raidPtr->mutex);
   1378 		return EINVAL;
   1379 	}
   1380 
   1381 	rf_unlock_mutex2(raidPtr->mutex);
   1382 
   1383 	struct rf_recon_req_internal *rrint;
   1384 	rrint = RF_Malloc(sizeof(*rrint));
   1385 	if (rrint == NULL)
   1386 		return ENOMEM;
   1387 
   1388 	rrint->col = column;
   1389 	rrint->raidPtr = raidPtr;
   1390 
   1391 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1392 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1393 }
   1394 
   1395 static int
   1396 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1397 {
   1398 	/*
   1399 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1400 	 * so tell the user it's done.
   1401 	 */
   1402 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1403 	    raidPtr->status != rf_rs_reconstructing) {
   1404 		*data = 100;
   1405 		return 0;
   1406 	}
   1407 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1408 		*data = 0;
   1409 		return 0;
   1410 	}
   1411 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1412 	    / raidPtr->reconControl->numRUsTotal);
   1413 	return 0;
   1414 }
   1415 
   1416 /*
   1417  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1418  * on the component_name[] array.
   1419  */
   1420 static void
   1421 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1422 {
   1423 
   1424 	memcpy(component, data, sizeof *component);
   1425 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1426 }
   1427 
   1428 static int
   1429 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1430 {
   1431 	int     unit = raidunit(dev);
   1432 	int     part, pmask;
   1433 	struct raid_softc *rs;
   1434 	struct dk_softc *dksc;
   1435 	RF_Config_t *k_cfg;
   1436 	RF_Raid_t *raidPtr;
   1437 	RF_AccTotals_t *totals;
   1438 	RF_SingleComponent_t component;
   1439 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1440 	int retcode = 0;
   1441 	int column;
   1442 	RF_ComponentLabel_t *clabel;
   1443 	int d;
   1444 
   1445 	if ((rs = raidget(unit, false)) == NULL)
   1446 		return ENXIO;
   1447 
   1448 	dksc = &rs->sc_dksc;
   1449 	raidPtr = &rs->sc_r;
   1450 
   1451 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1452 	    (int) DISKPART(dev), (int) unit, cmd));
   1453 
   1454 	/* Must be initialized for these... */
   1455 	if (rf_must_be_initialized(rs, cmd))
   1456 		return ENXIO;
   1457 
   1458 	switch (cmd) {
   1459 		/* configure the system */
   1460 	case RAIDFRAME_CONFIGURE:
   1461 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1462 			return retcode;
   1463 		return rf_construct(rs, k_cfg);
   1464 
   1465 		/* shutdown the system */
   1466 	case RAIDFRAME_SHUTDOWN:
   1467 
   1468 		part = DISKPART(dev);
   1469 		pmask = (1 << part);
   1470 
   1471 		if ((retcode = raidlock(rs)) != 0)
   1472 			return retcode;
   1473 
   1474 		if (DK_BUSY(dksc, pmask) ||
   1475 		    raidPtr->recon_in_progress != 0 ||
   1476 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1477 		    raidPtr->copyback_in_progress != 0)
   1478 			retcode = EBUSY;
   1479 		else {
   1480 			/* detach and free on close */
   1481 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1482 			retcode = 0;
   1483 		}
   1484 
   1485 		raidunlock(rs);
   1486 
   1487 		return retcode;
   1488 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1489 		return rf_get_component_label(raidPtr, data);
   1490 
   1491 #if RF_DISABLED
   1492 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1493 		return rf_set_component_label(raidPtr, data);
   1494 #endif
   1495 
   1496 	case RAIDFRAME_INIT_LABELS:
   1497 		return rf_init_component_label(raidPtr, data);
   1498 
   1499 	case RAIDFRAME_SET_AUTOCONFIG:
   1500 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1501 		printf("raid%d: New autoconfig value is: %d\n",
   1502 		       raidPtr->raidid, d);
   1503 		*(int *) data = d;
   1504 		return retcode;
   1505 
   1506 	case RAIDFRAME_SET_ROOT:
   1507 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1508 		printf("raid%d: New rootpartition value is: %d\n",
   1509 		       raidPtr->raidid, d);
   1510 		*(int *) data = d;
   1511 		return retcode;
   1512 
   1513 		/* initialize all parity */
   1514 	case RAIDFRAME_REWRITEPARITY:
   1515 
   1516 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1517 			/* Parity for RAID 0 is trivially correct */
   1518 			raidPtr->parity_good = RF_RAID_CLEAN;
   1519 			return 0;
   1520 		}
   1521 
   1522 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1523 			/* Re-write is already in progress! */
   1524 			return EINVAL;
   1525 		}
   1526 
   1527 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1528 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1529 
   1530 	case RAIDFRAME_ADD_HOT_SPARE:
   1531 		rf_copy_single_component(&component, data);
   1532 		return rf_add_hot_spare(raidPtr, &component);
   1533 
   1534 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1535 		return retcode;
   1536 
   1537 	case RAIDFRAME_DELETE_COMPONENT:
   1538 		rf_copy_single_component(&component, data);
   1539 		return rf_delete_component(raidPtr, &component);
   1540 
   1541 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1542 		rf_copy_single_component(&component, data);
   1543 		return rf_incorporate_hot_spare(raidPtr, &component);
   1544 
   1545 	case RAIDFRAME_REBUILD_IN_PLACE:
   1546 		return rf_rebuild_in_place(raidPtr, data);
   1547 
   1548 	case RAIDFRAME_GET_INFO:
   1549 		ucfgp = *(RF_DeviceConfig_t **)data;
   1550 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1551 		if (d_cfg == NULL)
   1552 			return ENOMEM;
   1553 		retcode = rf_get_info(raidPtr, d_cfg);
   1554 		if (retcode == 0) {
   1555 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1556 		}
   1557 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1558 		return retcode;
   1559 
   1560 	case RAIDFRAME_CHECK_PARITY:
   1561 		*(int *) data = raidPtr->parity_good;
   1562 		return 0;
   1563 
   1564 	case RAIDFRAME_PARITYMAP_STATUS:
   1565 		if (rf_paritymap_ineligible(raidPtr))
   1566 			return EINVAL;
   1567 		rf_paritymap_status(raidPtr->parity_map, data);
   1568 		return 0;
   1569 
   1570 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1571 		if (rf_paritymap_ineligible(raidPtr))
   1572 			return EINVAL;
   1573 		if (raidPtr->parity_map == NULL)
   1574 			return ENOENT; /* ??? */
   1575 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1576 			return EINVAL;
   1577 		return 0;
   1578 
   1579 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1580 		if (rf_paritymap_ineligible(raidPtr))
   1581 			return EINVAL;
   1582 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1583 		return 0;
   1584 
   1585 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1586 		if (rf_paritymap_ineligible(raidPtr))
   1587 			return EINVAL;
   1588 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1589 		/* XXX should errors be passed up? */
   1590 		return 0;
   1591 
   1592 	case RAIDFRAME_RESET_ACCTOTALS:
   1593 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1594 		return 0;
   1595 
   1596 	case RAIDFRAME_GET_ACCTOTALS:
   1597 		totals = (RF_AccTotals_t *) data;
   1598 		*totals = raidPtr->acc_totals;
   1599 		return 0;
   1600 
   1601 	case RAIDFRAME_KEEP_ACCTOTALS:
   1602 		raidPtr->keep_acc_totals = *(int *)data;
   1603 		return 0;
   1604 
   1605 	case RAIDFRAME_GET_SIZE:
   1606 		*(int *) data = raidPtr->totalSectors;
   1607 		return 0;
   1608 
   1609 	case RAIDFRAME_FAIL_DISK:
   1610 		return rf_fail_disk(raidPtr, data);
   1611 
   1612 		/* invoke a copyback operation after recon on whatever disk
   1613 		 * needs it, if any */
   1614 	case RAIDFRAME_COPYBACK:
   1615 
   1616 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1617 			/* This makes no sense on a RAID 0!! */
   1618 			return EINVAL;
   1619 		}
   1620 
   1621 		if (raidPtr->copyback_in_progress == 1) {
   1622 			/* Copyback is already in progress! */
   1623 			return EINVAL;
   1624 		}
   1625 
   1626 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1627 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1628 
   1629 		/* return the percentage completion of reconstruction */
   1630 	case RAIDFRAME_CHECK_RECON_STATUS:
   1631 		return rf_check_recon_status(raidPtr, data);
   1632 
   1633 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1634 		rf_check_recon_status_ext(raidPtr, data);
   1635 		return 0;
   1636 
   1637 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1638 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1639 			/* This makes no sense on a RAID 0, so tell the
   1640 			   user it's done. */
   1641 			*(int *) data = 100;
   1642 			return 0;
   1643 		}
   1644 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1645 			*(int *) data = 100 *
   1646 				raidPtr->parity_rewrite_stripes_done /
   1647 				raidPtr->Layout.numStripe;
   1648 		} else {
   1649 			*(int *) data = 100;
   1650 		}
   1651 		return 0;
   1652 
   1653 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1654 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1655 		return 0;
   1656 
   1657 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1658 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1659 			/* This makes no sense on a RAID 0 */
   1660 			*(int *) data = 100;
   1661 			return 0;
   1662 		}
   1663 		if (raidPtr->copyback_in_progress == 1) {
   1664 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1665 				raidPtr->Layout.numStripe;
   1666 		} else {
   1667 			*(int *) data = 100;
   1668 		}
   1669 		return 0;
   1670 
   1671 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1672 		rf_check_copyback_status_ext(raidPtr, data);
   1673 		return 0;
   1674 
   1675 	case RAIDFRAME_SET_LAST_UNIT:
   1676 		for (column = 0; column < raidPtr->numCol; column++)
   1677 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1678 				return EBUSY;
   1679 
   1680 		for (column = 0; column < raidPtr->numCol; column++) {
   1681 			clabel = raidget_component_label(raidPtr, column);
   1682 			clabel->last_unit = *(int *)data;
   1683 			raidflush_component_label(raidPtr, column);
   1684 		}
   1685 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1686 		return 0;
   1687 
   1688 		/* the sparetable daemon calls this to wait for the kernel to
   1689 		 * need a spare table. this ioctl does not return until a
   1690 		 * spare table is needed. XXX -- calling mpsleep here in the
   1691 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1692 		 * -- I should either compute the spare table in the kernel,
   1693 		 * or have a different -- XXX XXX -- interface (a different
   1694 		 * character device) for delivering the table     -- XXX */
   1695 #if RF_DISABLED
   1696 	case RAIDFRAME_SPARET_WAIT:
   1697 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1698 		while (!rf_sparet_wait_queue)
   1699 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1700 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1701 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1702 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1703 
   1704 		/* structure assignment */
   1705 		*((RF_SparetWait_t *) data) = *waitreq;
   1706 
   1707 		RF_Free(waitreq, sizeof(*waitreq));
   1708 		return 0;
   1709 
   1710 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1711 		 * code in it that will cause the dameon to exit */
   1712 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1713 		waitreq = RF_Malloc(sizeof(*waitreq));
   1714 		waitreq->fcol = -1;
   1715 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1716 		waitreq->next = rf_sparet_wait_queue;
   1717 		rf_sparet_wait_queue = waitreq;
   1718 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1719 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1720 		return 0;
   1721 
   1722 		/* used by the spare table daemon to deliver a spare table
   1723 		 * into the kernel */
   1724 	case RAIDFRAME_SEND_SPARET:
   1725 
   1726 		/* install the spare table */
   1727 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1728 
   1729 		/* respond to the requestor.  the return status of the spare
   1730 		 * table installation is passed in the "fcol" field */
   1731 		waitred = RF_Malloc(sizeof(*waitreq));
   1732 		waitreq->fcol = retcode;
   1733 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1734 		waitreq->next = rf_sparet_resp_queue;
   1735 		rf_sparet_resp_queue = waitreq;
   1736 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1737 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1738 
   1739 		return retcode;
   1740 #endif
   1741 	default:
   1742 		/*
   1743 		 * Don't bother trying to load compat modules
   1744 		 * if it is not our ioctl. This is more efficient
   1745 		 * and makes rump tests not depend on compat code
   1746 		 */
   1747 		if (IOCGROUP(cmd) != 'r')
   1748 			break;
   1749 #ifdef _LP64
   1750 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1751 			module_autoload("compat_netbsd32_raid",
   1752 			    MODULE_CLASS_EXEC);
   1753 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1754 			    (rs, cmd, data), enosys(), retcode);
   1755 			if (retcode != EPASSTHROUGH)
   1756 				return retcode;
   1757 		}
   1758 #endif
   1759 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1760 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1761 		    (rs, cmd, data), enosys(), retcode);
   1762 		if (retcode != EPASSTHROUGH)
   1763 			return retcode;
   1764 
   1765 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1766 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1767 		    (rs, cmd, data), enosys(), retcode);
   1768 		if (retcode != EPASSTHROUGH)
   1769 			return retcode;
   1770 		break; /* fall through to the os-specific code below */
   1771 
   1772 	}
   1773 
   1774 	if (!raidPtr->valid)
   1775 		return (EINVAL);
   1776 
   1777 	/*
   1778 	 * Add support for "regular" device ioctls here.
   1779 	 */
   1780 
   1781 	switch (cmd) {
   1782 	case DIOCGCACHE:
   1783 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1784 		break;
   1785 
   1786 	case DIOCCACHESYNC:
   1787 		retcode = rf_sync_component_caches(raidPtr);
   1788 		break;
   1789 
   1790 	default:
   1791 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1792 		break;
   1793 	}
   1794 
   1795 	return (retcode);
   1796 
   1797 }
   1798 
   1799 
   1800 /* raidinit -- complete the rest of the initialization for the
   1801    RAIDframe device.  */
   1802 
   1803 
   1804 static void
   1805 raidinit(struct raid_softc *rs)
   1806 {
   1807 	cfdata_t cf;
   1808 	unsigned int unit;
   1809 	struct dk_softc *dksc = &rs->sc_dksc;
   1810 	RF_Raid_t *raidPtr = &rs->sc_r;
   1811 	device_t dev;
   1812 
   1813 	unit = raidPtr->raidid;
   1814 
   1815 	/* XXX doesn't check bounds. */
   1816 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1817 
   1818 	/* attach the pseudo device */
   1819 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1820 	cf->cf_name = raid_cd.cd_name;
   1821 	cf->cf_atname = raid_cd.cd_name;
   1822 	cf->cf_unit = unit;
   1823 	cf->cf_fstate = FSTATE_STAR;
   1824 
   1825 	dev = config_attach_pseudo(cf);
   1826 	if (dev == NULL) {
   1827 		printf("raid%d: config_attach_pseudo failed\n",
   1828 		    raidPtr->raidid);
   1829 		free(cf, M_RAIDFRAME);
   1830 		return;
   1831 	}
   1832 
   1833 	/* provide a backpointer to the real softc */
   1834 	raidsoftc(dev) = rs;
   1835 
   1836 	/* disk_attach actually creates space for the CPU disklabel, among
   1837 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1838 	 * with disklabels. */
   1839 	dk_init(dksc, dev, DKTYPE_RAID);
   1840 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1841 
   1842 	/* XXX There may be a weird interaction here between this, and
   1843 	 * protectedSectors, as used in RAIDframe.  */
   1844 
   1845 	rs->sc_size = raidPtr->totalSectors;
   1846 
   1847 	/* Attach dk and disk subsystems */
   1848 	dk_attach(dksc);
   1849 	disk_attach(&dksc->sc_dkdev);
   1850 	rf_set_geometry(rs, raidPtr);
   1851 
   1852 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1853 
   1854 	/* mark unit as usuable */
   1855 	rs->sc_flags |= RAIDF_INITED;
   1856 
   1857 	dkwedge_discover(&dksc->sc_dkdev);
   1858 }
   1859 
   1860 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1861 /* wake up the daemon & tell it to get us a spare table
   1862  * XXX
   1863  * the entries in the queues should be tagged with the raidPtr
   1864  * so that in the extremely rare case that two recons happen at once,
   1865  * we know for which device were requesting a spare table
   1866  * XXX
   1867  *
   1868  * XXX This code is not currently used. GO
   1869  */
   1870 int
   1871 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1872 {
   1873 	int     retcode;
   1874 
   1875 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1876 	req->next = rf_sparet_wait_queue;
   1877 	rf_sparet_wait_queue = req;
   1878 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1879 
   1880 	/* mpsleep unlocks the mutex */
   1881 	while (!rf_sparet_resp_queue) {
   1882 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1883 	}
   1884 	req = rf_sparet_resp_queue;
   1885 	rf_sparet_resp_queue = req->next;
   1886 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1887 
   1888 	retcode = req->fcol;
   1889 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1890 					 * alloc'd */
   1891 	return (retcode);
   1892 }
   1893 #endif
   1894 
   1895 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1896  * bp & passes it down.
   1897  * any calls originating in the kernel must use non-blocking I/O
   1898  * do some extra sanity checking to return "appropriate" error values for
   1899  * certain conditions (to make some standard utilities work)
   1900  *
   1901  * Formerly known as: rf_DoAccessKernel
   1902  */
   1903 void
   1904 raidstart(RF_Raid_t *raidPtr)
   1905 {
   1906 	struct raid_softc *rs;
   1907 	struct dk_softc *dksc;
   1908 
   1909 	rs = raidPtr->softc;
   1910 	dksc = &rs->sc_dksc;
   1911 	/* quick check to see if anything has died recently */
   1912 	rf_lock_mutex2(raidPtr->mutex);
   1913 	if (raidPtr->numNewFailures > 0) {
   1914 		rf_unlock_mutex2(raidPtr->mutex);
   1915 		rf_update_component_labels(raidPtr,
   1916 					   RF_NORMAL_COMPONENT_UPDATE);
   1917 		rf_lock_mutex2(raidPtr->mutex);
   1918 		raidPtr->numNewFailures--;
   1919 	}
   1920 	rf_unlock_mutex2(raidPtr->mutex);
   1921 
   1922 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1923 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1924 		return;
   1925 	}
   1926 
   1927 	dk_start(dksc, NULL);
   1928 }
   1929 
   1930 static int
   1931 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1932 {
   1933 	RF_SectorCount_t num_blocks, pb, sum;
   1934 	RF_RaidAddr_t raid_addr;
   1935 	daddr_t blocknum;
   1936 	int     do_async;
   1937 	int rc;
   1938 
   1939 	rf_lock_mutex2(raidPtr->mutex);
   1940 	if (raidPtr->openings == 0) {
   1941 		rf_unlock_mutex2(raidPtr->mutex);
   1942 		return EAGAIN;
   1943 	}
   1944 	rf_unlock_mutex2(raidPtr->mutex);
   1945 
   1946 	blocknum = bp->b_rawblkno;
   1947 
   1948 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1949 		    (int) blocknum));
   1950 
   1951 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1952 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1953 
   1954 	/* *THIS* is where we adjust what block we're going to...
   1955 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1956 	raid_addr = blocknum;
   1957 
   1958 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1959 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1960 	sum = raid_addr + num_blocks + pb;
   1961 	if (1 || rf_debugKernelAccess) {
   1962 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1963 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1964 			    (int) pb, (int) bp->b_resid));
   1965 	}
   1966 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1967 	    || (sum < num_blocks) || (sum < pb)) {
   1968 		rc = ENOSPC;
   1969 		goto done;
   1970 	}
   1971 	/*
   1972 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1973 	 */
   1974 
   1975 	if (bp->b_bcount & raidPtr->sectorMask) {
   1976 		rc = ENOSPC;
   1977 		goto done;
   1978 	}
   1979 	db1_printf(("Calling DoAccess..\n"));
   1980 
   1981 
   1982 	rf_lock_mutex2(raidPtr->mutex);
   1983 	raidPtr->openings--;
   1984 	rf_unlock_mutex2(raidPtr->mutex);
   1985 
   1986 	/*
   1987 	 * Everything is async.
   1988 	 */
   1989 	do_async = 1;
   1990 
   1991 	/* don't ever condition on bp->b_flags & B_WRITE.
   1992 	 * always condition on B_READ instead */
   1993 
   1994 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1995 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1996 			 do_async, raid_addr, num_blocks,
   1997 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1998 
   1999 done:
   2000 	return rc;
   2001 }
   2002 
   2003 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2004 
   2005 int
   2006 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2007 {
   2008 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2009 	struct buf *bp;
   2010 
   2011 	req->queue = queue;
   2012 	bp = req->bp;
   2013 
   2014 	switch (req->type) {
   2015 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2016 		/* XXX need to do something extra here.. */
   2017 		/* I'm leaving this in, as I've never actually seen it used,
   2018 		 * and I'd like folks to report it... GO */
   2019 		printf(("WAKEUP CALLED\n"));
   2020 		queue->numOutstanding++;
   2021 
   2022 		bp->b_flags = 0;
   2023 		bp->b_private = req;
   2024 
   2025 		KernelWakeupFunc(bp);
   2026 		break;
   2027 
   2028 	case RF_IO_TYPE_READ:
   2029 	case RF_IO_TYPE_WRITE:
   2030 #if RF_ACC_TRACE > 0
   2031 		if (req->tracerec) {
   2032 			RF_ETIMER_START(req->tracerec->timer);
   2033 		}
   2034 #endif
   2035 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2036 		    op, queue->rf_cinfo->ci_dev,
   2037 		    req->sectorOffset, req->numSector,
   2038 		    req->buf, KernelWakeupFunc, (void *) req,
   2039 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2040 
   2041 		if (rf_debugKernelAccess) {
   2042 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2043 				(long) bp->b_blkno));
   2044 		}
   2045 		queue->numOutstanding++;
   2046 		queue->last_deq_sector = req->sectorOffset;
   2047 		/* acc wouldn't have been let in if there were any pending
   2048 		 * reqs at any other priority */
   2049 		queue->curPriority = req->priority;
   2050 
   2051 		db1_printf(("Going for %c to unit %d col %d\n",
   2052 			    req->type, queue->raidPtr->raidid,
   2053 			    queue->col));
   2054 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2055 			(int) req->sectorOffset, (int) req->numSector,
   2056 			(int) (req->numSector <<
   2057 			    queue->raidPtr->logBytesPerSector),
   2058 			(int) queue->raidPtr->logBytesPerSector));
   2059 
   2060 		/*
   2061 		 * XXX: drop lock here since this can block at
   2062 		 * least with backing SCSI devices.  Retake it
   2063 		 * to minimize fuss with calling interfaces.
   2064 		 */
   2065 
   2066 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2067 		bdev_strategy(bp);
   2068 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2069 		break;
   2070 
   2071 	default:
   2072 		panic("bad req->type in rf_DispatchKernelIO");
   2073 	}
   2074 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2075 
   2076 	return (0);
   2077 }
   2078 /* this is the callback function associated with a I/O invoked from
   2079    kernel code.
   2080  */
   2081 static void
   2082 KernelWakeupFunc(struct buf *bp)
   2083 {
   2084 	RF_DiskQueueData_t *req = NULL;
   2085 	RF_DiskQueue_t *queue;
   2086 
   2087 	db1_printf(("recovering the request queue:\n"));
   2088 
   2089 	req = bp->b_private;
   2090 
   2091 	queue = (RF_DiskQueue_t *) req->queue;
   2092 
   2093 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2094 
   2095 #if RF_ACC_TRACE > 0
   2096 	if (req->tracerec) {
   2097 		RF_ETIMER_STOP(req->tracerec->timer);
   2098 		RF_ETIMER_EVAL(req->tracerec->timer);
   2099 		rf_lock_mutex2(rf_tracing_mutex);
   2100 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2101 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2102 		req->tracerec->num_phys_ios++;
   2103 		rf_unlock_mutex2(rf_tracing_mutex);
   2104 	}
   2105 #endif
   2106 
   2107 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2108 	 * ballistic, and mark the component as hosed... */
   2109 
   2110 	if (bp->b_error != 0) {
   2111 		/* Mark the disk as dead */
   2112 		/* but only mark it once... */
   2113 		/* and only if it wouldn't leave this RAID set
   2114 		   completely broken */
   2115 		if (((queue->raidPtr->Disks[queue->col].status ==
   2116 		      rf_ds_optimal) ||
   2117 		     (queue->raidPtr->Disks[queue->col].status ==
   2118 		      rf_ds_used_spare)) &&
   2119 		     (queue->raidPtr->numFailures <
   2120 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2121 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2122 			       queue->raidPtr->raidid,
   2123 			       bp->b_error,
   2124 			       queue->raidPtr->Disks[queue->col].devname);
   2125 			queue->raidPtr->Disks[queue->col].status =
   2126 			    rf_ds_failed;
   2127 			queue->raidPtr->status = rf_rs_degraded;
   2128 			queue->raidPtr->numFailures++;
   2129 			queue->raidPtr->numNewFailures++;
   2130 		} else {	/* Disk is already dead... */
   2131 			/* printf("Disk already marked as dead!\n"); */
   2132 		}
   2133 
   2134 	}
   2135 
   2136 	/* Fill in the error value */
   2137 	req->error = bp->b_error;
   2138 
   2139 	/* Drop this one on the "finished" queue... */
   2140 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2141 
   2142 	/* Let the raidio thread know there is work to be done. */
   2143 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2144 
   2145 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2146 }
   2147 
   2148 
   2149 /*
   2150  * initialize a buf structure for doing an I/O in the kernel.
   2151  */
   2152 static void
   2153 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2154        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2155        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2156        struct proc *b_proc)
   2157 {
   2158 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2159 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2160 	bp->b_oflags = 0;
   2161 	bp->b_cflags = 0;
   2162 	bp->b_bcount = numSect << logBytesPerSector;
   2163 	bp->b_bufsize = bp->b_bcount;
   2164 	bp->b_error = 0;
   2165 	bp->b_dev = dev;
   2166 	bp->b_data = bf;
   2167 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2168 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2169 	if (bp->b_bcount == 0) {
   2170 		panic("bp->b_bcount is zero in InitBP!!");
   2171 	}
   2172 	bp->b_proc = b_proc;
   2173 	bp->b_iodone = cbFunc;
   2174 	bp->b_private = cbArg;
   2175 }
   2176 
   2177 /*
   2178  * Wait interruptibly for an exclusive lock.
   2179  *
   2180  * XXX
   2181  * Several drivers do this; it should be abstracted and made MP-safe.
   2182  * (Hmm... where have we seen this warning before :->  GO )
   2183  */
   2184 static int
   2185 raidlock(struct raid_softc *rs)
   2186 {
   2187 	int     error;
   2188 
   2189 	error = 0;
   2190 	mutex_enter(&rs->sc_mutex);
   2191 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2192 		rs->sc_flags |= RAIDF_WANTED;
   2193 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2194 		if (error != 0)
   2195 			goto done;
   2196 	}
   2197 	rs->sc_flags |= RAIDF_LOCKED;
   2198 done:
   2199 	mutex_exit(&rs->sc_mutex);
   2200 	return (error);
   2201 }
   2202 /*
   2203  * Unlock and wake up any waiters.
   2204  */
   2205 static void
   2206 raidunlock(struct raid_softc *rs)
   2207 {
   2208 
   2209 	mutex_enter(&rs->sc_mutex);
   2210 	rs->sc_flags &= ~RAIDF_LOCKED;
   2211 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2212 		rs->sc_flags &= ~RAIDF_WANTED;
   2213 		cv_broadcast(&rs->sc_cv);
   2214 	}
   2215 	mutex_exit(&rs->sc_mutex);
   2216 }
   2217 
   2218 
   2219 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2220 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2221 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2222 
   2223 static daddr_t
   2224 rf_component_info_offset(void)
   2225 {
   2226 
   2227 	return RF_COMPONENT_INFO_OFFSET;
   2228 }
   2229 
   2230 static daddr_t
   2231 rf_component_info_size(unsigned secsize)
   2232 {
   2233 	daddr_t info_size;
   2234 
   2235 	KASSERT(secsize);
   2236 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2237 		info_size = secsize;
   2238 	else
   2239 		info_size = RF_COMPONENT_INFO_SIZE;
   2240 
   2241 	return info_size;
   2242 }
   2243 
   2244 static daddr_t
   2245 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2246 {
   2247 	daddr_t map_offset;
   2248 
   2249 	KASSERT(raidPtr->bytesPerSector);
   2250 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2251 		map_offset = raidPtr->bytesPerSector;
   2252 	else
   2253 		map_offset = RF_COMPONENT_INFO_SIZE;
   2254 	map_offset += rf_component_info_offset();
   2255 
   2256 	return map_offset;
   2257 }
   2258 
   2259 static daddr_t
   2260 rf_parity_map_size(RF_Raid_t *raidPtr)
   2261 {
   2262 	daddr_t map_size;
   2263 
   2264 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2265 		map_size = raidPtr->bytesPerSector;
   2266 	else
   2267 		map_size = RF_PARITY_MAP_SIZE;
   2268 
   2269 	return map_size;
   2270 }
   2271 
   2272 int
   2273 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2274 {
   2275 	RF_ComponentLabel_t *clabel;
   2276 
   2277 	clabel = raidget_component_label(raidPtr, col);
   2278 	clabel->clean = RF_RAID_CLEAN;
   2279 	raidflush_component_label(raidPtr, col);
   2280 	return(0);
   2281 }
   2282 
   2283 
   2284 int
   2285 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2286 {
   2287 	RF_ComponentLabel_t *clabel;
   2288 
   2289 	clabel = raidget_component_label(raidPtr, col);
   2290 	clabel->clean = RF_RAID_DIRTY;
   2291 	raidflush_component_label(raidPtr, col);
   2292 	return(0);
   2293 }
   2294 
   2295 int
   2296 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2297 {
   2298 	KASSERT(raidPtr->bytesPerSector);
   2299 	return raidread_component_label(raidPtr->bytesPerSector,
   2300 	    raidPtr->Disks[col].dev,
   2301 	    raidPtr->raid_cinfo[col].ci_vp,
   2302 	    &raidPtr->raid_cinfo[col].ci_label);
   2303 }
   2304 
   2305 RF_ComponentLabel_t *
   2306 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2307 {
   2308 	return &raidPtr->raid_cinfo[col].ci_label;
   2309 }
   2310 
   2311 int
   2312 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2313 {
   2314 	RF_ComponentLabel_t *label;
   2315 
   2316 	label = &raidPtr->raid_cinfo[col].ci_label;
   2317 	label->mod_counter = raidPtr->mod_counter;
   2318 #ifndef RF_NO_PARITY_MAP
   2319 	label->parity_map_modcount = label->mod_counter;
   2320 #endif
   2321 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2322 	    raidPtr->Disks[col].dev,
   2323 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2324 }
   2325 
   2326 
   2327 static int
   2328 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2329     RF_ComponentLabel_t *clabel)
   2330 {
   2331 	return raidread_component_area(dev, b_vp, clabel,
   2332 	    sizeof(RF_ComponentLabel_t),
   2333 	    rf_component_info_offset(),
   2334 	    rf_component_info_size(secsize));
   2335 }
   2336 
   2337 /* ARGSUSED */
   2338 static int
   2339 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2340     size_t msize, daddr_t offset, daddr_t dsize)
   2341 {
   2342 	struct buf *bp;
   2343 	int error;
   2344 
   2345 	/* XXX should probably ensure that we don't try to do this if
   2346 	   someone has changed rf_protected_sectors. */
   2347 
   2348 	if (b_vp == NULL) {
   2349 		/* For whatever reason, this component is not valid.
   2350 		   Don't try to read a component label from it. */
   2351 		return(EINVAL);
   2352 	}
   2353 
   2354 	/* get a block of the appropriate size... */
   2355 	bp = geteblk((int)dsize);
   2356 	bp->b_dev = dev;
   2357 
   2358 	/* get our ducks in a row for the read */
   2359 	bp->b_blkno = offset / DEV_BSIZE;
   2360 	bp->b_bcount = dsize;
   2361 	bp->b_flags |= B_READ;
   2362  	bp->b_resid = dsize;
   2363 
   2364 	bdev_strategy(bp);
   2365 	error = biowait(bp);
   2366 
   2367 	if (!error) {
   2368 		memcpy(data, bp->b_data, msize);
   2369 	}
   2370 
   2371 	brelse(bp, 0);
   2372 	return(error);
   2373 }
   2374 
   2375 
   2376 static int
   2377 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2378     RF_ComponentLabel_t *clabel)
   2379 {
   2380 	return raidwrite_component_area(dev, b_vp, clabel,
   2381 	    sizeof(RF_ComponentLabel_t),
   2382 	    rf_component_info_offset(),
   2383 	    rf_component_info_size(secsize), 0);
   2384 }
   2385 
   2386 /* ARGSUSED */
   2387 static int
   2388 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2389     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2390 {
   2391 	struct buf *bp;
   2392 	int error;
   2393 
   2394 	/* get a block of the appropriate size... */
   2395 	bp = geteblk((int)dsize);
   2396 	bp->b_dev = dev;
   2397 
   2398 	/* get our ducks in a row for the write */
   2399 	bp->b_blkno = offset / DEV_BSIZE;
   2400 	bp->b_bcount = dsize;
   2401 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2402  	bp->b_resid = dsize;
   2403 
   2404 	memset(bp->b_data, 0, dsize);
   2405 	memcpy(bp->b_data, data, msize);
   2406 
   2407 	bdev_strategy(bp);
   2408 	if (asyncp)
   2409 		return 0;
   2410 	error = biowait(bp);
   2411 	brelse(bp, 0);
   2412 	if (error) {
   2413 #if 1
   2414 		printf("Failed to write RAID component info!\n");
   2415 #endif
   2416 	}
   2417 
   2418 	return(error);
   2419 }
   2420 
   2421 void
   2422 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2423 {
   2424 	int c;
   2425 
   2426 	for (c = 0; c < raidPtr->numCol; c++) {
   2427 		/* Skip dead disks. */
   2428 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2429 			continue;
   2430 		/* XXXjld: what if an error occurs here? */
   2431 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2432 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2433 		    RF_PARITYMAP_NBYTE,
   2434 		    rf_parity_map_offset(raidPtr),
   2435 		    rf_parity_map_size(raidPtr), 0);
   2436 	}
   2437 }
   2438 
   2439 void
   2440 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2441 {
   2442 	struct rf_paritymap_ondisk tmp;
   2443 	int c,first;
   2444 
   2445 	first=1;
   2446 	for (c = 0; c < raidPtr->numCol; c++) {
   2447 		/* Skip dead disks. */
   2448 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2449 			continue;
   2450 		raidread_component_area(raidPtr->Disks[c].dev,
   2451 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2452 		    RF_PARITYMAP_NBYTE,
   2453 		    rf_parity_map_offset(raidPtr),
   2454 		    rf_parity_map_size(raidPtr));
   2455 		if (first) {
   2456 			memcpy(map, &tmp, sizeof(*map));
   2457 			first = 0;
   2458 		} else {
   2459 			rf_paritymap_merge(map, &tmp);
   2460 		}
   2461 	}
   2462 }
   2463 
   2464 void
   2465 rf_markalldirty(RF_Raid_t *raidPtr)
   2466 {
   2467 	RF_ComponentLabel_t *clabel;
   2468 	int sparecol;
   2469 	int c;
   2470 	int j;
   2471 	int scol = -1;
   2472 
   2473 	raidPtr->mod_counter++;
   2474 	for (c = 0; c < raidPtr->numCol; c++) {
   2475 		/* we don't want to touch (at all) a disk that has
   2476 		   failed */
   2477 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2478 			clabel = raidget_component_label(raidPtr, c);
   2479 			if (clabel->status == rf_ds_spared) {
   2480 				/* XXX do something special...
   2481 				   but whatever you do, don't
   2482 				   try to access it!! */
   2483 			} else {
   2484 				raidmarkdirty(raidPtr, c);
   2485 			}
   2486 		}
   2487 	}
   2488 
   2489 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2490 		sparecol = raidPtr->numCol + c;
   2491 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2492 			/*
   2493 
   2494 			   we claim this disk is "optimal" if it's
   2495 			   rf_ds_used_spare, as that means it should be
   2496 			   directly substitutable for the disk it replaced.
   2497 			   We note that too...
   2498 
   2499 			 */
   2500 
   2501 			for(j=0;j<raidPtr->numCol;j++) {
   2502 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2503 					scol = j;
   2504 					break;
   2505 				}
   2506 			}
   2507 
   2508 			clabel = raidget_component_label(raidPtr, sparecol);
   2509 			/* make sure status is noted */
   2510 
   2511 			raid_init_component_label(raidPtr, clabel);
   2512 
   2513 			clabel->row = 0;
   2514 			clabel->column = scol;
   2515 			/* Note: we *don't* change status from rf_ds_used_spare
   2516 			   to rf_ds_optimal */
   2517 			/* clabel.status = rf_ds_optimal; */
   2518 
   2519 			raidmarkdirty(raidPtr, sparecol);
   2520 		}
   2521 	}
   2522 }
   2523 
   2524 
   2525 void
   2526 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2527 {
   2528 	RF_ComponentLabel_t *clabel;
   2529 	int sparecol;
   2530 	int c;
   2531 	int j;
   2532 	int scol;
   2533 	struct raid_softc *rs = raidPtr->softc;
   2534 
   2535 	scol = -1;
   2536 
   2537 	/* XXX should do extra checks to make sure things really are clean,
   2538 	   rather than blindly setting the clean bit... */
   2539 
   2540 	raidPtr->mod_counter++;
   2541 
   2542 	for (c = 0; c < raidPtr->numCol; c++) {
   2543 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2544 			clabel = raidget_component_label(raidPtr, c);
   2545 			/* make sure status is noted */
   2546 			clabel->status = rf_ds_optimal;
   2547 
   2548 			/* note what unit we are configured as */
   2549 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2550 				clabel->last_unit = raidPtr->raidid;
   2551 
   2552 			raidflush_component_label(raidPtr, c);
   2553 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2554 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2555 					raidmarkclean(raidPtr, c);
   2556 				}
   2557 			}
   2558 		}
   2559 		/* else we don't touch it.. */
   2560 	}
   2561 
   2562 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2563 		sparecol = raidPtr->numCol + c;
   2564 		/* Need to ensure that the reconstruct actually completed! */
   2565 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2566 			/*
   2567 
   2568 			   we claim this disk is "optimal" if it's
   2569 			   rf_ds_used_spare, as that means it should be
   2570 			   directly substitutable for the disk it replaced.
   2571 			   We note that too...
   2572 
   2573 			 */
   2574 
   2575 			for(j=0;j<raidPtr->numCol;j++) {
   2576 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2577 					scol = j;
   2578 					break;
   2579 				}
   2580 			}
   2581 
   2582 			/* XXX shouldn't *really* need this... */
   2583 			clabel = raidget_component_label(raidPtr, sparecol);
   2584 			/* make sure status is noted */
   2585 
   2586 			raid_init_component_label(raidPtr, clabel);
   2587 
   2588 			clabel->column = scol;
   2589 			clabel->status = rf_ds_optimal;
   2590 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2591 				clabel->last_unit = raidPtr->raidid;
   2592 
   2593 			raidflush_component_label(raidPtr, sparecol);
   2594 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2595 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2596 					raidmarkclean(raidPtr, sparecol);
   2597 				}
   2598 			}
   2599 		}
   2600 	}
   2601 }
   2602 
   2603 void
   2604 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2605 {
   2606 
   2607 	if (vp != NULL) {
   2608 		if (auto_configured == 1) {
   2609 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2610 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2611 			vput(vp);
   2612 
   2613 		} else {
   2614 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2615 		}
   2616 	}
   2617 }
   2618 
   2619 
   2620 void
   2621 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2622 {
   2623 	int r,c;
   2624 	struct vnode *vp;
   2625 	int acd;
   2626 
   2627 
   2628 	/* We take this opportunity to close the vnodes like we should.. */
   2629 
   2630 	for (c = 0; c < raidPtr->numCol; c++) {
   2631 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2632 		acd = raidPtr->Disks[c].auto_configured;
   2633 		rf_close_component(raidPtr, vp, acd);
   2634 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2635 		raidPtr->Disks[c].auto_configured = 0;
   2636 	}
   2637 
   2638 	for (r = 0; r < raidPtr->numSpare; r++) {
   2639 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2640 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2641 		rf_close_component(raidPtr, vp, acd);
   2642 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2643 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2644 	}
   2645 }
   2646 
   2647 
   2648 void
   2649 rf_ReconThread(struct rf_recon_req_internal *req)
   2650 {
   2651 	int     s;
   2652 	RF_Raid_t *raidPtr;
   2653 
   2654 	s = splbio();
   2655 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2656 	raidPtr->recon_in_progress = 1;
   2657 
   2658 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2659 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2660 
   2661 	RF_Free(req, sizeof(*req));
   2662 
   2663 	raidPtr->recon_in_progress = 0;
   2664 	splx(s);
   2665 
   2666 	/* That's all... */
   2667 	kthread_exit(0);	/* does not return */
   2668 }
   2669 
   2670 void
   2671 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2672 {
   2673 	int retcode;
   2674 	int s;
   2675 
   2676 	raidPtr->parity_rewrite_stripes_done = 0;
   2677 	raidPtr->parity_rewrite_in_progress = 1;
   2678 	s = splbio();
   2679 	retcode = rf_RewriteParity(raidPtr);
   2680 	splx(s);
   2681 	if (retcode) {
   2682 		printf("raid%d: Error re-writing parity (%d)!\n",
   2683 		    raidPtr->raidid, retcode);
   2684 	} else {
   2685 		/* set the clean bit!  If we shutdown correctly,
   2686 		   the clean bit on each component label will get
   2687 		   set */
   2688 		raidPtr->parity_good = RF_RAID_CLEAN;
   2689 	}
   2690 	raidPtr->parity_rewrite_in_progress = 0;
   2691 
   2692 	/* Anyone waiting for us to stop?  If so, inform them... */
   2693 	if (raidPtr->waitShutdown) {
   2694 		rf_lock_mutex2(raidPtr->rad_lock);
   2695 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2696 		rf_unlock_mutex2(raidPtr->rad_lock);
   2697 	}
   2698 
   2699 	/* That's all... */
   2700 	kthread_exit(0);	/* does not return */
   2701 }
   2702 
   2703 
   2704 void
   2705 rf_CopybackThread(RF_Raid_t *raidPtr)
   2706 {
   2707 	int s;
   2708 
   2709 	raidPtr->copyback_in_progress = 1;
   2710 	s = splbio();
   2711 	rf_CopybackReconstructedData(raidPtr);
   2712 	splx(s);
   2713 	raidPtr->copyback_in_progress = 0;
   2714 
   2715 	/* That's all... */
   2716 	kthread_exit(0);	/* does not return */
   2717 }
   2718 
   2719 
   2720 void
   2721 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2722 {
   2723 	int s;
   2724 	RF_Raid_t *raidPtr;
   2725 
   2726 	s = splbio();
   2727 	raidPtr = req->raidPtr;
   2728 	raidPtr->recon_in_progress = 1;
   2729 	rf_ReconstructInPlace(raidPtr, req->col);
   2730 	RF_Free(req, sizeof(*req));
   2731 	raidPtr->recon_in_progress = 0;
   2732 	splx(s);
   2733 
   2734 	/* That's all... */
   2735 	kthread_exit(0);	/* does not return */
   2736 }
   2737 
   2738 static RF_AutoConfig_t *
   2739 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2740     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2741     unsigned secsize)
   2742 {
   2743 	int good_one = 0;
   2744 	RF_ComponentLabel_t *clabel;
   2745 	RF_AutoConfig_t *ac;
   2746 
   2747 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2748 	if (clabel == NULL) {
   2749 oomem:
   2750 		    while(ac_list) {
   2751 			    ac = ac_list;
   2752 			    if (ac->clabel)
   2753 				    free(ac->clabel, M_RAIDFRAME);
   2754 			    ac_list = ac_list->next;
   2755 			    free(ac, M_RAIDFRAME);
   2756 		    }
   2757 		    printf("RAID auto config: out of memory!\n");
   2758 		    return NULL; /* XXX probably should panic? */
   2759 	}
   2760 
   2761 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2762 		/* Got the label.  Does it look reasonable? */
   2763 		if (rf_reasonable_label(clabel, numsecs) &&
   2764 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2765 #ifdef DEBUG
   2766 			printf("Component on: %s: %llu\n",
   2767 				cname, (unsigned long long)size);
   2768 			rf_print_component_label(clabel);
   2769 #endif
   2770 			/* if it's reasonable, add it, else ignore it. */
   2771 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2772 				M_NOWAIT);
   2773 			if (ac == NULL) {
   2774 				free(clabel, M_RAIDFRAME);
   2775 				goto oomem;
   2776 			}
   2777 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2778 			ac->dev = dev;
   2779 			ac->vp = vp;
   2780 			ac->clabel = clabel;
   2781 			ac->next = ac_list;
   2782 			ac_list = ac;
   2783 			good_one = 1;
   2784 		}
   2785 	}
   2786 	if (!good_one) {
   2787 		/* cleanup */
   2788 		free(clabel, M_RAIDFRAME);
   2789 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2790 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2791 		vput(vp);
   2792 	}
   2793 	return ac_list;
   2794 }
   2795 
   2796 RF_AutoConfig_t *
   2797 rf_find_raid_components(void)
   2798 {
   2799 	struct vnode *vp;
   2800 	struct disklabel label;
   2801 	device_t dv;
   2802 	deviter_t di;
   2803 	dev_t dev;
   2804 	int bmajor, bminor, wedge, rf_part_found;
   2805 	int error;
   2806 	int i;
   2807 	RF_AutoConfig_t *ac_list;
   2808 	uint64_t numsecs;
   2809 	unsigned secsize;
   2810 	int dowedges;
   2811 
   2812 	/* initialize the AutoConfig list */
   2813 	ac_list = NULL;
   2814 
   2815 	/*
   2816 	 * we begin by trolling through *all* the devices on the system *twice*
   2817 	 * first we scan for wedges, second for other devices. This avoids
   2818 	 * using a raw partition instead of a wedge that covers the whole disk
   2819 	 */
   2820 
   2821 	for (dowedges=1; dowedges>=0; --dowedges) {
   2822 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2823 		     dv = deviter_next(&di)) {
   2824 
   2825 			/* we are only interested in disks... */
   2826 			if (device_class(dv) != DV_DISK)
   2827 				continue;
   2828 
   2829 			/* we don't care about floppies... */
   2830 			if (device_is_a(dv, "fd")) {
   2831 				continue;
   2832 			}
   2833 
   2834 			/* we don't care about CD's... */
   2835 			if (device_is_a(dv, "cd")) {
   2836 				continue;
   2837 			}
   2838 
   2839 			/* we don't care about md's... */
   2840 			if (device_is_a(dv, "md")) {
   2841 				continue;
   2842 			}
   2843 
   2844 			/* hdfd is the Atari/Hades floppy driver */
   2845 			if (device_is_a(dv, "hdfd")) {
   2846 				continue;
   2847 			}
   2848 
   2849 			/* fdisa is the Atari/Milan floppy driver */
   2850 			if (device_is_a(dv, "fdisa")) {
   2851 				continue;
   2852 			}
   2853 
   2854 			/* are we in the wedges pass ? */
   2855 			wedge = device_is_a(dv, "dk");
   2856 			if (wedge != dowedges) {
   2857 				continue;
   2858 			}
   2859 
   2860 			/* need to find the device_name_to_block_device_major stuff */
   2861 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2862 
   2863 			rf_part_found = 0; /*No raid partition as yet*/
   2864 
   2865 			/* get a vnode for the raw partition of this disk */
   2866 			bminor = minor(device_unit(dv));
   2867 			dev = wedge ? makedev(bmajor, bminor) :
   2868 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2869 			if (bdevvp(dev, &vp))
   2870 				panic("RAID can't alloc vnode");
   2871 
   2872 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2873 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2874 
   2875 			if (error) {
   2876 				/* "Who cares."  Continue looking
   2877 				   for something that exists*/
   2878 				vput(vp);
   2879 				continue;
   2880 			}
   2881 
   2882 			error = getdisksize(vp, &numsecs, &secsize);
   2883 			if (error) {
   2884 				/*
   2885 				 * Pseudo devices like vnd and cgd can be
   2886 				 * opened but may still need some configuration.
   2887 				 * Ignore these quietly.
   2888 				 */
   2889 				if (error != ENXIO)
   2890 					printf("RAIDframe: can't get disk size"
   2891 					    " for dev %s (%d)\n",
   2892 					    device_xname(dv), error);
   2893 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2894 				vput(vp);
   2895 				continue;
   2896 			}
   2897 			if (wedge) {
   2898 				struct dkwedge_info dkw;
   2899 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2900 				    NOCRED);
   2901 				if (error) {
   2902 					printf("RAIDframe: can't get wedge info for "
   2903 					    "dev %s (%d)\n", device_xname(dv), error);
   2904 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2905 					vput(vp);
   2906 					continue;
   2907 				}
   2908 
   2909 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2910 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2911 					vput(vp);
   2912 					continue;
   2913 				}
   2914 
   2915 				VOP_UNLOCK(vp);
   2916 				ac_list = rf_get_component(ac_list, dev, vp,
   2917 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2918 				rf_part_found = 1; /*There is a raid component on this disk*/
   2919 				continue;
   2920 			}
   2921 
   2922 			/* Ok, the disk exists.  Go get the disklabel. */
   2923 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2924 			if (error) {
   2925 				/*
   2926 				 * XXX can't happen - open() would
   2927 				 * have errored out (or faked up one)
   2928 				 */
   2929 				if (error != ENOTTY)
   2930 					printf("RAIDframe: can't get label for dev "
   2931 					    "%s (%d)\n", device_xname(dv), error);
   2932 			}
   2933 
   2934 			/* don't need this any more.  We'll allocate it again
   2935 			   a little later if we really do... */
   2936 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2937 			vput(vp);
   2938 
   2939 			if (error)
   2940 				continue;
   2941 
   2942 			rf_part_found = 0; /*No raid partitions yet*/
   2943 			for (i = 0; i < label.d_npartitions; i++) {
   2944 				char cname[sizeof(ac_list->devname)];
   2945 
   2946 				/* We only support partitions marked as RAID */
   2947 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2948 					continue;
   2949 
   2950 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2951 				if (bdevvp(dev, &vp))
   2952 					panic("RAID can't alloc vnode");
   2953 
   2954 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2955 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2956 				if (error) {
   2957 					/* Whatever... */
   2958 					vput(vp);
   2959 					continue;
   2960 				}
   2961 				VOP_UNLOCK(vp);
   2962 				snprintf(cname, sizeof(cname), "%s%c",
   2963 				    device_xname(dv), 'a' + i);
   2964 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2965 					label.d_partitions[i].p_size, numsecs, secsize);
   2966 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2967 			}
   2968 
   2969 			/*
   2970 			 *If there is no raid component on this disk, either in a
   2971 			 *disklabel or inside a wedge, check the raw partition as well,
   2972 			 *as it is possible to configure raid components on raw disk
   2973 			 *devices.
   2974 			 */
   2975 
   2976 			if (!rf_part_found) {
   2977 				char cname[sizeof(ac_list->devname)];
   2978 
   2979 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2980 				if (bdevvp(dev, &vp))
   2981 					panic("RAID can't alloc vnode");
   2982 
   2983 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2984 
   2985 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2986 				if (error) {
   2987 					/* Whatever... */
   2988 					vput(vp);
   2989 					continue;
   2990 				}
   2991 				VOP_UNLOCK(vp);
   2992 				snprintf(cname, sizeof(cname), "%s%c",
   2993 				    device_xname(dv), 'a' + RAW_PART);
   2994 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2995 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2996 			}
   2997 		}
   2998 		deviter_release(&di);
   2999 	}
   3000 	return ac_list;
   3001 }
   3002 
   3003 
   3004 int
   3005 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3006 {
   3007 
   3008 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3009 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3010 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3011 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3012 	    clabel->row >=0 &&
   3013 	    clabel->column >= 0 &&
   3014 	    clabel->num_rows > 0 &&
   3015 	    clabel->num_columns > 0 &&
   3016 	    clabel->row < clabel->num_rows &&
   3017 	    clabel->column < clabel->num_columns &&
   3018 	    clabel->blockSize > 0 &&
   3019 	    /*
   3020 	     * numBlocksHi may contain garbage, but it is ok since
   3021 	     * the type is unsigned.  If it is really garbage,
   3022 	     * rf_fix_old_label_size() will fix it.
   3023 	     */
   3024 	    rf_component_label_numblocks(clabel) > 0) {
   3025 		/*
   3026 		 * label looks reasonable enough...
   3027 		 * let's make sure it has no old garbage.
   3028 		 */
   3029 		if (numsecs)
   3030 			rf_fix_old_label_size(clabel, numsecs);
   3031 		return(1);
   3032 	}
   3033 	return(0);
   3034 }
   3035 
   3036 
   3037 /*
   3038  * For reasons yet unknown, some old component labels have garbage in
   3039  * the newer numBlocksHi region, and this causes lossage.  Since those
   3040  * disks will also have numsecs set to less than 32 bits of sectors,
   3041  * we can determine when this corruption has occurred, and fix it.
   3042  *
   3043  * The exact same problem, with the same unknown reason, happens to
   3044  * the partitionSizeHi member as well.
   3045  */
   3046 static void
   3047 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3048 {
   3049 
   3050 	if (numsecs < ((uint64_t)1 << 32)) {
   3051 		if (clabel->numBlocksHi) {
   3052 			printf("WARNING: total sectors < 32 bits, yet "
   3053 			       "numBlocksHi set\n"
   3054 			       "WARNING: resetting numBlocksHi to zero.\n");
   3055 			clabel->numBlocksHi = 0;
   3056 		}
   3057 
   3058 		if (clabel->partitionSizeHi) {
   3059 			printf("WARNING: total sectors < 32 bits, yet "
   3060 			       "partitionSizeHi set\n"
   3061 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3062 			clabel->partitionSizeHi = 0;
   3063 		}
   3064 	}
   3065 }
   3066 
   3067 
   3068 #ifdef DEBUG
   3069 void
   3070 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3071 {
   3072 	uint64_t numBlocks;
   3073 	static const char *rp[] = {
   3074 	    "No", "Force", "Soft", "*invalid*"
   3075 	};
   3076 
   3077 
   3078 	numBlocks = rf_component_label_numblocks(clabel);
   3079 
   3080 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3081 	       clabel->row, clabel->column,
   3082 	       clabel->num_rows, clabel->num_columns);
   3083 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3084 	       clabel->version, clabel->serial_number,
   3085 	       clabel->mod_counter);
   3086 	printf("   Clean: %s Status: %d\n",
   3087 	       clabel->clean ? "Yes" : "No", clabel->status);
   3088 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3089 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3090 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3091 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3092 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3093 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3094 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3095 #if 0
   3096 	   printf("   Config order: %d\n", clabel->config_order);
   3097 #endif
   3098 
   3099 }
   3100 #endif
   3101 
   3102 RF_ConfigSet_t *
   3103 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3104 {
   3105 	RF_AutoConfig_t *ac;
   3106 	RF_ConfigSet_t *config_sets;
   3107 	RF_ConfigSet_t *cset;
   3108 	RF_AutoConfig_t *ac_next;
   3109 
   3110 
   3111 	config_sets = NULL;
   3112 
   3113 	/* Go through the AutoConfig list, and figure out which components
   3114 	   belong to what sets.  */
   3115 	ac = ac_list;
   3116 	while(ac!=NULL) {
   3117 		/* we're going to putz with ac->next, so save it here
   3118 		   for use at the end of the loop */
   3119 		ac_next = ac->next;
   3120 
   3121 		if (config_sets == NULL) {
   3122 			/* will need at least this one... */
   3123 			config_sets = (RF_ConfigSet_t *)
   3124 				malloc(sizeof(RF_ConfigSet_t),
   3125 				       M_RAIDFRAME, M_NOWAIT);
   3126 			if (config_sets == NULL) {
   3127 				panic("rf_create_auto_sets: No memory!");
   3128 			}
   3129 			/* this one is easy :) */
   3130 			config_sets->ac = ac;
   3131 			config_sets->next = NULL;
   3132 			config_sets->rootable = 0;
   3133 			ac->next = NULL;
   3134 		} else {
   3135 			/* which set does this component fit into? */
   3136 			cset = config_sets;
   3137 			while(cset!=NULL) {
   3138 				if (rf_does_it_fit(cset, ac)) {
   3139 					/* looks like it matches... */
   3140 					ac->next = cset->ac;
   3141 					cset->ac = ac;
   3142 					break;
   3143 				}
   3144 				cset = cset->next;
   3145 			}
   3146 			if (cset==NULL) {
   3147 				/* didn't find a match above... new set..*/
   3148 				cset = (RF_ConfigSet_t *)
   3149 					malloc(sizeof(RF_ConfigSet_t),
   3150 					       M_RAIDFRAME, M_NOWAIT);
   3151 				if (cset == NULL) {
   3152 					panic("rf_create_auto_sets: No memory!");
   3153 				}
   3154 				cset->ac = ac;
   3155 				ac->next = NULL;
   3156 				cset->next = config_sets;
   3157 				cset->rootable = 0;
   3158 				config_sets = cset;
   3159 			}
   3160 		}
   3161 		ac = ac_next;
   3162 	}
   3163 
   3164 
   3165 	return(config_sets);
   3166 }
   3167 
   3168 static int
   3169 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3170 {
   3171 	RF_ComponentLabel_t *clabel1, *clabel2;
   3172 
   3173 	/* If this one matches the *first* one in the set, that's good
   3174 	   enough, since the other members of the set would have been
   3175 	   through here too... */
   3176 	/* note that we are not checking partitionSize here..
   3177 
   3178 	   Note that we are also not checking the mod_counters here.
   3179 	   If everything else matches except the mod_counter, that's
   3180 	   good enough for this test.  We will deal with the mod_counters
   3181 	   a little later in the autoconfiguration process.
   3182 
   3183 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3184 
   3185 	   The reason we don't check for this is that failed disks
   3186 	   will have lower modification counts.  If those disks are
   3187 	   not added to the set they used to belong to, then they will
   3188 	   form their own set, which may result in 2 different sets,
   3189 	   for example, competing to be configured at raid0, and
   3190 	   perhaps competing to be the root filesystem set.  If the
   3191 	   wrong ones get configured, or both attempt to become /,
   3192 	   weird behaviour and or serious lossage will occur.  Thus we
   3193 	   need to bring them into the fold here, and kick them out at
   3194 	   a later point.
   3195 
   3196 	*/
   3197 
   3198 	clabel1 = cset->ac->clabel;
   3199 	clabel2 = ac->clabel;
   3200 	if ((clabel1->version == clabel2->version) &&
   3201 	    (clabel1->serial_number == clabel2->serial_number) &&
   3202 	    (clabel1->num_rows == clabel2->num_rows) &&
   3203 	    (clabel1->num_columns == clabel2->num_columns) &&
   3204 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3205 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3206 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3207 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3208 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3209 	    (clabel1->blockSize == clabel2->blockSize) &&
   3210 	    rf_component_label_numblocks(clabel1) ==
   3211 	    rf_component_label_numblocks(clabel2) &&
   3212 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3213 	    (clabel1->root_partition == clabel2->root_partition) &&
   3214 	    (clabel1->last_unit == clabel2->last_unit) &&
   3215 	    (clabel1->config_order == clabel2->config_order)) {
   3216 		/* if it get's here, it almost *has* to be a match */
   3217 	} else {
   3218 		/* it's not consistent with somebody in the set..
   3219 		   punt */
   3220 		return(0);
   3221 	}
   3222 	/* all was fine.. it must fit... */
   3223 	return(1);
   3224 }
   3225 
   3226 int
   3227 rf_have_enough_components(RF_ConfigSet_t *cset)
   3228 {
   3229 	RF_AutoConfig_t *ac;
   3230 	RF_AutoConfig_t *auto_config;
   3231 	RF_ComponentLabel_t *clabel;
   3232 	int c;
   3233 	int num_cols;
   3234 	int num_missing;
   3235 	int mod_counter;
   3236 	int mod_counter_found;
   3237 	int even_pair_failed;
   3238 	char parity_type;
   3239 
   3240 
   3241 	/* check to see that we have enough 'live' components
   3242 	   of this set.  If so, we can configure it if necessary */
   3243 
   3244 	num_cols = cset->ac->clabel->num_columns;
   3245 	parity_type = cset->ac->clabel->parityConfig;
   3246 
   3247 	/* XXX Check for duplicate components!?!?!? */
   3248 
   3249 	/* Determine what the mod_counter is supposed to be for this set. */
   3250 
   3251 	mod_counter_found = 0;
   3252 	mod_counter = 0;
   3253 	ac = cset->ac;
   3254 	while(ac!=NULL) {
   3255 		if (mod_counter_found==0) {
   3256 			mod_counter = ac->clabel->mod_counter;
   3257 			mod_counter_found = 1;
   3258 		} else {
   3259 			if (ac->clabel->mod_counter > mod_counter) {
   3260 				mod_counter = ac->clabel->mod_counter;
   3261 			}
   3262 		}
   3263 		ac = ac->next;
   3264 	}
   3265 
   3266 	num_missing = 0;
   3267 	auto_config = cset->ac;
   3268 
   3269 	even_pair_failed = 0;
   3270 	for(c=0; c<num_cols; c++) {
   3271 		ac = auto_config;
   3272 		while(ac!=NULL) {
   3273 			if ((ac->clabel->column == c) &&
   3274 			    (ac->clabel->mod_counter == mod_counter)) {
   3275 				/* it's this one... */
   3276 #ifdef DEBUG
   3277 				printf("Found: %s at %d\n",
   3278 				       ac->devname,c);
   3279 #endif
   3280 				break;
   3281 			}
   3282 			ac=ac->next;
   3283 		}
   3284 		if (ac==NULL) {
   3285 				/* Didn't find one here! */
   3286 				/* special case for RAID 1, especially
   3287 				   where there are more than 2
   3288 				   components (where RAIDframe treats
   3289 				   things a little differently :( ) */
   3290 			if (parity_type == '1') {
   3291 				if (c%2 == 0) { /* even component */
   3292 					even_pair_failed = 1;
   3293 				} else { /* odd component.  If
   3294 					    we're failed, and
   3295 					    so is the even
   3296 					    component, it's
   3297 					    "Good Night, Charlie" */
   3298 					if (even_pair_failed == 1) {
   3299 						return(0);
   3300 					}
   3301 				}
   3302 			} else {
   3303 				/* normal accounting */
   3304 				num_missing++;
   3305 			}
   3306 		}
   3307 		if ((parity_type == '1') && (c%2 == 1)) {
   3308 				/* Just did an even component, and we didn't
   3309 				   bail.. reset the even_pair_failed flag,
   3310 				   and go on to the next component.... */
   3311 			even_pair_failed = 0;
   3312 		}
   3313 	}
   3314 
   3315 	clabel = cset->ac->clabel;
   3316 
   3317 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3318 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3319 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3320 		/* XXX this needs to be made *much* more general */
   3321 		/* Too many failures */
   3322 		return(0);
   3323 	}
   3324 	/* otherwise, all is well, and we've got enough to take a kick
   3325 	   at autoconfiguring this set */
   3326 	return(1);
   3327 }
   3328 
   3329 void
   3330 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3331 			RF_Raid_t *raidPtr)
   3332 {
   3333 	RF_ComponentLabel_t *clabel;
   3334 	int i;
   3335 
   3336 	clabel = ac->clabel;
   3337 
   3338 	/* 1. Fill in the common stuff */
   3339 	config->numCol = clabel->num_columns;
   3340 	config->numSpare = 0; /* XXX should this be set here? */
   3341 	config->sectPerSU = clabel->sectPerSU;
   3342 	config->SUsPerPU = clabel->SUsPerPU;
   3343 	config->SUsPerRU = clabel->SUsPerRU;
   3344 	config->parityConfig = clabel->parityConfig;
   3345 	/* XXX... */
   3346 	strcpy(config->diskQueueType,"fifo");
   3347 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3348 	config->layoutSpecificSize = 0; /* XXX ?? */
   3349 
   3350 	while(ac!=NULL) {
   3351 		/* row/col values will be in range due to the checks
   3352 		   in reasonable_label() */
   3353 		strcpy(config->devnames[0][ac->clabel->column],
   3354 		       ac->devname);
   3355 		ac = ac->next;
   3356 	}
   3357 
   3358 	for(i=0;i<RF_MAXDBGV;i++) {
   3359 		config->debugVars[i][0] = 0;
   3360 	}
   3361 }
   3362 
   3363 int
   3364 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3365 {
   3366 	RF_ComponentLabel_t *clabel;
   3367 	int column;
   3368 	int sparecol;
   3369 
   3370 	raidPtr->autoconfigure = new_value;
   3371 
   3372 	for(column=0; column<raidPtr->numCol; column++) {
   3373 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3374 			clabel = raidget_component_label(raidPtr, column);
   3375 			clabel->autoconfigure = new_value;
   3376 			raidflush_component_label(raidPtr, column);
   3377 		}
   3378 	}
   3379 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3380 		sparecol = raidPtr->numCol + column;
   3381 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3382 			clabel = raidget_component_label(raidPtr, sparecol);
   3383 			clabel->autoconfigure = new_value;
   3384 			raidflush_component_label(raidPtr, sparecol);
   3385 		}
   3386 	}
   3387 	return(new_value);
   3388 }
   3389 
   3390 int
   3391 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3392 {
   3393 	RF_ComponentLabel_t *clabel;
   3394 	int column;
   3395 	int sparecol;
   3396 
   3397 	raidPtr->root_partition = new_value;
   3398 	for(column=0; column<raidPtr->numCol; column++) {
   3399 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3400 			clabel = raidget_component_label(raidPtr, column);
   3401 			clabel->root_partition = new_value;
   3402 			raidflush_component_label(raidPtr, column);
   3403 		}
   3404 	}
   3405 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3406 		sparecol = raidPtr->numCol + column;
   3407 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3408 			clabel = raidget_component_label(raidPtr, sparecol);
   3409 			clabel->root_partition = new_value;
   3410 			raidflush_component_label(raidPtr, sparecol);
   3411 		}
   3412 	}
   3413 	return(new_value);
   3414 }
   3415 
   3416 void
   3417 rf_release_all_vps(RF_ConfigSet_t *cset)
   3418 {
   3419 	RF_AutoConfig_t *ac;
   3420 
   3421 	ac = cset->ac;
   3422 	while(ac!=NULL) {
   3423 		/* Close the vp, and give it back */
   3424 		if (ac->vp) {
   3425 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3426 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3427 			vput(ac->vp);
   3428 			ac->vp = NULL;
   3429 		}
   3430 		ac = ac->next;
   3431 	}
   3432 }
   3433 
   3434 
   3435 void
   3436 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3437 {
   3438 	RF_AutoConfig_t *ac;
   3439 	RF_AutoConfig_t *next_ac;
   3440 
   3441 	ac = cset->ac;
   3442 	while(ac!=NULL) {
   3443 		next_ac = ac->next;
   3444 		/* nuke the label */
   3445 		free(ac->clabel, M_RAIDFRAME);
   3446 		/* cleanup the config structure */
   3447 		free(ac, M_RAIDFRAME);
   3448 		/* "next.." */
   3449 		ac = next_ac;
   3450 	}
   3451 	/* and, finally, nuke the config set */
   3452 	free(cset, M_RAIDFRAME);
   3453 }
   3454 
   3455 
   3456 void
   3457 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3458 {
   3459 	/* current version number */
   3460 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3461 	clabel->serial_number = raidPtr->serial_number;
   3462 	clabel->mod_counter = raidPtr->mod_counter;
   3463 
   3464 	clabel->num_rows = 1;
   3465 	clabel->num_columns = raidPtr->numCol;
   3466 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3467 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3468 
   3469 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3470 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3471 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3472 
   3473 	clabel->blockSize = raidPtr->bytesPerSector;
   3474 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3475 
   3476 	/* XXX not portable */
   3477 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3478 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3479 	clabel->autoconfigure = raidPtr->autoconfigure;
   3480 	clabel->root_partition = raidPtr->root_partition;
   3481 	clabel->last_unit = raidPtr->raidid;
   3482 	clabel->config_order = raidPtr->config_order;
   3483 
   3484 #ifndef RF_NO_PARITY_MAP
   3485 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3486 #endif
   3487 }
   3488 
   3489 struct raid_softc *
   3490 rf_auto_config_set(RF_ConfigSet_t *cset)
   3491 {
   3492 	RF_Raid_t *raidPtr;
   3493 	RF_Config_t *config;
   3494 	int raidID;
   3495 	struct raid_softc *sc;
   3496 
   3497 #ifdef DEBUG
   3498 	printf("RAID autoconfigure\n");
   3499 #endif
   3500 
   3501 	/* 1. Create a config structure */
   3502 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3503 	if (config == NULL) {
   3504 		printf("%s: Out of mem - config!?!?\n", __func__);
   3505 				/* XXX do something more intelligent here. */
   3506 		return NULL;
   3507 	}
   3508 
   3509 	/*
   3510 	   2. Figure out what RAID ID this one is supposed to live at
   3511 	   See if we can get the same RAID dev that it was configured
   3512 	   on last time..
   3513 	*/
   3514 
   3515 	raidID = cset->ac->clabel->last_unit;
   3516 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3517 	     sc = raidget(++raidID, false))
   3518 		continue;
   3519 #ifdef DEBUG
   3520 	printf("Configuring raid%d:\n",raidID);
   3521 #endif
   3522 
   3523 	if (sc == NULL)
   3524 		sc = raidget(raidID, true);
   3525 	if (sc == NULL) {
   3526 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3527 				/* XXX do something more intelligent here. */
   3528 		free(config, M_RAIDFRAME);
   3529 		return NULL;
   3530 	}
   3531 
   3532 	raidPtr = &sc->sc_r;
   3533 
   3534 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3535 	raidPtr->softc = sc;
   3536 	raidPtr->raidid = raidID;
   3537 	raidPtr->openings = RAIDOUTSTANDING;
   3538 
   3539 	/* 3. Build the configuration structure */
   3540 	rf_create_configuration(cset->ac, config, raidPtr);
   3541 
   3542 	/* 4. Do the configuration */
   3543 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3544 		raidinit(sc);
   3545 
   3546 		rf_markalldirty(raidPtr);
   3547 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3548 		switch (cset->ac->clabel->root_partition) {
   3549 		case 1:	/* Force Root */
   3550 		case 2:	/* Soft Root: root when boot partition part of raid */
   3551 			/*
   3552 			 * everything configured just fine.  Make a note
   3553 			 * that this set is eligible to be root,
   3554 			 * or forced to be root
   3555 			 */
   3556 			cset->rootable = cset->ac->clabel->root_partition;
   3557 			/* XXX do this here? */
   3558 			raidPtr->root_partition = cset->rootable;
   3559 			break;
   3560 		default:
   3561 			break;
   3562 		}
   3563 	} else {
   3564 		raidput(sc);
   3565 		sc = NULL;
   3566 	}
   3567 
   3568 	/* 5. Cleanup */
   3569 	free(config, M_RAIDFRAME);
   3570 	return sc;
   3571 }
   3572 
   3573 void
   3574 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3575 	     size_t xmin, size_t xmax)
   3576 {
   3577 	int error;
   3578 
   3579 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3580 	pool_sethiwat(p, xmax);
   3581 	if ((error = pool_prime(p, xmin)) != 0)
   3582 		panic("%s: failed to prime pool: %d", __func__, error);
   3583 	pool_setlowat(p, xmin);
   3584 }
   3585 
   3586 /*
   3587  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3588  * to see if there is IO pending and if that IO could possibly be done
   3589  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3590  * otherwise.
   3591  *
   3592  */
   3593 int
   3594 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3595 {
   3596 	struct raid_softc *rs;
   3597 	struct dk_softc *dksc;
   3598 
   3599 	rs = raidPtr->softc;
   3600 	dksc = &rs->sc_dksc;
   3601 
   3602 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3603 		return 1;
   3604 
   3605 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3606 		/* there is work to do */
   3607 		return 0;
   3608 	}
   3609 	/* default is nothing to do */
   3610 	return 1;
   3611 }
   3612 
   3613 int
   3614 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3615 {
   3616 	uint64_t numsecs;
   3617 	unsigned secsize;
   3618 	int error;
   3619 
   3620 	error = getdisksize(vp, &numsecs, &secsize);
   3621 	if (error == 0) {
   3622 		diskPtr->blockSize = secsize;
   3623 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3624 		diskPtr->partitionSize = numsecs;
   3625 		return 0;
   3626 	}
   3627 	return error;
   3628 }
   3629 
   3630 static int
   3631 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3632 {
   3633 	return 1;
   3634 }
   3635 
   3636 static void
   3637 raid_attach(device_t parent, device_t self, void *aux)
   3638 {
   3639 }
   3640 
   3641 
   3642 static int
   3643 raid_detach(device_t self, int flags)
   3644 {
   3645 	int error;
   3646 	struct raid_softc *rs = raidsoftc(self);
   3647 
   3648 	if (rs == NULL)
   3649 		return ENXIO;
   3650 
   3651 	if ((error = raidlock(rs)) != 0)
   3652 		return (error);
   3653 
   3654 	error = raid_detach_unlocked(rs);
   3655 
   3656 	raidunlock(rs);
   3657 
   3658 	/* XXX raid can be referenced here */
   3659 
   3660 	if (error)
   3661 		return error;
   3662 
   3663 	/* Free the softc */
   3664 	raidput(rs);
   3665 
   3666 	return 0;
   3667 }
   3668 
   3669 static void
   3670 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3671 {
   3672 	struct dk_softc *dksc = &rs->sc_dksc;
   3673 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3674 
   3675 	memset(dg, 0, sizeof(*dg));
   3676 
   3677 	dg->dg_secperunit = raidPtr->totalSectors;
   3678 	dg->dg_secsize = raidPtr->bytesPerSector;
   3679 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3680 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3681 
   3682 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3683 }
   3684 
   3685 /*
   3686  * Get cache info for all the components (including spares).
   3687  * Returns intersection of all the cache flags of all disks, or first
   3688  * error if any encountered.
   3689  * XXXfua feature flags can change as spares are added - lock down somehow
   3690  */
   3691 static int
   3692 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3693 {
   3694 	int c;
   3695 	int error;
   3696 	int dkwhole = 0, dkpart;
   3697 
   3698 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3699 		/*
   3700 		 * Check any non-dead disk, even when currently being
   3701 		 * reconstructed.
   3702 		 */
   3703 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3704 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3705 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3706 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3707 			if (error) {
   3708 				if (error != ENODEV) {
   3709 					printf("raid%d: get cache for component %s failed\n",
   3710 					    raidPtr->raidid,
   3711 					    raidPtr->Disks[c].devname);
   3712 				}
   3713 
   3714 				return error;
   3715 			}
   3716 
   3717 			if (c == 0)
   3718 				dkwhole = dkpart;
   3719 			else
   3720 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3721 		}
   3722 	}
   3723 
   3724 	*data = dkwhole;
   3725 
   3726 	return 0;
   3727 }
   3728 
   3729 /*
   3730  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3731  * We end up returning whatever error was returned by the first cache flush
   3732  * that fails.
   3733  */
   3734 
   3735 int
   3736 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3737 {
   3738 	int c, sparecol;
   3739 	int e,error;
   3740 	int force = 1;
   3741 
   3742 	error = 0;
   3743 	for (c = 0; c < raidPtr->numCol; c++) {
   3744 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3745 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3746 					  &force, FWRITE, NOCRED);
   3747 			if (e) {
   3748 				if (e != ENODEV)
   3749 					printf("raid%d: cache flush to component %s failed.\n",
   3750 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3751 				if (error == 0) {
   3752 					error = e;
   3753 				}
   3754 			}
   3755 		}
   3756 	}
   3757 
   3758 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3759 		sparecol = raidPtr->numCol + c;
   3760 		/* Need to ensure that the reconstruct actually completed! */
   3761 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3762 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3763 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3764 			if (e) {
   3765 				if (e != ENODEV)
   3766 					printf("raid%d: cache flush to component %s failed.\n",
   3767 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3768 				if (error == 0) {
   3769 					error = e;
   3770 				}
   3771 			}
   3772 		}
   3773 	}
   3774 	return error;
   3775 }
   3776 
   3777 /* Fill in info with the current status */
   3778 void
   3779 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3780 {
   3781 
   3782 	memset(info, 0, sizeof(*info));
   3783 
   3784 	if (raidPtr->status != rf_rs_reconstructing) {
   3785 		info->total = 100;
   3786 		info->completed = 100;
   3787 	} else {
   3788 		info->total = raidPtr->reconControl->numRUsTotal;
   3789 		info->completed = raidPtr->reconControl->numRUsComplete;
   3790 	}
   3791 	info->remaining = info->total - info->completed;
   3792 }
   3793 
   3794 /* Fill in info with the current status */
   3795 void
   3796 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3797 {
   3798 
   3799 	memset(info, 0, sizeof(*info));
   3800 
   3801 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3802 		info->total = raidPtr->Layout.numStripe;
   3803 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3804 	} else {
   3805 		info->completed = 100;
   3806 		info->total = 100;
   3807 	}
   3808 	info->remaining = info->total - info->completed;
   3809 }
   3810 
   3811 /* Fill in info with the current status */
   3812 void
   3813 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3814 {
   3815 
   3816 	memset(info, 0, sizeof(*info));
   3817 
   3818 	if (raidPtr->copyback_in_progress == 1) {
   3819 		info->total = raidPtr->Layout.numStripe;
   3820 		info->completed = raidPtr->copyback_stripes_done;
   3821 		info->remaining = info->total - info->completed;
   3822 	} else {
   3823 		info->remaining = 0;
   3824 		info->completed = 100;
   3825 		info->total = 100;
   3826 	}
   3827 }
   3828 
   3829 /* Fill in config with the current info */
   3830 int
   3831 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3832 {
   3833 	int	d, i, j;
   3834 
   3835 	if (!raidPtr->valid)
   3836 		return (ENODEV);
   3837 	config->cols = raidPtr->numCol;
   3838 	config->ndevs = raidPtr->numCol;
   3839 	if (config->ndevs >= RF_MAX_DISKS)
   3840 		return (ENOMEM);
   3841 	config->nspares = raidPtr->numSpare;
   3842 	if (config->nspares >= RF_MAX_DISKS)
   3843 		return (ENOMEM);
   3844 	config->maxqdepth = raidPtr->maxQueueDepth;
   3845 	d = 0;
   3846 	for (j = 0; j < config->cols; j++) {
   3847 		config->devs[d] = raidPtr->Disks[j];
   3848 		d++;
   3849 	}
   3850 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3851 		config->spares[i] = raidPtr->Disks[j];
   3852 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3853 			/* XXX: raidctl(8) expects to see this as a used spare */
   3854 			config->spares[i].status = rf_ds_used_spare;
   3855 		}
   3856 	}
   3857 	return 0;
   3858 }
   3859 
   3860 int
   3861 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3862 {
   3863 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3864 	RF_ComponentLabel_t *raid_clabel;
   3865 	int column = clabel->column;
   3866 
   3867 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3868 		return EINVAL;
   3869 	raid_clabel = raidget_component_label(raidPtr, column);
   3870 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3871 
   3872 	return 0;
   3873 }
   3874 
   3875 /*
   3876  * Module interface
   3877  */
   3878 
   3879 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3880 
   3881 #ifdef _MODULE
   3882 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3883 #endif
   3884 
   3885 static int raid_modcmd(modcmd_t, void *);
   3886 static int raid_modcmd_init(void);
   3887 static int raid_modcmd_fini(void);
   3888 
   3889 static int
   3890 raid_modcmd(modcmd_t cmd, void *data)
   3891 {
   3892 	int error;
   3893 
   3894 	error = 0;
   3895 	switch (cmd) {
   3896 	case MODULE_CMD_INIT:
   3897 		error = raid_modcmd_init();
   3898 		break;
   3899 	case MODULE_CMD_FINI:
   3900 		error = raid_modcmd_fini();
   3901 		break;
   3902 	default:
   3903 		error = ENOTTY;
   3904 		break;
   3905 	}
   3906 	return error;
   3907 }
   3908 
   3909 static int
   3910 raid_modcmd_init(void)
   3911 {
   3912 	int error;
   3913 	int bmajor, cmajor;
   3914 
   3915 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3916 	mutex_enter(&raid_lock);
   3917 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3918 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3919 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3920 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3921 
   3922 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3923 #endif
   3924 
   3925 	bmajor = cmajor = -1;
   3926 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3927 	    &raid_cdevsw, &cmajor);
   3928 	if (error != 0 && error != EEXIST) {
   3929 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3930 		mutex_exit(&raid_lock);
   3931 		return error;
   3932 	}
   3933 #ifdef _MODULE
   3934 	error = config_cfdriver_attach(&raid_cd);
   3935 	if (error != 0) {
   3936 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3937 		    __func__, error);
   3938 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3939 		mutex_exit(&raid_lock);
   3940 		return error;
   3941 	}
   3942 #endif
   3943 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3944 	if (error != 0) {
   3945 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3946 		    __func__, error);
   3947 #ifdef _MODULE
   3948 		config_cfdriver_detach(&raid_cd);
   3949 #endif
   3950 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3951 		mutex_exit(&raid_lock);
   3952 		return error;
   3953 	}
   3954 
   3955 	raidautoconfigdone = false;
   3956 
   3957 	mutex_exit(&raid_lock);
   3958 
   3959 	if (error == 0) {
   3960 		if (rf_BootRaidframe(true) == 0)
   3961 			aprint_verbose("Kernelized RAIDframe activated\n");
   3962 		else
   3963 			panic("Serious error activating RAID!!");
   3964 	}
   3965 
   3966 	/*
   3967 	 * Register a finalizer which will be used to auto-config RAID
   3968 	 * sets once all real hardware devices have been found.
   3969 	 */
   3970 	error = config_finalize_register(NULL, rf_autoconfig);
   3971 	if (error != 0) {
   3972 		aprint_error("WARNING: unable to register RAIDframe "
   3973 		    "finalizer\n");
   3974 		error = 0;
   3975 	}
   3976 
   3977 	return error;
   3978 }
   3979 
   3980 static int
   3981 raid_modcmd_fini(void)
   3982 {
   3983 	int error;
   3984 
   3985 	mutex_enter(&raid_lock);
   3986 
   3987 	/* Don't allow unload if raid device(s) exist.  */
   3988 	if (!LIST_EMPTY(&raids)) {
   3989 		mutex_exit(&raid_lock);
   3990 		return EBUSY;
   3991 	}
   3992 
   3993 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3994 	if (error != 0) {
   3995 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3996 		mutex_exit(&raid_lock);
   3997 		return error;
   3998 	}
   3999 #ifdef _MODULE
   4000 	error = config_cfdriver_detach(&raid_cd);
   4001 	if (error != 0) {
   4002 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4003 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4004 		mutex_exit(&raid_lock);
   4005 		return error;
   4006 	}
   4007 #endif
   4008 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4009 	if (error != 0) {
   4010 		aprint_error("%s: cannot detach devsw\n",__func__);
   4011 #ifdef _MODULE
   4012 		config_cfdriver_attach(&raid_cd);
   4013 #endif
   4014 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4015 		mutex_exit(&raid_lock);
   4016 		return error;
   4017 	}
   4018 	rf_BootRaidframe(false);
   4019 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4020 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4021 	rf_destroy_cond2(rf_sparet_wait_cv);
   4022 	rf_destroy_cond2(rf_sparet_resp_cv);
   4023 #endif
   4024 	mutex_exit(&raid_lock);
   4025 	mutex_destroy(&raid_lock);
   4026 
   4027 	return error;
   4028 }
   4029