Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.357
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.357 2019/01/08 07:18:18 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.357 2019/01/08 07:18:18 mrg Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #ifdef COMPAT_50
    153 #include "rf_compat50.h"
    154 #endif
    155 
    156 #ifdef COMPAT_80
    157 #include "rf_compat80.h"
    158 #endif
    159 
    160 #ifdef COMPAT_NETBSD32
    161 #include "rf_compat32.h"
    162 #endif
    163 
    164 #include "ioconf.h"
    165 
    166 #ifdef DEBUG
    167 int     rf_kdebug_level = 0;
    168 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    169 #else				/* DEBUG */
    170 #define db1_printf(a) { }
    171 #endif				/* DEBUG */
    172 
    173 #ifdef DEBUG_ROOT
    174 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    175 #else
    176 #define DPRINTF(a, ...)
    177 #endif
    178 
    179 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    180 static rf_declare_mutex2(rf_sparet_wait_mutex);
    181 static rf_declare_cond2(rf_sparet_wait_cv);
    182 static rf_declare_cond2(rf_sparet_resp_cv);
    183 
    184 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    185 						 * spare table */
    186 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    187 						 * installation process */
    188 #endif
    189 
    190 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    191 
    192 /* prototypes */
    193 static void KernelWakeupFunc(struct buf *);
    194 static void InitBP(struct buf *, struct vnode *, unsigned,
    195     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    196     void *, int, struct proc *);
    197 struct raid_softc;
    198 static void raidinit(struct raid_softc *);
    199 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    200 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    201 
    202 static int raid_match(device_t, cfdata_t, void *);
    203 static void raid_attach(device_t, device_t, void *);
    204 static int raid_detach(device_t, int);
    205 
    206 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    207     daddr_t, daddr_t);
    208 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    209     daddr_t, daddr_t, int);
    210 
    211 static int raidwrite_component_label(unsigned,
    212     dev_t, struct vnode *, RF_ComponentLabel_t *);
    213 static int raidread_component_label(unsigned,
    214     dev_t, struct vnode *, RF_ComponentLabel_t *);
    215 
    216 static int raid_diskstart(device_t, struct buf *bp);
    217 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    218 static int raid_lastclose(device_t);
    219 
    220 static dev_type_open(raidopen);
    221 static dev_type_close(raidclose);
    222 static dev_type_read(raidread);
    223 static dev_type_write(raidwrite);
    224 static dev_type_ioctl(raidioctl);
    225 static dev_type_strategy(raidstrategy);
    226 static dev_type_dump(raiddump);
    227 static dev_type_size(raidsize);
    228 
    229 const struct bdevsw raid_bdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_strategy = raidstrategy,
    233 	.d_ioctl = raidioctl,
    234 	.d_dump = raiddump,
    235 	.d_psize = raidsize,
    236 	.d_discard = nodiscard,
    237 	.d_flag = D_DISK
    238 };
    239 
    240 const struct cdevsw raid_cdevsw = {
    241 	.d_open = raidopen,
    242 	.d_close = raidclose,
    243 	.d_read = raidread,
    244 	.d_write = raidwrite,
    245 	.d_ioctl = raidioctl,
    246 	.d_stop = nostop,
    247 	.d_tty = notty,
    248 	.d_poll = nopoll,
    249 	.d_mmap = nommap,
    250 	.d_kqfilter = nokqfilter,
    251 	.d_discard = nodiscard,
    252 	.d_flag = D_DISK
    253 };
    254 
    255 static struct dkdriver rf_dkdriver = {
    256 	.d_open = raidopen,
    257 	.d_close = raidclose,
    258 	.d_strategy = raidstrategy,
    259 	.d_diskstart = raid_diskstart,
    260 	.d_dumpblocks = raid_dumpblocks,
    261 	.d_lastclose = raid_lastclose,
    262 	.d_minphys = minphys
    263 };
    264 
    265 struct raid_softc {
    266 	struct dk_softc sc_dksc;
    267 	int	sc_unit;
    268 	int     sc_flags;	/* flags */
    269 	int     sc_cflags;	/* configuration flags */
    270 	kmutex_t sc_mutex;	/* interlock mutex */
    271 	kcondvar_t sc_cv;	/* and the condvar */
    272 	uint64_t sc_size;	/* size of the raid device */
    273 	char    sc_xname[20];	/* XXX external name */
    274 	RF_Raid_t sc_r;
    275 	LIST_ENTRY(raid_softc) sc_link;
    276 };
    277 /* sc_flags */
    278 #define RAIDF_INITED		0x01	/* unit has been initialized */
    279 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    280 #define RAIDF_DETACH  		0x04	/* detach after final close */
    281 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    282 #define RAIDF_LOCKED		0x10	/* unit is locked */
    283 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    284 
    285 #define	raidunit(x)	DISKUNIT(x)
    286 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    287 
    288 extern struct cfdriver raid_cd;
    289 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    290     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    291     DVF_DETACH_SHUTDOWN);
    292 
    293 /* Internal representation of a rf_recon_req */
    294 struct rf_recon_req_internal {
    295 	RF_RowCol_t col;
    296 	RF_ReconReqFlags_t flags;
    297 	void   *raidPtr;
    298 };
    299 
    300 /*
    301  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    302  * Be aware that large numbers can allow the driver to consume a lot of
    303  * kernel memory, especially on writes, and in degraded mode reads.
    304  *
    305  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    306  * a single 64K write will typically require 64K for the old data,
    307  * 64K for the old parity, and 64K for the new parity, for a total
    308  * of 192K (if the parity buffer is not re-used immediately).
    309  * Even it if is used immediately, that's still 128K, which when multiplied
    310  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    311  *
    312  * Now in degraded mode, for example, a 64K read on the above setup may
    313  * require data reconstruction, which will require *all* of the 4 remaining
    314  * disks to participate -- 4 * 32K/disk == 128K again.
    315  */
    316 
    317 #ifndef RAIDOUTSTANDING
    318 #define RAIDOUTSTANDING   6
    319 #endif
    320 
    321 #define RAIDLABELDEV(dev)	\
    322 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    323 
    324 /* declared here, and made public, for the benefit of KVM stuff.. */
    325 
    326 static int raidlock(struct raid_softc *);
    327 static void raidunlock(struct raid_softc *);
    328 
    329 static int raid_detach_unlocked(struct raid_softc *);
    330 
    331 static void rf_markalldirty(RF_Raid_t *);
    332 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    333 
    334 void rf_ReconThread(struct rf_recon_req_internal *);
    335 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    336 void rf_CopybackThread(RF_Raid_t *raidPtr);
    337 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    338 int rf_autoconfig(device_t);
    339 void rf_buildroothack(RF_ConfigSet_t *);
    340 
    341 RF_AutoConfig_t *rf_find_raid_components(void);
    342 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    343 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    344 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    345 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    346 int rf_set_autoconfig(RF_Raid_t *, int);
    347 int rf_set_rootpartition(RF_Raid_t *, int);
    348 void rf_release_all_vps(RF_ConfigSet_t *);
    349 void rf_cleanup_config_set(RF_ConfigSet_t *);
    350 int rf_have_enough_components(RF_ConfigSet_t *);
    351 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    352 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    353 
    354 /*
    355  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    356  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    357  * in the kernel config file.
    358  */
    359 #ifdef RAID_AUTOCONFIG
    360 int raidautoconfig = 1;
    361 #else
    362 int raidautoconfig = 0;
    363 #endif
    364 static bool raidautoconfigdone = false;
    365 
    366 struct RF_Pools_s rf_pools;
    367 
    368 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    369 static kmutex_t raid_lock;
    370 
    371 static struct raid_softc *
    372 raidcreate(int unit) {
    373 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    374 	sc->sc_unit = unit;
    375 	cv_init(&sc->sc_cv, "raidunit");
    376 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    377 	return sc;
    378 }
    379 
    380 static void
    381 raiddestroy(struct raid_softc *sc) {
    382 	cv_destroy(&sc->sc_cv);
    383 	mutex_destroy(&sc->sc_mutex);
    384 	kmem_free(sc, sizeof(*sc));
    385 }
    386 
    387 static struct raid_softc *
    388 raidget(int unit, bool create) {
    389 	struct raid_softc *sc;
    390 	if (unit < 0) {
    391 #ifdef DIAGNOSTIC
    392 		panic("%s: unit %d!", __func__, unit);
    393 #endif
    394 		return NULL;
    395 	}
    396 	mutex_enter(&raid_lock);
    397 	LIST_FOREACH(sc, &raids, sc_link) {
    398 		if (sc->sc_unit == unit) {
    399 			mutex_exit(&raid_lock);
    400 			return sc;
    401 		}
    402 	}
    403 	mutex_exit(&raid_lock);
    404 	if (!create)
    405 		return NULL;
    406 	if ((sc = raidcreate(unit)) == NULL)
    407 		return NULL;
    408 	mutex_enter(&raid_lock);
    409 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    410 	mutex_exit(&raid_lock);
    411 	return sc;
    412 }
    413 
    414 static void
    415 raidput(struct raid_softc *sc) {
    416 	mutex_enter(&raid_lock);
    417 	LIST_REMOVE(sc, sc_link);
    418 	mutex_exit(&raid_lock);
    419 	raiddestroy(sc);
    420 }
    421 
    422 void
    423 raidattach(int num)
    424 {
    425 
    426 	/*
    427 	 * Device attachment and associated initialization now occurs
    428 	 * as part of the module initialization.
    429 	 */
    430 }
    431 
    432 int
    433 rf_autoconfig(device_t self)
    434 {
    435 	RF_AutoConfig_t *ac_list;
    436 	RF_ConfigSet_t *config_sets;
    437 
    438 	if (!raidautoconfig || raidautoconfigdone == true)
    439 		return (0);
    440 
    441 	/* XXX This code can only be run once. */
    442 	raidautoconfigdone = true;
    443 
    444 #ifdef __HAVE_CPU_BOOTCONF
    445 	/*
    446 	 * 0. find the boot device if needed first so we can use it later
    447 	 * this needs to be done before we autoconfigure any raid sets,
    448 	 * because if we use wedges we are not going to be able to open
    449 	 * the boot device later
    450 	 */
    451 	if (booted_device == NULL)
    452 		cpu_bootconf();
    453 #endif
    454 	/* 1. locate all RAID components on the system */
    455 	aprint_debug("Searching for RAID components...\n");
    456 	ac_list = rf_find_raid_components();
    457 
    458 	/* 2. Sort them into their respective sets. */
    459 	config_sets = rf_create_auto_sets(ac_list);
    460 
    461 	/*
    462 	 * 3. Evaluate each set and configure the valid ones.
    463 	 * This gets done in rf_buildroothack().
    464 	 */
    465 	rf_buildroothack(config_sets);
    466 
    467 	return 1;
    468 }
    469 
    470 static int
    471 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    472 	const char *bootname = device_xname(bdv);
    473 	size_t len = strlen(bootname);
    474 
    475 	for (int col = 0; col < r->numCol; col++) {
    476 		const char *devname = r->Disks[col].devname;
    477 		devname += sizeof("/dev/") - 1;
    478 		if (strncmp(devname, "dk", 2) == 0) {
    479 			const char *parent =
    480 			    dkwedge_get_parent_name(r->Disks[col].dev);
    481 			if (parent != NULL)
    482 				devname = parent;
    483 		}
    484 		if (strncmp(devname, bootname, len) == 0) {
    485 			struct raid_softc *sc = r->softc;
    486 			aprint_debug("raid%d includes boot device %s\n",
    487 			    sc->sc_unit, devname);
    488 			return 1;
    489 		}
    490 	}
    491 	return 0;
    492 }
    493 
    494 void
    495 rf_buildroothack(RF_ConfigSet_t *config_sets)
    496 {
    497 	RF_ConfigSet_t *cset;
    498 	RF_ConfigSet_t *next_cset;
    499 	int num_root;
    500 	struct raid_softc *sc, *rsc;
    501 	struct dk_softc *dksc;
    502 
    503 	sc = rsc = NULL;
    504 	num_root = 0;
    505 	cset = config_sets;
    506 	while (cset != NULL) {
    507 		next_cset = cset->next;
    508 		if (rf_have_enough_components(cset) &&
    509 		    cset->ac->clabel->autoconfigure == 1) {
    510 			sc = rf_auto_config_set(cset);
    511 			if (sc != NULL) {
    512 				aprint_debug("raid%d: configured ok\n",
    513 				    sc->sc_unit);
    514 				if (cset->rootable) {
    515 					rsc = sc;
    516 					num_root++;
    517 				}
    518 			} else {
    519 				/* The autoconfig didn't work :( */
    520 				aprint_debug("Autoconfig failed\n");
    521 				rf_release_all_vps(cset);
    522 			}
    523 		} else {
    524 			/* we're not autoconfiguring this set...
    525 			   release the associated resources */
    526 			rf_release_all_vps(cset);
    527 		}
    528 		/* cleanup */
    529 		rf_cleanup_config_set(cset);
    530 		cset = next_cset;
    531 	}
    532 	dksc = &rsc->sc_dksc;
    533 
    534 	/* if the user has specified what the root device should be
    535 	   then we don't touch booted_device or boothowto... */
    536 
    537 	if (rootspec != NULL)
    538 		return;
    539 
    540 	/* we found something bootable... */
    541 
    542 	/*
    543 	 * XXX: The following code assumes that the root raid
    544 	 * is the first ('a') partition. This is about the best
    545 	 * we can do with a BSD disklabel, but we might be able
    546 	 * to do better with a GPT label, by setting a specified
    547 	 * attribute to indicate the root partition. We can then
    548 	 * stash the partition number in the r->root_partition
    549 	 * high bits (the bottom 2 bits are already used). For
    550 	 * now we just set booted_partition to 0 when we override
    551 	 * root.
    552 	 */
    553 	if (num_root == 1) {
    554 		device_t candidate_root;
    555 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    556 			char cname[sizeof(cset->ac->devname)];
    557 			/* XXX: assume partition 'a' first */
    558 			snprintf(cname, sizeof(cname), "%s%c",
    559 			    device_xname(dksc->sc_dev), 'a');
    560 			candidate_root = dkwedge_find_by_wname(cname);
    561 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    562 			    cname);
    563 			if (candidate_root == NULL) {
    564 				/*
    565 				 * If that is not found, because we don't use
    566 				 * disklabel, return the first dk child
    567 				 * XXX: we can skip the 'a' check above
    568 				 * and always do this...
    569 				 */
    570 				size_t i = 0;
    571 				candidate_root = dkwedge_find_by_parent(
    572 				    device_xname(dksc->sc_dev), &i);
    573 			}
    574 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    575 			    candidate_root);
    576 		} else
    577 			candidate_root = dksc->sc_dev;
    578 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    579 		DPRINTF("%s: booted_device=%p root_partition=%d "
    580 		   "contains_boot=%d\n", __func__, booted_device,
    581 		   rsc->sc_r.root_partition,
    582 		   rf_containsboot(&rsc->sc_r, booted_device));
    583 		if (booted_device == NULL ||
    584 		    rsc->sc_r.root_partition == 1 ||
    585 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    586 			booted_device = candidate_root;
    587 			booted_method = "raidframe/single";
    588 			booted_partition = 0;	/* XXX assume 'a' */
    589 		}
    590 	} else if (num_root > 1) {
    591 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    592 		    booted_device);
    593 
    594 		/*
    595 		 * Maybe the MD code can help. If it cannot, then
    596 		 * setroot() will discover that we have no
    597 		 * booted_device and will ask the user if nothing was
    598 		 * hardwired in the kernel config file
    599 		 */
    600 		if (booted_device == NULL)
    601 			return;
    602 
    603 		num_root = 0;
    604 		mutex_enter(&raid_lock);
    605 		LIST_FOREACH(sc, &raids, sc_link) {
    606 			RF_Raid_t *r = &sc->sc_r;
    607 			if (r->valid == 0)
    608 				continue;
    609 
    610 			if (r->root_partition == 0)
    611 				continue;
    612 
    613 			if (rf_containsboot(r, booted_device)) {
    614 				num_root++;
    615 				rsc = sc;
    616 				dksc = &rsc->sc_dksc;
    617 			}
    618 		}
    619 		mutex_exit(&raid_lock);
    620 
    621 		if (num_root == 1) {
    622 			booted_device = dksc->sc_dev;
    623 			booted_method = "raidframe/multi";
    624 			booted_partition = 0;	/* XXX assume 'a' */
    625 		} else {
    626 			/* we can't guess.. require the user to answer... */
    627 			boothowto |= RB_ASKNAME;
    628 		}
    629 	}
    630 }
    631 
    632 static int
    633 raidsize(dev_t dev)
    634 {
    635 	struct raid_softc *rs;
    636 	struct dk_softc *dksc;
    637 	unsigned int unit;
    638 
    639 	unit = raidunit(dev);
    640 	if ((rs = raidget(unit, false)) == NULL)
    641 		return -1;
    642 	dksc = &rs->sc_dksc;
    643 
    644 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    645 		return -1;
    646 
    647 	return dk_size(dksc, dev);
    648 }
    649 
    650 static int
    651 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    652 {
    653 	unsigned int unit;
    654 	struct raid_softc *rs;
    655 	struct dk_softc *dksc;
    656 
    657 	unit = raidunit(dev);
    658 	if ((rs = raidget(unit, false)) == NULL)
    659 		return ENXIO;
    660 	dksc = &rs->sc_dksc;
    661 
    662 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    663 		return ENODEV;
    664 
    665         /*
    666            Note that blkno is relative to this particular partition.
    667            By adding adding RF_PROTECTED_SECTORS, we get a value that
    668 	   is relative to the partition used for the underlying component.
    669         */
    670 	blkno += RF_PROTECTED_SECTORS;
    671 
    672 	return dk_dump(dksc, dev, blkno, va, size);
    673 }
    674 
    675 static int
    676 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    677 {
    678 	struct raid_softc *rs = raidsoftc(dev);
    679 	const struct bdevsw *bdev;
    680 	RF_Raid_t *raidPtr;
    681 	int     c, sparecol, j, scol, dumpto;
    682 	int     error = 0;
    683 
    684 	raidPtr = &rs->sc_r;
    685 
    686 	/* we only support dumping to RAID 1 sets */
    687 	if (raidPtr->Layout.numDataCol != 1 ||
    688 	    raidPtr->Layout.numParityCol != 1)
    689 		return EINVAL;
    690 
    691 	if ((error = raidlock(rs)) != 0)
    692 		return error;
    693 
    694 	/* figure out what device is alive.. */
    695 
    696 	/*
    697 	   Look for a component to dump to.  The preference for the
    698 	   component to dump to is as follows:
    699 	   1) the master
    700 	   2) a used_spare of the master
    701 	   3) the slave
    702 	   4) a used_spare of the slave
    703 	*/
    704 
    705 	dumpto = -1;
    706 	for (c = 0; c < raidPtr->numCol; c++) {
    707 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    708 			/* this might be the one */
    709 			dumpto = c;
    710 			break;
    711 		}
    712 	}
    713 
    714 	/*
    715 	   At this point we have possibly selected a live master or a
    716 	   live slave.  We now check to see if there is a spared
    717 	   master (or a spared slave), if we didn't find a live master
    718 	   or a live slave.
    719 	*/
    720 
    721 	for (c = 0; c < raidPtr->numSpare; c++) {
    722 		sparecol = raidPtr->numCol + c;
    723 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    724 			/* How about this one? */
    725 			scol = -1;
    726 			for(j=0;j<raidPtr->numCol;j++) {
    727 				if (raidPtr->Disks[j].spareCol == sparecol) {
    728 					scol = j;
    729 					break;
    730 				}
    731 			}
    732 			if (scol == 0) {
    733 				/*
    734 				   We must have found a spared master!
    735 				   We'll take that over anything else
    736 				   found so far.  (We couldn't have
    737 				   found a real master before, since
    738 				   this is a used spare, and it's
    739 				   saying that it's replacing the
    740 				   master.)  On reboot (with
    741 				   autoconfiguration turned on)
    742 				   sparecol will become the 1st
    743 				   component (component0) of this set.
    744 				*/
    745 				dumpto = sparecol;
    746 				break;
    747 			} else if (scol != -1) {
    748 				/*
    749 				   Must be a spared slave.  We'll dump
    750 				   to that if we havn't found anything
    751 				   else so far.
    752 				*/
    753 				if (dumpto == -1)
    754 					dumpto = sparecol;
    755 			}
    756 		}
    757 	}
    758 
    759 	if (dumpto == -1) {
    760 		/* we couldn't find any live components to dump to!?!?
    761 		 */
    762 		error = EINVAL;
    763 		goto out;
    764 	}
    765 
    766 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    767 	if (bdev == NULL) {
    768 		error = ENXIO;
    769 		goto out;
    770 	}
    771 
    772 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    773 				blkno, va, nblk * raidPtr->bytesPerSector);
    774 
    775 out:
    776 	raidunlock(rs);
    777 
    778 	return error;
    779 }
    780 
    781 /* ARGSUSED */
    782 static int
    783 raidopen(dev_t dev, int flags, int fmt,
    784     struct lwp *l)
    785 {
    786 	int     unit = raidunit(dev);
    787 	struct raid_softc *rs;
    788 	struct dk_softc *dksc;
    789 	int     error = 0;
    790 	int     part, pmask;
    791 
    792 	if ((rs = raidget(unit, true)) == NULL)
    793 		return ENXIO;
    794 	if ((error = raidlock(rs)) != 0)
    795 		return (error);
    796 
    797 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    798 		error = EBUSY;
    799 		goto bad;
    800 	}
    801 
    802 	dksc = &rs->sc_dksc;
    803 
    804 	part = DISKPART(dev);
    805 	pmask = (1 << part);
    806 
    807 	if (!DK_BUSY(dksc, pmask) &&
    808 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    809 		/* First one... mark things as dirty... Note that we *MUST*
    810 		 have done a configure before this.  I DO NOT WANT TO BE
    811 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    812 		 THAT THEY BELONG TOGETHER!!!!! */
    813 		/* XXX should check to see if we're only open for reading
    814 		   here... If so, we needn't do this, but then need some
    815 		   other way of keeping track of what's happened.. */
    816 
    817 		rf_markalldirty(&rs->sc_r);
    818 	}
    819 
    820 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    821 		error = dk_open(dksc, dev, flags, fmt, l);
    822 
    823 bad:
    824 	raidunlock(rs);
    825 
    826 	return (error);
    827 
    828 
    829 }
    830 
    831 static int
    832 raid_lastclose(device_t self)
    833 {
    834 	struct raid_softc *rs = raidsoftc(self);
    835 
    836 	/* Last one... device is not unconfigured yet.
    837 	   Device shutdown has taken care of setting the
    838 	   clean bits if RAIDF_INITED is not set
    839 	   mark things as clean... */
    840 
    841 	rf_update_component_labels(&rs->sc_r,
    842 	    RF_FINAL_COMPONENT_UPDATE);
    843 
    844 	/* pass to unlocked code */
    845 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    846 		rs->sc_flags |= RAIDF_DETACH;
    847 
    848 	return 0;
    849 }
    850 
    851 /* ARGSUSED */
    852 static int
    853 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    854 {
    855 	int     unit = raidunit(dev);
    856 	struct raid_softc *rs;
    857 	struct dk_softc *dksc;
    858 	cfdata_t cf;
    859 	int     error = 0, do_detach = 0, do_put = 0;
    860 
    861 	if ((rs = raidget(unit, false)) == NULL)
    862 		return ENXIO;
    863 	dksc = &rs->sc_dksc;
    864 
    865 	if ((error = raidlock(rs)) != 0)
    866 		return (error);
    867 
    868 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    869 		error = dk_close(dksc, dev, flags, fmt, l);
    870 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    871 			do_detach = 1;
    872 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    873 		do_put = 1;
    874 
    875 	raidunlock(rs);
    876 
    877 	if (do_detach) {
    878 		/* free the pseudo device attach bits */
    879 		cf = device_cfdata(dksc->sc_dev);
    880 		error = config_detach(dksc->sc_dev, 0);
    881 		if (error == 0)
    882 			free(cf, M_RAIDFRAME);
    883 	} else if (do_put) {
    884 		raidput(rs);
    885 	}
    886 
    887 	return (error);
    888 
    889 }
    890 
    891 static void
    892 raid_wakeup(RF_Raid_t *raidPtr)
    893 {
    894 	rf_lock_mutex2(raidPtr->iodone_lock);
    895 	rf_signal_cond2(raidPtr->iodone_cv);
    896 	rf_unlock_mutex2(raidPtr->iodone_lock);
    897 }
    898 
    899 static void
    900 raidstrategy(struct buf *bp)
    901 {
    902 	unsigned int unit;
    903 	struct raid_softc *rs;
    904 	struct dk_softc *dksc;
    905 	RF_Raid_t *raidPtr;
    906 
    907 	unit = raidunit(bp->b_dev);
    908 	if ((rs = raidget(unit, false)) == NULL) {
    909 		bp->b_error = ENXIO;
    910 		goto fail;
    911 	}
    912 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    913 		bp->b_error = ENXIO;
    914 		goto fail;
    915 	}
    916 	dksc = &rs->sc_dksc;
    917 	raidPtr = &rs->sc_r;
    918 
    919 	/* Queue IO only */
    920 	if (dk_strategy_defer(dksc, bp))
    921 		goto done;
    922 
    923 	/* schedule the IO to happen at the next convenient time */
    924 	raid_wakeup(raidPtr);
    925 
    926 done:
    927 	return;
    928 
    929 fail:
    930 	bp->b_resid = bp->b_bcount;
    931 	biodone(bp);
    932 }
    933 
    934 static int
    935 raid_diskstart(device_t dev, struct buf *bp)
    936 {
    937 	struct raid_softc *rs = raidsoftc(dev);
    938 	RF_Raid_t *raidPtr;
    939 
    940 	raidPtr = &rs->sc_r;
    941 	if (!raidPtr->valid) {
    942 		db1_printf(("raid is not valid..\n"));
    943 		return ENODEV;
    944 	}
    945 
    946 	/* XXX */
    947 	bp->b_resid = 0;
    948 
    949 	return raiddoaccess(raidPtr, bp);
    950 }
    951 
    952 void
    953 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    954 {
    955 	struct raid_softc *rs;
    956 	struct dk_softc *dksc;
    957 
    958 	rs = raidPtr->softc;
    959 	dksc = &rs->sc_dksc;
    960 
    961 	dk_done(dksc, bp);
    962 
    963 	rf_lock_mutex2(raidPtr->mutex);
    964 	raidPtr->openings++;
    965 	rf_unlock_mutex2(raidPtr->mutex);
    966 
    967 	/* schedule more IO */
    968 	raid_wakeup(raidPtr);
    969 }
    970 
    971 /* ARGSUSED */
    972 static int
    973 raidread(dev_t dev, struct uio *uio, int flags)
    974 {
    975 	int     unit = raidunit(dev);
    976 	struct raid_softc *rs;
    977 
    978 	if ((rs = raidget(unit, false)) == NULL)
    979 		return ENXIO;
    980 
    981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    982 		return (ENXIO);
    983 
    984 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    985 
    986 }
    987 
    988 /* ARGSUSED */
    989 static int
    990 raidwrite(dev_t dev, struct uio *uio, int flags)
    991 {
    992 	int     unit = raidunit(dev);
    993 	struct raid_softc *rs;
    994 
    995 	if ((rs = raidget(unit, false)) == NULL)
    996 		return ENXIO;
    997 
    998 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    999 		return (ENXIO);
   1000 
   1001 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1002 
   1003 }
   1004 
   1005 static int
   1006 raid_detach_unlocked(struct raid_softc *rs)
   1007 {
   1008 	struct dk_softc *dksc = &rs->sc_dksc;
   1009 	RF_Raid_t *raidPtr;
   1010 	int error;
   1011 
   1012 	raidPtr = &rs->sc_r;
   1013 
   1014 	if (DK_BUSY(dksc, 0) ||
   1015 	    raidPtr->recon_in_progress != 0 ||
   1016 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1017 	    raidPtr->copyback_in_progress != 0)
   1018 		return EBUSY;
   1019 
   1020 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1021 		return 0;
   1022 
   1023 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1024 
   1025 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1026 		return error;
   1027 
   1028 	rs->sc_flags &= ~RAIDF_INITED;
   1029 
   1030 	/* Kill off any queued buffers */
   1031 	dk_drain(dksc);
   1032 	bufq_free(dksc->sc_bufq);
   1033 
   1034 	/* Detach the disk. */
   1035 	dkwedge_delall(&dksc->sc_dkdev);
   1036 	disk_detach(&dksc->sc_dkdev);
   1037 	disk_destroy(&dksc->sc_dkdev);
   1038 	dk_detach(dksc);
   1039 
   1040 	return 0;
   1041 }
   1042 
   1043 static int
   1044 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1045 {
   1046 	int     unit = raidunit(dev);
   1047 	int     error = 0;
   1048 	int     part, pmask;
   1049 	struct raid_softc *rs;
   1050 	struct dk_softc *dksc;
   1051 	RF_Config_t *k_cfg, *u_cfg;
   1052 	RF_Raid_t *raidPtr;
   1053 	RF_RaidDisk_t *diskPtr;
   1054 	RF_AccTotals_t *totals;
   1055 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1056 	u_char *specific_buf;
   1057 	int retcode = 0;
   1058 	int column;
   1059 /*	int raidid; */
   1060 	struct rf_recon_req *rr;
   1061 	struct rf_recon_req_internal *rrint;
   1062 	RF_ComponentLabel_t *clabel;
   1063 	RF_ComponentLabel_t *ci_label;
   1064 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1065 	RF_SingleComponent_t component;
   1066 	int d;
   1067 
   1068 	if ((rs = raidget(unit, false)) == NULL)
   1069 		return ENXIO;
   1070 	dksc = &rs->sc_dksc;
   1071 	raidPtr = &rs->sc_r;
   1072 
   1073 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1074 		(int) DISKPART(dev), (int) unit, cmd));
   1075 
   1076 	/* Must be initialized for these... */
   1077 	switch (cmd) {
   1078 	case RAIDFRAME_REWRITEPARITY:
   1079 	case RAIDFRAME_GET_INFO:
   1080 	case RAIDFRAME_RESET_ACCTOTALS:
   1081 	case RAIDFRAME_GET_ACCTOTALS:
   1082 	case RAIDFRAME_KEEP_ACCTOTALS:
   1083 	case RAIDFRAME_GET_SIZE:
   1084 	case RAIDFRAME_FAIL_DISK:
   1085 	case RAIDFRAME_COPYBACK:
   1086 	case RAIDFRAME_CHECK_RECON_STATUS:
   1087 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1088 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1089 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1090 	case RAIDFRAME_ADD_HOT_SPARE:
   1091 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1092 	case RAIDFRAME_INIT_LABELS:
   1093 	case RAIDFRAME_REBUILD_IN_PLACE:
   1094 	case RAIDFRAME_CHECK_PARITY:
   1095 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1096 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1097 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1098 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1099 	case RAIDFRAME_SET_AUTOCONFIG:
   1100 	case RAIDFRAME_SET_ROOT:
   1101 	case RAIDFRAME_DELETE_COMPONENT:
   1102 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1103 	case RAIDFRAME_PARITYMAP_STATUS:
   1104 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1105 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1106 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1107 #ifdef COMPAT_50
   1108 	case RAIDFRAME_GET_INFO50:
   1109 #endif
   1110 #ifdef COMPAT_80
   1111 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1113 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1114 	case RAIDFRAME_GET_INFO80:
   1115 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1116 #endif
   1117 #ifdef COMPAT_NETBSD32
   1118 #ifdef _LP64
   1119 	case RAIDFRAME_GET_INFO32:
   1120 #endif
   1121 #endif
   1122 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1123 			return (ENXIO);
   1124 	}
   1125 
   1126 	switch (cmd) {
   1127 #ifdef COMPAT_50
   1128 	case RAIDFRAME_GET_INFO50:
   1129 		return rf_get_info50(raidPtr, data);
   1130 
   1131 	case RAIDFRAME_CONFIGURE50:
   1132 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1133 			return retcode;
   1134 		goto config;
   1135 #endif
   1136 
   1137 #ifdef COMPAT_80
   1138 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1139 		return rf_check_recon_status_ext80(raidPtr, data);
   1140 
   1141 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1142 		return rf_check_parityrewrite_status_ext80(raidPtr, data);
   1143 
   1144 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1145 		return rf_check_copyback_status_ext80(raidPtr, data);
   1146 
   1147 	case RAIDFRAME_GET_INFO80:
   1148 		return rf_get_info80(raidPtr, data);
   1149 
   1150 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1151 		return rf_get_component_label80(raidPtr, data);
   1152 
   1153 	case RAIDFRAME_CONFIGURE80:
   1154 		if ((retcode = rf_config80(raidPtr, unit, data, &k_cfg)) != 0)
   1155 			return retcode;
   1156 		goto config;
   1157 #endif
   1158 
   1159 		/* configure the system */
   1160 	case RAIDFRAME_CONFIGURE:
   1161 #ifdef COMPAT_NETBSD32
   1162 #ifdef _LP64
   1163 	case RAIDFRAME_CONFIGURE32:
   1164 #endif
   1165 #endif
   1166 
   1167 		if (raidPtr->valid) {
   1168 			/* There is a valid RAID set running on this unit! */
   1169 			printf("raid%d: Device already configured!\n",unit);
   1170 			return(EINVAL);
   1171 		}
   1172 
   1173 		/* copy-in the configuration information */
   1174 		/* data points to a pointer to the configuration structure */
   1175 
   1176 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1177 		if (k_cfg == NULL) {
   1178 			return (ENOMEM);
   1179 		}
   1180 #ifdef COMPAT_NETBSD32
   1181 #ifdef _LP64
   1182 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1183 		    (l->l_proc->p_flag & PK_32) != 0)
   1184 			retcode = rf_config_netbsd32(data, k_cfg);
   1185 		else
   1186 #endif
   1187 #endif
   1188 		{
   1189 			u_cfg = *((RF_Config_t **) data);
   1190 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1191 		}
   1192 		if (retcode) {
   1193 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1194 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1195 				retcode));
   1196 			goto no_config;
   1197 		}
   1198 		goto config;
   1199 	config:
   1200 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1201 
   1202 		/* allocate a buffer for the layout-specific data, and copy it
   1203 		 * in */
   1204 		if (k_cfg->layoutSpecificSize) {
   1205 			if (k_cfg->layoutSpecificSize > 10000) {
   1206 				/* sanity check */
   1207 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1208 				retcode = EINVAL;
   1209 				goto no_config;
   1210 			}
   1211 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1212 			    (u_char *));
   1213 			if (specific_buf == NULL) {
   1214 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1215 				retcode = ENOMEM;
   1216 				goto no_config;
   1217 			}
   1218 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1219 			    k_cfg->layoutSpecificSize);
   1220 			if (retcode) {
   1221 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1222 				RF_Free(specific_buf,
   1223 					k_cfg->layoutSpecificSize);
   1224 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1225 					retcode));
   1226 				goto no_config;
   1227 			}
   1228 		} else
   1229 			specific_buf = NULL;
   1230 		k_cfg->layoutSpecific = specific_buf;
   1231 
   1232 		/* should do some kind of sanity check on the configuration.
   1233 		 * Store the sum of all the bytes in the last byte? */
   1234 
   1235 		/* configure the system */
   1236 
   1237 		/*
   1238 		 * Clear the entire RAID descriptor, just to make sure
   1239 		 *  there is no stale data left in the case of a
   1240 		 *  reconfiguration
   1241 		 */
   1242 		memset(raidPtr, 0, sizeof(*raidPtr));
   1243 		raidPtr->softc = rs;
   1244 		raidPtr->raidid = unit;
   1245 
   1246 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1247 
   1248 		if (retcode == 0) {
   1249 
   1250 			/* allow this many simultaneous IO's to
   1251 			   this RAID device */
   1252 			raidPtr->openings = RAIDOUTSTANDING;
   1253 
   1254 			raidinit(rs);
   1255 			raid_wakeup(raidPtr);
   1256 			rf_markalldirty(raidPtr);
   1257 		}
   1258 		/* free the buffers.  No return code here. */
   1259 		if (k_cfg->layoutSpecificSize) {
   1260 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1261 		}
   1262 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1263 
   1264 	no_config:
   1265 		/*
   1266 		 * If configuration failed, set sc_flags so that we
   1267 		 * will detach the device when we close it.
   1268 		 */
   1269 		if (retcode != 0)
   1270 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1271 		return (retcode);
   1272 
   1273 		/* shutdown the system */
   1274 	case RAIDFRAME_SHUTDOWN:
   1275 
   1276 		part = DISKPART(dev);
   1277 		pmask = (1 << part);
   1278 
   1279 		if ((error = raidlock(rs)) != 0)
   1280 			return (error);
   1281 
   1282 		if (DK_BUSY(dksc, pmask) ||
   1283 		    raidPtr->recon_in_progress != 0 ||
   1284 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1285 		    raidPtr->copyback_in_progress != 0)
   1286 			retcode = EBUSY;
   1287 		else {
   1288 			/* detach and free on close */
   1289 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1290 			retcode = 0;
   1291 		}
   1292 
   1293 		raidunlock(rs);
   1294 
   1295 		return (retcode);
   1296 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1297 		return rf_get_component_label(raidPtr, data);
   1298 
   1299 #if 0
   1300 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1301 		clabel = (RF_ComponentLabel_t *) data;
   1302 
   1303 		/* XXX check the label for valid stuff... */
   1304 		/* Note that some things *should not* get modified --
   1305 		   the user should be re-initing the labels instead of
   1306 		   trying to patch things.
   1307 		   */
   1308 
   1309 		raidid = raidPtr->raidid;
   1310 #ifdef DEBUG
   1311 		printf("raid%d: Got component label:\n", raidid);
   1312 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1313 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1314 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1315 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1316 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1317 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1318 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1319 #endif
   1320 		clabel->row = 0;
   1321 		column = clabel->column;
   1322 
   1323 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1324 			return(EINVAL);
   1325 		}
   1326 
   1327 		/* XXX this isn't allowed to do anything for now :-) */
   1328 
   1329 		/* XXX and before it is, we need to fill in the rest
   1330 		   of the fields!?!?!?! */
   1331 		memcpy(raidget_component_label(raidPtr, column),
   1332 		    clabel, sizeof(*clabel));
   1333 		raidflush_component_label(raidPtr, column);
   1334 		return (0);
   1335 #endif
   1336 
   1337 	case RAIDFRAME_INIT_LABELS:
   1338 		clabel = (RF_ComponentLabel_t *) data;
   1339 		/*
   1340 		   we only want the serial number from
   1341 		   the above.  We get all the rest of the information
   1342 		   from the config that was used to create this RAID
   1343 		   set.
   1344 		   */
   1345 
   1346 		raidPtr->serial_number = clabel->serial_number;
   1347 
   1348 		for(column=0;column<raidPtr->numCol;column++) {
   1349 			diskPtr = &raidPtr->Disks[column];
   1350 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1351 				ci_label = raidget_component_label(raidPtr,
   1352 				    column);
   1353 				/* Zeroing this is important. */
   1354 				memset(ci_label, 0, sizeof(*ci_label));
   1355 				raid_init_component_label(raidPtr, ci_label);
   1356 				ci_label->serial_number =
   1357 				    raidPtr->serial_number;
   1358 				ci_label->row = 0; /* we dont' pretend to support more */
   1359 				rf_component_label_set_partitionsize(ci_label,
   1360 				    diskPtr->partitionSize);
   1361 				ci_label->column = column;
   1362 				raidflush_component_label(raidPtr, column);
   1363 			}
   1364 			/* XXXjld what about the spares? */
   1365 		}
   1366 
   1367 		return (retcode);
   1368 	case RAIDFRAME_SET_AUTOCONFIG:
   1369 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1370 		printf("raid%d: New autoconfig value is: %d\n",
   1371 		       raidPtr->raidid, d);
   1372 		*(int *) data = d;
   1373 		return (retcode);
   1374 
   1375 	case RAIDFRAME_SET_ROOT:
   1376 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1377 		printf("raid%d: New rootpartition value is: %d\n",
   1378 		       raidPtr->raidid, d);
   1379 		*(int *) data = d;
   1380 		return (retcode);
   1381 
   1382 		/* initialize all parity */
   1383 	case RAIDFRAME_REWRITEPARITY:
   1384 
   1385 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1386 			/* Parity for RAID 0 is trivially correct */
   1387 			raidPtr->parity_good = RF_RAID_CLEAN;
   1388 			return(0);
   1389 		}
   1390 
   1391 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1392 			/* Re-write is already in progress! */
   1393 			return(EINVAL);
   1394 		}
   1395 
   1396 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1397 					   rf_RewriteParityThread,
   1398 					   raidPtr,"raid_parity");
   1399 		return (retcode);
   1400 
   1401 
   1402 	case RAIDFRAME_ADD_HOT_SPARE:
   1403 		sparePtr = (RF_SingleComponent_t *) data;
   1404 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1405 		retcode = rf_add_hot_spare(raidPtr, &component);
   1406 		return(retcode);
   1407 
   1408 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1409 		return(retcode);
   1410 
   1411 	case RAIDFRAME_DELETE_COMPONENT:
   1412 		componentPtr = (RF_SingleComponent_t *)data;
   1413 		memcpy( &component, componentPtr,
   1414 			sizeof(RF_SingleComponent_t));
   1415 		retcode = rf_delete_component(raidPtr, &component);
   1416 		return(retcode);
   1417 
   1418 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1419 		componentPtr = (RF_SingleComponent_t *)data;
   1420 		memcpy( &component, componentPtr,
   1421 			sizeof(RF_SingleComponent_t));
   1422 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1423 		return(retcode);
   1424 
   1425 	case RAIDFRAME_REBUILD_IN_PLACE:
   1426 
   1427 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1428 			/* Can't do this on a RAID 0!! */
   1429 			return(EINVAL);
   1430 		}
   1431 
   1432 		if (raidPtr->recon_in_progress == 1) {
   1433 			/* a reconstruct is already in progress! */
   1434 			return(EINVAL);
   1435 		}
   1436 
   1437 		componentPtr = (RF_SingleComponent_t *) data;
   1438 		memcpy( &component, componentPtr,
   1439 			sizeof(RF_SingleComponent_t));
   1440 		component.row = 0; /* we don't support any more */
   1441 		column = component.column;
   1442 
   1443 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1444 			return(EINVAL);
   1445 		}
   1446 
   1447 		rf_lock_mutex2(raidPtr->mutex);
   1448 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1449 		    (raidPtr->numFailures > 0)) {
   1450 			/* XXX 0 above shouldn't be constant!!! */
   1451 			/* some component other than this has failed.
   1452 			   Let's not make things worse than they already
   1453 			   are... */
   1454 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1455 			       raidPtr->raidid);
   1456 			printf("raid%d:     Col: %d   Too many failures.\n",
   1457 			       raidPtr->raidid, column);
   1458 			rf_unlock_mutex2(raidPtr->mutex);
   1459 			return (EINVAL);
   1460 		}
   1461 		if (raidPtr->Disks[column].status ==
   1462 		    rf_ds_reconstructing) {
   1463 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1464 			       raidPtr->raidid);
   1465 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1466 
   1467 			rf_unlock_mutex2(raidPtr->mutex);
   1468 			return (EINVAL);
   1469 		}
   1470 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1471 			rf_unlock_mutex2(raidPtr->mutex);
   1472 			return (EINVAL);
   1473 		}
   1474 		rf_unlock_mutex2(raidPtr->mutex);
   1475 
   1476 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1477 		if (rrint == NULL)
   1478 			return(ENOMEM);
   1479 
   1480 		rrint->col = column;
   1481 		rrint->raidPtr = raidPtr;
   1482 
   1483 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1484 					   rf_ReconstructInPlaceThread,
   1485 					   rrint, "raid_reconip");
   1486 		return(retcode);
   1487 
   1488 	case RAIDFRAME_GET_INFO:
   1489 #ifdef COMPAT_NETBSD32
   1490 #ifdef _LP64
   1491 	case RAIDFRAME_GET_INFO32:
   1492 #endif
   1493 #endif
   1494 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1495 			  (RF_DeviceConfig_t *));
   1496 		if (d_cfg == NULL)
   1497 			return (ENOMEM);
   1498 		retcode = rf_get_info(raidPtr, d_cfg);
   1499 		if (retcode == 0) {
   1500 #ifdef COMPAT_NETBSD32
   1501 #ifdef _LP64
   1502 			if (cmd == RAIDFRAME_GET_INFO32)
   1503 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1504 			else
   1505 #endif
   1506 #endif
   1507 				ucfgp = *(RF_DeviceConfig_t **)data;
   1508 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1509 		}
   1510 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1511 
   1512 		return (retcode);
   1513 
   1514 	case RAIDFRAME_CHECK_PARITY:
   1515 		*(int *) data = raidPtr->parity_good;
   1516 		return (0);
   1517 
   1518 	case RAIDFRAME_PARITYMAP_STATUS:
   1519 		if (rf_paritymap_ineligible(raidPtr))
   1520 			return EINVAL;
   1521 		rf_paritymap_status(raidPtr->parity_map,
   1522 		    (struct rf_pmstat *)data);
   1523 		return 0;
   1524 
   1525 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1526 		if (rf_paritymap_ineligible(raidPtr))
   1527 			return EINVAL;
   1528 		if (raidPtr->parity_map == NULL)
   1529 			return ENOENT; /* ??? */
   1530 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1531 			(struct rf_pmparams *)data, 1))
   1532 			return EINVAL;
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1539 		return 0;
   1540 
   1541 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1542 		if (rf_paritymap_ineligible(raidPtr))
   1543 			return EINVAL;
   1544 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1545 		/* XXX should errors be passed up? */
   1546 		return 0;
   1547 
   1548 	case RAIDFRAME_RESET_ACCTOTALS:
   1549 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1550 		return (0);
   1551 
   1552 	case RAIDFRAME_GET_ACCTOTALS:
   1553 		totals = (RF_AccTotals_t *) data;
   1554 		*totals = raidPtr->acc_totals;
   1555 		return (0);
   1556 
   1557 	case RAIDFRAME_KEEP_ACCTOTALS:
   1558 		raidPtr->keep_acc_totals = *(int *)data;
   1559 		return (0);
   1560 
   1561 	case RAIDFRAME_GET_SIZE:
   1562 		*(int *) data = raidPtr->totalSectors;
   1563 		return (0);
   1564 
   1565 		/* fail a disk & optionally start reconstruction */
   1566 	case RAIDFRAME_FAIL_DISK:
   1567 #ifdef COMPAT_80
   1568 	case RAIDFRAME_FAIL_DISK80:
   1569 #endif
   1570 
   1571 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1572 			/* Can't do this on a RAID 0!! */
   1573 			return(EINVAL);
   1574 		}
   1575 
   1576 		rr = (struct rf_recon_req *) data;
   1577 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1578 			return (EINVAL);
   1579 
   1580 		rf_lock_mutex2(raidPtr->mutex);
   1581 		if (raidPtr->status == rf_rs_reconstructing) {
   1582 			/* you can't fail a disk while we're reconstructing! */
   1583 			/* XXX wrong for RAID6 */
   1584 			rf_unlock_mutex2(raidPtr->mutex);
   1585 			return (EINVAL);
   1586 		}
   1587 		if ((raidPtr->Disks[rr->col].status ==
   1588 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1589 			/* some other component has failed.  Let's not make
   1590 			   things worse. XXX wrong for RAID6 */
   1591 			rf_unlock_mutex2(raidPtr->mutex);
   1592 			return (EINVAL);
   1593 		}
   1594 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1595 			/* Can't fail a spared disk! */
   1596 			rf_unlock_mutex2(raidPtr->mutex);
   1597 			return (EINVAL);
   1598 		}
   1599 		rf_unlock_mutex2(raidPtr->mutex);
   1600 
   1601 		/* make a copy of the recon request so that we don't rely on
   1602 		 * the user's buffer */
   1603 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1604 		if (rrint == NULL)
   1605 			return(ENOMEM);
   1606 		rrint->col = rr->col;
   1607 		rrint->flags = rr->flags;
   1608 		rrint->raidPtr = raidPtr;
   1609 
   1610 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1611 					   rf_ReconThread,
   1612 					   rrint, "raid_recon");
   1613 		return (0);
   1614 
   1615 		/* invoke a copyback operation after recon on whatever disk
   1616 		 * needs it, if any */
   1617 	case RAIDFRAME_COPYBACK:
   1618 
   1619 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1620 			/* This makes no sense on a RAID 0!! */
   1621 			return(EINVAL);
   1622 		}
   1623 
   1624 		if (raidPtr->copyback_in_progress == 1) {
   1625 			/* Copyback is already in progress! */
   1626 			return(EINVAL);
   1627 		}
   1628 
   1629 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1630 					   rf_CopybackThread,
   1631 					   raidPtr,"raid_copyback");
   1632 		return (retcode);
   1633 
   1634 		/* return the percentage completion of reconstruction */
   1635 	case RAIDFRAME_CHECK_RECON_STATUS:
   1636 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1637 			/* This makes no sense on a RAID 0, so tell the
   1638 			   user it's done. */
   1639 			*(int *) data = 100;
   1640 			return(0);
   1641 		}
   1642 		if (raidPtr->status != rf_rs_reconstructing)
   1643 			*(int *) data = 100;
   1644 		else {
   1645 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1646 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1647 			} else {
   1648 				*(int *) data = 0;
   1649 			}
   1650 		}
   1651 		return (0);
   1652 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1653 		rf_check_recon_status_ext(raidPtr, data);
   1654 		return (0);
   1655 
   1656 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1657 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1658 			/* This makes no sense on a RAID 0, so tell the
   1659 			   user it's done. */
   1660 			*(int *) data = 100;
   1661 			return(0);
   1662 		}
   1663 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1664 			*(int *) data = 100 *
   1665 				raidPtr->parity_rewrite_stripes_done /
   1666 				raidPtr->Layout.numStripe;
   1667 		} else {
   1668 			*(int *) data = 100;
   1669 		}
   1670 		return (0);
   1671 
   1672 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1673 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1674 		return (0);
   1675 
   1676 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1677 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1678 			/* This makes no sense on a RAID 0 */
   1679 			*(int *) data = 100;
   1680 			return(0);
   1681 		}
   1682 		if (raidPtr->copyback_in_progress == 1) {
   1683 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1684 				raidPtr->Layout.numStripe;
   1685 		} else {
   1686 			*(int *) data = 100;
   1687 		}
   1688 		return (0);
   1689 
   1690 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1691 		rf_check_copyback_status_ext(raidPtr, data);
   1692 		return 0;
   1693 
   1694 	case RAIDFRAME_SET_LAST_UNIT:
   1695 		for (column = 0; column < raidPtr->numCol; column++)
   1696 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1697 				return EBUSY;
   1698 
   1699 		for (column = 0; column < raidPtr->numCol; column++) {
   1700 			clabel = raidget_component_label(raidPtr, column);
   1701 			clabel->last_unit = *(int *)data;
   1702 			raidflush_component_label(raidPtr, column);
   1703 		}
   1704 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1705 		return 0;
   1706 
   1707 		/* the sparetable daemon calls this to wait for the kernel to
   1708 		 * need a spare table. this ioctl does not return until a
   1709 		 * spare table is needed. XXX -- calling mpsleep here in the
   1710 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1711 		 * -- I should either compute the spare table in the kernel,
   1712 		 * or have a different -- XXX XXX -- interface (a different
   1713 		 * character device) for delivering the table     -- XXX */
   1714 #if 0
   1715 	case RAIDFRAME_SPARET_WAIT:
   1716 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1717 		while (!rf_sparet_wait_queue)
   1718 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1719 		waitreq = rf_sparet_wait_queue;
   1720 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1721 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1722 
   1723 		/* structure assignment */
   1724 		*((RF_SparetWait_t *) data) = *waitreq;
   1725 
   1726 		RF_Free(waitreq, sizeof(*waitreq));
   1727 		return (0);
   1728 
   1729 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1730 		 * code in it that will cause the dameon to exit */
   1731 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1732 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1733 		waitreq->fcol = -1;
   1734 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1735 		waitreq->next = rf_sparet_wait_queue;
   1736 		rf_sparet_wait_queue = waitreq;
   1737 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1738 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1739 		return (0);
   1740 
   1741 		/* used by the spare table daemon to deliver a spare table
   1742 		 * into the kernel */
   1743 	case RAIDFRAME_SEND_SPARET:
   1744 
   1745 		/* install the spare table */
   1746 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1747 
   1748 		/* respond to the requestor.  the return status of the spare
   1749 		 * table installation is passed in the "fcol" field */
   1750 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1751 		waitreq->fcol = retcode;
   1752 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1753 		waitreq->next = rf_sparet_resp_queue;
   1754 		rf_sparet_resp_queue = waitreq;
   1755 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1756 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1757 
   1758 		return (retcode);
   1759 #endif
   1760 
   1761 	default:
   1762 		break; /* fall through to the os-specific code below */
   1763 
   1764 	}
   1765 
   1766 	if (!raidPtr->valid)
   1767 		return (EINVAL);
   1768 
   1769 	/*
   1770 	 * Add support for "regular" device ioctls here.
   1771 	 */
   1772 
   1773 	switch (cmd) {
   1774 	case DIOCGCACHE:
   1775 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1776 		break;
   1777 
   1778 	case DIOCCACHESYNC:
   1779 		retcode = rf_sync_component_caches(raidPtr);
   1780 		break;
   1781 
   1782 	default:
   1783 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1784 		break;
   1785 	}
   1786 
   1787 	return (retcode);
   1788 
   1789 }
   1790 
   1791 
   1792 /* raidinit -- complete the rest of the initialization for the
   1793    RAIDframe device.  */
   1794 
   1795 
   1796 static void
   1797 raidinit(struct raid_softc *rs)
   1798 {
   1799 	cfdata_t cf;
   1800 	unsigned int unit;
   1801 	struct dk_softc *dksc = &rs->sc_dksc;
   1802 	RF_Raid_t *raidPtr = &rs->sc_r;
   1803 	device_t dev;
   1804 
   1805 	unit = raidPtr->raidid;
   1806 
   1807 	/* XXX doesn't check bounds. */
   1808 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1809 
   1810 	/* attach the pseudo device */
   1811 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1812 	cf->cf_name = raid_cd.cd_name;
   1813 	cf->cf_atname = raid_cd.cd_name;
   1814 	cf->cf_unit = unit;
   1815 	cf->cf_fstate = FSTATE_STAR;
   1816 
   1817 	dev = config_attach_pseudo(cf);
   1818 	if (dev == NULL) {
   1819 		printf("raid%d: config_attach_pseudo failed\n",
   1820 		    raidPtr->raidid);
   1821 		free(cf, M_RAIDFRAME);
   1822 		return;
   1823 	}
   1824 
   1825 	/* provide a backpointer to the real softc */
   1826 	raidsoftc(dev) = rs;
   1827 
   1828 	/* disk_attach actually creates space for the CPU disklabel, among
   1829 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1830 	 * with disklabels. */
   1831 	dk_init(dksc, dev, DKTYPE_RAID);
   1832 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1833 
   1834 	/* XXX There may be a weird interaction here between this, and
   1835 	 * protectedSectors, as used in RAIDframe.  */
   1836 
   1837 	rs->sc_size = raidPtr->totalSectors;
   1838 
   1839 	/* Attach dk and disk subsystems */
   1840 	dk_attach(dksc);
   1841 	disk_attach(&dksc->sc_dkdev);
   1842 	rf_set_geometry(rs, raidPtr);
   1843 
   1844 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1845 
   1846 	/* mark unit as usuable */
   1847 	rs->sc_flags |= RAIDF_INITED;
   1848 
   1849 	dkwedge_discover(&dksc->sc_dkdev);
   1850 }
   1851 
   1852 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1853 /* wake up the daemon & tell it to get us a spare table
   1854  * XXX
   1855  * the entries in the queues should be tagged with the raidPtr
   1856  * so that in the extremely rare case that two recons happen at once,
   1857  * we know for which device were requesting a spare table
   1858  * XXX
   1859  *
   1860  * XXX This code is not currently used. GO
   1861  */
   1862 int
   1863 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1864 {
   1865 	int     retcode;
   1866 
   1867 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1868 	req->next = rf_sparet_wait_queue;
   1869 	rf_sparet_wait_queue = req;
   1870 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1871 
   1872 	/* mpsleep unlocks the mutex */
   1873 	while (!rf_sparet_resp_queue) {
   1874 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1875 	}
   1876 	req = rf_sparet_resp_queue;
   1877 	rf_sparet_resp_queue = req->next;
   1878 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1879 
   1880 	retcode = req->fcol;
   1881 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1882 					 * alloc'd */
   1883 	return (retcode);
   1884 }
   1885 #endif
   1886 
   1887 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1888  * bp & passes it down.
   1889  * any calls originating in the kernel must use non-blocking I/O
   1890  * do some extra sanity checking to return "appropriate" error values for
   1891  * certain conditions (to make some standard utilities work)
   1892  *
   1893  * Formerly known as: rf_DoAccessKernel
   1894  */
   1895 void
   1896 raidstart(RF_Raid_t *raidPtr)
   1897 {
   1898 	struct raid_softc *rs;
   1899 	struct dk_softc *dksc;
   1900 
   1901 	rs = raidPtr->softc;
   1902 	dksc = &rs->sc_dksc;
   1903 	/* quick check to see if anything has died recently */
   1904 	rf_lock_mutex2(raidPtr->mutex);
   1905 	if (raidPtr->numNewFailures > 0) {
   1906 		rf_unlock_mutex2(raidPtr->mutex);
   1907 		rf_update_component_labels(raidPtr,
   1908 					   RF_NORMAL_COMPONENT_UPDATE);
   1909 		rf_lock_mutex2(raidPtr->mutex);
   1910 		raidPtr->numNewFailures--;
   1911 	}
   1912 	rf_unlock_mutex2(raidPtr->mutex);
   1913 
   1914 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1915 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1916 		return;
   1917 	}
   1918 
   1919 	dk_start(dksc, NULL);
   1920 }
   1921 
   1922 static int
   1923 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1924 {
   1925 	RF_SectorCount_t num_blocks, pb, sum;
   1926 	RF_RaidAddr_t raid_addr;
   1927 	daddr_t blocknum;
   1928 	int     do_async;
   1929 	int rc;
   1930 
   1931 	rf_lock_mutex2(raidPtr->mutex);
   1932 	if (raidPtr->openings == 0) {
   1933 		rf_unlock_mutex2(raidPtr->mutex);
   1934 		return EAGAIN;
   1935 	}
   1936 	rf_unlock_mutex2(raidPtr->mutex);
   1937 
   1938 	blocknum = bp->b_rawblkno;
   1939 
   1940 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1941 		    (int) blocknum));
   1942 
   1943 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1944 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1945 
   1946 	/* *THIS* is where we adjust what block we're going to...
   1947 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1948 	raid_addr = blocknum;
   1949 
   1950 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1951 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1952 	sum = raid_addr + num_blocks + pb;
   1953 	if (1 || rf_debugKernelAccess) {
   1954 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1955 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1956 			    (int) pb, (int) bp->b_resid));
   1957 	}
   1958 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1959 	    || (sum < num_blocks) || (sum < pb)) {
   1960 		rc = ENOSPC;
   1961 		goto done;
   1962 	}
   1963 	/*
   1964 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1965 	 */
   1966 
   1967 	if (bp->b_bcount & raidPtr->sectorMask) {
   1968 		rc = ENOSPC;
   1969 		goto done;
   1970 	}
   1971 	db1_printf(("Calling DoAccess..\n"));
   1972 
   1973 
   1974 	rf_lock_mutex2(raidPtr->mutex);
   1975 	raidPtr->openings--;
   1976 	rf_unlock_mutex2(raidPtr->mutex);
   1977 
   1978 	/*
   1979 	 * Everything is async.
   1980 	 */
   1981 	do_async = 1;
   1982 
   1983 	/* don't ever condition on bp->b_flags & B_WRITE.
   1984 	 * always condition on B_READ instead */
   1985 
   1986 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1987 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1988 			 do_async, raid_addr, num_blocks,
   1989 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1990 
   1991 done:
   1992 	return rc;
   1993 }
   1994 
   1995 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1996 
   1997 int
   1998 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1999 {
   2000 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2001 	struct buf *bp;
   2002 
   2003 	req->queue = queue;
   2004 	bp = req->bp;
   2005 
   2006 	switch (req->type) {
   2007 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2008 		/* XXX need to do something extra here.. */
   2009 		/* I'm leaving this in, as I've never actually seen it used,
   2010 		 * and I'd like folks to report it... GO */
   2011 		printf(("WAKEUP CALLED\n"));
   2012 		queue->numOutstanding++;
   2013 
   2014 		bp->b_flags = 0;
   2015 		bp->b_private = req;
   2016 
   2017 		KernelWakeupFunc(bp);
   2018 		break;
   2019 
   2020 	case RF_IO_TYPE_READ:
   2021 	case RF_IO_TYPE_WRITE:
   2022 #if RF_ACC_TRACE > 0
   2023 		if (req->tracerec) {
   2024 			RF_ETIMER_START(req->tracerec->timer);
   2025 		}
   2026 #endif
   2027 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2028 		    op, queue->rf_cinfo->ci_dev,
   2029 		    req->sectorOffset, req->numSector,
   2030 		    req->buf, KernelWakeupFunc, (void *) req,
   2031 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2032 
   2033 		if (rf_debugKernelAccess) {
   2034 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2035 				(long) bp->b_blkno));
   2036 		}
   2037 		queue->numOutstanding++;
   2038 		queue->last_deq_sector = req->sectorOffset;
   2039 		/* acc wouldn't have been let in if there were any pending
   2040 		 * reqs at any other priority */
   2041 		queue->curPriority = req->priority;
   2042 
   2043 		db1_printf(("Going for %c to unit %d col %d\n",
   2044 			    req->type, queue->raidPtr->raidid,
   2045 			    queue->col));
   2046 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2047 			(int) req->sectorOffset, (int) req->numSector,
   2048 			(int) (req->numSector <<
   2049 			    queue->raidPtr->logBytesPerSector),
   2050 			(int) queue->raidPtr->logBytesPerSector));
   2051 
   2052 		/*
   2053 		 * XXX: drop lock here since this can block at
   2054 		 * least with backing SCSI devices.  Retake it
   2055 		 * to minimize fuss with calling interfaces.
   2056 		 */
   2057 
   2058 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2059 		bdev_strategy(bp);
   2060 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2061 		break;
   2062 
   2063 	default:
   2064 		panic("bad req->type in rf_DispatchKernelIO");
   2065 	}
   2066 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2067 
   2068 	return (0);
   2069 }
   2070 /* this is the callback function associated with a I/O invoked from
   2071    kernel code.
   2072  */
   2073 static void
   2074 KernelWakeupFunc(struct buf *bp)
   2075 {
   2076 	RF_DiskQueueData_t *req = NULL;
   2077 	RF_DiskQueue_t *queue;
   2078 
   2079 	db1_printf(("recovering the request queue:\n"));
   2080 
   2081 	req = bp->b_private;
   2082 
   2083 	queue = (RF_DiskQueue_t *) req->queue;
   2084 
   2085 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2086 
   2087 #if RF_ACC_TRACE > 0
   2088 	if (req->tracerec) {
   2089 		RF_ETIMER_STOP(req->tracerec->timer);
   2090 		RF_ETIMER_EVAL(req->tracerec->timer);
   2091 		rf_lock_mutex2(rf_tracing_mutex);
   2092 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2093 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2094 		req->tracerec->num_phys_ios++;
   2095 		rf_unlock_mutex2(rf_tracing_mutex);
   2096 	}
   2097 #endif
   2098 
   2099 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2100 	 * ballistic, and mark the component as hosed... */
   2101 
   2102 	if (bp->b_error != 0) {
   2103 		/* Mark the disk as dead */
   2104 		/* but only mark it once... */
   2105 		/* and only if it wouldn't leave this RAID set
   2106 		   completely broken */
   2107 		if (((queue->raidPtr->Disks[queue->col].status ==
   2108 		      rf_ds_optimal) ||
   2109 		     (queue->raidPtr->Disks[queue->col].status ==
   2110 		      rf_ds_used_spare)) &&
   2111 		     (queue->raidPtr->numFailures <
   2112 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2113 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2114 			       queue->raidPtr->raidid,
   2115 			       bp->b_error,
   2116 			       queue->raidPtr->Disks[queue->col].devname);
   2117 			queue->raidPtr->Disks[queue->col].status =
   2118 			    rf_ds_failed;
   2119 			queue->raidPtr->status = rf_rs_degraded;
   2120 			queue->raidPtr->numFailures++;
   2121 			queue->raidPtr->numNewFailures++;
   2122 		} else {	/* Disk is already dead... */
   2123 			/* printf("Disk already marked as dead!\n"); */
   2124 		}
   2125 
   2126 	}
   2127 
   2128 	/* Fill in the error value */
   2129 	req->error = bp->b_error;
   2130 
   2131 	/* Drop this one on the "finished" queue... */
   2132 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2133 
   2134 	/* Let the raidio thread know there is work to be done. */
   2135 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2136 
   2137 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2138 }
   2139 
   2140 
   2141 /*
   2142  * initialize a buf structure for doing an I/O in the kernel.
   2143  */
   2144 static void
   2145 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2146        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2147        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2148        struct proc *b_proc)
   2149 {
   2150 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2151 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2152 	bp->b_oflags = 0;
   2153 	bp->b_cflags = 0;
   2154 	bp->b_bcount = numSect << logBytesPerSector;
   2155 	bp->b_bufsize = bp->b_bcount;
   2156 	bp->b_error = 0;
   2157 	bp->b_dev = dev;
   2158 	bp->b_data = bf;
   2159 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2160 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2161 	if (bp->b_bcount == 0) {
   2162 		panic("bp->b_bcount is zero in InitBP!!");
   2163 	}
   2164 	bp->b_proc = b_proc;
   2165 	bp->b_iodone = cbFunc;
   2166 	bp->b_private = cbArg;
   2167 }
   2168 
   2169 /*
   2170  * Wait interruptibly for an exclusive lock.
   2171  *
   2172  * XXX
   2173  * Several drivers do this; it should be abstracted and made MP-safe.
   2174  * (Hmm... where have we seen this warning before :->  GO )
   2175  */
   2176 static int
   2177 raidlock(struct raid_softc *rs)
   2178 {
   2179 	int     error;
   2180 
   2181 	error = 0;
   2182 	mutex_enter(&rs->sc_mutex);
   2183 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2184 		rs->sc_flags |= RAIDF_WANTED;
   2185 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2186 		if (error != 0)
   2187 			goto done;
   2188 	}
   2189 	rs->sc_flags |= RAIDF_LOCKED;
   2190 done:
   2191 	mutex_exit(&rs->sc_mutex);
   2192 	return (error);
   2193 }
   2194 /*
   2195  * Unlock and wake up any waiters.
   2196  */
   2197 static void
   2198 raidunlock(struct raid_softc *rs)
   2199 {
   2200 
   2201 	mutex_enter(&rs->sc_mutex);
   2202 	rs->sc_flags &= ~RAIDF_LOCKED;
   2203 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2204 		rs->sc_flags &= ~RAIDF_WANTED;
   2205 		cv_broadcast(&rs->sc_cv);
   2206 	}
   2207 	mutex_exit(&rs->sc_mutex);
   2208 }
   2209 
   2210 
   2211 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2212 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2213 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2214 
   2215 static daddr_t
   2216 rf_component_info_offset(void)
   2217 {
   2218 
   2219 	return RF_COMPONENT_INFO_OFFSET;
   2220 }
   2221 
   2222 static daddr_t
   2223 rf_component_info_size(unsigned secsize)
   2224 {
   2225 	daddr_t info_size;
   2226 
   2227 	KASSERT(secsize);
   2228 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2229 		info_size = secsize;
   2230 	else
   2231 		info_size = RF_COMPONENT_INFO_SIZE;
   2232 
   2233 	return info_size;
   2234 }
   2235 
   2236 static daddr_t
   2237 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2238 {
   2239 	daddr_t map_offset;
   2240 
   2241 	KASSERT(raidPtr->bytesPerSector);
   2242 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2243 		map_offset = raidPtr->bytesPerSector;
   2244 	else
   2245 		map_offset = RF_COMPONENT_INFO_SIZE;
   2246 	map_offset += rf_component_info_offset();
   2247 
   2248 	return map_offset;
   2249 }
   2250 
   2251 static daddr_t
   2252 rf_parity_map_size(RF_Raid_t *raidPtr)
   2253 {
   2254 	daddr_t map_size;
   2255 
   2256 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2257 		map_size = raidPtr->bytesPerSector;
   2258 	else
   2259 		map_size = RF_PARITY_MAP_SIZE;
   2260 
   2261 	return map_size;
   2262 }
   2263 
   2264 int
   2265 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2266 {
   2267 	RF_ComponentLabel_t *clabel;
   2268 
   2269 	clabel = raidget_component_label(raidPtr, col);
   2270 	clabel->clean = RF_RAID_CLEAN;
   2271 	raidflush_component_label(raidPtr, col);
   2272 	return(0);
   2273 }
   2274 
   2275 
   2276 int
   2277 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2278 {
   2279 	RF_ComponentLabel_t *clabel;
   2280 
   2281 	clabel = raidget_component_label(raidPtr, col);
   2282 	clabel->clean = RF_RAID_DIRTY;
   2283 	raidflush_component_label(raidPtr, col);
   2284 	return(0);
   2285 }
   2286 
   2287 int
   2288 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2289 {
   2290 	KASSERT(raidPtr->bytesPerSector);
   2291 	return raidread_component_label(raidPtr->bytesPerSector,
   2292 	    raidPtr->Disks[col].dev,
   2293 	    raidPtr->raid_cinfo[col].ci_vp,
   2294 	    &raidPtr->raid_cinfo[col].ci_label);
   2295 }
   2296 
   2297 RF_ComponentLabel_t *
   2298 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2299 {
   2300 	return &raidPtr->raid_cinfo[col].ci_label;
   2301 }
   2302 
   2303 int
   2304 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2305 {
   2306 	RF_ComponentLabel_t *label;
   2307 
   2308 	label = &raidPtr->raid_cinfo[col].ci_label;
   2309 	label->mod_counter = raidPtr->mod_counter;
   2310 #ifndef RF_NO_PARITY_MAP
   2311 	label->parity_map_modcount = label->mod_counter;
   2312 #endif
   2313 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2314 	    raidPtr->Disks[col].dev,
   2315 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2316 }
   2317 
   2318 
   2319 static int
   2320 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2321     RF_ComponentLabel_t *clabel)
   2322 {
   2323 	return raidread_component_area(dev, b_vp, clabel,
   2324 	    sizeof(RF_ComponentLabel_t),
   2325 	    rf_component_info_offset(),
   2326 	    rf_component_info_size(secsize));
   2327 }
   2328 
   2329 /* ARGSUSED */
   2330 static int
   2331 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2332     size_t msize, daddr_t offset, daddr_t dsize)
   2333 {
   2334 	struct buf *bp;
   2335 	int error;
   2336 
   2337 	/* XXX should probably ensure that we don't try to do this if
   2338 	   someone has changed rf_protected_sectors. */
   2339 
   2340 	if (b_vp == NULL) {
   2341 		/* For whatever reason, this component is not valid.
   2342 		   Don't try to read a component label from it. */
   2343 		return(EINVAL);
   2344 	}
   2345 
   2346 	/* get a block of the appropriate size... */
   2347 	bp = geteblk((int)dsize);
   2348 	bp->b_dev = dev;
   2349 
   2350 	/* get our ducks in a row for the read */
   2351 	bp->b_blkno = offset / DEV_BSIZE;
   2352 	bp->b_bcount = dsize;
   2353 	bp->b_flags |= B_READ;
   2354  	bp->b_resid = dsize;
   2355 
   2356 	bdev_strategy(bp);
   2357 	error = biowait(bp);
   2358 
   2359 	if (!error) {
   2360 		memcpy(data, bp->b_data, msize);
   2361 	}
   2362 
   2363 	brelse(bp, 0);
   2364 	return(error);
   2365 }
   2366 
   2367 
   2368 static int
   2369 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2370     RF_ComponentLabel_t *clabel)
   2371 {
   2372 	return raidwrite_component_area(dev, b_vp, clabel,
   2373 	    sizeof(RF_ComponentLabel_t),
   2374 	    rf_component_info_offset(),
   2375 	    rf_component_info_size(secsize), 0);
   2376 }
   2377 
   2378 /* ARGSUSED */
   2379 static int
   2380 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2381     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2382 {
   2383 	struct buf *bp;
   2384 	int error;
   2385 
   2386 	/* get a block of the appropriate size... */
   2387 	bp = geteblk((int)dsize);
   2388 	bp->b_dev = dev;
   2389 
   2390 	/* get our ducks in a row for the write */
   2391 	bp->b_blkno = offset / DEV_BSIZE;
   2392 	bp->b_bcount = dsize;
   2393 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2394  	bp->b_resid = dsize;
   2395 
   2396 	memset(bp->b_data, 0, dsize);
   2397 	memcpy(bp->b_data, data, msize);
   2398 
   2399 	bdev_strategy(bp);
   2400 	if (asyncp)
   2401 		return 0;
   2402 	error = biowait(bp);
   2403 	brelse(bp, 0);
   2404 	if (error) {
   2405 #if 1
   2406 		printf("Failed to write RAID component info!\n");
   2407 #endif
   2408 	}
   2409 
   2410 	return(error);
   2411 }
   2412 
   2413 void
   2414 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2415 {
   2416 	int c;
   2417 
   2418 	for (c = 0; c < raidPtr->numCol; c++) {
   2419 		/* Skip dead disks. */
   2420 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2421 			continue;
   2422 		/* XXXjld: what if an error occurs here? */
   2423 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2424 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2425 		    RF_PARITYMAP_NBYTE,
   2426 		    rf_parity_map_offset(raidPtr),
   2427 		    rf_parity_map_size(raidPtr), 0);
   2428 	}
   2429 }
   2430 
   2431 void
   2432 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2433 {
   2434 	struct rf_paritymap_ondisk tmp;
   2435 	int c,first;
   2436 
   2437 	first=1;
   2438 	for (c = 0; c < raidPtr->numCol; c++) {
   2439 		/* Skip dead disks. */
   2440 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2441 			continue;
   2442 		raidread_component_area(raidPtr->Disks[c].dev,
   2443 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2444 		    RF_PARITYMAP_NBYTE,
   2445 		    rf_parity_map_offset(raidPtr),
   2446 		    rf_parity_map_size(raidPtr));
   2447 		if (first) {
   2448 			memcpy(map, &tmp, sizeof(*map));
   2449 			first = 0;
   2450 		} else {
   2451 			rf_paritymap_merge(map, &tmp);
   2452 		}
   2453 	}
   2454 }
   2455 
   2456 void
   2457 rf_markalldirty(RF_Raid_t *raidPtr)
   2458 {
   2459 	RF_ComponentLabel_t *clabel;
   2460 	int sparecol;
   2461 	int c;
   2462 	int j;
   2463 	int scol = -1;
   2464 
   2465 	raidPtr->mod_counter++;
   2466 	for (c = 0; c < raidPtr->numCol; c++) {
   2467 		/* we don't want to touch (at all) a disk that has
   2468 		   failed */
   2469 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2470 			clabel = raidget_component_label(raidPtr, c);
   2471 			if (clabel->status == rf_ds_spared) {
   2472 				/* XXX do something special...
   2473 				   but whatever you do, don't
   2474 				   try to access it!! */
   2475 			} else {
   2476 				raidmarkdirty(raidPtr, c);
   2477 			}
   2478 		}
   2479 	}
   2480 
   2481 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2482 		sparecol = raidPtr->numCol + c;
   2483 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2484 			/*
   2485 
   2486 			   we claim this disk is "optimal" if it's
   2487 			   rf_ds_used_spare, as that means it should be
   2488 			   directly substitutable for the disk it replaced.
   2489 			   We note that too...
   2490 
   2491 			 */
   2492 
   2493 			for(j=0;j<raidPtr->numCol;j++) {
   2494 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2495 					scol = j;
   2496 					break;
   2497 				}
   2498 			}
   2499 
   2500 			clabel = raidget_component_label(raidPtr, sparecol);
   2501 			/* make sure status is noted */
   2502 
   2503 			raid_init_component_label(raidPtr, clabel);
   2504 
   2505 			clabel->row = 0;
   2506 			clabel->column = scol;
   2507 			/* Note: we *don't* change status from rf_ds_used_spare
   2508 			   to rf_ds_optimal */
   2509 			/* clabel.status = rf_ds_optimal; */
   2510 
   2511 			raidmarkdirty(raidPtr, sparecol);
   2512 		}
   2513 	}
   2514 }
   2515 
   2516 
   2517 void
   2518 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2519 {
   2520 	RF_ComponentLabel_t *clabel;
   2521 	int sparecol;
   2522 	int c;
   2523 	int j;
   2524 	int scol;
   2525 	struct raid_softc *rs = raidPtr->softc;
   2526 
   2527 	scol = -1;
   2528 
   2529 	/* XXX should do extra checks to make sure things really are clean,
   2530 	   rather than blindly setting the clean bit... */
   2531 
   2532 	raidPtr->mod_counter++;
   2533 
   2534 	for (c = 0; c < raidPtr->numCol; c++) {
   2535 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2536 			clabel = raidget_component_label(raidPtr, c);
   2537 			/* make sure status is noted */
   2538 			clabel->status = rf_ds_optimal;
   2539 
   2540 			/* note what unit we are configured as */
   2541 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2542 				clabel->last_unit = raidPtr->raidid;
   2543 
   2544 			raidflush_component_label(raidPtr, c);
   2545 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2546 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2547 					raidmarkclean(raidPtr, c);
   2548 				}
   2549 			}
   2550 		}
   2551 		/* else we don't touch it.. */
   2552 	}
   2553 
   2554 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2555 		sparecol = raidPtr->numCol + c;
   2556 		/* Need to ensure that the reconstruct actually completed! */
   2557 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2558 			/*
   2559 
   2560 			   we claim this disk is "optimal" if it's
   2561 			   rf_ds_used_spare, as that means it should be
   2562 			   directly substitutable for the disk it replaced.
   2563 			   We note that too...
   2564 
   2565 			 */
   2566 
   2567 			for(j=0;j<raidPtr->numCol;j++) {
   2568 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2569 					scol = j;
   2570 					break;
   2571 				}
   2572 			}
   2573 
   2574 			/* XXX shouldn't *really* need this... */
   2575 			clabel = raidget_component_label(raidPtr, sparecol);
   2576 			/* make sure status is noted */
   2577 
   2578 			raid_init_component_label(raidPtr, clabel);
   2579 
   2580 			clabel->column = scol;
   2581 			clabel->status = rf_ds_optimal;
   2582 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2583 				clabel->last_unit = raidPtr->raidid;
   2584 
   2585 			raidflush_component_label(raidPtr, sparecol);
   2586 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2587 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2588 					raidmarkclean(raidPtr, sparecol);
   2589 				}
   2590 			}
   2591 		}
   2592 	}
   2593 }
   2594 
   2595 void
   2596 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2597 {
   2598 
   2599 	if (vp != NULL) {
   2600 		if (auto_configured == 1) {
   2601 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2602 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2603 			vput(vp);
   2604 
   2605 		} else {
   2606 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2607 		}
   2608 	}
   2609 }
   2610 
   2611 
   2612 void
   2613 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2614 {
   2615 	int r,c;
   2616 	struct vnode *vp;
   2617 	int acd;
   2618 
   2619 
   2620 	/* We take this opportunity to close the vnodes like we should.. */
   2621 
   2622 	for (c = 0; c < raidPtr->numCol; c++) {
   2623 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2624 		acd = raidPtr->Disks[c].auto_configured;
   2625 		rf_close_component(raidPtr, vp, acd);
   2626 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2627 		raidPtr->Disks[c].auto_configured = 0;
   2628 	}
   2629 
   2630 	for (r = 0; r < raidPtr->numSpare; r++) {
   2631 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2632 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2633 		rf_close_component(raidPtr, vp, acd);
   2634 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2635 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2636 	}
   2637 }
   2638 
   2639 
   2640 void
   2641 rf_ReconThread(struct rf_recon_req_internal *req)
   2642 {
   2643 	int     s;
   2644 	RF_Raid_t *raidPtr;
   2645 
   2646 	s = splbio();
   2647 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2648 	raidPtr->recon_in_progress = 1;
   2649 
   2650 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2651 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2652 
   2653 	RF_Free(req, sizeof(*req));
   2654 
   2655 	raidPtr->recon_in_progress = 0;
   2656 	splx(s);
   2657 
   2658 	/* That's all... */
   2659 	kthread_exit(0);	/* does not return */
   2660 }
   2661 
   2662 void
   2663 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2664 {
   2665 	int retcode;
   2666 	int s;
   2667 
   2668 	raidPtr->parity_rewrite_stripes_done = 0;
   2669 	raidPtr->parity_rewrite_in_progress = 1;
   2670 	s = splbio();
   2671 	retcode = rf_RewriteParity(raidPtr);
   2672 	splx(s);
   2673 	if (retcode) {
   2674 		printf("raid%d: Error re-writing parity (%d)!\n",
   2675 		    raidPtr->raidid, retcode);
   2676 	} else {
   2677 		/* set the clean bit!  If we shutdown correctly,
   2678 		   the clean bit on each component label will get
   2679 		   set */
   2680 		raidPtr->parity_good = RF_RAID_CLEAN;
   2681 	}
   2682 	raidPtr->parity_rewrite_in_progress = 0;
   2683 
   2684 	/* Anyone waiting for us to stop?  If so, inform them... */
   2685 	if (raidPtr->waitShutdown) {
   2686 		rf_lock_mutex2(raidPtr->rad_lock);
   2687 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2688 		rf_unlock_mutex2(raidPtr->rad_lock);
   2689 	}
   2690 
   2691 	/* That's all... */
   2692 	kthread_exit(0);	/* does not return */
   2693 }
   2694 
   2695 
   2696 void
   2697 rf_CopybackThread(RF_Raid_t *raidPtr)
   2698 {
   2699 	int s;
   2700 
   2701 	raidPtr->copyback_in_progress = 1;
   2702 	s = splbio();
   2703 	rf_CopybackReconstructedData(raidPtr);
   2704 	splx(s);
   2705 	raidPtr->copyback_in_progress = 0;
   2706 
   2707 	/* That's all... */
   2708 	kthread_exit(0);	/* does not return */
   2709 }
   2710 
   2711 
   2712 void
   2713 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2714 {
   2715 	int s;
   2716 	RF_Raid_t *raidPtr;
   2717 
   2718 	s = splbio();
   2719 	raidPtr = req->raidPtr;
   2720 	raidPtr->recon_in_progress = 1;
   2721 	rf_ReconstructInPlace(raidPtr, req->col);
   2722 	RF_Free(req, sizeof(*req));
   2723 	raidPtr->recon_in_progress = 0;
   2724 	splx(s);
   2725 
   2726 	/* That's all... */
   2727 	kthread_exit(0);	/* does not return */
   2728 }
   2729 
   2730 static RF_AutoConfig_t *
   2731 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2732     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2733     unsigned secsize)
   2734 {
   2735 	int good_one = 0;
   2736 	RF_ComponentLabel_t *clabel;
   2737 	RF_AutoConfig_t *ac;
   2738 
   2739 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2740 	if (clabel == NULL) {
   2741 oomem:
   2742 		    while(ac_list) {
   2743 			    ac = ac_list;
   2744 			    if (ac->clabel)
   2745 				    free(ac->clabel, M_RAIDFRAME);
   2746 			    ac_list = ac_list->next;
   2747 			    free(ac, M_RAIDFRAME);
   2748 		    }
   2749 		    printf("RAID auto config: out of memory!\n");
   2750 		    return NULL; /* XXX probably should panic? */
   2751 	}
   2752 
   2753 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2754 		/* Got the label.  Does it look reasonable? */
   2755 		if (rf_reasonable_label(clabel, numsecs) &&
   2756 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2757 #ifdef DEBUG
   2758 			printf("Component on: %s: %llu\n",
   2759 				cname, (unsigned long long)size);
   2760 			rf_print_component_label(clabel);
   2761 #endif
   2762 			/* if it's reasonable, add it, else ignore it. */
   2763 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2764 				M_NOWAIT);
   2765 			if (ac == NULL) {
   2766 				free(clabel, M_RAIDFRAME);
   2767 				goto oomem;
   2768 			}
   2769 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2770 			ac->dev = dev;
   2771 			ac->vp = vp;
   2772 			ac->clabel = clabel;
   2773 			ac->next = ac_list;
   2774 			ac_list = ac;
   2775 			good_one = 1;
   2776 		}
   2777 	}
   2778 	if (!good_one) {
   2779 		/* cleanup */
   2780 		free(clabel, M_RAIDFRAME);
   2781 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2782 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2783 		vput(vp);
   2784 	}
   2785 	return ac_list;
   2786 }
   2787 
   2788 RF_AutoConfig_t *
   2789 rf_find_raid_components(void)
   2790 {
   2791 	struct vnode *vp;
   2792 	struct disklabel label;
   2793 	device_t dv;
   2794 	deviter_t di;
   2795 	dev_t dev;
   2796 	int bmajor, bminor, wedge, rf_part_found;
   2797 	int error;
   2798 	int i;
   2799 	RF_AutoConfig_t *ac_list;
   2800 	uint64_t numsecs;
   2801 	unsigned secsize;
   2802 	int dowedges;
   2803 
   2804 	/* initialize the AutoConfig list */
   2805 	ac_list = NULL;
   2806 
   2807 	/*
   2808 	 * we begin by trolling through *all* the devices on the system *twice*
   2809 	 * first we scan for wedges, second for other devices. This avoids
   2810 	 * using a raw partition instead of a wedge that covers the whole disk
   2811 	 */
   2812 
   2813 	for (dowedges=1; dowedges>=0; --dowedges) {
   2814 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2815 		     dv = deviter_next(&di)) {
   2816 
   2817 			/* we are only interested in disks... */
   2818 			if (device_class(dv) != DV_DISK)
   2819 				continue;
   2820 
   2821 			/* we don't care about floppies... */
   2822 			if (device_is_a(dv, "fd")) {
   2823 				continue;
   2824 			}
   2825 
   2826 			/* we don't care about CD's... */
   2827 			if (device_is_a(dv, "cd")) {
   2828 				continue;
   2829 			}
   2830 
   2831 			/* we don't care about md's... */
   2832 			if (device_is_a(dv, "md")) {
   2833 				continue;
   2834 			}
   2835 
   2836 			/* hdfd is the Atari/Hades floppy driver */
   2837 			if (device_is_a(dv, "hdfd")) {
   2838 				continue;
   2839 			}
   2840 
   2841 			/* fdisa is the Atari/Milan floppy driver */
   2842 			if (device_is_a(dv, "fdisa")) {
   2843 				continue;
   2844 			}
   2845 
   2846 			/* are we in the wedges pass ? */
   2847 			wedge = device_is_a(dv, "dk");
   2848 			if (wedge != dowedges) {
   2849 				continue;
   2850 			}
   2851 
   2852 			/* need to find the device_name_to_block_device_major stuff */
   2853 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2854 
   2855 			rf_part_found = 0; /*No raid partition as yet*/
   2856 
   2857 			/* get a vnode for the raw partition of this disk */
   2858 			bminor = minor(device_unit(dv));
   2859 			dev = wedge ? makedev(bmajor, bminor) :
   2860 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2861 			if (bdevvp(dev, &vp))
   2862 				panic("RAID can't alloc vnode");
   2863 
   2864 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2865 
   2866 			if (error) {
   2867 				/* "Who cares."  Continue looking
   2868 				   for something that exists*/
   2869 				vput(vp);
   2870 				continue;
   2871 			}
   2872 
   2873 			error = getdisksize(vp, &numsecs, &secsize);
   2874 			if (error) {
   2875 				/*
   2876 				 * Pseudo devices like vnd and cgd can be
   2877 				 * opened but may still need some configuration.
   2878 				 * Ignore these quietly.
   2879 				 */
   2880 				if (error != ENXIO)
   2881 					printf("RAIDframe: can't get disk size"
   2882 					    " for dev %s (%d)\n",
   2883 					    device_xname(dv), error);
   2884 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2885 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2886 				vput(vp);
   2887 				continue;
   2888 			}
   2889 			if (wedge) {
   2890 				struct dkwedge_info dkw;
   2891 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2892 				    NOCRED);
   2893 				if (error) {
   2894 					printf("RAIDframe: can't get wedge info for "
   2895 					    "dev %s (%d)\n", device_xname(dv), error);
   2896 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2897 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2898 					vput(vp);
   2899 					continue;
   2900 				}
   2901 
   2902 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2903 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2904 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2905 					vput(vp);
   2906 					continue;
   2907 				}
   2908 
   2909 				ac_list = rf_get_component(ac_list, dev, vp,
   2910 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2911 				rf_part_found = 1; /*There is a raid component on this disk*/
   2912 				continue;
   2913 			}
   2914 
   2915 			/* Ok, the disk exists.  Go get the disklabel. */
   2916 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2917 			if (error) {
   2918 				/*
   2919 				 * XXX can't happen - open() would
   2920 				 * have errored out (or faked up one)
   2921 				 */
   2922 				if (error != ENOTTY)
   2923 					printf("RAIDframe: can't get label for dev "
   2924 					    "%s (%d)\n", device_xname(dv), error);
   2925 			}
   2926 
   2927 			/* don't need this any more.  We'll allocate it again
   2928 			   a little later if we really do... */
   2929 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2930 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2931 			vput(vp);
   2932 
   2933 			if (error)
   2934 				continue;
   2935 
   2936 			rf_part_found = 0; /*No raid partitions yet*/
   2937 			for (i = 0; i < label.d_npartitions; i++) {
   2938 				char cname[sizeof(ac_list->devname)];
   2939 
   2940 				/* We only support partitions marked as RAID */
   2941 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2942 					continue;
   2943 
   2944 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2945 				if (bdevvp(dev, &vp))
   2946 					panic("RAID can't alloc vnode");
   2947 
   2948 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2949 				if (error) {
   2950 					/* Whatever... */
   2951 					vput(vp);
   2952 					continue;
   2953 				}
   2954 				snprintf(cname, sizeof(cname), "%s%c",
   2955 				    device_xname(dv), 'a' + i);
   2956 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2957 					label.d_partitions[i].p_size, numsecs, secsize);
   2958 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2959 			}
   2960 
   2961 			/*
   2962 			 *If there is no raid component on this disk, either in a
   2963 			 *disklabel or inside a wedge, check the raw partition as well,
   2964 			 *as it is possible to configure raid components on raw disk
   2965 			 *devices.
   2966 			 */
   2967 
   2968 			if (!rf_part_found) {
   2969 				char cname[sizeof(ac_list->devname)];
   2970 
   2971 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2972 				if (bdevvp(dev, &vp))
   2973 					panic("RAID can't alloc vnode");
   2974 
   2975 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2976 				if (error) {
   2977 					/* Whatever... */
   2978 					vput(vp);
   2979 					continue;
   2980 				}
   2981 				snprintf(cname, sizeof(cname), "%s%c",
   2982 				    device_xname(dv), 'a' + RAW_PART);
   2983 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2984 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2985 			}
   2986 		}
   2987 		deviter_release(&di);
   2988 	}
   2989 	return ac_list;
   2990 }
   2991 
   2992 
   2993 int
   2994 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2995 {
   2996 
   2997 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2998 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2999 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3000 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3001 	    clabel->row >=0 &&
   3002 	    clabel->column >= 0 &&
   3003 	    clabel->num_rows > 0 &&
   3004 	    clabel->num_columns > 0 &&
   3005 	    clabel->row < clabel->num_rows &&
   3006 	    clabel->column < clabel->num_columns &&
   3007 	    clabel->blockSize > 0 &&
   3008 	    /*
   3009 	     * numBlocksHi may contain garbage, but it is ok since
   3010 	     * the type is unsigned.  If it is really garbage,
   3011 	     * rf_fix_old_label_size() will fix it.
   3012 	     */
   3013 	    rf_component_label_numblocks(clabel) > 0) {
   3014 		/*
   3015 		 * label looks reasonable enough...
   3016 		 * let's make sure it has no old garbage.
   3017 		 */
   3018 		if (numsecs)
   3019 			rf_fix_old_label_size(clabel, numsecs);
   3020 		return(1);
   3021 	}
   3022 	return(0);
   3023 }
   3024 
   3025 
   3026 /*
   3027  * For reasons yet unknown, some old component labels have garbage in
   3028  * the newer numBlocksHi region, and this causes lossage.  Since those
   3029  * disks will also have numsecs set to less than 32 bits of sectors,
   3030  * we can determine when this corruption has occurred, and fix it.
   3031  *
   3032  * The exact same problem, with the same unknown reason, happens to
   3033  * the partitionSizeHi member as well.
   3034  */
   3035 static void
   3036 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3037 {
   3038 
   3039 	if (numsecs < ((uint64_t)1 << 32)) {
   3040 		if (clabel->numBlocksHi) {
   3041 			printf("WARNING: total sectors < 32 bits, yet "
   3042 			       "numBlocksHi set\n"
   3043 			       "WARNING: resetting numBlocksHi to zero.\n");
   3044 			clabel->numBlocksHi = 0;
   3045 		}
   3046 
   3047 		if (clabel->partitionSizeHi) {
   3048 			printf("WARNING: total sectors < 32 bits, yet "
   3049 			       "partitionSizeHi set\n"
   3050 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3051 			clabel->partitionSizeHi = 0;
   3052 		}
   3053 	}
   3054 }
   3055 
   3056 
   3057 #ifdef DEBUG
   3058 void
   3059 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3060 {
   3061 	uint64_t numBlocks;
   3062 	static const char *rp[] = {
   3063 	    "No", "Force", "Soft", "*invalid*"
   3064 	};
   3065 
   3066 
   3067 	numBlocks = rf_component_label_numblocks(clabel);
   3068 
   3069 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3070 	       clabel->row, clabel->column,
   3071 	       clabel->num_rows, clabel->num_columns);
   3072 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3073 	       clabel->version, clabel->serial_number,
   3074 	       clabel->mod_counter);
   3075 	printf("   Clean: %s Status: %d\n",
   3076 	       clabel->clean ? "Yes" : "No", clabel->status);
   3077 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3078 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3079 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3080 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3081 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3082 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3083 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3084 #if 0
   3085 	   printf("   Config order: %d\n", clabel->config_order);
   3086 #endif
   3087 
   3088 }
   3089 #endif
   3090 
   3091 RF_ConfigSet_t *
   3092 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3093 {
   3094 	RF_AutoConfig_t *ac;
   3095 	RF_ConfigSet_t *config_sets;
   3096 	RF_ConfigSet_t *cset;
   3097 	RF_AutoConfig_t *ac_next;
   3098 
   3099 
   3100 	config_sets = NULL;
   3101 
   3102 	/* Go through the AutoConfig list, and figure out which components
   3103 	   belong to what sets.  */
   3104 	ac = ac_list;
   3105 	while(ac!=NULL) {
   3106 		/* we're going to putz with ac->next, so save it here
   3107 		   for use at the end of the loop */
   3108 		ac_next = ac->next;
   3109 
   3110 		if (config_sets == NULL) {
   3111 			/* will need at least this one... */
   3112 			config_sets = (RF_ConfigSet_t *)
   3113 				malloc(sizeof(RF_ConfigSet_t),
   3114 				       M_RAIDFRAME, M_NOWAIT);
   3115 			if (config_sets == NULL) {
   3116 				panic("rf_create_auto_sets: No memory!");
   3117 			}
   3118 			/* this one is easy :) */
   3119 			config_sets->ac = ac;
   3120 			config_sets->next = NULL;
   3121 			config_sets->rootable = 0;
   3122 			ac->next = NULL;
   3123 		} else {
   3124 			/* which set does this component fit into? */
   3125 			cset = config_sets;
   3126 			while(cset!=NULL) {
   3127 				if (rf_does_it_fit(cset, ac)) {
   3128 					/* looks like it matches... */
   3129 					ac->next = cset->ac;
   3130 					cset->ac = ac;
   3131 					break;
   3132 				}
   3133 				cset = cset->next;
   3134 			}
   3135 			if (cset==NULL) {
   3136 				/* didn't find a match above... new set..*/
   3137 				cset = (RF_ConfigSet_t *)
   3138 					malloc(sizeof(RF_ConfigSet_t),
   3139 					       M_RAIDFRAME, M_NOWAIT);
   3140 				if (cset == NULL) {
   3141 					panic("rf_create_auto_sets: No memory!");
   3142 				}
   3143 				cset->ac = ac;
   3144 				ac->next = NULL;
   3145 				cset->next = config_sets;
   3146 				cset->rootable = 0;
   3147 				config_sets = cset;
   3148 			}
   3149 		}
   3150 		ac = ac_next;
   3151 	}
   3152 
   3153 
   3154 	return(config_sets);
   3155 }
   3156 
   3157 static int
   3158 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3159 {
   3160 	RF_ComponentLabel_t *clabel1, *clabel2;
   3161 
   3162 	/* If this one matches the *first* one in the set, that's good
   3163 	   enough, since the other members of the set would have been
   3164 	   through here too... */
   3165 	/* note that we are not checking partitionSize here..
   3166 
   3167 	   Note that we are also not checking the mod_counters here.
   3168 	   If everything else matches except the mod_counter, that's
   3169 	   good enough for this test.  We will deal with the mod_counters
   3170 	   a little later in the autoconfiguration process.
   3171 
   3172 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3173 
   3174 	   The reason we don't check for this is that failed disks
   3175 	   will have lower modification counts.  If those disks are
   3176 	   not added to the set they used to belong to, then they will
   3177 	   form their own set, which may result in 2 different sets,
   3178 	   for example, competing to be configured at raid0, and
   3179 	   perhaps competing to be the root filesystem set.  If the
   3180 	   wrong ones get configured, or both attempt to become /,
   3181 	   weird behaviour and or serious lossage will occur.  Thus we
   3182 	   need to bring them into the fold here, and kick them out at
   3183 	   a later point.
   3184 
   3185 	*/
   3186 
   3187 	clabel1 = cset->ac->clabel;
   3188 	clabel2 = ac->clabel;
   3189 	if ((clabel1->version == clabel2->version) &&
   3190 	    (clabel1->serial_number == clabel2->serial_number) &&
   3191 	    (clabel1->num_rows == clabel2->num_rows) &&
   3192 	    (clabel1->num_columns == clabel2->num_columns) &&
   3193 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3194 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3195 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3196 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3197 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3198 	    (clabel1->blockSize == clabel2->blockSize) &&
   3199 	    rf_component_label_numblocks(clabel1) ==
   3200 	    rf_component_label_numblocks(clabel2) &&
   3201 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3202 	    (clabel1->root_partition == clabel2->root_partition) &&
   3203 	    (clabel1->last_unit == clabel2->last_unit) &&
   3204 	    (clabel1->config_order == clabel2->config_order)) {
   3205 		/* if it get's here, it almost *has* to be a match */
   3206 	} else {
   3207 		/* it's not consistent with somebody in the set..
   3208 		   punt */
   3209 		return(0);
   3210 	}
   3211 	/* all was fine.. it must fit... */
   3212 	return(1);
   3213 }
   3214 
   3215 int
   3216 rf_have_enough_components(RF_ConfigSet_t *cset)
   3217 {
   3218 	RF_AutoConfig_t *ac;
   3219 	RF_AutoConfig_t *auto_config;
   3220 	RF_ComponentLabel_t *clabel;
   3221 	int c;
   3222 	int num_cols;
   3223 	int num_missing;
   3224 	int mod_counter;
   3225 	int mod_counter_found;
   3226 	int even_pair_failed;
   3227 	char parity_type;
   3228 
   3229 
   3230 	/* check to see that we have enough 'live' components
   3231 	   of this set.  If so, we can configure it if necessary */
   3232 
   3233 	num_cols = cset->ac->clabel->num_columns;
   3234 	parity_type = cset->ac->clabel->parityConfig;
   3235 
   3236 	/* XXX Check for duplicate components!?!?!? */
   3237 
   3238 	/* Determine what the mod_counter is supposed to be for this set. */
   3239 
   3240 	mod_counter_found = 0;
   3241 	mod_counter = 0;
   3242 	ac = cset->ac;
   3243 	while(ac!=NULL) {
   3244 		if (mod_counter_found==0) {
   3245 			mod_counter = ac->clabel->mod_counter;
   3246 			mod_counter_found = 1;
   3247 		} else {
   3248 			if (ac->clabel->mod_counter > mod_counter) {
   3249 				mod_counter = ac->clabel->mod_counter;
   3250 			}
   3251 		}
   3252 		ac = ac->next;
   3253 	}
   3254 
   3255 	num_missing = 0;
   3256 	auto_config = cset->ac;
   3257 
   3258 	even_pair_failed = 0;
   3259 	for(c=0; c<num_cols; c++) {
   3260 		ac = auto_config;
   3261 		while(ac!=NULL) {
   3262 			if ((ac->clabel->column == c) &&
   3263 			    (ac->clabel->mod_counter == mod_counter)) {
   3264 				/* it's this one... */
   3265 #ifdef DEBUG
   3266 				printf("Found: %s at %d\n",
   3267 				       ac->devname,c);
   3268 #endif
   3269 				break;
   3270 			}
   3271 			ac=ac->next;
   3272 		}
   3273 		if (ac==NULL) {
   3274 				/* Didn't find one here! */
   3275 				/* special case for RAID 1, especially
   3276 				   where there are more than 2
   3277 				   components (where RAIDframe treats
   3278 				   things a little differently :( ) */
   3279 			if (parity_type == '1') {
   3280 				if (c%2 == 0) { /* even component */
   3281 					even_pair_failed = 1;
   3282 				} else { /* odd component.  If
   3283 					    we're failed, and
   3284 					    so is the even
   3285 					    component, it's
   3286 					    "Good Night, Charlie" */
   3287 					if (even_pair_failed == 1) {
   3288 						return(0);
   3289 					}
   3290 				}
   3291 			} else {
   3292 				/* normal accounting */
   3293 				num_missing++;
   3294 			}
   3295 		}
   3296 		if ((parity_type == '1') && (c%2 == 1)) {
   3297 				/* Just did an even component, and we didn't
   3298 				   bail.. reset the even_pair_failed flag,
   3299 				   and go on to the next component.... */
   3300 			even_pair_failed = 0;
   3301 		}
   3302 	}
   3303 
   3304 	clabel = cset->ac->clabel;
   3305 
   3306 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3307 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3308 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3309 		/* XXX this needs to be made *much* more general */
   3310 		/* Too many failures */
   3311 		return(0);
   3312 	}
   3313 	/* otherwise, all is well, and we've got enough to take a kick
   3314 	   at autoconfiguring this set */
   3315 	return(1);
   3316 }
   3317 
   3318 void
   3319 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3320 			RF_Raid_t *raidPtr)
   3321 {
   3322 	RF_ComponentLabel_t *clabel;
   3323 	int i;
   3324 
   3325 	clabel = ac->clabel;
   3326 
   3327 	/* 1. Fill in the common stuff */
   3328 	config->numCol = clabel->num_columns;
   3329 	config->numSpare = 0; /* XXX should this be set here? */
   3330 	config->sectPerSU = clabel->sectPerSU;
   3331 	config->SUsPerPU = clabel->SUsPerPU;
   3332 	config->SUsPerRU = clabel->SUsPerRU;
   3333 	config->parityConfig = clabel->parityConfig;
   3334 	/* XXX... */
   3335 	strcpy(config->diskQueueType,"fifo");
   3336 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3337 	config->layoutSpecificSize = 0; /* XXX ?? */
   3338 
   3339 	while(ac!=NULL) {
   3340 		/* row/col values will be in range due to the checks
   3341 		   in reasonable_label() */
   3342 		strcpy(config->devnames[0][ac->clabel->column],
   3343 		       ac->devname);
   3344 		ac = ac->next;
   3345 	}
   3346 
   3347 	for(i=0;i<RF_MAXDBGV;i++) {
   3348 		config->debugVars[i][0] = 0;
   3349 	}
   3350 }
   3351 
   3352 int
   3353 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3354 {
   3355 	RF_ComponentLabel_t *clabel;
   3356 	int column;
   3357 	int sparecol;
   3358 
   3359 	raidPtr->autoconfigure = new_value;
   3360 
   3361 	for(column=0; column<raidPtr->numCol; column++) {
   3362 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3363 			clabel = raidget_component_label(raidPtr, column);
   3364 			clabel->autoconfigure = new_value;
   3365 			raidflush_component_label(raidPtr, column);
   3366 		}
   3367 	}
   3368 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3369 		sparecol = raidPtr->numCol + column;
   3370 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3371 			clabel = raidget_component_label(raidPtr, sparecol);
   3372 			clabel->autoconfigure = new_value;
   3373 			raidflush_component_label(raidPtr, sparecol);
   3374 		}
   3375 	}
   3376 	return(new_value);
   3377 }
   3378 
   3379 int
   3380 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3381 {
   3382 	RF_ComponentLabel_t *clabel;
   3383 	int column;
   3384 	int sparecol;
   3385 
   3386 	raidPtr->root_partition = new_value;
   3387 	for(column=0; column<raidPtr->numCol; column++) {
   3388 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3389 			clabel = raidget_component_label(raidPtr, column);
   3390 			clabel->root_partition = new_value;
   3391 			raidflush_component_label(raidPtr, column);
   3392 		}
   3393 	}
   3394 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3395 		sparecol = raidPtr->numCol + column;
   3396 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3397 			clabel = raidget_component_label(raidPtr, sparecol);
   3398 			clabel->root_partition = new_value;
   3399 			raidflush_component_label(raidPtr, sparecol);
   3400 		}
   3401 	}
   3402 	return(new_value);
   3403 }
   3404 
   3405 void
   3406 rf_release_all_vps(RF_ConfigSet_t *cset)
   3407 {
   3408 	RF_AutoConfig_t *ac;
   3409 
   3410 	ac = cset->ac;
   3411 	while(ac!=NULL) {
   3412 		/* Close the vp, and give it back */
   3413 		if (ac->vp) {
   3414 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3415 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3416 			vput(ac->vp);
   3417 			ac->vp = NULL;
   3418 		}
   3419 		ac = ac->next;
   3420 	}
   3421 }
   3422 
   3423 
   3424 void
   3425 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3426 {
   3427 	RF_AutoConfig_t *ac;
   3428 	RF_AutoConfig_t *next_ac;
   3429 
   3430 	ac = cset->ac;
   3431 	while(ac!=NULL) {
   3432 		next_ac = ac->next;
   3433 		/* nuke the label */
   3434 		free(ac->clabel, M_RAIDFRAME);
   3435 		/* cleanup the config structure */
   3436 		free(ac, M_RAIDFRAME);
   3437 		/* "next.." */
   3438 		ac = next_ac;
   3439 	}
   3440 	/* and, finally, nuke the config set */
   3441 	free(cset, M_RAIDFRAME);
   3442 }
   3443 
   3444 
   3445 void
   3446 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3447 {
   3448 	/* current version number */
   3449 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3450 	clabel->serial_number = raidPtr->serial_number;
   3451 	clabel->mod_counter = raidPtr->mod_counter;
   3452 
   3453 	clabel->num_rows = 1;
   3454 	clabel->num_columns = raidPtr->numCol;
   3455 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3456 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3457 
   3458 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3459 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3460 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3461 
   3462 	clabel->blockSize = raidPtr->bytesPerSector;
   3463 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3464 
   3465 	/* XXX not portable */
   3466 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3467 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3468 	clabel->autoconfigure = raidPtr->autoconfigure;
   3469 	clabel->root_partition = raidPtr->root_partition;
   3470 	clabel->last_unit = raidPtr->raidid;
   3471 	clabel->config_order = raidPtr->config_order;
   3472 
   3473 #ifndef RF_NO_PARITY_MAP
   3474 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3475 #endif
   3476 }
   3477 
   3478 struct raid_softc *
   3479 rf_auto_config_set(RF_ConfigSet_t *cset)
   3480 {
   3481 	RF_Raid_t *raidPtr;
   3482 	RF_Config_t *config;
   3483 	int raidID;
   3484 	struct raid_softc *sc;
   3485 
   3486 #ifdef DEBUG
   3487 	printf("RAID autoconfigure\n");
   3488 #endif
   3489 
   3490 	/* 1. Create a config structure */
   3491 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3492 	if (config == NULL) {
   3493 		printf("%s: Out of mem - config!?!?\n", __func__);
   3494 				/* XXX do something more intelligent here. */
   3495 		return NULL;
   3496 	}
   3497 
   3498 	/*
   3499 	   2. Figure out what RAID ID this one is supposed to live at
   3500 	   See if we can get the same RAID dev that it was configured
   3501 	   on last time..
   3502 	*/
   3503 
   3504 	raidID = cset->ac->clabel->last_unit;
   3505 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3506 	     sc = raidget(++raidID, false))
   3507 		continue;
   3508 #ifdef DEBUG
   3509 	printf("Configuring raid%d:\n",raidID);
   3510 #endif
   3511 
   3512 	if (sc == NULL)
   3513 		sc = raidget(raidID, true);
   3514 	if (sc == NULL) {
   3515 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3516 				/* XXX do something more intelligent here. */
   3517 		free(config, M_RAIDFRAME);
   3518 		return NULL;
   3519 	}
   3520 
   3521 	raidPtr = &sc->sc_r;
   3522 
   3523 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3524 	raidPtr->softc = sc;
   3525 	raidPtr->raidid = raidID;
   3526 	raidPtr->openings = RAIDOUTSTANDING;
   3527 
   3528 	/* 3. Build the configuration structure */
   3529 	rf_create_configuration(cset->ac, config, raidPtr);
   3530 
   3531 	/* 4. Do the configuration */
   3532 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3533 		raidinit(sc);
   3534 
   3535 		rf_markalldirty(raidPtr);
   3536 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3537 		switch (cset->ac->clabel->root_partition) {
   3538 		case 1:	/* Force Root */
   3539 		case 2:	/* Soft Root: root when boot partition part of raid */
   3540 			/*
   3541 			 * everything configured just fine.  Make a note
   3542 			 * that this set is eligible to be root,
   3543 			 * or forced to be root
   3544 			 */
   3545 			cset->rootable = cset->ac->clabel->root_partition;
   3546 			/* XXX do this here? */
   3547 			raidPtr->root_partition = cset->rootable;
   3548 			break;
   3549 		default:
   3550 			break;
   3551 		}
   3552 	} else {
   3553 		raidput(sc);
   3554 		sc = NULL;
   3555 	}
   3556 
   3557 	/* 5. Cleanup */
   3558 	free(config, M_RAIDFRAME);
   3559 	return sc;
   3560 }
   3561 
   3562 void
   3563 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3564 	     size_t xmin, size_t xmax)
   3565 {
   3566 	int error;
   3567 
   3568 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3569 	pool_sethiwat(p, xmax);
   3570 	if ((error = pool_prime(p, xmin)) != 0)
   3571 		panic("%s: failed to prime pool: %d", __func__, error);
   3572 	pool_setlowat(p, xmin);
   3573 }
   3574 
   3575 /*
   3576  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3577  * to see if there is IO pending and if that IO could possibly be done
   3578  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3579  * otherwise.
   3580  *
   3581  */
   3582 int
   3583 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3584 {
   3585 	struct raid_softc *rs;
   3586 	struct dk_softc *dksc;
   3587 
   3588 	rs = raidPtr->softc;
   3589 	dksc = &rs->sc_dksc;
   3590 
   3591 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3592 		return 1;
   3593 
   3594 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3595 		/* there is work to do */
   3596 		return 0;
   3597 	}
   3598 	/* default is nothing to do */
   3599 	return 1;
   3600 }
   3601 
   3602 int
   3603 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3604 {
   3605 	uint64_t numsecs;
   3606 	unsigned secsize;
   3607 	int error;
   3608 
   3609 	error = getdisksize(vp, &numsecs, &secsize);
   3610 	if (error == 0) {
   3611 		diskPtr->blockSize = secsize;
   3612 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3613 		diskPtr->partitionSize = numsecs;
   3614 		return 0;
   3615 	}
   3616 	return error;
   3617 }
   3618 
   3619 static int
   3620 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3621 {
   3622 	return 1;
   3623 }
   3624 
   3625 static void
   3626 raid_attach(device_t parent, device_t self, void *aux)
   3627 {
   3628 }
   3629 
   3630 
   3631 static int
   3632 raid_detach(device_t self, int flags)
   3633 {
   3634 	int error;
   3635 	struct raid_softc *rs = raidsoftc(self);
   3636 
   3637 	if (rs == NULL)
   3638 		return ENXIO;
   3639 
   3640 	if ((error = raidlock(rs)) != 0)
   3641 		return (error);
   3642 
   3643 	error = raid_detach_unlocked(rs);
   3644 
   3645 	raidunlock(rs);
   3646 
   3647 	/* XXX raid can be referenced here */
   3648 
   3649 	if (error)
   3650 		return error;
   3651 
   3652 	/* Free the softc */
   3653 	raidput(rs);
   3654 
   3655 	return 0;
   3656 }
   3657 
   3658 static void
   3659 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3660 {
   3661 	struct dk_softc *dksc = &rs->sc_dksc;
   3662 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3663 
   3664 	memset(dg, 0, sizeof(*dg));
   3665 
   3666 	dg->dg_secperunit = raidPtr->totalSectors;
   3667 	dg->dg_secsize = raidPtr->bytesPerSector;
   3668 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3669 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3670 
   3671 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3672 }
   3673 
   3674 /*
   3675  * Get cache info for all the components (including spares).
   3676  * Returns intersection of all the cache flags of all disks, or first
   3677  * error if any encountered.
   3678  * XXXfua feature flags can change as spares are added - lock down somehow
   3679  */
   3680 static int
   3681 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3682 {
   3683 	int c;
   3684 	int error;
   3685 	int dkwhole = 0, dkpart;
   3686 
   3687 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3688 		/*
   3689 		 * Check any non-dead disk, even when currently being
   3690 		 * reconstructed.
   3691 		 */
   3692 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3693 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3694 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3695 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3696 			if (error) {
   3697 				if (error != ENODEV) {
   3698 					printf("raid%d: get cache for component %s failed\n",
   3699 					    raidPtr->raidid,
   3700 					    raidPtr->Disks[c].devname);
   3701 				}
   3702 
   3703 				return error;
   3704 			}
   3705 
   3706 			if (c == 0)
   3707 				dkwhole = dkpart;
   3708 			else
   3709 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3710 		}
   3711 	}
   3712 
   3713 	*data = dkwhole;
   3714 
   3715 	return 0;
   3716 }
   3717 
   3718 /*
   3719  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3720  * We end up returning whatever error was returned by the first cache flush
   3721  * that fails.
   3722  */
   3723 
   3724 int
   3725 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3726 {
   3727 	int c, sparecol;
   3728 	int e,error;
   3729 	int force = 1;
   3730 
   3731 	error = 0;
   3732 	for (c = 0; c < raidPtr->numCol; c++) {
   3733 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3734 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3735 					  &force, FWRITE, NOCRED);
   3736 			if (e) {
   3737 				if (e != ENODEV)
   3738 					printf("raid%d: cache flush to component %s failed.\n",
   3739 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3740 				if (error == 0) {
   3741 					error = e;
   3742 				}
   3743 			}
   3744 		}
   3745 	}
   3746 
   3747 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3748 		sparecol = raidPtr->numCol + c;
   3749 		/* Need to ensure that the reconstruct actually completed! */
   3750 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3751 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3752 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3753 			if (e) {
   3754 				if (e != ENODEV)
   3755 					printf("raid%d: cache flush to component %s failed.\n",
   3756 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3757 				if (error == 0) {
   3758 					error = e;
   3759 				}
   3760 			}
   3761 		}
   3762 	}
   3763 	return error;
   3764 }
   3765 
   3766 /* Fill in info with the current status */
   3767 void
   3768 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3769 {
   3770 
   3771 	if (raidPtr->status != rf_rs_reconstructing) {
   3772 		info->total = 100;
   3773 		info->completed = 100;
   3774 	} else {
   3775 		info->total = raidPtr->reconControl->numRUsTotal;
   3776 		info->completed = raidPtr->reconControl->numRUsComplete;
   3777 	}
   3778 	info->remaining = info->total - info->completed;
   3779 }
   3780 
   3781 /* Fill in info with the current status */
   3782 void
   3783 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3784 {
   3785 
   3786 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3787 		info->total = raidPtr->Layout.numStripe;
   3788 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3789 	} else {
   3790 		info->completed = 100;
   3791 		info->total = 100;
   3792 	}
   3793 	info->remaining = info->total - info->completed;
   3794 }
   3795 
   3796 /* Fill in info with the current status */
   3797 void
   3798 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3799 {
   3800 
   3801 	if (raidPtr->copyback_in_progress == 1) {
   3802 		info->total = raidPtr->Layout.numStripe;
   3803 		info->completed = raidPtr->copyback_stripes_done;
   3804 		info->remaining = info->total - info->completed;
   3805 	} else {
   3806 		info->remaining = 0;
   3807 		info->completed = 100;
   3808 		info->total = 100;
   3809 	}
   3810 }
   3811 
   3812 /* Fill in config with the current info */
   3813 int
   3814 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3815 {
   3816 	int	d, i, j;
   3817 
   3818 	if (!raidPtr->valid)
   3819 		return (ENODEV);
   3820 	config->cols = raidPtr->numCol;
   3821 	config->ndevs = raidPtr->numCol;
   3822 	if (config->ndevs >= RF_MAX_DISKS)
   3823 		return (ENOMEM);
   3824 	config->nspares = raidPtr->numSpare;
   3825 	if (config->nspares >= RF_MAX_DISKS)
   3826 		return (ENOMEM);
   3827 	config->maxqdepth = raidPtr->maxQueueDepth;
   3828 	d = 0;
   3829 	for (j = 0; j < config->cols; j++) {
   3830 		config->devs[d] = raidPtr->Disks[j];
   3831 		d++;
   3832 	}
   3833 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3834 		config->spares[i] = raidPtr->Disks[j];
   3835 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3836 			/* XXX: raidctl(8) expects to see this as a used spare */
   3837 			config->spares[i].status = rf_ds_used_spare;
   3838 		}
   3839 	}
   3840 	return 0;
   3841 }
   3842 
   3843 int
   3844 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3845 {
   3846 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3847 	RF_ComponentLabel_t *raid_clabel;
   3848 	int column = clabel->column;
   3849 
   3850 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3851 		return EINVAL;
   3852 	raid_clabel = raidget_component_label(raidPtr, column);
   3853 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3854 
   3855 	return 0;
   3856 }
   3857 
   3858 /*
   3859  * Module interface
   3860  */
   3861 
   3862 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3863 
   3864 #ifdef _MODULE
   3865 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3866 #endif
   3867 
   3868 static int raid_modcmd(modcmd_t, void *);
   3869 static int raid_modcmd_init(void);
   3870 static int raid_modcmd_fini(void);
   3871 
   3872 static int
   3873 raid_modcmd(modcmd_t cmd, void *data)
   3874 {
   3875 	int error;
   3876 
   3877 	error = 0;
   3878 	switch (cmd) {
   3879 	case MODULE_CMD_INIT:
   3880 		error = raid_modcmd_init();
   3881 		break;
   3882 	case MODULE_CMD_FINI:
   3883 		error = raid_modcmd_fini();
   3884 		break;
   3885 	default:
   3886 		error = ENOTTY;
   3887 		break;
   3888 	}
   3889 	return error;
   3890 }
   3891 
   3892 static int
   3893 raid_modcmd_init(void)
   3894 {
   3895 	int error;
   3896 	int bmajor, cmajor;
   3897 
   3898 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3899 	mutex_enter(&raid_lock);
   3900 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3901 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3902 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3903 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3904 
   3905 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3906 #endif
   3907 
   3908 	bmajor = cmajor = -1;
   3909 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3910 	    &raid_cdevsw, &cmajor);
   3911 	if (error != 0 && error != EEXIST) {
   3912 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3913 		mutex_exit(&raid_lock);
   3914 		return error;
   3915 	}
   3916 #ifdef _MODULE
   3917 	error = config_cfdriver_attach(&raid_cd);
   3918 	if (error != 0) {
   3919 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3920 		    __func__, error);
   3921 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3922 		mutex_exit(&raid_lock);
   3923 		return error;
   3924 	}
   3925 #endif
   3926 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3927 	if (error != 0) {
   3928 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3929 		    __func__, error);
   3930 #ifdef _MODULE
   3931 		config_cfdriver_detach(&raid_cd);
   3932 #endif
   3933 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3934 		mutex_exit(&raid_lock);
   3935 		return error;
   3936 	}
   3937 
   3938 	raidautoconfigdone = false;
   3939 
   3940 	mutex_exit(&raid_lock);
   3941 
   3942 	if (error == 0) {
   3943 		if (rf_BootRaidframe(true) == 0)
   3944 			aprint_verbose("Kernelized RAIDframe activated\n");
   3945 		else
   3946 			panic("Serious error activating RAID!!");
   3947 	}
   3948 
   3949 	/*
   3950 	 * Register a finalizer which will be used to auto-config RAID
   3951 	 * sets once all real hardware devices have been found.
   3952 	 */
   3953 	error = config_finalize_register(NULL, rf_autoconfig);
   3954 	if (error != 0) {
   3955 		aprint_error("WARNING: unable to register RAIDframe "
   3956 		    "finalizer\n");
   3957 		error = 0;
   3958 	}
   3959 
   3960 	return error;
   3961 }
   3962 
   3963 static int
   3964 raid_modcmd_fini(void)
   3965 {
   3966 	int error;
   3967 
   3968 	mutex_enter(&raid_lock);
   3969 
   3970 	/* Don't allow unload if raid device(s) exist.  */
   3971 	if (!LIST_EMPTY(&raids)) {
   3972 		mutex_exit(&raid_lock);
   3973 		return EBUSY;
   3974 	}
   3975 
   3976 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3977 	if (error != 0) {
   3978 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3979 		mutex_exit(&raid_lock);
   3980 		return error;
   3981 	}
   3982 #ifdef _MODULE
   3983 	error = config_cfdriver_detach(&raid_cd);
   3984 	if (error != 0) {
   3985 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3986 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3987 		mutex_exit(&raid_lock);
   3988 		return error;
   3989 	}
   3990 #endif
   3991 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3992 	if (error != 0) {
   3993 		aprint_error("%s: cannot detach devsw\n",__func__);
   3994 #ifdef _MODULE
   3995 		config_cfdriver_attach(&raid_cd);
   3996 #endif
   3997 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3998 		mutex_exit(&raid_lock);
   3999 		return error;
   4000 	}
   4001 	rf_BootRaidframe(false);
   4002 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4003 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4004 	rf_destroy_cond2(rf_sparet_wait_cv);
   4005 	rf_destroy_cond2(rf_sparet_resp_cv);
   4006 #endif
   4007 	mutex_exit(&raid_lock);
   4008 	mutex_destroy(&raid_lock);
   4009 
   4010 	return error;
   4011 }
   4012