Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.353
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.353 2018/01/18 00:32:49 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.353 2018/01/18 00:32:49 mrg Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #ifdef COMPAT_50
    153 #include "rf_compat50.h"
    154 #endif
    155 
    156 #ifdef COMPAT_80
    157 #include "rf_compat80.h"
    158 #endif
    159 
    160 #ifdef COMPAT_NETBSD32
    161 #include "rf_compat32.h"
    162 #endif
    163 
    164 #include "ioconf.h"
    165 
    166 #ifdef DEBUG
    167 int     rf_kdebug_level = 0;
    168 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    169 #else				/* DEBUG */
    170 #define db1_printf(a) { }
    171 #endif				/* DEBUG */
    172 
    173 #ifdef DEBUG_ROOT
    174 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    175 #else
    176 #define DPRINTF(a, ...)
    177 #endif
    178 
    179 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    180 static rf_declare_mutex2(rf_sparet_wait_mutex);
    181 static rf_declare_cond2(rf_sparet_wait_cv);
    182 static rf_declare_cond2(rf_sparet_resp_cv);
    183 
    184 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    185 						 * spare table */
    186 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    187 						 * installation process */
    188 #endif
    189 
    190 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    191 
    192 /* prototypes */
    193 static void KernelWakeupFunc(struct buf *);
    194 static void InitBP(struct buf *, struct vnode *, unsigned,
    195     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    196     void *, int, struct proc *);
    197 struct raid_softc;
    198 static void raidinit(struct raid_softc *);
    199 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    200 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    201 
    202 static int raid_match(device_t, cfdata_t, void *);
    203 static void raid_attach(device_t, device_t, void *);
    204 static int raid_detach(device_t, int);
    205 
    206 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    207     daddr_t, daddr_t);
    208 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    209     daddr_t, daddr_t, int);
    210 
    211 static int raidwrite_component_label(unsigned,
    212     dev_t, struct vnode *, RF_ComponentLabel_t *);
    213 static int raidread_component_label(unsigned,
    214     dev_t, struct vnode *, RF_ComponentLabel_t *);
    215 
    216 static int raid_diskstart(device_t, struct buf *bp);
    217 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    218 static int raid_lastclose(device_t);
    219 
    220 static dev_type_open(raidopen);
    221 static dev_type_close(raidclose);
    222 static dev_type_read(raidread);
    223 static dev_type_write(raidwrite);
    224 static dev_type_ioctl(raidioctl);
    225 static dev_type_strategy(raidstrategy);
    226 static dev_type_dump(raiddump);
    227 static dev_type_size(raidsize);
    228 
    229 const struct bdevsw raid_bdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_strategy = raidstrategy,
    233 	.d_ioctl = raidioctl,
    234 	.d_dump = raiddump,
    235 	.d_psize = raidsize,
    236 	.d_discard = nodiscard,
    237 	.d_flag = D_DISK
    238 };
    239 
    240 const struct cdevsw raid_cdevsw = {
    241 	.d_open = raidopen,
    242 	.d_close = raidclose,
    243 	.d_read = raidread,
    244 	.d_write = raidwrite,
    245 	.d_ioctl = raidioctl,
    246 	.d_stop = nostop,
    247 	.d_tty = notty,
    248 	.d_poll = nopoll,
    249 	.d_mmap = nommap,
    250 	.d_kqfilter = nokqfilter,
    251 	.d_discard = nodiscard,
    252 	.d_flag = D_DISK
    253 };
    254 
    255 static struct dkdriver rf_dkdriver = {
    256 	.d_open = raidopen,
    257 	.d_close = raidclose,
    258 	.d_strategy = raidstrategy,
    259 	.d_diskstart = raid_diskstart,
    260 	.d_dumpblocks = raid_dumpblocks,
    261 	.d_lastclose = raid_lastclose,
    262 	.d_minphys = minphys
    263 };
    264 
    265 struct raid_softc {
    266 	struct dk_softc sc_dksc;
    267 	int	sc_unit;
    268 	int     sc_flags;	/* flags */
    269 	int     sc_cflags;	/* configuration flags */
    270 	kmutex_t sc_mutex;	/* interlock mutex */
    271 	kcondvar_t sc_cv;	/* and the condvar */
    272 	uint64_t sc_size;	/* size of the raid device */
    273 	char    sc_xname[20];	/* XXX external name */
    274 	RF_Raid_t sc_r;
    275 	LIST_ENTRY(raid_softc) sc_link;
    276 };
    277 /* sc_flags */
    278 #define RAIDF_INITED		0x01	/* unit has been initialized */
    279 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    280 #define RAIDF_DETACH  		0x04	/* detach after final close */
    281 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    282 #define RAIDF_LOCKED		0x10	/* unit is locked */
    283 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    284 
    285 #define	raidunit(x)	DISKUNIT(x)
    286 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    287 
    288 extern struct cfdriver raid_cd;
    289 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    290     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    291     DVF_DETACH_SHUTDOWN);
    292 
    293 /* Internal representation of a rf_recon_req */
    294 struct rf_recon_req_internal {
    295 	RF_RowCol_t col;
    296 	RF_ReconReqFlags_t flags;
    297 	void   *raidPtr;
    298 };
    299 
    300 /*
    301  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    302  * Be aware that large numbers can allow the driver to consume a lot of
    303  * kernel memory, especially on writes, and in degraded mode reads.
    304  *
    305  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    306  * a single 64K write will typically require 64K for the old data,
    307  * 64K for the old parity, and 64K for the new parity, for a total
    308  * of 192K (if the parity buffer is not re-used immediately).
    309  * Even it if is used immediately, that's still 128K, which when multiplied
    310  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    311  *
    312  * Now in degraded mode, for example, a 64K read on the above setup may
    313  * require data reconstruction, which will require *all* of the 4 remaining
    314  * disks to participate -- 4 * 32K/disk == 128K again.
    315  */
    316 
    317 #ifndef RAIDOUTSTANDING
    318 #define RAIDOUTSTANDING   6
    319 #endif
    320 
    321 #define RAIDLABELDEV(dev)	\
    322 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    323 
    324 /* declared here, and made public, for the benefit of KVM stuff.. */
    325 
    326 static int raidlock(struct raid_softc *);
    327 static void raidunlock(struct raid_softc *);
    328 
    329 static int raid_detach_unlocked(struct raid_softc *);
    330 
    331 static void rf_markalldirty(RF_Raid_t *);
    332 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    333 
    334 void rf_ReconThread(struct rf_recon_req_internal *);
    335 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    336 void rf_CopybackThread(RF_Raid_t *raidPtr);
    337 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    338 int rf_autoconfig(device_t);
    339 void rf_buildroothack(RF_ConfigSet_t *);
    340 
    341 RF_AutoConfig_t *rf_find_raid_components(void);
    342 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    343 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    344 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    345 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    346 int rf_set_autoconfig(RF_Raid_t *, int);
    347 int rf_set_rootpartition(RF_Raid_t *, int);
    348 void rf_release_all_vps(RF_ConfigSet_t *);
    349 void rf_cleanup_config_set(RF_ConfigSet_t *);
    350 int rf_have_enough_components(RF_ConfigSet_t *);
    351 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    352 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    353 
    354 /*
    355  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    356  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    357  * in the kernel config file.
    358  */
    359 #ifdef RAID_AUTOCONFIG
    360 int raidautoconfig = 1;
    361 #else
    362 int raidautoconfig = 0;
    363 #endif
    364 static bool raidautoconfigdone = false;
    365 
    366 struct RF_Pools_s rf_pools;
    367 
    368 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    369 static kmutex_t raid_lock;
    370 
    371 static struct raid_softc *
    372 raidcreate(int unit) {
    373 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    374 	sc->sc_unit = unit;
    375 	cv_init(&sc->sc_cv, "raidunit");
    376 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    377 	return sc;
    378 }
    379 
    380 static void
    381 raiddestroy(struct raid_softc *sc) {
    382 	cv_destroy(&sc->sc_cv);
    383 	mutex_destroy(&sc->sc_mutex);
    384 	kmem_free(sc, sizeof(*sc));
    385 }
    386 
    387 static struct raid_softc *
    388 raidget(int unit, bool create) {
    389 	struct raid_softc *sc;
    390 	if (unit < 0) {
    391 #ifdef DIAGNOSTIC
    392 		panic("%s: unit %d!", __func__, unit);
    393 #endif
    394 		return NULL;
    395 	}
    396 	mutex_enter(&raid_lock);
    397 	LIST_FOREACH(sc, &raids, sc_link) {
    398 		if (sc->sc_unit == unit) {
    399 			mutex_exit(&raid_lock);
    400 			return sc;
    401 		}
    402 	}
    403 	mutex_exit(&raid_lock);
    404 	if (!create)
    405 		return NULL;
    406 	if ((sc = raidcreate(unit)) == NULL)
    407 		return NULL;
    408 	mutex_enter(&raid_lock);
    409 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    410 	mutex_exit(&raid_lock);
    411 	return sc;
    412 }
    413 
    414 static void
    415 raidput(struct raid_softc *sc) {
    416 	mutex_enter(&raid_lock);
    417 	LIST_REMOVE(sc, sc_link);
    418 	mutex_exit(&raid_lock);
    419 	raiddestroy(sc);
    420 }
    421 
    422 void
    423 raidattach(int num)
    424 {
    425 
    426 	/*
    427 	 * Device attachment and associated initialization now occurs
    428 	 * as part of the module initialization.
    429 	 */
    430 }
    431 
    432 int
    433 rf_autoconfig(device_t self)
    434 {
    435 	RF_AutoConfig_t *ac_list;
    436 	RF_ConfigSet_t *config_sets;
    437 
    438 	if (!raidautoconfig || raidautoconfigdone == true)
    439 		return (0);
    440 
    441 	/* XXX This code can only be run once. */
    442 	raidautoconfigdone = true;
    443 
    444 #ifdef __HAVE_CPU_BOOTCONF
    445 	/*
    446 	 * 0. find the boot device if needed first so we can use it later
    447 	 * this needs to be done before we autoconfigure any raid sets,
    448 	 * because if we use wedges we are not going to be able to open
    449 	 * the boot device later
    450 	 */
    451 	if (booted_device == NULL)
    452 		cpu_bootconf();
    453 #endif
    454 	/* 1. locate all RAID components on the system */
    455 	aprint_debug("Searching for RAID components...\n");
    456 	ac_list = rf_find_raid_components();
    457 
    458 	/* 2. Sort them into their respective sets. */
    459 	config_sets = rf_create_auto_sets(ac_list);
    460 
    461 	/*
    462 	 * 3. Evaluate each set and configure the valid ones.
    463 	 * This gets done in rf_buildroothack().
    464 	 */
    465 	rf_buildroothack(config_sets);
    466 
    467 	return 1;
    468 }
    469 
    470 static int
    471 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    472 	const char *bootname = device_xname(bdv);
    473 	size_t len = strlen(bootname);
    474 
    475 	for (int col = 0; col < r->numCol; col++) {
    476 		const char *devname = r->Disks[col].devname;
    477 		devname += sizeof("/dev/") - 1;
    478 		if (strncmp(devname, "dk", 2) == 0) {
    479 			const char *parent =
    480 			    dkwedge_get_parent_name(r->Disks[col].dev);
    481 			if (parent != NULL)
    482 				devname = parent;
    483 		}
    484 		if (strncmp(devname, bootname, len) == 0) {
    485 			struct raid_softc *sc = r->softc;
    486 			aprint_debug("raid%d includes boot device %s\n",
    487 			    sc->sc_unit, devname);
    488 			return 1;
    489 		}
    490 	}
    491 	return 0;
    492 }
    493 
    494 void
    495 rf_buildroothack(RF_ConfigSet_t *config_sets)
    496 {
    497 	RF_ConfigSet_t *cset;
    498 	RF_ConfigSet_t *next_cset;
    499 	int num_root;
    500 	struct raid_softc *sc, *rsc;
    501 	struct dk_softc *dksc;
    502 
    503 	sc = rsc = NULL;
    504 	num_root = 0;
    505 	cset = config_sets;
    506 	while (cset != NULL) {
    507 		next_cset = cset->next;
    508 		if (rf_have_enough_components(cset) &&
    509 		    cset->ac->clabel->autoconfigure == 1) {
    510 			sc = rf_auto_config_set(cset);
    511 			if (sc != NULL) {
    512 				aprint_debug("raid%d: configured ok\n",
    513 				    sc->sc_unit);
    514 				if (cset->rootable) {
    515 					rsc = sc;
    516 					num_root++;
    517 				}
    518 			} else {
    519 				/* The autoconfig didn't work :( */
    520 				aprint_debug("Autoconfig failed\n");
    521 				rf_release_all_vps(cset);
    522 			}
    523 		} else {
    524 			/* we're not autoconfiguring this set...
    525 			   release the associated resources */
    526 			rf_release_all_vps(cset);
    527 		}
    528 		/* cleanup */
    529 		rf_cleanup_config_set(cset);
    530 		cset = next_cset;
    531 	}
    532 	dksc = &rsc->sc_dksc;
    533 
    534 	/* if the user has specified what the root device should be
    535 	   then we don't touch booted_device or boothowto... */
    536 
    537 	if (rootspec != NULL)
    538 		return;
    539 
    540 	/* we found something bootable... */
    541 
    542 	/*
    543 	 * XXX: The following code assumes that the root raid
    544 	 * is the first ('a') partition. This is about the best
    545 	 * we can do with a BSD disklabel, but we might be able
    546 	 * to do better with a GPT label, by setting a specified
    547 	 * attribute to indicate the root partition. We can then
    548 	 * stash the partition number in the r->root_partition
    549 	 * high bits (the bottom 2 bits are already used). For
    550 	 * now we just set booted_partition to 0 when we override
    551 	 * root.
    552 	 */
    553 	if (num_root == 1) {
    554 		device_t candidate_root;
    555 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    556 			char cname[sizeof(cset->ac->devname)];
    557 			/* XXX: assume partition 'a' first */
    558 			snprintf(cname, sizeof(cname), "%s%c",
    559 			    device_xname(dksc->sc_dev), 'a');
    560 			candidate_root = dkwedge_find_by_wname(cname);
    561 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    562 			    cname);
    563 			if (candidate_root == NULL) {
    564 				/*
    565 				 * If that is not found, because we don't use
    566 				 * disklabel, return the first dk child
    567 				 * XXX: we can skip the 'a' check above
    568 				 * and always do this...
    569 				 */
    570 				size_t i = 0;
    571 				candidate_root = dkwedge_find_by_parent(
    572 				    device_xname(dksc->sc_dev), &i);
    573 			}
    574 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    575 			    candidate_root);
    576 		} else
    577 			candidate_root = dksc->sc_dev;
    578 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    579 		DPRINTF("%s: booted_device=%p root_partition=%d "
    580 		   "contains_boot=%d\n", __func__, booted_device,
    581 		   rsc->sc_r.root_partition,
    582 		   rf_containsboot(&rsc->sc_r, booted_device));
    583 		if (booted_device == NULL ||
    584 		    rsc->sc_r.root_partition == 1 ||
    585 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    586 			booted_device = candidate_root;
    587 			booted_method = "raidframe/single";
    588 			booted_partition = 0;	/* XXX assume 'a' */
    589 		}
    590 	} else if (num_root > 1) {
    591 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    592 		    booted_device);
    593 
    594 		/*
    595 		 * Maybe the MD code can help. If it cannot, then
    596 		 * setroot() will discover that we have no
    597 		 * booted_device and will ask the user if nothing was
    598 		 * hardwired in the kernel config file
    599 		 */
    600 		if (booted_device == NULL)
    601 			return;
    602 
    603 		num_root = 0;
    604 		mutex_enter(&raid_lock);
    605 		LIST_FOREACH(sc, &raids, sc_link) {
    606 			RF_Raid_t *r = &sc->sc_r;
    607 			if (r->valid == 0)
    608 				continue;
    609 
    610 			if (r->root_partition == 0)
    611 				continue;
    612 
    613 			if (rf_containsboot(r, booted_device)) {
    614 				num_root++;
    615 				rsc = sc;
    616 				dksc = &rsc->sc_dksc;
    617 			}
    618 		}
    619 		mutex_exit(&raid_lock);
    620 
    621 		if (num_root == 1) {
    622 			booted_device = dksc->sc_dev;
    623 			booted_method = "raidframe/multi";
    624 			booted_partition = 0;	/* XXX assume 'a' */
    625 		} else {
    626 			/* we can't guess.. require the user to answer... */
    627 			boothowto |= RB_ASKNAME;
    628 		}
    629 	}
    630 }
    631 
    632 static int
    633 raidsize(dev_t dev)
    634 {
    635 	struct raid_softc *rs;
    636 	struct dk_softc *dksc;
    637 	unsigned int unit;
    638 
    639 	unit = raidunit(dev);
    640 	if ((rs = raidget(unit, false)) == NULL)
    641 		return -1;
    642 	dksc = &rs->sc_dksc;
    643 
    644 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    645 		return -1;
    646 
    647 	return dk_size(dksc, dev);
    648 }
    649 
    650 static int
    651 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    652 {
    653 	unsigned int unit;
    654 	struct raid_softc *rs;
    655 	struct dk_softc *dksc;
    656 
    657 	unit = raidunit(dev);
    658 	if ((rs = raidget(unit, false)) == NULL)
    659 		return ENXIO;
    660 	dksc = &rs->sc_dksc;
    661 
    662 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    663 		return ENODEV;
    664 
    665         /*
    666            Note that blkno is relative to this particular partition.
    667            By adding adding RF_PROTECTED_SECTORS, we get a value that
    668 	   is relative to the partition used for the underlying component.
    669         */
    670 	blkno += RF_PROTECTED_SECTORS;
    671 
    672 	return dk_dump(dksc, dev, blkno, va, size);
    673 }
    674 
    675 static int
    676 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    677 {
    678 	struct raid_softc *rs = raidsoftc(dev);
    679 	const struct bdevsw *bdev;
    680 	RF_Raid_t *raidPtr;
    681 	int     c, sparecol, j, scol, dumpto;
    682 	int     error = 0;
    683 
    684 	raidPtr = &rs->sc_r;
    685 
    686 	/* we only support dumping to RAID 1 sets */
    687 	if (raidPtr->Layout.numDataCol != 1 ||
    688 	    raidPtr->Layout.numParityCol != 1)
    689 		return EINVAL;
    690 
    691 	if ((error = raidlock(rs)) != 0)
    692 		return error;
    693 
    694 	/* figure out what device is alive.. */
    695 
    696 	/*
    697 	   Look for a component to dump to.  The preference for the
    698 	   component to dump to is as follows:
    699 	   1) the master
    700 	   2) a used_spare of the master
    701 	   3) the slave
    702 	   4) a used_spare of the slave
    703 	*/
    704 
    705 	dumpto = -1;
    706 	for (c = 0; c < raidPtr->numCol; c++) {
    707 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    708 			/* this might be the one */
    709 			dumpto = c;
    710 			break;
    711 		}
    712 	}
    713 
    714 	/*
    715 	   At this point we have possibly selected a live master or a
    716 	   live slave.  We now check to see if there is a spared
    717 	   master (or a spared slave), if we didn't find a live master
    718 	   or a live slave.
    719 	*/
    720 
    721 	for (c = 0; c < raidPtr->numSpare; c++) {
    722 		sparecol = raidPtr->numCol + c;
    723 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    724 			/* How about this one? */
    725 			scol = -1;
    726 			for(j=0;j<raidPtr->numCol;j++) {
    727 				if (raidPtr->Disks[j].spareCol == sparecol) {
    728 					scol = j;
    729 					break;
    730 				}
    731 			}
    732 			if (scol == 0) {
    733 				/*
    734 				   We must have found a spared master!
    735 				   We'll take that over anything else
    736 				   found so far.  (We couldn't have
    737 				   found a real master before, since
    738 				   this is a used spare, and it's
    739 				   saying that it's replacing the
    740 				   master.)  On reboot (with
    741 				   autoconfiguration turned on)
    742 				   sparecol will become the 1st
    743 				   component (component0) of this set.
    744 				*/
    745 				dumpto = sparecol;
    746 				break;
    747 			} else if (scol != -1) {
    748 				/*
    749 				   Must be a spared slave.  We'll dump
    750 				   to that if we havn't found anything
    751 				   else so far.
    752 				*/
    753 				if (dumpto == -1)
    754 					dumpto = sparecol;
    755 			}
    756 		}
    757 	}
    758 
    759 	if (dumpto == -1) {
    760 		/* we couldn't find any live components to dump to!?!?
    761 		 */
    762 		error = EINVAL;
    763 		goto out;
    764 	}
    765 
    766 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    767 	if (bdev == NULL) {
    768 		error = ENXIO;
    769 		goto out;
    770 	}
    771 
    772 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    773 				blkno, va, nblk * raidPtr->bytesPerSector);
    774 
    775 out:
    776 	raidunlock(rs);
    777 
    778 	return error;
    779 }
    780 
    781 /* ARGSUSED */
    782 static int
    783 raidopen(dev_t dev, int flags, int fmt,
    784     struct lwp *l)
    785 {
    786 	int     unit = raidunit(dev);
    787 	struct raid_softc *rs;
    788 	struct dk_softc *dksc;
    789 	int     error = 0;
    790 	int     part, pmask;
    791 
    792 	if ((rs = raidget(unit, true)) == NULL)
    793 		return ENXIO;
    794 	if ((error = raidlock(rs)) != 0)
    795 		return (error);
    796 
    797 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    798 		error = EBUSY;
    799 		goto bad;
    800 	}
    801 
    802 	dksc = &rs->sc_dksc;
    803 
    804 	part = DISKPART(dev);
    805 	pmask = (1 << part);
    806 
    807 	if (!DK_BUSY(dksc, pmask) &&
    808 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    809 		/* First one... mark things as dirty... Note that we *MUST*
    810 		 have done a configure before this.  I DO NOT WANT TO BE
    811 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    812 		 THAT THEY BELONG TOGETHER!!!!! */
    813 		/* XXX should check to see if we're only open for reading
    814 		   here... If so, we needn't do this, but then need some
    815 		   other way of keeping track of what's happened.. */
    816 
    817 		rf_markalldirty(&rs->sc_r);
    818 	}
    819 
    820 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    821 		error = dk_open(dksc, dev, flags, fmt, l);
    822 
    823 bad:
    824 	raidunlock(rs);
    825 
    826 	return (error);
    827 
    828 
    829 }
    830 
    831 static int
    832 raid_lastclose(device_t self)
    833 {
    834 	struct raid_softc *rs = raidsoftc(self);
    835 
    836 	/* Last one... device is not unconfigured yet.
    837 	   Device shutdown has taken care of setting the
    838 	   clean bits if RAIDF_INITED is not set
    839 	   mark things as clean... */
    840 
    841 	rf_update_component_labels(&rs->sc_r,
    842 	    RF_FINAL_COMPONENT_UPDATE);
    843 
    844 	/* pass to unlocked code */
    845 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    846 		rs->sc_flags |= RAIDF_DETACH;
    847 
    848 	return 0;
    849 }
    850 
    851 /* ARGSUSED */
    852 static int
    853 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    854 {
    855 	int     unit = raidunit(dev);
    856 	struct raid_softc *rs;
    857 	struct dk_softc *dksc;
    858 	cfdata_t cf;
    859 	int     error = 0, do_detach = 0, do_put = 0;
    860 
    861 	if ((rs = raidget(unit, false)) == NULL)
    862 		return ENXIO;
    863 	dksc = &rs->sc_dksc;
    864 
    865 	if ((error = raidlock(rs)) != 0)
    866 		return (error);
    867 
    868 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    869 		error = dk_close(dksc, dev, flags, fmt, l);
    870 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    871 			do_detach = 1;
    872 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    873 		do_put = 1;
    874 
    875 	raidunlock(rs);
    876 
    877 	if (do_detach) {
    878 		/* free the pseudo device attach bits */
    879 		cf = device_cfdata(dksc->sc_dev);
    880 		error = config_detach(dksc->sc_dev, 0);
    881 		if (error == 0)
    882 			free(cf, M_RAIDFRAME);
    883 	} else if (do_put) {
    884 		raidput(rs);
    885 	}
    886 
    887 	return (error);
    888 
    889 }
    890 
    891 static void
    892 raid_wakeup(RF_Raid_t *raidPtr)
    893 {
    894 	rf_lock_mutex2(raidPtr->iodone_lock);
    895 	rf_signal_cond2(raidPtr->iodone_cv);
    896 	rf_unlock_mutex2(raidPtr->iodone_lock);
    897 }
    898 
    899 static void
    900 raidstrategy(struct buf *bp)
    901 {
    902 	unsigned int unit;
    903 	struct raid_softc *rs;
    904 	struct dk_softc *dksc;
    905 	RF_Raid_t *raidPtr;
    906 
    907 	unit = raidunit(bp->b_dev);
    908 	if ((rs = raidget(unit, false)) == NULL) {
    909 		bp->b_error = ENXIO;
    910 		goto fail;
    911 	}
    912 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    913 		bp->b_error = ENXIO;
    914 		goto fail;
    915 	}
    916 	dksc = &rs->sc_dksc;
    917 	raidPtr = &rs->sc_r;
    918 
    919 	/* Queue IO only */
    920 	if (dk_strategy_defer(dksc, bp))
    921 		goto done;
    922 
    923 	/* schedule the IO to happen at the next convenient time */
    924 	raid_wakeup(raidPtr);
    925 
    926 done:
    927 	return;
    928 
    929 fail:
    930 	bp->b_resid = bp->b_bcount;
    931 	biodone(bp);
    932 }
    933 
    934 static int
    935 raid_diskstart(device_t dev, struct buf *bp)
    936 {
    937 	struct raid_softc *rs = raidsoftc(dev);
    938 	RF_Raid_t *raidPtr;
    939 
    940 	raidPtr = &rs->sc_r;
    941 	if (!raidPtr->valid) {
    942 		db1_printf(("raid is not valid..\n"));
    943 		return ENODEV;
    944 	}
    945 
    946 	/* XXX */
    947 	bp->b_resid = 0;
    948 
    949 	return raiddoaccess(raidPtr, bp);
    950 }
    951 
    952 void
    953 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    954 {
    955 	struct raid_softc *rs;
    956 	struct dk_softc *dksc;
    957 
    958 	rs = raidPtr->softc;
    959 	dksc = &rs->sc_dksc;
    960 
    961 	dk_done(dksc, bp);
    962 
    963 	rf_lock_mutex2(raidPtr->mutex);
    964 	raidPtr->openings++;
    965 	rf_unlock_mutex2(raidPtr->mutex);
    966 
    967 	/* schedule more IO */
    968 	raid_wakeup(raidPtr);
    969 }
    970 
    971 /* ARGSUSED */
    972 static int
    973 raidread(dev_t dev, struct uio *uio, int flags)
    974 {
    975 	int     unit = raidunit(dev);
    976 	struct raid_softc *rs;
    977 
    978 	if ((rs = raidget(unit, false)) == NULL)
    979 		return ENXIO;
    980 
    981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    982 		return (ENXIO);
    983 
    984 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    985 
    986 }
    987 
    988 /* ARGSUSED */
    989 static int
    990 raidwrite(dev_t dev, struct uio *uio, int flags)
    991 {
    992 	int     unit = raidunit(dev);
    993 	struct raid_softc *rs;
    994 
    995 	if ((rs = raidget(unit, false)) == NULL)
    996 		return ENXIO;
    997 
    998 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    999 		return (ENXIO);
   1000 
   1001 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1002 
   1003 }
   1004 
   1005 static int
   1006 raid_detach_unlocked(struct raid_softc *rs)
   1007 {
   1008 	struct dk_softc *dksc = &rs->sc_dksc;
   1009 	RF_Raid_t *raidPtr;
   1010 	int error;
   1011 
   1012 	raidPtr = &rs->sc_r;
   1013 
   1014 	if (DK_BUSY(dksc, 0) ||
   1015 	    raidPtr->recon_in_progress != 0 ||
   1016 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1017 	    raidPtr->copyback_in_progress != 0)
   1018 		return EBUSY;
   1019 
   1020 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1021 		return 0;
   1022 
   1023 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1024 
   1025 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1026 		return error;
   1027 
   1028 	rs->sc_flags &= ~RAIDF_INITED;
   1029 
   1030 	/* Kill off any queued buffers */
   1031 	dk_drain(dksc);
   1032 	bufq_free(dksc->sc_bufq);
   1033 
   1034 	/* Detach the disk. */
   1035 	dkwedge_delall(&dksc->sc_dkdev);
   1036 	disk_detach(&dksc->sc_dkdev);
   1037 	disk_destroy(&dksc->sc_dkdev);
   1038 	dk_detach(dksc);
   1039 
   1040 	return 0;
   1041 }
   1042 
   1043 static int
   1044 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1045 {
   1046 	int     unit = raidunit(dev);
   1047 	int     error = 0;
   1048 	int     part, pmask;
   1049 	struct raid_softc *rs;
   1050 	struct dk_softc *dksc;
   1051 	RF_Config_t *k_cfg, *u_cfg;
   1052 	RF_Raid_t *raidPtr;
   1053 	RF_RaidDisk_t *diskPtr;
   1054 	RF_AccTotals_t *totals;
   1055 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1056 	u_char *specific_buf;
   1057 	int retcode = 0;
   1058 	int column;
   1059 /*	int raidid; */
   1060 	struct rf_recon_req *rr;
   1061 	struct rf_recon_req_internal *rrint;
   1062 	RF_ComponentLabel_t *clabel;
   1063 	RF_ComponentLabel_t *ci_label;
   1064 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1065 	RF_SingleComponent_t component;
   1066 	int d;
   1067 
   1068 	if ((rs = raidget(unit, false)) == NULL)
   1069 		return ENXIO;
   1070 	dksc = &rs->sc_dksc;
   1071 	raidPtr = &rs->sc_r;
   1072 
   1073 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1074 		(int) DISKPART(dev), (int) unit, cmd));
   1075 
   1076 	/* Must be initialized for these... */
   1077 	switch (cmd) {
   1078 	case RAIDFRAME_REWRITEPARITY:
   1079 	case RAIDFRAME_GET_INFO:
   1080 	case RAIDFRAME_RESET_ACCTOTALS:
   1081 	case RAIDFRAME_GET_ACCTOTALS:
   1082 	case RAIDFRAME_KEEP_ACCTOTALS:
   1083 	case RAIDFRAME_GET_SIZE:
   1084 	case RAIDFRAME_FAIL_DISK:
   1085 	case RAIDFRAME_COPYBACK:
   1086 	case RAIDFRAME_CHECK_RECON_STATUS:
   1087 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1088 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1089 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1090 	case RAIDFRAME_ADD_HOT_SPARE:
   1091 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1092 	case RAIDFRAME_INIT_LABELS:
   1093 	case RAIDFRAME_REBUILD_IN_PLACE:
   1094 	case RAIDFRAME_CHECK_PARITY:
   1095 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1096 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1097 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1098 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1099 	case RAIDFRAME_SET_AUTOCONFIG:
   1100 	case RAIDFRAME_SET_ROOT:
   1101 	case RAIDFRAME_DELETE_COMPONENT:
   1102 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1103 	case RAIDFRAME_PARITYMAP_STATUS:
   1104 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1105 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1106 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1107 #ifdef COMPAT_50
   1108 	case RAIDFRAME_GET_INFO50:
   1109 #endif
   1110 #ifdef COMPAT_80
   1111 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1113 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1114 	case RAIDFRAME_GET_INFO80:
   1115 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1116 #endif
   1117 #ifdef COMPAT_NETBSD32
   1118 	case RAIDFRAME_GET_INFO32:
   1119 #endif
   1120 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1121 			return (ENXIO);
   1122 	}
   1123 
   1124 	switch (cmd) {
   1125 #ifdef COMPAT_50
   1126 	case RAIDFRAME_GET_INFO50:
   1127 		return rf_get_info50(raidPtr, data);
   1128 
   1129 	case RAIDFRAME_CONFIGURE50:
   1130 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1131 			return retcode;
   1132 		goto config;
   1133 #endif
   1134 
   1135 #ifdef COMPAT_80
   1136 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1137 		return rf_check_recon_status_ext80(raidPtr, data);
   1138 
   1139 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1140 		return rf_check_parityrewrite_status_ext80(raidPtr, data);
   1141 
   1142 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1143 		return rf_check_copyback_status_ext80(raidPtr, data);
   1144 
   1145 	case RAIDFRAME_GET_INFO80:
   1146 		return rf_get_info80(raidPtr, data);
   1147 
   1148 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1149 		return rf_get_component_label80(raidPtr, data);
   1150 #endif
   1151 
   1152 		/* configure the system */
   1153 	case RAIDFRAME_CONFIGURE:
   1154 #ifdef COMPAT_NETBSD32
   1155 	case RAIDFRAME_CONFIGURE32:
   1156 #endif
   1157 
   1158 		if (raidPtr->valid) {
   1159 			/* There is a valid RAID set running on this unit! */
   1160 			printf("raid%d: Device already configured!\n",unit);
   1161 			return(EINVAL);
   1162 		}
   1163 
   1164 		/* copy-in the configuration information */
   1165 		/* data points to a pointer to the configuration structure */
   1166 
   1167 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1168 		if (k_cfg == NULL) {
   1169 			return (ENOMEM);
   1170 		}
   1171 #ifdef COMPAT_NETBSD32
   1172 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1173 		    (l->l_proc->p_flag & PK_32) != 0)
   1174 			retcode = rf_config_netbsd32(data, k_cfg);
   1175 		else
   1176 #endif
   1177 		{
   1178 			u_cfg = *((RF_Config_t **) data);
   1179 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1180 		}
   1181 		if (retcode) {
   1182 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1183 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1184 				retcode));
   1185 			goto no_config;
   1186 		}
   1187 		goto config;
   1188 	config:
   1189 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1190 
   1191 		/* allocate a buffer for the layout-specific data, and copy it
   1192 		 * in */
   1193 		if (k_cfg->layoutSpecificSize) {
   1194 			if (k_cfg->layoutSpecificSize > 10000) {
   1195 				/* sanity check */
   1196 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1197 				retcode = EINVAL;
   1198 				goto no_config;
   1199 			}
   1200 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1201 			    (u_char *));
   1202 			if (specific_buf == NULL) {
   1203 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1204 				retcode = ENOMEM;
   1205 				goto no_config;
   1206 			}
   1207 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1208 			    k_cfg->layoutSpecificSize);
   1209 			if (retcode) {
   1210 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1211 				RF_Free(specific_buf,
   1212 					k_cfg->layoutSpecificSize);
   1213 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1214 					retcode));
   1215 				goto no_config;
   1216 			}
   1217 		} else
   1218 			specific_buf = NULL;
   1219 		k_cfg->layoutSpecific = specific_buf;
   1220 
   1221 		/* should do some kind of sanity check on the configuration.
   1222 		 * Store the sum of all the bytes in the last byte? */
   1223 
   1224 		/* configure the system */
   1225 
   1226 		/*
   1227 		 * Clear the entire RAID descriptor, just to make sure
   1228 		 *  there is no stale data left in the case of a
   1229 		 *  reconfiguration
   1230 		 */
   1231 		memset(raidPtr, 0, sizeof(*raidPtr));
   1232 		raidPtr->softc = rs;
   1233 		raidPtr->raidid = unit;
   1234 
   1235 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1236 
   1237 		if (retcode == 0) {
   1238 
   1239 			/* allow this many simultaneous IO's to
   1240 			   this RAID device */
   1241 			raidPtr->openings = RAIDOUTSTANDING;
   1242 
   1243 			raidinit(rs);
   1244 			raid_wakeup(raidPtr);
   1245 			rf_markalldirty(raidPtr);
   1246 		}
   1247 		/* free the buffers.  No return code here. */
   1248 		if (k_cfg->layoutSpecificSize) {
   1249 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1250 		}
   1251 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1252 
   1253 	no_config:
   1254 		/*
   1255 		 * If configuration failed, set sc_flags so that we
   1256 		 * will detach the device when we close it.
   1257 		 */
   1258 		if (retcode != 0)
   1259 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1260 		return (retcode);
   1261 
   1262 		/* shutdown the system */
   1263 	case RAIDFRAME_SHUTDOWN:
   1264 
   1265 		part = DISKPART(dev);
   1266 		pmask = (1 << part);
   1267 
   1268 		if ((error = raidlock(rs)) != 0)
   1269 			return (error);
   1270 
   1271 		if (DK_BUSY(dksc, pmask) ||
   1272 		    raidPtr->recon_in_progress != 0 ||
   1273 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1274 		    raidPtr->copyback_in_progress != 0)
   1275 			retcode = EBUSY;
   1276 		else {
   1277 			/* detach and free on close */
   1278 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1279 			retcode = 0;
   1280 		}
   1281 
   1282 		raidunlock(rs);
   1283 
   1284 		return (retcode);
   1285 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1286 		return rf_get_component_label(raidPtr, data);
   1287 
   1288 #if 0
   1289 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1290 		clabel = (RF_ComponentLabel_t *) data;
   1291 
   1292 		/* XXX check the label for valid stuff... */
   1293 		/* Note that some things *should not* get modified --
   1294 		   the user should be re-initing the labels instead of
   1295 		   trying to patch things.
   1296 		   */
   1297 
   1298 		raidid = raidPtr->raidid;
   1299 #ifdef DEBUG
   1300 		printf("raid%d: Got component label:\n", raidid);
   1301 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1302 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1303 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1304 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1305 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1306 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1307 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1308 #endif
   1309 		clabel->row = 0;
   1310 		column = clabel->column;
   1311 
   1312 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1313 			return(EINVAL);
   1314 		}
   1315 
   1316 		/* XXX this isn't allowed to do anything for now :-) */
   1317 
   1318 		/* XXX and before it is, we need to fill in the rest
   1319 		   of the fields!?!?!?! */
   1320 		memcpy(raidget_component_label(raidPtr, column),
   1321 		    clabel, sizeof(*clabel));
   1322 		raidflush_component_label(raidPtr, column);
   1323 		return (0);
   1324 #endif
   1325 
   1326 	case RAIDFRAME_INIT_LABELS:
   1327 		clabel = (RF_ComponentLabel_t *) data;
   1328 		/*
   1329 		   we only want the serial number from
   1330 		   the above.  We get all the rest of the information
   1331 		   from the config that was used to create this RAID
   1332 		   set.
   1333 		   */
   1334 
   1335 		raidPtr->serial_number = clabel->serial_number;
   1336 
   1337 		for(column=0;column<raidPtr->numCol;column++) {
   1338 			diskPtr = &raidPtr->Disks[column];
   1339 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1340 				ci_label = raidget_component_label(raidPtr,
   1341 				    column);
   1342 				/* Zeroing this is important. */
   1343 				memset(ci_label, 0, sizeof(*ci_label));
   1344 				raid_init_component_label(raidPtr, ci_label);
   1345 				ci_label->serial_number =
   1346 				    raidPtr->serial_number;
   1347 				ci_label->row = 0; /* we dont' pretend to support more */
   1348 				rf_component_label_set_partitionsize(ci_label,
   1349 				    diskPtr->partitionSize);
   1350 				ci_label->column = column;
   1351 				raidflush_component_label(raidPtr, column);
   1352 			}
   1353 			/* XXXjld what about the spares? */
   1354 		}
   1355 
   1356 		return (retcode);
   1357 	case RAIDFRAME_SET_AUTOCONFIG:
   1358 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1359 		printf("raid%d: New autoconfig value is: %d\n",
   1360 		       raidPtr->raidid, d);
   1361 		*(int *) data = d;
   1362 		return (retcode);
   1363 
   1364 	case RAIDFRAME_SET_ROOT:
   1365 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1366 		printf("raid%d: New rootpartition value is: %d\n",
   1367 		       raidPtr->raidid, d);
   1368 		*(int *) data = d;
   1369 		return (retcode);
   1370 
   1371 		/* initialize all parity */
   1372 	case RAIDFRAME_REWRITEPARITY:
   1373 
   1374 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1375 			/* Parity for RAID 0 is trivially correct */
   1376 			raidPtr->parity_good = RF_RAID_CLEAN;
   1377 			return(0);
   1378 		}
   1379 
   1380 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1381 			/* Re-write is already in progress! */
   1382 			return(EINVAL);
   1383 		}
   1384 
   1385 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1386 					   rf_RewriteParityThread,
   1387 					   raidPtr,"raid_parity");
   1388 		return (retcode);
   1389 
   1390 
   1391 	case RAIDFRAME_ADD_HOT_SPARE:
   1392 		sparePtr = (RF_SingleComponent_t *) data;
   1393 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1394 		retcode = rf_add_hot_spare(raidPtr, &component);
   1395 		return(retcode);
   1396 
   1397 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1398 		return(retcode);
   1399 
   1400 	case RAIDFRAME_DELETE_COMPONENT:
   1401 		componentPtr = (RF_SingleComponent_t *)data;
   1402 		memcpy( &component, componentPtr,
   1403 			sizeof(RF_SingleComponent_t));
   1404 		retcode = rf_delete_component(raidPtr, &component);
   1405 		return(retcode);
   1406 
   1407 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1408 		componentPtr = (RF_SingleComponent_t *)data;
   1409 		memcpy( &component, componentPtr,
   1410 			sizeof(RF_SingleComponent_t));
   1411 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1412 		return(retcode);
   1413 
   1414 	case RAIDFRAME_REBUILD_IN_PLACE:
   1415 
   1416 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1417 			/* Can't do this on a RAID 0!! */
   1418 			return(EINVAL);
   1419 		}
   1420 
   1421 		if (raidPtr->recon_in_progress == 1) {
   1422 			/* a reconstruct is already in progress! */
   1423 			return(EINVAL);
   1424 		}
   1425 
   1426 		componentPtr = (RF_SingleComponent_t *) data;
   1427 		memcpy( &component, componentPtr,
   1428 			sizeof(RF_SingleComponent_t));
   1429 		component.row = 0; /* we don't support any more */
   1430 		column = component.column;
   1431 
   1432 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1433 			return(EINVAL);
   1434 		}
   1435 
   1436 		rf_lock_mutex2(raidPtr->mutex);
   1437 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1438 		    (raidPtr->numFailures > 0)) {
   1439 			/* XXX 0 above shouldn't be constant!!! */
   1440 			/* some component other than this has failed.
   1441 			   Let's not make things worse than they already
   1442 			   are... */
   1443 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1444 			       raidPtr->raidid);
   1445 			printf("raid%d:     Col: %d   Too many failures.\n",
   1446 			       raidPtr->raidid, column);
   1447 			rf_unlock_mutex2(raidPtr->mutex);
   1448 			return (EINVAL);
   1449 		}
   1450 		if (raidPtr->Disks[column].status ==
   1451 		    rf_ds_reconstructing) {
   1452 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1453 			       raidPtr->raidid);
   1454 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1455 
   1456 			rf_unlock_mutex2(raidPtr->mutex);
   1457 			return (EINVAL);
   1458 		}
   1459 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1460 			rf_unlock_mutex2(raidPtr->mutex);
   1461 			return (EINVAL);
   1462 		}
   1463 		rf_unlock_mutex2(raidPtr->mutex);
   1464 
   1465 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1466 		if (rrint == NULL)
   1467 			return(ENOMEM);
   1468 
   1469 		rrint->col = column;
   1470 		rrint->raidPtr = raidPtr;
   1471 
   1472 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1473 					   rf_ReconstructInPlaceThread,
   1474 					   rrint, "raid_reconip");
   1475 		return(retcode);
   1476 
   1477 	case RAIDFRAME_GET_INFO:
   1478 #ifdef COMPAT_NETBSD32
   1479 	case RAIDFRAME_GET_INFO32:
   1480 #endif
   1481 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1482 			  (RF_DeviceConfig_t *));
   1483 		if (d_cfg == NULL)
   1484 			return (ENOMEM);
   1485 		retcode = rf_get_info(raidPtr, d_cfg);
   1486 		if (retcode == 0) {
   1487 #ifdef COMPAT_NETBSD32
   1488 			if (cmd == RAIDFRAME_GET_INFO32)
   1489 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1490 			else
   1491 #endif
   1492 				ucfgp = *(RF_DeviceConfig_t **)data;
   1493 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1494 		}
   1495 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1496 
   1497 		return (retcode);
   1498 
   1499 	case RAIDFRAME_CHECK_PARITY:
   1500 		*(int *) data = raidPtr->parity_good;
   1501 		return (0);
   1502 
   1503 	case RAIDFRAME_PARITYMAP_STATUS:
   1504 		if (rf_paritymap_ineligible(raidPtr))
   1505 			return EINVAL;
   1506 		rf_paritymap_status(raidPtr->parity_map,
   1507 		    (struct rf_pmstat *)data);
   1508 		return 0;
   1509 
   1510 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1511 		if (rf_paritymap_ineligible(raidPtr))
   1512 			return EINVAL;
   1513 		if (raidPtr->parity_map == NULL)
   1514 			return ENOENT; /* ??? */
   1515 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1516 			(struct rf_pmparams *)data, 1))
   1517 			return EINVAL;
   1518 		return 0;
   1519 
   1520 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1521 		if (rf_paritymap_ineligible(raidPtr))
   1522 			return EINVAL;
   1523 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1524 		return 0;
   1525 
   1526 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1527 		if (rf_paritymap_ineligible(raidPtr))
   1528 			return EINVAL;
   1529 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1530 		/* XXX should errors be passed up? */
   1531 		return 0;
   1532 
   1533 	case RAIDFRAME_RESET_ACCTOTALS:
   1534 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1535 		return (0);
   1536 
   1537 	case RAIDFRAME_GET_ACCTOTALS:
   1538 		totals = (RF_AccTotals_t *) data;
   1539 		*totals = raidPtr->acc_totals;
   1540 		return (0);
   1541 
   1542 	case RAIDFRAME_KEEP_ACCTOTALS:
   1543 		raidPtr->keep_acc_totals = *(int *)data;
   1544 		return (0);
   1545 
   1546 	case RAIDFRAME_GET_SIZE:
   1547 		*(int *) data = raidPtr->totalSectors;
   1548 		return (0);
   1549 
   1550 		/* fail a disk & optionally start reconstruction */
   1551 	case RAIDFRAME_FAIL_DISK:
   1552 #ifdef COMPAT_80
   1553 	case RAIDFRAME_FAIL_DISK80:
   1554 #endif
   1555 
   1556 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1557 			/* Can't do this on a RAID 0!! */
   1558 			return(EINVAL);
   1559 		}
   1560 
   1561 		rr = (struct rf_recon_req *) data;
   1562 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1563 			return (EINVAL);
   1564 
   1565 		rf_lock_mutex2(raidPtr->mutex);
   1566 		if (raidPtr->status == rf_rs_reconstructing) {
   1567 			/* you can't fail a disk while we're reconstructing! */
   1568 			/* XXX wrong for RAID6 */
   1569 			rf_unlock_mutex2(raidPtr->mutex);
   1570 			return (EINVAL);
   1571 		}
   1572 		if ((raidPtr->Disks[rr->col].status ==
   1573 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1574 			/* some other component has failed.  Let's not make
   1575 			   things worse. XXX wrong for RAID6 */
   1576 			rf_unlock_mutex2(raidPtr->mutex);
   1577 			return (EINVAL);
   1578 		}
   1579 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1580 			/* Can't fail a spared disk! */
   1581 			rf_unlock_mutex2(raidPtr->mutex);
   1582 			return (EINVAL);
   1583 		}
   1584 		rf_unlock_mutex2(raidPtr->mutex);
   1585 
   1586 		/* make a copy of the recon request so that we don't rely on
   1587 		 * the user's buffer */
   1588 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1589 		if (rrint == NULL)
   1590 			return(ENOMEM);
   1591 		rrint->col = rr->col;
   1592 		rrint->flags = rr->flags;
   1593 		rrint->raidPtr = raidPtr;
   1594 
   1595 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1596 					   rf_ReconThread,
   1597 					   rrint, "raid_recon");
   1598 		return (0);
   1599 
   1600 		/* invoke a copyback operation after recon on whatever disk
   1601 		 * needs it, if any */
   1602 	case RAIDFRAME_COPYBACK:
   1603 
   1604 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1605 			/* This makes no sense on a RAID 0!! */
   1606 			return(EINVAL);
   1607 		}
   1608 
   1609 		if (raidPtr->copyback_in_progress == 1) {
   1610 			/* Copyback is already in progress! */
   1611 			return(EINVAL);
   1612 		}
   1613 
   1614 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1615 					   rf_CopybackThread,
   1616 					   raidPtr,"raid_copyback");
   1617 		return (retcode);
   1618 
   1619 		/* return the percentage completion of reconstruction */
   1620 	case RAIDFRAME_CHECK_RECON_STATUS:
   1621 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1622 			/* This makes no sense on a RAID 0, so tell the
   1623 			   user it's done. */
   1624 			*(int *) data = 100;
   1625 			return(0);
   1626 		}
   1627 		if (raidPtr->status != rf_rs_reconstructing)
   1628 			*(int *) data = 100;
   1629 		else {
   1630 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1631 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1632 			} else {
   1633 				*(int *) data = 0;
   1634 			}
   1635 		}
   1636 		return (0);
   1637 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1638 		rf_check_recon_status_ext(raidPtr, data);
   1639 		return (0);
   1640 
   1641 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1642 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1643 			/* This makes no sense on a RAID 0, so tell the
   1644 			   user it's done. */
   1645 			*(int *) data = 100;
   1646 			return(0);
   1647 		}
   1648 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1649 			*(int *) data = 100 *
   1650 				raidPtr->parity_rewrite_stripes_done /
   1651 				raidPtr->Layout.numStripe;
   1652 		} else {
   1653 			*(int *) data = 100;
   1654 		}
   1655 		return (0);
   1656 
   1657 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1658 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1659 		return (0);
   1660 
   1661 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1662 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1663 			/* This makes no sense on a RAID 0 */
   1664 			*(int *) data = 100;
   1665 			return(0);
   1666 		}
   1667 		if (raidPtr->copyback_in_progress == 1) {
   1668 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1669 				raidPtr->Layout.numStripe;
   1670 		} else {
   1671 			*(int *) data = 100;
   1672 		}
   1673 		return (0);
   1674 
   1675 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1676 		rf_check_copyback_status_ext(raidPtr, data);
   1677 		return 0;
   1678 
   1679 	case RAIDFRAME_SET_LAST_UNIT:
   1680 		for (column = 0; column < raidPtr->numCol; column++)
   1681 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1682 				return EBUSY;
   1683 
   1684 		for (column = 0; column < raidPtr->numCol; column++) {
   1685 			clabel = raidget_component_label(raidPtr, column);
   1686 			clabel->last_unit = *(int *)data;
   1687 			raidflush_component_label(raidPtr, column);
   1688 		}
   1689 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1690 		return 0;
   1691 
   1692 		/* the sparetable daemon calls this to wait for the kernel to
   1693 		 * need a spare table. this ioctl does not return until a
   1694 		 * spare table is needed. XXX -- calling mpsleep here in the
   1695 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1696 		 * -- I should either compute the spare table in the kernel,
   1697 		 * or have a different -- XXX XXX -- interface (a different
   1698 		 * character device) for delivering the table     -- XXX */
   1699 #if 0
   1700 	case RAIDFRAME_SPARET_WAIT:
   1701 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1702 		while (!rf_sparet_wait_queue)
   1703 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1704 		waitreq = rf_sparet_wait_queue;
   1705 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1706 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1707 
   1708 		/* structure assignment */
   1709 		*((RF_SparetWait_t *) data) = *waitreq;
   1710 
   1711 		RF_Free(waitreq, sizeof(*waitreq));
   1712 		return (0);
   1713 
   1714 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1715 		 * code in it that will cause the dameon to exit */
   1716 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1717 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1718 		waitreq->fcol = -1;
   1719 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1720 		waitreq->next = rf_sparet_wait_queue;
   1721 		rf_sparet_wait_queue = waitreq;
   1722 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1723 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1724 		return (0);
   1725 
   1726 		/* used by the spare table daemon to deliver a spare table
   1727 		 * into the kernel */
   1728 	case RAIDFRAME_SEND_SPARET:
   1729 
   1730 		/* install the spare table */
   1731 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1732 
   1733 		/* respond to the requestor.  the return status of the spare
   1734 		 * table installation is passed in the "fcol" field */
   1735 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1736 		waitreq->fcol = retcode;
   1737 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1738 		waitreq->next = rf_sparet_resp_queue;
   1739 		rf_sparet_resp_queue = waitreq;
   1740 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1741 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1742 
   1743 		return (retcode);
   1744 #endif
   1745 
   1746 	default:
   1747 		break; /* fall through to the os-specific code below */
   1748 
   1749 	}
   1750 
   1751 	if (!raidPtr->valid)
   1752 		return (EINVAL);
   1753 
   1754 	/*
   1755 	 * Add support for "regular" device ioctls here.
   1756 	 */
   1757 
   1758 	switch (cmd) {
   1759 	case DIOCGCACHE:
   1760 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1761 		break;
   1762 
   1763 	case DIOCCACHESYNC:
   1764 		retcode = rf_sync_component_caches(raidPtr);
   1765 		break;
   1766 
   1767 	default:
   1768 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1769 		break;
   1770 	}
   1771 
   1772 	return (retcode);
   1773 
   1774 }
   1775 
   1776 
   1777 /* raidinit -- complete the rest of the initialization for the
   1778    RAIDframe device.  */
   1779 
   1780 
   1781 static void
   1782 raidinit(struct raid_softc *rs)
   1783 {
   1784 	cfdata_t cf;
   1785 	unsigned int unit;
   1786 	struct dk_softc *dksc = &rs->sc_dksc;
   1787 	RF_Raid_t *raidPtr = &rs->sc_r;
   1788 	device_t dev;
   1789 
   1790 	unit = raidPtr->raidid;
   1791 
   1792 	/* XXX doesn't check bounds. */
   1793 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1794 
   1795 	/* attach the pseudo device */
   1796 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1797 	cf->cf_name = raid_cd.cd_name;
   1798 	cf->cf_atname = raid_cd.cd_name;
   1799 	cf->cf_unit = unit;
   1800 	cf->cf_fstate = FSTATE_STAR;
   1801 
   1802 	dev = config_attach_pseudo(cf);
   1803 	if (dev == NULL) {
   1804 		printf("raid%d: config_attach_pseudo failed\n",
   1805 		    raidPtr->raidid);
   1806 		free(cf, M_RAIDFRAME);
   1807 		return;
   1808 	}
   1809 
   1810 	/* provide a backpointer to the real softc */
   1811 	raidsoftc(dev) = rs;
   1812 
   1813 	/* disk_attach actually creates space for the CPU disklabel, among
   1814 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1815 	 * with disklabels. */
   1816 	dk_init(dksc, dev, DKTYPE_RAID);
   1817 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1818 
   1819 	/* XXX There may be a weird interaction here between this, and
   1820 	 * protectedSectors, as used in RAIDframe.  */
   1821 
   1822 	rs->sc_size = raidPtr->totalSectors;
   1823 
   1824 	/* Attach dk and disk subsystems */
   1825 	dk_attach(dksc);
   1826 	disk_attach(&dksc->sc_dkdev);
   1827 	rf_set_geometry(rs, raidPtr);
   1828 
   1829 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1830 
   1831 	/* mark unit as usuable */
   1832 	rs->sc_flags |= RAIDF_INITED;
   1833 
   1834 	dkwedge_discover(&dksc->sc_dkdev);
   1835 }
   1836 
   1837 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1838 /* wake up the daemon & tell it to get us a spare table
   1839  * XXX
   1840  * the entries in the queues should be tagged with the raidPtr
   1841  * so that in the extremely rare case that two recons happen at once,
   1842  * we know for which device were requesting a spare table
   1843  * XXX
   1844  *
   1845  * XXX This code is not currently used. GO
   1846  */
   1847 int
   1848 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1849 {
   1850 	int     retcode;
   1851 
   1852 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1853 	req->next = rf_sparet_wait_queue;
   1854 	rf_sparet_wait_queue = req;
   1855 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1856 
   1857 	/* mpsleep unlocks the mutex */
   1858 	while (!rf_sparet_resp_queue) {
   1859 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1860 	}
   1861 	req = rf_sparet_resp_queue;
   1862 	rf_sparet_resp_queue = req->next;
   1863 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1864 
   1865 	retcode = req->fcol;
   1866 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1867 					 * alloc'd */
   1868 	return (retcode);
   1869 }
   1870 #endif
   1871 
   1872 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1873  * bp & passes it down.
   1874  * any calls originating in the kernel must use non-blocking I/O
   1875  * do some extra sanity checking to return "appropriate" error values for
   1876  * certain conditions (to make some standard utilities work)
   1877  *
   1878  * Formerly known as: rf_DoAccessKernel
   1879  */
   1880 void
   1881 raidstart(RF_Raid_t *raidPtr)
   1882 {
   1883 	struct raid_softc *rs;
   1884 	struct dk_softc *dksc;
   1885 
   1886 	rs = raidPtr->softc;
   1887 	dksc = &rs->sc_dksc;
   1888 	/* quick check to see if anything has died recently */
   1889 	rf_lock_mutex2(raidPtr->mutex);
   1890 	if (raidPtr->numNewFailures > 0) {
   1891 		rf_unlock_mutex2(raidPtr->mutex);
   1892 		rf_update_component_labels(raidPtr,
   1893 					   RF_NORMAL_COMPONENT_UPDATE);
   1894 		rf_lock_mutex2(raidPtr->mutex);
   1895 		raidPtr->numNewFailures--;
   1896 	}
   1897 	rf_unlock_mutex2(raidPtr->mutex);
   1898 
   1899 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1900 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1901 		return;
   1902 	}
   1903 
   1904 	dk_start(dksc, NULL);
   1905 }
   1906 
   1907 static int
   1908 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1909 {
   1910 	RF_SectorCount_t num_blocks, pb, sum;
   1911 	RF_RaidAddr_t raid_addr;
   1912 	daddr_t blocknum;
   1913 	int     do_async;
   1914 	int rc;
   1915 
   1916 	rf_lock_mutex2(raidPtr->mutex);
   1917 	if (raidPtr->openings == 0) {
   1918 		rf_unlock_mutex2(raidPtr->mutex);
   1919 		return EAGAIN;
   1920 	}
   1921 	rf_unlock_mutex2(raidPtr->mutex);
   1922 
   1923 	blocknum = bp->b_rawblkno;
   1924 
   1925 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1926 		    (int) blocknum));
   1927 
   1928 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1929 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1930 
   1931 	/* *THIS* is where we adjust what block we're going to...
   1932 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1933 	raid_addr = blocknum;
   1934 
   1935 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1936 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1937 	sum = raid_addr + num_blocks + pb;
   1938 	if (1 || rf_debugKernelAccess) {
   1939 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1940 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1941 			    (int) pb, (int) bp->b_resid));
   1942 	}
   1943 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1944 	    || (sum < num_blocks) || (sum < pb)) {
   1945 		rc = ENOSPC;
   1946 		goto done;
   1947 	}
   1948 	/*
   1949 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1950 	 */
   1951 
   1952 	if (bp->b_bcount & raidPtr->sectorMask) {
   1953 		rc = ENOSPC;
   1954 		goto done;
   1955 	}
   1956 	db1_printf(("Calling DoAccess..\n"));
   1957 
   1958 
   1959 	rf_lock_mutex2(raidPtr->mutex);
   1960 	raidPtr->openings--;
   1961 	rf_unlock_mutex2(raidPtr->mutex);
   1962 
   1963 	/*
   1964 	 * Everything is async.
   1965 	 */
   1966 	do_async = 1;
   1967 
   1968 	/* don't ever condition on bp->b_flags & B_WRITE.
   1969 	 * always condition on B_READ instead */
   1970 
   1971 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1972 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1973 			 do_async, raid_addr, num_blocks,
   1974 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1975 
   1976 done:
   1977 	return rc;
   1978 }
   1979 
   1980 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1981 
   1982 int
   1983 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1984 {
   1985 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1986 	struct buf *bp;
   1987 
   1988 	req->queue = queue;
   1989 	bp = req->bp;
   1990 
   1991 	switch (req->type) {
   1992 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1993 		/* XXX need to do something extra here.. */
   1994 		/* I'm leaving this in, as I've never actually seen it used,
   1995 		 * and I'd like folks to report it... GO */
   1996 		printf(("WAKEUP CALLED\n"));
   1997 		queue->numOutstanding++;
   1998 
   1999 		bp->b_flags = 0;
   2000 		bp->b_private = req;
   2001 
   2002 		KernelWakeupFunc(bp);
   2003 		break;
   2004 
   2005 	case RF_IO_TYPE_READ:
   2006 	case RF_IO_TYPE_WRITE:
   2007 #if RF_ACC_TRACE > 0
   2008 		if (req->tracerec) {
   2009 			RF_ETIMER_START(req->tracerec->timer);
   2010 		}
   2011 #endif
   2012 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2013 		    op, queue->rf_cinfo->ci_dev,
   2014 		    req->sectorOffset, req->numSector,
   2015 		    req->buf, KernelWakeupFunc, (void *) req,
   2016 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2017 
   2018 		if (rf_debugKernelAccess) {
   2019 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2020 				(long) bp->b_blkno));
   2021 		}
   2022 		queue->numOutstanding++;
   2023 		queue->last_deq_sector = req->sectorOffset;
   2024 		/* acc wouldn't have been let in if there were any pending
   2025 		 * reqs at any other priority */
   2026 		queue->curPriority = req->priority;
   2027 
   2028 		db1_printf(("Going for %c to unit %d col %d\n",
   2029 			    req->type, queue->raidPtr->raidid,
   2030 			    queue->col));
   2031 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2032 			(int) req->sectorOffset, (int) req->numSector,
   2033 			(int) (req->numSector <<
   2034 			    queue->raidPtr->logBytesPerSector),
   2035 			(int) queue->raidPtr->logBytesPerSector));
   2036 
   2037 		/*
   2038 		 * XXX: drop lock here since this can block at
   2039 		 * least with backing SCSI devices.  Retake it
   2040 		 * to minimize fuss with calling interfaces.
   2041 		 */
   2042 
   2043 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2044 		bdev_strategy(bp);
   2045 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2046 		break;
   2047 
   2048 	default:
   2049 		panic("bad req->type in rf_DispatchKernelIO");
   2050 	}
   2051 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2052 
   2053 	return (0);
   2054 }
   2055 /* this is the callback function associated with a I/O invoked from
   2056    kernel code.
   2057  */
   2058 static void
   2059 KernelWakeupFunc(struct buf *bp)
   2060 {
   2061 	RF_DiskQueueData_t *req = NULL;
   2062 	RF_DiskQueue_t *queue;
   2063 
   2064 	db1_printf(("recovering the request queue:\n"));
   2065 
   2066 	req = bp->b_private;
   2067 
   2068 	queue = (RF_DiskQueue_t *) req->queue;
   2069 
   2070 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2071 
   2072 #if RF_ACC_TRACE > 0
   2073 	if (req->tracerec) {
   2074 		RF_ETIMER_STOP(req->tracerec->timer);
   2075 		RF_ETIMER_EVAL(req->tracerec->timer);
   2076 		rf_lock_mutex2(rf_tracing_mutex);
   2077 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2078 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2079 		req->tracerec->num_phys_ios++;
   2080 		rf_unlock_mutex2(rf_tracing_mutex);
   2081 	}
   2082 #endif
   2083 
   2084 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2085 	 * ballistic, and mark the component as hosed... */
   2086 
   2087 	if (bp->b_error != 0) {
   2088 		/* Mark the disk as dead */
   2089 		/* but only mark it once... */
   2090 		/* and only if it wouldn't leave this RAID set
   2091 		   completely broken */
   2092 		if (((queue->raidPtr->Disks[queue->col].status ==
   2093 		      rf_ds_optimal) ||
   2094 		     (queue->raidPtr->Disks[queue->col].status ==
   2095 		      rf_ds_used_spare)) &&
   2096 		     (queue->raidPtr->numFailures <
   2097 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2098 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2099 			       queue->raidPtr->raidid,
   2100 			       bp->b_error,
   2101 			       queue->raidPtr->Disks[queue->col].devname);
   2102 			queue->raidPtr->Disks[queue->col].status =
   2103 			    rf_ds_failed;
   2104 			queue->raidPtr->status = rf_rs_degraded;
   2105 			queue->raidPtr->numFailures++;
   2106 			queue->raidPtr->numNewFailures++;
   2107 		} else {	/* Disk is already dead... */
   2108 			/* printf("Disk already marked as dead!\n"); */
   2109 		}
   2110 
   2111 	}
   2112 
   2113 	/* Fill in the error value */
   2114 	req->error = bp->b_error;
   2115 
   2116 	/* Drop this one on the "finished" queue... */
   2117 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2118 
   2119 	/* Let the raidio thread know there is work to be done. */
   2120 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2121 
   2122 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2123 }
   2124 
   2125 
   2126 /*
   2127  * initialize a buf structure for doing an I/O in the kernel.
   2128  */
   2129 static void
   2130 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2131        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2132        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2133        struct proc *b_proc)
   2134 {
   2135 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2136 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2137 	bp->b_oflags = 0;
   2138 	bp->b_cflags = 0;
   2139 	bp->b_bcount = numSect << logBytesPerSector;
   2140 	bp->b_bufsize = bp->b_bcount;
   2141 	bp->b_error = 0;
   2142 	bp->b_dev = dev;
   2143 	bp->b_data = bf;
   2144 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2145 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2146 	if (bp->b_bcount == 0) {
   2147 		panic("bp->b_bcount is zero in InitBP!!");
   2148 	}
   2149 	bp->b_proc = b_proc;
   2150 	bp->b_iodone = cbFunc;
   2151 	bp->b_private = cbArg;
   2152 }
   2153 
   2154 /*
   2155  * Wait interruptibly for an exclusive lock.
   2156  *
   2157  * XXX
   2158  * Several drivers do this; it should be abstracted and made MP-safe.
   2159  * (Hmm... where have we seen this warning before :->  GO )
   2160  */
   2161 static int
   2162 raidlock(struct raid_softc *rs)
   2163 {
   2164 	int     error;
   2165 
   2166 	error = 0;
   2167 	mutex_enter(&rs->sc_mutex);
   2168 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2169 		rs->sc_flags |= RAIDF_WANTED;
   2170 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2171 		if (error != 0)
   2172 			goto done;
   2173 	}
   2174 	rs->sc_flags |= RAIDF_LOCKED;
   2175 done:
   2176 	mutex_exit(&rs->sc_mutex);
   2177 	return (error);
   2178 }
   2179 /*
   2180  * Unlock and wake up any waiters.
   2181  */
   2182 static void
   2183 raidunlock(struct raid_softc *rs)
   2184 {
   2185 
   2186 	mutex_enter(&rs->sc_mutex);
   2187 	rs->sc_flags &= ~RAIDF_LOCKED;
   2188 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2189 		rs->sc_flags &= ~RAIDF_WANTED;
   2190 		cv_broadcast(&rs->sc_cv);
   2191 	}
   2192 	mutex_exit(&rs->sc_mutex);
   2193 }
   2194 
   2195 
   2196 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2197 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2198 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2199 
   2200 static daddr_t
   2201 rf_component_info_offset(void)
   2202 {
   2203 
   2204 	return RF_COMPONENT_INFO_OFFSET;
   2205 }
   2206 
   2207 static daddr_t
   2208 rf_component_info_size(unsigned secsize)
   2209 {
   2210 	daddr_t info_size;
   2211 
   2212 	KASSERT(secsize);
   2213 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2214 		info_size = secsize;
   2215 	else
   2216 		info_size = RF_COMPONENT_INFO_SIZE;
   2217 
   2218 	return info_size;
   2219 }
   2220 
   2221 static daddr_t
   2222 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2223 {
   2224 	daddr_t map_offset;
   2225 
   2226 	KASSERT(raidPtr->bytesPerSector);
   2227 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2228 		map_offset = raidPtr->bytesPerSector;
   2229 	else
   2230 		map_offset = RF_COMPONENT_INFO_SIZE;
   2231 	map_offset += rf_component_info_offset();
   2232 
   2233 	return map_offset;
   2234 }
   2235 
   2236 static daddr_t
   2237 rf_parity_map_size(RF_Raid_t *raidPtr)
   2238 {
   2239 	daddr_t map_size;
   2240 
   2241 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2242 		map_size = raidPtr->bytesPerSector;
   2243 	else
   2244 		map_size = RF_PARITY_MAP_SIZE;
   2245 
   2246 	return map_size;
   2247 }
   2248 
   2249 int
   2250 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2251 {
   2252 	RF_ComponentLabel_t *clabel;
   2253 
   2254 	clabel = raidget_component_label(raidPtr, col);
   2255 	clabel->clean = RF_RAID_CLEAN;
   2256 	raidflush_component_label(raidPtr, col);
   2257 	return(0);
   2258 }
   2259 
   2260 
   2261 int
   2262 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2263 {
   2264 	RF_ComponentLabel_t *clabel;
   2265 
   2266 	clabel = raidget_component_label(raidPtr, col);
   2267 	clabel->clean = RF_RAID_DIRTY;
   2268 	raidflush_component_label(raidPtr, col);
   2269 	return(0);
   2270 }
   2271 
   2272 int
   2273 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2274 {
   2275 	KASSERT(raidPtr->bytesPerSector);
   2276 	return raidread_component_label(raidPtr->bytesPerSector,
   2277 	    raidPtr->Disks[col].dev,
   2278 	    raidPtr->raid_cinfo[col].ci_vp,
   2279 	    &raidPtr->raid_cinfo[col].ci_label);
   2280 }
   2281 
   2282 RF_ComponentLabel_t *
   2283 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2284 {
   2285 	return &raidPtr->raid_cinfo[col].ci_label;
   2286 }
   2287 
   2288 int
   2289 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2290 {
   2291 	RF_ComponentLabel_t *label;
   2292 
   2293 	label = &raidPtr->raid_cinfo[col].ci_label;
   2294 	label->mod_counter = raidPtr->mod_counter;
   2295 #ifndef RF_NO_PARITY_MAP
   2296 	label->parity_map_modcount = label->mod_counter;
   2297 #endif
   2298 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2299 	    raidPtr->Disks[col].dev,
   2300 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2301 }
   2302 
   2303 
   2304 static int
   2305 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2306     RF_ComponentLabel_t *clabel)
   2307 {
   2308 	return raidread_component_area(dev, b_vp, clabel,
   2309 	    sizeof(RF_ComponentLabel_t),
   2310 	    rf_component_info_offset(),
   2311 	    rf_component_info_size(secsize));
   2312 }
   2313 
   2314 /* ARGSUSED */
   2315 static int
   2316 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2317     size_t msize, daddr_t offset, daddr_t dsize)
   2318 {
   2319 	struct buf *bp;
   2320 	int error;
   2321 
   2322 	/* XXX should probably ensure that we don't try to do this if
   2323 	   someone has changed rf_protected_sectors. */
   2324 
   2325 	if (b_vp == NULL) {
   2326 		/* For whatever reason, this component is not valid.
   2327 		   Don't try to read a component label from it. */
   2328 		return(EINVAL);
   2329 	}
   2330 
   2331 	/* get a block of the appropriate size... */
   2332 	bp = geteblk((int)dsize);
   2333 	bp->b_dev = dev;
   2334 
   2335 	/* get our ducks in a row for the read */
   2336 	bp->b_blkno = offset / DEV_BSIZE;
   2337 	bp->b_bcount = dsize;
   2338 	bp->b_flags |= B_READ;
   2339  	bp->b_resid = dsize;
   2340 
   2341 	bdev_strategy(bp);
   2342 	error = biowait(bp);
   2343 
   2344 	if (!error) {
   2345 		memcpy(data, bp->b_data, msize);
   2346 	}
   2347 
   2348 	brelse(bp, 0);
   2349 	return(error);
   2350 }
   2351 
   2352 
   2353 static int
   2354 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2355     RF_ComponentLabel_t *clabel)
   2356 {
   2357 	return raidwrite_component_area(dev, b_vp, clabel,
   2358 	    sizeof(RF_ComponentLabel_t),
   2359 	    rf_component_info_offset(),
   2360 	    rf_component_info_size(secsize), 0);
   2361 }
   2362 
   2363 /* ARGSUSED */
   2364 static int
   2365 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2366     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2367 {
   2368 	struct buf *bp;
   2369 	int error;
   2370 
   2371 	/* get a block of the appropriate size... */
   2372 	bp = geteblk((int)dsize);
   2373 	bp->b_dev = dev;
   2374 
   2375 	/* get our ducks in a row for the write */
   2376 	bp->b_blkno = offset / DEV_BSIZE;
   2377 	bp->b_bcount = dsize;
   2378 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2379  	bp->b_resid = dsize;
   2380 
   2381 	memset(bp->b_data, 0, dsize);
   2382 	memcpy(bp->b_data, data, msize);
   2383 
   2384 	bdev_strategy(bp);
   2385 	if (asyncp)
   2386 		return 0;
   2387 	error = biowait(bp);
   2388 	brelse(bp, 0);
   2389 	if (error) {
   2390 #if 1
   2391 		printf("Failed to write RAID component info!\n");
   2392 #endif
   2393 	}
   2394 
   2395 	return(error);
   2396 }
   2397 
   2398 void
   2399 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2400 {
   2401 	int c;
   2402 
   2403 	for (c = 0; c < raidPtr->numCol; c++) {
   2404 		/* Skip dead disks. */
   2405 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2406 			continue;
   2407 		/* XXXjld: what if an error occurs here? */
   2408 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2409 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2410 		    RF_PARITYMAP_NBYTE,
   2411 		    rf_parity_map_offset(raidPtr),
   2412 		    rf_parity_map_size(raidPtr), 0);
   2413 	}
   2414 }
   2415 
   2416 void
   2417 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2418 {
   2419 	struct rf_paritymap_ondisk tmp;
   2420 	int c,first;
   2421 
   2422 	first=1;
   2423 	for (c = 0; c < raidPtr->numCol; c++) {
   2424 		/* Skip dead disks. */
   2425 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2426 			continue;
   2427 		raidread_component_area(raidPtr->Disks[c].dev,
   2428 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2429 		    RF_PARITYMAP_NBYTE,
   2430 		    rf_parity_map_offset(raidPtr),
   2431 		    rf_parity_map_size(raidPtr));
   2432 		if (first) {
   2433 			memcpy(map, &tmp, sizeof(*map));
   2434 			first = 0;
   2435 		} else {
   2436 			rf_paritymap_merge(map, &tmp);
   2437 		}
   2438 	}
   2439 }
   2440 
   2441 void
   2442 rf_markalldirty(RF_Raid_t *raidPtr)
   2443 {
   2444 	RF_ComponentLabel_t *clabel;
   2445 	int sparecol;
   2446 	int c;
   2447 	int j;
   2448 	int scol = -1;
   2449 
   2450 	raidPtr->mod_counter++;
   2451 	for (c = 0; c < raidPtr->numCol; c++) {
   2452 		/* we don't want to touch (at all) a disk that has
   2453 		   failed */
   2454 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2455 			clabel = raidget_component_label(raidPtr, c);
   2456 			if (clabel->status == rf_ds_spared) {
   2457 				/* XXX do something special...
   2458 				   but whatever you do, don't
   2459 				   try to access it!! */
   2460 			} else {
   2461 				raidmarkdirty(raidPtr, c);
   2462 			}
   2463 		}
   2464 	}
   2465 
   2466 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2467 		sparecol = raidPtr->numCol + c;
   2468 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2469 			/*
   2470 
   2471 			   we claim this disk is "optimal" if it's
   2472 			   rf_ds_used_spare, as that means it should be
   2473 			   directly substitutable for the disk it replaced.
   2474 			   We note that too...
   2475 
   2476 			 */
   2477 
   2478 			for(j=0;j<raidPtr->numCol;j++) {
   2479 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2480 					scol = j;
   2481 					break;
   2482 				}
   2483 			}
   2484 
   2485 			clabel = raidget_component_label(raidPtr, sparecol);
   2486 			/* make sure status is noted */
   2487 
   2488 			raid_init_component_label(raidPtr, clabel);
   2489 
   2490 			clabel->row = 0;
   2491 			clabel->column = scol;
   2492 			/* Note: we *don't* change status from rf_ds_used_spare
   2493 			   to rf_ds_optimal */
   2494 			/* clabel.status = rf_ds_optimal; */
   2495 
   2496 			raidmarkdirty(raidPtr, sparecol);
   2497 		}
   2498 	}
   2499 }
   2500 
   2501 
   2502 void
   2503 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2504 {
   2505 	RF_ComponentLabel_t *clabel;
   2506 	int sparecol;
   2507 	int c;
   2508 	int j;
   2509 	int scol;
   2510 	struct raid_softc *rs = raidPtr->softc;
   2511 
   2512 	scol = -1;
   2513 
   2514 	/* XXX should do extra checks to make sure things really are clean,
   2515 	   rather than blindly setting the clean bit... */
   2516 
   2517 	raidPtr->mod_counter++;
   2518 
   2519 	for (c = 0; c < raidPtr->numCol; c++) {
   2520 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2521 			clabel = raidget_component_label(raidPtr, c);
   2522 			/* make sure status is noted */
   2523 			clabel->status = rf_ds_optimal;
   2524 
   2525 			/* note what unit we are configured as */
   2526 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2527 				clabel->last_unit = raidPtr->raidid;
   2528 
   2529 			raidflush_component_label(raidPtr, c);
   2530 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2531 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2532 					raidmarkclean(raidPtr, c);
   2533 				}
   2534 			}
   2535 		}
   2536 		/* else we don't touch it.. */
   2537 	}
   2538 
   2539 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2540 		sparecol = raidPtr->numCol + c;
   2541 		/* Need to ensure that the reconstruct actually completed! */
   2542 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2543 			/*
   2544 
   2545 			   we claim this disk is "optimal" if it's
   2546 			   rf_ds_used_spare, as that means it should be
   2547 			   directly substitutable for the disk it replaced.
   2548 			   We note that too...
   2549 
   2550 			 */
   2551 
   2552 			for(j=0;j<raidPtr->numCol;j++) {
   2553 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2554 					scol = j;
   2555 					break;
   2556 				}
   2557 			}
   2558 
   2559 			/* XXX shouldn't *really* need this... */
   2560 			clabel = raidget_component_label(raidPtr, sparecol);
   2561 			/* make sure status is noted */
   2562 
   2563 			raid_init_component_label(raidPtr, clabel);
   2564 
   2565 			clabel->column = scol;
   2566 			clabel->status = rf_ds_optimal;
   2567 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2568 				clabel->last_unit = raidPtr->raidid;
   2569 
   2570 			raidflush_component_label(raidPtr, sparecol);
   2571 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2572 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2573 					raidmarkclean(raidPtr, sparecol);
   2574 				}
   2575 			}
   2576 		}
   2577 	}
   2578 }
   2579 
   2580 void
   2581 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2582 {
   2583 
   2584 	if (vp != NULL) {
   2585 		if (auto_configured == 1) {
   2586 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2587 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2588 			vput(vp);
   2589 
   2590 		} else {
   2591 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2592 		}
   2593 	}
   2594 }
   2595 
   2596 
   2597 void
   2598 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2599 {
   2600 	int r,c;
   2601 	struct vnode *vp;
   2602 	int acd;
   2603 
   2604 
   2605 	/* We take this opportunity to close the vnodes like we should.. */
   2606 
   2607 	for (c = 0; c < raidPtr->numCol; c++) {
   2608 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2609 		acd = raidPtr->Disks[c].auto_configured;
   2610 		rf_close_component(raidPtr, vp, acd);
   2611 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2612 		raidPtr->Disks[c].auto_configured = 0;
   2613 	}
   2614 
   2615 	for (r = 0; r < raidPtr->numSpare; r++) {
   2616 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2617 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2618 		rf_close_component(raidPtr, vp, acd);
   2619 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2620 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2621 	}
   2622 }
   2623 
   2624 
   2625 void
   2626 rf_ReconThread(struct rf_recon_req_internal *req)
   2627 {
   2628 	int     s;
   2629 	RF_Raid_t *raidPtr;
   2630 
   2631 	s = splbio();
   2632 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2633 	raidPtr->recon_in_progress = 1;
   2634 
   2635 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2636 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2637 
   2638 	RF_Free(req, sizeof(*req));
   2639 
   2640 	raidPtr->recon_in_progress = 0;
   2641 	splx(s);
   2642 
   2643 	/* That's all... */
   2644 	kthread_exit(0);	/* does not return */
   2645 }
   2646 
   2647 void
   2648 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2649 {
   2650 	int retcode;
   2651 	int s;
   2652 
   2653 	raidPtr->parity_rewrite_stripes_done = 0;
   2654 	raidPtr->parity_rewrite_in_progress = 1;
   2655 	s = splbio();
   2656 	retcode = rf_RewriteParity(raidPtr);
   2657 	splx(s);
   2658 	if (retcode) {
   2659 		printf("raid%d: Error re-writing parity (%d)!\n",
   2660 		    raidPtr->raidid, retcode);
   2661 	} else {
   2662 		/* set the clean bit!  If we shutdown correctly,
   2663 		   the clean bit on each component label will get
   2664 		   set */
   2665 		raidPtr->parity_good = RF_RAID_CLEAN;
   2666 	}
   2667 	raidPtr->parity_rewrite_in_progress = 0;
   2668 
   2669 	/* Anyone waiting for us to stop?  If so, inform them... */
   2670 	if (raidPtr->waitShutdown) {
   2671 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2672 	}
   2673 
   2674 	/* That's all... */
   2675 	kthread_exit(0);	/* does not return */
   2676 }
   2677 
   2678 
   2679 void
   2680 rf_CopybackThread(RF_Raid_t *raidPtr)
   2681 {
   2682 	int s;
   2683 
   2684 	raidPtr->copyback_in_progress = 1;
   2685 	s = splbio();
   2686 	rf_CopybackReconstructedData(raidPtr);
   2687 	splx(s);
   2688 	raidPtr->copyback_in_progress = 0;
   2689 
   2690 	/* That's all... */
   2691 	kthread_exit(0);	/* does not return */
   2692 }
   2693 
   2694 
   2695 void
   2696 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2697 {
   2698 	int s;
   2699 	RF_Raid_t *raidPtr;
   2700 
   2701 	s = splbio();
   2702 	raidPtr = req->raidPtr;
   2703 	raidPtr->recon_in_progress = 1;
   2704 	rf_ReconstructInPlace(raidPtr, req->col);
   2705 	RF_Free(req, sizeof(*req));
   2706 	raidPtr->recon_in_progress = 0;
   2707 	splx(s);
   2708 
   2709 	/* That's all... */
   2710 	kthread_exit(0);	/* does not return */
   2711 }
   2712 
   2713 static RF_AutoConfig_t *
   2714 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2715     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2716     unsigned secsize)
   2717 {
   2718 	int good_one = 0;
   2719 	RF_ComponentLabel_t *clabel;
   2720 	RF_AutoConfig_t *ac;
   2721 
   2722 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2723 	if (clabel == NULL) {
   2724 oomem:
   2725 		    while(ac_list) {
   2726 			    ac = ac_list;
   2727 			    if (ac->clabel)
   2728 				    free(ac->clabel, M_RAIDFRAME);
   2729 			    ac_list = ac_list->next;
   2730 			    free(ac, M_RAIDFRAME);
   2731 		    }
   2732 		    printf("RAID auto config: out of memory!\n");
   2733 		    return NULL; /* XXX probably should panic? */
   2734 	}
   2735 
   2736 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2737 		/* Got the label.  Does it look reasonable? */
   2738 		if (rf_reasonable_label(clabel, numsecs) &&
   2739 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2740 #ifdef DEBUG
   2741 			printf("Component on: %s: %llu\n",
   2742 				cname, (unsigned long long)size);
   2743 			rf_print_component_label(clabel);
   2744 #endif
   2745 			/* if it's reasonable, add it, else ignore it. */
   2746 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2747 				M_NOWAIT);
   2748 			if (ac == NULL) {
   2749 				free(clabel, M_RAIDFRAME);
   2750 				goto oomem;
   2751 			}
   2752 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2753 			ac->dev = dev;
   2754 			ac->vp = vp;
   2755 			ac->clabel = clabel;
   2756 			ac->next = ac_list;
   2757 			ac_list = ac;
   2758 			good_one = 1;
   2759 		}
   2760 	}
   2761 	if (!good_one) {
   2762 		/* cleanup */
   2763 		free(clabel, M_RAIDFRAME);
   2764 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2765 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2766 		vput(vp);
   2767 	}
   2768 	return ac_list;
   2769 }
   2770 
   2771 RF_AutoConfig_t *
   2772 rf_find_raid_components(void)
   2773 {
   2774 	struct vnode *vp;
   2775 	struct disklabel label;
   2776 	device_t dv;
   2777 	deviter_t di;
   2778 	dev_t dev;
   2779 	int bmajor, bminor, wedge, rf_part_found;
   2780 	int error;
   2781 	int i;
   2782 	RF_AutoConfig_t *ac_list;
   2783 	uint64_t numsecs;
   2784 	unsigned secsize;
   2785 	int dowedges;
   2786 
   2787 	/* initialize the AutoConfig list */
   2788 	ac_list = NULL;
   2789 
   2790 	/*
   2791 	 * we begin by trolling through *all* the devices on the system *twice*
   2792 	 * first we scan for wedges, second for other devices. This avoids
   2793 	 * using a raw partition instead of a wedge that covers the whole disk
   2794 	 */
   2795 
   2796 	for (dowedges=1; dowedges>=0; --dowedges) {
   2797 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2798 		     dv = deviter_next(&di)) {
   2799 
   2800 			/* we are only interested in disks... */
   2801 			if (device_class(dv) != DV_DISK)
   2802 				continue;
   2803 
   2804 			/* we don't care about floppies... */
   2805 			if (device_is_a(dv, "fd")) {
   2806 				continue;
   2807 			}
   2808 
   2809 			/* we don't care about CD's... */
   2810 			if (device_is_a(dv, "cd")) {
   2811 				continue;
   2812 			}
   2813 
   2814 			/* we don't care about md's... */
   2815 			if (device_is_a(dv, "md")) {
   2816 				continue;
   2817 			}
   2818 
   2819 			/* hdfd is the Atari/Hades floppy driver */
   2820 			if (device_is_a(dv, "hdfd")) {
   2821 				continue;
   2822 			}
   2823 
   2824 			/* fdisa is the Atari/Milan floppy driver */
   2825 			if (device_is_a(dv, "fdisa")) {
   2826 				continue;
   2827 			}
   2828 
   2829 			/* are we in the wedges pass ? */
   2830 			wedge = device_is_a(dv, "dk");
   2831 			if (wedge != dowedges) {
   2832 				continue;
   2833 			}
   2834 
   2835 			/* need to find the device_name_to_block_device_major stuff */
   2836 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2837 
   2838 			rf_part_found = 0; /*No raid partition as yet*/
   2839 
   2840 			/* get a vnode for the raw partition of this disk */
   2841 			bminor = minor(device_unit(dv));
   2842 			dev = wedge ? makedev(bmajor, bminor) :
   2843 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2844 			if (bdevvp(dev, &vp))
   2845 				panic("RAID can't alloc vnode");
   2846 
   2847 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2848 
   2849 			if (error) {
   2850 				/* "Who cares."  Continue looking
   2851 				   for something that exists*/
   2852 				vput(vp);
   2853 				continue;
   2854 			}
   2855 
   2856 			error = getdisksize(vp, &numsecs, &secsize);
   2857 			if (error) {
   2858 				/*
   2859 				 * Pseudo devices like vnd and cgd can be
   2860 				 * opened but may still need some configuration.
   2861 				 * Ignore these quietly.
   2862 				 */
   2863 				if (error != ENXIO)
   2864 					printf("RAIDframe: can't get disk size"
   2865 					    " for dev %s (%d)\n",
   2866 					    device_xname(dv), error);
   2867 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2868 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2869 				vput(vp);
   2870 				continue;
   2871 			}
   2872 			if (wedge) {
   2873 				struct dkwedge_info dkw;
   2874 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2875 				    NOCRED);
   2876 				if (error) {
   2877 					printf("RAIDframe: can't get wedge info for "
   2878 					    "dev %s (%d)\n", device_xname(dv), error);
   2879 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2880 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2881 					vput(vp);
   2882 					continue;
   2883 				}
   2884 
   2885 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2886 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2887 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2888 					vput(vp);
   2889 					continue;
   2890 				}
   2891 
   2892 				ac_list = rf_get_component(ac_list, dev, vp,
   2893 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2894 				rf_part_found = 1; /*There is a raid component on this disk*/
   2895 				continue;
   2896 			}
   2897 
   2898 			/* Ok, the disk exists.  Go get the disklabel. */
   2899 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2900 			if (error) {
   2901 				/*
   2902 				 * XXX can't happen - open() would
   2903 				 * have errored out (or faked up one)
   2904 				 */
   2905 				if (error != ENOTTY)
   2906 					printf("RAIDframe: can't get label for dev "
   2907 					    "%s (%d)\n", device_xname(dv), error);
   2908 			}
   2909 
   2910 			/* don't need this any more.  We'll allocate it again
   2911 			   a little later if we really do... */
   2912 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2913 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2914 			vput(vp);
   2915 
   2916 			if (error)
   2917 				continue;
   2918 
   2919 			rf_part_found = 0; /*No raid partitions yet*/
   2920 			for (i = 0; i < label.d_npartitions; i++) {
   2921 				char cname[sizeof(ac_list->devname)];
   2922 
   2923 				/* We only support partitions marked as RAID */
   2924 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2925 					continue;
   2926 
   2927 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2928 				if (bdevvp(dev, &vp))
   2929 					panic("RAID can't alloc vnode");
   2930 
   2931 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2932 				if (error) {
   2933 					/* Whatever... */
   2934 					vput(vp);
   2935 					continue;
   2936 				}
   2937 				snprintf(cname, sizeof(cname), "%s%c",
   2938 				    device_xname(dv), 'a' + i);
   2939 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2940 					label.d_partitions[i].p_size, numsecs, secsize);
   2941 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2942 			}
   2943 
   2944 			/*
   2945 			 *If there is no raid component on this disk, either in a
   2946 			 *disklabel or inside a wedge, check the raw partition as well,
   2947 			 *as it is possible to configure raid components on raw disk
   2948 			 *devices.
   2949 			 */
   2950 
   2951 			if (!rf_part_found) {
   2952 				char cname[sizeof(ac_list->devname)];
   2953 
   2954 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2955 				if (bdevvp(dev, &vp))
   2956 					panic("RAID can't alloc vnode");
   2957 
   2958 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2959 				if (error) {
   2960 					/* Whatever... */
   2961 					vput(vp);
   2962 					continue;
   2963 				}
   2964 				snprintf(cname, sizeof(cname), "%s%c",
   2965 				    device_xname(dv), 'a' + RAW_PART);
   2966 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2967 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2968 			}
   2969 		}
   2970 		deviter_release(&di);
   2971 	}
   2972 	return ac_list;
   2973 }
   2974 
   2975 
   2976 int
   2977 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2978 {
   2979 
   2980 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2981 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2982 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2983 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2984 	    clabel->row >=0 &&
   2985 	    clabel->column >= 0 &&
   2986 	    clabel->num_rows > 0 &&
   2987 	    clabel->num_columns > 0 &&
   2988 	    clabel->row < clabel->num_rows &&
   2989 	    clabel->column < clabel->num_columns &&
   2990 	    clabel->blockSize > 0 &&
   2991 	    /*
   2992 	     * numBlocksHi may contain garbage, but it is ok since
   2993 	     * the type is unsigned.  If it is really garbage,
   2994 	     * rf_fix_old_label_size() will fix it.
   2995 	     */
   2996 	    rf_component_label_numblocks(clabel) > 0) {
   2997 		/*
   2998 		 * label looks reasonable enough...
   2999 		 * let's make sure it has no old garbage.
   3000 		 */
   3001 		if (numsecs)
   3002 			rf_fix_old_label_size(clabel, numsecs);
   3003 		return(1);
   3004 	}
   3005 	return(0);
   3006 }
   3007 
   3008 
   3009 /*
   3010  * For reasons yet unknown, some old component labels have garbage in
   3011  * the newer numBlocksHi region, and this causes lossage.  Since those
   3012  * disks will also have numsecs set to less than 32 bits of sectors,
   3013  * we can determine when this corruption has occurred, and fix it.
   3014  *
   3015  * The exact same problem, with the same unknown reason, happens to
   3016  * the partitionSizeHi member as well.
   3017  */
   3018 static void
   3019 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3020 {
   3021 
   3022 	if (numsecs < ((uint64_t)1 << 32)) {
   3023 		if (clabel->numBlocksHi) {
   3024 			printf("WARNING: total sectors < 32 bits, yet "
   3025 			       "numBlocksHi set\n"
   3026 			       "WARNING: resetting numBlocksHi to zero.\n");
   3027 			clabel->numBlocksHi = 0;
   3028 		}
   3029 
   3030 		if (clabel->partitionSizeHi) {
   3031 			printf("WARNING: total sectors < 32 bits, yet "
   3032 			       "partitionSizeHi set\n"
   3033 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3034 			clabel->partitionSizeHi = 0;
   3035 		}
   3036 	}
   3037 }
   3038 
   3039 
   3040 #ifdef DEBUG
   3041 void
   3042 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3043 {
   3044 	uint64_t numBlocks;
   3045 	static const char *rp[] = {
   3046 	    "No", "Force", "Soft", "*invalid*"
   3047 	};
   3048 
   3049 
   3050 	numBlocks = rf_component_label_numblocks(clabel);
   3051 
   3052 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3053 	       clabel->row, clabel->column,
   3054 	       clabel->num_rows, clabel->num_columns);
   3055 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3056 	       clabel->version, clabel->serial_number,
   3057 	       clabel->mod_counter);
   3058 	printf("   Clean: %s Status: %d\n",
   3059 	       clabel->clean ? "Yes" : "No", clabel->status);
   3060 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3061 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3062 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3063 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3064 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3065 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3066 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3067 #if 0
   3068 	   printf("   Config order: %d\n", clabel->config_order);
   3069 #endif
   3070 
   3071 }
   3072 #endif
   3073 
   3074 RF_ConfigSet_t *
   3075 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3076 {
   3077 	RF_AutoConfig_t *ac;
   3078 	RF_ConfigSet_t *config_sets;
   3079 	RF_ConfigSet_t *cset;
   3080 	RF_AutoConfig_t *ac_next;
   3081 
   3082 
   3083 	config_sets = NULL;
   3084 
   3085 	/* Go through the AutoConfig list, and figure out which components
   3086 	   belong to what sets.  */
   3087 	ac = ac_list;
   3088 	while(ac!=NULL) {
   3089 		/* we're going to putz with ac->next, so save it here
   3090 		   for use at the end of the loop */
   3091 		ac_next = ac->next;
   3092 
   3093 		if (config_sets == NULL) {
   3094 			/* will need at least this one... */
   3095 			config_sets = (RF_ConfigSet_t *)
   3096 				malloc(sizeof(RF_ConfigSet_t),
   3097 				       M_RAIDFRAME, M_NOWAIT);
   3098 			if (config_sets == NULL) {
   3099 				panic("rf_create_auto_sets: No memory!");
   3100 			}
   3101 			/* this one is easy :) */
   3102 			config_sets->ac = ac;
   3103 			config_sets->next = NULL;
   3104 			config_sets->rootable = 0;
   3105 			ac->next = NULL;
   3106 		} else {
   3107 			/* which set does this component fit into? */
   3108 			cset = config_sets;
   3109 			while(cset!=NULL) {
   3110 				if (rf_does_it_fit(cset, ac)) {
   3111 					/* looks like it matches... */
   3112 					ac->next = cset->ac;
   3113 					cset->ac = ac;
   3114 					break;
   3115 				}
   3116 				cset = cset->next;
   3117 			}
   3118 			if (cset==NULL) {
   3119 				/* didn't find a match above... new set..*/
   3120 				cset = (RF_ConfigSet_t *)
   3121 					malloc(sizeof(RF_ConfigSet_t),
   3122 					       M_RAIDFRAME, M_NOWAIT);
   3123 				if (cset == NULL) {
   3124 					panic("rf_create_auto_sets: No memory!");
   3125 				}
   3126 				cset->ac = ac;
   3127 				ac->next = NULL;
   3128 				cset->next = config_sets;
   3129 				cset->rootable = 0;
   3130 				config_sets = cset;
   3131 			}
   3132 		}
   3133 		ac = ac_next;
   3134 	}
   3135 
   3136 
   3137 	return(config_sets);
   3138 }
   3139 
   3140 static int
   3141 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3142 {
   3143 	RF_ComponentLabel_t *clabel1, *clabel2;
   3144 
   3145 	/* If this one matches the *first* one in the set, that's good
   3146 	   enough, since the other members of the set would have been
   3147 	   through here too... */
   3148 	/* note that we are not checking partitionSize here..
   3149 
   3150 	   Note that we are also not checking the mod_counters here.
   3151 	   If everything else matches except the mod_counter, that's
   3152 	   good enough for this test.  We will deal with the mod_counters
   3153 	   a little later in the autoconfiguration process.
   3154 
   3155 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3156 
   3157 	   The reason we don't check for this is that failed disks
   3158 	   will have lower modification counts.  If those disks are
   3159 	   not added to the set they used to belong to, then they will
   3160 	   form their own set, which may result in 2 different sets,
   3161 	   for example, competing to be configured at raid0, and
   3162 	   perhaps competing to be the root filesystem set.  If the
   3163 	   wrong ones get configured, or both attempt to become /,
   3164 	   weird behaviour and or serious lossage will occur.  Thus we
   3165 	   need to bring them into the fold here, and kick them out at
   3166 	   a later point.
   3167 
   3168 	*/
   3169 
   3170 	clabel1 = cset->ac->clabel;
   3171 	clabel2 = ac->clabel;
   3172 	if ((clabel1->version == clabel2->version) &&
   3173 	    (clabel1->serial_number == clabel2->serial_number) &&
   3174 	    (clabel1->num_rows == clabel2->num_rows) &&
   3175 	    (clabel1->num_columns == clabel2->num_columns) &&
   3176 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3177 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3178 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3179 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3180 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3181 	    (clabel1->blockSize == clabel2->blockSize) &&
   3182 	    rf_component_label_numblocks(clabel1) ==
   3183 	    rf_component_label_numblocks(clabel2) &&
   3184 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3185 	    (clabel1->root_partition == clabel2->root_partition) &&
   3186 	    (clabel1->last_unit == clabel2->last_unit) &&
   3187 	    (clabel1->config_order == clabel2->config_order)) {
   3188 		/* if it get's here, it almost *has* to be a match */
   3189 	} else {
   3190 		/* it's not consistent with somebody in the set..
   3191 		   punt */
   3192 		return(0);
   3193 	}
   3194 	/* all was fine.. it must fit... */
   3195 	return(1);
   3196 }
   3197 
   3198 int
   3199 rf_have_enough_components(RF_ConfigSet_t *cset)
   3200 {
   3201 	RF_AutoConfig_t *ac;
   3202 	RF_AutoConfig_t *auto_config;
   3203 	RF_ComponentLabel_t *clabel;
   3204 	int c;
   3205 	int num_cols;
   3206 	int num_missing;
   3207 	int mod_counter;
   3208 	int mod_counter_found;
   3209 	int even_pair_failed;
   3210 	char parity_type;
   3211 
   3212 
   3213 	/* check to see that we have enough 'live' components
   3214 	   of this set.  If so, we can configure it if necessary */
   3215 
   3216 	num_cols = cset->ac->clabel->num_columns;
   3217 	parity_type = cset->ac->clabel->parityConfig;
   3218 
   3219 	/* XXX Check for duplicate components!?!?!? */
   3220 
   3221 	/* Determine what the mod_counter is supposed to be for this set. */
   3222 
   3223 	mod_counter_found = 0;
   3224 	mod_counter = 0;
   3225 	ac = cset->ac;
   3226 	while(ac!=NULL) {
   3227 		if (mod_counter_found==0) {
   3228 			mod_counter = ac->clabel->mod_counter;
   3229 			mod_counter_found = 1;
   3230 		} else {
   3231 			if (ac->clabel->mod_counter > mod_counter) {
   3232 				mod_counter = ac->clabel->mod_counter;
   3233 			}
   3234 		}
   3235 		ac = ac->next;
   3236 	}
   3237 
   3238 	num_missing = 0;
   3239 	auto_config = cset->ac;
   3240 
   3241 	even_pair_failed = 0;
   3242 	for(c=0; c<num_cols; c++) {
   3243 		ac = auto_config;
   3244 		while(ac!=NULL) {
   3245 			if ((ac->clabel->column == c) &&
   3246 			    (ac->clabel->mod_counter == mod_counter)) {
   3247 				/* it's this one... */
   3248 #ifdef DEBUG
   3249 				printf("Found: %s at %d\n",
   3250 				       ac->devname,c);
   3251 #endif
   3252 				break;
   3253 			}
   3254 			ac=ac->next;
   3255 		}
   3256 		if (ac==NULL) {
   3257 				/* Didn't find one here! */
   3258 				/* special case for RAID 1, especially
   3259 				   where there are more than 2
   3260 				   components (where RAIDframe treats
   3261 				   things a little differently :( ) */
   3262 			if (parity_type == '1') {
   3263 				if (c%2 == 0) { /* even component */
   3264 					even_pair_failed = 1;
   3265 				} else { /* odd component.  If
   3266 					    we're failed, and
   3267 					    so is the even
   3268 					    component, it's
   3269 					    "Good Night, Charlie" */
   3270 					if (even_pair_failed == 1) {
   3271 						return(0);
   3272 					}
   3273 				}
   3274 			} else {
   3275 				/* normal accounting */
   3276 				num_missing++;
   3277 			}
   3278 		}
   3279 		if ((parity_type == '1') && (c%2 == 1)) {
   3280 				/* Just did an even component, and we didn't
   3281 				   bail.. reset the even_pair_failed flag,
   3282 				   and go on to the next component.... */
   3283 			even_pair_failed = 0;
   3284 		}
   3285 	}
   3286 
   3287 	clabel = cset->ac->clabel;
   3288 
   3289 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3290 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3291 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3292 		/* XXX this needs to be made *much* more general */
   3293 		/* Too many failures */
   3294 		return(0);
   3295 	}
   3296 	/* otherwise, all is well, and we've got enough to take a kick
   3297 	   at autoconfiguring this set */
   3298 	return(1);
   3299 }
   3300 
   3301 void
   3302 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3303 			RF_Raid_t *raidPtr)
   3304 {
   3305 	RF_ComponentLabel_t *clabel;
   3306 	int i;
   3307 
   3308 	clabel = ac->clabel;
   3309 
   3310 	/* 1. Fill in the common stuff */
   3311 	config->numCol = clabel->num_columns;
   3312 	config->numSpare = 0; /* XXX should this be set here? */
   3313 	config->sectPerSU = clabel->sectPerSU;
   3314 	config->SUsPerPU = clabel->SUsPerPU;
   3315 	config->SUsPerRU = clabel->SUsPerRU;
   3316 	config->parityConfig = clabel->parityConfig;
   3317 	/* XXX... */
   3318 	strcpy(config->diskQueueType,"fifo");
   3319 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3320 	config->layoutSpecificSize = 0; /* XXX ?? */
   3321 
   3322 	while(ac!=NULL) {
   3323 		/* row/col values will be in range due to the checks
   3324 		   in reasonable_label() */
   3325 		strcpy(config->devnames[0][ac->clabel->column],
   3326 		       ac->devname);
   3327 		ac = ac->next;
   3328 	}
   3329 
   3330 	for(i=0;i<RF_MAXDBGV;i++) {
   3331 		config->debugVars[i][0] = 0;
   3332 	}
   3333 }
   3334 
   3335 int
   3336 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3337 {
   3338 	RF_ComponentLabel_t *clabel;
   3339 	int column;
   3340 	int sparecol;
   3341 
   3342 	raidPtr->autoconfigure = new_value;
   3343 
   3344 	for(column=0; column<raidPtr->numCol; column++) {
   3345 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3346 			clabel = raidget_component_label(raidPtr, column);
   3347 			clabel->autoconfigure = new_value;
   3348 			raidflush_component_label(raidPtr, column);
   3349 		}
   3350 	}
   3351 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3352 		sparecol = raidPtr->numCol + column;
   3353 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3354 			clabel = raidget_component_label(raidPtr, sparecol);
   3355 			clabel->autoconfigure = new_value;
   3356 			raidflush_component_label(raidPtr, sparecol);
   3357 		}
   3358 	}
   3359 	return(new_value);
   3360 }
   3361 
   3362 int
   3363 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3364 {
   3365 	RF_ComponentLabel_t *clabel;
   3366 	int column;
   3367 	int sparecol;
   3368 
   3369 	raidPtr->root_partition = new_value;
   3370 	for(column=0; column<raidPtr->numCol; column++) {
   3371 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3372 			clabel = raidget_component_label(raidPtr, column);
   3373 			clabel->root_partition = new_value;
   3374 			raidflush_component_label(raidPtr, column);
   3375 		}
   3376 	}
   3377 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3378 		sparecol = raidPtr->numCol + column;
   3379 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3380 			clabel = raidget_component_label(raidPtr, sparecol);
   3381 			clabel->root_partition = new_value;
   3382 			raidflush_component_label(raidPtr, sparecol);
   3383 		}
   3384 	}
   3385 	return(new_value);
   3386 }
   3387 
   3388 void
   3389 rf_release_all_vps(RF_ConfigSet_t *cset)
   3390 {
   3391 	RF_AutoConfig_t *ac;
   3392 
   3393 	ac = cset->ac;
   3394 	while(ac!=NULL) {
   3395 		/* Close the vp, and give it back */
   3396 		if (ac->vp) {
   3397 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3398 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3399 			vput(ac->vp);
   3400 			ac->vp = NULL;
   3401 		}
   3402 		ac = ac->next;
   3403 	}
   3404 }
   3405 
   3406 
   3407 void
   3408 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3409 {
   3410 	RF_AutoConfig_t *ac;
   3411 	RF_AutoConfig_t *next_ac;
   3412 
   3413 	ac = cset->ac;
   3414 	while(ac!=NULL) {
   3415 		next_ac = ac->next;
   3416 		/* nuke the label */
   3417 		free(ac->clabel, M_RAIDFRAME);
   3418 		/* cleanup the config structure */
   3419 		free(ac, M_RAIDFRAME);
   3420 		/* "next.." */
   3421 		ac = next_ac;
   3422 	}
   3423 	/* and, finally, nuke the config set */
   3424 	free(cset, M_RAIDFRAME);
   3425 }
   3426 
   3427 
   3428 void
   3429 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3430 {
   3431 	/* current version number */
   3432 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3433 	clabel->serial_number = raidPtr->serial_number;
   3434 	clabel->mod_counter = raidPtr->mod_counter;
   3435 
   3436 	clabel->num_rows = 1;
   3437 	clabel->num_columns = raidPtr->numCol;
   3438 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3439 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3440 
   3441 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3442 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3443 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3444 
   3445 	clabel->blockSize = raidPtr->bytesPerSector;
   3446 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3447 
   3448 	/* XXX not portable */
   3449 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3450 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3451 	clabel->autoconfigure = raidPtr->autoconfigure;
   3452 	clabel->root_partition = raidPtr->root_partition;
   3453 	clabel->last_unit = raidPtr->raidid;
   3454 	clabel->config_order = raidPtr->config_order;
   3455 
   3456 #ifndef RF_NO_PARITY_MAP
   3457 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3458 #endif
   3459 }
   3460 
   3461 struct raid_softc *
   3462 rf_auto_config_set(RF_ConfigSet_t *cset)
   3463 {
   3464 	RF_Raid_t *raidPtr;
   3465 	RF_Config_t *config;
   3466 	int raidID;
   3467 	struct raid_softc *sc;
   3468 
   3469 #ifdef DEBUG
   3470 	printf("RAID autoconfigure\n");
   3471 #endif
   3472 
   3473 	/* 1. Create a config structure */
   3474 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3475 	if (config == NULL) {
   3476 		printf("%s: Out of mem - config!?!?\n", __func__);
   3477 				/* XXX do something more intelligent here. */
   3478 		return NULL;
   3479 	}
   3480 
   3481 	/*
   3482 	   2. Figure out what RAID ID this one is supposed to live at
   3483 	   See if we can get the same RAID dev that it was configured
   3484 	   on last time..
   3485 	*/
   3486 
   3487 	raidID = cset->ac->clabel->last_unit;
   3488 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3489 	     sc = raidget(++raidID, false))
   3490 		continue;
   3491 #ifdef DEBUG
   3492 	printf("Configuring raid%d:\n",raidID);
   3493 #endif
   3494 
   3495 	if (sc == NULL)
   3496 		sc = raidget(raidID, true);
   3497 	if (sc == NULL) {
   3498 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3499 				/* XXX do something more intelligent here. */
   3500 		free(config, M_RAIDFRAME);
   3501 		return NULL;
   3502 	}
   3503 
   3504 	raidPtr = &sc->sc_r;
   3505 
   3506 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3507 	raidPtr->softc = sc;
   3508 	raidPtr->raidid = raidID;
   3509 	raidPtr->openings = RAIDOUTSTANDING;
   3510 
   3511 	/* 3. Build the configuration structure */
   3512 	rf_create_configuration(cset->ac, config, raidPtr);
   3513 
   3514 	/* 4. Do the configuration */
   3515 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3516 		raidinit(sc);
   3517 
   3518 		rf_markalldirty(raidPtr);
   3519 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3520 		switch (cset->ac->clabel->root_partition) {
   3521 		case 1:	/* Force Root */
   3522 		case 2:	/* Soft Root: root when boot partition part of raid */
   3523 			/*
   3524 			 * everything configured just fine.  Make a note
   3525 			 * that this set is eligible to be root,
   3526 			 * or forced to be root
   3527 			 */
   3528 			cset->rootable = cset->ac->clabel->root_partition;
   3529 			/* XXX do this here? */
   3530 			raidPtr->root_partition = cset->rootable;
   3531 			break;
   3532 		default:
   3533 			break;
   3534 		}
   3535 	} else {
   3536 		raidput(sc);
   3537 		sc = NULL;
   3538 	}
   3539 
   3540 	/* 5. Cleanup */
   3541 	free(config, M_RAIDFRAME);
   3542 	return sc;
   3543 }
   3544 
   3545 void
   3546 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3547 	     size_t xmin, size_t xmax)
   3548 {
   3549 	int error;
   3550 
   3551 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3552 	pool_sethiwat(p, xmax);
   3553 	if ((error = pool_prime(p, xmin)) != 0)
   3554 		panic("%s: failed to prime pool: %d", __func__, error);
   3555 	pool_setlowat(p, xmin);
   3556 }
   3557 
   3558 /*
   3559  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3560  * to see if there is IO pending and if that IO could possibly be done
   3561  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3562  * otherwise.
   3563  *
   3564  */
   3565 int
   3566 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3567 {
   3568 	struct raid_softc *rs;
   3569 	struct dk_softc *dksc;
   3570 
   3571 	rs = raidPtr->softc;
   3572 	dksc = &rs->sc_dksc;
   3573 
   3574 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3575 		return 1;
   3576 
   3577 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3578 		/* there is work to do */
   3579 		return 0;
   3580 	}
   3581 	/* default is nothing to do */
   3582 	return 1;
   3583 }
   3584 
   3585 int
   3586 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3587 {
   3588 	uint64_t numsecs;
   3589 	unsigned secsize;
   3590 	int error;
   3591 
   3592 	error = getdisksize(vp, &numsecs, &secsize);
   3593 	if (error == 0) {
   3594 		diskPtr->blockSize = secsize;
   3595 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3596 		diskPtr->partitionSize = numsecs;
   3597 		return 0;
   3598 	}
   3599 	return error;
   3600 }
   3601 
   3602 static int
   3603 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3604 {
   3605 	return 1;
   3606 }
   3607 
   3608 static void
   3609 raid_attach(device_t parent, device_t self, void *aux)
   3610 {
   3611 }
   3612 
   3613 
   3614 static int
   3615 raid_detach(device_t self, int flags)
   3616 {
   3617 	int error;
   3618 	struct raid_softc *rs = raidsoftc(self);
   3619 
   3620 	if (rs == NULL)
   3621 		return ENXIO;
   3622 
   3623 	if ((error = raidlock(rs)) != 0)
   3624 		return (error);
   3625 
   3626 	error = raid_detach_unlocked(rs);
   3627 
   3628 	raidunlock(rs);
   3629 
   3630 	/* XXX raid can be referenced here */
   3631 
   3632 	if (error)
   3633 		return error;
   3634 
   3635 	/* Free the softc */
   3636 	raidput(rs);
   3637 
   3638 	return 0;
   3639 }
   3640 
   3641 static void
   3642 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3643 {
   3644 	struct dk_softc *dksc = &rs->sc_dksc;
   3645 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3646 
   3647 	memset(dg, 0, sizeof(*dg));
   3648 
   3649 	dg->dg_secperunit = raidPtr->totalSectors;
   3650 	dg->dg_secsize = raidPtr->bytesPerSector;
   3651 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3652 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3653 
   3654 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3655 }
   3656 
   3657 /*
   3658  * Get cache info for all the components (including spares).
   3659  * Returns intersection of all the cache flags of all disks, or first
   3660  * error if any encountered.
   3661  * XXXfua feature flags can change as spares are added - lock down somehow
   3662  */
   3663 static int
   3664 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3665 {
   3666 	int c;
   3667 	int error;
   3668 	int dkwhole = 0, dkpart;
   3669 
   3670 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3671 		/*
   3672 		 * Check any non-dead disk, even when currently being
   3673 		 * reconstructed.
   3674 		 */
   3675 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3676 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3677 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3678 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3679 			if (error) {
   3680 				if (error != ENODEV) {
   3681 					printf("raid%d: get cache for component %s failed\n",
   3682 					    raidPtr->raidid,
   3683 					    raidPtr->Disks[c].devname);
   3684 				}
   3685 
   3686 				return error;
   3687 			}
   3688 
   3689 			if (c == 0)
   3690 				dkwhole = dkpart;
   3691 			else
   3692 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3693 		}
   3694 	}
   3695 
   3696 	*data = dkwhole;
   3697 
   3698 	return 0;
   3699 }
   3700 
   3701 /*
   3702  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3703  * We end up returning whatever error was returned by the first cache flush
   3704  * that fails.
   3705  */
   3706 
   3707 int
   3708 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3709 {
   3710 	int c, sparecol;
   3711 	int e,error;
   3712 	int force = 1;
   3713 
   3714 	error = 0;
   3715 	for (c = 0; c < raidPtr->numCol; c++) {
   3716 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3717 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3718 					  &force, FWRITE, NOCRED);
   3719 			if (e) {
   3720 				if (e != ENODEV)
   3721 					printf("raid%d: cache flush to component %s failed.\n",
   3722 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3723 				if (error == 0) {
   3724 					error = e;
   3725 				}
   3726 			}
   3727 		}
   3728 	}
   3729 
   3730 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3731 		sparecol = raidPtr->numCol + c;
   3732 		/* Need to ensure that the reconstruct actually completed! */
   3733 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3734 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3735 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3736 			if (e) {
   3737 				if (e != ENODEV)
   3738 					printf("raid%d: cache flush to component %s failed.\n",
   3739 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3740 				if (error == 0) {
   3741 					error = e;
   3742 				}
   3743 			}
   3744 		}
   3745 	}
   3746 	return error;
   3747 }
   3748 
   3749 /* Fill in info with the current status */
   3750 void
   3751 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3752 {
   3753 
   3754 	if (raidPtr->status != rf_rs_reconstructing) {
   3755 		info->total = 100;
   3756 		info->completed = 100;
   3757 	} else {
   3758 		info->total = raidPtr->reconControl->numRUsTotal;
   3759 		info->completed = raidPtr->reconControl->numRUsComplete;
   3760 	}
   3761 	info->remaining = info->total - info->completed;
   3762 }
   3763 
   3764 /* Fill in info with the current status */
   3765 void
   3766 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3767 {
   3768 
   3769 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3770 		info->total = raidPtr->Layout.numStripe;
   3771 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3772 	} else {
   3773 		info->completed = 100;
   3774 		info->total = 100;
   3775 	}
   3776 	info->remaining = info->total - info->completed;
   3777 }
   3778 
   3779 /* Fill in info with the current status */
   3780 void
   3781 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3782 {
   3783 
   3784 	if (raidPtr->copyback_in_progress == 1) {
   3785 		info->total = raidPtr->Layout.numStripe;
   3786 		info->completed = raidPtr->copyback_stripes_done;
   3787 		info->remaining = info->total - info->completed;
   3788 	} else {
   3789 		info->remaining = 0;
   3790 		info->completed = 100;
   3791 		info->total = 100;
   3792 	}
   3793 }
   3794 
   3795 /* Fill in config with the current info */
   3796 int
   3797 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3798 {
   3799 	int	d, i, j;
   3800 
   3801 	if (!raidPtr->valid)
   3802 		return (ENODEV);
   3803 	config->cols = raidPtr->numCol;
   3804 	config->ndevs = raidPtr->numCol;
   3805 	if (config->ndevs >= RF_MAX_DISKS)
   3806 		return (ENOMEM);
   3807 	config->nspares = raidPtr->numSpare;
   3808 	if (config->nspares >= RF_MAX_DISKS)
   3809 		return (ENOMEM);
   3810 	config->maxqdepth = raidPtr->maxQueueDepth;
   3811 	d = 0;
   3812 	for (j = 0; j < config->cols; j++) {
   3813 		config->devs[d] = raidPtr->Disks[j];
   3814 		d++;
   3815 	}
   3816 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3817 		config->spares[i] = raidPtr->Disks[j];
   3818 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3819 			/* XXX: raidctl(8) expects to see this as a used spare */
   3820 			config->spares[i].status = rf_ds_used_spare;
   3821 		}
   3822 	}
   3823 	return 0;
   3824 }
   3825 
   3826 int
   3827 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3828 {
   3829 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3830 	RF_ComponentLabel_t *raid_clabel;
   3831 	int column = clabel->column;
   3832 
   3833 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3834 		return EINVAL;
   3835 	raid_clabel = raidget_component_label(raidPtr, column);
   3836 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3837 
   3838 	return 0;
   3839 }
   3840 
   3841 /*
   3842  * Module interface
   3843  */
   3844 
   3845 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3846 
   3847 #ifdef _MODULE
   3848 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3849 #endif
   3850 
   3851 static int raid_modcmd(modcmd_t, void *);
   3852 static int raid_modcmd_init(void);
   3853 static int raid_modcmd_fini(void);
   3854 
   3855 static int
   3856 raid_modcmd(modcmd_t cmd, void *data)
   3857 {
   3858 	int error;
   3859 
   3860 	error = 0;
   3861 	switch (cmd) {
   3862 	case MODULE_CMD_INIT:
   3863 		error = raid_modcmd_init();
   3864 		break;
   3865 	case MODULE_CMD_FINI:
   3866 		error = raid_modcmd_fini();
   3867 		break;
   3868 	default:
   3869 		error = ENOTTY;
   3870 		break;
   3871 	}
   3872 	return error;
   3873 }
   3874 
   3875 static int
   3876 raid_modcmd_init(void)
   3877 {
   3878 	int error;
   3879 	int bmajor, cmajor;
   3880 
   3881 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3882 	mutex_enter(&raid_lock);
   3883 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3884 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3885 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3886 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3887 
   3888 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3889 #endif
   3890 
   3891 	bmajor = cmajor = -1;
   3892 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3893 	    &raid_cdevsw, &cmajor);
   3894 	if (error != 0 && error != EEXIST) {
   3895 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3896 		mutex_exit(&raid_lock);
   3897 		return error;
   3898 	}
   3899 #ifdef _MODULE
   3900 	error = config_cfdriver_attach(&raid_cd);
   3901 	if (error != 0) {
   3902 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3903 		    __func__, error);
   3904 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3905 		mutex_exit(&raid_lock);
   3906 		return error;
   3907 	}
   3908 #endif
   3909 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3910 	if (error != 0) {
   3911 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3912 		    __func__, error);
   3913 #ifdef _MODULE
   3914 		config_cfdriver_detach(&raid_cd);
   3915 #endif
   3916 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3917 		mutex_exit(&raid_lock);
   3918 		return error;
   3919 	}
   3920 
   3921 	raidautoconfigdone = false;
   3922 
   3923 	mutex_exit(&raid_lock);
   3924 
   3925 	if (error == 0) {
   3926 		if (rf_BootRaidframe(true) == 0)
   3927 			aprint_verbose("Kernelized RAIDframe activated\n");
   3928 		else
   3929 			panic("Serious error activating RAID!!");
   3930 	}
   3931 
   3932 	/*
   3933 	 * Register a finalizer which will be used to auto-config RAID
   3934 	 * sets once all real hardware devices have been found.
   3935 	 */
   3936 	error = config_finalize_register(NULL, rf_autoconfig);
   3937 	if (error != 0) {
   3938 		aprint_error("WARNING: unable to register RAIDframe "
   3939 		    "finalizer\n");
   3940 		error = 0;
   3941 	}
   3942 
   3943 	return error;
   3944 }
   3945 
   3946 static int
   3947 raid_modcmd_fini(void)
   3948 {
   3949 	int error;
   3950 
   3951 	mutex_enter(&raid_lock);
   3952 
   3953 	/* Don't allow unload if raid device(s) exist.  */
   3954 	if (!LIST_EMPTY(&raids)) {
   3955 		mutex_exit(&raid_lock);
   3956 		return EBUSY;
   3957 	}
   3958 
   3959 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3960 	if (error != 0) {
   3961 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3962 		mutex_exit(&raid_lock);
   3963 		return error;
   3964 	}
   3965 #ifdef _MODULE
   3966 	error = config_cfdriver_detach(&raid_cd);
   3967 	if (error != 0) {
   3968 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3969 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3970 		mutex_exit(&raid_lock);
   3971 		return error;
   3972 	}
   3973 #endif
   3974 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3975 	if (error != 0) {
   3976 		aprint_error("%s: cannot detach devsw\n",__func__);
   3977 #ifdef _MODULE
   3978 		config_cfdriver_attach(&raid_cd);
   3979 #endif
   3980 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3981 		mutex_exit(&raid_lock);
   3982 		return error;
   3983 	}
   3984 	rf_BootRaidframe(false);
   3985 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3986 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3987 	rf_destroy_cond2(rf_sparet_wait_cv);
   3988 	rf_destroy_cond2(rf_sparet_resp_cv);
   3989 #endif
   3990 	mutex_exit(&raid_lock);
   3991 	mutex_destroy(&raid_lock);
   3992 
   3993 	return error;
   3994 }
   3995