Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.358
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.358 2019/01/27 02:08:42 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.358 2019/01/27 02:08:42 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 #include <sys/compat_stub.h>
    132 
    133 #include <prop/proplib.h>
    134 
    135 #include <dev/raidframe/raidframevar.h>
    136 #include <dev/raidframe/raidframeio.h>
    137 #include <dev/raidframe/rf_paritymap.h>
    138 
    139 #include "rf_raid.h"
    140 #include "rf_copyback.h"
    141 #include "rf_dag.h"
    142 #include "rf_dagflags.h"
    143 #include "rf_desc.h"
    144 #include "rf_diskqueue.h"
    145 #include "rf_etimer.h"
    146 #include "rf_general.h"
    147 #include "rf_kintf.h"
    148 #include "rf_options.h"
    149 #include "rf_driver.h"
    150 #include "rf_parityscan.h"
    151 #include "rf_threadstuff.h"
    152 
    153 #include "rf_compat50.h"
    154 
    155 #include "rf_compat80.h"
    156 
    157 #ifdef COMPAT_NETBSD32
    158 #include "rf_compat32.h"
    159 #endif
    160 
    161 #include "ioconf.h"
    162 
    163 #ifdef DEBUG
    164 int     rf_kdebug_level = 0;
    165 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    166 #else				/* DEBUG */
    167 #define db1_printf(a) { }
    168 #endif				/* DEBUG */
    169 
    170 #ifdef DEBUG_ROOT
    171 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    172 #else
    173 #define DPRINTF(a, ...)
    174 #endif
    175 
    176 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    177 static rf_declare_mutex2(rf_sparet_wait_mutex);
    178 static rf_declare_cond2(rf_sparet_wait_cv);
    179 static rf_declare_cond2(rf_sparet_resp_cv);
    180 
    181 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    182 						 * spare table */
    183 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    184 						 * installation process */
    185 #endif
    186 
    187 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    188 
    189 /* prototypes */
    190 static void KernelWakeupFunc(struct buf *);
    191 static void InitBP(struct buf *, struct vnode *, unsigned,
    192     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    193     void *, int, struct proc *);
    194 struct raid_softc;
    195 static void raidinit(struct raid_softc *);
    196 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    197 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    198 
    199 static int raid_match(device_t, cfdata_t, void *);
    200 static void raid_attach(device_t, device_t, void *);
    201 static int raid_detach(device_t, int);
    202 
    203 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    204     daddr_t, daddr_t);
    205 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    206     daddr_t, daddr_t, int);
    207 
    208 static int raidwrite_component_label(unsigned,
    209     dev_t, struct vnode *, RF_ComponentLabel_t *);
    210 static int raidread_component_label(unsigned,
    211     dev_t, struct vnode *, RF_ComponentLabel_t *);
    212 
    213 static int raid_diskstart(device_t, struct buf *bp);
    214 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    215 static int raid_lastclose(device_t);
    216 
    217 static dev_type_open(raidopen);
    218 static dev_type_close(raidclose);
    219 static dev_type_read(raidread);
    220 static dev_type_write(raidwrite);
    221 static dev_type_ioctl(raidioctl);
    222 static dev_type_strategy(raidstrategy);
    223 static dev_type_dump(raiddump);
    224 static dev_type_size(raidsize);
    225 
    226 const struct bdevsw raid_bdevsw = {
    227 	.d_open = raidopen,
    228 	.d_close = raidclose,
    229 	.d_strategy = raidstrategy,
    230 	.d_ioctl = raidioctl,
    231 	.d_dump = raiddump,
    232 	.d_psize = raidsize,
    233 	.d_discard = nodiscard,
    234 	.d_flag = D_DISK
    235 };
    236 
    237 const struct cdevsw raid_cdevsw = {
    238 	.d_open = raidopen,
    239 	.d_close = raidclose,
    240 	.d_read = raidread,
    241 	.d_write = raidwrite,
    242 	.d_ioctl = raidioctl,
    243 	.d_stop = nostop,
    244 	.d_tty = notty,
    245 	.d_poll = nopoll,
    246 	.d_mmap = nommap,
    247 	.d_kqfilter = nokqfilter,
    248 	.d_discard = nodiscard,
    249 	.d_flag = D_DISK
    250 };
    251 
    252 static struct dkdriver rf_dkdriver = {
    253 	.d_open = raidopen,
    254 	.d_close = raidclose,
    255 	.d_strategy = raidstrategy,
    256 	.d_diskstart = raid_diskstart,
    257 	.d_dumpblocks = raid_dumpblocks,
    258 	.d_lastclose = raid_lastclose,
    259 	.d_minphys = minphys
    260 };
    261 
    262 struct raid_softc {
    263 	struct dk_softc sc_dksc;
    264 	int	sc_unit;
    265 	int     sc_flags;	/* flags */
    266 	int     sc_cflags;	/* configuration flags */
    267 	kmutex_t sc_mutex;	/* interlock mutex */
    268 	kcondvar_t sc_cv;	/* and the condvar */
    269 	uint64_t sc_size;	/* size of the raid device */
    270 	char    sc_xname[20];	/* XXX external name */
    271 	RF_Raid_t sc_r;
    272 	LIST_ENTRY(raid_softc) sc_link;
    273 };
    274 /* sc_flags */
    275 #define RAIDF_INITED		0x01	/* unit has been initialized */
    276 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    277 #define RAIDF_DETACH  		0x04	/* detach after final close */
    278 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    279 #define RAIDF_LOCKED		0x10	/* unit is locked */
    280 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    281 
    282 #define	raidunit(x)	DISKUNIT(x)
    283 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    284 
    285 extern struct cfdriver raid_cd;
    286 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    287     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    288     DVF_DETACH_SHUTDOWN);
    289 
    290 /* Internal representation of a rf_recon_req */
    291 struct rf_recon_req_internal {
    292 	RF_RowCol_t col;
    293 	RF_ReconReqFlags_t flags;
    294 	void   *raidPtr;
    295 };
    296 
    297 /*
    298  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    299  * Be aware that large numbers can allow the driver to consume a lot of
    300  * kernel memory, especially on writes, and in degraded mode reads.
    301  *
    302  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    303  * a single 64K write will typically require 64K for the old data,
    304  * 64K for the old parity, and 64K for the new parity, for a total
    305  * of 192K (if the parity buffer is not re-used immediately).
    306  * Even it if is used immediately, that's still 128K, which when multiplied
    307  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    308  *
    309  * Now in degraded mode, for example, a 64K read on the above setup may
    310  * require data reconstruction, which will require *all* of the 4 remaining
    311  * disks to participate -- 4 * 32K/disk == 128K again.
    312  */
    313 
    314 #ifndef RAIDOUTSTANDING
    315 #define RAIDOUTSTANDING   6
    316 #endif
    317 
    318 #define RAIDLABELDEV(dev)	\
    319 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    320 
    321 /* declared here, and made public, for the benefit of KVM stuff.. */
    322 
    323 static int raidlock(struct raid_softc *);
    324 static void raidunlock(struct raid_softc *);
    325 
    326 static int raid_detach_unlocked(struct raid_softc *);
    327 
    328 static void rf_markalldirty(RF_Raid_t *);
    329 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    330 
    331 void rf_ReconThread(struct rf_recon_req_internal *);
    332 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    333 void rf_CopybackThread(RF_Raid_t *raidPtr);
    334 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    335 int rf_autoconfig(device_t);
    336 void rf_buildroothack(RF_ConfigSet_t *);
    337 
    338 RF_AutoConfig_t *rf_find_raid_components(void);
    339 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    340 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    341 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    342 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    343 int rf_set_autoconfig(RF_Raid_t *, int);
    344 int rf_set_rootpartition(RF_Raid_t *, int);
    345 void rf_release_all_vps(RF_ConfigSet_t *);
    346 void rf_cleanup_config_set(RF_ConfigSet_t *);
    347 int rf_have_enough_components(RF_ConfigSet_t *);
    348 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    349 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    350 
    351 /*
    352  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    353  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    354  * in the kernel config file.
    355  */
    356 #ifdef RAID_AUTOCONFIG
    357 int raidautoconfig = 1;
    358 #else
    359 int raidautoconfig = 0;
    360 #endif
    361 static bool raidautoconfigdone = false;
    362 
    363 struct RF_Pools_s rf_pools;
    364 
    365 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    366 static kmutex_t raid_lock;
    367 
    368 static struct raid_softc *
    369 raidcreate(int unit) {
    370 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    371 	sc->sc_unit = unit;
    372 	cv_init(&sc->sc_cv, "raidunit");
    373 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    374 	return sc;
    375 }
    376 
    377 static void
    378 raiddestroy(struct raid_softc *sc) {
    379 	cv_destroy(&sc->sc_cv);
    380 	mutex_destroy(&sc->sc_mutex);
    381 	kmem_free(sc, sizeof(*sc));
    382 }
    383 
    384 static struct raid_softc *
    385 raidget(int unit, bool create) {
    386 	struct raid_softc *sc;
    387 	if (unit < 0) {
    388 #ifdef DIAGNOSTIC
    389 		panic("%s: unit %d!", __func__, unit);
    390 #endif
    391 		return NULL;
    392 	}
    393 	mutex_enter(&raid_lock);
    394 	LIST_FOREACH(sc, &raids, sc_link) {
    395 		if (sc->sc_unit == unit) {
    396 			mutex_exit(&raid_lock);
    397 			return sc;
    398 		}
    399 	}
    400 	mutex_exit(&raid_lock);
    401 	if (!create)
    402 		return NULL;
    403 	if ((sc = raidcreate(unit)) == NULL)
    404 		return NULL;
    405 	mutex_enter(&raid_lock);
    406 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    407 	mutex_exit(&raid_lock);
    408 	return sc;
    409 }
    410 
    411 static void
    412 raidput(struct raid_softc *sc) {
    413 	mutex_enter(&raid_lock);
    414 	LIST_REMOVE(sc, sc_link);
    415 	mutex_exit(&raid_lock);
    416 	raiddestroy(sc);
    417 }
    418 
    419 void
    420 raidattach(int num)
    421 {
    422 
    423 	/*
    424 	 * Device attachment and associated initialization now occurs
    425 	 * as part of the module initialization.
    426 	 */
    427 }
    428 
    429 int
    430 rf_autoconfig(device_t self)
    431 {
    432 	RF_AutoConfig_t *ac_list;
    433 	RF_ConfigSet_t *config_sets;
    434 
    435 	if (!raidautoconfig || raidautoconfigdone == true)
    436 		return (0);
    437 
    438 	/* XXX This code can only be run once. */
    439 	raidautoconfigdone = true;
    440 
    441 #ifdef __HAVE_CPU_BOOTCONF
    442 	/*
    443 	 * 0. find the boot device if needed first so we can use it later
    444 	 * this needs to be done before we autoconfigure any raid sets,
    445 	 * because if we use wedges we are not going to be able to open
    446 	 * the boot device later
    447 	 */
    448 	if (booted_device == NULL)
    449 		cpu_bootconf();
    450 #endif
    451 	/* 1. locate all RAID components on the system */
    452 	aprint_debug("Searching for RAID components...\n");
    453 	ac_list = rf_find_raid_components();
    454 
    455 	/* 2. Sort them into their respective sets. */
    456 	config_sets = rf_create_auto_sets(ac_list);
    457 
    458 	/*
    459 	 * 3. Evaluate each set and configure the valid ones.
    460 	 * This gets done in rf_buildroothack().
    461 	 */
    462 	rf_buildroothack(config_sets);
    463 
    464 	return 1;
    465 }
    466 
    467 static int
    468 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    469 	const char *bootname = device_xname(bdv);
    470 	size_t len = strlen(bootname);
    471 
    472 	for (int col = 0; col < r->numCol; col++) {
    473 		const char *devname = r->Disks[col].devname;
    474 		devname += sizeof("/dev/") - 1;
    475 		if (strncmp(devname, "dk", 2) == 0) {
    476 			const char *parent =
    477 			    dkwedge_get_parent_name(r->Disks[col].dev);
    478 			if (parent != NULL)
    479 				devname = parent;
    480 		}
    481 		if (strncmp(devname, bootname, len) == 0) {
    482 			struct raid_softc *sc = r->softc;
    483 			aprint_debug("raid%d includes boot device %s\n",
    484 			    sc->sc_unit, devname);
    485 			return 1;
    486 		}
    487 	}
    488 	return 0;
    489 }
    490 
    491 void
    492 rf_buildroothack(RF_ConfigSet_t *config_sets)
    493 {
    494 	RF_ConfigSet_t *cset;
    495 	RF_ConfigSet_t *next_cset;
    496 	int num_root;
    497 	struct raid_softc *sc, *rsc;
    498 	struct dk_softc *dksc;
    499 
    500 	sc = rsc = NULL;
    501 	num_root = 0;
    502 	cset = config_sets;
    503 	while (cset != NULL) {
    504 		next_cset = cset->next;
    505 		if (rf_have_enough_components(cset) &&
    506 		    cset->ac->clabel->autoconfigure == 1) {
    507 			sc = rf_auto_config_set(cset);
    508 			if (sc != NULL) {
    509 				aprint_debug("raid%d: configured ok\n",
    510 				    sc->sc_unit);
    511 				if (cset->rootable) {
    512 					rsc = sc;
    513 					num_root++;
    514 				}
    515 			} else {
    516 				/* The autoconfig didn't work :( */
    517 				aprint_debug("Autoconfig failed\n");
    518 				rf_release_all_vps(cset);
    519 			}
    520 		} else {
    521 			/* we're not autoconfiguring this set...
    522 			   release the associated resources */
    523 			rf_release_all_vps(cset);
    524 		}
    525 		/* cleanup */
    526 		rf_cleanup_config_set(cset);
    527 		cset = next_cset;
    528 	}
    529 	dksc = &rsc->sc_dksc;
    530 
    531 	/* if the user has specified what the root device should be
    532 	   then we don't touch booted_device or boothowto... */
    533 
    534 	if (rootspec != NULL)
    535 		return;
    536 
    537 	/* we found something bootable... */
    538 
    539 	/*
    540 	 * XXX: The following code assumes that the root raid
    541 	 * is the first ('a') partition. This is about the best
    542 	 * we can do with a BSD disklabel, but we might be able
    543 	 * to do better with a GPT label, by setting a specified
    544 	 * attribute to indicate the root partition. We can then
    545 	 * stash the partition number in the r->root_partition
    546 	 * high bits (the bottom 2 bits are already used). For
    547 	 * now we just set booted_partition to 0 when we override
    548 	 * root.
    549 	 */
    550 	if (num_root == 1) {
    551 		device_t candidate_root;
    552 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    553 			char cname[sizeof(cset->ac->devname)];
    554 			/* XXX: assume partition 'a' first */
    555 			snprintf(cname, sizeof(cname), "%s%c",
    556 			    device_xname(dksc->sc_dev), 'a');
    557 			candidate_root = dkwedge_find_by_wname(cname);
    558 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    559 			    cname);
    560 			if (candidate_root == NULL) {
    561 				/*
    562 				 * If that is not found, because we don't use
    563 				 * disklabel, return the first dk child
    564 				 * XXX: we can skip the 'a' check above
    565 				 * and always do this...
    566 				 */
    567 				size_t i = 0;
    568 				candidate_root = dkwedge_find_by_parent(
    569 				    device_xname(dksc->sc_dev), &i);
    570 			}
    571 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    572 			    candidate_root);
    573 		} else
    574 			candidate_root = dksc->sc_dev;
    575 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    576 		DPRINTF("%s: booted_device=%p root_partition=%d "
    577 		   "contains_boot=%d\n", __func__, booted_device,
    578 		   rsc->sc_r.root_partition,
    579 		   rf_containsboot(&rsc->sc_r, booted_device));
    580 		if (booted_device == NULL ||
    581 		    rsc->sc_r.root_partition == 1 ||
    582 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    583 			booted_device = candidate_root;
    584 			booted_method = "raidframe/single";
    585 			booted_partition = 0;	/* XXX assume 'a' */
    586 		}
    587 	} else if (num_root > 1) {
    588 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    589 		    booted_device);
    590 
    591 		/*
    592 		 * Maybe the MD code can help. If it cannot, then
    593 		 * setroot() will discover that we have no
    594 		 * booted_device and will ask the user if nothing was
    595 		 * hardwired in the kernel config file
    596 		 */
    597 		if (booted_device == NULL)
    598 			return;
    599 
    600 		num_root = 0;
    601 		mutex_enter(&raid_lock);
    602 		LIST_FOREACH(sc, &raids, sc_link) {
    603 			RF_Raid_t *r = &sc->sc_r;
    604 			if (r->valid == 0)
    605 				continue;
    606 
    607 			if (r->root_partition == 0)
    608 				continue;
    609 
    610 			if (rf_containsboot(r, booted_device)) {
    611 				num_root++;
    612 				rsc = sc;
    613 				dksc = &rsc->sc_dksc;
    614 			}
    615 		}
    616 		mutex_exit(&raid_lock);
    617 
    618 		if (num_root == 1) {
    619 			booted_device = dksc->sc_dev;
    620 			booted_method = "raidframe/multi";
    621 			booted_partition = 0;	/* XXX assume 'a' */
    622 		} else {
    623 			/* we can't guess.. require the user to answer... */
    624 			boothowto |= RB_ASKNAME;
    625 		}
    626 	}
    627 }
    628 
    629 static int
    630 raidsize(dev_t dev)
    631 {
    632 	struct raid_softc *rs;
    633 	struct dk_softc *dksc;
    634 	unsigned int unit;
    635 
    636 	unit = raidunit(dev);
    637 	if ((rs = raidget(unit, false)) == NULL)
    638 		return -1;
    639 	dksc = &rs->sc_dksc;
    640 
    641 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    642 		return -1;
    643 
    644 	return dk_size(dksc, dev);
    645 }
    646 
    647 static int
    648 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    649 {
    650 	unsigned int unit;
    651 	struct raid_softc *rs;
    652 	struct dk_softc *dksc;
    653 
    654 	unit = raidunit(dev);
    655 	if ((rs = raidget(unit, false)) == NULL)
    656 		return ENXIO;
    657 	dksc = &rs->sc_dksc;
    658 
    659 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    660 		return ENODEV;
    661 
    662         /*
    663            Note that blkno is relative to this particular partition.
    664            By adding adding RF_PROTECTED_SECTORS, we get a value that
    665 	   is relative to the partition used for the underlying component.
    666         */
    667 	blkno += RF_PROTECTED_SECTORS;
    668 
    669 	return dk_dump(dksc, dev, blkno, va, size);
    670 }
    671 
    672 static int
    673 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    674 {
    675 	struct raid_softc *rs = raidsoftc(dev);
    676 	const struct bdevsw *bdev;
    677 	RF_Raid_t *raidPtr;
    678 	int     c, sparecol, j, scol, dumpto;
    679 	int     error = 0;
    680 
    681 	raidPtr = &rs->sc_r;
    682 
    683 	/* we only support dumping to RAID 1 sets */
    684 	if (raidPtr->Layout.numDataCol != 1 ||
    685 	    raidPtr->Layout.numParityCol != 1)
    686 		return EINVAL;
    687 
    688 	if ((error = raidlock(rs)) != 0)
    689 		return error;
    690 
    691 	/* figure out what device is alive.. */
    692 
    693 	/*
    694 	   Look for a component to dump to.  The preference for the
    695 	   component to dump to is as follows:
    696 	   1) the master
    697 	   2) a used_spare of the master
    698 	   3) the slave
    699 	   4) a used_spare of the slave
    700 	*/
    701 
    702 	dumpto = -1;
    703 	for (c = 0; c < raidPtr->numCol; c++) {
    704 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    705 			/* this might be the one */
    706 			dumpto = c;
    707 			break;
    708 		}
    709 	}
    710 
    711 	/*
    712 	   At this point we have possibly selected a live master or a
    713 	   live slave.  We now check to see if there is a spared
    714 	   master (or a spared slave), if we didn't find a live master
    715 	   or a live slave.
    716 	*/
    717 
    718 	for (c = 0; c < raidPtr->numSpare; c++) {
    719 		sparecol = raidPtr->numCol + c;
    720 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    721 			/* How about this one? */
    722 			scol = -1;
    723 			for(j=0;j<raidPtr->numCol;j++) {
    724 				if (raidPtr->Disks[j].spareCol == sparecol) {
    725 					scol = j;
    726 					break;
    727 				}
    728 			}
    729 			if (scol == 0) {
    730 				/*
    731 				   We must have found a spared master!
    732 				   We'll take that over anything else
    733 				   found so far.  (We couldn't have
    734 				   found a real master before, since
    735 				   this is a used spare, and it's
    736 				   saying that it's replacing the
    737 				   master.)  On reboot (with
    738 				   autoconfiguration turned on)
    739 				   sparecol will become the 1st
    740 				   component (component0) of this set.
    741 				*/
    742 				dumpto = sparecol;
    743 				break;
    744 			} else if (scol != -1) {
    745 				/*
    746 				   Must be a spared slave.  We'll dump
    747 				   to that if we havn't found anything
    748 				   else so far.
    749 				*/
    750 				if (dumpto == -1)
    751 					dumpto = sparecol;
    752 			}
    753 		}
    754 	}
    755 
    756 	if (dumpto == -1) {
    757 		/* we couldn't find any live components to dump to!?!?
    758 		 */
    759 		error = EINVAL;
    760 		goto out;
    761 	}
    762 
    763 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    764 	if (bdev == NULL) {
    765 		error = ENXIO;
    766 		goto out;
    767 	}
    768 
    769 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    770 				blkno, va, nblk * raidPtr->bytesPerSector);
    771 
    772 out:
    773 	raidunlock(rs);
    774 
    775 	return error;
    776 }
    777 
    778 /* ARGSUSED */
    779 static int
    780 raidopen(dev_t dev, int flags, int fmt,
    781     struct lwp *l)
    782 {
    783 	int     unit = raidunit(dev);
    784 	struct raid_softc *rs;
    785 	struct dk_softc *dksc;
    786 	int     error = 0;
    787 	int     part, pmask;
    788 
    789 	if ((rs = raidget(unit, true)) == NULL)
    790 		return ENXIO;
    791 	if ((error = raidlock(rs)) != 0)
    792 		return (error);
    793 
    794 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    795 		error = EBUSY;
    796 		goto bad;
    797 	}
    798 
    799 	dksc = &rs->sc_dksc;
    800 
    801 	part = DISKPART(dev);
    802 	pmask = (1 << part);
    803 
    804 	if (!DK_BUSY(dksc, pmask) &&
    805 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    806 		/* First one... mark things as dirty... Note that we *MUST*
    807 		 have done a configure before this.  I DO NOT WANT TO BE
    808 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    809 		 THAT THEY BELONG TOGETHER!!!!! */
    810 		/* XXX should check to see if we're only open for reading
    811 		   here... If so, we needn't do this, but then need some
    812 		   other way of keeping track of what's happened.. */
    813 
    814 		rf_markalldirty(&rs->sc_r);
    815 	}
    816 
    817 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    818 		error = dk_open(dksc, dev, flags, fmt, l);
    819 
    820 bad:
    821 	raidunlock(rs);
    822 
    823 	return (error);
    824 
    825 
    826 }
    827 
    828 static int
    829 raid_lastclose(device_t self)
    830 {
    831 	struct raid_softc *rs = raidsoftc(self);
    832 
    833 	/* Last one... device is not unconfigured yet.
    834 	   Device shutdown has taken care of setting the
    835 	   clean bits if RAIDF_INITED is not set
    836 	   mark things as clean... */
    837 
    838 	rf_update_component_labels(&rs->sc_r,
    839 	    RF_FINAL_COMPONENT_UPDATE);
    840 
    841 	/* pass to unlocked code */
    842 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    843 		rs->sc_flags |= RAIDF_DETACH;
    844 
    845 	return 0;
    846 }
    847 
    848 /* ARGSUSED */
    849 static int
    850 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    851 {
    852 	int     unit = raidunit(dev);
    853 	struct raid_softc *rs;
    854 	struct dk_softc *dksc;
    855 	cfdata_t cf;
    856 	int     error = 0, do_detach = 0, do_put = 0;
    857 
    858 	if ((rs = raidget(unit, false)) == NULL)
    859 		return ENXIO;
    860 	dksc = &rs->sc_dksc;
    861 
    862 	if ((error = raidlock(rs)) != 0)
    863 		return (error);
    864 
    865 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    866 		error = dk_close(dksc, dev, flags, fmt, l);
    867 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    868 			do_detach = 1;
    869 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    870 		do_put = 1;
    871 
    872 	raidunlock(rs);
    873 
    874 	if (do_detach) {
    875 		/* free the pseudo device attach bits */
    876 		cf = device_cfdata(dksc->sc_dev);
    877 		error = config_detach(dksc->sc_dev, 0);
    878 		if (error == 0)
    879 			free(cf, M_RAIDFRAME);
    880 	} else if (do_put) {
    881 		raidput(rs);
    882 	}
    883 
    884 	return (error);
    885 
    886 }
    887 
    888 static void
    889 raid_wakeup(RF_Raid_t *raidPtr)
    890 {
    891 	rf_lock_mutex2(raidPtr->iodone_lock);
    892 	rf_signal_cond2(raidPtr->iodone_cv);
    893 	rf_unlock_mutex2(raidPtr->iodone_lock);
    894 }
    895 
    896 static void
    897 raidstrategy(struct buf *bp)
    898 {
    899 	unsigned int unit;
    900 	struct raid_softc *rs;
    901 	struct dk_softc *dksc;
    902 	RF_Raid_t *raidPtr;
    903 
    904 	unit = raidunit(bp->b_dev);
    905 	if ((rs = raidget(unit, false)) == NULL) {
    906 		bp->b_error = ENXIO;
    907 		goto fail;
    908 	}
    909 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    910 		bp->b_error = ENXIO;
    911 		goto fail;
    912 	}
    913 	dksc = &rs->sc_dksc;
    914 	raidPtr = &rs->sc_r;
    915 
    916 	/* Queue IO only */
    917 	if (dk_strategy_defer(dksc, bp))
    918 		goto done;
    919 
    920 	/* schedule the IO to happen at the next convenient time */
    921 	raid_wakeup(raidPtr);
    922 
    923 done:
    924 	return;
    925 
    926 fail:
    927 	bp->b_resid = bp->b_bcount;
    928 	biodone(bp);
    929 }
    930 
    931 static int
    932 raid_diskstart(device_t dev, struct buf *bp)
    933 {
    934 	struct raid_softc *rs = raidsoftc(dev);
    935 	RF_Raid_t *raidPtr;
    936 
    937 	raidPtr = &rs->sc_r;
    938 	if (!raidPtr->valid) {
    939 		db1_printf(("raid is not valid..\n"));
    940 		return ENODEV;
    941 	}
    942 
    943 	/* XXX */
    944 	bp->b_resid = 0;
    945 
    946 	return raiddoaccess(raidPtr, bp);
    947 }
    948 
    949 void
    950 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    951 {
    952 	struct raid_softc *rs;
    953 	struct dk_softc *dksc;
    954 
    955 	rs = raidPtr->softc;
    956 	dksc = &rs->sc_dksc;
    957 
    958 	dk_done(dksc, bp);
    959 
    960 	rf_lock_mutex2(raidPtr->mutex);
    961 	raidPtr->openings++;
    962 	rf_unlock_mutex2(raidPtr->mutex);
    963 
    964 	/* schedule more IO */
    965 	raid_wakeup(raidPtr);
    966 }
    967 
    968 /* ARGSUSED */
    969 static int
    970 raidread(dev_t dev, struct uio *uio, int flags)
    971 {
    972 	int     unit = raidunit(dev);
    973 	struct raid_softc *rs;
    974 
    975 	if ((rs = raidget(unit, false)) == NULL)
    976 		return ENXIO;
    977 
    978 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    979 		return (ENXIO);
    980 
    981 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    982 
    983 }
    984 
    985 /* ARGSUSED */
    986 static int
    987 raidwrite(dev_t dev, struct uio *uio, int flags)
    988 {
    989 	int     unit = raidunit(dev);
    990 	struct raid_softc *rs;
    991 
    992 	if ((rs = raidget(unit, false)) == NULL)
    993 		return ENXIO;
    994 
    995 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    996 		return (ENXIO);
    997 
    998 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    999 
   1000 }
   1001 
   1002 static int
   1003 raid_detach_unlocked(struct raid_softc *rs)
   1004 {
   1005 	struct dk_softc *dksc = &rs->sc_dksc;
   1006 	RF_Raid_t *raidPtr;
   1007 	int error;
   1008 
   1009 	raidPtr = &rs->sc_r;
   1010 
   1011 	if (DK_BUSY(dksc, 0) ||
   1012 	    raidPtr->recon_in_progress != 0 ||
   1013 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1014 	    raidPtr->copyback_in_progress != 0)
   1015 		return EBUSY;
   1016 
   1017 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1018 		return 0;
   1019 
   1020 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1021 
   1022 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1023 		return error;
   1024 
   1025 	rs->sc_flags &= ~RAIDF_INITED;
   1026 
   1027 	/* Kill off any queued buffers */
   1028 	dk_drain(dksc);
   1029 	bufq_free(dksc->sc_bufq);
   1030 
   1031 	/* Detach the disk. */
   1032 	dkwedge_delall(&dksc->sc_dkdev);
   1033 	disk_detach(&dksc->sc_dkdev);
   1034 	disk_destroy(&dksc->sc_dkdev);
   1035 	dk_detach(dksc);
   1036 
   1037 	return 0;
   1038 }
   1039 
   1040 static int
   1041 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1042 {
   1043 	int     unit = raidunit(dev);
   1044 	int     error = 0;
   1045 	int     part, pmask;
   1046 	struct raid_softc *rs;
   1047 	struct dk_softc *dksc;
   1048 	RF_Config_t *k_cfg, *u_cfg;
   1049 	RF_Raid_t *raidPtr;
   1050 	RF_RaidDisk_t *diskPtr;
   1051 	RF_AccTotals_t *totals;
   1052 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1053 	u_char *specific_buf;
   1054 	int retcode = 0;
   1055 	int column;
   1056 /*	int raidid; */
   1057 	struct rf_recon_req *rr;
   1058 	struct rf_recon_req_internal *rrint;
   1059 	RF_ComponentLabel_t *clabel;
   1060 	RF_ComponentLabel_t *ci_label;
   1061 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1062 	RF_SingleComponent_t component;
   1063 	int d;
   1064 
   1065 	if ((rs = raidget(unit, false)) == NULL)
   1066 		return ENXIO;
   1067 	dksc = &rs->sc_dksc;
   1068 	raidPtr = &rs->sc_r;
   1069 
   1070 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1071 		(int) DISKPART(dev), (int) unit, cmd));
   1072 
   1073 	/* Must be initialized for these... */
   1074 	switch (cmd) {
   1075 	case RAIDFRAME_REWRITEPARITY:
   1076 	case RAIDFRAME_GET_INFO:
   1077 	case RAIDFRAME_RESET_ACCTOTALS:
   1078 	case RAIDFRAME_GET_ACCTOTALS:
   1079 	case RAIDFRAME_KEEP_ACCTOTALS:
   1080 	case RAIDFRAME_GET_SIZE:
   1081 	case RAIDFRAME_FAIL_DISK:
   1082 	case RAIDFRAME_COPYBACK:
   1083 	case RAIDFRAME_CHECK_RECON_STATUS:
   1084 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1085 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1086 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1087 	case RAIDFRAME_ADD_HOT_SPARE:
   1088 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1089 	case RAIDFRAME_INIT_LABELS:
   1090 	case RAIDFRAME_REBUILD_IN_PLACE:
   1091 	case RAIDFRAME_CHECK_PARITY:
   1092 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1093 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1094 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1095 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1096 	case RAIDFRAME_SET_AUTOCONFIG:
   1097 	case RAIDFRAME_SET_ROOT:
   1098 	case RAIDFRAME_DELETE_COMPONENT:
   1099 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1100 	case RAIDFRAME_PARITYMAP_STATUS:
   1101 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1102 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1103 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1104 #ifdef COMPAT_NETBSD32
   1105 #ifdef _LP64
   1106 	case RAIDFRAME_GET_INFO32:
   1107 #endif
   1108 #endif
   1109 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1110 			return (ENXIO);
   1111 	}
   1112 
   1113 	/*
   1114 	 * Handle compat ioctl calls
   1115 	 *
   1116 	 * * If compat code is not loaded, stub returns ENOSYS and we just
   1117 	 *   check the "native" cmd's
   1118 	 * * If compat code is loaded but does not recognize the cmd, it
   1119 	 *   returns EPASSTHROUGH, and we just check the "native" cmd's
   1120 	 * * If compat code returns EAGAIN, we need to finish via config
   1121 	 * * Otherwise the cmd has been handled and we just return
   1122 	 */
   1123 	MODULE_CALL_HOOK(raidframe50_ioctl_hook,
   1124 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1125 	    enosys(), retcode);
   1126 	if (retcode == ENOSYS)
   1127 		retcode = 0;
   1128 	else if (retcode == EAGAIN)
   1129 		goto config;
   1130 	else if (retcode != EPASSTHROUGH)
   1131 		return retcode;
   1132 
   1133 	MODULE_CALL_HOOK(raidframe80_ioctl_hook,
   1134 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1135 	    enosys(), retcode);
   1136 	if (retcode == ENOSYS)
   1137 		retcode = 0;
   1138 	else if (retcode == EAGAIN)
   1139 		goto config;
   1140 	else if (retcode != EPASSTHROUGH)
   1141 		return retcode;
   1142 
   1143 	/*
   1144 	 * XXX
   1145 	 * Handling of FAIL_DISK80 command requires us to retain retcode's
   1146 	 * value of EPASSTHROUGH.  If you add more compat code later, make
   1147 	 * sure you don't overwrite retcode and break this!
   1148 	 */
   1149 
   1150 	switch (cmd) {
   1151 
   1152 		/* configure the system */
   1153 	case RAIDFRAME_CONFIGURE:
   1154 #ifdef COMPAT_NETBSD32
   1155 #ifdef _LP64
   1156 	case RAIDFRAME_CONFIGURE32:
   1157 #endif
   1158 #endif
   1159 
   1160 		if (raidPtr->valid) {
   1161 			/* There is a valid RAID set running on this unit! */
   1162 			printf("raid%d: Device already configured!\n",unit);
   1163 			return(EINVAL);
   1164 		}
   1165 
   1166 		/* copy-in the configuration information */
   1167 		/* data points to a pointer to the configuration structure */
   1168 
   1169 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1170 		if (k_cfg == NULL) {
   1171 			return (ENOMEM);
   1172 		}
   1173 #ifdef COMPAT_NETBSD32
   1174 #ifdef _LP64
   1175 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1176 		    (l->l_proc->p_flag & PK_32) != 0)
   1177 			retcode = rf_config_netbsd32(data, k_cfg);
   1178 		else
   1179 #endif
   1180 #endif
   1181 		{
   1182 			u_cfg = *((RF_Config_t **) data);
   1183 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1184 		}
   1185 		if (retcode) {
   1186 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1187 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1188 				retcode));
   1189 			goto no_config;
   1190 		}
   1191 		goto config;
   1192 	config:
   1193 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1194 
   1195 		/* allocate a buffer for the layout-specific data, and copy it
   1196 		 * in */
   1197 		if (k_cfg->layoutSpecificSize) {
   1198 			if (k_cfg->layoutSpecificSize > 10000) {
   1199 				/* sanity check */
   1200 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1201 				retcode = EINVAL;
   1202 				goto no_config;
   1203 			}
   1204 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1205 			    (u_char *));
   1206 			if (specific_buf == NULL) {
   1207 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1208 				retcode = ENOMEM;
   1209 				goto no_config;
   1210 			}
   1211 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1212 			    k_cfg->layoutSpecificSize);
   1213 			if (retcode) {
   1214 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1215 				RF_Free(specific_buf,
   1216 					k_cfg->layoutSpecificSize);
   1217 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1218 					retcode));
   1219 				goto no_config;
   1220 			}
   1221 		} else
   1222 			specific_buf = NULL;
   1223 		k_cfg->layoutSpecific = specific_buf;
   1224 
   1225 		/* should do some kind of sanity check on the configuration.
   1226 		 * Store the sum of all the bytes in the last byte? */
   1227 
   1228 		/* configure the system */
   1229 
   1230 		/*
   1231 		 * Clear the entire RAID descriptor, just to make sure
   1232 		 *  there is no stale data left in the case of a
   1233 		 *  reconfiguration
   1234 		 */
   1235 		memset(raidPtr, 0, sizeof(*raidPtr));
   1236 		raidPtr->softc = rs;
   1237 		raidPtr->raidid = unit;
   1238 
   1239 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1240 
   1241 		if (retcode == 0) {
   1242 
   1243 			/* allow this many simultaneous IO's to
   1244 			   this RAID device */
   1245 			raidPtr->openings = RAIDOUTSTANDING;
   1246 
   1247 			raidinit(rs);
   1248 			raid_wakeup(raidPtr);
   1249 			rf_markalldirty(raidPtr);
   1250 		}
   1251 		/* free the buffers.  No return code here. */
   1252 		if (k_cfg->layoutSpecificSize) {
   1253 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1254 		}
   1255 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1256 
   1257 	no_config:
   1258 		/*
   1259 		 * If configuration failed, set sc_flags so that we
   1260 		 * will detach the device when we close it.
   1261 		 */
   1262 		if (retcode != 0)
   1263 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1264 		return (retcode);
   1265 
   1266 		/* shutdown the system */
   1267 	case RAIDFRAME_SHUTDOWN:
   1268 
   1269 		part = DISKPART(dev);
   1270 		pmask = (1 << part);
   1271 
   1272 		if ((error = raidlock(rs)) != 0)
   1273 			return (error);
   1274 
   1275 		if (DK_BUSY(dksc, pmask) ||
   1276 		    raidPtr->recon_in_progress != 0 ||
   1277 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1278 		    raidPtr->copyback_in_progress != 0)
   1279 			retcode = EBUSY;
   1280 		else {
   1281 			/* detach and free on close */
   1282 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1283 			retcode = 0;
   1284 		}
   1285 
   1286 		raidunlock(rs);
   1287 
   1288 		return (retcode);
   1289 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1290 		return rf_get_component_label(raidPtr, data);
   1291 
   1292 #if 0
   1293 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1294 		clabel = (RF_ComponentLabel_t *) data;
   1295 
   1296 		/* XXX check the label for valid stuff... */
   1297 		/* Note that some things *should not* get modified --
   1298 		   the user should be re-initing the labels instead of
   1299 		   trying to patch things.
   1300 		   */
   1301 
   1302 		raidid = raidPtr->raidid;
   1303 #ifdef DEBUG
   1304 		printf("raid%d: Got component label:\n", raidid);
   1305 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1306 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1307 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1308 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1309 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1310 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1311 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1312 #endif
   1313 		clabel->row = 0;
   1314 		column = clabel->column;
   1315 
   1316 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1317 			return(EINVAL);
   1318 		}
   1319 
   1320 		/* XXX this isn't allowed to do anything for now :-) */
   1321 
   1322 		/* XXX and before it is, we need to fill in the rest
   1323 		   of the fields!?!?!?! */
   1324 		memcpy(raidget_component_label(raidPtr, column),
   1325 		    clabel, sizeof(*clabel));
   1326 		raidflush_component_label(raidPtr, column);
   1327 		return (0);
   1328 #endif
   1329 
   1330 	case RAIDFRAME_INIT_LABELS:
   1331 		clabel = (RF_ComponentLabel_t *) data;
   1332 		/*
   1333 		   we only want the serial number from
   1334 		   the above.  We get all the rest of the information
   1335 		   from the config that was used to create this RAID
   1336 		   set.
   1337 		   */
   1338 
   1339 		raidPtr->serial_number = clabel->serial_number;
   1340 
   1341 		for(column=0;column<raidPtr->numCol;column++) {
   1342 			diskPtr = &raidPtr->Disks[column];
   1343 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1344 				ci_label = raidget_component_label(raidPtr,
   1345 				    column);
   1346 				/* Zeroing this is important. */
   1347 				memset(ci_label, 0, sizeof(*ci_label));
   1348 				raid_init_component_label(raidPtr, ci_label);
   1349 				ci_label->serial_number =
   1350 				    raidPtr->serial_number;
   1351 				ci_label->row = 0; /* we dont' pretend to support more */
   1352 				rf_component_label_set_partitionsize(ci_label,
   1353 				    diskPtr->partitionSize);
   1354 				ci_label->column = column;
   1355 				raidflush_component_label(raidPtr, column);
   1356 			}
   1357 			/* XXXjld what about the spares? */
   1358 		}
   1359 
   1360 		return (retcode);
   1361 	case RAIDFRAME_SET_AUTOCONFIG:
   1362 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1363 		printf("raid%d: New autoconfig value is: %d\n",
   1364 		       raidPtr->raidid, d);
   1365 		*(int *) data = d;
   1366 		return (retcode);
   1367 
   1368 	case RAIDFRAME_SET_ROOT:
   1369 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1370 		printf("raid%d: New rootpartition value is: %d\n",
   1371 		       raidPtr->raidid, d);
   1372 		*(int *) data = d;
   1373 		return (retcode);
   1374 
   1375 		/* initialize all parity */
   1376 	case RAIDFRAME_REWRITEPARITY:
   1377 
   1378 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1379 			/* Parity for RAID 0 is trivially correct */
   1380 			raidPtr->parity_good = RF_RAID_CLEAN;
   1381 			return(0);
   1382 		}
   1383 
   1384 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1385 			/* Re-write is already in progress! */
   1386 			return(EINVAL);
   1387 		}
   1388 
   1389 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1390 					   rf_RewriteParityThread,
   1391 					   raidPtr,"raid_parity");
   1392 		return (retcode);
   1393 
   1394 
   1395 	case RAIDFRAME_ADD_HOT_SPARE:
   1396 		sparePtr = (RF_SingleComponent_t *) data;
   1397 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1398 		retcode = rf_add_hot_spare(raidPtr, &component);
   1399 		return(retcode);
   1400 
   1401 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1402 		return(retcode);
   1403 
   1404 	case RAIDFRAME_DELETE_COMPONENT:
   1405 		componentPtr = (RF_SingleComponent_t *)data;
   1406 		memcpy( &component, componentPtr,
   1407 			sizeof(RF_SingleComponent_t));
   1408 		retcode = rf_delete_component(raidPtr, &component);
   1409 		return(retcode);
   1410 
   1411 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1412 		componentPtr = (RF_SingleComponent_t *)data;
   1413 		memcpy( &component, componentPtr,
   1414 			sizeof(RF_SingleComponent_t));
   1415 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1416 		return(retcode);
   1417 
   1418 	case RAIDFRAME_REBUILD_IN_PLACE:
   1419 
   1420 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1421 			/* Can't do this on a RAID 0!! */
   1422 			return(EINVAL);
   1423 		}
   1424 
   1425 		if (raidPtr->recon_in_progress == 1) {
   1426 			/* a reconstruct is already in progress! */
   1427 			return(EINVAL);
   1428 		}
   1429 
   1430 		componentPtr = (RF_SingleComponent_t *) data;
   1431 		memcpy( &component, componentPtr,
   1432 			sizeof(RF_SingleComponent_t));
   1433 		component.row = 0; /* we don't support any more */
   1434 		column = component.column;
   1435 
   1436 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1437 			return(EINVAL);
   1438 		}
   1439 
   1440 		rf_lock_mutex2(raidPtr->mutex);
   1441 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1442 		    (raidPtr->numFailures > 0)) {
   1443 			/* XXX 0 above shouldn't be constant!!! */
   1444 			/* some component other than this has failed.
   1445 			   Let's not make things worse than they already
   1446 			   are... */
   1447 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1448 			       raidPtr->raidid);
   1449 			printf("raid%d:     Col: %d   Too many failures.\n",
   1450 			       raidPtr->raidid, column);
   1451 			rf_unlock_mutex2(raidPtr->mutex);
   1452 			return (EINVAL);
   1453 		}
   1454 		if (raidPtr->Disks[column].status ==
   1455 		    rf_ds_reconstructing) {
   1456 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1457 			       raidPtr->raidid);
   1458 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1459 
   1460 			rf_unlock_mutex2(raidPtr->mutex);
   1461 			return (EINVAL);
   1462 		}
   1463 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1464 			rf_unlock_mutex2(raidPtr->mutex);
   1465 			return (EINVAL);
   1466 		}
   1467 		rf_unlock_mutex2(raidPtr->mutex);
   1468 
   1469 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1470 		if (rrint == NULL)
   1471 			return(ENOMEM);
   1472 
   1473 		rrint->col = column;
   1474 		rrint->raidPtr = raidPtr;
   1475 
   1476 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1477 					   rf_ReconstructInPlaceThread,
   1478 					   rrint, "raid_reconip");
   1479 		return(retcode);
   1480 
   1481 	case RAIDFRAME_GET_INFO:
   1482 #ifdef COMPAT_NETBSD32
   1483 #ifdef _LP64
   1484 	case RAIDFRAME_GET_INFO32:
   1485 #endif
   1486 #endif
   1487 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1488 			  (RF_DeviceConfig_t *));
   1489 		if (d_cfg == NULL)
   1490 			return (ENOMEM);
   1491 		retcode = rf_get_info(raidPtr, d_cfg);
   1492 		if (retcode == 0) {
   1493 #ifdef COMPAT_NETBSD32
   1494 #ifdef _LP64
   1495 			if (cmd == RAIDFRAME_GET_INFO32)
   1496 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1497 			else
   1498 #endif
   1499 #endif
   1500 				ucfgp = *(RF_DeviceConfig_t **)data;
   1501 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1502 		}
   1503 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1504 
   1505 		return (retcode);
   1506 
   1507 	case RAIDFRAME_CHECK_PARITY:
   1508 		*(int *) data = raidPtr->parity_good;
   1509 		return (0);
   1510 
   1511 	case RAIDFRAME_PARITYMAP_STATUS:
   1512 		if (rf_paritymap_ineligible(raidPtr))
   1513 			return EINVAL;
   1514 		rf_paritymap_status(raidPtr->parity_map,
   1515 		    (struct rf_pmstat *)data);
   1516 		return 0;
   1517 
   1518 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1519 		if (rf_paritymap_ineligible(raidPtr))
   1520 			return EINVAL;
   1521 		if (raidPtr->parity_map == NULL)
   1522 			return ENOENT; /* ??? */
   1523 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1524 			(struct rf_pmparams *)data, 1))
   1525 			return EINVAL;
   1526 		return 0;
   1527 
   1528 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1529 		if (rf_paritymap_ineligible(raidPtr))
   1530 			return EINVAL;
   1531 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1532 		return 0;
   1533 
   1534 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1535 		if (rf_paritymap_ineligible(raidPtr))
   1536 			return EINVAL;
   1537 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1538 		/* XXX should errors be passed up? */
   1539 		return 0;
   1540 
   1541 	case RAIDFRAME_RESET_ACCTOTALS:
   1542 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1543 		return (0);
   1544 
   1545 	case RAIDFRAME_GET_ACCTOTALS:
   1546 		totals = (RF_AccTotals_t *) data;
   1547 		*totals = raidPtr->acc_totals;
   1548 		return (0);
   1549 
   1550 	case RAIDFRAME_KEEP_ACCTOTALS:
   1551 		raidPtr->keep_acc_totals = *(int *)data;
   1552 		return (0);
   1553 
   1554 	case RAIDFRAME_GET_SIZE:
   1555 		*(int *) data = raidPtr->totalSectors;
   1556 		return (0);
   1557 
   1558 		/* fail a disk & optionally start reconstruction */
   1559 	case RAIDFRAME_FAIL_DISK80:
   1560 		/* Check if we called compat code for this cmd */
   1561 		if (retcode != EPASSTHROUGH)
   1562 			return EINVAL;
   1563 		/* FALLTHRU */
   1564 	case RAIDFRAME_FAIL_DISK:
   1565 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1566 			/* Can't do this on a RAID 0!! */
   1567 			return(EINVAL);
   1568 		}
   1569 
   1570 		rr = (struct rf_recon_req *) data;
   1571 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1572 			return (EINVAL);
   1573 
   1574 		rf_lock_mutex2(raidPtr->mutex);
   1575 		if (raidPtr->status == rf_rs_reconstructing) {
   1576 			/* you can't fail a disk while we're reconstructing! */
   1577 			/* XXX wrong for RAID6 */
   1578 			rf_unlock_mutex2(raidPtr->mutex);
   1579 			return (EINVAL);
   1580 		}
   1581 		if ((raidPtr->Disks[rr->col].status ==
   1582 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1583 			/* some other component has failed.  Let's not make
   1584 			   things worse. XXX wrong for RAID6 */
   1585 			rf_unlock_mutex2(raidPtr->mutex);
   1586 			return (EINVAL);
   1587 		}
   1588 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1589 			/* Can't fail a spared disk! */
   1590 			rf_unlock_mutex2(raidPtr->mutex);
   1591 			return (EINVAL);
   1592 		}
   1593 		rf_unlock_mutex2(raidPtr->mutex);
   1594 
   1595 		/* make a copy of the recon request so that we don't rely on
   1596 		 * the user's buffer */
   1597 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1598 		if (rrint == NULL)
   1599 			return(ENOMEM);
   1600 		rrint->col = rr->col;
   1601 		rrint->flags = rr->flags;
   1602 		rrint->raidPtr = raidPtr;
   1603 
   1604 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1605 					   rf_ReconThread,
   1606 					   rrint, "raid_recon");
   1607 		return (0);
   1608 
   1609 		/* invoke a copyback operation after recon on whatever disk
   1610 		 * needs it, if any */
   1611 	case RAIDFRAME_COPYBACK:
   1612 
   1613 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1614 			/* This makes no sense on a RAID 0!! */
   1615 			return(EINVAL);
   1616 		}
   1617 
   1618 		if (raidPtr->copyback_in_progress == 1) {
   1619 			/* Copyback is already in progress! */
   1620 			return(EINVAL);
   1621 		}
   1622 
   1623 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1624 					   rf_CopybackThread,
   1625 					   raidPtr,"raid_copyback");
   1626 		return (retcode);
   1627 
   1628 		/* return the percentage completion of reconstruction */
   1629 	case RAIDFRAME_CHECK_RECON_STATUS:
   1630 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1631 			/* This makes no sense on a RAID 0, so tell the
   1632 			   user it's done. */
   1633 			*(int *) data = 100;
   1634 			return(0);
   1635 		}
   1636 		if (raidPtr->status != rf_rs_reconstructing)
   1637 			*(int *) data = 100;
   1638 		else {
   1639 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1640 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1641 			} else {
   1642 				*(int *) data = 0;
   1643 			}
   1644 		}
   1645 		return (0);
   1646 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1647 		rf_check_recon_status_ext(raidPtr, data);
   1648 		return (0);
   1649 
   1650 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1651 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1652 			/* This makes no sense on a RAID 0, so tell the
   1653 			   user it's done. */
   1654 			*(int *) data = 100;
   1655 			return(0);
   1656 		}
   1657 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1658 			*(int *) data = 100 *
   1659 				raidPtr->parity_rewrite_stripes_done /
   1660 				raidPtr->Layout.numStripe;
   1661 		} else {
   1662 			*(int *) data = 100;
   1663 		}
   1664 		return (0);
   1665 
   1666 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1667 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1668 		return (0);
   1669 
   1670 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1671 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1672 			/* This makes no sense on a RAID 0 */
   1673 			*(int *) data = 100;
   1674 			return(0);
   1675 		}
   1676 		if (raidPtr->copyback_in_progress == 1) {
   1677 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1678 				raidPtr->Layout.numStripe;
   1679 		} else {
   1680 			*(int *) data = 100;
   1681 		}
   1682 		return (0);
   1683 
   1684 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1685 		rf_check_copyback_status_ext(raidPtr, data);
   1686 		return 0;
   1687 
   1688 	case RAIDFRAME_SET_LAST_UNIT:
   1689 		for (column = 0; column < raidPtr->numCol; column++)
   1690 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1691 				return EBUSY;
   1692 
   1693 		for (column = 0; column < raidPtr->numCol; column++) {
   1694 			clabel = raidget_component_label(raidPtr, column);
   1695 			clabel->last_unit = *(int *)data;
   1696 			raidflush_component_label(raidPtr, column);
   1697 		}
   1698 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1699 		return 0;
   1700 
   1701 		/* the sparetable daemon calls this to wait for the kernel to
   1702 		 * need a spare table. this ioctl does not return until a
   1703 		 * spare table is needed. XXX -- calling mpsleep here in the
   1704 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1705 		 * -- I should either compute the spare table in the kernel,
   1706 		 * or have a different -- XXX XXX -- interface (a different
   1707 		 * character device) for delivering the table     -- XXX */
   1708 #if 0
   1709 	case RAIDFRAME_SPARET_WAIT:
   1710 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1711 		while (!rf_sparet_wait_queue)
   1712 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1713 		waitreq = rf_sparet_wait_queue;
   1714 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1715 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1716 
   1717 		/* structure assignment */
   1718 		*((RF_SparetWait_t *) data) = *waitreq;
   1719 
   1720 		RF_Free(waitreq, sizeof(*waitreq));
   1721 		return (0);
   1722 
   1723 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1724 		 * code in it that will cause the dameon to exit */
   1725 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1726 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1727 		waitreq->fcol = -1;
   1728 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1729 		waitreq->next = rf_sparet_wait_queue;
   1730 		rf_sparet_wait_queue = waitreq;
   1731 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1732 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1733 		return (0);
   1734 
   1735 		/* used by the spare table daemon to deliver a spare table
   1736 		 * into the kernel */
   1737 	case RAIDFRAME_SEND_SPARET:
   1738 
   1739 		/* install the spare table */
   1740 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1741 
   1742 		/* respond to the requestor.  the return status of the spare
   1743 		 * table installation is passed in the "fcol" field */
   1744 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1745 		waitreq->fcol = retcode;
   1746 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1747 		waitreq->next = rf_sparet_resp_queue;
   1748 		rf_sparet_resp_queue = waitreq;
   1749 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1750 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1751 
   1752 		return (retcode);
   1753 #endif
   1754 
   1755 	default:
   1756 		break; /* fall through to the os-specific code below */
   1757 
   1758 	}
   1759 
   1760 	if (!raidPtr->valid)
   1761 		return (EINVAL);
   1762 
   1763 	/*
   1764 	 * Add support for "regular" device ioctls here.
   1765 	 */
   1766 
   1767 	switch (cmd) {
   1768 	case DIOCGCACHE:
   1769 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1770 		break;
   1771 
   1772 	case DIOCCACHESYNC:
   1773 		retcode = rf_sync_component_caches(raidPtr);
   1774 		break;
   1775 
   1776 	default:
   1777 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1778 		break;
   1779 	}
   1780 
   1781 	return (retcode);
   1782 
   1783 }
   1784 
   1785 
   1786 /* raidinit -- complete the rest of the initialization for the
   1787    RAIDframe device.  */
   1788 
   1789 
   1790 static void
   1791 raidinit(struct raid_softc *rs)
   1792 {
   1793 	cfdata_t cf;
   1794 	unsigned int unit;
   1795 	struct dk_softc *dksc = &rs->sc_dksc;
   1796 	RF_Raid_t *raidPtr = &rs->sc_r;
   1797 	device_t dev;
   1798 
   1799 	unit = raidPtr->raidid;
   1800 
   1801 	/* XXX doesn't check bounds. */
   1802 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1803 
   1804 	/* attach the pseudo device */
   1805 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1806 	cf->cf_name = raid_cd.cd_name;
   1807 	cf->cf_atname = raid_cd.cd_name;
   1808 	cf->cf_unit = unit;
   1809 	cf->cf_fstate = FSTATE_STAR;
   1810 
   1811 	dev = config_attach_pseudo(cf);
   1812 	if (dev == NULL) {
   1813 		printf("raid%d: config_attach_pseudo failed\n",
   1814 		    raidPtr->raidid);
   1815 		free(cf, M_RAIDFRAME);
   1816 		return;
   1817 	}
   1818 
   1819 	/* provide a backpointer to the real softc */
   1820 	raidsoftc(dev) = rs;
   1821 
   1822 	/* disk_attach actually creates space for the CPU disklabel, among
   1823 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1824 	 * with disklabels. */
   1825 	dk_init(dksc, dev, DKTYPE_RAID);
   1826 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1827 
   1828 	/* XXX There may be a weird interaction here between this, and
   1829 	 * protectedSectors, as used in RAIDframe.  */
   1830 
   1831 	rs->sc_size = raidPtr->totalSectors;
   1832 
   1833 	/* Attach dk and disk subsystems */
   1834 	dk_attach(dksc);
   1835 	disk_attach(&dksc->sc_dkdev);
   1836 	rf_set_geometry(rs, raidPtr);
   1837 
   1838 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1839 
   1840 	/* mark unit as usuable */
   1841 	rs->sc_flags |= RAIDF_INITED;
   1842 
   1843 	dkwedge_discover(&dksc->sc_dkdev);
   1844 }
   1845 
   1846 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1847 /* wake up the daemon & tell it to get us a spare table
   1848  * XXX
   1849  * the entries in the queues should be tagged with the raidPtr
   1850  * so that in the extremely rare case that two recons happen at once,
   1851  * we know for which device were requesting a spare table
   1852  * XXX
   1853  *
   1854  * XXX This code is not currently used. GO
   1855  */
   1856 int
   1857 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1858 {
   1859 	int     retcode;
   1860 
   1861 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1862 	req->next = rf_sparet_wait_queue;
   1863 	rf_sparet_wait_queue = req;
   1864 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1865 
   1866 	/* mpsleep unlocks the mutex */
   1867 	while (!rf_sparet_resp_queue) {
   1868 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1869 	}
   1870 	req = rf_sparet_resp_queue;
   1871 	rf_sparet_resp_queue = req->next;
   1872 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1873 
   1874 	retcode = req->fcol;
   1875 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1876 					 * alloc'd */
   1877 	return (retcode);
   1878 }
   1879 #endif
   1880 
   1881 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1882  * bp & passes it down.
   1883  * any calls originating in the kernel must use non-blocking I/O
   1884  * do some extra sanity checking to return "appropriate" error values for
   1885  * certain conditions (to make some standard utilities work)
   1886  *
   1887  * Formerly known as: rf_DoAccessKernel
   1888  */
   1889 void
   1890 raidstart(RF_Raid_t *raidPtr)
   1891 {
   1892 	struct raid_softc *rs;
   1893 	struct dk_softc *dksc;
   1894 
   1895 	rs = raidPtr->softc;
   1896 	dksc = &rs->sc_dksc;
   1897 	/* quick check to see if anything has died recently */
   1898 	rf_lock_mutex2(raidPtr->mutex);
   1899 	if (raidPtr->numNewFailures > 0) {
   1900 		rf_unlock_mutex2(raidPtr->mutex);
   1901 		rf_update_component_labels(raidPtr,
   1902 					   RF_NORMAL_COMPONENT_UPDATE);
   1903 		rf_lock_mutex2(raidPtr->mutex);
   1904 		raidPtr->numNewFailures--;
   1905 	}
   1906 	rf_unlock_mutex2(raidPtr->mutex);
   1907 
   1908 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1909 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1910 		return;
   1911 	}
   1912 
   1913 	dk_start(dksc, NULL);
   1914 }
   1915 
   1916 static int
   1917 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1918 {
   1919 	RF_SectorCount_t num_blocks, pb, sum;
   1920 	RF_RaidAddr_t raid_addr;
   1921 	daddr_t blocknum;
   1922 	int     do_async;
   1923 	int rc;
   1924 
   1925 	rf_lock_mutex2(raidPtr->mutex);
   1926 	if (raidPtr->openings == 0) {
   1927 		rf_unlock_mutex2(raidPtr->mutex);
   1928 		return EAGAIN;
   1929 	}
   1930 	rf_unlock_mutex2(raidPtr->mutex);
   1931 
   1932 	blocknum = bp->b_rawblkno;
   1933 
   1934 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1935 		    (int) blocknum));
   1936 
   1937 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1938 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1939 
   1940 	/* *THIS* is where we adjust what block we're going to...
   1941 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1942 	raid_addr = blocknum;
   1943 
   1944 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1945 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1946 	sum = raid_addr + num_blocks + pb;
   1947 	if (1 || rf_debugKernelAccess) {
   1948 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1949 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1950 			    (int) pb, (int) bp->b_resid));
   1951 	}
   1952 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1953 	    || (sum < num_blocks) || (sum < pb)) {
   1954 		rc = ENOSPC;
   1955 		goto done;
   1956 	}
   1957 	/*
   1958 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1959 	 */
   1960 
   1961 	if (bp->b_bcount & raidPtr->sectorMask) {
   1962 		rc = ENOSPC;
   1963 		goto done;
   1964 	}
   1965 	db1_printf(("Calling DoAccess..\n"));
   1966 
   1967 
   1968 	rf_lock_mutex2(raidPtr->mutex);
   1969 	raidPtr->openings--;
   1970 	rf_unlock_mutex2(raidPtr->mutex);
   1971 
   1972 	/*
   1973 	 * Everything is async.
   1974 	 */
   1975 	do_async = 1;
   1976 
   1977 	/* don't ever condition on bp->b_flags & B_WRITE.
   1978 	 * always condition on B_READ instead */
   1979 
   1980 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1981 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1982 			 do_async, raid_addr, num_blocks,
   1983 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1984 
   1985 done:
   1986 	return rc;
   1987 }
   1988 
   1989 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1990 
   1991 int
   1992 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1993 {
   1994 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1995 	struct buf *bp;
   1996 
   1997 	req->queue = queue;
   1998 	bp = req->bp;
   1999 
   2000 	switch (req->type) {
   2001 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2002 		/* XXX need to do something extra here.. */
   2003 		/* I'm leaving this in, as I've never actually seen it used,
   2004 		 * and I'd like folks to report it... GO */
   2005 		printf(("WAKEUP CALLED\n"));
   2006 		queue->numOutstanding++;
   2007 
   2008 		bp->b_flags = 0;
   2009 		bp->b_private = req;
   2010 
   2011 		KernelWakeupFunc(bp);
   2012 		break;
   2013 
   2014 	case RF_IO_TYPE_READ:
   2015 	case RF_IO_TYPE_WRITE:
   2016 #if RF_ACC_TRACE > 0
   2017 		if (req->tracerec) {
   2018 			RF_ETIMER_START(req->tracerec->timer);
   2019 		}
   2020 #endif
   2021 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2022 		    op, queue->rf_cinfo->ci_dev,
   2023 		    req->sectorOffset, req->numSector,
   2024 		    req->buf, KernelWakeupFunc, (void *) req,
   2025 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2026 
   2027 		if (rf_debugKernelAccess) {
   2028 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2029 				(long) bp->b_blkno));
   2030 		}
   2031 		queue->numOutstanding++;
   2032 		queue->last_deq_sector = req->sectorOffset;
   2033 		/* acc wouldn't have been let in if there were any pending
   2034 		 * reqs at any other priority */
   2035 		queue->curPriority = req->priority;
   2036 
   2037 		db1_printf(("Going for %c to unit %d col %d\n",
   2038 			    req->type, queue->raidPtr->raidid,
   2039 			    queue->col));
   2040 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2041 			(int) req->sectorOffset, (int) req->numSector,
   2042 			(int) (req->numSector <<
   2043 			    queue->raidPtr->logBytesPerSector),
   2044 			(int) queue->raidPtr->logBytesPerSector));
   2045 
   2046 		/*
   2047 		 * XXX: drop lock here since this can block at
   2048 		 * least with backing SCSI devices.  Retake it
   2049 		 * to minimize fuss with calling interfaces.
   2050 		 */
   2051 
   2052 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2053 		bdev_strategy(bp);
   2054 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2055 		break;
   2056 
   2057 	default:
   2058 		panic("bad req->type in rf_DispatchKernelIO");
   2059 	}
   2060 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2061 
   2062 	return (0);
   2063 }
   2064 /* this is the callback function associated with a I/O invoked from
   2065    kernel code.
   2066  */
   2067 static void
   2068 KernelWakeupFunc(struct buf *bp)
   2069 {
   2070 	RF_DiskQueueData_t *req = NULL;
   2071 	RF_DiskQueue_t *queue;
   2072 
   2073 	db1_printf(("recovering the request queue:\n"));
   2074 
   2075 	req = bp->b_private;
   2076 
   2077 	queue = (RF_DiskQueue_t *) req->queue;
   2078 
   2079 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2080 
   2081 #if RF_ACC_TRACE > 0
   2082 	if (req->tracerec) {
   2083 		RF_ETIMER_STOP(req->tracerec->timer);
   2084 		RF_ETIMER_EVAL(req->tracerec->timer);
   2085 		rf_lock_mutex2(rf_tracing_mutex);
   2086 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2087 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2088 		req->tracerec->num_phys_ios++;
   2089 		rf_unlock_mutex2(rf_tracing_mutex);
   2090 	}
   2091 #endif
   2092 
   2093 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2094 	 * ballistic, and mark the component as hosed... */
   2095 
   2096 	if (bp->b_error != 0) {
   2097 		/* Mark the disk as dead */
   2098 		/* but only mark it once... */
   2099 		/* and only if it wouldn't leave this RAID set
   2100 		   completely broken */
   2101 		if (((queue->raidPtr->Disks[queue->col].status ==
   2102 		      rf_ds_optimal) ||
   2103 		     (queue->raidPtr->Disks[queue->col].status ==
   2104 		      rf_ds_used_spare)) &&
   2105 		     (queue->raidPtr->numFailures <
   2106 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2107 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2108 			       queue->raidPtr->raidid,
   2109 			       bp->b_error,
   2110 			       queue->raidPtr->Disks[queue->col].devname);
   2111 			queue->raidPtr->Disks[queue->col].status =
   2112 			    rf_ds_failed;
   2113 			queue->raidPtr->status = rf_rs_degraded;
   2114 			queue->raidPtr->numFailures++;
   2115 			queue->raidPtr->numNewFailures++;
   2116 		} else {	/* Disk is already dead... */
   2117 			/* printf("Disk already marked as dead!\n"); */
   2118 		}
   2119 
   2120 	}
   2121 
   2122 	/* Fill in the error value */
   2123 	req->error = bp->b_error;
   2124 
   2125 	/* Drop this one on the "finished" queue... */
   2126 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2127 
   2128 	/* Let the raidio thread know there is work to be done. */
   2129 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2130 
   2131 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2132 }
   2133 
   2134 
   2135 /*
   2136  * initialize a buf structure for doing an I/O in the kernel.
   2137  */
   2138 static void
   2139 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2140        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2141        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2142        struct proc *b_proc)
   2143 {
   2144 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2145 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2146 	bp->b_oflags = 0;
   2147 	bp->b_cflags = 0;
   2148 	bp->b_bcount = numSect << logBytesPerSector;
   2149 	bp->b_bufsize = bp->b_bcount;
   2150 	bp->b_error = 0;
   2151 	bp->b_dev = dev;
   2152 	bp->b_data = bf;
   2153 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2154 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2155 	if (bp->b_bcount == 0) {
   2156 		panic("bp->b_bcount is zero in InitBP!!");
   2157 	}
   2158 	bp->b_proc = b_proc;
   2159 	bp->b_iodone = cbFunc;
   2160 	bp->b_private = cbArg;
   2161 }
   2162 
   2163 /*
   2164  * Wait interruptibly for an exclusive lock.
   2165  *
   2166  * XXX
   2167  * Several drivers do this; it should be abstracted and made MP-safe.
   2168  * (Hmm... where have we seen this warning before :->  GO )
   2169  */
   2170 static int
   2171 raidlock(struct raid_softc *rs)
   2172 {
   2173 	int     error;
   2174 
   2175 	error = 0;
   2176 	mutex_enter(&rs->sc_mutex);
   2177 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2178 		rs->sc_flags |= RAIDF_WANTED;
   2179 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2180 		if (error != 0)
   2181 			goto done;
   2182 	}
   2183 	rs->sc_flags |= RAIDF_LOCKED;
   2184 done:
   2185 	mutex_exit(&rs->sc_mutex);
   2186 	return (error);
   2187 }
   2188 /*
   2189  * Unlock and wake up any waiters.
   2190  */
   2191 static void
   2192 raidunlock(struct raid_softc *rs)
   2193 {
   2194 
   2195 	mutex_enter(&rs->sc_mutex);
   2196 	rs->sc_flags &= ~RAIDF_LOCKED;
   2197 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2198 		rs->sc_flags &= ~RAIDF_WANTED;
   2199 		cv_broadcast(&rs->sc_cv);
   2200 	}
   2201 	mutex_exit(&rs->sc_mutex);
   2202 }
   2203 
   2204 
   2205 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2206 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2207 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2208 
   2209 static daddr_t
   2210 rf_component_info_offset(void)
   2211 {
   2212 
   2213 	return RF_COMPONENT_INFO_OFFSET;
   2214 }
   2215 
   2216 static daddr_t
   2217 rf_component_info_size(unsigned secsize)
   2218 {
   2219 	daddr_t info_size;
   2220 
   2221 	KASSERT(secsize);
   2222 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2223 		info_size = secsize;
   2224 	else
   2225 		info_size = RF_COMPONENT_INFO_SIZE;
   2226 
   2227 	return info_size;
   2228 }
   2229 
   2230 static daddr_t
   2231 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2232 {
   2233 	daddr_t map_offset;
   2234 
   2235 	KASSERT(raidPtr->bytesPerSector);
   2236 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2237 		map_offset = raidPtr->bytesPerSector;
   2238 	else
   2239 		map_offset = RF_COMPONENT_INFO_SIZE;
   2240 	map_offset += rf_component_info_offset();
   2241 
   2242 	return map_offset;
   2243 }
   2244 
   2245 static daddr_t
   2246 rf_parity_map_size(RF_Raid_t *raidPtr)
   2247 {
   2248 	daddr_t map_size;
   2249 
   2250 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2251 		map_size = raidPtr->bytesPerSector;
   2252 	else
   2253 		map_size = RF_PARITY_MAP_SIZE;
   2254 
   2255 	return map_size;
   2256 }
   2257 
   2258 int
   2259 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2260 {
   2261 	RF_ComponentLabel_t *clabel;
   2262 
   2263 	clabel = raidget_component_label(raidPtr, col);
   2264 	clabel->clean = RF_RAID_CLEAN;
   2265 	raidflush_component_label(raidPtr, col);
   2266 	return(0);
   2267 }
   2268 
   2269 
   2270 int
   2271 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2272 {
   2273 	RF_ComponentLabel_t *clabel;
   2274 
   2275 	clabel = raidget_component_label(raidPtr, col);
   2276 	clabel->clean = RF_RAID_DIRTY;
   2277 	raidflush_component_label(raidPtr, col);
   2278 	return(0);
   2279 }
   2280 
   2281 int
   2282 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2283 {
   2284 	KASSERT(raidPtr->bytesPerSector);
   2285 	return raidread_component_label(raidPtr->bytesPerSector,
   2286 	    raidPtr->Disks[col].dev,
   2287 	    raidPtr->raid_cinfo[col].ci_vp,
   2288 	    &raidPtr->raid_cinfo[col].ci_label);
   2289 }
   2290 
   2291 RF_ComponentLabel_t *
   2292 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2293 {
   2294 	return &raidPtr->raid_cinfo[col].ci_label;
   2295 }
   2296 
   2297 int
   2298 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2299 {
   2300 	RF_ComponentLabel_t *label;
   2301 
   2302 	label = &raidPtr->raid_cinfo[col].ci_label;
   2303 	label->mod_counter = raidPtr->mod_counter;
   2304 #ifndef RF_NO_PARITY_MAP
   2305 	label->parity_map_modcount = label->mod_counter;
   2306 #endif
   2307 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2308 	    raidPtr->Disks[col].dev,
   2309 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2310 }
   2311 
   2312 
   2313 static int
   2314 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2315     RF_ComponentLabel_t *clabel)
   2316 {
   2317 	return raidread_component_area(dev, b_vp, clabel,
   2318 	    sizeof(RF_ComponentLabel_t),
   2319 	    rf_component_info_offset(),
   2320 	    rf_component_info_size(secsize));
   2321 }
   2322 
   2323 /* ARGSUSED */
   2324 static int
   2325 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2326     size_t msize, daddr_t offset, daddr_t dsize)
   2327 {
   2328 	struct buf *bp;
   2329 	int error;
   2330 
   2331 	/* XXX should probably ensure that we don't try to do this if
   2332 	   someone has changed rf_protected_sectors. */
   2333 
   2334 	if (b_vp == NULL) {
   2335 		/* For whatever reason, this component is not valid.
   2336 		   Don't try to read a component label from it. */
   2337 		return(EINVAL);
   2338 	}
   2339 
   2340 	/* get a block of the appropriate size... */
   2341 	bp = geteblk((int)dsize);
   2342 	bp->b_dev = dev;
   2343 
   2344 	/* get our ducks in a row for the read */
   2345 	bp->b_blkno = offset / DEV_BSIZE;
   2346 	bp->b_bcount = dsize;
   2347 	bp->b_flags |= B_READ;
   2348  	bp->b_resid = dsize;
   2349 
   2350 	bdev_strategy(bp);
   2351 	error = biowait(bp);
   2352 
   2353 	if (!error) {
   2354 		memcpy(data, bp->b_data, msize);
   2355 	}
   2356 
   2357 	brelse(bp, 0);
   2358 	return(error);
   2359 }
   2360 
   2361 
   2362 static int
   2363 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2364     RF_ComponentLabel_t *clabel)
   2365 {
   2366 	return raidwrite_component_area(dev, b_vp, clabel,
   2367 	    sizeof(RF_ComponentLabel_t),
   2368 	    rf_component_info_offset(),
   2369 	    rf_component_info_size(secsize), 0);
   2370 }
   2371 
   2372 /* ARGSUSED */
   2373 static int
   2374 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2375     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2376 {
   2377 	struct buf *bp;
   2378 	int error;
   2379 
   2380 	/* get a block of the appropriate size... */
   2381 	bp = geteblk((int)dsize);
   2382 	bp->b_dev = dev;
   2383 
   2384 	/* get our ducks in a row for the write */
   2385 	bp->b_blkno = offset / DEV_BSIZE;
   2386 	bp->b_bcount = dsize;
   2387 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2388  	bp->b_resid = dsize;
   2389 
   2390 	memset(bp->b_data, 0, dsize);
   2391 	memcpy(bp->b_data, data, msize);
   2392 
   2393 	bdev_strategy(bp);
   2394 	if (asyncp)
   2395 		return 0;
   2396 	error = biowait(bp);
   2397 	brelse(bp, 0);
   2398 	if (error) {
   2399 #if 1
   2400 		printf("Failed to write RAID component info!\n");
   2401 #endif
   2402 	}
   2403 
   2404 	return(error);
   2405 }
   2406 
   2407 void
   2408 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2409 {
   2410 	int c;
   2411 
   2412 	for (c = 0; c < raidPtr->numCol; c++) {
   2413 		/* Skip dead disks. */
   2414 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2415 			continue;
   2416 		/* XXXjld: what if an error occurs here? */
   2417 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2418 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2419 		    RF_PARITYMAP_NBYTE,
   2420 		    rf_parity_map_offset(raidPtr),
   2421 		    rf_parity_map_size(raidPtr), 0);
   2422 	}
   2423 }
   2424 
   2425 void
   2426 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2427 {
   2428 	struct rf_paritymap_ondisk tmp;
   2429 	int c,first;
   2430 
   2431 	first=1;
   2432 	for (c = 0; c < raidPtr->numCol; c++) {
   2433 		/* Skip dead disks. */
   2434 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2435 			continue;
   2436 		raidread_component_area(raidPtr->Disks[c].dev,
   2437 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2438 		    RF_PARITYMAP_NBYTE,
   2439 		    rf_parity_map_offset(raidPtr),
   2440 		    rf_parity_map_size(raidPtr));
   2441 		if (first) {
   2442 			memcpy(map, &tmp, sizeof(*map));
   2443 			first = 0;
   2444 		} else {
   2445 			rf_paritymap_merge(map, &tmp);
   2446 		}
   2447 	}
   2448 }
   2449 
   2450 void
   2451 rf_markalldirty(RF_Raid_t *raidPtr)
   2452 {
   2453 	RF_ComponentLabel_t *clabel;
   2454 	int sparecol;
   2455 	int c;
   2456 	int j;
   2457 	int scol = -1;
   2458 
   2459 	raidPtr->mod_counter++;
   2460 	for (c = 0; c < raidPtr->numCol; c++) {
   2461 		/* we don't want to touch (at all) a disk that has
   2462 		   failed */
   2463 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2464 			clabel = raidget_component_label(raidPtr, c);
   2465 			if (clabel->status == rf_ds_spared) {
   2466 				/* XXX do something special...
   2467 				   but whatever you do, don't
   2468 				   try to access it!! */
   2469 			} else {
   2470 				raidmarkdirty(raidPtr, c);
   2471 			}
   2472 		}
   2473 	}
   2474 
   2475 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2476 		sparecol = raidPtr->numCol + c;
   2477 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2478 			/*
   2479 
   2480 			   we claim this disk is "optimal" if it's
   2481 			   rf_ds_used_spare, as that means it should be
   2482 			   directly substitutable for the disk it replaced.
   2483 			   We note that too...
   2484 
   2485 			 */
   2486 
   2487 			for(j=0;j<raidPtr->numCol;j++) {
   2488 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2489 					scol = j;
   2490 					break;
   2491 				}
   2492 			}
   2493 
   2494 			clabel = raidget_component_label(raidPtr, sparecol);
   2495 			/* make sure status is noted */
   2496 
   2497 			raid_init_component_label(raidPtr, clabel);
   2498 
   2499 			clabel->row = 0;
   2500 			clabel->column = scol;
   2501 			/* Note: we *don't* change status from rf_ds_used_spare
   2502 			   to rf_ds_optimal */
   2503 			/* clabel.status = rf_ds_optimal; */
   2504 
   2505 			raidmarkdirty(raidPtr, sparecol);
   2506 		}
   2507 	}
   2508 }
   2509 
   2510 
   2511 void
   2512 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2513 {
   2514 	RF_ComponentLabel_t *clabel;
   2515 	int sparecol;
   2516 	int c;
   2517 	int j;
   2518 	int scol;
   2519 	struct raid_softc *rs = raidPtr->softc;
   2520 
   2521 	scol = -1;
   2522 
   2523 	/* XXX should do extra checks to make sure things really are clean,
   2524 	   rather than blindly setting the clean bit... */
   2525 
   2526 	raidPtr->mod_counter++;
   2527 
   2528 	for (c = 0; c < raidPtr->numCol; c++) {
   2529 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2530 			clabel = raidget_component_label(raidPtr, c);
   2531 			/* make sure status is noted */
   2532 			clabel->status = rf_ds_optimal;
   2533 
   2534 			/* note what unit we are configured as */
   2535 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2536 				clabel->last_unit = raidPtr->raidid;
   2537 
   2538 			raidflush_component_label(raidPtr, c);
   2539 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2540 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2541 					raidmarkclean(raidPtr, c);
   2542 				}
   2543 			}
   2544 		}
   2545 		/* else we don't touch it.. */
   2546 	}
   2547 
   2548 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2549 		sparecol = raidPtr->numCol + c;
   2550 		/* Need to ensure that the reconstruct actually completed! */
   2551 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2552 			/*
   2553 
   2554 			   we claim this disk is "optimal" if it's
   2555 			   rf_ds_used_spare, as that means it should be
   2556 			   directly substitutable for the disk it replaced.
   2557 			   We note that too...
   2558 
   2559 			 */
   2560 
   2561 			for(j=0;j<raidPtr->numCol;j++) {
   2562 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2563 					scol = j;
   2564 					break;
   2565 				}
   2566 			}
   2567 
   2568 			/* XXX shouldn't *really* need this... */
   2569 			clabel = raidget_component_label(raidPtr, sparecol);
   2570 			/* make sure status is noted */
   2571 
   2572 			raid_init_component_label(raidPtr, clabel);
   2573 
   2574 			clabel->column = scol;
   2575 			clabel->status = rf_ds_optimal;
   2576 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2577 				clabel->last_unit = raidPtr->raidid;
   2578 
   2579 			raidflush_component_label(raidPtr, sparecol);
   2580 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2581 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2582 					raidmarkclean(raidPtr, sparecol);
   2583 				}
   2584 			}
   2585 		}
   2586 	}
   2587 }
   2588 
   2589 void
   2590 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2591 {
   2592 
   2593 	if (vp != NULL) {
   2594 		if (auto_configured == 1) {
   2595 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2596 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2597 			vput(vp);
   2598 
   2599 		} else {
   2600 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2601 		}
   2602 	}
   2603 }
   2604 
   2605 
   2606 void
   2607 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2608 {
   2609 	int r,c;
   2610 	struct vnode *vp;
   2611 	int acd;
   2612 
   2613 
   2614 	/* We take this opportunity to close the vnodes like we should.. */
   2615 
   2616 	for (c = 0; c < raidPtr->numCol; c++) {
   2617 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2618 		acd = raidPtr->Disks[c].auto_configured;
   2619 		rf_close_component(raidPtr, vp, acd);
   2620 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2621 		raidPtr->Disks[c].auto_configured = 0;
   2622 	}
   2623 
   2624 	for (r = 0; r < raidPtr->numSpare; r++) {
   2625 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2626 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2627 		rf_close_component(raidPtr, vp, acd);
   2628 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2629 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2630 	}
   2631 }
   2632 
   2633 
   2634 void
   2635 rf_ReconThread(struct rf_recon_req_internal *req)
   2636 {
   2637 	int     s;
   2638 	RF_Raid_t *raidPtr;
   2639 
   2640 	s = splbio();
   2641 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2642 	raidPtr->recon_in_progress = 1;
   2643 
   2644 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2645 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2646 
   2647 	RF_Free(req, sizeof(*req));
   2648 
   2649 	raidPtr->recon_in_progress = 0;
   2650 	splx(s);
   2651 
   2652 	/* That's all... */
   2653 	kthread_exit(0);	/* does not return */
   2654 }
   2655 
   2656 void
   2657 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2658 {
   2659 	int retcode;
   2660 	int s;
   2661 
   2662 	raidPtr->parity_rewrite_stripes_done = 0;
   2663 	raidPtr->parity_rewrite_in_progress = 1;
   2664 	s = splbio();
   2665 	retcode = rf_RewriteParity(raidPtr);
   2666 	splx(s);
   2667 	if (retcode) {
   2668 		printf("raid%d: Error re-writing parity (%d)!\n",
   2669 		    raidPtr->raidid, retcode);
   2670 	} else {
   2671 		/* set the clean bit!  If we shutdown correctly,
   2672 		   the clean bit on each component label will get
   2673 		   set */
   2674 		raidPtr->parity_good = RF_RAID_CLEAN;
   2675 	}
   2676 	raidPtr->parity_rewrite_in_progress = 0;
   2677 
   2678 	/* Anyone waiting for us to stop?  If so, inform them... */
   2679 	if (raidPtr->waitShutdown) {
   2680 		rf_lock_mutex2(raidPtr->rad_lock);
   2681 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2682 		rf_unlock_mutex2(raidPtr->rad_lock);
   2683 	}
   2684 
   2685 	/* That's all... */
   2686 	kthread_exit(0);	/* does not return */
   2687 }
   2688 
   2689 
   2690 void
   2691 rf_CopybackThread(RF_Raid_t *raidPtr)
   2692 {
   2693 	int s;
   2694 
   2695 	raidPtr->copyback_in_progress = 1;
   2696 	s = splbio();
   2697 	rf_CopybackReconstructedData(raidPtr);
   2698 	splx(s);
   2699 	raidPtr->copyback_in_progress = 0;
   2700 
   2701 	/* That's all... */
   2702 	kthread_exit(0);	/* does not return */
   2703 }
   2704 
   2705 
   2706 void
   2707 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2708 {
   2709 	int s;
   2710 	RF_Raid_t *raidPtr;
   2711 
   2712 	s = splbio();
   2713 	raidPtr = req->raidPtr;
   2714 	raidPtr->recon_in_progress = 1;
   2715 	rf_ReconstructInPlace(raidPtr, req->col);
   2716 	RF_Free(req, sizeof(*req));
   2717 	raidPtr->recon_in_progress = 0;
   2718 	splx(s);
   2719 
   2720 	/* That's all... */
   2721 	kthread_exit(0);	/* does not return */
   2722 }
   2723 
   2724 static RF_AutoConfig_t *
   2725 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2726     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2727     unsigned secsize)
   2728 {
   2729 	int good_one = 0;
   2730 	RF_ComponentLabel_t *clabel;
   2731 	RF_AutoConfig_t *ac;
   2732 
   2733 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2734 	if (clabel == NULL) {
   2735 oomem:
   2736 		    while(ac_list) {
   2737 			    ac = ac_list;
   2738 			    if (ac->clabel)
   2739 				    free(ac->clabel, M_RAIDFRAME);
   2740 			    ac_list = ac_list->next;
   2741 			    free(ac, M_RAIDFRAME);
   2742 		    }
   2743 		    printf("RAID auto config: out of memory!\n");
   2744 		    return NULL; /* XXX probably should panic? */
   2745 	}
   2746 
   2747 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2748 		/* Got the label.  Does it look reasonable? */
   2749 		if (rf_reasonable_label(clabel, numsecs) &&
   2750 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2751 #ifdef DEBUG
   2752 			printf("Component on: %s: %llu\n",
   2753 				cname, (unsigned long long)size);
   2754 			rf_print_component_label(clabel);
   2755 #endif
   2756 			/* if it's reasonable, add it, else ignore it. */
   2757 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2758 				M_NOWAIT);
   2759 			if (ac == NULL) {
   2760 				free(clabel, M_RAIDFRAME);
   2761 				goto oomem;
   2762 			}
   2763 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2764 			ac->dev = dev;
   2765 			ac->vp = vp;
   2766 			ac->clabel = clabel;
   2767 			ac->next = ac_list;
   2768 			ac_list = ac;
   2769 			good_one = 1;
   2770 		}
   2771 	}
   2772 	if (!good_one) {
   2773 		/* cleanup */
   2774 		free(clabel, M_RAIDFRAME);
   2775 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2776 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2777 		vput(vp);
   2778 	}
   2779 	return ac_list;
   2780 }
   2781 
   2782 RF_AutoConfig_t *
   2783 rf_find_raid_components(void)
   2784 {
   2785 	struct vnode *vp;
   2786 	struct disklabel label;
   2787 	device_t dv;
   2788 	deviter_t di;
   2789 	dev_t dev;
   2790 	int bmajor, bminor, wedge, rf_part_found;
   2791 	int error;
   2792 	int i;
   2793 	RF_AutoConfig_t *ac_list;
   2794 	uint64_t numsecs;
   2795 	unsigned secsize;
   2796 	int dowedges;
   2797 
   2798 	/* initialize the AutoConfig list */
   2799 	ac_list = NULL;
   2800 
   2801 	/*
   2802 	 * we begin by trolling through *all* the devices on the system *twice*
   2803 	 * first we scan for wedges, second for other devices. This avoids
   2804 	 * using a raw partition instead of a wedge that covers the whole disk
   2805 	 */
   2806 
   2807 	for (dowedges=1; dowedges>=0; --dowedges) {
   2808 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2809 		     dv = deviter_next(&di)) {
   2810 
   2811 			/* we are only interested in disks... */
   2812 			if (device_class(dv) != DV_DISK)
   2813 				continue;
   2814 
   2815 			/* we don't care about floppies... */
   2816 			if (device_is_a(dv, "fd")) {
   2817 				continue;
   2818 			}
   2819 
   2820 			/* we don't care about CD's... */
   2821 			if (device_is_a(dv, "cd")) {
   2822 				continue;
   2823 			}
   2824 
   2825 			/* we don't care about md's... */
   2826 			if (device_is_a(dv, "md")) {
   2827 				continue;
   2828 			}
   2829 
   2830 			/* hdfd is the Atari/Hades floppy driver */
   2831 			if (device_is_a(dv, "hdfd")) {
   2832 				continue;
   2833 			}
   2834 
   2835 			/* fdisa is the Atari/Milan floppy driver */
   2836 			if (device_is_a(dv, "fdisa")) {
   2837 				continue;
   2838 			}
   2839 
   2840 			/* are we in the wedges pass ? */
   2841 			wedge = device_is_a(dv, "dk");
   2842 			if (wedge != dowedges) {
   2843 				continue;
   2844 			}
   2845 
   2846 			/* need to find the device_name_to_block_device_major stuff */
   2847 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2848 
   2849 			rf_part_found = 0; /*No raid partition as yet*/
   2850 
   2851 			/* get a vnode for the raw partition of this disk */
   2852 			bminor = minor(device_unit(dv));
   2853 			dev = wedge ? makedev(bmajor, bminor) :
   2854 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2855 			if (bdevvp(dev, &vp))
   2856 				panic("RAID can't alloc vnode");
   2857 
   2858 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2859 
   2860 			if (error) {
   2861 				/* "Who cares."  Continue looking
   2862 				   for something that exists*/
   2863 				vput(vp);
   2864 				continue;
   2865 			}
   2866 
   2867 			error = getdisksize(vp, &numsecs, &secsize);
   2868 			if (error) {
   2869 				/*
   2870 				 * Pseudo devices like vnd and cgd can be
   2871 				 * opened but may still need some configuration.
   2872 				 * Ignore these quietly.
   2873 				 */
   2874 				if (error != ENXIO)
   2875 					printf("RAIDframe: can't get disk size"
   2876 					    " for dev %s (%d)\n",
   2877 					    device_xname(dv), error);
   2878 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2879 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2880 				vput(vp);
   2881 				continue;
   2882 			}
   2883 			if (wedge) {
   2884 				struct dkwedge_info dkw;
   2885 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2886 				    NOCRED);
   2887 				if (error) {
   2888 					printf("RAIDframe: can't get wedge info for "
   2889 					    "dev %s (%d)\n", device_xname(dv), error);
   2890 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2891 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2892 					vput(vp);
   2893 					continue;
   2894 				}
   2895 
   2896 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2897 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2898 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2899 					vput(vp);
   2900 					continue;
   2901 				}
   2902 
   2903 				ac_list = rf_get_component(ac_list, dev, vp,
   2904 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2905 				rf_part_found = 1; /*There is a raid component on this disk*/
   2906 				continue;
   2907 			}
   2908 
   2909 			/* Ok, the disk exists.  Go get the disklabel. */
   2910 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2911 			if (error) {
   2912 				/*
   2913 				 * XXX can't happen - open() would
   2914 				 * have errored out (or faked up one)
   2915 				 */
   2916 				if (error != ENOTTY)
   2917 					printf("RAIDframe: can't get label for dev "
   2918 					    "%s (%d)\n", device_xname(dv), error);
   2919 			}
   2920 
   2921 			/* don't need this any more.  We'll allocate it again
   2922 			   a little later if we really do... */
   2923 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2924 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2925 			vput(vp);
   2926 
   2927 			if (error)
   2928 				continue;
   2929 
   2930 			rf_part_found = 0; /*No raid partitions yet*/
   2931 			for (i = 0; i < label.d_npartitions; i++) {
   2932 				char cname[sizeof(ac_list->devname)];
   2933 
   2934 				/* We only support partitions marked as RAID */
   2935 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2936 					continue;
   2937 
   2938 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2939 				if (bdevvp(dev, &vp))
   2940 					panic("RAID can't alloc vnode");
   2941 
   2942 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2943 				if (error) {
   2944 					/* Whatever... */
   2945 					vput(vp);
   2946 					continue;
   2947 				}
   2948 				snprintf(cname, sizeof(cname), "%s%c",
   2949 				    device_xname(dv), 'a' + i);
   2950 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2951 					label.d_partitions[i].p_size, numsecs, secsize);
   2952 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2953 			}
   2954 
   2955 			/*
   2956 			 *If there is no raid component on this disk, either in a
   2957 			 *disklabel or inside a wedge, check the raw partition as well,
   2958 			 *as it is possible to configure raid components on raw disk
   2959 			 *devices.
   2960 			 */
   2961 
   2962 			if (!rf_part_found) {
   2963 				char cname[sizeof(ac_list->devname)];
   2964 
   2965 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2966 				if (bdevvp(dev, &vp))
   2967 					panic("RAID can't alloc vnode");
   2968 
   2969 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2970 				if (error) {
   2971 					/* Whatever... */
   2972 					vput(vp);
   2973 					continue;
   2974 				}
   2975 				snprintf(cname, sizeof(cname), "%s%c",
   2976 				    device_xname(dv), 'a' + RAW_PART);
   2977 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2978 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2979 			}
   2980 		}
   2981 		deviter_release(&di);
   2982 	}
   2983 	return ac_list;
   2984 }
   2985 
   2986 
   2987 int
   2988 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2989 {
   2990 
   2991 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2992 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2993 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2994 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2995 	    clabel->row >=0 &&
   2996 	    clabel->column >= 0 &&
   2997 	    clabel->num_rows > 0 &&
   2998 	    clabel->num_columns > 0 &&
   2999 	    clabel->row < clabel->num_rows &&
   3000 	    clabel->column < clabel->num_columns &&
   3001 	    clabel->blockSize > 0 &&
   3002 	    /*
   3003 	     * numBlocksHi may contain garbage, but it is ok since
   3004 	     * the type is unsigned.  If it is really garbage,
   3005 	     * rf_fix_old_label_size() will fix it.
   3006 	     */
   3007 	    rf_component_label_numblocks(clabel) > 0) {
   3008 		/*
   3009 		 * label looks reasonable enough...
   3010 		 * let's make sure it has no old garbage.
   3011 		 */
   3012 		if (numsecs)
   3013 			rf_fix_old_label_size(clabel, numsecs);
   3014 		return(1);
   3015 	}
   3016 	return(0);
   3017 }
   3018 
   3019 
   3020 /*
   3021  * For reasons yet unknown, some old component labels have garbage in
   3022  * the newer numBlocksHi region, and this causes lossage.  Since those
   3023  * disks will also have numsecs set to less than 32 bits of sectors,
   3024  * we can determine when this corruption has occurred, and fix it.
   3025  *
   3026  * The exact same problem, with the same unknown reason, happens to
   3027  * the partitionSizeHi member as well.
   3028  */
   3029 static void
   3030 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3031 {
   3032 
   3033 	if (numsecs < ((uint64_t)1 << 32)) {
   3034 		if (clabel->numBlocksHi) {
   3035 			printf("WARNING: total sectors < 32 bits, yet "
   3036 			       "numBlocksHi set\n"
   3037 			       "WARNING: resetting numBlocksHi to zero.\n");
   3038 			clabel->numBlocksHi = 0;
   3039 		}
   3040 
   3041 		if (clabel->partitionSizeHi) {
   3042 			printf("WARNING: total sectors < 32 bits, yet "
   3043 			       "partitionSizeHi set\n"
   3044 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3045 			clabel->partitionSizeHi = 0;
   3046 		}
   3047 	}
   3048 }
   3049 
   3050 
   3051 #ifdef DEBUG
   3052 void
   3053 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3054 {
   3055 	uint64_t numBlocks;
   3056 	static const char *rp[] = {
   3057 	    "No", "Force", "Soft", "*invalid*"
   3058 	};
   3059 
   3060 
   3061 	numBlocks = rf_component_label_numblocks(clabel);
   3062 
   3063 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3064 	       clabel->row, clabel->column,
   3065 	       clabel->num_rows, clabel->num_columns);
   3066 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3067 	       clabel->version, clabel->serial_number,
   3068 	       clabel->mod_counter);
   3069 	printf("   Clean: %s Status: %d\n",
   3070 	       clabel->clean ? "Yes" : "No", clabel->status);
   3071 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3072 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3073 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3074 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3075 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3076 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3077 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3078 #if 0
   3079 	   printf("   Config order: %d\n", clabel->config_order);
   3080 #endif
   3081 
   3082 }
   3083 #endif
   3084 
   3085 RF_ConfigSet_t *
   3086 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3087 {
   3088 	RF_AutoConfig_t *ac;
   3089 	RF_ConfigSet_t *config_sets;
   3090 	RF_ConfigSet_t *cset;
   3091 	RF_AutoConfig_t *ac_next;
   3092 
   3093 
   3094 	config_sets = NULL;
   3095 
   3096 	/* Go through the AutoConfig list, and figure out which components
   3097 	   belong to what sets.  */
   3098 	ac = ac_list;
   3099 	while(ac!=NULL) {
   3100 		/* we're going to putz with ac->next, so save it here
   3101 		   for use at the end of the loop */
   3102 		ac_next = ac->next;
   3103 
   3104 		if (config_sets == NULL) {
   3105 			/* will need at least this one... */
   3106 			config_sets = (RF_ConfigSet_t *)
   3107 				malloc(sizeof(RF_ConfigSet_t),
   3108 				       M_RAIDFRAME, M_NOWAIT);
   3109 			if (config_sets == NULL) {
   3110 				panic("rf_create_auto_sets: No memory!");
   3111 			}
   3112 			/* this one is easy :) */
   3113 			config_sets->ac = ac;
   3114 			config_sets->next = NULL;
   3115 			config_sets->rootable = 0;
   3116 			ac->next = NULL;
   3117 		} else {
   3118 			/* which set does this component fit into? */
   3119 			cset = config_sets;
   3120 			while(cset!=NULL) {
   3121 				if (rf_does_it_fit(cset, ac)) {
   3122 					/* looks like it matches... */
   3123 					ac->next = cset->ac;
   3124 					cset->ac = ac;
   3125 					break;
   3126 				}
   3127 				cset = cset->next;
   3128 			}
   3129 			if (cset==NULL) {
   3130 				/* didn't find a match above... new set..*/
   3131 				cset = (RF_ConfigSet_t *)
   3132 					malloc(sizeof(RF_ConfigSet_t),
   3133 					       M_RAIDFRAME, M_NOWAIT);
   3134 				if (cset == NULL) {
   3135 					panic("rf_create_auto_sets: No memory!");
   3136 				}
   3137 				cset->ac = ac;
   3138 				ac->next = NULL;
   3139 				cset->next = config_sets;
   3140 				cset->rootable = 0;
   3141 				config_sets = cset;
   3142 			}
   3143 		}
   3144 		ac = ac_next;
   3145 	}
   3146 
   3147 
   3148 	return(config_sets);
   3149 }
   3150 
   3151 static int
   3152 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3153 {
   3154 	RF_ComponentLabel_t *clabel1, *clabel2;
   3155 
   3156 	/* If this one matches the *first* one in the set, that's good
   3157 	   enough, since the other members of the set would have been
   3158 	   through here too... */
   3159 	/* note that we are not checking partitionSize here..
   3160 
   3161 	   Note that we are also not checking the mod_counters here.
   3162 	   If everything else matches except the mod_counter, that's
   3163 	   good enough for this test.  We will deal with the mod_counters
   3164 	   a little later in the autoconfiguration process.
   3165 
   3166 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3167 
   3168 	   The reason we don't check for this is that failed disks
   3169 	   will have lower modification counts.  If those disks are
   3170 	   not added to the set they used to belong to, then they will
   3171 	   form their own set, which may result in 2 different sets,
   3172 	   for example, competing to be configured at raid0, and
   3173 	   perhaps competing to be the root filesystem set.  If the
   3174 	   wrong ones get configured, or both attempt to become /,
   3175 	   weird behaviour and or serious lossage will occur.  Thus we
   3176 	   need to bring them into the fold here, and kick them out at
   3177 	   a later point.
   3178 
   3179 	*/
   3180 
   3181 	clabel1 = cset->ac->clabel;
   3182 	clabel2 = ac->clabel;
   3183 	if ((clabel1->version == clabel2->version) &&
   3184 	    (clabel1->serial_number == clabel2->serial_number) &&
   3185 	    (clabel1->num_rows == clabel2->num_rows) &&
   3186 	    (clabel1->num_columns == clabel2->num_columns) &&
   3187 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3188 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3189 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3190 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3191 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3192 	    (clabel1->blockSize == clabel2->blockSize) &&
   3193 	    rf_component_label_numblocks(clabel1) ==
   3194 	    rf_component_label_numblocks(clabel2) &&
   3195 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3196 	    (clabel1->root_partition == clabel2->root_partition) &&
   3197 	    (clabel1->last_unit == clabel2->last_unit) &&
   3198 	    (clabel1->config_order == clabel2->config_order)) {
   3199 		/* if it get's here, it almost *has* to be a match */
   3200 	} else {
   3201 		/* it's not consistent with somebody in the set..
   3202 		   punt */
   3203 		return(0);
   3204 	}
   3205 	/* all was fine.. it must fit... */
   3206 	return(1);
   3207 }
   3208 
   3209 int
   3210 rf_have_enough_components(RF_ConfigSet_t *cset)
   3211 {
   3212 	RF_AutoConfig_t *ac;
   3213 	RF_AutoConfig_t *auto_config;
   3214 	RF_ComponentLabel_t *clabel;
   3215 	int c;
   3216 	int num_cols;
   3217 	int num_missing;
   3218 	int mod_counter;
   3219 	int mod_counter_found;
   3220 	int even_pair_failed;
   3221 	char parity_type;
   3222 
   3223 
   3224 	/* check to see that we have enough 'live' components
   3225 	   of this set.  If so, we can configure it if necessary */
   3226 
   3227 	num_cols = cset->ac->clabel->num_columns;
   3228 	parity_type = cset->ac->clabel->parityConfig;
   3229 
   3230 	/* XXX Check for duplicate components!?!?!? */
   3231 
   3232 	/* Determine what the mod_counter is supposed to be for this set. */
   3233 
   3234 	mod_counter_found = 0;
   3235 	mod_counter = 0;
   3236 	ac = cset->ac;
   3237 	while(ac!=NULL) {
   3238 		if (mod_counter_found==0) {
   3239 			mod_counter = ac->clabel->mod_counter;
   3240 			mod_counter_found = 1;
   3241 		} else {
   3242 			if (ac->clabel->mod_counter > mod_counter) {
   3243 				mod_counter = ac->clabel->mod_counter;
   3244 			}
   3245 		}
   3246 		ac = ac->next;
   3247 	}
   3248 
   3249 	num_missing = 0;
   3250 	auto_config = cset->ac;
   3251 
   3252 	even_pair_failed = 0;
   3253 	for(c=0; c<num_cols; c++) {
   3254 		ac = auto_config;
   3255 		while(ac!=NULL) {
   3256 			if ((ac->clabel->column == c) &&
   3257 			    (ac->clabel->mod_counter == mod_counter)) {
   3258 				/* it's this one... */
   3259 #ifdef DEBUG
   3260 				printf("Found: %s at %d\n",
   3261 				       ac->devname,c);
   3262 #endif
   3263 				break;
   3264 			}
   3265 			ac=ac->next;
   3266 		}
   3267 		if (ac==NULL) {
   3268 				/* Didn't find one here! */
   3269 				/* special case for RAID 1, especially
   3270 				   where there are more than 2
   3271 				   components (where RAIDframe treats
   3272 				   things a little differently :( ) */
   3273 			if (parity_type == '1') {
   3274 				if (c%2 == 0) { /* even component */
   3275 					even_pair_failed = 1;
   3276 				} else { /* odd component.  If
   3277 					    we're failed, and
   3278 					    so is the even
   3279 					    component, it's
   3280 					    "Good Night, Charlie" */
   3281 					if (even_pair_failed == 1) {
   3282 						return(0);
   3283 					}
   3284 				}
   3285 			} else {
   3286 				/* normal accounting */
   3287 				num_missing++;
   3288 			}
   3289 		}
   3290 		if ((parity_type == '1') && (c%2 == 1)) {
   3291 				/* Just did an even component, and we didn't
   3292 				   bail.. reset the even_pair_failed flag,
   3293 				   and go on to the next component.... */
   3294 			even_pair_failed = 0;
   3295 		}
   3296 	}
   3297 
   3298 	clabel = cset->ac->clabel;
   3299 
   3300 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3301 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3302 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3303 		/* XXX this needs to be made *much* more general */
   3304 		/* Too many failures */
   3305 		return(0);
   3306 	}
   3307 	/* otherwise, all is well, and we've got enough to take a kick
   3308 	   at autoconfiguring this set */
   3309 	return(1);
   3310 }
   3311 
   3312 void
   3313 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3314 			RF_Raid_t *raidPtr)
   3315 {
   3316 	RF_ComponentLabel_t *clabel;
   3317 	int i;
   3318 
   3319 	clabel = ac->clabel;
   3320 
   3321 	/* 1. Fill in the common stuff */
   3322 	config->numCol = clabel->num_columns;
   3323 	config->numSpare = 0; /* XXX should this be set here? */
   3324 	config->sectPerSU = clabel->sectPerSU;
   3325 	config->SUsPerPU = clabel->SUsPerPU;
   3326 	config->SUsPerRU = clabel->SUsPerRU;
   3327 	config->parityConfig = clabel->parityConfig;
   3328 	/* XXX... */
   3329 	strcpy(config->diskQueueType,"fifo");
   3330 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3331 	config->layoutSpecificSize = 0; /* XXX ?? */
   3332 
   3333 	while(ac!=NULL) {
   3334 		/* row/col values will be in range due to the checks
   3335 		   in reasonable_label() */
   3336 		strcpy(config->devnames[0][ac->clabel->column],
   3337 		       ac->devname);
   3338 		ac = ac->next;
   3339 	}
   3340 
   3341 	for(i=0;i<RF_MAXDBGV;i++) {
   3342 		config->debugVars[i][0] = 0;
   3343 	}
   3344 }
   3345 
   3346 int
   3347 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3348 {
   3349 	RF_ComponentLabel_t *clabel;
   3350 	int column;
   3351 	int sparecol;
   3352 
   3353 	raidPtr->autoconfigure = new_value;
   3354 
   3355 	for(column=0; column<raidPtr->numCol; column++) {
   3356 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3357 			clabel = raidget_component_label(raidPtr, column);
   3358 			clabel->autoconfigure = new_value;
   3359 			raidflush_component_label(raidPtr, column);
   3360 		}
   3361 	}
   3362 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3363 		sparecol = raidPtr->numCol + column;
   3364 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3365 			clabel = raidget_component_label(raidPtr, sparecol);
   3366 			clabel->autoconfigure = new_value;
   3367 			raidflush_component_label(raidPtr, sparecol);
   3368 		}
   3369 	}
   3370 	return(new_value);
   3371 }
   3372 
   3373 int
   3374 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3375 {
   3376 	RF_ComponentLabel_t *clabel;
   3377 	int column;
   3378 	int sparecol;
   3379 
   3380 	raidPtr->root_partition = new_value;
   3381 	for(column=0; column<raidPtr->numCol; column++) {
   3382 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3383 			clabel = raidget_component_label(raidPtr, column);
   3384 			clabel->root_partition = new_value;
   3385 			raidflush_component_label(raidPtr, column);
   3386 		}
   3387 	}
   3388 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3389 		sparecol = raidPtr->numCol + column;
   3390 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3391 			clabel = raidget_component_label(raidPtr, sparecol);
   3392 			clabel->root_partition = new_value;
   3393 			raidflush_component_label(raidPtr, sparecol);
   3394 		}
   3395 	}
   3396 	return(new_value);
   3397 }
   3398 
   3399 void
   3400 rf_release_all_vps(RF_ConfigSet_t *cset)
   3401 {
   3402 	RF_AutoConfig_t *ac;
   3403 
   3404 	ac = cset->ac;
   3405 	while(ac!=NULL) {
   3406 		/* Close the vp, and give it back */
   3407 		if (ac->vp) {
   3408 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3409 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3410 			vput(ac->vp);
   3411 			ac->vp = NULL;
   3412 		}
   3413 		ac = ac->next;
   3414 	}
   3415 }
   3416 
   3417 
   3418 void
   3419 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3420 {
   3421 	RF_AutoConfig_t *ac;
   3422 	RF_AutoConfig_t *next_ac;
   3423 
   3424 	ac = cset->ac;
   3425 	while(ac!=NULL) {
   3426 		next_ac = ac->next;
   3427 		/* nuke the label */
   3428 		free(ac->clabel, M_RAIDFRAME);
   3429 		/* cleanup the config structure */
   3430 		free(ac, M_RAIDFRAME);
   3431 		/* "next.." */
   3432 		ac = next_ac;
   3433 	}
   3434 	/* and, finally, nuke the config set */
   3435 	free(cset, M_RAIDFRAME);
   3436 }
   3437 
   3438 
   3439 void
   3440 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3441 {
   3442 	/* current version number */
   3443 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3444 	clabel->serial_number = raidPtr->serial_number;
   3445 	clabel->mod_counter = raidPtr->mod_counter;
   3446 
   3447 	clabel->num_rows = 1;
   3448 	clabel->num_columns = raidPtr->numCol;
   3449 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3450 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3451 
   3452 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3453 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3454 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3455 
   3456 	clabel->blockSize = raidPtr->bytesPerSector;
   3457 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3458 
   3459 	/* XXX not portable */
   3460 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3461 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3462 	clabel->autoconfigure = raidPtr->autoconfigure;
   3463 	clabel->root_partition = raidPtr->root_partition;
   3464 	clabel->last_unit = raidPtr->raidid;
   3465 	clabel->config_order = raidPtr->config_order;
   3466 
   3467 #ifndef RF_NO_PARITY_MAP
   3468 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3469 #endif
   3470 }
   3471 
   3472 struct raid_softc *
   3473 rf_auto_config_set(RF_ConfigSet_t *cset)
   3474 {
   3475 	RF_Raid_t *raidPtr;
   3476 	RF_Config_t *config;
   3477 	int raidID;
   3478 	struct raid_softc *sc;
   3479 
   3480 #ifdef DEBUG
   3481 	printf("RAID autoconfigure\n");
   3482 #endif
   3483 
   3484 	/* 1. Create a config structure */
   3485 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3486 	if (config == NULL) {
   3487 		printf("%s: Out of mem - config!?!?\n", __func__);
   3488 				/* XXX do something more intelligent here. */
   3489 		return NULL;
   3490 	}
   3491 
   3492 	/*
   3493 	   2. Figure out what RAID ID this one is supposed to live at
   3494 	   See if we can get the same RAID dev that it was configured
   3495 	   on last time..
   3496 	*/
   3497 
   3498 	raidID = cset->ac->clabel->last_unit;
   3499 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3500 	     sc = raidget(++raidID, false))
   3501 		continue;
   3502 #ifdef DEBUG
   3503 	printf("Configuring raid%d:\n",raidID);
   3504 #endif
   3505 
   3506 	if (sc == NULL)
   3507 		sc = raidget(raidID, true);
   3508 	if (sc == NULL) {
   3509 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3510 				/* XXX do something more intelligent here. */
   3511 		free(config, M_RAIDFRAME);
   3512 		return NULL;
   3513 	}
   3514 
   3515 	raidPtr = &sc->sc_r;
   3516 
   3517 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3518 	raidPtr->softc = sc;
   3519 	raidPtr->raidid = raidID;
   3520 	raidPtr->openings = RAIDOUTSTANDING;
   3521 
   3522 	/* 3. Build the configuration structure */
   3523 	rf_create_configuration(cset->ac, config, raidPtr);
   3524 
   3525 	/* 4. Do the configuration */
   3526 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3527 		raidinit(sc);
   3528 
   3529 		rf_markalldirty(raidPtr);
   3530 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3531 		switch (cset->ac->clabel->root_partition) {
   3532 		case 1:	/* Force Root */
   3533 		case 2:	/* Soft Root: root when boot partition part of raid */
   3534 			/*
   3535 			 * everything configured just fine.  Make a note
   3536 			 * that this set is eligible to be root,
   3537 			 * or forced to be root
   3538 			 */
   3539 			cset->rootable = cset->ac->clabel->root_partition;
   3540 			/* XXX do this here? */
   3541 			raidPtr->root_partition = cset->rootable;
   3542 			break;
   3543 		default:
   3544 			break;
   3545 		}
   3546 	} else {
   3547 		raidput(sc);
   3548 		sc = NULL;
   3549 	}
   3550 
   3551 	/* 5. Cleanup */
   3552 	free(config, M_RAIDFRAME);
   3553 	return sc;
   3554 }
   3555 
   3556 void
   3557 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3558 	     size_t xmin, size_t xmax)
   3559 {
   3560 	int error;
   3561 
   3562 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3563 	pool_sethiwat(p, xmax);
   3564 	if ((error = pool_prime(p, xmin)) != 0)
   3565 		panic("%s: failed to prime pool: %d", __func__, error);
   3566 	pool_setlowat(p, xmin);
   3567 }
   3568 
   3569 /*
   3570  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3571  * to see if there is IO pending and if that IO could possibly be done
   3572  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3573  * otherwise.
   3574  *
   3575  */
   3576 int
   3577 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3578 {
   3579 	struct raid_softc *rs;
   3580 	struct dk_softc *dksc;
   3581 
   3582 	rs = raidPtr->softc;
   3583 	dksc = &rs->sc_dksc;
   3584 
   3585 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3586 		return 1;
   3587 
   3588 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3589 		/* there is work to do */
   3590 		return 0;
   3591 	}
   3592 	/* default is nothing to do */
   3593 	return 1;
   3594 }
   3595 
   3596 int
   3597 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3598 {
   3599 	uint64_t numsecs;
   3600 	unsigned secsize;
   3601 	int error;
   3602 
   3603 	error = getdisksize(vp, &numsecs, &secsize);
   3604 	if (error == 0) {
   3605 		diskPtr->blockSize = secsize;
   3606 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3607 		diskPtr->partitionSize = numsecs;
   3608 		return 0;
   3609 	}
   3610 	return error;
   3611 }
   3612 
   3613 static int
   3614 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3615 {
   3616 	return 1;
   3617 }
   3618 
   3619 static void
   3620 raid_attach(device_t parent, device_t self, void *aux)
   3621 {
   3622 }
   3623 
   3624 
   3625 static int
   3626 raid_detach(device_t self, int flags)
   3627 {
   3628 	int error;
   3629 	struct raid_softc *rs = raidsoftc(self);
   3630 
   3631 	if (rs == NULL)
   3632 		return ENXIO;
   3633 
   3634 	if ((error = raidlock(rs)) != 0)
   3635 		return (error);
   3636 
   3637 	error = raid_detach_unlocked(rs);
   3638 
   3639 	raidunlock(rs);
   3640 
   3641 	/* XXX raid can be referenced here */
   3642 
   3643 	if (error)
   3644 		return error;
   3645 
   3646 	/* Free the softc */
   3647 	raidput(rs);
   3648 
   3649 	return 0;
   3650 }
   3651 
   3652 static void
   3653 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3654 {
   3655 	struct dk_softc *dksc = &rs->sc_dksc;
   3656 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3657 
   3658 	memset(dg, 0, sizeof(*dg));
   3659 
   3660 	dg->dg_secperunit = raidPtr->totalSectors;
   3661 	dg->dg_secsize = raidPtr->bytesPerSector;
   3662 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3663 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3664 
   3665 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3666 }
   3667 
   3668 /*
   3669  * Get cache info for all the components (including spares).
   3670  * Returns intersection of all the cache flags of all disks, or first
   3671  * error if any encountered.
   3672  * XXXfua feature flags can change as spares are added - lock down somehow
   3673  */
   3674 static int
   3675 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3676 {
   3677 	int c;
   3678 	int error;
   3679 	int dkwhole = 0, dkpart;
   3680 
   3681 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3682 		/*
   3683 		 * Check any non-dead disk, even when currently being
   3684 		 * reconstructed.
   3685 		 */
   3686 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3687 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3688 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3689 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3690 			if (error) {
   3691 				if (error != ENODEV) {
   3692 					printf("raid%d: get cache for component %s failed\n",
   3693 					    raidPtr->raidid,
   3694 					    raidPtr->Disks[c].devname);
   3695 				}
   3696 
   3697 				return error;
   3698 			}
   3699 
   3700 			if (c == 0)
   3701 				dkwhole = dkpart;
   3702 			else
   3703 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3704 		}
   3705 	}
   3706 
   3707 	*data = dkwhole;
   3708 
   3709 	return 0;
   3710 }
   3711 
   3712 /*
   3713  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3714  * We end up returning whatever error was returned by the first cache flush
   3715  * that fails.
   3716  */
   3717 
   3718 int
   3719 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3720 {
   3721 	int c, sparecol;
   3722 	int e,error;
   3723 	int force = 1;
   3724 
   3725 	error = 0;
   3726 	for (c = 0; c < raidPtr->numCol; c++) {
   3727 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3728 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3729 					  &force, FWRITE, NOCRED);
   3730 			if (e) {
   3731 				if (e != ENODEV)
   3732 					printf("raid%d: cache flush to component %s failed.\n",
   3733 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3734 				if (error == 0) {
   3735 					error = e;
   3736 				}
   3737 			}
   3738 		}
   3739 	}
   3740 
   3741 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3742 		sparecol = raidPtr->numCol + c;
   3743 		/* Need to ensure that the reconstruct actually completed! */
   3744 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3745 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3746 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3747 			if (e) {
   3748 				if (e != ENODEV)
   3749 					printf("raid%d: cache flush to component %s failed.\n",
   3750 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3751 				if (error == 0) {
   3752 					error = e;
   3753 				}
   3754 			}
   3755 		}
   3756 	}
   3757 	return error;
   3758 }
   3759 
   3760 /* Fill in info with the current status */
   3761 void
   3762 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3763 {
   3764 
   3765 	if (raidPtr->status != rf_rs_reconstructing) {
   3766 		info->total = 100;
   3767 		info->completed = 100;
   3768 	} else {
   3769 		info->total = raidPtr->reconControl->numRUsTotal;
   3770 		info->completed = raidPtr->reconControl->numRUsComplete;
   3771 	}
   3772 	info->remaining = info->total - info->completed;
   3773 }
   3774 
   3775 /* Fill in info with the current status */
   3776 void
   3777 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3778 {
   3779 
   3780 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3781 		info->total = raidPtr->Layout.numStripe;
   3782 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3783 	} else {
   3784 		info->completed = 100;
   3785 		info->total = 100;
   3786 	}
   3787 	info->remaining = info->total - info->completed;
   3788 }
   3789 
   3790 /* Fill in info with the current status */
   3791 void
   3792 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3793 {
   3794 
   3795 	if (raidPtr->copyback_in_progress == 1) {
   3796 		info->total = raidPtr->Layout.numStripe;
   3797 		info->completed = raidPtr->copyback_stripes_done;
   3798 		info->remaining = info->total - info->completed;
   3799 	} else {
   3800 		info->remaining = 0;
   3801 		info->completed = 100;
   3802 		info->total = 100;
   3803 	}
   3804 }
   3805 
   3806 /* Fill in config with the current info */
   3807 int
   3808 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3809 {
   3810 	int	d, i, j;
   3811 
   3812 	if (!raidPtr->valid)
   3813 		return (ENODEV);
   3814 	config->cols = raidPtr->numCol;
   3815 	config->ndevs = raidPtr->numCol;
   3816 	if (config->ndevs >= RF_MAX_DISKS)
   3817 		return (ENOMEM);
   3818 	config->nspares = raidPtr->numSpare;
   3819 	if (config->nspares >= RF_MAX_DISKS)
   3820 		return (ENOMEM);
   3821 	config->maxqdepth = raidPtr->maxQueueDepth;
   3822 	d = 0;
   3823 	for (j = 0; j < config->cols; j++) {
   3824 		config->devs[d] = raidPtr->Disks[j];
   3825 		d++;
   3826 	}
   3827 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3828 		config->spares[i] = raidPtr->Disks[j];
   3829 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3830 			/* XXX: raidctl(8) expects to see this as a used spare */
   3831 			config->spares[i].status = rf_ds_used_spare;
   3832 		}
   3833 	}
   3834 	return 0;
   3835 }
   3836 
   3837 int
   3838 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3839 {
   3840 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3841 	RF_ComponentLabel_t *raid_clabel;
   3842 	int column = clabel->column;
   3843 
   3844 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3845 		return EINVAL;
   3846 	raid_clabel = raidget_component_label(raidPtr, column);
   3847 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3848 
   3849 	return 0;
   3850 }
   3851 
   3852 /*
   3853  * Module interface
   3854  */
   3855 
   3856 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3857 
   3858 #ifdef _MODULE
   3859 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3860 #endif
   3861 
   3862 static int raid_modcmd(modcmd_t, void *);
   3863 static int raid_modcmd_init(void);
   3864 static int raid_modcmd_fini(void);
   3865 
   3866 static int
   3867 raid_modcmd(modcmd_t cmd, void *data)
   3868 {
   3869 	int error;
   3870 
   3871 	error = 0;
   3872 	switch (cmd) {
   3873 	case MODULE_CMD_INIT:
   3874 		error = raid_modcmd_init();
   3875 		break;
   3876 	case MODULE_CMD_FINI:
   3877 		error = raid_modcmd_fini();
   3878 		break;
   3879 	default:
   3880 		error = ENOTTY;
   3881 		break;
   3882 	}
   3883 	return error;
   3884 }
   3885 
   3886 static int
   3887 raid_modcmd_init(void)
   3888 {
   3889 	int error;
   3890 	int bmajor, cmajor;
   3891 
   3892 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3893 	mutex_enter(&raid_lock);
   3894 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3895 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3896 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3897 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3898 
   3899 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3900 #endif
   3901 
   3902 	bmajor = cmajor = -1;
   3903 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3904 	    &raid_cdevsw, &cmajor);
   3905 	if (error != 0 && error != EEXIST) {
   3906 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3907 		mutex_exit(&raid_lock);
   3908 		return error;
   3909 	}
   3910 #ifdef _MODULE
   3911 	error = config_cfdriver_attach(&raid_cd);
   3912 	if (error != 0) {
   3913 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3914 		    __func__, error);
   3915 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3916 		mutex_exit(&raid_lock);
   3917 		return error;
   3918 	}
   3919 #endif
   3920 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3921 	if (error != 0) {
   3922 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3923 		    __func__, error);
   3924 #ifdef _MODULE
   3925 		config_cfdriver_detach(&raid_cd);
   3926 #endif
   3927 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3928 		mutex_exit(&raid_lock);
   3929 		return error;
   3930 	}
   3931 
   3932 	raidautoconfigdone = false;
   3933 
   3934 	mutex_exit(&raid_lock);
   3935 
   3936 	if (error == 0) {
   3937 		if (rf_BootRaidframe(true) == 0)
   3938 			aprint_verbose("Kernelized RAIDframe activated\n");
   3939 		else
   3940 			panic("Serious error activating RAID!!");
   3941 	}
   3942 
   3943 	/*
   3944 	 * Register a finalizer which will be used to auto-config RAID
   3945 	 * sets once all real hardware devices have been found.
   3946 	 */
   3947 	error = config_finalize_register(NULL, rf_autoconfig);
   3948 	if (error != 0) {
   3949 		aprint_error("WARNING: unable to register RAIDframe "
   3950 		    "finalizer\n");
   3951 		error = 0;
   3952 	}
   3953 
   3954 	return error;
   3955 }
   3956 
   3957 static int
   3958 raid_modcmd_fini(void)
   3959 {
   3960 	int error;
   3961 
   3962 	mutex_enter(&raid_lock);
   3963 
   3964 	/* Don't allow unload if raid device(s) exist.  */
   3965 	if (!LIST_EMPTY(&raids)) {
   3966 		mutex_exit(&raid_lock);
   3967 		return EBUSY;
   3968 	}
   3969 
   3970 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3971 	if (error != 0) {
   3972 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3973 		mutex_exit(&raid_lock);
   3974 		return error;
   3975 	}
   3976 #ifdef _MODULE
   3977 	error = config_cfdriver_detach(&raid_cd);
   3978 	if (error != 0) {
   3979 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3980 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3981 		mutex_exit(&raid_lock);
   3982 		return error;
   3983 	}
   3984 #endif
   3985 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3986 	if (error != 0) {
   3987 		aprint_error("%s: cannot detach devsw\n",__func__);
   3988 #ifdef _MODULE
   3989 		config_cfdriver_attach(&raid_cd);
   3990 #endif
   3991 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3992 		mutex_exit(&raid_lock);
   3993 		return error;
   3994 	}
   3995 	rf_BootRaidframe(false);
   3996 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3997 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3998 	rf_destroy_cond2(rf_sparet_wait_cv);
   3999 	rf_destroy_cond2(rf_sparet_resp_cv);
   4000 #endif
   4001 	mutex_exit(&raid_lock);
   4002 	mutex_destroy(&raid_lock);
   4003 
   4004 	return error;
   4005 }
   4006