Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.356.2.2
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.356.2.2 2018/09/09 22:12:16 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.356.2.2 2018/09/09 22:12:16 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 #include <sys/compat_stub.h>
    132 
    133 #include <prop/proplib.h>
    134 
    135 #include <dev/raidframe/raidframevar.h>
    136 #include <dev/raidframe/raidframeio.h>
    137 #include <dev/raidframe/rf_paritymap.h>
    138 
    139 #include "rf_raid.h"
    140 #include "rf_copyback.h"
    141 #include "rf_dag.h"
    142 #include "rf_dagflags.h"
    143 #include "rf_desc.h"
    144 #include "rf_diskqueue.h"
    145 #include "rf_etimer.h"
    146 #include "rf_general.h"
    147 #include "rf_kintf.h"
    148 #include "rf_options.h"
    149 #include "rf_driver.h"
    150 #include "rf_parityscan.h"
    151 #include "rf_threadstuff.h"
    152 
    153 #include "rf_compat50.h"
    154 
    155 #include "rf_compat80.h"
    156 
    157 #ifdef COMPAT_NETBSD32
    158 #include "rf_compat32.h"
    159 #endif
    160 
    161 #include "ioconf.h"
    162 
    163 #ifdef DEBUG
    164 int     rf_kdebug_level = 0;
    165 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    166 #else				/* DEBUG */
    167 #define db1_printf(a) { }
    168 #endif				/* DEBUG */
    169 
    170 #ifdef DEBUG_ROOT
    171 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    172 #else
    173 #define DPRINTF(a, ...)
    174 #endif
    175 
    176 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    177 static rf_declare_mutex2(rf_sparet_wait_mutex);
    178 static rf_declare_cond2(rf_sparet_wait_cv);
    179 static rf_declare_cond2(rf_sparet_resp_cv);
    180 
    181 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    182 						 * spare table */
    183 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    184 						 * installation process */
    185 #endif
    186 
    187 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    188 
    189 /* prototypes */
    190 static void KernelWakeupFunc(struct buf *);
    191 static void InitBP(struct buf *, struct vnode *, unsigned,
    192     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    193     void *, int, struct proc *);
    194 struct raid_softc;
    195 static void raidinit(struct raid_softc *);
    196 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    197 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    198 
    199 static int raid_match(device_t, cfdata_t, void *);
    200 static void raid_attach(device_t, device_t, void *);
    201 static int raid_detach(device_t, int);
    202 
    203 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    204     daddr_t, daddr_t);
    205 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    206     daddr_t, daddr_t, int);
    207 
    208 static int raidwrite_component_label(unsigned,
    209     dev_t, struct vnode *, RF_ComponentLabel_t *);
    210 static int raidread_component_label(unsigned,
    211     dev_t, struct vnode *, RF_ComponentLabel_t *);
    212 
    213 static int raid_diskstart(device_t, struct buf *bp);
    214 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    215 static int raid_lastclose(device_t);
    216 
    217 static dev_type_open(raidopen);
    218 static dev_type_close(raidclose);
    219 static dev_type_read(raidread);
    220 static dev_type_write(raidwrite);
    221 static dev_type_ioctl(raidioctl);
    222 static dev_type_strategy(raidstrategy);
    223 static dev_type_dump(raiddump);
    224 static dev_type_size(raidsize);
    225 
    226 const struct bdevsw raid_bdevsw = {
    227 	.d_open = raidopen,
    228 	.d_close = raidclose,
    229 	.d_strategy = raidstrategy,
    230 	.d_ioctl = raidioctl,
    231 	.d_dump = raiddump,
    232 	.d_psize = raidsize,
    233 	.d_discard = nodiscard,
    234 	.d_flag = D_DISK
    235 };
    236 
    237 const struct cdevsw raid_cdevsw = {
    238 	.d_open = raidopen,
    239 	.d_close = raidclose,
    240 	.d_read = raidread,
    241 	.d_write = raidwrite,
    242 	.d_ioctl = raidioctl,
    243 	.d_stop = nostop,
    244 	.d_tty = notty,
    245 	.d_poll = nopoll,
    246 	.d_mmap = nommap,
    247 	.d_kqfilter = nokqfilter,
    248 	.d_discard = nodiscard,
    249 	.d_flag = D_DISK
    250 };
    251 
    252 static struct dkdriver rf_dkdriver = {
    253 	.d_open = raidopen,
    254 	.d_close = raidclose,
    255 	.d_strategy = raidstrategy,
    256 	.d_diskstart = raid_diskstart,
    257 	.d_dumpblocks = raid_dumpblocks,
    258 	.d_lastclose = raid_lastclose,
    259 	.d_minphys = minphys
    260 };
    261 
    262 struct raid_softc {
    263 	struct dk_softc sc_dksc;
    264 	int	sc_unit;
    265 	int     sc_flags;	/* flags */
    266 	int     sc_cflags;	/* configuration flags */
    267 	kmutex_t sc_mutex;	/* interlock mutex */
    268 	kcondvar_t sc_cv;	/* and the condvar */
    269 	uint64_t sc_size;	/* size of the raid device */
    270 	char    sc_xname[20];	/* XXX external name */
    271 	RF_Raid_t sc_r;
    272 	LIST_ENTRY(raid_softc) sc_link;
    273 };
    274 /* sc_flags */
    275 #define RAIDF_INITED		0x01	/* unit has been initialized */
    276 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    277 #define RAIDF_DETACH  		0x04	/* detach after final close */
    278 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    279 #define RAIDF_LOCKED		0x10	/* unit is locked */
    280 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    281 
    282 #define	raidunit(x)	DISKUNIT(x)
    283 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    284 
    285 extern struct cfdriver raid_cd;
    286 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    287     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    288     DVF_DETACH_SHUTDOWN);
    289 
    290 /* Internal representation of a rf_recon_req */
    291 struct rf_recon_req_internal {
    292 	RF_RowCol_t col;
    293 	RF_ReconReqFlags_t flags;
    294 	void   *raidPtr;
    295 };
    296 
    297 /*
    298  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    299  * Be aware that large numbers can allow the driver to consume a lot of
    300  * kernel memory, especially on writes, and in degraded mode reads.
    301  *
    302  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    303  * a single 64K write will typically require 64K for the old data,
    304  * 64K for the old parity, and 64K for the new parity, for a total
    305  * of 192K (if the parity buffer is not re-used immediately).
    306  * Even it if is used immediately, that's still 128K, which when multiplied
    307  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    308  *
    309  * Now in degraded mode, for example, a 64K read on the above setup may
    310  * require data reconstruction, which will require *all* of the 4 remaining
    311  * disks to participate -- 4 * 32K/disk == 128K again.
    312  */
    313 
    314 #ifndef RAIDOUTSTANDING
    315 #define RAIDOUTSTANDING   6
    316 #endif
    317 
    318 #define RAIDLABELDEV(dev)	\
    319 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    320 
    321 /* declared here, and made public, for the benefit of KVM stuff.. */
    322 
    323 static int raidlock(struct raid_softc *);
    324 static void raidunlock(struct raid_softc *);
    325 
    326 static int raid_detach_unlocked(struct raid_softc *);
    327 
    328 static void rf_markalldirty(RF_Raid_t *);
    329 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    330 
    331 void rf_ReconThread(struct rf_recon_req_internal *);
    332 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    333 void rf_CopybackThread(RF_Raid_t *raidPtr);
    334 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    335 int rf_autoconfig(device_t);
    336 void rf_buildroothack(RF_ConfigSet_t *);
    337 
    338 RF_AutoConfig_t *rf_find_raid_components(void);
    339 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    340 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    341 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    342 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    343 int rf_set_autoconfig(RF_Raid_t *, int);
    344 int rf_set_rootpartition(RF_Raid_t *, int);
    345 void rf_release_all_vps(RF_ConfigSet_t *);
    346 void rf_cleanup_config_set(RF_ConfigSet_t *);
    347 int rf_have_enough_components(RF_ConfigSet_t *);
    348 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    349 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    350 
    351 /*
    352  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    353  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    354  * in the kernel config file.
    355  */
    356 #ifdef RAID_AUTOCONFIG
    357 int raidautoconfig = 1;
    358 #else
    359 int raidautoconfig = 0;
    360 #endif
    361 static bool raidautoconfigdone = false;
    362 
    363 struct RF_Pools_s rf_pools;
    364 
    365 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    366 static kmutex_t raid_lock;
    367 
    368 static struct raid_softc *
    369 raidcreate(int unit) {
    370 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    371 	sc->sc_unit = unit;
    372 	cv_init(&sc->sc_cv, "raidunit");
    373 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    374 	return sc;
    375 }
    376 
    377 static void
    378 raiddestroy(struct raid_softc *sc) {
    379 	cv_destroy(&sc->sc_cv);
    380 	mutex_destroy(&sc->sc_mutex);
    381 	kmem_free(sc, sizeof(*sc));
    382 }
    383 
    384 static struct raid_softc *
    385 raidget(int unit, bool create) {
    386 	struct raid_softc *sc;
    387 	if (unit < 0) {
    388 #ifdef DIAGNOSTIC
    389 		panic("%s: unit %d!", __func__, unit);
    390 #endif
    391 		return NULL;
    392 	}
    393 	mutex_enter(&raid_lock);
    394 	LIST_FOREACH(sc, &raids, sc_link) {
    395 		if (sc->sc_unit == unit) {
    396 			mutex_exit(&raid_lock);
    397 			return sc;
    398 		}
    399 	}
    400 	mutex_exit(&raid_lock);
    401 	if (!create)
    402 		return NULL;
    403 	if ((sc = raidcreate(unit)) == NULL)
    404 		return NULL;
    405 	mutex_enter(&raid_lock);
    406 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    407 	mutex_exit(&raid_lock);
    408 	return sc;
    409 }
    410 
    411 static void
    412 raidput(struct raid_softc *sc) {
    413 	mutex_enter(&raid_lock);
    414 	LIST_REMOVE(sc, sc_link);
    415 	mutex_exit(&raid_lock);
    416 	raiddestroy(sc);
    417 }
    418 
    419 void
    420 raidattach(int num)
    421 {
    422 
    423 	/*
    424 	 * Device attachment and associated initialization now occurs
    425 	 * as part of the module initialization.
    426 	 */
    427 }
    428 
    429 int
    430 rf_autoconfig(device_t self)
    431 {
    432 	RF_AutoConfig_t *ac_list;
    433 	RF_ConfigSet_t *config_sets;
    434 
    435 	if (!raidautoconfig || raidautoconfigdone == true)
    436 		return (0);
    437 
    438 	/* XXX This code can only be run once. */
    439 	raidautoconfigdone = true;
    440 
    441 #ifdef __HAVE_CPU_BOOTCONF
    442 	/*
    443 	 * 0. find the boot device if needed first so we can use it later
    444 	 * this needs to be done before we autoconfigure any raid sets,
    445 	 * because if we use wedges we are not going to be able to open
    446 	 * the boot device later
    447 	 */
    448 	if (booted_device == NULL)
    449 		cpu_bootconf();
    450 #endif
    451 	/* 1. locate all RAID components on the system */
    452 	aprint_debug("Searching for RAID components...\n");
    453 	ac_list = rf_find_raid_components();
    454 
    455 	/* 2. Sort them into their respective sets. */
    456 	config_sets = rf_create_auto_sets(ac_list);
    457 
    458 	/*
    459 	 * 3. Evaluate each set and configure the valid ones.
    460 	 * This gets done in rf_buildroothack().
    461 	 */
    462 	rf_buildroothack(config_sets);
    463 
    464 	return 1;
    465 }
    466 
    467 static int
    468 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    469 	const char *bootname = device_xname(bdv);
    470 	size_t len = strlen(bootname);
    471 
    472 	for (int col = 0; col < r->numCol; col++) {
    473 		const char *devname = r->Disks[col].devname;
    474 		devname += sizeof("/dev/") - 1;
    475 		if (strncmp(devname, "dk", 2) == 0) {
    476 			const char *parent =
    477 			    dkwedge_get_parent_name(r->Disks[col].dev);
    478 			if (parent != NULL)
    479 				devname = parent;
    480 		}
    481 		if (strncmp(devname, bootname, len) == 0) {
    482 			struct raid_softc *sc = r->softc;
    483 			aprint_debug("raid%d includes boot device %s\n",
    484 			    sc->sc_unit, devname);
    485 			return 1;
    486 		}
    487 	}
    488 	return 0;
    489 }
    490 
    491 void
    492 rf_buildroothack(RF_ConfigSet_t *config_sets)
    493 {
    494 	RF_ConfigSet_t *cset;
    495 	RF_ConfigSet_t *next_cset;
    496 	int num_root;
    497 	struct raid_softc *sc, *rsc;
    498 	struct dk_softc *dksc;
    499 
    500 	sc = rsc = NULL;
    501 	num_root = 0;
    502 	cset = config_sets;
    503 	while (cset != NULL) {
    504 		next_cset = cset->next;
    505 		if (rf_have_enough_components(cset) &&
    506 		    cset->ac->clabel->autoconfigure == 1) {
    507 			sc = rf_auto_config_set(cset);
    508 			if (sc != NULL) {
    509 				aprint_debug("raid%d: configured ok\n",
    510 				    sc->sc_unit);
    511 				if (cset->rootable) {
    512 					rsc = sc;
    513 					num_root++;
    514 				}
    515 			} else {
    516 				/* The autoconfig didn't work :( */
    517 				aprint_debug("Autoconfig failed\n");
    518 				rf_release_all_vps(cset);
    519 			}
    520 		} else {
    521 			/* we're not autoconfiguring this set...
    522 			   release the associated resources */
    523 			rf_release_all_vps(cset);
    524 		}
    525 		/* cleanup */
    526 		rf_cleanup_config_set(cset);
    527 		cset = next_cset;
    528 	}
    529 	dksc = &rsc->sc_dksc;
    530 
    531 	/* if the user has specified what the root device should be
    532 	   then we don't touch booted_device or boothowto... */
    533 
    534 	if (rootspec != NULL)
    535 		return;
    536 
    537 	/* we found something bootable... */
    538 
    539 	/*
    540 	 * XXX: The following code assumes that the root raid
    541 	 * is the first ('a') partition. This is about the best
    542 	 * we can do with a BSD disklabel, but we might be able
    543 	 * to do better with a GPT label, by setting a specified
    544 	 * attribute to indicate the root partition. We can then
    545 	 * stash the partition number in the r->root_partition
    546 	 * high bits (the bottom 2 bits are already used). For
    547 	 * now we just set booted_partition to 0 when we override
    548 	 * root.
    549 	 */
    550 	if (num_root == 1) {
    551 		device_t candidate_root;
    552 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    553 			char cname[sizeof(cset->ac->devname)];
    554 			/* XXX: assume partition 'a' first */
    555 			snprintf(cname, sizeof(cname), "%s%c",
    556 			    device_xname(dksc->sc_dev), 'a');
    557 			candidate_root = dkwedge_find_by_wname(cname);
    558 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    559 			    cname);
    560 			if (candidate_root == NULL) {
    561 				/*
    562 				 * If that is not found, because we don't use
    563 				 * disklabel, return the first dk child
    564 				 * XXX: we can skip the 'a' check above
    565 				 * and always do this...
    566 				 */
    567 				size_t i = 0;
    568 				candidate_root = dkwedge_find_by_parent(
    569 				    device_xname(dksc->sc_dev), &i);
    570 			}
    571 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    572 			    candidate_root);
    573 		} else
    574 			candidate_root = dksc->sc_dev;
    575 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    576 		DPRINTF("%s: booted_device=%p root_partition=%d "
    577 		   "contains_boot=%d\n", __func__, booted_device,
    578 		   rsc->sc_r.root_partition,
    579 		   rf_containsboot(&rsc->sc_r, booted_device));
    580 		if (booted_device == NULL ||
    581 		    rsc->sc_r.root_partition == 1 ||
    582 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    583 			booted_device = candidate_root;
    584 			booted_method = "raidframe/single";
    585 			booted_partition = 0;	/* XXX assume 'a' */
    586 		}
    587 	} else if (num_root > 1) {
    588 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    589 		    booted_device);
    590 
    591 		/*
    592 		 * Maybe the MD code can help. If it cannot, then
    593 		 * setroot() will discover that we have no
    594 		 * booted_device and will ask the user if nothing was
    595 		 * hardwired in the kernel config file
    596 		 */
    597 		if (booted_device == NULL)
    598 			return;
    599 
    600 		num_root = 0;
    601 		mutex_enter(&raid_lock);
    602 		LIST_FOREACH(sc, &raids, sc_link) {
    603 			RF_Raid_t *r = &sc->sc_r;
    604 			if (r->valid == 0)
    605 				continue;
    606 
    607 			if (r->root_partition == 0)
    608 				continue;
    609 
    610 			if (rf_containsboot(r, booted_device)) {
    611 				num_root++;
    612 				rsc = sc;
    613 				dksc = &rsc->sc_dksc;
    614 			}
    615 		}
    616 		mutex_exit(&raid_lock);
    617 
    618 		if (num_root == 1) {
    619 			booted_device = dksc->sc_dev;
    620 			booted_method = "raidframe/multi";
    621 			booted_partition = 0;	/* XXX assume 'a' */
    622 		} else {
    623 			/* we can't guess.. require the user to answer... */
    624 			boothowto |= RB_ASKNAME;
    625 		}
    626 	}
    627 }
    628 
    629 static int
    630 raidsize(dev_t dev)
    631 {
    632 	struct raid_softc *rs;
    633 	struct dk_softc *dksc;
    634 	unsigned int unit;
    635 
    636 	unit = raidunit(dev);
    637 	if ((rs = raidget(unit, false)) == NULL)
    638 		return -1;
    639 	dksc = &rs->sc_dksc;
    640 
    641 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    642 		return -1;
    643 
    644 	return dk_size(dksc, dev);
    645 }
    646 
    647 static int
    648 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    649 {
    650 	unsigned int unit;
    651 	struct raid_softc *rs;
    652 	struct dk_softc *dksc;
    653 
    654 	unit = raidunit(dev);
    655 	if ((rs = raidget(unit, false)) == NULL)
    656 		return ENXIO;
    657 	dksc = &rs->sc_dksc;
    658 
    659 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    660 		return ENODEV;
    661 
    662         /*
    663            Note that blkno is relative to this particular partition.
    664            By adding adding RF_PROTECTED_SECTORS, we get a value that
    665 	   is relative to the partition used for the underlying component.
    666         */
    667 	blkno += RF_PROTECTED_SECTORS;
    668 
    669 	return dk_dump(dksc, dev, blkno, va, size);
    670 }
    671 
    672 static int
    673 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    674 {
    675 	struct raid_softc *rs = raidsoftc(dev);
    676 	const struct bdevsw *bdev;
    677 	RF_Raid_t *raidPtr;
    678 	int     c, sparecol, j, scol, dumpto;
    679 	int     error = 0;
    680 
    681 	raidPtr = &rs->sc_r;
    682 
    683 	/* we only support dumping to RAID 1 sets */
    684 	if (raidPtr->Layout.numDataCol != 1 ||
    685 	    raidPtr->Layout.numParityCol != 1)
    686 		return EINVAL;
    687 
    688 	if ((error = raidlock(rs)) != 0)
    689 		return error;
    690 
    691 	/* figure out what device is alive.. */
    692 
    693 	/*
    694 	   Look for a component to dump to.  The preference for the
    695 	   component to dump to is as follows:
    696 	   1) the master
    697 	   2) a used_spare of the master
    698 	   3) the slave
    699 	   4) a used_spare of the slave
    700 	*/
    701 
    702 	dumpto = -1;
    703 	for (c = 0; c < raidPtr->numCol; c++) {
    704 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    705 			/* this might be the one */
    706 			dumpto = c;
    707 			break;
    708 		}
    709 	}
    710 
    711 	/*
    712 	   At this point we have possibly selected a live master or a
    713 	   live slave.  We now check to see if there is a spared
    714 	   master (or a spared slave), if we didn't find a live master
    715 	   or a live slave.
    716 	*/
    717 
    718 	for (c = 0; c < raidPtr->numSpare; c++) {
    719 		sparecol = raidPtr->numCol + c;
    720 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    721 			/* How about this one? */
    722 			scol = -1;
    723 			for(j=0;j<raidPtr->numCol;j++) {
    724 				if (raidPtr->Disks[j].spareCol == sparecol) {
    725 					scol = j;
    726 					break;
    727 				}
    728 			}
    729 			if (scol == 0) {
    730 				/*
    731 				   We must have found a spared master!
    732 				   We'll take that over anything else
    733 				   found so far.  (We couldn't have
    734 				   found a real master before, since
    735 				   this is a used spare, and it's
    736 				   saying that it's replacing the
    737 				   master.)  On reboot (with
    738 				   autoconfiguration turned on)
    739 				   sparecol will become the 1st
    740 				   component (component0) of this set.
    741 				*/
    742 				dumpto = sparecol;
    743 				break;
    744 			} else if (scol != -1) {
    745 				/*
    746 				   Must be a spared slave.  We'll dump
    747 				   to that if we havn't found anything
    748 				   else so far.
    749 				*/
    750 				if (dumpto == -1)
    751 					dumpto = sparecol;
    752 			}
    753 		}
    754 	}
    755 
    756 	if (dumpto == -1) {
    757 		/* we couldn't find any live components to dump to!?!?
    758 		 */
    759 		error = EINVAL;
    760 		goto out;
    761 	}
    762 
    763 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    764 	if (bdev == NULL) {
    765 		error = ENXIO;
    766 		goto out;
    767 	}
    768 
    769 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    770 				blkno, va, nblk * raidPtr->bytesPerSector);
    771 
    772 out:
    773 	raidunlock(rs);
    774 
    775 	return error;
    776 }
    777 
    778 /* ARGSUSED */
    779 static int
    780 raidopen(dev_t dev, int flags, int fmt,
    781     struct lwp *l)
    782 {
    783 	int     unit = raidunit(dev);
    784 	struct raid_softc *rs;
    785 	struct dk_softc *dksc;
    786 	int     error = 0;
    787 	int     part, pmask;
    788 
    789 	if ((rs = raidget(unit, true)) == NULL)
    790 		return ENXIO;
    791 	if ((error = raidlock(rs)) != 0)
    792 		return (error);
    793 
    794 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    795 		error = EBUSY;
    796 		goto bad;
    797 	}
    798 
    799 	dksc = &rs->sc_dksc;
    800 
    801 	part = DISKPART(dev);
    802 	pmask = (1 << part);
    803 
    804 	if (!DK_BUSY(dksc, pmask) &&
    805 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    806 		/* First one... mark things as dirty... Note that we *MUST*
    807 		 have done a configure before this.  I DO NOT WANT TO BE
    808 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    809 		 THAT THEY BELONG TOGETHER!!!!! */
    810 		/* XXX should check to see if we're only open for reading
    811 		   here... If so, we needn't do this, but then need some
    812 		   other way of keeping track of what's happened.. */
    813 
    814 		rf_markalldirty(&rs->sc_r);
    815 	}
    816 
    817 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    818 		error = dk_open(dksc, dev, flags, fmt, l);
    819 
    820 bad:
    821 	raidunlock(rs);
    822 
    823 	return (error);
    824 
    825 
    826 }
    827 
    828 static int
    829 raid_lastclose(device_t self)
    830 {
    831 	struct raid_softc *rs = raidsoftc(self);
    832 
    833 	/* Last one... device is not unconfigured yet.
    834 	   Device shutdown has taken care of setting the
    835 	   clean bits if RAIDF_INITED is not set
    836 	   mark things as clean... */
    837 
    838 	rf_update_component_labels(&rs->sc_r,
    839 	    RF_FINAL_COMPONENT_UPDATE);
    840 
    841 	/* pass to unlocked code */
    842 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    843 		rs->sc_flags |= RAIDF_DETACH;
    844 
    845 	return 0;
    846 }
    847 
    848 /* ARGSUSED */
    849 static int
    850 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    851 {
    852 	int     unit = raidunit(dev);
    853 	struct raid_softc *rs;
    854 	struct dk_softc *dksc;
    855 	cfdata_t cf;
    856 	int     error = 0, do_detach = 0, do_put = 0;
    857 
    858 	if ((rs = raidget(unit, false)) == NULL)
    859 		return ENXIO;
    860 	dksc = &rs->sc_dksc;
    861 
    862 	if ((error = raidlock(rs)) != 0)
    863 		return (error);
    864 
    865 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    866 		error = dk_close(dksc, dev, flags, fmt, l);
    867 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    868 			do_detach = 1;
    869 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    870 		do_put = 1;
    871 
    872 	raidunlock(rs);
    873 
    874 	if (do_detach) {
    875 		/* free the pseudo device attach bits */
    876 		cf = device_cfdata(dksc->sc_dev);
    877 		error = config_detach(dksc->sc_dev, 0);
    878 		if (error == 0)
    879 			free(cf, M_RAIDFRAME);
    880 	} else if (do_put) {
    881 		raidput(rs);
    882 	}
    883 
    884 	return (error);
    885 
    886 }
    887 
    888 static void
    889 raid_wakeup(RF_Raid_t *raidPtr)
    890 {
    891 	rf_lock_mutex2(raidPtr->iodone_lock);
    892 	rf_signal_cond2(raidPtr->iodone_cv);
    893 	rf_unlock_mutex2(raidPtr->iodone_lock);
    894 }
    895 
    896 static void
    897 raidstrategy(struct buf *bp)
    898 {
    899 	unsigned int unit;
    900 	struct raid_softc *rs;
    901 	struct dk_softc *dksc;
    902 	RF_Raid_t *raidPtr;
    903 
    904 	unit = raidunit(bp->b_dev);
    905 	if ((rs = raidget(unit, false)) == NULL) {
    906 		bp->b_error = ENXIO;
    907 		goto fail;
    908 	}
    909 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    910 		bp->b_error = ENXIO;
    911 		goto fail;
    912 	}
    913 	dksc = &rs->sc_dksc;
    914 	raidPtr = &rs->sc_r;
    915 
    916 	/* Queue IO only */
    917 	if (dk_strategy_defer(dksc, bp))
    918 		goto done;
    919 
    920 	/* schedule the IO to happen at the next convenient time */
    921 	raid_wakeup(raidPtr);
    922 
    923 done:
    924 	return;
    925 
    926 fail:
    927 	bp->b_resid = bp->b_bcount;
    928 	biodone(bp);
    929 }
    930 
    931 static int
    932 raid_diskstart(device_t dev, struct buf *bp)
    933 {
    934 	struct raid_softc *rs = raidsoftc(dev);
    935 	RF_Raid_t *raidPtr;
    936 
    937 	raidPtr = &rs->sc_r;
    938 	if (!raidPtr->valid) {
    939 		db1_printf(("raid is not valid..\n"));
    940 		return ENODEV;
    941 	}
    942 
    943 	/* XXX */
    944 	bp->b_resid = 0;
    945 
    946 	return raiddoaccess(raidPtr, bp);
    947 }
    948 
    949 void
    950 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    951 {
    952 	struct raid_softc *rs;
    953 	struct dk_softc *dksc;
    954 
    955 	rs = raidPtr->softc;
    956 	dksc = &rs->sc_dksc;
    957 
    958 	dk_done(dksc, bp);
    959 
    960 	rf_lock_mutex2(raidPtr->mutex);
    961 	raidPtr->openings++;
    962 	rf_unlock_mutex2(raidPtr->mutex);
    963 
    964 	/* schedule more IO */
    965 	raid_wakeup(raidPtr);
    966 }
    967 
    968 /* ARGSUSED */
    969 static int
    970 raidread(dev_t dev, struct uio *uio, int flags)
    971 {
    972 	int     unit = raidunit(dev);
    973 	struct raid_softc *rs;
    974 
    975 	if ((rs = raidget(unit, false)) == NULL)
    976 		return ENXIO;
    977 
    978 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    979 		return (ENXIO);
    980 
    981 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    982 
    983 }
    984 
    985 /* ARGSUSED */
    986 static int
    987 raidwrite(dev_t dev, struct uio *uio, int flags)
    988 {
    989 	int     unit = raidunit(dev);
    990 	struct raid_softc *rs;
    991 
    992 	if ((rs = raidget(unit, false)) == NULL)
    993 		return ENXIO;
    994 
    995 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    996 		return (ENXIO);
    997 
    998 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    999 
   1000 }
   1001 
   1002 static int
   1003 raid_detach_unlocked(struct raid_softc *rs)
   1004 {
   1005 	struct dk_softc *dksc = &rs->sc_dksc;
   1006 	RF_Raid_t *raidPtr;
   1007 	int error;
   1008 
   1009 	raidPtr = &rs->sc_r;
   1010 
   1011 	if (DK_BUSY(dksc, 0) ||
   1012 	    raidPtr->recon_in_progress != 0 ||
   1013 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1014 	    raidPtr->copyback_in_progress != 0)
   1015 		return EBUSY;
   1016 
   1017 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1018 		return 0;
   1019 
   1020 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1021 
   1022 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1023 		return error;
   1024 
   1025 	rs->sc_flags &= ~RAIDF_INITED;
   1026 
   1027 	/* Kill off any queued buffers */
   1028 	dk_drain(dksc);
   1029 	bufq_free(dksc->sc_bufq);
   1030 
   1031 	/* Detach the disk. */
   1032 	dkwedge_delall(&dksc->sc_dkdev);
   1033 	disk_detach(&dksc->sc_dkdev);
   1034 	disk_destroy(&dksc->sc_dkdev);
   1035 	dk_detach(dksc);
   1036 
   1037 	return 0;
   1038 }
   1039 
   1040 static int
   1041 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1042 {
   1043 	int     unit = raidunit(dev);
   1044 	int     error = 0;
   1045 	int     part, pmask;
   1046 	struct raid_softc *rs;
   1047 	struct dk_softc *dksc;
   1048 	RF_Config_t *k_cfg, *u_cfg;
   1049 	RF_Raid_t *raidPtr;
   1050 	RF_RaidDisk_t *diskPtr;
   1051 	RF_AccTotals_t *totals;
   1052 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1053 	u_char *specific_buf;
   1054 	int retcode = 0;
   1055 	int column;
   1056 /*	int raidid; */
   1057 	struct rf_recon_req *rr;
   1058 	struct rf_recon_req_internal *rrint;
   1059 	RF_ComponentLabel_t *clabel;
   1060 	RF_ComponentLabel_t *ci_label;
   1061 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1062 	RF_SingleComponent_t component;
   1063 	int d;
   1064 
   1065 	if ((rs = raidget(unit, false)) == NULL)
   1066 		return ENXIO;
   1067 	dksc = &rs->sc_dksc;
   1068 	raidPtr = &rs->sc_r;
   1069 
   1070 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1071 		(int) DISKPART(dev), (int) unit, cmd));
   1072 
   1073 	/* Must be initialized for these... */
   1074 	switch (cmd) {
   1075 	case RAIDFRAME_REWRITEPARITY:
   1076 	case RAIDFRAME_GET_INFO:
   1077 	case RAIDFRAME_RESET_ACCTOTALS:
   1078 	case RAIDFRAME_GET_ACCTOTALS:
   1079 	case RAIDFRAME_KEEP_ACCTOTALS:
   1080 	case RAIDFRAME_GET_SIZE:
   1081 	case RAIDFRAME_FAIL_DISK:
   1082 	case RAIDFRAME_COPYBACK:
   1083 	case RAIDFRAME_CHECK_RECON_STATUS:
   1084 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1085 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1086 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1087 	case RAIDFRAME_ADD_HOT_SPARE:
   1088 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1089 	case RAIDFRAME_INIT_LABELS:
   1090 	case RAIDFRAME_REBUILD_IN_PLACE:
   1091 	case RAIDFRAME_CHECK_PARITY:
   1092 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1093 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1094 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1095 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1096 	case RAIDFRAME_SET_AUTOCONFIG:
   1097 	case RAIDFRAME_SET_ROOT:
   1098 	case RAIDFRAME_DELETE_COMPONENT:
   1099 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1100 	case RAIDFRAME_PARITYMAP_STATUS:
   1101 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1102 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1103 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1104 #ifdef COMPAT_NETBSD32
   1105 #ifdef _LP64
   1106 	case RAIDFRAME_GET_INFO32:
   1107 #endif
   1108 #endif
   1109 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1110 			return (ENXIO);
   1111 	}
   1112 
   1113 	/*
   1114 	 * Handle compat ioctl calls
   1115 	 *
   1116 	 * * If compat code is not loaded, stub returns ENOSYS and we just
   1117 	 *   check the "native" cmd's
   1118 	 * * If compat code is loaded but does not recognize the cmd, it
   1119 	 *   returns EPASSTHROUGH, and we just check the "native" cmd's
   1120 	 * * If compat code returns EAGAIN, we need to finish via config
   1121 	 * * Otherwise the cmd has been handled and we just return
   1122 	 */
   1123 	retcode = (*raidframe50_ioctl)(cmd, (rs->sc_flags & RAIDF_INITED),
   1124 	    raidPtr, unit, data, &k_cfg);
   1125 	if (retcode == ENOSYS)
   1126 		retcode = 0;
   1127 	else if (retcode == EAGAIN)
   1128 		goto config;
   1129 	else if (retcode != EPASSTHROUGH)
   1130 		return retcode;
   1131 
   1132 	retcode = (*raidframe80_ioctl)(cmd, (rs->sc_flags & RAIDF_INITED),
   1133 	    raidPtr, unit, data, &k_cfg);
   1134 	if (retcode == ENOSYS)
   1135 		retcode = 0;
   1136 	else if (retcode == EAGAIN)
   1137 		goto config;
   1138 	else if (retcode != EPASSTHROUGH)
   1139 		return retcode;
   1140 
   1141 	/*
   1142 	 * XXX
   1143 	 * Handling of FAIL_DISK80 command requires us to retain retcode's
   1144 	 * value of EPASSTHROUGH.  If you add more compat code later, make
   1145 	 * sure you don't overwrite retcode and break this!
   1146 	 */
   1147 
   1148 	switch (cmd) {
   1149 
   1150 		/* configure the system */
   1151 	case RAIDFRAME_CONFIGURE:
   1152 #ifdef COMPAT_NETBSD32
   1153 #ifdef _LP64
   1154 	case RAIDFRAME_CONFIGURE32:
   1155 #endif
   1156 #endif
   1157 
   1158 		if (raidPtr->valid) {
   1159 			/* There is a valid RAID set running on this unit! */
   1160 			printf("raid%d: Device already configured!\n",unit);
   1161 			return(EINVAL);
   1162 		}
   1163 
   1164 		/* copy-in the configuration information */
   1165 		/* data points to a pointer to the configuration structure */
   1166 
   1167 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1168 		if (k_cfg == NULL) {
   1169 			return (ENOMEM);
   1170 		}
   1171 #ifdef COMPAT_NETBSD32
   1172 #ifdef _LP64
   1173 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1174 		    (l->l_proc->p_flag & PK_32) != 0)
   1175 			retcode = rf_config_netbsd32(data, k_cfg);
   1176 		else
   1177 #endif
   1178 #endif
   1179 		{
   1180 			u_cfg = *((RF_Config_t **) data);
   1181 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1182 		}
   1183 		if (retcode) {
   1184 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1185 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1186 				retcode));
   1187 			goto no_config;
   1188 		}
   1189 		goto config;
   1190 	config:
   1191 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1192 
   1193 		/* allocate a buffer for the layout-specific data, and copy it
   1194 		 * in */
   1195 		if (k_cfg->layoutSpecificSize) {
   1196 			if (k_cfg->layoutSpecificSize > 10000) {
   1197 				/* sanity check */
   1198 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1199 				retcode = EINVAL;
   1200 				goto no_config;
   1201 			}
   1202 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1203 			    (u_char *));
   1204 			if (specific_buf == NULL) {
   1205 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1206 				retcode = ENOMEM;
   1207 				goto no_config;
   1208 			}
   1209 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1210 			    k_cfg->layoutSpecificSize);
   1211 			if (retcode) {
   1212 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1213 				RF_Free(specific_buf,
   1214 					k_cfg->layoutSpecificSize);
   1215 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1216 					retcode));
   1217 				goto no_config;
   1218 			}
   1219 		} else
   1220 			specific_buf = NULL;
   1221 		k_cfg->layoutSpecific = specific_buf;
   1222 
   1223 		/* should do some kind of sanity check on the configuration.
   1224 		 * Store the sum of all the bytes in the last byte? */
   1225 
   1226 		/* configure the system */
   1227 
   1228 		/*
   1229 		 * Clear the entire RAID descriptor, just to make sure
   1230 		 *  there is no stale data left in the case of a
   1231 		 *  reconfiguration
   1232 		 */
   1233 		memset(raidPtr, 0, sizeof(*raidPtr));
   1234 		raidPtr->softc = rs;
   1235 		raidPtr->raidid = unit;
   1236 
   1237 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1238 
   1239 		if (retcode == 0) {
   1240 
   1241 			/* allow this many simultaneous IO's to
   1242 			   this RAID device */
   1243 			raidPtr->openings = RAIDOUTSTANDING;
   1244 
   1245 			raidinit(rs);
   1246 			raid_wakeup(raidPtr);
   1247 			rf_markalldirty(raidPtr);
   1248 		}
   1249 		/* free the buffers.  No return code here. */
   1250 		if (k_cfg->layoutSpecificSize) {
   1251 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1252 		}
   1253 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1254 
   1255 	no_config:
   1256 		/*
   1257 		 * If configuration failed, set sc_flags so that we
   1258 		 * will detach the device when we close it.
   1259 		 */
   1260 		if (retcode != 0)
   1261 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1262 		return (retcode);
   1263 
   1264 		/* shutdown the system */
   1265 	case RAIDFRAME_SHUTDOWN:
   1266 
   1267 		part = DISKPART(dev);
   1268 		pmask = (1 << part);
   1269 
   1270 		if ((error = raidlock(rs)) != 0)
   1271 			return (error);
   1272 
   1273 		if (DK_BUSY(dksc, pmask) ||
   1274 		    raidPtr->recon_in_progress != 0 ||
   1275 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1276 		    raidPtr->copyback_in_progress != 0)
   1277 			retcode = EBUSY;
   1278 		else {
   1279 			/* detach and free on close */
   1280 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1281 			retcode = 0;
   1282 		}
   1283 
   1284 		raidunlock(rs);
   1285 
   1286 		return (retcode);
   1287 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1288 		return rf_get_component_label(raidPtr, data);
   1289 
   1290 #if 0
   1291 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1292 		clabel = (RF_ComponentLabel_t *) data;
   1293 
   1294 		/* XXX check the label for valid stuff... */
   1295 		/* Note that some things *should not* get modified --
   1296 		   the user should be re-initing the labels instead of
   1297 		   trying to patch things.
   1298 		   */
   1299 
   1300 		raidid = raidPtr->raidid;
   1301 #ifdef DEBUG
   1302 		printf("raid%d: Got component label:\n", raidid);
   1303 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1304 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1305 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1306 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1307 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1308 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1309 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1310 #endif
   1311 		clabel->row = 0;
   1312 		column = clabel->column;
   1313 
   1314 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1315 			return(EINVAL);
   1316 		}
   1317 
   1318 		/* XXX this isn't allowed to do anything for now :-) */
   1319 
   1320 		/* XXX and before it is, we need to fill in the rest
   1321 		   of the fields!?!?!?! */
   1322 		memcpy(raidget_component_label(raidPtr, column),
   1323 		    clabel, sizeof(*clabel));
   1324 		raidflush_component_label(raidPtr, column);
   1325 		return (0);
   1326 #endif
   1327 
   1328 	case RAIDFRAME_INIT_LABELS:
   1329 		clabel = (RF_ComponentLabel_t *) data;
   1330 		/*
   1331 		   we only want the serial number from
   1332 		   the above.  We get all the rest of the information
   1333 		   from the config that was used to create this RAID
   1334 		   set.
   1335 		   */
   1336 
   1337 		raidPtr->serial_number = clabel->serial_number;
   1338 
   1339 		for(column=0;column<raidPtr->numCol;column++) {
   1340 			diskPtr = &raidPtr->Disks[column];
   1341 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1342 				ci_label = raidget_component_label(raidPtr,
   1343 				    column);
   1344 				/* Zeroing this is important. */
   1345 				memset(ci_label, 0, sizeof(*ci_label));
   1346 				raid_init_component_label(raidPtr, ci_label);
   1347 				ci_label->serial_number =
   1348 				    raidPtr->serial_number;
   1349 				ci_label->row = 0; /* we dont' pretend to support more */
   1350 				rf_component_label_set_partitionsize(ci_label,
   1351 				    diskPtr->partitionSize);
   1352 				ci_label->column = column;
   1353 				raidflush_component_label(raidPtr, column);
   1354 			}
   1355 			/* XXXjld what about the spares? */
   1356 		}
   1357 
   1358 		return (retcode);
   1359 	case RAIDFRAME_SET_AUTOCONFIG:
   1360 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1361 		printf("raid%d: New autoconfig value is: %d\n",
   1362 		       raidPtr->raidid, d);
   1363 		*(int *) data = d;
   1364 		return (retcode);
   1365 
   1366 	case RAIDFRAME_SET_ROOT:
   1367 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1368 		printf("raid%d: New rootpartition value is: %d\n",
   1369 		       raidPtr->raidid, d);
   1370 		*(int *) data = d;
   1371 		return (retcode);
   1372 
   1373 		/* initialize all parity */
   1374 	case RAIDFRAME_REWRITEPARITY:
   1375 
   1376 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1377 			/* Parity for RAID 0 is trivially correct */
   1378 			raidPtr->parity_good = RF_RAID_CLEAN;
   1379 			return(0);
   1380 		}
   1381 
   1382 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1383 			/* Re-write is already in progress! */
   1384 			return(EINVAL);
   1385 		}
   1386 
   1387 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1388 					   rf_RewriteParityThread,
   1389 					   raidPtr,"raid_parity");
   1390 		return (retcode);
   1391 
   1392 
   1393 	case RAIDFRAME_ADD_HOT_SPARE:
   1394 		sparePtr = (RF_SingleComponent_t *) data;
   1395 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1396 		retcode = rf_add_hot_spare(raidPtr, &component);
   1397 		return(retcode);
   1398 
   1399 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1400 		return(retcode);
   1401 
   1402 	case RAIDFRAME_DELETE_COMPONENT:
   1403 		componentPtr = (RF_SingleComponent_t *)data;
   1404 		memcpy( &component, componentPtr,
   1405 			sizeof(RF_SingleComponent_t));
   1406 		retcode = rf_delete_component(raidPtr, &component);
   1407 		return(retcode);
   1408 
   1409 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1410 		componentPtr = (RF_SingleComponent_t *)data;
   1411 		memcpy( &component, componentPtr,
   1412 			sizeof(RF_SingleComponent_t));
   1413 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1414 		return(retcode);
   1415 
   1416 	case RAIDFRAME_REBUILD_IN_PLACE:
   1417 
   1418 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1419 			/* Can't do this on a RAID 0!! */
   1420 			return(EINVAL);
   1421 		}
   1422 
   1423 		if (raidPtr->recon_in_progress == 1) {
   1424 			/* a reconstruct is already in progress! */
   1425 			return(EINVAL);
   1426 		}
   1427 
   1428 		componentPtr = (RF_SingleComponent_t *) data;
   1429 		memcpy( &component, componentPtr,
   1430 			sizeof(RF_SingleComponent_t));
   1431 		component.row = 0; /* we don't support any more */
   1432 		column = component.column;
   1433 
   1434 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1435 			return(EINVAL);
   1436 		}
   1437 
   1438 		rf_lock_mutex2(raidPtr->mutex);
   1439 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1440 		    (raidPtr->numFailures > 0)) {
   1441 			/* XXX 0 above shouldn't be constant!!! */
   1442 			/* some component other than this has failed.
   1443 			   Let's not make things worse than they already
   1444 			   are... */
   1445 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1446 			       raidPtr->raidid);
   1447 			printf("raid%d:     Col: %d   Too many failures.\n",
   1448 			       raidPtr->raidid, column);
   1449 			rf_unlock_mutex2(raidPtr->mutex);
   1450 			return (EINVAL);
   1451 		}
   1452 		if (raidPtr->Disks[column].status ==
   1453 		    rf_ds_reconstructing) {
   1454 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1455 			       raidPtr->raidid);
   1456 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1457 
   1458 			rf_unlock_mutex2(raidPtr->mutex);
   1459 			return (EINVAL);
   1460 		}
   1461 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1462 			rf_unlock_mutex2(raidPtr->mutex);
   1463 			return (EINVAL);
   1464 		}
   1465 		rf_unlock_mutex2(raidPtr->mutex);
   1466 
   1467 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1468 		if (rrint == NULL)
   1469 			return(ENOMEM);
   1470 
   1471 		rrint->col = column;
   1472 		rrint->raidPtr = raidPtr;
   1473 
   1474 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1475 					   rf_ReconstructInPlaceThread,
   1476 					   rrint, "raid_reconip");
   1477 		return(retcode);
   1478 
   1479 	case RAIDFRAME_GET_INFO:
   1480 #ifdef COMPAT_NETBSD32
   1481 #ifdef _LP64
   1482 	case RAIDFRAME_GET_INFO32:
   1483 #endif
   1484 #endif
   1485 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1486 			  (RF_DeviceConfig_t *));
   1487 		if (d_cfg == NULL)
   1488 			return (ENOMEM);
   1489 		retcode = rf_get_info(raidPtr, d_cfg);
   1490 		if (retcode == 0) {
   1491 #ifdef COMPAT_NETBSD32
   1492 #ifdef _LP64
   1493 			if (cmd == RAIDFRAME_GET_INFO32)
   1494 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1495 			else
   1496 #endif
   1497 #endif
   1498 				ucfgp = *(RF_DeviceConfig_t **)data;
   1499 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1500 		}
   1501 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1502 
   1503 		return (retcode);
   1504 
   1505 	case RAIDFRAME_CHECK_PARITY:
   1506 		*(int *) data = raidPtr->parity_good;
   1507 		return (0);
   1508 
   1509 	case RAIDFRAME_PARITYMAP_STATUS:
   1510 		if (rf_paritymap_ineligible(raidPtr))
   1511 			return EINVAL;
   1512 		rf_paritymap_status(raidPtr->parity_map,
   1513 		    (struct rf_pmstat *)data);
   1514 		return 0;
   1515 
   1516 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1517 		if (rf_paritymap_ineligible(raidPtr))
   1518 			return EINVAL;
   1519 		if (raidPtr->parity_map == NULL)
   1520 			return ENOENT; /* ??? */
   1521 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1522 			(struct rf_pmparams *)data, 1))
   1523 			return EINVAL;
   1524 		return 0;
   1525 
   1526 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1527 		if (rf_paritymap_ineligible(raidPtr))
   1528 			return EINVAL;
   1529 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1530 		return 0;
   1531 
   1532 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1533 		if (rf_paritymap_ineligible(raidPtr))
   1534 			return EINVAL;
   1535 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1536 		/* XXX should errors be passed up? */
   1537 		return 0;
   1538 
   1539 	case RAIDFRAME_RESET_ACCTOTALS:
   1540 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1541 		return (0);
   1542 
   1543 	case RAIDFRAME_GET_ACCTOTALS:
   1544 		totals = (RF_AccTotals_t *) data;
   1545 		*totals = raidPtr->acc_totals;
   1546 		return (0);
   1547 
   1548 	case RAIDFRAME_KEEP_ACCTOTALS:
   1549 		raidPtr->keep_acc_totals = *(int *)data;
   1550 		return (0);
   1551 
   1552 	case RAIDFRAME_GET_SIZE:
   1553 		*(int *) data = raidPtr->totalSectors;
   1554 		return (0);
   1555 
   1556 		/* fail a disk & optionally start reconstruction */
   1557 	case RAIDFRAME_FAIL_DISK80:
   1558 		/* Check if we called compat code for this cmd */
   1559 		if (retcode != EPASSTHROUGH)
   1560 			return EINVAL;
   1561 		/* FALLTHRU */
   1562 	case RAIDFRAME_FAIL_DISK:
   1563 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1564 			/* Can't do this on a RAID 0!! */
   1565 			return(EINVAL);
   1566 		}
   1567 
   1568 		rr = (struct rf_recon_req *) data;
   1569 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1570 			return (EINVAL);
   1571 
   1572 		rf_lock_mutex2(raidPtr->mutex);
   1573 		if (raidPtr->status == rf_rs_reconstructing) {
   1574 			/* you can't fail a disk while we're reconstructing! */
   1575 			/* XXX wrong for RAID6 */
   1576 			rf_unlock_mutex2(raidPtr->mutex);
   1577 			return (EINVAL);
   1578 		}
   1579 		if ((raidPtr->Disks[rr->col].status ==
   1580 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1581 			/* some other component has failed.  Let's not make
   1582 			   things worse. XXX wrong for RAID6 */
   1583 			rf_unlock_mutex2(raidPtr->mutex);
   1584 			return (EINVAL);
   1585 		}
   1586 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1587 			/* Can't fail a spared disk! */
   1588 			rf_unlock_mutex2(raidPtr->mutex);
   1589 			return (EINVAL);
   1590 		}
   1591 		rf_unlock_mutex2(raidPtr->mutex);
   1592 
   1593 		/* make a copy of the recon request so that we don't rely on
   1594 		 * the user's buffer */
   1595 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1596 		if (rrint == NULL)
   1597 			return(ENOMEM);
   1598 		rrint->col = rr->col;
   1599 		rrint->flags = rr->flags;
   1600 		rrint->raidPtr = raidPtr;
   1601 
   1602 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1603 					   rf_ReconThread,
   1604 					   rrint, "raid_recon");
   1605 		return (0);
   1606 
   1607 		/* invoke a copyback operation after recon on whatever disk
   1608 		 * needs it, if any */
   1609 	case RAIDFRAME_COPYBACK:
   1610 
   1611 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1612 			/* This makes no sense on a RAID 0!! */
   1613 			return(EINVAL);
   1614 		}
   1615 
   1616 		if (raidPtr->copyback_in_progress == 1) {
   1617 			/* Copyback is already in progress! */
   1618 			return(EINVAL);
   1619 		}
   1620 
   1621 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1622 					   rf_CopybackThread,
   1623 					   raidPtr,"raid_copyback");
   1624 		return (retcode);
   1625 
   1626 		/* return the percentage completion of reconstruction */
   1627 	case RAIDFRAME_CHECK_RECON_STATUS:
   1628 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1629 			/* This makes no sense on a RAID 0, so tell the
   1630 			   user it's done. */
   1631 			*(int *) data = 100;
   1632 			return(0);
   1633 		}
   1634 		if (raidPtr->status != rf_rs_reconstructing)
   1635 			*(int *) data = 100;
   1636 		else {
   1637 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1638 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1639 			} else {
   1640 				*(int *) data = 0;
   1641 			}
   1642 		}
   1643 		return (0);
   1644 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1645 		rf_check_recon_status_ext(raidPtr, data);
   1646 		return (0);
   1647 
   1648 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1649 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1650 			/* This makes no sense on a RAID 0, so tell the
   1651 			   user it's done. */
   1652 			*(int *) data = 100;
   1653 			return(0);
   1654 		}
   1655 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1656 			*(int *) data = 100 *
   1657 				raidPtr->parity_rewrite_stripes_done /
   1658 				raidPtr->Layout.numStripe;
   1659 		} else {
   1660 			*(int *) data = 100;
   1661 		}
   1662 		return (0);
   1663 
   1664 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1665 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1666 		return (0);
   1667 
   1668 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1669 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1670 			/* This makes no sense on a RAID 0 */
   1671 			*(int *) data = 100;
   1672 			return(0);
   1673 		}
   1674 		if (raidPtr->copyback_in_progress == 1) {
   1675 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1676 				raidPtr->Layout.numStripe;
   1677 		} else {
   1678 			*(int *) data = 100;
   1679 		}
   1680 		return (0);
   1681 
   1682 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1683 		rf_check_copyback_status_ext(raidPtr, data);
   1684 		return 0;
   1685 
   1686 	case RAIDFRAME_SET_LAST_UNIT:
   1687 		for (column = 0; column < raidPtr->numCol; column++)
   1688 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1689 				return EBUSY;
   1690 
   1691 		for (column = 0; column < raidPtr->numCol; column++) {
   1692 			clabel = raidget_component_label(raidPtr, column);
   1693 			clabel->last_unit = *(int *)data;
   1694 			raidflush_component_label(raidPtr, column);
   1695 		}
   1696 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1697 		return 0;
   1698 
   1699 		/* the sparetable daemon calls this to wait for the kernel to
   1700 		 * need a spare table. this ioctl does not return until a
   1701 		 * spare table is needed. XXX -- calling mpsleep here in the
   1702 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1703 		 * -- I should either compute the spare table in the kernel,
   1704 		 * or have a different -- XXX XXX -- interface (a different
   1705 		 * character device) for delivering the table     -- XXX */
   1706 #if 0
   1707 	case RAIDFRAME_SPARET_WAIT:
   1708 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1709 		while (!rf_sparet_wait_queue)
   1710 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1711 		waitreq = rf_sparet_wait_queue;
   1712 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1713 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1714 
   1715 		/* structure assignment */
   1716 		*((RF_SparetWait_t *) data) = *waitreq;
   1717 
   1718 		RF_Free(waitreq, sizeof(*waitreq));
   1719 		return (0);
   1720 
   1721 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1722 		 * code in it that will cause the dameon to exit */
   1723 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1724 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1725 		waitreq->fcol = -1;
   1726 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1727 		waitreq->next = rf_sparet_wait_queue;
   1728 		rf_sparet_wait_queue = waitreq;
   1729 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1730 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1731 		return (0);
   1732 
   1733 		/* used by the spare table daemon to deliver a spare table
   1734 		 * into the kernel */
   1735 	case RAIDFRAME_SEND_SPARET:
   1736 
   1737 		/* install the spare table */
   1738 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1739 
   1740 		/* respond to the requestor.  the return status of the spare
   1741 		 * table installation is passed in the "fcol" field */
   1742 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1743 		waitreq->fcol = retcode;
   1744 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1745 		waitreq->next = rf_sparet_resp_queue;
   1746 		rf_sparet_resp_queue = waitreq;
   1747 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1748 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1749 
   1750 		return (retcode);
   1751 #endif
   1752 
   1753 	default:
   1754 		break; /* fall through to the os-specific code below */
   1755 
   1756 	}
   1757 
   1758 	if (!raidPtr->valid)
   1759 		return (EINVAL);
   1760 
   1761 	/*
   1762 	 * Add support for "regular" device ioctls here.
   1763 	 */
   1764 
   1765 	switch (cmd) {
   1766 	case DIOCGCACHE:
   1767 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1768 		break;
   1769 
   1770 	case DIOCCACHESYNC:
   1771 		retcode = rf_sync_component_caches(raidPtr);
   1772 		break;
   1773 
   1774 	default:
   1775 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1776 		break;
   1777 	}
   1778 
   1779 	return (retcode);
   1780 
   1781 }
   1782 
   1783 
   1784 /* raidinit -- complete the rest of the initialization for the
   1785    RAIDframe device.  */
   1786 
   1787 
   1788 static void
   1789 raidinit(struct raid_softc *rs)
   1790 {
   1791 	cfdata_t cf;
   1792 	unsigned int unit;
   1793 	struct dk_softc *dksc = &rs->sc_dksc;
   1794 	RF_Raid_t *raidPtr = &rs->sc_r;
   1795 	device_t dev;
   1796 
   1797 	unit = raidPtr->raidid;
   1798 
   1799 	/* XXX doesn't check bounds. */
   1800 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1801 
   1802 	/* attach the pseudo device */
   1803 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1804 	cf->cf_name = raid_cd.cd_name;
   1805 	cf->cf_atname = raid_cd.cd_name;
   1806 	cf->cf_unit = unit;
   1807 	cf->cf_fstate = FSTATE_STAR;
   1808 
   1809 	dev = config_attach_pseudo(cf);
   1810 	if (dev == NULL) {
   1811 		printf("raid%d: config_attach_pseudo failed\n",
   1812 		    raidPtr->raidid);
   1813 		free(cf, M_RAIDFRAME);
   1814 		return;
   1815 	}
   1816 
   1817 	/* provide a backpointer to the real softc */
   1818 	raidsoftc(dev) = rs;
   1819 
   1820 	/* disk_attach actually creates space for the CPU disklabel, among
   1821 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1822 	 * with disklabels. */
   1823 	dk_init(dksc, dev, DKTYPE_RAID);
   1824 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1825 
   1826 	/* XXX There may be a weird interaction here between this, and
   1827 	 * protectedSectors, as used in RAIDframe.  */
   1828 
   1829 	rs->sc_size = raidPtr->totalSectors;
   1830 
   1831 	/* Attach dk and disk subsystems */
   1832 	dk_attach(dksc);
   1833 	disk_attach(&dksc->sc_dkdev);
   1834 	rf_set_geometry(rs, raidPtr);
   1835 
   1836 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1837 
   1838 	/* mark unit as usuable */
   1839 	rs->sc_flags |= RAIDF_INITED;
   1840 
   1841 	dkwedge_discover(&dksc->sc_dkdev);
   1842 }
   1843 
   1844 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1845 /* wake up the daemon & tell it to get us a spare table
   1846  * XXX
   1847  * the entries in the queues should be tagged with the raidPtr
   1848  * so that in the extremely rare case that two recons happen at once,
   1849  * we know for which device were requesting a spare table
   1850  * XXX
   1851  *
   1852  * XXX This code is not currently used. GO
   1853  */
   1854 int
   1855 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1856 {
   1857 	int     retcode;
   1858 
   1859 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1860 	req->next = rf_sparet_wait_queue;
   1861 	rf_sparet_wait_queue = req;
   1862 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1863 
   1864 	/* mpsleep unlocks the mutex */
   1865 	while (!rf_sparet_resp_queue) {
   1866 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1867 	}
   1868 	req = rf_sparet_resp_queue;
   1869 	rf_sparet_resp_queue = req->next;
   1870 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1871 
   1872 	retcode = req->fcol;
   1873 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1874 					 * alloc'd */
   1875 	return (retcode);
   1876 }
   1877 #endif
   1878 
   1879 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1880  * bp & passes it down.
   1881  * any calls originating in the kernel must use non-blocking I/O
   1882  * do some extra sanity checking to return "appropriate" error values for
   1883  * certain conditions (to make some standard utilities work)
   1884  *
   1885  * Formerly known as: rf_DoAccessKernel
   1886  */
   1887 void
   1888 raidstart(RF_Raid_t *raidPtr)
   1889 {
   1890 	struct raid_softc *rs;
   1891 	struct dk_softc *dksc;
   1892 
   1893 	rs = raidPtr->softc;
   1894 	dksc = &rs->sc_dksc;
   1895 	/* quick check to see if anything has died recently */
   1896 	rf_lock_mutex2(raidPtr->mutex);
   1897 	if (raidPtr->numNewFailures > 0) {
   1898 		rf_unlock_mutex2(raidPtr->mutex);
   1899 		rf_update_component_labels(raidPtr,
   1900 					   RF_NORMAL_COMPONENT_UPDATE);
   1901 		rf_lock_mutex2(raidPtr->mutex);
   1902 		raidPtr->numNewFailures--;
   1903 	}
   1904 	rf_unlock_mutex2(raidPtr->mutex);
   1905 
   1906 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1907 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1908 		return;
   1909 	}
   1910 
   1911 	dk_start(dksc, NULL);
   1912 }
   1913 
   1914 static int
   1915 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1916 {
   1917 	RF_SectorCount_t num_blocks, pb, sum;
   1918 	RF_RaidAddr_t raid_addr;
   1919 	daddr_t blocknum;
   1920 	int     do_async;
   1921 	int rc;
   1922 
   1923 	rf_lock_mutex2(raidPtr->mutex);
   1924 	if (raidPtr->openings == 0) {
   1925 		rf_unlock_mutex2(raidPtr->mutex);
   1926 		return EAGAIN;
   1927 	}
   1928 	rf_unlock_mutex2(raidPtr->mutex);
   1929 
   1930 	blocknum = bp->b_rawblkno;
   1931 
   1932 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1933 		    (int) blocknum));
   1934 
   1935 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1936 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1937 
   1938 	/* *THIS* is where we adjust what block we're going to...
   1939 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1940 	raid_addr = blocknum;
   1941 
   1942 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1943 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1944 	sum = raid_addr + num_blocks + pb;
   1945 	if (1 || rf_debugKernelAccess) {
   1946 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1947 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1948 			    (int) pb, (int) bp->b_resid));
   1949 	}
   1950 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1951 	    || (sum < num_blocks) || (sum < pb)) {
   1952 		rc = ENOSPC;
   1953 		goto done;
   1954 	}
   1955 	/*
   1956 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1957 	 */
   1958 
   1959 	if (bp->b_bcount & raidPtr->sectorMask) {
   1960 		rc = ENOSPC;
   1961 		goto done;
   1962 	}
   1963 	db1_printf(("Calling DoAccess..\n"));
   1964 
   1965 
   1966 	rf_lock_mutex2(raidPtr->mutex);
   1967 	raidPtr->openings--;
   1968 	rf_unlock_mutex2(raidPtr->mutex);
   1969 
   1970 	/*
   1971 	 * Everything is async.
   1972 	 */
   1973 	do_async = 1;
   1974 
   1975 	/* don't ever condition on bp->b_flags & B_WRITE.
   1976 	 * always condition on B_READ instead */
   1977 
   1978 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1979 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1980 			 do_async, raid_addr, num_blocks,
   1981 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1982 
   1983 done:
   1984 	return rc;
   1985 }
   1986 
   1987 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1988 
   1989 int
   1990 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1991 {
   1992 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1993 	struct buf *bp;
   1994 
   1995 	req->queue = queue;
   1996 	bp = req->bp;
   1997 
   1998 	switch (req->type) {
   1999 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2000 		/* XXX need to do something extra here.. */
   2001 		/* I'm leaving this in, as I've never actually seen it used,
   2002 		 * and I'd like folks to report it... GO */
   2003 		printf(("WAKEUP CALLED\n"));
   2004 		queue->numOutstanding++;
   2005 
   2006 		bp->b_flags = 0;
   2007 		bp->b_private = req;
   2008 
   2009 		KernelWakeupFunc(bp);
   2010 		break;
   2011 
   2012 	case RF_IO_TYPE_READ:
   2013 	case RF_IO_TYPE_WRITE:
   2014 #if RF_ACC_TRACE > 0
   2015 		if (req->tracerec) {
   2016 			RF_ETIMER_START(req->tracerec->timer);
   2017 		}
   2018 #endif
   2019 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2020 		    op, queue->rf_cinfo->ci_dev,
   2021 		    req->sectorOffset, req->numSector,
   2022 		    req->buf, KernelWakeupFunc, (void *) req,
   2023 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2024 
   2025 		if (rf_debugKernelAccess) {
   2026 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2027 				(long) bp->b_blkno));
   2028 		}
   2029 		queue->numOutstanding++;
   2030 		queue->last_deq_sector = req->sectorOffset;
   2031 		/* acc wouldn't have been let in if there were any pending
   2032 		 * reqs at any other priority */
   2033 		queue->curPriority = req->priority;
   2034 
   2035 		db1_printf(("Going for %c to unit %d col %d\n",
   2036 			    req->type, queue->raidPtr->raidid,
   2037 			    queue->col));
   2038 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2039 			(int) req->sectorOffset, (int) req->numSector,
   2040 			(int) (req->numSector <<
   2041 			    queue->raidPtr->logBytesPerSector),
   2042 			(int) queue->raidPtr->logBytesPerSector));
   2043 
   2044 		/*
   2045 		 * XXX: drop lock here since this can block at
   2046 		 * least with backing SCSI devices.  Retake it
   2047 		 * to minimize fuss with calling interfaces.
   2048 		 */
   2049 
   2050 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2051 		bdev_strategy(bp);
   2052 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2053 		break;
   2054 
   2055 	default:
   2056 		panic("bad req->type in rf_DispatchKernelIO");
   2057 	}
   2058 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2059 
   2060 	return (0);
   2061 }
   2062 /* this is the callback function associated with a I/O invoked from
   2063    kernel code.
   2064  */
   2065 static void
   2066 KernelWakeupFunc(struct buf *bp)
   2067 {
   2068 	RF_DiskQueueData_t *req = NULL;
   2069 	RF_DiskQueue_t *queue;
   2070 
   2071 	db1_printf(("recovering the request queue:\n"));
   2072 
   2073 	req = bp->b_private;
   2074 
   2075 	queue = (RF_DiskQueue_t *) req->queue;
   2076 
   2077 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2078 
   2079 #if RF_ACC_TRACE > 0
   2080 	if (req->tracerec) {
   2081 		RF_ETIMER_STOP(req->tracerec->timer);
   2082 		RF_ETIMER_EVAL(req->tracerec->timer);
   2083 		rf_lock_mutex2(rf_tracing_mutex);
   2084 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2085 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2086 		req->tracerec->num_phys_ios++;
   2087 		rf_unlock_mutex2(rf_tracing_mutex);
   2088 	}
   2089 #endif
   2090 
   2091 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2092 	 * ballistic, and mark the component as hosed... */
   2093 
   2094 	if (bp->b_error != 0) {
   2095 		/* Mark the disk as dead */
   2096 		/* but only mark it once... */
   2097 		/* and only if it wouldn't leave this RAID set
   2098 		   completely broken */
   2099 		if (((queue->raidPtr->Disks[queue->col].status ==
   2100 		      rf_ds_optimal) ||
   2101 		     (queue->raidPtr->Disks[queue->col].status ==
   2102 		      rf_ds_used_spare)) &&
   2103 		     (queue->raidPtr->numFailures <
   2104 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2105 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2106 			       queue->raidPtr->raidid,
   2107 			       bp->b_error,
   2108 			       queue->raidPtr->Disks[queue->col].devname);
   2109 			queue->raidPtr->Disks[queue->col].status =
   2110 			    rf_ds_failed;
   2111 			queue->raidPtr->status = rf_rs_degraded;
   2112 			queue->raidPtr->numFailures++;
   2113 			queue->raidPtr->numNewFailures++;
   2114 		} else {	/* Disk is already dead... */
   2115 			/* printf("Disk already marked as dead!\n"); */
   2116 		}
   2117 
   2118 	}
   2119 
   2120 	/* Fill in the error value */
   2121 	req->error = bp->b_error;
   2122 
   2123 	/* Drop this one on the "finished" queue... */
   2124 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2125 
   2126 	/* Let the raidio thread know there is work to be done. */
   2127 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2128 
   2129 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2130 }
   2131 
   2132 
   2133 /*
   2134  * initialize a buf structure for doing an I/O in the kernel.
   2135  */
   2136 static void
   2137 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2138        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2139        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2140        struct proc *b_proc)
   2141 {
   2142 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2143 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2144 	bp->b_oflags = 0;
   2145 	bp->b_cflags = 0;
   2146 	bp->b_bcount = numSect << logBytesPerSector;
   2147 	bp->b_bufsize = bp->b_bcount;
   2148 	bp->b_error = 0;
   2149 	bp->b_dev = dev;
   2150 	bp->b_data = bf;
   2151 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2152 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2153 	if (bp->b_bcount == 0) {
   2154 		panic("bp->b_bcount is zero in InitBP!!");
   2155 	}
   2156 	bp->b_proc = b_proc;
   2157 	bp->b_iodone = cbFunc;
   2158 	bp->b_private = cbArg;
   2159 }
   2160 
   2161 /*
   2162  * Wait interruptibly for an exclusive lock.
   2163  *
   2164  * XXX
   2165  * Several drivers do this; it should be abstracted and made MP-safe.
   2166  * (Hmm... where have we seen this warning before :->  GO )
   2167  */
   2168 static int
   2169 raidlock(struct raid_softc *rs)
   2170 {
   2171 	int     error;
   2172 
   2173 	error = 0;
   2174 	mutex_enter(&rs->sc_mutex);
   2175 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2176 		rs->sc_flags |= RAIDF_WANTED;
   2177 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2178 		if (error != 0)
   2179 			goto done;
   2180 	}
   2181 	rs->sc_flags |= RAIDF_LOCKED;
   2182 done:
   2183 	mutex_exit(&rs->sc_mutex);
   2184 	return (error);
   2185 }
   2186 /*
   2187  * Unlock and wake up any waiters.
   2188  */
   2189 static void
   2190 raidunlock(struct raid_softc *rs)
   2191 {
   2192 
   2193 	mutex_enter(&rs->sc_mutex);
   2194 	rs->sc_flags &= ~RAIDF_LOCKED;
   2195 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2196 		rs->sc_flags &= ~RAIDF_WANTED;
   2197 		cv_broadcast(&rs->sc_cv);
   2198 	}
   2199 	mutex_exit(&rs->sc_mutex);
   2200 }
   2201 
   2202 
   2203 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2204 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2205 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2206 
   2207 static daddr_t
   2208 rf_component_info_offset(void)
   2209 {
   2210 
   2211 	return RF_COMPONENT_INFO_OFFSET;
   2212 }
   2213 
   2214 static daddr_t
   2215 rf_component_info_size(unsigned secsize)
   2216 {
   2217 	daddr_t info_size;
   2218 
   2219 	KASSERT(secsize);
   2220 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2221 		info_size = secsize;
   2222 	else
   2223 		info_size = RF_COMPONENT_INFO_SIZE;
   2224 
   2225 	return info_size;
   2226 }
   2227 
   2228 static daddr_t
   2229 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2230 {
   2231 	daddr_t map_offset;
   2232 
   2233 	KASSERT(raidPtr->bytesPerSector);
   2234 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2235 		map_offset = raidPtr->bytesPerSector;
   2236 	else
   2237 		map_offset = RF_COMPONENT_INFO_SIZE;
   2238 	map_offset += rf_component_info_offset();
   2239 
   2240 	return map_offset;
   2241 }
   2242 
   2243 static daddr_t
   2244 rf_parity_map_size(RF_Raid_t *raidPtr)
   2245 {
   2246 	daddr_t map_size;
   2247 
   2248 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2249 		map_size = raidPtr->bytesPerSector;
   2250 	else
   2251 		map_size = RF_PARITY_MAP_SIZE;
   2252 
   2253 	return map_size;
   2254 }
   2255 
   2256 int
   2257 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2258 {
   2259 	RF_ComponentLabel_t *clabel;
   2260 
   2261 	clabel = raidget_component_label(raidPtr, col);
   2262 	clabel->clean = RF_RAID_CLEAN;
   2263 	raidflush_component_label(raidPtr, col);
   2264 	return(0);
   2265 }
   2266 
   2267 
   2268 int
   2269 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2270 {
   2271 	RF_ComponentLabel_t *clabel;
   2272 
   2273 	clabel = raidget_component_label(raidPtr, col);
   2274 	clabel->clean = RF_RAID_DIRTY;
   2275 	raidflush_component_label(raidPtr, col);
   2276 	return(0);
   2277 }
   2278 
   2279 int
   2280 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2281 {
   2282 	KASSERT(raidPtr->bytesPerSector);
   2283 	return raidread_component_label(raidPtr->bytesPerSector,
   2284 	    raidPtr->Disks[col].dev,
   2285 	    raidPtr->raid_cinfo[col].ci_vp,
   2286 	    &raidPtr->raid_cinfo[col].ci_label);
   2287 }
   2288 
   2289 RF_ComponentLabel_t *
   2290 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2291 {
   2292 	return &raidPtr->raid_cinfo[col].ci_label;
   2293 }
   2294 
   2295 int
   2296 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2297 {
   2298 	RF_ComponentLabel_t *label;
   2299 
   2300 	label = &raidPtr->raid_cinfo[col].ci_label;
   2301 	label->mod_counter = raidPtr->mod_counter;
   2302 #ifndef RF_NO_PARITY_MAP
   2303 	label->parity_map_modcount = label->mod_counter;
   2304 #endif
   2305 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2306 	    raidPtr->Disks[col].dev,
   2307 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2308 }
   2309 
   2310 
   2311 static int
   2312 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2313     RF_ComponentLabel_t *clabel)
   2314 {
   2315 	return raidread_component_area(dev, b_vp, clabel,
   2316 	    sizeof(RF_ComponentLabel_t),
   2317 	    rf_component_info_offset(),
   2318 	    rf_component_info_size(secsize));
   2319 }
   2320 
   2321 /* ARGSUSED */
   2322 static int
   2323 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2324     size_t msize, daddr_t offset, daddr_t dsize)
   2325 {
   2326 	struct buf *bp;
   2327 	int error;
   2328 
   2329 	/* XXX should probably ensure that we don't try to do this if
   2330 	   someone has changed rf_protected_sectors. */
   2331 
   2332 	if (b_vp == NULL) {
   2333 		/* For whatever reason, this component is not valid.
   2334 		   Don't try to read a component label from it. */
   2335 		return(EINVAL);
   2336 	}
   2337 
   2338 	/* get a block of the appropriate size... */
   2339 	bp = geteblk((int)dsize);
   2340 	bp->b_dev = dev;
   2341 
   2342 	/* get our ducks in a row for the read */
   2343 	bp->b_blkno = offset / DEV_BSIZE;
   2344 	bp->b_bcount = dsize;
   2345 	bp->b_flags |= B_READ;
   2346  	bp->b_resid = dsize;
   2347 
   2348 	bdev_strategy(bp);
   2349 	error = biowait(bp);
   2350 
   2351 	if (!error) {
   2352 		memcpy(data, bp->b_data, msize);
   2353 	}
   2354 
   2355 	brelse(bp, 0);
   2356 	return(error);
   2357 }
   2358 
   2359 
   2360 static int
   2361 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2362     RF_ComponentLabel_t *clabel)
   2363 {
   2364 	return raidwrite_component_area(dev, b_vp, clabel,
   2365 	    sizeof(RF_ComponentLabel_t),
   2366 	    rf_component_info_offset(),
   2367 	    rf_component_info_size(secsize), 0);
   2368 }
   2369 
   2370 /* ARGSUSED */
   2371 static int
   2372 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2373     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2374 {
   2375 	struct buf *bp;
   2376 	int error;
   2377 
   2378 	/* get a block of the appropriate size... */
   2379 	bp = geteblk((int)dsize);
   2380 	bp->b_dev = dev;
   2381 
   2382 	/* get our ducks in a row for the write */
   2383 	bp->b_blkno = offset / DEV_BSIZE;
   2384 	bp->b_bcount = dsize;
   2385 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2386  	bp->b_resid = dsize;
   2387 
   2388 	memset(bp->b_data, 0, dsize);
   2389 	memcpy(bp->b_data, data, msize);
   2390 
   2391 	bdev_strategy(bp);
   2392 	if (asyncp)
   2393 		return 0;
   2394 	error = biowait(bp);
   2395 	brelse(bp, 0);
   2396 	if (error) {
   2397 #if 1
   2398 		printf("Failed to write RAID component info!\n");
   2399 #endif
   2400 	}
   2401 
   2402 	return(error);
   2403 }
   2404 
   2405 void
   2406 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2407 {
   2408 	int c;
   2409 
   2410 	for (c = 0; c < raidPtr->numCol; c++) {
   2411 		/* Skip dead disks. */
   2412 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2413 			continue;
   2414 		/* XXXjld: what if an error occurs here? */
   2415 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2416 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2417 		    RF_PARITYMAP_NBYTE,
   2418 		    rf_parity_map_offset(raidPtr),
   2419 		    rf_parity_map_size(raidPtr), 0);
   2420 	}
   2421 }
   2422 
   2423 void
   2424 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2425 {
   2426 	struct rf_paritymap_ondisk tmp;
   2427 	int c,first;
   2428 
   2429 	first=1;
   2430 	for (c = 0; c < raidPtr->numCol; c++) {
   2431 		/* Skip dead disks. */
   2432 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2433 			continue;
   2434 		raidread_component_area(raidPtr->Disks[c].dev,
   2435 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2436 		    RF_PARITYMAP_NBYTE,
   2437 		    rf_parity_map_offset(raidPtr),
   2438 		    rf_parity_map_size(raidPtr));
   2439 		if (first) {
   2440 			memcpy(map, &tmp, sizeof(*map));
   2441 			first = 0;
   2442 		} else {
   2443 			rf_paritymap_merge(map, &tmp);
   2444 		}
   2445 	}
   2446 }
   2447 
   2448 void
   2449 rf_markalldirty(RF_Raid_t *raidPtr)
   2450 {
   2451 	RF_ComponentLabel_t *clabel;
   2452 	int sparecol;
   2453 	int c;
   2454 	int j;
   2455 	int scol = -1;
   2456 
   2457 	raidPtr->mod_counter++;
   2458 	for (c = 0; c < raidPtr->numCol; c++) {
   2459 		/* we don't want to touch (at all) a disk that has
   2460 		   failed */
   2461 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2462 			clabel = raidget_component_label(raidPtr, c);
   2463 			if (clabel->status == rf_ds_spared) {
   2464 				/* XXX do something special...
   2465 				   but whatever you do, don't
   2466 				   try to access it!! */
   2467 			} else {
   2468 				raidmarkdirty(raidPtr, c);
   2469 			}
   2470 		}
   2471 	}
   2472 
   2473 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2474 		sparecol = raidPtr->numCol + c;
   2475 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2476 			/*
   2477 
   2478 			   we claim this disk is "optimal" if it's
   2479 			   rf_ds_used_spare, as that means it should be
   2480 			   directly substitutable for the disk it replaced.
   2481 			   We note that too...
   2482 
   2483 			 */
   2484 
   2485 			for(j=0;j<raidPtr->numCol;j++) {
   2486 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2487 					scol = j;
   2488 					break;
   2489 				}
   2490 			}
   2491 
   2492 			clabel = raidget_component_label(raidPtr, sparecol);
   2493 			/* make sure status is noted */
   2494 
   2495 			raid_init_component_label(raidPtr, clabel);
   2496 
   2497 			clabel->row = 0;
   2498 			clabel->column = scol;
   2499 			/* Note: we *don't* change status from rf_ds_used_spare
   2500 			   to rf_ds_optimal */
   2501 			/* clabel.status = rf_ds_optimal; */
   2502 
   2503 			raidmarkdirty(raidPtr, sparecol);
   2504 		}
   2505 	}
   2506 }
   2507 
   2508 
   2509 void
   2510 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2511 {
   2512 	RF_ComponentLabel_t *clabel;
   2513 	int sparecol;
   2514 	int c;
   2515 	int j;
   2516 	int scol;
   2517 	struct raid_softc *rs = raidPtr->softc;
   2518 
   2519 	scol = -1;
   2520 
   2521 	/* XXX should do extra checks to make sure things really are clean,
   2522 	   rather than blindly setting the clean bit... */
   2523 
   2524 	raidPtr->mod_counter++;
   2525 
   2526 	for (c = 0; c < raidPtr->numCol; c++) {
   2527 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2528 			clabel = raidget_component_label(raidPtr, c);
   2529 			/* make sure status is noted */
   2530 			clabel->status = rf_ds_optimal;
   2531 
   2532 			/* note what unit we are configured as */
   2533 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2534 				clabel->last_unit = raidPtr->raidid;
   2535 
   2536 			raidflush_component_label(raidPtr, c);
   2537 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2538 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2539 					raidmarkclean(raidPtr, c);
   2540 				}
   2541 			}
   2542 		}
   2543 		/* else we don't touch it.. */
   2544 	}
   2545 
   2546 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2547 		sparecol = raidPtr->numCol + c;
   2548 		/* Need to ensure that the reconstruct actually completed! */
   2549 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2550 			/*
   2551 
   2552 			   we claim this disk is "optimal" if it's
   2553 			   rf_ds_used_spare, as that means it should be
   2554 			   directly substitutable for the disk it replaced.
   2555 			   We note that too...
   2556 
   2557 			 */
   2558 
   2559 			for(j=0;j<raidPtr->numCol;j++) {
   2560 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2561 					scol = j;
   2562 					break;
   2563 				}
   2564 			}
   2565 
   2566 			/* XXX shouldn't *really* need this... */
   2567 			clabel = raidget_component_label(raidPtr, sparecol);
   2568 			/* make sure status is noted */
   2569 
   2570 			raid_init_component_label(raidPtr, clabel);
   2571 
   2572 			clabel->column = scol;
   2573 			clabel->status = rf_ds_optimal;
   2574 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2575 				clabel->last_unit = raidPtr->raidid;
   2576 
   2577 			raidflush_component_label(raidPtr, sparecol);
   2578 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2579 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2580 					raidmarkclean(raidPtr, sparecol);
   2581 				}
   2582 			}
   2583 		}
   2584 	}
   2585 }
   2586 
   2587 void
   2588 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2589 {
   2590 
   2591 	if (vp != NULL) {
   2592 		if (auto_configured == 1) {
   2593 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2594 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2595 			vput(vp);
   2596 
   2597 		} else {
   2598 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2599 		}
   2600 	}
   2601 }
   2602 
   2603 
   2604 void
   2605 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2606 {
   2607 	int r,c;
   2608 	struct vnode *vp;
   2609 	int acd;
   2610 
   2611 
   2612 	/* We take this opportunity to close the vnodes like we should.. */
   2613 
   2614 	for (c = 0; c < raidPtr->numCol; c++) {
   2615 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2616 		acd = raidPtr->Disks[c].auto_configured;
   2617 		rf_close_component(raidPtr, vp, acd);
   2618 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2619 		raidPtr->Disks[c].auto_configured = 0;
   2620 	}
   2621 
   2622 	for (r = 0; r < raidPtr->numSpare; r++) {
   2623 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2624 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2625 		rf_close_component(raidPtr, vp, acd);
   2626 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2627 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2628 	}
   2629 }
   2630 
   2631 
   2632 void
   2633 rf_ReconThread(struct rf_recon_req_internal *req)
   2634 {
   2635 	int     s;
   2636 	RF_Raid_t *raidPtr;
   2637 
   2638 	s = splbio();
   2639 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2640 	raidPtr->recon_in_progress = 1;
   2641 
   2642 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2643 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2644 
   2645 	RF_Free(req, sizeof(*req));
   2646 
   2647 	raidPtr->recon_in_progress = 0;
   2648 	splx(s);
   2649 
   2650 	/* That's all... */
   2651 	kthread_exit(0);	/* does not return */
   2652 }
   2653 
   2654 void
   2655 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2656 {
   2657 	int retcode;
   2658 	int s;
   2659 
   2660 	raidPtr->parity_rewrite_stripes_done = 0;
   2661 	raidPtr->parity_rewrite_in_progress = 1;
   2662 	s = splbio();
   2663 	retcode = rf_RewriteParity(raidPtr);
   2664 	splx(s);
   2665 	if (retcode) {
   2666 		printf("raid%d: Error re-writing parity (%d)!\n",
   2667 		    raidPtr->raidid, retcode);
   2668 	} else {
   2669 		/* set the clean bit!  If we shutdown correctly,
   2670 		   the clean bit on each component label will get
   2671 		   set */
   2672 		raidPtr->parity_good = RF_RAID_CLEAN;
   2673 	}
   2674 	raidPtr->parity_rewrite_in_progress = 0;
   2675 
   2676 	/* Anyone waiting for us to stop?  If so, inform them... */
   2677 	if (raidPtr->waitShutdown) {
   2678 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2679 	}
   2680 
   2681 	/* That's all... */
   2682 	kthread_exit(0);	/* does not return */
   2683 }
   2684 
   2685 
   2686 void
   2687 rf_CopybackThread(RF_Raid_t *raidPtr)
   2688 {
   2689 	int s;
   2690 
   2691 	raidPtr->copyback_in_progress = 1;
   2692 	s = splbio();
   2693 	rf_CopybackReconstructedData(raidPtr);
   2694 	splx(s);
   2695 	raidPtr->copyback_in_progress = 0;
   2696 
   2697 	/* That's all... */
   2698 	kthread_exit(0);	/* does not return */
   2699 }
   2700 
   2701 
   2702 void
   2703 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2704 {
   2705 	int s;
   2706 	RF_Raid_t *raidPtr;
   2707 
   2708 	s = splbio();
   2709 	raidPtr = req->raidPtr;
   2710 	raidPtr->recon_in_progress = 1;
   2711 	rf_ReconstructInPlace(raidPtr, req->col);
   2712 	RF_Free(req, sizeof(*req));
   2713 	raidPtr->recon_in_progress = 0;
   2714 	splx(s);
   2715 
   2716 	/* That's all... */
   2717 	kthread_exit(0);	/* does not return */
   2718 }
   2719 
   2720 static RF_AutoConfig_t *
   2721 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2722     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2723     unsigned secsize)
   2724 {
   2725 	int good_one = 0;
   2726 	RF_ComponentLabel_t *clabel;
   2727 	RF_AutoConfig_t *ac;
   2728 
   2729 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2730 	if (clabel == NULL) {
   2731 oomem:
   2732 		    while(ac_list) {
   2733 			    ac = ac_list;
   2734 			    if (ac->clabel)
   2735 				    free(ac->clabel, M_RAIDFRAME);
   2736 			    ac_list = ac_list->next;
   2737 			    free(ac, M_RAIDFRAME);
   2738 		    }
   2739 		    printf("RAID auto config: out of memory!\n");
   2740 		    return NULL; /* XXX probably should panic? */
   2741 	}
   2742 
   2743 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2744 		/* Got the label.  Does it look reasonable? */
   2745 		if (rf_reasonable_label(clabel, numsecs) &&
   2746 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2747 #ifdef DEBUG
   2748 			printf("Component on: %s: %llu\n",
   2749 				cname, (unsigned long long)size);
   2750 			rf_print_component_label(clabel);
   2751 #endif
   2752 			/* if it's reasonable, add it, else ignore it. */
   2753 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2754 				M_NOWAIT);
   2755 			if (ac == NULL) {
   2756 				free(clabel, M_RAIDFRAME);
   2757 				goto oomem;
   2758 			}
   2759 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2760 			ac->dev = dev;
   2761 			ac->vp = vp;
   2762 			ac->clabel = clabel;
   2763 			ac->next = ac_list;
   2764 			ac_list = ac;
   2765 			good_one = 1;
   2766 		}
   2767 	}
   2768 	if (!good_one) {
   2769 		/* cleanup */
   2770 		free(clabel, M_RAIDFRAME);
   2771 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2772 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2773 		vput(vp);
   2774 	}
   2775 	return ac_list;
   2776 }
   2777 
   2778 RF_AutoConfig_t *
   2779 rf_find_raid_components(void)
   2780 {
   2781 	struct vnode *vp;
   2782 	struct disklabel label;
   2783 	device_t dv;
   2784 	deviter_t di;
   2785 	dev_t dev;
   2786 	int bmajor, bminor, wedge, rf_part_found;
   2787 	int error;
   2788 	int i;
   2789 	RF_AutoConfig_t *ac_list;
   2790 	uint64_t numsecs;
   2791 	unsigned secsize;
   2792 	int dowedges;
   2793 
   2794 	/* initialize the AutoConfig list */
   2795 	ac_list = NULL;
   2796 
   2797 	/*
   2798 	 * we begin by trolling through *all* the devices on the system *twice*
   2799 	 * first we scan for wedges, second for other devices. This avoids
   2800 	 * using a raw partition instead of a wedge that covers the whole disk
   2801 	 */
   2802 
   2803 	for (dowedges=1; dowedges>=0; --dowedges) {
   2804 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2805 		     dv = deviter_next(&di)) {
   2806 
   2807 			/* we are only interested in disks... */
   2808 			if (device_class(dv) != DV_DISK)
   2809 				continue;
   2810 
   2811 			/* we don't care about floppies... */
   2812 			if (device_is_a(dv, "fd")) {
   2813 				continue;
   2814 			}
   2815 
   2816 			/* we don't care about CD's... */
   2817 			if (device_is_a(dv, "cd")) {
   2818 				continue;
   2819 			}
   2820 
   2821 			/* we don't care about md's... */
   2822 			if (device_is_a(dv, "md")) {
   2823 				continue;
   2824 			}
   2825 
   2826 			/* hdfd is the Atari/Hades floppy driver */
   2827 			if (device_is_a(dv, "hdfd")) {
   2828 				continue;
   2829 			}
   2830 
   2831 			/* fdisa is the Atari/Milan floppy driver */
   2832 			if (device_is_a(dv, "fdisa")) {
   2833 				continue;
   2834 			}
   2835 
   2836 			/* are we in the wedges pass ? */
   2837 			wedge = device_is_a(dv, "dk");
   2838 			if (wedge != dowedges) {
   2839 				continue;
   2840 			}
   2841 
   2842 			/* need to find the device_name_to_block_device_major stuff */
   2843 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2844 
   2845 			rf_part_found = 0; /*No raid partition as yet*/
   2846 
   2847 			/* get a vnode for the raw partition of this disk */
   2848 			bminor = minor(device_unit(dv));
   2849 			dev = wedge ? makedev(bmajor, bminor) :
   2850 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2851 			if (bdevvp(dev, &vp))
   2852 				panic("RAID can't alloc vnode");
   2853 
   2854 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2855 
   2856 			if (error) {
   2857 				/* "Who cares."  Continue looking
   2858 				   for something that exists*/
   2859 				vput(vp);
   2860 				continue;
   2861 			}
   2862 
   2863 			error = getdisksize(vp, &numsecs, &secsize);
   2864 			if (error) {
   2865 				/*
   2866 				 * Pseudo devices like vnd and cgd can be
   2867 				 * opened but may still need some configuration.
   2868 				 * Ignore these quietly.
   2869 				 */
   2870 				if (error != ENXIO)
   2871 					printf("RAIDframe: can't get disk size"
   2872 					    " for dev %s (%d)\n",
   2873 					    device_xname(dv), error);
   2874 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2875 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2876 				vput(vp);
   2877 				continue;
   2878 			}
   2879 			if (wedge) {
   2880 				struct dkwedge_info dkw;
   2881 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2882 				    NOCRED);
   2883 				if (error) {
   2884 					printf("RAIDframe: can't get wedge info for "
   2885 					    "dev %s (%d)\n", device_xname(dv), error);
   2886 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2887 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2888 					vput(vp);
   2889 					continue;
   2890 				}
   2891 
   2892 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2893 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2894 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2895 					vput(vp);
   2896 					continue;
   2897 				}
   2898 
   2899 				ac_list = rf_get_component(ac_list, dev, vp,
   2900 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2901 				rf_part_found = 1; /*There is a raid component on this disk*/
   2902 				continue;
   2903 			}
   2904 
   2905 			/* Ok, the disk exists.  Go get the disklabel. */
   2906 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2907 			if (error) {
   2908 				/*
   2909 				 * XXX can't happen - open() would
   2910 				 * have errored out (or faked up one)
   2911 				 */
   2912 				if (error != ENOTTY)
   2913 					printf("RAIDframe: can't get label for dev "
   2914 					    "%s (%d)\n", device_xname(dv), error);
   2915 			}
   2916 
   2917 			/* don't need this any more.  We'll allocate it again
   2918 			   a little later if we really do... */
   2919 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2920 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2921 			vput(vp);
   2922 
   2923 			if (error)
   2924 				continue;
   2925 
   2926 			rf_part_found = 0; /*No raid partitions yet*/
   2927 			for (i = 0; i < label.d_npartitions; i++) {
   2928 				char cname[sizeof(ac_list->devname)];
   2929 
   2930 				/* We only support partitions marked as RAID */
   2931 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2932 					continue;
   2933 
   2934 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2935 				if (bdevvp(dev, &vp))
   2936 					panic("RAID can't alloc vnode");
   2937 
   2938 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2939 				if (error) {
   2940 					/* Whatever... */
   2941 					vput(vp);
   2942 					continue;
   2943 				}
   2944 				snprintf(cname, sizeof(cname), "%s%c",
   2945 				    device_xname(dv), 'a' + i);
   2946 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2947 					label.d_partitions[i].p_size, numsecs, secsize);
   2948 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2949 			}
   2950 
   2951 			/*
   2952 			 *If there is no raid component on this disk, either in a
   2953 			 *disklabel or inside a wedge, check the raw partition as well,
   2954 			 *as it is possible to configure raid components on raw disk
   2955 			 *devices.
   2956 			 */
   2957 
   2958 			if (!rf_part_found) {
   2959 				char cname[sizeof(ac_list->devname)];
   2960 
   2961 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2962 				if (bdevvp(dev, &vp))
   2963 					panic("RAID can't alloc vnode");
   2964 
   2965 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2966 				if (error) {
   2967 					/* Whatever... */
   2968 					vput(vp);
   2969 					continue;
   2970 				}
   2971 				snprintf(cname, sizeof(cname), "%s%c",
   2972 				    device_xname(dv), 'a' + RAW_PART);
   2973 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2974 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2975 			}
   2976 		}
   2977 		deviter_release(&di);
   2978 	}
   2979 	return ac_list;
   2980 }
   2981 
   2982 
   2983 int
   2984 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2985 {
   2986 
   2987 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2988 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2989 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2990 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2991 	    clabel->row >=0 &&
   2992 	    clabel->column >= 0 &&
   2993 	    clabel->num_rows > 0 &&
   2994 	    clabel->num_columns > 0 &&
   2995 	    clabel->row < clabel->num_rows &&
   2996 	    clabel->column < clabel->num_columns &&
   2997 	    clabel->blockSize > 0 &&
   2998 	    /*
   2999 	     * numBlocksHi may contain garbage, but it is ok since
   3000 	     * the type is unsigned.  If it is really garbage,
   3001 	     * rf_fix_old_label_size() will fix it.
   3002 	     */
   3003 	    rf_component_label_numblocks(clabel) > 0) {
   3004 		/*
   3005 		 * label looks reasonable enough...
   3006 		 * let's make sure it has no old garbage.
   3007 		 */
   3008 		if (numsecs)
   3009 			rf_fix_old_label_size(clabel, numsecs);
   3010 		return(1);
   3011 	}
   3012 	return(0);
   3013 }
   3014 
   3015 
   3016 /*
   3017  * For reasons yet unknown, some old component labels have garbage in
   3018  * the newer numBlocksHi region, and this causes lossage.  Since those
   3019  * disks will also have numsecs set to less than 32 bits of sectors,
   3020  * we can determine when this corruption has occurred, and fix it.
   3021  *
   3022  * The exact same problem, with the same unknown reason, happens to
   3023  * the partitionSizeHi member as well.
   3024  */
   3025 static void
   3026 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3027 {
   3028 
   3029 	if (numsecs < ((uint64_t)1 << 32)) {
   3030 		if (clabel->numBlocksHi) {
   3031 			printf("WARNING: total sectors < 32 bits, yet "
   3032 			       "numBlocksHi set\n"
   3033 			       "WARNING: resetting numBlocksHi to zero.\n");
   3034 			clabel->numBlocksHi = 0;
   3035 		}
   3036 
   3037 		if (clabel->partitionSizeHi) {
   3038 			printf("WARNING: total sectors < 32 bits, yet "
   3039 			       "partitionSizeHi set\n"
   3040 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3041 			clabel->partitionSizeHi = 0;
   3042 		}
   3043 	}
   3044 }
   3045 
   3046 
   3047 #ifdef DEBUG
   3048 void
   3049 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3050 {
   3051 	uint64_t numBlocks;
   3052 	static const char *rp[] = {
   3053 	    "No", "Force", "Soft", "*invalid*"
   3054 	};
   3055 
   3056 
   3057 	numBlocks = rf_component_label_numblocks(clabel);
   3058 
   3059 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3060 	       clabel->row, clabel->column,
   3061 	       clabel->num_rows, clabel->num_columns);
   3062 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3063 	       clabel->version, clabel->serial_number,
   3064 	       clabel->mod_counter);
   3065 	printf("   Clean: %s Status: %d\n",
   3066 	       clabel->clean ? "Yes" : "No", clabel->status);
   3067 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3068 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3069 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3070 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3071 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3072 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3073 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3074 #if 0
   3075 	   printf("   Config order: %d\n", clabel->config_order);
   3076 #endif
   3077 
   3078 }
   3079 #endif
   3080 
   3081 RF_ConfigSet_t *
   3082 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3083 {
   3084 	RF_AutoConfig_t *ac;
   3085 	RF_ConfigSet_t *config_sets;
   3086 	RF_ConfigSet_t *cset;
   3087 	RF_AutoConfig_t *ac_next;
   3088 
   3089 
   3090 	config_sets = NULL;
   3091 
   3092 	/* Go through the AutoConfig list, and figure out which components
   3093 	   belong to what sets.  */
   3094 	ac = ac_list;
   3095 	while(ac!=NULL) {
   3096 		/* we're going to putz with ac->next, so save it here
   3097 		   for use at the end of the loop */
   3098 		ac_next = ac->next;
   3099 
   3100 		if (config_sets == NULL) {
   3101 			/* will need at least this one... */
   3102 			config_sets = (RF_ConfigSet_t *)
   3103 				malloc(sizeof(RF_ConfigSet_t),
   3104 				       M_RAIDFRAME, M_NOWAIT);
   3105 			if (config_sets == NULL) {
   3106 				panic("rf_create_auto_sets: No memory!");
   3107 			}
   3108 			/* this one is easy :) */
   3109 			config_sets->ac = ac;
   3110 			config_sets->next = NULL;
   3111 			config_sets->rootable = 0;
   3112 			ac->next = NULL;
   3113 		} else {
   3114 			/* which set does this component fit into? */
   3115 			cset = config_sets;
   3116 			while(cset!=NULL) {
   3117 				if (rf_does_it_fit(cset, ac)) {
   3118 					/* looks like it matches... */
   3119 					ac->next = cset->ac;
   3120 					cset->ac = ac;
   3121 					break;
   3122 				}
   3123 				cset = cset->next;
   3124 			}
   3125 			if (cset==NULL) {
   3126 				/* didn't find a match above... new set..*/
   3127 				cset = (RF_ConfigSet_t *)
   3128 					malloc(sizeof(RF_ConfigSet_t),
   3129 					       M_RAIDFRAME, M_NOWAIT);
   3130 				if (cset == NULL) {
   3131 					panic("rf_create_auto_sets: No memory!");
   3132 				}
   3133 				cset->ac = ac;
   3134 				ac->next = NULL;
   3135 				cset->next = config_sets;
   3136 				cset->rootable = 0;
   3137 				config_sets = cset;
   3138 			}
   3139 		}
   3140 		ac = ac_next;
   3141 	}
   3142 
   3143 
   3144 	return(config_sets);
   3145 }
   3146 
   3147 static int
   3148 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3149 {
   3150 	RF_ComponentLabel_t *clabel1, *clabel2;
   3151 
   3152 	/* If this one matches the *first* one in the set, that's good
   3153 	   enough, since the other members of the set would have been
   3154 	   through here too... */
   3155 	/* note that we are not checking partitionSize here..
   3156 
   3157 	   Note that we are also not checking the mod_counters here.
   3158 	   If everything else matches except the mod_counter, that's
   3159 	   good enough for this test.  We will deal with the mod_counters
   3160 	   a little later in the autoconfiguration process.
   3161 
   3162 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3163 
   3164 	   The reason we don't check for this is that failed disks
   3165 	   will have lower modification counts.  If those disks are
   3166 	   not added to the set they used to belong to, then they will
   3167 	   form their own set, which may result in 2 different sets,
   3168 	   for example, competing to be configured at raid0, and
   3169 	   perhaps competing to be the root filesystem set.  If the
   3170 	   wrong ones get configured, or both attempt to become /,
   3171 	   weird behaviour and or serious lossage will occur.  Thus we
   3172 	   need to bring them into the fold here, and kick them out at
   3173 	   a later point.
   3174 
   3175 	*/
   3176 
   3177 	clabel1 = cset->ac->clabel;
   3178 	clabel2 = ac->clabel;
   3179 	if ((clabel1->version == clabel2->version) &&
   3180 	    (clabel1->serial_number == clabel2->serial_number) &&
   3181 	    (clabel1->num_rows == clabel2->num_rows) &&
   3182 	    (clabel1->num_columns == clabel2->num_columns) &&
   3183 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3184 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3185 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3186 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3187 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3188 	    (clabel1->blockSize == clabel2->blockSize) &&
   3189 	    rf_component_label_numblocks(clabel1) ==
   3190 	    rf_component_label_numblocks(clabel2) &&
   3191 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3192 	    (clabel1->root_partition == clabel2->root_partition) &&
   3193 	    (clabel1->last_unit == clabel2->last_unit) &&
   3194 	    (clabel1->config_order == clabel2->config_order)) {
   3195 		/* if it get's here, it almost *has* to be a match */
   3196 	} else {
   3197 		/* it's not consistent with somebody in the set..
   3198 		   punt */
   3199 		return(0);
   3200 	}
   3201 	/* all was fine.. it must fit... */
   3202 	return(1);
   3203 }
   3204 
   3205 int
   3206 rf_have_enough_components(RF_ConfigSet_t *cset)
   3207 {
   3208 	RF_AutoConfig_t *ac;
   3209 	RF_AutoConfig_t *auto_config;
   3210 	RF_ComponentLabel_t *clabel;
   3211 	int c;
   3212 	int num_cols;
   3213 	int num_missing;
   3214 	int mod_counter;
   3215 	int mod_counter_found;
   3216 	int even_pair_failed;
   3217 	char parity_type;
   3218 
   3219 
   3220 	/* check to see that we have enough 'live' components
   3221 	   of this set.  If so, we can configure it if necessary */
   3222 
   3223 	num_cols = cset->ac->clabel->num_columns;
   3224 	parity_type = cset->ac->clabel->parityConfig;
   3225 
   3226 	/* XXX Check for duplicate components!?!?!? */
   3227 
   3228 	/* Determine what the mod_counter is supposed to be for this set. */
   3229 
   3230 	mod_counter_found = 0;
   3231 	mod_counter = 0;
   3232 	ac = cset->ac;
   3233 	while(ac!=NULL) {
   3234 		if (mod_counter_found==0) {
   3235 			mod_counter = ac->clabel->mod_counter;
   3236 			mod_counter_found = 1;
   3237 		} else {
   3238 			if (ac->clabel->mod_counter > mod_counter) {
   3239 				mod_counter = ac->clabel->mod_counter;
   3240 			}
   3241 		}
   3242 		ac = ac->next;
   3243 	}
   3244 
   3245 	num_missing = 0;
   3246 	auto_config = cset->ac;
   3247 
   3248 	even_pair_failed = 0;
   3249 	for(c=0; c<num_cols; c++) {
   3250 		ac = auto_config;
   3251 		while(ac!=NULL) {
   3252 			if ((ac->clabel->column == c) &&
   3253 			    (ac->clabel->mod_counter == mod_counter)) {
   3254 				/* it's this one... */
   3255 #ifdef DEBUG
   3256 				printf("Found: %s at %d\n",
   3257 				       ac->devname,c);
   3258 #endif
   3259 				break;
   3260 			}
   3261 			ac=ac->next;
   3262 		}
   3263 		if (ac==NULL) {
   3264 				/* Didn't find one here! */
   3265 				/* special case for RAID 1, especially
   3266 				   where there are more than 2
   3267 				   components (where RAIDframe treats
   3268 				   things a little differently :( ) */
   3269 			if (parity_type == '1') {
   3270 				if (c%2 == 0) { /* even component */
   3271 					even_pair_failed = 1;
   3272 				} else { /* odd component.  If
   3273 					    we're failed, and
   3274 					    so is the even
   3275 					    component, it's
   3276 					    "Good Night, Charlie" */
   3277 					if (even_pair_failed == 1) {
   3278 						return(0);
   3279 					}
   3280 				}
   3281 			} else {
   3282 				/* normal accounting */
   3283 				num_missing++;
   3284 			}
   3285 		}
   3286 		if ((parity_type == '1') && (c%2 == 1)) {
   3287 				/* Just did an even component, and we didn't
   3288 				   bail.. reset the even_pair_failed flag,
   3289 				   and go on to the next component.... */
   3290 			even_pair_failed = 0;
   3291 		}
   3292 	}
   3293 
   3294 	clabel = cset->ac->clabel;
   3295 
   3296 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3297 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3298 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3299 		/* XXX this needs to be made *much* more general */
   3300 		/* Too many failures */
   3301 		return(0);
   3302 	}
   3303 	/* otherwise, all is well, and we've got enough to take a kick
   3304 	   at autoconfiguring this set */
   3305 	return(1);
   3306 }
   3307 
   3308 void
   3309 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3310 			RF_Raid_t *raidPtr)
   3311 {
   3312 	RF_ComponentLabel_t *clabel;
   3313 	int i;
   3314 
   3315 	clabel = ac->clabel;
   3316 
   3317 	/* 1. Fill in the common stuff */
   3318 	config->numCol = clabel->num_columns;
   3319 	config->numSpare = 0; /* XXX should this be set here? */
   3320 	config->sectPerSU = clabel->sectPerSU;
   3321 	config->SUsPerPU = clabel->SUsPerPU;
   3322 	config->SUsPerRU = clabel->SUsPerRU;
   3323 	config->parityConfig = clabel->parityConfig;
   3324 	/* XXX... */
   3325 	strcpy(config->diskQueueType,"fifo");
   3326 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3327 	config->layoutSpecificSize = 0; /* XXX ?? */
   3328 
   3329 	while(ac!=NULL) {
   3330 		/* row/col values will be in range due to the checks
   3331 		   in reasonable_label() */
   3332 		strcpy(config->devnames[0][ac->clabel->column],
   3333 		       ac->devname);
   3334 		ac = ac->next;
   3335 	}
   3336 
   3337 	for(i=0;i<RF_MAXDBGV;i++) {
   3338 		config->debugVars[i][0] = 0;
   3339 	}
   3340 }
   3341 
   3342 int
   3343 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3344 {
   3345 	RF_ComponentLabel_t *clabel;
   3346 	int column;
   3347 	int sparecol;
   3348 
   3349 	raidPtr->autoconfigure = new_value;
   3350 
   3351 	for(column=0; column<raidPtr->numCol; column++) {
   3352 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3353 			clabel = raidget_component_label(raidPtr, column);
   3354 			clabel->autoconfigure = new_value;
   3355 			raidflush_component_label(raidPtr, column);
   3356 		}
   3357 	}
   3358 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3359 		sparecol = raidPtr->numCol + column;
   3360 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3361 			clabel = raidget_component_label(raidPtr, sparecol);
   3362 			clabel->autoconfigure = new_value;
   3363 			raidflush_component_label(raidPtr, sparecol);
   3364 		}
   3365 	}
   3366 	return(new_value);
   3367 }
   3368 
   3369 int
   3370 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3371 {
   3372 	RF_ComponentLabel_t *clabel;
   3373 	int column;
   3374 	int sparecol;
   3375 
   3376 	raidPtr->root_partition = new_value;
   3377 	for(column=0; column<raidPtr->numCol; column++) {
   3378 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3379 			clabel = raidget_component_label(raidPtr, column);
   3380 			clabel->root_partition = new_value;
   3381 			raidflush_component_label(raidPtr, column);
   3382 		}
   3383 	}
   3384 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3385 		sparecol = raidPtr->numCol + column;
   3386 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3387 			clabel = raidget_component_label(raidPtr, sparecol);
   3388 			clabel->root_partition = new_value;
   3389 			raidflush_component_label(raidPtr, sparecol);
   3390 		}
   3391 	}
   3392 	return(new_value);
   3393 }
   3394 
   3395 void
   3396 rf_release_all_vps(RF_ConfigSet_t *cset)
   3397 {
   3398 	RF_AutoConfig_t *ac;
   3399 
   3400 	ac = cset->ac;
   3401 	while(ac!=NULL) {
   3402 		/* Close the vp, and give it back */
   3403 		if (ac->vp) {
   3404 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3405 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3406 			vput(ac->vp);
   3407 			ac->vp = NULL;
   3408 		}
   3409 		ac = ac->next;
   3410 	}
   3411 }
   3412 
   3413 
   3414 void
   3415 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3416 {
   3417 	RF_AutoConfig_t *ac;
   3418 	RF_AutoConfig_t *next_ac;
   3419 
   3420 	ac = cset->ac;
   3421 	while(ac!=NULL) {
   3422 		next_ac = ac->next;
   3423 		/* nuke the label */
   3424 		free(ac->clabel, M_RAIDFRAME);
   3425 		/* cleanup the config structure */
   3426 		free(ac, M_RAIDFRAME);
   3427 		/* "next.." */
   3428 		ac = next_ac;
   3429 	}
   3430 	/* and, finally, nuke the config set */
   3431 	free(cset, M_RAIDFRAME);
   3432 }
   3433 
   3434 
   3435 void
   3436 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3437 {
   3438 	/* current version number */
   3439 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3440 	clabel->serial_number = raidPtr->serial_number;
   3441 	clabel->mod_counter = raidPtr->mod_counter;
   3442 
   3443 	clabel->num_rows = 1;
   3444 	clabel->num_columns = raidPtr->numCol;
   3445 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3446 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3447 
   3448 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3449 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3450 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3451 
   3452 	clabel->blockSize = raidPtr->bytesPerSector;
   3453 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3454 
   3455 	/* XXX not portable */
   3456 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3457 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3458 	clabel->autoconfigure = raidPtr->autoconfigure;
   3459 	clabel->root_partition = raidPtr->root_partition;
   3460 	clabel->last_unit = raidPtr->raidid;
   3461 	clabel->config_order = raidPtr->config_order;
   3462 
   3463 #ifndef RF_NO_PARITY_MAP
   3464 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3465 #endif
   3466 }
   3467 
   3468 struct raid_softc *
   3469 rf_auto_config_set(RF_ConfigSet_t *cset)
   3470 {
   3471 	RF_Raid_t *raidPtr;
   3472 	RF_Config_t *config;
   3473 	int raidID;
   3474 	struct raid_softc *sc;
   3475 
   3476 #ifdef DEBUG
   3477 	printf("RAID autoconfigure\n");
   3478 #endif
   3479 
   3480 	/* 1. Create a config structure */
   3481 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3482 	if (config == NULL) {
   3483 		printf("%s: Out of mem - config!?!?\n", __func__);
   3484 				/* XXX do something more intelligent here. */
   3485 		return NULL;
   3486 	}
   3487 
   3488 	/*
   3489 	   2. Figure out what RAID ID this one is supposed to live at
   3490 	   See if we can get the same RAID dev that it was configured
   3491 	   on last time..
   3492 	*/
   3493 
   3494 	raidID = cset->ac->clabel->last_unit;
   3495 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3496 	     sc = raidget(++raidID, false))
   3497 		continue;
   3498 #ifdef DEBUG
   3499 	printf("Configuring raid%d:\n",raidID);
   3500 #endif
   3501 
   3502 	if (sc == NULL)
   3503 		sc = raidget(raidID, true);
   3504 	if (sc == NULL) {
   3505 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3506 				/* XXX do something more intelligent here. */
   3507 		free(config, M_RAIDFRAME);
   3508 		return NULL;
   3509 	}
   3510 
   3511 	raidPtr = &sc->sc_r;
   3512 
   3513 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3514 	raidPtr->softc = sc;
   3515 	raidPtr->raidid = raidID;
   3516 	raidPtr->openings = RAIDOUTSTANDING;
   3517 
   3518 	/* 3. Build the configuration structure */
   3519 	rf_create_configuration(cset->ac, config, raidPtr);
   3520 
   3521 	/* 4. Do the configuration */
   3522 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3523 		raidinit(sc);
   3524 
   3525 		rf_markalldirty(raidPtr);
   3526 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3527 		switch (cset->ac->clabel->root_partition) {
   3528 		case 1:	/* Force Root */
   3529 		case 2:	/* Soft Root: root when boot partition part of raid */
   3530 			/*
   3531 			 * everything configured just fine.  Make a note
   3532 			 * that this set is eligible to be root,
   3533 			 * or forced to be root
   3534 			 */
   3535 			cset->rootable = cset->ac->clabel->root_partition;
   3536 			/* XXX do this here? */
   3537 			raidPtr->root_partition = cset->rootable;
   3538 			break;
   3539 		default:
   3540 			break;
   3541 		}
   3542 	} else {
   3543 		raidput(sc);
   3544 		sc = NULL;
   3545 	}
   3546 
   3547 	/* 5. Cleanup */
   3548 	free(config, M_RAIDFRAME);
   3549 	return sc;
   3550 }
   3551 
   3552 void
   3553 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3554 	     size_t xmin, size_t xmax)
   3555 {
   3556 	int error;
   3557 
   3558 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3559 	pool_sethiwat(p, xmax);
   3560 	if ((error = pool_prime(p, xmin)) != 0)
   3561 		panic("%s: failed to prime pool: %d", __func__, error);
   3562 	pool_setlowat(p, xmin);
   3563 }
   3564 
   3565 /*
   3566  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3567  * to see if there is IO pending and if that IO could possibly be done
   3568  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3569  * otherwise.
   3570  *
   3571  */
   3572 int
   3573 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3574 {
   3575 	struct raid_softc *rs;
   3576 	struct dk_softc *dksc;
   3577 
   3578 	rs = raidPtr->softc;
   3579 	dksc = &rs->sc_dksc;
   3580 
   3581 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3582 		return 1;
   3583 
   3584 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3585 		/* there is work to do */
   3586 		return 0;
   3587 	}
   3588 	/* default is nothing to do */
   3589 	return 1;
   3590 }
   3591 
   3592 int
   3593 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3594 {
   3595 	uint64_t numsecs;
   3596 	unsigned secsize;
   3597 	int error;
   3598 
   3599 	error = getdisksize(vp, &numsecs, &secsize);
   3600 	if (error == 0) {
   3601 		diskPtr->blockSize = secsize;
   3602 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3603 		diskPtr->partitionSize = numsecs;
   3604 		return 0;
   3605 	}
   3606 	return error;
   3607 }
   3608 
   3609 static int
   3610 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3611 {
   3612 	return 1;
   3613 }
   3614 
   3615 static void
   3616 raid_attach(device_t parent, device_t self, void *aux)
   3617 {
   3618 }
   3619 
   3620 
   3621 static int
   3622 raid_detach(device_t self, int flags)
   3623 {
   3624 	int error;
   3625 	struct raid_softc *rs = raidsoftc(self);
   3626 
   3627 	if (rs == NULL)
   3628 		return ENXIO;
   3629 
   3630 	if ((error = raidlock(rs)) != 0)
   3631 		return (error);
   3632 
   3633 	error = raid_detach_unlocked(rs);
   3634 
   3635 	raidunlock(rs);
   3636 
   3637 	/* XXX raid can be referenced here */
   3638 
   3639 	if (error)
   3640 		return error;
   3641 
   3642 	/* Free the softc */
   3643 	raidput(rs);
   3644 
   3645 	return 0;
   3646 }
   3647 
   3648 static void
   3649 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3650 {
   3651 	struct dk_softc *dksc = &rs->sc_dksc;
   3652 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3653 
   3654 	memset(dg, 0, sizeof(*dg));
   3655 
   3656 	dg->dg_secperunit = raidPtr->totalSectors;
   3657 	dg->dg_secsize = raidPtr->bytesPerSector;
   3658 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3659 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3660 
   3661 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3662 }
   3663 
   3664 /*
   3665  * Get cache info for all the components (including spares).
   3666  * Returns intersection of all the cache flags of all disks, or first
   3667  * error if any encountered.
   3668  * XXXfua feature flags can change as spares are added - lock down somehow
   3669  */
   3670 static int
   3671 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3672 {
   3673 	int c;
   3674 	int error;
   3675 	int dkwhole = 0, dkpart;
   3676 
   3677 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3678 		/*
   3679 		 * Check any non-dead disk, even when currently being
   3680 		 * reconstructed.
   3681 		 */
   3682 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3683 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3684 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3685 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3686 			if (error) {
   3687 				if (error != ENODEV) {
   3688 					printf("raid%d: get cache for component %s failed\n",
   3689 					    raidPtr->raidid,
   3690 					    raidPtr->Disks[c].devname);
   3691 				}
   3692 
   3693 				return error;
   3694 			}
   3695 
   3696 			if (c == 0)
   3697 				dkwhole = dkpart;
   3698 			else
   3699 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3700 		}
   3701 	}
   3702 
   3703 	*data = dkwhole;
   3704 
   3705 	return 0;
   3706 }
   3707 
   3708 /*
   3709  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3710  * We end up returning whatever error was returned by the first cache flush
   3711  * that fails.
   3712  */
   3713 
   3714 int
   3715 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3716 {
   3717 	int c, sparecol;
   3718 	int e,error;
   3719 	int force = 1;
   3720 
   3721 	error = 0;
   3722 	for (c = 0; c < raidPtr->numCol; c++) {
   3723 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3724 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3725 					  &force, FWRITE, NOCRED);
   3726 			if (e) {
   3727 				if (e != ENODEV)
   3728 					printf("raid%d: cache flush to component %s failed.\n",
   3729 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3730 				if (error == 0) {
   3731 					error = e;
   3732 				}
   3733 			}
   3734 		}
   3735 	}
   3736 
   3737 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3738 		sparecol = raidPtr->numCol + c;
   3739 		/* Need to ensure that the reconstruct actually completed! */
   3740 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3741 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3742 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3743 			if (e) {
   3744 				if (e != ENODEV)
   3745 					printf("raid%d: cache flush to component %s failed.\n",
   3746 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3747 				if (error == 0) {
   3748 					error = e;
   3749 				}
   3750 			}
   3751 		}
   3752 	}
   3753 	return error;
   3754 }
   3755 
   3756 /* Fill in info with the current status */
   3757 void
   3758 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3759 {
   3760 
   3761 	if (raidPtr->status != rf_rs_reconstructing) {
   3762 		info->total = 100;
   3763 		info->completed = 100;
   3764 	} else {
   3765 		info->total = raidPtr->reconControl->numRUsTotal;
   3766 		info->completed = raidPtr->reconControl->numRUsComplete;
   3767 	}
   3768 	info->remaining = info->total - info->completed;
   3769 }
   3770 
   3771 /* Fill in info with the current status */
   3772 void
   3773 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3774 {
   3775 
   3776 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3777 		info->total = raidPtr->Layout.numStripe;
   3778 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3779 	} else {
   3780 		info->completed = 100;
   3781 		info->total = 100;
   3782 	}
   3783 	info->remaining = info->total - info->completed;
   3784 }
   3785 
   3786 /* Fill in info with the current status */
   3787 void
   3788 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3789 {
   3790 
   3791 	if (raidPtr->copyback_in_progress == 1) {
   3792 		info->total = raidPtr->Layout.numStripe;
   3793 		info->completed = raidPtr->copyback_stripes_done;
   3794 		info->remaining = info->total - info->completed;
   3795 	} else {
   3796 		info->remaining = 0;
   3797 		info->completed = 100;
   3798 		info->total = 100;
   3799 	}
   3800 }
   3801 
   3802 /* Fill in config with the current info */
   3803 int
   3804 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3805 {
   3806 	int	d, i, j;
   3807 
   3808 	if (!raidPtr->valid)
   3809 		return (ENODEV);
   3810 	config->cols = raidPtr->numCol;
   3811 	config->ndevs = raidPtr->numCol;
   3812 	if (config->ndevs >= RF_MAX_DISKS)
   3813 		return (ENOMEM);
   3814 	config->nspares = raidPtr->numSpare;
   3815 	if (config->nspares >= RF_MAX_DISKS)
   3816 		return (ENOMEM);
   3817 	config->maxqdepth = raidPtr->maxQueueDepth;
   3818 	d = 0;
   3819 	for (j = 0; j < config->cols; j++) {
   3820 		config->devs[d] = raidPtr->Disks[j];
   3821 		d++;
   3822 	}
   3823 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3824 		config->spares[i] = raidPtr->Disks[j];
   3825 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3826 			/* XXX: raidctl(8) expects to see this as a used spare */
   3827 			config->spares[i].status = rf_ds_used_spare;
   3828 		}
   3829 	}
   3830 	return 0;
   3831 }
   3832 
   3833 int
   3834 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3835 {
   3836 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3837 	RF_ComponentLabel_t *raid_clabel;
   3838 	int column = clabel->column;
   3839 
   3840 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3841 		return EINVAL;
   3842 	raid_clabel = raidget_component_label(raidPtr, column);
   3843 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3844 
   3845 	return 0;
   3846 }
   3847 
   3848 /*
   3849  * Module interface
   3850  */
   3851 
   3852 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3853 
   3854 #ifdef _MODULE
   3855 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3856 #endif
   3857 
   3858 static int raid_modcmd(modcmd_t, void *);
   3859 static int raid_modcmd_init(void);
   3860 static int raid_modcmd_fini(void);
   3861 
   3862 static int
   3863 raid_modcmd(modcmd_t cmd, void *data)
   3864 {
   3865 	int error;
   3866 
   3867 	error = 0;
   3868 	switch (cmd) {
   3869 	case MODULE_CMD_INIT:
   3870 		error = raid_modcmd_init();
   3871 		break;
   3872 	case MODULE_CMD_FINI:
   3873 		error = raid_modcmd_fini();
   3874 		break;
   3875 	default:
   3876 		error = ENOTTY;
   3877 		break;
   3878 	}
   3879 	return error;
   3880 }
   3881 
   3882 static int
   3883 raid_modcmd_init(void)
   3884 {
   3885 	int error;
   3886 	int bmajor, cmajor;
   3887 
   3888 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3889 	mutex_enter(&raid_lock);
   3890 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3891 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3892 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3893 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3894 
   3895 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3896 #endif
   3897 
   3898 	bmajor = cmajor = -1;
   3899 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3900 	    &raid_cdevsw, &cmajor);
   3901 	if (error != 0 && error != EEXIST) {
   3902 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3903 		mutex_exit(&raid_lock);
   3904 		return error;
   3905 	}
   3906 #ifdef _MODULE
   3907 	error = config_cfdriver_attach(&raid_cd);
   3908 	if (error != 0) {
   3909 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3910 		    __func__, error);
   3911 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3912 		mutex_exit(&raid_lock);
   3913 		return error;
   3914 	}
   3915 #endif
   3916 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3917 	if (error != 0) {
   3918 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3919 		    __func__, error);
   3920 #ifdef _MODULE
   3921 		config_cfdriver_detach(&raid_cd);
   3922 #endif
   3923 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3924 		mutex_exit(&raid_lock);
   3925 		return error;
   3926 	}
   3927 
   3928 	raidautoconfigdone = false;
   3929 
   3930 	mutex_exit(&raid_lock);
   3931 
   3932 	if (error == 0) {
   3933 		if (rf_BootRaidframe(true) == 0)
   3934 			aprint_verbose("Kernelized RAIDframe activated\n");
   3935 		else
   3936 			panic("Serious error activating RAID!!");
   3937 	}
   3938 
   3939 	/*
   3940 	 * Register a finalizer which will be used to auto-config RAID
   3941 	 * sets once all real hardware devices have been found.
   3942 	 */
   3943 	error = config_finalize_register(NULL, rf_autoconfig);
   3944 	if (error != 0) {
   3945 		aprint_error("WARNING: unable to register RAIDframe "
   3946 		    "finalizer\n");
   3947 		error = 0;
   3948 	}
   3949 
   3950 	return error;
   3951 }
   3952 
   3953 static int
   3954 raid_modcmd_fini(void)
   3955 {
   3956 	int error;
   3957 
   3958 	mutex_enter(&raid_lock);
   3959 
   3960 	/* Don't allow unload if raid device(s) exist.  */
   3961 	if (!LIST_EMPTY(&raids)) {
   3962 		mutex_exit(&raid_lock);
   3963 		return EBUSY;
   3964 	}
   3965 
   3966 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3967 	if (error != 0) {
   3968 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3969 		mutex_exit(&raid_lock);
   3970 		return error;
   3971 	}
   3972 #ifdef _MODULE
   3973 	error = config_cfdriver_detach(&raid_cd);
   3974 	if (error != 0) {
   3975 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3976 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3977 		mutex_exit(&raid_lock);
   3978 		return error;
   3979 	}
   3980 #endif
   3981 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3982 	if (error != 0) {
   3983 		aprint_error("%s: cannot detach devsw\n",__func__);
   3984 #ifdef _MODULE
   3985 		config_cfdriver_attach(&raid_cd);
   3986 #endif
   3987 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3988 		mutex_exit(&raid_lock);
   3989 		return error;
   3990 	}
   3991 	rf_BootRaidframe(false);
   3992 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3993 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3994 	rf_destroy_cond2(rf_sparet_wait_cv);
   3995 	rf_destroy_cond2(rf_sparet_resp_cv);
   3996 #endif
   3997 	mutex_exit(&raid_lock);
   3998 	mutex_destroy(&raid_lock);
   3999 
   4000 	return error;
   4001 }
   4002