Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.345.2.7
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.345.2.7 2016/11/04 14:49:15 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.345.2.7 2016/11/04 14:49:15 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/localcount.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #ifdef COMPAT_50
    153 #include "rf_compat50.h"
    154 #endif
    155 
    156 #include "ioconf.h"
    157 
    158 #ifdef DEBUG
    159 int     rf_kdebug_level = 0;
    160 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    161 #else				/* DEBUG */
    162 #define db1_printf(a) { }
    163 #endif				/* DEBUG */
    164 
    165 #ifdef DEBUG_ROOT
    166 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    167 #else
    168 #define DPRINTF(a, ...)
    169 #endif
    170 
    171 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    172 static rf_declare_mutex2(rf_sparet_wait_mutex);
    173 static rf_declare_cond2(rf_sparet_wait_cv);
    174 static rf_declare_cond2(rf_sparet_resp_cv);
    175 
    176 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    177 						 * spare table */
    178 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    179 						 * installation process */
    180 #endif
    181 
    182 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    183 
    184 /* prototypes */
    185 static void KernelWakeupFunc(struct buf *);
    186 static void InitBP(struct buf *, struct vnode *, unsigned,
    187     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    188     void *, int, struct proc *);
    189 struct raid_softc;
    190 static void raidinit(struct raid_softc *);
    191 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 const struct bdevsw raid_bdevsw = {
    221 	DEVSW_MODULE_INIT
    222 	.d_open = raidopen,
    223 	.d_close = raidclose,
    224 	.d_strategy = raidstrategy,
    225 	.d_ioctl = raidioctl,
    226 	.d_dump = raiddump,
    227 	.d_psize = raidsize,
    228 	.d_discard = nodiscard,
    229 	.d_flag = D_DISK
    230 };
    231 
    232 const struct cdevsw raid_cdevsw = {
    233 	DEVSW_MODULE_INIT
    234 	.d_open = raidopen,
    235 	.d_close = raidclose,
    236 	.d_read = raidread,
    237 	.d_write = raidwrite,
    238 	.d_ioctl = raidioctl,
    239 	.d_stop = nostop,
    240 	.d_tty = notty,
    241 	.d_poll = nopoll,
    242 	.d_mmap = nommap,
    243 	.d_kqfilter = nokqfilter,
    244 	.d_discard = nodiscard,
    245 	.d_flag = D_DISK
    246 };
    247 
    248 static struct dkdriver rf_dkdriver = {
    249 	.d_open = raidopen,
    250 	.d_close = raidclose,
    251 	.d_strategy = raidstrategy,
    252 	.d_diskstart = raid_diskstart,
    253 	.d_dumpblocks = raid_dumpblocks,
    254 	.d_lastclose = raid_lastclose,
    255 	.d_minphys = minphys
    256 };
    257 
    258 struct raid_softc {
    259 	struct dk_softc sc_dksc;
    260 	int	sc_unit;
    261 	int     sc_flags;	/* flags */
    262 	int     sc_cflags;	/* configuration flags */
    263 	kmutex_t sc_mutex;	/* interlock mutex */
    264 	kcondvar_t sc_cv;	/* and the condvar */
    265 	uint64_t sc_size;	/* size of the raid device */
    266 	char    sc_xname[20];	/* XXX external name */
    267 	RF_Raid_t sc_r;
    268 	LIST_ENTRY(raid_softc) sc_link;
    269 };
    270 /* sc_flags */
    271 #define RAIDF_INITED		0x01	/* unit has been initialized */
    272 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    273 #define RAIDF_DETACH  		0x04	/* detach after final close */
    274 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    275 #define RAIDF_LOCKED		0x10	/* unit is locked */
    276 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    277 
    278 #define	raidunit(x)	DISKUNIT(x)
    279 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    280 
    281 extern struct cfdriver raid_cd;
    282 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    283     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    284     DVF_DETACH_SHUTDOWN);
    285 
    286 /*
    287  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    288  * Be aware that large numbers can allow the driver to consume a lot of
    289  * kernel memory, especially on writes, and in degraded mode reads.
    290  *
    291  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    292  * a single 64K write will typically require 64K for the old data,
    293  * 64K for the old parity, and 64K for the new parity, for a total
    294  * of 192K (if the parity buffer is not re-used immediately).
    295  * Even it if is used immediately, that's still 128K, which when multiplied
    296  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    297  *
    298  * Now in degraded mode, for example, a 64K read on the above setup may
    299  * require data reconstruction, which will require *all* of the 4 remaining
    300  * disks to participate -- 4 * 32K/disk == 128K again.
    301  */
    302 
    303 #ifndef RAIDOUTSTANDING
    304 #define RAIDOUTSTANDING   6
    305 #endif
    306 
    307 #define RAIDLABELDEV(dev)	\
    308 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    309 
    310 /* declared here, and made public, for the benefit of KVM stuff.. */
    311 
    312 static int raidlock(struct raid_softc *);
    313 static void raidunlock(struct raid_softc *);
    314 
    315 static int raid_detach_unlocked(struct raid_softc *);
    316 
    317 static void rf_markalldirty(RF_Raid_t *);
    318 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    319 
    320 void rf_ReconThread(struct rf_recon_req *);
    321 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    322 void rf_CopybackThread(RF_Raid_t *raidPtr);
    323 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    324 int rf_autoconfig(device_t);
    325 void rf_buildroothack(RF_ConfigSet_t *);
    326 
    327 RF_AutoConfig_t *rf_find_raid_components(void);
    328 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    329 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    330 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    331 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    332 int rf_set_autoconfig(RF_Raid_t *, int);
    333 int rf_set_rootpartition(RF_Raid_t *, int);
    334 void rf_release_all_vps(RF_ConfigSet_t *);
    335 void rf_cleanup_config_set(RF_ConfigSet_t *);
    336 int rf_have_enough_components(RF_ConfigSet_t *);
    337 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    338 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    339 
    340 /*
    341  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    342  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    343  * in the kernel config file.
    344  */
    345 #ifdef RAID_AUTOCONFIG
    346 int raidautoconfig = 1;
    347 #else
    348 int raidautoconfig = 0;
    349 #endif
    350 static bool raidautoconfigdone = false;
    351 
    352 struct RF_Pools_s rf_pools;
    353 
    354 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    355 static kmutex_t raid_lock;
    356 
    357 static struct raid_softc *
    358 raidcreate(int unit) {
    359 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    360 	if (sc == NULL) {
    361 #ifdef DIAGNOSTIC
    362 		printf("%s: out of memory\n", __func__);
    363 #endif
    364 		return NULL;
    365 	}
    366 	sc->sc_unit = unit;
    367 	cv_init(&sc->sc_cv, "raidunit");
    368 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    369 	return sc;
    370 }
    371 
    372 static void
    373 raiddestroy(struct raid_softc *sc) {
    374 	cv_destroy(&sc->sc_cv);
    375 	mutex_destroy(&sc->sc_mutex);
    376 	kmem_free(sc, sizeof(*sc));
    377 }
    378 
    379 static struct raid_softc *
    380 raidget(int unit, bool create) {
    381 	struct raid_softc *sc;
    382 	if (unit < 0) {
    383 #ifdef DIAGNOSTIC
    384 		panic("%s: unit %d!", __func__, unit);
    385 #endif
    386 		return NULL;
    387 	}
    388 	mutex_enter(&raid_lock);
    389 	LIST_FOREACH(sc, &raids, sc_link) {
    390 		if (sc->sc_unit == unit) {
    391 			mutex_exit(&raid_lock);
    392 			return sc;
    393 		}
    394 	}
    395 	mutex_exit(&raid_lock);
    396 	if (!create)
    397 		return NULL;
    398 	if ((sc = raidcreate(unit)) == NULL)
    399 		return NULL;
    400 	mutex_enter(&raid_lock);
    401 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    402 	mutex_exit(&raid_lock);
    403 	return sc;
    404 }
    405 
    406 static void
    407 raidput(struct raid_softc *sc) {
    408 	mutex_enter(&raid_lock);
    409 	LIST_REMOVE(sc, sc_link);
    410 	mutex_exit(&raid_lock);
    411 	raiddestroy(sc);
    412 }
    413 
    414 void
    415 raidattach(int num)
    416 {
    417 
    418 	/*
    419 	 * Device attachment and associated initialization now occurs
    420 	 * as part of the module initialization.
    421 	 */
    422 }
    423 
    424 int
    425 rf_autoconfig(device_t self)
    426 {
    427 	RF_AutoConfig_t *ac_list;
    428 	RF_ConfigSet_t *config_sets;
    429 
    430 	if (!raidautoconfig || raidautoconfigdone == true)
    431 		return (0);
    432 
    433 	/* XXX This code can only be run once. */
    434 	raidautoconfigdone = true;
    435 
    436 #ifdef __HAVE_CPU_BOOTCONF
    437 	/*
    438 	 * 0. find the boot device if needed first so we can use it later
    439 	 * this needs to be done before we autoconfigure any raid sets,
    440 	 * because if we use wedges we are not going to be able to open
    441 	 * the boot device later
    442 	 */
    443 	if (booted_device == NULL)
    444 		cpu_bootconf();
    445 #endif
    446 	/* 1. locate all RAID components on the system */
    447 	aprint_debug("Searching for RAID components...\n");
    448 	ac_list = rf_find_raid_components();
    449 
    450 	/* 2. Sort them into their respective sets. */
    451 	config_sets = rf_create_auto_sets(ac_list);
    452 
    453 	/*
    454 	 * 3. Evaluate each set and configure the valid ones.
    455 	 * This gets done in rf_buildroothack().
    456 	 */
    457 	rf_buildroothack(config_sets);
    458 
    459 	return 1;
    460 }
    461 
    462 static int
    463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    464 	const char *bootname = device_xname(bdv);
    465 	size_t len = strlen(bootname);
    466 
    467 	for (int col = 0; col < r->numCol; col++) {
    468 		const char *devname = r->Disks[col].devname;
    469 		devname += sizeof("/dev/") - 1;
    470 		if (strncmp(devname, "dk", 2) == 0) {
    471 			const char *parent =
    472 			    dkwedge_get_parent_name(r->Disks[col].dev);
    473 			if (parent != NULL)
    474 				devname = parent;
    475 		}
    476 		if (strncmp(devname, bootname, len) == 0) {
    477 			struct raid_softc *sc = r->softc;
    478 			aprint_debug("raid%d includes boot device %s\n",
    479 			    sc->sc_unit, devname);
    480 			return 1;
    481 		}
    482 	}
    483 	return 0;
    484 }
    485 
    486 void
    487 rf_buildroothack(RF_ConfigSet_t *config_sets)
    488 {
    489 	RF_ConfigSet_t *cset;
    490 	RF_ConfigSet_t *next_cset;
    491 	int num_root;
    492 	struct raid_softc *sc, *rsc;
    493 	struct dk_softc *dksc;
    494 
    495 	sc = rsc = NULL;
    496 	num_root = 0;
    497 	cset = config_sets;
    498 	while (cset != NULL) {
    499 		next_cset = cset->next;
    500 		if (rf_have_enough_components(cset) &&
    501 		    cset->ac->clabel->autoconfigure == 1) {
    502 			sc = rf_auto_config_set(cset);
    503 			if (sc != NULL) {
    504 				aprint_debug("raid%d: configured ok\n",
    505 				    sc->sc_unit);
    506 				if (cset->rootable) {
    507 					rsc = sc;
    508 					num_root++;
    509 				}
    510 			} else {
    511 				/* The autoconfig didn't work :( */
    512 				aprint_debug("Autoconfig failed\n");
    513 				rf_release_all_vps(cset);
    514 			}
    515 		} else {
    516 			/* we're not autoconfiguring this set...
    517 			   release the associated resources */
    518 			rf_release_all_vps(cset);
    519 		}
    520 		/* cleanup */
    521 		rf_cleanup_config_set(cset);
    522 		cset = next_cset;
    523 	}
    524 	dksc = &rsc->sc_dksc;
    525 
    526 	/* if the user has specified what the root device should be
    527 	   then we don't touch booted_device or boothowto... */
    528 
    529 	if (rootspec != NULL)
    530 		return;
    531 
    532 	/* we found something bootable... */
    533 
    534 	/*
    535 	 * XXX: The following code assumes that the root raid
    536 	 * is the first ('a') partition. This is about the best
    537 	 * we can do with a BSD disklabel, but we might be able
    538 	 * to do better with a GPT label, by setting a specified
    539 	 * attribute to indicate the root partition. We can then
    540 	 * stash the partition number in the r->root_partition
    541 	 * high bits (the bottom 2 bits are already used). For
    542 	 * now we just set booted_partition to 0 when we override
    543 	 * root.
    544 	 */
    545 	if (num_root == 1) {
    546 		device_t candidate_root;
    547 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    548 			char cname[sizeof(cset->ac->devname)];
    549 			/* XXX: assume partition 'a' first */
    550 			snprintf(cname, sizeof(cname), "%s%c",
    551 			    device_xname(dksc->sc_dev), 'a');
    552 			candidate_root = dkwedge_find_by_wname(cname);
    553 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    554 			    cname);
    555 			if (candidate_root == NULL) {
    556 				/*
    557 				 * If that is not found, because we don't use
    558 				 * disklabel, return the first dk child
    559 				 * XXX: we can skip the 'a' check above
    560 				 * and always do this...
    561 				 */
    562 				size_t i = 0;
    563 				candidate_root = dkwedge_find_by_parent(
    564 				    device_xname(dksc->sc_dev), &i);
    565 			}
    566 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    567 			    candidate_root);
    568 		} else
    569 			candidate_root = dksc->sc_dev;
    570 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    571 		DPRINTF("%s: booted_device=%p root_partition=%d "
    572 		   "contains_boot=%d\n", __func__, booted_device,
    573 		   rsc->sc_r.root_partition,
    574 		   rf_containsboot(&rsc->sc_r, booted_device));
    575 		if (booted_device == NULL ||
    576 		    rsc->sc_r.root_partition == 1 ||
    577 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    578 			booted_device = candidate_root;
    579 			booted_partition = 0;	/* XXX assume 'a' */
    580 		}
    581 	} else if (num_root > 1) {
    582 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    583 		    booted_device);
    584 
    585 		/*
    586 		 * Maybe the MD code can help. If it cannot, then
    587 		 * setroot() will discover that we have no
    588 		 * booted_device and will ask the user if nothing was
    589 		 * hardwired in the kernel config file
    590 		 */
    591 		if (booted_device == NULL)
    592 			return;
    593 
    594 		num_root = 0;
    595 		mutex_enter(&raid_lock);
    596 		LIST_FOREACH(sc, &raids, sc_link) {
    597 			RF_Raid_t *r = &sc->sc_r;
    598 			if (r->valid == 0)
    599 				continue;
    600 
    601 			if (r->root_partition == 0)
    602 				continue;
    603 
    604 			if (rf_containsboot(r, booted_device)) {
    605 				num_root++;
    606 				rsc = sc;
    607 				dksc = &rsc->sc_dksc;
    608 			}
    609 		}
    610 		mutex_exit(&raid_lock);
    611 
    612 		if (num_root == 1) {
    613 			booted_device = dksc->sc_dev;
    614 			booted_partition = 0;	/* XXX assume 'a' */
    615 		} else {
    616 			/* we can't guess.. require the user to answer... */
    617 			boothowto |= RB_ASKNAME;
    618 		}
    619 	}
    620 }
    621 
    622 static int
    623 raidsize(dev_t dev)
    624 {
    625 	struct raid_softc *rs;
    626 	struct dk_softc *dksc;
    627 	unsigned int unit;
    628 
    629 	unit = raidunit(dev);
    630 	if ((rs = raidget(unit, false)) == NULL)
    631 		return -1;
    632 	dksc = &rs->sc_dksc;
    633 
    634 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    635 		return -1;
    636 
    637 	return dk_size(dksc, dev);
    638 }
    639 
    640 static int
    641 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    642 {
    643 	unsigned int unit;
    644 	struct raid_softc *rs;
    645 	struct dk_softc *dksc;
    646 
    647 	unit = raidunit(dev);
    648 	if ((rs = raidget(unit, false)) == NULL)
    649 		return ENXIO;
    650 	dksc = &rs->sc_dksc;
    651 
    652 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    653 		return ENODEV;
    654 
    655         /*
    656            Note that blkno is relative to this particular partition.
    657            By adding adding RF_PROTECTED_SECTORS, we get a value that
    658 	   is relative to the partition used for the underlying component.
    659         */
    660 	blkno += RF_PROTECTED_SECTORS;
    661 
    662 	return dk_dump(dksc, dev, blkno, va, size);
    663 }
    664 
    665 static int
    666 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    667 {
    668 	struct raid_softc *rs = raidsoftc(dev);
    669 	const struct bdevsw *bdev;
    670 	RF_Raid_t *raidPtr;
    671 	int     c, sparecol, j, scol, dumpto;
    672 	int     error = 0;
    673 
    674 	raidPtr = &rs->sc_r;
    675 
    676 	/* we only support dumping to RAID 1 sets */
    677 	if (raidPtr->Layout.numDataCol != 1 ||
    678 	    raidPtr->Layout.numParityCol != 1)
    679 		return EINVAL;
    680 
    681 	if ((error = raidlock(rs)) != 0)
    682 		return error;
    683 
    684 	/* figure out what device is alive.. */
    685 
    686 	/*
    687 	   Look for a component to dump to.  The preference for the
    688 	   component to dump to is as follows:
    689 	   1) the master
    690 	   2) a used_spare of the master
    691 	   3) the slave
    692 	   4) a used_spare of the slave
    693 	*/
    694 
    695 	dumpto = -1;
    696 	for (c = 0; c < raidPtr->numCol; c++) {
    697 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    698 			/* this might be the one */
    699 			dumpto = c;
    700 			break;
    701 		}
    702 	}
    703 
    704 	/*
    705 	   At this point we have possibly selected a live master or a
    706 	   live slave.  We now check to see if there is a spared
    707 	   master (or a spared slave), if we didn't find a live master
    708 	   or a live slave.
    709 	*/
    710 
    711 	for (c = 0; c < raidPtr->numSpare; c++) {
    712 		sparecol = raidPtr->numCol + c;
    713 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    714 			/* How about this one? */
    715 			scol = -1;
    716 			for(j=0;j<raidPtr->numCol;j++) {
    717 				if (raidPtr->Disks[j].spareCol == sparecol) {
    718 					scol = j;
    719 					break;
    720 				}
    721 			}
    722 			if (scol == 0) {
    723 				/*
    724 				   We must have found a spared master!
    725 				   We'll take that over anything else
    726 				   found so far.  (We couldn't have
    727 				   found a real master before, since
    728 				   this is a used spare, and it's
    729 				   saying that it's replacing the
    730 				   master.)  On reboot (with
    731 				   autoconfiguration turned on)
    732 				   sparecol will become the 1st
    733 				   component (component0) of this set.
    734 				*/
    735 				dumpto = sparecol;
    736 				break;
    737 			} else if (scol != -1) {
    738 				/*
    739 				   Must be a spared slave.  We'll dump
    740 				   to that if we havn't found anything
    741 				   else so far.
    742 				*/
    743 				if (dumpto == -1)
    744 					dumpto = sparecol;
    745 			}
    746 		}
    747 	}
    748 
    749 	if (dumpto == -1) {
    750 		/* we couldn't find any live components to dump to!?!?
    751 		 */
    752 		error = EINVAL;
    753 		goto out;
    754 	}
    755 
    756 	bdev = bdevsw_lookup_acquire(raidPtr->Disks[dumpto].dev);
    757 	if (bdev == NULL) {
    758 		error = ENXIO;
    759 		goto out;
    760 	}
    761 
    762 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    763 				blkno, va, nblk * raidPtr->bytesPerSector);
    764 	bdevsw_release(bdev);
    765 out:
    766 	raidunlock(rs);
    767 
    768 	return error;
    769 }
    770 
    771 /* ARGSUSED */
    772 static int
    773 raidopen(dev_t dev, int flags, int fmt,
    774     struct lwp *l)
    775 {
    776 	int     unit = raidunit(dev);
    777 	struct raid_softc *rs;
    778 	struct dk_softc *dksc;
    779 	int     error = 0;
    780 	int     part, pmask;
    781 
    782 	if ((rs = raidget(unit, true)) == NULL)
    783 		return ENXIO;
    784 	if ((error = raidlock(rs)) != 0)
    785 		return (error);
    786 
    787 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    788 		error = EBUSY;
    789 		goto bad;
    790 	}
    791 
    792 	dksc = &rs->sc_dksc;
    793 
    794 	part = DISKPART(dev);
    795 	pmask = (1 << part);
    796 
    797 	if (!DK_BUSY(dksc, pmask) &&
    798 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    799 		/* First one... mark things as dirty... Note that we *MUST*
    800 		 have done a configure before this.  I DO NOT WANT TO BE
    801 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    802 		 THAT THEY BELONG TOGETHER!!!!! */
    803 		/* XXX should check to see if we're only open for reading
    804 		   here... If so, we needn't do this, but then need some
    805 		   other way of keeping track of what's happened.. */
    806 
    807 		rf_markalldirty(&rs->sc_r);
    808 	}
    809 
    810 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    811 		error = dk_open(dksc, dev, flags, fmt, l);
    812 
    813 bad:
    814 	raidunlock(rs);
    815 
    816 	return (error);
    817 
    818 
    819 }
    820 
    821 static int
    822 raid_lastclose(device_t self)
    823 {
    824 	struct raid_softc *rs = raidsoftc(self);
    825 
    826 	/* Last one... device is not unconfigured yet.
    827 	   Device shutdown has taken care of setting the
    828 	   clean bits if RAIDF_INITED is not set
    829 	   mark things as clean... */
    830 
    831 	rf_update_component_labels(&rs->sc_r,
    832 	    RF_FINAL_COMPONENT_UPDATE);
    833 
    834 	/* pass to unlocked code */
    835 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    836 		rs->sc_flags |= RAIDF_DETACH;
    837 
    838 	return 0;
    839 }
    840 
    841 /* ARGSUSED */
    842 static int
    843 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    844 {
    845 	int     unit = raidunit(dev);
    846 	struct raid_softc *rs;
    847 	struct dk_softc *dksc;
    848 	cfdata_t cf;
    849 	int     error = 0, do_detach = 0, do_put = 0;
    850 
    851 	if ((rs = raidget(unit, false)) == NULL)
    852 		return ENXIO;
    853 	dksc = &rs->sc_dksc;
    854 
    855 	if ((error = raidlock(rs)) != 0)
    856 		return (error);
    857 
    858 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    859 		error = dk_close(dksc, dev, flags, fmt, l);
    860 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    861 			do_detach = 1;
    862 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    863 		do_put = 1;
    864 
    865 	raidunlock(rs);
    866 
    867 	if (do_detach) {
    868 		/* free the pseudo device attach bits */
    869 		cf = device_cfdata(dksc->sc_dev);
    870 		error = config_detach(dksc->sc_dev, 0);
    871 		if (error == 0)
    872 			free(cf, M_RAIDFRAME);
    873 	} else if (do_put) {
    874 		raidput(rs);
    875 	}
    876 
    877 	return (error);
    878 
    879 }
    880 
    881 static void
    882 raid_wakeup(RF_Raid_t *raidPtr)
    883 {
    884 	rf_lock_mutex2(raidPtr->iodone_lock);
    885 	rf_signal_cond2(raidPtr->iodone_cv);
    886 	rf_unlock_mutex2(raidPtr->iodone_lock);
    887 }
    888 
    889 static void
    890 raidstrategy(struct buf *bp)
    891 {
    892 	unsigned int unit;
    893 	struct raid_softc *rs;
    894 	struct dk_softc *dksc;
    895 	RF_Raid_t *raidPtr;
    896 
    897 	unit = raidunit(bp->b_dev);
    898 	if ((rs = raidget(unit, false)) == NULL) {
    899 		bp->b_error = ENXIO;
    900 		goto fail;
    901 	}
    902 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	dksc = &rs->sc_dksc;
    907 	raidPtr = &rs->sc_r;
    908 
    909 	/* Queue IO only */
    910 	if (dk_strategy_defer(dksc, bp))
    911 		goto done;
    912 
    913 	/* schedule the IO to happen at the next convenient time */
    914 	raid_wakeup(raidPtr);
    915 
    916 done:
    917 	return;
    918 
    919 fail:
    920 	bp->b_resid = bp->b_bcount;
    921 	biodone(bp);
    922 }
    923 
    924 static int
    925 raid_diskstart(device_t dev, struct buf *bp)
    926 {
    927 	struct raid_softc *rs = raidsoftc(dev);
    928 	RF_Raid_t *raidPtr;
    929 
    930 	raidPtr = &rs->sc_r;
    931 	if (!raidPtr->valid) {
    932 		db1_printf(("raid is not valid..\n"));
    933 		return ENODEV;
    934 	}
    935 
    936 	/* XXX */
    937 	bp->b_resid = 0;
    938 
    939 	return raiddoaccess(raidPtr, bp);
    940 }
    941 
    942 void
    943 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    944 {
    945 	struct raid_softc *rs;
    946 	struct dk_softc *dksc;
    947 
    948 	rs = raidPtr->softc;
    949 	dksc = &rs->sc_dksc;
    950 
    951 	dk_done(dksc, bp);
    952 
    953 	rf_lock_mutex2(raidPtr->mutex);
    954 	raidPtr->openings++;
    955 	rf_unlock_mutex2(raidPtr->mutex);
    956 
    957 	/* schedule more IO */
    958 	raid_wakeup(raidPtr);
    959 }
    960 
    961 /* ARGSUSED */
    962 static int
    963 raidread(dev_t dev, struct uio *uio, int flags)
    964 {
    965 	int     unit = raidunit(dev);
    966 	struct raid_softc *rs;
    967 
    968 	if ((rs = raidget(unit, false)) == NULL)
    969 		return ENXIO;
    970 
    971 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    972 		return (ENXIO);
    973 
    974 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    975 
    976 }
    977 
    978 /* ARGSUSED */
    979 static int
    980 raidwrite(dev_t dev, struct uio *uio, int flags)
    981 {
    982 	int     unit = raidunit(dev);
    983 	struct raid_softc *rs;
    984 
    985 	if ((rs = raidget(unit, false)) == NULL)
    986 		return ENXIO;
    987 
    988 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    989 		return (ENXIO);
    990 
    991 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    992 
    993 }
    994 
    995 static int
    996 raid_detach_unlocked(struct raid_softc *rs)
    997 {
    998 	struct dk_softc *dksc = &rs->sc_dksc;
    999 	RF_Raid_t *raidPtr;
   1000 	int error;
   1001 
   1002 	raidPtr = &rs->sc_r;
   1003 
   1004 	if (DK_BUSY(dksc, 0) ||
   1005 	    raidPtr->recon_in_progress != 0 ||
   1006 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1007 	    raidPtr->copyback_in_progress != 0)
   1008 		return EBUSY;
   1009 
   1010 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1011 		return 0;
   1012 
   1013 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1014 
   1015 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1016 		return error;
   1017 
   1018 	rs->sc_flags &= ~RAIDF_INITED;
   1019 
   1020 	/* Kill off any queued buffers */
   1021 	dk_drain(dksc);
   1022 	bufq_free(dksc->sc_bufq);
   1023 
   1024 	/* Detach the disk. */
   1025 	dkwedge_delall(&dksc->sc_dkdev);
   1026 	disk_detach(&dksc->sc_dkdev);
   1027 	disk_destroy(&dksc->sc_dkdev);
   1028 	dk_detach(dksc);
   1029 
   1030 	return 0;
   1031 }
   1032 
   1033 static int
   1034 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1035 {
   1036 	int     unit = raidunit(dev);
   1037 	int     error = 0;
   1038 	int     part, pmask;
   1039 	struct raid_softc *rs;
   1040 	struct dk_softc *dksc;
   1041 	RF_Config_t *k_cfg, *u_cfg;
   1042 	RF_Raid_t *raidPtr;
   1043 	RF_RaidDisk_t *diskPtr;
   1044 	RF_AccTotals_t *totals;
   1045 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1046 	u_char *specific_buf;
   1047 	int retcode = 0;
   1048 	int column;
   1049 /*	int raidid; */
   1050 	struct rf_recon_req *rrcopy, *rr;
   1051 	RF_ComponentLabel_t *clabel;
   1052 	RF_ComponentLabel_t *ci_label;
   1053 	RF_ComponentLabel_t **clabel_ptr;
   1054 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1055 	RF_SingleComponent_t component;
   1056 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1057 	int i, j, d;
   1058 
   1059 	if ((rs = raidget(unit, false)) == NULL)
   1060 		return ENXIO;
   1061 	dksc = &rs->sc_dksc;
   1062 	raidPtr = &rs->sc_r;
   1063 
   1064 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1065 		(int) DISKPART(dev), (int) unit, cmd));
   1066 
   1067 	/* Must be initialized for these... */
   1068 	switch (cmd) {
   1069 	case RAIDFRAME_REWRITEPARITY:
   1070 	case RAIDFRAME_GET_INFO:
   1071 	case RAIDFRAME_RESET_ACCTOTALS:
   1072 	case RAIDFRAME_GET_ACCTOTALS:
   1073 	case RAIDFRAME_KEEP_ACCTOTALS:
   1074 	case RAIDFRAME_GET_SIZE:
   1075 	case RAIDFRAME_FAIL_DISK:
   1076 	case RAIDFRAME_COPYBACK:
   1077 	case RAIDFRAME_CHECK_RECON_STATUS:
   1078 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1079 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1080 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1081 	case RAIDFRAME_ADD_HOT_SPARE:
   1082 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1083 	case RAIDFRAME_INIT_LABELS:
   1084 	case RAIDFRAME_REBUILD_IN_PLACE:
   1085 	case RAIDFRAME_CHECK_PARITY:
   1086 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1087 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1088 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1089 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1090 	case RAIDFRAME_SET_AUTOCONFIG:
   1091 	case RAIDFRAME_SET_ROOT:
   1092 	case RAIDFRAME_DELETE_COMPONENT:
   1093 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1094 	case RAIDFRAME_PARITYMAP_STATUS:
   1095 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1096 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1097 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1098 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1099 			return (ENXIO);
   1100 	}
   1101 
   1102 	switch (cmd) {
   1103 #ifdef COMPAT_50
   1104 	case RAIDFRAME_GET_INFO50:
   1105 		return rf_get_info50(raidPtr, data);
   1106 
   1107 	case RAIDFRAME_CONFIGURE50:
   1108 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1109 			return retcode;
   1110 		goto config;
   1111 #endif
   1112 		/* configure the system */
   1113 	case RAIDFRAME_CONFIGURE:
   1114 
   1115 		if (raidPtr->valid) {
   1116 			/* There is a valid RAID set running on this unit! */
   1117 			printf("raid%d: Device already configured!\n",unit);
   1118 			return(EINVAL);
   1119 		}
   1120 
   1121 		/* copy-in the configuration information */
   1122 		/* data points to a pointer to the configuration structure */
   1123 
   1124 		u_cfg = *((RF_Config_t **) data);
   1125 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1126 		if (k_cfg == NULL) {
   1127 			return (ENOMEM);
   1128 		}
   1129 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1130 		if (retcode) {
   1131 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1132 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1133 				retcode));
   1134 			goto no_config;
   1135 		}
   1136 		goto config;
   1137 	config:
   1138 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1139 
   1140 		/* allocate a buffer for the layout-specific data, and copy it
   1141 		 * in */
   1142 		if (k_cfg->layoutSpecificSize) {
   1143 			if (k_cfg->layoutSpecificSize > 10000) {
   1144 				/* sanity check */
   1145 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1146 				retcode = EINVAL;
   1147 				goto no_config;
   1148 			}
   1149 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1150 			    (u_char *));
   1151 			if (specific_buf == NULL) {
   1152 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1153 				retcode = ENOMEM;
   1154 				goto no_config;
   1155 			}
   1156 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1157 			    k_cfg->layoutSpecificSize);
   1158 			if (retcode) {
   1159 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1160 				RF_Free(specific_buf,
   1161 					k_cfg->layoutSpecificSize);
   1162 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1163 					retcode));
   1164 				goto no_config;
   1165 			}
   1166 		} else
   1167 			specific_buf = NULL;
   1168 		k_cfg->layoutSpecific = specific_buf;
   1169 
   1170 		/* should do some kind of sanity check on the configuration.
   1171 		 * Store the sum of all the bytes in the last byte? */
   1172 
   1173 		/* configure the system */
   1174 
   1175 		/*
   1176 		 * Clear the entire RAID descriptor, just to make sure
   1177 		 *  there is no stale data left in the case of a
   1178 		 *  reconfiguration
   1179 		 */
   1180 		memset(raidPtr, 0, sizeof(*raidPtr));
   1181 		raidPtr->softc = rs;
   1182 		raidPtr->raidid = unit;
   1183 
   1184 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1185 
   1186 		if (retcode == 0) {
   1187 
   1188 			/* allow this many simultaneous IO's to
   1189 			   this RAID device */
   1190 			raidPtr->openings = RAIDOUTSTANDING;
   1191 
   1192 			raidinit(rs);
   1193 			raid_wakeup(raidPtr);
   1194 			rf_markalldirty(raidPtr);
   1195 		}
   1196 		/* free the buffers.  No return code here. */
   1197 		if (k_cfg->layoutSpecificSize) {
   1198 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1199 		}
   1200 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1201 
   1202 	no_config:
   1203 		/*
   1204 		 * If configuration failed, set sc_flags so that we
   1205 		 * will detach the device when we close it.
   1206 		 */
   1207 		if (retcode != 0)
   1208 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1209 		return (retcode);
   1210 
   1211 		/* shutdown the system */
   1212 	case RAIDFRAME_SHUTDOWN:
   1213 
   1214 		part = DISKPART(dev);
   1215 		pmask = (1 << part);
   1216 
   1217 		if ((error = raidlock(rs)) != 0)
   1218 			return (error);
   1219 
   1220 		if (DK_BUSY(dksc, pmask) ||
   1221 		    raidPtr->recon_in_progress != 0 ||
   1222 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1223 		    raidPtr->copyback_in_progress != 0)
   1224 			retcode = EBUSY;
   1225 		else {
   1226 			/* detach and free on close */
   1227 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1228 			retcode = 0;
   1229 		}
   1230 
   1231 		raidunlock(rs);
   1232 
   1233 		return (retcode);
   1234 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1235 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1236 		/* need to read the component label for the disk indicated
   1237 		   by row,column in clabel */
   1238 
   1239 		/*
   1240 		 * Perhaps there should be an option to skip the in-core
   1241 		 * copy and hit the disk, as with disklabel(8).
   1242 		 */
   1243 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1244 
   1245 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1246 
   1247 		if (retcode) {
   1248 			RF_Free(clabel, sizeof(*clabel));
   1249 			return retcode;
   1250 		}
   1251 
   1252 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1253 
   1254 		column = clabel->column;
   1255 
   1256 		if ((column < 0) || (column >= raidPtr->numCol +
   1257 		    raidPtr->numSpare)) {
   1258 			RF_Free(clabel, sizeof(*clabel));
   1259 			return EINVAL;
   1260 		}
   1261 
   1262 		RF_Free(clabel, sizeof(*clabel));
   1263 
   1264 		clabel = raidget_component_label(raidPtr, column);
   1265 
   1266 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1267 
   1268 #if 0
   1269 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1270 		clabel = (RF_ComponentLabel_t *) data;
   1271 
   1272 		/* XXX check the label for valid stuff... */
   1273 		/* Note that some things *should not* get modified --
   1274 		   the user should be re-initing the labels instead of
   1275 		   trying to patch things.
   1276 		   */
   1277 
   1278 		raidid = raidPtr->raidid;
   1279 #ifdef DEBUG
   1280 		printf("raid%d: Got component label:\n", raidid);
   1281 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1282 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1283 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1284 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1285 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1286 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1287 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1288 #endif
   1289 		clabel->row = 0;
   1290 		column = clabel->column;
   1291 
   1292 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1293 			return(EINVAL);
   1294 		}
   1295 
   1296 		/* XXX this isn't allowed to do anything for now :-) */
   1297 
   1298 		/* XXX and before it is, we need to fill in the rest
   1299 		   of the fields!?!?!?! */
   1300 		memcpy(raidget_component_label(raidPtr, column),
   1301 		    clabel, sizeof(*clabel));
   1302 		raidflush_component_label(raidPtr, column);
   1303 		return (0);
   1304 #endif
   1305 
   1306 	case RAIDFRAME_INIT_LABELS:
   1307 		clabel = (RF_ComponentLabel_t *) data;
   1308 		/*
   1309 		   we only want the serial number from
   1310 		   the above.  We get all the rest of the information
   1311 		   from the config that was used to create this RAID
   1312 		   set.
   1313 		   */
   1314 
   1315 		raidPtr->serial_number = clabel->serial_number;
   1316 
   1317 		for(column=0;column<raidPtr->numCol;column++) {
   1318 			diskPtr = &raidPtr->Disks[column];
   1319 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1320 				ci_label = raidget_component_label(raidPtr,
   1321 				    column);
   1322 				/* Zeroing this is important. */
   1323 				memset(ci_label, 0, sizeof(*ci_label));
   1324 				raid_init_component_label(raidPtr, ci_label);
   1325 				ci_label->serial_number =
   1326 				    raidPtr->serial_number;
   1327 				ci_label->row = 0; /* we dont' pretend to support more */
   1328 				rf_component_label_set_partitionsize(ci_label,
   1329 				    diskPtr->partitionSize);
   1330 				ci_label->column = column;
   1331 				raidflush_component_label(raidPtr, column);
   1332 			}
   1333 			/* XXXjld what about the spares? */
   1334 		}
   1335 
   1336 		return (retcode);
   1337 	case RAIDFRAME_SET_AUTOCONFIG:
   1338 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1339 		printf("raid%d: New autoconfig value is: %d\n",
   1340 		       raidPtr->raidid, d);
   1341 		*(int *) data = d;
   1342 		return (retcode);
   1343 
   1344 	case RAIDFRAME_SET_ROOT:
   1345 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1346 		printf("raid%d: New rootpartition value is: %d\n",
   1347 		       raidPtr->raidid, d);
   1348 		*(int *) data = d;
   1349 		return (retcode);
   1350 
   1351 		/* initialize all parity */
   1352 	case RAIDFRAME_REWRITEPARITY:
   1353 
   1354 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1355 			/* Parity for RAID 0 is trivially correct */
   1356 			raidPtr->parity_good = RF_RAID_CLEAN;
   1357 			return(0);
   1358 		}
   1359 
   1360 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1361 			/* Re-write is already in progress! */
   1362 			return(EINVAL);
   1363 		}
   1364 
   1365 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1366 					   rf_RewriteParityThread,
   1367 					   raidPtr,"raid_parity");
   1368 		return (retcode);
   1369 
   1370 
   1371 	case RAIDFRAME_ADD_HOT_SPARE:
   1372 		sparePtr = (RF_SingleComponent_t *) data;
   1373 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1374 		retcode = rf_add_hot_spare(raidPtr, &component);
   1375 		return(retcode);
   1376 
   1377 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1378 		return(retcode);
   1379 
   1380 	case RAIDFRAME_DELETE_COMPONENT:
   1381 		componentPtr = (RF_SingleComponent_t *)data;
   1382 		memcpy( &component, componentPtr,
   1383 			sizeof(RF_SingleComponent_t));
   1384 		retcode = rf_delete_component(raidPtr, &component);
   1385 		return(retcode);
   1386 
   1387 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1388 		componentPtr = (RF_SingleComponent_t *)data;
   1389 		memcpy( &component, componentPtr,
   1390 			sizeof(RF_SingleComponent_t));
   1391 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1392 		return(retcode);
   1393 
   1394 	case RAIDFRAME_REBUILD_IN_PLACE:
   1395 
   1396 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1397 			/* Can't do this on a RAID 0!! */
   1398 			return(EINVAL);
   1399 		}
   1400 
   1401 		if (raidPtr->recon_in_progress == 1) {
   1402 			/* a reconstruct is already in progress! */
   1403 			return(EINVAL);
   1404 		}
   1405 
   1406 		componentPtr = (RF_SingleComponent_t *) data;
   1407 		memcpy( &component, componentPtr,
   1408 			sizeof(RF_SingleComponent_t));
   1409 		component.row = 0; /* we don't support any more */
   1410 		column = component.column;
   1411 
   1412 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1413 			return(EINVAL);
   1414 		}
   1415 
   1416 		rf_lock_mutex2(raidPtr->mutex);
   1417 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1418 		    (raidPtr->numFailures > 0)) {
   1419 			/* XXX 0 above shouldn't be constant!!! */
   1420 			/* some component other than this has failed.
   1421 			   Let's not make things worse than they already
   1422 			   are... */
   1423 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1424 			       raidPtr->raidid);
   1425 			printf("raid%d:     Col: %d   Too many failures.\n",
   1426 			       raidPtr->raidid, column);
   1427 			rf_unlock_mutex2(raidPtr->mutex);
   1428 			return (EINVAL);
   1429 		}
   1430 		if (raidPtr->Disks[column].status ==
   1431 		    rf_ds_reconstructing) {
   1432 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1433 			       raidPtr->raidid);
   1434 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1435 
   1436 			rf_unlock_mutex2(raidPtr->mutex);
   1437 			return (EINVAL);
   1438 		}
   1439 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1440 			rf_unlock_mutex2(raidPtr->mutex);
   1441 			return (EINVAL);
   1442 		}
   1443 		rf_unlock_mutex2(raidPtr->mutex);
   1444 
   1445 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1446 		if (rrcopy == NULL)
   1447 			return(ENOMEM);
   1448 
   1449 		rrcopy->raidPtr = (void *) raidPtr;
   1450 		rrcopy->col = column;
   1451 
   1452 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1453 					   rf_ReconstructInPlaceThread,
   1454 					   rrcopy,"raid_reconip");
   1455 		return(retcode);
   1456 
   1457 	case RAIDFRAME_GET_INFO:
   1458 		if (!raidPtr->valid)
   1459 			return (ENODEV);
   1460 		ucfgp = (RF_DeviceConfig_t **) data;
   1461 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1462 			  (RF_DeviceConfig_t *));
   1463 		if (d_cfg == NULL)
   1464 			return (ENOMEM);
   1465 		d_cfg->rows = 1; /* there is only 1 row now */
   1466 		d_cfg->cols = raidPtr->numCol;
   1467 		d_cfg->ndevs = raidPtr->numCol;
   1468 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1469 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1470 			return (ENOMEM);
   1471 		}
   1472 		d_cfg->nspares = raidPtr->numSpare;
   1473 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1474 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1475 			return (ENOMEM);
   1476 		}
   1477 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1478 		d = 0;
   1479 		for (j = 0; j < d_cfg->cols; j++) {
   1480 			d_cfg->devs[d] = raidPtr->Disks[j];
   1481 			d++;
   1482 		}
   1483 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1484 			d_cfg->spares[i] = raidPtr->Disks[j];
   1485 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1486 				/* XXX: raidctl(8) expects to see this as a used spare */
   1487 				d_cfg->spares[i].status = rf_ds_used_spare;
   1488 			}
   1489 		}
   1490 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1491 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1492 
   1493 		return (retcode);
   1494 
   1495 	case RAIDFRAME_CHECK_PARITY:
   1496 		*(int *) data = raidPtr->parity_good;
   1497 		return (0);
   1498 
   1499 	case RAIDFRAME_PARITYMAP_STATUS:
   1500 		if (rf_paritymap_ineligible(raidPtr))
   1501 			return EINVAL;
   1502 		rf_paritymap_status(raidPtr->parity_map,
   1503 		    (struct rf_pmstat *)data);
   1504 		return 0;
   1505 
   1506 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1507 		if (rf_paritymap_ineligible(raidPtr))
   1508 			return EINVAL;
   1509 		if (raidPtr->parity_map == NULL)
   1510 			return ENOENT; /* ??? */
   1511 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1512 			(struct rf_pmparams *)data, 1))
   1513 			return EINVAL;
   1514 		return 0;
   1515 
   1516 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1517 		if (rf_paritymap_ineligible(raidPtr))
   1518 			return EINVAL;
   1519 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1520 		return 0;
   1521 
   1522 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1523 		if (rf_paritymap_ineligible(raidPtr))
   1524 			return EINVAL;
   1525 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1526 		/* XXX should errors be passed up? */
   1527 		return 0;
   1528 
   1529 	case RAIDFRAME_RESET_ACCTOTALS:
   1530 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1531 		return (0);
   1532 
   1533 	case RAIDFRAME_GET_ACCTOTALS:
   1534 		totals = (RF_AccTotals_t *) data;
   1535 		*totals = raidPtr->acc_totals;
   1536 		return (0);
   1537 
   1538 	case RAIDFRAME_KEEP_ACCTOTALS:
   1539 		raidPtr->keep_acc_totals = *(int *)data;
   1540 		return (0);
   1541 
   1542 	case RAIDFRAME_GET_SIZE:
   1543 		*(int *) data = raidPtr->totalSectors;
   1544 		return (0);
   1545 
   1546 		/* fail a disk & optionally start reconstruction */
   1547 	case RAIDFRAME_FAIL_DISK:
   1548 
   1549 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1550 			/* Can't do this on a RAID 0!! */
   1551 			return(EINVAL);
   1552 		}
   1553 
   1554 		rr = (struct rf_recon_req *) data;
   1555 		rr->row = 0;
   1556 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1557 			return (EINVAL);
   1558 
   1559 
   1560 		rf_lock_mutex2(raidPtr->mutex);
   1561 		if (raidPtr->status == rf_rs_reconstructing) {
   1562 			/* you can't fail a disk while we're reconstructing! */
   1563 			/* XXX wrong for RAID6 */
   1564 			rf_unlock_mutex2(raidPtr->mutex);
   1565 			return (EINVAL);
   1566 		}
   1567 		if ((raidPtr->Disks[rr->col].status ==
   1568 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1569 			/* some other component has failed.  Let's not make
   1570 			   things worse. XXX wrong for RAID6 */
   1571 			rf_unlock_mutex2(raidPtr->mutex);
   1572 			return (EINVAL);
   1573 		}
   1574 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1575 			/* Can't fail a spared disk! */
   1576 			rf_unlock_mutex2(raidPtr->mutex);
   1577 			return (EINVAL);
   1578 		}
   1579 		rf_unlock_mutex2(raidPtr->mutex);
   1580 
   1581 		/* make a copy of the recon request so that we don't rely on
   1582 		 * the user's buffer */
   1583 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1584 		if (rrcopy == NULL)
   1585 			return(ENOMEM);
   1586 		memcpy(rrcopy, rr, sizeof(*rr));
   1587 		rrcopy->raidPtr = (void *) raidPtr;
   1588 
   1589 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1590 					   rf_ReconThread,
   1591 					   rrcopy,"raid_recon");
   1592 		return (0);
   1593 
   1594 		/* invoke a copyback operation after recon on whatever disk
   1595 		 * needs it, if any */
   1596 	case RAIDFRAME_COPYBACK:
   1597 
   1598 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1599 			/* This makes no sense on a RAID 0!! */
   1600 			return(EINVAL);
   1601 		}
   1602 
   1603 		if (raidPtr->copyback_in_progress == 1) {
   1604 			/* Copyback is already in progress! */
   1605 			return(EINVAL);
   1606 		}
   1607 
   1608 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1609 					   rf_CopybackThread,
   1610 					   raidPtr,"raid_copyback");
   1611 		return (retcode);
   1612 
   1613 		/* return the percentage completion of reconstruction */
   1614 	case RAIDFRAME_CHECK_RECON_STATUS:
   1615 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1616 			/* This makes no sense on a RAID 0, so tell the
   1617 			   user it's done. */
   1618 			*(int *) data = 100;
   1619 			return(0);
   1620 		}
   1621 		if (raidPtr->status != rf_rs_reconstructing)
   1622 			*(int *) data = 100;
   1623 		else {
   1624 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1625 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1626 			} else {
   1627 				*(int *) data = 0;
   1628 			}
   1629 		}
   1630 		return (0);
   1631 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1632 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1633 		if (raidPtr->status != rf_rs_reconstructing) {
   1634 			progressInfo.remaining = 0;
   1635 			progressInfo.completed = 100;
   1636 			progressInfo.total = 100;
   1637 		} else {
   1638 			progressInfo.total =
   1639 				raidPtr->reconControl->numRUsTotal;
   1640 			progressInfo.completed =
   1641 				raidPtr->reconControl->numRUsComplete;
   1642 			progressInfo.remaining = progressInfo.total -
   1643 				progressInfo.completed;
   1644 		}
   1645 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1646 				  sizeof(RF_ProgressInfo_t));
   1647 		return (retcode);
   1648 
   1649 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1650 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1651 			/* This makes no sense on a RAID 0, so tell the
   1652 			   user it's done. */
   1653 			*(int *) data = 100;
   1654 			return(0);
   1655 		}
   1656 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1657 			*(int *) data = 100 *
   1658 				raidPtr->parity_rewrite_stripes_done /
   1659 				raidPtr->Layout.numStripe;
   1660 		} else {
   1661 			*(int *) data = 100;
   1662 		}
   1663 		return (0);
   1664 
   1665 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1666 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1667 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1668 			progressInfo.total = raidPtr->Layout.numStripe;
   1669 			progressInfo.completed =
   1670 				raidPtr->parity_rewrite_stripes_done;
   1671 			progressInfo.remaining = progressInfo.total -
   1672 				progressInfo.completed;
   1673 		} else {
   1674 			progressInfo.remaining = 0;
   1675 			progressInfo.completed = 100;
   1676 			progressInfo.total = 100;
   1677 		}
   1678 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1679 				  sizeof(RF_ProgressInfo_t));
   1680 		return (retcode);
   1681 
   1682 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1683 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1684 			/* This makes no sense on a RAID 0 */
   1685 			*(int *) data = 100;
   1686 			return(0);
   1687 		}
   1688 		if (raidPtr->copyback_in_progress == 1) {
   1689 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1690 				raidPtr->Layout.numStripe;
   1691 		} else {
   1692 			*(int *) data = 100;
   1693 		}
   1694 		return (0);
   1695 
   1696 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1697 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1698 		if (raidPtr->copyback_in_progress == 1) {
   1699 			progressInfo.total = raidPtr->Layout.numStripe;
   1700 			progressInfo.completed =
   1701 				raidPtr->copyback_stripes_done;
   1702 			progressInfo.remaining = progressInfo.total -
   1703 				progressInfo.completed;
   1704 		} else {
   1705 			progressInfo.remaining = 0;
   1706 			progressInfo.completed = 100;
   1707 			progressInfo.total = 100;
   1708 		}
   1709 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1710 				  sizeof(RF_ProgressInfo_t));
   1711 		return (retcode);
   1712 
   1713 	case RAIDFRAME_SET_LAST_UNIT:
   1714 		for (column = 0; column < raidPtr->numCol; column++)
   1715 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1716 				return EBUSY;
   1717 
   1718 		for (column = 0; column < raidPtr->numCol; column++) {
   1719 			clabel = raidget_component_label(raidPtr, column);
   1720 			clabel->last_unit = *(int *)data;
   1721 			raidflush_component_label(raidPtr, column);
   1722 		}
   1723 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1724 		return 0;
   1725 
   1726 		/* the sparetable daemon calls this to wait for the kernel to
   1727 		 * need a spare table. this ioctl does not return until a
   1728 		 * spare table is needed. XXX -- calling mpsleep here in the
   1729 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1730 		 * -- I should either compute the spare table in the kernel,
   1731 		 * or have a different -- XXX XXX -- interface (a different
   1732 		 * character device) for delivering the table     -- XXX */
   1733 #if 0
   1734 	case RAIDFRAME_SPARET_WAIT:
   1735 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1736 		while (!rf_sparet_wait_queue)
   1737 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1738 		waitreq = rf_sparet_wait_queue;
   1739 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1740 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1741 
   1742 		/* structure assignment */
   1743 		*((RF_SparetWait_t *) data) = *waitreq;
   1744 
   1745 		RF_Free(waitreq, sizeof(*waitreq));
   1746 		return (0);
   1747 
   1748 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1749 		 * code in it that will cause the dameon to exit */
   1750 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1751 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1752 		waitreq->fcol = -1;
   1753 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1754 		waitreq->next = rf_sparet_wait_queue;
   1755 		rf_sparet_wait_queue = waitreq;
   1756 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1757 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1758 		return (0);
   1759 
   1760 		/* used by the spare table daemon to deliver a spare table
   1761 		 * into the kernel */
   1762 	case RAIDFRAME_SEND_SPARET:
   1763 
   1764 		/* install the spare table */
   1765 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1766 
   1767 		/* respond to the requestor.  the return status of the spare
   1768 		 * table installation is passed in the "fcol" field */
   1769 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1770 		waitreq->fcol = retcode;
   1771 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1772 		waitreq->next = rf_sparet_resp_queue;
   1773 		rf_sparet_resp_queue = waitreq;
   1774 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1775 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1776 
   1777 		return (retcode);
   1778 #endif
   1779 
   1780 	default:
   1781 		break; /* fall through to the os-specific code below */
   1782 
   1783 	}
   1784 
   1785 	if (!raidPtr->valid)
   1786 		return (EINVAL);
   1787 
   1788 	/*
   1789 	 * Add support for "regular" device ioctls here.
   1790 	 */
   1791 
   1792 	switch (cmd) {
   1793 	case DIOCCACHESYNC:
   1794 		retcode = rf_sync_component_caches(raidPtr);
   1795 		break;
   1796 
   1797 	default:
   1798 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1799 		break;
   1800 	}
   1801 
   1802 	return (retcode);
   1803 
   1804 }
   1805 
   1806 
   1807 /* raidinit -- complete the rest of the initialization for the
   1808    RAIDframe device.  */
   1809 
   1810 
   1811 static void
   1812 raidinit(struct raid_softc *rs)
   1813 {
   1814 	cfdata_t cf;
   1815 	unsigned int unit;
   1816 	struct dk_softc *dksc = &rs->sc_dksc;
   1817 	RF_Raid_t *raidPtr = &rs->sc_r;
   1818 	device_t dev;
   1819 
   1820 	unit = raidPtr->raidid;
   1821 
   1822 	/* XXX doesn't check bounds. */
   1823 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1824 
   1825 	/* attach the pseudo device */
   1826 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1827 	cf->cf_name = raid_cd.cd_name;
   1828 	cf->cf_atname = raid_cd.cd_name;
   1829 	cf->cf_unit = unit;
   1830 	cf->cf_fstate = FSTATE_STAR;
   1831 
   1832 	dev = config_attach_pseudo(cf);
   1833 	if (dev == NULL) {
   1834 		printf("raid%d: config_attach_pseudo failed\n",
   1835 		    raidPtr->raidid);
   1836 		free(cf, M_RAIDFRAME);
   1837 		return;
   1838 	}
   1839 
   1840 	/* provide a backpointer to the real softc */
   1841 	raidsoftc(dev) = rs;
   1842 
   1843 	/* disk_attach actually creates space for the CPU disklabel, among
   1844 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1845 	 * with disklabels. */
   1846 	dk_init(dksc, dev, DKTYPE_RAID);
   1847 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1848 
   1849 	/* XXX There may be a weird interaction here between this, and
   1850 	 * protectedSectors, as used in RAIDframe.  */
   1851 
   1852 	rs->sc_size = raidPtr->totalSectors;
   1853 
   1854 	/* Attach dk and disk subsystems */
   1855 	dk_attach(dksc);
   1856 	disk_attach(&dksc->sc_dkdev);
   1857 	rf_set_geometry(rs, raidPtr);
   1858 
   1859 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1860 
   1861 	/* mark unit as usuable */
   1862 	rs->sc_flags |= RAIDF_INITED;
   1863 
   1864 	dkwedge_discover(&dksc->sc_dkdev);
   1865 }
   1866 
   1867 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1868 /* wake up the daemon & tell it to get us a spare table
   1869  * XXX
   1870  * the entries in the queues should be tagged with the raidPtr
   1871  * so that in the extremely rare case that two recons happen at once,
   1872  * we know for which device were requesting a spare table
   1873  * XXX
   1874  *
   1875  * XXX This code is not currently used. GO
   1876  */
   1877 int
   1878 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1879 {
   1880 	int     retcode;
   1881 
   1882 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1883 	req->next = rf_sparet_wait_queue;
   1884 	rf_sparet_wait_queue = req;
   1885 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1886 
   1887 	/* mpsleep unlocks the mutex */
   1888 	while (!rf_sparet_resp_queue) {
   1889 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1890 	}
   1891 	req = rf_sparet_resp_queue;
   1892 	rf_sparet_resp_queue = req->next;
   1893 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1894 
   1895 	retcode = req->fcol;
   1896 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1897 					 * alloc'd */
   1898 	return (retcode);
   1899 }
   1900 #endif
   1901 
   1902 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1903  * bp & passes it down.
   1904  * any calls originating in the kernel must use non-blocking I/O
   1905  * do some extra sanity checking to return "appropriate" error values for
   1906  * certain conditions (to make some standard utilities work)
   1907  *
   1908  * Formerly known as: rf_DoAccessKernel
   1909  */
   1910 void
   1911 raidstart(RF_Raid_t *raidPtr)
   1912 {
   1913 	struct raid_softc *rs;
   1914 	struct dk_softc *dksc;
   1915 
   1916 	rs = raidPtr->softc;
   1917 	dksc = &rs->sc_dksc;
   1918 	/* quick check to see if anything has died recently */
   1919 	rf_lock_mutex2(raidPtr->mutex);
   1920 	if (raidPtr->numNewFailures > 0) {
   1921 		rf_unlock_mutex2(raidPtr->mutex);
   1922 		rf_update_component_labels(raidPtr,
   1923 					   RF_NORMAL_COMPONENT_UPDATE);
   1924 		rf_lock_mutex2(raidPtr->mutex);
   1925 		raidPtr->numNewFailures--;
   1926 	}
   1927 	rf_unlock_mutex2(raidPtr->mutex);
   1928 
   1929 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1930 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1931 		return;
   1932 	}
   1933 
   1934 	dk_start(dksc, NULL);
   1935 }
   1936 
   1937 static int
   1938 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1939 {
   1940 	RF_SectorCount_t num_blocks, pb, sum;
   1941 	RF_RaidAddr_t raid_addr;
   1942 	daddr_t blocknum;
   1943 	int     do_async;
   1944 	int rc;
   1945 
   1946 	rf_lock_mutex2(raidPtr->mutex);
   1947 	if (raidPtr->openings == 0) {
   1948 		rf_unlock_mutex2(raidPtr->mutex);
   1949 		return EAGAIN;
   1950 	}
   1951 	rf_unlock_mutex2(raidPtr->mutex);
   1952 
   1953 	blocknum = bp->b_rawblkno;
   1954 
   1955 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1956 		    (int) blocknum));
   1957 
   1958 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1959 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1960 
   1961 	/* *THIS* is where we adjust what block we're going to...
   1962 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1963 	raid_addr = blocknum;
   1964 
   1965 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1966 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1967 	sum = raid_addr + num_blocks + pb;
   1968 	if (1 || rf_debugKernelAccess) {
   1969 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1970 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1971 			    (int) pb, (int) bp->b_resid));
   1972 	}
   1973 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1974 	    || (sum < num_blocks) || (sum < pb)) {
   1975 		rc = ENOSPC;
   1976 		goto done;
   1977 	}
   1978 	/*
   1979 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1980 	 */
   1981 
   1982 	if (bp->b_bcount & raidPtr->sectorMask) {
   1983 		rc = ENOSPC;
   1984 		goto done;
   1985 	}
   1986 	db1_printf(("Calling DoAccess..\n"));
   1987 
   1988 
   1989 	rf_lock_mutex2(raidPtr->mutex);
   1990 	raidPtr->openings--;
   1991 	rf_unlock_mutex2(raidPtr->mutex);
   1992 
   1993 	/*
   1994 	 * Everything is async.
   1995 	 */
   1996 	do_async = 1;
   1997 
   1998 	/* don't ever condition on bp->b_flags & B_WRITE.
   1999 	 * always condition on B_READ instead */
   2000 
   2001 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2002 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2003 			 do_async, raid_addr, num_blocks,
   2004 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2005 
   2006 done:
   2007 	return rc;
   2008 }
   2009 
   2010 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2011 
   2012 int
   2013 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2014 {
   2015 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2016 	struct buf *bp;
   2017 
   2018 	req->queue = queue;
   2019 	bp = req->bp;
   2020 
   2021 	switch (req->type) {
   2022 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2023 		/* XXX need to do something extra here.. */
   2024 		/* I'm leaving this in, as I've never actually seen it used,
   2025 		 * and I'd like folks to report it... GO */
   2026 		printf(("WAKEUP CALLED\n"));
   2027 		queue->numOutstanding++;
   2028 
   2029 		bp->b_flags = 0;
   2030 		bp->b_private = req;
   2031 
   2032 		KernelWakeupFunc(bp);
   2033 		break;
   2034 
   2035 	case RF_IO_TYPE_READ:
   2036 	case RF_IO_TYPE_WRITE:
   2037 #if RF_ACC_TRACE > 0
   2038 		if (req->tracerec) {
   2039 			RF_ETIMER_START(req->tracerec->timer);
   2040 		}
   2041 #endif
   2042 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2043 		    op, queue->rf_cinfo->ci_dev,
   2044 		    req->sectorOffset, req->numSector,
   2045 		    req->buf, KernelWakeupFunc, (void *) req,
   2046 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2047 
   2048 		if (rf_debugKernelAccess) {
   2049 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2050 				(long) bp->b_blkno));
   2051 		}
   2052 		queue->numOutstanding++;
   2053 		queue->last_deq_sector = req->sectorOffset;
   2054 		/* acc wouldn't have been let in if there were any pending
   2055 		 * reqs at any other priority */
   2056 		queue->curPriority = req->priority;
   2057 
   2058 		db1_printf(("Going for %c to unit %d col %d\n",
   2059 			    req->type, queue->raidPtr->raidid,
   2060 			    queue->col));
   2061 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2062 			(int) req->sectorOffset, (int) req->numSector,
   2063 			(int) (req->numSector <<
   2064 			    queue->raidPtr->logBytesPerSector),
   2065 			(int) queue->raidPtr->logBytesPerSector));
   2066 
   2067 		/*
   2068 		 * XXX: drop lock here since this can block at
   2069 		 * least with backing SCSI devices.  Retake it
   2070 		 * to minimize fuss with calling interfaces.
   2071 		 */
   2072 
   2073 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2074 		bdev_strategy(bp);
   2075 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2076 		break;
   2077 
   2078 	default:
   2079 		panic("bad req->type in rf_DispatchKernelIO");
   2080 	}
   2081 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2082 
   2083 	return (0);
   2084 }
   2085 /* this is the callback function associated with a I/O invoked from
   2086    kernel code.
   2087  */
   2088 static void
   2089 KernelWakeupFunc(struct buf *bp)
   2090 {
   2091 	RF_DiskQueueData_t *req = NULL;
   2092 	RF_DiskQueue_t *queue;
   2093 
   2094 	db1_printf(("recovering the request queue:\n"));
   2095 
   2096 	req = bp->b_private;
   2097 
   2098 	queue = (RF_DiskQueue_t *) req->queue;
   2099 
   2100 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2101 
   2102 #if RF_ACC_TRACE > 0
   2103 	if (req->tracerec) {
   2104 		RF_ETIMER_STOP(req->tracerec->timer);
   2105 		RF_ETIMER_EVAL(req->tracerec->timer);
   2106 		rf_lock_mutex2(rf_tracing_mutex);
   2107 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2108 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2109 		req->tracerec->num_phys_ios++;
   2110 		rf_unlock_mutex2(rf_tracing_mutex);
   2111 	}
   2112 #endif
   2113 
   2114 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2115 	 * ballistic, and mark the component as hosed... */
   2116 
   2117 	if (bp->b_error != 0) {
   2118 		/* Mark the disk as dead */
   2119 		/* but only mark it once... */
   2120 		/* and only if it wouldn't leave this RAID set
   2121 		   completely broken */
   2122 		if (((queue->raidPtr->Disks[queue->col].status ==
   2123 		      rf_ds_optimal) ||
   2124 		     (queue->raidPtr->Disks[queue->col].status ==
   2125 		      rf_ds_used_spare)) &&
   2126 		     (queue->raidPtr->numFailures <
   2127 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2128 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2129 			       queue->raidPtr->raidid,
   2130 			       bp->b_error,
   2131 			       queue->raidPtr->Disks[queue->col].devname);
   2132 			queue->raidPtr->Disks[queue->col].status =
   2133 			    rf_ds_failed;
   2134 			queue->raidPtr->status = rf_rs_degraded;
   2135 			queue->raidPtr->numFailures++;
   2136 			queue->raidPtr->numNewFailures++;
   2137 		} else {	/* Disk is already dead... */
   2138 			/* printf("Disk already marked as dead!\n"); */
   2139 		}
   2140 
   2141 	}
   2142 
   2143 	/* Fill in the error value */
   2144 	req->error = bp->b_error;
   2145 
   2146 	/* Drop this one on the "finished" queue... */
   2147 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2148 
   2149 	/* Let the raidio thread know there is work to be done. */
   2150 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2151 
   2152 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2153 }
   2154 
   2155 
   2156 /*
   2157  * initialize a buf structure for doing an I/O in the kernel.
   2158  */
   2159 static void
   2160 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2161        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2162        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2163        struct proc *b_proc)
   2164 {
   2165 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2166 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2167 	bp->b_oflags = 0;
   2168 	bp->b_cflags = 0;
   2169 	bp->b_bcount = numSect << logBytesPerSector;
   2170 	bp->b_bufsize = bp->b_bcount;
   2171 	bp->b_error = 0;
   2172 	bp->b_dev = dev;
   2173 	bp->b_data = bf;
   2174 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2175 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2176 	if (bp->b_bcount == 0) {
   2177 		panic("bp->b_bcount is zero in InitBP!!");
   2178 	}
   2179 	bp->b_proc = b_proc;
   2180 	bp->b_iodone = cbFunc;
   2181 	bp->b_private = cbArg;
   2182 }
   2183 
   2184 /*
   2185  * Wait interruptibly for an exclusive lock.
   2186  *
   2187  * XXX
   2188  * Several drivers do this; it should be abstracted and made MP-safe.
   2189  * (Hmm... where have we seen this warning before :->  GO )
   2190  */
   2191 static int
   2192 raidlock(struct raid_softc *rs)
   2193 {
   2194 	int     error;
   2195 
   2196 	error = 0;
   2197 	mutex_enter(&rs->sc_mutex);
   2198 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2199 		rs->sc_flags |= RAIDF_WANTED;
   2200 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2201 		if (error != 0)
   2202 			goto done;
   2203 	}
   2204 	rs->sc_flags |= RAIDF_LOCKED;
   2205 done:
   2206 	mutex_exit(&rs->sc_mutex);
   2207 	return (error);
   2208 }
   2209 /*
   2210  * Unlock and wake up any waiters.
   2211  */
   2212 static void
   2213 raidunlock(struct raid_softc *rs)
   2214 {
   2215 
   2216 	mutex_enter(&rs->sc_mutex);
   2217 	rs->sc_flags &= ~RAIDF_LOCKED;
   2218 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2219 		rs->sc_flags &= ~RAIDF_WANTED;
   2220 		cv_broadcast(&rs->sc_cv);
   2221 	}
   2222 	mutex_exit(&rs->sc_mutex);
   2223 }
   2224 
   2225 
   2226 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2227 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2228 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2229 
   2230 static daddr_t
   2231 rf_component_info_offset(void)
   2232 {
   2233 
   2234 	return RF_COMPONENT_INFO_OFFSET;
   2235 }
   2236 
   2237 static daddr_t
   2238 rf_component_info_size(unsigned secsize)
   2239 {
   2240 	daddr_t info_size;
   2241 
   2242 	KASSERT(secsize);
   2243 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2244 		info_size = secsize;
   2245 	else
   2246 		info_size = RF_COMPONENT_INFO_SIZE;
   2247 
   2248 	return info_size;
   2249 }
   2250 
   2251 static daddr_t
   2252 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2253 {
   2254 	daddr_t map_offset;
   2255 
   2256 	KASSERT(raidPtr->bytesPerSector);
   2257 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2258 		map_offset = raidPtr->bytesPerSector;
   2259 	else
   2260 		map_offset = RF_COMPONENT_INFO_SIZE;
   2261 	map_offset += rf_component_info_offset();
   2262 
   2263 	return map_offset;
   2264 }
   2265 
   2266 static daddr_t
   2267 rf_parity_map_size(RF_Raid_t *raidPtr)
   2268 {
   2269 	daddr_t map_size;
   2270 
   2271 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2272 		map_size = raidPtr->bytesPerSector;
   2273 	else
   2274 		map_size = RF_PARITY_MAP_SIZE;
   2275 
   2276 	return map_size;
   2277 }
   2278 
   2279 int
   2280 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2281 {
   2282 	RF_ComponentLabel_t *clabel;
   2283 
   2284 	clabel = raidget_component_label(raidPtr, col);
   2285 	clabel->clean = RF_RAID_CLEAN;
   2286 	raidflush_component_label(raidPtr, col);
   2287 	return(0);
   2288 }
   2289 
   2290 
   2291 int
   2292 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2293 {
   2294 	RF_ComponentLabel_t *clabel;
   2295 
   2296 	clabel = raidget_component_label(raidPtr, col);
   2297 	clabel->clean = RF_RAID_DIRTY;
   2298 	raidflush_component_label(raidPtr, col);
   2299 	return(0);
   2300 }
   2301 
   2302 int
   2303 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2304 {
   2305 	KASSERT(raidPtr->bytesPerSector);
   2306 	return raidread_component_label(raidPtr->bytesPerSector,
   2307 	    raidPtr->Disks[col].dev,
   2308 	    raidPtr->raid_cinfo[col].ci_vp,
   2309 	    &raidPtr->raid_cinfo[col].ci_label);
   2310 }
   2311 
   2312 RF_ComponentLabel_t *
   2313 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2314 {
   2315 	return &raidPtr->raid_cinfo[col].ci_label;
   2316 }
   2317 
   2318 int
   2319 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2320 {
   2321 	RF_ComponentLabel_t *label;
   2322 
   2323 	label = &raidPtr->raid_cinfo[col].ci_label;
   2324 	label->mod_counter = raidPtr->mod_counter;
   2325 #ifndef RF_NO_PARITY_MAP
   2326 	label->parity_map_modcount = label->mod_counter;
   2327 #endif
   2328 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2329 	    raidPtr->Disks[col].dev,
   2330 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2331 }
   2332 
   2333 
   2334 static int
   2335 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2336     RF_ComponentLabel_t *clabel)
   2337 {
   2338 	return raidread_component_area(dev, b_vp, clabel,
   2339 	    sizeof(RF_ComponentLabel_t),
   2340 	    rf_component_info_offset(),
   2341 	    rf_component_info_size(secsize));
   2342 }
   2343 
   2344 /* ARGSUSED */
   2345 static int
   2346 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2347     size_t msize, daddr_t offset, daddr_t dsize)
   2348 {
   2349 	struct buf *bp;
   2350 	int error;
   2351 
   2352 	/* XXX should probably ensure that we don't try to do this if
   2353 	   someone has changed rf_protected_sectors. */
   2354 
   2355 	if (b_vp == NULL) {
   2356 		/* For whatever reason, this component is not valid.
   2357 		   Don't try to read a component label from it. */
   2358 		return(EINVAL);
   2359 	}
   2360 
   2361 	/* get a block of the appropriate size... */
   2362 	bp = geteblk((int)dsize);
   2363 	bp->b_dev = dev;
   2364 
   2365 	/* get our ducks in a row for the read */
   2366 	bp->b_blkno = offset / DEV_BSIZE;
   2367 	bp->b_bcount = dsize;
   2368 	bp->b_flags |= B_READ;
   2369  	bp->b_resid = dsize;
   2370 
   2371 	bdev_strategy(bp);
   2372 	error = biowait(bp);
   2373 
   2374 	if (!error) {
   2375 		memcpy(data, bp->b_data, msize);
   2376 	}
   2377 
   2378 	brelse(bp, 0);
   2379 	return(error);
   2380 }
   2381 
   2382 
   2383 static int
   2384 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2385     RF_ComponentLabel_t *clabel)
   2386 {
   2387 	return raidwrite_component_area(dev, b_vp, clabel,
   2388 	    sizeof(RF_ComponentLabel_t),
   2389 	    rf_component_info_offset(),
   2390 	    rf_component_info_size(secsize), 0);
   2391 }
   2392 
   2393 /* ARGSUSED */
   2394 static int
   2395 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2396     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2397 {
   2398 	struct buf *bp;
   2399 	int error;
   2400 
   2401 	/* get a block of the appropriate size... */
   2402 	bp = geteblk((int)dsize);
   2403 	bp->b_dev = dev;
   2404 
   2405 	/* get our ducks in a row for the write */
   2406 	bp->b_blkno = offset / DEV_BSIZE;
   2407 	bp->b_bcount = dsize;
   2408 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2409  	bp->b_resid = dsize;
   2410 
   2411 	memset(bp->b_data, 0, dsize);
   2412 	memcpy(bp->b_data, data, msize);
   2413 
   2414 	bdev_strategy(bp);
   2415 	if (asyncp)
   2416 		return 0;
   2417 	error = biowait(bp);
   2418 	brelse(bp, 0);
   2419 	if (error) {
   2420 #if 1
   2421 		printf("Failed to write RAID component info!\n");
   2422 #endif
   2423 	}
   2424 
   2425 	return(error);
   2426 }
   2427 
   2428 void
   2429 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2430 {
   2431 	int c;
   2432 
   2433 	for (c = 0; c < raidPtr->numCol; c++) {
   2434 		/* Skip dead disks. */
   2435 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2436 			continue;
   2437 		/* XXXjld: what if an error occurs here? */
   2438 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2439 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2440 		    RF_PARITYMAP_NBYTE,
   2441 		    rf_parity_map_offset(raidPtr),
   2442 		    rf_parity_map_size(raidPtr), 0);
   2443 	}
   2444 }
   2445 
   2446 void
   2447 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2448 {
   2449 	struct rf_paritymap_ondisk tmp;
   2450 	int c,first;
   2451 
   2452 	first=1;
   2453 	for (c = 0; c < raidPtr->numCol; c++) {
   2454 		/* Skip dead disks. */
   2455 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2456 			continue;
   2457 		raidread_component_area(raidPtr->Disks[c].dev,
   2458 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2459 		    RF_PARITYMAP_NBYTE,
   2460 		    rf_parity_map_offset(raidPtr),
   2461 		    rf_parity_map_size(raidPtr));
   2462 		if (first) {
   2463 			memcpy(map, &tmp, sizeof(*map));
   2464 			first = 0;
   2465 		} else {
   2466 			rf_paritymap_merge(map, &tmp);
   2467 		}
   2468 	}
   2469 }
   2470 
   2471 void
   2472 rf_markalldirty(RF_Raid_t *raidPtr)
   2473 {
   2474 	RF_ComponentLabel_t *clabel;
   2475 	int sparecol;
   2476 	int c;
   2477 	int j;
   2478 	int scol = -1;
   2479 
   2480 	raidPtr->mod_counter++;
   2481 	for (c = 0; c < raidPtr->numCol; c++) {
   2482 		/* we don't want to touch (at all) a disk that has
   2483 		   failed */
   2484 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2485 			clabel = raidget_component_label(raidPtr, c);
   2486 			if (clabel->status == rf_ds_spared) {
   2487 				/* XXX do something special...
   2488 				   but whatever you do, don't
   2489 				   try to access it!! */
   2490 			} else {
   2491 				raidmarkdirty(raidPtr, c);
   2492 			}
   2493 		}
   2494 	}
   2495 
   2496 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2497 		sparecol = raidPtr->numCol + c;
   2498 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2499 			/*
   2500 
   2501 			   we claim this disk is "optimal" if it's
   2502 			   rf_ds_used_spare, as that means it should be
   2503 			   directly substitutable for the disk it replaced.
   2504 			   We note that too...
   2505 
   2506 			 */
   2507 
   2508 			for(j=0;j<raidPtr->numCol;j++) {
   2509 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2510 					scol = j;
   2511 					break;
   2512 				}
   2513 			}
   2514 
   2515 			clabel = raidget_component_label(raidPtr, sparecol);
   2516 			/* make sure status is noted */
   2517 
   2518 			raid_init_component_label(raidPtr, clabel);
   2519 
   2520 			clabel->row = 0;
   2521 			clabel->column = scol;
   2522 			/* Note: we *don't* change status from rf_ds_used_spare
   2523 			   to rf_ds_optimal */
   2524 			/* clabel.status = rf_ds_optimal; */
   2525 
   2526 			raidmarkdirty(raidPtr, sparecol);
   2527 		}
   2528 	}
   2529 }
   2530 
   2531 
   2532 void
   2533 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2534 {
   2535 	RF_ComponentLabel_t *clabel;
   2536 	int sparecol;
   2537 	int c;
   2538 	int j;
   2539 	int scol;
   2540 	struct raid_softc *rs = raidPtr->softc;
   2541 
   2542 	scol = -1;
   2543 
   2544 	/* XXX should do extra checks to make sure things really are clean,
   2545 	   rather than blindly setting the clean bit... */
   2546 
   2547 	raidPtr->mod_counter++;
   2548 
   2549 	for (c = 0; c < raidPtr->numCol; c++) {
   2550 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2551 			clabel = raidget_component_label(raidPtr, c);
   2552 			/* make sure status is noted */
   2553 			clabel->status = rf_ds_optimal;
   2554 
   2555 			/* note what unit we are configured as */
   2556 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2557 				clabel->last_unit = raidPtr->raidid;
   2558 
   2559 			raidflush_component_label(raidPtr, c);
   2560 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2561 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2562 					raidmarkclean(raidPtr, c);
   2563 				}
   2564 			}
   2565 		}
   2566 		/* else we don't touch it.. */
   2567 	}
   2568 
   2569 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2570 		sparecol = raidPtr->numCol + c;
   2571 		/* Need to ensure that the reconstruct actually completed! */
   2572 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2573 			/*
   2574 
   2575 			   we claim this disk is "optimal" if it's
   2576 			   rf_ds_used_spare, as that means it should be
   2577 			   directly substitutable for the disk it replaced.
   2578 			   We note that too...
   2579 
   2580 			 */
   2581 
   2582 			for(j=0;j<raidPtr->numCol;j++) {
   2583 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2584 					scol = j;
   2585 					break;
   2586 				}
   2587 			}
   2588 
   2589 			/* XXX shouldn't *really* need this... */
   2590 			clabel = raidget_component_label(raidPtr, sparecol);
   2591 			/* make sure status is noted */
   2592 
   2593 			raid_init_component_label(raidPtr, clabel);
   2594 
   2595 			clabel->column = scol;
   2596 			clabel->status = rf_ds_optimal;
   2597 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2598 				clabel->last_unit = raidPtr->raidid;
   2599 
   2600 			raidflush_component_label(raidPtr, sparecol);
   2601 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2602 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2603 					raidmarkclean(raidPtr, sparecol);
   2604 				}
   2605 			}
   2606 		}
   2607 	}
   2608 }
   2609 
   2610 void
   2611 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2612 {
   2613 
   2614 	if (vp != NULL) {
   2615 		if (auto_configured == 1) {
   2616 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2617 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2618 			vput(vp);
   2619 
   2620 		} else {
   2621 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2622 		}
   2623 	}
   2624 }
   2625 
   2626 
   2627 void
   2628 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2629 {
   2630 	int r,c;
   2631 	struct vnode *vp;
   2632 	int acd;
   2633 
   2634 
   2635 	/* We take this opportunity to close the vnodes like we should.. */
   2636 
   2637 	for (c = 0; c < raidPtr->numCol; c++) {
   2638 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2639 		acd = raidPtr->Disks[c].auto_configured;
   2640 		rf_close_component(raidPtr, vp, acd);
   2641 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2642 		raidPtr->Disks[c].auto_configured = 0;
   2643 	}
   2644 
   2645 	for (r = 0; r < raidPtr->numSpare; r++) {
   2646 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2647 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2648 		rf_close_component(raidPtr, vp, acd);
   2649 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2650 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2651 	}
   2652 }
   2653 
   2654 
   2655 void
   2656 rf_ReconThread(struct rf_recon_req *req)
   2657 {
   2658 	int     s;
   2659 	RF_Raid_t *raidPtr;
   2660 
   2661 	s = splbio();
   2662 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2663 	raidPtr->recon_in_progress = 1;
   2664 
   2665 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2666 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2667 
   2668 	RF_Free(req, sizeof(*req));
   2669 
   2670 	raidPtr->recon_in_progress = 0;
   2671 	splx(s);
   2672 
   2673 	/* That's all... */
   2674 	kthread_exit(0);	/* does not return */
   2675 }
   2676 
   2677 void
   2678 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2679 {
   2680 	int retcode;
   2681 	int s;
   2682 
   2683 	raidPtr->parity_rewrite_stripes_done = 0;
   2684 	raidPtr->parity_rewrite_in_progress = 1;
   2685 	s = splbio();
   2686 	retcode = rf_RewriteParity(raidPtr);
   2687 	splx(s);
   2688 	if (retcode) {
   2689 		printf("raid%d: Error re-writing parity (%d)!\n",
   2690 		    raidPtr->raidid, retcode);
   2691 	} else {
   2692 		/* set the clean bit!  If we shutdown correctly,
   2693 		   the clean bit on each component label will get
   2694 		   set */
   2695 		raidPtr->parity_good = RF_RAID_CLEAN;
   2696 	}
   2697 	raidPtr->parity_rewrite_in_progress = 0;
   2698 
   2699 	/* Anyone waiting for us to stop?  If so, inform them... */
   2700 	if (raidPtr->waitShutdown) {
   2701 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2702 	}
   2703 
   2704 	/* That's all... */
   2705 	kthread_exit(0);	/* does not return */
   2706 }
   2707 
   2708 
   2709 void
   2710 rf_CopybackThread(RF_Raid_t *raidPtr)
   2711 {
   2712 	int s;
   2713 
   2714 	raidPtr->copyback_in_progress = 1;
   2715 	s = splbio();
   2716 	rf_CopybackReconstructedData(raidPtr);
   2717 	splx(s);
   2718 	raidPtr->copyback_in_progress = 0;
   2719 
   2720 	/* That's all... */
   2721 	kthread_exit(0);	/* does not return */
   2722 }
   2723 
   2724 
   2725 void
   2726 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2727 {
   2728 	int s;
   2729 	RF_Raid_t *raidPtr;
   2730 
   2731 	s = splbio();
   2732 	raidPtr = req->raidPtr;
   2733 	raidPtr->recon_in_progress = 1;
   2734 	rf_ReconstructInPlace(raidPtr, req->col);
   2735 	RF_Free(req, sizeof(*req));
   2736 	raidPtr->recon_in_progress = 0;
   2737 	splx(s);
   2738 
   2739 	/* That's all... */
   2740 	kthread_exit(0);	/* does not return */
   2741 }
   2742 
   2743 static RF_AutoConfig_t *
   2744 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2745     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2746     unsigned secsize)
   2747 {
   2748 	int good_one = 0;
   2749 	RF_ComponentLabel_t *clabel;
   2750 	RF_AutoConfig_t *ac;
   2751 
   2752 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2753 	if (clabel == NULL) {
   2754 oomem:
   2755 		    while(ac_list) {
   2756 			    ac = ac_list;
   2757 			    if (ac->clabel)
   2758 				    free(ac->clabel, M_RAIDFRAME);
   2759 			    ac_list = ac_list->next;
   2760 			    free(ac, M_RAIDFRAME);
   2761 		    }
   2762 		    printf("RAID auto config: out of memory!\n");
   2763 		    return NULL; /* XXX probably should panic? */
   2764 	}
   2765 
   2766 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2767 		/* Got the label.  Does it look reasonable? */
   2768 		if (rf_reasonable_label(clabel, numsecs) &&
   2769 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2770 #ifdef DEBUG
   2771 			printf("Component on: %s: %llu\n",
   2772 				cname, (unsigned long long)size);
   2773 			rf_print_component_label(clabel);
   2774 #endif
   2775 			/* if it's reasonable, add it, else ignore it. */
   2776 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2777 				M_NOWAIT);
   2778 			if (ac == NULL) {
   2779 				free(clabel, M_RAIDFRAME);
   2780 				goto oomem;
   2781 			}
   2782 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2783 			ac->dev = dev;
   2784 			ac->vp = vp;
   2785 			ac->clabel = clabel;
   2786 			ac->next = ac_list;
   2787 			ac_list = ac;
   2788 			good_one = 1;
   2789 		}
   2790 	}
   2791 	if (!good_one) {
   2792 		/* cleanup */
   2793 		free(clabel, M_RAIDFRAME);
   2794 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2795 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2796 		vput(vp);
   2797 	}
   2798 	return ac_list;
   2799 }
   2800 
   2801 RF_AutoConfig_t *
   2802 rf_find_raid_components(void)
   2803 {
   2804 	struct vnode *vp;
   2805 	struct disklabel label;
   2806 	device_t dv;
   2807 	deviter_t di;
   2808 	dev_t dev;
   2809 	int bmajor, bminor, wedge, rf_part_found;
   2810 	int error;
   2811 	int i;
   2812 	RF_AutoConfig_t *ac_list;
   2813 	uint64_t numsecs;
   2814 	unsigned secsize;
   2815 	int dowedges;
   2816 
   2817 	/* initialize the AutoConfig list */
   2818 	ac_list = NULL;
   2819 
   2820 	/*
   2821 	 * we begin by trolling through *all* the devices on the system *twice*
   2822 	 * first we scan for wedges, second for other devices. This avoids
   2823 	 * using a raw partition instead of a wedge that covers the whole disk
   2824 	 */
   2825 
   2826 	for (dowedges=1; dowedges>=0; --dowedges) {
   2827 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2828 		     dv = deviter_next(&di)) {
   2829 
   2830 			/* we are only interested in disks... */
   2831 			if (device_class(dv) != DV_DISK)
   2832 				continue;
   2833 
   2834 			/* we don't care about floppies... */
   2835 			if (device_is_a(dv, "fd")) {
   2836 				continue;
   2837 			}
   2838 
   2839 			/* we don't care about CD's... */
   2840 			if (device_is_a(dv, "cd")) {
   2841 				continue;
   2842 			}
   2843 
   2844 			/* we don't care about md's... */
   2845 			if (device_is_a(dv, "md")) {
   2846 				continue;
   2847 			}
   2848 
   2849 			/* hdfd is the Atari/Hades floppy driver */
   2850 			if (device_is_a(dv, "hdfd")) {
   2851 				continue;
   2852 			}
   2853 
   2854 			/* fdisa is the Atari/Milan floppy driver */
   2855 			if (device_is_a(dv, "fdisa")) {
   2856 				continue;
   2857 			}
   2858 
   2859 			/* are we in the wedges pass ? */
   2860 			wedge = device_is_a(dv, "dk");
   2861 			if (wedge != dowedges) {
   2862 				continue;
   2863 			}
   2864 
   2865 			/* need to find the device_name_to_block_device_major stuff */
   2866 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2867 
   2868 			rf_part_found = 0; /*No raid partition as yet*/
   2869 
   2870 			/* get a vnode for the raw partition of this disk */
   2871 			bminor = minor(device_unit(dv));
   2872 			dev = wedge ? makedev(bmajor, bminor) :
   2873 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2874 			if (bdevvp(dev, &vp))
   2875 				panic("RAID can't alloc vnode");
   2876 
   2877 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2878 
   2879 			if (error) {
   2880 				/* "Who cares."  Continue looking
   2881 				   for something that exists*/
   2882 				vput(vp);
   2883 				continue;
   2884 			}
   2885 
   2886 			error = getdisksize(vp, &numsecs, &secsize);
   2887 			if (error) {
   2888 				/*
   2889 				 * Pseudo devices like vnd and cgd can be
   2890 				 * opened but may still need some configuration.
   2891 				 * Ignore these quietly.
   2892 				 */
   2893 				if (error != ENXIO)
   2894 					printf("RAIDframe: can't get disk size"
   2895 					    " for dev %s (%d)\n",
   2896 					    device_xname(dv), error);
   2897 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2898 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2899 				vput(vp);
   2900 				continue;
   2901 			}
   2902 			if (wedge) {
   2903 				struct dkwedge_info dkw;
   2904 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2905 				    NOCRED);
   2906 				if (error) {
   2907 					printf("RAIDframe: can't get wedge info for "
   2908 					    "dev %s (%d)\n", device_xname(dv), error);
   2909 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2910 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2911 					vput(vp);
   2912 					continue;
   2913 				}
   2914 
   2915 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2916 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2917 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2918 					vput(vp);
   2919 					continue;
   2920 				}
   2921 
   2922 				ac_list = rf_get_component(ac_list, dev, vp,
   2923 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2924 				rf_part_found = 1; /*There is a raid component on this disk*/
   2925 				continue;
   2926 			}
   2927 
   2928 			/* Ok, the disk exists.  Go get the disklabel. */
   2929 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2930 			if (error) {
   2931 				/*
   2932 				 * XXX can't happen - open() would
   2933 				 * have errored out (or faked up one)
   2934 				 */
   2935 				if (error != ENOTTY)
   2936 					printf("RAIDframe: can't get label for dev "
   2937 					    "%s (%d)\n", device_xname(dv), error);
   2938 			}
   2939 
   2940 			/* don't need this any more.  We'll allocate it again
   2941 			   a little later if we really do... */
   2942 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2943 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2944 			vput(vp);
   2945 
   2946 			if (error)
   2947 				continue;
   2948 
   2949 			rf_part_found = 0; /*No raid partitions yet*/
   2950 			for (i = 0; i < label.d_npartitions; i++) {
   2951 				char cname[sizeof(ac_list->devname)];
   2952 
   2953 				/* We only support partitions marked as RAID */
   2954 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2955 					continue;
   2956 
   2957 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2958 				if (bdevvp(dev, &vp))
   2959 					panic("RAID can't alloc vnode");
   2960 
   2961 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2962 				if (error) {
   2963 					/* Whatever... */
   2964 					vput(vp);
   2965 					continue;
   2966 				}
   2967 				snprintf(cname, sizeof(cname), "%s%c",
   2968 				    device_xname(dv), 'a' + i);
   2969 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2970 					label.d_partitions[i].p_size, numsecs, secsize);
   2971 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2972 			}
   2973 
   2974 			/*
   2975 			 *If there is no raid component on this disk, either in a
   2976 			 *disklabel or inside a wedge, check the raw partition as well,
   2977 			 *as it is possible to configure raid components on raw disk
   2978 			 *devices.
   2979 			 */
   2980 
   2981 			if (!rf_part_found) {
   2982 				char cname[sizeof(ac_list->devname)];
   2983 
   2984 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2985 				if (bdevvp(dev, &vp))
   2986 					panic("RAID can't alloc vnode");
   2987 
   2988 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2989 				if (error) {
   2990 					/* Whatever... */
   2991 					vput(vp);
   2992 					continue;
   2993 				}
   2994 				snprintf(cname, sizeof(cname), "%s%c",
   2995 				    device_xname(dv), 'a' + RAW_PART);
   2996 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2997 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2998 			}
   2999 		}
   3000 		deviter_release(&di);
   3001 	}
   3002 	return ac_list;
   3003 }
   3004 
   3005 
   3006 int
   3007 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3008 {
   3009 
   3010 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3011 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3012 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3013 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3014 	    clabel->row >=0 &&
   3015 	    clabel->column >= 0 &&
   3016 	    clabel->num_rows > 0 &&
   3017 	    clabel->num_columns > 0 &&
   3018 	    clabel->row < clabel->num_rows &&
   3019 	    clabel->column < clabel->num_columns &&
   3020 	    clabel->blockSize > 0 &&
   3021 	    /*
   3022 	     * numBlocksHi may contain garbage, but it is ok since
   3023 	     * the type is unsigned.  If it is really garbage,
   3024 	     * rf_fix_old_label_size() will fix it.
   3025 	     */
   3026 	    rf_component_label_numblocks(clabel) > 0) {
   3027 		/*
   3028 		 * label looks reasonable enough...
   3029 		 * let's make sure it has no old garbage.
   3030 		 */
   3031 		if (numsecs)
   3032 			rf_fix_old_label_size(clabel, numsecs);
   3033 		return(1);
   3034 	}
   3035 	return(0);
   3036 }
   3037 
   3038 
   3039 /*
   3040  * For reasons yet unknown, some old component labels have garbage in
   3041  * the newer numBlocksHi region, and this causes lossage.  Since those
   3042  * disks will also have numsecs set to less than 32 bits of sectors,
   3043  * we can determine when this corruption has occurred, and fix it.
   3044  *
   3045  * The exact same problem, with the same unknown reason, happens to
   3046  * the partitionSizeHi member as well.
   3047  */
   3048 static void
   3049 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3050 {
   3051 
   3052 	if (numsecs < ((uint64_t)1 << 32)) {
   3053 		if (clabel->numBlocksHi) {
   3054 			printf("WARNING: total sectors < 32 bits, yet "
   3055 			       "numBlocksHi set\n"
   3056 			       "WARNING: resetting numBlocksHi to zero.\n");
   3057 			clabel->numBlocksHi = 0;
   3058 		}
   3059 
   3060 		if (clabel->partitionSizeHi) {
   3061 			printf("WARNING: total sectors < 32 bits, yet "
   3062 			       "partitionSizeHi set\n"
   3063 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3064 			clabel->partitionSizeHi = 0;
   3065 		}
   3066 	}
   3067 }
   3068 
   3069 
   3070 #ifdef DEBUG
   3071 void
   3072 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3073 {
   3074 	uint64_t numBlocks;
   3075 	static const char *rp[] = {
   3076 	    "No", "Force", "Soft", "*invalid*"
   3077 	};
   3078 
   3079 
   3080 	numBlocks = rf_component_label_numblocks(clabel);
   3081 
   3082 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3083 	       clabel->row, clabel->column,
   3084 	       clabel->num_rows, clabel->num_columns);
   3085 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3086 	       clabel->version, clabel->serial_number,
   3087 	       clabel->mod_counter);
   3088 	printf("   Clean: %s Status: %d\n",
   3089 	       clabel->clean ? "Yes" : "No", clabel->status);
   3090 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3091 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3092 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3093 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3094 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3095 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3096 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3097 #if 0
   3098 	   printf("   Config order: %d\n", clabel->config_order);
   3099 #endif
   3100 
   3101 }
   3102 #endif
   3103 
   3104 RF_ConfigSet_t *
   3105 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3106 {
   3107 	RF_AutoConfig_t *ac;
   3108 	RF_ConfigSet_t *config_sets;
   3109 	RF_ConfigSet_t *cset;
   3110 	RF_AutoConfig_t *ac_next;
   3111 
   3112 
   3113 	config_sets = NULL;
   3114 
   3115 	/* Go through the AutoConfig list, and figure out which components
   3116 	   belong to what sets.  */
   3117 	ac = ac_list;
   3118 	while(ac!=NULL) {
   3119 		/* we're going to putz with ac->next, so save it here
   3120 		   for use at the end of the loop */
   3121 		ac_next = ac->next;
   3122 
   3123 		if (config_sets == NULL) {
   3124 			/* will need at least this one... */
   3125 			config_sets = (RF_ConfigSet_t *)
   3126 				malloc(sizeof(RF_ConfigSet_t),
   3127 				       M_RAIDFRAME, M_NOWAIT);
   3128 			if (config_sets == NULL) {
   3129 				panic("rf_create_auto_sets: No memory!");
   3130 			}
   3131 			/* this one is easy :) */
   3132 			config_sets->ac = ac;
   3133 			config_sets->next = NULL;
   3134 			config_sets->rootable = 0;
   3135 			ac->next = NULL;
   3136 		} else {
   3137 			/* which set does this component fit into? */
   3138 			cset = config_sets;
   3139 			while(cset!=NULL) {
   3140 				if (rf_does_it_fit(cset, ac)) {
   3141 					/* looks like it matches... */
   3142 					ac->next = cset->ac;
   3143 					cset->ac = ac;
   3144 					break;
   3145 				}
   3146 				cset = cset->next;
   3147 			}
   3148 			if (cset==NULL) {
   3149 				/* didn't find a match above... new set..*/
   3150 				cset = (RF_ConfigSet_t *)
   3151 					malloc(sizeof(RF_ConfigSet_t),
   3152 					       M_RAIDFRAME, M_NOWAIT);
   3153 				if (cset == NULL) {
   3154 					panic("rf_create_auto_sets: No memory!");
   3155 				}
   3156 				cset->ac = ac;
   3157 				ac->next = NULL;
   3158 				cset->next = config_sets;
   3159 				cset->rootable = 0;
   3160 				config_sets = cset;
   3161 			}
   3162 		}
   3163 		ac = ac_next;
   3164 	}
   3165 
   3166 
   3167 	return(config_sets);
   3168 }
   3169 
   3170 static int
   3171 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3172 {
   3173 	RF_ComponentLabel_t *clabel1, *clabel2;
   3174 
   3175 	/* If this one matches the *first* one in the set, that's good
   3176 	   enough, since the other members of the set would have been
   3177 	   through here too... */
   3178 	/* note that we are not checking partitionSize here..
   3179 
   3180 	   Note that we are also not checking the mod_counters here.
   3181 	   If everything else matches except the mod_counter, that's
   3182 	   good enough for this test.  We will deal with the mod_counters
   3183 	   a little later in the autoconfiguration process.
   3184 
   3185 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3186 
   3187 	   The reason we don't check for this is that failed disks
   3188 	   will have lower modification counts.  If those disks are
   3189 	   not added to the set they used to belong to, then they will
   3190 	   form their own set, which may result in 2 different sets,
   3191 	   for example, competing to be configured at raid0, and
   3192 	   perhaps competing to be the root filesystem set.  If the
   3193 	   wrong ones get configured, or both attempt to become /,
   3194 	   weird behaviour and or serious lossage will occur.  Thus we
   3195 	   need to bring them into the fold here, and kick them out at
   3196 	   a later point.
   3197 
   3198 	*/
   3199 
   3200 	clabel1 = cset->ac->clabel;
   3201 	clabel2 = ac->clabel;
   3202 	if ((clabel1->version == clabel2->version) &&
   3203 	    (clabel1->serial_number == clabel2->serial_number) &&
   3204 	    (clabel1->num_rows == clabel2->num_rows) &&
   3205 	    (clabel1->num_columns == clabel2->num_columns) &&
   3206 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3207 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3208 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3209 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3210 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3211 	    (clabel1->blockSize == clabel2->blockSize) &&
   3212 	    rf_component_label_numblocks(clabel1) ==
   3213 	    rf_component_label_numblocks(clabel2) &&
   3214 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3215 	    (clabel1->root_partition == clabel2->root_partition) &&
   3216 	    (clabel1->last_unit == clabel2->last_unit) &&
   3217 	    (clabel1->config_order == clabel2->config_order)) {
   3218 		/* if it get's here, it almost *has* to be a match */
   3219 	} else {
   3220 		/* it's not consistent with somebody in the set..
   3221 		   punt */
   3222 		return(0);
   3223 	}
   3224 	/* all was fine.. it must fit... */
   3225 	return(1);
   3226 }
   3227 
   3228 int
   3229 rf_have_enough_components(RF_ConfigSet_t *cset)
   3230 {
   3231 	RF_AutoConfig_t *ac;
   3232 	RF_AutoConfig_t *auto_config;
   3233 	RF_ComponentLabel_t *clabel;
   3234 	int c;
   3235 	int num_cols;
   3236 	int num_missing;
   3237 	int mod_counter;
   3238 	int mod_counter_found;
   3239 	int even_pair_failed;
   3240 	char parity_type;
   3241 
   3242 
   3243 	/* check to see that we have enough 'live' components
   3244 	   of this set.  If so, we can configure it if necessary */
   3245 
   3246 	num_cols = cset->ac->clabel->num_columns;
   3247 	parity_type = cset->ac->clabel->parityConfig;
   3248 
   3249 	/* XXX Check for duplicate components!?!?!? */
   3250 
   3251 	/* Determine what the mod_counter is supposed to be for this set. */
   3252 
   3253 	mod_counter_found = 0;
   3254 	mod_counter = 0;
   3255 	ac = cset->ac;
   3256 	while(ac!=NULL) {
   3257 		if (mod_counter_found==0) {
   3258 			mod_counter = ac->clabel->mod_counter;
   3259 			mod_counter_found = 1;
   3260 		} else {
   3261 			if (ac->clabel->mod_counter > mod_counter) {
   3262 				mod_counter = ac->clabel->mod_counter;
   3263 			}
   3264 		}
   3265 		ac = ac->next;
   3266 	}
   3267 
   3268 	num_missing = 0;
   3269 	auto_config = cset->ac;
   3270 
   3271 	even_pair_failed = 0;
   3272 	for(c=0; c<num_cols; c++) {
   3273 		ac = auto_config;
   3274 		while(ac!=NULL) {
   3275 			if ((ac->clabel->column == c) &&
   3276 			    (ac->clabel->mod_counter == mod_counter)) {
   3277 				/* it's this one... */
   3278 #ifdef DEBUG
   3279 				printf("Found: %s at %d\n",
   3280 				       ac->devname,c);
   3281 #endif
   3282 				break;
   3283 			}
   3284 			ac=ac->next;
   3285 		}
   3286 		if (ac==NULL) {
   3287 				/* Didn't find one here! */
   3288 				/* special case for RAID 1, especially
   3289 				   where there are more than 2
   3290 				   components (where RAIDframe treats
   3291 				   things a little differently :( ) */
   3292 			if (parity_type == '1') {
   3293 				if (c%2 == 0) { /* even component */
   3294 					even_pair_failed = 1;
   3295 				} else { /* odd component.  If
   3296 					    we're failed, and
   3297 					    so is the even
   3298 					    component, it's
   3299 					    "Good Night, Charlie" */
   3300 					if (even_pair_failed == 1) {
   3301 						return(0);
   3302 					}
   3303 				}
   3304 			} else {
   3305 				/* normal accounting */
   3306 				num_missing++;
   3307 			}
   3308 		}
   3309 		if ((parity_type == '1') && (c%2 == 1)) {
   3310 				/* Just did an even component, and we didn't
   3311 				   bail.. reset the even_pair_failed flag,
   3312 				   and go on to the next component.... */
   3313 			even_pair_failed = 0;
   3314 		}
   3315 	}
   3316 
   3317 	clabel = cset->ac->clabel;
   3318 
   3319 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3320 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3321 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3322 		/* XXX this needs to be made *much* more general */
   3323 		/* Too many failures */
   3324 		return(0);
   3325 	}
   3326 	/* otherwise, all is well, and we've got enough to take a kick
   3327 	   at autoconfiguring this set */
   3328 	return(1);
   3329 }
   3330 
   3331 void
   3332 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3333 			RF_Raid_t *raidPtr)
   3334 {
   3335 	RF_ComponentLabel_t *clabel;
   3336 	int i;
   3337 
   3338 	clabel = ac->clabel;
   3339 
   3340 	/* 1. Fill in the common stuff */
   3341 	config->numRow = clabel->num_rows = 1;
   3342 	config->numCol = clabel->num_columns;
   3343 	config->numSpare = 0; /* XXX should this be set here? */
   3344 	config->sectPerSU = clabel->sectPerSU;
   3345 	config->SUsPerPU = clabel->SUsPerPU;
   3346 	config->SUsPerRU = clabel->SUsPerRU;
   3347 	config->parityConfig = clabel->parityConfig;
   3348 	/* XXX... */
   3349 	strcpy(config->diskQueueType,"fifo");
   3350 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3351 	config->layoutSpecificSize = 0; /* XXX ?? */
   3352 
   3353 	while(ac!=NULL) {
   3354 		/* row/col values will be in range due to the checks
   3355 		   in reasonable_label() */
   3356 		strcpy(config->devnames[0][ac->clabel->column],
   3357 		       ac->devname);
   3358 		ac = ac->next;
   3359 	}
   3360 
   3361 	for(i=0;i<RF_MAXDBGV;i++) {
   3362 		config->debugVars[i][0] = 0;
   3363 	}
   3364 }
   3365 
   3366 int
   3367 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3368 {
   3369 	RF_ComponentLabel_t *clabel;
   3370 	int column;
   3371 	int sparecol;
   3372 
   3373 	raidPtr->autoconfigure = new_value;
   3374 
   3375 	for(column=0; column<raidPtr->numCol; column++) {
   3376 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3377 			clabel = raidget_component_label(raidPtr, column);
   3378 			clabel->autoconfigure = new_value;
   3379 			raidflush_component_label(raidPtr, column);
   3380 		}
   3381 	}
   3382 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3383 		sparecol = raidPtr->numCol + column;
   3384 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3385 			clabel = raidget_component_label(raidPtr, sparecol);
   3386 			clabel->autoconfigure = new_value;
   3387 			raidflush_component_label(raidPtr, sparecol);
   3388 		}
   3389 	}
   3390 	return(new_value);
   3391 }
   3392 
   3393 int
   3394 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3395 {
   3396 	RF_ComponentLabel_t *clabel;
   3397 	int column;
   3398 	int sparecol;
   3399 
   3400 	raidPtr->root_partition = new_value;
   3401 	for(column=0; column<raidPtr->numCol; column++) {
   3402 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3403 			clabel = raidget_component_label(raidPtr, column);
   3404 			clabel->root_partition = new_value;
   3405 			raidflush_component_label(raidPtr, column);
   3406 		}
   3407 	}
   3408 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3409 		sparecol = raidPtr->numCol + column;
   3410 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3411 			clabel = raidget_component_label(raidPtr, sparecol);
   3412 			clabel->root_partition = new_value;
   3413 			raidflush_component_label(raidPtr, sparecol);
   3414 		}
   3415 	}
   3416 	return(new_value);
   3417 }
   3418 
   3419 void
   3420 rf_release_all_vps(RF_ConfigSet_t *cset)
   3421 {
   3422 	RF_AutoConfig_t *ac;
   3423 
   3424 	ac = cset->ac;
   3425 	while(ac!=NULL) {
   3426 		/* Close the vp, and give it back */
   3427 		if (ac->vp) {
   3428 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3429 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3430 			vput(ac->vp);
   3431 			ac->vp = NULL;
   3432 		}
   3433 		ac = ac->next;
   3434 	}
   3435 }
   3436 
   3437 
   3438 void
   3439 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3440 {
   3441 	RF_AutoConfig_t *ac;
   3442 	RF_AutoConfig_t *next_ac;
   3443 
   3444 	ac = cset->ac;
   3445 	while(ac!=NULL) {
   3446 		next_ac = ac->next;
   3447 		/* nuke the label */
   3448 		free(ac->clabel, M_RAIDFRAME);
   3449 		/* cleanup the config structure */
   3450 		free(ac, M_RAIDFRAME);
   3451 		/* "next.." */
   3452 		ac = next_ac;
   3453 	}
   3454 	/* and, finally, nuke the config set */
   3455 	free(cset, M_RAIDFRAME);
   3456 }
   3457 
   3458 
   3459 void
   3460 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3461 {
   3462 	/* current version number */
   3463 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3464 	clabel->serial_number = raidPtr->serial_number;
   3465 	clabel->mod_counter = raidPtr->mod_counter;
   3466 
   3467 	clabel->num_rows = 1;
   3468 	clabel->num_columns = raidPtr->numCol;
   3469 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3470 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3471 
   3472 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3473 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3474 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3475 
   3476 	clabel->blockSize = raidPtr->bytesPerSector;
   3477 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3478 
   3479 	/* XXX not portable */
   3480 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3481 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3482 	clabel->autoconfigure = raidPtr->autoconfigure;
   3483 	clabel->root_partition = raidPtr->root_partition;
   3484 	clabel->last_unit = raidPtr->raidid;
   3485 	clabel->config_order = raidPtr->config_order;
   3486 
   3487 #ifndef RF_NO_PARITY_MAP
   3488 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3489 #endif
   3490 }
   3491 
   3492 struct raid_softc *
   3493 rf_auto_config_set(RF_ConfigSet_t *cset)
   3494 {
   3495 	RF_Raid_t *raidPtr;
   3496 	RF_Config_t *config;
   3497 	int raidID;
   3498 	struct raid_softc *sc;
   3499 
   3500 #ifdef DEBUG
   3501 	printf("RAID autoconfigure\n");
   3502 #endif
   3503 
   3504 	/* 1. Create a config structure */
   3505 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3506 	if (config == NULL) {
   3507 		printf("%s: Out of mem - config!?!?\n", __func__);
   3508 				/* XXX do something more intelligent here. */
   3509 		return NULL;
   3510 	}
   3511 
   3512 	/*
   3513 	   2. Figure out what RAID ID this one is supposed to live at
   3514 	   See if we can get the same RAID dev that it was configured
   3515 	   on last time..
   3516 	*/
   3517 
   3518 	raidID = cset->ac->clabel->last_unit;
   3519 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3520 	     sc = raidget(++raidID, false))
   3521 		continue;
   3522 #ifdef DEBUG
   3523 	printf("Configuring raid%d:\n",raidID);
   3524 #endif
   3525 
   3526 	if (sc == NULL)
   3527 		sc = raidget(raidID, true);
   3528 	if (sc == NULL) {
   3529 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3530 				/* XXX do something more intelligent here. */
   3531 		free(config, M_RAIDFRAME);
   3532 		return NULL;
   3533 	}
   3534 
   3535 	raidPtr = &sc->sc_r;
   3536 
   3537 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3538 	raidPtr->softc = sc;
   3539 	raidPtr->raidid = raidID;
   3540 	raidPtr->openings = RAIDOUTSTANDING;
   3541 
   3542 	/* 3. Build the configuration structure */
   3543 	rf_create_configuration(cset->ac, config, raidPtr);
   3544 
   3545 	/* 4. Do the configuration */
   3546 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3547 		raidinit(sc);
   3548 
   3549 		rf_markalldirty(raidPtr);
   3550 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3551 		switch (cset->ac->clabel->root_partition) {
   3552 		case 1:	/* Force Root */
   3553 		case 2:	/* Soft Root: root when boot partition part of raid */
   3554 			/*
   3555 			 * everything configured just fine.  Make a note
   3556 			 * that this set is eligible to be root,
   3557 			 * or forced to be root
   3558 			 */
   3559 			cset->rootable = cset->ac->clabel->root_partition;
   3560 			/* XXX do this here? */
   3561 			raidPtr->root_partition = cset->rootable;
   3562 			break;
   3563 		default:
   3564 			break;
   3565 		}
   3566 	} else {
   3567 		raidput(sc);
   3568 		sc = NULL;
   3569 	}
   3570 
   3571 	/* 5. Cleanup */
   3572 	free(config, M_RAIDFRAME);
   3573 	return sc;
   3574 }
   3575 
   3576 void
   3577 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3578 	     size_t xmin, size_t xmax)
   3579 {
   3580 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3581 	pool_sethiwat(p, xmax);
   3582 	pool_prime(p, xmin);
   3583 	pool_setlowat(p, xmin);
   3584 }
   3585 
   3586 /*
   3587  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3588  * to see if there is IO pending and if that IO could possibly be done
   3589  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3590  * otherwise.
   3591  *
   3592  */
   3593 int
   3594 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3595 {
   3596 	struct raid_softc *rs;
   3597 	struct dk_softc *dksc;
   3598 
   3599 	rs = raidPtr->softc;
   3600 	dksc = &rs->sc_dksc;
   3601 
   3602 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3603 		return 1;
   3604 
   3605 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3606 		/* there is work to do */
   3607 		return 0;
   3608 	}
   3609 	/* default is nothing to do */
   3610 	return 1;
   3611 }
   3612 
   3613 int
   3614 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3615 {
   3616 	uint64_t numsecs;
   3617 	unsigned secsize;
   3618 	int error;
   3619 
   3620 	error = getdisksize(vp, &numsecs, &secsize);
   3621 	if (error == 0) {
   3622 		diskPtr->blockSize = secsize;
   3623 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3624 		diskPtr->partitionSize = numsecs;
   3625 		return 0;
   3626 	}
   3627 	return error;
   3628 }
   3629 
   3630 static int
   3631 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3632 {
   3633 	return 1;
   3634 }
   3635 
   3636 static void
   3637 raid_attach(device_t parent, device_t self, void *aux)
   3638 {
   3639 }
   3640 
   3641 
   3642 static int
   3643 raid_detach(device_t self, int flags)
   3644 {
   3645 	int error;
   3646 	struct raid_softc *rs = raidsoftc(self);
   3647 
   3648 	if (rs == NULL)
   3649 		return ENXIO;
   3650 
   3651 	if ((error = raidlock(rs)) != 0)
   3652 		return (error);
   3653 
   3654 	error = raid_detach_unlocked(rs);
   3655 
   3656 	raidunlock(rs);
   3657 
   3658 	/* XXX raid can be referenced here */
   3659 
   3660 	if (error)
   3661 		return error;
   3662 
   3663 	/* Free the softc */
   3664 	raidput(rs);
   3665 
   3666 	return 0;
   3667 }
   3668 
   3669 static void
   3670 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3671 {
   3672 	struct dk_softc *dksc = &rs->sc_dksc;
   3673 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3674 
   3675 	memset(dg, 0, sizeof(*dg));
   3676 
   3677 	dg->dg_secperunit = raidPtr->totalSectors;
   3678 	dg->dg_secsize = raidPtr->bytesPerSector;
   3679 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3680 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3681 
   3682 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3683 }
   3684 
   3685 /*
   3686  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3687  * We end up returning whatever error was returned by the first cache flush
   3688  * that fails.
   3689  */
   3690 
   3691 int
   3692 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3693 {
   3694 	int c, sparecol;
   3695 	int e,error;
   3696 	int force = 1;
   3697 
   3698 	error = 0;
   3699 	for (c = 0; c < raidPtr->numCol; c++) {
   3700 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3701 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3702 					  &force, FWRITE, NOCRED);
   3703 			if (e) {
   3704 				if (e != ENODEV)
   3705 					printf("raid%d: cache flush to component %s failed.\n",
   3706 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3707 				if (error == 0) {
   3708 					error = e;
   3709 				}
   3710 			}
   3711 		}
   3712 	}
   3713 
   3714 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3715 		sparecol = raidPtr->numCol + c;
   3716 		/* Need to ensure that the reconstruct actually completed! */
   3717 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3718 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3719 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3720 			if (e) {
   3721 				if (e != ENODEV)
   3722 					printf("raid%d: cache flush to component %s failed.\n",
   3723 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3724 				if (error == 0) {
   3725 					error = e;
   3726 				}
   3727 			}
   3728 		}
   3729 	}
   3730 	return error;
   3731 }
   3732 
   3733 /*
   3734  * Module interface
   3735  */
   3736 
   3737 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3738 
   3739 #ifdef _MODULE
   3740 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3741 #endif
   3742 
   3743 static int raid_modcmd(modcmd_t, void *);
   3744 static int raid_modcmd_init(void);
   3745 static int raid_modcmd_fini(void);
   3746 
   3747 static int
   3748 raid_modcmd(modcmd_t cmd, void *data)
   3749 {
   3750 	int error;
   3751 
   3752 	error = 0;
   3753 	switch (cmd) {
   3754 	case MODULE_CMD_INIT:
   3755 		error = raid_modcmd_init();
   3756 		break;
   3757 	case MODULE_CMD_FINI:
   3758 		error = raid_modcmd_fini();
   3759 		break;
   3760 	default:
   3761 		error = ENOTTY;
   3762 		break;
   3763 	}
   3764 	return error;
   3765 }
   3766 
   3767 static int
   3768 raid_modcmd_init(void)
   3769 {
   3770 	int error;
   3771 #ifdef _MODULE
   3772 	int bmajor, cmajor;
   3773 #endif
   3774 
   3775 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3776 	mutex_enter(&raid_lock);
   3777 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3778 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3779 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3780 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3781 
   3782 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3783 #endif
   3784 
   3785 #ifdef _MODULE
   3786 	bmajor = cmajor = -1;
   3787 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3788 	    &raid_cdevsw, &cmajor);
   3789 	if (error != 0) {
   3790 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3791 		mutex_exit(&raid_lock);
   3792 		return error;
   3793 	}
   3794 	error = config_cfdriver_attach(&raid_cd);
   3795 	if (error != 0) {
   3796 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3797 		    __func__, error);
   3798 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3799 		mutex_exit(&raid_lock);
   3800 		return error;
   3801 	}
   3802 #endif
   3803 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3804 	if (error != 0) {
   3805 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3806 		    __func__, error);
   3807 #ifdef _MODULE
   3808 		config_cfdriver_detach(&raid_cd);
   3809 #endif
   3810 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3811 		mutex_exit(&raid_lock);
   3812 		return error;
   3813 	}
   3814 
   3815 	raidautoconfigdone = false;
   3816 
   3817 	mutex_exit(&raid_lock);
   3818 
   3819 	if (error == 0) {
   3820 		if (rf_BootRaidframe(true) == 0)
   3821 			aprint_verbose("Kernelized RAIDframe activated\n");
   3822 		else
   3823 			panic("Serious error activating RAID!!");
   3824 	}
   3825 
   3826 	/*
   3827 	 * Register a finalizer which will be used to auto-config RAID
   3828 	 * sets once all real hardware devices have been found.
   3829 	 */
   3830 	error = config_finalize_register(NULL, rf_autoconfig);
   3831 	if (error != 0) {
   3832 		aprint_error("WARNING: unable to register RAIDframe "
   3833 		    "finalizer\n");
   3834 		error = 0;
   3835 	}
   3836 
   3837 	return error;
   3838 }
   3839 
   3840 static int
   3841 raid_modcmd_fini(void)
   3842 {
   3843 	int error;
   3844 
   3845 	mutex_enter(&raid_lock);
   3846 
   3847 	/* Don't allow unload if raid device(s) exist.  */
   3848 	if (!LIST_EMPTY(&raids)) {
   3849 		mutex_exit(&raid_lock);
   3850 		return EBUSY;
   3851 	}
   3852 
   3853 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3854 	if (error != 0) {
   3855 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3856 		mutex_exit(&raid_lock);
   3857 		return error;
   3858 	}
   3859 #ifdef _MODULE
   3860 	error = config_cfdriver_detach(&raid_cd);
   3861 	if (error != 0) {
   3862 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3863 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3864 		mutex_exit(&raid_lock);
   3865 		return error;
   3866 	}
   3867 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3868 	if (error != 0) {
   3869 		aprint_error("%s: cannot detach devsw\n",__func__);
   3870 		config_cfdriver_attach(&raid_cd);
   3871 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3872 		mutex_exit(&raid_lock);
   3873 		return error;
   3874 	}
   3875 #endif
   3876 	rf_BootRaidframe(false);
   3877 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3878 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3879 	rf_destroy_cond2(rf_sparet_wait_cv);
   3880 	rf_destroy_cond2(rf_sparet_resp_cv);
   3881 #endif
   3882 	mutex_exit(&raid_lock);
   3883 	mutex_destroy(&raid_lock);
   3884 
   3885 	return error;
   3886 }
   3887