Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.351
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.351 2017/11/09 01:02:56 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.351 2017/11/09 01:02:56 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #include "ioconf.h"
    156 
    157 #ifdef DEBUG
    158 int     rf_kdebug_level = 0;
    159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    160 #else				/* DEBUG */
    161 #define db1_printf(a) { }
    162 #endif				/* DEBUG */
    163 
    164 #ifdef DEBUG_ROOT
    165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    166 #else
    167 #define DPRINTF(a, ...)
    168 #endif
    169 
    170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    171 static rf_declare_mutex2(rf_sparet_wait_mutex);
    172 static rf_declare_cond2(rf_sparet_wait_cv);
    173 static rf_declare_cond2(rf_sparet_resp_cv);
    174 
    175 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    176 						 * spare table */
    177 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    178 						 * installation process */
    179 #endif
    180 
    181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    182 
    183 /* prototypes */
    184 static void KernelWakeupFunc(struct buf *);
    185 static void InitBP(struct buf *, struct vnode *, unsigned,
    186     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    187     void *, int, struct proc *);
    188 struct raid_softc;
    189 static void raidinit(struct raid_softc *);
    190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    192 
    193 static int raid_match(device_t, cfdata_t, void *);
    194 static void raid_attach(device_t, device_t, void *);
    195 static int raid_detach(device_t, int);
    196 
    197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t);
    199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    200     daddr_t, daddr_t, int);
    201 
    202 static int raidwrite_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 static int raidread_component_label(unsigned,
    205     dev_t, struct vnode *, RF_ComponentLabel_t *);
    206 
    207 static int raid_diskstart(device_t, struct buf *bp);
    208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    209 static int raid_lastclose(device_t);
    210 
    211 static dev_type_open(raidopen);
    212 static dev_type_close(raidclose);
    213 static dev_type_read(raidread);
    214 static dev_type_write(raidwrite);
    215 static dev_type_ioctl(raidioctl);
    216 static dev_type_strategy(raidstrategy);
    217 static dev_type_dump(raiddump);
    218 static dev_type_size(raidsize);
    219 
    220 const struct bdevsw raid_bdevsw = {
    221 	.d_open = raidopen,
    222 	.d_close = raidclose,
    223 	.d_strategy = raidstrategy,
    224 	.d_ioctl = raidioctl,
    225 	.d_dump = raiddump,
    226 	.d_psize = raidsize,
    227 	.d_discard = nodiscard,
    228 	.d_flag = D_DISK
    229 };
    230 
    231 const struct cdevsw raid_cdevsw = {
    232 	.d_open = raidopen,
    233 	.d_close = raidclose,
    234 	.d_read = raidread,
    235 	.d_write = raidwrite,
    236 	.d_ioctl = raidioctl,
    237 	.d_stop = nostop,
    238 	.d_tty = notty,
    239 	.d_poll = nopoll,
    240 	.d_mmap = nommap,
    241 	.d_kqfilter = nokqfilter,
    242 	.d_discard = nodiscard,
    243 	.d_flag = D_DISK
    244 };
    245 
    246 static struct dkdriver rf_dkdriver = {
    247 	.d_open = raidopen,
    248 	.d_close = raidclose,
    249 	.d_strategy = raidstrategy,
    250 	.d_diskstart = raid_diskstart,
    251 	.d_dumpblocks = raid_dumpblocks,
    252 	.d_lastclose = raid_lastclose,
    253 	.d_minphys = minphys
    254 };
    255 
    256 struct raid_softc {
    257 	struct dk_softc sc_dksc;
    258 	int	sc_unit;
    259 	int     sc_flags;	/* flags */
    260 	int     sc_cflags;	/* configuration flags */
    261 	kmutex_t sc_mutex;	/* interlock mutex */
    262 	kcondvar_t sc_cv;	/* and the condvar */
    263 	uint64_t sc_size;	/* size of the raid device */
    264 	char    sc_xname[20];	/* XXX external name */
    265 	RF_Raid_t sc_r;
    266 	LIST_ENTRY(raid_softc) sc_link;
    267 };
    268 /* sc_flags */
    269 #define RAIDF_INITED		0x01	/* unit has been initialized */
    270 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    271 #define RAIDF_DETACH  		0x04	/* detach after final close */
    272 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    273 #define RAIDF_LOCKED		0x10	/* unit is locked */
    274 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    275 
    276 #define	raidunit(x)	DISKUNIT(x)
    277 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    278 
    279 extern struct cfdriver raid_cd;
    280 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    281     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    282     DVF_DETACH_SHUTDOWN);
    283 
    284 /*
    285  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    286  * Be aware that large numbers can allow the driver to consume a lot of
    287  * kernel memory, especially on writes, and in degraded mode reads.
    288  *
    289  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    290  * a single 64K write will typically require 64K for the old data,
    291  * 64K for the old parity, and 64K for the new parity, for a total
    292  * of 192K (if the parity buffer is not re-used immediately).
    293  * Even it if is used immediately, that's still 128K, which when multiplied
    294  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    295  *
    296  * Now in degraded mode, for example, a 64K read on the above setup may
    297  * require data reconstruction, which will require *all* of the 4 remaining
    298  * disks to participate -- 4 * 32K/disk == 128K again.
    299  */
    300 
    301 #ifndef RAIDOUTSTANDING
    302 #define RAIDOUTSTANDING   6
    303 #endif
    304 
    305 #define RAIDLABELDEV(dev)	\
    306 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    307 
    308 /* declared here, and made public, for the benefit of KVM stuff.. */
    309 
    310 static int raidlock(struct raid_softc *);
    311 static void raidunlock(struct raid_softc *);
    312 
    313 static int raid_detach_unlocked(struct raid_softc *);
    314 
    315 static void rf_markalldirty(RF_Raid_t *);
    316 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    317 
    318 void rf_ReconThread(struct rf_recon_req *);
    319 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    320 void rf_CopybackThread(RF_Raid_t *raidPtr);
    321 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    322 int rf_autoconfig(device_t);
    323 void rf_buildroothack(RF_ConfigSet_t *);
    324 
    325 RF_AutoConfig_t *rf_find_raid_components(void);
    326 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    327 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    328 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    329 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    330 int rf_set_autoconfig(RF_Raid_t *, int);
    331 int rf_set_rootpartition(RF_Raid_t *, int);
    332 void rf_release_all_vps(RF_ConfigSet_t *);
    333 void rf_cleanup_config_set(RF_ConfigSet_t *);
    334 int rf_have_enough_components(RF_ConfigSet_t *);
    335 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    336 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    337 
    338 /*
    339  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    340  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    341  * in the kernel config file.
    342  */
    343 #ifdef RAID_AUTOCONFIG
    344 int raidautoconfig = 1;
    345 #else
    346 int raidautoconfig = 0;
    347 #endif
    348 static bool raidautoconfigdone = false;
    349 
    350 struct RF_Pools_s rf_pools;
    351 
    352 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    353 static kmutex_t raid_lock;
    354 
    355 static struct raid_softc *
    356 raidcreate(int unit) {
    357 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    358 	sc->sc_unit = unit;
    359 	cv_init(&sc->sc_cv, "raidunit");
    360 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    361 	return sc;
    362 }
    363 
    364 static void
    365 raiddestroy(struct raid_softc *sc) {
    366 	cv_destroy(&sc->sc_cv);
    367 	mutex_destroy(&sc->sc_mutex);
    368 	kmem_free(sc, sizeof(*sc));
    369 }
    370 
    371 static struct raid_softc *
    372 raidget(int unit, bool create) {
    373 	struct raid_softc *sc;
    374 	if (unit < 0) {
    375 #ifdef DIAGNOSTIC
    376 		panic("%s: unit %d!", __func__, unit);
    377 #endif
    378 		return NULL;
    379 	}
    380 	mutex_enter(&raid_lock);
    381 	LIST_FOREACH(sc, &raids, sc_link) {
    382 		if (sc->sc_unit == unit) {
    383 			mutex_exit(&raid_lock);
    384 			return sc;
    385 		}
    386 	}
    387 	mutex_exit(&raid_lock);
    388 	if (!create)
    389 		return NULL;
    390 	if ((sc = raidcreate(unit)) == NULL)
    391 		return NULL;
    392 	mutex_enter(&raid_lock);
    393 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    394 	mutex_exit(&raid_lock);
    395 	return sc;
    396 }
    397 
    398 static void
    399 raidput(struct raid_softc *sc) {
    400 	mutex_enter(&raid_lock);
    401 	LIST_REMOVE(sc, sc_link);
    402 	mutex_exit(&raid_lock);
    403 	raiddestroy(sc);
    404 }
    405 
    406 void
    407 raidattach(int num)
    408 {
    409 
    410 	/*
    411 	 * Device attachment and associated initialization now occurs
    412 	 * as part of the module initialization.
    413 	 */
    414 }
    415 
    416 int
    417 rf_autoconfig(device_t self)
    418 {
    419 	RF_AutoConfig_t *ac_list;
    420 	RF_ConfigSet_t *config_sets;
    421 
    422 	if (!raidautoconfig || raidautoconfigdone == true)
    423 		return (0);
    424 
    425 	/* XXX This code can only be run once. */
    426 	raidautoconfigdone = true;
    427 
    428 #ifdef __HAVE_CPU_BOOTCONF
    429 	/*
    430 	 * 0. find the boot device if needed first so we can use it later
    431 	 * this needs to be done before we autoconfigure any raid sets,
    432 	 * because if we use wedges we are not going to be able to open
    433 	 * the boot device later
    434 	 */
    435 	if (booted_device == NULL)
    436 		cpu_bootconf();
    437 #endif
    438 	/* 1. locate all RAID components on the system */
    439 	aprint_debug("Searching for RAID components...\n");
    440 	ac_list = rf_find_raid_components();
    441 
    442 	/* 2. Sort them into their respective sets. */
    443 	config_sets = rf_create_auto_sets(ac_list);
    444 
    445 	/*
    446 	 * 3. Evaluate each set and configure the valid ones.
    447 	 * This gets done in rf_buildroothack().
    448 	 */
    449 	rf_buildroothack(config_sets);
    450 
    451 	return 1;
    452 }
    453 
    454 static int
    455 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    456 	const char *bootname = device_xname(bdv);
    457 	size_t len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 void
    479 rf_buildroothack(RF_ConfigSet_t *config_sets)
    480 {
    481 	RF_ConfigSet_t *cset;
    482 	RF_ConfigSet_t *next_cset;
    483 	int num_root;
    484 	struct raid_softc *sc, *rsc;
    485 	struct dk_softc *dksc;
    486 
    487 	sc = rsc = NULL;
    488 	num_root = 0;
    489 	cset = config_sets;
    490 	while (cset != NULL) {
    491 		next_cset = cset->next;
    492 		if (rf_have_enough_components(cset) &&
    493 		    cset->ac->clabel->autoconfigure == 1) {
    494 			sc = rf_auto_config_set(cset);
    495 			if (sc != NULL) {
    496 				aprint_debug("raid%d: configured ok\n",
    497 				    sc->sc_unit);
    498 				if (cset->rootable) {
    499 					rsc = sc;
    500 					num_root++;
    501 				}
    502 			} else {
    503 				/* The autoconfig didn't work :( */
    504 				aprint_debug("Autoconfig failed\n");
    505 				rf_release_all_vps(cset);
    506 			}
    507 		} else {
    508 			/* we're not autoconfiguring this set...
    509 			   release the associated resources */
    510 			rf_release_all_vps(cset);
    511 		}
    512 		/* cleanup */
    513 		rf_cleanup_config_set(cset);
    514 		cset = next_cset;
    515 	}
    516 	dksc = &rsc->sc_dksc;
    517 
    518 	/* if the user has specified what the root device should be
    519 	   then we don't touch booted_device or boothowto... */
    520 
    521 	if (rootspec != NULL)
    522 		return;
    523 
    524 	/* we found something bootable... */
    525 
    526 	/*
    527 	 * XXX: The following code assumes that the root raid
    528 	 * is the first ('a') partition. This is about the best
    529 	 * we can do with a BSD disklabel, but we might be able
    530 	 * to do better with a GPT label, by setting a specified
    531 	 * attribute to indicate the root partition. We can then
    532 	 * stash the partition number in the r->root_partition
    533 	 * high bits (the bottom 2 bits are already used). For
    534 	 * now we just set booted_partition to 0 when we override
    535 	 * root.
    536 	 */
    537 	if (num_root == 1) {
    538 		device_t candidate_root;
    539 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    540 			char cname[sizeof(cset->ac->devname)];
    541 			/* XXX: assume partition 'a' first */
    542 			snprintf(cname, sizeof(cname), "%s%c",
    543 			    device_xname(dksc->sc_dev), 'a');
    544 			candidate_root = dkwedge_find_by_wname(cname);
    545 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    546 			    cname);
    547 			if (candidate_root == NULL) {
    548 				/*
    549 				 * If that is not found, because we don't use
    550 				 * disklabel, return the first dk child
    551 				 * XXX: we can skip the 'a' check above
    552 				 * and always do this...
    553 				 */
    554 				size_t i = 0;
    555 				candidate_root = dkwedge_find_by_parent(
    556 				    device_xname(dksc->sc_dev), &i);
    557 			}
    558 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    559 			    candidate_root);
    560 		} else
    561 			candidate_root = dksc->sc_dev;
    562 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    563 		DPRINTF("%s: booted_device=%p root_partition=%d "
    564 		   "contains_boot=%d\n", __func__, booted_device,
    565 		   rsc->sc_r.root_partition,
    566 		   rf_containsboot(&rsc->sc_r, booted_device));
    567 		if (booted_device == NULL ||
    568 		    rsc->sc_r.root_partition == 1 ||
    569 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    570 			booted_device = candidate_root;
    571 			booted_method = "raidframe/single";
    572 			booted_partition = 0;	/* XXX assume 'a' */
    573 		}
    574 	} else if (num_root > 1) {
    575 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    576 		    booted_device);
    577 
    578 		/*
    579 		 * Maybe the MD code can help. If it cannot, then
    580 		 * setroot() will discover that we have no
    581 		 * booted_device and will ask the user if nothing was
    582 		 * hardwired in the kernel config file
    583 		 */
    584 		if (booted_device == NULL)
    585 			return;
    586 
    587 		num_root = 0;
    588 		mutex_enter(&raid_lock);
    589 		LIST_FOREACH(sc, &raids, sc_link) {
    590 			RF_Raid_t *r = &sc->sc_r;
    591 			if (r->valid == 0)
    592 				continue;
    593 
    594 			if (r->root_partition == 0)
    595 				continue;
    596 
    597 			if (rf_containsboot(r, booted_device)) {
    598 				num_root++;
    599 				rsc = sc;
    600 				dksc = &rsc->sc_dksc;
    601 			}
    602 		}
    603 		mutex_exit(&raid_lock);
    604 
    605 		if (num_root == 1) {
    606 			booted_device = dksc->sc_dev;
    607 			booted_method = "raidframe/multi";
    608 			booted_partition = 0;	/* XXX assume 'a' */
    609 		} else {
    610 			/* we can't guess.. require the user to answer... */
    611 			boothowto |= RB_ASKNAME;
    612 		}
    613 	}
    614 }
    615 
    616 static int
    617 raidsize(dev_t dev)
    618 {
    619 	struct raid_softc *rs;
    620 	struct dk_softc *dksc;
    621 	unsigned int unit;
    622 
    623 	unit = raidunit(dev);
    624 	if ((rs = raidget(unit, false)) == NULL)
    625 		return -1;
    626 	dksc = &rs->sc_dksc;
    627 
    628 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    629 		return -1;
    630 
    631 	return dk_size(dksc, dev);
    632 }
    633 
    634 static int
    635 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    636 {
    637 	unsigned int unit;
    638 	struct raid_softc *rs;
    639 	struct dk_softc *dksc;
    640 
    641 	unit = raidunit(dev);
    642 	if ((rs = raidget(unit, false)) == NULL)
    643 		return ENXIO;
    644 	dksc = &rs->sc_dksc;
    645 
    646 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    647 		return ENODEV;
    648 
    649         /*
    650            Note that blkno is relative to this particular partition.
    651            By adding adding RF_PROTECTED_SECTORS, we get a value that
    652 	   is relative to the partition used for the underlying component.
    653         */
    654 	blkno += RF_PROTECTED_SECTORS;
    655 
    656 	return dk_dump(dksc, dev, blkno, va, size);
    657 }
    658 
    659 static int
    660 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    661 {
    662 	struct raid_softc *rs = raidsoftc(dev);
    663 	const struct bdevsw *bdev;
    664 	RF_Raid_t *raidPtr;
    665 	int     c, sparecol, j, scol, dumpto;
    666 	int     error = 0;
    667 
    668 	raidPtr = &rs->sc_r;
    669 
    670 	/* we only support dumping to RAID 1 sets */
    671 	if (raidPtr->Layout.numDataCol != 1 ||
    672 	    raidPtr->Layout.numParityCol != 1)
    673 		return EINVAL;
    674 
    675 	if ((error = raidlock(rs)) != 0)
    676 		return error;
    677 
    678 	/* figure out what device is alive.. */
    679 
    680 	/*
    681 	   Look for a component to dump to.  The preference for the
    682 	   component to dump to is as follows:
    683 	   1) the master
    684 	   2) a used_spare of the master
    685 	   3) the slave
    686 	   4) a used_spare of the slave
    687 	*/
    688 
    689 	dumpto = -1;
    690 	for (c = 0; c < raidPtr->numCol; c++) {
    691 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    692 			/* this might be the one */
    693 			dumpto = c;
    694 			break;
    695 		}
    696 	}
    697 
    698 	/*
    699 	   At this point we have possibly selected a live master or a
    700 	   live slave.  We now check to see if there is a spared
    701 	   master (or a spared slave), if we didn't find a live master
    702 	   or a live slave.
    703 	*/
    704 
    705 	for (c = 0; c < raidPtr->numSpare; c++) {
    706 		sparecol = raidPtr->numCol + c;
    707 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    708 			/* How about this one? */
    709 			scol = -1;
    710 			for(j=0;j<raidPtr->numCol;j++) {
    711 				if (raidPtr->Disks[j].spareCol == sparecol) {
    712 					scol = j;
    713 					break;
    714 				}
    715 			}
    716 			if (scol == 0) {
    717 				/*
    718 				   We must have found a spared master!
    719 				   We'll take that over anything else
    720 				   found so far.  (We couldn't have
    721 				   found a real master before, since
    722 				   this is a used spare, and it's
    723 				   saying that it's replacing the
    724 				   master.)  On reboot (with
    725 				   autoconfiguration turned on)
    726 				   sparecol will become the 1st
    727 				   component (component0) of this set.
    728 				*/
    729 				dumpto = sparecol;
    730 				break;
    731 			} else if (scol != -1) {
    732 				/*
    733 				   Must be a spared slave.  We'll dump
    734 				   to that if we havn't found anything
    735 				   else so far.
    736 				*/
    737 				if (dumpto == -1)
    738 					dumpto = sparecol;
    739 			}
    740 		}
    741 	}
    742 
    743 	if (dumpto == -1) {
    744 		/* we couldn't find any live components to dump to!?!?
    745 		 */
    746 		error = EINVAL;
    747 		goto out;
    748 	}
    749 
    750 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    751 	if (bdev == NULL) {
    752 		error = ENXIO;
    753 		goto out;
    754 	}
    755 
    756 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    757 				blkno, va, nblk * raidPtr->bytesPerSector);
    758 
    759 out:
    760 	raidunlock(rs);
    761 
    762 	return error;
    763 }
    764 
    765 /* ARGSUSED */
    766 static int
    767 raidopen(dev_t dev, int flags, int fmt,
    768     struct lwp *l)
    769 {
    770 	int     unit = raidunit(dev);
    771 	struct raid_softc *rs;
    772 	struct dk_softc *dksc;
    773 	int     error = 0;
    774 	int     part, pmask;
    775 
    776 	if ((rs = raidget(unit, true)) == NULL)
    777 		return ENXIO;
    778 	if ((error = raidlock(rs)) != 0)
    779 		return (error);
    780 
    781 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    782 		error = EBUSY;
    783 		goto bad;
    784 	}
    785 
    786 	dksc = &rs->sc_dksc;
    787 
    788 	part = DISKPART(dev);
    789 	pmask = (1 << part);
    790 
    791 	if (!DK_BUSY(dksc, pmask) &&
    792 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    793 		/* First one... mark things as dirty... Note that we *MUST*
    794 		 have done a configure before this.  I DO NOT WANT TO BE
    795 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    796 		 THAT THEY BELONG TOGETHER!!!!! */
    797 		/* XXX should check to see if we're only open for reading
    798 		   here... If so, we needn't do this, but then need some
    799 		   other way of keeping track of what's happened.. */
    800 
    801 		rf_markalldirty(&rs->sc_r);
    802 	}
    803 
    804 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    805 		error = dk_open(dksc, dev, flags, fmt, l);
    806 
    807 bad:
    808 	raidunlock(rs);
    809 
    810 	return (error);
    811 
    812 
    813 }
    814 
    815 static int
    816 raid_lastclose(device_t self)
    817 {
    818 	struct raid_softc *rs = raidsoftc(self);
    819 
    820 	/* Last one... device is not unconfigured yet.
    821 	   Device shutdown has taken care of setting the
    822 	   clean bits if RAIDF_INITED is not set
    823 	   mark things as clean... */
    824 
    825 	rf_update_component_labels(&rs->sc_r,
    826 	    RF_FINAL_COMPONENT_UPDATE);
    827 
    828 	/* pass to unlocked code */
    829 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    830 		rs->sc_flags |= RAIDF_DETACH;
    831 
    832 	return 0;
    833 }
    834 
    835 /* ARGSUSED */
    836 static int
    837 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    838 {
    839 	int     unit = raidunit(dev);
    840 	struct raid_softc *rs;
    841 	struct dk_softc *dksc;
    842 	cfdata_t cf;
    843 	int     error = 0, do_detach = 0, do_put = 0;
    844 
    845 	if ((rs = raidget(unit, false)) == NULL)
    846 		return ENXIO;
    847 	dksc = &rs->sc_dksc;
    848 
    849 	if ((error = raidlock(rs)) != 0)
    850 		return (error);
    851 
    852 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    853 		error = dk_close(dksc, dev, flags, fmt, l);
    854 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    855 			do_detach = 1;
    856 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    857 		do_put = 1;
    858 
    859 	raidunlock(rs);
    860 
    861 	if (do_detach) {
    862 		/* free the pseudo device attach bits */
    863 		cf = device_cfdata(dksc->sc_dev);
    864 		error = config_detach(dksc->sc_dev, 0);
    865 		if (error == 0)
    866 			free(cf, M_RAIDFRAME);
    867 	} else if (do_put) {
    868 		raidput(rs);
    869 	}
    870 
    871 	return (error);
    872 
    873 }
    874 
    875 static void
    876 raid_wakeup(RF_Raid_t *raidPtr)
    877 {
    878 	rf_lock_mutex2(raidPtr->iodone_lock);
    879 	rf_signal_cond2(raidPtr->iodone_cv);
    880 	rf_unlock_mutex2(raidPtr->iodone_lock);
    881 }
    882 
    883 static void
    884 raidstrategy(struct buf *bp)
    885 {
    886 	unsigned int unit;
    887 	struct raid_softc *rs;
    888 	struct dk_softc *dksc;
    889 	RF_Raid_t *raidPtr;
    890 
    891 	unit = raidunit(bp->b_dev);
    892 	if ((rs = raidget(unit, false)) == NULL) {
    893 		bp->b_error = ENXIO;
    894 		goto fail;
    895 	}
    896 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    897 		bp->b_error = ENXIO;
    898 		goto fail;
    899 	}
    900 	dksc = &rs->sc_dksc;
    901 	raidPtr = &rs->sc_r;
    902 
    903 	/* Queue IO only */
    904 	if (dk_strategy_defer(dksc, bp))
    905 		goto done;
    906 
    907 	/* schedule the IO to happen at the next convenient time */
    908 	raid_wakeup(raidPtr);
    909 
    910 done:
    911 	return;
    912 
    913 fail:
    914 	bp->b_resid = bp->b_bcount;
    915 	biodone(bp);
    916 }
    917 
    918 static int
    919 raid_diskstart(device_t dev, struct buf *bp)
    920 {
    921 	struct raid_softc *rs = raidsoftc(dev);
    922 	RF_Raid_t *raidPtr;
    923 
    924 	raidPtr = &rs->sc_r;
    925 	if (!raidPtr->valid) {
    926 		db1_printf(("raid is not valid..\n"));
    927 		return ENODEV;
    928 	}
    929 
    930 	/* XXX */
    931 	bp->b_resid = 0;
    932 
    933 	return raiddoaccess(raidPtr, bp);
    934 }
    935 
    936 void
    937 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    938 {
    939 	struct raid_softc *rs;
    940 	struct dk_softc *dksc;
    941 
    942 	rs = raidPtr->softc;
    943 	dksc = &rs->sc_dksc;
    944 
    945 	dk_done(dksc, bp);
    946 
    947 	rf_lock_mutex2(raidPtr->mutex);
    948 	raidPtr->openings++;
    949 	rf_unlock_mutex2(raidPtr->mutex);
    950 
    951 	/* schedule more IO */
    952 	raid_wakeup(raidPtr);
    953 }
    954 
    955 /* ARGSUSED */
    956 static int
    957 raidread(dev_t dev, struct uio *uio, int flags)
    958 {
    959 	int     unit = raidunit(dev);
    960 	struct raid_softc *rs;
    961 
    962 	if ((rs = raidget(unit, false)) == NULL)
    963 		return ENXIO;
    964 
    965 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    966 		return (ENXIO);
    967 
    968 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    969 
    970 }
    971 
    972 /* ARGSUSED */
    973 static int
    974 raidwrite(dev_t dev, struct uio *uio, int flags)
    975 {
    976 	int     unit = raidunit(dev);
    977 	struct raid_softc *rs;
    978 
    979 	if ((rs = raidget(unit, false)) == NULL)
    980 		return ENXIO;
    981 
    982 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    983 		return (ENXIO);
    984 
    985 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    986 
    987 }
    988 
    989 static int
    990 raid_detach_unlocked(struct raid_softc *rs)
    991 {
    992 	struct dk_softc *dksc = &rs->sc_dksc;
    993 	RF_Raid_t *raidPtr;
    994 	int error;
    995 
    996 	raidPtr = &rs->sc_r;
    997 
    998 	if (DK_BUSY(dksc, 0) ||
    999 	    raidPtr->recon_in_progress != 0 ||
   1000 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1001 	    raidPtr->copyback_in_progress != 0)
   1002 		return EBUSY;
   1003 
   1004 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1005 		return 0;
   1006 
   1007 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1008 
   1009 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1010 		return error;
   1011 
   1012 	rs->sc_flags &= ~RAIDF_INITED;
   1013 
   1014 	/* Kill off any queued buffers */
   1015 	dk_drain(dksc);
   1016 	bufq_free(dksc->sc_bufq);
   1017 
   1018 	/* Detach the disk. */
   1019 	dkwedge_delall(&dksc->sc_dkdev);
   1020 	disk_detach(&dksc->sc_dkdev);
   1021 	disk_destroy(&dksc->sc_dkdev);
   1022 	dk_detach(dksc);
   1023 
   1024 	return 0;
   1025 }
   1026 
   1027 static int
   1028 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1029 {
   1030 	int     unit = raidunit(dev);
   1031 	int     error = 0;
   1032 	int     part, pmask;
   1033 	struct raid_softc *rs;
   1034 	struct dk_softc *dksc;
   1035 	RF_Config_t *k_cfg, *u_cfg;
   1036 	RF_Raid_t *raidPtr;
   1037 	RF_RaidDisk_t *diskPtr;
   1038 	RF_AccTotals_t *totals;
   1039 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1040 	u_char *specific_buf;
   1041 	int retcode = 0;
   1042 	int column;
   1043 /*	int raidid; */
   1044 	struct rf_recon_req *rrcopy, *rr;
   1045 	RF_ComponentLabel_t *clabel;
   1046 	RF_ComponentLabel_t *ci_label;
   1047 	RF_ComponentLabel_t **clabel_ptr;
   1048 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1049 	RF_SingleComponent_t component;
   1050 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1051 	int i, j, d;
   1052 
   1053 	if ((rs = raidget(unit, false)) == NULL)
   1054 		return ENXIO;
   1055 	dksc = &rs->sc_dksc;
   1056 	raidPtr = &rs->sc_r;
   1057 
   1058 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1059 		(int) DISKPART(dev), (int) unit, cmd));
   1060 
   1061 	/* Must be initialized for these... */
   1062 	switch (cmd) {
   1063 	case RAIDFRAME_REWRITEPARITY:
   1064 	case RAIDFRAME_GET_INFO:
   1065 	case RAIDFRAME_RESET_ACCTOTALS:
   1066 	case RAIDFRAME_GET_ACCTOTALS:
   1067 	case RAIDFRAME_KEEP_ACCTOTALS:
   1068 	case RAIDFRAME_GET_SIZE:
   1069 	case RAIDFRAME_FAIL_DISK:
   1070 	case RAIDFRAME_COPYBACK:
   1071 	case RAIDFRAME_CHECK_RECON_STATUS:
   1072 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1073 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1074 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1075 	case RAIDFRAME_ADD_HOT_SPARE:
   1076 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1077 	case RAIDFRAME_INIT_LABELS:
   1078 	case RAIDFRAME_REBUILD_IN_PLACE:
   1079 	case RAIDFRAME_CHECK_PARITY:
   1080 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1081 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1082 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1083 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1084 	case RAIDFRAME_SET_AUTOCONFIG:
   1085 	case RAIDFRAME_SET_ROOT:
   1086 	case RAIDFRAME_DELETE_COMPONENT:
   1087 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1088 	case RAIDFRAME_PARITYMAP_STATUS:
   1089 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1090 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1091 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1092 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1093 			return (ENXIO);
   1094 	}
   1095 
   1096 	switch (cmd) {
   1097 #ifdef COMPAT_50
   1098 	case RAIDFRAME_GET_INFO50:
   1099 		return rf_get_info50(raidPtr, data);
   1100 
   1101 	case RAIDFRAME_CONFIGURE50:
   1102 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1103 			return retcode;
   1104 		goto config;
   1105 #endif
   1106 		/* configure the system */
   1107 	case RAIDFRAME_CONFIGURE:
   1108 
   1109 		if (raidPtr->valid) {
   1110 			/* There is a valid RAID set running on this unit! */
   1111 			printf("raid%d: Device already configured!\n",unit);
   1112 			return(EINVAL);
   1113 		}
   1114 
   1115 		/* copy-in the configuration information */
   1116 		/* data points to a pointer to the configuration structure */
   1117 
   1118 		u_cfg = *((RF_Config_t **) data);
   1119 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1120 		if (k_cfg == NULL) {
   1121 			return (ENOMEM);
   1122 		}
   1123 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1124 		if (retcode) {
   1125 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1126 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1127 				retcode));
   1128 			goto no_config;
   1129 		}
   1130 		goto config;
   1131 	config:
   1132 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1133 
   1134 		/* allocate a buffer for the layout-specific data, and copy it
   1135 		 * in */
   1136 		if (k_cfg->layoutSpecificSize) {
   1137 			if (k_cfg->layoutSpecificSize > 10000) {
   1138 				/* sanity check */
   1139 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1140 				retcode = EINVAL;
   1141 				goto no_config;
   1142 			}
   1143 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1144 			    (u_char *));
   1145 			if (specific_buf == NULL) {
   1146 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1147 				retcode = ENOMEM;
   1148 				goto no_config;
   1149 			}
   1150 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1151 			    k_cfg->layoutSpecificSize);
   1152 			if (retcode) {
   1153 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1154 				RF_Free(specific_buf,
   1155 					k_cfg->layoutSpecificSize);
   1156 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1157 					retcode));
   1158 				goto no_config;
   1159 			}
   1160 		} else
   1161 			specific_buf = NULL;
   1162 		k_cfg->layoutSpecific = specific_buf;
   1163 
   1164 		/* should do some kind of sanity check on the configuration.
   1165 		 * Store the sum of all the bytes in the last byte? */
   1166 
   1167 		/* configure the system */
   1168 
   1169 		/*
   1170 		 * Clear the entire RAID descriptor, just to make sure
   1171 		 *  there is no stale data left in the case of a
   1172 		 *  reconfiguration
   1173 		 */
   1174 		memset(raidPtr, 0, sizeof(*raidPtr));
   1175 		raidPtr->softc = rs;
   1176 		raidPtr->raidid = unit;
   1177 
   1178 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1179 
   1180 		if (retcode == 0) {
   1181 
   1182 			/* allow this many simultaneous IO's to
   1183 			   this RAID device */
   1184 			raidPtr->openings = RAIDOUTSTANDING;
   1185 
   1186 			raidinit(rs);
   1187 			raid_wakeup(raidPtr);
   1188 			rf_markalldirty(raidPtr);
   1189 		}
   1190 		/* free the buffers.  No return code here. */
   1191 		if (k_cfg->layoutSpecificSize) {
   1192 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1193 		}
   1194 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1195 
   1196 	no_config:
   1197 		/*
   1198 		 * If configuration failed, set sc_flags so that we
   1199 		 * will detach the device when we close it.
   1200 		 */
   1201 		if (retcode != 0)
   1202 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1203 		return (retcode);
   1204 
   1205 		/* shutdown the system */
   1206 	case RAIDFRAME_SHUTDOWN:
   1207 
   1208 		part = DISKPART(dev);
   1209 		pmask = (1 << part);
   1210 
   1211 		if ((error = raidlock(rs)) != 0)
   1212 			return (error);
   1213 
   1214 		if (DK_BUSY(dksc, pmask) ||
   1215 		    raidPtr->recon_in_progress != 0 ||
   1216 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1217 		    raidPtr->copyback_in_progress != 0)
   1218 			retcode = EBUSY;
   1219 		else {
   1220 			/* detach and free on close */
   1221 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1222 			retcode = 0;
   1223 		}
   1224 
   1225 		raidunlock(rs);
   1226 
   1227 		return (retcode);
   1228 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1229 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1230 		/* need to read the component label for the disk indicated
   1231 		   by row,column in clabel */
   1232 
   1233 		/*
   1234 		 * Perhaps there should be an option to skip the in-core
   1235 		 * copy and hit the disk, as with disklabel(8).
   1236 		 */
   1237 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1238 
   1239 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1240 
   1241 		if (retcode) {
   1242 			RF_Free(clabel, sizeof(*clabel));
   1243 			return retcode;
   1244 		}
   1245 
   1246 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1247 
   1248 		column = clabel->column;
   1249 
   1250 		if ((column < 0) || (column >= raidPtr->numCol +
   1251 		    raidPtr->numSpare)) {
   1252 			RF_Free(clabel, sizeof(*clabel));
   1253 			return EINVAL;
   1254 		}
   1255 
   1256 		RF_Free(clabel, sizeof(*clabel));
   1257 
   1258 		clabel = raidget_component_label(raidPtr, column);
   1259 
   1260 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1261 
   1262 #if 0
   1263 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1264 		clabel = (RF_ComponentLabel_t *) data;
   1265 
   1266 		/* XXX check the label for valid stuff... */
   1267 		/* Note that some things *should not* get modified --
   1268 		   the user should be re-initing the labels instead of
   1269 		   trying to patch things.
   1270 		   */
   1271 
   1272 		raidid = raidPtr->raidid;
   1273 #ifdef DEBUG
   1274 		printf("raid%d: Got component label:\n", raidid);
   1275 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1276 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1277 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1278 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1279 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1280 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1281 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1282 #endif
   1283 		clabel->row = 0;
   1284 		column = clabel->column;
   1285 
   1286 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1287 			return(EINVAL);
   1288 		}
   1289 
   1290 		/* XXX this isn't allowed to do anything for now :-) */
   1291 
   1292 		/* XXX and before it is, we need to fill in the rest
   1293 		   of the fields!?!?!?! */
   1294 		memcpy(raidget_component_label(raidPtr, column),
   1295 		    clabel, sizeof(*clabel));
   1296 		raidflush_component_label(raidPtr, column);
   1297 		return (0);
   1298 #endif
   1299 
   1300 	case RAIDFRAME_INIT_LABELS:
   1301 		clabel = (RF_ComponentLabel_t *) data;
   1302 		/*
   1303 		   we only want the serial number from
   1304 		   the above.  We get all the rest of the information
   1305 		   from the config that was used to create this RAID
   1306 		   set.
   1307 		   */
   1308 
   1309 		raidPtr->serial_number = clabel->serial_number;
   1310 
   1311 		for(column=0;column<raidPtr->numCol;column++) {
   1312 			diskPtr = &raidPtr->Disks[column];
   1313 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1314 				ci_label = raidget_component_label(raidPtr,
   1315 				    column);
   1316 				/* Zeroing this is important. */
   1317 				memset(ci_label, 0, sizeof(*ci_label));
   1318 				raid_init_component_label(raidPtr, ci_label);
   1319 				ci_label->serial_number =
   1320 				    raidPtr->serial_number;
   1321 				ci_label->row = 0; /* we dont' pretend to support more */
   1322 				rf_component_label_set_partitionsize(ci_label,
   1323 				    diskPtr->partitionSize);
   1324 				ci_label->column = column;
   1325 				raidflush_component_label(raidPtr, column);
   1326 			}
   1327 			/* XXXjld what about the spares? */
   1328 		}
   1329 
   1330 		return (retcode);
   1331 	case RAIDFRAME_SET_AUTOCONFIG:
   1332 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1333 		printf("raid%d: New autoconfig value is: %d\n",
   1334 		       raidPtr->raidid, d);
   1335 		*(int *) data = d;
   1336 		return (retcode);
   1337 
   1338 	case RAIDFRAME_SET_ROOT:
   1339 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1340 		printf("raid%d: New rootpartition value is: %d\n",
   1341 		       raidPtr->raidid, d);
   1342 		*(int *) data = d;
   1343 		return (retcode);
   1344 
   1345 		/* initialize all parity */
   1346 	case RAIDFRAME_REWRITEPARITY:
   1347 
   1348 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1349 			/* Parity for RAID 0 is trivially correct */
   1350 			raidPtr->parity_good = RF_RAID_CLEAN;
   1351 			return(0);
   1352 		}
   1353 
   1354 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1355 			/* Re-write is already in progress! */
   1356 			return(EINVAL);
   1357 		}
   1358 
   1359 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1360 					   rf_RewriteParityThread,
   1361 					   raidPtr,"raid_parity");
   1362 		return (retcode);
   1363 
   1364 
   1365 	case RAIDFRAME_ADD_HOT_SPARE:
   1366 		sparePtr = (RF_SingleComponent_t *) data;
   1367 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1368 		retcode = rf_add_hot_spare(raidPtr, &component);
   1369 		return(retcode);
   1370 
   1371 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1372 		return(retcode);
   1373 
   1374 	case RAIDFRAME_DELETE_COMPONENT:
   1375 		componentPtr = (RF_SingleComponent_t *)data;
   1376 		memcpy( &component, componentPtr,
   1377 			sizeof(RF_SingleComponent_t));
   1378 		retcode = rf_delete_component(raidPtr, &component);
   1379 		return(retcode);
   1380 
   1381 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1382 		componentPtr = (RF_SingleComponent_t *)data;
   1383 		memcpy( &component, componentPtr,
   1384 			sizeof(RF_SingleComponent_t));
   1385 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1386 		return(retcode);
   1387 
   1388 	case RAIDFRAME_REBUILD_IN_PLACE:
   1389 
   1390 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1391 			/* Can't do this on a RAID 0!! */
   1392 			return(EINVAL);
   1393 		}
   1394 
   1395 		if (raidPtr->recon_in_progress == 1) {
   1396 			/* a reconstruct is already in progress! */
   1397 			return(EINVAL);
   1398 		}
   1399 
   1400 		componentPtr = (RF_SingleComponent_t *) data;
   1401 		memcpy( &component, componentPtr,
   1402 			sizeof(RF_SingleComponent_t));
   1403 		component.row = 0; /* we don't support any more */
   1404 		column = component.column;
   1405 
   1406 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1407 			return(EINVAL);
   1408 		}
   1409 
   1410 		rf_lock_mutex2(raidPtr->mutex);
   1411 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1412 		    (raidPtr->numFailures > 0)) {
   1413 			/* XXX 0 above shouldn't be constant!!! */
   1414 			/* some component other than this has failed.
   1415 			   Let's not make things worse than they already
   1416 			   are... */
   1417 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1418 			       raidPtr->raidid);
   1419 			printf("raid%d:     Col: %d   Too many failures.\n",
   1420 			       raidPtr->raidid, column);
   1421 			rf_unlock_mutex2(raidPtr->mutex);
   1422 			return (EINVAL);
   1423 		}
   1424 		if (raidPtr->Disks[column].status ==
   1425 		    rf_ds_reconstructing) {
   1426 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1427 			       raidPtr->raidid);
   1428 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1429 
   1430 			rf_unlock_mutex2(raidPtr->mutex);
   1431 			return (EINVAL);
   1432 		}
   1433 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1434 			rf_unlock_mutex2(raidPtr->mutex);
   1435 			return (EINVAL);
   1436 		}
   1437 		rf_unlock_mutex2(raidPtr->mutex);
   1438 
   1439 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1440 		if (rrcopy == NULL)
   1441 			return(ENOMEM);
   1442 
   1443 		rrcopy->raidPtr = (void *) raidPtr;
   1444 		rrcopy->col = column;
   1445 
   1446 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1447 					   rf_ReconstructInPlaceThread,
   1448 					   rrcopy,"raid_reconip");
   1449 		return(retcode);
   1450 
   1451 	case RAIDFRAME_GET_INFO:
   1452 		if (!raidPtr->valid)
   1453 			return (ENODEV);
   1454 		ucfgp = (RF_DeviceConfig_t **) data;
   1455 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1456 			  (RF_DeviceConfig_t *));
   1457 		if (d_cfg == NULL)
   1458 			return (ENOMEM);
   1459 		d_cfg->rows = 1; /* there is only 1 row now */
   1460 		d_cfg->cols = raidPtr->numCol;
   1461 		d_cfg->ndevs = raidPtr->numCol;
   1462 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1463 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1464 			return (ENOMEM);
   1465 		}
   1466 		d_cfg->nspares = raidPtr->numSpare;
   1467 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1468 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1469 			return (ENOMEM);
   1470 		}
   1471 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1472 		d = 0;
   1473 		for (j = 0; j < d_cfg->cols; j++) {
   1474 			d_cfg->devs[d] = raidPtr->Disks[j];
   1475 			d++;
   1476 		}
   1477 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1478 			d_cfg->spares[i] = raidPtr->Disks[j];
   1479 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1480 				/* XXX: raidctl(8) expects to see this as a used spare */
   1481 				d_cfg->spares[i].status = rf_ds_used_spare;
   1482 			}
   1483 		}
   1484 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1485 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1486 
   1487 		return (retcode);
   1488 
   1489 	case RAIDFRAME_CHECK_PARITY:
   1490 		*(int *) data = raidPtr->parity_good;
   1491 		return (0);
   1492 
   1493 	case RAIDFRAME_PARITYMAP_STATUS:
   1494 		if (rf_paritymap_ineligible(raidPtr))
   1495 			return EINVAL;
   1496 		rf_paritymap_status(raidPtr->parity_map,
   1497 		    (struct rf_pmstat *)data);
   1498 		return 0;
   1499 
   1500 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1501 		if (rf_paritymap_ineligible(raidPtr))
   1502 			return EINVAL;
   1503 		if (raidPtr->parity_map == NULL)
   1504 			return ENOENT; /* ??? */
   1505 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1506 			(struct rf_pmparams *)data, 1))
   1507 			return EINVAL;
   1508 		return 0;
   1509 
   1510 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1511 		if (rf_paritymap_ineligible(raidPtr))
   1512 			return EINVAL;
   1513 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1514 		return 0;
   1515 
   1516 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1517 		if (rf_paritymap_ineligible(raidPtr))
   1518 			return EINVAL;
   1519 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1520 		/* XXX should errors be passed up? */
   1521 		return 0;
   1522 
   1523 	case RAIDFRAME_RESET_ACCTOTALS:
   1524 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1525 		return (0);
   1526 
   1527 	case RAIDFRAME_GET_ACCTOTALS:
   1528 		totals = (RF_AccTotals_t *) data;
   1529 		*totals = raidPtr->acc_totals;
   1530 		return (0);
   1531 
   1532 	case RAIDFRAME_KEEP_ACCTOTALS:
   1533 		raidPtr->keep_acc_totals = *(int *)data;
   1534 		return (0);
   1535 
   1536 	case RAIDFRAME_GET_SIZE:
   1537 		*(int *) data = raidPtr->totalSectors;
   1538 		return (0);
   1539 
   1540 		/* fail a disk & optionally start reconstruction */
   1541 	case RAIDFRAME_FAIL_DISK:
   1542 
   1543 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1544 			/* Can't do this on a RAID 0!! */
   1545 			return(EINVAL);
   1546 		}
   1547 
   1548 		rr = (struct rf_recon_req *) data;
   1549 		rr->row = 0;
   1550 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1551 			return (EINVAL);
   1552 
   1553 
   1554 		rf_lock_mutex2(raidPtr->mutex);
   1555 		if (raidPtr->status == rf_rs_reconstructing) {
   1556 			/* you can't fail a disk while we're reconstructing! */
   1557 			/* XXX wrong for RAID6 */
   1558 			rf_unlock_mutex2(raidPtr->mutex);
   1559 			return (EINVAL);
   1560 		}
   1561 		if ((raidPtr->Disks[rr->col].status ==
   1562 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1563 			/* some other component has failed.  Let's not make
   1564 			   things worse. XXX wrong for RAID6 */
   1565 			rf_unlock_mutex2(raidPtr->mutex);
   1566 			return (EINVAL);
   1567 		}
   1568 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1569 			/* Can't fail a spared disk! */
   1570 			rf_unlock_mutex2(raidPtr->mutex);
   1571 			return (EINVAL);
   1572 		}
   1573 		rf_unlock_mutex2(raidPtr->mutex);
   1574 
   1575 		/* make a copy of the recon request so that we don't rely on
   1576 		 * the user's buffer */
   1577 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1578 		if (rrcopy == NULL)
   1579 			return(ENOMEM);
   1580 		memcpy(rrcopy, rr, sizeof(*rr));
   1581 		rrcopy->raidPtr = (void *) raidPtr;
   1582 
   1583 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1584 					   rf_ReconThread,
   1585 					   rrcopy,"raid_recon");
   1586 		return (0);
   1587 
   1588 		/* invoke a copyback operation after recon on whatever disk
   1589 		 * needs it, if any */
   1590 	case RAIDFRAME_COPYBACK:
   1591 
   1592 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1593 			/* This makes no sense on a RAID 0!! */
   1594 			return(EINVAL);
   1595 		}
   1596 
   1597 		if (raidPtr->copyback_in_progress == 1) {
   1598 			/* Copyback is already in progress! */
   1599 			return(EINVAL);
   1600 		}
   1601 
   1602 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1603 					   rf_CopybackThread,
   1604 					   raidPtr,"raid_copyback");
   1605 		return (retcode);
   1606 
   1607 		/* return the percentage completion of reconstruction */
   1608 	case RAIDFRAME_CHECK_RECON_STATUS:
   1609 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1610 			/* This makes no sense on a RAID 0, so tell the
   1611 			   user it's done. */
   1612 			*(int *) data = 100;
   1613 			return(0);
   1614 		}
   1615 		if (raidPtr->status != rf_rs_reconstructing)
   1616 			*(int *) data = 100;
   1617 		else {
   1618 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1619 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1620 			} else {
   1621 				*(int *) data = 0;
   1622 			}
   1623 		}
   1624 		return (0);
   1625 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1626 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1627 		if (raidPtr->status != rf_rs_reconstructing) {
   1628 			progressInfo.remaining = 0;
   1629 			progressInfo.completed = 100;
   1630 			progressInfo.total = 100;
   1631 		} else {
   1632 			progressInfo.total =
   1633 				raidPtr->reconControl->numRUsTotal;
   1634 			progressInfo.completed =
   1635 				raidPtr->reconControl->numRUsComplete;
   1636 			progressInfo.remaining = progressInfo.total -
   1637 				progressInfo.completed;
   1638 		}
   1639 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1640 				  sizeof(RF_ProgressInfo_t));
   1641 		return (retcode);
   1642 
   1643 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1644 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1645 			/* This makes no sense on a RAID 0, so tell the
   1646 			   user it's done. */
   1647 			*(int *) data = 100;
   1648 			return(0);
   1649 		}
   1650 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1651 			*(int *) data = 100 *
   1652 				raidPtr->parity_rewrite_stripes_done /
   1653 				raidPtr->Layout.numStripe;
   1654 		} else {
   1655 			*(int *) data = 100;
   1656 		}
   1657 		return (0);
   1658 
   1659 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1660 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1661 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1662 			progressInfo.total = raidPtr->Layout.numStripe;
   1663 			progressInfo.completed =
   1664 				raidPtr->parity_rewrite_stripes_done;
   1665 			progressInfo.remaining = progressInfo.total -
   1666 				progressInfo.completed;
   1667 		} else {
   1668 			progressInfo.remaining = 0;
   1669 			progressInfo.completed = 100;
   1670 			progressInfo.total = 100;
   1671 		}
   1672 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1673 				  sizeof(RF_ProgressInfo_t));
   1674 		return (retcode);
   1675 
   1676 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1677 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1678 			/* This makes no sense on a RAID 0 */
   1679 			*(int *) data = 100;
   1680 			return(0);
   1681 		}
   1682 		if (raidPtr->copyback_in_progress == 1) {
   1683 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1684 				raidPtr->Layout.numStripe;
   1685 		} else {
   1686 			*(int *) data = 100;
   1687 		}
   1688 		return (0);
   1689 
   1690 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1691 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1692 		if (raidPtr->copyback_in_progress == 1) {
   1693 			progressInfo.total = raidPtr->Layout.numStripe;
   1694 			progressInfo.completed =
   1695 				raidPtr->copyback_stripes_done;
   1696 			progressInfo.remaining = progressInfo.total -
   1697 				progressInfo.completed;
   1698 		} else {
   1699 			progressInfo.remaining = 0;
   1700 			progressInfo.completed = 100;
   1701 			progressInfo.total = 100;
   1702 		}
   1703 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1704 				  sizeof(RF_ProgressInfo_t));
   1705 		return (retcode);
   1706 
   1707 	case RAIDFRAME_SET_LAST_UNIT:
   1708 		for (column = 0; column < raidPtr->numCol; column++)
   1709 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1710 				return EBUSY;
   1711 
   1712 		for (column = 0; column < raidPtr->numCol; column++) {
   1713 			clabel = raidget_component_label(raidPtr, column);
   1714 			clabel->last_unit = *(int *)data;
   1715 			raidflush_component_label(raidPtr, column);
   1716 		}
   1717 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1718 		return 0;
   1719 
   1720 		/* the sparetable daemon calls this to wait for the kernel to
   1721 		 * need a spare table. this ioctl does not return until a
   1722 		 * spare table is needed. XXX -- calling mpsleep here in the
   1723 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1724 		 * -- I should either compute the spare table in the kernel,
   1725 		 * or have a different -- XXX XXX -- interface (a different
   1726 		 * character device) for delivering the table     -- XXX */
   1727 #if 0
   1728 	case RAIDFRAME_SPARET_WAIT:
   1729 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1730 		while (!rf_sparet_wait_queue)
   1731 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1732 		waitreq = rf_sparet_wait_queue;
   1733 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1734 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1735 
   1736 		/* structure assignment */
   1737 		*((RF_SparetWait_t *) data) = *waitreq;
   1738 
   1739 		RF_Free(waitreq, sizeof(*waitreq));
   1740 		return (0);
   1741 
   1742 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1743 		 * code in it that will cause the dameon to exit */
   1744 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1745 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1746 		waitreq->fcol = -1;
   1747 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1748 		waitreq->next = rf_sparet_wait_queue;
   1749 		rf_sparet_wait_queue = waitreq;
   1750 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1751 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1752 		return (0);
   1753 
   1754 		/* used by the spare table daemon to deliver a spare table
   1755 		 * into the kernel */
   1756 	case RAIDFRAME_SEND_SPARET:
   1757 
   1758 		/* install the spare table */
   1759 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1760 
   1761 		/* respond to the requestor.  the return status of the spare
   1762 		 * table installation is passed in the "fcol" field */
   1763 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1764 		waitreq->fcol = retcode;
   1765 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1766 		waitreq->next = rf_sparet_resp_queue;
   1767 		rf_sparet_resp_queue = waitreq;
   1768 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1769 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1770 
   1771 		return (retcode);
   1772 #endif
   1773 
   1774 	default:
   1775 		break; /* fall through to the os-specific code below */
   1776 
   1777 	}
   1778 
   1779 	if (!raidPtr->valid)
   1780 		return (EINVAL);
   1781 
   1782 	/*
   1783 	 * Add support for "regular" device ioctls here.
   1784 	 */
   1785 
   1786 	switch (cmd) {
   1787 	case DIOCGCACHE:
   1788 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1789 		break;
   1790 
   1791 	case DIOCCACHESYNC:
   1792 		retcode = rf_sync_component_caches(raidPtr);
   1793 		break;
   1794 
   1795 	default:
   1796 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1797 		break;
   1798 	}
   1799 
   1800 	return (retcode);
   1801 
   1802 }
   1803 
   1804 
   1805 /* raidinit -- complete the rest of the initialization for the
   1806    RAIDframe device.  */
   1807 
   1808 
   1809 static void
   1810 raidinit(struct raid_softc *rs)
   1811 {
   1812 	cfdata_t cf;
   1813 	unsigned int unit;
   1814 	struct dk_softc *dksc = &rs->sc_dksc;
   1815 	RF_Raid_t *raidPtr = &rs->sc_r;
   1816 	device_t dev;
   1817 
   1818 	unit = raidPtr->raidid;
   1819 
   1820 	/* XXX doesn't check bounds. */
   1821 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1822 
   1823 	/* attach the pseudo device */
   1824 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1825 	cf->cf_name = raid_cd.cd_name;
   1826 	cf->cf_atname = raid_cd.cd_name;
   1827 	cf->cf_unit = unit;
   1828 	cf->cf_fstate = FSTATE_STAR;
   1829 
   1830 	dev = config_attach_pseudo(cf);
   1831 	if (dev == NULL) {
   1832 		printf("raid%d: config_attach_pseudo failed\n",
   1833 		    raidPtr->raidid);
   1834 		free(cf, M_RAIDFRAME);
   1835 		return;
   1836 	}
   1837 
   1838 	/* provide a backpointer to the real softc */
   1839 	raidsoftc(dev) = rs;
   1840 
   1841 	/* disk_attach actually creates space for the CPU disklabel, among
   1842 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1843 	 * with disklabels. */
   1844 	dk_init(dksc, dev, DKTYPE_RAID);
   1845 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1846 
   1847 	/* XXX There may be a weird interaction here between this, and
   1848 	 * protectedSectors, as used in RAIDframe.  */
   1849 
   1850 	rs->sc_size = raidPtr->totalSectors;
   1851 
   1852 	/* Attach dk and disk subsystems */
   1853 	dk_attach(dksc);
   1854 	disk_attach(&dksc->sc_dkdev);
   1855 	rf_set_geometry(rs, raidPtr);
   1856 
   1857 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1858 
   1859 	/* mark unit as usuable */
   1860 	rs->sc_flags |= RAIDF_INITED;
   1861 
   1862 	dkwedge_discover(&dksc->sc_dkdev);
   1863 }
   1864 
   1865 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1866 /* wake up the daemon & tell it to get us a spare table
   1867  * XXX
   1868  * the entries in the queues should be tagged with the raidPtr
   1869  * so that in the extremely rare case that two recons happen at once,
   1870  * we know for which device were requesting a spare table
   1871  * XXX
   1872  *
   1873  * XXX This code is not currently used. GO
   1874  */
   1875 int
   1876 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1877 {
   1878 	int     retcode;
   1879 
   1880 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1881 	req->next = rf_sparet_wait_queue;
   1882 	rf_sparet_wait_queue = req;
   1883 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1884 
   1885 	/* mpsleep unlocks the mutex */
   1886 	while (!rf_sparet_resp_queue) {
   1887 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1888 	}
   1889 	req = rf_sparet_resp_queue;
   1890 	rf_sparet_resp_queue = req->next;
   1891 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1892 
   1893 	retcode = req->fcol;
   1894 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1895 					 * alloc'd */
   1896 	return (retcode);
   1897 }
   1898 #endif
   1899 
   1900 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1901  * bp & passes it down.
   1902  * any calls originating in the kernel must use non-blocking I/O
   1903  * do some extra sanity checking to return "appropriate" error values for
   1904  * certain conditions (to make some standard utilities work)
   1905  *
   1906  * Formerly known as: rf_DoAccessKernel
   1907  */
   1908 void
   1909 raidstart(RF_Raid_t *raidPtr)
   1910 {
   1911 	struct raid_softc *rs;
   1912 	struct dk_softc *dksc;
   1913 
   1914 	rs = raidPtr->softc;
   1915 	dksc = &rs->sc_dksc;
   1916 	/* quick check to see if anything has died recently */
   1917 	rf_lock_mutex2(raidPtr->mutex);
   1918 	if (raidPtr->numNewFailures > 0) {
   1919 		rf_unlock_mutex2(raidPtr->mutex);
   1920 		rf_update_component_labels(raidPtr,
   1921 					   RF_NORMAL_COMPONENT_UPDATE);
   1922 		rf_lock_mutex2(raidPtr->mutex);
   1923 		raidPtr->numNewFailures--;
   1924 	}
   1925 	rf_unlock_mutex2(raidPtr->mutex);
   1926 
   1927 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1928 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1929 		return;
   1930 	}
   1931 
   1932 	dk_start(dksc, NULL);
   1933 }
   1934 
   1935 static int
   1936 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1937 {
   1938 	RF_SectorCount_t num_blocks, pb, sum;
   1939 	RF_RaidAddr_t raid_addr;
   1940 	daddr_t blocknum;
   1941 	int     do_async;
   1942 	int rc;
   1943 
   1944 	rf_lock_mutex2(raidPtr->mutex);
   1945 	if (raidPtr->openings == 0) {
   1946 		rf_unlock_mutex2(raidPtr->mutex);
   1947 		return EAGAIN;
   1948 	}
   1949 	rf_unlock_mutex2(raidPtr->mutex);
   1950 
   1951 	blocknum = bp->b_rawblkno;
   1952 
   1953 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1954 		    (int) blocknum));
   1955 
   1956 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1957 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1958 
   1959 	/* *THIS* is where we adjust what block we're going to...
   1960 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1961 	raid_addr = blocknum;
   1962 
   1963 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1964 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1965 	sum = raid_addr + num_blocks + pb;
   1966 	if (1 || rf_debugKernelAccess) {
   1967 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1968 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1969 			    (int) pb, (int) bp->b_resid));
   1970 	}
   1971 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1972 	    || (sum < num_blocks) || (sum < pb)) {
   1973 		rc = ENOSPC;
   1974 		goto done;
   1975 	}
   1976 	/*
   1977 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1978 	 */
   1979 
   1980 	if (bp->b_bcount & raidPtr->sectorMask) {
   1981 		rc = ENOSPC;
   1982 		goto done;
   1983 	}
   1984 	db1_printf(("Calling DoAccess..\n"));
   1985 
   1986 
   1987 	rf_lock_mutex2(raidPtr->mutex);
   1988 	raidPtr->openings--;
   1989 	rf_unlock_mutex2(raidPtr->mutex);
   1990 
   1991 	/*
   1992 	 * Everything is async.
   1993 	 */
   1994 	do_async = 1;
   1995 
   1996 	/* don't ever condition on bp->b_flags & B_WRITE.
   1997 	 * always condition on B_READ instead */
   1998 
   1999 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2000 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2001 			 do_async, raid_addr, num_blocks,
   2002 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2003 
   2004 done:
   2005 	return rc;
   2006 }
   2007 
   2008 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2009 
   2010 int
   2011 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2012 {
   2013 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2014 	struct buf *bp;
   2015 
   2016 	req->queue = queue;
   2017 	bp = req->bp;
   2018 
   2019 	switch (req->type) {
   2020 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2021 		/* XXX need to do something extra here.. */
   2022 		/* I'm leaving this in, as I've never actually seen it used,
   2023 		 * and I'd like folks to report it... GO */
   2024 		printf(("WAKEUP CALLED\n"));
   2025 		queue->numOutstanding++;
   2026 
   2027 		bp->b_flags = 0;
   2028 		bp->b_private = req;
   2029 
   2030 		KernelWakeupFunc(bp);
   2031 		break;
   2032 
   2033 	case RF_IO_TYPE_READ:
   2034 	case RF_IO_TYPE_WRITE:
   2035 #if RF_ACC_TRACE > 0
   2036 		if (req->tracerec) {
   2037 			RF_ETIMER_START(req->tracerec->timer);
   2038 		}
   2039 #endif
   2040 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2041 		    op, queue->rf_cinfo->ci_dev,
   2042 		    req->sectorOffset, req->numSector,
   2043 		    req->buf, KernelWakeupFunc, (void *) req,
   2044 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2045 
   2046 		if (rf_debugKernelAccess) {
   2047 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2048 				(long) bp->b_blkno));
   2049 		}
   2050 		queue->numOutstanding++;
   2051 		queue->last_deq_sector = req->sectorOffset;
   2052 		/* acc wouldn't have been let in if there were any pending
   2053 		 * reqs at any other priority */
   2054 		queue->curPriority = req->priority;
   2055 
   2056 		db1_printf(("Going for %c to unit %d col %d\n",
   2057 			    req->type, queue->raidPtr->raidid,
   2058 			    queue->col));
   2059 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2060 			(int) req->sectorOffset, (int) req->numSector,
   2061 			(int) (req->numSector <<
   2062 			    queue->raidPtr->logBytesPerSector),
   2063 			(int) queue->raidPtr->logBytesPerSector));
   2064 
   2065 		/*
   2066 		 * XXX: drop lock here since this can block at
   2067 		 * least with backing SCSI devices.  Retake it
   2068 		 * to minimize fuss with calling interfaces.
   2069 		 */
   2070 
   2071 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2072 		bdev_strategy(bp);
   2073 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2074 		break;
   2075 
   2076 	default:
   2077 		panic("bad req->type in rf_DispatchKernelIO");
   2078 	}
   2079 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2080 
   2081 	return (0);
   2082 }
   2083 /* this is the callback function associated with a I/O invoked from
   2084    kernel code.
   2085  */
   2086 static void
   2087 KernelWakeupFunc(struct buf *bp)
   2088 {
   2089 	RF_DiskQueueData_t *req = NULL;
   2090 	RF_DiskQueue_t *queue;
   2091 
   2092 	db1_printf(("recovering the request queue:\n"));
   2093 
   2094 	req = bp->b_private;
   2095 
   2096 	queue = (RF_DiskQueue_t *) req->queue;
   2097 
   2098 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2099 
   2100 #if RF_ACC_TRACE > 0
   2101 	if (req->tracerec) {
   2102 		RF_ETIMER_STOP(req->tracerec->timer);
   2103 		RF_ETIMER_EVAL(req->tracerec->timer);
   2104 		rf_lock_mutex2(rf_tracing_mutex);
   2105 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2106 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2107 		req->tracerec->num_phys_ios++;
   2108 		rf_unlock_mutex2(rf_tracing_mutex);
   2109 	}
   2110 #endif
   2111 
   2112 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2113 	 * ballistic, and mark the component as hosed... */
   2114 
   2115 	if (bp->b_error != 0) {
   2116 		/* Mark the disk as dead */
   2117 		/* but only mark it once... */
   2118 		/* and only if it wouldn't leave this RAID set
   2119 		   completely broken */
   2120 		if (((queue->raidPtr->Disks[queue->col].status ==
   2121 		      rf_ds_optimal) ||
   2122 		     (queue->raidPtr->Disks[queue->col].status ==
   2123 		      rf_ds_used_spare)) &&
   2124 		     (queue->raidPtr->numFailures <
   2125 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2126 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2127 			       queue->raidPtr->raidid,
   2128 			       bp->b_error,
   2129 			       queue->raidPtr->Disks[queue->col].devname);
   2130 			queue->raidPtr->Disks[queue->col].status =
   2131 			    rf_ds_failed;
   2132 			queue->raidPtr->status = rf_rs_degraded;
   2133 			queue->raidPtr->numFailures++;
   2134 			queue->raidPtr->numNewFailures++;
   2135 		} else {	/* Disk is already dead... */
   2136 			/* printf("Disk already marked as dead!\n"); */
   2137 		}
   2138 
   2139 	}
   2140 
   2141 	/* Fill in the error value */
   2142 	req->error = bp->b_error;
   2143 
   2144 	/* Drop this one on the "finished" queue... */
   2145 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2146 
   2147 	/* Let the raidio thread know there is work to be done. */
   2148 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2149 
   2150 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2151 }
   2152 
   2153 
   2154 /*
   2155  * initialize a buf structure for doing an I/O in the kernel.
   2156  */
   2157 static void
   2158 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2159        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2160        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2161        struct proc *b_proc)
   2162 {
   2163 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2164 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2165 	bp->b_oflags = 0;
   2166 	bp->b_cflags = 0;
   2167 	bp->b_bcount = numSect << logBytesPerSector;
   2168 	bp->b_bufsize = bp->b_bcount;
   2169 	bp->b_error = 0;
   2170 	bp->b_dev = dev;
   2171 	bp->b_data = bf;
   2172 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2173 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2174 	if (bp->b_bcount == 0) {
   2175 		panic("bp->b_bcount is zero in InitBP!!");
   2176 	}
   2177 	bp->b_proc = b_proc;
   2178 	bp->b_iodone = cbFunc;
   2179 	bp->b_private = cbArg;
   2180 }
   2181 
   2182 /*
   2183  * Wait interruptibly for an exclusive lock.
   2184  *
   2185  * XXX
   2186  * Several drivers do this; it should be abstracted and made MP-safe.
   2187  * (Hmm... where have we seen this warning before :->  GO )
   2188  */
   2189 static int
   2190 raidlock(struct raid_softc *rs)
   2191 {
   2192 	int     error;
   2193 
   2194 	error = 0;
   2195 	mutex_enter(&rs->sc_mutex);
   2196 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2197 		rs->sc_flags |= RAIDF_WANTED;
   2198 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2199 		if (error != 0)
   2200 			goto done;
   2201 	}
   2202 	rs->sc_flags |= RAIDF_LOCKED;
   2203 done:
   2204 	mutex_exit(&rs->sc_mutex);
   2205 	return (error);
   2206 }
   2207 /*
   2208  * Unlock and wake up any waiters.
   2209  */
   2210 static void
   2211 raidunlock(struct raid_softc *rs)
   2212 {
   2213 
   2214 	mutex_enter(&rs->sc_mutex);
   2215 	rs->sc_flags &= ~RAIDF_LOCKED;
   2216 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2217 		rs->sc_flags &= ~RAIDF_WANTED;
   2218 		cv_broadcast(&rs->sc_cv);
   2219 	}
   2220 	mutex_exit(&rs->sc_mutex);
   2221 }
   2222 
   2223 
   2224 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2225 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2226 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2227 
   2228 static daddr_t
   2229 rf_component_info_offset(void)
   2230 {
   2231 
   2232 	return RF_COMPONENT_INFO_OFFSET;
   2233 }
   2234 
   2235 static daddr_t
   2236 rf_component_info_size(unsigned secsize)
   2237 {
   2238 	daddr_t info_size;
   2239 
   2240 	KASSERT(secsize);
   2241 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2242 		info_size = secsize;
   2243 	else
   2244 		info_size = RF_COMPONENT_INFO_SIZE;
   2245 
   2246 	return info_size;
   2247 }
   2248 
   2249 static daddr_t
   2250 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2251 {
   2252 	daddr_t map_offset;
   2253 
   2254 	KASSERT(raidPtr->bytesPerSector);
   2255 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2256 		map_offset = raidPtr->bytesPerSector;
   2257 	else
   2258 		map_offset = RF_COMPONENT_INFO_SIZE;
   2259 	map_offset += rf_component_info_offset();
   2260 
   2261 	return map_offset;
   2262 }
   2263 
   2264 static daddr_t
   2265 rf_parity_map_size(RF_Raid_t *raidPtr)
   2266 {
   2267 	daddr_t map_size;
   2268 
   2269 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2270 		map_size = raidPtr->bytesPerSector;
   2271 	else
   2272 		map_size = RF_PARITY_MAP_SIZE;
   2273 
   2274 	return map_size;
   2275 }
   2276 
   2277 int
   2278 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2279 {
   2280 	RF_ComponentLabel_t *clabel;
   2281 
   2282 	clabel = raidget_component_label(raidPtr, col);
   2283 	clabel->clean = RF_RAID_CLEAN;
   2284 	raidflush_component_label(raidPtr, col);
   2285 	return(0);
   2286 }
   2287 
   2288 
   2289 int
   2290 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2291 {
   2292 	RF_ComponentLabel_t *clabel;
   2293 
   2294 	clabel = raidget_component_label(raidPtr, col);
   2295 	clabel->clean = RF_RAID_DIRTY;
   2296 	raidflush_component_label(raidPtr, col);
   2297 	return(0);
   2298 }
   2299 
   2300 int
   2301 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2302 {
   2303 	KASSERT(raidPtr->bytesPerSector);
   2304 	return raidread_component_label(raidPtr->bytesPerSector,
   2305 	    raidPtr->Disks[col].dev,
   2306 	    raidPtr->raid_cinfo[col].ci_vp,
   2307 	    &raidPtr->raid_cinfo[col].ci_label);
   2308 }
   2309 
   2310 RF_ComponentLabel_t *
   2311 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2312 {
   2313 	return &raidPtr->raid_cinfo[col].ci_label;
   2314 }
   2315 
   2316 int
   2317 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2318 {
   2319 	RF_ComponentLabel_t *label;
   2320 
   2321 	label = &raidPtr->raid_cinfo[col].ci_label;
   2322 	label->mod_counter = raidPtr->mod_counter;
   2323 #ifndef RF_NO_PARITY_MAP
   2324 	label->parity_map_modcount = label->mod_counter;
   2325 #endif
   2326 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2327 	    raidPtr->Disks[col].dev,
   2328 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2329 }
   2330 
   2331 
   2332 static int
   2333 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2334     RF_ComponentLabel_t *clabel)
   2335 {
   2336 	return raidread_component_area(dev, b_vp, clabel,
   2337 	    sizeof(RF_ComponentLabel_t),
   2338 	    rf_component_info_offset(),
   2339 	    rf_component_info_size(secsize));
   2340 }
   2341 
   2342 /* ARGSUSED */
   2343 static int
   2344 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2345     size_t msize, daddr_t offset, daddr_t dsize)
   2346 {
   2347 	struct buf *bp;
   2348 	int error;
   2349 
   2350 	/* XXX should probably ensure that we don't try to do this if
   2351 	   someone has changed rf_protected_sectors. */
   2352 
   2353 	if (b_vp == NULL) {
   2354 		/* For whatever reason, this component is not valid.
   2355 		   Don't try to read a component label from it. */
   2356 		return(EINVAL);
   2357 	}
   2358 
   2359 	/* get a block of the appropriate size... */
   2360 	bp = geteblk((int)dsize);
   2361 	bp->b_dev = dev;
   2362 
   2363 	/* get our ducks in a row for the read */
   2364 	bp->b_blkno = offset / DEV_BSIZE;
   2365 	bp->b_bcount = dsize;
   2366 	bp->b_flags |= B_READ;
   2367  	bp->b_resid = dsize;
   2368 
   2369 	bdev_strategy(bp);
   2370 	error = biowait(bp);
   2371 
   2372 	if (!error) {
   2373 		memcpy(data, bp->b_data, msize);
   2374 	}
   2375 
   2376 	brelse(bp, 0);
   2377 	return(error);
   2378 }
   2379 
   2380 
   2381 static int
   2382 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2383     RF_ComponentLabel_t *clabel)
   2384 {
   2385 	return raidwrite_component_area(dev, b_vp, clabel,
   2386 	    sizeof(RF_ComponentLabel_t),
   2387 	    rf_component_info_offset(),
   2388 	    rf_component_info_size(secsize), 0);
   2389 }
   2390 
   2391 /* ARGSUSED */
   2392 static int
   2393 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2394     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2395 {
   2396 	struct buf *bp;
   2397 	int error;
   2398 
   2399 	/* get a block of the appropriate size... */
   2400 	bp = geteblk((int)dsize);
   2401 	bp->b_dev = dev;
   2402 
   2403 	/* get our ducks in a row for the write */
   2404 	bp->b_blkno = offset / DEV_BSIZE;
   2405 	bp->b_bcount = dsize;
   2406 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2407  	bp->b_resid = dsize;
   2408 
   2409 	memset(bp->b_data, 0, dsize);
   2410 	memcpy(bp->b_data, data, msize);
   2411 
   2412 	bdev_strategy(bp);
   2413 	if (asyncp)
   2414 		return 0;
   2415 	error = biowait(bp);
   2416 	brelse(bp, 0);
   2417 	if (error) {
   2418 #if 1
   2419 		printf("Failed to write RAID component info!\n");
   2420 #endif
   2421 	}
   2422 
   2423 	return(error);
   2424 }
   2425 
   2426 void
   2427 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2428 {
   2429 	int c;
   2430 
   2431 	for (c = 0; c < raidPtr->numCol; c++) {
   2432 		/* Skip dead disks. */
   2433 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2434 			continue;
   2435 		/* XXXjld: what if an error occurs here? */
   2436 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2437 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2438 		    RF_PARITYMAP_NBYTE,
   2439 		    rf_parity_map_offset(raidPtr),
   2440 		    rf_parity_map_size(raidPtr), 0);
   2441 	}
   2442 }
   2443 
   2444 void
   2445 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2446 {
   2447 	struct rf_paritymap_ondisk tmp;
   2448 	int c,first;
   2449 
   2450 	first=1;
   2451 	for (c = 0; c < raidPtr->numCol; c++) {
   2452 		/* Skip dead disks. */
   2453 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2454 			continue;
   2455 		raidread_component_area(raidPtr->Disks[c].dev,
   2456 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2457 		    RF_PARITYMAP_NBYTE,
   2458 		    rf_parity_map_offset(raidPtr),
   2459 		    rf_parity_map_size(raidPtr));
   2460 		if (first) {
   2461 			memcpy(map, &tmp, sizeof(*map));
   2462 			first = 0;
   2463 		} else {
   2464 			rf_paritymap_merge(map, &tmp);
   2465 		}
   2466 	}
   2467 }
   2468 
   2469 void
   2470 rf_markalldirty(RF_Raid_t *raidPtr)
   2471 {
   2472 	RF_ComponentLabel_t *clabel;
   2473 	int sparecol;
   2474 	int c;
   2475 	int j;
   2476 	int scol = -1;
   2477 
   2478 	raidPtr->mod_counter++;
   2479 	for (c = 0; c < raidPtr->numCol; c++) {
   2480 		/* we don't want to touch (at all) a disk that has
   2481 		   failed */
   2482 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2483 			clabel = raidget_component_label(raidPtr, c);
   2484 			if (clabel->status == rf_ds_spared) {
   2485 				/* XXX do something special...
   2486 				   but whatever you do, don't
   2487 				   try to access it!! */
   2488 			} else {
   2489 				raidmarkdirty(raidPtr, c);
   2490 			}
   2491 		}
   2492 	}
   2493 
   2494 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2495 		sparecol = raidPtr->numCol + c;
   2496 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2497 			/*
   2498 
   2499 			   we claim this disk is "optimal" if it's
   2500 			   rf_ds_used_spare, as that means it should be
   2501 			   directly substitutable for the disk it replaced.
   2502 			   We note that too...
   2503 
   2504 			 */
   2505 
   2506 			for(j=0;j<raidPtr->numCol;j++) {
   2507 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2508 					scol = j;
   2509 					break;
   2510 				}
   2511 			}
   2512 
   2513 			clabel = raidget_component_label(raidPtr, sparecol);
   2514 			/* make sure status is noted */
   2515 
   2516 			raid_init_component_label(raidPtr, clabel);
   2517 
   2518 			clabel->row = 0;
   2519 			clabel->column = scol;
   2520 			/* Note: we *don't* change status from rf_ds_used_spare
   2521 			   to rf_ds_optimal */
   2522 			/* clabel.status = rf_ds_optimal; */
   2523 
   2524 			raidmarkdirty(raidPtr, sparecol);
   2525 		}
   2526 	}
   2527 }
   2528 
   2529 
   2530 void
   2531 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2532 {
   2533 	RF_ComponentLabel_t *clabel;
   2534 	int sparecol;
   2535 	int c;
   2536 	int j;
   2537 	int scol;
   2538 	struct raid_softc *rs = raidPtr->softc;
   2539 
   2540 	scol = -1;
   2541 
   2542 	/* XXX should do extra checks to make sure things really are clean,
   2543 	   rather than blindly setting the clean bit... */
   2544 
   2545 	raidPtr->mod_counter++;
   2546 
   2547 	for (c = 0; c < raidPtr->numCol; c++) {
   2548 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2549 			clabel = raidget_component_label(raidPtr, c);
   2550 			/* make sure status is noted */
   2551 			clabel->status = rf_ds_optimal;
   2552 
   2553 			/* note what unit we are configured as */
   2554 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2555 				clabel->last_unit = raidPtr->raidid;
   2556 
   2557 			raidflush_component_label(raidPtr, c);
   2558 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2559 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2560 					raidmarkclean(raidPtr, c);
   2561 				}
   2562 			}
   2563 		}
   2564 		/* else we don't touch it.. */
   2565 	}
   2566 
   2567 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2568 		sparecol = raidPtr->numCol + c;
   2569 		/* Need to ensure that the reconstruct actually completed! */
   2570 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2571 			/*
   2572 
   2573 			   we claim this disk is "optimal" if it's
   2574 			   rf_ds_used_spare, as that means it should be
   2575 			   directly substitutable for the disk it replaced.
   2576 			   We note that too...
   2577 
   2578 			 */
   2579 
   2580 			for(j=0;j<raidPtr->numCol;j++) {
   2581 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2582 					scol = j;
   2583 					break;
   2584 				}
   2585 			}
   2586 
   2587 			/* XXX shouldn't *really* need this... */
   2588 			clabel = raidget_component_label(raidPtr, sparecol);
   2589 			/* make sure status is noted */
   2590 
   2591 			raid_init_component_label(raidPtr, clabel);
   2592 
   2593 			clabel->column = scol;
   2594 			clabel->status = rf_ds_optimal;
   2595 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2596 				clabel->last_unit = raidPtr->raidid;
   2597 
   2598 			raidflush_component_label(raidPtr, sparecol);
   2599 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2600 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2601 					raidmarkclean(raidPtr, sparecol);
   2602 				}
   2603 			}
   2604 		}
   2605 	}
   2606 }
   2607 
   2608 void
   2609 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2610 {
   2611 
   2612 	if (vp != NULL) {
   2613 		if (auto_configured == 1) {
   2614 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2615 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2616 			vput(vp);
   2617 
   2618 		} else {
   2619 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2620 		}
   2621 	}
   2622 }
   2623 
   2624 
   2625 void
   2626 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2627 {
   2628 	int r,c;
   2629 	struct vnode *vp;
   2630 	int acd;
   2631 
   2632 
   2633 	/* We take this opportunity to close the vnodes like we should.. */
   2634 
   2635 	for (c = 0; c < raidPtr->numCol; c++) {
   2636 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2637 		acd = raidPtr->Disks[c].auto_configured;
   2638 		rf_close_component(raidPtr, vp, acd);
   2639 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2640 		raidPtr->Disks[c].auto_configured = 0;
   2641 	}
   2642 
   2643 	for (r = 0; r < raidPtr->numSpare; r++) {
   2644 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2645 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2646 		rf_close_component(raidPtr, vp, acd);
   2647 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2648 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2649 	}
   2650 }
   2651 
   2652 
   2653 void
   2654 rf_ReconThread(struct rf_recon_req *req)
   2655 {
   2656 	int     s;
   2657 	RF_Raid_t *raidPtr;
   2658 
   2659 	s = splbio();
   2660 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2661 	raidPtr->recon_in_progress = 1;
   2662 
   2663 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2664 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2665 
   2666 	RF_Free(req, sizeof(*req));
   2667 
   2668 	raidPtr->recon_in_progress = 0;
   2669 	splx(s);
   2670 
   2671 	/* That's all... */
   2672 	kthread_exit(0);	/* does not return */
   2673 }
   2674 
   2675 void
   2676 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2677 {
   2678 	int retcode;
   2679 	int s;
   2680 
   2681 	raidPtr->parity_rewrite_stripes_done = 0;
   2682 	raidPtr->parity_rewrite_in_progress = 1;
   2683 	s = splbio();
   2684 	retcode = rf_RewriteParity(raidPtr);
   2685 	splx(s);
   2686 	if (retcode) {
   2687 		printf("raid%d: Error re-writing parity (%d)!\n",
   2688 		    raidPtr->raidid, retcode);
   2689 	} else {
   2690 		/* set the clean bit!  If we shutdown correctly,
   2691 		   the clean bit on each component label will get
   2692 		   set */
   2693 		raidPtr->parity_good = RF_RAID_CLEAN;
   2694 	}
   2695 	raidPtr->parity_rewrite_in_progress = 0;
   2696 
   2697 	/* Anyone waiting for us to stop?  If so, inform them... */
   2698 	if (raidPtr->waitShutdown) {
   2699 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2700 	}
   2701 
   2702 	/* That's all... */
   2703 	kthread_exit(0);	/* does not return */
   2704 }
   2705 
   2706 
   2707 void
   2708 rf_CopybackThread(RF_Raid_t *raidPtr)
   2709 {
   2710 	int s;
   2711 
   2712 	raidPtr->copyback_in_progress = 1;
   2713 	s = splbio();
   2714 	rf_CopybackReconstructedData(raidPtr);
   2715 	splx(s);
   2716 	raidPtr->copyback_in_progress = 0;
   2717 
   2718 	/* That's all... */
   2719 	kthread_exit(0);	/* does not return */
   2720 }
   2721 
   2722 
   2723 void
   2724 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2725 {
   2726 	int s;
   2727 	RF_Raid_t *raidPtr;
   2728 
   2729 	s = splbio();
   2730 	raidPtr = req->raidPtr;
   2731 	raidPtr->recon_in_progress = 1;
   2732 	rf_ReconstructInPlace(raidPtr, req->col);
   2733 	RF_Free(req, sizeof(*req));
   2734 	raidPtr->recon_in_progress = 0;
   2735 	splx(s);
   2736 
   2737 	/* That's all... */
   2738 	kthread_exit(0);	/* does not return */
   2739 }
   2740 
   2741 static RF_AutoConfig_t *
   2742 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2743     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2744     unsigned secsize)
   2745 {
   2746 	int good_one = 0;
   2747 	RF_ComponentLabel_t *clabel;
   2748 	RF_AutoConfig_t *ac;
   2749 
   2750 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2751 	if (clabel == NULL) {
   2752 oomem:
   2753 		    while(ac_list) {
   2754 			    ac = ac_list;
   2755 			    if (ac->clabel)
   2756 				    free(ac->clabel, M_RAIDFRAME);
   2757 			    ac_list = ac_list->next;
   2758 			    free(ac, M_RAIDFRAME);
   2759 		    }
   2760 		    printf("RAID auto config: out of memory!\n");
   2761 		    return NULL; /* XXX probably should panic? */
   2762 	}
   2763 
   2764 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2765 		/* Got the label.  Does it look reasonable? */
   2766 		if (rf_reasonable_label(clabel, numsecs) &&
   2767 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2768 #ifdef DEBUG
   2769 			printf("Component on: %s: %llu\n",
   2770 				cname, (unsigned long long)size);
   2771 			rf_print_component_label(clabel);
   2772 #endif
   2773 			/* if it's reasonable, add it, else ignore it. */
   2774 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2775 				M_NOWAIT);
   2776 			if (ac == NULL) {
   2777 				free(clabel, M_RAIDFRAME);
   2778 				goto oomem;
   2779 			}
   2780 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2781 			ac->dev = dev;
   2782 			ac->vp = vp;
   2783 			ac->clabel = clabel;
   2784 			ac->next = ac_list;
   2785 			ac_list = ac;
   2786 			good_one = 1;
   2787 		}
   2788 	}
   2789 	if (!good_one) {
   2790 		/* cleanup */
   2791 		free(clabel, M_RAIDFRAME);
   2792 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2793 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2794 		vput(vp);
   2795 	}
   2796 	return ac_list;
   2797 }
   2798 
   2799 RF_AutoConfig_t *
   2800 rf_find_raid_components(void)
   2801 {
   2802 	struct vnode *vp;
   2803 	struct disklabel label;
   2804 	device_t dv;
   2805 	deviter_t di;
   2806 	dev_t dev;
   2807 	int bmajor, bminor, wedge, rf_part_found;
   2808 	int error;
   2809 	int i;
   2810 	RF_AutoConfig_t *ac_list;
   2811 	uint64_t numsecs;
   2812 	unsigned secsize;
   2813 	int dowedges;
   2814 
   2815 	/* initialize the AutoConfig list */
   2816 	ac_list = NULL;
   2817 
   2818 	/*
   2819 	 * we begin by trolling through *all* the devices on the system *twice*
   2820 	 * first we scan for wedges, second for other devices. This avoids
   2821 	 * using a raw partition instead of a wedge that covers the whole disk
   2822 	 */
   2823 
   2824 	for (dowedges=1; dowedges>=0; --dowedges) {
   2825 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2826 		     dv = deviter_next(&di)) {
   2827 
   2828 			/* we are only interested in disks... */
   2829 			if (device_class(dv) != DV_DISK)
   2830 				continue;
   2831 
   2832 			/* we don't care about floppies... */
   2833 			if (device_is_a(dv, "fd")) {
   2834 				continue;
   2835 			}
   2836 
   2837 			/* we don't care about CD's... */
   2838 			if (device_is_a(dv, "cd")) {
   2839 				continue;
   2840 			}
   2841 
   2842 			/* we don't care about md's... */
   2843 			if (device_is_a(dv, "md")) {
   2844 				continue;
   2845 			}
   2846 
   2847 			/* hdfd is the Atari/Hades floppy driver */
   2848 			if (device_is_a(dv, "hdfd")) {
   2849 				continue;
   2850 			}
   2851 
   2852 			/* fdisa is the Atari/Milan floppy driver */
   2853 			if (device_is_a(dv, "fdisa")) {
   2854 				continue;
   2855 			}
   2856 
   2857 			/* are we in the wedges pass ? */
   2858 			wedge = device_is_a(dv, "dk");
   2859 			if (wedge != dowedges) {
   2860 				continue;
   2861 			}
   2862 
   2863 			/* need to find the device_name_to_block_device_major stuff */
   2864 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2865 
   2866 			rf_part_found = 0; /*No raid partition as yet*/
   2867 
   2868 			/* get a vnode for the raw partition of this disk */
   2869 			bminor = minor(device_unit(dv));
   2870 			dev = wedge ? makedev(bmajor, bminor) :
   2871 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2872 			if (bdevvp(dev, &vp))
   2873 				panic("RAID can't alloc vnode");
   2874 
   2875 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2876 
   2877 			if (error) {
   2878 				/* "Who cares."  Continue looking
   2879 				   for something that exists*/
   2880 				vput(vp);
   2881 				continue;
   2882 			}
   2883 
   2884 			error = getdisksize(vp, &numsecs, &secsize);
   2885 			if (error) {
   2886 				/*
   2887 				 * Pseudo devices like vnd and cgd can be
   2888 				 * opened but may still need some configuration.
   2889 				 * Ignore these quietly.
   2890 				 */
   2891 				if (error != ENXIO)
   2892 					printf("RAIDframe: can't get disk size"
   2893 					    " for dev %s (%d)\n",
   2894 					    device_xname(dv), error);
   2895 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2896 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2897 				vput(vp);
   2898 				continue;
   2899 			}
   2900 			if (wedge) {
   2901 				struct dkwedge_info dkw;
   2902 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2903 				    NOCRED);
   2904 				if (error) {
   2905 					printf("RAIDframe: can't get wedge info for "
   2906 					    "dev %s (%d)\n", device_xname(dv), error);
   2907 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2908 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2909 					vput(vp);
   2910 					continue;
   2911 				}
   2912 
   2913 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2914 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2915 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2916 					vput(vp);
   2917 					continue;
   2918 				}
   2919 
   2920 				ac_list = rf_get_component(ac_list, dev, vp,
   2921 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2922 				rf_part_found = 1; /*There is a raid component on this disk*/
   2923 				continue;
   2924 			}
   2925 
   2926 			/* Ok, the disk exists.  Go get the disklabel. */
   2927 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2928 			if (error) {
   2929 				/*
   2930 				 * XXX can't happen - open() would
   2931 				 * have errored out (or faked up one)
   2932 				 */
   2933 				if (error != ENOTTY)
   2934 					printf("RAIDframe: can't get label for dev "
   2935 					    "%s (%d)\n", device_xname(dv), error);
   2936 			}
   2937 
   2938 			/* don't need this any more.  We'll allocate it again
   2939 			   a little later if we really do... */
   2940 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2941 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2942 			vput(vp);
   2943 
   2944 			if (error)
   2945 				continue;
   2946 
   2947 			rf_part_found = 0; /*No raid partitions yet*/
   2948 			for (i = 0; i < label.d_npartitions; i++) {
   2949 				char cname[sizeof(ac_list->devname)];
   2950 
   2951 				/* We only support partitions marked as RAID */
   2952 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2953 					continue;
   2954 
   2955 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2956 				if (bdevvp(dev, &vp))
   2957 					panic("RAID can't alloc vnode");
   2958 
   2959 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2960 				if (error) {
   2961 					/* Whatever... */
   2962 					vput(vp);
   2963 					continue;
   2964 				}
   2965 				snprintf(cname, sizeof(cname), "%s%c",
   2966 				    device_xname(dv), 'a' + i);
   2967 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2968 					label.d_partitions[i].p_size, numsecs, secsize);
   2969 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2970 			}
   2971 
   2972 			/*
   2973 			 *If there is no raid component on this disk, either in a
   2974 			 *disklabel or inside a wedge, check the raw partition as well,
   2975 			 *as it is possible to configure raid components on raw disk
   2976 			 *devices.
   2977 			 */
   2978 
   2979 			if (!rf_part_found) {
   2980 				char cname[sizeof(ac_list->devname)];
   2981 
   2982 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2983 				if (bdevvp(dev, &vp))
   2984 					panic("RAID can't alloc vnode");
   2985 
   2986 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2987 				if (error) {
   2988 					/* Whatever... */
   2989 					vput(vp);
   2990 					continue;
   2991 				}
   2992 				snprintf(cname, sizeof(cname), "%s%c",
   2993 				    device_xname(dv), 'a' + RAW_PART);
   2994 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2995 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2996 			}
   2997 		}
   2998 		deviter_release(&di);
   2999 	}
   3000 	return ac_list;
   3001 }
   3002 
   3003 
   3004 int
   3005 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3006 {
   3007 
   3008 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3009 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3010 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3011 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3012 	    clabel->row >=0 &&
   3013 	    clabel->column >= 0 &&
   3014 	    clabel->num_rows > 0 &&
   3015 	    clabel->num_columns > 0 &&
   3016 	    clabel->row < clabel->num_rows &&
   3017 	    clabel->column < clabel->num_columns &&
   3018 	    clabel->blockSize > 0 &&
   3019 	    /*
   3020 	     * numBlocksHi may contain garbage, but it is ok since
   3021 	     * the type is unsigned.  If it is really garbage,
   3022 	     * rf_fix_old_label_size() will fix it.
   3023 	     */
   3024 	    rf_component_label_numblocks(clabel) > 0) {
   3025 		/*
   3026 		 * label looks reasonable enough...
   3027 		 * let's make sure it has no old garbage.
   3028 		 */
   3029 		if (numsecs)
   3030 			rf_fix_old_label_size(clabel, numsecs);
   3031 		return(1);
   3032 	}
   3033 	return(0);
   3034 }
   3035 
   3036 
   3037 /*
   3038  * For reasons yet unknown, some old component labels have garbage in
   3039  * the newer numBlocksHi region, and this causes lossage.  Since those
   3040  * disks will also have numsecs set to less than 32 bits of sectors,
   3041  * we can determine when this corruption has occurred, and fix it.
   3042  *
   3043  * The exact same problem, with the same unknown reason, happens to
   3044  * the partitionSizeHi member as well.
   3045  */
   3046 static void
   3047 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3048 {
   3049 
   3050 	if (numsecs < ((uint64_t)1 << 32)) {
   3051 		if (clabel->numBlocksHi) {
   3052 			printf("WARNING: total sectors < 32 bits, yet "
   3053 			       "numBlocksHi set\n"
   3054 			       "WARNING: resetting numBlocksHi to zero.\n");
   3055 			clabel->numBlocksHi = 0;
   3056 		}
   3057 
   3058 		if (clabel->partitionSizeHi) {
   3059 			printf("WARNING: total sectors < 32 bits, yet "
   3060 			       "partitionSizeHi set\n"
   3061 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3062 			clabel->partitionSizeHi = 0;
   3063 		}
   3064 	}
   3065 }
   3066 
   3067 
   3068 #ifdef DEBUG
   3069 void
   3070 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3071 {
   3072 	uint64_t numBlocks;
   3073 	static const char *rp[] = {
   3074 	    "No", "Force", "Soft", "*invalid*"
   3075 	};
   3076 
   3077 
   3078 	numBlocks = rf_component_label_numblocks(clabel);
   3079 
   3080 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3081 	       clabel->row, clabel->column,
   3082 	       clabel->num_rows, clabel->num_columns);
   3083 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3084 	       clabel->version, clabel->serial_number,
   3085 	       clabel->mod_counter);
   3086 	printf("   Clean: %s Status: %d\n",
   3087 	       clabel->clean ? "Yes" : "No", clabel->status);
   3088 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3089 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3090 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3091 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3092 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3093 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3094 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3095 #if 0
   3096 	   printf("   Config order: %d\n", clabel->config_order);
   3097 #endif
   3098 
   3099 }
   3100 #endif
   3101 
   3102 RF_ConfigSet_t *
   3103 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3104 {
   3105 	RF_AutoConfig_t *ac;
   3106 	RF_ConfigSet_t *config_sets;
   3107 	RF_ConfigSet_t *cset;
   3108 	RF_AutoConfig_t *ac_next;
   3109 
   3110 
   3111 	config_sets = NULL;
   3112 
   3113 	/* Go through the AutoConfig list, and figure out which components
   3114 	   belong to what sets.  */
   3115 	ac = ac_list;
   3116 	while(ac!=NULL) {
   3117 		/* we're going to putz with ac->next, so save it here
   3118 		   for use at the end of the loop */
   3119 		ac_next = ac->next;
   3120 
   3121 		if (config_sets == NULL) {
   3122 			/* will need at least this one... */
   3123 			config_sets = (RF_ConfigSet_t *)
   3124 				malloc(sizeof(RF_ConfigSet_t),
   3125 				       M_RAIDFRAME, M_NOWAIT);
   3126 			if (config_sets == NULL) {
   3127 				panic("rf_create_auto_sets: No memory!");
   3128 			}
   3129 			/* this one is easy :) */
   3130 			config_sets->ac = ac;
   3131 			config_sets->next = NULL;
   3132 			config_sets->rootable = 0;
   3133 			ac->next = NULL;
   3134 		} else {
   3135 			/* which set does this component fit into? */
   3136 			cset = config_sets;
   3137 			while(cset!=NULL) {
   3138 				if (rf_does_it_fit(cset, ac)) {
   3139 					/* looks like it matches... */
   3140 					ac->next = cset->ac;
   3141 					cset->ac = ac;
   3142 					break;
   3143 				}
   3144 				cset = cset->next;
   3145 			}
   3146 			if (cset==NULL) {
   3147 				/* didn't find a match above... new set..*/
   3148 				cset = (RF_ConfigSet_t *)
   3149 					malloc(sizeof(RF_ConfigSet_t),
   3150 					       M_RAIDFRAME, M_NOWAIT);
   3151 				if (cset == NULL) {
   3152 					panic("rf_create_auto_sets: No memory!");
   3153 				}
   3154 				cset->ac = ac;
   3155 				ac->next = NULL;
   3156 				cset->next = config_sets;
   3157 				cset->rootable = 0;
   3158 				config_sets = cset;
   3159 			}
   3160 		}
   3161 		ac = ac_next;
   3162 	}
   3163 
   3164 
   3165 	return(config_sets);
   3166 }
   3167 
   3168 static int
   3169 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3170 {
   3171 	RF_ComponentLabel_t *clabel1, *clabel2;
   3172 
   3173 	/* If this one matches the *first* one in the set, that's good
   3174 	   enough, since the other members of the set would have been
   3175 	   through here too... */
   3176 	/* note that we are not checking partitionSize here..
   3177 
   3178 	   Note that we are also not checking the mod_counters here.
   3179 	   If everything else matches except the mod_counter, that's
   3180 	   good enough for this test.  We will deal with the mod_counters
   3181 	   a little later in the autoconfiguration process.
   3182 
   3183 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3184 
   3185 	   The reason we don't check for this is that failed disks
   3186 	   will have lower modification counts.  If those disks are
   3187 	   not added to the set they used to belong to, then they will
   3188 	   form their own set, which may result in 2 different sets,
   3189 	   for example, competing to be configured at raid0, and
   3190 	   perhaps competing to be the root filesystem set.  If the
   3191 	   wrong ones get configured, or both attempt to become /,
   3192 	   weird behaviour and or serious lossage will occur.  Thus we
   3193 	   need to bring them into the fold here, and kick them out at
   3194 	   a later point.
   3195 
   3196 	*/
   3197 
   3198 	clabel1 = cset->ac->clabel;
   3199 	clabel2 = ac->clabel;
   3200 	if ((clabel1->version == clabel2->version) &&
   3201 	    (clabel1->serial_number == clabel2->serial_number) &&
   3202 	    (clabel1->num_rows == clabel2->num_rows) &&
   3203 	    (clabel1->num_columns == clabel2->num_columns) &&
   3204 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3205 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3206 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3207 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3208 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3209 	    (clabel1->blockSize == clabel2->blockSize) &&
   3210 	    rf_component_label_numblocks(clabel1) ==
   3211 	    rf_component_label_numblocks(clabel2) &&
   3212 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3213 	    (clabel1->root_partition == clabel2->root_partition) &&
   3214 	    (clabel1->last_unit == clabel2->last_unit) &&
   3215 	    (clabel1->config_order == clabel2->config_order)) {
   3216 		/* if it get's here, it almost *has* to be a match */
   3217 	} else {
   3218 		/* it's not consistent with somebody in the set..
   3219 		   punt */
   3220 		return(0);
   3221 	}
   3222 	/* all was fine.. it must fit... */
   3223 	return(1);
   3224 }
   3225 
   3226 int
   3227 rf_have_enough_components(RF_ConfigSet_t *cset)
   3228 {
   3229 	RF_AutoConfig_t *ac;
   3230 	RF_AutoConfig_t *auto_config;
   3231 	RF_ComponentLabel_t *clabel;
   3232 	int c;
   3233 	int num_cols;
   3234 	int num_missing;
   3235 	int mod_counter;
   3236 	int mod_counter_found;
   3237 	int even_pair_failed;
   3238 	char parity_type;
   3239 
   3240 
   3241 	/* check to see that we have enough 'live' components
   3242 	   of this set.  If so, we can configure it if necessary */
   3243 
   3244 	num_cols = cset->ac->clabel->num_columns;
   3245 	parity_type = cset->ac->clabel->parityConfig;
   3246 
   3247 	/* XXX Check for duplicate components!?!?!? */
   3248 
   3249 	/* Determine what the mod_counter is supposed to be for this set. */
   3250 
   3251 	mod_counter_found = 0;
   3252 	mod_counter = 0;
   3253 	ac = cset->ac;
   3254 	while(ac!=NULL) {
   3255 		if (mod_counter_found==0) {
   3256 			mod_counter = ac->clabel->mod_counter;
   3257 			mod_counter_found = 1;
   3258 		} else {
   3259 			if (ac->clabel->mod_counter > mod_counter) {
   3260 				mod_counter = ac->clabel->mod_counter;
   3261 			}
   3262 		}
   3263 		ac = ac->next;
   3264 	}
   3265 
   3266 	num_missing = 0;
   3267 	auto_config = cset->ac;
   3268 
   3269 	even_pair_failed = 0;
   3270 	for(c=0; c<num_cols; c++) {
   3271 		ac = auto_config;
   3272 		while(ac!=NULL) {
   3273 			if ((ac->clabel->column == c) &&
   3274 			    (ac->clabel->mod_counter == mod_counter)) {
   3275 				/* it's this one... */
   3276 #ifdef DEBUG
   3277 				printf("Found: %s at %d\n",
   3278 				       ac->devname,c);
   3279 #endif
   3280 				break;
   3281 			}
   3282 			ac=ac->next;
   3283 		}
   3284 		if (ac==NULL) {
   3285 				/* Didn't find one here! */
   3286 				/* special case for RAID 1, especially
   3287 				   where there are more than 2
   3288 				   components (where RAIDframe treats
   3289 				   things a little differently :( ) */
   3290 			if (parity_type == '1') {
   3291 				if (c%2 == 0) { /* even component */
   3292 					even_pair_failed = 1;
   3293 				} else { /* odd component.  If
   3294 					    we're failed, and
   3295 					    so is the even
   3296 					    component, it's
   3297 					    "Good Night, Charlie" */
   3298 					if (even_pair_failed == 1) {
   3299 						return(0);
   3300 					}
   3301 				}
   3302 			} else {
   3303 				/* normal accounting */
   3304 				num_missing++;
   3305 			}
   3306 		}
   3307 		if ((parity_type == '1') && (c%2 == 1)) {
   3308 				/* Just did an even component, and we didn't
   3309 				   bail.. reset the even_pair_failed flag,
   3310 				   and go on to the next component.... */
   3311 			even_pair_failed = 0;
   3312 		}
   3313 	}
   3314 
   3315 	clabel = cset->ac->clabel;
   3316 
   3317 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3318 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3319 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3320 		/* XXX this needs to be made *much* more general */
   3321 		/* Too many failures */
   3322 		return(0);
   3323 	}
   3324 	/* otherwise, all is well, and we've got enough to take a kick
   3325 	   at autoconfiguring this set */
   3326 	return(1);
   3327 }
   3328 
   3329 void
   3330 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3331 			RF_Raid_t *raidPtr)
   3332 {
   3333 	RF_ComponentLabel_t *clabel;
   3334 	int i;
   3335 
   3336 	clabel = ac->clabel;
   3337 
   3338 	/* 1. Fill in the common stuff */
   3339 	config->numRow = clabel->num_rows = 1;
   3340 	config->numCol = clabel->num_columns;
   3341 	config->numSpare = 0; /* XXX should this be set here? */
   3342 	config->sectPerSU = clabel->sectPerSU;
   3343 	config->SUsPerPU = clabel->SUsPerPU;
   3344 	config->SUsPerRU = clabel->SUsPerRU;
   3345 	config->parityConfig = clabel->parityConfig;
   3346 	/* XXX... */
   3347 	strcpy(config->diskQueueType,"fifo");
   3348 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3349 	config->layoutSpecificSize = 0; /* XXX ?? */
   3350 
   3351 	while(ac!=NULL) {
   3352 		/* row/col values will be in range due to the checks
   3353 		   in reasonable_label() */
   3354 		strcpy(config->devnames[0][ac->clabel->column],
   3355 		       ac->devname);
   3356 		ac = ac->next;
   3357 	}
   3358 
   3359 	for(i=0;i<RF_MAXDBGV;i++) {
   3360 		config->debugVars[i][0] = 0;
   3361 	}
   3362 }
   3363 
   3364 int
   3365 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3366 {
   3367 	RF_ComponentLabel_t *clabel;
   3368 	int column;
   3369 	int sparecol;
   3370 
   3371 	raidPtr->autoconfigure = new_value;
   3372 
   3373 	for(column=0; column<raidPtr->numCol; column++) {
   3374 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3375 			clabel = raidget_component_label(raidPtr, column);
   3376 			clabel->autoconfigure = new_value;
   3377 			raidflush_component_label(raidPtr, column);
   3378 		}
   3379 	}
   3380 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3381 		sparecol = raidPtr->numCol + column;
   3382 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3383 			clabel = raidget_component_label(raidPtr, sparecol);
   3384 			clabel->autoconfigure = new_value;
   3385 			raidflush_component_label(raidPtr, sparecol);
   3386 		}
   3387 	}
   3388 	return(new_value);
   3389 }
   3390 
   3391 int
   3392 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3393 {
   3394 	RF_ComponentLabel_t *clabel;
   3395 	int column;
   3396 	int sparecol;
   3397 
   3398 	raidPtr->root_partition = new_value;
   3399 	for(column=0; column<raidPtr->numCol; column++) {
   3400 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3401 			clabel = raidget_component_label(raidPtr, column);
   3402 			clabel->root_partition = new_value;
   3403 			raidflush_component_label(raidPtr, column);
   3404 		}
   3405 	}
   3406 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3407 		sparecol = raidPtr->numCol + column;
   3408 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3409 			clabel = raidget_component_label(raidPtr, sparecol);
   3410 			clabel->root_partition = new_value;
   3411 			raidflush_component_label(raidPtr, sparecol);
   3412 		}
   3413 	}
   3414 	return(new_value);
   3415 }
   3416 
   3417 void
   3418 rf_release_all_vps(RF_ConfigSet_t *cset)
   3419 {
   3420 	RF_AutoConfig_t *ac;
   3421 
   3422 	ac = cset->ac;
   3423 	while(ac!=NULL) {
   3424 		/* Close the vp, and give it back */
   3425 		if (ac->vp) {
   3426 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3427 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3428 			vput(ac->vp);
   3429 			ac->vp = NULL;
   3430 		}
   3431 		ac = ac->next;
   3432 	}
   3433 }
   3434 
   3435 
   3436 void
   3437 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3438 {
   3439 	RF_AutoConfig_t *ac;
   3440 	RF_AutoConfig_t *next_ac;
   3441 
   3442 	ac = cset->ac;
   3443 	while(ac!=NULL) {
   3444 		next_ac = ac->next;
   3445 		/* nuke the label */
   3446 		free(ac->clabel, M_RAIDFRAME);
   3447 		/* cleanup the config structure */
   3448 		free(ac, M_RAIDFRAME);
   3449 		/* "next.." */
   3450 		ac = next_ac;
   3451 	}
   3452 	/* and, finally, nuke the config set */
   3453 	free(cset, M_RAIDFRAME);
   3454 }
   3455 
   3456 
   3457 void
   3458 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3459 {
   3460 	/* current version number */
   3461 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3462 	clabel->serial_number = raidPtr->serial_number;
   3463 	clabel->mod_counter = raidPtr->mod_counter;
   3464 
   3465 	clabel->num_rows = 1;
   3466 	clabel->num_columns = raidPtr->numCol;
   3467 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3468 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3469 
   3470 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3471 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3472 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3473 
   3474 	clabel->blockSize = raidPtr->bytesPerSector;
   3475 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3476 
   3477 	/* XXX not portable */
   3478 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3479 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3480 	clabel->autoconfigure = raidPtr->autoconfigure;
   3481 	clabel->root_partition = raidPtr->root_partition;
   3482 	clabel->last_unit = raidPtr->raidid;
   3483 	clabel->config_order = raidPtr->config_order;
   3484 
   3485 #ifndef RF_NO_PARITY_MAP
   3486 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3487 #endif
   3488 }
   3489 
   3490 struct raid_softc *
   3491 rf_auto_config_set(RF_ConfigSet_t *cset)
   3492 {
   3493 	RF_Raid_t *raidPtr;
   3494 	RF_Config_t *config;
   3495 	int raidID;
   3496 	struct raid_softc *sc;
   3497 
   3498 #ifdef DEBUG
   3499 	printf("RAID autoconfigure\n");
   3500 #endif
   3501 
   3502 	/* 1. Create a config structure */
   3503 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3504 	if (config == NULL) {
   3505 		printf("%s: Out of mem - config!?!?\n", __func__);
   3506 				/* XXX do something more intelligent here. */
   3507 		return NULL;
   3508 	}
   3509 
   3510 	/*
   3511 	   2. Figure out what RAID ID this one is supposed to live at
   3512 	   See if we can get the same RAID dev that it was configured
   3513 	   on last time..
   3514 	*/
   3515 
   3516 	raidID = cset->ac->clabel->last_unit;
   3517 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3518 	     sc = raidget(++raidID, false))
   3519 		continue;
   3520 #ifdef DEBUG
   3521 	printf("Configuring raid%d:\n",raidID);
   3522 #endif
   3523 
   3524 	if (sc == NULL)
   3525 		sc = raidget(raidID, true);
   3526 	if (sc == NULL) {
   3527 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3528 				/* XXX do something more intelligent here. */
   3529 		free(config, M_RAIDFRAME);
   3530 		return NULL;
   3531 	}
   3532 
   3533 	raidPtr = &sc->sc_r;
   3534 
   3535 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3536 	raidPtr->softc = sc;
   3537 	raidPtr->raidid = raidID;
   3538 	raidPtr->openings = RAIDOUTSTANDING;
   3539 
   3540 	/* 3. Build the configuration structure */
   3541 	rf_create_configuration(cset->ac, config, raidPtr);
   3542 
   3543 	/* 4. Do the configuration */
   3544 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3545 		raidinit(sc);
   3546 
   3547 		rf_markalldirty(raidPtr);
   3548 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3549 		switch (cset->ac->clabel->root_partition) {
   3550 		case 1:	/* Force Root */
   3551 		case 2:	/* Soft Root: root when boot partition part of raid */
   3552 			/*
   3553 			 * everything configured just fine.  Make a note
   3554 			 * that this set is eligible to be root,
   3555 			 * or forced to be root
   3556 			 */
   3557 			cset->rootable = cset->ac->clabel->root_partition;
   3558 			/* XXX do this here? */
   3559 			raidPtr->root_partition = cset->rootable;
   3560 			break;
   3561 		default:
   3562 			break;
   3563 		}
   3564 	} else {
   3565 		raidput(sc);
   3566 		sc = NULL;
   3567 	}
   3568 
   3569 	/* 5. Cleanup */
   3570 	free(config, M_RAIDFRAME);
   3571 	return sc;
   3572 }
   3573 
   3574 void
   3575 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3576 	     size_t xmin, size_t xmax)
   3577 {
   3578 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3579 	pool_sethiwat(p, xmax);
   3580 	pool_prime(p, xmin);
   3581 	pool_setlowat(p, xmin);
   3582 }
   3583 
   3584 /*
   3585  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3586  * to see if there is IO pending and if that IO could possibly be done
   3587  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3588  * otherwise.
   3589  *
   3590  */
   3591 int
   3592 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3593 {
   3594 	struct raid_softc *rs;
   3595 	struct dk_softc *dksc;
   3596 
   3597 	rs = raidPtr->softc;
   3598 	dksc = &rs->sc_dksc;
   3599 
   3600 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3601 		return 1;
   3602 
   3603 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3604 		/* there is work to do */
   3605 		return 0;
   3606 	}
   3607 	/* default is nothing to do */
   3608 	return 1;
   3609 }
   3610 
   3611 int
   3612 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3613 {
   3614 	uint64_t numsecs;
   3615 	unsigned secsize;
   3616 	int error;
   3617 
   3618 	error = getdisksize(vp, &numsecs, &secsize);
   3619 	if (error == 0) {
   3620 		diskPtr->blockSize = secsize;
   3621 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3622 		diskPtr->partitionSize = numsecs;
   3623 		return 0;
   3624 	}
   3625 	return error;
   3626 }
   3627 
   3628 static int
   3629 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3630 {
   3631 	return 1;
   3632 }
   3633 
   3634 static void
   3635 raid_attach(device_t parent, device_t self, void *aux)
   3636 {
   3637 }
   3638 
   3639 
   3640 static int
   3641 raid_detach(device_t self, int flags)
   3642 {
   3643 	int error;
   3644 	struct raid_softc *rs = raidsoftc(self);
   3645 
   3646 	if (rs == NULL)
   3647 		return ENXIO;
   3648 
   3649 	if ((error = raidlock(rs)) != 0)
   3650 		return (error);
   3651 
   3652 	error = raid_detach_unlocked(rs);
   3653 
   3654 	raidunlock(rs);
   3655 
   3656 	/* XXX raid can be referenced here */
   3657 
   3658 	if (error)
   3659 		return error;
   3660 
   3661 	/* Free the softc */
   3662 	raidput(rs);
   3663 
   3664 	return 0;
   3665 }
   3666 
   3667 static void
   3668 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3669 {
   3670 	struct dk_softc *dksc = &rs->sc_dksc;
   3671 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3672 
   3673 	memset(dg, 0, sizeof(*dg));
   3674 
   3675 	dg->dg_secperunit = raidPtr->totalSectors;
   3676 	dg->dg_secsize = raidPtr->bytesPerSector;
   3677 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3678 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3679 
   3680 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3681 }
   3682 
   3683 /*
   3684  * Get cache info for all the components (including spares).
   3685  * Returns intersection of all the cache flags of all disks, or first
   3686  * error if any encountered.
   3687  * XXXfua feature flags can change as spares are added - lock down somehow
   3688  */
   3689 static int
   3690 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3691 {
   3692 	int c;
   3693 	int error;
   3694 	int dkwhole = 0, dkpart;
   3695 
   3696 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3697 		/*
   3698 		 * Check any non-dead disk, even when currently being
   3699 		 * reconstructed.
   3700 		 */
   3701 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3702 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3703 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3704 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3705 			if (error) {
   3706 				if (error != ENODEV) {
   3707 					printf("raid%d: get cache for component %s failed\n",
   3708 					    raidPtr->raidid,
   3709 					    raidPtr->Disks[c].devname);
   3710 				}
   3711 
   3712 				return error;
   3713 			}
   3714 
   3715 			if (c == 0)
   3716 				dkwhole = dkpart;
   3717 			else
   3718 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3719 		}
   3720 	}
   3721 
   3722 	*data = dkwhole;
   3723 
   3724 	return 0;
   3725 }
   3726 
   3727 /*
   3728  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3729  * We end up returning whatever error was returned by the first cache flush
   3730  * that fails.
   3731  */
   3732 
   3733 int
   3734 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3735 {
   3736 	int c, sparecol;
   3737 	int e,error;
   3738 	int force = 1;
   3739 
   3740 	error = 0;
   3741 	for (c = 0; c < raidPtr->numCol; c++) {
   3742 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3743 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3744 					  &force, FWRITE, NOCRED);
   3745 			if (e) {
   3746 				if (e != ENODEV)
   3747 					printf("raid%d: cache flush to component %s failed.\n",
   3748 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3749 				if (error == 0) {
   3750 					error = e;
   3751 				}
   3752 			}
   3753 		}
   3754 	}
   3755 
   3756 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3757 		sparecol = raidPtr->numCol + c;
   3758 		/* Need to ensure that the reconstruct actually completed! */
   3759 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3760 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3761 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3762 			if (e) {
   3763 				if (e != ENODEV)
   3764 					printf("raid%d: cache flush to component %s failed.\n",
   3765 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3766 				if (error == 0) {
   3767 					error = e;
   3768 				}
   3769 			}
   3770 		}
   3771 	}
   3772 	return error;
   3773 }
   3774 
   3775 /*
   3776  * Module interface
   3777  */
   3778 
   3779 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
   3780 
   3781 #ifdef _MODULE
   3782 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3783 #endif
   3784 
   3785 static int raid_modcmd(modcmd_t, void *);
   3786 static int raid_modcmd_init(void);
   3787 static int raid_modcmd_fini(void);
   3788 
   3789 static int
   3790 raid_modcmd(modcmd_t cmd, void *data)
   3791 {
   3792 	int error;
   3793 
   3794 	error = 0;
   3795 	switch (cmd) {
   3796 	case MODULE_CMD_INIT:
   3797 		error = raid_modcmd_init();
   3798 		break;
   3799 	case MODULE_CMD_FINI:
   3800 		error = raid_modcmd_fini();
   3801 		break;
   3802 	default:
   3803 		error = ENOTTY;
   3804 		break;
   3805 	}
   3806 	return error;
   3807 }
   3808 
   3809 static int
   3810 raid_modcmd_init(void)
   3811 {
   3812 	int error;
   3813 	int bmajor, cmajor;
   3814 
   3815 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3816 	mutex_enter(&raid_lock);
   3817 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3818 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3819 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3820 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3821 
   3822 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3823 #endif
   3824 
   3825 	bmajor = cmajor = -1;
   3826 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3827 	    &raid_cdevsw, &cmajor);
   3828 	if (error != 0 && error != EEXIST) {
   3829 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3830 		mutex_exit(&raid_lock);
   3831 		return error;
   3832 	}
   3833 #ifdef _MODULE
   3834 	error = config_cfdriver_attach(&raid_cd);
   3835 	if (error != 0) {
   3836 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3837 		    __func__, error);
   3838 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3839 		mutex_exit(&raid_lock);
   3840 		return error;
   3841 	}
   3842 #endif
   3843 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3844 	if (error != 0) {
   3845 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3846 		    __func__, error);
   3847 #ifdef _MODULE
   3848 		config_cfdriver_detach(&raid_cd);
   3849 #endif
   3850 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3851 		mutex_exit(&raid_lock);
   3852 		return error;
   3853 	}
   3854 
   3855 	raidautoconfigdone = false;
   3856 
   3857 	mutex_exit(&raid_lock);
   3858 
   3859 	if (error == 0) {
   3860 		if (rf_BootRaidframe(true) == 0)
   3861 			aprint_verbose("Kernelized RAIDframe activated\n");
   3862 		else
   3863 			panic("Serious error activating RAID!!");
   3864 	}
   3865 
   3866 	/*
   3867 	 * Register a finalizer which will be used to auto-config RAID
   3868 	 * sets once all real hardware devices have been found.
   3869 	 */
   3870 	error = config_finalize_register(NULL, rf_autoconfig);
   3871 	if (error != 0) {
   3872 		aprint_error("WARNING: unable to register RAIDframe "
   3873 		    "finalizer\n");
   3874 		error = 0;
   3875 	}
   3876 
   3877 	return error;
   3878 }
   3879 
   3880 static int
   3881 raid_modcmd_fini(void)
   3882 {
   3883 	int error;
   3884 
   3885 	mutex_enter(&raid_lock);
   3886 
   3887 	/* Don't allow unload if raid device(s) exist.  */
   3888 	if (!LIST_EMPTY(&raids)) {
   3889 		mutex_exit(&raid_lock);
   3890 		return EBUSY;
   3891 	}
   3892 
   3893 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3894 	if (error != 0) {
   3895 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3896 		mutex_exit(&raid_lock);
   3897 		return error;
   3898 	}
   3899 #ifdef _MODULE
   3900 	error = config_cfdriver_detach(&raid_cd);
   3901 	if (error != 0) {
   3902 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3903 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3904 		mutex_exit(&raid_lock);
   3905 		return error;
   3906 	}
   3907 #endif
   3908 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3909 	if (error != 0) {
   3910 		aprint_error("%s: cannot detach devsw\n",__func__);
   3911 #ifdef _MODULE
   3912 		config_cfdriver_attach(&raid_cd);
   3913 #endif
   3914 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3915 		mutex_exit(&raid_lock);
   3916 		return error;
   3917 	}
   3918 	rf_BootRaidframe(false);
   3919 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3920 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3921 	rf_destroy_cond2(rf_sparet_wait_cv);
   3922 	rf_destroy_cond2(rf_sparet_resp_cv);
   3923 #endif
   3924 	mutex_exit(&raid_lock);
   3925 	mutex_destroy(&raid_lock);
   3926 
   3927 	return error;
   3928 }
   3929