Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.356.2.10
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.356.2.10 2019/01/18 08:50:42 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.356.2.10 2019/01/18 08:50:42 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 #include <sys/compat_stub.h>
    132 
    133 #include <prop/proplib.h>
    134 
    135 #include <dev/raidframe/raidframevar.h>
    136 #include <dev/raidframe/raidframeio.h>
    137 #include <dev/raidframe/rf_paritymap.h>
    138 
    139 #include "rf_raid.h"
    140 #include "rf_copyback.h"
    141 #include "rf_dag.h"
    142 #include "rf_dagflags.h"
    143 #include "rf_desc.h"
    144 #include "rf_diskqueue.h"
    145 #include "rf_etimer.h"
    146 #include "rf_general.h"
    147 #include "rf_kintf.h"
    148 #include "rf_options.h"
    149 #include "rf_driver.h"
    150 #include "rf_parityscan.h"
    151 #include "rf_threadstuff.h"
    152 
    153 #include "rf_compat50.h"
    154 
    155 #include "rf_compat80.h"
    156 
    157 #ifdef COMPAT_NETBSD32
    158 #include "rf_compat32.h"
    159 #endif
    160 
    161 #include "ioconf.h"
    162 
    163 #ifdef DEBUG
    164 int     rf_kdebug_level = 0;
    165 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    166 #else				/* DEBUG */
    167 #define db1_printf(a) { }
    168 #endif				/* DEBUG */
    169 
    170 #ifdef DEBUG_ROOT
    171 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    172 #else
    173 #define DPRINTF(a, ...)
    174 #endif
    175 
    176 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    177 static rf_declare_mutex2(rf_sparet_wait_mutex);
    178 static rf_declare_cond2(rf_sparet_wait_cv);
    179 static rf_declare_cond2(rf_sparet_resp_cv);
    180 
    181 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    182 						 * spare table */
    183 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    184 						 * installation process */
    185 #endif
    186 
    187 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    188 
    189 /* prototypes */
    190 static void KernelWakeupFunc(struct buf *);
    191 static void InitBP(struct buf *, struct vnode *, unsigned,
    192     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    193     void *, int, struct proc *);
    194 struct raid_softc;
    195 static void raidinit(struct raid_softc *);
    196 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    197 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    198 
    199 static int raid_match(device_t, cfdata_t, void *);
    200 static void raid_attach(device_t, device_t, void *);
    201 static int raid_detach(device_t, int);
    202 
    203 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    204     daddr_t, daddr_t);
    205 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    206     daddr_t, daddr_t, int);
    207 
    208 static int raidwrite_component_label(unsigned,
    209     dev_t, struct vnode *, RF_ComponentLabel_t *);
    210 static int raidread_component_label(unsigned,
    211     dev_t, struct vnode *, RF_ComponentLabel_t *);
    212 
    213 static int raid_diskstart(device_t, struct buf *bp);
    214 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    215 static int raid_lastclose(device_t);
    216 
    217 static dev_type_open(raidopen);
    218 static dev_type_close(raidclose);
    219 static dev_type_read(raidread);
    220 static dev_type_write(raidwrite);
    221 static dev_type_ioctl(raidioctl);
    222 static dev_type_strategy(raidstrategy);
    223 static dev_type_dump(raiddump);
    224 static dev_type_size(raidsize);
    225 
    226 const struct bdevsw raid_bdevsw = {
    227 	.d_open = raidopen,
    228 	.d_close = raidclose,
    229 	.d_strategy = raidstrategy,
    230 	.d_ioctl = raidioctl,
    231 	.d_dump = raiddump,
    232 	.d_psize = raidsize,
    233 	.d_discard = nodiscard,
    234 	.d_flag = D_DISK
    235 };
    236 
    237 const struct cdevsw raid_cdevsw = {
    238 	.d_open = raidopen,
    239 	.d_close = raidclose,
    240 	.d_read = raidread,
    241 	.d_write = raidwrite,
    242 	.d_ioctl = raidioctl,
    243 	.d_stop = nostop,
    244 	.d_tty = notty,
    245 	.d_poll = nopoll,
    246 	.d_mmap = nommap,
    247 	.d_kqfilter = nokqfilter,
    248 	.d_discard = nodiscard,
    249 	.d_flag = D_DISK
    250 };
    251 
    252 static struct dkdriver rf_dkdriver = {
    253 	.d_open = raidopen,
    254 	.d_close = raidclose,
    255 	.d_strategy = raidstrategy,
    256 	.d_diskstart = raid_diskstart,
    257 	.d_dumpblocks = raid_dumpblocks,
    258 	.d_lastclose = raid_lastclose,
    259 	.d_minphys = minphys
    260 };
    261 
    262 struct raid_softc {
    263 	struct dk_softc sc_dksc;
    264 	int	sc_unit;
    265 	int     sc_flags;	/* flags */
    266 	int     sc_cflags;	/* configuration flags */
    267 	kmutex_t sc_mutex;	/* interlock mutex */
    268 	kcondvar_t sc_cv;	/* and the condvar */
    269 	uint64_t sc_size;	/* size of the raid device */
    270 	char    sc_xname[20];	/* XXX external name */
    271 	RF_Raid_t sc_r;
    272 	LIST_ENTRY(raid_softc) sc_link;
    273 };
    274 /* sc_flags */
    275 #define RAIDF_INITED		0x01	/* unit has been initialized */
    276 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    277 #define RAIDF_DETACH  		0x04	/* detach after final close */
    278 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    279 #define RAIDF_LOCKED		0x10	/* unit is locked */
    280 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    281 
    282 #define	raidunit(x)	DISKUNIT(x)
    283 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    284 
    285 extern struct cfdriver raid_cd;
    286 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    287     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    288     DVF_DETACH_SHUTDOWN);
    289 
    290 /* Internal representation of a rf_recon_req */
    291 struct rf_recon_req_internal {
    292 	RF_RowCol_t col;
    293 	RF_ReconReqFlags_t flags;
    294 	void   *raidPtr;
    295 };
    296 
    297 /*
    298  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    299  * Be aware that large numbers can allow the driver to consume a lot of
    300  * kernel memory, especially on writes, and in degraded mode reads.
    301  *
    302  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    303  * a single 64K write will typically require 64K for the old data,
    304  * 64K for the old parity, and 64K for the new parity, for a total
    305  * of 192K (if the parity buffer is not re-used immediately).
    306  * Even it if is used immediately, that's still 128K, which when multiplied
    307  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    308  *
    309  * Now in degraded mode, for example, a 64K read on the above setup may
    310  * require data reconstruction, which will require *all* of the 4 remaining
    311  * disks to participate -- 4 * 32K/disk == 128K again.
    312  */
    313 
    314 #ifndef RAIDOUTSTANDING
    315 #define RAIDOUTSTANDING   6
    316 #endif
    317 
    318 #define RAIDLABELDEV(dev)	\
    319 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    320 
    321 /* declared here, and made public, for the benefit of KVM stuff.. */
    322 
    323 static int raidlock(struct raid_softc *);
    324 static void raidunlock(struct raid_softc *);
    325 
    326 static int raid_detach_unlocked(struct raid_softc *);
    327 
    328 static void rf_markalldirty(RF_Raid_t *);
    329 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    330 
    331 void rf_ReconThread(struct rf_recon_req_internal *);
    332 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    333 void rf_CopybackThread(RF_Raid_t *raidPtr);
    334 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    335 int rf_autoconfig(device_t);
    336 void rf_buildroothack(RF_ConfigSet_t *);
    337 
    338 RF_AutoConfig_t *rf_find_raid_components(void);
    339 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    340 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    341 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    342 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    343 int rf_set_autoconfig(RF_Raid_t *, int);
    344 int rf_set_rootpartition(RF_Raid_t *, int);
    345 void rf_release_all_vps(RF_ConfigSet_t *);
    346 void rf_cleanup_config_set(RF_ConfigSet_t *);
    347 int rf_have_enough_components(RF_ConfigSet_t *);
    348 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    349 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    350 
    351 /*
    352  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    353  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    354  * in the kernel config file.
    355  */
    356 #ifdef RAID_AUTOCONFIG
    357 int raidautoconfig = 1;
    358 #else
    359 int raidautoconfig = 0;
    360 #endif
    361 static bool raidautoconfigdone = false;
    362 
    363 struct RF_Pools_s rf_pools;
    364 
    365 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    366 static kmutex_t raid_lock;
    367 
    368 static struct raid_softc *
    369 raidcreate(int unit) {
    370 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    371 	sc->sc_unit = unit;
    372 	cv_init(&sc->sc_cv, "raidunit");
    373 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    374 	return sc;
    375 }
    376 
    377 static void
    378 raiddestroy(struct raid_softc *sc) {
    379 	cv_destroy(&sc->sc_cv);
    380 	mutex_destroy(&sc->sc_mutex);
    381 	kmem_free(sc, sizeof(*sc));
    382 }
    383 
    384 static struct raid_softc *
    385 raidget(int unit, bool create) {
    386 	struct raid_softc *sc;
    387 	if (unit < 0) {
    388 #ifdef DIAGNOSTIC
    389 		panic("%s: unit %d!", __func__, unit);
    390 #endif
    391 		return NULL;
    392 	}
    393 	mutex_enter(&raid_lock);
    394 	LIST_FOREACH(sc, &raids, sc_link) {
    395 		if (sc->sc_unit == unit) {
    396 			mutex_exit(&raid_lock);
    397 			return sc;
    398 		}
    399 	}
    400 	mutex_exit(&raid_lock);
    401 	if (!create)
    402 		return NULL;
    403 	if ((sc = raidcreate(unit)) == NULL)
    404 		return NULL;
    405 	mutex_enter(&raid_lock);
    406 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    407 	mutex_exit(&raid_lock);
    408 	return sc;
    409 }
    410 
    411 static void
    412 raidput(struct raid_softc *sc) {
    413 	mutex_enter(&raid_lock);
    414 	LIST_REMOVE(sc, sc_link);
    415 	mutex_exit(&raid_lock);
    416 	raiddestroy(sc);
    417 }
    418 
    419 void
    420 raidattach(int num)
    421 {
    422 
    423 	/*
    424 	 * Device attachment and associated initialization now occurs
    425 	 * as part of the module initialization.
    426 	 */
    427 }
    428 
    429 int
    430 rf_autoconfig(device_t self)
    431 {
    432 	RF_AutoConfig_t *ac_list;
    433 	RF_ConfigSet_t *config_sets;
    434 
    435 	if (!raidautoconfig || raidautoconfigdone == true)
    436 		return (0);
    437 
    438 	/* XXX This code can only be run once. */
    439 	raidautoconfigdone = true;
    440 
    441 #ifdef __HAVE_CPU_BOOTCONF
    442 	/*
    443 	 * 0. find the boot device if needed first so we can use it later
    444 	 * this needs to be done before we autoconfigure any raid sets,
    445 	 * because if we use wedges we are not going to be able to open
    446 	 * the boot device later
    447 	 */
    448 	if (booted_device == NULL)
    449 		cpu_bootconf();
    450 #endif
    451 	/* 1. locate all RAID components on the system */
    452 	aprint_debug("Searching for RAID components...\n");
    453 	ac_list = rf_find_raid_components();
    454 
    455 	/* 2. Sort them into their respective sets. */
    456 	config_sets = rf_create_auto_sets(ac_list);
    457 
    458 	/*
    459 	 * 3. Evaluate each set and configure the valid ones.
    460 	 * This gets done in rf_buildroothack().
    461 	 */
    462 	rf_buildroothack(config_sets);
    463 
    464 	return 1;
    465 }
    466 
    467 static int
    468 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    469 	const char *bootname = device_xname(bdv);
    470 	size_t len = strlen(bootname);
    471 
    472 	for (int col = 0; col < r->numCol; col++) {
    473 		const char *devname = r->Disks[col].devname;
    474 		devname += sizeof("/dev/") - 1;
    475 		if (strncmp(devname, "dk", 2) == 0) {
    476 			const char *parent =
    477 			    dkwedge_get_parent_name(r->Disks[col].dev);
    478 			if (parent != NULL)
    479 				devname = parent;
    480 		}
    481 		if (strncmp(devname, bootname, len) == 0) {
    482 			struct raid_softc *sc = r->softc;
    483 			aprint_debug("raid%d includes boot device %s\n",
    484 			    sc->sc_unit, devname);
    485 			return 1;
    486 		}
    487 	}
    488 	return 0;
    489 }
    490 
    491 void
    492 rf_buildroothack(RF_ConfigSet_t *config_sets)
    493 {
    494 	RF_ConfigSet_t *cset;
    495 	RF_ConfigSet_t *next_cset;
    496 	int num_root;
    497 	struct raid_softc *sc, *rsc;
    498 	struct dk_softc *dksc;
    499 
    500 	sc = rsc = NULL;
    501 	num_root = 0;
    502 	cset = config_sets;
    503 	while (cset != NULL) {
    504 		next_cset = cset->next;
    505 		if (rf_have_enough_components(cset) &&
    506 		    cset->ac->clabel->autoconfigure == 1) {
    507 			sc = rf_auto_config_set(cset);
    508 			if (sc != NULL) {
    509 				aprint_debug("raid%d: configured ok\n",
    510 				    sc->sc_unit);
    511 				if (cset->rootable) {
    512 					rsc = sc;
    513 					num_root++;
    514 				}
    515 			} else {
    516 				/* The autoconfig didn't work :( */
    517 				aprint_debug("Autoconfig failed\n");
    518 				rf_release_all_vps(cset);
    519 			}
    520 		} else {
    521 			/* we're not autoconfiguring this set...
    522 			   release the associated resources */
    523 			rf_release_all_vps(cset);
    524 		}
    525 		/* cleanup */
    526 		rf_cleanup_config_set(cset);
    527 		cset = next_cset;
    528 	}
    529 	dksc = &rsc->sc_dksc;
    530 
    531 	/* if the user has specified what the root device should be
    532 	   then we don't touch booted_device or boothowto... */
    533 
    534 	if (rootspec != NULL)
    535 		return;
    536 
    537 	/* we found something bootable... */
    538 
    539 	/*
    540 	 * XXX: The following code assumes that the root raid
    541 	 * is the first ('a') partition. This is about the best
    542 	 * we can do with a BSD disklabel, but we might be able
    543 	 * to do better with a GPT label, by setting a specified
    544 	 * attribute to indicate the root partition. We can then
    545 	 * stash the partition number in the r->root_partition
    546 	 * high bits (the bottom 2 bits are already used). For
    547 	 * now we just set booted_partition to 0 when we override
    548 	 * root.
    549 	 */
    550 	if (num_root == 1) {
    551 		device_t candidate_root;
    552 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    553 			char cname[sizeof(cset->ac->devname)];
    554 			/* XXX: assume partition 'a' first */
    555 			snprintf(cname, sizeof(cname), "%s%c",
    556 			    device_xname(dksc->sc_dev), 'a');
    557 			candidate_root = dkwedge_find_by_wname(cname);
    558 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    559 			    cname);
    560 			if (candidate_root == NULL) {
    561 				/*
    562 				 * If that is not found, because we don't use
    563 				 * disklabel, return the first dk child
    564 				 * XXX: we can skip the 'a' check above
    565 				 * and always do this...
    566 				 */
    567 				size_t i = 0;
    568 				candidate_root = dkwedge_find_by_parent(
    569 				    device_xname(dksc->sc_dev), &i);
    570 			}
    571 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    572 			    candidate_root);
    573 		} else
    574 			candidate_root = dksc->sc_dev;
    575 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    576 		DPRINTF("%s: booted_device=%p root_partition=%d "
    577 		   "contains_boot=%d\n", __func__, booted_device,
    578 		   rsc->sc_r.root_partition,
    579 		   rf_containsboot(&rsc->sc_r, booted_device));
    580 		if (booted_device == NULL ||
    581 		    rsc->sc_r.root_partition == 1 ||
    582 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    583 			booted_device = candidate_root;
    584 			booted_method = "raidframe/single";
    585 			booted_partition = 0;	/* XXX assume 'a' */
    586 		}
    587 	} else if (num_root > 1) {
    588 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    589 		    booted_device);
    590 
    591 		/*
    592 		 * Maybe the MD code can help. If it cannot, then
    593 		 * setroot() will discover that we have no
    594 		 * booted_device and will ask the user if nothing was
    595 		 * hardwired in the kernel config file
    596 		 */
    597 		if (booted_device == NULL)
    598 			return;
    599 
    600 		num_root = 0;
    601 		mutex_enter(&raid_lock);
    602 		LIST_FOREACH(sc, &raids, sc_link) {
    603 			RF_Raid_t *r = &sc->sc_r;
    604 			if (r->valid == 0)
    605 				continue;
    606 
    607 			if (r->root_partition == 0)
    608 				continue;
    609 
    610 			if (rf_containsboot(r, booted_device)) {
    611 				num_root++;
    612 				rsc = sc;
    613 				dksc = &rsc->sc_dksc;
    614 			}
    615 		}
    616 		mutex_exit(&raid_lock);
    617 
    618 		if (num_root == 1) {
    619 			booted_device = dksc->sc_dev;
    620 			booted_method = "raidframe/multi";
    621 			booted_partition = 0;	/* XXX assume 'a' */
    622 		} else {
    623 			/* we can't guess.. require the user to answer... */
    624 			boothowto |= RB_ASKNAME;
    625 		}
    626 	}
    627 }
    628 
    629 static int
    630 raidsize(dev_t dev)
    631 {
    632 	struct raid_softc *rs;
    633 	struct dk_softc *dksc;
    634 	unsigned int unit;
    635 
    636 	unit = raidunit(dev);
    637 	if ((rs = raidget(unit, false)) == NULL)
    638 		return -1;
    639 	dksc = &rs->sc_dksc;
    640 
    641 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    642 		return -1;
    643 
    644 	return dk_size(dksc, dev);
    645 }
    646 
    647 static int
    648 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    649 {
    650 	unsigned int unit;
    651 	struct raid_softc *rs;
    652 	struct dk_softc *dksc;
    653 
    654 	unit = raidunit(dev);
    655 	if ((rs = raidget(unit, false)) == NULL)
    656 		return ENXIO;
    657 	dksc = &rs->sc_dksc;
    658 
    659 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    660 		return ENODEV;
    661 
    662         /*
    663            Note that blkno is relative to this particular partition.
    664            By adding adding RF_PROTECTED_SECTORS, we get a value that
    665 	   is relative to the partition used for the underlying component.
    666         */
    667 	blkno += RF_PROTECTED_SECTORS;
    668 
    669 	return dk_dump(dksc, dev, blkno, va, size);
    670 }
    671 
    672 static int
    673 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    674 {
    675 	struct raid_softc *rs = raidsoftc(dev);
    676 	const struct bdevsw *bdev;
    677 	RF_Raid_t *raidPtr;
    678 	int     c, sparecol, j, scol, dumpto;
    679 	int     error = 0;
    680 
    681 	raidPtr = &rs->sc_r;
    682 
    683 	/* we only support dumping to RAID 1 sets */
    684 	if (raidPtr->Layout.numDataCol != 1 ||
    685 	    raidPtr->Layout.numParityCol != 1)
    686 		return EINVAL;
    687 
    688 	if ((error = raidlock(rs)) != 0)
    689 		return error;
    690 
    691 	/* figure out what device is alive.. */
    692 
    693 	/*
    694 	   Look for a component to dump to.  The preference for the
    695 	   component to dump to is as follows:
    696 	   1) the master
    697 	   2) a used_spare of the master
    698 	   3) the slave
    699 	   4) a used_spare of the slave
    700 	*/
    701 
    702 	dumpto = -1;
    703 	for (c = 0; c < raidPtr->numCol; c++) {
    704 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    705 			/* this might be the one */
    706 			dumpto = c;
    707 			break;
    708 		}
    709 	}
    710 
    711 	/*
    712 	   At this point we have possibly selected a live master or a
    713 	   live slave.  We now check to see if there is a spared
    714 	   master (or a spared slave), if we didn't find a live master
    715 	   or a live slave.
    716 	*/
    717 
    718 	for (c = 0; c < raidPtr->numSpare; c++) {
    719 		sparecol = raidPtr->numCol + c;
    720 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    721 			/* How about this one? */
    722 			scol = -1;
    723 			for(j=0;j<raidPtr->numCol;j++) {
    724 				if (raidPtr->Disks[j].spareCol == sparecol) {
    725 					scol = j;
    726 					break;
    727 				}
    728 			}
    729 			if (scol == 0) {
    730 				/*
    731 				   We must have found a spared master!
    732 				   We'll take that over anything else
    733 				   found so far.  (We couldn't have
    734 				   found a real master before, since
    735 				   this is a used spare, and it's
    736 				   saying that it's replacing the
    737 				   master.)  On reboot (with
    738 				   autoconfiguration turned on)
    739 				   sparecol will become the 1st
    740 				   component (component0) of this set.
    741 				*/
    742 				dumpto = sparecol;
    743 				break;
    744 			} else if (scol != -1) {
    745 				/*
    746 				   Must be a spared slave.  We'll dump
    747 				   to that if we havn't found anything
    748 				   else so far.
    749 				*/
    750 				if (dumpto == -1)
    751 					dumpto = sparecol;
    752 			}
    753 		}
    754 	}
    755 
    756 	if (dumpto == -1) {
    757 		/* we couldn't find any live components to dump to!?!?
    758 		 */
    759 		error = EINVAL;
    760 		goto out;
    761 	}
    762 
    763 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    764 	if (bdev == NULL) {
    765 		error = ENXIO;
    766 		goto out;
    767 	}
    768 
    769 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    770 				blkno, va, nblk * raidPtr->bytesPerSector);
    771 
    772 out:
    773 	raidunlock(rs);
    774 
    775 	return error;
    776 }
    777 
    778 /* ARGSUSED */
    779 static int
    780 raidopen(dev_t dev, int flags, int fmt,
    781     struct lwp *l)
    782 {
    783 	int     unit = raidunit(dev);
    784 	struct raid_softc *rs;
    785 	struct dk_softc *dksc;
    786 	int     error = 0;
    787 	int     part, pmask;
    788 
    789 	if ((rs = raidget(unit, true)) == NULL)
    790 		return ENXIO;
    791 	if ((error = raidlock(rs)) != 0)
    792 		return (error);
    793 
    794 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    795 		error = EBUSY;
    796 		goto bad;
    797 	}
    798 
    799 	dksc = &rs->sc_dksc;
    800 
    801 	part = DISKPART(dev);
    802 	pmask = (1 << part);
    803 
    804 	if (!DK_BUSY(dksc, pmask) &&
    805 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    806 		/* First one... mark things as dirty... Note that we *MUST*
    807 		 have done a configure before this.  I DO NOT WANT TO BE
    808 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    809 		 THAT THEY BELONG TOGETHER!!!!! */
    810 		/* XXX should check to see if we're only open for reading
    811 		   here... If so, we needn't do this, but then need some
    812 		   other way of keeping track of what's happened.. */
    813 
    814 		rf_markalldirty(&rs->sc_r);
    815 	}
    816 
    817 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    818 		error = dk_open(dksc, dev, flags, fmt, l);
    819 
    820 bad:
    821 	raidunlock(rs);
    822 
    823 	return (error);
    824 
    825 
    826 }
    827 
    828 static int
    829 raid_lastclose(device_t self)
    830 {
    831 	struct raid_softc *rs = raidsoftc(self);
    832 
    833 	/* Last one... device is not unconfigured yet.
    834 	   Device shutdown has taken care of setting the
    835 	   clean bits if RAIDF_INITED is not set
    836 	   mark things as clean... */
    837 
    838 	rf_update_component_labels(&rs->sc_r,
    839 	    RF_FINAL_COMPONENT_UPDATE);
    840 
    841 	/* pass to unlocked code */
    842 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    843 		rs->sc_flags |= RAIDF_DETACH;
    844 
    845 	return 0;
    846 }
    847 
    848 /* ARGSUSED */
    849 static int
    850 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    851 {
    852 	int     unit = raidunit(dev);
    853 	struct raid_softc *rs;
    854 	struct dk_softc *dksc;
    855 	cfdata_t cf;
    856 	int     error = 0, do_detach = 0, do_put = 0;
    857 
    858 	if ((rs = raidget(unit, false)) == NULL)
    859 		return ENXIO;
    860 	dksc = &rs->sc_dksc;
    861 
    862 	if ((error = raidlock(rs)) != 0)
    863 		return (error);
    864 
    865 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    866 		error = dk_close(dksc, dev, flags, fmt, l);
    867 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    868 			do_detach = 1;
    869 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    870 		do_put = 1;
    871 
    872 	raidunlock(rs);
    873 
    874 	if (do_detach) {
    875 		/* free the pseudo device attach bits */
    876 		cf = device_cfdata(dksc->sc_dev);
    877 		error = config_detach(dksc->sc_dev, 0);
    878 		if (error == 0)
    879 			free(cf, M_RAIDFRAME);
    880 	} else if (do_put) {
    881 		raidput(rs);
    882 	}
    883 
    884 	return (error);
    885 
    886 }
    887 
    888 static void
    889 raid_wakeup(RF_Raid_t *raidPtr)
    890 {
    891 	rf_lock_mutex2(raidPtr->iodone_lock);
    892 	rf_signal_cond2(raidPtr->iodone_cv);
    893 	rf_unlock_mutex2(raidPtr->iodone_lock);
    894 }
    895 
    896 static void
    897 raidstrategy(struct buf *bp)
    898 {
    899 	unsigned int unit;
    900 	struct raid_softc *rs;
    901 	struct dk_softc *dksc;
    902 	RF_Raid_t *raidPtr;
    903 
    904 	unit = raidunit(bp->b_dev);
    905 	if ((rs = raidget(unit, false)) == NULL) {
    906 		bp->b_error = ENXIO;
    907 		goto fail;
    908 	}
    909 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    910 		bp->b_error = ENXIO;
    911 		goto fail;
    912 	}
    913 	dksc = &rs->sc_dksc;
    914 	raidPtr = &rs->sc_r;
    915 
    916 	/* Queue IO only */
    917 	if (dk_strategy_defer(dksc, bp))
    918 		goto done;
    919 
    920 	/* schedule the IO to happen at the next convenient time */
    921 	raid_wakeup(raidPtr);
    922 
    923 done:
    924 	return;
    925 
    926 fail:
    927 	bp->b_resid = bp->b_bcount;
    928 	biodone(bp);
    929 }
    930 
    931 static int
    932 raid_diskstart(device_t dev, struct buf *bp)
    933 {
    934 	struct raid_softc *rs = raidsoftc(dev);
    935 	RF_Raid_t *raidPtr;
    936 
    937 	raidPtr = &rs->sc_r;
    938 	if (!raidPtr->valid) {
    939 		db1_printf(("raid is not valid..\n"));
    940 		return ENODEV;
    941 	}
    942 
    943 	/* XXX */
    944 	bp->b_resid = 0;
    945 
    946 	return raiddoaccess(raidPtr, bp);
    947 }
    948 
    949 void
    950 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    951 {
    952 	struct raid_softc *rs;
    953 	struct dk_softc *dksc;
    954 
    955 	rs = raidPtr->softc;
    956 	dksc = &rs->sc_dksc;
    957 
    958 	dk_done(dksc, bp);
    959 
    960 	rf_lock_mutex2(raidPtr->mutex);
    961 	raidPtr->openings++;
    962 	rf_unlock_mutex2(raidPtr->mutex);
    963 
    964 	/* schedule more IO */
    965 	raid_wakeup(raidPtr);
    966 }
    967 
    968 /* ARGSUSED */
    969 static int
    970 raidread(dev_t dev, struct uio *uio, int flags)
    971 {
    972 	int     unit = raidunit(dev);
    973 	struct raid_softc *rs;
    974 
    975 	if ((rs = raidget(unit, false)) == NULL)
    976 		return ENXIO;
    977 
    978 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    979 		return (ENXIO);
    980 
    981 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    982 
    983 }
    984 
    985 /* ARGSUSED */
    986 static int
    987 raidwrite(dev_t dev, struct uio *uio, int flags)
    988 {
    989 	int     unit = raidunit(dev);
    990 	struct raid_softc *rs;
    991 
    992 	if ((rs = raidget(unit, false)) == NULL)
    993 		return ENXIO;
    994 
    995 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    996 		return (ENXIO);
    997 
    998 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    999 
   1000 }
   1001 
   1002 static int
   1003 raid_detach_unlocked(struct raid_softc *rs)
   1004 {
   1005 	struct dk_softc *dksc = &rs->sc_dksc;
   1006 	RF_Raid_t *raidPtr;
   1007 	int error;
   1008 
   1009 	raidPtr = &rs->sc_r;
   1010 
   1011 	if (DK_BUSY(dksc, 0) ||
   1012 	    raidPtr->recon_in_progress != 0 ||
   1013 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1014 	    raidPtr->copyback_in_progress != 0)
   1015 		return EBUSY;
   1016 
   1017 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1018 		return 0;
   1019 
   1020 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1021 
   1022 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1023 		return error;
   1024 
   1025 	rs->sc_flags &= ~RAIDF_INITED;
   1026 
   1027 	/* Kill off any queued buffers */
   1028 	dk_drain(dksc);
   1029 	bufq_free(dksc->sc_bufq);
   1030 
   1031 	/* Detach the disk. */
   1032 	dkwedge_delall(&dksc->sc_dkdev);
   1033 	disk_detach(&dksc->sc_dkdev);
   1034 	disk_destroy(&dksc->sc_dkdev);
   1035 	dk_detach(dksc);
   1036 
   1037 	return 0;
   1038 }
   1039 
   1040 /* Hooks to call the 5.0 and 8.0 ioctl compat code */
   1041 MODULE_CALL_HOOK_DECL(raidframe50_ioctl_hook, int,
   1042     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1043      RF_Config_t **k_cfg));
   1044 MODULE_CALL_HOOK(raidframe50_ioctl_hook, int,
   1045     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1046      RF_Config_t **k_cfg),
   1047     (cmd, initted, raidPtr, unit, data, k_cfg),
   1048     enosys());
   1049 
   1050 MODULE_CALL_HOOK_DECL(raidframe80_ioctl_hook, int,
   1051     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1052      RF_Config_t **k_cfg));
   1053 MODULE_CALL_HOOK(raidframe80_ioctl_hook, int,
   1054     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1055      RF_Config_t **k_cfg),
   1056     (cmd, initted, raidPtr, unit, data, k_cfg),
   1057     enosys());
   1058 
   1059 static int
   1060 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1061 {
   1062 	int     unit = raidunit(dev);
   1063 	int     error = 0;
   1064 	int     part, pmask;
   1065 	struct raid_softc *rs;
   1066 	struct dk_softc *dksc;
   1067 	RF_Config_t *k_cfg, *u_cfg;
   1068 	RF_Raid_t *raidPtr;
   1069 	RF_RaidDisk_t *diskPtr;
   1070 	RF_AccTotals_t *totals;
   1071 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1072 	u_char *specific_buf;
   1073 	int retcode = 0;
   1074 	int column;
   1075 /*	int raidid; */
   1076 	struct rf_recon_req *rr;
   1077 	struct rf_recon_req_internal *rrint;
   1078 	RF_ComponentLabel_t *clabel;
   1079 	RF_ComponentLabel_t *ci_label;
   1080 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1081 	RF_SingleComponent_t component;
   1082 	int d;
   1083 
   1084 	if ((rs = raidget(unit, false)) == NULL)
   1085 		return ENXIO;
   1086 	dksc = &rs->sc_dksc;
   1087 	raidPtr = &rs->sc_r;
   1088 
   1089 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1090 		(int) DISKPART(dev), (int) unit, cmd));
   1091 
   1092 	/* Must be initialized for these... */
   1093 	switch (cmd) {
   1094 	case RAIDFRAME_REWRITEPARITY:
   1095 	case RAIDFRAME_GET_INFO:
   1096 	case RAIDFRAME_RESET_ACCTOTALS:
   1097 	case RAIDFRAME_GET_ACCTOTALS:
   1098 	case RAIDFRAME_KEEP_ACCTOTALS:
   1099 	case RAIDFRAME_GET_SIZE:
   1100 	case RAIDFRAME_FAIL_DISK:
   1101 	case RAIDFRAME_COPYBACK:
   1102 	case RAIDFRAME_CHECK_RECON_STATUS:
   1103 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1104 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1105 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1106 	case RAIDFRAME_ADD_HOT_SPARE:
   1107 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1108 	case RAIDFRAME_INIT_LABELS:
   1109 	case RAIDFRAME_REBUILD_IN_PLACE:
   1110 	case RAIDFRAME_CHECK_PARITY:
   1111 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1113 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1114 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1115 	case RAIDFRAME_SET_AUTOCONFIG:
   1116 	case RAIDFRAME_SET_ROOT:
   1117 	case RAIDFRAME_DELETE_COMPONENT:
   1118 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1119 	case RAIDFRAME_PARITYMAP_STATUS:
   1120 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1121 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1122 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1123 #ifdef COMPAT_NETBSD32
   1124 #ifdef _LP64
   1125 	case RAIDFRAME_GET_INFO32:
   1126 #endif
   1127 #endif
   1128 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1129 			return (ENXIO);
   1130 	}
   1131 
   1132 	/*
   1133 	 * Handle compat ioctl calls
   1134 	 *
   1135 	 * * If compat code is not loaded, stub returns ENOSYS and we just
   1136 	 *   check the "native" cmd's
   1137 	 * * If compat code is loaded but does not recognize the cmd, it
   1138 	 *   returns EPASSTHROUGH, and we just check the "native" cmd's
   1139 	 * * If compat code returns EAGAIN, we need to finish via config
   1140 	 * * Otherwise the cmd has been handled and we just return
   1141 	 */
   1142 	retcode = raidframe50_ioctl_hook_call(cmd,
   1143 	    (rs->sc_flags & RAIDF_INITED), raidPtr, unit, data, &k_cfg);
   1144 	if (retcode == ENOSYS)
   1145 		retcode = 0;
   1146 	else if (retcode == EAGAIN)
   1147 		goto config;
   1148 	else if (retcode != EPASSTHROUGH)
   1149 		return retcode;
   1150 
   1151 	retcode = raidframe80_ioctl_hook_call(cmd,
   1152 	    (rs->sc_flags & RAIDF_INITED), raidPtr, unit, data, &k_cfg);
   1153 	if (retcode == ENOSYS)
   1154 		retcode = 0;
   1155 	else if (retcode == EAGAIN)
   1156 		goto config;
   1157 	else if (retcode != EPASSTHROUGH)
   1158 		return retcode;
   1159 
   1160 	/*
   1161 	 * XXX
   1162 	 * Handling of FAIL_DISK80 command requires us to retain retcode's
   1163 	 * value of EPASSTHROUGH.  If you add more compat code later, make
   1164 	 * sure you don't overwrite retcode and break this!
   1165 	 */
   1166 
   1167 	switch (cmd) {
   1168 
   1169 		/* configure the system */
   1170 	case RAIDFRAME_CONFIGURE:
   1171 #ifdef COMPAT_NETBSD32
   1172 #ifdef _LP64
   1173 	case RAIDFRAME_CONFIGURE32:
   1174 #endif
   1175 #endif
   1176 
   1177 		if (raidPtr->valid) {
   1178 			/* There is a valid RAID set running on this unit! */
   1179 			printf("raid%d: Device already configured!\n",unit);
   1180 			return(EINVAL);
   1181 		}
   1182 
   1183 		/* copy-in the configuration information */
   1184 		/* data points to a pointer to the configuration structure */
   1185 
   1186 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1187 		if (k_cfg == NULL) {
   1188 			return (ENOMEM);
   1189 		}
   1190 #ifdef COMPAT_NETBSD32
   1191 #ifdef _LP64
   1192 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1193 		    (l->l_proc->p_flag & PK_32) != 0)
   1194 			retcode = rf_config_netbsd32(data, k_cfg);
   1195 		else
   1196 #endif
   1197 #endif
   1198 		{
   1199 			u_cfg = *((RF_Config_t **) data);
   1200 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1201 		}
   1202 		if (retcode) {
   1203 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1204 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1205 				retcode));
   1206 			goto no_config;
   1207 		}
   1208 		goto config;
   1209 	config:
   1210 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1211 
   1212 		/* allocate a buffer for the layout-specific data, and copy it
   1213 		 * in */
   1214 		if (k_cfg->layoutSpecificSize) {
   1215 			if (k_cfg->layoutSpecificSize > 10000) {
   1216 				/* sanity check */
   1217 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1218 				retcode = EINVAL;
   1219 				goto no_config;
   1220 			}
   1221 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1222 			    (u_char *));
   1223 			if (specific_buf == NULL) {
   1224 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1225 				retcode = ENOMEM;
   1226 				goto no_config;
   1227 			}
   1228 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1229 			    k_cfg->layoutSpecificSize);
   1230 			if (retcode) {
   1231 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1232 				RF_Free(specific_buf,
   1233 					k_cfg->layoutSpecificSize);
   1234 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1235 					retcode));
   1236 				goto no_config;
   1237 			}
   1238 		} else
   1239 			specific_buf = NULL;
   1240 		k_cfg->layoutSpecific = specific_buf;
   1241 
   1242 		/* should do some kind of sanity check on the configuration.
   1243 		 * Store the sum of all the bytes in the last byte? */
   1244 
   1245 		/* configure the system */
   1246 
   1247 		/*
   1248 		 * Clear the entire RAID descriptor, just to make sure
   1249 		 *  there is no stale data left in the case of a
   1250 		 *  reconfiguration
   1251 		 */
   1252 		memset(raidPtr, 0, sizeof(*raidPtr));
   1253 		raidPtr->softc = rs;
   1254 		raidPtr->raidid = unit;
   1255 
   1256 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1257 
   1258 		if (retcode == 0) {
   1259 
   1260 			/* allow this many simultaneous IO's to
   1261 			   this RAID device */
   1262 			raidPtr->openings = RAIDOUTSTANDING;
   1263 
   1264 			raidinit(rs);
   1265 			raid_wakeup(raidPtr);
   1266 			rf_markalldirty(raidPtr);
   1267 		}
   1268 		/* free the buffers.  No return code here. */
   1269 		if (k_cfg->layoutSpecificSize) {
   1270 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1271 		}
   1272 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1273 
   1274 	no_config:
   1275 		/*
   1276 		 * If configuration failed, set sc_flags so that we
   1277 		 * will detach the device when we close it.
   1278 		 */
   1279 		if (retcode != 0)
   1280 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1281 		return (retcode);
   1282 
   1283 		/* shutdown the system */
   1284 	case RAIDFRAME_SHUTDOWN:
   1285 
   1286 		part = DISKPART(dev);
   1287 		pmask = (1 << part);
   1288 
   1289 		if ((error = raidlock(rs)) != 0)
   1290 			return (error);
   1291 
   1292 		if (DK_BUSY(dksc, pmask) ||
   1293 		    raidPtr->recon_in_progress != 0 ||
   1294 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1295 		    raidPtr->copyback_in_progress != 0)
   1296 			retcode = EBUSY;
   1297 		else {
   1298 			/* detach and free on close */
   1299 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1300 			retcode = 0;
   1301 		}
   1302 
   1303 		raidunlock(rs);
   1304 
   1305 		return (retcode);
   1306 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1307 		return rf_get_component_label(raidPtr, data);
   1308 
   1309 #if 0
   1310 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1311 		clabel = (RF_ComponentLabel_t *) data;
   1312 
   1313 		/* XXX check the label for valid stuff... */
   1314 		/* Note that some things *should not* get modified --
   1315 		   the user should be re-initing the labels instead of
   1316 		   trying to patch things.
   1317 		   */
   1318 
   1319 		raidid = raidPtr->raidid;
   1320 #ifdef DEBUG
   1321 		printf("raid%d: Got component label:\n", raidid);
   1322 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1323 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1324 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1325 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1326 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1327 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1328 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1329 #endif
   1330 		clabel->row = 0;
   1331 		column = clabel->column;
   1332 
   1333 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1334 			return(EINVAL);
   1335 		}
   1336 
   1337 		/* XXX this isn't allowed to do anything for now :-) */
   1338 
   1339 		/* XXX and before it is, we need to fill in the rest
   1340 		   of the fields!?!?!?! */
   1341 		memcpy(raidget_component_label(raidPtr, column),
   1342 		    clabel, sizeof(*clabel));
   1343 		raidflush_component_label(raidPtr, column);
   1344 		return (0);
   1345 #endif
   1346 
   1347 	case RAIDFRAME_INIT_LABELS:
   1348 		clabel = (RF_ComponentLabel_t *) data;
   1349 		/*
   1350 		   we only want the serial number from
   1351 		   the above.  We get all the rest of the information
   1352 		   from the config that was used to create this RAID
   1353 		   set.
   1354 		   */
   1355 
   1356 		raidPtr->serial_number = clabel->serial_number;
   1357 
   1358 		for(column=0;column<raidPtr->numCol;column++) {
   1359 			diskPtr = &raidPtr->Disks[column];
   1360 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1361 				ci_label = raidget_component_label(raidPtr,
   1362 				    column);
   1363 				/* Zeroing this is important. */
   1364 				memset(ci_label, 0, sizeof(*ci_label));
   1365 				raid_init_component_label(raidPtr, ci_label);
   1366 				ci_label->serial_number =
   1367 				    raidPtr->serial_number;
   1368 				ci_label->row = 0; /* we dont' pretend to support more */
   1369 				rf_component_label_set_partitionsize(ci_label,
   1370 				    diskPtr->partitionSize);
   1371 				ci_label->column = column;
   1372 				raidflush_component_label(raidPtr, column);
   1373 			}
   1374 			/* XXXjld what about the spares? */
   1375 		}
   1376 
   1377 		return (retcode);
   1378 	case RAIDFRAME_SET_AUTOCONFIG:
   1379 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1380 		printf("raid%d: New autoconfig value is: %d\n",
   1381 		       raidPtr->raidid, d);
   1382 		*(int *) data = d;
   1383 		return (retcode);
   1384 
   1385 	case RAIDFRAME_SET_ROOT:
   1386 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1387 		printf("raid%d: New rootpartition value is: %d\n",
   1388 		       raidPtr->raidid, d);
   1389 		*(int *) data = d;
   1390 		return (retcode);
   1391 
   1392 		/* initialize all parity */
   1393 	case RAIDFRAME_REWRITEPARITY:
   1394 
   1395 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1396 			/* Parity for RAID 0 is trivially correct */
   1397 			raidPtr->parity_good = RF_RAID_CLEAN;
   1398 			return(0);
   1399 		}
   1400 
   1401 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1402 			/* Re-write is already in progress! */
   1403 			return(EINVAL);
   1404 		}
   1405 
   1406 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1407 					   rf_RewriteParityThread,
   1408 					   raidPtr,"raid_parity");
   1409 		return (retcode);
   1410 
   1411 
   1412 	case RAIDFRAME_ADD_HOT_SPARE:
   1413 		sparePtr = (RF_SingleComponent_t *) data;
   1414 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1415 		retcode = rf_add_hot_spare(raidPtr, &component);
   1416 		return(retcode);
   1417 
   1418 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1419 		return(retcode);
   1420 
   1421 	case RAIDFRAME_DELETE_COMPONENT:
   1422 		componentPtr = (RF_SingleComponent_t *)data;
   1423 		memcpy( &component, componentPtr,
   1424 			sizeof(RF_SingleComponent_t));
   1425 		retcode = rf_delete_component(raidPtr, &component);
   1426 		return(retcode);
   1427 
   1428 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1429 		componentPtr = (RF_SingleComponent_t *)data;
   1430 		memcpy( &component, componentPtr,
   1431 			sizeof(RF_SingleComponent_t));
   1432 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1433 		return(retcode);
   1434 
   1435 	case RAIDFRAME_REBUILD_IN_PLACE:
   1436 
   1437 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1438 			/* Can't do this on a RAID 0!! */
   1439 			return(EINVAL);
   1440 		}
   1441 
   1442 		if (raidPtr->recon_in_progress == 1) {
   1443 			/* a reconstruct is already in progress! */
   1444 			return(EINVAL);
   1445 		}
   1446 
   1447 		componentPtr = (RF_SingleComponent_t *) data;
   1448 		memcpy( &component, componentPtr,
   1449 			sizeof(RF_SingleComponent_t));
   1450 		component.row = 0; /* we don't support any more */
   1451 		column = component.column;
   1452 
   1453 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1454 			return(EINVAL);
   1455 		}
   1456 
   1457 		rf_lock_mutex2(raidPtr->mutex);
   1458 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1459 		    (raidPtr->numFailures > 0)) {
   1460 			/* XXX 0 above shouldn't be constant!!! */
   1461 			/* some component other than this has failed.
   1462 			   Let's not make things worse than they already
   1463 			   are... */
   1464 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1465 			       raidPtr->raidid);
   1466 			printf("raid%d:     Col: %d   Too many failures.\n",
   1467 			       raidPtr->raidid, column);
   1468 			rf_unlock_mutex2(raidPtr->mutex);
   1469 			return (EINVAL);
   1470 		}
   1471 		if (raidPtr->Disks[column].status ==
   1472 		    rf_ds_reconstructing) {
   1473 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1474 			       raidPtr->raidid);
   1475 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1476 
   1477 			rf_unlock_mutex2(raidPtr->mutex);
   1478 			return (EINVAL);
   1479 		}
   1480 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1481 			rf_unlock_mutex2(raidPtr->mutex);
   1482 			return (EINVAL);
   1483 		}
   1484 		rf_unlock_mutex2(raidPtr->mutex);
   1485 
   1486 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1487 		if (rrint == NULL)
   1488 			return(ENOMEM);
   1489 
   1490 		rrint->col = column;
   1491 		rrint->raidPtr = raidPtr;
   1492 
   1493 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1494 					   rf_ReconstructInPlaceThread,
   1495 					   rrint, "raid_reconip");
   1496 		return(retcode);
   1497 
   1498 	case RAIDFRAME_GET_INFO:
   1499 #ifdef COMPAT_NETBSD32
   1500 #ifdef _LP64
   1501 	case RAIDFRAME_GET_INFO32:
   1502 #endif
   1503 #endif
   1504 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1505 			  (RF_DeviceConfig_t *));
   1506 		if (d_cfg == NULL)
   1507 			return (ENOMEM);
   1508 		retcode = rf_get_info(raidPtr, d_cfg);
   1509 		if (retcode == 0) {
   1510 #ifdef COMPAT_NETBSD32
   1511 #ifdef _LP64
   1512 			if (cmd == RAIDFRAME_GET_INFO32)
   1513 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1514 			else
   1515 #endif
   1516 #endif
   1517 				ucfgp = *(RF_DeviceConfig_t **)data;
   1518 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1519 		}
   1520 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1521 
   1522 		return (retcode);
   1523 
   1524 	case RAIDFRAME_CHECK_PARITY:
   1525 		*(int *) data = raidPtr->parity_good;
   1526 		return (0);
   1527 
   1528 	case RAIDFRAME_PARITYMAP_STATUS:
   1529 		if (rf_paritymap_ineligible(raidPtr))
   1530 			return EINVAL;
   1531 		rf_paritymap_status(raidPtr->parity_map,
   1532 		    (struct rf_pmstat *)data);
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		if (raidPtr->parity_map == NULL)
   1539 			return ENOENT; /* ??? */
   1540 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1541 			(struct rf_pmparams *)data, 1))
   1542 			return EINVAL;
   1543 		return 0;
   1544 
   1545 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1546 		if (rf_paritymap_ineligible(raidPtr))
   1547 			return EINVAL;
   1548 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1549 		return 0;
   1550 
   1551 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1552 		if (rf_paritymap_ineligible(raidPtr))
   1553 			return EINVAL;
   1554 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1555 		/* XXX should errors be passed up? */
   1556 		return 0;
   1557 
   1558 	case RAIDFRAME_RESET_ACCTOTALS:
   1559 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1560 		return (0);
   1561 
   1562 	case RAIDFRAME_GET_ACCTOTALS:
   1563 		totals = (RF_AccTotals_t *) data;
   1564 		*totals = raidPtr->acc_totals;
   1565 		return (0);
   1566 
   1567 	case RAIDFRAME_KEEP_ACCTOTALS:
   1568 		raidPtr->keep_acc_totals = *(int *)data;
   1569 		return (0);
   1570 
   1571 	case RAIDFRAME_GET_SIZE:
   1572 		*(int *) data = raidPtr->totalSectors;
   1573 		return (0);
   1574 
   1575 		/* fail a disk & optionally start reconstruction */
   1576 	case RAIDFRAME_FAIL_DISK80:
   1577 		/* Check if we called compat code for this cmd */
   1578 		if (retcode != EPASSTHROUGH)
   1579 			return EINVAL;
   1580 		/* FALLTHRU */
   1581 	case RAIDFRAME_FAIL_DISK:
   1582 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1583 			/* Can't do this on a RAID 0!! */
   1584 			return(EINVAL);
   1585 		}
   1586 
   1587 		rr = (struct rf_recon_req *) data;
   1588 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1589 			return (EINVAL);
   1590 
   1591 		rf_lock_mutex2(raidPtr->mutex);
   1592 		if (raidPtr->status == rf_rs_reconstructing) {
   1593 			/* you can't fail a disk while we're reconstructing! */
   1594 			/* XXX wrong for RAID6 */
   1595 			rf_unlock_mutex2(raidPtr->mutex);
   1596 			return (EINVAL);
   1597 		}
   1598 		if ((raidPtr->Disks[rr->col].status ==
   1599 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1600 			/* some other component has failed.  Let's not make
   1601 			   things worse. XXX wrong for RAID6 */
   1602 			rf_unlock_mutex2(raidPtr->mutex);
   1603 			return (EINVAL);
   1604 		}
   1605 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1606 			/* Can't fail a spared disk! */
   1607 			rf_unlock_mutex2(raidPtr->mutex);
   1608 			return (EINVAL);
   1609 		}
   1610 		rf_unlock_mutex2(raidPtr->mutex);
   1611 
   1612 		/* make a copy of the recon request so that we don't rely on
   1613 		 * the user's buffer */
   1614 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1615 		if (rrint == NULL)
   1616 			return(ENOMEM);
   1617 		rrint->col = rr->col;
   1618 		rrint->flags = rr->flags;
   1619 		rrint->raidPtr = raidPtr;
   1620 
   1621 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1622 					   rf_ReconThread,
   1623 					   rrint, "raid_recon");
   1624 		return (0);
   1625 
   1626 		/* invoke a copyback operation after recon on whatever disk
   1627 		 * needs it, if any */
   1628 	case RAIDFRAME_COPYBACK:
   1629 
   1630 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1631 			/* This makes no sense on a RAID 0!! */
   1632 			return(EINVAL);
   1633 		}
   1634 
   1635 		if (raidPtr->copyback_in_progress == 1) {
   1636 			/* Copyback is already in progress! */
   1637 			return(EINVAL);
   1638 		}
   1639 
   1640 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1641 					   rf_CopybackThread,
   1642 					   raidPtr,"raid_copyback");
   1643 		return (retcode);
   1644 
   1645 		/* return the percentage completion of reconstruction */
   1646 	case RAIDFRAME_CHECK_RECON_STATUS:
   1647 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1648 			/* This makes no sense on a RAID 0, so tell the
   1649 			   user it's done. */
   1650 			*(int *) data = 100;
   1651 			return(0);
   1652 		}
   1653 		if (raidPtr->status != rf_rs_reconstructing)
   1654 			*(int *) data = 100;
   1655 		else {
   1656 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1657 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1658 			} else {
   1659 				*(int *) data = 0;
   1660 			}
   1661 		}
   1662 		return (0);
   1663 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1664 		rf_check_recon_status_ext(raidPtr, data);
   1665 		return (0);
   1666 
   1667 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1668 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1669 			/* This makes no sense on a RAID 0, so tell the
   1670 			   user it's done. */
   1671 			*(int *) data = 100;
   1672 			return(0);
   1673 		}
   1674 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1675 			*(int *) data = 100 *
   1676 				raidPtr->parity_rewrite_stripes_done /
   1677 				raidPtr->Layout.numStripe;
   1678 		} else {
   1679 			*(int *) data = 100;
   1680 		}
   1681 		return (0);
   1682 
   1683 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1684 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1685 		return (0);
   1686 
   1687 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1688 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1689 			/* This makes no sense on a RAID 0 */
   1690 			*(int *) data = 100;
   1691 			return(0);
   1692 		}
   1693 		if (raidPtr->copyback_in_progress == 1) {
   1694 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1695 				raidPtr->Layout.numStripe;
   1696 		} else {
   1697 			*(int *) data = 100;
   1698 		}
   1699 		return (0);
   1700 
   1701 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1702 		rf_check_copyback_status_ext(raidPtr, data);
   1703 		return 0;
   1704 
   1705 	case RAIDFRAME_SET_LAST_UNIT:
   1706 		for (column = 0; column < raidPtr->numCol; column++)
   1707 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1708 				return EBUSY;
   1709 
   1710 		for (column = 0; column < raidPtr->numCol; column++) {
   1711 			clabel = raidget_component_label(raidPtr, column);
   1712 			clabel->last_unit = *(int *)data;
   1713 			raidflush_component_label(raidPtr, column);
   1714 		}
   1715 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1716 		return 0;
   1717 
   1718 		/* the sparetable daemon calls this to wait for the kernel to
   1719 		 * need a spare table. this ioctl does not return until a
   1720 		 * spare table is needed. XXX -- calling mpsleep here in the
   1721 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1722 		 * -- I should either compute the spare table in the kernel,
   1723 		 * or have a different -- XXX XXX -- interface (a different
   1724 		 * character device) for delivering the table     -- XXX */
   1725 #if 0
   1726 	case RAIDFRAME_SPARET_WAIT:
   1727 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1728 		while (!rf_sparet_wait_queue)
   1729 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1730 		waitreq = rf_sparet_wait_queue;
   1731 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1732 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1733 
   1734 		/* structure assignment */
   1735 		*((RF_SparetWait_t *) data) = *waitreq;
   1736 
   1737 		RF_Free(waitreq, sizeof(*waitreq));
   1738 		return (0);
   1739 
   1740 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1741 		 * code in it that will cause the dameon to exit */
   1742 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1743 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1744 		waitreq->fcol = -1;
   1745 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1746 		waitreq->next = rf_sparet_wait_queue;
   1747 		rf_sparet_wait_queue = waitreq;
   1748 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1749 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1750 		return (0);
   1751 
   1752 		/* used by the spare table daemon to deliver a spare table
   1753 		 * into the kernel */
   1754 	case RAIDFRAME_SEND_SPARET:
   1755 
   1756 		/* install the spare table */
   1757 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1758 
   1759 		/* respond to the requestor.  the return status of the spare
   1760 		 * table installation is passed in the "fcol" field */
   1761 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1762 		waitreq->fcol = retcode;
   1763 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1764 		waitreq->next = rf_sparet_resp_queue;
   1765 		rf_sparet_resp_queue = waitreq;
   1766 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1767 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1768 
   1769 		return (retcode);
   1770 #endif
   1771 
   1772 	default:
   1773 		break; /* fall through to the os-specific code below */
   1774 
   1775 	}
   1776 
   1777 	if (!raidPtr->valid)
   1778 		return (EINVAL);
   1779 
   1780 	/*
   1781 	 * Add support for "regular" device ioctls here.
   1782 	 */
   1783 
   1784 	switch (cmd) {
   1785 	case DIOCGCACHE:
   1786 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1787 		break;
   1788 
   1789 	case DIOCCACHESYNC:
   1790 		retcode = rf_sync_component_caches(raidPtr);
   1791 		break;
   1792 
   1793 	default:
   1794 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1795 		break;
   1796 	}
   1797 
   1798 	return (retcode);
   1799 
   1800 }
   1801 
   1802 
   1803 /* raidinit -- complete the rest of the initialization for the
   1804    RAIDframe device.  */
   1805 
   1806 
   1807 static void
   1808 raidinit(struct raid_softc *rs)
   1809 {
   1810 	cfdata_t cf;
   1811 	unsigned int unit;
   1812 	struct dk_softc *dksc = &rs->sc_dksc;
   1813 	RF_Raid_t *raidPtr = &rs->sc_r;
   1814 	device_t dev;
   1815 
   1816 	unit = raidPtr->raidid;
   1817 
   1818 	/* XXX doesn't check bounds. */
   1819 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1820 
   1821 	/* attach the pseudo device */
   1822 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1823 	cf->cf_name = raid_cd.cd_name;
   1824 	cf->cf_atname = raid_cd.cd_name;
   1825 	cf->cf_unit = unit;
   1826 	cf->cf_fstate = FSTATE_STAR;
   1827 
   1828 	dev = config_attach_pseudo(cf);
   1829 	if (dev == NULL) {
   1830 		printf("raid%d: config_attach_pseudo failed\n",
   1831 		    raidPtr->raidid);
   1832 		free(cf, M_RAIDFRAME);
   1833 		return;
   1834 	}
   1835 
   1836 	/* provide a backpointer to the real softc */
   1837 	raidsoftc(dev) = rs;
   1838 
   1839 	/* disk_attach actually creates space for the CPU disklabel, among
   1840 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1841 	 * with disklabels. */
   1842 	dk_init(dksc, dev, DKTYPE_RAID);
   1843 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1844 
   1845 	/* XXX There may be a weird interaction here between this, and
   1846 	 * protectedSectors, as used in RAIDframe.  */
   1847 
   1848 	rs->sc_size = raidPtr->totalSectors;
   1849 
   1850 	/* Attach dk and disk subsystems */
   1851 	dk_attach(dksc);
   1852 	disk_attach(&dksc->sc_dkdev);
   1853 	rf_set_geometry(rs, raidPtr);
   1854 
   1855 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1856 
   1857 	/* mark unit as usuable */
   1858 	rs->sc_flags |= RAIDF_INITED;
   1859 
   1860 	dkwedge_discover(&dksc->sc_dkdev);
   1861 }
   1862 
   1863 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1864 /* wake up the daemon & tell it to get us a spare table
   1865  * XXX
   1866  * the entries in the queues should be tagged with the raidPtr
   1867  * so that in the extremely rare case that two recons happen at once,
   1868  * we know for which device were requesting a spare table
   1869  * XXX
   1870  *
   1871  * XXX This code is not currently used. GO
   1872  */
   1873 int
   1874 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1875 {
   1876 	int     retcode;
   1877 
   1878 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1879 	req->next = rf_sparet_wait_queue;
   1880 	rf_sparet_wait_queue = req;
   1881 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1882 
   1883 	/* mpsleep unlocks the mutex */
   1884 	while (!rf_sparet_resp_queue) {
   1885 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1886 	}
   1887 	req = rf_sparet_resp_queue;
   1888 	rf_sparet_resp_queue = req->next;
   1889 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1890 
   1891 	retcode = req->fcol;
   1892 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1893 					 * alloc'd */
   1894 	return (retcode);
   1895 }
   1896 #endif
   1897 
   1898 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1899  * bp & passes it down.
   1900  * any calls originating in the kernel must use non-blocking I/O
   1901  * do some extra sanity checking to return "appropriate" error values for
   1902  * certain conditions (to make some standard utilities work)
   1903  *
   1904  * Formerly known as: rf_DoAccessKernel
   1905  */
   1906 void
   1907 raidstart(RF_Raid_t *raidPtr)
   1908 {
   1909 	struct raid_softc *rs;
   1910 	struct dk_softc *dksc;
   1911 
   1912 	rs = raidPtr->softc;
   1913 	dksc = &rs->sc_dksc;
   1914 	/* quick check to see if anything has died recently */
   1915 	rf_lock_mutex2(raidPtr->mutex);
   1916 	if (raidPtr->numNewFailures > 0) {
   1917 		rf_unlock_mutex2(raidPtr->mutex);
   1918 		rf_update_component_labels(raidPtr,
   1919 					   RF_NORMAL_COMPONENT_UPDATE);
   1920 		rf_lock_mutex2(raidPtr->mutex);
   1921 		raidPtr->numNewFailures--;
   1922 	}
   1923 	rf_unlock_mutex2(raidPtr->mutex);
   1924 
   1925 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1926 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1927 		return;
   1928 	}
   1929 
   1930 	dk_start(dksc, NULL);
   1931 }
   1932 
   1933 static int
   1934 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1935 {
   1936 	RF_SectorCount_t num_blocks, pb, sum;
   1937 	RF_RaidAddr_t raid_addr;
   1938 	daddr_t blocknum;
   1939 	int     do_async;
   1940 	int rc;
   1941 
   1942 	rf_lock_mutex2(raidPtr->mutex);
   1943 	if (raidPtr->openings == 0) {
   1944 		rf_unlock_mutex2(raidPtr->mutex);
   1945 		return EAGAIN;
   1946 	}
   1947 	rf_unlock_mutex2(raidPtr->mutex);
   1948 
   1949 	blocknum = bp->b_rawblkno;
   1950 
   1951 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1952 		    (int) blocknum));
   1953 
   1954 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1955 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1956 
   1957 	/* *THIS* is where we adjust what block we're going to...
   1958 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1959 	raid_addr = blocknum;
   1960 
   1961 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1962 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1963 	sum = raid_addr + num_blocks + pb;
   1964 	if (1 || rf_debugKernelAccess) {
   1965 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1966 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1967 			    (int) pb, (int) bp->b_resid));
   1968 	}
   1969 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1970 	    || (sum < num_blocks) || (sum < pb)) {
   1971 		rc = ENOSPC;
   1972 		goto done;
   1973 	}
   1974 	/*
   1975 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1976 	 */
   1977 
   1978 	if (bp->b_bcount & raidPtr->sectorMask) {
   1979 		rc = ENOSPC;
   1980 		goto done;
   1981 	}
   1982 	db1_printf(("Calling DoAccess..\n"));
   1983 
   1984 
   1985 	rf_lock_mutex2(raidPtr->mutex);
   1986 	raidPtr->openings--;
   1987 	rf_unlock_mutex2(raidPtr->mutex);
   1988 
   1989 	/*
   1990 	 * Everything is async.
   1991 	 */
   1992 	do_async = 1;
   1993 
   1994 	/* don't ever condition on bp->b_flags & B_WRITE.
   1995 	 * always condition on B_READ instead */
   1996 
   1997 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1998 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1999 			 do_async, raid_addr, num_blocks,
   2000 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2001 
   2002 done:
   2003 	return rc;
   2004 }
   2005 
   2006 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2007 
   2008 int
   2009 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2010 {
   2011 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2012 	struct buf *bp;
   2013 
   2014 	req->queue = queue;
   2015 	bp = req->bp;
   2016 
   2017 	switch (req->type) {
   2018 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2019 		/* XXX need to do something extra here.. */
   2020 		/* I'm leaving this in, as I've never actually seen it used,
   2021 		 * and I'd like folks to report it... GO */
   2022 		printf(("WAKEUP CALLED\n"));
   2023 		queue->numOutstanding++;
   2024 
   2025 		bp->b_flags = 0;
   2026 		bp->b_private = req;
   2027 
   2028 		KernelWakeupFunc(bp);
   2029 		break;
   2030 
   2031 	case RF_IO_TYPE_READ:
   2032 	case RF_IO_TYPE_WRITE:
   2033 #if RF_ACC_TRACE > 0
   2034 		if (req->tracerec) {
   2035 			RF_ETIMER_START(req->tracerec->timer);
   2036 		}
   2037 #endif
   2038 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2039 		    op, queue->rf_cinfo->ci_dev,
   2040 		    req->sectorOffset, req->numSector,
   2041 		    req->buf, KernelWakeupFunc, (void *) req,
   2042 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2043 
   2044 		if (rf_debugKernelAccess) {
   2045 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2046 				(long) bp->b_blkno));
   2047 		}
   2048 		queue->numOutstanding++;
   2049 		queue->last_deq_sector = req->sectorOffset;
   2050 		/* acc wouldn't have been let in if there were any pending
   2051 		 * reqs at any other priority */
   2052 		queue->curPriority = req->priority;
   2053 
   2054 		db1_printf(("Going for %c to unit %d col %d\n",
   2055 			    req->type, queue->raidPtr->raidid,
   2056 			    queue->col));
   2057 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2058 			(int) req->sectorOffset, (int) req->numSector,
   2059 			(int) (req->numSector <<
   2060 			    queue->raidPtr->logBytesPerSector),
   2061 			(int) queue->raidPtr->logBytesPerSector));
   2062 
   2063 		/*
   2064 		 * XXX: drop lock here since this can block at
   2065 		 * least with backing SCSI devices.  Retake it
   2066 		 * to minimize fuss with calling interfaces.
   2067 		 */
   2068 
   2069 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2070 		bdev_strategy(bp);
   2071 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2072 		break;
   2073 
   2074 	default:
   2075 		panic("bad req->type in rf_DispatchKernelIO");
   2076 	}
   2077 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2078 
   2079 	return (0);
   2080 }
   2081 /* this is the callback function associated with a I/O invoked from
   2082    kernel code.
   2083  */
   2084 static void
   2085 KernelWakeupFunc(struct buf *bp)
   2086 {
   2087 	RF_DiskQueueData_t *req = NULL;
   2088 	RF_DiskQueue_t *queue;
   2089 
   2090 	db1_printf(("recovering the request queue:\n"));
   2091 
   2092 	req = bp->b_private;
   2093 
   2094 	queue = (RF_DiskQueue_t *) req->queue;
   2095 
   2096 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2097 
   2098 #if RF_ACC_TRACE > 0
   2099 	if (req->tracerec) {
   2100 		RF_ETIMER_STOP(req->tracerec->timer);
   2101 		RF_ETIMER_EVAL(req->tracerec->timer);
   2102 		rf_lock_mutex2(rf_tracing_mutex);
   2103 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2104 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2105 		req->tracerec->num_phys_ios++;
   2106 		rf_unlock_mutex2(rf_tracing_mutex);
   2107 	}
   2108 #endif
   2109 
   2110 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2111 	 * ballistic, and mark the component as hosed... */
   2112 
   2113 	if (bp->b_error != 0) {
   2114 		/* Mark the disk as dead */
   2115 		/* but only mark it once... */
   2116 		/* and only if it wouldn't leave this RAID set
   2117 		   completely broken */
   2118 		if (((queue->raidPtr->Disks[queue->col].status ==
   2119 		      rf_ds_optimal) ||
   2120 		     (queue->raidPtr->Disks[queue->col].status ==
   2121 		      rf_ds_used_spare)) &&
   2122 		     (queue->raidPtr->numFailures <
   2123 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2124 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2125 			       queue->raidPtr->raidid,
   2126 			       bp->b_error,
   2127 			       queue->raidPtr->Disks[queue->col].devname);
   2128 			queue->raidPtr->Disks[queue->col].status =
   2129 			    rf_ds_failed;
   2130 			queue->raidPtr->status = rf_rs_degraded;
   2131 			queue->raidPtr->numFailures++;
   2132 			queue->raidPtr->numNewFailures++;
   2133 		} else {	/* Disk is already dead... */
   2134 			/* printf("Disk already marked as dead!\n"); */
   2135 		}
   2136 
   2137 	}
   2138 
   2139 	/* Fill in the error value */
   2140 	req->error = bp->b_error;
   2141 
   2142 	/* Drop this one on the "finished" queue... */
   2143 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2144 
   2145 	/* Let the raidio thread know there is work to be done. */
   2146 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2147 
   2148 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2149 }
   2150 
   2151 
   2152 /*
   2153  * initialize a buf structure for doing an I/O in the kernel.
   2154  */
   2155 static void
   2156 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2157        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2158        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2159        struct proc *b_proc)
   2160 {
   2161 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2162 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2163 	bp->b_oflags = 0;
   2164 	bp->b_cflags = 0;
   2165 	bp->b_bcount = numSect << logBytesPerSector;
   2166 	bp->b_bufsize = bp->b_bcount;
   2167 	bp->b_error = 0;
   2168 	bp->b_dev = dev;
   2169 	bp->b_data = bf;
   2170 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2171 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2172 	if (bp->b_bcount == 0) {
   2173 		panic("bp->b_bcount is zero in InitBP!!");
   2174 	}
   2175 	bp->b_proc = b_proc;
   2176 	bp->b_iodone = cbFunc;
   2177 	bp->b_private = cbArg;
   2178 }
   2179 
   2180 /*
   2181  * Wait interruptibly for an exclusive lock.
   2182  *
   2183  * XXX
   2184  * Several drivers do this; it should be abstracted and made MP-safe.
   2185  * (Hmm... where have we seen this warning before :->  GO )
   2186  */
   2187 static int
   2188 raidlock(struct raid_softc *rs)
   2189 {
   2190 	int     error;
   2191 
   2192 	error = 0;
   2193 	mutex_enter(&rs->sc_mutex);
   2194 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2195 		rs->sc_flags |= RAIDF_WANTED;
   2196 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2197 		if (error != 0)
   2198 			goto done;
   2199 	}
   2200 	rs->sc_flags |= RAIDF_LOCKED;
   2201 done:
   2202 	mutex_exit(&rs->sc_mutex);
   2203 	return (error);
   2204 }
   2205 /*
   2206  * Unlock and wake up any waiters.
   2207  */
   2208 static void
   2209 raidunlock(struct raid_softc *rs)
   2210 {
   2211 
   2212 	mutex_enter(&rs->sc_mutex);
   2213 	rs->sc_flags &= ~RAIDF_LOCKED;
   2214 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2215 		rs->sc_flags &= ~RAIDF_WANTED;
   2216 		cv_broadcast(&rs->sc_cv);
   2217 	}
   2218 	mutex_exit(&rs->sc_mutex);
   2219 }
   2220 
   2221 
   2222 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2223 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2224 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2225 
   2226 static daddr_t
   2227 rf_component_info_offset(void)
   2228 {
   2229 
   2230 	return RF_COMPONENT_INFO_OFFSET;
   2231 }
   2232 
   2233 static daddr_t
   2234 rf_component_info_size(unsigned secsize)
   2235 {
   2236 	daddr_t info_size;
   2237 
   2238 	KASSERT(secsize);
   2239 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2240 		info_size = secsize;
   2241 	else
   2242 		info_size = RF_COMPONENT_INFO_SIZE;
   2243 
   2244 	return info_size;
   2245 }
   2246 
   2247 static daddr_t
   2248 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2249 {
   2250 	daddr_t map_offset;
   2251 
   2252 	KASSERT(raidPtr->bytesPerSector);
   2253 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2254 		map_offset = raidPtr->bytesPerSector;
   2255 	else
   2256 		map_offset = RF_COMPONENT_INFO_SIZE;
   2257 	map_offset += rf_component_info_offset();
   2258 
   2259 	return map_offset;
   2260 }
   2261 
   2262 static daddr_t
   2263 rf_parity_map_size(RF_Raid_t *raidPtr)
   2264 {
   2265 	daddr_t map_size;
   2266 
   2267 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2268 		map_size = raidPtr->bytesPerSector;
   2269 	else
   2270 		map_size = RF_PARITY_MAP_SIZE;
   2271 
   2272 	return map_size;
   2273 }
   2274 
   2275 int
   2276 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2277 {
   2278 	RF_ComponentLabel_t *clabel;
   2279 
   2280 	clabel = raidget_component_label(raidPtr, col);
   2281 	clabel->clean = RF_RAID_CLEAN;
   2282 	raidflush_component_label(raidPtr, col);
   2283 	return(0);
   2284 }
   2285 
   2286 
   2287 int
   2288 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2289 {
   2290 	RF_ComponentLabel_t *clabel;
   2291 
   2292 	clabel = raidget_component_label(raidPtr, col);
   2293 	clabel->clean = RF_RAID_DIRTY;
   2294 	raidflush_component_label(raidPtr, col);
   2295 	return(0);
   2296 }
   2297 
   2298 int
   2299 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2300 {
   2301 	KASSERT(raidPtr->bytesPerSector);
   2302 	return raidread_component_label(raidPtr->bytesPerSector,
   2303 	    raidPtr->Disks[col].dev,
   2304 	    raidPtr->raid_cinfo[col].ci_vp,
   2305 	    &raidPtr->raid_cinfo[col].ci_label);
   2306 }
   2307 
   2308 RF_ComponentLabel_t *
   2309 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2310 {
   2311 	return &raidPtr->raid_cinfo[col].ci_label;
   2312 }
   2313 
   2314 int
   2315 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2316 {
   2317 	RF_ComponentLabel_t *label;
   2318 
   2319 	label = &raidPtr->raid_cinfo[col].ci_label;
   2320 	label->mod_counter = raidPtr->mod_counter;
   2321 #ifndef RF_NO_PARITY_MAP
   2322 	label->parity_map_modcount = label->mod_counter;
   2323 #endif
   2324 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2325 	    raidPtr->Disks[col].dev,
   2326 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2327 }
   2328 
   2329 
   2330 static int
   2331 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2332     RF_ComponentLabel_t *clabel)
   2333 {
   2334 	return raidread_component_area(dev, b_vp, clabel,
   2335 	    sizeof(RF_ComponentLabel_t),
   2336 	    rf_component_info_offset(),
   2337 	    rf_component_info_size(secsize));
   2338 }
   2339 
   2340 /* ARGSUSED */
   2341 static int
   2342 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2343     size_t msize, daddr_t offset, daddr_t dsize)
   2344 {
   2345 	struct buf *bp;
   2346 	int error;
   2347 
   2348 	/* XXX should probably ensure that we don't try to do this if
   2349 	   someone has changed rf_protected_sectors. */
   2350 
   2351 	if (b_vp == NULL) {
   2352 		/* For whatever reason, this component is not valid.
   2353 		   Don't try to read a component label from it. */
   2354 		return(EINVAL);
   2355 	}
   2356 
   2357 	/* get a block of the appropriate size... */
   2358 	bp = geteblk((int)dsize);
   2359 	bp->b_dev = dev;
   2360 
   2361 	/* get our ducks in a row for the read */
   2362 	bp->b_blkno = offset / DEV_BSIZE;
   2363 	bp->b_bcount = dsize;
   2364 	bp->b_flags |= B_READ;
   2365  	bp->b_resid = dsize;
   2366 
   2367 	bdev_strategy(bp);
   2368 	error = biowait(bp);
   2369 
   2370 	if (!error) {
   2371 		memcpy(data, bp->b_data, msize);
   2372 	}
   2373 
   2374 	brelse(bp, 0);
   2375 	return(error);
   2376 }
   2377 
   2378 
   2379 static int
   2380 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2381     RF_ComponentLabel_t *clabel)
   2382 {
   2383 	return raidwrite_component_area(dev, b_vp, clabel,
   2384 	    sizeof(RF_ComponentLabel_t),
   2385 	    rf_component_info_offset(),
   2386 	    rf_component_info_size(secsize), 0);
   2387 }
   2388 
   2389 /* ARGSUSED */
   2390 static int
   2391 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2392     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2393 {
   2394 	struct buf *bp;
   2395 	int error;
   2396 
   2397 	/* get a block of the appropriate size... */
   2398 	bp = geteblk((int)dsize);
   2399 	bp->b_dev = dev;
   2400 
   2401 	/* get our ducks in a row for the write */
   2402 	bp->b_blkno = offset / DEV_BSIZE;
   2403 	bp->b_bcount = dsize;
   2404 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2405  	bp->b_resid = dsize;
   2406 
   2407 	memset(bp->b_data, 0, dsize);
   2408 	memcpy(bp->b_data, data, msize);
   2409 
   2410 	bdev_strategy(bp);
   2411 	if (asyncp)
   2412 		return 0;
   2413 	error = biowait(bp);
   2414 	brelse(bp, 0);
   2415 	if (error) {
   2416 #if 1
   2417 		printf("Failed to write RAID component info!\n");
   2418 #endif
   2419 	}
   2420 
   2421 	return(error);
   2422 }
   2423 
   2424 void
   2425 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2426 {
   2427 	int c;
   2428 
   2429 	for (c = 0; c < raidPtr->numCol; c++) {
   2430 		/* Skip dead disks. */
   2431 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2432 			continue;
   2433 		/* XXXjld: what if an error occurs here? */
   2434 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2435 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2436 		    RF_PARITYMAP_NBYTE,
   2437 		    rf_parity_map_offset(raidPtr),
   2438 		    rf_parity_map_size(raidPtr), 0);
   2439 	}
   2440 }
   2441 
   2442 void
   2443 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2444 {
   2445 	struct rf_paritymap_ondisk tmp;
   2446 	int c,first;
   2447 
   2448 	first=1;
   2449 	for (c = 0; c < raidPtr->numCol; c++) {
   2450 		/* Skip dead disks. */
   2451 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2452 			continue;
   2453 		raidread_component_area(raidPtr->Disks[c].dev,
   2454 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2455 		    RF_PARITYMAP_NBYTE,
   2456 		    rf_parity_map_offset(raidPtr),
   2457 		    rf_parity_map_size(raidPtr));
   2458 		if (first) {
   2459 			memcpy(map, &tmp, sizeof(*map));
   2460 			first = 0;
   2461 		} else {
   2462 			rf_paritymap_merge(map, &tmp);
   2463 		}
   2464 	}
   2465 }
   2466 
   2467 void
   2468 rf_markalldirty(RF_Raid_t *raidPtr)
   2469 {
   2470 	RF_ComponentLabel_t *clabel;
   2471 	int sparecol;
   2472 	int c;
   2473 	int j;
   2474 	int scol = -1;
   2475 
   2476 	raidPtr->mod_counter++;
   2477 	for (c = 0; c < raidPtr->numCol; c++) {
   2478 		/* we don't want to touch (at all) a disk that has
   2479 		   failed */
   2480 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2481 			clabel = raidget_component_label(raidPtr, c);
   2482 			if (clabel->status == rf_ds_spared) {
   2483 				/* XXX do something special...
   2484 				   but whatever you do, don't
   2485 				   try to access it!! */
   2486 			} else {
   2487 				raidmarkdirty(raidPtr, c);
   2488 			}
   2489 		}
   2490 	}
   2491 
   2492 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2493 		sparecol = raidPtr->numCol + c;
   2494 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2495 			/*
   2496 
   2497 			   we claim this disk is "optimal" if it's
   2498 			   rf_ds_used_spare, as that means it should be
   2499 			   directly substitutable for the disk it replaced.
   2500 			   We note that too...
   2501 
   2502 			 */
   2503 
   2504 			for(j=0;j<raidPtr->numCol;j++) {
   2505 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2506 					scol = j;
   2507 					break;
   2508 				}
   2509 			}
   2510 
   2511 			clabel = raidget_component_label(raidPtr, sparecol);
   2512 			/* make sure status is noted */
   2513 
   2514 			raid_init_component_label(raidPtr, clabel);
   2515 
   2516 			clabel->row = 0;
   2517 			clabel->column = scol;
   2518 			/* Note: we *don't* change status from rf_ds_used_spare
   2519 			   to rf_ds_optimal */
   2520 			/* clabel.status = rf_ds_optimal; */
   2521 
   2522 			raidmarkdirty(raidPtr, sparecol);
   2523 		}
   2524 	}
   2525 }
   2526 
   2527 
   2528 void
   2529 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2530 {
   2531 	RF_ComponentLabel_t *clabel;
   2532 	int sparecol;
   2533 	int c;
   2534 	int j;
   2535 	int scol;
   2536 	struct raid_softc *rs = raidPtr->softc;
   2537 
   2538 	scol = -1;
   2539 
   2540 	/* XXX should do extra checks to make sure things really are clean,
   2541 	   rather than blindly setting the clean bit... */
   2542 
   2543 	raidPtr->mod_counter++;
   2544 
   2545 	for (c = 0; c < raidPtr->numCol; c++) {
   2546 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2547 			clabel = raidget_component_label(raidPtr, c);
   2548 			/* make sure status is noted */
   2549 			clabel->status = rf_ds_optimal;
   2550 
   2551 			/* note what unit we are configured as */
   2552 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2553 				clabel->last_unit = raidPtr->raidid;
   2554 
   2555 			raidflush_component_label(raidPtr, c);
   2556 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2557 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2558 					raidmarkclean(raidPtr, c);
   2559 				}
   2560 			}
   2561 		}
   2562 		/* else we don't touch it.. */
   2563 	}
   2564 
   2565 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2566 		sparecol = raidPtr->numCol + c;
   2567 		/* Need to ensure that the reconstruct actually completed! */
   2568 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2569 			/*
   2570 
   2571 			   we claim this disk is "optimal" if it's
   2572 			   rf_ds_used_spare, as that means it should be
   2573 			   directly substitutable for the disk it replaced.
   2574 			   We note that too...
   2575 
   2576 			 */
   2577 
   2578 			for(j=0;j<raidPtr->numCol;j++) {
   2579 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2580 					scol = j;
   2581 					break;
   2582 				}
   2583 			}
   2584 
   2585 			/* XXX shouldn't *really* need this... */
   2586 			clabel = raidget_component_label(raidPtr, sparecol);
   2587 			/* make sure status is noted */
   2588 
   2589 			raid_init_component_label(raidPtr, clabel);
   2590 
   2591 			clabel->column = scol;
   2592 			clabel->status = rf_ds_optimal;
   2593 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2594 				clabel->last_unit = raidPtr->raidid;
   2595 
   2596 			raidflush_component_label(raidPtr, sparecol);
   2597 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2598 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2599 					raidmarkclean(raidPtr, sparecol);
   2600 				}
   2601 			}
   2602 		}
   2603 	}
   2604 }
   2605 
   2606 void
   2607 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2608 {
   2609 
   2610 	if (vp != NULL) {
   2611 		if (auto_configured == 1) {
   2612 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2613 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2614 			vput(vp);
   2615 
   2616 		} else {
   2617 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2618 		}
   2619 	}
   2620 }
   2621 
   2622 
   2623 void
   2624 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2625 {
   2626 	int r,c;
   2627 	struct vnode *vp;
   2628 	int acd;
   2629 
   2630 
   2631 	/* We take this opportunity to close the vnodes like we should.. */
   2632 
   2633 	for (c = 0; c < raidPtr->numCol; c++) {
   2634 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2635 		acd = raidPtr->Disks[c].auto_configured;
   2636 		rf_close_component(raidPtr, vp, acd);
   2637 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2638 		raidPtr->Disks[c].auto_configured = 0;
   2639 	}
   2640 
   2641 	for (r = 0; r < raidPtr->numSpare; r++) {
   2642 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2643 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2644 		rf_close_component(raidPtr, vp, acd);
   2645 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2646 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2647 	}
   2648 }
   2649 
   2650 
   2651 void
   2652 rf_ReconThread(struct rf_recon_req_internal *req)
   2653 {
   2654 	int     s;
   2655 	RF_Raid_t *raidPtr;
   2656 
   2657 	s = splbio();
   2658 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2659 	raidPtr->recon_in_progress = 1;
   2660 
   2661 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2662 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2663 
   2664 	RF_Free(req, sizeof(*req));
   2665 
   2666 	raidPtr->recon_in_progress = 0;
   2667 	splx(s);
   2668 
   2669 	/* That's all... */
   2670 	kthread_exit(0);	/* does not return */
   2671 }
   2672 
   2673 void
   2674 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2675 {
   2676 	int retcode;
   2677 	int s;
   2678 
   2679 	raidPtr->parity_rewrite_stripes_done = 0;
   2680 	raidPtr->parity_rewrite_in_progress = 1;
   2681 	s = splbio();
   2682 	retcode = rf_RewriteParity(raidPtr);
   2683 	splx(s);
   2684 	if (retcode) {
   2685 		printf("raid%d: Error re-writing parity (%d)!\n",
   2686 		    raidPtr->raidid, retcode);
   2687 	} else {
   2688 		/* set the clean bit!  If we shutdown correctly,
   2689 		   the clean bit on each component label will get
   2690 		   set */
   2691 		raidPtr->parity_good = RF_RAID_CLEAN;
   2692 	}
   2693 	raidPtr->parity_rewrite_in_progress = 0;
   2694 
   2695 	/* Anyone waiting for us to stop?  If so, inform them... */
   2696 	if (raidPtr->waitShutdown) {
   2697 		rf_lock_mutex2(raidPtr->rad_lock);
   2698 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2699 		rf_unlock_mutex2(raidPtr->rad_lock);
   2700 	}
   2701 
   2702 	/* That's all... */
   2703 	kthread_exit(0);	/* does not return */
   2704 }
   2705 
   2706 
   2707 void
   2708 rf_CopybackThread(RF_Raid_t *raidPtr)
   2709 {
   2710 	int s;
   2711 
   2712 	raidPtr->copyback_in_progress = 1;
   2713 	s = splbio();
   2714 	rf_CopybackReconstructedData(raidPtr);
   2715 	splx(s);
   2716 	raidPtr->copyback_in_progress = 0;
   2717 
   2718 	/* That's all... */
   2719 	kthread_exit(0);	/* does not return */
   2720 }
   2721 
   2722 
   2723 void
   2724 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2725 {
   2726 	int s;
   2727 	RF_Raid_t *raidPtr;
   2728 
   2729 	s = splbio();
   2730 	raidPtr = req->raidPtr;
   2731 	raidPtr->recon_in_progress = 1;
   2732 	rf_ReconstructInPlace(raidPtr, req->col);
   2733 	RF_Free(req, sizeof(*req));
   2734 	raidPtr->recon_in_progress = 0;
   2735 	splx(s);
   2736 
   2737 	/* That's all... */
   2738 	kthread_exit(0);	/* does not return */
   2739 }
   2740 
   2741 static RF_AutoConfig_t *
   2742 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2743     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2744     unsigned secsize)
   2745 {
   2746 	int good_one = 0;
   2747 	RF_ComponentLabel_t *clabel;
   2748 	RF_AutoConfig_t *ac;
   2749 
   2750 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2751 	if (clabel == NULL) {
   2752 oomem:
   2753 		    while(ac_list) {
   2754 			    ac = ac_list;
   2755 			    if (ac->clabel)
   2756 				    free(ac->clabel, M_RAIDFRAME);
   2757 			    ac_list = ac_list->next;
   2758 			    free(ac, M_RAIDFRAME);
   2759 		    }
   2760 		    printf("RAID auto config: out of memory!\n");
   2761 		    return NULL; /* XXX probably should panic? */
   2762 	}
   2763 
   2764 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2765 		/* Got the label.  Does it look reasonable? */
   2766 		if (rf_reasonable_label(clabel, numsecs) &&
   2767 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2768 #ifdef DEBUG
   2769 			printf("Component on: %s: %llu\n",
   2770 				cname, (unsigned long long)size);
   2771 			rf_print_component_label(clabel);
   2772 #endif
   2773 			/* if it's reasonable, add it, else ignore it. */
   2774 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2775 				M_NOWAIT);
   2776 			if (ac == NULL) {
   2777 				free(clabel, M_RAIDFRAME);
   2778 				goto oomem;
   2779 			}
   2780 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2781 			ac->dev = dev;
   2782 			ac->vp = vp;
   2783 			ac->clabel = clabel;
   2784 			ac->next = ac_list;
   2785 			ac_list = ac;
   2786 			good_one = 1;
   2787 		}
   2788 	}
   2789 	if (!good_one) {
   2790 		/* cleanup */
   2791 		free(clabel, M_RAIDFRAME);
   2792 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2793 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2794 		vput(vp);
   2795 	}
   2796 	return ac_list;
   2797 }
   2798 
   2799 RF_AutoConfig_t *
   2800 rf_find_raid_components(void)
   2801 {
   2802 	struct vnode *vp;
   2803 	struct disklabel label;
   2804 	device_t dv;
   2805 	deviter_t di;
   2806 	dev_t dev;
   2807 	int bmajor, bminor, wedge, rf_part_found;
   2808 	int error;
   2809 	int i;
   2810 	RF_AutoConfig_t *ac_list;
   2811 	uint64_t numsecs;
   2812 	unsigned secsize;
   2813 	int dowedges;
   2814 
   2815 	/* initialize the AutoConfig list */
   2816 	ac_list = NULL;
   2817 
   2818 	/*
   2819 	 * we begin by trolling through *all* the devices on the system *twice*
   2820 	 * first we scan for wedges, second for other devices. This avoids
   2821 	 * using a raw partition instead of a wedge that covers the whole disk
   2822 	 */
   2823 
   2824 	for (dowedges=1; dowedges>=0; --dowedges) {
   2825 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2826 		     dv = deviter_next(&di)) {
   2827 
   2828 			/* we are only interested in disks... */
   2829 			if (device_class(dv) != DV_DISK)
   2830 				continue;
   2831 
   2832 			/* we don't care about floppies... */
   2833 			if (device_is_a(dv, "fd")) {
   2834 				continue;
   2835 			}
   2836 
   2837 			/* we don't care about CD's... */
   2838 			if (device_is_a(dv, "cd")) {
   2839 				continue;
   2840 			}
   2841 
   2842 			/* we don't care about md's... */
   2843 			if (device_is_a(dv, "md")) {
   2844 				continue;
   2845 			}
   2846 
   2847 			/* hdfd is the Atari/Hades floppy driver */
   2848 			if (device_is_a(dv, "hdfd")) {
   2849 				continue;
   2850 			}
   2851 
   2852 			/* fdisa is the Atari/Milan floppy driver */
   2853 			if (device_is_a(dv, "fdisa")) {
   2854 				continue;
   2855 			}
   2856 
   2857 			/* are we in the wedges pass ? */
   2858 			wedge = device_is_a(dv, "dk");
   2859 			if (wedge != dowedges) {
   2860 				continue;
   2861 			}
   2862 
   2863 			/* need to find the device_name_to_block_device_major stuff */
   2864 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2865 
   2866 			rf_part_found = 0; /*No raid partition as yet*/
   2867 
   2868 			/* get a vnode for the raw partition of this disk */
   2869 			bminor = minor(device_unit(dv));
   2870 			dev = wedge ? makedev(bmajor, bminor) :
   2871 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2872 			if (bdevvp(dev, &vp))
   2873 				panic("RAID can't alloc vnode");
   2874 
   2875 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2876 
   2877 			if (error) {
   2878 				/* "Who cares."  Continue looking
   2879 				   for something that exists*/
   2880 				vput(vp);
   2881 				continue;
   2882 			}
   2883 
   2884 			error = getdisksize(vp, &numsecs, &secsize);
   2885 			if (error) {
   2886 				/*
   2887 				 * Pseudo devices like vnd and cgd can be
   2888 				 * opened but may still need some configuration.
   2889 				 * Ignore these quietly.
   2890 				 */
   2891 				if (error != ENXIO)
   2892 					printf("RAIDframe: can't get disk size"
   2893 					    " for dev %s (%d)\n",
   2894 					    device_xname(dv), error);
   2895 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2896 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2897 				vput(vp);
   2898 				continue;
   2899 			}
   2900 			if (wedge) {
   2901 				struct dkwedge_info dkw;
   2902 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2903 				    NOCRED);
   2904 				if (error) {
   2905 					printf("RAIDframe: can't get wedge info for "
   2906 					    "dev %s (%d)\n", device_xname(dv), error);
   2907 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2908 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2909 					vput(vp);
   2910 					continue;
   2911 				}
   2912 
   2913 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2914 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2915 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2916 					vput(vp);
   2917 					continue;
   2918 				}
   2919 
   2920 				ac_list = rf_get_component(ac_list, dev, vp,
   2921 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2922 				rf_part_found = 1; /*There is a raid component on this disk*/
   2923 				continue;
   2924 			}
   2925 
   2926 			/* Ok, the disk exists.  Go get the disklabel. */
   2927 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2928 			if (error) {
   2929 				/*
   2930 				 * XXX can't happen - open() would
   2931 				 * have errored out (or faked up one)
   2932 				 */
   2933 				if (error != ENOTTY)
   2934 					printf("RAIDframe: can't get label for dev "
   2935 					    "%s (%d)\n", device_xname(dv), error);
   2936 			}
   2937 
   2938 			/* don't need this any more.  We'll allocate it again
   2939 			   a little later if we really do... */
   2940 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2941 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2942 			vput(vp);
   2943 
   2944 			if (error)
   2945 				continue;
   2946 
   2947 			rf_part_found = 0; /*No raid partitions yet*/
   2948 			for (i = 0; i < label.d_npartitions; i++) {
   2949 				char cname[sizeof(ac_list->devname)];
   2950 
   2951 				/* We only support partitions marked as RAID */
   2952 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2953 					continue;
   2954 
   2955 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2956 				if (bdevvp(dev, &vp))
   2957 					panic("RAID can't alloc vnode");
   2958 
   2959 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2960 				if (error) {
   2961 					/* Whatever... */
   2962 					vput(vp);
   2963 					continue;
   2964 				}
   2965 				snprintf(cname, sizeof(cname), "%s%c",
   2966 				    device_xname(dv), 'a' + i);
   2967 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2968 					label.d_partitions[i].p_size, numsecs, secsize);
   2969 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2970 			}
   2971 
   2972 			/*
   2973 			 *If there is no raid component on this disk, either in a
   2974 			 *disklabel or inside a wedge, check the raw partition as well,
   2975 			 *as it is possible to configure raid components on raw disk
   2976 			 *devices.
   2977 			 */
   2978 
   2979 			if (!rf_part_found) {
   2980 				char cname[sizeof(ac_list->devname)];
   2981 
   2982 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2983 				if (bdevvp(dev, &vp))
   2984 					panic("RAID can't alloc vnode");
   2985 
   2986 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2987 				if (error) {
   2988 					/* Whatever... */
   2989 					vput(vp);
   2990 					continue;
   2991 				}
   2992 				snprintf(cname, sizeof(cname), "%s%c",
   2993 				    device_xname(dv), 'a' + RAW_PART);
   2994 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2995 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2996 			}
   2997 		}
   2998 		deviter_release(&di);
   2999 	}
   3000 	return ac_list;
   3001 }
   3002 
   3003 
   3004 int
   3005 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3006 {
   3007 
   3008 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3009 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3010 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3011 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3012 	    clabel->row >=0 &&
   3013 	    clabel->column >= 0 &&
   3014 	    clabel->num_rows > 0 &&
   3015 	    clabel->num_columns > 0 &&
   3016 	    clabel->row < clabel->num_rows &&
   3017 	    clabel->column < clabel->num_columns &&
   3018 	    clabel->blockSize > 0 &&
   3019 	    /*
   3020 	     * numBlocksHi may contain garbage, but it is ok since
   3021 	     * the type is unsigned.  If it is really garbage,
   3022 	     * rf_fix_old_label_size() will fix it.
   3023 	     */
   3024 	    rf_component_label_numblocks(clabel) > 0) {
   3025 		/*
   3026 		 * label looks reasonable enough...
   3027 		 * let's make sure it has no old garbage.
   3028 		 */
   3029 		if (numsecs)
   3030 			rf_fix_old_label_size(clabel, numsecs);
   3031 		return(1);
   3032 	}
   3033 	return(0);
   3034 }
   3035 
   3036 
   3037 /*
   3038  * For reasons yet unknown, some old component labels have garbage in
   3039  * the newer numBlocksHi region, and this causes lossage.  Since those
   3040  * disks will also have numsecs set to less than 32 bits of sectors,
   3041  * we can determine when this corruption has occurred, and fix it.
   3042  *
   3043  * The exact same problem, with the same unknown reason, happens to
   3044  * the partitionSizeHi member as well.
   3045  */
   3046 static void
   3047 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3048 {
   3049 
   3050 	if (numsecs < ((uint64_t)1 << 32)) {
   3051 		if (clabel->numBlocksHi) {
   3052 			printf("WARNING: total sectors < 32 bits, yet "
   3053 			       "numBlocksHi set\n"
   3054 			       "WARNING: resetting numBlocksHi to zero.\n");
   3055 			clabel->numBlocksHi = 0;
   3056 		}
   3057 
   3058 		if (clabel->partitionSizeHi) {
   3059 			printf("WARNING: total sectors < 32 bits, yet "
   3060 			       "partitionSizeHi set\n"
   3061 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3062 			clabel->partitionSizeHi = 0;
   3063 		}
   3064 	}
   3065 }
   3066 
   3067 
   3068 #ifdef DEBUG
   3069 void
   3070 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3071 {
   3072 	uint64_t numBlocks;
   3073 	static const char *rp[] = {
   3074 	    "No", "Force", "Soft", "*invalid*"
   3075 	};
   3076 
   3077 
   3078 	numBlocks = rf_component_label_numblocks(clabel);
   3079 
   3080 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3081 	       clabel->row, clabel->column,
   3082 	       clabel->num_rows, clabel->num_columns);
   3083 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3084 	       clabel->version, clabel->serial_number,
   3085 	       clabel->mod_counter);
   3086 	printf("   Clean: %s Status: %d\n",
   3087 	       clabel->clean ? "Yes" : "No", clabel->status);
   3088 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3089 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3090 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3091 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3092 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3093 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3094 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3095 #if 0
   3096 	   printf("   Config order: %d\n", clabel->config_order);
   3097 #endif
   3098 
   3099 }
   3100 #endif
   3101 
   3102 RF_ConfigSet_t *
   3103 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3104 {
   3105 	RF_AutoConfig_t *ac;
   3106 	RF_ConfigSet_t *config_sets;
   3107 	RF_ConfigSet_t *cset;
   3108 	RF_AutoConfig_t *ac_next;
   3109 
   3110 
   3111 	config_sets = NULL;
   3112 
   3113 	/* Go through the AutoConfig list, and figure out which components
   3114 	   belong to what sets.  */
   3115 	ac = ac_list;
   3116 	while(ac!=NULL) {
   3117 		/* we're going to putz with ac->next, so save it here
   3118 		   for use at the end of the loop */
   3119 		ac_next = ac->next;
   3120 
   3121 		if (config_sets == NULL) {
   3122 			/* will need at least this one... */
   3123 			config_sets = (RF_ConfigSet_t *)
   3124 				malloc(sizeof(RF_ConfigSet_t),
   3125 				       M_RAIDFRAME, M_NOWAIT);
   3126 			if (config_sets == NULL) {
   3127 				panic("rf_create_auto_sets: No memory!");
   3128 			}
   3129 			/* this one is easy :) */
   3130 			config_sets->ac = ac;
   3131 			config_sets->next = NULL;
   3132 			config_sets->rootable = 0;
   3133 			ac->next = NULL;
   3134 		} else {
   3135 			/* which set does this component fit into? */
   3136 			cset = config_sets;
   3137 			while(cset!=NULL) {
   3138 				if (rf_does_it_fit(cset, ac)) {
   3139 					/* looks like it matches... */
   3140 					ac->next = cset->ac;
   3141 					cset->ac = ac;
   3142 					break;
   3143 				}
   3144 				cset = cset->next;
   3145 			}
   3146 			if (cset==NULL) {
   3147 				/* didn't find a match above... new set..*/
   3148 				cset = (RF_ConfigSet_t *)
   3149 					malloc(sizeof(RF_ConfigSet_t),
   3150 					       M_RAIDFRAME, M_NOWAIT);
   3151 				if (cset == NULL) {
   3152 					panic("rf_create_auto_sets: No memory!");
   3153 				}
   3154 				cset->ac = ac;
   3155 				ac->next = NULL;
   3156 				cset->next = config_sets;
   3157 				cset->rootable = 0;
   3158 				config_sets = cset;
   3159 			}
   3160 		}
   3161 		ac = ac_next;
   3162 	}
   3163 
   3164 
   3165 	return(config_sets);
   3166 }
   3167 
   3168 static int
   3169 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3170 {
   3171 	RF_ComponentLabel_t *clabel1, *clabel2;
   3172 
   3173 	/* If this one matches the *first* one in the set, that's good
   3174 	   enough, since the other members of the set would have been
   3175 	   through here too... */
   3176 	/* note that we are not checking partitionSize here..
   3177 
   3178 	   Note that we are also not checking the mod_counters here.
   3179 	   If everything else matches except the mod_counter, that's
   3180 	   good enough for this test.  We will deal with the mod_counters
   3181 	   a little later in the autoconfiguration process.
   3182 
   3183 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3184 
   3185 	   The reason we don't check for this is that failed disks
   3186 	   will have lower modification counts.  If those disks are
   3187 	   not added to the set they used to belong to, then they will
   3188 	   form their own set, which may result in 2 different sets,
   3189 	   for example, competing to be configured at raid0, and
   3190 	   perhaps competing to be the root filesystem set.  If the
   3191 	   wrong ones get configured, or both attempt to become /,
   3192 	   weird behaviour and or serious lossage will occur.  Thus we
   3193 	   need to bring them into the fold here, and kick them out at
   3194 	   a later point.
   3195 
   3196 	*/
   3197 
   3198 	clabel1 = cset->ac->clabel;
   3199 	clabel2 = ac->clabel;
   3200 	if ((clabel1->version == clabel2->version) &&
   3201 	    (clabel1->serial_number == clabel2->serial_number) &&
   3202 	    (clabel1->num_rows == clabel2->num_rows) &&
   3203 	    (clabel1->num_columns == clabel2->num_columns) &&
   3204 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3205 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3206 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3207 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3208 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3209 	    (clabel1->blockSize == clabel2->blockSize) &&
   3210 	    rf_component_label_numblocks(clabel1) ==
   3211 	    rf_component_label_numblocks(clabel2) &&
   3212 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3213 	    (clabel1->root_partition == clabel2->root_partition) &&
   3214 	    (clabel1->last_unit == clabel2->last_unit) &&
   3215 	    (clabel1->config_order == clabel2->config_order)) {
   3216 		/* if it get's here, it almost *has* to be a match */
   3217 	} else {
   3218 		/* it's not consistent with somebody in the set..
   3219 		   punt */
   3220 		return(0);
   3221 	}
   3222 	/* all was fine.. it must fit... */
   3223 	return(1);
   3224 }
   3225 
   3226 int
   3227 rf_have_enough_components(RF_ConfigSet_t *cset)
   3228 {
   3229 	RF_AutoConfig_t *ac;
   3230 	RF_AutoConfig_t *auto_config;
   3231 	RF_ComponentLabel_t *clabel;
   3232 	int c;
   3233 	int num_cols;
   3234 	int num_missing;
   3235 	int mod_counter;
   3236 	int mod_counter_found;
   3237 	int even_pair_failed;
   3238 	char parity_type;
   3239 
   3240 
   3241 	/* check to see that we have enough 'live' components
   3242 	   of this set.  If so, we can configure it if necessary */
   3243 
   3244 	num_cols = cset->ac->clabel->num_columns;
   3245 	parity_type = cset->ac->clabel->parityConfig;
   3246 
   3247 	/* XXX Check for duplicate components!?!?!? */
   3248 
   3249 	/* Determine what the mod_counter is supposed to be for this set. */
   3250 
   3251 	mod_counter_found = 0;
   3252 	mod_counter = 0;
   3253 	ac = cset->ac;
   3254 	while(ac!=NULL) {
   3255 		if (mod_counter_found==0) {
   3256 			mod_counter = ac->clabel->mod_counter;
   3257 			mod_counter_found = 1;
   3258 		} else {
   3259 			if (ac->clabel->mod_counter > mod_counter) {
   3260 				mod_counter = ac->clabel->mod_counter;
   3261 			}
   3262 		}
   3263 		ac = ac->next;
   3264 	}
   3265 
   3266 	num_missing = 0;
   3267 	auto_config = cset->ac;
   3268 
   3269 	even_pair_failed = 0;
   3270 	for(c=0; c<num_cols; c++) {
   3271 		ac = auto_config;
   3272 		while(ac!=NULL) {
   3273 			if ((ac->clabel->column == c) &&
   3274 			    (ac->clabel->mod_counter == mod_counter)) {
   3275 				/* it's this one... */
   3276 #ifdef DEBUG
   3277 				printf("Found: %s at %d\n",
   3278 				       ac->devname,c);
   3279 #endif
   3280 				break;
   3281 			}
   3282 			ac=ac->next;
   3283 		}
   3284 		if (ac==NULL) {
   3285 				/* Didn't find one here! */
   3286 				/* special case for RAID 1, especially
   3287 				   where there are more than 2
   3288 				   components (where RAIDframe treats
   3289 				   things a little differently :( ) */
   3290 			if (parity_type == '1') {
   3291 				if (c%2 == 0) { /* even component */
   3292 					even_pair_failed = 1;
   3293 				} else { /* odd component.  If
   3294 					    we're failed, and
   3295 					    so is the even
   3296 					    component, it's
   3297 					    "Good Night, Charlie" */
   3298 					if (even_pair_failed == 1) {
   3299 						return(0);
   3300 					}
   3301 				}
   3302 			} else {
   3303 				/* normal accounting */
   3304 				num_missing++;
   3305 			}
   3306 		}
   3307 		if ((parity_type == '1') && (c%2 == 1)) {
   3308 				/* Just did an even component, and we didn't
   3309 				   bail.. reset the even_pair_failed flag,
   3310 				   and go on to the next component.... */
   3311 			even_pair_failed = 0;
   3312 		}
   3313 	}
   3314 
   3315 	clabel = cset->ac->clabel;
   3316 
   3317 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3318 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3319 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3320 		/* XXX this needs to be made *much* more general */
   3321 		/* Too many failures */
   3322 		return(0);
   3323 	}
   3324 	/* otherwise, all is well, and we've got enough to take a kick
   3325 	   at autoconfiguring this set */
   3326 	return(1);
   3327 }
   3328 
   3329 void
   3330 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3331 			RF_Raid_t *raidPtr)
   3332 {
   3333 	RF_ComponentLabel_t *clabel;
   3334 	int i;
   3335 
   3336 	clabel = ac->clabel;
   3337 
   3338 	/* 1. Fill in the common stuff */
   3339 	config->numCol = clabel->num_columns;
   3340 	config->numSpare = 0; /* XXX should this be set here? */
   3341 	config->sectPerSU = clabel->sectPerSU;
   3342 	config->SUsPerPU = clabel->SUsPerPU;
   3343 	config->SUsPerRU = clabel->SUsPerRU;
   3344 	config->parityConfig = clabel->parityConfig;
   3345 	/* XXX... */
   3346 	strcpy(config->diskQueueType,"fifo");
   3347 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3348 	config->layoutSpecificSize = 0; /* XXX ?? */
   3349 
   3350 	while(ac!=NULL) {
   3351 		/* row/col values will be in range due to the checks
   3352 		   in reasonable_label() */
   3353 		strcpy(config->devnames[0][ac->clabel->column],
   3354 		       ac->devname);
   3355 		ac = ac->next;
   3356 	}
   3357 
   3358 	for(i=0;i<RF_MAXDBGV;i++) {
   3359 		config->debugVars[i][0] = 0;
   3360 	}
   3361 }
   3362 
   3363 int
   3364 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3365 {
   3366 	RF_ComponentLabel_t *clabel;
   3367 	int column;
   3368 	int sparecol;
   3369 
   3370 	raidPtr->autoconfigure = new_value;
   3371 
   3372 	for(column=0; column<raidPtr->numCol; column++) {
   3373 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3374 			clabel = raidget_component_label(raidPtr, column);
   3375 			clabel->autoconfigure = new_value;
   3376 			raidflush_component_label(raidPtr, column);
   3377 		}
   3378 	}
   3379 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3380 		sparecol = raidPtr->numCol + column;
   3381 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3382 			clabel = raidget_component_label(raidPtr, sparecol);
   3383 			clabel->autoconfigure = new_value;
   3384 			raidflush_component_label(raidPtr, sparecol);
   3385 		}
   3386 	}
   3387 	return(new_value);
   3388 }
   3389 
   3390 int
   3391 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3392 {
   3393 	RF_ComponentLabel_t *clabel;
   3394 	int column;
   3395 	int sparecol;
   3396 
   3397 	raidPtr->root_partition = new_value;
   3398 	for(column=0; column<raidPtr->numCol; column++) {
   3399 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3400 			clabel = raidget_component_label(raidPtr, column);
   3401 			clabel->root_partition = new_value;
   3402 			raidflush_component_label(raidPtr, column);
   3403 		}
   3404 	}
   3405 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3406 		sparecol = raidPtr->numCol + column;
   3407 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3408 			clabel = raidget_component_label(raidPtr, sparecol);
   3409 			clabel->root_partition = new_value;
   3410 			raidflush_component_label(raidPtr, sparecol);
   3411 		}
   3412 	}
   3413 	return(new_value);
   3414 }
   3415 
   3416 void
   3417 rf_release_all_vps(RF_ConfigSet_t *cset)
   3418 {
   3419 	RF_AutoConfig_t *ac;
   3420 
   3421 	ac = cset->ac;
   3422 	while(ac!=NULL) {
   3423 		/* Close the vp, and give it back */
   3424 		if (ac->vp) {
   3425 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3426 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3427 			vput(ac->vp);
   3428 			ac->vp = NULL;
   3429 		}
   3430 		ac = ac->next;
   3431 	}
   3432 }
   3433 
   3434 
   3435 void
   3436 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3437 {
   3438 	RF_AutoConfig_t *ac;
   3439 	RF_AutoConfig_t *next_ac;
   3440 
   3441 	ac = cset->ac;
   3442 	while(ac!=NULL) {
   3443 		next_ac = ac->next;
   3444 		/* nuke the label */
   3445 		free(ac->clabel, M_RAIDFRAME);
   3446 		/* cleanup the config structure */
   3447 		free(ac, M_RAIDFRAME);
   3448 		/* "next.." */
   3449 		ac = next_ac;
   3450 	}
   3451 	/* and, finally, nuke the config set */
   3452 	free(cset, M_RAIDFRAME);
   3453 }
   3454 
   3455 
   3456 void
   3457 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3458 {
   3459 	/* current version number */
   3460 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3461 	clabel->serial_number = raidPtr->serial_number;
   3462 	clabel->mod_counter = raidPtr->mod_counter;
   3463 
   3464 	clabel->num_rows = 1;
   3465 	clabel->num_columns = raidPtr->numCol;
   3466 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3467 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3468 
   3469 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3470 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3471 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3472 
   3473 	clabel->blockSize = raidPtr->bytesPerSector;
   3474 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3475 
   3476 	/* XXX not portable */
   3477 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3478 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3479 	clabel->autoconfigure = raidPtr->autoconfigure;
   3480 	clabel->root_partition = raidPtr->root_partition;
   3481 	clabel->last_unit = raidPtr->raidid;
   3482 	clabel->config_order = raidPtr->config_order;
   3483 
   3484 #ifndef RF_NO_PARITY_MAP
   3485 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3486 #endif
   3487 }
   3488 
   3489 struct raid_softc *
   3490 rf_auto_config_set(RF_ConfigSet_t *cset)
   3491 {
   3492 	RF_Raid_t *raidPtr;
   3493 	RF_Config_t *config;
   3494 	int raidID;
   3495 	struct raid_softc *sc;
   3496 
   3497 #ifdef DEBUG
   3498 	printf("RAID autoconfigure\n");
   3499 #endif
   3500 
   3501 	/* 1. Create a config structure */
   3502 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3503 	if (config == NULL) {
   3504 		printf("%s: Out of mem - config!?!?\n", __func__);
   3505 				/* XXX do something more intelligent here. */
   3506 		return NULL;
   3507 	}
   3508 
   3509 	/*
   3510 	   2. Figure out what RAID ID this one is supposed to live at
   3511 	   See if we can get the same RAID dev that it was configured
   3512 	   on last time..
   3513 	*/
   3514 
   3515 	raidID = cset->ac->clabel->last_unit;
   3516 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3517 	     sc = raidget(++raidID, false))
   3518 		continue;
   3519 #ifdef DEBUG
   3520 	printf("Configuring raid%d:\n",raidID);
   3521 #endif
   3522 
   3523 	if (sc == NULL)
   3524 		sc = raidget(raidID, true);
   3525 	if (sc == NULL) {
   3526 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3527 				/* XXX do something more intelligent here. */
   3528 		free(config, M_RAIDFRAME);
   3529 		return NULL;
   3530 	}
   3531 
   3532 	raidPtr = &sc->sc_r;
   3533 
   3534 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3535 	raidPtr->softc = sc;
   3536 	raidPtr->raidid = raidID;
   3537 	raidPtr->openings = RAIDOUTSTANDING;
   3538 
   3539 	/* 3. Build the configuration structure */
   3540 	rf_create_configuration(cset->ac, config, raidPtr);
   3541 
   3542 	/* 4. Do the configuration */
   3543 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3544 		raidinit(sc);
   3545 
   3546 		rf_markalldirty(raidPtr);
   3547 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3548 		switch (cset->ac->clabel->root_partition) {
   3549 		case 1:	/* Force Root */
   3550 		case 2:	/* Soft Root: root when boot partition part of raid */
   3551 			/*
   3552 			 * everything configured just fine.  Make a note
   3553 			 * that this set is eligible to be root,
   3554 			 * or forced to be root
   3555 			 */
   3556 			cset->rootable = cset->ac->clabel->root_partition;
   3557 			/* XXX do this here? */
   3558 			raidPtr->root_partition = cset->rootable;
   3559 			break;
   3560 		default:
   3561 			break;
   3562 		}
   3563 	} else {
   3564 		raidput(sc);
   3565 		sc = NULL;
   3566 	}
   3567 
   3568 	/* 5. Cleanup */
   3569 	free(config, M_RAIDFRAME);
   3570 	return sc;
   3571 }
   3572 
   3573 void
   3574 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3575 	     size_t xmin, size_t xmax)
   3576 {
   3577 	int error;
   3578 
   3579 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3580 	pool_sethiwat(p, xmax);
   3581 	if ((error = pool_prime(p, xmin)) != 0)
   3582 		panic("%s: failed to prime pool: %d", __func__, error);
   3583 	pool_setlowat(p, xmin);
   3584 }
   3585 
   3586 /*
   3587  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3588  * to see if there is IO pending and if that IO could possibly be done
   3589  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3590  * otherwise.
   3591  *
   3592  */
   3593 int
   3594 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3595 {
   3596 	struct raid_softc *rs;
   3597 	struct dk_softc *dksc;
   3598 
   3599 	rs = raidPtr->softc;
   3600 	dksc = &rs->sc_dksc;
   3601 
   3602 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3603 		return 1;
   3604 
   3605 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3606 		/* there is work to do */
   3607 		return 0;
   3608 	}
   3609 	/* default is nothing to do */
   3610 	return 1;
   3611 }
   3612 
   3613 int
   3614 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3615 {
   3616 	uint64_t numsecs;
   3617 	unsigned secsize;
   3618 	int error;
   3619 
   3620 	error = getdisksize(vp, &numsecs, &secsize);
   3621 	if (error == 0) {
   3622 		diskPtr->blockSize = secsize;
   3623 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3624 		diskPtr->partitionSize = numsecs;
   3625 		return 0;
   3626 	}
   3627 	return error;
   3628 }
   3629 
   3630 static int
   3631 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3632 {
   3633 	return 1;
   3634 }
   3635 
   3636 static void
   3637 raid_attach(device_t parent, device_t self, void *aux)
   3638 {
   3639 }
   3640 
   3641 
   3642 static int
   3643 raid_detach(device_t self, int flags)
   3644 {
   3645 	int error;
   3646 	struct raid_softc *rs = raidsoftc(self);
   3647 
   3648 	if (rs == NULL)
   3649 		return ENXIO;
   3650 
   3651 	if ((error = raidlock(rs)) != 0)
   3652 		return (error);
   3653 
   3654 	error = raid_detach_unlocked(rs);
   3655 
   3656 	raidunlock(rs);
   3657 
   3658 	/* XXX raid can be referenced here */
   3659 
   3660 	if (error)
   3661 		return error;
   3662 
   3663 	/* Free the softc */
   3664 	raidput(rs);
   3665 
   3666 	return 0;
   3667 }
   3668 
   3669 static void
   3670 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3671 {
   3672 	struct dk_softc *dksc = &rs->sc_dksc;
   3673 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3674 
   3675 	memset(dg, 0, sizeof(*dg));
   3676 
   3677 	dg->dg_secperunit = raidPtr->totalSectors;
   3678 	dg->dg_secsize = raidPtr->bytesPerSector;
   3679 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3680 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3681 
   3682 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3683 }
   3684 
   3685 /*
   3686  * Get cache info for all the components (including spares).
   3687  * Returns intersection of all the cache flags of all disks, or first
   3688  * error if any encountered.
   3689  * XXXfua feature flags can change as spares are added - lock down somehow
   3690  */
   3691 static int
   3692 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3693 {
   3694 	int c;
   3695 	int error;
   3696 	int dkwhole = 0, dkpart;
   3697 
   3698 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3699 		/*
   3700 		 * Check any non-dead disk, even when currently being
   3701 		 * reconstructed.
   3702 		 */
   3703 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3704 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3705 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3706 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3707 			if (error) {
   3708 				if (error != ENODEV) {
   3709 					printf("raid%d: get cache for component %s failed\n",
   3710 					    raidPtr->raidid,
   3711 					    raidPtr->Disks[c].devname);
   3712 				}
   3713 
   3714 				return error;
   3715 			}
   3716 
   3717 			if (c == 0)
   3718 				dkwhole = dkpart;
   3719 			else
   3720 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3721 		}
   3722 	}
   3723 
   3724 	*data = dkwhole;
   3725 
   3726 	return 0;
   3727 }
   3728 
   3729 /*
   3730  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3731  * We end up returning whatever error was returned by the first cache flush
   3732  * that fails.
   3733  */
   3734 
   3735 int
   3736 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3737 {
   3738 	int c, sparecol;
   3739 	int e,error;
   3740 	int force = 1;
   3741 
   3742 	error = 0;
   3743 	for (c = 0; c < raidPtr->numCol; c++) {
   3744 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3745 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3746 					  &force, FWRITE, NOCRED);
   3747 			if (e) {
   3748 				if (e != ENODEV)
   3749 					printf("raid%d: cache flush to component %s failed.\n",
   3750 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3751 				if (error == 0) {
   3752 					error = e;
   3753 				}
   3754 			}
   3755 		}
   3756 	}
   3757 
   3758 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3759 		sparecol = raidPtr->numCol + c;
   3760 		/* Need to ensure that the reconstruct actually completed! */
   3761 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3762 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3763 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3764 			if (e) {
   3765 				if (e != ENODEV)
   3766 					printf("raid%d: cache flush to component %s failed.\n",
   3767 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3768 				if (error == 0) {
   3769 					error = e;
   3770 				}
   3771 			}
   3772 		}
   3773 	}
   3774 	return error;
   3775 }
   3776 
   3777 /* Fill in info with the current status */
   3778 void
   3779 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3780 {
   3781 
   3782 	if (raidPtr->status != rf_rs_reconstructing) {
   3783 		info->total = 100;
   3784 		info->completed = 100;
   3785 	} else {
   3786 		info->total = raidPtr->reconControl->numRUsTotal;
   3787 		info->completed = raidPtr->reconControl->numRUsComplete;
   3788 	}
   3789 	info->remaining = info->total - info->completed;
   3790 }
   3791 
   3792 /* Fill in info with the current status */
   3793 void
   3794 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3795 {
   3796 
   3797 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3798 		info->total = raidPtr->Layout.numStripe;
   3799 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3800 	} else {
   3801 		info->completed = 100;
   3802 		info->total = 100;
   3803 	}
   3804 	info->remaining = info->total - info->completed;
   3805 }
   3806 
   3807 /* Fill in info with the current status */
   3808 void
   3809 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3810 {
   3811 
   3812 	if (raidPtr->copyback_in_progress == 1) {
   3813 		info->total = raidPtr->Layout.numStripe;
   3814 		info->completed = raidPtr->copyback_stripes_done;
   3815 		info->remaining = info->total - info->completed;
   3816 	} else {
   3817 		info->remaining = 0;
   3818 		info->completed = 100;
   3819 		info->total = 100;
   3820 	}
   3821 }
   3822 
   3823 /* Fill in config with the current info */
   3824 int
   3825 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3826 {
   3827 	int	d, i, j;
   3828 
   3829 	if (!raidPtr->valid)
   3830 		return (ENODEV);
   3831 	config->cols = raidPtr->numCol;
   3832 	config->ndevs = raidPtr->numCol;
   3833 	if (config->ndevs >= RF_MAX_DISKS)
   3834 		return (ENOMEM);
   3835 	config->nspares = raidPtr->numSpare;
   3836 	if (config->nspares >= RF_MAX_DISKS)
   3837 		return (ENOMEM);
   3838 	config->maxqdepth = raidPtr->maxQueueDepth;
   3839 	d = 0;
   3840 	for (j = 0; j < config->cols; j++) {
   3841 		config->devs[d] = raidPtr->Disks[j];
   3842 		d++;
   3843 	}
   3844 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3845 		config->spares[i] = raidPtr->Disks[j];
   3846 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3847 			/* XXX: raidctl(8) expects to see this as a used spare */
   3848 			config->spares[i].status = rf_ds_used_spare;
   3849 		}
   3850 	}
   3851 	return 0;
   3852 }
   3853 
   3854 int
   3855 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3856 {
   3857 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3858 	RF_ComponentLabel_t *raid_clabel;
   3859 	int column = clabel->column;
   3860 
   3861 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3862 		return EINVAL;
   3863 	raid_clabel = raidget_component_label(raidPtr, column);
   3864 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3865 
   3866 	return 0;
   3867 }
   3868 
   3869 /*
   3870  * Module interface
   3871  */
   3872 
   3873 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3874 
   3875 #ifdef _MODULE
   3876 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3877 #endif
   3878 
   3879 static int raid_modcmd(modcmd_t, void *);
   3880 static int raid_modcmd_init(void);
   3881 static int raid_modcmd_fini(void);
   3882 
   3883 static int
   3884 raid_modcmd(modcmd_t cmd, void *data)
   3885 {
   3886 	int error;
   3887 
   3888 	error = 0;
   3889 	switch (cmd) {
   3890 	case MODULE_CMD_INIT:
   3891 		error = raid_modcmd_init();
   3892 		break;
   3893 	case MODULE_CMD_FINI:
   3894 		error = raid_modcmd_fini();
   3895 		break;
   3896 	default:
   3897 		error = ENOTTY;
   3898 		break;
   3899 	}
   3900 	return error;
   3901 }
   3902 
   3903 static int
   3904 raid_modcmd_init(void)
   3905 {
   3906 	int error;
   3907 	int bmajor, cmajor;
   3908 
   3909 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3910 	mutex_enter(&raid_lock);
   3911 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3912 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3913 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3914 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3915 
   3916 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3917 #endif
   3918 
   3919 	bmajor = cmajor = -1;
   3920 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3921 	    &raid_cdevsw, &cmajor);
   3922 	if (error != 0 && error != EEXIST) {
   3923 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3924 		mutex_exit(&raid_lock);
   3925 		return error;
   3926 	}
   3927 #ifdef _MODULE
   3928 	error = config_cfdriver_attach(&raid_cd);
   3929 	if (error != 0) {
   3930 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3931 		    __func__, error);
   3932 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3933 		mutex_exit(&raid_lock);
   3934 		return error;
   3935 	}
   3936 #endif
   3937 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3938 	if (error != 0) {
   3939 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3940 		    __func__, error);
   3941 #ifdef _MODULE
   3942 		config_cfdriver_detach(&raid_cd);
   3943 #endif
   3944 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3945 		mutex_exit(&raid_lock);
   3946 		return error;
   3947 	}
   3948 
   3949 	raidautoconfigdone = false;
   3950 
   3951 	mutex_exit(&raid_lock);
   3952 
   3953 	if (error == 0) {
   3954 		if (rf_BootRaidframe(true) == 0)
   3955 			aprint_verbose("Kernelized RAIDframe activated\n");
   3956 		else
   3957 			panic("Serious error activating RAID!!");
   3958 	}
   3959 
   3960 	/*
   3961 	 * Register a finalizer which will be used to auto-config RAID
   3962 	 * sets once all real hardware devices have been found.
   3963 	 */
   3964 	error = config_finalize_register(NULL, rf_autoconfig);
   3965 	if (error != 0) {
   3966 		aprint_error("WARNING: unable to register RAIDframe "
   3967 		    "finalizer\n");
   3968 		error = 0;
   3969 	}
   3970 
   3971 	return error;
   3972 }
   3973 
   3974 static int
   3975 raid_modcmd_fini(void)
   3976 {
   3977 	int error;
   3978 
   3979 	mutex_enter(&raid_lock);
   3980 
   3981 	/* Don't allow unload if raid device(s) exist.  */
   3982 	if (!LIST_EMPTY(&raids)) {
   3983 		mutex_exit(&raid_lock);
   3984 		return EBUSY;
   3985 	}
   3986 
   3987 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3988 	if (error != 0) {
   3989 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3990 		mutex_exit(&raid_lock);
   3991 		return error;
   3992 	}
   3993 #ifdef _MODULE
   3994 	error = config_cfdriver_detach(&raid_cd);
   3995 	if (error != 0) {
   3996 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3997 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3998 		mutex_exit(&raid_lock);
   3999 		return error;
   4000 	}
   4001 #endif
   4002 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4003 	if (error != 0) {
   4004 		aprint_error("%s: cannot detach devsw\n",__func__);
   4005 #ifdef _MODULE
   4006 		config_cfdriver_attach(&raid_cd);
   4007 #endif
   4008 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4009 		mutex_exit(&raid_lock);
   4010 		return error;
   4011 	}
   4012 	rf_BootRaidframe(false);
   4013 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4014 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4015 	rf_destroy_cond2(rf_sparet_wait_cv);
   4016 	rf_destroy_cond2(rf_sparet_resp_cv);
   4017 #endif
   4018 	mutex_exit(&raid_lock);
   4019 	mutex_destroy(&raid_lock);
   4020 
   4021 	return error;
   4022 }
   4023