Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.364
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.364 2019/02/05 09:28:00 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.364 2019/02/05 09:28:00 mrg Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "rf_compat80.h"
    153 
    154 #ifdef COMPAT_NETBSD32
    155 #ifdef _LP64
    156 #include "rf_compat32.h"
    157 #define RAID_COMPAT32
    158 #define RAID_COMPAT32
    159 #endif
    160 #endif
    161 
    162 #include "ioconf.h"
    163 
    164 #ifdef DEBUG
    165 int     rf_kdebug_level = 0;
    166 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    167 #else				/* DEBUG */
    168 #define db1_printf(a) { }
    169 #endif				/* DEBUG */
    170 
    171 #ifdef DEBUG_ROOT
    172 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    173 #else
    174 #define DPRINTF(a, ...)
    175 #endif
    176 
    177 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    178 static rf_declare_mutex2(rf_sparet_wait_mutex);
    179 static rf_declare_cond2(rf_sparet_wait_cv);
    180 static rf_declare_cond2(rf_sparet_resp_cv);
    181 
    182 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    183 						 * spare table */
    184 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    185 						 * installation process */
    186 #endif
    187 
    188 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    189 
    190 /* prototypes */
    191 static void KernelWakeupFunc(struct buf *);
    192 static void InitBP(struct buf *, struct vnode *, unsigned,
    193     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    194     void *, int, struct proc *);
    195 struct raid_softc;
    196 static void raidinit(struct raid_softc *);
    197 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    198 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    199 
    200 static int raid_match(device_t, cfdata_t, void *);
    201 static void raid_attach(device_t, device_t, void *);
    202 static int raid_detach(device_t, int);
    203 
    204 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    205     daddr_t, daddr_t);
    206 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    207     daddr_t, daddr_t, int);
    208 
    209 static int raidwrite_component_label(unsigned,
    210     dev_t, struct vnode *, RF_ComponentLabel_t *);
    211 static int raidread_component_label(unsigned,
    212     dev_t, struct vnode *, RF_ComponentLabel_t *);
    213 
    214 static int raid_diskstart(device_t, struct buf *bp);
    215 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    216 static int raid_lastclose(device_t);
    217 
    218 static dev_type_open(raidopen);
    219 static dev_type_close(raidclose);
    220 static dev_type_read(raidread);
    221 static dev_type_write(raidwrite);
    222 static dev_type_ioctl(raidioctl);
    223 static dev_type_strategy(raidstrategy);
    224 static dev_type_dump(raiddump);
    225 static dev_type_size(raidsize);
    226 
    227 const struct bdevsw raid_bdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_strategy = raidstrategy,
    231 	.d_ioctl = raidioctl,
    232 	.d_dump = raiddump,
    233 	.d_psize = raidsize,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 const struct cdevsw raid_cdevsw = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_read = raidread,
    242 	.d_write = raidwrite,
    243 	.d_ioctl = raidioctl,
    244 	.d_stop = nostop,
    245 	.d_tty = notty,
    246 	.d_poll = nopoll,
    247 	.d_mmap = nommap,
    248 	.d_kqfilter = nokqfilter,
    249 	.d_discard = nodiscard,
    250 	.d_flag = D_DISK
    251 };
    252 
    253 static struct dkdriver rf_dkdriver = {
    254 	.d_open = raidopen,
    255 	.d_close = raidclose,
    256 	.d_strategy = raidstrategy,
    257 	.d_diskstart = raid_diskstart,
    258 	.d_dumpblocks = raid_dumpblocks,
    259 	.d_lastclose = raid_lastclose,
    260 	.d_minphys = minphys
    261 };
    262 
    263 struct raid_softc {
    264 	struct dk_softc sc_dksc;
    265 	int	sc_unit;
    266 	int     sc_flags;	/* flags */
    267 	int     sc_cflags;	/* configuration flags */
    268 	kmutex_t sc_mutex;	/* interlock mutex */
    269 	kcondvar_t sc_cv;	/* and the condvar */
    270 	uint64_t sc_size;	/* size of the raid device */
    271 	char    sc_xname[20];	/* XXX external name */
    272 	RF_Raid_t sc_r;
    273 	LIST_ENTRY(raid_softc) sc_link;
    274 };
    275 /* sc_flags */
    276 #define RAIDF_INITED		0x01	/* unit has been initialized */
    277 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    278 #define RAIDF_DETACH  		0x04	/* detach after final close */
    279 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    280 #define RAIDF_LOCKED		0x10	/* unit is locked */
    281 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    282 
    283 #define	raidunit(x)	DISKUNIT(x)
    284 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    285 
    286 extern struct cfdriver raid_cd;
    287 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    288     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    289     DVF_DETACH_SHUTDOWN);
    290 
    291 /* Internal representation of a rf_recon_req */
    292 struct rf_recon_req_internal {
    293 	RF_RowCol_t col;
    294 	RF_ReconReqFlags_t flags;
    295 	void   *raidPtr;
    296 };
    297 
    298 /*
    299  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    300  * Be aware that large numbers can allow the driver to consume a lot of
    301  * kernel memory, especially on writes, and in degraded mode reads.
    302  *
    303  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    304  * a single 64K write will typically require 64K for the old data,
    305  * 64K for the old parity, and 64K for the new parity, for a total
    306  * of 192K (if the parity buffer is not re-used immediately).
    307  * Even it if is used immediately, that's still 128K, which when multiplied
    308  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    309  *
    310  * Now in degraded mode, for example, a 64K read on the above setup may
    311  * require data reconstruction, which will require *all* of the 4 remaining
    312  * disks to participate -- 4 * 32K/disk == 128K again.
    313  */
    314 
    315 #ifndef RAIDOUTSTANDING
    316 #define RAIDOUTSTANDING   6
    317 #endif
    318 
    319 #define RAIDLABELDEV(dev)	\
    320 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    321 
    322 /* declared here, and made public, for the benefit of KVM stuff.. */
    323 
    324 static int raidlock(struct raid_softc *);
    325 static void raidunlock(struct raid_softc *);
    326 
    327 static int raid_detach_unlocked(struct raid_softc *);
    328 
    329 static void rf_markalldirty(RF_Raid_t *);
    330 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    331 
    332 void rf_ReconThread(struct rf_recon_req_internal *);
    333 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    334 void rf_CopybackThread(RF_Raid_t *raidPtr);
    335 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    336 int rf_autoconfig(device_t);
    337 void rf_buildroothack(RF_ConfigSet_t *);
    338 
    339 RF_AutoConfig_t *rf_find_raid_components(void);
    340 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    341 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    342 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    343 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    344 int rf_set_autoconfig(RF_Raid_t *, int);
    345 int rf_set_rootpartition(RF_Raid_t *, int);
    346 void rf_release_all_vps(RF_ConfigSet_t *);
    347 void rf_cleanup_config_set(RF_ConfigSet_t *);
    348 int rf_have_enough_components(RF_ConfigSet_t *);
    349 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    350 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    351 
    352 /*
    353  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    354  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    355  * in the kernel config file.
    356  */
    357 #ifdef RAID_AUTOCONFIG
    358 int raidautoconfig = 1;
    359 #else
    360 int raidautoconfig = 0;
    361 #endif
    362 static bool raidautoconfigdone = false;
    363 
    364 struct RF_Pools_s rf_pools;
    365 
    366 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    367 static kmutex_t raid_lock;
    368 
    369 static struct raid_softc *
    370 raidcreate(int unit) {
    371 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    372 	sc->sc_unit = unit;
    373 	cv_init(&sc->sc_cv, "raidunit");
    374 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    375 	return sc;
    376 }
    377 
    378 static void
    379 raiddestroy(struct raid_softc *sc) {
    380 	cv_destroy(&sc->sc_cv);
    381 	mutex_destroy(&sc->sc_mutex);
    382 	kmem_free(sc, sizeof(*sc));
    383 }
    384 
    385 static struct raid_softc *
    386 raidget(int unit, bool create) {
    387 	struct raid_softc *sc;
    388 	if (unit < 0) {
    389 #ifdef DIAGNOSTIC
    390 		panic("%s: unit %d!", __func__, unit);
    391 #endif
    392 		return NULL;
    393 	}
    394 	mutex_enter(&raid_lock);
    395 	LIST_FOREACH(sc, &raids, sc_link) {
    396 		if (sc->sc_unit == unit) {
    397 			mutex_exit(&raid_lock);
    398 			return sc;
    399 		}
    400 	}
    401 	mutex_exit(&raid_lock);
    402 	if (!create)
    403 		return NULL;
    404 	if ((sc = raidcreate(unit)) == NULL)
    405 		return NULL;
    406 	mutex_enter(&raid_lock);
    407 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    408 	mutex_exit(&raid_lock);
    409 	return sc;
    410 }
    411 
    412 static void
    413 raidput(struct raid_softc *sc) {
    414 	mutex_enter(&raid_lock);
    415 	LIST_REMOVE(sc, sc_link);
    416 	mutex_exit(&raid_lock);
    417 	raiddestroy(sc);
    418 }
    419 
    420 void
    421 raidattach(int num)
    422 {
    423 
    424 	/*
    425 	 * Device attachment and associated initialization now occurs
    426 	 * as part of the module initialization.
    427 	 */
    428 }
    429 
    430 int
    431 rf_autoconfig(device_t self)
    432 {
    433 	RF_AutoConfig_t *ac_list;
    434 	RF_ConfigSet_t *config_sets;
    435 
    436 	if (!raidautoconfig || raidautoconfigdone == true)
    437 		return (0);
    438 
    439 	/* XXX This code can only be run once. */
    440 	raidautoconfigdone = true;
    441 
    442 #ifdef __HAVE_CPU_BOOTCONF
    443 	/*
    444 	 * 0. find the boot device if needed first so we can use it later
    445 	 * this needs to be done before we autoconfigure any raid sets,
    446 	 * because if we use wedges we are not going to be able to open
    447 	 * the boot device later
    448 	 */
    449 	if (booted_device == NULL)
    450 		cpu_bootconf();
    451 #endif
    452 	/* 1. locate all RAID components on the system */
    453 	aprint_debug("Searching for RAID components...\n");
    454 	ac_list = rf_find_raid_components();
    455 
    456 	/* 2. Sort them into their respective sets. */
    457 	config_sets = rf_create_auto_sets(ac_list);
    458 
    459 	/*
    460 	 * 3. Evaluate each set and configure the valid ones.
    461 	 * This gets done in rf_buildroothack().
    462 	 */
    463 	rf_buildroothack(config_sets);
    464 
    465 	return 1;
    466 }
    467 
    468 static int
    469 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    470 	const char *bootname;
    471 	size_t len;
    472 
    473 	/* if bdv is NULL, the set can't contain it. exit early. */
    474 	if (bdv == NULL)
    475 		return 0;
    476 
    477 	bootname = device_xname(bdv);
    478 	len = strlen(bootname);
    479 
    480 	for (int col = 0; col < r->numCol; col++) {
    481 		const char *devname = r->Disks[col].devname;
    482 		devname += sizeof("/dev/") - 1;
    483 		if (strncmp(devname, "dk", 2) == 0) {
    484 			const char *parent =
    485 			    dkwedge_get_parent_name(r->Disks[col].dev);
    486 			if (parent != NULL)
    487 				devname = parent;
    488 		}
    489 		if (strncmp(devname, bootname, len) == 0) {
    490 			struct raid_softc *sc = r->softc;
    491 			aprint_debug("raid%d includes boot device %s\n",
    492 			    sc->sc_unit, devname);
    493 			return 1;
    494 		}
    495 	}
    496 	return 0;
    497 }
    498 
    499 void
    500 rf_buildroothack(RF_ConfigSet_t *config_sets)
    501 {
    502 	RF_ConfigSet_t *cset;
    503 	RF_ConfigSet_t *next_cset;
    504 	int num_root;
    505 	struct raid_softc *sc, *rsc;
    506 	struct dk_softc *dksc;
    507 
    508 	sc = rsc = NULL;
    509 	num_root = 0;
    510 	cset = config_sets;
    511 	while (cset != NULL) {
    512 		next_cset = cset->next;
    513 		if (rf_have_enough_components(cset) &&
    514 		    cset->ac->clabel->autoconfigure == 1) {
    515 			sc = rf_auto_config_set(cset);
    516 			if (sc != NULL) {
    517 				aprint_debug("raid%d: configured ok, rootable %d\n",
    518 				    sc->sc_unit, cset->rootable);
    519 				if (cset->rootable) {
    520 					rsc = sc;
    521 					num_root++;
    522 				}
    523 			} else {
    524 				/* The autoconfig didn't work :( */
    525 				aprint_debug("Autoconfig failed\n");
    526 				rf_release_all_vps(cset);
    527 			}
    528 		} else {
    529 			/* we're not autoconfiguring this set...
    530 			   release the associated resources */
    531 			rf_release_all_vps(cset);
    532 		}
    533 		/* cleanup */
    534 		rf_cleanup_config_set(cset);
    535 		cset = next_cset;
    536 	}
    537 	dksc = &rsc->sc_dksc;
    538 
    539 	/* if the user has specified what the root device should be
    540 	   then we don't touch booted_device or boothowto... */
    541 
    542 	if (rootspec != NULL) {
    543 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    544 		return;
    545 	}
    546 
    547 	/* we found something bootable... */
    548 
    549 	/*
    550 	 * XXX: The following code assumes that the root raid
    551 	 * is the first ('a') partition. This is about the best
    552 	 * we can do with a BSD disklabel, but we might be able
    553 	 * to do better with a GPT label, by setting a specified
    554 	 * attribute to indicate the root partition. We can then
    555 	 * stash the partition number in the r->root_partition
    556 	 * high bits (the bottom 2 bits are already used). For
    557 	 * now we just set booted_partition to 0 when we override
    558 	 * root.
    559 	 */
    560 	if (num_root == 1) {
    561 		device_t candidate_root;
    562 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    563 			char cname[sizeof(cset->ac->devname)];
    564 			/* XXX: assume partition 'a' first */
    565 			snprintf(cname, sizeof(cname), "%s%c",
    566 			    device_xname(dksc->sc_dev), 'a');
    567 			candidate_root = dkwedge_find_by_wname(cname);
    568 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    569 			    cname);
    570 			if (candidate_root == NULL) {
    571 				/*
    572 				 * If that is not found, because we don't use
    573 				 * disklabel, return the first dk child
    574 				 * XXX: we can skip the 'a' check above
    575 				 * and always do this...
    576 				 */
    577 				size_t i = 0;
    578 				candidate_root = dkwedge_find_by_parent(
    579 				    device_xname(dksc->sc_dev), &i);
    580 			}
    581 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    582 			    candidate_root);
    583 		} else
    584 			candidate_root = dksc->sc_dev;
    585 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    586 		DPRINTF("%s: booted_device=%p root_partition=%d "
    587 			"contains_boot=%d",
    588 		    __func__, booted_device, rsc->sc_r.root_partition,
    589 			   rf_containsboot(&rsc->sc_r, booted_device));
    590 		/* XXX the check for booted_device == NULL can probably be
    591 		 * dropped, now that rf_containsboot handles that case.
    592 		 */
    593 		if (booted_device == NULL ||
    594 		    rsc->sc_r.root_partition == 1 ||
    595 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    596 			booted_device = candidate_root;
    597 			booted_method = "raidframe/single";
    598 			booted_partition = 0;	/* XXX assume 'a' */
    599 		}
    600 	} else if (num_root > 1) {
    601 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    602 		    booted_device);
    603 
    604 		/*
    605 		 * Maybe the MD code can help. If it cannot, then
    606 		 * setroot() will discover that we have no
    607 		 * booted_device and will ask the user if nothing was
    608 		 * hardwired in the kernel config file
    609 		 */
    610 		if (booted_device == NULL)
    611 			return;
    612 
    613 		num_root = 0;
    614 		mutex_enter(&raid_lock);
    615 		LIST_FOREACH(sc, &raids, sc_link) {
    616 			RF_Raid_t *r = &sc->sc_r;
    617 			if (r->valid == 0)
    618 				continue;
    619 
    620 			if (r->root_partition == 0)
    621 				continue;
    622 
    623 			if (rf_containsboot(r, booted_device)) {
    624 				num_root++;
    625 				rsc = sc;
    626 				dksc = &rsc->sc_dksc;
    627 			}
    628 		}
    629 		mutex_exit(&raid_lock);
    630 
    631 		if (num_root == 1) {
    632 			booted_device = dksc->sc_dev;
    633 			booted_method = "raidframe/multi";
    634 			booted_partition = 0;	/* XXX assume 'a' */
    635 		} else {
    636 			/* we can't guess.. require the user to answer... */
    637 			boothowto |= RB_ASKNAME;
    638 		}
    639 	}
    640 }
    641 
    642 static int
    643 raidsize(dev_t dev)
    644 {
    645 	struct raid_softc *rs;
    646 	struct dk_softc *dksc;
    647 	unsigned int unit;
    648 
    649 	unit = raidunit(dev);
    650 	if ((rs = raidget(unit, false)) == NULL)
    651 		return -1;
    652 	dksc = &rs->sc_dksc;
    653 
    654 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    655 		return -1;
    656 
    657 	return dk_size(dksc, dev);
    658 }
    659 
    660 static int
    661 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    662 {
    663 	unsigned int unit;
    664 	struct raid_softc *rs;
    665 	struct dk_softc *dksc;
    666 
    667 	unit = raidunit(dev);
    668 	if ((rs = raidget(unit, false)) == NULL)
    669 		return ENXIO;
    670 	dksc = &rs->sc_dksc;
    671 
    672 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    673 		return ENODEV;
    674 
    675         /*
    676            Note that blkno is relative to this particular partition.
    677            By adding adding RF_PROTECTED_SECTORS, we get a value that
    678 	   is relative to the partition used for the underlying component.
    679         */
    680 	blkno += RF_PROTECTED_SECTORS;
    681 
    682 	return dk_dump(dksc, dev, blkno, va, size);
    683 }
    684 
    685 static int
    686 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    687 {
    688 	struct raid_softc *rs = raidsoftc(dev);
    689 	const struct bdevsw *bdev;
    690 	RF_Raid_t *raidPtr;
    691 	int     c, sparecol, j, scol, dumpto;
    692 	int     error = 0;
    693 
    694 	raidPtr = &rs->sc_r;
    695 
    696 	/* we only support dumping to RAID 1 sets */
    697 	if (raidPtr->Layout.numDataCol != 1 ||
    698 	    raidPtr->Layout.numParityCol != 1)
    699 		return EINVAL;
    700 
    701 	if ((error = raidlock(rs)) != 0)
    702 		return error;
    703 
    704 	/* figure out what device is alive.. */
    705 
    706 	/*
    707 	   Look for a component to dump to.  The preference for the
    708 	   component to dump to is as follows:
    709 	   1) the master
    710 	   2) a used_spare of the master
    711 	   3) the slave
    712 	   4) a used_spare of the slave
    713 	*/
    714 
    715 	dumpto = -1;
    716 	for (c = 0; c < raidPtr->numCol; c++) {
    717 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    718 			/* this might be the one */
    719 			dumpto = c;
    720 			break;
    721 		}
    722 	}
    723 
    724 	/*
    725 	   At this point we have possibly selected a live master or a
    726 	   live slave.  We now check to see if there is a spared
    727 	   master (or a spared slave), if we didn't find a live master
    728 	   or a live slave.
    729 	*/
    730 
    731 	for (c = 0; c < raidPtr->numSpare; c++) {
    732 		sparecol = raidPtr->numCol + c;
    733 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    734 			/* How about this one? */
    735 			scol = -1;
    736 			for(j=0;j<raidPtr->numCol;j++) {
    737 				if (raidPtr->Disks[j].spareCol == sparecol) {
    738 					scol = j;
    739 					break;
    740 				}
    741 			}
    742 			if (scol == 0) {
    743 				/*
    744 				   We must have found a spared master!
    745 				   We'll take that over anything else
    746 				   found so far.  (We couldn't have
    747 				   found a real master before, since
    748 				   this is a used spare, and it's
    749 				   saying that it's replacing the
    750 				   master.)  On reboot (with
    751 				   autoconfiguration turned on)
    752 				   sparecol will become the 1st
    753 				   component (component0) of this set.
    754 				*/
    755 				dumpto = sparecol;
    756 				break;
    757 			} else if (scol != -1) {
    758 				/*
    759 				   Must be a spared slave.  We'll dump
    760 				   to that if we havn't found anything
    761 				   else so far.
    762 				*/
    763 				if (dumpto == -1)
    764 					dumpto = sparecol;
    765 			}
    766 		}
    767 	}
    768 
    769 	if (dumpto == -1) {
    770 		/* we couldn't find any live components to dump to!?!?
    771 		 */
    772 		error = EINVAL;
    773 		goto out;
    774 	}
    775 
    776 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    777 	if (bdev == NULL) {
    778 		error = ENXIO;
    779 		goto out;
    780 	}
    781 
    782 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    783 				blkno, va, nblk * raidPtr->bytesPerSector);
    784 
    785 out:
    786 	raidunlock(rs);
    787 
    788 	return error;
    789 }
    790 
    791 /* ARGSUSED */
    792 static int
    793 raidopen(dev_t dev, int flags, int fmt,
    794     struct lwp *l)
    795 {
    796 	int     unit = raidunit(dev);
    797 	struct raid_softc *rs;
    798 	struct dk_softc *dksc;
    799 	int     error = 0;
    800 	int     part, pmask;
    801 
    802 	if ((rs = raidget(unit, true)) == NULL)
    803 		return ENXIO;
    804 	if ((error = raidlock(rs)) != 0)
    805 		return (error);
    806 
    807 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    808 		error = EBUSY;
    809 		goto bad;
    810 	}
    811 
    812 	dksc = &rs->sc_dksc;
    813 
    814 	part = DISKPART(dev);
    815 	pmask = (1 << part);
    816 
    817 	if (!DK_BUSY(dksc, pmask) &&
    818 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    819 		/* First one... mark things as dirty... Note that we *MUST*
    820 		 have done a configure before this.  I DO NOT WANT TO BE
    821 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    822 		 THAT THEY BELONG TOGETHER!!!!! */
    823 		/* XXX should check to see if we're only open for reading
    824 		   here... If so, we needn't do this, but then need some
    825 		   other way of keeping track of what's happened.. */
    826 
    827 		rf_markalldirty(&rs->sc_r);
    828 	}
    829 
    830 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    831 		error = dk_open(dksc, dev, flags, fmt, l);
    832 
    833 bad:
    834 	raidunlock(rs);
    835 
    836 	return (error);
    837 
    838 
    839 }
    840 
    841 static int
    842 raid_lastclose(device_t self)
    843 {
    844 	struct raid_softc *rs = raidsoftc(self);
    845 
    846 	/* Last one... device is not unconfigured yet.
    847 	   Device shutdown has taken care of setting the
    848 	   clean bits if RAIDF_INITED is not set
    849 	   mark things as clean... */
    850 
    851 	rf_update_component_labels(&rs->sc_r,
    852 	    RF_FINAL_COMPONENT_UPDATE);
    853 
    854 	/* pass to unlocked code */
    855 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    856 		rs->sc_flags |= RAIDF_DETACH;
    857 
    858 	return 0;
    859 }
    860 
    861 /* ARGSUSED */
    862 static int
    863 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    864 {
    865 	int     unit = raidunit(dev);
    866 	struct raid_softc *rs;
    867 	struct dk_softc *dksc;
    868 	cfdata_t cf;
    869 	int     error = 0, do_detach = 0, do_put = 0;
    870 
    871 	if ((rs = raidget(unit, false)) == NULL)
    872 		return ENXIO;
    873 	dksc = &rs->sc_dksc;
    874 
    875 	if ((error = raidlock(rs)) != 0)
    876 		return (error);
    877 
    878 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    879 		error = dk_close(dksc, dev, flags, fmt, l);
    880 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    881 			do_detach = 1;
    882 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    883 		do_put = 1;
    884 
    885 	raidunlock(rs);
    886 
    887 	if (do_detach) {
    888 		/* free the pseudo device attach bits */
    889 		cf = device_cfdata(dksc->sc_dev);
    890 		error = config_detach(dksc->sc_dev, 0);
    891 		if (error == 0)
    892 			free(cf, M_RAIDFRAME);
    893 	} else if (do_put) {
    894 		raidput(rs);
    895 	}
    896 
    897 	return (error);
    898 
    899 }
    900 
    901 static void
    902 raid_wakeup(RF_Raid_t *raidPtr)
    903 {
    904 	rf_lock_mutex2(raidPtr->iodone_lock);
    905 	rf_signal_cond2(raidPtr->iodone_cv);
    906 	rf_unlock_mutex2(raidPtr->iodone_lock);
    907 }
    908 
    909 static void
    910 raidstrategy(struct buf *bp)
    911 {
    912 	unsigned int unit;
    913 	struct raid_softc *rs;
    914 	struct dk_softc *dksc;
    915 	RF_Raid_t *raidPtr;
    916 
    917 	unit = raidunit(bp->b_dev);
    918 	if ((rs = raidget(unit, false)) == NULL) {
    919 		bp->b_error = ENXIO;
    920 		goto fail;
    921 	}
    922 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    923 		bp->b_error = ENXIO;
    924 		goto fail;
    925 	}
    926 	dksc = &rs->sc_dksc;
    927 	raidPtr = &rs->sc_r;
    928 
    929 	/* Queue IO only */
    930 	if (dk_strategy_defer(dksc, bp))
    931 		goto done;
    932 
    933 	/* schedule the IO to happen at the next convenient time */
    934 	raid_wakeup(raidPtr);
    935 
    936 done:
    937 	return;
    938 
    939 fail:
    940 	bp->b_resid = bp->b_bcount;
    941 	biodone(bp);
    942 }
    943 
    944 static int
    945 raid_diskstart(device_t dev, struct buf *bp)
    946 {
    947 	struct raid_softc *rs = raidsoftc(dev);
    948 	RF_Raid_t *raidPtr;
    949 
    950 	raidPtr = &rs->sc_r;
    951 	if (!raidPtr->valid) {
    952 		db1_printf(("raid is not valid..\n"));
    953 		return ENODEV;
    954 	}
    955 
    956 	/* XXX */
    957 	bp->b_resid = 0;
    958 
    959 	return raiddoaccess(raidPtr, bp);
    960 }
    961 
    962 void
    963 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    964 {
    965 	struct raid_softc *rs;
    966 	struct dk_softc *dksc;
    967 
    968 	rs = raidPtr->softc;
    969 	dksc = &rs->sc_dksc;
    970 
    971 	dk_done(dksc, bp);
    972 
    973 	rf_lock_mutex2(raidPtr->mutex);
    974 	raidPtr->openings++;
    975 	rf_unlock_mutex2(raidPtr->mutex);
    976 
    977 	/* schedule more IO */
    978 	raid_wakeup(raidPtr);
    979 }
    980 
    981 /* ARGSUSED */
    982 static int
    983 raidread(dev_t dev, struct uio *uio, int flags)
    984 {
    985 	int     unit = raidunit(dev);
    986 	struct raid_softc *rs;
    987 
    988 	if ((rs = raidget(unit, false)) == NULL)
    989 		return ENXIO;
    990 
    991 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    992 		return (ENXIO);
    993 
    994 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    995 
    996 }
    997 
    998 /* ARGSUSED */
    999 static int
   1000 raidwrite(dev_t dev, struct uio *uio, int flags)
   1001 {
   1002 	int     unit = raidunit(dev);
   1003 	struct raid_softc *rs;
   1004 
   1005 	if ((rs = raidget(unit, false)) == NULL)
   1006 		return ENXIO;
   1007 
   1008 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1009 		return (ENXIO);
   1010 
   1011 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1012 
   1013 }
   1014 
   1015 static int
   1016 raid_detach_unlocked(struct raid_softc *rs)
   1017 {
   1018 	struct dk_softc *dksc = &rs->sc_dksc;
   1019 	RF_Raid_t *raidPtr;
   1020 	int error;
   1021 
   1022 	raidPtr = &rs->sc_r;
   1023 
   1024 	if (DK_BUSY(dksc, 0) ||
   1025 	    raidPtr->recon_in_progress != 0 ||
   1026 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1027 	    raidPtr->copyback_in_progress != 0)
   1028 		return EBUSY;
   1029 
   1030 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1031 		return 0;
   1032 
   1033 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1034 
   1035 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1036 		return error;
   1037 
   1038 	rs->sc_flags &= ~RAIDF_INITED;
   1039 
   1040 	/* Kill off any queued buffers */
   1041 	dk_drain(dksc);
   1042 	bufq_free(dksc->sc_bufq);
   1043 
   1044 	/* Detach the disk. */
   1045 	dkwedge_delall(&dksc->sc_dkdev);
   1046 	disk_detach(&dksc->sc_dkdev);
   1047 	disk_destroy(&dksc->sc_dkdev);
   1048 	dk_detach(dksc);
   1049 
   1050 	return 0;
   1051 }
   1052 
   1053 static int
   1054 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1055 {
   1056 	int     unit = raidunit(dev);
   1057 	int     error = 0;
   1058 	int     part, pmask;
   1059 	struct raid_softc *rs;
   1060 	struct dk_softc *dksc;
   1061 	RF_Config_t *k_cfg, *u_cfg;
   1062 	RF_Raid_t *raidPtr;
   1063 	RF_RaidDisk_t *diskPtr;
   1064 	RF_AccTotals_t *totals;
   1065 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1066 	u_char *specific_buf;
   1067 	int retcode = 0;
   1068 	int column;
   1069 /*	int raidid; */
   1070 	struct rf_recon_req *rr;
   1071 	struct rf_recon_req_internal *rrint;
   1072 	RF_ComponentLabel_t *clabel;
   1073 	RF_ComponentLabel_t *ci_label;
   1074 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1075 	RF_SingleComponent_t component;
   1076 	int d;
   1077 
   1078 	if ((rs = raidget(unit, false)) == NULL)
   1079 		return ENXIO;
   1080 	dksc = &rs->sc_dksc;
   1081 	raidPtr = &rs->sc_r;
   1082 
   1083 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1084 		(int) DISKPART(dev), (int) unit, cmd));
   1085 
   1086 	/* Must be initialized for these... */
   1087 	switch (cmd) {
   1088 	case RAIDFRAME_REWRITEPARITY:
   1089 	case RAIDFRAME_GET_INFO:
   1090 	case RAIDFRAME_RESET_ACCTOTALS:
   1091 	case RAIDFRAME_GET_ACCTOTALS:
   1092 	case RAIDFRAME_KEEP_ACCTOTALS:
   1093 	case RAIDFRAME_GET_SIZE:
   1094 	case RAIDFRAME_FAIL_DISK:
   1095 	case RAIDFRAME_COPYBACK:
   1096 	case RAIDFRAME_CHECK_RECON_STATUS:
   1097 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1098 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1099 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1100 	case RAIDFRAME_ADD_HOT_SPARE:
   1101 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1102 	case RAIDFRAME_INIT_LABELS:
   1103 	case RAIDFRAME_REBUILD_IN_PLACE:
   1104 	case RAIDFRAME_CHECK_PARITY:
   1105 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1106 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1107 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1108 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1109 	case RAIDFRAME_SET_AUTOCONFIG:
   1110 	case RAIDFRAME_SET_ROOT:
   1111 	case RAIDFRAME_DELETE_COMPONENT:
   1112 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1113 	case RAIDFRAME_PARITYMAP_STATUS:
   1114 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1115 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1116 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1117 #ifdef RAID_COMPAT32
   1118 	case RAIDFRAME_GET_INFO32:
   1119 #endif
   1120 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1121 			return (ENXIO);
   1122 	}
   1123 
   1124 	/*
   1125 	 * Handle compat ioctl calls
   1126 	 *
   1127 	 * * If compat code is not loaded, stub returns ENOSYS and we just
   1128 	 *   check the "native" cmd's
   1129 	 * * If compat code is loaded but does not recognize the cmd, it
   1130 	 *   returns EPASSTHROUGH, and we just check the "native" cmd's
   1131 	 * * If compat code returns EAGAIN, we need to finish via config
   1132 	 * * Otherwise the cmd has been handled and we just return
   1133 	 */
   1134 	module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1135 	MODULE_CALL_HOOK(raidframe_ioctl_50_hook,
   1136 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1137 	    enosys(), retcode);
   1138 	if (retcode == ENOSYS)
   1139 		retcode = 0;
   1140 	else if (retcode == EAGAIN)
   1141 		goto config;
   1142 	else if (retcode != EPASSTHROUGH)
   1143 		return retcode;
   1144 
   1145 	module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1146 	MODULE_CALL_HOOK(raidframe_ioctl_80_hook,
   1147 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1148 	    enosys(), retcode);
   1149 	if (retcode == ENOSYS)
   1150 		retcode = 0;
   1151 	else if (retcode == EAGAIN)
   1152 		goto config;
   1153 	else if (retcode != EPASSTHROUGH)
   1154 		return retcode;
   1155 
   1156 	/*
   1157 	 * XXX
   1158 	 * Handling of FAIL_DISK80 command requires us to retain retcode's
   1159 	 * value of EPASSTHROUGH.  If you add more compat code later, make
   1160 	 * sure you don't overwrite retcode and break this!
   1161 	 */
   1162 
   1163 	switch (cmd) {
   1164 
   1165 		/* configure the system */
   1166 	case RAIDFRAME_CONFIGURE:
   1167 #ifdef RAID_COMPAT32
   1168 	case RAIDFRAME_CONFIGURE32:
   1169 #endif
   1170 
   1171 		if (raidPtr->valid) {
   1172 			/* There is a valid RAID set running on this unit! */
   1173 			printf("raid%d: Device already configured!\n",unit);
   1174 			return(EINVAL);
   1175 		}
   1176 
   1177 		/* copy-in the configuration information */
   1178 		/* data points to a pointer to the configuration structure */
   1179 
   1180 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1181 		if (k_cfg == NULL) {
   1182 			return (ENOMEM);
   1183 		}
   1184 #ifdef RAID_COMPAT32
   1185 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1186 		    (l->l_proc->p_flag & PK_32) != 0)
   1187 			MODULE_CALL_HOOK(raidframe_netbsd32_config_hook,
   1188 			    (data, k_cfg), enosys(), retcode);
   1189 		else
   1190 #endif
   1191 		{
   1192 			u_cfg = *((RF_Config_t **) data);
   1193 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1194 		}
   1195 		if (retcode) {
   1196 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1197 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1198 				retcode));
   1199 			goto no_config;
   1200 		}
   1201 		goto config;
   1202 	config:
   1203 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1204 
   1205 		/* allocate a buffer for the layout-specific data, and copy it
   1206 		 * in */
   1207 		if (k_cfg->layoutSpecificSize) {
   1208 			if (k_cfg->layoutSpecificSize > 10000) {
   1209 				/* sanity check */
   1210 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1211 				retcode = EINVAL;
   1212 				goto no_config;
   1213 			}
   1214 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1215 			    (u_char *));
   1216 			if (specific_buf == NULL) {
   1217 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1218 				retcode = ENOMEM;
   1219 				goto no_config;
   1220 			}
   1221 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1222 			    k_cfg->layoutSpecificSize);
   1223 			if (retcode) {
   1224 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1225 				RF_Free(specific_buf,
   1226 					k_cfg->layoutSpecificSize);
   1227 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1228 					retcode));
   1229 				goto no_config;
   1230 			}
   1231 		} else
   1232 			specific_buf = NULL;
   1233 		k_cfg->layoutSpecific = specific_buf;
   1234 
   1235 		/* should do some kind of sanity check on the configuration.
   1236 		 * Store the sum of all the bytes in the last byte? */
   1237 
   1238 		/* configure the system */
   1239 
   1240 		/*
   1241 		 * Clear the entire RAID descriptor, just to make sure
   1242 		 *  there is no stale data left in the case of a
   1243 		 *  reconfiguration
   1244 		 */
   1245 		memset(raidPtr, 0, sizeof(*raidPtr));
   1246 		raidPtr->softc = rs;
   1247 		raidPtr->raidid = unit;
   1248 
   1249 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1250 
   1251 		if (retcode == 0) {
   1252 
   1253 			/* allow this many simultaneous IO's to
   1254 			   this RAID device */
   1255 			raidPtr->openings = RAIDOUTSTANDING;
   1256 
   1257 			raidinit(rs);
   1258 			raid_wakeup(raidPtr);
   1259 			rf_markalldirty(raidPtr);
   1260 		}
   1261 		/* free the buffers.  No return code here. */
   1262 		if (k_cfg->layoutSpecificSize) {
   1263 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1264 		}
   1265 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1266 
   1267 	no_config:
   1268 		/*
   1269 		 * If configuration failed, set sc_flags so that we
   1270 		 * will detach the device when we close it.
   1271 		 */
   1272 		if (retcode != 0)
   1273 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1274 		return (retcode);
   1275 
   1276 		/* shutdown the system */
   1277 	case RAIDFRAME_SHUTDOWN:
   1278 
   1279 		part = DISKPART(dev);
   1280 		pmask = (1 << part);
   1281 
   1282 		if ((error = raidlock(rs)) != 0)
   1283 			return (error);
   1284 
   1285 		if (DK_BUSY(dksc, pmask) ||
   1286 		    raidPtr->recon_in_progress != 0 ||
   1287 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1288 		    raidPtr->copyback_in_progress != 0)
   1289 			retcode = EBUSY;
   1290 		else {
   1291 			/* detach and free on close */
   1292 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1293 			retcode = 0;
   1294 		}
   1295 
   1296 		raidunlock(rs);
   1297 
   1298 		return (retcode);
   1299 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1300 		return rf_get_component_label(raidPtr, data);
   1301 
   1302 #if 0
   1303 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1304 		clabel = (RF_ComponentLabel_t *) data;
   1305 
   1306 		/* XXX check the label for valid stuff... */
   1307 		/* Note that some things *should not* get modified --
   1308 		   the user should be re-initing the labels instead of
   1309 		   trying to patch things.
   1310 		   */
   1311 
   1312 		raidid = raidPtr->raidid;
   1313 #ifdef DEBUG
   1314 		printf("raid%d: Got component label:\n", raidid);
   1315 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1316 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1317 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1318 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1319 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1320 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1321 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1322 #endif	/* DEBUG */
   1323 		clabel->row = 0;
   1324 		column = clabel->column;
   1325 
   1326 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1327 			return(EINVAL);
   1328 		}
   1329 
   1330 		/* XXX this isn't allowed to do anything for now :-) */
   1331 
   1332 		/* XXX and before it is, we need to fill in the rest
   1333 		   of the fields!?!?!?! */
   1334 		memcpy(raidget_component_label(raidPtr, column),
   1335 		    clabel, sizeof(*clabel));
   1336 		raidflush_component_label(raidPtr, column);
   1337 		return (0);
   1338 #endif	/* 0 */
   1339 
   1340 	case RAIDFRAME_INIT_LABELS:
   1341 		clabel = (RF_ComponentLabel_t *) data;
   1342 		/*
   1343 		   we only want the serial number from
   1344 		   the above.  We get all the rest of the information
   1345 		   from the config that was used to create this RAID
   1346 		   set.
   1347 		   */
   1348 
   1349 		raidPtr->serial_number = clabel->serial_number;
   1350 
   1351 		for(column=0;column<raidPtr->numCol;column++) {
   1352 			diskPtr = &raidPtr->Disks[column];
   1353 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1354 				ci_label = raidget_component_label(raidPtr,
   1355 				    column);
   1356 				/* Zeroing this is important. */
   1357 				memset(ci_label, 0, sizeof(*ci_label));
   1358 				raid_init_component_label(raidPtr, ci_label);
   1359 				ci_label->serial_number =
   1360 				    raidPtr->serial_number;
   1361 				ci_label->row = 0; /* we dont' pretend to support more */
   1362 				rf_component_label_set_partitionsize(ci_label,
   1363 				    diskPtr->partitionSize);
   1364 				ci_label->column = column;
   1365 				raidflush_component_label(raidPtr, column);
   1366 			}
   1367 			/* XXXjld what about the spares? */
   1368 		}
   1369 
   1370 		return (retcode);
   1371 	case RAIDFRAME_SET_AUTOCONFIG:
   1372 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1373 		printf("raid%d: New autoconfig value is: %d\n",
   1374 		       raidPtr->raidid, d);
   1375 		*(int *) data = d;
   1376 		return (retcode);
   1377 
   1378 	case RAIDFRAME_SET_ROOT:
   1379 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1380 		printf("raid%d: New rootpartition value is: %d\n",
   1381 		       raidPtr->raidid, d);
   1382 		*(int *) data = d;
   1383 		return (retcode);
   1384 
   1385 		/* initialize all parity */
   1386 	case RAIDFRAME_REWRITEPARITY:
   1387 
   1388 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1389 			/* Parity for RAID 0 is trivially correct */
   1390 			raidPtr->parity_good = RF_RAID_CLEAN;
   1391 			return(0);
   1392 		}
   1393 
   1394 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1395 			/* Re-write is already in progress! */
   1396 			return(EINVAL);
   1397 		}
   1398 
   1399 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1400 					   rf_RewriteParityThread,
   1401 					   raidPtr,"raid_parity");
   1402 		return (retcode);
   1403 
   1404 
   1405 	case RAIDFRAME_ADD_HOT_SPARE:
   1406 		sparePtr = (RF_SingleComponent_t *) data;
   1407 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1408 		retcode = rf_add_hot_spare(raidPtr, &component);
   1409 		return(retcode);
   1410 
   1411 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1412 		return(retcode);
   1413 
   1414 	case RAIDFRAME_DELETE_COMPONENT:
   1415 		componentPtr = (RF_SingleComponent_t *)data;
   1416 		memcpy( &component, componentPtr,
   1417 			sizeof(RF_SingleComponent_t));
   1418 		retcode = rf_delete_component(raidPtr, &component);
   1419 		return(retcode);
   1420 
   1421 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1422 		componentPtr = (RF_SingleComponent_t *)data;
   1423 		memcpy( &component, componentPtr,
   1424 			sizeof(RF_SingleComponent_t));
   1425 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1426 		return(retcode);
   1427 
   1428 	case RAIDFRAME_REBUILD_IN_PLACE:
   1429 
   1430 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1431 			/* Can't do this on a RAID 0!! */
   1432 			return(EINVAL);
   1433 		}
   1434 
   1435 		if (raidPtr->recon_in_progress == 1) {
   1436 			/* a reconstruct is already in progress! */
   1437 			return(EINVAL);
   1438 		}
   1439 
   1440 		componentPtr = (RF_SingleComponent_t *) data;
   1441 		memcpy( &component, componentPtr,
   1442 			sizeof(RF_SingleComponent_t));
   1443 		component.row = 0; /* we don't support any more */
   1444 		column = component.column;
   1445 
   1446 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1447 			return(EINVAL);
   1448 		}
   1449 
   1450 		rf_lock_mutex2(raidPtr->mutex);
   1451 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1452 		    (raidPtr->numFailures > 0)) {
   1453 			/* XXX 0 above shouldn't be constant!!! */
   1454 			/* some component other than this has failed.
   1455 			   Let's not make things worse than they already
   1456 			   are... */
   1457 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1458 			       raidPtr->raidid);
   1459 			printf("raid%d:     Col: %d   Too many failures.\n",
   1460 			       raidPtr->raidid, column);
   1461 			rf_unlock_mutex2(raidPtr->mutex);
   1462 			return (EINVAL);
   1463 		}
   1464 		if (raidPtr->Disks[column].status ==
   1465 		    rf_ds_reconstructing) {
   1466 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1467 			       raidPtr->raidid);
   1468 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1469 
   1470 			rf_unlock_mutex2(raidPtr->mutex);
   1471 			return (EINVAL);
   1472 		}
   1473 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1474 			rf_unlock_mutex2(raidPtr->mutex);
   1475 			return (EINVAL);
   1476 		}
   1477 		rf_unlock_mutex2(raidPtr->mutex);
   1478 
   1479 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1480 		if (rrint == NULL)
   1481 			return(ENOMEM);
   1482 
   1483 		rrint->col = column;
   1484 		rrint->raidPtr = raidPtr;
   1485 
   1486 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1487 					   rf_ReconstructInPlaceThread,
   1488 					   rrint, "raid_reconip");
   1489 		return(retcode);
   1490 
   1491 	case RAIDFRAME_GET_INFO:
   1492 #ifdef RAID_COMPAT32
   1493 	case RAIDFRAME_GET_INFO32:
   1494 #endif
   1495 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1496 			  (RF_DeviceConfig_t *));
   1497 		if (d_cfg == NULL)
   1498 			return (ENOMEM);
   1499 		retcode = rf_get_info(raidPtr, d_cfg);
   1500 		if (retcode == 0) {
   1501 #ifdef RAID_COMPAT32
   1502 			if (raidframe_netbsd32_config_hook.hooked &&
   1503 			    cmd == RAIDFRAME_GET_INFO32)
   1504 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1505 			else
   1506 #endif
   1507 				ucfgp = *(RF_DeviceConfig_t **)data;
   1508 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1509 		}
   1510 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1511 
   1512 		return (retcode);
   1513 
   1514 	case RAIDFRAME_CHECK_PARITY:
   1515 		*(int *) data = raidPtr->parity_good;
   1516 		return (0);
   1517 
   1518 	case RAIDFRAME_PARITYMAP_STATUS:
   1519 		if (rf_paritymap_ineligible(raidPtr))
   1520 			return EINVAL;
   1521 		rf_paritymap_status(raidPtr->parity_map,
   1522 		    (struct rf_pmstat *)data);
   1523 		return 0;
   1524 
   1525 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1526 		if (rf_paritymap_ineligible(raidPtr))
   1527 			return EINVAL;
   1528 		if (raidPtr->parity_map == NULL)
   1529 			return ENOENT; /* ??? */
   1530 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1531 			(struct rf_pmparams *)data, 1))
   1532 			return EINVAL;
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1539 		return 0;
   1540 
   1541 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1542 		if (rf_paritymap_ineligible(raidPtr))
   1543 			return EINVAL;
   1544 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1545 		/* XXX should errors be passed up? */
   1546 		return 0;
   1547 
   1548 	case RAIDFRAME_RESET_ACCTOTALS:
   1549 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1550 		return (0);
   1551 
   1552 	case RAIDFRAME_GET_ACCTOTALS:
   1553 		totals = (RF_AccTotals_t *) data;
   1554 		*totals = raidPtr->acc_totals;
   1555 		return (0);
   1556 
   1557 	case RAIDFRAME_KEEP_ACCTOTALS:
   1558 		raidPtr->keep_acc_totals = *(int *)data;
   1559 		return (0);
   1560 
   1561 	case RAIDFRAME_GET_SIZE:
   1562 		*(int *) data = raidPtr->totalSectors;
   1563 		return (0);
   1564 
   1565 		/* fail a disk & optionally start reconstruction */
   1566 	case RAIDFRAME_FAIL_DISK80:
   1567 		/* Check if we called compat code for this cmd */
   1568 		if (retcode != EPASSTHROUGH)
   1569 			return EINVAL;
   1570 		/* FALLTHRU */
   1571 	case RAIDFRAME_FAIL_DISK:
   1572 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1573 			/* Can't do this on a RAID 0!! */
   1574 			return(EINVAL);
   1575 		}
   1576 
   1577 		rr = (struct rf_recon_req *) data;
   1578 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1579 			return (EINVAL);
   1580 
   1581 		rf_lock_mutex2(raidPtr->mutex);
   1582 		if (raidPtr->status == rf_rs_reconstructing) {
   1583 			/* you can't fail a disk while we're reconstructing! */
   1584 			/* XXX wrong for RAID6 */
   1585 			rf_unlock_mutex2(raidPtr->mutex);
   1586 			return (EINVAL);
   1587 		}
   1588 		if ((raidPtr->Disks[rr->col].status ==
   1589 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1590 			/* some other component has failed.  Let's not make
   1591 			   things worse. XXX wrong for RAID6 */
   1592 			rf_unlock_mutex2(raidPtr->mutex);
   1593 			return (EINVAL);
   1594 		}
   1595 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1596 			/* Can't fail a spared disk! */
   1597 			rf_unlock_mutex2(raidPtr->mutex);
   1598 			return (EINVAL);
   1599 		}
   1600 		rf_unlock_mutex2(raidPtr->mutex);
   1601 
   1602 		/* make a copy of the recon request so that we don't rely on
   1603 		 * the user's buffer */
   1604 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1605 		if (rrint == NULL)
   1606 			return(ENOMEM);
   1607 		rrint->col = rr->col;
   1608 		rrint->flags = rr->flags;
   1609 		rrint->raidPtr = raidPtr;
   1610 
   1611 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1612 					   rf_ReconThread,
   1613 					   rrint, "raid_recon");
   1614 		return (0);
   1615 
   1616 		/* invoke a copyback operation after recon on whatever disk
   1617 		 * needs it, if any */
   1618 	case RAIDFRAME_COPYBACK:
   1619 
   1620 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1621 			/* This makes no sense on a RAID 0!! */
   1622 			return(EINVAL);
   1623 		}
   1624 
   1625 		if (raidPtr->copyback_in_progress == 1) {
   1626 			/* Copyback is already in progress! */
   1627 			return(EINVAL);
   1628 		}
   1629 
   1630 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1631 					   rf_CopybackThread,
   1632 					   raidPtr,"raid_copyback");
   1633 		return (retcode);
   1634 
   1635 		/* return the percentage completion of reconstruction */
   1636 	case RAIDFRAME_CHECK_RECON_STATUS:
   1637 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1638 			/* This makes no sense on a RAID 0, so tell the
   1639 			   user it's done. */
   1640 			*(int *) data = 100;
   1641 			return(0);
   1642 		}
   1643 		if (raidPtr->status != rf_rs_reconstructing)
   1644 			*(int *) data = 100;
   1645 		else {
   1646 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1647 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1648 			} else {
   1649 				*(int *) data = 0;
   1650 			}
   1651 		}
   1652 		return (0);
   1653 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1654 		rf_check_recon_status_ext(raidPtr, data);
   1655 		return (0);
   1656 
   1657 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1658 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1659 			/* This makes no sense on a RAID 0, so tell the
   1660 			   user it's done. */
   1661 			*(int *) data = 100;
   1662 			return(0);
   1663 		}
   1664 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1665 			*(int *) data = 100 *
   1666 				raidPtr->parity_rewrite_stripes_done /
   1667 				raidPtr->Layout.numStripe;
   1668 		} else {
   1669 			*(int *) data = 100;
   1670 		}
   1671 		return (0);
   1672 
   1673 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1674 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1675 		return (0);
   1676 
   1677 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1678 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1679 			/* This makes no sense on a RAID 0 */
   1680 			*(int *) data = 100;
   1681 			return(0);
   1682 		}
   1683 		if (raidPtr->copyback_in_progress == 1) {
   1684 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1685 				raidPtr->Layout.numStripe;
   1686 		} else {
   1687 			*(int *) data = 100;
   1688 		}
   1689 		return (0);
   1690 
   1691 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1692 		rf_check_copyback_status_ext(raidPtr, data);
   1693 		return 0;
   1694 
   1695 	case RAIDFRAME_SET_LAST_UNIT:
   1696 		for (column = 0; column < raidPtr->numCol; column++)
   1697 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1698 				return EBUSY;
   1699 
   1700 		for (column = 0; column < raidPtr->numCol; column++) {
   1701 			clabel = raidget_component_label(raidPtr, column);
   1702 			clabel->last_unit = *(int *)data;
   1703 			raidflush_component_label(raidPtr, column);
   1704 		}
   1705 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1706 		return 0;
   1707 
   1708 		/* the sparetable daemon calls this to wait for the kernel to
   1709 		 * need a spare table. this ioctl does not return until a
   1710 		 * spare table is needed. XXX -- calling mpsleep here in the
   1711 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1712 		 * -- I should either compute the spare table in the kernel,
   1713 		 * or have a different -- XXX XXX -- interface (a different
   1714 		 * character device) for delivering the table     -- XXX */
   1715 #if 0
   1716 	case RAIDFRAME_SPARET_WAIT:
   1717 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1718 		while (!rf_sparet_wait_queue)
   1719 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1720 		waitreq = rf_sparet_wait_queue;
   1721 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1722 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1723 
   1724 		/* structure assignment */
   1725 		*((RF_SparetWait_t *) data) = *waitreq;
   1726 
   1727 		RF_Free(waitreq, sizeof(*waitreq));
   1728 		return (0);
   1729 
   1730 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1731 		 * code in it that will cause the dameon to exit */
   1732 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1733 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1734 		waitreq->fcol = -1;
   1735 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1736 		waitreq->next = rf_sparet_wait_queue;
   1737 		rf_sparet_wait_queue = waitreq;
   1738 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1739 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1740 		return (0);
   1741 
   1742 		/* used by the spare table daemon to deliver a spare table
   1743 		 * into the kernel */
   1744 	case RAIDFRAME_SEND_SPARET:
   1745 
   1746 		/* install the spare table */
   1747 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1748 
   1749 		/* respond to the requestor.  the return status of the spare
   1750 		 * table installation is passed in the "fcol" field */
   1751 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1752 		waitreq->fcol = retcode;
   1753 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1754 		waitreq->next = rf_sparet_resp_queue;
   1755 		rf_sparet_resp_queue = waitreq;
   1756 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1757 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1758 
   1759 		return (retcode);
   1760 #endif
   1761 
   1762 	default:
   1763 		break; /* fall through to the os-specific code below */
   1764 
   1765 	}
   1766 
   1767 	if (!raidPtr->valid)
   1768 		return (EINVAL);
   1769 
   1770 	/*
   1771 	 * Add support for "regular" device ioctls here.
   1772 	 */
   1773 
   1774 	switch (cmd) {
   1775 	case DIOCGCACHE:
   1776 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1777 		break;
   1778 
   1779 	case DIOCCACHESYNC:
   1780 		retcode = rf_sync_component_caches(raidPtr);
   1781 		break;
   1782 
   1783 	default:
   1784 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1785 		break;
   1786 	}
   1787 
   1788 	return (retcode);
   1789 
   1790 }
   1791 
   1792 
   1793 /* raidinit -- complete the rest of the initialization for the
   1794    RAIDframe device.  */
   1795 
   1796 
   1797 static void
   1798 raidinit(struct raid_softc *rs)
   1799 {
   1800 	cfdata_t cf;
   1801 	unsigned int unit;
   1802 	struct dk_softc *dksc = &rs->sc_dksc;
   1803 	RF_Raid_t *raidPtr = &rs->sc_r;
   1804 	device_t dev;
   1805 
   1806 	unit = raidPtr->raidid;
   1807 
   1808 	/* XXX doesn't check bounds. */
   1809 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1810 
   1811 	/* attach the pseudo device */
   1812 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1813 	cf->cf_name = raid_cd.cd_name;
   1814 	cf->cf_atname = raid_cd.cd_name;
   1815 	cf->cf_unit = unit;
   1816 	cf->cf_fstate = FSTATE_STAR;
   1817 
   1818 	dev = config_attach_pseudo(cf);
   1819 	if (dev == NULL) {
   1820 		printf("raid%d: config_attach_pseudo failed\n",
   1821 		    raidPtr->raidid);
   1822 		free(cf, M_RAIDFRAME);
   1823 		return;
   1824 	}
   1825 
   1826 	/* provide a backpointer to the real softc */
   1827 	raidsoftc(dev) = rs;
   1828 
   1829 	/* disk_attach actually creates space for the CPU disklabel, among
   1830 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1831 	 * with disklabels. */
   1832 	dk_init(dksc, dev, DKTYPE_RAID);
   1833 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1834 
   1835 	/* XXX There may be a weird interaction here between this, and
   1836 	 * protectedSectors, as used in RAIDframe.  */
   1837 
   1838 	rs->sc_size = raidPtr->totalSectors;
   1839 
   1840 	/* Attach dk and disk subsystems */
   1841 	dk_attach(dksc);
   1842 	disk_attach(&dksc->sc_dkdev);
   1843 	rf_set_geometry(rs, raidPtr);
   1844 
   1845 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1846 
   1847 	/* mark unit as usuable */
   1848 	rs->sc_flags |= RAIDF_INITED;
   1849 
   1850 	dkwedge_discover(&dksc->sc_dkdev);
   1851 }
   1852 
   1853 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1854 /* wake up the daemon & tell it to get us a spare table
   1855  * XXX
   1856  * the entries in the queues should be tagged with the raidPtr
   1857  * so that in the extremely rare case that two recons happen at once,
   1858  * we know for which device were requesting a spare table
   1859  * XXX
   1860  *
   1861  * XXX This code is not currently used. GO
   1862  */
   1863 int
   1864 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1865 {
   1866 	int     retcode;
   1867 
   1868 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1869 	req->next = rf_sparet_wait_queue;
   1870 	rf_sparet_wait_queue = req;
   1871 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1872 
   1873 	/* mpsleep unlocks the mutex */
   1874 	while (!rf_sparet_resp_queue) {
   1875 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1876 	}
   1877 	req = rf_sparet_resp_queue;
   1878 	rf_sparet_resp_queue = req->next;
   1879 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1880 
   1881 	retcode = req->fcol;
   1882 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1883 					 * alloc'd */
   1884 	return (retcode);
   1885 }
   1886 #endif
   1887 
   1888 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1889  * bp & passes it down.
   1890  * any calls originating in the kernel must use non-blocking I/O
   1891  * do some extra sanity checking to return "appropriate" error values for
   1892  * certain conditions (to make some standard utilities work)
   1893  *
   1894  * Formerly known as: rf_DoAccessKernel
   1895  */
   1896 void
   1897 raidstart(RF_Raid_t *raidPtr)
   1898 {
   1899 	struct raid_softc *rs;
   1900 	struct dk_softc *dksc;
   1901 
   1902 	rs = raidPtr->softc;
   1903 	dksc = &rs->sc_dksc;
   1904 	/* quick check to see if anything has died recently */
   1905 	rf_lock_mutex2(raidPtr->mutex);
   1906 	if (raidPtr->numNewFailures > 0) {
   1907 		rf_unlock_mutex2(raidPtr->mutex);
   1908 		rf_update_component_labels(raidPtr,
   1909 					   RF_NORMAL_COMPONENT_UPDATE);
   1910 		rf_lock_mutex2(raidPtr->mutex);
   1911 		raidPtr->numNewFailures--;
   1912 	}
   1913 	rf_unlock_mutex2(raidPtr->mutex);
   1914 
   1915 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1916 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1917 		return;
   1918 	}
   1919 
   1920 	dk_start(dksc, NULL);
   1921 }
   1922 
   1923 static int
   1924 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1925 {
   1926 	RF_SectorCount_t num_blocks, pb, sum;
   1927 	RF_RaidAddr_t raid_addr;
   1928 	daddr_t blocknum;
   1929 	int     do_async;
   1930 	int rc;
   1931 
   1932 	rf_lock_mutex2(raidPtr->mutex);
   1933 	if (raidPtr->openings == 0) {
   1934 		rf_unlock_mutex2(raidPtr->mutex);
   1935 		return EAGAIN;
   1936 	}
   1937 	rf_unlock_mutex2(raidPtr->mutex);
   1938 
   1939 	blocknum = bp->b_rawblkno;
   1940 
   1941 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1942 		    (int) blocknum));
   1943 
   1944 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1945 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1946 
   1947 	/* *THIS* is where we adjust what block we're going to...
   1948 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1949 	raid_addr = blocknum;
   1950 
   1951 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1952 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1953 	sum = raid_addr + num_blocks + pb;
   1954 	if (1 || rf_debugKernelAccess) {
   1955 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1956 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1957 			    (int) pb, (int) bp->b_resid));
   1958 	}
   1959 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1960 	    || (sum < num_blocks) || (sum < pb)) {
   1961 		rc = ENOSPC;
   1962 		goto done;
   1963 	}
   1964 	/*
   1965 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1966 	 */
   1967 
   1968 	if (bp->b_bcount & raidPtr->sectorMask) {
   1969 		rc = ENOSPC;
   1970 		goto done;
   1971 	}
   1972 	db1_printf(("Calling DoAccess..\n"));
   1973 
   1974 
   1975 	rf_lock_mutex2(raidPtr->mutex);
   1976 	raidPtr->openings--;
   1977 	rf_unlock_mutex2(raidPtr->mutex);
   1978 
   1979 	/*
   1980 	 * Everything is async.
   1981 	 */
   1982 	do_async = 1;
   1983 
   1984 	/* don't ever condition on bp->b_flags & B_WRITE.
   1985 	 * always condition on B_READ instead */
   1986 
   1987 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1988 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1989 			 do_async, raid_addr, num_blocks,
   1990 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1991 
   1992 done:
   1993 	return rc;
   1994 }
   1995 
   1996 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1997 
   1998 int
   1999 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2000 {
   2001 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2002 	struct buf *bp;
   2003 
   2004 	req->queue = queue;
   2005 	bp = req->bp;
   2006 
   2007 	switch (req->type) {
   2008 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2009 		/* XXX need to do something extra here.. */
   2010 		/* I'm leaving this in, as I've never actually seen it used,
   2011 		 * and I'd like folks to report it... GO */
   2012 		printf(("WAKEUP CALLED\n"));
   2013 		queue->numOutstanding++;
   2014 
   2015 		bp->b_flags = 0;
   2016 		bp->b_private = req;
   2017 
   2018 		KernelWakeupFunc(bp);
   2019 		break;
   2020 
   2021 	case RF_IO_TYPE_READ:
   2022 	case RF_IO_TYPE_WRITE:
   2023 #if RF_ACC_TRACE > 0
   2024 		if (req->tracerec) {
   2025 			RF_ETIMER_START(req->tracerec->timer);
   2026 		}
   2027 #endif
   2028 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2029 		    op, queue->rf_cinfo->ci_dev,
   2030 		    req->sectorOffset, req->numSector,
   2031 		    req->buf, KernelWakeupFunc, (void *) req,
   2032 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2033 
   2034 		if (rf_debugKernelAccess) {
   2035 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2036 				(long) bp->b_blkno));
   2037 		}
   2038 		queue->numOutstanding++;
   2039 		queue->last_deq_sector = req->sectorOffset;
   2040 		/* acc wouldn't have been let in if there were any pending
   2041 		 * reqs at any other priority */
   2042 		queue->curPriority = req->priority;
   2043 
   2044 		db1_printf(("Going for %c to unit %d col %d\n",
   2045 			    req->type, queue->raidPtr->raidid,
   2046 			    queue->col));
   2047 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2048 			(int) req->sectorOffset, (int) req->numSector,
   2049 			(int) (req->numSector <<
   2050 			    queue->raidPtr->logBytesPerSector),
   2051 			(int) queue->raidPtr->logBytesPerSector));
   2052 
   2053 		/*
   2054 		 * XXX: drop lock here since this can block at
   2055 		 * least with backing SCSI devices.  Retake it
   2056 		 * to minimize fuss with calling interfaces.
   2057 		 */
   2058 
   2059 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2060 		bdev_strategy(bp);
   2061 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2062 		break;
   2063 
   2064 	default:
   2065 		panic("bad req->type in rf_DispatchKernelIO");
   2066 	}
   2067 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2068 
   2069 	return (0);
   2070 }
   2071 /* this is the callback function associated with a I/O invoked from
   2072    kernel code.
   2073  */
   2074 static void
   2075 KernelWakeupFunc(struct buf *bp)
   2076 {
   2077 	RF_DiskQueueData_t *req = NULL;
   2078 	RF_DiskQueue_t *queue;
   2079 
   2080 	db1_printf(("recovering the request queue:\n"));
   2081 
   2082 	req = bp->b_private;
   2083 
   2084 	queue = (RF_DiskQueue_t *) req->queue;
   2085 
   2086 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2087 
   2088 #if RF_ACC_TRACE > 0
   2089 	if (req->tracerec) {
   2090 		RF_ETIMER_STOP(req->tracerec->timer);
   2091 		RF_ETIMER_EVAL(req->tracerec->timer);
   2092 		rf_lock_mutex2(rf_tracing_mutex);
   2093 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2094 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2095 		req->tracerec->num_phys_ios++;
   2096 		rf_unlock_mutex2(rf_tracing_mutex);
   2097 	}
   2098 #endif
   2099 
   2100 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2101 	 * ballistic, and mark the component as hosed... */
   2102 
   2103 	if (bp->b_error != 0) {
   2104 		/* Mark the disk as dead */
   2105 		/* but only mark it once... */
   2106 		/* and only if it wouldn't leave this RAID set
   2107 		   completely broken */
   2108 		if (((queue->raidPtr->Disks[queue->col].status ==
   2109 		      rf_ds_optimal) ||
   2110 		     (queue->raidPtr->Disks[queue->col].status ==
   2111 		      rf_ds_used_spare)) &&
   2112 		     (queue->raidPtr->numFailures <
   2113 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2114 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2115 			       queue->raidPtr->raidid,
   2116 			       bp->b_error,
   2117 			       queue->raidPtr->Disks[queue->col].devname);
   2118 			queue->raidPtr->Disks[queue->col].status =
   2119 			    rf_ds_failed;
   2120 			queue->raidPtr->status = rf_rs_degraded;
   2121 			queue->raidPtr->numFailures++;
   2122 			queue->raidPtr->numNewFailures++;
   2123 		} else {	/* Disk is already dead... */
   2124 			/* printf("Disk already marked as dead!\n"); */
   2125 		}
   2126 
   2127 	}
   2128 
   2129 	/* Fill in the error value */
   2130 	req->error = bp->b_error;
   2131 
   2132 	/* Drop this one on the "finished" queue... */
   2133 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2134 
   2135 	/* Let the raidio thread know there is work to be done. */
   2136 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2137 
   2138 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2139 }
   2140 
   2141 
   2142 /*
   2143  * initialize a buf structure for doing an I/O in the kernel.
   2144  */
   2145 static void
   2146 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2147        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2148        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2149        struct proc *b_proc)
   2150 {
   2151 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2152 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2153 	bp->b_oflags = 0;
   2154 	bp->b_cflags = 0;
   2155 	bp->b_bcount = numSect << logBytesPerSector;
   2156 	bp->b_bufsize = bp->b_bcount;
   2157 	bp->b_error = 0;
   2158 	bp->b_dev = dev;
   2159 	bp->b_data = bf;
   2160 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2161 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2162 	if (bp->b_bcount == 0) {
   2163 		panic("bp->b_bcount is zero in InitBP!!");
   2164 	}
   2165 	bp->b_proc = b_proc;
   2166 	bp->b_iodone = cbFunc;
   2167 	bp->b_private = cbArg;
   2168 }
   2169 
   2170 /*
   2171  * Wait interruptibly for an exclusive lock.
   2172  *
   2173  * XXX
   2174  * Several drivers do this; it should be abstracted and made MP-safe.
   2175  * (Hmm... where have we seen this warning before :->  GO )
   2176  */
   2177 static int
   2178 raidlock(struct raid_softc *rs)
   2179 {
   2180 	int     error;
   2181 
   2182 	error = 0;
   2183 	mutex_enter(&rs->sc_mutex);
   2184 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2185 		rs->sc_flags |= RAIDF_WANTED;
   2186 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2187 		if (error != 0)
   2188 			goto done;
   2189 	}
   2190 	rs->sc_flags |= RAIDF_LOCKED;
   2191 done:
   2192 	mutex_exit(&rs->sc_mutex);
   2193 	return (error);
   2194 }
   2195 /*
   2196  * Unlock and wake up any waiters.
   2197  */
   2198 static void
   2199 raidunlock(struct raid_softc *rs)
   2200 {
   2201 
   2202 	mutex_enter(&rs->sc_mutex);
   2203 	rs->sc_flags &= ~RAIDF_LOCKED;
   2204 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2205 		rs->sc_flags &= ~RAIDF_WANTED;
   2206 		cv_broadcast(&rs->sc_cv);
   2207 	}
   2208 	mutex_exit(&rs->sc_mutex);
   2209 }
   2210 
   2211 
   2212 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2213 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2214 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2215 
   2216 static daddr_t
   2217 rf_component_info_offset(void)
   2218 {
   2219 
   2220 	return RF_COMPONENT_INFO_OFFSET;
   2221 }
   2222 
   2223 static daddr_t
   2224 rf_component_info_size(unsigned secsize)
   2225 {
   2226 	daddr_t info_size;
   2227 
   2228 	KASSERT(secsize);
   2229 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2230 		info_size = secsize;
   2231 	else
   2232 		info_size = RF_COMPONENT_INFO_SIZE;
   2233 
   2234 	return info_size;
   2235 }
   2236 
   2237 static daddr_t
   2238 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2239 {
   2240 	daddr_t map_offset;
   2241 
   2242 	KASSERT(raidPtr->bytesPerSector);
   2243 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2244 		map_offset = raidPtr->bytesPerSector;
   2245 	else
   2246 		map_offset = RF_COMPONENT_INFO_SIZE;
   2247 	map_offset += rf_component_info_offset();
   2248 
   2249 	return map_offset;
   2250 }
   2251 
   2252 static daddr_t
   2253 rf_parity_map_size(RF_Raid_t *raidPtr)
   2254 {
   2255 	daddr_t map_size;
   2256 
   2257 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2258 		map_size = raidPtr->bytesPerSector;
   2259 	else
   2260 		map_size = RF_PARITY_MAP_SIZE;
   2261 
   2262 	return map_size;
   2263 }
   2264 
   2265 int
   2266 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2267 {
   2268 	RF_ComponentLabel_t *clabel;
   2269 
   2270 	clabel = raidget_component_label(raidPtr, col);
   2271 	clabel->clean = RF_RAID_CLEAN;
   2272 	raidflush_component_label(raidPtr, col);
   2273 	return(0);
   2274 }
   2275 
   2276 
   2277 int
   2278 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2279 {
   2280 	RF_ComponentLabel_t *clabel;
   2281 
   2282 	clabel = raidget_component_label(raidPtr, col);
   2283 	clabel->clean = RF_RAID_DIRTY;
   2284 	raidflush_component_label(raidPtr, col);
   2285 	return(0);
   2286 }
   2287 
   2288 int
   2289 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2290 {
   2291 	KASSERT(raidPtr->bytesPerSector);
   2292 	return raidread_component_label(raidPtr->bytesPerSector,
   2293 	    raidPtr->Disks[col].dev,
   2294 	    raidPtr->raid_cinfo[col].ci_vp,
   2295 	    &raidPtr->raid_cinfo[col].ci_label);
   2296 }
   2297 
   2298 RF_ComponentLabel_t *
   2299 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2300 {
   2301 	return &raidPtr->raid_cinfo[col].ci_label;
   2302 }
   2303 
   2304 int
   2305 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2306 {
   2307 	RF_ComponentLabel_t *label;
   2308 
   2309 	label = &raidPtr->raid_cinfo[col].ci_label;
   2310 	label->mod_counter = raidPtr->mod_counter;
   2311 #ifndef RF_NO_PARITY_MAP
   2312 	label->parity_map_modcount = label->mod_counter;
   2313 #endif
   2314 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2315 	    raidPtr->Disks[col].dev,
   2316 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2317 }
   2318 
   2319 
   2320 static int
   2321 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2322     RF_ComponentLabel_t *clabel)
   2323 {
   2324 	return raidread_component_area(dev, b_vp, clabel,
   2325 	    sizeof(RF_ComponentLabel_t),
   2326 	    rf_component_info_offset(),
   2327 	    rf_component_info_size(secsize));
   2328 }
   2329 
   2330 /* ARGSUSED */
   2331 static int
   2332 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2333     size_t msize, daddr_t offset, daddr_t dsize)
   2334 {
   2335 	struct buf *bp;
   2336 	int error;
   2337 
   2338 	/* XXX should probably ensure that we don't try to do this if
   2339 	   someone has changed rf_protected_sectors. */
   2340 
   2341 	if (b_vp == NULL) {
   2342 		/* For whatever reason, this component is not valid.
   2343 		   Don't try to read a component label from it. */
   2344 		return(EINVAL);
   2345 	}
   2346 
   2347 	/* get a block of the appropriate size... */
   2348 	bp = geteblk((int)dsize);
   2349 	bp->b_dev = dev;
   2350 
   2351 	/* get our ducks in a row for the read */
   2352 	bp->b_blkno = offset / DEV_BSIZE;
   2353 	bp->b_bcount = dsize;
   2354 	bp->b_flags |= B_READ;
   2355  	bp->b_resid = dsize;
   2356 
   2357 	bdev_strategy(bp);
   2358 	error = biowait(bp);
   2359 
   2360 	if (!error) {
   2361 		memcpy(data, bp->b_data, msize);
   2362 	}
   2363 
   2364 	brelse(bp, 0);
   2365 	return(error);
   2366 }
   2367 
   2368 
   2369 static int
   2370 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2371     RF_ComponentLabel_t *clabel)
   2372 {
   2373 	return raidwrite_component_area(dev, b_vp, clabel,
   2374 	    sizeof(RF_ComponentLabel_t),
   2375 	    rf_component_info_offset(),
   2376 	    rf_component_info_size(secsize), 0);
   2377 }
   2378 
   2379 /* ARGSUSED */
   2380 static int
   2381 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2382     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2383 {
   2384 	struct buf *bp;
   2385 	int error;
   2386 
   2387 	/* get a block of the appropriate size... */
   2388 	bp = geteblk((int)dsize);
   2389 	bp->b_dev = dev;
   2390 
   2391 	/* get our ducks in a row for the write */
   2392 	bp->b_blkno = offset / DEV_BSIZE;
   2393 	bp->b_bcount = dsize;
   2394 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2395  	bp->b_resid = dsize;
   2396 
   2397 	memset(bp->b_data, 0, dsize);
   2398 	memcpy(bp->b_data, data, msize);
   2399 
   2400 	bdev_strategy(bp);
   2401 	if (asyncp)
   2402 		return 0;
   2403 	error = biowait(bp);
   2404 	brelse(bp, 0);
   2405 	if (error) {
   2406 #if 1
   2407 		printf("Failed to write RAID component info!\n");
   2408 #endif
   2409 	}
   2410 
   2411 	return(error);
   2412 }
   2413 
   2414 void
   2415 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2416 {
   2417 	int c;
   2418 
   2419 	for (c = 0; c < raidPtr->numCol; c++) {
   2420 		/* Skip dead disks. */
   2421 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2422 			continue;
   2423 		/* XXXjld: what if an error occurs here? */
   2424 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2425 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2426 		    RF_PARITYMAP_NBYTE,
   2427 		    rf_parity_map_offset(raidPtr),
   2428 		    rf_parity_map_size(raidPtr), 0);
   2429 	}
   2430 }
   2431 
   2432 void
   2433 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2434 {
   2435 	struct rf_paritymap_ondisk tmp;
   2436 	int c,first;
   2437 
   2438 	first=1;
   2439 	for (c = 0; c < raidPtr->numCol; c++) {
   2440 		/* Skip dead disks. */
   2441 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2442 			continue;
   2443 		raidread_component_area(raidPtr->Disks[c].dev,
   2444 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2445 		    RF_PARITYMAP_NBYTE,
   2446 		    rf_parity_map_offset(raidPtr),
   2447 		    rf_parity_map_size(raidPtr));
   2448 		if (first) {
   2449 			memcpy(map, &tmp, sizeof(*map));
   2450 			first = 0;
   2451 		} else {
   2452 			rf_paritymap_merge(map, &tmp);
   2453 		}
   2454 	}
   2455 }
   2456 
   2457 void
   2458 rf_markalldirty(RF_Raid_t *raidPtr)
   2459 {
   2460 	RF_ComponentLabel_t *clabel;
   2461 	int sparecol;
   2462 	int c;
   2463 	int j;
   2464 	int scol = -1;
   2465 
   2466 	raidPtr->mod_counter++;
   2467 	for (c = 0; c < raidPtr->numCol; c++) {
   2468 		/* we don't want to touch (at all) a disk that has
   2469 		   failed */
   2470 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2471 			clabel = raidget_component_label(raidPtr, c);
   2472 			if (clabel->status == rf_ds_spared) {
   2473 				/* XXX do something special...
   2474 				   but whatever you do, don't
   2475 				   try to access it!! */
   2476 			} else {
   2477 				raidmarkdirty(raidPtr, c);
   2478 			}
   2479 		}
   2480 	}
   2481 
   2482 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2483 		sparecol = raidPtr->numCol + c;
   2484 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2485 			/*
   2486 
   2487 			   we claim this disk is "optimal" if it's
   2488 			   rf_ds_used_spare, as that means it should be
   2489 			   directly substitutable for the disk it replaced.
   2490 			   We note that too...
   2491 
   2492 			 */
   2493 
   2494 			for(j=0;j<raidPtr->numCol;j++) {
   2495 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2496 					scol = j;
   2497 					break;
   2498 				}
   2499 			}
   2500 
   2501 			clabel = raidget_component_label(raidPtr, sparecol);
   2502 			/* make sure status is noted */
   2503 
   2504 			raid_init_component_label(raidPtr, clabel);
   2505 
   2506 			clabel->row = 0;
   2507 			clabel->column = scol;
   2508 			/* Note: we *don't* change status from rf_ds_used_spare
   2509 			   to rf_ds_optimal */
   2510 			/* clabel.status = rf_ds_optimal; */
   2511 
   2512 			raidmarkdirty(raidPtr, sparecol);
   2513 		}
   2514 	}
   2515 }
   2516 
   2517 
   2518 void
   2519 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2520 {
   2521 	RF_ComponentLabel_t *clabel;
   2522 	int sparecol;
   2523 	int c;
   2524 	int j;
   2525 	int scol;
   2526 	struct raid_softc *rs = raidPtr->softc;
   2527 
   2528 	scol = -1;
   2529 
   2530 	/* XXX should do extra checks to make sure things really are clean,
   2531 	   rather than blindly setting the clean bit... */
   2532 
   2533 	raidPtr->mod_counter++;
   2534 
   2535 	for (c = 0; c < raidPtr->numCol; c++) {
   2536 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2537 			clabel = raidget_component_label(raidPtr, c);
   2538 			/* make sure status is noted */
   2539 			clabel->status = rf_ds_optimal;
   2540 
   2541 			/* note what unit we are configured as */
   2542 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2543 				clabel->last_unit = raidPtr->raidid;
   2544 
   2545 			raidflush_component_label(raidPtr, c);
   2546 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2547 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2548 					raidmarkclean(raidPtr, c);
   2549 				}
   2550 			}
   2551 		}
   2552 		/* else we don't touch it.. */
   2553 	}
   2554 
   2555 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2556 		sparecol = raidPtr->numCol + c;
   2557 		/* Need to ensure that the reconstruct actually completed! */
   2558 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2559 			/*
   2560 
   2561 			   we claim this disk is "optimal" if it's
   2562 			   rf_ds_used_spare, as that means it should be
   2563 			   directly substitutable for the disk it replaced.
   2564 			   We note that too...
   2565 
   2566 			 */
   2567 
   2568 			for(j=0;j<raidPtr->numCol;j++) {
   2569 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2570 					scol = j;
   2571 					break;
   2572 				}
   2573 			}
   2574 
   2575 			/* XXX shouldn't *really* need this... */
   2576 			clabel = raidget_component_label(raidPtr, sparecol);
   2577 			/* make sure status is noted */
   2578 
   2579 			raid_init_component_label(raidPtr, clabel);
   2580 
   2581 			clabel->column = scol;
   2582 			clabel->status = rf_ds_optimal;
   2583 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2584 				clabel->last_unit = raidPtr->raidid;
   2585 
   2586 			raidflush_component_label(raidPtr, sparecol);
   2587 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2588 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2589 					raidmarkclean(raidPtr, sparecol);
   2590 				}
   2591 			}
   2592 		}
   2593 	}
   2594 }
   2595 
   2596 void
   2597 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2598 {
   2599 
   2600 	if (vp != NULL) {
   2601 		if (auto_configured == 1) {
   2602 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2603 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2604 			vput(vp);
   2605 
   2606 		} else {
   2607 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2608 		}
   2609 	}
   2610 }
   2611 
   2612 
   2613 void
   2614 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2615 {
   2616 	int r,c;
   2617 	struct vnode *vp;
   2618 	int acd;
   2619 
   2620 
   2621 	/* We take this opportunity to close the vnodes like we should.. */
   2622 
   2623 	for (c = 0; c < raidPtr->numCol; c++) {
   2624 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2625 		acd = raidPtr->Disks[c].auto_configured;
   2626 		rf_close_component(raidPtr, vp, acd);
   2627 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2628 		raidPtr->Disks[c].auto_configured = 0;
   2629 	}
   2630 
   2631 	for (r = 0; r < raidPtr->numSpare; r++) {
   2632 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2633 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2634 		rf_close_component(raidPtr, vp, acd);
   2635 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2636 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2637 	}
   2638 }
   2639 
   2640 
   2641 void
   2642 rf_ReconThread(struct rf_recon_req_internal *req)
   2643 {
   2644 	int     s;
   2645 	RF_Raid_t *raidPtr;
   2646 
   2647 	s = splbio();
   2648 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2649 	raidPtr->recon_in_progress = 1;
   2650 
   2651 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2652 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2653 
   2654 	RF_Free(req, sizeof(*req));
   2655 
   2656 	raidPtr->recon_in_progress = 0;
   2657 	splx(s);
   2658 
   2659 	/* That's all... */
   2660 	kthread_exit(0);	/* does not return */
   2661 }
   2662 
   2663 void
   2664 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2665 {
   2666 	int retcode;
   2667 	int s;
   2668 
   2669 	raidPtr->parity_rewrite_stripes_done = 0;
   2670 	raidPtr->parity_rewrite_in_progress = 1;
   2671 	s = splbio();
   2672 	retcode = rf_RewriteParity(raidPtr);
   2673 	splx(s);
   2674 	if (retcode) {
   2675 		printf("raid%d: Error re-writing parity (%d)!\n",
   2676 		    raidPtr->raidid, retcode);
   2677 	} else {
   2678 		/* set the clean bit!  If we shutdown correctly,
   2679 		   the clean bit on each component label will get
   2680 		   set */
   2681 		raidPtr->parity_good = RF_RAID_CLEAN;
   2682 	}
   2683 	raidPtr->parity_rewrite_in_progress = 0;
   2684 
   2685 	/* Anyone waiting for us to stop?  If so, inform them... */
   2686 	if (raidPtr->waitShutdown) {
   2687 		rf_lock_mutex2(raidPtr->rad_lock);
   2688 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2689 		rf_unlock_mutex2(raidPtr->rad_lock);
   2690 	}
   2691 
   2692 	/* That's all... */
   2693 	kthread_exit(0);	/* does not return */
   2694 }
   2695 
   2696 
   2697 void
   2698 rf_CopybackThread(RF_Raid_t *raidPtr)
   2699 {
   2700 	int s;
   2701 
   2702 	raidPtr->copyback_in_progress = 1;
   2703 	s = splbio();
   2704 	rf_CopybackReconstructedData(raidPtr);
   2705 	splx(s);
   2706 	raidPtr->copyback_in_progress = 0;
   2707 
   2708 	/* That's all... */
   2709 	kthread_exit(0);	/* does not return */
   2710 }
   2711 
   2712 
   2713 void
   2714 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2715 {
   2716 	int s;
   2717 	RF_Raid_t *raidPtr;
   2718 
   2719 	s = splbio();
   2720 	raidPtr = req->raidPtr;
   2721 	raidPtr->recon_in_progress = 1;
   2722 	rf_ReconstructInPlace(raidPtr, req->col);
   2723 	RF_Free(req, sizeof(*req));
   2724 	raidPtr->recon_in_progress = 0;
   2725 	splx(s);
   2726 
   2727 	/* That's all... */
   2728 	kthread_exit(0);	/* does not return */
   2729 }
   2730 
   2731 static RF_AutoConfig_t *
   2732 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2733     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2734     unsigned secsize)
   2735 {
   2736 	int good_one = 0;
   2737 	RF_ComponentLabel_t *clabel;
   2738 	RF_AutoConfig_t *ac;
   2739 
   2740 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2741 	if (clabel == NULL) {
   2742 oomem:
   2743 		    while(ac_list) {
   2744 			    ac = ac_list;
   2745 			    if (ac->clabel)
   2746 				    free(ac->clabel, M_RAIDFRAME);
   2747 			    ac_list = ac_list->next;
   2748 			    free(ac, M_RAIDFRAME);
   2749 		    }
   2750 		    printf("RAID auto config: out of memory!\n");
   2751 		    return NULL; /* XXX probably should panic? */
   2752 	}
   2753 
   2754 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2755 		/* Got the label.  Does it look reasonable? */
   2756 		if (rf_reasonable_label(clabel, numsecs) &&
   2757 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2758 #ifdef DEBUG
   2759 			printf("Component on: %s: %llu\n",
   2760 				cname, (unsigned long long)size);
   2761 			rf_print_component_label(clabel);
   2762 #endif
   2763 			/* if it's reasonable, add it, else ignore it. */
   2764 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2765 				M_NOWAIT);
   2766 			if (ac == NULL) {
   2767 				free(clabel, M_RAIDFRAME);
   2768 				goto oomem;
   2769 			}
   2770 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2771 			ac->dev = dev;
   2772 			ac->vp = vp;
   2773 			ac->clabel = clabel;
   2774 			ac->next = ac_list;
   2775 			ac_list = ac;
   2776 			good_one = 1;
   2777 		}
   2778 	}
   2779 	if (!good_one) {
   2780 		/* cleanup */
   2781 		free(clabel, M_RAIDFRAME);
   2782 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2783 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2784 		vput(vp);
   2785 	}
   2786 	return ac_list;
   2787 }
   2788 
   2789 RF_AutoConfig_t *
   2790 rf_find_raid_components(void)
   2791 {
   2792 	struct vnode *vp;
   2793 	struct disklabel label;
   2794 	device_t dv;
   2795 	deviter_t di;
   2796 	dev_t dev;
   2797 	int bmajor, bminor, wedge, rf_part_found;
   2798 	int error;
   2799 	int i;
   2800 	RF_AutoConfig_t *ac_list;
   2801 	uint64_t numsecs;
   2802 	unsigned secsize;
   2803 	int dowedges;
   2804 
   2805 	/* initialize the AutoConfig list */
   2806 	ac_list = NULL;
   2807 
   2808 	/*
   2809 	 * we begin by trolling through *all* the devices on the system *twice*
   2810 	 * first we scan for wedges, second for other devices. This avoids
   2811 	 * using a raw partition instead of a wedge that covers the whole disk
   2812 	 */
   2813 
   2814 	for (dowedges=1; dowedges>=0; --dowedges) {
   2815 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2816 		     dv = deviter_next(&di)) {
   2817 
   2818 			/* we are only interested in disks... */
   2819 			if (device_class(dv) != DV_DISK)
   2820 				continue;
   2821 
   2822 			/* we don't care about floppies... */
   2823 			if (device_is_a(dv, "fd")) {
   2824 				continue;
   2825 			}
   2826 
   2827 			/* we don't care about CD's... */
   2828 			if (device_is_a(dv, "cd")) {
   2829 				continue;
   2830 			}
   2831 
   2832 			/* we don't care about md's... */
   2833 			if (device_is_a(dv, "md")) {
   2834 				continue;
   2835 			}
   2836 
   2837 			/* hdfd is the Atari/Hades floppy driver */
   2838 			if (device_is_a(dv, "hdfd")) {
   2839 				continue;
   2840 			}
   2841 
   2842 			/* fdisa is the Atari/Milan floppy driver */
   2843 			if (device_is_a(dv, "fdisa")) {
   2844 				continue;
   2845 			}
   2846 
   2847 			/* are we in the wedges pass ? */
   2848 			wedge = device_is_a(dv, "dk");
   2849 			if (wedge != dowedges) {
   2850 				continue;
   2851 			}
   2852 
   2853 			/* need to find the device_name_to_block_device_major stuff */
   2854 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2855 
   2856 			rf_part_found = 0; /*No raid partition as yet*/
   2857 
   2858 			/* get a vnode for the raw partition of this disk */
   2859 			bminor = minor(device_unit(dv));
   2860 			dev = wedge ? makedev(bmajor, bminor) :
   2861 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2862 			if (bdevvp(dev, &vp))
   2863 				panic("RAID can't alloc vnode");
   2864 
   2865 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2866 
   2867 			if (error) {
   2868 				/* "Who cares."  Continue looking
   2869 				   for something that exists*/
   2870 				vput(vp);
   2871 				continue;
   2872 			}
   2873 
   2874 			error = getdisksize(vp, &numsecs, &secsize);
   2875 			if (error) {
   2876 				/*
   2877 				 * Pseudo devices like vnd and cgd can be
   2878 				 * opened but may still need some configuration.
   2879 				 * Ignore these quietly.
   2880 				 */
   2881 				if (error != ENXIO)
   2882 					printf("RAIDframe: can't get disk size"
   2883 					    " for dev %s (%d)\n",
   2884 					    device_xname(dv), error);
   2885 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2886 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2887 				vput(vp);
   2888 				continue;
   2889 			}
   2890 			if (wedge) {
   2891 				struct dkwedge_info dkw;
   2892 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2893 				    NOCRED);
   2894 				if (error) {
   2895 					printf("RAIDframe: can't get wedge info for "
   2896 					    "dev %s (%d)\n", device_xname(dv), error);
   2897 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2898 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2899 					vput(vp);
   2900 					continue;
   2901 				}
   2902 
   2903 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2904 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2905 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2906 					vput(vp);
   2907 					continue;
   2908 				}
   2909 
   2910 				ac_list = rf_get_component(ac_list, dev, vp,
   2911 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2912 				rf_part_found = 1; /*There is a raid component on this disk*/
   2913 				continue;
   2914 			}
   2915 
   2916 			/* Ok, the disk exists.  Go get the disklabel. */
   2917 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2918 			if (error) {
   2919 				/*
   2920 				 * XXX can't happen - open() would
   2921 				 * have errored out (or faked up one)
   2922 				 */
   2923 				if (error != ENOTTY)
   2924 					printf("RAIDframe: can't get label for dev "
   2925 					    "%s (%d)\n", device_xname(dv), error);
   2926 			}
   2927 
   2928 			/* don't need this any more.  We'll allocate it again
   2929 			   a little later if we really do... */
   2930 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2931 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2932 			vput(vp);
   2933 
   2934 			if (error)
   2935 				continue;
   2936 
   2937 			rf_part_found = 0; /*No raid partitions yet*/
   2938 			for (i = 0; i < label.d_npartitions; i++) {
   2939 				char cname[sizeof(ac_list->devname)];
   2940 
   2941 				/* We only support partitions marked as RAID */
   2942 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2943 					continue;
   2944 
   2945 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2946 				if (bdevvp(dev, &vp))
   2947 					panic("RAID can't alloc vnode");
   2948 
   2949 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2950 				if (error) {
   2951 					/* Whatever... */
   2952 					vput(vp);
   2953 					continue;
   2954 				}
   2955 				snprintf(cname, sizeof(cname), "%s%c",
   2956 				    device_xname(dv), 'a' + i);
   2957 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2958 					label.d_partitions[i].p_size, numsecs, secsize);
   2959 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2960 			}
   2961 
   2962 			/*
   2963 			 *If there is no raid component on this disk, either in a
   2964 			 *disklabel or inside a wedge, check the raw partition as well,
   2965 			 *as it is possible to configure raid components on raw disk
   2966 			 *devices.
   2967 			 */
   2968 
   2969 			if (!rf_part_found) {
   2970 				char cname[sizeof(ac_list->devname)];
   2971 
   2972 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2973 				if (bdevvp(dev, &vp))
   2974 					panic("RAID can't alloc vnode");
   2975 
   2976 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2977 				if (error) {
   2978 					/* Whatever... */
   2979 					vput(vp);
   2980 					continue;
   2981 				}
   2982 				snprintf(cname, sizeof(cname), "%s%c",
   2983 				    device_xname(dv), 'a' + RAW_PART);
   2984 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2985 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2986 			}
   2987 		}
   2988 		deviter_release(&di);
   2989 	}
   2990 	return ac_list;
   2991 }
   2992 
   2993 
   2994 int
   2995 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2996 {
   2997 
   2998 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2999 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3000 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3001 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3002 	    clabel->row >=0 &&
   3003 	    clabel->column >= 0 &&
   3004 	    clabel->num_rows > 0 &&
   3005 	    clabel->num_columns > 0 &&
   3006 	    clabel->row < clabel->num_rows &&
   3007 	    clabel->column < clabel->num_columns &&
   3008 	    clabel->blockSize > 0 &&
   3009 	    /*
   3010 	     * numBlocksHi may contain garbage, but it is ok since
   3011 	     * the type is unsigned.  If it is really garbage,
   3012 	     * rf_fix_old_label_size() will fix it.
   3013 	     */
   3014 	    rf_component_label_numblocks(clabel) > 0) {
   3015 		/*
   3016 		 * label looks reasonable enough...
   3017 		 * let's make sure it has no old garbage.
   3018 		 */
   3019 		if (numsecs)
   3020 			rf_fix_old_label_size(clabel, numsecs);
   3021 		return(1);
   3022 	}
   3023 	return(0);
   3024 }
   3025 
   3026 
   3027 /*
   3028  * For reasons yet unknown, some old component labels have garbage in
   3029  * the newer numBlocksHi region, and this causes lossage.  Since those
   3030  * disks will also have numsecs set to less than 32 bits of sectors,
   3031  * we can determine when this corruption has occurred, and fix it.
   3032  *
   3033  * The exact same problem, with the same unknown reason, happens to
   3034  * the partitionSizeHi member as well.
   3035  */
   3036 static void
   3037 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3038 {
   3039 
   3040 	if (numsecs < ((uint64_t)1 << 32)) {
   3041 		if (clabel->numBlocksHi) {
   3042 			printf("WARNING: total sectors < 32 bits, yet "
   3043 			       "numBlocksHi set\n"
   3044 			       "WARNING: resetting numBlocksHi to zero.\n");
   3045 			clabel->numBlocksHi = 0;
   3046 		}
   3047 
   3048 		if (clabel->partitionSizeHi) {
   3049 			printf("WARNING: total sectors < 32 bits, yet "
   3050 			       "partitionSizeHi set\n"
   3051 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3052 			clabel->partitionSizeHi = 0;
   3053 		}
   3054 	}
   3055 }
   3056 
   3057 
   3058 #ifdef DEBUG
   3059 void
   3060 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3061 {
   3062 	uint64_t numBlocks;
   3063 	static const char *rp[] = {
   3064 	    "No", "Force", "Soft", "*invalid*"
   3065 	};
   3066 
   3067 
   3068 	numBlocks = rf_component_label_numblocks(clabel);
   3069 
   3070 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3071 	       clabel->row, clabel->column,
   3072 	       clabel->num_rows, clabel->num_columns);
   3073 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3074 	       clabel->version, clabel->serial_number,
   3075 	       clabel->mod_counter);
   3076 	printf("   Clean: %s Status: %d\n",
   3077 	       clabel->clean ? "Yes" : "No", clabel->status);
   3078 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3079 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3080 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3081 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3082 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3083 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3084 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3085 #if 0
   3086 	   printf("   Config order: %d\n", clabel->config_order);
   3087 #endif
   3088 
   3089 }
   3090 #endif
   3091 
   3092 RF_ConfigSet_t *
   3093 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3094 {
   3095 	RF_AutoConfig_t *ac;
   3096 	RF_ConfigSet_t *config_sets;
   3097 	RF_ConfigSet_t *cset;
   3098 	RF_AutoConfig_t *ac_next;
   3099 
   3100 
   3101 	config_sets = NULL;
   3102 
   3103 	/* Go through the AutoConfig list, and figure out which components
   3104 	   belong to what sets.  */
   3105 	ac = ac_list;
   3106 	while(ac!=NULL) {
   3107 		/* we're going to putz with ac->next, so save it here
   3108 		   for use at the end of the loop */
   3109 		ac_next = ac->next;
   3110 
   3111 		if (config_sets == NULL) {
   3112 			/* will need at least this one... */
   3113 			config_sets = (RF_ConfigSet_t *)
   3114 				malloc(sizeof(RF_ConfigSet_t),
   3115 				       M_RAIDFRAME, M_NOWAIT);
   3116 			if (config_sets == NULL) {
   3117 				panic("rf_create_auto_sets: No memory!");
   3118 			}
   3119 			/* this one is easy :) */
   3120 			config_sets->ac = ac;
   3121 			config_sets->next = NULL;
   3122 			config_sets->rootable = 0;
   3123 			ac->next = NULL;
   3124 		} else {
   3125 			/* which set does this component fit into? */
   3126 			cset = config_sets;
   3127 			while(cset!=NULL) {
   3128 				if (rf_does_it_fit(cset, ac)) {
   3129 					/* looks like it matches... */
   3130 					ac->next = cset->ac;
   3131 					cset->ac = ac;
   3132 					break;
   3133 				}
   3134 				cset = cset->next;
   3135 			}
   3136 			if (cset==NULL) {
   3137 				/* didn't find a match above... new set..*/
   3138 				cset = (RF_ConfigSet_t *)
   3139 					malloc(sizeof(RF_ConfigSet_t),
   3140 					       M_RAIDFRAME, M_NOWAIT);
   3141 				if (cset == NULL) {
   3142 					panic("rf_create_auto_sets: No memory!");
   3143 				}
   3144 				cset->ac = ac;
   3145 				ac->next = NULL;
   3146 				cset->next = config_sets;
   3147 				cset->rootable = 0;
   3148 				config_sets = cset;
   3149 			}
   3150 		}
   3151 		ac = ac_next;
   3152 	}
   3153 
   3154 
   3155 	return(config_sets);
   3156 }
   3157 
   3158 static int
   3159 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3160 {
   3161 	RF_ComponentLabel_t *clabel1, *clabel2;
   3162 
   3163 	/* If this one matches the *first* one in the set, that's good
   3164 	   enough, since the other members of the set would have been
   3165 	   through here too... */
   3166 	/* note that we are not checking partitionSize here..
   3167 
   3168 	   Note that we are also not checking the mod_counters here.
   3169 	   If everything else matches except the mod_counter, that's
   3170 	   good enough for this test.  We will deal with the mod_counters
   3171 	   a little later in the autoconfiguration process.
   3172 
   3173 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3174 
   3175 	   The reason we don't check for this is that failed disks
   3176 	   will have lower modification counts.  If those disks are
   3177 	   not added to the set they used to belong to, then they will
   3178 	   form their own set, which may result in 2 different sets,
   3179 	   for example, competing to be configured at raid0, and
   3180 	   perhaps competing to be the root filesystem set.  If the
   3181 	   wrong ones get configured, or both attempt to become /,
   3182 	   weird behaviour and or serious lossage will occur.  Thus we
   3183 	   need to bring them into the fold here, and kick them out at
   3184 	   a later point.
   3185 
   3186 	*/
   3187 
   3188 	clabel1 = cset->ac->clabel;
   3189 	clabel2 = ac->clabel;
   3190 	if ((clabel1->version == clabel2->version) &&
   3191 	    (clabel1->serial_number == clabel2->serial_number) &&
   3192 	    (clabel1->num_rows == clabel2->num_rows) &&
   3193 	    (clabel1->num_columns == clabel2->num_columns) &&
   3194 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3195 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3196 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3197 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3198 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3199 	    (clabel1->blockSize == clabel2->blockSize) &&
   3200 	    rf_component_label_numblocks(clabel1) ==
   3201 	    rf_component_label_numblocks(clabel2) &&
   3202 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3203 	    (clabel1->root_partition == clabel2->root_partition) &&
   3204 	    (clabel1->last_unit == clabel2->last_unit) &&
   3205 	    (clabel1->config_order == clabel2->config_order)) {
   3206 		/* if it get's here, it almost *has* to be a match */
   3207 	} else {
   3208 		/* it's not consistent with somebody in the set..
   3209 		   punt */
   3210 		return(0);
   3211 	}
   3212 	/* all was fine.. it must fit... */
   3213 	return(1);
   3214 }
   3215 
   3216 int
   3217 rf_have_enough_components(RF_ConfigSet_t *cset)
   3218 {
   3219 	RF_AutoConfig_t *ac;
   3220 	RF_AutoConfig_t *auto_config;
   3221 	RF_ComponentLabel_t *clabel;
   3222 	int c;
   3223 	int num_cols;
   3224 	int num_missing;
   3225 	int mod_counter;
   3226 	int mod_counter_found;
   3227 	int even_pair_failed;
   3228 	char parity_type;
   3229 
   3230 
   3231 	/* check to see that we have enough 'live' components
   3232 	   of this set.  If so, we can configure it if necessary */
   3233 
   3234 	num_cols = cset->ac->clabel->num_columns;
   3235 	parity_type = cset->ac->clabel->parityConfig;
   3236 
   3237 	/* XXX Check for duplicate components!?!?!? */
   3238 
   3239 	/* Determine what the mod_counter is supposed to be for this set. */
   3240 
   3241 	mod_counter_found = 0;
   3242 	mod_counter = 0;
   3243 	ac = cset->ac;
   3244 	while(ac!=NULL) {
   3245 		if (mod_counter_found==0) {
   3246 			mod_counter = ac->clabel->mod_counter;
   3247 			mod_counter_found = 1;
   3248 		} else {
   3249 			if (ac->clabel->mod_counter > mod_counter) {
   3250 				mod_counter = ac->clabel->mod_counter;
   3251 			}
   3252 		}
   3253 		ac = ac->next;
   3254 	}
   3255 
   3256 	num_missing = 0;
   3257 	auto_config = cset->ac;
   3258 
   3259 	even_pair_failed = 0;
   3260 	for(c=0; c<num_cols; c++) {
   3261 		ac = auto_config;
   3262 		while(ac!=NULL) {
   3263 			if ((ac->clabel->column == c) &&
   3264 			    (ac->clabel->mod_counter == mod_counter)) {
   3265 				/* it's this one... */
   3266 #ifdef DEBUG
   3267 				printf("Found: %s at %d\n",
   3268 				       ac->devname,c);
   3269 #endif
   3270 				break;
   3271 			}
   3272 			ac=ac->next;
   3273 		}
   3274 		if (ac==NULL) {
   3275 				/* Didn't find one here! */
   3276 				/* special case for RAID 1, especially
   3277 				   where there are more than 2
   3278 				   components (where RAIDframe treats
   3279 				   things a little differently :( ) */
   3280 			if (parity_type == '1') {
   3281 				if (c%2 == 0) { /* even component */
   3282 					even_pair_failed = 1;
   3283 				} else { /* odd component.  If
   3284 					    we're failed, and
   3285 					    so is the even
   3286 					    component, it's
   3287 					    "Good Night, Charlie" */
   3288 					if (even_pair_failed == 1) {
   3289 						return(0);
   3290 					}
   3291 				}
   3292 			} else {
   3293 				/* normal accounting */
   3294 				num_missing++;
   3295 			}
   3296 		}
   3297 		if ((parity_type == '1') && (c%2 == 1)) {
   3298 				/* Just did an even component, and we didn't
   3299 				   bail.. reset the even_pair_failed flag,
   3300 				   and go on to the next component.... */
   3301 			even_pair_failed = 0;
   3302 		}
   3303 	}
   3304 
   3305 	clabel = cset->ac->clabel;
   3306 
   3307 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3308 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3309 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3310 		/* XXX this needs to be made *much* more general */
   3311 		/* Too many failures */
   3312 		return(0);
   3313 	}
   3314 	/* otherwise, all is well, and we've got enough to take a kick
   3315 	   at autoconfiguring this set */
   3316 	return(1);
   3317 }
   3318 
   3319 void
   3320 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3321 			RF_Raid_t *raidPtr)
   3322 {
   3323 	RF_ComponentLabel_t *clabel;
   3324 	int i;
   3325 
   3326 	clabel = ac->clabel;
   3327 
   3328 	/* 1. Fill in the common stuff */
   3329 	config->numCol = clabel->num_columns;
   3330 	config->numSpare = 0; /* XXX should this be set here? */
   3331 	config->sectPerSU = clabel->sectPerSU;
   3332 	config->SUsPerPU = clabel->SUsPerPU;
   3333 	config->SUsPerRU = clabel->SUsPerRU;
   3334 	config->parityConfig = clabel->parityConfig;
   3335 	/* XXX... */
   3336 	strcpy(config->diskQueueType,"fifo");
   3337 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3338 	config->layoutSpecificSize = 0; /* XXX ?? */
   3339 
   3340 	while(ac!=NULL) {
   3341 		/* row/col values will be in range due to the checks
   3342 		   in reasonable_label() */
   3343 		strcpy(config->devnames[0][ac->clabel->column],
   3344 		       ac->devname);
   3345 		ac = ac->next;
   3346 	}
   3347 
   3348 	for(i=0;i<RF_MAXDBGV;i++) {
   3349 		config->debugVars[i][0] = 0;
   3350 	}
   3351 }
   3352 
   3353 int
   3354 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3355 {
   3356 	RF_ComponentLabel_t *clabel;
   3357 	int column;
   3358 	int sparecol;
   3359 
   3360 	raidPtr->autoconfigure = new_value;
   3361 
   3362 	for(column=0; column<raidPtr->numCol; column++) {
   3363 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3364 			clabel = raidget_component_label(raidPtr, column);
   3365 			clabel->autoconfigure = new_value;
   3366 			raidflush_component_label(raidPtr, column);
   3367 		}
   3368 	}
   3369 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3370 		sparecol = raidPtr->numCol + column;
   3371 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3372 			clabel = raidget_component_label(raidPtr, sparecol);
   3373 			clabel->autoconfigure = new_value;
   3374 			raidflush_component_label(raidPtr, sparecol);
   3375 		}
   3376 	}
   3377 	return(new_value);
   3378 }
   3379 
   3380 int
   3381 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3382 {
   3383 	RF_ComponentLabel_t *clabel;
   3384 	int column;
   3385 	int sparecol;
   3386 
   3387 	raidPtr->root_partition = new_value;
   3388 	for(column=0; column<raidPtr->numCol; column++) {
   3389 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3390 			clabel = raidget_component_label(raidPtr, column);
   3391 			clabel->root_partition = new_value;
   3392 			raidflush_component_label(raidPtr, column);
   3393 		}
   3394 	}
   3395 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3396 		sparecol = raidPtr->numCol + column;
   3397 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3398 			clabel = raidget_component_label(raidPtr, sparecol);
   3399 			clabel->root_partition = new_value;
   3400 			raidflush_component_label(raidPtr, sparecol);
   3401 		}
   3402 	}
   3403 	return(new_value);
   3404 }
   3405 
   3406 void
   3407 rf_release_all_vps(RF_ConfigSet_t *cset)
   3408 {
   3409 	RF_AutoConfig_t *ac;
   3410 
   3411 	ac = cset->ac;
   3412 	while(ac!=NULL) {
   3413 		/* Close the vp, and give it back */
   3414 		if (ac->vp) {
   3415 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3416 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3417 			vput(ac->vp);
   3418 			ac->vp = NULL;
   3419 		}
   3420 		ac = ac->next;
   3421 	}
   3422 }
   3423 
   3424 
   3425 void
   3426 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3427 {
   3428 	RF_AutoConfig_t *ac;
   3429 	RF_AutoConfig_t *next_ac;
   3430 
   3431 	ac = cset->ac;
   3432 	while(ac!=NULL) {
   3433 		next_ac = ac->next;
   3434 		/* nuke the label */
   3435 		free(ac->clabel, M_RAIDFRAME);
   3436 		/* cleanup the config structure */
   3437 		free(ac, M_RAIDFRAME);
   3438 		/* "next.." */
   3439 		ac = next_ac;
   3440 	}
   3441 	/* and, finally, nuke the config set */
   3442 	free(cset, M_RAIDFRAME);
   3443 }
   3444 
   3445 
   3446 void
   3447 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3448 {
   3449 	/* current version number */
   3450 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3451 	clabel->serial_number = raidPtr->serial_number;
   3452 	clabel->mod_counter = raidPtr->mod_counter;
   3453 
   3454 	clabel->num_rows = 1;
   3455 	clabel->num_columns = raidPtr->numCol;
   3456 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3457 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3458 
   3459 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3460 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3461 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3462 
   3463 	clabel->blockSize = raidPtr->bytesPerSector;
   3464 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3465 
   3466 	/* XXX not portable */
   3467 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3468 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3469 	clabel->autoconfigure = raidPtr->autoconfigure;
   3470 	clabel->root_partition = raidPtr->root_partition;
   3471 	clabel->last_unit = raidPtr->raidid;
   3472 	clabel->config_order = raidPtr->config_order;
   3473 
   3474 #ifndef RF_NO_PARITY_MAP
   3475 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3476 #endif
   3477 }
   3478 
   3479 struct raid_softc *
   3480 rf_auto_config_set(RF_ConfigSet_t *cset)
   3481 {
   3482 	RF_Raid_t *raidPtr;
   3483 	RF_Config_t *config;
   3484 	int raidID;
   3485 	struct raid_softc *sc;
   3486 
   3487 #ifdef DEBUG
   3488 	printf("RAID autoconfigure\n");
   3489 #endif
   3490 
   3491 	/* 1. Create a config structure */
   3492 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3493 	if (config == NULL) {
   3494 		printf("%s: Out of mem - config!?!?\n", __func__);
   3495 				/* XXX do something more intelligent here. */
   3496 		return NULL;
   3497 	}
   3498 
   3499 	/*
   3500 	   2. Figure out what RAID ID this one is supposed to live at
   3501 	   See if we can get the same RAID dev that it was configured
   3502 	   on last time..
   3503 	*/
   3504 
   3505 	raidID = cset->ac->clabel->last_unit;
   3506 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3507 	     sc = raidget(++raidID, false))
   3508 		continue;
   3509 #ifdef DEBUG
   3510 	printf("Configuring raid%d:\n",raidID);
   3511 #endif
   3512 
   3513 	if (sc == NULL)
   3514 		sc = raidget(raidID, true);
   3515 	if (sc == NULL) {
   3516 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3517 				/* XXX do something more intelligent here. */
   3518 		free(config, M_RAIDFRAME);
   3519 		return NULL;
   3520 	}
   3521 
   3522 	raidPtr = &sc->sc_r;
   3523 
   3524 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3525 	raidPtr->softc = sc;
   3526 	raidPtr->raidid = raidID;
   3527 	raidPtr->openings = RAIDOUTSTANDING;
   3528 
   3529 	/* 3. Build the configuration structure */
   3530 	rf_create_configuration(cset->ac, config, raidPtr);
   3531 
   3532 	/* 4. Do the configuration */
   3533 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3534 		raidinit(sc);
   3535 
   3536 		rf_markalldirty(raidPtr);
   3537 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3538 		switch (cset->ac->clabel->root_partition) {
   3539 		case 1:	/* Force Root */
   3540 		case 2:	/* Soft Root: root when boot partition part of raid */
   3541 			/*
   3542 			 * everything configured just fine.  Make a note
   3543 			 * that this set is eligible to be root,
   3544 			 * or forced to be root
   3545 			 */
   3546 			cset->rootable = cset->ac->clabel->root_partition;
   3547 			/* XXX do this here? */
   3548 			raidPtr->root_partition = cset->rootable;
   3549 			break;
   3550 		default:
   3551 			break;
   3552 		}
   3553 	} else {
   3554 		raidput(sc);
   3555 		sc = NULL;
   3556 	}
   3557 
   3558 	/* 5. Cleanup */
   3559 	free(config, M_RAIDFRAME);
   3560 	return sc;
   3561 }
   3562 
   3563 void
   3564 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3565 	     size_t xmin, size_t xmax)
   3566 {
   3567 	int error;
   3568 
   3569 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3570 	pool_sethiwat(p, xmax);
   3571 	if ((error = pool_prime(p, xmin)) != 0)
   3572 		panic("%s: failed to prime pool: %d", __func__, error);
   3573 	pool_setlowat(p, xmin);
   3574 }
   3575 
   3576 /*
   3577  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3578  * to see if there is IO pending and if that IO could possibly be done
   3579  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3580  * otherwise.
   3581  *
   3582  */
   3583 int
   3584 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3585 {
   3586 	struct raid_softc *rs;
   3587 	struct dk_softc *dksc;
   3588 
   3589 	rs = raidPtr->softc;
   3590 	dksc = &rs->sc_dksc;
   3591 
   3592 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3593 		return 1;
   3594 
   3595 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3596 		/* there is work to do */
   3597 		return 0;
   3598 	}
   3599 	/* default is nothing to do */
   3600 	return 1;
   3601 }
   3602 
   3603 int
   3604 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3605 {
   3606 	uint64_t numsecs;
   3607 	unsigned secsize;
   3608 	int error;
   3609 
   3610 	error = getdisksize(vp, &numsecs, &secsize);
   3611 	if (error == 0) {
   3612 		diskPtr->blockSize = secsize;
   3613 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3614 		diskPtr->partitionSize = numsecs;
   3615 		return 0;
   3616 	}
   3617 	return error;
   3618 }
   3619 
   3620 static int
   3621 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3622 {
   3623 	return 1;
   3624 }
   3625 
   3626 static void
   3627 raid_attach(device_t parent, device_t self, void *aux)
   3628 {
   3629 }
   3630 
   3631 
   3632 static int
   3633 raid_detach(device_t self, int flags)
   3634 {
   3635 	int error;
   3636 	struct raid_softc *rs = raidsoftc(self);
   3637 
   3638 	if (rs == NULL)
   3639 		return ENXIO;
   3640 
   3641 	if ((error = raidlock(rs)) != 0)
   3642 		return (error);
   3643 
   3644 	error = raid_detach_unlocked(rs);
   3645 
   3646 	raidunlock(rs);
   3647 
   3648 	/* XXX raid can be referenced here */
   3649 
   3650 	if (error)
   3651 		return error;
   3652 
   3653 	/* Free the softc */
   3654 	raidput(rs);
   3655 
   3656 	return 0;
   3657 }
   3658 
   3659 static void
   3660 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3661 {
   3662 	struct dk_softc *dksc = &rs->sc_dksc;
   3663 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3664 
   3665 	memset(dg, 0, sizeof(*dg));
   3666 
   3667 	dg->dg_secperunit = raidPtr->totalSectors;
   3668 	dg->dg_secsize = raidPtr->bytesPerSector;
   3669 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3670 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3671 
   3672 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3673 }
   3674 
   3675 /*
   3676  * Get cache info for all the components (including spares).
   3677  * Returns intersection of all the cache flags of all disks, or first
   3678  * error if any encountered.
   3679  * XXXfua feature flags can change as spares are added - lock down somehow
   3680  */
   3681 static int
   3682 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3683 {
   3684 	int c;
   3685 	int error;
   3686 	int dkwhole = 0, dkpart;
   3687 
   3688 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3689 		/*
   3690 		 * Check any non-dead disk, even when currently being
   3691 		 * reconstructed.
   3692 		 */
   3693 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3694 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3695 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3696 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3697 			if (error) {
   3698 				if (error != ENODEV) {
   3699 					printf("raid%d: get cache for component %s failed\n",
   3700 					    raidPtr->raidid,
   3701 					    raidPtr->Disks[c].devname);
   3702 				}
   3703 
   3704 				return error;
   3705 			}
   3706 
   3707 			if (c == 0)
   3708 				dkwhole = dkpart;
   3709 			else
   3710 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3711 		}
   3712 	}
   3713 
   3714 	*data = dkwhole;
   3715 
   3716 	return 0;
   3717 }
   3718 
   3719 /*
   3720  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3721  * We end up returning whatever error was returned by the first cache flush
   3722  * that fails.
   3723  */
   3724 
   3725 int
   3726 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3727 {
   3728 	int c, sparecol;
   3729 	int e,error;
   3730 	int force = 1;
   3731 
   3732 	error = 0;
   3733 	for (c = 0; c < raidPtr->numCol; c++) {
   3734 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3735 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3736 					  &force, FWRITE, NOCRED);
   3737 			if (e) {
   3738 				if (e != ENODEV)
   3739 					printf("raid%d: cache flush to component %s failed.\n",
   3740 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3741 				if (error == 0) {
   3742 					error = e;
   3743 				}
   3744 			}
   3745 		}
   3746 	}
   3747 
   3748 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3749 		sparecol = raidPtr->numCol + c;
   3750 		/* Need to ensure that the reconstruct actually completed! */
   3751 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3752 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3753 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3754 			if (e) {
   3755 				if (e != ENODEV)
   3756 					printf("raid%d: cache flush to component %s failed.\n",
   3757 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3758 				if (error == 0) {
   3759 					error = e;
   3760 				}
   3761 			}
   3762 		}
   3763 	}
   3764 	return error;
   3765 }
   3766 
   3767 /* Fill in info with the current status */
   3768 void
   3769 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3770 {
   3771 
   3772 	if (raidPtr->status != rf_rs_reconstructing) {
   3773 		info->total = 100;
   3774 		info->completed = 100;
   3775 	} else {
   3776 		info->total = raidPtr->reconControl->numRUsTotal;
   3777 		info->completed = raidPtr->reconControl->numRUsComplete;
   3778 	}
   3779 	info->remaining = info->total - info->completed;
   3780 }
   3781 
   3782 /* Fill in info with the current status */
   3783 void
   3784 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3785 {
   3786 
   3787 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3788 		info->total = raidPtr->Layout.numStripe;
   3789 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3790 	} else {
   3791 		info->completed = 100;
   3792 		info->total = 100;
   3793 	}
   3794 	info->remaining = info->total - info->completed;
   3795 }
   3796 
   3797 /* Fill in info with the current status */
   3798 void
   3799 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3800 {
   3801 
   3802 	if (raidPtr->copyback_in_progress == 1) {
   3803 		info->total = raidPtr->Layout.numStripe;
   3804 		info->completed = raidPtr->copyback_stripes_done;
   3805 		info->remaining = info->total - info->completed;
   3806 	} else {
   3807 		info->remaining = 0;
   3808 		info->completed = 100;
   3809 		info->total = 100;
   3810 	}
   3811 }
   3812 
   3813 /* Fill in config with the current info */
   3814 int
   3815 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3816 {
   3817 	int	d, i, j;
   3818 
   3819 	if (!raidPtr->valid)
   3820 		return (ENODEV);
   3821 	config->cols = raidPtr->numCol;
   3822 	config->ndevs = raidPtr->numCol;
   3823 	if (config->ndevs >= RF_MAX_DISKS)
   3824 		return (ENOMEM);
   3825 	config->nspares = raidPtr->numSpare;
   3826 	if (config->nspares >= RF_MAX_DISKS)
   3827 		return (ENOMEM);
   3828 	config->maxqdepth = raidPtr->maxQueueDepth;
   3829 	d = 0;
   3830 	for (j = 0; j < config->cols; j++) {
   3831 		config->devs[d] = raidPtr->Disks[j];
   3832 		d++;
   3833 	}
   3834 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3835 		config->spares[i] = raidPtr->Disks[j];
   3836 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3837 			/* XXX: raidctl(8) expects to see this as a used spare */
   3838 			config->spares[i].status = rf_ds_used_spare;
   3839 		}
   3840 	}
   3841 	return 0;
   3842 }
   3843 
   3844 int
   3845 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3846 {
   3847 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3848 	RF_ComponentLabel_t *raid_clabel;
   3849 	int column = clabel->column;
   3850 
   3851 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3852 		return EINVAL;
   3853 	raid_clabel = raidget_component_label(raidPtr, column);
   3854 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3855 
   3856 	return 0;
   3857 }
   3858 
   3859 /*
   3860  * Module interface
   3861  */
   3862 
   3863 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3864 
   3865 #ifdef _MODULE
   3866 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3867 #endif
   3868 
   3869 static int raid_modcmd(modcmd_t, void *);
   3870 static int raid_modcmd_init(void);
   3871 static int raid_modcmd_fini(void);
   3872 
   3873 static int
   3874 raid_modcmd(modcmd_t cmd, void *data)
   3875 {
   3876 	int error;
   3877 
   3878 	error = 0;
   3879 	switch (cmd) {
   3880 	case MODULE_CMD_INIT:
   3881 		error = raid_modcmd_init();
   3882 		break;
   3883 	case MODULE_CMD_FINI:
   3884 		error = raid_modcmd_fini();
   3885 		break;
   3886 	default:
   3887 		error = ENOTTY;
   3888 		break;
   3889 	}
   3890 	return error;
   3891 }
   3892 
   3893 static int
   3894 raid_modcmd_init(void)
   3895 {
   3896 	int error;
   3897 	int bmajor, cmajor;
   3898 
   3899 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3900 	mutex_enter(&raid_lock);
   3901 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3902 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3903 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3904 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3905 
   3906 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3907 #endif
   3908 
   3909 	bmajor = cmajor = -1;
   3910 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3911 	    &raid_cdevsw, &cmajor);
   3912 	if (error != 0 && error != EEXIST) {
   3913 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3914 		mutex_exit(&raid_lock);
   3915 		return error;
   3916 	}
   3917 #ifdef _MODULE
   3918 	error = config_cfdriver_attach(&raid_cd);
   3919 	if (error != 0) {
   3920 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3921 		    __func__, error);
   3922 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3923 		mutex_exit(&raid_lock);
   3924 		return error;
   3925 	}
   3926 #endif
   3927 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3928 	if (error != 0) {
   3929 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3930 		    __func__, error);
   3931 #ifdef _MODULE
   3932 		config_cfdriver_detach(&raid_cd);
   3933 #endif
   3934 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3935 		mutex_exit(&raid_lock);
   3936 		return error;
   3937 	}
   3938 
   3939 	raidautoconfigdone = false;
   3940 
   3941 	mutex_exit(&raid_lock);
   3942 
   3943 	if (error == 0) {
   3944 		if (rf_BootRaidframe(true) == 0)
   3945 			aprint_verbose("Kernelized RAIDframe activated\n");
   3946 		else
   3947 			panic("Serious error activating RAID!!");
   3948 	}
   3949 
   3950 	/*
   3951 	 * Register a finalizer which will be used to auto-config RAID
   3952 	 * sets once all real hardware devices have been found.
   3953 	 */
   3954 	error = config_finalize_register(NULL, rf_autoconfig);
   3955 	if (error != 0) {
   3956 		aprint_error("WARNING: unable to register RAIDframe "
   3957 		    "finalizer\n");
   3958 		error = 0;
   3959 	}
   3960 
   3961 	return error;
   3962 }
   3963 
   3964 static int
   3965 raid_modcmd_fini(void)
   3966 {
   3967 	int error;
   3968 
   3969 	mutex_enter(&raid_lock);
   3970 
   3971 	/* Don't allow unload if raid device(s) exist.  */
   3972 	if (!LIST_EMPTY(&raids)) {
   3973 		mutex_exit(&raid_lock);
   3974 		return EBUSY;
   3975 	}
   3976 
   3977 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3978 	if (error != 0) {
   3979 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3980 		mutex_exit(&raid_lock);
   3981 		return error;
   3982 	}
   3983 #ifdef _MODULE
   3984 	error = config_cfdriver_detach(&raid_cd);
   3985 	if (error != 0) {
   3986 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3987 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3988 		mutex_exit(&raid_lock);
   3989 		return error;
   3990 	}
   3991 #endif
   3992 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3993 	if (error != 0) {
   3994 		aprint_error("%s: cannot detach devsw\n",__func__);
   3995 #ifdef _MODULE
   3996 		config_cfdriver_attach(&raid_cd);
   3997 #endif
   3998 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3999 		mutex_exit(&raid_lock);
   4000 		return error;
   4001 	}
   4002 	rf_BootRaidframe(false);
   4003 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4004 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4005 	rf_destroy_cond2(rf_sparet_wait_cv);
   4006 	rf_destroy_cond2(rf_sparet_resp_cv);
   4007 #endif
   4008 	mutex_exit(&raid_lock);
   4009 	mutex_destroy(&raid_lock);
   4010 
   4011 	return error;
   4012 }
   4013