Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.391.2.1
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.391.2.1 2021/05/13 00:47:32 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.391.2.1 2021/05/13 00:47:32 thorpej Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    179 
    180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    181 
    182 /* prototypes */
    183 static void KernelWakeupFunc(struct buf *);
    184 static void InitBP(struct buf *, struct vnode *, unsigned,
    185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    186     void *, int);
    187 static void raidinit(struct raid_softc *);
    188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    190 
    191 static int raid_match(device_t, cfdata_t, void *);
    192 static void raid_attach(device_t, device_t, void *);
    193 static int raid_detach(device_t, int);
    194 
    195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t);
    197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t, int);
    199 
    200 static int raidwrite_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 static int raidread_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 
    205 static int raid_diskstart(device_t, struct buf *bp);
    206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    207 static int raid_lastclose(device_t);
    208 
    209 static dev_type_open(raidopen);
    210 static dev_type_close(raidclose);
    211 static dev_type_read(raidread);
    212 static dev_type_write(raidwrite);
    213 static dev_type_ioctl(raidioctl);
    214 static dev_type_strategy(raidstrategy);
    215 static dev_type_dump(raiddump);
    216 static dev_type_size(raidsize);
    217 
    218 const struct bdevsw raid_bdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_strategy = raidstrategy,
    222 	.d_ioctl = raidioctl,
    223 	.d_dump = raiddump,
    224 	.d_psize = raidsize,
    225 	.d_discard = nodiscard,
    226 	.d_flag = D_DISK
    227 };
    228 
    229 const struct cdevsw raid_cdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_read = raidread,
    233 	.d_write = raidwrite,
    234 	.d_ioctl = raidioctl,
    235 	.d_stop = nostop,
    236 	.d_tty = notty,
    237 	.d_poll = nopoll,
    238 	.d_mmap = nommap,
    239 	.d_kqfilter = nokqfilter,
    240 	.d_discard = nodiscard,
    241 	.d_flag = D_DISK
    242 };
    243 
    244 static struct dkdriver rf_dkdriver = {
    245 	.d_open = raidopen,
    246 	.d_close = raidclose,
    247 	.d_strategy = raidstrategy,
    248 	.d_diskstart = raid_diskstart,
    249 	.d_dumpblocks = raid_dumpblocks,
    250 	.d_lastclose = raid_lastclose,
    251 	.d_minphys = minphys
    252 };
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /* Internal representation of a rf_recon_req */
    263 struct rf_recon_req_internal {
    264 	RF_RowCol_t col;
    265 	RF_ReconReqFlags_t flags;
    266 	void   *raidPtr;
    267 };
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 void rf_ReconThread(struct rf_recon_req_internal *);
    304 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    307 int rf_autoconfig(device_t);
    308 void rf_buildroothack(RF_ConfigSet_t *);
    309 
    310 RF_AutoConfig_t *rf_find_raid_components(void);
    311 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    313 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    314 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    315 int rf_set_autoconfig(RF_Raid_t *, int);
    316 int rf_set_rootpartition(RF_Raid_t *, int);
    317 void rf_release_all_vps(RF_ConfigSet_t *);
    318 void rf_cleanup_config_set(RF_ConfigSet_t *);
    319 int rf_have_enough_components(RF_ConfigSet_t *);
    320 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    322 
    323 /*
    324  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    325  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    326  * in the kernel config file.
    327  */
    328 #ifdef RAID_AUTOCONFIG
    329 int raidautoconfig = 1;
    330 #else
    331 int raidautoconfig = 0;
    332 #endif
    333 static bool raidautoconfigdone = false;
    334 
    335 struct RF_Pools_s rf_pools;
    336 
    337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    338 static kmutex_t raid_lock;
    339 
    340 static struct raid_softc *
    341 raidcreate(int unit) {
    342 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    343 	sc->sc_unit = unit;
    344 	cv_init(&sc->sc_cv, "raidunit");
    345 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    346 	return sc;
    347 }
    348 
    349 static void
    350 raiddestroy(struct raid_softc *sc) {
    351 	cv_destroy(&sc->sc_cv);
    352 	mutex_destroy(&sc->sc_mutex);
    353 	kmem_free(sc, sizeof(*sc));
    354 }
    355 
    356 static struct raid_softc *
    357 raidget(int unit, bool create) {
    358 	struct raid_softc *sc;
    359 	if (unit < 0) {
    360 #ifdef DIAGNOSTIC
    361 		panic("%s: unit %d!", __func__, unit);
    362 #endif
    363 		return NULL;
    364 	}
    365 	mutex_enter(&raid_lock);
    366 	LIST_FOREACH(sc, &raids, sc_link) {
    367 		if (sc->sc_unit == unit) {
    368 			mutex_exit(&raid_lock);
    369 			return sc;
    370 		}
    371 	}
    372 	mutex_exit(&raid_lock);
    373 	if (!create)
    374 		return NULL;
    375 	sc = raidcreate(unit);
    376 	mutex_enter(&raid_lock);
    377 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    378 	mutex_exit(&raid_lock);
    379 	return sc;
    380 }
    381 
    382 static void
    383 raidput(struct raid_softc *sc) {
    384 	mutex_enter(&raid_lock);
    385 	LIST_REMOVE(sc, sc_link);
    386 	mutex_exit(&raid_lock);
    387 	raiddestroy(sc);
    388 }
    389 
    390 void
    391 raidattach(int num)
    392 {
    393 
    394 	/*
    395 	 * Device attachment and associated initialization now occurs
    396 	 * as part of the module initialization.
    397 	 */
    398 }
    399 
    400 int
    401 rf_autoconfig(device_t self)
    402 {
    403 	RF_AutoConfig_t *ac_list;
    404 	RF_ConfigSet_t *config_sets;
    405 
    406 	if (!raidautoconfig || raidautoconfigdone == true)
    407 		return 0;
    408 
    409 	/* XXX This code can only be run once. */
    410 	raidautoconfigdone = true;
    411 
    412 #ifdef __HAVE_CPU_BOOTCONF
    413 	/*
    414 	 * 0. find the boot device if needed first so we can use it later
    415 	 * this needs to be done before we autoconfigure any raid sets,
    416 	 * because if we use wedges we are not going to be able to open
    417 	 * the boot device later
    418 	 */
    419 	if (booted_device == NULL)
    420 		cpu_bootconf();
    421 #endif
    422 	/* 1. locate all RAID components on the system */
    423 	aprint_debug("Searching for RAID components...\n");
    424 	ac_list = rf_find_raid_components();
    425 
    426 	/* 2. Sort them into their respective sets. */
    427 	config_sets = rf_create_auto_sets(ac_list);
    428 
    429 	/*
    430 	 * 3. Evaluate each set and configure the valid ones.
    431 	 * This gets done in rf_buildroothack().
    432 	 */
    433 	rf_buildroothack(config_sets);
    434 
    435 	return 1;
    436 }
    437 
    438 int
    439 rf_inited(const struct raid_softc *rs) {
    440 	return (rs->sc_flags & RAIDF_INITED) != 0;
    441 }
    442 
    443 RF_Raid_t *
    444 rf_get_raid(struct raid_softc *rs) {
    445 	return &rs->sc_r;
    446 }
    447 
    448 int
    449 rf_get_unit(const struct raid_softc *rs) {
    450 	return rs->sc_unit;
    451 }
    452 
    453 static int
    454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    455 	const char *bootname;
    456 	size_t len;
    457 
    458 	/* if bdv is NULL, the set can't contain it. exit early. */
    459 	if (bdv == NULL)
    460 		return 0;
    461 
    462 	bootname = device_xname(bdv);
    463 	len = strlen(bootname);
    464 
    465 	for (int col = 0; col < r->numCol; col++) {
    466 		const char *devname = r->Disks[col].devname;
    467 		devname += sizeof("/dev/") - 1;
    468 		if (strncmp(devname, "dk", 2) == 0) {
    469 			const char *parent =
    470 			    dkwedge_get_parent_name(r->Disks[col].dev);
    471 			if (parent != NULL)
    472 				devname = parent;
    473 		}
    474 		if (strncmp(devname, bootname, len) == 0) {
    475 			struct raid_softc *sc = r->softc;
    476 			aprint_debug("raid%d includes boot device %s\n",
    477 			    sc->sc_unit, devname);
    478 			return 1;
    479 		}
    480 	}
    481 	return 0;
    482 }
    483 
    484 void
    485 rf_buildroothack(RF_ConfigSet_t *config_sets)
    486 {
    487 	RF_ConfigSet_t *cset;
    488 	RF_ConfigSet_t *next_cset;
    489 	int num_root;
    490 	struct raid_softc *sc, *rsc;
    491 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    492 
    493 	sc = rsc = NULL;
    494 	num_root = 0;
    495 	cset = config_sets;
    496 	while (cset != NULL) {
    497 		next_cset = cset->next;
    498 		if (rf_have_enough_components(cset) &&
    499 		    cset->ac->clabel->autoconfigure == 1) {
    500 			sc = rf_auto_config_set(cset);
    501 			if (sc != NULL) {
    502 				aprint_debug("raid%d: configured ok, rootable %d\n",
    503 				    sc->sc_unit, cset->rootable);
    504 				if (cset->rootable) {
    505 					rsc = sc;
    506 					num_root++;
    507 				}
    508 			} else {
    509 				/* The autoconfig didn't work :( */
    510 				aprint_debug("Autoconfig failed\n");
    511 				rf_release_all_vps(cset);
    512 			}
    513 		} else {
    514 			/* we're not autoconfiguring this set...
    515 			   release the associated resources */
    516 			rf_release_all_vps(cset);
    517 		}
    518 		/* cleanup */
    519 		rf_cleanup_config_set(cset);
    520 		cset = next_cset;
    521 	}
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL) {
    527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    528 		return;
    529 	}
    530 
    531 	/* we found something bootable... */
    532 
    533 	/*
    534 	 * XXX: The following code assumes that the root raid
    535 	 * is the first ('a') partition. This is about the best
    536 	 * we can do with a BSD disklabel, but we might be able
    537 	 * to do better with a GPT label, by setting a specified
    538 	 * attribute to indicate the root partition. We can then
    539 	 * stash the partition number in the r->root_partition
    540 	 * high bits (the bottom 2 bits are already used). For
    541 	 * now we just set booted_partition to 0 when we override
    542 	 * root.
    543 	 */
    544 	if (num_root == 1) {
    545 		device_t candidate_root;
    546 		dksc = &rsc->sc_dksc;
    547 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    548 			char cname[sizeof(cset->ac->devname)];
    549 			/* XXX: assume partition 'a' first */
    550 			snprintf(cname, sizeof(cname), "%s%c",
    551 			    device_xname(dksc->sc_dev), 'a');
    552 			candidate_root = dkwedge_find_by_wname(cname);
    553 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    554 			    cname);
    555 			if (candidate_root == NULL) {
    556 				/*
    557 				 * If that is not found, because we don't use
    558 				 * disklabel, return the first dk child
    559 				 * XXX: we can skip the 'a' check above
    560 				 * and always do this...
    561 				 */
    562 				size_t i = 0;
    563 				candidate_root = dkwedge_find_by_parent(
    564 				    device_xname(dksc->sc_dev), &i);
    565 			}
    566 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    567 			    candidate_root);
    568 		} else
    569 			candidate_root = dksc->sc_dev;
    570 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    571 		DPRINTF("%s: booted_device=%p root_partition=%d "
    572 			"contains_boot=%d",
    573 		    __func__, booted_device, rsc->sc_r.root_partition,
    574 			   rf_containsboot(&rsc->sc_r, booted_device));
    575 		/* XXX the check for booted_device == NULL can probably be
    576 		 * dropped, now that rf_containsboot handles that case.
    577 		 */
    578 		if (booted_device == NULL ||
    579 		    rsc->sc_r.root_partition == 1 ||
    580 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    581 			booted_device = candidate_root;
    582 			booted_method = "raidframe/single";
    583 			booted_partition = 0;	/* XXX assume 'a' */
    584 			DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
    585 			    device_xname(booted_device), booted_device);
    586 		}
    587 	} else if (num_root > 1) {
    588 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    589 		    booted_device);
    590 
    591 		/*
    592 		 * Maybe the MD code can help. If it cannot, then
    593 		 * setroot() will discover that we have no
    594 		 * booted_device and will ask the user if nothing was
    595 		 * hardwired in the kernel config file
    596 		 */
    597 		if (booted_device == NULL)
    598 			return;
    599 
    600 		num_root = 0;
    601 		mutex_enter(&raid_lock);
    602 		LIST_FOREACH(sc, &raids, sc_link) {
    603 			RF_Raid_t *r = &sc->sc_r;
    604 			if (r->valid == 0)
    605 				continue;
    606 
    607 			if (r->root_partition == 0)
    608 				continue;
    609 
    610 			if (rf_containsboot(r, booted_device)) {
    611 				num_root++;
    612 				rsc = sc;
    613 				dksc = &rsc->sc_dksc;
    614 			}
    615 		}
    616 		mutex_exit(&raid_lock);
    617 
    618 		if (num_root == 1) {
    619 			booted_device = dksc->sc_dev;
    620 			booted_method = "raidframe/multi";
    621 			booted_partition = 0;	/* XXX assume 'a' */
    622 		} else {
    623 			/* we can't guess.. require the user to answer... */
    624 			boothowto |= RB_ASKNAME;
    625 		}
    626 	}
    627 }
    628 
    629 static int
    630 raidsize(dev_t dev)
    631 {
    632 	struct raid_softc *rs;
    633 	struct dk_softc *dksc;
    634 	unsigned int unit;
    635 
    636 	unit = raidunit(dev);
    637 	if ((rs = raidget(unit, false)) == NULL)
    638 		return -1;
    639 	dksc = &rs->sc_dksc;
    640 
    641 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    642 		return -1;
    643 
    644 	return dk_size(dksc, dev);
    645 }
    646 
    647 static int
    648 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    649 {
    650 	unsigned int unit;
    651 	struct raid_softc *rs;
    652 	struct dk_softc *dksc;
    653 
    654 	unit = raidunit(dev);
    655 	if ((rs = raidget(unit, false)) == NULL)
    656 		return ENXIO;
    657 	dksc = &rs->sc_dksc;
    658 
    659 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    660 		return ENODEV;
    661 
    662         /*
    663            Note that blkno is relative to this particular partition.
    664            By adding adding RF_PROTECTED_SECTORS, we get a value that
    665 	   is relative to the partition used for the underlying component.
    666         */
    667 	blkno += RF_PROTECTED_SECTORS;
    668 
    669 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    670 }
    671 
    672 static int
    673 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    674 {
    675 	struct raid_softc *rs = raidsoftc(dev);
    676 	const struct bdevsw *bdev;
    677 	RF_Raid_t *raidPtr;
    678 	int     c, sparecol, j, scol, dumpto;
    679 	int     error = 0;
    680 
    681 	raidPtr = &rs->sc_r;
    682 
    683 	/* we only support dumping to RAID 1 sets */
    684 	if (raidPtr->Layout.numDataCol != 1 ||
    685 	    raidPtr->Layout.numParityCol != 1)
    686 		return EINVAL;
    687 
    688 	if ((error = raidlock(rs)) != 0)
    689 		return error;
    690 
    691 	/* figure out what device is alive.. */
    692 
    693 	/*
    694 	   Look for a component to dump to.  The preference for the
    695 	   component to dump to is as follows:
    696 	   1) the first component
    697 	   2) a used_spare of the first component
    698 	   3) the second component
    699 	   4) a used_spare of the second component
    700 	*/
    701 
    702 	dumpto = -1;
    703 	for (c = 0; c < raidPtr->numCol; c++) {
    704 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    705 			/* this might be the one */
    706 			dumpto = c;
    707 			break;
    708 		}
    709 	}
    710 
    711 	/*
    712 	   At this point we have possibly selected a live component.
    713 	   If we didn't find a live ocmponent, we now check to see
    714 	   if there is a relevant spared component.
    715 	*/
    716 
    717 	for (c = 0; c < raidPtr->numSpare; c++) {
    718 		sparecol = raidPtr->numCol + c;
    719 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    720 			/* How about this one? */
    721 			scol = -1;
    722 			for(j=0;j<raidPtr->numCol;j++) {
    723 				if (raidPtr->Disks[j].spareCol == sparecol) {
    724 					scol = j;
    725 					break;
    726 				}
    727 			}
    728 			if (scol == 0) {
    729 				/*
    730 				   We must have found a spared first
    731 				   component!  We'll take that over
    732 				   anything else found so far.  (We
    733 				   couldn't have found a real first
    734 				   component before, since this is a
    735 				   used spare, and it's saying that
    736 				   it's replacing the first
    737 				   component.)  On reboot (with
    738 				   autoconfiguration turned on)
    739 				   sparecol will become the first
    740 				   component (component0) of this set.
    741 				*/
    742 				dumpto = sparecol;
    743 				break;
    744 			} else if (scol != -1) {
    745 				/*
    746 				   Must be a spared second component.
    747 				   We'll dump to that if we havn't found
    748 				   anything else so far.
    749 				*/
    750 				if (dumpto == -1)
    751 					dumpto = sparecol;
    752 			}
    753 		}
    754 	}
    755 
    756 	if (dumpto == -1) {
    757 		/* we couldn't find any live components to dump to!?!?
    758 		 */
    759 		error = EINVAL;
    760 		goto out;
    761 	}
    762 
    763 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    764 	if (bdev == NULL) {
    765 		error = ENXIO;
    766 		goto out;
    767 	}
    768 
    769 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    770 				blkno, va, nblk * raidPtr->bytesPerSector);
    771 
    772 out:
    773 	raidunlock(rs);
    774 
    775 	return error;
    776 }
    777 
    778 /* ARGSUSED */
    779 static int
    780 raidopen(dev_t dev, int flags, int fmt,
    781     struct lwp *l)
    782 {
    783 	int     unit = raidunit(dev);
    784 	struct raid_softc *rs;
    785 	struct dk_softc *dksc;
    786 	int     error = 0;
    787 	int     part, pmask;
    788 
    789 	if ((rs = raidget(unit, true)) == NULL)
    790 		return ENXIO;
    791 	if ((error = raidlock(rs)) != 0)
    792 		return error;
    793 
    794 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    795 		error = EBUSY;
    796 		goto bad;
    797 	}
    798 
    799 	dksc = &rs->sc_dksc;
    800 
    801 	part = DISKPART(dev);
    802 	pmask = (1 << part);
    803 
    804 	if (!DK_BUSY(dksc, pmask) &&
    805 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    806 		/* First one... mark things as dirty... Note that we *MUST*
    807 		 have done a configure before this.  I DO NOT WANT TO BE
    808 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    809 		 THAT THEY BELONG TOGETHER!!!!! */
    810 		/* XXX should check to see if we're only open for reading
    811 		   here... If so, we needn't do this, but then need some
    812 		   other way of keeping track of what's happened.. */
    813 
    814 		rf_markalldirty(&rs->sc_r);
    815 	}
    816 
    817 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    818 		error = dk_open(dksc, dev, flags, fmt, l);
    819 
    820 bad:
    821 	raidunlock(rs);
    822 
    823 	return error;
    824 
    825 
    826 }
    827 
    828 static int
    829 raid_lastclose(device_t self)
    830 {
    831 	struct raid_softc *rs = raidsoftc(self);
    832 
    833 	/* Last one... device is not unconfigured yet.
    834 	   Device shutdown has taken care of setting the
    835 	   clean bits if RAIDF_INITED is not set
    836 	   mark things as clean... */
    837 
    838 	rf_update_component_labels(&rs->sc_r,
    839 	    RF_FINAL_COMPONENT_UPDATE);
    840 
    841 	/* pass to unlocked code */
    842 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    843 		rs->sc_flags |= RAIDF_DETACH;
    844 
    845 	return 0;
    846 }
    847 
    848 /* ARGSUSED */
    849 static int
    850 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    851 {
    852 	int     unit = raidunit(dev);
    853 	struct raid_softc *rs;
    854 	struct dk_softc *dksc;
    855 	cfdata_t cf;
    856 	int     error = 0, do_detach = 0, do_put = 0;
    857 
    858 	if ((rs = raidget(unit, false)) == NULL)
    859 		return ENXIO;
    860 	dksc = &rs->sc_dksc;
    861 
    862 	if ((error = raidlock(rs)) != 0)
    863 		return error;
    864 
    865 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    866 		error = dk_close(dksc, dev, flags, fmt, l);
    867 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    868 			do_detach = 1;
    869 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    870 		do_put = 1;
    871 
    872 	raidunlock(rs);
    873 
    874 	if (do_detach) {
    875 		/* free the pseudo device attach bits */
    876 		cf = device_cfdata(dksc->sc_dev);
    877 		error = config_detach(dksc->sc_dev, 0);
    878 		if (error == 0)
    879 			free(cf, M_RAIDFRAME);
    880 	} else if (do_put) {
    881 		raidput(rs);
    882 	}
    883 
    884 	return error;
    885 
    886 }
    887 
    888 static void
    889 raid_wakeup(RF_Raid_t *raidPtr)
    890 {
    891 	rf_lock_mutex2(raidPtr->iodone_lock);
    892 	rf_signal_cond2(raidPtr->iodone_cv);
    893 	rf_unlock_mutex2(raidPtr->iodone_lock);
    894 }
    895 
    896 static void
    897 raidstrategy(struct buf *bp)
    898 {
    899 	unsigned int unit;
    900 	struct raid_softc *rs;
    901 	struct dk_softc *dksc;
    902 	RF_Raid_t *raidPtr;
    903 
    904 	unit = raidunit(bp->b_dev);
    905 	if ((rs = raidget(unit, false)) == NULL) {
    906 		bp->b_error = ENXIO;
    907 		goto fail;
    908 	}
    909 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    910 		bp->b_error = ENXIO;
    911 		goto fail;
    912 	}
    913 	dksc = &rs->sc_dksc;
    914 	raidPtr = &rs->sc_r;
    915 
    916 	/* Queue IO only */
    917 	if (dk_strategy_defer(dksc, bp))
    918 		goto done;
    919 
    920 	/* schedule the IO to happen at the next convenient time */
    921 	raid_wakeup(raidPtr);
    922 
    923 done:
    924 	return;
    925 
    926 fail:
    927 	bp->b_resid = bp->b_bcount;
    928 	biodone(bp);
    929 }
    930 
    931 static int
    932 raid_diskstart(device_t dev, struct buf *bp)
    933 {
    934 	struct raid_softc *rs = raidsoftc(dev);
    935 	RF_Raid_t *raidPtr;
    936 
    937 	raidPtr = &rs->sc_r;
    938 	if (!raidPtr->valid) {
    939 		db1_printf(("raid is not valid..\n"));
    940 		return ENODEV;
    941 	}
    942 
    943 	/* XXX */
    944 	bp->b_resid = 0;
    945 
    946 	return raiddoaccess(raidPtr, bp);
    947 }
    948 
    949 void
    950 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    951 {
    952 	struct raid_softc *rs;
    953 	struct dk_softc *dksc;
    954 
    955 	rs = raidPtr->softc;
    956 	dksc = &rs->sc_dksc;
    957 
    958 	dk_done(dksc, bp);
    959 
    960 	rf_lock_mutex2(raidPtr->mutex);
    961 	raidPtr->openings++;
    962 	rf_unlock_mutex2(raidPtr->mutex);
    963 
    964 	/* schedule more IO */
    965 	raid_wakeup(raidPtr);
    966 }
    967 
    968 /* ARGSUSED */
    969 static int
    970 raidread(dev_t dev, struct uio *uio, int flags)
    971 {
    972 	int     unit = raidunit(dev);
    973 	struct raid_softc *rs;
    974 
    975 	if ((rs = raidget(unit, false)) == NULL)
    976 		return ENXIO;
    977 
    978 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    979 		return ENXIO;
    980 
    981 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
    982 
    983 }
    984 
    985 /* ARGSUSED */
    986 static int
    987 raidwrite(dev_t dev, struct uio *uio, int flags)
    988 {
    989 	int     unit = raidunit(dev);
    990 	struct raid_softc *rs;
    991 
    992 	if ((rs = raidget(unit, false)) == NULL)
    993 		return ENXIO;
    994 
    995 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    996 		return ENXIO;
    997 
    998 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
    999 
   1000 }
   1001 
   1002 static int
   1003 raid_detach_unlocked(struct raid_softc *rs)
   1004 {
   1005 	struct dk_softc *dksc = &rs->sc_dksc;
   1006 	RF_Raid_t *raidPtr;
   1007 	int error;
   1008 
   1009 	raidPtr = &rs->sc_r;
   1010 
   1011 	if (DK_BUSY(dksc, 0) ||
   1012 	    raidPtr->recon_in_progress != 0 ||
   1013 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1014 	    raidPtr->copyback_in_progress != 0)
   1015 		return EBUSY;
   1016 
   1017 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1018 		return 0;
   1019 
   1020 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1021 
   1022 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1023 		return error;
   1024 
   1025 	rs->sc_flags &= ~RAIDF_INITED;
   1026 
   1027 	/* Kill off any queued buffers */
   1028 	dk_drain(dksc);
   1029 	bufq_free(dksc->sc_bufq);
   1030 
   1031 	/* Detach the disk. */
   1032 	dkwedge_delall(&dksc->sc_dkdev);
   1033 	disk_detach(&dksc->sc_dkdev);
   1034 	disk_destroy(&dksc->sc_dkdev);
   1035 	dk_detach(dksc);
   1036 
   1037 	return 0;
   1038 }
   1039 
   1040 static bool
   1041 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1042 {
   1043 	switch (cmd) {
   1044 	case RAIDFRAME_ADD_HOT_SPARE:
   1045 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1046 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1047 	case RAIDFRAME_CHECK_PARITY:
   1048 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1049 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1050 	case RAIDFRAME_CHECK_RECON_STATUS:
   1051 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1052 	case RAIDFRAME_COPYBACK:
   1053 	case RAIDFRAME_DELETE_COMPONENT:
   1054 	case RAIDFRAME_FAIL_DISK:
   1055 	case RAIDFRAME_GET_ACCTOTALS:
   1056 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1057 	case RAIDFRAME_GET_INFO:
   1058 	case RAIDFRAME_GET_SIZE:
   1059 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1060 	case RAIDFRAME_INIT_LABELS:
   1061 	case RAIDFRAME_KEEP_ACCTOTALS:
   1062 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1063 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1064 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1065 	case RAIDFRAME_PARITYMAP_STATUS:
   1066 	case RAIDFRAME_REBUILD_IN_PLACE:
   1067 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1068 	case RAIDFRAME_RESET_ACCTOTALS:
   1069 	case RAIDFRAME_REWRITEPARITY:
   1070 	case RAIDFRAME_SET_AUTOCONFIG:
   1071 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1072 	case RAIDFRAME_SET_ROOT:
   1073 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1074 	}
   1075 	return false;
   1076 }
   1077 
   1078 int
   1079 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1080 {
   1081 	struct rf_recon_req_internal *rrint;
   1082 
   1083 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1084 		/* Can't do this on a RAID 0!! */
   1085 		return EINVAL;
   1086 	}
   1087 
   1088 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1089 		/* bad column */
   1090 		return EINVAL;
   1091 	}
   1092 
   1093 	rf_lock_mutex2(raidPtr->mutex);
   1094 	if (raidPtr->status == rf_rs_reconstructing) {
   1095 		/* you can't fail a disk while we're reconstructing! */
   1096 		/* XXX wrong for RAID6 */
   1097 		goto out;
   1098 	}
   1099 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1100 	    (raidPtr->numFailures > 0)) {
   1101 		/* some other component has failed.  Let's not make
   1102 		   things worse. XXX wrong for RAID6 */
   1103 		goto out;
   1104 	}
   1105 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1106 		/* Can't fail a spared disk! */
   1107 		goto out;
   1108 	}
   1109 	rf_unlock_mutex2(raidPtr->mutex);
   1110 
   1111 	/* make a copy of the recon request so that we don't rely on
   1112 	 * the user's buffer */
   1113 	rrint = RF_Malloc(sizeof(*rrint));
   1114 	if (rrint == NULL)
   1115 		return(ENOMEM);
   1116 	rrint->col = rr->col;
   1117 	rrint->flags = rr->flags;
   1118 	rrint->raidPtr = raidPtr;
   1119 
   1120 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1121 	    rrint, "raid_recon");
   1122 out:
   1123 	rf_unlock_mutex2(raidPtr->mutex);
   1124 	return EINVAL;
   1125 }
   1126 
   1127 static int
   1128 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1129 {
   1130 	/* allocate a buffer for the layout-specific data, and copy it in */
   1131 	if (k_cfg->layoutSpecificSize == 0)
   1132 		return 0;
   1133 
   1134 	if (k_cfg->layoutSpecificSize > 10000) {
   1135 	    /* sanity check */
   1136 	    return EINVAL;
   1137 	}
   1138 
   1139 	u_char *specific_buf;
   1140 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1141 	if (specific_buf == NULL)
   1142 		return ENOMEM;
   1143 
   1144 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1145 	    k_cfg->layoutSpecificSize);
   1146 	if (retcode) {
   1147 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1148 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1149 		return retcode;
   1150 	}
   1151 
   1152 	k_cfg->layoutSpecific = specific_buf;
   1153 	return 0;
   1154 }
   1155 
   1156 static int
   1157 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1158 {
   1159 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1160 
   1161 	if (rs->sc_r.valid) {
   1162 		/* There is a valid RAID set running on this unit! */
   1163 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1164 		return EINVAL;
   1165 	}
   1166 
   1167 	/* copy-in the configuration information */
   1168 	/* data points to a pointer to the configuration structure */
   1169 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1170 	if (*k_cfg == NULL) {
   1171 		return ENOMEM;
   1172 	}
   1173 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1174 	if (retcode == 0)
   1175 		return 0;
   1176 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1177 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1178 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1179 	return retcode;
   1180 }
   1181 
   1182 int
   1183 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1184 {
   1185 	int retcode;
   1186 	RF_Raid_t *raidPtr = &rs->sc_r;
   1187 
   1188 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1189 
   1190 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1191 		goto out;
   1192 
   1193 	/* should do some kind of sanity check on the configuration.
   1194 	 * Store the sum of all the bytes in the last byte? */
   1195 
   1196 	/* configure the system */
   1197 
   1198 	/*
   1199 	 * Clear the entire RAID descriptor, just to make sure
   1200 	 *  there is no stale data left in the case of a
   1201 	 *  reconfiguration
   1202 	 */
   1203 	memset(raidPtr, 0, sizeof(*raidPtr));
   1204 	raidPtr->softc = rs;
   1205 	raidPtr->raidid = rs->sc_unit;
   1206 
   1207 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1208 
   1209 	if (retcode == 0) {
   1210 		/* allow this many simultaneous IO's to
   1211 		   this RAID device */
   1212 		raidPtr->openings = RAIDOUTSTANDING;
   1213 
   1214 		raidinit(rs);
   1215 		raid_wakeup(raidPtr);
   1216 		rf_markalldirty(raidPtr);
   1217 	}
   1218 
   1219 	/* free the buffers.  No return code here. */
   1220 	if (k_cfg->layoutSpecificSize) {
   1221 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1222 	}
   1223 out:
   1224 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1225 	if (retcode) {
   1226 		/*
   1227 		 * If configuration failed, set sc_flags so that we
   1228 		 * will detach the device when we close it.
   1229 		 */
   1230 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1231 	}
   1232 	return retcode;
   1233 }
   1234 
   1235 #if RF_DISABLED
   1236 static int
   1237 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1238 {
   1239 
   1240 	/* XXX check the label for valid stuff... */
   1241 	/* Note that some things *should not* get modified --
   1242 	   the user should be re-initing the labels instead of
   1243 	   trying to patch things.
   1244 	   */
   1245 #ifdef DEBUG
   1246 	int raidid = raidPtr->raidid;
   1247 	printf("raid%d: Got component label:\n", raidid);
   1248 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1249 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1250 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1251 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1252 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1253 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1254 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1255 #endif	/* DEBUG */
   1256 	clabel->row = 0;
   1257 	int column = clabel->column;
   1258 
   1259 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1260 		return(EINVAL);
   1261 	}
   1262 
   1263 	/* XXX this isn't allowed to do anything for now :-) */
   1264 
   1265 	/* XXX and before it is, we need to fill in the rest
   1266 	   of the fields!?!?!?! */
   1267 	memcpy(raidget_component_label(raidPtr, column),
   1268 	    clabel, sizeof(*clabel));
   1269 	raidflush_component_label(raidPtr, column);
   1270 	return 0;
   1271 }
   1272 #endif
   1273 
   1274 static int
   1275 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1276 {
   1277 	/*
   1278 	   we only want the serial number from
   1279 	   the above.  We get all the rest of the information
   1280 	   from the config that was used to create this RAID
   1281 	   set.
   1282 	   */
   1283 
   1284 	raidPtr->serial_number = clabel->serial_number;
   1285 
   1286 	for (int column = 0; column < raidPtr->numCol; column++) {
   1287 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1288 		if (RF_DEAD_DISK(diskPtr->status))
   1289 			continue;
   1290 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1291 		    raidPtr, column);
   1292 		/* Zeroing this is important. */
   1293 		memset(ci_label, 0, sizeof(*ci_label));
   1294 		raid_init_component_label(raidPtr, ci_label);
   1295 		ci_label->serial_number = raidPtr->serial_number;
   1296 		ci_label->row = 0; /* we dont' pretend to support more */
   1297 		rf_component_label_set_partitionsize(ci_label,
   1298 		    diskPtr->partitionSize);
   1299 		ci_label->column = column;
   1300 		raidflush_component_label(raidPtr, column);
   1301 		/* XXXjld what about the spares? */
   1302 	}
   1303 
   1304 	return 0;
   1305 }
   1306 
   1307 static int
   1308 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1309 {
   1310 
   1311 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1312 		/* Can't do this on a RAID 0!! */
   1313 		return EINVAL;
   1314 	}
   1315 
   1316 	if (raidPtr->recon_in_progress == 1) {
   1317 		/* a reconstruct is already in progress! */
   1318 		return EINVAL;
   1319 	}
   1320 
   1321 	RF_SingleComponent_t component;
   1322 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1323 	component.row = 0; /* we don't support any more */
   1324 	int column = component.column;
   1325 
   1326 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1327 		return EINVAL;
   1328 	}
   1329 
   1330 	rf_lock_mutex2(raidPtr->mutex);
   1331 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1332 	    (raidPtr->numFailures > 0)) {
   1333 		/* XXX 0 above shouldn't be constant!!! */
   1334 		/* some component other than this has failed.
   1335 		   Let's not make things worse than they already
   1336 		   are... */
   1337 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1338 		       raidPtr->raidid);
   1339 		printf("raid%d:     Col: %d   Too many failures.\n",
   1340 		       raidPtr->raidid, column);
   1341 		rf_unlock_mutex2(raidPtr->mutex);
   1342 		return EINVAL;
   1343 	}
   1344 
   1345 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1346 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1347 		       raidPtr->raidid);
   1348 		printf("raid%d:    Col: %d   "
   1349 		    "Reconstruction already occurring!\n",
   1350 		    raidPtr->raidid, column);
   1351 
   1352 		rf_unlock_mutex2(raidPtr->mutex);
   1353 		return EINVAL;
   1354 	}
   1355 
   1356 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1357 		rf_unlock_mutex2(raidPtr->mutex);
   1358 		return EINVAL;
   1359 	}
   1360 
   1361 	rf_unlock_mutex2(raidPtr->mutex);
   1362 
   1363 	struct rf_recon_req_internal *rrint;
   1364 	rrint = RF_Malloc(sizeof(*rrint));
   1365 	if (rrint == NULL)
   1366 		return ENOMEM;
   1367 
   1368 	rrint->col = column;
   1369 	rrint->raidPtr = raidPtr;
   1370 
   1371 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1372 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1373 }
   1374 
   1375 static int
   1376 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1377 {
   1378 	/*
   1379 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1380 	 * so tell the user it's done.
   1381 	 */
   1382 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1383 	    raidPtr->status != rf_rs_reconstructing) {
   1384 		*data = 100;
   1385 		return 0;
   1386 	}
   1387 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1388 		*data = 0;
   1389 		return 0;
   1390 	}
   1391 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1392 	    / raidPtr->reconControl->numRUsTotal);
   1393 	return 0;
   1394 }
   1395 
   1396 static int
   1397 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1398 {
   1399 	int     unit = raidunit(dev);
   1400 	int     part, pmask;
   1401 	struct raid_softc *rs;
   1402 	struct dk_softc *dksc;
   1403 	RF_Config_t *k_cfg;
   1404 	RF_Raid_t *raidPtr;
   1405 	RF_AccTotals_t *totals;
   1406 	RF_SingleComponent_t component;
   1407 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1408 	int retcode = 0;
   1409 	int column;
   1410 	RF_ComponentLabel_t *clabel;
   1411 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1412 	int d;
   1413 
   1414 	if ((rs = raidget(unit, false)) == NULL)
   1415 		return ENXIO;
   1416 
   1417 	dksc = &rs->sc_dksc;
   1418 	raidPtr = &rs->sc_r;
   1419 
   1420 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1421 	    (int) DISKPART(dev), (int) unit, cmd));
   1422 
   1423 	/* Must be initialized for these... */
   1424 	if (rf_must_be_initialized(rs, cmd))
   1425 		return ENXIO;
   1426 
   1427 	switch (cmd) {
   1428 		/* configure the system */
   1429 	case RAIDFRAME_CONFIGURE:
   1430 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1431 			return retcode;
   1432 		return rf_construct(rs, k_cfg);
   1433 
   1434 		/* shutdown the system */
   1435 	case RAIDFRAME_SHUTDOWN:
   1436 
   1437 		part = DISKPART(dev);
   1438 		pmask = (1 << part);
   1439 
   1440 		if ((retcode = raidlock(rs)) != 0)
   1441 			return retcode;
   1442 
   1443 		if (DK_BUSY(dksc, pmask) ||
   1444 		    raidPtr->recon_in_progress != 0 ||
   1445 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1446 		    raidPtr->copyback_in_progress != 0)
   1447 			retcode = EBUSY;
   1448 		else {
   1449 			/* detach and free on close */
   1450 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1451 			retcode = 0;
   1452 		}
   1453 
   1454 		raidunlock(rs);
   1455 
   1456 		return retcode;
   1457 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1458 		return rf_get_component_label(raidPtr, data);
   1459 
   1460 #if RF_DISABLED
   1461 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1462 		return rf_set_component_label(raidPtr, data);
   1463 #endif
   1464 
   1465 	case RAIDFRAME_INIT_LABELS:
   1466 		return rf_init_component_label(raidPtr, data);
   1467 
   1468 	case RAIDFRAME_SET_AUTOCONFIG:
   1469 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1470 		printf("raid%d: New autoconfig value is: %d\n",
   1471 		       raidPtr->raidid, d);
   1472 		*(int *) data = d;
   1473 		return retcode;
   1474 
   1475 	case RAIDFRAME_SET_ROOT:
   1476 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1477 		printf("raid%d: New rootpartition value is: %d\n",
   1478 		       raidPtr->raidid, d);
   1479 		*(int *) data = d;
   1480 		return retcode;
   1481 
   1482 		/* initialize all parity */
   1483 	case RAIDFRAME_REWRITEPARITY:
   1484 
   1485 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1486 			/* Parity for RAID 0 is trivially correct */
   1487 			raidPtr->parity_good = RF_RAID_CLEAN;
   1488 			return 0;
   1489 		}
   1490 
   1491 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1492 			/* Re-write is already in progress! */
   1493 			return EINVAL;
   1494 		}
   1495 
   1496 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1497 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1498 
   1499 	case RAIDFRAME_ADD_HOT_SPARE:
   1500 		sparePtr = (RF_SingleComponent_t *) data;
   1501 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1502 		return rf_add_hot_spare(raidPtr, &component);
   1503 
   1504 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1505 		return retcode;
   1506 
   1507 	case RAIDFRAME_DELETE_COMPONENT:
   1508 		componentPtr = (RF_SingleComponent_t *)data;
   1509 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1510 		return rf_delete_component(raidPtr, &component);
   1511 
   1512 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1513 		componentPtr = (RF_SingleComponent_t *)data;
   1514 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1515 		return rf_incorporate_hot_spare(raidPtr, &component);
   1516 
   1517 	case RAIDFRAME_REBUILD_IN_PLACE:
   1518 		return rf_rebuild_in_place(raidPtr, data);
   1519 
   1520 	case RAIDFRAME_GET_INFO:
   1521 		ucfgp = *(RF_DeviceConfig_t **)data;
   1522 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1523 		if (d_cfg == NULL)
   1524 			return ENOMEM;
   1525 		retcode = rf_get_info(raidPtr, d_cfg);
   1526 		if (retcode == 0) {
   1527 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1528 		}
   1529 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1530 		return retcode;
   1531 
   1532 	case RAIDFRAME_CHECK_PARITY:
   1533 		*(int *) data = raidPtr->parity_good;
   1534 		return 0;
   1535 
   1536 	case RAIDFRAME_PARITYMAP_STATUS:
   1537 		if (rf_paritymap_ineligible(raidPtr))
   1538 			return EINVAL;
   1539 		rf_paritymap_status(raidPtr->parity_map, data);
   1540 		return 0;
   1541 
   1542 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1543 		if (rf_paritymap_ineligible(raidPtr))
   1544 			return EINVAL;
   1545 		if (raidPtr->parity_map == NULL)
   1546 			return ENOENT; /* ??? */
   1547 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1548 			return EINVAL;
   1549 		return 0;
   1550 
   1551 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1552 		if (rf_paritymap_ineligible(raidPtr))
   1553 			return EINVAL;
   1554 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1555 		return 0;
   1556 
   1557 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1558 		if (rf_paritymap_ineligible(raidPtr))
   1559 			return EINVAL;
   1560 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1561 		/* XXX should errors be passed up? */
   1562 		return 0;
   1563 
   1564 	case RAIDFRAME_RESET_ACCTOTALS:
   1565 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1566 		return 0;
   1567 
   1568 	case RAIDFRAME_GET_ACCTOTALS:
   1569 		totals = (RF_AccTotals_t *) data;
   1570 		*totals = raidPtr->acc_totals;
   1571 		return 0;
   1572 
   1573 	case RAIDFRAME_KEEP_ACCTOTALS:
   1574 		raidPtr->keep_acc_totals = *(int *)data;
   1575 		return 0;
   1576 
   1577 	case RAIDFRAME_GET_SIZE:
   1578 		*(int *) data = raidPtr->totalSectors;
   1579 		return 0;
   1580 
   1581 	case RAIDFRAME_FAIL_DISK:
   1582 		return rf_fail_disk(raidPtr, data);
   1583 
   1584 		/* invoke a copyback operation after recon on whatever disk
   1585 		 * needs it, if any */
   1586 	case RAIDFRAME_COPYBACK:
   1587 
   1588 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1589 			/* This makes no sense on a RAID 0!! */
   1590 			return EINVAL;
   1591 		}
   1592 
   1593 		if (raidPtr->copyback_in_progress == 1) {
   1594 			/* Copyback is already in progress! */
   1595 			return EINVAL;
   1596 		}
   1597 
   1598 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1599 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1600 
   1601 		/* return the percentage completion of reconstruction */
   1602 	case RAIDFRAME_CHECK_RECON_STATUS:
   1603 		return rf_check_recon_status(raidPtr, data);
   1604 
   1605 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1606 		rf_check_recon_status_ext(raidPtr, data);
   1607 		return 0;
   1608 
   1609 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1610 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1611 			/* This makes no sense on a RAID 0, so tell the
   1612 			   user it's done. */
   1613 			*(int *) data = 100;
   1614 			return 0;
   1615 		}
   1616 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1617 			*(int *) data = 100 *
   1618 				raidPtr->parity_rewrite_stripes_done /
   1619 				raidPtr->Layout.numStripe;
   1620 		} else {
   1621 			*(int *) data = 100;
   1622 		}
   1623 		return 0;
   1624 
   1625 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1626 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1627 		return 0;
   1628 
   1629 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1630 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1631 			/* This makes no sense on a RAID 0 */
   1632 			*(int *) data = 100;
   1633 			return 0;
   1634 		}
   1635 		if (raidPtr->copyback_in_progress == 1) {
   1636 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1637 				raidPtr->Layout.numStripe;
   1638 		} else {
   1639 			*(int *) data = 100;
   1640 		}
   1641 		return 0;
   1642 
   1643 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1644 		rf_check_copyback_status_ext(raidPtr, data);
   1645 		return 0;
   1646 
   1647 	case RAIDFRAME_SET_LAST_UNIT:
   1648 		for (column = 0; column < raidPtr->numCol; column++)
   1649 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1650 				return EBUSY;
   1651 
   1652 		for (column = 0; column < raidPtr->numCol; column++) {
   1653 			clabel = raidget_component_label(raidPtr, column);
   1654 			clabel->last_unit = *(int *)data;
   1655 			raidflush_component_label(raidPtr, column);
   1656 		}
   1657 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1658 		return 0;
   1659 
   1660 		/* the sparetable daemon calls this to wait for the kernel to
   1661 		 * need a spare table. this ioctl does not return until a
   1662 		 * spare table is needed. XXX -- calling mpsleep here in the
   1663 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1664 		 * -- I should either compute the spare table in the kernel,
   1665 		 * or have a different -- XXX XXX -- interface (a different
   1666 		 * character device) for delivering the table     -- XXX */
   1667 #if RF_DISABLED
   1668 	case RAIDFRAME_SPARET_WAIT:
   1669 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1670 		while (!rf_sparet_wait_queue)
   1671 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1672 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1673 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1674 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1675 
   1676 		/* structure assignment */
   1677 		*((RF_SparetWait_t *) data) = *waitreq;
   1678 
   1679 		RF_Free(waitreq, sizeof(*waitreq));
   1680 		return 0;
   1681 
   1682 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1683 		 * code in it that will cause the dameon to exit */
   1684 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1685 		waitreq = RF_Malloc(sizeof(*waitreq));
   1686 		waitreq->fcol = -1;
   1687 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1688 		waitreq->next = rf_sparet_wait_queue;
   1689 		rf_sparet_wait_queue = waitreq;
   1690 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1691 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1692 		return 0;
   1693 
   1694 		/* used by the spare table daemon to deliver a spare table
   1695 		 * into the kernel */
   1696 	case RAIDFRAME_SEND_SPARET:
   1697 
   1698 		/* install the spare table */
   1699 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1700 
   1701 		/* respond to the requestor.  the return status of the spare
   1702 		 * table installation is passed in the "fcol" field */
   1703 		waitred = RF_Malloc(sizeof(*waitreq));
   1704 		waitreq->fcol = retcode;
   1705 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1706 		waitreq->next = rf_sparet_resp_queue;
   1707 		rf_sparet_resp_queue = waitreq;
   1708 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1709 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1710 
   1711 		return retcode;
   1712 #endif
   1713 	default:
   1714 		/*
   1715 		 * Don't bother trying to load compat modules
   1716 		 * if it is not our ioctl. This is more efficient
   1717 		 * and makes rump tests not depend on compat code
   1718 		 */
   1719 		if (IOCGROUP(cmd) != 'r')
   1720 			break;
   1721 #ifdef _LP64
   1722 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1723 			module_autoload("compat_netbsd32_raid",
   1724 			    MODULE_CLASS_EXEC);
   1725 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1726 			    (rs, cmd, data), enosys(), retcode);
   1727 			if (retcode != EPASSTHROUGH)
   1728 				return retcode;
   1729 		}
   1730 #endif
   1731 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1732 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1733 		    (rs, cmd, data), enosys(), retcode);
   1734 		if (retcode != EPASSTHROUGH)
   1735 			return retcode;
   1736 
   1737 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1738 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1739 		    (rs, cmd, data), enosys(), retcode);
   1740 		if (retcode != EPASSTHROUGH)
   1741 			return retcode;
   1742 		break; /* fall through to the os-specific code below */
   1743 
   1744 	}
   1745 
   1746 	if (!raidPtr->valid)
   1747 		return EINVAL;
   1748 
   1749 	/*
   1750 	 * Add support for "regular" device ioctls here.
   1751 	 */
   1752 
   1753 	switch (cmd) {
   1754 	case DIOCGCACHE:
   1755 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1756 		break;
   1757 
   1758 	case DIOCCACHESYNC:
   1759 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1760 		break;
   1761 
   1762 	default:
   1763 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1764 		break;
   1765 	}
   1766 
   1767 	return retcode;
   1768 
   1769 }
   1770 
   1771 
   1772 /* raidinit -- complete the rest of the initialization for the
   1773    RAIDframe device.  */
   1774 
   1775 
   1776 static void
   1777 raidinit(struct raid_softc *rs)
   1778 {
   1779 	cfdata_t cf;
   1780 	unsigned int unit;
   1781 	struct dk_softc *dksc = &rs->sc_dksc;
   1782 	RF_Raid_t *raidPtr = &rs->sc_r;
   1783 	device_t dev;
   1784 
   1785 	unit = raidPtr->raidid;
   1786 
   1787 	/* XXX doesn't check bounds. */
   1788 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1789 
   1790 	/* attach the pseudo device */
   1791 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1792 	cf->cf_name = raid_cd.cd_name;
   1793 	cf->cf_atname = raid_cd.cd_name;
   1794 	cf->cf_unit = unit;
   1795 	cf->cf_fstate = FSTATE_STAR;
   1796 
   1797 	dev = config_attach_pseudo(cf);
   1798 	if (dev == NULL) {
   1799 		printf("raid%d: config_attach_pseudo failed\n",
   1800 		    raidPtr->raidid);
   1801 		free(cf, M_RAIDFRAME);
   1802 		return;
   1803 	}
   1804 
   1805 	/* provide a backpointer to the real softc */
   1806 	raidsoftc(dev) = rs;
   1807 
   1808 	/* disk_attach actually creates space for the CPU disklabel, among
   1809 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1810 	 * with disklabels. */
   1811 	dk_init(dksc, dev, DKTYPE_RAID);
   1812 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1813 
   1814 	/* XXX There may be a weird interaction here between this, and
   1815 	 * protectedSectors, as used in RAIDframe.  */
   1816 
   1817 	rs->sc_size = raidPtr->totalSectors;
   1818 
   1819 	/* Attach dk and disk subsystems */
   1820 	dk_attach(dksc);
   1821 	disk_attach(&dksc->sc_dkdev);
   1822 	rf_set_geometry(rs, raidPtr);
   1823 
   1824 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1825 
   1826 	/* mark unit as usuable */
   1827 	rs->sc_flags |= RAIDF_INITED;
   1828 
   1829 	dkwedge_discover(&dksc->sc_dkdev);
   1830 }
   1831 
   1832 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1833 /* wake up the daemon & tell it to get us a spare table
   1834  * XXX
   1835  * the entries in the queues should be tagged with the raidPtr
   1836  * so that in the extremely rare case that two recons happen at once,
   1837  * we know for which device were requesting a spare table
   1838  * XXX
   1839  *
   1840  * XXX This code is not currently used. GO
   1841  */
   1842 int
   1843 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1844 {
   1845 	int     retcode;
   1846 
   1847 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1848 	req->next = rf_sparet_wait_queue;
   1849 	rf_sparet_wait_queue = req;
   1850 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1851 
   1852 	/* mpsleep unlocks the mutex */
   1853 	while (!rf_sparet_resp_queue) {
   1854 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1855 	}
   1856 	req = rf_sparet_resp_queue;
   1857 	rf_sparet_resp_queue = req->next;
   1858 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1859 
   1860 	retcode = req->fcol;
   1861 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1862 					 * alloc'd */
   1863 	return retcode;
   1864 }
   1865 #endif
   1866 
   1867 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1868  * bp & passes it down.
   1869  * any calls originating in the kernel must use non-blocking I/O
   1870  * do some extra sanity checking to return "appropriate" error values for
   1871  * certain conditions (to make some standard utilities work)
   1872  *
   1873  * Formerly known as: rf_DoAccessKernel
   1874  */
   1875 void
   1876 raidstart(RF_Raid_t *raidPtr)
   1877 {
   1878 	struct raid_softc *rs;
   1879 	struct dk_softc *dksc;
   1880 
   1881 	rs = raidPtr->softc;
   1882 	dksc = &rs->sc_dksc;
   1883 	/* quick check to see if anything has died recently */
   1884 	rf_lock_mutex2(raidPtr->mutex);
   1885 	if (raidPtr->numNewFailures > 0) {
   1886 		rf_unlock_mutex2(raidPtr->mutex);
   1887 		rf_update_component_labels(raidPtr,
   1888 					   RF_NORMAL_COMPONENT_UPDATE);
   1889 		rf_lock_mutex2(raidPtr->mutex);
   1890 		raidPtr->numNewFailures--;
   1891 	}
   1892 	rf_unlock_mutex2(raidPtr->mutex);
   1893 
   1894 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1895 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1896 		return;
   1897 	}
   1898 
   1899 	dk_start(dksc, NULL);
   1900 }
   1901 
   1902 static int
   1903 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1904 {
   1905 	RF_SectorCount_t num_blocks, pb, sum;
   1906 	RF_RaidAddr_t raid_addr;
   1907 	daddr_t blocknum;
   1908 	int     do_async;
   1909 	int rc;
   1910 
   1911 	rf_lock_mutex2(raidPtr->mutex);
   1912 	if (raidPtr->openings == 0) {
   1913 		rf_unlock_mutex2(raidPtr->mutex);
   1914 		return EAGAIN;
   1915 	}
   1916 	rf_unlock_mutex2(raidPtr->mutex);
   1917 
   1918 	blocknum = bp->b_rawblkno;
   1919 
   1920 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1921 		    (int) blocknum));
   1922 
   1923 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1924 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1925 
   1926 	/* *THIS* is where we adjust what block we're going to...
   1927 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1928 	raid_addr = blocknum;
   1929 
   1930 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1931 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1932 	sum = raid_addr + num_blocks + pb;
   1933 	if (1 || rf_debugKernelAccess) {
   1934 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1935 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1936 			    (int) pb, (int) bp->b_resid));
   1937 	}
   1938 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1939 	    || (sum < num_blocks) || (sum < pb)) {
   1940 		rc = ENOSPC;
   1941 		goto done;
   1942 	}
   1943 	/*
   1944 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1945 	 */
   1946 
   1947 	if (bp->b_bcount & raidPtr->sectorMask) {
   1948 		rc = ENOSPC;
   1949 		goto done;
   1950 	}
   1951 	db1_printf(("Calling DoAccess..\n"));
   1952 
   1953 
   1954 	rf_lock_mutex2(raidPtr->mutex);
   1955 	raidPtr->openings--;
   1956 	rf_unlock_mutex2(raidPtr->mutex);
   1957 
   1958 	/*
   1959 	 * Everything is async.
   1960 	 */
   1961 	do_async = 1;
   1962 
   1963 	/* don't ever condition on bp->b_flags & B_WRITE.
   1964 	 * always condition on B_READ instead */
   1965 
   1966 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1967 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1968 			 do_async, raid_addr, num_blocks,
   1969 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1970 
   1971 done:
   1972 	return rc;
   1973 }
   1974 
   1975 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1976 
   1977 int
   1978 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1979 {
   1980 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1981 	struct buf *bp;
   1982 
   1983 	req->queue = queue;
   1984 	bp = req->bp;
   1985 
   1986 	switch (req->type) {
   1987 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1988 		/* XXX need to do something extra here.. */
   1989 		/* I'm leaving this in, as I've never actually seen it used,
   1990 		 * and I'd like folks to report it... GO */
   1991 		printf("%s: WAKEUP CALLED\n", __func__);
   1992 		queue->numOutstanding++;
   1993 
   1994 		bp->b_flags = 0;
   1995 		bp->b_private = req;
   1996 
   1997 		KernelWakeupFunc(bp);
   1998 		break;
   1999 
   2000 	case RF_IO_TYPE_READ:
   2001 	case RF_IO_TYPE_WRITE:
   2002 #if RF_ACC_TRACE > 0
   2003 		if (req->tracerec) {
   2004 			RF_ETIMER_START(req->tracerec->timer);
   2005 		}
   2006 #endif
   2007 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2008 		    op, queue->rf_cinfo->ci_dev,
   2009 		    req->sectorOffset, req->numSector,
   2010 		    req->buf, KernelWakeupFunc, (void *) req,
   2011 		    queue->raidPtr->logBytesPerSector);
   2012 
   2013 		if (rf_debugKernelAccess) {
   2014 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2015 				(long) bp->b_blkno));
   2016 		}
   2017 		queue->numOutstanding++;
   2018 		queue->last_deq_sector = req->sectorOffset;
   2019 		/* acc wouldn't have been let in if there were any pending
   2020 		 * reqs at any other priority */
   2021 		queue->curPriority = req->priority;
   2022 
   2023 		db1_printf(("Going for %c to unit %d col %d\n",
   2024 			    req->type, queue->raidPtr->raidid,
   2025 			    queue->col));
   2026 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2027 			(int) req->sectorOffset, (int) req->numSector,
   2028 			(int) (req->numSector <<
   2029 			    queue->raidPtr->logBytesPerSector),
   2030 			(int) queue->raidPtr->logBytesPerSector));
   2031 
   2032 		/*
   2033 		 * XXX: drop lock here since this can block at
   2034 		 * least with backing SCSI devices.  Retake it
   2035 		 * to minimize fuss with calling interfaces.
   2036 		 */
   2037 
   2038 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2039 		bdev_strategy(bp);
   2040 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2041 		break;
   2042 
   2043 	default:
   2044 		panic("bad req->type in rf_DispatchKernelIO");
   2045 	}
   2046 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2047 
   2048 	return 0;
   2049 }
   2050 /* this is the callback function associated with a I/O invoked from
   2051    kernel code.
   2052  */
   2053 static void
   2054 KernelWakeupFunc(struct buf *bp)
   2055 {
   2056 	RF_DiskQueueData_t *req = NULL;
   2057 	RF_DiskQueue_t *queue;
   2058 
   2059 	db1_printf(("recovering the request queue:\n"));
   2060 
   2061 	req = bp->b_private;
   2062 
   2063 	queue = (RF_DiskQueue_t *) req->queue;
   2064 
   2065 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2066 
   2067 #if RF_ACC_TRACE > 0
   2068 	if (req->tracerec) {
   2069 		RF_ETIMER_STOP(req->tracerec->timer);
   2070 		RF_ETIMER_EVAL(req->tracerec->timer);
   2071 		rf_lock_mutex2(rf_tracing_mutex);
   2072 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2073 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2074 		req->tracerec->num_phys_ios++;
   2075 		rf_unlock_mutex2(rf_tracing_mutex);
   2076 	}
   2077 #endif
   2078 
   2079 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2080 	 * ballistic, and mark the component as hosed... */
   2081 
   2082 	if (bp->b_error != 0) {
   2083 		/* Mark the disk as dead */
   2084 		/* but only mark it once... */
   2085 		/* and only if it wouldn't leave this RAID set
   2086 		   completely broken */
   2087 		if (((queue->raidPtr->Disks[queue->col].status ==
   2088 		      rf_ds_optimal) ||
   2089 		     (queue->raidPtr->Disks[queue->col].status ==
   2090 		      rf_ds_used_spare)) &&
   2091 		     (queue->raidPtr->numFailures <
   2092 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2093 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2094 			       queue->raidPtr->raidid,
   2095 			       bp->b_error,
   2096 			       queue->raidPtr->Disks[queue->col].devname);
   2097 			queue->raidPtr->Disks[queue->col].status =
   2098 			    rf_ds_failed;
   2099 			queue->raidPtr->status = rf_rs_degraded;
   2100 			queue->raidPtr->numFailures++;
   2101 			queue->raidPtr->numNewFailures++;
   2102 		} else {	/* Disk is already dead... */
   2103 			/* printf("Disk already marked as dead!\n"); */
   2104 		}
   2105 
   2106 	}
   2107 
   2108 	/* Fill in the error value */
   2109 	req->error = bp->b_error;
   2110 
   2111 	/* Drop this one on the "finished" queue... */
   2112 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2113 
   2114 	/* Let the raidio thread know there is work to be done. */
   2115 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2116 
   2117 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2118 }
   2119 
   2120 
   2121 /*
   2122  * initialize a buf structure for doing an I/O in the kernel.
   2123  */
   2124 static void
   2125 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2126        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2127        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2128 {
   2129 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2130 	bp->b_oflags = 0;
   2131 	bp->b_cflags = 0;
   2132 	bp->b_bcount = numSect << logBytesPerSector;
   2133 	bp->b_bufsize = bp->b_bcount;
   2134 	bp->b_error = 0;
   2135 	bp->b_dev = dev;
   2136 	bp->b_data = bf;
   2137 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2138 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2139 	if (bp->b_bcount == 0) {
   2140 		panic("bp->b_bcount is zero in InitBP!!");
   2141 	}
   2142 	bp->b_iodone = cbFunc;
   2143 	bp->b_private = cbArg;
   2144 }
   2145 
   2146 /*
   2147  * Wait interruptibly for an exclusive lock.
   2148  *
   2149  * XXX
   2150  * Several drivers do this; it should be abstracted and made MP-safe.
   2151  * (Hmm... where have we seen this warning before :->  GO )
   2152  */
   2153 static int
   2154 raidlock(struct raid_softc *rs)
   2155 {
   2156 	int     error;
   2157 
   2158 	error = 0;
   2159 	mutex_enter(&rs->sc_mutex);
   2160 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2161 		rs->sc_flags |= RAIDF_WANTED;
   2162 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2163 		if (error != 0)
   2164 			goto done;
   2165 	}
   2166 	rs->sc_flags |= RAIDF_LOCKED;
   2167 done:
   2168 	mutex_exit(&rs->sc_mutex);
   2169 	return error;
   2170 }
   2171 /*
   2172  * Unlock and wake up any waiters.
   2173  */
   2174 static void
   2175 raidunlock(struct raid_softc *rs)
   2176 {
   2177 
   2178 	mutex_enter(&rs->sc_mutex);
   2179 	rs->sc_flags &= ~RAIDF_LOCKED;
   2180 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2181 		rs->sc_flags &= ~RAIDF_WANTED;
   2182 		cv_broadcast(&rs->sc_cv);
   2183 	}
   2184 	mutex_exit(&rs->sc_mutex);
   2185 }
   2186 
   2187 
   2188 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2189 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2190 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2191 
   2192 static daddr_t
   2193 rf_component_info_offset(void)
   2194 {
   2195 
   2196 	return RF_COMPONENT_INFO_OFFSET;
   2197 }
   2198 
   2199 static daddr_t
   2200 rf_component_info_size(unsigned secsize)
   2201 {
   2202 	daddr_t info_size;
   2203 
   2204 	KASSERT(secsize);
   2205 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2206 		info_size = secsize;
   2207 	else
   2208 		info_size = RF_COMPONENT_INFO_SIZE;
   2209 
   2210 	return info_size;
   2211 }
   2212 
   2213 static daddr_t
   2214 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2215 {
   2216 	daddr_t map_offset;
   2217 
   2218 	KASSERT(raidPtr->bytesPerSector);
   2219 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2220 		map_offset = raidPtr->bytesPerSector;
   2221 	else
   2222 		map_offset = RF_COMPONENT_INFO_SIZE;
   2223 	map_offset += rf_component_info_offset();
   2224 
   2225 	return map_offset;
   2226 }
   2227 
   2228 static daddr_t
   2229 rf_parity_map_size(RF_Raid_t *raidPtr)
   2230 {
   2231 	daddr_t map_size;
   2232 
   2233 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2234 		map_size = raidPtr->bytesPerSector;
   2235 	else
   2236 		map_size = RF_PARITY_MAP_SIZE;
   2237 
   2238 	return map_size;
   2239 }
   2240 
   2241 int
   2242 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2243 {
   2244 	RF_ComponentLabel_t *clabel;
   2245 
   2246 	clabel = raidget_component_label(raidPtr, col);
   2247 	clabel->clean = RF_RAID_CLEAN;
   2248 	raidflush_component_label(raidPtr, col);
   2249 	return(0);
   2250 }
   2251 
   2252 
   2253 int
   2254 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2255 {
   2256 	RF_ComponentLabel_t *clabel;
   2257 
   2258 	clabel = raidget_component_label(raidPtr, col);
   2259 	clabel->clean = RF_RAID_DIRTY;
   2260 	raidflush_component_label(raidPtr, col);
   2261 	return(0);
   2262 }
   2263 
   2264 int
   2265 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2266 {
   2267 	KASSERT(raidPtr->bytesPerSector);
   2268 	return raidread_component_label(raidPtr->bytesPerSector,
   2269 	    raidPtr->Disks[col].dev,
   2270 	    raidPtr->raid_cinfo[col].ci_vp,
   2271 	    &raidPtr->raid_cinfo[col].ci_label);
   2272 }
   2273 
   2274 RF_ComponentLabel_t *
   2275 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2276 {
   2277 	return &raidPtr->raid_cinfo[col].ci_label;
   2278 }
   2279 
   2280 int
   2281 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2282 {
   2283 	RF_ComponentLabel_t *label;
   2284 
   2285 	label = &raidPtr->raid_cinfo[col].ci_label;
   2286 	label->mod_counter = raidPtr->mod_counter;
   2287 #ifndef RF_NO_PARITY_MAP
   2288 	label->parity_map_modcount = label->mod_counter;
   2289 #endif
   2290 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2291 	    raidPtr->Disks[col].dev,
   2292 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2293 }
   2294 
   2295 
   2296 static int
   2297 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2298     RF_ComponentLabel_t *clabel)
   2299 {
   2300 	return raidread_component_area(dev, b_vp, clabel,
   2301 	    sizeof(RF_ComponentLabel_t),
   2302 	    rf_component_info_offset(),
   2303 	    rf_component_info_size(secsize));
   2304 }
   2305 
   2306 /* ARGSUSED */
   2307 static int
   2308 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2309     size_t msize, daddr_t offset, daddr_t dsize)
   2310 {
   2311 	struct buf *bp;
   2312 	int error;
   2313 
   2314 	/* XXX should probably ensure that we don't try to do this if
   2315 	   someone has changed rf_protected_sectors. */
   2316 
   2317 	if (b_vp == NULL) {
   2318 		/* For whatever reason, this component is not valid.
   2319 		   Don't try to read a component label from it. */
   2320 		return(EINVAL);
   2321 	}
   2322 
   2323 	/* get a block of the appropriate size... */
   2324 	bp = geteblk((int)dsize);
   2325 	bp->b_dev = dev;
   2326 
   2327 	/* get our ducks in a row for the read */
   2328 	bp->b_blkno = offset / DEV_BSIZE;
   2329 	bp->b_bcount = dsize;
   2330 	bp->b_flags |= B_READ;
   2331  	bp->b_resid = dsize;
   2332 
   2333 	bdev_strategy(bp);
   2334 	error = biowait(bp);
   2335 
   2336 	if (!error) {
   2337 		memcpy(data, bp->b_data, msize);
   2338 	}
   2339 
   2340 	brelse(bp, 0);
   2341 	return(error);
   2342 }
   2343 
   2344 
   2345 static int
   2346 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2347     RF_ComponentLabel_t *clabel)
   2348 {
   2349 	return raidwrite_component_area(dev, b_vp, clabel,
   2350 	    sizeof(RF_ComponentLabel_t),
   2351 	    rf_component_info_offset(),
   2352 	    rf_component_info_size(secsize), 0);
   2353 }
   2354 
   2355 /* ARGSUSED */
   2356 static int
   2357 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2358     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2359 {
   2360 	struct buf *bp;
   2361 	int error;
   2362 
   2363 	/* get a block of the appropriate size... */
   2364 	bp = geteblk((int)dsize);
   2365 	bp->b_dev = dev;
   2366 
   2367 	/* get our ducks in a row for the write */
   2368 	bp->b_blkno = offset / DEV_BSIZE;
   2369 	bp->b_bcount = dsize;
   2370 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2371  	bp->b_resid = dsize;
   2372 
   2373 	memset(bp->b_data, 0, dsize);
   2374 	memcpy(bp->b_data, data, msize);
   2375 
   2376 	bdev_strategy(bp);
   2377 	if (asyncp)
   2378 		return 0;
   2379 	error = biowait(bp);
   2380 	brelse(bp, 0);
   2381 	if (error) {
   2382 #if 1
   2383 		printf("Failed to write RAID component info!\n");
   2384 #endif
   2385 	}
   2386 
   2387 	return(error);
   2388 }
   2389 
   2390 void
   2391 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2392 {
   2393 	int c;
   2394 
   2395 	for (c = 0; c < raidPtr->numCol; c++) {
   2396 		/* Skip dead disks. */
   2397 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2398 			continue;
   2399 		/* XXXjld: what if an error occurs here? */
   2400 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2401 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2402 		    RF_PARITYMAP_NBYTE,
   2403 		    rf_parity_map_offset(raidPtr),
   2404 		    rf_parity_map_size(raidPtr), 0);
   2405 	}
   2406 }
   2407 
   2408 void
   2409 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2410 {
   2411 	struct rf_paritymap_ondisk tmp;
   2412 	int c,first;
   2413 
   2414 	first=1;
   2415 	for (c = 0; c < raidPtr->numCol; c++) {
   2416 		/* Skip dead disks. */
   2417 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2418 			continue;
   2419 		raidread_component_area(raidPtr->Disks[c].dev,
   2420 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2421 		    RF_PARITYMAP_NBYTE,
   2422 		    rf_parity_map_offset(raidPtr),
   2423 		    rf_parity_map_size(raidPtr));
   2424 		if (first) {
   2425 			memcpy(map, &tmp, sizeof(*map));
   2426 			first = 0;
   2427 		} else {
   2428 			rf_paritymap_merge(map, &tmp);
   2429 		}
   2430 	}
   2431 }
   2432 
   2433 void
   2434 rf_markalldirty(RF_Raid_t *raidPtr)
   2435 {
   2436 	RF_ComponentLabel_t *clabel;
   2437 	int sparecol;
   2438 	int c;
   2439 	int j;
   2440 	int scol = -1;
   2441 
   2442 	raidPtr->mod_counter++;
   2443 	for (c = 0; c < raidPtr->numCol; c++) {
   2444 		/* we don't want to touch (at all) a disk that has
   2445 		   failed */
   2446 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2447 			clabel = raidget_component_label(raidPtr, c);
   2448 			if (clabel->status == rf_ds_spared) {
   2449 				/* XXX do something special...
   2450 				   but whatever you do, don't
   2451 				   try to access it!! */
   2452 			} else {
   2453 				raidmarkdirty(raidPtr, c);
   2454 			}
   2455 		}
   2456 	}
   2457 
   2458 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2459 		sparecol = raidPtr->numCol + c;
   2460 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2461 			/*
   2462 
   2463 			   we claim this disk is "optimal" if it's
   2464 			   rf_ds_used_spare, as that means it should be
   2465 			   directly substitutable for the disk it replaced.
   2466 			   We note that too...
   2467 
   2468 			 */
   2469 
   2470 			for(j=0;j<raidPtr->numCol;j++) {
   2471 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2472 					scol = j;
   2473 					break;
   2474 				}
   2475 			}
   2476 
   2477 			clabel = raidget_component_label(raidPtr, sparecol);
   2478 			/* make sure status is noted */
   2479 
   2480 			raid_init_component_label(raidPtr, clabel);
   2481 
   2482 			clabel->row = 0;
   2483 			clabel->column = scol;
   2484 			/* Note: we *don't* change status from rf_ds_used_spare
   2485 			   to rf_ds_optimal */
   2486 			/* clabel.status = rf_ds_optimal; */
   2487 
   2488 			raidmarkdirty(raidPtr, sparecol);
   2489 		}
   2490 	}
   2491 }
   2492 
   2493 
   2494 void
   2495 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2496 {
   2497 	RF_ComponentLabel_t *clabel;
   2498 	int sparecol;
   2499 	int c;
   2500 	int j;
   2501 	int scol;
   2502 	struct raid_softc *rs = raidPtr->softc;
   2503 
   2504 	scol = -1;
   2505 
   2506 	/* XXX should do extra checks to make sure things really are clean,
   2507 	   rather than blindly setting the clean bit... */
   2508 
   2509 	raidPtr->mod_counter++;
   2510 
   2511 	for (c = 0; c < raidPtr->numCol; c++) {
   2512 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2513 			clabel = raidget_component_label(raidPtr, c);
   2514 			/* make sure status is noted */
   2515 			clabel->status = rf_ds_optimal;
   2516 
   2517 			/* note what unit we are configured as */
   2518 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2519 				clabel->last_unit = raidPtr->raidid;
   2520 
   2521 			raidflush_component_label(raidPtr, c);
   2522 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2523 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2524 					raidmarkclean(raidPtr, c);
   2525 				}
   2526 			}
   2527 		}
   2528 		/* else we don't touch it.. */
   2529 	}
   2530 
   2531 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2532 		sparecol = raidPtr->numCol + c;
   2533 		/* Need to ensure that the reconstruct actually completed! */
   2534 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2535 			/*
   2536 
   2537 			   we claim this disk is "optimal" if it's
   2538 			   rf_ds_used_spare, as that means it should be
   2539 			   directly substitutable for the disk it replaced.
   2540 			   We note that too...
   2541 
   2542 			 */
   2543 
   2544 			for(j=0;j<raidPtr->numCol;j++) {
   2545 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2546 					scol = j;
   2547 					break;
   2548 				}
   2549 			}
   2550 
   2551 			/* XXX shouldn't *really* need this... */
   2552 			clabel = raidget_component_label(raidPtr, sparecol);
   2553 			/* make sure status is noted */
   2554 
   2555 			raid_init_component_label(raidPtr, clabel);
   2556 
   2557 			clabel->column = scol;
   2558 			clabel->status = rf_ds_optimal;
   2559 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2560 				clabel->last_unit = raidPtr->raidid;
   2561 
   2562 			raidflush_component_label(raidPtr, sparecol);
   2563 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2564 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2565 					raidmarkclean(raidPtr, sparecol);
   2566 				}
   2567 			}
   2568 		}
   2569 	}
   2570 }
   2571 
   2572 void
   2573 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2574 {
   2575 
   2576 	if (vp != NULL) {
   2577 		if (auto_configured == 1) {
   2578 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2579 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2580 			vput(vp);
   2581 
   2582 		} else {
   2583 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2584 		}
   2585 	}
   2586 }
   2587 
   2588 
   2589 void
   2590 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2591 {
   2592 	int r,c;
   2593 	struct vnode *vp;
   2594 	int acd;
   2595 
   2596 
   2597 	/* We take this opportunity to close the vnodes like we should.. */
   2598 
   2599 	for (c = 0; c < raidPtr->numCol; c++) {
   2600 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2601 		acd = raidPtr->Disks[c].auto_configured;
   2602 		rf_close_component(raidPtr, vp, acd);
   2603 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2604 		raidPtr->Disks[c].auto_configured = 0;
   2605 	}
   2606 
   2607 	for (r = 0; r < raidPtr->numSpare; r++) {
   2608 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2609 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2610 		rf_close_component(raidPtr, vp, acd);
   2611 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2612 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2613 	}
   2614 }
   2615 
   2616 
   2617 void
   2618 rf_ReconThread(struct rf_recon_req_internal *req)
   2619 {
   2620 	int     s;
   2621 	RF_Raid_t *raidPtr;
   2622 
   2623 	s = splbio();
   2624 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2625 	raidPtr->recon_in_progress = 1;
   2626 
   2627 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2628 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2629 
   2630 	RF_Free(req, sizeof(*req));
   2631 
   2632 	raidPtr->recon_in_progress = 0;
   2633 	splx(s);
   2634 
   2635 	/* That's all... */
   2636 	kthread_exit(0);	/* does not return */
   2637 }
   2638 
   2639 void
   2640 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2641 {
   2642 	int retcode;
   2643 	int s;
   2644 
   2645 	raidPtr->parity_rewrite_stripes_done = 0;
   2646 	raidPtr->parity_rewrite_in_progress = 1;
   2647 	s = splbio();
   2648 	retcode = rf_RewriteParity(raidPtr);
   2649 	splx(s);
   2650 	if (retcode) {
   2651 		printf("raid%d: Error re-writing parity (%d)!\n",
   2652 		    raidPtr->raidid, retcode);
   2653 	} else {
   2654 		/* set the clean bit!  If we shutdown correctly,
   2655 		   the clean bit on each component label will get
   2656 		   set */
   2657 		raidPtr->parity_good = RF_RAID_CLEAN;
   2658 	}
   2659 	raidPtr->parity_rewrite_in_progress = 0;
   2660 
   2661 	/* Anyone waiting for us to stop?  If so, inform them... */
   2662 	if (raidPtr->waitShutdown) {
   2663 		rf_lock_mutex2(raidPtr->rad_lock);
   2664 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2665 		rf_unlock_mutex2(raidPtr->rad_lock);
   2666 	}
   2667 
   2668 	/* That's all... */
   2669 	kthread_exit(0);	/* does not return */
   2670 }
   2671 
   2672 
   2673 void
   2674 rf_CopybackThread(RF_Raid_t *raidPtr)
   2675 {
   2676 	int s;
   2677 
   2678 	raidPtr->copyback_in_progress = 1;
   2679 	s = splbio();
   2680 	rf_CopybackReconstructedData(raidPtr);
   2681 	splx(s);
   2682 	raidPtr->copyback_in_progress = 0;
   2683 
   2684 	/* That's all... */
   2685 	kthread_exit(0);	/* does not return */
   2686 }
   2687 
   2688 
   2689 void
   2690 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2691 {
   2692 	int s;
   2693 	RF_Raid_t *raidPtr;
   2694 
   2695 	s = splbio();
   2696 	raidPtr = req->raidPtr;
   2697 	raidPtr->recon_in_progress = 1;
   2698 	rf_ReconstructInPlace(raidPtr, req->col);
   2699 	RF_Free(req, sizeof(*req));
   2700 	raidPtr->recon_in_progress = 0;
   2701 	splx(s);
   2702 
   2703 	/* That's all... */
   2704 	kthread_exit(0);	/* does not return */
   2705 }
   2706 
   2707 static RF_AutoConfig_t *
   2708 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2709     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2710     unsigned secsize)
   2711 {
   2712 	int good_one = 0;
   2713 	RF_ComponentLabel_t *clabel;
   2714 	RF_AutoConfig_t *ac;
   2715 
   2716 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2717 
   2718 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2719 		/* Got the label.  Does it look reasonable? */
   2720 		if (rf_reasonable_label(clabel, numsecs) &&
   2721 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2722 #ifdef DEBUG
   2723 			printf("Component on: %s: %llu\n",
   2724 				cname, (unsigned long long)size);
   2725 			rf_print_component_label(clabel);
   2726 #endif
   2727 			/* if it's reasonable, add it, else ignore it. */
   2728 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2729 				M_WAITOK);
   2730 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2731 			ac->dev = dev;
   2732 			ac->vp = vp;
   2733 			ac->clabel = clabel;
   2734 			ac->next = ac_list;
   2735 			ac_list = ac;
   2736 			good_one = 1;
   2737 		}
   2738 	}
   2739 	if (!good_one) {
   2740 		/* cleanup */
   2741 		free(clabel, M_RAIDFRAME);
   2742 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2743 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2744 		vput(vp);
   2745 	}
   2746 	return ac_list;
   2747 }
   2748 
   2749 RF_AutoConfig_t *
   2750 rf_find_raid_components(void)
   2751 {
   2752 	struct vnode *vp;
   2753 	struct disklabel label;
   2754 	device_t dv;
   2755 	deviter_t di;
   2756 	dev_t dev;
   2757 	int bmajor, bminor, wedge, rf_part_found;
   2758 	int error;
   2759 	int i;
   2760 	RF_AutoConfig_t *ac_list;
   2761 	uint64_t numsecs;
   2762 	unsigned secsize;
   2763 	int dowedges;
   2764 
   2765 	/* initialize the AutoConfig list */
   2766 	ac_list = NULL;
   2767 
   2768 	/*
   2769 	 * we begin by trolling through *all* the devices on the system *twice*
   2770 	 * first we scan for wedges, second for other devices. This avoids
   2771 	 * using a raw partition instead of a wedge that covers the whole disk
   2772 	 */
   2773 
   2774 	for (dowedges=1; dowedges>=0; --dowedges) {
   2775 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2776 		     dv = deviter_next(&di)) {
   2777 
   2778 			/* we are only interested in disks... */
   2779 			if (device_class(dv) != DV_DISK)
   2780 				continue;
   2781 
   2782 			/* we don't care about floppies... */
   2783 			if (device_is_a(dv, "fd")) {
   2784 				continue;
   2785 			}
   2786 
   2787 			/* we don't care about CD's... */
   2788 			if (device_is_a(dv, "cd")) {
   2789 				continue;
   2790 			}
   2791 
   2792 			/* we don't care about md's... */
   2793 			if (device_is_a(dv, "md")) {
   2794 				continue;
   2795 			}
   2796 
   2797 			/* hdfd is the Atari/Hades floppy driver */
   2798 			if (device_is_a(dv, "hdfd")) {
   2799 				continue;
   2800 			}
   2801 
   2802 			/* fdisa is the Atari/Milan floppy driver */
   2803 			if (device_is_a(dv, "fdisa")) {
   2804 				continue;
   2805 			}
   2806 
   2807 			/* are we in the wedges pass ? */
   2808 			wedge = device_is_a(dv, "dk");
   2809 			if (wedge != dowedges) {
   2810 				continue;
   2811 			}
   2812 
   2813 			/* need to find the device_name_to_block_device_major stuff */
   2814 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2815 
   2816 			rf_part_found = 0; /*No raid partition as yet*/
   2817 
   2818 			/* get a vnode for the raw partition of this disk */
   2819 			bminor = minor(device_unit(dv));
   2820 			dev = wedge ? makedev(bmajor, bminor) :
   2821 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2822 			if (bdevvp(dev, &vp))
   2823 				panic("RAID can't alloc vnode");
   2824 
   2825 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2826 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2827 
   2828 			if (error) {
   2829 				/* "Who cares."  Continue looking
   2830 				   for something that exists*/
   2831 				vput(vp);
   2832 				continue;
   2833 			}
   2834 
   2835 			error = getdisksize(vp, &numsecs, &secsize);
   2836 			if (error) {
   2837 				/*
   2838 				 * Pseudo devices like vnd and cgd can be
   2839 				 * opened but may still need some configuration.
   2840 				 * Ignore these quietly.
   2841 				 */
   2842 				if (error != ENXIO)
   2843 					printf("RAIDframe: can't get disk size"
   2844 					    " for dev %s (%d)\n",
   2845 					    device_xname(dv), error);
   2846 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2847 				vput(vp);
   2848 				continue;
   2849 			}
   2850 			if (wedge) {
   2851 				struct dkwedge_info dkw;
   2852 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2853 				    NOCRED);
   2854 				if (error) {
   2855 					printf("RAIDframe: can't get wedge info for "
   2856 					    "dev %s (%d)\n", device_xname(dv), error);
   2857 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2858 					vput(vp);
   2859 					continue;
   2860 				}
   2861 
   2862 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2863 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2864 					vput(vp);
   2865 					continue;
   2866 				}
   2867 
   2868 				VOP_UNLOCK(vp);
   2869 				ac_list = rf_get_component(ac_list, dev, vp,
   2870 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2871 				rf_part_found = 1; /*There is a raid component on this disk*/
   2872 				continue;
   2873 			}
   2874 
   2875 			/* Ok, the disk exists.  Go get the disklabel. */
   2876 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2877 			if (error) {
   2878 				/*
   2879 				 * XXX can't happen - open() would
   2880 				 * have errored out (or faked up one)
   2881 				 */
   2882 				if (error != ENOTTY)
   2883 					printf("RAIDframe: can't get label for dev "
   2884 					    "%s (%d)\n", device_xname(dv), error);
   2885 			}
   2886 
   2887 			/* don't need this any more.  We'll allocate it again
   2888 			   a little later if we really do... */
   2889 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2890 			vput(vp);
   2891 
   2892 			if (error)
   2893 				continue;
   2894 
   2895 			rf_part_found = 0; /*No raid partitions yet*/
   2896 			for (i = 0; i < label.d_npartitions; i++) {
   2897 				char cname[sizeof(ac_list->devname)];
   2898 
   2899 				/* We only support partitions marked as RAID */
   2900 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2901 					continue;
   2902 
   2903 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2904 				if (bdevvp(dev, &vp))
   2905 					panic("RAID can't alloc vnode");
   2906 
   2907 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2908 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2909 				if (error) {
   2910 					/* Whatever... */
   2911 					vput(vp);
   2912 					continue;
   2913 				}
   2914 				VOP_UNLOCK(vp);
   2915 				snprintf(cname, sizeof(cname), "%s%c",
   2916 				    device_xname(dv), 'a' + i);
   2917 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2918 					label.d_partitions[i].p_size, numsecs, secsize);
   2919 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2920 			}
   2921 
   2922 			/*
   2923 			 *If there is no raid component on this disk, either in a
   2924 			 *disklabel or inside a wedge, check the raw partition as well,
   2925 			 *as it is possible to configure raid components on raw disk
   2926 			 *devices.
   2927 			 */
   2928 
   2929 			if (!rf_part_found) {
   2930 				char cname[sizeof(ac_list->devname)];
   2931 
   2932 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2933 				if (bdevvp(dev, &vp))
   2934 					panic("RAID can't alloc vnode");
   2935 
   2936 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2937 
   2938 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2939 				if (error) {
   2940 					/* Whatever... */
   2941 					vput(vp);
   2942 					continue;
   2943 				}
   2944 				VOP_UNLOCK(vp);
   2945 				snprintf(cname, sizeof(cname), "%s%c",
   2946 				    device_xname(dv), 'a' + RAW_PART);
   2947 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2948 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2949 			}
   2950 		}
   2951 		deviter_release(&di);
   2952 	}
   2953 	return ac_list;
   2954 }
   2955 
   2956 
   2957 int
   2958 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2959 {
   2960 
   2961 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2962 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2963 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2964 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2965 	    clabel->row >=0 &&
   2966 	    clabel->column >= 0 &&
   2967 	    clabel->num_rows > 0 &&
   2968 	    clabel->num_columns > 0 &&
   2969 	    clabel->row < clabel->num_rows &&
   2970 	    clabel->column < clabel->num_columns &&
   2971 	    clabel->blockSize > 0 &&
   2972 	    /*
   2973 	     * numBlocksHi may contain garbage, but it is ok since
   2974 	     * the type is unsigned.  If it is really garbage,
   2975 	     * rf_fix_old_label_size() will fix it.
   2976 	     */
   2977 	    rf_component_label_numblocks(clabel) > 0) {
   2978 		/*
   2979 		 * label looks reasonable enough...
   2980 		 * let's make sure it has no old garbage.
   2981 		 */
   2982 		if (numsecs)
   2983 			rf_fix_old_label_size(clabel, numsecs);
   2984 		return(1);
   2985 	}
   2986 	return(0);
   2987 }
   2988 
   2989 
   2990 /*
   2991  * For reasons yet unknown, some old component labels have garbage in
   2992  * the newer numBlocksHi region, and this causes lossage.  Since those
   2993  * disks will also have numsecs set to less than 32 bits of sectors,
   2994  * we can determine when this corruption has occurred, and fix it.
   2995  *
   2996  * The exact same problem, with the same unknown reason, happens to
   2997  * the partitionSizeHi member as well.
   2998  */
   2999 static void
   3000 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3001 {
   3002 
   3003 	if (numsecs < ((uint64_t)1 << 32)) {
   3004 		if (clabel->numBlocksHi) {
   3005 			printf("WARNING: total sectors < 32 bits, yet "
   3006 			       "numBlocksHi set\n"
   3007 			       "WARNING: resetting numBlocksHi to zero.\n");
   3008 			clabel->numBlocksHi = 0;
   3009 		}
   3010 
   3011 		if (clabel->partitionSizeHi) {
   3012 			printf("WARNING: total sectors < 32 bits, yet "
   3013 			       "partitionSizeHi set\n"
   3014 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3015 			clabel->partitionSizeHi = 0;
   3016 		}
   3017 	}
   3018 }
   3019 
   3020 
   3021 #ifdef DEBUG
   3022 void
   3023 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3024 {
   3025 	uint64_t numBlocks;
   3026 	static const char *rp[] = {
   3027 	    "No", "Force", "Soft", "*invalid*"
   3028 	};
   3029 
   3030 
   3031 	numBlocks = rf_component_label_numblocks(clabel);
   3032 
   3033 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3034 	       clabel->row, clabel->column,
   3035 	       clabel->num_rows, clabel->num_columns);
   3036 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3037 	       clabel->version, clabel->serial_number,
   3038 	       clabel->mod_counter);
   3039 	printf("   Clean: %s Status: %d\n",
   3040 	       clabel->clean ? "Yes" : "No", clabel->status);
   3041 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3042 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3043 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3044 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3045 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3046 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3047 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3048 #if 0
   3049 	   printf("   Config order: %d\n", clabel->config_order);
   3050 #endif
   3051 
   3052 }
   3053 #endif
   3054 
   3055 RF_ConfigSet_t *
   3056 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3057 {
   3058 	RF_AutoConfig_t *ac;
   3059 	RF_ConfigSet_t *config_sets;
   3060 	RF_ConfigSet_t *cset;
   3061 	RF_AutoConfig_t *ac_next;
   3062 
   3063 
   3064 	config_sets = NULL;
   3065 
   3066 	/* Go through the AutoConfig list, and figure out which components
   3067 	   belong to what sets.  */
   3068 	ac = ac_list;
   3069 	while(ac!=NULL) {
   3070 		/* we're going to putz with ac->next, so save it here
   3071 		   for use at the end of the loop */
   3072 		ac_next = ac->next;
   3073 
   3074 		if (config_sets == NULL) {
   3075 			/* will need at least this one... */
   3076 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3077 				       M_RAIDFRAME, M_WAITOK);
   3078 			/* this one is easy :) */
   3079 			config_sets->ac = ac;
   3080 			config_sets->next = NULL;
   3081 			config_sets->rootable = 0;
   3082 			ac->next = NULL;
   3083 		} else {
   3084 			/* which set does this component fit into? */
   3085 			cset = config_sets;
   3086 			while(cset!=NULL) {
   3087 				if (rf_does_it_fit(cset, ac)) {
   3088 					/* looks like it matches... */
   3089 					ac->next = cset->ac;
   3090 					cset->ac = ac;
   3091 					break;
   3092 				}
   3093 				cset = cset->next;
   3094 			}
   3095 			if (cset==NULL) {
   3096 				/* didn't find a match above... new set..*/
   3097 				cset = malloc(sizeof(RF_ConfigSet_t),
   3098 					       M_RAIDFRAME, M_WAITOK);
   3099 				cset->ac = ac;
   3100 				ac->next = NULL;
   3101 				cset->next = config_sets;
   3102 				cset->rootable = 0;
   3103 				config_sets = cset;
   3104 			}
   3105 		}
   3106 		ac = ac_next;
   3107 	}
   3108 
   3109 
   3110 	return(config_sets);
   3111 }
   3112 
   3113 static int
   3114 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3115 {
   3116 	RF_ComponentLabel_t *clabel1, *clabel2;
   3117 
   3118 	/* If this one matches the *first* one in the set, that's good
   3119 	   enough, since the other members of the set would have been
   3120 	   through here too... */
   3121 	/* note that we are not checking partitionSize here..
   3122 
   3123 	   Note that we are also not checking the mod_counters here.
   3124 	   If everything else matches except the mod_counter, that's
   3125 	   good enough for this test.  We will deal with the mod_counters
   3126 	   a little later in the autoconfiguration process.
   3127 
   3128 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3129 
   3130 	   The reason we don't check for this is that failed disks
   3131 	   will have lower modification counts.  If those disks are
   3132 	   not added to the set they used to belong to, then they will
   3133 	   form their own set, which may result in 2 different sets,
   3134 	   for example, competing to be configured at raid0, and
   3135 	   perhaps competing to be the root filesystem set.  If the
   3136 	   wrong ones get configured, or both attempt to become /,
   3137 	   weird behaviour and or serious lossage will occur.  Thus we
   3138 	   need to bring them into the fold here, and kick them out at
   3139 	   a later point.
   3140 
   3141 	*/
   3142 
   3143 	clabel1 = cset->ac->clabel;
   3144 	clabel2 = ac->clabel;
   3145 	if ((clabel1->version == clabel2->version) &&
   3146 	    (clabel1->serial_number == clabel2->serial_number) &&
   3147 	    (clabel1->num_rows == clabel2->num_rows) &&
   3148 	    (clabel1->num_columns == clabel2->num_columns) &&
   3149 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3150 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3151 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3152 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3153 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3154 	    (clabel1->blockSize == clabel2->blockSize) &&
   3155 	    rf_component_label_numblocks(clabel1) ==
   3156 	    rf_component_label_numblocks(clabel2) &&
   3157 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3158 	    (clabel1->root_partition == clabel2->root_partition) &&
   3159 	    (clabel1->last_unit == clabel2->last_unit) &&
   3160 	    (clabel1->config_order == clabel2->config_order)) {
   3161 		/* if it get's here, it almost *has* to be a match */
   3162 	} else {
   3163 		/* it's not consistent with somebody in the set..
   3164 		   punt */
   3165 		return(0);
   3166 	}
   3167 	/* all was fine.. it must fit... */
   3168 	return(1);
   3169 }
   3170 
   3171 int
   3172 rf_have_enough_components(RF_ConfigSet_t *cset)
   3173 {
   3174 	RF_AutoConfig_t *ac;
   3175 	RF_AutoConfig_t *auto_config;
   3176 	RF_ComponentLabel_t *clabel;
   3177 	int c;
   3178 	int num_cols;
   3179 	int num_missing;
   3180 	int mod_counter;
   3181 	int mod_counter_found;
   3182 	int even_pair_failed;
   3183 	char parity_type;
   3184 
   3185 
   3186 	/* check to see that we have enough 'live' components
   3187 	   of this set.  If so, we can configure it if necessary */
   3188 
   3189 	num_cols = cset->ac->clabel->num_columns;
   3190 	parity_type = cset->ac->clabel->parityConfig;
   3191 
   3192 	/* XXX Check for duplicate components!?!?!? */
   3193 
   3194 	/* Determine what the mod_counter is supposed to be for this set. */
   3195 
   3196 	mod_counter_found = 0;
   3197 	mod_counter = 0;
   3198 	ac = cset->ac;
   3199 	while(ac!=NULL) {
   3200 		if (mod_counter_found==0) {
   3201 			mod_counter = ac->clabel->mod_counter;
   3202 			mod_counter_found = 1;
   3203 		} else {
   3204 			if (ac->clabel->mod_counter > mod_counter) {
   3205 				mod_counter = ac->clabel->mod_counter;
   3206 			}
   3207 		}
   3208 		ac = ac->next;
   3209 	}
   3210 
   3211 	num_missing = 0;
   3212 	auto_config = cset->ac;
   3213 
   3214 	even_pair_failed = 0;
   3215 	for(c=0; c<num_cols; c++) {
   3216 		ac = auto_config;
   3217 		while(ac!=NULL) {
   3218 			if ((ac->clabel->column == c) &&
   3219 			    (ac->clabel->mod_counter == mod_counter)) {
   3220 				/* it's this one... */
   3221 #ifdef DEBUG
   3222 				printf("Found: %s at %d\n",
   3223 				       ac->devname,c);
   3224 #endif
   3225 				break;
   3226 			}
   3227 			ac=ac->next;
   3228 		}
   3229 		if (ac==NULL) {
   3230 				/* Didn't find one here! */
   3231 				/* special case for RAID 1, especially
   3232 				   where there are more than 2
   3233 				   components (where RAIDframe treats
   3234 				   things a little differently :( ) */
   3235 			if (parity_type == '1') {
   3236 				if (c%2 == 0) { /* even component */
   3237 					even_pair_failed = 1;
   3238 				} else { /* odd component.  If
   3239 					    we're failed, and
   3240 					    so is the even
   3241 					    component, it's
   3242 					    "Good Night, Charlie" */
   3243 					if (even_pair_failed == 1) {
   3244 						return(0);
   3245 					}
   3246 				}
   3247 			} else {
   3248 				/* normal accounting */
   3249 				num_missing++;
   3250 			}
   3251 		}
   3252 		if ((parity_type == '1') && (c%2 == 1)) {
   3253 				/* Just did an even component, and we didn't
   3254 				   bail.. reset the even_pair_failed flag,
   3255 				   and go on to the next component.... */
   3256 			even_pair_failed = 0;
   3257 		}
   3258 	}
   3259 
   3260 	clabel = cset->ac->clabel;
   3261 
   3262 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3263 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3264 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3265 		/* XXX this needs to be made *much* more general */
   3266 		/* Too many failures */
   3267 		return(0);
   3268 	}
   3269 	/* otherwise, all is well, and we've got enough to take a kick
   3270 	   at autoconfiguring this set */
   3271 	return(1);
   3272 }
   3273 
   3274 void
   3275 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3276 			RF_Raid_t *raidPtr)
   3277 {
   3278 	RF_ComponentLabel_t *clabel;
   3279 	int i;
   3280 
   3281 	clabel = ac->clabel;
   3282 
   3283 	/* 1. Fill in the common stuff */
   3284 	config->numCol = clabel->num_columns;
   3285 	config->numSpare = 0; /* XXX should this be set here? */
   3286 	config->sectPerSU = clabel->sectPerSU;
   3287 	config->SUsPerPU = clabel->SUsPerPU;
   3288 	config->SUsPerRU = clabel->SUsPerRU;
   3289 	config->parityConfig = clabel->parityConfig;
   3290 	/* XXX... */
   3291 	strcpy(config->diskQueueType,"fifo");
   3292 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3293 	config->layoutSpecificSize = 0; /* XXX ?? */
   3294 
   3295 	while(ac!=NULL) {
   3296 		/* row/col values will be in range due to the checks
   3297 		   in reasonable_label() */
   3298 		strcpy(config->devnames[0][ac->clabel->column],
   3299 		       ac->devname);
   3300 		ac = ac->next;
   3301 	}
   3302 
   3303 	for(i=0;i<RF_MAXDBGV;i++) {
   3304 		config->debugVars[i][0] = 0;
   3305 	}
   3306 }
   3307 
   3308 int
   3309 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3310 {
   3311 	RF_ComponentLabel_t *clabel;
   3312 	int column;
   3313 	int sparecol;
   3314 
   3315 	raidPtr->autoconfigure = new_value;
   3316 
   3317 	for(column=0; column<raidPtr->numCol; column++) {
   3318 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3319 			clabel = raidget_component_label(raidPtr, column);
   3320 			clabel->autoconfigure = new_value;
   3321 			raidflush_component_label(raidPtr, column);
   3322 		}
   3323 	}
   3324 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3325 		sparecol = raidPtr->numCol + column;
   3326 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3327 			clabel = raidget_component_label(raidPtr, sparecol);
   3328 			clabel->autoconfigure = new_value;
   3329 			raidflush_component_label(raidPtr, sparecol);
   3330 		}
   3331 	}
   3332 	return(new_value);
   3333 }
   3334 
   3335 int
   3336 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3337 {
   3338 	RF_ComponentLabel_t *clabel;
   3339 	int column;
   3340 	int sparecol;
   3341 
   3342 	raidPtr->root_partition = new_value;
   3343 	for(column=0; column<raidPtr->numCol; column++) {
   3344 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3345 			clabel = raidget_component_label(raidPtr, column);
   3346 			clabel->root_partition = new_value;
   3347 			raidflush_component_label(raidPtr, column);
   3348 		}
   3349 	}
   3350 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3351 		sparecol = raidPtr->numCol + column;
   3352 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3353 			clabel = raidget_component_label(raidPtr, sparecol);
   3354 			clabel->root_partition = new_value;
   3355 			raidflush_component_label(raidPtr, sparecol);
   3356 		}
   3357 	}
   3358 	return(new_value);
   3359 }
   3360 
   3361 void
   3362 rf_release_all_vps(RF_ConfigSet_t *cset)
   3363 {
   3364 	RF_AutoConfig_t *ac;
   3365 
   3366 	ac = cset->ac;
   3367 	while(ac!=NULL) {
   3368 		/* Close the vp, and give it back */
   3369 		if (ac->vp) {
   3370 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3371 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3372 			vput(ac->vp);
   3373 			ac->vp = NULL;
   3374 		}
   3375 		ac = ac->next;
   3376 	}
   3377 }
   3378 
   3379 
   3380 void
   3381 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3382 {
   3383 	RF_AutoConfig_t *ac;
   3384 	RF_AutoConfig_t *next_ac;
   3385 
   3386 	ac = cset->ac;
   3387 	while(ac!=NULL) {
   3388 		next_ac = ac->next;
   3389 		/* nuke the label */
   3390 		free(ac->clabel, M_RAIDFRAME);
   3391 		/* cleanup the config structure */
   3392 		free(ac, M_RAIDFRAME);
   3393 		/* "next.." */
   3394 		ac = next_ac;
   3395 	}
   3396 	/* and, finally, nuke the config set */
   3397 	free(cset, M_RAIDFRAME);
   3398 }
   3399 
   3400 
   3401 void
   3402 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3403 {
   3404 	/* current version number */
   3405 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3406 	clabel->serial_number = raidPtr->serial_number;
   3407 	clabel->mod_counter = raidPtr->mod_counter;
   3408 
   3409 	clabel->num_rows = 1;
   3410 	clabel->num_columns = raidPtr->numCol;
   3411 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3412 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3413 
   3414 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3415 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3416 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3417 
   3418 	clabel->blockSize = raidPtr->bytesPerSector;
   3419 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3420 
   3421 	/* XXX not portable */
   3422 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3423 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3424 	clabel->autoconfigure = raidPtr->autoconfigure;
   3425 	clabel->root_partition = raidPtr->root_partition;
   3426 	clabel->last_unit = raidPtr->raidid;
   3427 	clabel->config_order = raidPtr->config_order;
   3428 
   3429 #ifndef RF_NO_PARITY_MAP
   3430 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3431 #endif
   3432 }
   3433 
   3434 struct raid_softc *
   3435 rf_auto_config_set(RF_ConfigSet_t *cset)
   3436 {
   3437 	RF_Raid_t *raidPtr;
   3438 	RF_Config_t *config;
   3439 	int raidID;
   3440 	struct raid_softc *sc;
   3441 
   3442 #ifdef DEBUG
   3443 	printf("RAID autoconfigure\n");
   3444 #endif
   3445 
   3446 	/* 1. Create a config structure */
   3447 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3448 
   3449 	/*
   3450 	   2. Figure out what RAID ID this one is supposed to live at
   3451 	   See if we can get the same RAID dev that it was configured
   3452 	   on last time..
   3453 	*/
   3454 
   3455 	raidID = cset->ac->clabel->last_unit;
   3456 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3457 	     sc = raidget(++raidID, false))
   3458 		continue;
   3459 #ifdef DEBUG
   3460 	printf("Configuring raid%d:\n",raidID);
   3461 #endif
   3462 
   3463 	if (sc == NULL)
   3464 		sc = raidget(raidID, true);
   3465 	raidPtr = &sc->sc_r;
   3466 
   3467 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3468 	raidPtr->softc = sc;
   3469 	raidPtr->raidid = raidID;
   3470 	raidPtr->openings = RAIDOUTSTANDING;
   3471 
   3472 	/* 3. Build the configuration structure */
   3473 	rf_create_configuration(cset->ac, config, raidPtr);
   3474 
   3475 	/* 4. Do the configuration */
   3476 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3477 		raidinit(sc);
   3478 
   3479 		rf_markalldirty(raidPtr);
   3480 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3481 		switch (cset->ac->clabel->root_partition) {
   3482 		case 1:	/* Force Root */
   3483 		case 2:	/* Soft Root: root when boot partition part of raid */
   3484 			/*
   3485 			 * everything configured just fine.  Make a note
   3486 			 * that this set is eligible to be root,
   3487 			 * or forced to be root
   3488 			 */
   3489 			cset->rootable = cset->ac->clabel->root_partition;
   3490 			/* XXX do this here? */
   3491 			raidPtr->root_partition = cset->rootable;
   3492 			break;
   3493 		default:
   3494 			break;
   3495 		}
   3496 	} else {
   3497 		raidput(sc);
   3498 		sc = NULL;
   3499 	}
   3500 
   3501 	/* 5. Cleanup */
   3502 	free(config, M_RAIDFRAME);
   3503 	return sc;
   3504 }
   3505 
   3506 void
   3507 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3508 	     size_t xmin, size_t xmax)
   3509 {
   3510 
   3511 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3512 	pool_sethiwat(p, xmax);
   3513 	pool_prime(p, xmin);
   3514 }
   3515 
   3516 /*
   3517  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3518  * to see if there is IO pending and if that IO could possibly be done
   3519  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3520  * otherwise.
   3521  *
   3522  */
   3523 int
   3524 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3525 {
   3526 	struct raid_softc *rs;
   3527 	struct dk_softc *dksc;
   3528 
   3529 	rs = raidPtr->softc;
   3530 	dksc = &rs->sc_dksc;
   3531 
   3532 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3533 		return 1;
   3534 
   3535 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3536 		/* there is work to do */
   3537 		return 0;
   3538 	}
   3539 	/* default is nothing to do */
   3540 	return 1;
   3541 }
   3542 
   3543 int
   3544 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3545 {
   3546 	uint64_t numsecs;
   3547 	unsigned secsize;
   3548 	int error;
   3549 
   3550 	error = getdisksize(vp, &numsecs, &secsize);
   3551 	if (error == 0) {
   3552 		diskPtr->blockSize = secsize;
   3553 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3554 		diskPtr->partitionSize = numsecs;
   3555 		return 0;
   3556 	}
   3557 	return error;
   3558 }
   3559 
   3560 static int
   3561 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3562 {
   3563 	return 1;
   3564 }
   3565 
   3566 static void
   3567 raid_attach(device_t parent, device_t self, void *aux)
   3568 {
   3569 }
   3570 
   3571 
   3572 static int
   3573 raid_detach(device_t self, int flags)
   3574 {
   3575 	int error;
   3576 	struct raid_softc *rs = raidsoftc(self);
   3577 
   3578 	if (rs == NULL)
   3579 		return ENXIO;
   3580 
   3581 	if ((error = raidlock(rs)) != 0)
   3582 		return error;
   3583 
   3584 	error = raid_detach_unlocked(rs);
   3585 
   3586 	raidunlock(rs);
   3587 
   3588 	/* XXX raid can be referenced here */
   3589 
   3590 	if (error)
   3591 		return error;
   3592 
   3593 	/* Free the softc */
   3594 	raidput(rs);
   3595 
   3596 	return 0;
   3597 }
   3598 
   3599 static void
   3600 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3601 {
   3602 	struct dk_softc *dksc = &rs->sc_dksc;
   3603 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3604 
   3605 	memset(dg, 0, sizeof(*dg));
   3606 
   3607 	dg->dg_secperunit = raidPtr->totalSectors;
   3608 	dg->dg_secsize = raidPtr->bytesPerSector;
   3609 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3610 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3611 
   3612 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3613 }
   3614 
   3615 /*
   3616  * Get cache info for all the components (including spares).
   3617  * Returns intersection of all the cache flags of all disks, or first
   3618  * error if any encountered.
   3619  * XXXfua feature flags can change as spares are added - lock down somehow
   3620  */
   3621 static int
   3622 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3623 {
   3624 	int c;
   3625 	int error;
   3626 	int dkwhole = 0, dkpart;
   3627 
   3628 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3629 		/*
   3630 		 * Check any non-dead disk, even when currently being
   3631 		 * reconstructed.
   3632 		 */
   3633 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3634 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3635 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3636 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3637 			if (error) {
   3638 				if (error != ENODEV) {
   3639 					printf("raid%d: get cache for component %s failed\n",
   3640 					    raidPtr->raidid,
   3641 					    raidPtr->Disks[c].devname);
   3642 				}
   3643 
   3644 				return error;
   3645 			}
   3646 
   3647 			if (c == 0)
   3648 				dkwhole = dkpart;
   3649 			else
   3650 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3651 		}
   3652 	}
   3653 
   3654 	*data = dkwhole;
   3655 
   3656 	return 0;
   3657 }
   3658 
   3659 /*
   3660  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3661  * We end up returning whatever error was returned by the first cache flush
   3662  * that fails.
   3663  */
   3664 
   3665 static int
   3666 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3667 {
   3668 	int e = 0;
   3669 	for (int i = 0; i < 5; i++) {
   3670 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3671 		    &force, FWRITE, NOCRED);
   3672 		if (!e || e == ENODEV)
   3673 			return e;
   3674 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3675 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3676 	}
   3677 	return e;
   3678 }
   3679 
   3680 int
   3681 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3682 {
   3683 	int c, error;
   3684 
   3685 	error = 0;
   3686 	for (c = 0; c < raidPtr->numCol; c++) {
   3687 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3688 			int e = rf_sync_component_cache(raidPtr, c, force);
   3689 			if (e && !error)
   3690 				error = e;
   3691 		}
   3692 	}
   3693 
   3694 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3695 		int sparecol = raidPtr->numCol + c;
   3696 		/* Need to ensure that the reconstruct actually completed! */
   3697 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3698 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3699 			    force);
   3700 			if (e && !error)
   3701 				error = e;
   3702 		}
   3703 	}
   3704 	return error;
   3705 }
   3706 
   3707 /* Fill in info with the current status */
   3708 void
   3709 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3710 {
   3711 
   3712 	if (raidPtr->status != rf_rs_reconstructing) {
   3713 		info->total = 100;
   3714 		info->completed = 100;
   3715 	} else {
   3716 		info->total = raidPtr->reconControl->numRUsTotal;
   3717 		info->completed = raidPtr->reconControl->numRUsComplete;
   3718 	}
   3719 	info->remaining = info->total - info->completed;
   3720 }
   3721 
   3722 /* Fill in info with the current status */
   3723 void
   3724 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3725 {
   3726 
   3727 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3728 		info->total = raidPtr->Layout.numStripe;
   3729 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3730 	} else {
   3731 		info->completed = 100;
   3732 		info->total = 100;
   3733 	}
   3734 	info->remaining = info->total - info->completed;
   3735 }
   3736 
   3737 /* Fill in info with the current status */
   3738 void
   3739 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3740 {
   3741 
   3742 	if (raidPtr->copyback_in_progress == 1) {
   3743 		info->total = raidPtr->Layout.numStripe;
   3744 		info->completed = raidPtr->copyback_stripes_done;
   3745 		info->remaining = info->total - info->completed;
   3746 	} else {
   3747 		info->remaining = 0;
   3748 		info->completed = 100;
   3749 		info->total = 100;
   3750 	}
   3751 }
   3752 
   3753 /* Fill in config with the current info */
   3754 int
   3755 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3756 {
   3757 	int	d, i, j;
   3758 
   3759 	if (!raidPtr->valid)
   3760 		return ENODEV;
   3761 	config->cols = raidPtr->numCol;
   3762 	config->ndevs = raidPtr->numCol;
   3763 	if (config->ndevs >= RF_MAX_DISKS)
   3764 		return ENOMEM;
   3765 	config->nspares = raidPtr->numSpare;
   3766 	if (config->nspares >= RF_MAX_DISKS)
   3767 		return ENOMEM;
   3768 	config->maxqdepth = raidPtr->maxQueueDepth;
   3769 	d = 0;
   3770 	for (j = 0; j < config->cols; j++) {
   3771 		config->devs[d] = raidPtr->Disks[j];
   3772 		d++;
   3773 	}
   3774 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3775 		config->spares[i] = raidPtr->Disks[j];
   3776 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3777 			/* XXX: raidctl(8) expects to see this as a used spare */
   3778 			config->spares[i].status = rf_ds_used_spare;
   3779 		}
   3780 	}
   3781 	return 0;
   3782 }
   3783 
   3784 int
   3785 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3786 {
   3787 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3788 	RF_ComponentLabel_t *raid_clabel;
   3789 	int column = clabel->column;
   3790 
   3791 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3792 		return EINVAL;
   3793 	raid_clabel = raidget_component_label(raidPtr, column);
   3794 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3795 
   3796 	return 0;
   3797 }
   3798 
   3799 /*
   3800  * Module interface
   3801  */
   3802 
   3803 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3804 
   3805 #ifdef _MODULE
   3806 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3807 #endif
   3808 
   3809 static int raid_modcmd(modcmd_t, void *);
   3810 static int raid_modcmd_init(void);
   3811 static int raid_modcmd_fini(void);
   3812 
   3813 static int
   3814 raid_modcmd(modcmd_t cmd, void *data)
   3815 {
   3816 	int error;
   3817 
   3818 	error = 0;
   3819 	switch (cmd) {
   3820 	case MODULE_CMD_INIT:
   3821 		error = raid_modcmd_init();
   3822 		break;
   3823 	case MODULE_CMD_FINI:
   3824 		error = raid_modcmd_fini();
   3825 		break;
   3826 	default:
   3827 		error = ENOTTY;
   3828 		break;
   3829 	}
   3830 	return error;
   3831 }
   3832 
   3833 static int
   3834 raid_modcmd_init(void)
   3835 {
   3836 	int error;
   3837 	int bmajor, cmajor;
   3838 
   3839 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3840 	mutex_enter(&raid_lock);
   3841 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3842 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3843 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3844 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3845 
   3846 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3847 #endif
   3848 
   3849 	bmajor = cmajor = -1;
   3850 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3851 	    &raid_cdevsw, &cmajor);
   3852 	if (error != 0 && error != EEXIST) {
   3853 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3854 		mutex_exit(&raid_lock);
   3855 		return error;
   3856 	}
   3857 #ifdef _MODULE
   3858 	error = config_cfdriver_attach(&raid_cd);
   3859 	if (error != 0) {
   3860 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3861 		    __func__, error);
   3862 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3863 		mutex_exit(&raid_lock);
   3864 		return error;
   3865 	}
   3866 #endif
   3867 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3868 	if (error != 0) {
   3869 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3870 		    __func__, error);
   3871 #ifdef _MODULE
   3872 		config_cfdriver_detach(&raid_cd);
   3873 #endif
   3874 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3875 		mutex_exit(&raid_lock);
   3876 		return error;
   3877 	}
   3878 
   3879 	raidautoconfigdone = false;
   3880 
   3881 	mutex_exit(&raid_lock);
   3882 
   3883 	if (error == 0) {
   3884 		if (rf_BootRaidframe(true) == 0)
   3885 			aprint_verbose("Kernelized RAIDframe activated\n");
   3886 		else
   3887 			panic("Serious error activating RAID!!");
   3888 	}
   3889 
   3890 	/*
   3891 	 * Register a finalizer which will be used to auto-config RAID
   3892 	 * sets once all real hardware devices have been found.
   3893 	 */
   3894 	error = config_finalize_register(NULL, rf_autoconfig);
   3895 	if (error != 0) {
   3896 		aprint_error("WARNING: unable to register RAIDframe "
   3897 		    "finalizer\n");
   3898 		error = 0;
   3899 	}
   3900 
   3901 	return error;
   3902 }
   3903 
   3904 static int
   3905 raid_modcmd_fini(void)
   3906 {
   3907 	int error;
   3908 
   3909 	mutex_enter(&raid_lock);
   3910 
   3911 	/* Don't allow unload if raid device(s) exist.  */
   3912 	if (!LIST_EMPTY(&raids)) {
   3913 		mutex_exit(&raid_lock);
   3914 		return EBUSY;
   3915 	}
   3916 
   3917 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3918 	if (error != 0) {
   3919 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3920 		mutex_exit(&raid_lock);
   3921 		return error;
   3922 	}
   3923 #ifdef _MODULE
   3924 	error = config_cfdriver_detach(&raid_cd);
   3925 	if (error != 0) {
   3926 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3927 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3928 		mutex_exit(&raid_lock);
   3929 		return error;
   3930 	}
   3931 #endif
   3932 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3933 	if (error != 0) {
   3934 		aprint_error("%s: cannot detach devsw\n",__func__);
   3935 #ifdef _MODULE
   3936 		config_cfdriver_attach(&raid_cd);
   3937 #endif
   3938 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3939 		mutex_exit(&raid_lock);
   3940 		return error;
   3941 	}
   3942 	rf_BootRaidframe(false);
   3943 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3944 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3945 	rf_destroy_cond2(rf_sparet_wait_cv);
   3946 	rf_destroy_cond2(rf_sparet_resp_cv);
   3947 #endif
   3948 	mutex_exit(&raid_lock);
   3949 	mutex_destroy(&raid_lock);
   3950 
   3951 	return error;
   3952 }
   3953