Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.395
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.395 2021/07/23 00:54:45 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.395 2021/07/23 00:54:45 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    179 
    180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    181 
    182 /* prototypes */
    183 static void KernelWakeupFunc(struct buf *);
    184 static void InitBP(struct buf *, struct vnode *, unsigned,
    185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    186     void *, int);
    187 static void raidinit(struct raid_softc *);
    188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    190 
    191 static int raid_match(device_t, cfdata_t, void *);
    192 static void raid_attach(device_t, device_t, void *);
    193 static int raid_detach(device_t, int);
    194 
    195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t);
    197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t, int);
    199 
    200 static int raidwrite_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 static int raidread_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 
    205 static int raid_diskstart(device_t, struct buf *bp);
    206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    207 static int raid_lastclose(device_t);
    208 
    209 static dev_type_open(raidopen);
    210 static dev_type_close(raidclose);
    211 static dev_type_read(raidread);
    212 static dev_type_write(raidwrite);
    213 static dev_type_ioctl(raidioctl);
    214 static dev_type_strategy(raidstrategy);
    215 static dev_type_dump(raiddump);
    216 static dev_type_size(raidsize);
    217 
    218 const struct bdevsw raid_bdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_strategy = raidstrategy,
    222 	.d_ioctl = raidioctl,
    223 	.d_dump = raiddump,
    224 	.d_psize = raidsize,
    225 	.d_discard = nodiscard,
    226 	.d_flag = D_DISK
    227 };
    228 
    229 const struct cdevsw raid_cdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_read = raidread,
    233 	.d_write = raidwrite,
    234 	.d_ioctl = raidioctl,
    235 	.d_stop = nostop,
    236 	.d_tty = notty,
    237 	.d_poll = nopoll,
    238 	.d_mmap = nommap,
    239 	.d_kqfilter = nokqfilter,
    240 	.d_discard = nodiscard,
    241 	.d_flag = D_DISK
    242 };
    243 
    244 static struct dkdriver rf_dkdriver = {
    245 	.d_open = raidopen,
    246 	.d_close = raidclose,
    247 	.d_strategy = raidstrategy,
    248 	.d_diskstart = raid_diskstart,
    249 	.d_dumpblocks = raid_dumpblocks,
    250 	.d_lastclose = raid_lastclose,
    251 	.d_minphys = minphys
    252 };
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /* Internal representation of a rf_recon_req */
    263 struct rf_recon_req_internal {
    264 	RF_RowCol_t col;
    265 	RF_ReconReqFlags_t flags;
    266 	void   *raidPtr;
    267 };
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 static void rf_ReconThread(struct rf_recon_req_internal *);
    304 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    307 static int rf_autoconfig(device_t);
    308 static void rf_buildroothack(RF_ConfigSet_t *);
    309 
    310 static RF_AutoConfig_t *rf_find_raid_components(void);
    311 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    313 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    314 static int rf_set_autoconfig(RF_Raid_t *, int);
    315 static int rf_set_rootpartition(RF_Raid_t *, int);
    316 static void rf_release_all_vps(RF_ConfigSet_t *);
    317 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    318 static int rf_have_enough_components(RF_ConfigSet_t *);
    319 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    320 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    321 
    322 /*
    323  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    324  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    325  * in the kernel config file.
    326  */
    327 #ifdef RAID_AUTOCONFIG
    328 int raidautoconfig = 1;
    329 #else
    330 int raidautoconfig = 0;
    331 #endif
    332 static bool raidautoconfigdone = false;
    333 
    334 struct pool rf_alloclist_pool;   /* AllocList */
    335 
    336 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    337 static kmutex_t raid_lock;
    338 
    339 static struct raid_softc *
    340 raidcreate(int unit) {
    341 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    342 	sc->sc_unit = unit;
    343 	cv_init(&sc->sc_cv, "raidunit");
    344 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    345 	return sc;
    346 }
    347 
    348 static void
    349 raiddestroy(struct raid_softc *sc) {
    350 	cv_destroy(&sc->sc_cv);
    351 	mutex_destroy(&sc->sc_mutex);
    352 	kmem_free(sc, sizeof(*sc));
    353 }
    354 
    355 static struct raid_softc *
    356 raidget(int unit, bool create) {
    357 	struct raid_softc *sc;
    358 	if (unit < 0) {
    359 #ifdef DIAGNOSTIC
    360 		panic("%s: unit %d!", __func__, unit);
    361 #endif
    362 		return NULL;
    363 	}
    364 	mutex_enter(&raid_lock);
    365 	LIST_FOREACH(sc, &raids, sc_link) {
    366 		if (sc->sc_unit == unit) {
    367 			mutex_exit(&raid_lock);
    368 			return sc;
    369 		}
    370 	}
    371 	mutex_exit(&raid_lock);
    372 	if (!create)
    373 		return NULL;
    374 	sc = raidcreate(unit);
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 static int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return 0;
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 static void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok, rootable %d\n",
    502 				    sc->sc_unit, cset->rootable);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 
    522 	/* if the user has specified what the root device should be
    523 	   then we don't touch booted_device or boothowto... */
    524 
    525 	if (rootspec != NULL) {
    526 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    527 		return;
    528 	}
    529 
    530 	/* we found something bootable... */
    531 
    532 	/*
    533 	 * XXX: The following code assumes that the root raid
    534 	 * is the first ('a') partition. This is about the best
    535 	 * we can do with a BSD disklabel, but we might be able
    536 	 * to do better with a GPT label, by setting a specified
    537 	 * attribute to indicate the root partition. We can then
    538 	 * stash the partition number in the r->root_partition
    539 	 * high bits (the bottom 2 bits are already used). For
    540 	 * now we just set booted_partition to 0 when we override
    541 	 * root.
    542 	 */
    543 	if (num_root == 1) {
    544 		device_t candidate_root;
    545 		dksc = &rsc->sc_dksc;
    546 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    547 			char cname[sizeof(cset->ac->devname)];
    548 			/* XXX: assume partition 'a' first */
    549 			snprintf(cname, sizeof(cname), "%s%c",
    550 			    device_xname(dksc->sc_dev), 'a');
    551 			candidate_root = dkwedge_find_by_wname(cname);
    552 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    553 			    cname);
    554 			if (candidate_root == NULL) {
    555 				/*
    556 				 * If that is not found, because we don't use
    557 				 * disklabel, return the first dk child
    558 				 * XXX: we can skip the 'a' check above
    559 				 * and always do this...
    560 				 */
    561 				size_t i = 0;
    562 				candidate_root = dkwedge_find_by_parent(
    563 				    device_xname(dksc->sc_dev), &i);
    564 			}
    565 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    566 			    candidate_root);
    567 		} else
    568 			candidate_root = dksc->sc_dev;
    569 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    570 		DPRINTF("%s: booted_device=%p root_partition=%d "
    571 			"contains_boot=%d",
    572 		    __func__, booted_device, rsc->sc_r.root_partition,
    573 			   rf_containsboot(&rsc->sc_r, booted_device));
    574 		/* XXX the check for booted_device == NULL can probably be
    575 		 * dropped, now that rf_containsboot handles that case.
    576 		 */
    577 		if (booted_device == NULL ||
    578 		    rsc->sc_r.root_partition == 1 ||
    579 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    580 			booted_device = candidate_root;
    581 			booted_method = "raidframe/single";
    582 			booted_partition = 0;	/* XXX assume 'a' */
    583 			DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
    584 			    device_xname(booted_device), booted_device);
    585 		}
    586 	} else if (num_root > 1) {
    587 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    588 		    booted_device);
    589 
    590 		/*
    591 		 * Maybe the MD code can help. If it cannot, then
    592 		 * setroot() will discover that we have no
    593 		 * booted_device and will ask the user if nothing was
    594 		 * hardwired in the kernel config file
    595 		 */
    596 		if (booted_device == NULL)
    597 			return;
    598 
    599 		num_root = 0;
    600 		mutex_enter(&raid_lock);
    601 		LIST_FOREACH(sc, &raids, sc_link) {
    602 			RF_Raid_t *r = &sc->sc_r;
    603 			if (r->valid == 0)
    604 				continue;
    605 
    606 			if (r->root_partition == 0)
    607 				continue;
    608 
    609 			if (rf_containsboot(r, booted_device)) {
    610 				num_root++;
    611 				rsc = sc;
    612 				dksc = &rsc->sc_dksc;
    613 			}
    614 		}
    615 		mutex_exit(&raid_lock);
    616 
    617 		if (num_root == 1) {
    618 			booted_device = dksc->sc_dev;
    619 			booted_method = "raidframe/multi";
    620 			booted_partition = 0;	/* XXX assume 'a' */
    621 		} else {
    622 			/* we can't guess.. require the user to answer... */
    623 			boothowto |= RB_ASKNAME;
    624 		}
    625 	}
    626 }
    627 
    628 static int
    629 raidsize(dev_t dev)
    630 {
    631 	struct raid_softc *rs;
    632 	struct dk_softc *dksc;
    633 	unsigned int unit;
    634 
    635 	unit = raidunit(dev);
    636 	if ((rs = raidget(unit, false)) == NULL)
    637 		return -1;
    638 	dksc = &rs->sc_dksc;
    639 
    640 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    641 		return -1;
    642 
    643 	return dk_size(dksc, dev);
    644 }
    645 
    646 static int
    647 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    648 {
    649 	unsigned int unit;
    650 	struct raid_softc *rs;
    651 	struct dk_softc *dksc;
    652 
    653 	unit = raidunit(dev);
    654 	if ((rs = raidget(unit, false)) == NULL)
    655 		return ENXIO;
    656 	dksc = &rs->sc_dksc;
    657 
    658 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    659 		return ENODEV;
    660 
    661         /*
    662            Note that blkno is relative to this particular partition.
    663            By adding adding RF_PROTECTED_SECTORS, we get a value that
    664 	   is relative to the partition used for the underlying component.
    665         */
    666 	blkno += RF_PROTECTED_SECTORS;
    667 
    668 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    669 }
    670 
    671 static int
    672 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    673 {
    674 	struct raid_softc *rs = raidsoftc(dev);
    675 	const struct bdevsw *bdev;
    676 	RF_Raid_t *raidPtr;
    677 	int     c, sparecol, j, scol, dumpto;
    678 	int     error = 0;
    679 
    680 	raidPtr = &rs->sc_r;
    681 
    682 	/* we only support dumping to RAID 1 sets */
    683 	if (raidPtr->Layout.numDataCol != 1 ||
    684 	    raidPtr->Layout.numParityCol != 1)
    685 		return EINVAL;
    686 
    687 	if ((error = raidlock(rs)) != 0)
    688 		return error;
    689 
    690 	/* figure out what device is alive.. */
    691 
    692 	/*
    693 	   Look for a component to dump to.  The preference for the
    694 	   component to dump to is as follows:
    695 	   1) the first component
    696 	   2) a used_spare of the first component
    697 	   3) the second component
    698 	   4) a used_spare of the second component
    699 	*/
    700 
    701 	dumpto = -1;
    702 	for (c = 0; c < raidPtr->numCol; c++) {
    703 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    704 			/* this might be the one */
    705 			dumpto = c;
    706 			break;
    707 		}
    708 	}
    709 
    710 	/*
    711 	   At this point we have possibly selected a live component.
    712 	   If we didn't find a live ocmponent, we now check to see
    713 	   if there is a relevant spared component.
    714 	*/
    715 
    716 	for (c = 0; c < raidPtr->numSpare; c++) {
    717 		sparecol = raidPtr->numCol + c;
    718 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    719 			/* How about this one? */
    720 			scol = -1;
    721 			for(j=0;j<raidPtr->numCol;j++) {
    722 				if (raidPtr->Disks[j].spareCol == sparecol) {
    723 					scol = j;
    724 					break;
    725 				}
    726 			}
    727 			if (scol == 0) {
    728 				/*
    729 				   We must have found a spared first
    730 				   component!  We'll take that over
    731 				   anything else found so far.  (We
    732 				   couldn't have found a real first
    733 				   component before, since this is a
    734 				   used spare, and it's saying that
    735 				   it's replacing the first
    736 				   component.)  On reboot (with
    737 				   autoconfiguration turned on)
    738 				   sparecol will become the first
    739 				   component (component0) of this set.
    740 				*/
    741 				dumpto = sparecol;
    742 				break;
    743 			} else if (scol != -1) {
    744 				/*
    745 				   Must be a spared second component.
    746 				   We'll dump to that if we havn't found
    747 				   anything else so far.
    748 				*/
    749 				if (dumpto == -1)
    750 					dumpto = sparecol;
    751 			}
    752 		}
    753 	}
    754 
    755 	if (dumpto == -1) {
    756 		/* we couldn't find any live components to dump to!?!?
    757 		 */
    758 		error = EINVAL;
    759 		goto out;
    760 	}
    761 
    762 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    763 	if (bdev == NULL) {
    764 		error = ENXIO;
    765 		goto out;
    766 	}
    767 
    768 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    769 				blkno, va, nblk * raidPtr->bytesPerSector);
    770 
    771 out:
    772 	raidunlock(rs);
    773 
    774 	return error;
    775 }
    776 
    777 /* ARGSUSED */
    778 static int
    779 raidopen(dev_t dev, int flags, int fmt,
    780     struct lwp *l)
    781 {
    782 	int     unit = raidunit(dev);
    783 	struct raid_softc *rs;
    784 	struct dk_softc *dksc;
    785 	int     error = 0;
    786 	int     part, pmask;
    787 
    788 	if ((rs = raidget(unit, true)) == NULL)
    789 		return ENXIO;
    790 	if ((error = raidlock(rs)) != 0)
    791 		return error;
    792 
    793 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    794 		error = EBUSY;
    795 		goto bad;
    796 	}
    797 
    798 	dksc = &rs->sc_dksc;
    799 
    800 	part = DISKPART(dev);
    801 	pmask = (1 << part);
    802 
    803 	if (!DK_BUSY(dksc, pmask) &&
    804 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    805 		/* First one... mark things as dirty... Note that we *MUST*
    806 		 have done a configure before this.  I DO NOT WANT TO BE
    807 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    808 		 THAT THEY BELONG TOGETHER!!!!! */
    809 		/* XXX should check to see if we're only open for reading
    810 		   here... If so, we needn't do this, but then need some
    811 		   other way of keeping track of what's happened.. */
    812 
    813 		rf_markalldirty(&rs->sc_r);
    814 	}
    815 
    816 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    817 		error = dk_open(dksc, dev, flags, fmt, l);
    818 
    819 bad:
    820 	raidunlock(rs);
    821 
    822 	return error;
    823 
    824 
    825 }
    826 
    827 static int
    828 raid_lastclose(device_t self)
    829 {
    830 	struct raid_softc *rs = raidsoftc(self);
    831 
    832 	/* Last one... device is not unconfigured yet.
    833 	   Device shutdown has taken care of setting the
    834 	   clean bits if RAIDF_INITED is not set
    835 	   mark things as clean... */
    836 
    837 	rf_update_component_labels(&rs->sc_r,
    838 	    RF_FINAL_COMPONENT_UPDATE);
    839 
    840 	/* pass to unlocked code */
    841 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    842 		rs->sc_flags |= RAIDF_DETACH;
    843 
    844 	return 0;
    845 }
    846 
    847 /* ARGSUSED */
    848 static int
    849 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    850 {
    851 	int     unit = raidunit(dev);
    852 	struct raid_softc *rs;
    853 	struct dk_softc *dksc;
    854 	cfdata_t cf;
    855 	int     error = 0, do_detach = 0, do_put = 0;
    856 
    857 	if ((rs = raidget(unit, false)) == NULL)
    858 		return ENXIO;
    859 	dksc = &rs->sc_dksc;
    860 
    861 	if ((error = raidlock(rs)) != 0)
    862 		return error;
    863 
    864 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    865 		error = dk_close(dksc, dev, flags, fmt, l);
    866 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    867 			do_detach = 1;
    868 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    869 		do_put = 1;
    870 
    871 	raidunlock(rs);
    872 
    873 	if (do_detach) {
    874 		/* free the pseudo device attach bits */
    875 		cf = device_cfdata(dksc->sc_dev);
    876 		error = config_detach(dksc->sc_dev, 0);
    877 		if (error == 0)
    878 			free(cf, M_RAIDFRAME);
    879 	} else if (do_put) {
    880 		raidput(rs);
    881 	}
    882 
    883 	return error;
    884 
    885 }
    886 
    887 static void
    888 raid_wakeup(RF_Raid_t *raidPtr)
    889 {
    890 	rf_lock_mutex2(raidPtr->iodone_lock);
    891 	rf_signal_cond2(raidPtr->iodone_cv);
    892 	rf_unlock_mutex2(raidPtr->iodone_lock);
    893 }
    894 
    895 static void
    896 raidstrategy(struct buf *bp)
    897 {
    898 	unsigned int unit;
    899 	struct raid_softc *rs;
    900 	struct dk_softc *dksc;
    901 	RF_Raid_t *raidPtr;
    902 
    903 	unit = raidunit(bp->b_dev);
    904 	if ((rs = raidget(unit, false)) == NULL) {
    905 		bp->b_error = ENXIO;
    906 		goto fail;
    907 	}
    908 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    909 		bp->b_error = ENXIO;
    910 		goto fail;
    911 	}
    912 	dksc = &rs->sc_dksc;
    913 	raidPtr = &rs->sc_r;
    914 
    915 	/* Queue IO only */
    916 	if (dk_strategy_defer(dksc, bp))
    917 		goto done;
    918 
    919 	/* schedule the IO to happen at the next convenient time */
    920 	raid_wakeup(raidPtr);
    921 
    922 done:
    923 	return;
    924 
    925 fail:
    926 	bp->b_resid = bp->b_bcount;
    927 	biodone(bp);
    928 }
    929 
    930 static int
    931 raid_diskstart(device_t dev, struct buf *bp)
    932 {
    933 	struct raid_softc *rs = raidsoftc(dev);
    934 	RF_Raid_t *raidPtr;
    935 
    936 	raidPtr = &rs->sc_r;
    937 	if (!raidPtr->valid) {
    938 		db1_printf(("raid is not valid..\n"));
    939 		return ENODEV;
    940 	}
    941 
    942 	/* XXX */
    943 	bp->b_resid = 0;
    944 
    945 	return raiddoaccess(raidPtr, bp);
    946 }
    947 
    948 void
    949 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    950 {
    951 	struct raid_softc *rs;
    952 	struct dk_softc *dksc;
    953 
    954 	rs = raidPtr->softc;
    955 	dksc = &rs->sc_dksc;
    956 
    957 	dk_done(dksc, bp);
    958 
    959 	rf_lock_mutex2(raidPtr->mutex);
    960 	raidPtr->openings++;
    961 	rf_unlock_mutex2(raidPtr->mutex);
    962 
    963 	/* schedule more IO */
    964 	raid_wakeup(raidPtr);
    965 }
    966 
    967 /* ARGSUSED */
    968 static int
    969 raidread(dev_t dev, struct uio *uio, int flags)
    970 {
    971 	int     unit = raidunit(dev);
    972 	struct raid_softc *rs;
    973 
    974 	if ((rs = raidget(unit, false)) == NULL)
    975 		return ENXIO;
    976 
    977 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    978 		return ENXIO;
    979 
    980 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
    981 
    982 }
    983 
    984 /* ARGSUSED */
    985 static int
    986 raidwrite(dev_t dev, struct uio *uio, int flags)
    987 {
    988 	int     unit = raidunit(dev);
    989 	struct raid_softc *rs;
    990 
    991 	if ((rs = raidget(unit, false)) == NULL)
    992 		return ENXIO;
    993 
    994 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    995 		return ENXIO;
    996 
    997 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
    998 
    999 }
   1000 
   1001 static int
   1002 raid_detach_unlocked(struct raid_softc *rs)
   1003 {
   1004 	struct dk_softc *dksc = &rs->sc_dksc;
   1005 	RF_Raid_t *raidPtr;
   1006 	int error;
   1007 
   1008 	raidPtr = &rs->sc_r;
   1009 
   1010 	if (DK_BUSY(dksc, 0) ||
   1011 	    raidPtr->recon_in_progress != 0 ||
   1012 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1013 	    raidPtr->copyback_in_progress != 0)
   1014 		return EBUSY;
   1015 
   1016 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1017 		return 0;
   1018 
   1019 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1020 
   1021 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1022 		return error;
   1023 
   1024 	rs->sc_flags &= ~RAIDF_INITED;
   1025 
   1026 	/* Kill off any queued buffers */
   1027 	dk_drain(dksc);
   1028 	bufq_free(dksc->sc_bufq);
   1029 
   1030 	/* Detach the disk. */
   1031 	dkwedge_delall(&dksc->sc_dkdev);
   1032 	disk_detach(&dksc->sc_dkdev);
   1033 	disk_destroy(&dksc->sc_dkdev);
   1034 	dk_detach(dksc);
   1035 
   1036 	return 0;
   1037 }
   1038 
   1039 static bool
   1040 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1041 {
   1042 	switch (cmd) {
   1043 	case RAIDFRAME_ADD_HOT_SPARE:
   1044 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1045 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1046 	case RAIDFRAME_CHECK_PARITY:
   1047 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1048 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1049 	case RAIDFRAME_CHECK_RECON_STATUS:
   1050 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1051 	case RAIDFRAME_COPYBACK:
   1052 	case RAIDFRAME_DELETE_COMPONENT:
   1053 	case RAIDFRAME_FAIL_DISK:
   1054 	case RAIDFRAME_GET_ACCTOTALS:
   1055 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1056 	case RAIDFRAME_GET_INFO:
   1057 	case RAIDFRAME_GET_SIZE:
   1058 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1059 	case RAIDFRAME_INIT_LABELS:
   1060 	case RAIDFRAME_KEEP_ACCTOTALS:
   1061 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1062 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1063 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1064 	case RAIDFRAME_PARITYMAP_STATUS:
   1065 	case RAIDFRAME_REBUILD_IN_PLACE:
   1066 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1067 	case RAIDFRAME_RESET_ACCTOTALS:
   1068 	case RAIDFRAME_REWRITEPARITY:
   1069 	case RAIDFRAME_SET_AUTOCONFIG:
   1070 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1071 	case RAIDFRAME_SET_ROOT:
   1072 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1073 	}
   1074 	return false;
   1075 }
   1076 
   1077 int
   1078 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1079 {
   1080 	struct rf_recon_req_internal *rrint;
   1081 
   1082 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1083 		/* Can't do this on a RAID 0!! */
   1084 		return EINVAL;
   1085 	}
   1086 
   1087 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1088 		/* bad column */
   1089 		return EINVAL;
   1090 	}
   1091 
   1092 	rf_lock_mutex2(raidPtr->mutex);
   1093 	if (raidPtr->status == rf_rs_reconstructing) {
   1094 		/* you can't fail a disk while we're reconstructing! */
   1095 		/* XXX wrong for RAID6 */
   1096 		goto out;
   1097 	}
   1098 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1099 	    (raidPtr->numFailures > 0)) {
   1100 		/* some other component has failed.  Let's not make
   1101 		   things worse. XXX wrong for RAID6 */
   1102 		goto out;
   1103 	}
   1104 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1105 		/* Can't fail a spared disk! */
   1106 		goto out;
   1107 	}
   1108 	rf_unlock_mutex2(raidPtr->mutex);
   1109 
   1110 	/* make a copy of the recon request so that we don't rely on
   1111 	 * the user's buffer */
   1112 	rrint = RF_Malloc(sizeof(*rrint));
   1113 	if (rrint == NULL)
   1114 		return(ENOMEM);
   1115 	rrint->col = rr->col;
   1116 	rrint->flags = rr->flags;
   1117 	rrint->raidPtr = raidPtr;
   1118 
   1119 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1120 	    rrint, "raid_recon");
   1121 out:
   1122 	rf_unlock_mutex2(raidPtr->mutex);
   1123 	return EINVAL;
   1124 }
   1125 
   1126 static int
   1127 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1128 {
   1129 	/* allocate a buffer for the layout-specific data, and copy it in */
   1130 	if (k_cfg->layoutSpecificSize == 0)
   1131 		return 0;
   1132 
   1133 	if (k_cfg->layoutSpecificSize > 10000) {
   1134 	    /* sanity check */
   1135 	    return EINVAL;
   1136 	}
   1137 
   1138 	u_char *specific_buf;
   1139 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1140 	if (specific_buf == NULL)
   1141 		return ENOMEM;
   1142 
   1143 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1144 	    k_cfg->layoutSpecificSize);
   1145 	if (retcode) {
   1146 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1147 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1148 		return retcode;
   1149 	}
   1150 
   1151 	k_cfg->layoutSpecific = specific_buf;
   1152 	return 0;
   1153 }
   1154 
   1155 static int
   1156 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1157 {
   1158 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1159 
   1160 	if (rs->sc_r.valid) {
   1161 		/* There is a valid RAID set running on this unit! */
   1162 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1163 		return EINVAL;
   1164 	}
   1165 
   1166 	/* copy-in the configuration information */
   1167 	/* data points to a pointer to the configuration structure */
   1168 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1169 	if (*k_cfg == NULL) {
   1170 		return ENOMEM;
   1171 	}
   1172 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1173 	if (retcode == 0)
   1174 		return 0;
   1175 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1176 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1177 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1178 	return retcode;
   1179 }
   1180 
   1181 int
   1182 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1183 {
   1184 	int retcode;
   1185 	RF_Raid_t *raidPtr = &rs->sc_r;
   1186 
   1187 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1188 
   1189 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1190 		goto out;
   1191 
   1192 	/* should do some kind of sanity check on the configuration.
   1193 	 * Store the sum of all the bytes in the last byte? */
   1194 
   1195 	/* configure the system */
   1196 
   1197 	/*
   1198 	 * Clear the entire RAID descriptor, just to make sure
   1199 	 *  there is no stale data left in the case of a
   1200 	 *  reconfiguration
   1201 	 */
   1202 	memset(raidPtr, 0, sizeof(*raidPtr));
   1203 	raidPtr->softc = rs;
   1204 	raidPtr->raidid = rs->sc_unit;
   1205 
   1206 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1207 
   1208 	if (retcode == 0) {
   1209 		/* allow this many simultaneous IO's to
   1210 		   this RAID device */
   1211 		raidPtr->openings = RAIDOUTSTANDING;
   1212 
   1213 		raidinit(rs);
   1214 		raid_wakeup(raidPtr);
   1215 		rf_markalldirty(raidPtr);
   1216 	}
   1217 
   1218 	/* free the buffers.  No return code here. */
   1219 	if (k_cfg->layoutSpecificSize) {
   1220 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1221 	}
   1222 out:
   1223 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1224 	if (retcode) {
   1225 		/*
   1226 		 * If configuration failed, set sc_flags so that we
   1227 		 * will detach the device when we close it.
   1228 		 */
   1229 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1230 	}
   1231 	return retcode;
   1232 }
   1233 
   1234 #if RF_DISABLED
   1235 static int
   1236 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1237 {
   1238 
   1239 	/* XXX check the label for valid stuff... */
   1240 	/* Note that some things *should not* get modified --
   1241 	   the user should be re-initing the labels instead of
   1242 	   trying to patch things.
   1243 	   */
   1244 #ifdef DEBUG
   1245 	int raidid = raidPtr->raidid;
   1246 	printf("raid%d: Got component label:\n", raidid);
   1247 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1248 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1249 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1250 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1251 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1252 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1253 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1254 #endif	/* DEBUG */
   1255 	clabel->row = 0;
   1256 	int column = clabel->column;
   1257 
   1258 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1259 		return(EINVAL);
   1260 	}
   1261 
   1262 	/* XXX this isn't allowed to do anything for now :-) */
   1263 
   1264 	/* XXX and before it is, we need to fill in the rest
   1265 	   of the fields!?!?!?! */
   1266 	memcpy(raidget_component_label(raidPtr, column),
   1267 	    clabel, sizeof(*clabel));
   1268 	raidflush_component_label(raidPtr, column);
   1269 	return 0;
   1270 }
   1271 #endif
   1272 
   1273 static int
   1274 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1275 {
   1276 	/*
   1277 	   we only want the serial number from
   1278 	   the above.  We get all the rest of the information
   1279 	   from the config that was used to create this RAID
   1280 	   set.
   1281 	   */
   1282 
   1283 	raidPtr->serial_number = clabel->serial_number;
   1284 
   1285 	for (int column = 0; column < raidPtr->numCol; column++) {
   1286 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1287 		if (RF_DEAD_DISK(diskPtr->status))
   1288 			continue;
   1289 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1290 		    raidPtr, column);
   1291 		/* Zeroing this is important. */
   1292 		memset(ci_label, 0, sizeof(*ci_label));
   1293 		raid_init_component_label(raidPtr, ci_label);
   1294 		ci_label->serial_number = raidPtr->serial_number;
   1295 		ci_label->row = 0; /* we dont' pretend to support more */
   1296 		rf_component_label_set_partitionsize(ci_label,
   1297 		    diskPtr->partitionSize);
   1298 		ci_label->column = column;
   1299 		raidflush_component_label(raidPtr, column);
   1300 		/* XXXjld what about the spares? */
   1301 	}
   1302 
   1303 	return 0;
   1304 }
   1305 
   1306 static int
   1307 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1308 {
   1309 
   1310 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1311 		/* Can't do this on a RAID 0!! */
   1312 		return EINVAL;
   1313 	}
   1314 
   1315 	if (raidPtr->recon_in_progress == 1) {
   1316 		/* a reconstruct is already in progress! */
   1317 		return EINVAL;
   1318 	}
   1319 
   1320 	RF_SingleComponent_t component;
   1321 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1322 	component.row = 0; /* we don't support any more */
   1323 	int column = component.column;
   1324 
   1325 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1326 		return EINVAL;
   1327 	}
   1328 
   1329 	rf_lock_mutex2(raidPtr->mutex);
   1330 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1331 	    (raidPtr->numFailures > 0)) {
   1332 		/* XXX 0 above shouldn't be constant!!! */
   1333 		/* some component other than this has failed.
   1334 		   Let's not make things worse than they already
   1335 		   are... */
   1336 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1337 		       raidPtr->raidid);
   1338 		printf("raid%d:     Col: %d   Too many failures.\n",
   1339 		       raidPtr->raidid, column);
   1340 		rf_unlock_mutex2(raidPtr->mutex);
   1341 		return EINVAL;
   1342 	}
   1343 
   1344 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1345 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1346 		       raidPtr->raidid);
   1347 		printf("raid%d:    Col: %d   "
   1348 		    "Reconstruction already occurring!\n",
   1349 		    raidPtr->raidid, column);
   1350 
   1351 		rf_unlock_mutex2(raidPtr->mutex);
   1352 		return EINVAL;
   1353 	}
   1354 
   1355 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1356 		rf_unlock_mutex2(raidPtr->mutex);
   1357 		return EINVAL;
   1358 	}
   1359 
   1360 	rf_unlock_mutex2(raidPtr->mutex);
   1361 
   1362 	struct rf_recon_req_internal *rrint;
   1363 	rrint = RF_Malloc(sizeof(*rrint));
   1364 	if (rrint == NULL)
   1365 		return ENOMEM;
   1366 
   1367 	rrint->col = column;
   1368 	rrint->raidPtr = raidPtr;
   1369 
   1370 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1371 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1372 }
   1373 
   1374 static int
   1375 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1376 {
   1377 	/*
   1378 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1379 	 * so tell the user it's done.
   1380 	 */
   1381 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1382 	    raidPtr->status != rf_rs_reconstructing) {
   1383 		*data = 100;
   1384 		return 0;
   1385 	}
   1386 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1387 		*data = 0;
   1388 		return 0;
   1389 	}
   1390 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1391 	    / raidPtr->reconControl->numRUsTotal);
   1392 	return 0;
   1393 }
   1394 
   1395 static int
   1396 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1397 {
   1398 	int     unit = raidunit(dev);
   1399 	int     part, pmask;
   1400 	struct raid_softc *rs;
   1401 	struct dk_softc *dksc;
   1402 	RF_Config_t *k_cfg;
   1403 	RF_Raid_t *raidPtr;
   1404 	RF_AccTotals_t *totals;
   1405 	RF_SingleComponent_t component;
   1406 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1407 	int retcode = 0;
   1408 	int column;
   1409 	RF_ComponentLabel_t *clabel;
   1410 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1411 	int d;
   1412 
   1413 	if ((rs = raidget(unit, false)) == NULL)
   1414 		return ENXIO;
   1415 
   1416 	dksc = &rs->sc_dksc;
   1417 	raidPtr = &rs->sc_r;
   1418 
   1419 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1420 	    (int) DISKPART(dev), (int) unit, cmd));
   1421 
   1422 	/* Must be initialized for these... */
   1423 	if (rf_must_be_initialized(rs, cmd))
   1424 		return ENXIO;
   1425 
   1426 	switch (cmd) {
   1427 		/* configure the system */
   1428 	case RAIDFRAME_CONFIGURE:
   1429 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1430 			return retcode;
   1431 		return rf_construct(rs, k_cfg);
   1432 
   1433 		/* shutdown the system */
   1434 	case RAIDFRAME_SHUTDOWN:
   1435 
   1436 		part = DISKPART(dev);
   1437 		pmask = (1 << part);
   1438 
   1439 		if ((retcode = raidlock(rs)) != 0)
   1440 			return retcode;
   1441 
   1442 		if (DK_BUSY(dksc, pmask) ||
   1443 		    raidPtr->recon_in_progress != 0 ||
   1444 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1445 		    raidPtr->copyback_in_progress != 0)
   1446 			retcode = EBUSY;
   1447 		else {
   1448 			/* detach and free on close */
   1449 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1450 			retcode = 0;
   1451 		}
   1452 
   1453 		raidunlock(rs);
   1454 
   1455 		return retcode;
   1456 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1457 		return rf_get_component_label(raidPtr, data);
   1458 
   1459 #if RF_DISABLED
   1460 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1461 		return rf_set_component_label(raidPtr, data);
   1462 #endif
   1463 
   1464 	case RAIDFRAME_INIT_LABELS:
   1465 		return rf_init_component_label(raidPtr, data);
   1466 
   1467 	case RAIDFRAME_SET_AUTOCONFIG:
   1468 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1469 		printf("raid%d: New autoconfig value is: %d\n",
   1470 		       raidPtr->raidid, d);
   1471 		*(int *) data = d;
   1472 		return retcode;
   1473 
   1474 	case RAIDFRAME_SET_ROOT:
   1475 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1476 		printf("raid%d: New rootpartition value is: %d\n",
   1477 		       raidPtr->raidid, d);
   1478 		*(int *) data = d;
   1479 		return retcode;
   1480 
   1481 		/* initialize all parity */
   1482 	case RAIDFRAME_REWRITEPARITY:
   1483 
   1484 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1485 			/* Parity for RAID 0 is trivially correct */
   1486 			raidPtr->parity_good = RF_RAID_CLEAN;
   1487 			return 0;
   1488 		}
   1489 
   1490 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1491 			/* Re-write is already in progress! */
   1492 			return EINVAL;
   1493 		}
   1494 
   1495 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1496 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1497 
   1498 	case RAIDFRAME_ADD_HOT_SPARE:
   1499 		sparePtr = (RF_SingleComponent_t *) data;
   1500 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1501 		return rf_add_hot_spare(raidPtr, &component);
   1502 
   1503 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1504 		return retcode;
   1505 
   1506 	case RAIDFRAME_DELETE_COMPONENT:
   1507 		componentPtr = (RF_SingleComponent_t *)data;
   1508 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1509 		return rf_delete_component(raidPtr, &component);
   1510 
   1511 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1512 		componentPtr = (RF_SingleComponent_t *)data;
   1513 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1514 		return rf_incorporate_hot_spare(raidPtr, &component);
   1515 
   1516 	case RAIDFRAME_REBUILD_IN_PLACE:
   1517 		return rf_rebuild_in_place(raidPtr, data);
   1518 
   1519 	case RAIDFRAME_GET_INFO:
   1520 		ucfgp = *(RF_DeviceConfig_t **)data;
   1521 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1522 		if (d_cfg == NULL)
   1523 			return ENOMEM;
   1524 		retcode = rf_get_info(raidPtr, d_cfg);
   1525 		if (retcode == 0) {
   1526 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1527 		}
   1528 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1529 		return retcode;
   1530 
   1531 	case RAIDFRAME_CHECK_PARITY:
   1532 		*(int *) data = raidPtr->parity_good;
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_STATUS:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		rf_paritymap_status(raidPtr->parity_map, data);
   1539 		return 0;
   1540 
   1541 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1542 		if (rf_paritymap_ineligible(raidPtr))
   1543 			return EINVAL;
   1544 		if (raidPtr->parity_map == NULL)
   1545 			return ENOENT; /* ??? */
   1546 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1547 			return EINVAL;
   1548 		return 0;
   1549 
   1550 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1551 		if (rf_paritymap_ineligible(raidPtr))
   1552 			return EINVAL;
   1553 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1554 		return 0;
   1555 
   1556 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1557 		if (rf_paritymap_ineligible(raidPtr))
   1558 			return EINVAL;
   1559 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1560 		/* XXX should errors be passed up? */
   1561 		return 0;
   1562 
   1563 	case RAIDFRAME_RESET_ACCTOTALS:
   1564 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1565 		return 0;
   1566 
   1567 	case RAIDFRAME_GET_ACCTOTALS:
   1568 		totals = (RF_AccTotals_t *) data;
   1569 		*totals = raidPtr->acc_totals;
   1570 		return 0;
   1571 
   1572 	case RAIDFRAME_KEEP_ACCTOTALS:
   1573 		raidPtr->keep_acc_totals = *(int *)data;
   1574 		return 0;
   1575 
   1576 	case RAIDFRAME_GET_SIZE:
   1577 		*(int *) data = raidPtr->totalSectors;
   1578 		return 0;
   1579 
   1580 	case RAIDFRAME_FAIL_DISK:
   1581 		return rf_fail_disk(raidPtr, data);
   1582 
   1583 		/* invoke a copyback operation after recon on whatever disk
   1584 		 * needs it, if any */
   1585 	case RAIDFRAME_COPYBACK:
   1586 
   1587 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1588 			/* This makes no sense on a RAID 0!! */
   1589 			return EINVAL;
   1590 		}
   1591 
   1592 		if (raidPtr->copyback_in_progress == 1) {
   1593 			/* Copyback is already in progress! */
   1594 			return EINVAL;
   1595 		}
   1596 
   1597 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1598 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1599 
   1600 		/* return the percentage completion of reconstruction */
   1601 	case RAIDFRAME_CHECK_RECON_STATUS:
   1602 		return rf_check_recon_status(raidPtr, data);
   1603 
   1604 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1605 		rf_check_recon_status_ext(raidPtr, data);
   1606 		return 0;
   1607 
   1608 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1609 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1610 			/* This makes no sense on a RAID 0, so tell the
   1611 			   user it's done. */
   1612 			*(int *) data = 100;
   1613 			return 0;
   1614 		}
   1615 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1616 			*(int *) data = 100 *
   1617 				raidPtr->parity_rewrite_stripes_done /
   1618 				raidPtr->Layout.numStripe;
   1619 		} else {
   1620 			*(int *) data = 100;
   1621 		}
   1622 		return 0;
   1623 
   1624 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1625 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1626 		return 0;
   1627 
   1628 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1629 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1630 			/* This makes no sense on a RAID 0 */
   1631 			*(int *) data = 100;
   1632 			return 0;
   1633 		}
   1634 		if (raidPtr->copyback_in_progress == 1) {
   1635 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1636 				raidPtr->Layout.numStripe;
   1637 		} else {
   1638 			*(int *) data = 100;
   1639 		}
   1640 		return 0;
   1641 
   1642 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1643 		rf_check_copyback_status_ext(raidPtr, data);
   1644 		return 0;
   1645 
   1646 	case RAIDFRAME_SET_LAST_UNIT:
   1647 		for (column = 0; column < raidPtr->numCol; column++)
   1648 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1649 				return EBUSY;
   1650 
   1651 		for (column = 0; column < raidPtr->numCol; column++) {
   1652 			clabel = raidget_component_label(raidPtr, column);
   1653 			clabel->last_unit = *(int *)data;
   1654 			raidflush_component_label(raidPtr, column);
   1655 		}
   1656 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1657 		return 0;
   1658 
   1659 		/* the sparetable daemon calls this to wait for the kernel to
   1660 		 * need a spare table. this ioctl does not return until a
   1661 		 * spare table is needed. XXX -- calling mpsleep here in the
   1662 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1663 		 * -- I should either compute the spare table in the kernel,
   1664 		 * or have a different -- XXX XXX -- interface (a different
   1665 		 * character device) for delivering the table     -- XXX */
   1666 #if RF_DISABLED
   1667 	case RAIDFRAME_SPARET_WAIT:
   1668 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1669 		while (!rf_sparet_wait_queue)
   1670 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1671 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1672 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1673 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1674 
   1675 		/* structure assignment */
   1676 		*((RF_SparetWait_t *) data) = *waitreq;
   1677 
   1678 		RF_Free(waitreq, sizeof(*waitreq));
   1679 		return 0;
   1680 
   1681 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1682 		 * code in it that will cause the dameon to exit */
   1683 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1684 		waitreq = RF_Malloc(sizeof(*waitreq));
   1685 		waitreq->fcol = -1;
   1686 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1687 		waitreq->next = rf_sparet_wait_queue;
   1688 		rf_sparet_wait_queue = waitreq;
   1689 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1690 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1691 		return 0;
   1692 
   1693 		/* used by the spare table daemon to deliver a spare table
   1694 		 * into the kernel */
   1695 	case RAIDFRAME_SEND_SPARET:
   1696 
   1697 		/* install the spare table */
   1698 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1699 
   1700 		/* respond to the requestor.  the return status of the spare
   1701 		 * table installation is passed in the "fcol" field */
   1702 		waitred = RF_Malloc(sizeof(*waitreq));
   1703 		waitreq->fcol = retcode;
   1704 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1705 		waitreq->next = rf_sparet_resp_queue;
   1706 		rf_sparet_resp_queue = waitreq;
   1707 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1708 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1709 
   1710 		return retcode;
   1711 #endif
   1712 	default:
   1713 		/*
   1714 		 * Don't bother trying to load compat modules
   1715 		 * if it is not our ioctl. This is more efficient
   1716 		 * and makes rump tests not depend on compat code
   1717 		 */
   1718 		if (IOCGROUP(cmd) != 'r')
   1719 			break;
   1720 #ifdef _LP64
   1721 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1722 			module_autoload("compat_netbsd32_raid",
   1723 			    MODULE_CLASS_EXEC);
   1724 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1725 			    (rs, cmd, data), enosys(), retcode);
   1726 			if (retcode != EPASSTHROUGH)
   1727 				return retcode;
   1728 		}
   1729 #endif
   1730 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1731 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1732 		    (rs, cmd, data), enosys(), retcode);
   1733 		if (retcode != EPASSTHROUGH)
   1734 			return retcode;
   1735 
   1736 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1737 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1738 		    (rs, cmd, data), enosys(), retcode);
   1739 		if (retcode != EPASSTHROUGH)
   1740 			return retcode;
   1741 		break; /* fall through to the os-specific code below */
   1742 
   1743 	}
   1744 
   1745 	if (!raidPtr->valid)
   1746 		return EINVAL;
   1747 
   1748 	/*
   1749 	 * Add support for "regular" device ioctls here.
   1750 	 */
   1751 
   1752 	switch (cmd) {
   1753 	case DIOCGCACHE:
   1754 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1755 		break;
   1756 
   1757 	case DIOCCACHESYNC:
   1758 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1759 		break;
   1760 
   1761 	default:
   1762 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1763 		break;
   1764 	}
   1765 
   1766 	return retcode;
   1767 
   1768 }
   1769 
   1770 
   1771 /* raidinit -- complete the rest of the initialization for the
   1772    RAIDframe device.  */
   1773 
   1774 
   1775 static void
   1776 raidinit(struct raid_softc *rs)
   1777 {
   1778 	cfdata_t cf;
   1779 	unsigned int unit;
   1780 	struct dk_softc *dksc = &rs->sc_dksc;
   1781 	RF_Raid_t *raidPtr = &rs->sc_r;
   1782 	device_t dev;
   1783 
   1784 	unit = raidPtr->raidid;
   1785 
   1786 	/* XXX doesn't check bounds. */
   1787 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1788 
   1789 	/* attach the pseudo device */
   1790 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1791 	cf->cf_name = raid_cd.cd_name;
   1792 	cf->cf_atname = raid_cd.cd_name;
   1793 	cf->cf_unit = unit;
   1794 	cf->cf_fstate = FSTATE_STAR;
   1795 
   1796 	dev = config_attach_pseudo(cf);
   1797 	if (dev == NULL) {
   1798 		printf("raid%d: config_attach_pseudo failed\n",
   1799 		    raidPtr->raidid);
   1800 		free(cf, M_RAIDFRAME);
   1801 		return;
   1802 	}
   1803 
   1804 	/* provide a backpointer to the real softc */
   1805 	raidsoftc(dev) = rs;
   1806 
   1807 	/* disk_attach actually creates space for the CPU disklabel, among
   1808 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1809 	 * with disklabels. */
   1810 	dk_init(dksc, dev, DKTYPE_RAID);
   1811 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1812 
   1813 	/* XXX There may be a weird interaction here between this, and
   1814 	 * protectedSectors, as used in RAIDframe.  */
   1815 
   1816 	rs->sc_size = raidPtr->totalSectors;
   1817 
   1818 	/* Attach dk and disk subsystems */
   1819 	dk_attach(dksc);
   1820 	disk_attach(&dksc->sc_dkdev);
   1821 	rf_set_geometry(rs, raidPtr);
   1822 
   1823 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1824 
   1825 	/* mark unit as usuable */
   1826 	rs->sc_flags |= RAIDF_INITED;
   1827 
   1828 	dkwedge_discover(&dksc->sc_dkdev);
   1829 }
   1830 
   1831 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1832 /* wake up the daemon & tell it to get us a spare table
   1833  * XXX
   1834  * the entries in the queues should be tagged with the raidPtr
   1835  * so that in the extremely rare case that two recons happen at once,
   1836  * we know for which device were requesting a spare table
   1837  * XXX
   1838  *
   1839  * XXX This code is not currently used. GO
   1840  */
   1841 int
   1842 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1843 {
   1844 	int     retcode;
   1845 
   1846 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1847 	req->next = rf_sparet_wait_queue;
   1848 	rf_sparet_wait_queue = req;
   1849 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1850 
   1851 	/* mpsleep unlocks the mutex */
   1852 	while (!rf_sparet_resp_queue) {
   1853 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1854 	}
   1855 	req = rf_sparet_resp_queue;
   1856 	rf_sparet_resp_queue = req->next;
   1857 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1858 
   1859 	retcode = req->fcol;
   1860 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1861 					 * alloc'd */
   1862 	return retcode;
   1863 }
   1864 #endif
   1865 
   1866 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1867  * bp & passes it down.
   1868  * any calls originating in the kernel must use non-blocking I/O
   1869  * do some extra sanity checking to return "appropriate" error values for
   1870  * certain conditions (to make some standard utilities work)
   1871  *
   1872  * Formerly known as: rf_DoAccessKernel
   1873  */
   1874 void
   1875 raidstart(RF_Raid_t *raidPtr)
   1876 {
   1877 	struct raid_softc *rs;
   1878 	struct dk_softc *dksc;
   1879 
   1880 	rs = raidPtr->softc;
   1881 	dksc = &rs->sc_dksc;
   1882 	/* quick check to see if anything has died recently */
   1883 	rf_lock_mutex2(raidPtr->mutex);
   1884 	if (raidPtr->numNewFailures > 0) {
   1885 		rf_unlock_mutex2(raidPtr->mutex);
   1886 		rf_update_component_labels(raidPtr,
   1887 					   RF_NORMAL_COMPONENT_UPDATE);
   1888 		rf_lock_mutex2(raidPtr->mutex);
   1889 		raidPtr->numNewFailures--;
   1890 	}
   1891 	rf_unlock_mutex2(raidPtr->mutex);
   1892 
   1893 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1894 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1895 		return;
   1896 	}
   1897 
   1898 	dk_start(dksc, NULL);
   1899 }
   1900 
   1901 static int
   1902 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1903 {
   1904 	RF_SectorCount_t num_blocks, pb, sum;
   1905 	RF_RaidAddr_t raid_addr;
   1906 	daddr_t blocknum;
   1907 	int     do_async;
   1908 	int rc;
   1909 
   1910 	rf_lock_mutex2(raidPtr->mutex);
   1911 	if (raidPtr->openings == 0) {
   1912 		rf_unlock_mutex2(raidPtr->mutex);
   1913 		return EAGAIN;
   1914 	}
   1915 	rf_unlock_mutex2(raidPtr->mutex);
   1916 
   1917 	blocknum = bp->b_rawblkno;
   1918 
   1919 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1920 		    (int) blocknum));
   1921 
   1922 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1923 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1924 
   1925 	/* *THIS* is where we adjust what block we're going to...
   1926 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1927 	raid_addr = blocknum;
   1928 
   1929 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1930 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1931 	sum = raid_addr + num_blocks + pb;
   1932 	if (1 || rf_debugKernelAccess) {
   1933 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1934 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1935 			    (int) pb, (int) bp->b_resid));
   1936 	}
   1937 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1938 	    || (sum < num_blocks) || (sum < pb)) {
   1939 		rc = ENOSPC;
   1940 		goto done;
   1941 	}
   1942 	/*
   1943 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1944 	 */
   1945 
   1946 	if (bp->b_bcount & raidPtr->sectorMask) {
   1947 		rc = ENOSPC;
   1948 		goto done;
   1949 	}
   1950 	db1_printf(("Calling DoAccess..\n"));
   1951 
   1952 
   1953 	rf_lock_mutex2(raidPtr->mutex);
   1954 	raidPtr->openings--;
   1955 	rf_unlock_mutex2(raidPtr->mutex);
   1956 
   1957 	/*
   1958 	 * Everything is async.
   1959 	 */
   1960 	do_async = 1;
   1961 
   1962 	/* don't ever condition on bp->b_flags & B_WRITE.
   1963 	 * always condition on B_READ instead */
   1964 
   1965 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1966 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1967 			 do_async, raid_addr, num_blocks,
   1968 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1969 
   1970 done:
   1971 	return rc;
   1972 }
   1973 
   1974 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1975 
   1976 int
   1977 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1978 {
   1979 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1980 	struct buf *bp;
   1981 
   1982 	req->queue = queue;
   1983 	bp = req->bp;
   1984 
   1985 	switch (req->type) {
   1986 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1987 		/* XXX need to do something extra here.. */
   1988 		/* I'm leaving this in, as I've never actually seen it used,
   1989 		 * and I'd like folks to report it... GO */
   1990 		printf("%s: WAKEUP CALLED\n", __func__);
   1991 		queue->numOutstanding++;
   1992 
   1993 		bp->b_flags = 0;
   1994 		bp->b_private = req;
   1995 
   1996 		KernelWakeupFunc(bp);
   1997 		break;
   1998 
   1999 	case RF_IO_TYPE_READ:
   2000 	case RF_IO_TYPE_WRITE:
   2001 #if RF_ACC_TRACE > 0
   2002 		if (req->tracerec) {
   2003 			RF_ETIMER_START(req->tracerec->timer);
   2004 		}
   2005 #endif
   2006 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2007 		    op, queue->rf_cinfo->ci_dev,
   2008 		    req->sectorOffset, req->numSector,
   2009 		    req->buf, KernelWakeupFunc, (void *) req,
   2010 		    queue->raidPtr->logBytesPerSector);
   2011 
   2012 		if (rf_debugKernelAccess) {
   2013 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2014 				(long) bp->b_blkno));
   2015 		}
   2016 		queue->numOutstanding++;
   2017 		queue->last_deq_sector = req->sectorOffset;
   2018 		/* acc wouldn't have been let in if there were any pending
   2019 		 * reqs at any other priority */
   2020 		queue->curPriority = req->priority;
   2021 
   2022 		db1_printf(("Going for %c to unit %d col %d\n",
   2023 			    req->type, queue->raidPtr->raidid,
   2024 			    queue->col));
   2025 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2026 			(int) req->sectorOffset, (int) req->numSector,
   2027 			(int) (req->numSector <<
   2028 			    queue->raidPtr->logBytesPerSector),
   2029 			(int) queue->raidPtr->logBytesPerSector));
   2030 
   2031 		/*
   2032 		 * XXX: drop lock here since this can block at
   2033 		 * least with backing SCSI devices.  Retake it
   2034 		 * to minimize fuss with calling interfaces.
   2035 		 */
   2036 
   2037 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2038 		bdev_strategy(bp);
   2039 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2040 		break;
   2041 
   2042 	default:
   2043 		panic("bad req->type in rf_DispatchKernelIO");
   2044 	}
   2045 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2046 
   2047 	return 0;
   2048 }
   2049 /* this is the callback function associated with a I/O invoked from
   2050    kernel code.
   2051  */
   2052 static void
   2053 KernelWakeupFunc(struct buf *bp)
   2054 {
   2055 	RF_DiskQueueData_t *req = NULL;
   2056 	RF_DiskQueue_t *queue;
   2057 
   2058 	db1_printf(("recovering the request queue:\n"));
   2059 
   2060 	req = bp->b_private;
   2061 
   2062 	queue = (RF_DiskQueue_t *) req->queue;
   2063 
   2064 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2065 
   2066 #if RF_ACC_TRACE > 0
   2067 	if (req->tracerec) {
   2068 		RF_ETIMER_STOP(req->tracerec->timer);
   2069 		RF_ETIMER_EVAL(req->tracerec->timer);
   2070 		rf_lock_mutex2(rf_tracing_mutex);
   2071 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2072 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2073 		req->tracerec->num_phys_ios++;
   2074 		rf_unlock_mutex2(rf_tracing_mutex);
   2075 	}
   2076 #endif
   2077 
   2078 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2079 	 * ballistic, and mark the component as hosed... */
   2080 
   2081 	if (bp->b_error != 0) {
   2082 		/* Mark the disk as dead */
   2083 		/* but only mark it once... */
   2084 		/* and only if it wouldn't leave this RAID set
   2085 		   completely broken */
   2086 		if (((queue->raidPtr->Disks[queue->col].status ==
   2087 		      rf_ds_optimal) ||
   2088 		     (queue->raidPtr->Disks[queue->col].status ==
   2089 		      rf_ds_used_spare)) &&
   2090 		     (queue->raidPtr->numFailures <
   2091 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2092 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2093 			       queue->raidPtr->raidid,
   2094 			       bp->b_error,
   2095 			       queue->raidPtr->Disks[queue->col].devname);
   2096 			queue->raidPtr->Disks[queue->col].status =
   2097 			    rf_ds_failed;
   2098 			queue->raidPtr->status = rf_rs_degraded;
   2099 			queue->raidPtr->numFailures++;
   2100 			queue->raidPtr->numNewFailures++;
   2101 		} else {	/* Disk is already dead... */
   2102 			/* printf("Disk already marked as dead!\n"); */
   2103 		}
   2104 
   2105 	}
   2106 
   2107 	/* Fill in the error value */
   2108 	req->error = bp->b_error;
   2109 
   2110 	/* Drop this one on the "finished" queue... */
   2111 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2112 
   2113 	/* Let the raidio thread know there is work to be done. */
   2114 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2115 
   2116 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2117 }
   2118 
   2119 
   2120 /*
   2121  * initialize a buf structure for doing an I/O in the kernel.
   2122  */
   2123 static void
   2124 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2125        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2126        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2127 {
   2128 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2129 	bp->b_oflags = 0;
   2130 	bp->b_cflags = 0;
   2131 	bp->b_bcount = numSect << logBytesPerSector;
   2132 	bp->b_bufsize = bp->b_bcount;
   2133 	bp->b_error = 0;
   2134 	bp->b_dev = dev;
   2135 	bp->b_data = bf;
   2136 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2137 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2138 	if (bp->b_bcount == 0) {
   2139 		panic("bp->b_bcount is zero in InitBP!!");
   2140 	}
   2141 	bp->b_iodone = cbFunc;
   2142 	bp->b_private = cbArg;
   2143 }
   2144 
   2145 /*
   2146  * Wait interruptibly for an exclusive lock.
   2147  *
   2148  * XXX
   2149  * Several drivers do this; it should be abstracted and made MP-safe.
   2150  * (Hmm... where have we seen this warning before :->  GO )
   2151  */
   2152 static int
   2153 raidlock(struct raid_softc *rs)
   2154 {
   2155 	int     error;
   2156 
   2157 	error = 0;
   2158 	mutex_enter(&rs->sc_mutex);
   2159 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2160 		rs->sc_flags |= RAIDF_WANTED;
   2161 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2162 		if (error != 0)
   2163 			goto done;
   2164 	}
   2165 	rs->sc_flags |= RAIDF_LOCKED;
   2166 done:
   2167 	mutex_exit(&rs->sc_mutex);
   2168 	return error;
   2169 }
   2170 /*
   2171  * Unlock and wake up any waiters.
   2172  */
   2173 static void
   2174 raidunlock(struct raid_softc *rs)
   2175 {
   2176 
   2177 	mutex_enter(&rs->sc_mutex);
   2178 	rs->sc_flags &= ~RAIDF_LOCKED;
   2179 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2180 		rs->sc_flags &= ~RAIDF_WANTED;
   2181 		cv_broadcast(&rs->sc_cv);
   2182 	}
   2183 	mutex_exit(&rs->sc_mutex);
   2184 }
   2185 
   2186 
   2187 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2188 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2189 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2190 
   2191 static daddr_t
   2192 rf_component_info_offset(void)
   2193 {
   2194 
   2195 	return RF_COMPONENT_INFO_OFFSET;
   2196 }
   2197 
   2198 static daddr_t
   2199 rf_component_info_size(unsigned secsize)
   2200 {
   2201 	daddr_t info_size;
   2202 
   2203 	KASSERT(secsize);
   2204 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2205 		info_size = secsize;
   2206 	else
   2207 		info_size = RF_COMPONENT_INFO_SIZE;
   2208 
   2209 	return info_size;
   2210 }
   2211 
   2212 static daddr_t
   2213 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2214 {
   2215 	daddr_t map_offset;
   2216 
   2217 	KASSERT(raidPtr->bytesPerSector);
   2218 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2219 		map_offset = raidPtr->bytesPerSector;
   2220 	else
   2221 		map_offset = RF_COMPONENT_INFO_SIZE;
   2222 	map_offset += rf_component_info_offset();
   2223 
   2224 	return map_offset;
   2225 }
   2226 
   2227 static daddr_t
   2228 rf_parity_map_size(RF_Raid_t *raidPtr)
   2229 {
   2230 	daddr_t map_size;
   2231 
   2232 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2233 		map_size = raidPtr->bytesPerSector;
   2234 	else
   2235 		map_size = RF_PARITY_MAP_SIZE;
   2236 
   2237 	return map_size;
   2238 }
   2239 
   2240 int
   2241 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2242 {
   2243 	RF_ComponentLabel_t *clabel;
   2244 
   2245 	clabel = raidget_component_label(raidPtr, col);
   2246 	clabel->clean = RF_RAID_CLEAN;
   2247 	raidflush_component_label(raidPtr, col);
   2248 	return(0);
   2249 }
   2250 
   2251 
   2252 int
   2253 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2254 {
   2255 	RF_ComponentLabel_t *clabel;
   2256 
   2257 	clabel = raidget_component_label(raidPtr, col);
   2258 	clabel->clean = RF_RAID_DIRTY;
   2259 	raidflush_component_label(raidPtr, col);
   2260 	return(0);
   2261 }
   2262 
   2263 int
   2264 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2265 {
   2266 	KASSERT(raidPtr->bytesPerSector);
   2267 
   2268 	return raidread_component_label(raidPtr->bytesPerSector,
   2269 	    raidPtr->Disks[col].dev,
   2270 	    raidPtr->raid_cinfo[col].ci_vp,
   2271 	    &raidPtr->raid_cinfo[col].ci_label);
   2272 }
   2273 
   2274 RF_ComponentLabel_t *
   2275 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2276 {
   2277 	return &raidPtr->raid_cinfo[col].ci_label;
   2278 }
   2279 
   2280 int
   2281 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2282 {
   2283 	RF_ComponentLabel_t *label;
   2284 
   2285 	label = &raidPtr->raid_cinfo[col].ci_label;
   2286 	label->mod_counter = raidPtr->mod_counter;
   2287 #ifndef RF_NO_PARITY_MAP
   2288 	label->parity_map_modcount = label->mod_counter;
   2289 #endif
   2290 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2291 	    raidPtr->Disks[col].dev,
   2292 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2293 }
   2294 
   2295 /*
   2296  * Swap the label endianness.
   2297  *
   2298  * Everything in the component label is 4-byte-swapped except the version,
   2299  * which is kept in the byte-swapped version at all times, and indicates
   2300  * for the writer that a swap is necessary.
   2301  *
   2302  * For reads it is expected that out_label == clabel, but writes expect
   2303  * separate labels so only the re-swapped label is written out to disk,
   2304  * leaving the swapped-except-version internally.
   2305  *
   2306  * Only support swapping label version 2.
   2307  */
   2308 static void
   2309 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2310 {
   2311 	int	*in, *out, *in_last;
   2312 
   2313 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2314 
   2315 	/* Don't swap the label, but do copy it. */
   2316 	out_label->version = clabel->version;
   2317 
   2318 	in = &clabel->serial_number;
   2319 	in_last = &clabel->future_use2[42];
   2320 	out = &out_label->serial_number;
   2321 
   2322 	for (; in < in_last; in++, out++)
   2323 		*out = bswap32(*in);
   2324 }
   2325 
   2326 static int
   2327 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2328     RF_ComponentLabel_t *clabel)
   2329 {
   2330 	int error;
   2331 
   2332 	error = raidread_component_area(dev, b_vp, clabel,
   2333 	    sizeof(RF_ComponentLabel_t),
   2334 	    rf_component_info_offset(),
   2335 	    rf_component_info_size(secsize));
   2336 
   2337 	if (error == 0 &&
   2338 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2339 		rf_swap_label(clabel, clabel);
   2340 	}
   2341 
   2342 	return error;
   2343 }
   2344 
   2345 /* ARGSUSED */
   2346 static int
   2347 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2348     size_t msize, daddr_t offset, daddr_t dsize)
   2349 {
   2350 	struct buf *bp;
   2351 	int error;
   2352 
   2353 	/* XXX should probably ensure that we don't try to do this if
   2354 	   someone has changed rf_protected_sectors. */
   2355 
   2356 	if (b_vp == NULL) {
   2357 		/* For whatever reason, this component is not valid.
   2358 		   Don't try to read a component label from it. */
   2359 		return(EINVAL);
   2360 	}
   2361 
   2362 	/* get a block of the appropriate size... */
   2363 	bp = geteblk((int)dsize);
   2364 	bp->b_dev = dev;
   2365 
   2366 	/* get our ducks in a row for the read */
   2367 	bp->b_blkno = offset / DEV_BSIZE;
   2368 	bp->b_bcount = dsize;
   2369 	bp->b_flags |= B_READ;
   2370  	bp->b_resid = dsize;
   2371 
   2372 	bdev_strategy(bp);
   2373 	error = biowait(bp);
   2374 
   2375 	if (!error) {
   2376 		memcpy(data, bp->b_data, msize);
   2377 	}
   2378 
   2379 	brelse(bp, 0);
   2380 	return(error);
   2381 }
   2382 
   2383 static int
   2384 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2385     RF_ComponentLabel_t *clabel)
   2386 {
   2387 	RF_ComponentLabel_t *clabel_write = clabel;
   2388 	RF_ComponentLabel_t lclabel;
   2389 	int error;
   2390 
   2391 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2392 		clabel_write = &lclabel;
   2393 		rf_swap_label(clabel, clabel_write);
   2394 	}
   2395 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2396 	    sizeof(RF_ComponentLabel_t),
   2397 	    rf_component_info_offset(),
   2398 	    rf_component_info_size(secsize), 0);
   2399 
   2400 	return error;
   2401 }
   2402 
   2403 /* ARGSUSED */
   2404 static int
   2405 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2406     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2407 {
   2408 	struct buf *bp;
   2409 	int error;
   2410 
   2411 	/* get a block of the appropriate size... */
   2412 	bp = geteblk((int)dsize);
   2413 	bp->b_dev = dev;
   2414 
   2415 	/* get our ducks in a row for the write */
   2416 	bp->b_blkno = offset / DEV_BSIZE;
   2417 	bp->b_bcount = dsize;
   2418 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2419  	bp->b_resid = dsize;
   2420 
   2421 	memset(bp->b_data, 0, dsize);
   2422 	memcpy(bp->b_data, data, msize);
   2423 
   2424 	bdev_strategy(bp);
   2425 	if (asyncp)
   2426 		return 0;
   2427 	error = biowait(bp);
   2428 	brelse(bp, 0);
   2429 	if (error) {
   2430 #if 1
   2431 		printf("Failed to write RAID component info!\n");
   2432 #endif
   2433 	}
   2434 
   2435 	return(error);
   2436 }
   2437 
   2438 void
   2439 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2440 {
   2441 	int c;
   2442 
   2443 	for (c = 0; c < raidPtr->numCol; c++) {
   2444 		/* Skip dead disks. */
   2445 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2446 			continue;
   2447 		/* XXXjld: what if an error occurs here? */
   2448 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2449 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2450 		    RF_PARITYMAP_NBYTE,
   2451 		    rf_parity_map_offset(raidPtr),
   2452 		    rf_parity_map_size(raidPtr), 0);
   2453 	}
   2454 }
   2455 
   2456 void
   2457 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2458 {
   2459 	struct rf_paritymap_ondisk tmp;
   2460 	int c,first;
   2461 
   2462 	first=1;
   2463 	for (c = 0; c < raidPtr->numCol; c++) {
   2464 		/* Skip dead disks. */
   2465 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2466 			continue;
   2467 		raidread_component_area(raidPtr->Disks[c].dev,
   2468 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2469 		    RF_PARITYMAP_NBYTE,
   2470 		    rf_parity_map_offset(raidPtr),
   2471 		    rf_parity_map_size(raidPtr));
   2472 		if (first) {
   2473 			memcpy(map, &tmp, sizeof(*map));
   2474 			first = 0;
   2475 		} else {
   2476 			rf_paritymap_merge(map, &tmp);
   2477 		}
   2478 	}
   2479 }
   2480 
   2481 void
   2482 rf_markalldirty(RF_Raid_t *raidPtr)
   2483 {
   2484 	RF_ComponentLabel_t *clabel;
   2485 	int sparecol;
   2486 	int c;
   2487 	int j;
   2488 	int scol = -1;
   2489 
   2490 	raidPtr->mod_counter++;
   2491 	for (c = 0; c < raidPtr->numCol; c++) {
   2492 		/* we don't want to touch (at all) a disk that has
   2493 		   failed */
   2494 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2495 			clabel = raidget_component_label(raidPtr, c);
   2496 			if (clabel->status == rf_ds_spared) {
   2497 				/* XXX do something special...
   2498 				   but whatever you do, don't
   2499 				   try to access it!! */
   2500 			} else {
   2501 				raidmarkdirty(raidPtr, c);
   2502 			}
   2503 		}
   2504 	}
   2505 
   2506 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2507 		sparecol = raidPtr->numCol + c;
   2508 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2509 			/*
   2510 
   2511 			   we claim this disk is "optimal" if it's
   2512 			   rf_ds_used_spare, as that means it should be
   2513 			   directly substitutable for the disk it replaced.
   2514 			   We note that too...
   2515 
   2516 			 */
   2517 
   2518 			for(j=0;j<raidPtr->numCol;j++) {
   2519 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2520 					scol = j;
   2521 					break;
   2522 				}
   2523 			}
   2524 
   2525 			clabel = raidget_component_label(raidPtr, sparecol);
   2526 			/* make sure status is noted */
   2527 
   2528 			raid_init_component_label(raidPtr, clabel);
   2529 
   2530 			clabel->row = 0;
   2531 			clabel->column = scol;
   2532 			/* Note: we *don't* change status from rf_ds_used_spare
   2533 			   to rf_ds_optimal */
   2534 			/* clabel.status = rf_ds_optimal; */
   2535 
   2536 			raidmarkdirty(raidPtr, sparecol);
   2537 		}
   2538 	}
   2539 }
   2540 
   2541 
   2542 void
   2543 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2544 {
   2545 	RF_ComponentLabel_t *clabel;
   2546 	int sparecol;
   2547 	int c;
   2548 	int j;
   2549 	int scol;
   2550 	struct raid_softc *rs = raidPtr->softc;
   2551 
   2552 	scol = -1;
   2553 
   2554 	/* XXX should do extra checks to make sure things really are clean,
   2555 	   rather than blindly setting the clean bit... */
   2556 
   2557 	raidPtr->mod_counter++;
   2558 
   2559 	for (c = 0; c < raidPtr->numCol; c++) {
   2560 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2561 			clabel = raidget_component_label(raidPtr, c);
   2562 			/* make sure status is noted */
   2563 			clabel->status = rf_ds_optimal;
   2564 
   2565 			/* note what unit we are configured as */
   2566 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2567 				clabel->last_unit = raidPtr->raidid;
   2568 
   2569 			raidflush_component_label(raidPtr, c);
   2570 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2571 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2572 					raidmarkclean(raidPtr, c);
   2573 				}
   2574 			}
   2575 		}
   2576 		/* else we don't touch it.. */
   2577 	}
   2578 
   2579 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2580 		sparecol = raidPtr->numCol + c;
   2581 		/* Need to ensure that the reconstruct actually completed! */
   2582 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2583 			/*
   2584 
   2585 			   we claim this disk is "optimal" if it's
   2586 			   rf_ds_used_spare, as that means it should be
   2587 			   directly substitutable for the disk it replaced.
   2588 			   We note that too...
   2589 
   2590 			 */
   2591 
   2592 			for(j=0;j<raidPtr->numCol;j++) {
   2593 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2594 					scol = j;
   2595 					break;
   2596 				}
   2597 			}
   2598 
   2599 			/* XXX shouldn't *really* need this... */
   2600 			clabel = raidget_component_label(raidPtr, sparecol);
   2601 			/* make sure status is noted */
   2602 
   2603 			raid_init_component_label(raidPtr, clabel);
   2604 
   2605 			clabel->column = scol;
   2606 			clabel->status = rf_ds_optimal;
   2607 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2608 				clabel->last_unit = raidPtr->raidid;
   2609 
   2610 			raidflush_component_label(raidPtr, sparecol);
   2611 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2612 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2613 					raidmarkclean(raidPtr, sparecol);
   2614 				}
   2615 			}
   2616 		}
   2617 	}
   2618 }
   2619 
   2620 void
   2621 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2622 {
   2623 
   2624 	if (vp != NULL) {
   2625 		if (auto_configured == 1) {
   2626 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2627 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2628 			vput(vp);
   2629 
   2630 		} else {
   2631 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2632 		}
   2633 	}
   2634 }
   2635 
   2636 
   2637 void
   2638 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2639 {
   2640 	int r,c;
   2641 	struct vnode *vp;
   2642 	int acd;
   2643 
   2644 
   2645 	/* We take this opportunity to close the vnodes like we should.. */
   2646 
   2647 	for (c = 0; c < raidPtr->numCol; c++) {
   2648 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2649 		acd = raidPtr->Disks[c].auto_configured;
   2650 		rf_close_component(raidPtr, vp, acd);
   2651 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2652 		raidPtr->Disks[c].auto_configured = 0;
   2653 	}
   2654 
   2655 	for (r = 0; r < raidPtr->numSpare; r++) {
   2656 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2657 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2658 		rf_close_component(raidPtr, vp, acd);
   2659 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2660 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2661 	}
   2662 }
   2663 
   2664 
   2665 static void
   2666 rf_ReconThread(struct rf_recon_req_internal *req)
   2667 {
   2668 	int     s;
   2669 	RF_Raid_t *raidPtr;
   2670 
   2671 	s = splbio();
   2672 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2673 	raidPtr->recon_in_progress = 1;
   2674 
   2675 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2676 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2677 
   2678 	RF_Free(req, sizeof(*req));
   2679 
   2680 	raidPtr->recon_in_progress = 0;
   2681 	splx(s);
   2682 
   2683 	/* That's all... */
   2684 	kthread_exit(0);	/* does not return */
   2685 }
   2686 
   2687 static void
   2688 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2689 {
   2690 	int retcode;
   2691 	int s;
   2692 
   2693 	raidPtr->parity_rewrite_stripes_done = 0;
   2694 	raidPtr->parity_rewrite_in_progress = 1;
   2695 	s = splbio();
   2696 	retcode = rf_RewriteParity(raidPtr);
   2697 	splx(s);
   2698 	if (retcode) {
   2699 		printf("raid%d: Error re-writing parity (%d)!\n",
   2700 		    raidPtr->raidid, retcode);
   2701 	} else {
   2702 		/* set the clean bit!  If we shutdown correctly,
   2703 		   the clean bit on each component label will get
   2704 		   set */
   2705 		raidPtr->parity_good = RF_RAID_CLEAN;
   2706 	}
   2707 	raidPtr->parity_rewrite_in_progress = 0;
   2708 
   2709 	/* Anyone waiting for us to stop?  If so, inform them... */
   2710 	if (raidPtr->waitShutdown) {
   2711 		rf_lock_mutex2(raidPtr->rad_lock);
   2712 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2713 		rf_unlock_mutex2(raidPtr->rad_lock);
   2714 	}
   2715 
   2716 	/* That's all... */
   2717 	kthread_exit(0);	/* does not return */
   2718 }
   2719 
   2720 
   2721 static void
   2722 rf_CopybackThread(RF_Raid_t *raidPtr)
   2723 {
   2724 	int s;
   2725 
   2726 	raidPtr->copyback_in_progress = 1;
   2727 	s = splbio();
   2728 	rf_CopybackReconstructedData(raidPtr);
   2729 	splx(s);
   2730 	raidPtr->copyback_in_progress = 0;
   2731 
   2732 	/* That's all... */
   2733 	kthread_exit(0);	/* does not return */
   2734 }
   2735 
   2736 
   2737 static void
   2738 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2739 {
   2740 	int s;
   2741 	RF_Raid_t *raidPtr;
   2742 
   2743 	s = splbio();
   2744 	raidPtr = req->raidPtr;
   2745 	raidPtr->recon_in_progress = 1;
   2746 	rf_ReconstructInPlace(raidPtr, req->col);
   2747 	RF_Free(req, sizeof(*req));
   2748 	raidPtr->recon_in_progress = 0;
   2749 	splx(s);
   2750 
   2751 	/* That's all... */
   2752 	kthread_exit(0);	/* does not return */
   2753 }
   2754 
   2755 static RF_AutoConfig_t *
   2756 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2757     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2758     unsigned secsize)
   2759 {
   2760 	int good_one = 0;
   2761 	RF_ComponentLabel_t *clabel;
   2762 	RF_AutoConfig_t *ac;
   2763 
   2764 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2765 
   2766 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2767 		/* Got the label.  Does it look reasonable? */
   2768 		if (rf_reasonable_label(clabel, numsecs) &&
   2769 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2770 #ifdef DEBUG
   2771 			printf("Component on: %s: %llu\n",
   2772 				cname, (unsigned long long)size);
   2773 			rf_print_component_label(clabel);
   2774 #endif
   2775 			/* if it's reasonable, add it, else ignore it. */
   2776 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2777 				M_WAITOK);
   2778 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2779 			ac->dev = dev;
   2780 			ac->vp = vp;
   2781 			ac->clabel = clabel;
   2782 			ac->next = ac_list;
   2783 			ac_list = ac;
   2784 			good_one = 1;
   2785 		}
   2786 	}
   2787 	if (!good_one) {
   2788 		/* cleanup */
   2789 		free(clabel, M_RAIDFRAME);
   2790 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2791 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2792 		vput(vp);
   2793 	}
   2794 	return ac_list;
   2795 }
   2796 
   2797 static RF_AutoConfig_t *
   2798 rf_find_raid_components(void)
   2799 {
   2800 	struct vnode *vp;
   2801 	struct disklabel label;
   2802 	device_t dv;
   2803 	deviter_t di;
   2804 	dev_t dev;
   2805 	int bmajor, bminor, wedge, rf_part_found;
   2806 	int error;
   2807 	int i;
   2808 	RF_AutoConfig_t *ac_list;
   2809 	uint64_t numsecs;
   2810 	unsigned secsize;
   2811 	int dowedges;
   2812 
   2813 	/* initialize the AutoConfig list */
   2814 	ac_list = NULL;
   2815 
   2816 	/*
   2817 	 * we begin by trolling through *all* the devices on the system *twice*
   2818 	 * first we scan for wedges, second for other devices. This avoids
   2819 	 * using a raw partition instead of a wedge that covers the whole disk
   2820 	 */
   2821 
   2822 	for (dowedges=1; dowedges>=0; --dowedges) {
   2823 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2824 		     dv = deviter_next(&di)) {
   2825 
   2826 			/* we are only interested in disks */
   2827 			if (device_class(dv) != DV_DISK)
   2828 				continue;
   2829 
   2830 			/* we don't care about floppies */
   2831 			if (device_is_a(dv, "fd")) {
   2832 				continue;
   2833 			}
   2834 
   2835 			/* we don't care about CDs. */
   2836 			if (device_is_a(dv, "cd")) {
   2837 				continue;
   2838 			}
   2839 
   2840 			/* we don't care about md. */
   2841 			if (device_is_a(dv, "md")) {
   2842 				continue;
   2843 			}
   2844 
   2845 			/* hdfd is the Atari/Hades floppy driver */
   2846 			if (device_is_a(dv, "hdfd")) {
   2847 				continue;
   2848 			}
   2849 
   2850 			/* fdisa is the Atari/Milan floppy driver */
   2851 			if (device_is_a(dv, "fdisa")) {
   2852 				continue;
   2853 			}
   2854 
   2855 			/* we don't care about spiflash */
   2856 			if (device_is_a(dv, "spiflash")) {
   2857 				continue;
   2858 			}
   2859 
   2860 			/* are we in the wedges pass ? */
   2861 			wedge = device_is_a(dv, "dk");
   2862 			if (wedge != dowedges) {
   2863 				continue;
   2864 			}
   2865 
   2866 			/* need to find the device_name_to_block_device_major stuff */
   2867 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2868 
   2869 			rf_part_found = 0; /*No raid partition as yet*/
   2870 
   2871 			/* get a vnode for the raw partition of this disk */
   2872 			bminor = minor(device_unit(dv));
   2873 			dev = wedge ? makedev(bmajor, bminor) :
   2874 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2875 			if (bdevvp(dev, &vp))
   2876 				panic("RAID can't alloc vnode");
   2877 
   2878 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2879 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2880 
   2881 			if (error) {
   2882 				/* "Who cares."  Continue looking
   2883 				   for something that exists*/
   2884 				vput(vp);
   2885 				continue;
   2886 			}
   2887 
   2888 			error = getdisksize(vp, &numsecs, &secsize);
   2889 			if (error) {
   2890 				/*
   2891 				 * Pseudo devices like vnd and cgd can be
   2892 				 * opened but may still need some configuration.
   2893 				 * Ignore these quietly.
   2894 				 */
   2895 				if (error != ENXIO)
   2896 					printf("RAIDframe: can't get disk size"
   2897 					    " for dev %s (%d)\n",
   2898 					    device_xname(dv), error);
   2899 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2900 				vput(vp);
   2901 				continue;
   2902 			}
   2903 			if (wedge) {
   2904 				struct dkwedge_info dkw;
   2905 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2906 				    NOCRED);
   2907 				if (error) {
   2908 					printf("RAIDframe: can't get wedge info for "
   2909 					    "dev %s (%d)\n", device_xname(dv), error);
   2910 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2911 					vput(vp);
   2912 					continue;
   2913 				}
   2914 
   2915 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2916 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2917 					vput(vp);
   2918 					continue;
   2919 				}
   2920 
   2921 				VOP_UNLOCK(vp);
   2922 				ac_list = rf_get_component(ac_list, dev, vp,
   2923 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2924 				rf_part_found = 1; /*There is a raid component on this disk*/
   2925 				continue;
   2926 			}
   2927 
   2928 			/* Ok, the disk exists.  Go get the disklabel. */
   2929 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2930 			if (error) {
   2931 				/*
   2932 				 * XXX can't happen - open() would
   2933 				 * have errored out (or faked up one)
   2934 				 */
   2935 				if (error != ENOTTY)
   2936 					printf("RAIDframe: can't get label for dev "
   2937 					    "%s (%d)\n", device_xname(dv), error);
   2938 			}
   2939 
   2940 			/* don't need this any more.  We'll allocate it again
   2941 			   a little later if we really do... */
   2942 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2943 			vput(vp);
   2944 
   2945 			if (error)
   2946 				continue;
   2947 
   2948 			rf_part_found = 0; /*No raid partitions yet*/
   2949 			for (i = 0; i < label.d_npartitions; i++) {
   2950 				char cname[sizeof(ac_list->devname)];
   2951 
   2952 				/* We only support partitions marked as RAID */
   2953 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2954 					continue;
   2955 
   2956 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2957 				if (bdevvp(dev, &vp))
   2958 					panic("RAID can't alloc vnode");
   2959 
   2960 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2961 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2962 				if (error) {
   2963 					/* Whatever... */
   2964 					vput(vp);
   2965 					continue;
   2966 				}
   2967 				VOP_UNLOCK(vp);
   2968 				snprintf(cname, sizeof(cname), "%s%c",
   2969 				    device_xname(dv), 'a' + i);
   2970 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2971 					label.d_partitions[i].p_size, numsecs, secsize);
   2972 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2973 			}
   2974 
   2975 			/*
   2976 			 *If there is no raid component on this disk, either in a
   2977 			 *disklabel or inside a wedge, check the raw partition as well,
   2978 			 *as it is possible to configure raid components on raw disk
   2979 			 *devices.
   2980 			 */
   2981 
   2982 			if (!rf_part_found) {
   2983 				char cname[sizeof(ac_list->devname)];
   2984 
   2985 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2986 				if (bdevvp(dev, &vp))
   2987 					panic("RAID can't alloc vnode");
   2988 
   2989 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2990 
   2991 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2992 				if (error) {
   2993 					/* Whatever... */
   2994 					vput(vp);
   2995 					continue;
   2996 				}
   2997 				VOP_UNLOCK(vp);
   2998 				snprintf(cname, sizeof(cname), "%s%c",
   2999 				    device_xname(dv), 'a' + RAW_PART);
   3000 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3001 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3002 			}
   3003 		}
   3004 		deviter_release(&di);
   3005 	}
   3006 	return ac_list;
   3007 }
   3008 
   3009 int
   3010 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3011 {
   3012 
   3013 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3014 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3015 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3016 	    (clabel->clean == RF_RAID_CLEAN ||
   3017 	     clabel->clean == RF_RAID_DIRTY) &&
   3018 	    clabel->row >=0 &&
   3019 	    clabel->column >= 0 &&
   3020 	    clabel->num_rows > 0 &&
   3021 	    clabel->num_columns > 0 &&
   3022 	    clabel->row < clabel->num_rows &&
   3023 	    clabel->column < clabel->num_columns &&
   3024 	    clabel->blockSize > 0 &&
   3025 	    /*
   3026 	     * numBlocksHi may contain garbage, but it is ok since
   3027 	     * the type is unsigned.  If it is really garbage,
   3028 	     * rf_fix_old_label_size() will fix it.
   3029 	     */
   3030 	    rf_component_label_numblocks(clabel) > 0) {
   3031 		/*
   3032 		 * label looks reasonable enough...
   3033 		 * let's make sure it has no old garbage.
   3034 		 */
   3035 		if (numsecs)
   3036 			rf_fix_old_label_size(clabel, numsecs);
   3037 		return(1);
   3038 	}
   3039 	return(0);
   3040 }
   3041 
   3042 
   3043 /*
   3044  * For reasons yet unknown, some old component labels have garbage in
   3045  * the newer numBlocksHi region, and this causes lossage.  Since those
   3046  * disks will also have numsecs set to less than 32 bits of sectors,
   3047  * we can determine when this corruption has occurred, and fix it.
   3048  *
   3049  * The exact same problem, with the same unknown reason, happens to
   3050  * the partitionSizeHi member as well.
   3051  */
   3052 static void
   3053 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3054 {
   3055 
   3056 	if (numsecs < ((uint64_t)1 << 32)) {
   3057 		if (clabel->numBlocksHi) {
   3058 			printf("WARNING: total sectors < 32 bits, yet "
   3059 			       "numBlocksHi set\n"
   3060 			       "WARNING: resetting numBlocksHi to zero.\n");
   3061 			clabel->numBlocksHi = 0;
   3062 		}
   3063 
   3064 		if (clabel->partitionSizeHi) {
   3065 			printf("WARNING: total sectors < 32 bits, yet "
   3066 			       "partitionSizeHi set\n"
   3067 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3068 			clabel->partitionSizeHi = 0;
   3069 		}
   3070 	}
   3071 }
   3072 
   3073 
   3074 #ifdef DEBUG
   3075 void
   3076 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3077 {
   3078 	uint64_t numBlocks;
   3079 	static const char *rp[] = {
   3080 	    "No", "Force", "Soft", "*invalid*"
   3081 	};
   3082 
   3083 
   3084 	numBlocks = rf_component_label_numblocks(clabel);
   3085 
   3086 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3087 	       clabel->row, clabel->column,
   3088 	       clabel->num_rows, clabel->num_columns);
   3089 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3090 	       clabel->version, clabel->serial_number,
   3091 	       clabel->mod_counter);
   3092 	printf("   Clean: %s Status: %d\n",
   3093 	       clabel->clean ? "Yes" : "No", clabel->status);
   3094 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3095 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3096 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3097 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3098 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3099 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3100 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3101 #if 0
   3102 	   printf("   Config order: %d\n", clabel->config_order);
   3103 #endif
   3104 
   3105 }
   3106 #endif
   3107 
   3108 static RF_ConfigSet_t *
   3109 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3110 {
   3111 	RF_AutoConfig_t *ac;
   3112 	RF_ConfigSet_t *config_sets;
   3113 	RF_ConfigSet_t *cset;
   3114 	RF_AutoConfig_t *ac_next;
   3115 
   3116 
   3117 	config_sets = NULL;
   3118 
   3119 	/* Go through the AutoConfig list, and figure out which components
   3120 	   belong to what sets.  */
   3121 	ac = ac_list;
   3122 	while(ac!=NULL) {
   3123 		/* we're going to putz with ac->next, so save it here
   3124 		   for use at the end of the loop */
   3125 		ac_next = ac->next;
   3126 
   3127 		if (config_sets == NULL) {
   3128 			/* will need at least this one... */
   3129 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3130 				       M_RAIDFRAME, M_WAITOK);
   3131 			/* this one is easy :) */
   3132 			config_sets->ac = ac;
   3133 			config_sets->next = NULL;
   3134 			config_sets->rootable = 0;
   3135 			ac->next = NULL;
   3136 		} else {
   3137 			/* which set does this component fit into? */
   3138 			cset = config_sets;
   3139 			while(cset!=NULL) {
   3140 				if (rf_does_it_fit(cset, ac)) {
   3141 					/* looks like it matches... */
   3142 					ac->next = cset->ac;
   3143 					cset->ac = ac;
   3144 					break;
   3145 				}
   3146 				cset = cset->next;
   3147 			}
   3148 			if (cset==NULL) {
   3149 				/* didn't find a match above... new set..*/
   3150 				cset = malloc(sizeof(RF_ConfigSet_t),
   3151 					       M_RAIDFRAME, M_WAITOK);
   3152 				cset->ac = ac;
   3153 				ac->next = NULL;
   3154 				cset->next = config_sets;
   3155 				cset->rootable = 0;
   3156 				config_sets = cset;
   3157 			}
   3158 		}
   3159 		ac = ac_next;
   3160 	}
   3161 
   3162 
   3163 	return(config_sets);
   3164 }
   3165 
   3166 static int
   3167 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3168 {
   3169 	RF_ComponentLabel_t *clabel1, *clabel2;
   3170 
   3171 	/* If this one matches the *first* one in the set, that's good
   3172 	   enough, since the other members of the set would have been
   3173 	   through here too... */
   3174 	/* note that we are not checking partitionSize here..
   3175 
   3176 	   Note that we are also not checking the mod_counters here.
   3177 	   If everything else matches except the mod_counter, that's
   3178 	   good enough for this test.  We will deal with the mod_counters
   3179 	   a little later in the autoconfiguration process.
   3180 
   3181 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3182 
   3183 	   The reason we don't check for this is that failed disks
   3184 	   will have lower modification counts.  If those disks are
   3185 	   not added to the set they used to belong to, then they will
   3186 	   form their own set, which may result in 2 different sets,
   3187 	   for example, competing to be configured at raid0, and
   3188 	   perhaps competing to be the root filesystem set.  If the
   3189 	   wrong ones get configured, or both attempt to become /,
   3190 	   weird behaviour and or serious lossage will occur.  Thus we
   3191 	   need to bring them into the fold here, and kick them out at
   3192 	   a later point.
   3193 
   3194 	*/
   3195 
   3196 	clabel1 = cset->ac->clabel;
   3197 	clabel2 = ac->clabel;
   3198 	if ((clabel1->version == clabel2->version) &&
   3199 	    (clabel1->serial_number == clabel2->serial_number) &&
   3200 	    (clabel1->num_rows == clabel2->num_rows) &&
   3201 	    (clabel1->num_columns == clabel2->num_columns) &&
   3202 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3203 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3204 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3205 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3206 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3207 	    (clabel1->blockSize == clabel2->blockSize) &&
   3208 	    rf_component_label_numblocks(clabel1) ==
   3209 	    rf_component_label_numblocks(clabel2) &&
   3210 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3211 	    (clabel1->root_partition == clabel2->root_partition) &&
   3212 	    (clabel1->last_unit == clabel2->last_unit) &&
   3213 	    (clabel1->config_order == clabel2->config_order)) {
   3214 		/* if it get's here, it almost *has* to be a match */
   3215 	} else {
   3216 		/* it's not consistent with somebody in the set..
   3217 		   punt */
   3218 		return(0);
   3219 	}
   3220 	/* all was fine.. it must fit... */
   3221 	return(1);
   3222 }
   3223 
   3224 static int
   3225 rf_have_enough_components(RF_ConfigSet_t *cset)
   3226 {
   3227 	RF_AutoConfig_t *ac;
   3228 	RF_AutoConfig_t *auto_config;
   3229 	RF_ComponentLabel_t *clabel;
   3230 	int c;
   3231 	int num_cols;
   3232 	int num_missing;
   3233 	int mod_counter;
   3234 	int mod_counter_found;
   3235 	int even_pair_failed;
   3236 	char parity_type;
   3237 
   3238 
   3239 	/* check to see that we have enough 'live' components
   3240 	   of this set.  If so, we can configure it if necessary */
   3241 
   3242 	num_cols = cset->ac->clabel->num_columns;
   3243 	parity_type = cset->ac->clabel->parityConfig;
   3244 
   3245 	/* XXX Check for duplicate components!?!?!? */
   3246 
   3247 	/* Determine what the mod_counter is supposed to be for this set. */
   3248 
   3249 	mod_counter_found = 0;
   3250 	mod_counter = 0;
   3251 	ac = cset->ac;
   3252 	while(ac!=NULL) {
   3253 		if (mod_counter_found==0) {
   3254 			mod_counter = ac->clabel->mod_counter;
   3255 			mod_counter_found = 1;
   3256 		} else {
   3257 			if (ac->clabel->mod_counter > mod_counter) {
   3258 				mod_counter = ac->clabel->mod_counter;
   3259 			}
   3260 		}
   3261 		ac = ac->next;
   3262 	}
   3263 
   3264 	num_missing = 0;
   3265 	auto_config = cset->ac;
   3266 
   3267 	even_pair_failed = 0;
   3268 	for(c=0; c<num_cols; c++) {
   3269 		ac = auto_config;
   3270 		while(ac!=NULL) {
   3271 			if ((ac->clabel->column == c) &&
   3272 			    (ac->clabel->mod_counter == mod_counter)) {
   3273 				/* it's this one... */
   3274 #ifdef DEBUG
   3275 				printf("Found: %s at %d\n",
   3276 				       ac->devname,c);
   3277 #endif
   3278 				break;
   3279 			}
   3280 			ac=ac->next;
   3281 		}
   3282 		if (ac==NULL) {
   3283 				/* Didn't find one here! */
   3284 				/* special case for RAID 1, especially
   3285 				   where there are more than 2
   3286 				   components (where RAIDframe treats
   3287 				   things a little differently :( ) */
   3288 			if (parity_type == '1') {
   3289 				if (c%2 == 0) { /* even component */
   3290 					even_pair_failed = 1;
   3291 				} else { /* odd component.  If
   3292 					    we're failed, and
   3293 					    so is the even
   3294 					    component, it's
   3295 					    "Good Night, Charlie" */
   3296 					if (even_pair_failed == 1) {
   3297 						return(0);
   3298 					}
   3299 				}
   3300 			} else {
   3301 				/* normal accounting */
   3302 				num_missing++;
   3303 			}
   3304 		}
   3305 		if ((parity_type == '1') && (c%2 == 1)) {
   3306 				/* Just did an even component, and we didn't
   3307 				   bail.. reset the even_pair_failed flag,
   3308 				   and go on to the next component.... */
   3309 			even_pair_failed = 0;
   3310 		}
   3311 	}
   3312 
   3313 	clabel = cset->ac->clabel;
   3314 
   3315 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3316 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3317 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3318 		/* XXX this needs to be made *much* more general */
   3319 		/* Too many failures */
   3320 		return(0);
   3321 	}
   3322 	/* otherwise, all is well, and we've got enough to take a kick
   3323 	   at autoconfiguring this set */
   3324 	return(1);
   3325 }
   3326 
   3327 static void
   3328 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3329 			RF_Raid_t *raidPtr)
   3330 {
   3331 	RF_ComponentLabel_t *clabel;
   3332 	int i;
   3333 
   3334 	clabel = ac->clabel;
   3335 
   3336 	/* 1. Fill in the common stuff */
   3337 	config->numCol = clabel->num_columns;
   3338 	config->numSpare = 0; /* XXX should this be set here? */
   3339 	config->sectPerSU = clabel->sectPerSU;
   3340 	config->SUsPerPU = clabel->SUsPerPU;
   3341 	config->SUsPerRU = clabel->SUsPerRU;
   3342 	config->parityConfig = clabel->parityConfig;
   3343 	/* XXX... */
   3344 	strcpy(config->diskQueueType,"fifo");
   3345 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3346 	config->layoutSpecificSize = 0; /* XXX ?? */
   3347 
   3348 	while(ac!=NULL) {
   3349 		/* row/col values will be in range due to the checks
   3350 		   in reasonable_label() */
   3351 		strcpy(config->devnames[0][ac->clabel->column],
   3352 		       ac->devname);
   3353 		ac = ac->next;
   3354 	}
   3355 
   3356 	for(i=0;i<RF_MAXDBGV;i++) {
   3357 		config->debugVars[i][0] = 0;
   3358 	}
   3359 }
   3360 
   3361 static int
   3362 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3363 {
   3364 	RF_ComponentLabel_t *clabel;
   3365 	int column;
   3366 	int sparecol;
   3367 
   3368 	raidPtr->autoconfigure = new_value;
   3369 
   3370 	for(column=0; column<raidPtr->numCol; column++) {
   3371 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3372 			clabel = raidget_component_label(raidPtr, column);
   3373 			clabel->autoconfigure = new_value;
   3374 			raidflush_component_label(raidPtr, column);
   3375 		}
   3376 	}
   3377 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3378 		sparecol = raidPtr->numCol + column;
   3379 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3380 			clabel = raidget_component_label(raidPtr, sparecol);
   3381 			clabel->autoconfigure = new_value;
   3382 			raidflush_component_label(raidPtr, sparecol);
   3383 		}
   3384 	}
   3385 	return(new_value);
   3386 }
   3387 
   3388 static int
   3389 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3390 {
   3391 	RF_ComponentLabel_t *clabel;
   3392 	int column;
   3393 	int sparecol;
   3394 
   3395 	raidPtr->root_partition = new_value;
   3396 	for(column=0; column<raidPtr->numCol; column++) {
   3397 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3398 			clabel = raidget_component_label(raidPtr, column);
   3399 			clabel->root_partition = new_value;
   3400 			raidflush_component_label(raidPtr, column);
   3401 		}
   3402 	}
   3403 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3404 		sparecol = raidPtr->numCol + column;
   3405 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3406 			clabel = raidget_component_label(raidPtr, sparecol);
   3407 			clabel->root_partition = new_value;
   3408 			raidflush_component_label(raidPtr, sparecol);
   3409 		}
   3410 	}
   3411 	return(new_value);
   3412 }
   3413 
   3414 static void
   3415 rf_release_all_vps(RF_ConfigSet_t *cset)
   3416 {
   3417 	RF_AutoConfig_t *ac;
   3418 
   3419 	ac = cset->ac;
   3420 	while(ac!=NULL) {
   3421 		/* Close the vp, and give it back */
   3422 		if (ac->vp) {
   3423 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3424 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3425 			vput(ac->vp);
   3426 			ac->vp = NULL;
   3427 		}
   3428 		ac = ac->next;
   3429 	}
   3430 }
   3431 
   3432 
   3433 static void
   3434 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3435 {
   3436 	RF_AutoConfig_t *ac;
   3437 	RF_AutoConfig_t *next_ac;
   3438 
   3439 	ac = cset->ac;
   3440 	while(ac!=NULL) {
   3441 		next_ac = ac->next;
   3442 		/* nuke the label */
   3443 		free(ac->clabel, M_RAIDFRAME);
   3444 		/* cleanup the config structure */
   3445 		free(ac, M_RAIDFRAME);
   3446 		/* "next.." */
   3447 		ac = next_ac;
   3448 	}
   3449 	/* and, finally, nuke the config set */
   3450 	free(cset, M_RAIDFRAME);
   3451 }
   3452 
   3453 
   3454 void
   3455 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3456 {
   3457 	/* avoid over-writing byteswapped version. */
   3458 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3459 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3460 	clabel->serial_number = raidPtr->serial_number;
   3461 	clabel->mod_counter = raidPtr->mod_counter;
   3462 
   3463 	clabel->num_rows = 1;
   3464 	clabel->num_columns = raidPtr->numCol;
   3465 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3466 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3467 
   3468 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3469 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3470 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3471 
   3472 	clabel->blockSize = raidPtr->bytesPerSector;
   3473 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3474 
   3475 	/* XXX not portable */
   3476 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3477 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3478 	clabel->autoconfigure = raidPtr->autoconfigure;
   3479 	clabel->root_partition = raidPtr->root_partition;
   3480 	clabel->last_unit = raidPtr->raidid;
   3481 	clabel->config_order = raidPtr->config_order;
   3482 
   3483 #ifndef RF_NO_PARITY_MAP
   3484 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3485 #endif
   3486 }
   3487 
   3488 static struct raid_softc *
   3489 rf_auto_config_set(RF_ConfigSet_t *cset)
   3490 {
   3491 	RF_Raid_t *raidPtr;
   3492 	RF_Config_t *config;
   3493 	int raidID;
   3494 	struct raid_softc *sc;
   3495 
   3496 #ifdef DEBUG
   3497 	printf("RAID autoconfigure\n");
   3498 #endif
   3499 
   3500 	/* 1. Create a config structure */
   3501 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3502 
   3503 	/*
   3504 	   2. Figure out what RAID ID this one is supposed to live at
   3505 	   See if we can get the same RAID dev that it was configured
   3506 	   on last time..
   3507 	*/
   3508 
   3509 	raidID = cset->ac->clabel->last_unit;
   3510 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3511 	     sc = raidget(++raidID, false))
   3512 		continue;
   3513 #ifdef DEBUG
   3514 	printf("Configuring raid%d:\n",raidID);
   3515 #endif
   3516 
   3517 	if (sc == NULL)
   3518 		sc = raidget(raidID, true);
   3519 	raidPtr = &sc->sc_r;
   3520 
   3521 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3522 	raidPtr->softc = sc;
   3523 	raidPtr->raidid = raidID;
   3524 	raidPtr->openings = RAIDOUTSTANDING;
   3525 
   3526 	/* 3. Build the configuration structure */
   3527 	rf_create_configuration(cset->ac, config, raidPtr);
   3528 
   3529 	/* 4. Do the configuration */
   3530 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3531 		raidinit(sc);
   3532 
   3533 		rf_markalldirty(raidPtr);
   3534 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3535 		switch (cset->ac->clabel->root_partition) {
   3536 		case 1:	/* Force Root */
   3537 		case 2:	/* Soft Root: root when boot partition part of raid */
   3538 			/*
   3539 			 * everything configured just fine.  Make a note
   3540 			 * that this set is eligible to be root,
   3541 			 * or forced to be root
   3542 			 */
   3543 			cset->rootable = cset->ac->clabel->root_partition;
   3544 			/* XXX do this here? */
   3545 			raidPtr->root_partition = cset->rootable;
   3546 			break;
   3547 		default:
   3548 			break;
   3549 		}
   3550 	} else {
   3551 		raidput(sc);
   3552 		sc = NULL;
   3553 	}
   3554 
   3555 	/* 5. Cleanup */
   3556 	free(config, M_RAIDFRAME);
   3557 	return sc;
   3558 }
   3559 
   3560 void
   3561 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3562 	     size_t xmin, size_t xmax)
   3563 {
   3564 
   3565 	/* Format: raid%d_foo */
   3566 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3567 
   3568 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3569 	pool_sethiwat(p, xmax);
   3570 	pool_prime(p, xmin);
   3571 }
   3572 
   3573 
   3574 /*
   3575  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3576  * to see if there is IO pending and if that IO could possibly be done
   3577  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3578  * otherwise.
   3579  *
   3580  */
   3581 int
   3582 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3583 {
   3584 	struct raid_softc *rs;
   3585 	struct dk_softc *dksc;
   3586 
   3587 	rs = raidPtr->softc;
   3588 	dksc = &rs->sc_dksc;
   3589 
   3590 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3591 		return 1;
   3592 
   3593 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3594 		/* there is work to do */
   3595 		return 0;
   3596 	}
   3597 	/* default is nothing to do */
   3598 	return 1;
   3599 }
   3600 
   3601 int
   3602 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3603 {
   3604 	uint64_t numsecs;
   3605 	unsigned secsize;
   3606 	int error;
   3607 
   3608 	error = getdisksize(vp, &numsecs, &secsize);
   3609 	if (error == 0) {
   3610 		diskPtr->blockSize = secsize;
   3611 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3612 		diskPtr->partitionSize = numsecs;
   3613 		return 0;
   3614 	}
   3615 	return error;
   3616 }
   3617 
   3618 static int
   3619 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3620 {
   3621 	return 1;
   3622 }
   3623 
   3624 static void
   3625 raid_attach(device_t parent, device_t self, void *aux)
   3626 {
   3627 }
   3628 
   3629 
   3630 static int
   3631 raid_detach(device_t self, int flags)
   3632 {
   3633 	int error;
   3634 	struct raid_softc *rs = raidsoftc(self);
   3635 
   3636 	if (rs == NULL)
   3637 		return ENXIO;
   3638 
   3639 	if ((error = raidlock(rs)) != 0)
   3640 		return error;
   3641 
   3642 	error = raid_detach_unlocked(rs);
   3643 
   3644 	raidunlock(rs);
   3645 
   3646 	/* XXX raid can be referenced here */
   3647 
   3648 	if (error)
   3649 		return error;
   3650 
   3651 	/* Free the softc */
   3652 	raidput(rs);
   3653 
   3654 	return 0;
   3655 }
   3656 
   3657 static void
   3658 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3659 {
   3660 	struct dk_softc *dksc = &rs->sc_dksc;
   3661 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3662 
   3663 	memset(dg, 0, sizeof(*dg));
   3664 
   3665 	dg->dg_secperunit = raidPtr->totalSectors;
   3666 	dg->dg_secsize = raidPtr->bytesPerSector;
   3667 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3668 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3669 
   3670 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3671 }
   3672 
   3673 /*
   3674  * Get cache info for all the components (including spares).
   3675  * Returns intersection of all the cache flags of all disks, or first
   3676  * error if any encountered.
   3677  * XXXfua feature flags can change as spares are added - lock down somehow
   3678  */
   3679 static int
   3680 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3681 {
   3682 	int c;
   3683 	int error;
   3684 	int dkwhole = 0, dkpart;
   3685 
   3686 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3687 		/*
   3688 		 * Check any non-dead disk, even when currently being
   3689 		 * reconstructed.
   3690 		 */
   3691 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3692 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3693 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3694 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3695 			if (error) {
   3696 				if (error != ENODEV) {
   3697 					printf("raid%d: get cache for component %s failed\n",
   3698 					    raidPtr->raidid,
   3699 					    raidPtr->Disks[c].devname);
   3700 				}
   3701 
   3702 				return error;
   3703 			}
   3704 
   3705 			if (c == 0)
   3706 				dkwhole = dkpart;
   3707 			else
   3708 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3709 		}
   3710 	}
   3711 
   3712 	*data = dkwhole;
   3713 
   3714 	return 0;
   3715 }
   3716 
   3717 /*
   3718  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3719  * We end up returning whatever error was returned by the first cache flush
   3720  * that fails.
   3721  */
   3722 
   3723 static int
   3724 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3725 {
   3726 	int e = 0;
   3727 	for (int i = 0; i < 5; i++) {
   3728 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3729 		    &force, FWRITE, NOCRED);
   3730 		if (!e || e == ENODEV)
   3731 			return e;
   3732 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3733 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3734 	}
   3735 	return e;
   3736 }
   3737 
   3738 int
   3739 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3740 {
   3741 	int c, error;
   3742 
   3743 	error = 0;
   3744 	for (c = 0; c < raidPtr->numCol; c++) {
   3745 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3746 			int e = rf_sync_component_cache(raidPtr, c, force);
   3747 			if (e && !error)
   3748 				error = e;
   3749 		}
   3750 	}
   3751 
   3752 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3753 		int sparecol = raidPtr->numCol + c;
   3754 		/* Need to ensure that the reconstruct actually completed! */
   3755 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3756 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3757 			    force);
   3758 			if (e && !error)
   3759 				error = e;
   3760 		}
   3761 	}
   3762 	return error;
   3763 }
   3764 
   3765 /* Fill in info with the current status */
   3766 void
   3767 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3768 {
   3769 
   3770 	if (raidPtr->status != rf_rs_reconstructing) {
   3771 		info->total = 100;
   3772 		info->completed = 100;
   3773 	} else {
   3774 		info->total = raidPtr->reconControl->numRUsTotal;
   3775 		info->completed = raidPtr->reconControl->numRUsComplete;
   3776 	}
   3777 	info->remaining = info->total - info->completed;
   3778 }
   3779 
   3780 /* Fill in info with the current status */
   3781 void
   3782 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3783 {
   3784 
   3785 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3786 		info->total = raidPtr->Layout.numStripe;
   3787 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3788 	} else {
   3789 		info->completed = 100;
   3790 		info->total = 100;
   3791 	}
   3792 	info->remaining = info->total - info->completed;
   3793 }
   3794 
   3795 /* Fill in info with the current status */
   3796 void
   3797 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3798 {
   3799 
   3800 	if (raidPtr->copyback_in_progress == 1) {
   3801 		info->total = raidPtr->Layout.numStripe;
   3802 		info->completed = raidPtr->copyback_stripes_done;
   3803 		info->remaining = info->total - info->completed;
   3804 	} else {
   3805 		info->remaining = 0;
   3806 		info->completed = 100;
   3807 		info->total = 100;
   3808 	}
   3809 }
   3810 
   3811 /* Fill in config with the current info */
   3812 int
   3813 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3814 {
   3815 	int	d, i, j;
   3816 
   3817 	if (!raidPtr->valid)
   3818 		return ENODEV;
   3819 	config->cols = raidPtr->numCol;
   3820 	config->ndevs = raidPtr->numCol;
   3821 	if (config->ndevs >= RF_MAX_DISKS)
   3822 		return ENOMEM;
   3823 	config->nspares = raidPtr->numSpare;
   3824 	if (config->nspares >= RF_MAX_DISKS)
   3825 		return ENOMEM;
   3826 	config->maxqdepth = raidPtr->maxQueueDepth;
   3827 	d = 0;
   3828 	for (j = 0; j < config->cols; j++) {
   3829 		config->devs[d] = raidPtr->Disks[j];
   3830 		d++;
   3831 	}
   3832 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3833 		config->spares[i] = raidPtr->Disks[j];
   3834 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3835 			/* XXX: raidctl(8) expects to see this as a used spare */
   3836 			config->spares[i].status = rf_ds_used_spare;
   3837 		}
   3838 	}
   3839 	return 0;
   3840 }
   3841 
   3842 int
   3843 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3844 {
   3845 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3846 	RF_ComponentLabel_t *raid_clabel;
   3847 	int column = clabel->column;
   3848 
   3849 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3850 		return EINVAL;
   3851 	raid_clabel = raidget_component_label(raidPtr, column);
   3852 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3853 	/* Fix-up for userland. */
   3854 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3855 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3856 
   3857 	return 0;
   3858 }
   3859 
   3860 /*
   3861  * Module interface
   3862  */
   3863 
   3864 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3865 
   3866 #ifdef _MODULE
   3867 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3868 #endif
   3869 
   3870 static int raid_modcmd(modcmd_t, void *);
   3871 static int raid_modcmd_init(void);
   3872 static int raid_modcmd_fini(void);
   3873 
   3874 static int
   3875 raid_modcmd(modcmd_t cmd, void *data)
   3876 {
   3877 	int error;
   3878 
   3879 	error = 0;
   3880 	switch (cmd) {
   3881 	case MODULE_CMD_INIT:
   3882 		error = raid_modcmd_init();
   3883 		break;
   3884 	case MODULE_CMD_FINI:
   3885 		error = raid_modcmd_fini();
   3886 		break;
   3887 	default:
   3888 		error = ENOTTY;
   3889 		break;
   3890 	}
   3891 	return error;
   3892 }
   3893 
   3894 static int
   3895 raid_modcmd_init(void)
   3896 {
   3897 	int error;
   3898 	int bmajor, cmajor;
   3899 
   3900 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3901 	mutex_enter(&raid_lock);
   3902 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3903 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3904 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3905 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3906 
   3907 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3908 #endif
   3909 
   3910 	bmajor = cmajor = -1;
   3911 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3912 	    &raid_cdevsw, &cmajor);
   3913 	if (error != 0 && error != EEXIST) {
   3914 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3915 		mutex_exit(&raid_lock);
   3916 		return error;
   3917 	}
   3918 #ifdef _MODULE
   3919 	error = config_cfdriver_attach(&raid_cd);
   3920 	if (error != 0) {
   3921 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3922 		    __func__, error);
   3923 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3924 		mutex_exit(&raid_lock);
   3925 		return error;
   3926 	}
   3927 #endif
   3928 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3929 	if (error != 0) {
   3930 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3931 		    __func__, error);
   3932 #ifdef _MODULE
   3933 		config_cfdriver_detach(&raid_cd);
   3934 #endif
   3935 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3936 		mutex_exit(&raid_lock);
   3937 		return error;
   3938 	}
   3939 
   3940 	raidautoconfigdone = false;
   3941 
   3942 	mutex_exit(&raid_lock);
   3943 
   3944 	if (error == 0) {
   3945 		if (rf_BootRaidframe(true) == 0)
   3946 			aprint_verbose("Kernelized RAIDframe activated\n");
   3947 		else
   3948 			panic("Serious error activating RAID!!");
   3949 	}
   3950 
   3951 	/*
   3952 	 * Register a finalizer which will be used to auto-config RAID
   3953 	 * sets once all real hardware devices have been found.
   3954 	 */
   3955 	error = config_finalize_register(NULL, rf_autoconfig);
   3956 	if (error != 0) {
   3957 		aprint_error("WARNING: unable to register RAIDframe "
   3958 		    "finalizer\n");
   3959 		error = 0;
   3960 	}
   3961 
   3962 	return error;
   3963 }
   3964 
   3965 static int
   3966 raid_modcmd_fini(void)
   3967 {
   3968 	int error;
   3969 
   3970 	mutex_enter(&raid_lock);
   3971 
   3972 	/* Don't allow unload if raid device(s) exist.  */
   3973 	if (!LIST_EMPTY(&raids)) {
   3974 		mutex_exit(&raid_lock);
   3975 		return EBUSY;
   3976 	}
   3977 
   3978 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3979 	if (error != 0) {
   3980 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3981 		mutex_exit(&raid_lock);
   3982 		return error;
   3983 	}
   3984 #ifdef _MODULE
   3985 	error = config_cfdriver_detach(&raid_cd);
   3986 	if (error != 0) {
   3987 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3988 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3989 		mutex_exit(&raid_lock);
   3990 		return error;
   3991 	}
   3992 #endif
   3993 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3994 	if (error != 0) {
   3995 		aprint_error("%s: cannot detach devsw\n",__func__);
   3996 #ifdef _MODULE
   3997 		config_cfdriver_attach(&raid_cd);
   3998 #endif
   3999 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4000 		mutex_exit(&raid_lock);
   4001 		return error;
   4002 	}
   4003 	rf_BootRaidframe(false);
   4004 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4005 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4006 	rf_destroy_cond2(rf_sparet_wait_cv);
   4007 	rf_destroy_cond2(rf_sparet_resp_cv);
   4008 #endif
   4009 	mutex_exit(&raid_lock);
   4010 	mutex_destroy(&raid_lock);
   4011 
   4012 	return error;
   4013 }
   4014