Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.396
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.396 2021/07/23 02:35:14 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.396 2021/07/23 02:35:14 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    179 
    180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    181 
    182 /* prototypes */
    183 static void KernelWakeupFunc(struct buf *);
    184 static void InitBP(struct buf *, struct vnode *, unsigned,
    185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    186     void *, int);
    187 static void raidinit(struct raid_softc *);
    188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    190 
    191 static int raid_match(device_t, cfdata_t, void *);
    192 static void raid_attach(device_t, device_t, void *);
    193 static int raid_detach(device_t, int);
    194 
    195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t);
    197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t, int);
    199 
    200 static int raidwrite_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 static int raidread_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 
    205 static int raid_diskstart(device_t, struct buf *bp);
    206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    207 static int raid_lastclose(device_t);
    208 
    209 static dev_type_open(raidopen);
    210 static dev_type_close(raidclose);
    211 static dev_type_read(raidread);
    212 static dev_type_write(raidwrite);
    213 static dev_type_ioctl(raidioctl);
    214 static dev_type_strategy(raidstrategy);
    215 static dev_type_dump(raiddump);
    216 static dev_type_size(raidsize);
    217 
    218 const struct bdevsw raid_bdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_strategy = raidstrategy,
    222 	.d_ioctl = raidioctl,
    223 	.d_dump = raiddump,
    224 	.d_psize = raidsize,
    225 	.d_discard = nodiscard,
    226 	.d_flag = D_DISK
    227 };
    228 
    229 const struct cdevsw raid_cdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_read = raidread,
    233 	.d_write = raidwrite,
    234 	.d_ioctl = raidioctl,
    235 	.d_stop = nostop,
    236 	.d_tty = notty,
    237 	.d_poll = nopoll,
    238 	.d_mmap = nommap,
    239 	.d_kqfilter = nokqfilter,
    240 	.d_discard = nodiscard,
    241 	.d_flag = D_DISK
    242 };
    243 
    244 static struct dkdriver rf_dkdriver = {
    245 	.d_open = raidopen,
    246 	.d_close = raidclose,
    247 	.d_strategy = raidstrategy,
    248 	.d_diskstart = raid_diskstart,
    249 	.d_dumpblocks = raid_dumpblocks,
    250 	.d_lastclose = raid_lastclose,
    251 	.d_minphys = minphys
    252 };
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /* Internal representation of a rf_recon_req */
    263 struct rf_recon_req_internal {
    264 	RF_RowCol_t col;
    265 	RF_ReconReqFlags_t flags;
    266 	void   *raidPtr;
    267 };
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 static void rf_ReconThread(struct rf_recon_req_internal *);
    304 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    307 static int rf_autoconfig(device_t);
    308 static void rf_buildroothack(RF_ConfigSet_t *);
    309 
    310 static RF_AutoConfig_t *rf_find_raid_components(void);
    311 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    313 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    314 static int rf_set_autoconfig(RF_Raid_t *, int);
    315 static int rf_set_rootpartition(RF_Raid_t *, int);
    316 static void rf_release_all_vps(RF_ConfigSet_t *);
    317 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    318 static int rf_have_enough_components(RF_ConfigSet_t *);
    319 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    320 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    321 
    322 /*
    323  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    324  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    325  * in the kernel config file.
    326  */
    327 #ifdef RAID_AUTOCONFIG
    328 int raidautoconfig = 1;
    329 #else
    330 int raidautoconfig = 0;
    331 #endif
    332 static bool raidautoconfigdone = false;
    333 
    334 struct pool rf_alloclist_pool;   /* AllocList */
    335 
    336 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    337 static kmutex_t raid_lock;
    338 
    339 static struct raid_softc *
    340 raidcreate(int unit) {
    341 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    342 	sc->sc_unit = unit;
    343 	cv_init(&sc->sc_cv, "raidunit");
    344 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    345 	return sc;
    346 }
    347 
    348 static void
    349 raiddestroy(struct raid_softc *sc) {
    350 	cv_destroy(&sc->sc_cv);
    351 	mutex_destroy(&sc->sc_mutex);
    352 	kmem_free(sc, sizeof(*sc));
    353 }
    354 
    355 static struct raid_softc *
    356 raidget(int unit, bool create) {
    357 	struct raid_softc *sc;
    358 	if (unit < 0) {
    359 #ifdef DIAGNOSTIC
    360 		panic("%s: unit %d!", __func__, unit);
    361 #endif
    362 		return NULL;
    363 	}
    364 	mutex_enter(&raid_lock);
    365 	LIST_FOREACH(sc, &raids, sc_link) {
    366 		if (sc->sc_unit == unit) {
    367 			mutex_exit(&raid_lock);
    368 			return sc;
    369 		}
    370 	}
    371 	mutex_exit(&raid_lock);
    372 	if (!create)
    373 		return NULL;
    374 	sc = raidcreate(unit);
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 static int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return 0;
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 static void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok, rootable %d\n",
    502 				    sc->sc_unit, cset->rootable);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 
    522 	/* if the user has specified what the root device should be
    523 	   then we don't touch booted_device or boothowto... */
    524 
    525 	if (rootspec != NULL) {
    526 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    527 		return;
    528 	}
    529 
    530 	/* we found something bootable... */
    531 
    532 	/*
    533 	 * XXX: The following code assumes that the root raid
    534 	 * is the first ('a') partition. This is about the best
    535 	 * we can do with a BSD disklabel, but we might be able
    536 	 * to do better with a GPT label, by setting a specified
    537 	 * attribute to indicate the root partition. We can then
    538 	 * stash the partition number in the r->root_partition
    539 	 * high bits (the bottom 2 bits are already used). For
    540 	 * now we just set booted_partition to 0 when we override
    541 	 * root.
    542 	 */
    543 	if (num_root == 1) {
    544 		device_t candidate_root;
    545 		dksc = &rsc->sc_dksc;
    546 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    547 			char cname[sizeof(cset->ac->devname)];
    548 			/* XXX: assume partition 'a' first */
    549 			snprintf(cname, sizeof(cname), "%s%c",
    550 			    device_xname(dksc->sc_dev), 'a');
    551 			candidate_root = dkwedge_find_by_wname(cname);
    552 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    553 			    cname);
    554 			if (candidate_root == NULL) {
    555 				/*
    556 				 * If that is not found, because we don't use
    557 				 * disklabel, return the first dk child
    558 				 * XXX: we can skip the 'a' check above
    559 				 * and always do this...
    560 				 */
    561 				size_t i = 0;
    562 				candidate_root = dkwedge_find_by_parent(
    563 				    device_xname(dksc->sc_dev), &i);
    564 			}
    565 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    566 			    candidate_root);
    567 		} else
    568 			candidate_root = dksc->sc_dev;
    569 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    570 		DPRINTF("%s: booted_device=%p root_partition=%d "
    571 			"contains_boot=%d",
    572 		    __func__, booted_device, rsc->sc_r.root_partition,
    573 			   rf_containsboot(&rsc->sc_r, booted_device));
    574 		/* XXX the check for booted_device == NULL can probably be
    575 		 * dropped, now that rf_containsboot handles that case.
    576 		 */
    577 		if (booted_device == NULL ||
    578 		    rsc->sc_r.root_partition == 1 ||
    579 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    580 			booted_device = candidate_root;
    581 			booted_method = "raidframe/single";
    582 			booted_partition = 0;	/* XXX assume 'a' */
    583 			DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
    584 			    device_xname(booted_device), booted_device);
    585 		}
    586 	} else if (num_root > 1) {
    587 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    588 		    booted_device);
    589 
    590 		/*
    591 		 * Maybe the MD code can help. If it cannot, then
    592 		 * setroot() will discover that we have no
    593 		 * booted_device and will ask the user if nothing was
    594 		 * hardwired in the kernel config file
    595 		 */
    596 		if (booted_device == NULL)
    597 			return;
    598 
    599 		num_root = 0;
    600 		mutex_enter(&raid_lock);
    601 		LIST_FOREACH(sc, &raids, sc_link) {
    602 			RF_Raid_t *r = &sc->sc_r;
    603 			if (r->valid == 0)
    604 				continue;
    605 
    606 			if (r->root_partition == 0)
    607 				continue;
    608 
    609 			if (rf_containsboot(r, booted_device)) {
    610 				num_root++;
    611 				rsc = sc;
    612 				dksc = &rsc->sc_dksc;
    613 			}
    614 		}
    615 		mutex_exit(&raid_lock);
    616 
    617 		if (num_root == 1) {
    618 			booted_device = dksc->sc_dev;
    619 			booted_method = "raidframe/multi";
    620 			booted_partition = 0;	/* XXX assume 'a' */
    621 		} else {
    622 			/* we can't guess.. require the user to answer... */
    623 			boothowto |= RB_ASKNAME;
    624 		}
    625 	}
    626 }
    627 
    628 static int
    629 raidsize(dev_t dev)
    630 {
    631 	struct raid_softc *rs;
    632 	struct dk_softc *dksc;
    633 	unsigned int unit;
    634 
    635 	unit = raidunit(dev);
    636 	if ((rs = raidget(unit, false)) == NULL)
    637 		return -1;
    638 	dksc = &rs->sc_dksc;
    639 
    640 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    641 		return -1;
    642 
    643 	return dk_size(dksc, dev);
    644 }
    645 
    646 static int
    647 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    648 {
    649 	unsigned int unit;
    650 	struct raid_softc *rs;
    651 	struct dk_softc *dksc;
    652 
    653 	unit = raidunit(dev);
    654 	if ((rs = raidget(unit, false)) == NULL)
    655 		return ENXIO;
    656 	dksc = &rs->sc_dksc;
    657 
    658 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    659 		return ENODEV;
    660 
    661         /*
    662            Note that blkno is relative to this particular partition.
    663            By adding adding RF_PROTECTED_SECTORS, we get a value that
    664 	   is relative to the partition used for the underlying component.
    665         */
    666 	blkno += RF_PROTECTED_SECTORS;
    667 
    668 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    669 }
    670 
    671 static int
    672 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    673 {
    674 	struct raid_softc *rs = raidsoftc(dev);
    675 	const struct bdevsw *bdev;
    676 	RF_Raid_t *raidPtr;
    677 	int     c, sparecol, j, scol, dumpto;
    678 	int     error = 0;
    679 
    680 	raidPtr = &rs->sc_r;
    681 
    682 	/* we only support dumping to RAID 1 sets */
    683 	if (raidPtr->Layout.numDataCol != 1 ||
    684 	    raidPtr->Layout.numParityCol != 1)
    685 		return EINVAL;
    686 
    687 	if ((error = raidlock(rs)) != 0)
    688 		return error;
    689 
    690 	/* figure out what device is alive.. */
    691 
    692 	/*
    693 	   Look for a component to dump to.  The preference for the
    694 	   component to dump to is as follows:
    695 	   1) the first component
    696 	   2) a used_spare of the first component
    697 	   3) the second component
    698 	   4) a used_spare of the second component
    699 	*/
    700 
    701 	dumpto = -1;
    702 	for (c = 0; c < raidPtr->numCol; c++) {
    703 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    704 			/* this might be the one */
    705 			dumpto = c;
    706 			break;
    707 		}
    708 	}
    709 
    710 	/*
    711 	   At this point we have possibly selected a live component.
    712 	   If we didn't find a live ocmponent, we now check to see
    713 	   if there is a relevant spared component.
    714 	*/
    715 
    716 	for (c = 0; c < raidPtr->numSpare; c++) {
    717 		sparecol = raidPtr->numCol + c;
    718 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    719 			/* How about this one? */
    720 			scol = -1;
    721 			for(j=0;j<raidPtr->numCol;j++) {
    722 				if (raidPtr->Disks[j].spareCol == sparecol) {
    723 					scol = j;
    724 					break;
    725 				}
    726 			}
    727 			if (scol == 0) {
    728 				/*
    729 				   We must have found a spared first
    730 				   component!  We'll take that over
    731 				   anything else found so far.  (We
    732 				   couldn't have found a real first
    733 				   component before, since this is a
    734 				   used spare, and it's saying that
    735 				   it's replacing the first
    736 				   component.)  On reboot (with
    737 				   autoconfiguration turned on)
    738 				   sparecol will become the first
    739 				   component (component0) of this set.
    740 				*/
    741 				dumpto = sparecol;
    742 				break;
    743 			} else if (scol != -1) {
    744 				/*
    745 				   Must be a spared second component.
    746 				   We'll dump to that if we havn't found
    747 				   anything else so far.
    748 				*/
    749 				if (dumpto == -1)
    750 					dumpto = sparecol;
    751 			}
    752 		}
    753 	}
    754 
    755 	if (dumpto == -1) {
    756 		/* we couldn't find any live components to dump to!?!?
    757 		 */
    758 		error = EINVAL;
    759 		goto out;
    760 	}
    761 
    762 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    763 	if (bdev == NULL) {
    764 		error = ENXIO;
    765 		goto out;
    766 	}
    767 
    768 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    769 				blkno, va, nblk * raidPtr->bytesPerSector);
    770 
    771 out:
    772 	raidunlock(rs);
    773 
    774 	return error;
    775 }
    776 
    777 /* ARGSUSED */
    778 static int
    779 raidopen(dev_t dev, int flags, int fmt,
    780     struct lwp *l)
    781 {
    782 	int     unit = raidunit(dev);
    783 	struct raid_softc *rs;
    784 	struct dk_softc *dksc;
    785 	int     error = 0;
    786 	int     part, pmask;
    787 
    788 	if ((rs = raidget(unit, true)) == NULL)
    789 		return ENXIO;
    790 	if ((error = raidlock(rs)) != 0)
    791 		return error;
    792 
    793 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    794 		error = EBUSY;
    795 		goto bad;
    796 	}
    797 
    798 	dksc = &rs->sc_dksc;
    799 
    800 	part = DISKPART(dev);
    801 	pmask = (1 << part);
    802 
    803 	if (!DK_BUSY(dksc, pmask) &&
    804 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    805 		/* First one... mark things as dirty... Note that we *MUST*
    806 		 have done a configure before this.  I DO NOT WANT TO BE
    807 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    808 		 THAT THEY BELONG TOGETHER!!!!! */
    809 		/* XXX should check to see if we're only open for reading
    810 		   here... If so, we needn't do this, but then need some
    811 		   other way of keeping track of what's happened.. */
    812 
    813 		rf_markalldirty(&rs->sc_r);
    814 	}
    815 
    816 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    817 		error = dk_open(dksc, dev, flags, fmt, l);
    818 
    819 bad:
    820 	raidunlock(rs);
    821 
    822 	return error;
    823 
    824 
    825 }
    826 
    827 static int
    828 raid_lastclose(device_t self)
    829 {
    830 	struct raid_softc *rs = raidsoftc(self);
    831 
    832 	/* Last one... device is not unconfigured yet.
    833 	   Device shutdown has taken care of setting the
    834 	   clean bits if RAIDF_INITED is not set
    835 	   mark things as clean... */
    836 
    837 	rf_update_component_labels(&rs->sc_r,
    838 	    RF_FINAL_COMPONENT_UPDATE);
    839 
    840 	/* pass to unlocked code */
    841 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    842 		rs->sc_flags |= RAIDF_DETACH;
    843 
    844 	return 0;
    845 }
    846 
    847 /* ARGSUSED */
    848 static int
    849 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    850 {
    851 	int     unit = raidunit(dev);
    852 	struct raid_softc *rs;
    853 	struct dk_softc *dksc;
    854 	cfdata_t cf;
    855 	int     error = 0, do_detach = 0, do_put = 0;
    856 
    857 	if ((rs = raidget(unit, false)) == NULL)
    858 		return ENXIO;
    859 	dksc = &rs->sc_dksc;
    860 
    861 	if ((error = raidlock(rs)) != 0)
    862 		return error;
    863 
    864 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    865 		error = dk_close(dksc, dev, flags, fmt, l);
    866 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    867 			do_detach = 1;
    868 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    869 		do_put = 1;
    870 
    871 	raidunlock(rs);
    872 
    873 	if (do_detach) {
    874 		/* free the pseudo device attach bits */
    875 		cf = device_cfdata(dksc->sc_dev);
    876 		error = config_detach(dksc->sc_dev, 0);
    877 		if (error == 0)
    878 			free(cf, M_RAIDFRAME);
    879 	} else if (do_put) {
    880 		raidput(rs);
    881 	}
    882 
    883 	return error;
    884 
    885 }
    886 
    887 static void
    888 raid_wakeup(RF_Raid_t *raidPtr)
    889 {
    890 	rf_lock_mutex2(raidPtr->iodone_lock);
    891 	rf_signal_cond2(raidPtr->iodone_cv);
    892 	rf_unlock_mutex2(raidPtr->iodone_lock);
    893 }
    894 
    895 static void
    896 raidstrategy(struct buf *bp)
    897 {
    898 	unsigned int unit;
    899 	struct raid_softc *rs;
    900 	struct dk_softc *dksc;
    901 	RF_Raid_t *raidPtr;
    902 
    903 	unit = raidunit(bp->b_dev);
    904 	if ((rs = raidget(unit, false)) == NULL) {
    905 		bp->b_error = ENXIO;
    906 		goto fail;
    907 	}
    908 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    909 		bp->b_error = ENXIO;
    910 		goto fail;
    911 	}
    912 	dksc = &rs->sc_dksc;
    913 	raidPtr = &rs->sc_r;
    914 
    915 	/* Queue IO only */
    916 	if (dk_strategy_defer(dksc, bp))
    917 		goto done;
    918 
    919 	/* schedule the IO to happen at the next convenient time */
    920 	raid_wakeup(raidPtr);
    921 
    922 done:
    923 	return;
    924 
    925 fail:
    926 	bp->b_resid = bp->b_bcount;
    927 	biodone(bp);
    928 }
    929 
    930 static int
    931 raid_diskstart(device_t dev, struct buf *bp)
    932 {
    933 	struct raid_softc *rs = raidsoftc(dev);
    934 	RF_Raid_t *raidPtr;
    935 
    936 	raidPtr = &rs->sc_r;
    937 	if (!raidPtr->valid) {
    938 		db1_printf(("raid is not valid..\n"));
    939 		return ENODEV;
    940 	}
    941 
    942 	/* XXX */
    943 	bp->b_resid = 0;
    944 
    945 	return raiddoaccess(raidPtr, bp);
    946 }
    947 
    948 void
    949 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    950 {
    951 	struct raid_softc *rs;
    952 	struct dk_softc *dksc;
    953 
    954 	rs = raidPtr->softc;
    955 	dksc = &rs->sc_dksc;
    956 
    957 	dk_done(dksc, bp);
    958 
    959 	rf_lock_mutex2(raidPtr->mutex);
    960 	raidPtr->openings++;
    961 	rf_unlock_mutex2(raidPtr->mutex);
    962 
    963 	/* schedule more IO */
    964 	raid_wakeup(raidPtr);
    965 }
    966 
    967 /* ARGSUSED */
    968 static int
    969 raidread(dev_t dev, struct uio *uio, int flags)
    970 {
    971 	int     unit = raidunit(dev);
    972 	struct raid_softc *rs;
    973 
    974 	if ((rs = raidget(unit, false)) == NULL)
    975 		return ENXIO;
    976 
    977 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    978 		return ENXIO;
    979 
    980 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
    981 
    982 }
    983 
    984 /* ARGSUSED */
    985 static int
    986 raidwrite(dev_t dev, struct uio *uio, int flags)
    987 {
    988 	int     unit = raidunit(dev);
    989 	struct raid_softc *rs;
    990 
    991 	if ((rs = raidget(unit, false)) == NULL)
    992 		return ENXIO;
    993 
    994 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    995 		return ENXIO;
    996 
    997 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
    998 
    999 }
   1000 
   1001 static int
   1002 raid_detach_unlocked(struct raid_softc *rs)
   1003 {
   1004 	struct dk_softc *dksc = &rs->sc_dksc;
   1005 	RF_Raid_t *raidPtr;
   1006 	int error;
   1007 
   1008 	raidPtr = &rs->sc_r;
   1009 
   1010 	if (DK_BUSY(dksc, 0) ||
   1011 	    raidPtr->recon_in_progress != 0 ||
   1012 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1013 	    raidPtr->copyback_in_progress != 0)
   1014 		return EBUSY;
   1015 
   1016 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1017 		return 0;
   1018 
   1019 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1020 
   1021 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1022 		return error;
   1023 
   1024 	rs->sc_flags &= ~RAIDF_INITED;
   1025 
   1026 	/* Kill off any queued buffers */
   1027 	dk_drain(dksc);
   1028 	bufq_free(dksc->sc_bufq);
   1029 
   1030 	/* Detach the disk. */
   1031 	dkwedge_delall(&dksc->sc_dkdev);
   1032 	disk_detach(&dksc->sc_dkdev);
   1033 	disk_destroy(&dksc->sc_dkdev);
   1034 	dk_detach(dksc);
   1035 
   1036 	return 0;
   1037 }
   1038 
   1039 static bool
   1040 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1041 {
   1042 	switch (cmd) {
   1043 	case RAIDFRAME_ADD_HOT_SPARE:
   1044 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1045 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1046 	case RAIDFRAME_CHECK_PARITY:
   1047 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1048 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1049 	case RAIDFRAME_CHECK_RECON_STATUS:
   1050 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1051 	case RAIDFRAME_COPYBACK:
   1052 	case RAIDFRAME_DELETE_COMPONENT:
   1053 	case RAIDFRAME_FAIL_DISK:
   1054 	case RAIDFRAME_GET_ACCTOTALS:
   1055 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1056 	case RAIDFRAME_GET_INFO:
   1057 	case RAIDFRAME_GET_SIZE:
   1058 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1059 	case RAIDFRAME_INIT_LABELS:
   1060 	case RAIDFRAME_KEEP_ACCTOTALS:
   1061 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1062 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1063 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1064 	case RAIDFRAME_PARITYMAP_STATUS:
   1065 	case RAIDFRAME_REBUILD_IN_PLACE:
   1066 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1067 	case RAIDFRAME_RESET_ACCTOTALS:
   1068 	case RAIDFRAME_REWRITEPARITY:
   1069 	case RAIDFRAME_SET_AUTOCONFIG:
   1070 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1071 	case RAIDFRAME_SET_ROOT:
   1072 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1073 	}
   1074 	return false;
   1075 }
   1076 
   1077 int
   1078 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1079 {
   1080 	struct rf_recon_req_internal *rrint;
   1081 
   1082 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1083 		/* Can't do this on a RAID 0!! */
   1084 		return EINVAL;
   1085 	}
   1086 
   1087 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1088 		/* bad column */
   1089 		return EINVAL;
   1090 	}
   1091 
   1092 	rf_lock_mutex2(raidPtr->mutex);
   1093 	if (raidPtr->status == rf_rs_reconstructing) {
   1094 		/* you can't fail a disk while we're reconstructing! */
   1095 		/* XXX wrong for RAID6 */
   1096 		goto out;
   1097 	}
   1098 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1099 	    (raidPtr->numFailures > 0)) {
   1100 		/* some other component has failed.  Let's not make
   1101 		   things worse. XXX wrong for RAID6 */
   1102 		goto out;
   1103 	}
   1104 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1105 		/* Can't fail a spared disk! */
   1106 		goto out;
   1107 	}
   1108 	rf_unlock_mutex2(raidPtr->mutex);
   1109 
   1110 	/* make a copy of the recon request so that we don't rely on
   1111 	 * the user's buffer */
   1112 	rrint = RF_Malloc(sizeof(*rrint));
   1113 	if (rrint == NULL)
   1114 		return(ENOMEM);
   1115 	rrint->col = rr->col;
   1116 	rrint->flags = rr->flags;
   1117 	rrint->raidPtr = raidPtr;
   1118 
   1119 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1120 	    rrint, "raid_recon");
   1121 out:
   1122 	rf_unlock_mutex2(raidPtr->mutex);
   1123 	return EINVAL;
   1124 }
   1125 
   1126 static int
   1127 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1128 {
   1129 	/* allocate a buffer for the layout-specific data, and copy it in */
   1130 	if (k_cfg->layoutSpecificSize == 0)
   1131 		return 0;
   1132 
   1133 	if (k_cfg->layoutSpecificSize > 10000) {
   1134 	    /* sanity check */
   1135 	    return EINVAL;
   1136 	}
   1137 
   1138 	u_char *specific_buf;
   1139 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1140 	if (specific_buf == NULL)
   1141 		return ENOMEM;
   1142 
   1143 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1144 	    k_cfg->layoutSpecificSize);
   1145 	if (retcode) {
   1146 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1147 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1148 		return retcode;
   1149 	}
   1150 
   1151 	k_cfg->layoutSpecific = specific_buf;
   1152 	return 0;
   1153 }
   1154 
   1155 static int
   1156 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1157 {
   1158 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1159 
   1160 	if (rs->sc_r.valid) {
   1161 		/* There is a valid RAID set running on this unit! */
   1162 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1163 		return EINVAL;
   1164 	}
   1165 
   1166 	/* copy-in the configuration information */
   1167 	/* data points to a pointer to the configuration structure */
   1168 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1169 	if (*k_cfg == NULL) {
   1170 		return ENOMEM;
   1171 	}
   1172 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1173 	if (retcode == 0)
   1174 		return 0;
   1175 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1176 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1177 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1178 	return retcode;
   1179 }
   1180 
   1181 int
   1182 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1183 {
   1184 	int retcode;
   1185 	RF_Raid_t *raidPtr = &rs->sc_r;
   1186 
   1187 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1188 
   1189 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1190 		goto out;
   1191 
   1192 	/* should do some kind of sanity check on the configuration.
   1193 	 * Store the sum of all the bytes in the last byte? */
   1194 
   1195 	/* configure the system */
   1196 
   1197 	/*
   1198 	 * Clear the entire RAID descriptor, just to make sure
   1199 	 *  there is no stale data left in the case of a
   1200 	 *  reconfiguration
   1201 	 */
   1202 	memset(raidPtr, 0, sizeof(*raidPtr));
   1203 	raidPtr->softc = rs;
   1204 	raidPtr->raidid = rs->sc_unit;
   1205 
   1206 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1207 
   1208 	if (retcode == 0) {
   1209 		/* allow this many simultaneous IO's to
   1210 		   this RAID device */
   1211 		raidPtr->openings = RAIDOUTSTANDING;
   1212 
   1213 		raidinit(rs);
   1214 		raid_wakeup(raidPtr);
   1215 		rf_markalldirty(raidPtr);
   1216 	}
   1217 
   1218 	/* free the buffers.  No return code here. */
   1219 	if (k_cfg->layoutSpecificSize) {
   1220 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1221 	}
   1222 out:
   1223 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1224 	if (retcode) {
   1225 		/*
   1226 		 * If configuration failed, set sc_flags so that we
   1227 		 * will detach the device when we close it.
   1228 		 */
   1229 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1230 	}
   1231 	return retcode;
   1232 }
   1233 
   1234 #if RF_DISABLED
   1235 static int
   1236 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1237 {
   1238 
   1239 	/* XXX check the label for valid stuff... */
   1240 	/* Note that some things *should not* get modified --
   1241 	   the user should be re-initing the labels instead of
   1242 	   trying to patch things.
   1243 	   */
   1244 #ifdef DEBUG
   1245 	int raidid = raidPtr->raidid;
   1246 	printf("raid%d: Got component label:\n", raidid);
   1247 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1248 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1249 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1250 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1251 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1252 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1253 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1254 #endif	/* DEBUG */
   1255 	clabel->row = 0;
   1256 	int column = clabel->column;
   1257 
   1258 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1259 		return(EINVAL);
   1260 	}
   1261 
   1262 	/* XXX this isn't allowed to do anything for now :-) */
   1263 
   1264 	/* XXX and before it is, we need to fill in the rest
   1265 	   of the fields!?!?!?! */
   1266 	memcpy(raidget_component_label(raidPtr, column),
   1267 	    clabel, sizeof(*clabel));
   1268 	raidflush_component_label(raidPtr, column);
   1269 	return 0;
   1270 }
   1271 #endif
   1272 
   1273 static int
   1274 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1275 {
   1276 	/*
   1277 	   we only want the serial number from
   1278 	   the above.  We get all the rest of the information
   1279 	   from the config that was used to create this RAID
   1280 	   set.
   1281 	   */
   1282 
   1283 	raidPtr->serial_number = clabel->serial_number;
   1284 
   1285 	for (int column = 0; column < raidPtr->numCol; column++) {
   1286 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1287 		if (RF_DEAD_DISK(diskPtr->status))
   1288 			continue;
   1289 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1290 		    raidPtr, column);
   1291 		/* Zeroing this is important. */
   1292 		memset(ci_label, 0, sizeof(*ci_label));
   1293 		raid_init_component_label(raidPtr, ci_label);
   1294 		ci_label->serial_number = raidPtr->serial_number;
   1295 		ci_label->row = 0; /* we dont' pretend to support more */
   1296 		rf_component_label_set_partitionsize(ci_label,
   1297 		    diskPtr->partitionSize);
   1298 		ci_label->column = column;
   1299 		raidflush_component_label(raidPtr, column);
   1300 		/* XXXjld what about the spares? */
   1301 	}
   1302 
   1303 	return 0;
   1304 }
   1305 
   1306 static int
   1307 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1308 {
   1309 
   1310 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1311 		/* Can't do this on a RAID 0!! */
   1312 		return EINVAL;
   1313 	}
   1314 
   1315 	if (raidPtr->recon_in_progress == 1) {
   1316 		/* a reconstruct is already in progress! */
   1317 		return EINVAL;
   1318 	}
   1319 
   1320 	RF_SingleComponent_t component;
   1321 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1322 	component.row = 0; /* we don't support any more */
   1323 	int column = component.column;
   1324 
   1325 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1326 		return EINVAL;
   1327 	}
   1328 
   1329 	rf_lock_mutex2(raidPtr->mutex);
   1330 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1331 	    (raidPtr->numFailures > 0)) {
   1332 		/* XXX 0 above shouldn't be constant!!! */
   1333 		/* some component other than this has failed.
   1334 		   Let's not make things worse than they already
   1335 		   are... */
   1336 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1337 		       raidPtr->raidid);
   1338 		printf("raid%d:     Col: %d   Too many failures.\n",
   1339 		       raidPtr->raidid, column);
   1340 		rf_unlock_mutex2(raidPtr->mutex);
   1341 		return EINVAL;
   1342 	}
   1343 
   1344 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1345 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1346 		       raidPtr->raidid);
   1347 		printf("raid%d:    Col: %d   "
   1348 		    "Reconstruction already occurring!\n",
   1349 		    raidPtr->raidid, column);
   1350 
   1351 		rf_unlock_mutex2(raidPtr->mutex);
   1352 		return EINVAL;
   1353 	}
   1354 
   1355 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1356 		rf_unlock_mutex2(raidPtr->mutex);
   1357 		return EINVAL;
   1358 	}
   1359 
   1360 	rf_unlock_mutex2(raidPtr->mutex);
   1361 
   1362 	struct rf_recon_req_internal *rrint;
   1363 	rrint = RF_Malloc(sizeof(*rrint));
   1364 	if (rrint == NULL)
   1365 		return ENOMEM;
   1366 
   1367 	rrint->col = column;
   1368 	rrint->raidPtr = raidPtr;
   1369 
   1370 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1371 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1372 }
   1373 
   1374 static int
   1375 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1376 {
   1377 	/*
   1378 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1379 	 * so tell the user it's done.
   1380 	 */
   1381 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1382 	    raidPtr->status != rf_rs_reconstructing) {
   1383 		*data = 100;
   1384 		return 0;
   1385 	}
   1386 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1387 		*data = 0;
   1388 		return 0;
   1389 	}
   1390 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1391 	    / raidPtr->reconControl->numRUsTotal);
   1392 	return 0;
   1393 }
   1394 
   1395 static int
   1396 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1397 {
   1398 	int     unit = raidunit(dev);
   1399 	int     part, pmask;
   1400 	struct raid_softc *rs;
   1401 	struct dk_softc *dksc;
   1402 	RF_Config_t *k_cfg;
   1403 	RF_Raid_t *raidPtr;
   1404 	RF_AccTotals_t *totals;
   1405 	RF_SingleComponent_t component;
   1406 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1407 	int retcode = 0;
   1408 	int column;
   1409 	RF_ComponentLabel_t *clabel;
   1410 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1411 	int d;
   1412 
   1413 	if ((rs = raidget(unit, false)) == NULL)
   1414 		return ENXIO;
   1415 
   1416 	dksc = &rs->sc_dksc;
   1417 	raidPtr = &rs->sc_r;
   1418 
   1419 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1420 	    (int) DISKPART(dev), (int) unit, cmd));
   1421 
   1422 	/* Must be initialized for these... */
   1423 	if (rf_must_be_initialized(rs, cmd))
   1424 		return ENXIO;
   1425 
   1426 	switch (cmd) {
   1427 		/* configure the system */
   1428 	case RAIDFRAME_CONFIGURE:
   1429 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1430 			return retcode;
   1431 		return rf_construct(rs, k_cfg);
   1432 
   1433 		/* shutdown the system */
   1434 	case RAIDFRAME_SHUTDOWN:
   1435 
   1436 		part = DISKPART(dev);
   1437 		pmask = (1 << part);
   1438 
   1439 		if ((retcode = raidlock(rs)) != 0)
   1440 			return retcode;
   1441 
   1442 		if (DK_BUSY(dksc, pmask) ||
   1443 		    raidPtr->recon_in_progress != 0 ||
   1444 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1445 		    raidPtr->copyback_in_progress != 0)
   1446 			retcode = EBUSY;
   1447 		else {
   1448 			/* detach and free on close */
   1449 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1450 			retcode = 0;
   1451 		}
   1452 
   1453 		raidunlock(rs);
   1454 
   1455 		return retcode;
   1456 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1457 		return rf_get_component_label(raidPtr, data);
   1458 
   1459 #if RF_DISABLED
   1460 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1461 		return rf_set_component_label(raidPtr, data);
   1462 #endif
   1463 
   1464 	case RAIDFRAME_INIT_LABELS:
   1465 		return rf_init_component_label(raidPtr, data);
   1466 
   1467 	case RAIDFRAME_SET_AUTOCONFIG:
   1468 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1469 		printf("raid%d: New autoconfig value is: %d\n",
   1470 		       raidPtr->raidid, d);
   1471 		*(int *) data = d;
   1472 		return retcode;
   1473 
   1474 	case RAIDFRAME_SET_ROOT:
   1475 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1476 		printf("raid%d: New rootpartition value is: %d\n",
   1477 		       raidPtr->raidid, d);
   1478 		*(int *) data = d;
   1479 		return retcode;
   1480 
   1481 		/* initialize all parity */
   1482 	case RAIDFRAME_REWRITEPARITY:
   1483 
   1484 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1485 			/* Parity for RAID 0 is trivially correct */
   1486 			raidPtr->parity_good = RF_RAID_CLEAN;
   1487 			return 0;
   1488 		}
   1489 
   1490 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1491 			/* Re-write is already in progress! */
   1492 			return EINVAL;
   1493 		}
   1494 
   1495 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1496 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1497 
   1498 	case RAIDFRAME_ADD_HOT_SPARE:
   1499 		sparePtr = (RF_SingleComponent_t *) data;
   1500 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1501 		return rf_add_hot_spare(raidPtr, &component);
   1502 
   1503 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1504 		return retcode;
   1505 
   1506 	case RAIDFRAME_DELETE_COMPONENT:
   1507 		componentPtr = (RF_SingleComponent_t *)data;
   1508 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1509 		return rf_delete_component(raidPtr, &component);
   1510 
   1511 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1512 		componentPtr = (RF_SingleComponent_t *)data;
   1513 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1514 		return rf_incorporate_hot_spare(raidPtr, &component);
   1515 
   1516 	case RAIDFRAME_REBUILD_IN_PLACE:
   1517 		return rf_rebuild_in_place(raidPtr, data);
   1518 
   1519 	case RAIDFRAME_GET_INFO:
   1520 		ucfgp = *(RF_DeviceConfig_t **)data;
   1521 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1522 		if (d_cfg == NULL)
   1523 			return ENOMEM;
   1524 		retcode = rf_get_info(raidPtr, d_cfg);
   1525 		if (retcode == 0) {
   1526 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1527 		}
   1528 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1529 		return retcode;
   1530 
   1531 	case RAIDFRAME_CHECK_PARITY:
   1532 		*(int *) data = raidPtr->parity_good;
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_STATUS:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		rf_paritymap_status(raidPtr->parity_map, data);
   1539 		return 0;
   1540 
   1541 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1542 		if (rf_paritymap_ineligible(raidPtr))
   1543 			return EINVAL;
   1544 		if (raidPtr->parity_map == NULL)
   1545 			return ENOENT; /* ??? */
   1546 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1547 			return EINVAL;
   1548 		return 0;
   1549 
   1550 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1551 		if (rf_paritymap_ineligible(raidPtr))
   1552 			return EINVAL;
   1553 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1554 		return 0;
   1555 
   1556 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1557 		if (rf_paritymap_ineligible(raidPtr))
   1558 			return EINVAL;
   1559 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1560 		/* XXX should errors be passed up? */
   1561 		return 0;
   1562 
   1563 	case RAIDFRAME_RESET_ACCTOTALS:
   1564 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1565 		return 0;
   1566 
   1567 	case RAIDFRAME_GET_ACCTOTALS:
   1568 		totals = (RF_AccTotals_t *) data;
   1569 		*totals = raidPtr->acc_totals;
   1570 		return 0;
   1571 
   1572 	case RAIDFRAME_KEEP_ACCTOTALS:
   1573 		raidPtr->keep_acc_totals = *(int *)data;
   1574 		return 0;
   1575 
   1576 	case RAIDFRAME_GET_SIZE:
   1577 		*(int *) data = raidPtr->totalSectors;
   1578 		return 0;
   1579 
   1580 	case RAIDFRAME_FAIL_DISK:
   1581 		return rf_fail_disk(raidPtr, data);
   1582 
   1583 		/* invoke a copyback operation after recon on whatever disk
   1584 		 * needs it, if any */
   1585 	case RAIDFRAME_COPYBACK:
   1586 
   1587 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1588 			/* This makes no sense on a RAID 0!! */
   1589 			return EINVAL;
   1590 		}
   1591 
   1592 		if (raidPtr->copyback_in_progress == 1) {
   1593 			/* Copyback is already in progress! */
   1594 			return EINVAL;
   1595 		}
   1596 
   1597 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1598 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1599 
   1600 		/* return the percentage completion of reconstruction */
   1601 	case RAIDFRAME_CHECK_RECON_STATUS:
   1602 		return rf_check_recon_status(raidPtr, data);
   1603 
   1604 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1605 		rf_check_recon_status_ext(raidPtr, data);
   1606 		return 0;
   1607 
   1608 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1609 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1610 			/* This makes no sense on a RAID 0, so tell the
   1611 			   user it's done. */
   1612 			*(int *) data = 100;
   1613 			return 0;
   1614 		}
   1615 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1616 			*(int *) data = 100 *
   1617 				raidPtr->parity_rewrite_stripes_done /
   1618 				raidPtr->Layout.numStripe;
   1619 		} else {
   1620 			*(int *) data = 100;
   1621 		}
   1622 		return 0;
   1623 
   1624 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1625 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1626 		return 0;
   1627 
   1628 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1629 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1630 			/* This makes no sense on a RAID 0 */
   1631 			*(int *) data = 100;
   1632 			return 0;
   1633 		}
   1634 		if (raidPtr->copyback_in_progress == 1) {
   1635 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1636 				raidPtr->Layout.numStripe;
   1637 		} else {
   1638 			*(int *) data = 100;
   1639 		}
   1640 		return 0;
   1641 
   1642 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1643 		rf_check_copyback_status_ext(raidPtr, data);
   1644 		return 0;
   1645 
   1646 	case RAIDFRAME_SET_LAST_UNIT:
   1647 		for (column = 0; column < raidPtr->numCol; column++)
   1648 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1649 				return EBUSY;
   1650 
   1651 		for (column = 0; column < raidPtr->numCol; column++) {
   1652 			clabel = raidget_component_label(raidPtr, column);
   1653 			clabel->last_unit = *(int *)data;
   1654 			raidflush_component_label(raidPtr, column);
   1655 		}
   1656 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1657 		return 0;
   1658 
   1659 		/* the sparetable daemon calls this to wait for the kernel to
   1660 		 * need a spare table. this ioctl does not return until a
   1661 		 * spare table is needed. XXX -- calling mpsleep here in the
   1662 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1663 		 * -- I should either compute the spare table in the kernel,
   1664 		 * or have a different -- XXX XXX -- interface (a different
   1665 		 * character device) for delivering the table     -- XXX */
   1666 #if RF_DISABLED
   1667 	case RAIDFRAME_SPARET_WAIT:
   1668 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1669 		while (!rf_sparet_wait_queue)
   1670 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1671 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1672 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1673 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1674 
   1675 		/* structure assignment */
   1676 		*((RF_SparetWait_t *) data) = *waitreq;
   1677 
   1678 		RF_Free(waitreq, sizeof(*waitreq));
   1679 		return 0;
   1680 
   1681 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1682 		 * code in it that will cause the dameon to exit */
   1683 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1684 		waitreq = RF_Malloc(sizeof(*waitreq));
   1685 		waitreq->fcol = -1;
   1686 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1687 		waitreq->next = rf_sparet_wait_queue;
   1688 		rf_sparet_wait_queue = waitreq;
   1689 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1690 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1691 		return 0;
   1692 
   1693 		/* used by the spare table daemon to deliver a spare table
   1694 		 * into the kernel */
   1695 	case RAIDFRAME_SEND_SPARET:
   1696 
   1697 		/* install the spare table */
   1698 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1699 
   1700 		/* respond to the requestor.  the return status of the spare
   1701 		 * table installation is passed in the "fcol" field */
   1702 		waitred = RF_Malloc(sizeof(*waitreq));
   1703 		waitreq->fcol = retcode;
   1704 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1705 		waitreq->next = rf_sparet_resp_queue;
   1706 		rf_sparet_resp_queue = waitreq;
   1707 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1708 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1709 
   1710 		return retcode;
   1711 #endif
   1712 	default:
   1713 		/*
   1714 		 * Don't bother trying to load compat modules
   1715 		 * if it is not our ioctl. This is more efficient
   1716 		 * and makes rump tests not depend on compat code
   1717 		 */
   1718 		if (IOCGROUP(cmd) != 'r')
   1719 			break;
   1720 #ifdef _LP64
   1721 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1722 			module_autoload("compat_netbsd32_raid",
   1723 			    MODULE_CLASS_EXEC);
   1724 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1725 			    (rs, cmd, data), enosys(), retcode);
   1726 			if (retcode != EPASSTHROUGH)
   1727 				return retcode;
   1728 		}
   1729 #endif
   1730 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1731 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1732 		    (rs, cmd, data), enosys(), retcode);
   1733 		if (retcode != EPASSTHROUGH)
   1734 			return retcode;
   1735 
   1736 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1737 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1738 		    (rs, cmd, data), enosys(), retcode);
   1739 		if (retcode != EPASSTHROUGH)
   1740 			return retcode;
   1741 		break; /* fall through to the os-specific code below */
   1742 
   1743 	}
   1744 
   1745 	if (!raidPtr->valid)
   1746 		return EINVAL;
   1747 
   1748 	/*
   1749 	 * Add support for "regular" device ioctls here.
   1750 	 */
   1751 
   1752 	switch (cmd) {
   1753 	case DIOCGCACHE:
   1754 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1755 		break;
   1756 
   1757 	case DIOCCACHESYNC:
   1758 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1759 		break;
   1760 
   1761 	default:
   1762 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1763 		break;
   1764 	}
   1765 
   1766 	return retcode;
   1767 
   1768 }
   1769 
   1770 
   1771 /* raidinit -- complete the rest of the initialization for the
   1772    RAIDframe device.  */
   1773 
   1774 
   1775 static void
   1776 raidinit(struct raid_softc *rs)
   1777 {
   1778 	cfdata_t cf;
   1779 	unsigned int unit;
   1780 	struct dk_softc *dksc = &rs->sc_dksc;
   1781 	RF_Raid_t *raidPtr = &rs->sc_r;
   1782 	device_t dev;
   1783 
   1784 	unit = raidPtr->raidid;
   1785 
   1786 	/* XXX doesn't check bounds. */
   1787 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1788 
   1789 	/* attach the pseudo device */
   1790 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1791 	cf->cf_name = raid_cd.cd_name;
   1792 	cf->cf_atname = raid_cd.cd_name;
   1793 	cf->cf_unit = unit;
   1794 	cf->cf_fstate = FSTATE_STAR;
   1795 
   1796 	dev = config_attach_pseudo(cf);
   1797 	if (dev == NULL) {
   1798 		printf("raid%d: config_attach_pseudo failed\n",
   1799 		    raidPtr->raidid);
   1800 		free(cf, M_RAIDFRAME);
   1801 		return;
   1802 	}
   1803 
   1804 	/* provide a backpointer to the real softc */
   1805 	raidsoftc(dev) = rs;
   1806 
   1807 	/* disk_attach actually creates space for the CPU disklabel, among
   1808 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1809 	 * with disklabels. */
   1810 	dk_init(dksc, dev, DKTYPE_RAID);
   1811 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1812 
   1813 	/* XXX There may be a weird interaction here between this, and
   1814 	 * protectedSectors, as used in RAIDframe.  */
   1815 
   1816 	rs->sc_size = raidPtr->totalSectors;
   1817 
   1818 	/* Attach dk and disk subsystems */
   1819 	dk_attach(dksc);
   1820 	disk_attach(&dksc->sc_dkdev);
   1821 	rf_set_geometry(rs, raidPtr);
   1822 
   1823 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1824 
   1825 	/* mark unit as usuable */
   1826 	rs->sc_flags |= RAIDF_INITED;
   1827 
   1828 	dkwedge_discover(&dksc->sc_dkdev);
   1829 }
   1830 
   1831 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1832 /* wake up the daemon & tell it to get us a spare table
   1833  * XXX
   1834  * the entries in the queues should be tagged with the raidPtr
   1835  * so that in the extremely rare case that two recons happen at once,
   1836  * we know for which device were requesting a spare table
   1837  * XXX
   1838  *
   1839  * XXX This code is not currently used. GO
   1840  */
   1841 int
   1842 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1843 {
   1844 	int     retcode;
   1845 
   1846 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1847 	req->next = rf_sparet_wait_queue;
   1848 	rf_sparet_wait_queue = req;
   1849 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1850 
   1851 	/* mpsleep unlocks the mutex */
   1852 	while (!rf_sparet_resp_queue) {
   1853 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1854 	}
   1855 	req = rf_sparet_resp_queue;
   1856 	rf_sparet_resp_queue = req->next;
   1857 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1858 
   1859 	retcode = req->fcol;
   1860 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1861 					 * alloc'd */
   1862 	return retcode;
   1863 }
   1864 #endif
   1865 
   1866 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1867  * bp & passes it down.
   1868  * any calls originating in the kernel must use non-blocking I/O
   1869  * do some extra sanity checking to return "appropriate" error values for
   1870  * certain conditions (to make some standard utilities work)
   1871  *
   1872  * Formerly known as: rf_DoAccessKernel
   1873  */
   1874 void
   1875 raidstart(RF_Raid_t *raidPtr)
   1876 {
   1877 	struct raid_softc *rs;
   1878 	struct dk_softc *dksc;
   1879 
   1880 	rs = raidPtr->softc;
   1881 	dksc = &rs->sc_dksc;
   1882 	/* quick check to see if anything has died recently */
   1883 	rf_lock_mutex2(raidPtr->mutex);
   1884 	if (raidPtr->numNewFailures > 0) {
   1885 		rf_unlock_mutex2(raidPtr->mutex);
   1886 		rf_update_component_labels(raidPtr,
   1887 					   RF_NORMAL_COMPONENT_UPDATE);
   1888 		rf_lock_mutex2(raidPtr->mutex);
   1889 		raidPtr->numNewFailures--;
   1890 	}
   1891 	rf_unlock_mutex2(raidPtr->mutex);
   1892 
   1893 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1894 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1895 		return;
   1896 	}
   1897 
   1898 	dk_start(dksc, NULL);
   1899 }
   1900 
   1901 static int
   1902 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1903 {
   1904 	RF_SectorCount_t num_blocks, pb, sum;
   1905 	RF_RaidAddr_t raid_addr;
   1906 	daddr_t blocknum;
   1907 	int rc;
   1908 
   1909 	rf_lock_mutex2(raidPtr->mutex);
   1910 	if (raidPtr->openings == 0) {
   1911 		rf_unlock_mutex2(raidPtr->mutex);
   1912 		return EAGAIN;
   1913 	}
   1914 	rf_unlock_mutex2(raidPtr->mutex);
   1915 
   1916 	blocknum = bp->b_rawblkno;
   1917 
   1918 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1919 		    (int) blocknum));
   1920 
   1921 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1922 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1923 
   1924 	/* *THIS* is where we adjust what block we're going to...
   1925 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1926 	raid_addr = blocknum;
   1927 
   1928 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1929 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1930 	sum = raid_addr + num_blocks + pb;
   1931 	if (1 || rf_debugKernelAccess) {
   1932 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1933 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1934 			    (int) pb, (int) bp->b_resid));
   1935 	}
   1936 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1937 	    || (sum < num_blocks) || (sum < pb)) {
   1938 		rc = ENOSPC;
   1939 		goto done;
   1940 	}
   1941 	/*
   1942 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1943 	 */
   1944 
   1945 	if (bp->b_bcount & raidPtr->sectorMask) {
   1946 		rc = ENOSPC;
   1947 		goto done;
   1948 	}
   1949 	db1_printf(("Calling DoAccess..\n"));
   1950 
   1951 
   1952 	rf_lock_mutex2(raidPtr->mutex);
   1953 	raidPtr->openings--;
   1954 	rf_unlock_mutex2(raidPtr->mutex);
   1955 
   1956 	/* don't ever condition on bp->b_flags & B_WRITE.
   1957 	 * always condition on B_READ instead */
   1958 
   1959 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1960 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1961 			 raid_addr, num_blocks,
   1962 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1963 
   1964 done:
   1965 	return rc;
   1966 }
   1967 
   1968 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1969 
   1970 int
   1971 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1972 {
   1973 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1974 	struct buf *bp;
   1975 
   1976 	req->queue = queue;
   1977 	bp = req->bp;
   1978 
   1979 	switch (req->type) {
   1980 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1981 		/* XXX need to do something extra here.. */
   1982 		/* I'm leaving this in, as I've never actually seen it used,
   1983 		 * and I'd like folks to report it... GO */
   1984 		printf("%s: WAKEUP CALLED\n", __func__);
   1985 		queue->numOutstanding++;
   1986 
   1987 		bp->b_flags = 0;
   1988 		bp->b_private = req;
   1989 
   1990 		KernelWakeupFunc(bp);
   1991 		break;
   1992 
   1993 	case RF_IO_TYPE_READ:
   1994 	case RF_IO_TYPE_WRITE:
   1995 #if RF_ACC_TRACE > 0
   1996 		if (req->tracerec) {
   1997 			RF_ETIMER_START(req->tracerec->timer);
   1998 		}
   1999 #endif
   2000 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2001 		    op, queue->rf_cinfo->ci_dev,
   2002 		    req->sectorOffset, req->numSector,
   2003 		    req->buf, KernelWakeupFunc, (void *) req,
   2004 		    queue->raidPtr->logBytesPerSector);
   2005 
   2006 		if (rf_debugKernelAccess) {
   2007 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2008 				(long) bp->b_blkno));
   2009 		}
   2010 		queue->numOutstanding++;
   2011 		queue->last_deq_sector = req->sectorOffset;
   2012 		/* acc wouldn't have been let in if there were any pending
   2013 		 * reqs at any other priority */
   2014 		queue->curPriority = req->priority;
   2015 
   2016 		db1_printf(("Going for %c to unit %d col %d\n",
   2017 			    req->type, queue->raidPtr->raidid,
   2018 			    queue->col));
   2019 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2020 			(int) req->sectorOffset, (int) req->numSector,
   2021 			(int) (req->numSector <<
   2022 			    queue->raidPtr->logBytesPerSector),
   2023 			(int) queue->raidPtr->logBytesPerSector));
   2024 
   2025 		/*
   2026 		 * XXX: drop lock here since this can block at
   2027 		 * least with backing SCSI devices.  Retake it
   2028 		 * to minimize fuss with calling interfaces.
   2029 		 */
   2030 
   2031 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2032 		bdev_strategy(bp);
   2033 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2034 		break;
   2035 
   2036 	default:
   2037 		panic("bad req->type in rf_DispatchKernelIO");
   2038 	}
   2039 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2040 
   2041 	return 0;
   2042 }
   2043 /* this is the callback function associated with a I/O invoked from
   2044    kernel code.
   2045  */
   2046 static void
   2047 KernelWakeupFunc(struct buf *bp)
   2048 {
   2049 	RF_DiskQueueData_t *req = NULL;
   2050 	RF_DiskQueue_t *queue;
   2051 
   2052 	db1_printf(("recovering the request queue:\n"));
   2053 
   2054 	req = bp->b_private;
   2055 
   2056 	queue = (RF_DiskQueue_t *) req->queue;
   2057 
   2058 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2059 
   2060 #if RF_ACC_TRACE > 0
   2061 	if (req->tracerec) {
   2062 		RF_ETIMER_STOP(req->tracerec->timer);
   2063 		RF_ETIMER_EVAL(req->tracerec->timer);
   2064 		rf_lock_mutex2(rf_tracing_mutex);
   2065 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2066 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2067 		req->tracerec->num_phys_ios++;
   2068 		rf_unlock_mutex2(rf_tracing_mutex);
   2069 	}
   2070 #endif
   2071 
   2072 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2073 	 * ballistic, and mark the component as hosed... */
   2074 
   2075 	if (bp->b_error != 0) {
   2076 		/* Mark the disk as dead */
   2077 		/* but only mark it once... */
   2078 		/* and only if it wouldn't leave this RAID set
   2079 		   completely broken */
   2080 		if (((queue->raidPtr->Disks[queue->col].status ==
   2081 		      rf_ds_optimal) ||
   2082 		     (queue->raidPtr->Disks[queue->col].status ==
   2083 		      rf_ds_used_spare)) &&
   2084 		     (queue->raidPtr->numFailures <
   2085 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2086 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2087 			       queue->raidPtr->raidid,
   2088 			       bp->b_error,
   2089 			       queue->raidPtr->Disks[queue->col].devname);
   2090 			queue->raidPtr->Disks[queue->col].status =
   2091 			    rf_ds_failed;
   2092 			queue->raidPtr->status = rf_rs_degraded;
   2093 			queue->raidPtr->numFailures++;
   2094 			queue->raidPtr->numNewFailures++;
   2095 		} else {	/* Disk is already dead... */
   2096 			/* printf("Disk already marked as dead!\n"); */
   2097 		}
   2098 
   2099 	}
   2100 
   2101 	/* Fill in the error value */
   2102 	req->error = bp->b_error;
   2103 
   2104 	/* Drop this one on the "finished" queue... */
   2105 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2106 
   2107 	/* Let the raidio thread know there is work to be done. */
   2108 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2109 
   2110 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2111 }
   2112 
   2113 
   2114 /*
   2115  * initialize a buf structure for doing an I/O in the kernel.
   2116  */
   2117 static void
   2118 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2119        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2120        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2121 {
   2122 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2123 	bp->b_oflags = 0;
   2124 	bp->b_cflags = 0;
   2125 	bp->b_bcount = numSect << logBytesPerSector;
   2126 	bp->b_bufsize = bp->b_bcount;
   2127 	bp->b_error = 0;
   2128 	bp->b_dev = dev;
   2129 	bp->b_data = bf;
   2130 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2131 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2132 	if (bp->b_bcount == 0) {
   2133 		panic("bp->b_bcount is zero in InitBP!!");
   2134 	}
   2135 	bp->b_iodone = cbFunc;
   2136 	bp->b_private = cbArg;
   2137 }
   2138 
   2139 /*
   2140  * Wait interruptibly for an exclusive lock.
   2141  *
   2142  * XXX
   2143  * Several drivers do this; it should be abstracted and made MP-safe.
   2144  * (Hmm... where have we seen this warning before :->  GO )
   2145  */
   2146 static int
   2147 raidlock(struct raid_softc *rs)
   2148 {
   2149 	int     error;
   2150 
   2151 	error = 0;
   2152 	mutex_enter(&rs->sc_mutex);
   2153 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2154 		rs->sc_flags |= RAIDF_WANTED;
   2155 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2156 		if (error != 0)
   2157 			goto done;
   2158 	}
   2159 	rs->sc_flags |= RAIDF_LOCKED;
   2160 done:
   2161 	mutex_exit(&rs->sc_mutex);
   2162 	return error;
   2163 }
   2164 /*
   2165  * Unlock and wake up any waiters.
   2166  */
   2167 static void
   2168 raidunlock(struct raid_softc *rs)
   2169 {
   2170 
   2171 	mutex_enter(&rs->sc_mutex);
   2172 	rs->sc_flags &= ~RAIDF_LOCKED;
   2173 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2174 		rs->sc_flags &= ~RAIDF_WANTED;
   2175 		cv_broadcast(&rs->sc_cv);
   2176 	}
   2177 	mutex_exit(&rs->sc_mutex);
   2178 }
   2179 
   2180 
   2181 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2182 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2183 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2184 
   2185 static daddr_t
   2186 rf_component_info_offset(void)
   2187 {
   2188 
   2189 	return RF_COMPONENT_INFO_OFFSET;
   2190 }
   2191 
   2192 static daddr_t
   2193 rf_component_info_size(unsigned secsize)
   2194 {
   2195 	daddr_t info_size;
   2196 
   2197 	KASSERT(secsize);
   2198 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2199 		info_size = secsize;
   2200 	else
   2201 		info_size = RF_COMPONENT_INFO_SIZE;
   2202 
   2203 	return info_size;
   2204 }
   2205 
   2206 static daddr_t
   2207 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2208 {
   2209 	daddr_t map_offset;
   2210 
   2211 	KASSERT(raidPtr->bytesPerSector);
   2212 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2213 		map_offset = raidPtr->bytesPerSector;
   2214 	else
   2215 		map_offset = RF_COMPONENT_INFO_SIZE;
   2216 	map_offset += rf_component_info_offset();
   2217 
   2218 	return map_offset;
   2219 }
   2220 
   2221 static daddr_t
   2222 rf_parity_map_size(RF_Raid_t *raidPtr)
   2223 {
   2224 	daddr_t map_size;
   2225 
   2226 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2227 		map_size = raidPtr->bytesPerSector;
   2228 	else
   2229 		map_size = RF_PARITY_MAP_SIZE;
   2230 
   2231 	return map_size;
   2232 }
   2233 
   2234 int
   2235 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2236 {
   2237 	RF_ComponentLabel_t *clabel;
   2238 
   2239 	clabel = raidget_component_label(raidPtr, col);
   2240 	clabel->clean = RF_RAID_CLEAN;
   2241 	raidflush_component_label(raidPtr, col);
   2242 	return(0);
   2243 }
   2244 
   2245 
   2246 int
   2247 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2248 {
   2249 	RF_ComponentLabel_t *clabel;
   2250 
   2251 	clabel = raidget_component_label(raidPtr, col);
   2252 	clabel->clean = RF_RAID_DIRTY;
   2253 	raidflush_component_label(raidPtr, col);
   2254 	return(0);
   2255 }
   2256 
   2257 int
   2258 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2259 {
   2260 	KASSERT(raidPtr->bytesPerSector);
   2261 
   2262 	return raidread_component_label(raidPtr->bytesPerSector,
   2263 	    raidPtr->Disks[col].dev,
   2264 	    raidPtr->raid_cinfo[col].ci_vp,
   2265 	    &raidPtr->raid_cinfo[col].ci_label);
   2266 }
   2267 
   2268 RF_ComponentLabel_t *
   2269 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2270 {
   2271 	return &raidPtr->raid_cinfo[col].ci_label;
   2272 }
   2273 
   2274 int
   2275 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2276 {
   2277 	RF_ComponentLabel_t *label;
   2278 
   2279 	label = &raidPtr->raid_cinfo[col].ci_label;
   2280 	label->mod_counter = raidPtr->mod_counter;
   2281 #ifndef RF_NO_PARITY_MAP
   2282 	label->parity_map_modcount = label->mod_counter;
   2283 #endif
   2284 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2285 	    raidPtr->Disks[col].dev,
   2286 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2287 }
   2288 
   2289 /*
   2290  * Swap the label endianness.
   2291  *
   2292  * Everything in the component label is 4-byte-swapped except the version,
   2293  * which is kept in the byte-swapped version at all times, and indicates
   2294  * for the writer that a swap is necessary.
   2295  *
   2296  * For reads it is expected that out_label == clabel, but writes expect
   2297  * separate labels so only the re-swapped label is written out to disk,
   2298  * leaving the swapped-except-version internally.
   2299  *
   2300  * Only support swapping label version 2.
   2301  */
   2302 static void
   2303 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2304 {
   2305 	int	*in, *out, *in_last;
   2306 
   2307 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2308 
   2309 	/* Don't swap the label, but do copy it. */
   2310 	out_label->version = clabel->version;
   2311 
   2312 	in = &clabel->serial_number;
   2313 	in_last = &clabel->future_use2[42];
   2314 	out = &out_label->serial_number;
   2315 
   2316 	for (; in < in_last; in++, out++)
   2317 		*out = bswap32(*in);
   2318 }
   2319 
   2320 static int
   2321 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2322     RF_ComponentLabel_t *clabel)
   2323 {
   2324 	int error;
   2325 
   2326 	error = raidread_component_area(dev, b_vp, clabel,
   2327 	    sizeof(RF_ComponentLabel_t),
   2328 	    rf_component_info_offset(),
   2329 	    rf_component_info_size(secsize));
   2330 
   2331 	if (error == 0 &&
   2332 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2333 		rf_swap_label(clabel, clabel);
   2334 	}
   2335 
   2336 	return error;
   2337 }
   2338 
   2339 /* ARGSUSED */
   2340 static int
   2341 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2342     size_t msize, daddr_t offset, daddr_t dsize)
   2343 {
   2344 	struct buf *bp;
   2345 	int error;
   2346 
   2347 	/* XXX should probably ensure that we don't try to do this if
   2348 	   someone has changed rf_protected_sectors. */
   2349 
   2350 	if (b_vp == NULL) {
   2351 		/* For whatever reason, this component is not valid.
   2352 		   Don't try to read a component label from it. */
   2353 		return(EINVAL);
   2354 	}
   2355 
   2356 	/* get a block of the appropriate size... */
   2357 	bp = geteblk((int)dsize);
   2358 	bp->b_dev = dev;
   2359 
   2360 	/* get our ducks in a row for the read */
   2361 	bp->b_blkno = offset / DEV_BSIZE;
   2362 	bp->b_bcount = dsize;
   2363 	bp->b_flags |= B_READ;
   2364  	bp->b_resid = dsize;
   2365 
   2366 	bdev_strategy(bp);
   2367 	error = biowait(bp);
   2368 
   2369 	if (!error) {
   2370 		memcpy(data, bp->b_data, msize);
   2371 	}
   2372 
   2373 	brelse(bp, 0);
   2374 	return(error);
   2375 }
   2376 
   2377 static int
   2378 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2379     RF_ComponentLabel_t *clabel)
   2380 {
   2381 	RF_ComponentLabel_t *clabel_write = clabel;
   2382 	RF_ComponentLabel_t lclabel;
   2383 	int error;
   2384 
   2385 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2386 		clabel_write = &lclabel;
   2387 		rf_swap_label(clabel, clabel_write);
   2388 	}
   2389 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2390 	    sizeof(RF_ComponentLabel_t),
   2391 	    rf_component_info_offset(),
   2392 	    rf_component_info_size(secsize), 0);
   2393 
   2394 	return error;
   2395 }
   2396 
   2397 /* ARGSUSED */
   2398 static int
   2399 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2400     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2401 {
   2402 	struct buf *bp;
   2403 	int error;
   2404 
   2405 	/* get a block of the appropriate size... */
   2406 	bp = geteblk((int)dsize);
   2407 	bp->b_dev = dev;
   2408 
   2409 	/* get our ducks in a row for the write */
   2410 	bp->b_blkno = offset / DEV_BSIZE;
   2411 	bp->b_bcount = dsize;
   2412 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2413  	bp->b_resid = dsize;
   2414 
   2415 	memset(bp->b_data, 0, dsize);
   2416 	memcpy(bp->b_data, data, msize);
   2417 
   2418 	bdev_strategy(bp);
   2419 	if (asyncp)
   2420 		return 0;
   2421 	error = biowait(bp);
   2422 	brelse(bp, 0);
   2423 	if (error) {
   2424 #if 1
   2425 		printf("Failed to write RAID component info!\n");
   2426 #endif
   2427 	}
   2428 
   2429 	return(error);
   2430 }
   2431 
   2432 void
   2433 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2434 {
   2435 	int c;
   2436 
   2437 	for (c = 0; c < raidPtr->numCol; c++) {
   2438 		/* Skip dead disks. */
   2439 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2440 			continue;
   2441 		/* XXXjld: what if an error occurs here? */
   2442 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2443 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2444 		    RF_PARITYMAP_NBYTE,
   2445 		    rf_parity_map_offset(raidPtr),
   2446 		    rf_parity_map_size(raidPtr), 0);
   2447 	}
   2448 }
   2449 
   2450 void
   2451 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2452 {
   2453 	struct rf_paritymap_ondisk tmp;
   2454 	int c,first;
   2455 
   2456 	first=1;
   2457 	for (c = 0; c < raidPtr->numCol; c++) {
   2458 		/* Skip dead disks. */
   2459 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2460 			continue;
   2461 		raidread_component_area(raidPtr->Disks[c].dev,
   2462 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2463 		    RF_PARITYMAP_NBYTE,
   2464 		    rf_parity_map_offset(raidPtr),
   2465 		    rf_parity_map_size(raidPtr));
   2466 		if (first) {
   2467 			memcpy(map, &tmp, sizeof(*map));
   2468 			first = 0;
   2469 		} else {
   2470 			rf_paritymap_merge(map, &tmp);
   2471 		}
   2472 	}
   2473 }
   2474 
   2475 void
   2476 rf_markalldirty(RF_Raid_t *raidPtr)
   2477 {
   2478 	RF_ComponentLabel_t *clabel;
   2479 	int sparecol;
   2480 	int c;
   2481 	int j;
   2482 	int scol = -1;
   2483 
   2484 	raidPtr->mod_counter++;
   2485 	for (c = 0; c < raidPtr->numCol; c++) {
   2486 		/* we don't want to touch (at all) a disk that has
   2487 		   failed */
   2488 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2489 			clabel = raidget_component_label(raidPtr, c);
   2490 			if (clabel->status == rf_ds_spared) {
   2491 				/* XXX do something special...
   2492 				   but whatever you do, don't
   2493 				   try to access it!! */
   2494 			} else {
   2495 				raidmarkdirty(raidPtr, c);
   2496 			}
   2497 		}
   2498 	}
   2499 
   2500 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2501 		sparecol = raidPtr->numCol + c;
   2502 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2503 			/*
   2504 
   2505 			   we claim this disk is "optimal" if it's
   2506 			   rf_ds_used_spare, as that means it should be
   2507 			   directly substitutable for the disk it replaced.
   2508 			   We note that too...
   2509 
   2510 			 */
   2511 
   2512 			for(j=0;j<raidPtr->numCol;j++) {
   2513 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2514 					scol = j;
   2515 					break;
   2516 				}
   2517 			}
   2518 
   2519 			clabel = raidget_component_label(raidPtr, sparecol);
   2520 			/* make sure status is noted */
   2521 
   2522 			raid_init_component_label(raidPtr, clabel);
   2523 
   2524 			clabel->row = 0;
   2525 			clabel->column = scol;
   2526 			/* Note: we *don't* change status from rf_ds_used_spare
   2527 			   to rf_ds_optimal */
   2528 			/* clabel.status = rf_ds_optimal; */
   2529 
   2530 			raidmarkdirty(raidPtr, sparecol);
   2531 		}
   2532 	}
   2533 }
   2534 
   2535 
   2536 void
   2537 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2538 {
   2539 	RF_ComponentLabel_t *clabel;
   2540 	int sparecol;
   2541 	int c;
   2542 	int j;
   2543 	int scol;
   2544 	struct raid_softc *rs = raidPtr->softc;
   2545 
   2546 	scol = -1;
   2547 
   2548 	/* XXX should do extra checks to make sure things really are clean,
   2549 	   rather than blindly setting the clean bit... */
   2550 
   2551 	raidPtr->mod_counter++;
   2552 
   2553 	for (c = 0; c < raidPtr->numCol; c++) {
   2554 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2555 			clabel = raidget_component_label(raidPtr, c);
   2556 			/* make sure status is noted */
   2557 			clabel->status = rf_ds_optimal;
   2558 
   2559 			/* note what unit we are configured as */
   2560 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2561 				clabel->last_unit = raidPtr->raidid;
   2562 
   2563 			raidflush_component_label(raidPtr, c);
   2564 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2565 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2566 					raidmarkclean(raidPtr, c);
   2567 				}
   2568 			}
   2569 		}
   2570 		/* else we don't touch it.. */
   2571 	}
   2572 
   2573 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2574 		sparecol = raidPtr->numCol + c;
   2575 		/* Need to ensure that the reconstruct actually completed! */
   2576 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2577 			/*
   2578 
   2579 			   we claim this disk is "optimal" if it's
   2580 			   rf_ds_used_spare, as that means it should be
   2581 			   directly substitutable for the disk it replaced.
   2582 			   We note that too...
   2583 
   2584 			 */
   2585 
   2586 			for(j=0;j<raidPtr->numCol;j++) {
   2587 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2588 					scol = j;
   2589 					break;
   2590 				}
   2591 			}
   2592 
   2593 			/* XXX shouldn't *really* need this... */
   2594 			clabel = raidget_component_label(raidPtr, sparecol);
   2595 			/* make sure status is noted */
   2596 
   2597 			raid_init_component_label(raidPtr, clabel);
   2598 
   2599 			clabel->column = scol;
   2600 			clabel->status = rf_ds_optimal;
   2601 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2602 				clabel->last_unit = raidPtr->raidid;
   2603 
   2604 			raidflush_component_label(raidPtr, sparecol);
   2605 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2606 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2607 					raidmarkclean(raidPtr, sparecol);
   2608 				}
   2609 			}
   2610 		}
   2611 	}
   2612 }
   2613 
   2614 void
   2615 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2616 {
   2617 
   2618 	if (vp != NULL) {
   2619 		if (auto_configured == 1) {
   2620 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2621 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2622 			vput(vp);
   2623 
   2624 		} else {
   2625 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2626 		}
   2627 	}
   2628 }
   2629 
   2630 
   2631 void
   2632 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2633 {
   2634 	int r,c;
   2635 	struct vnode *vp;
   2636 	int acd;
   2637 
   2638 
   2639 	/* We take this opportunity to close the vnodes like we should.. */
   2640 
   2641 	for (c = 0; c < raidPtr->numCol; c++) {
   2642 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2643 		acd = raidPtr->Disks[c].auto_configured;
   2644 		rf_close_component(raidPtr, vp, acd);
   2645 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2646 		raidPtr->Disks[c].auto_configured = 0;
   2647 	}
   2648 
   2649 	for (r = 0; r < raidPtr->numSpare; r++) {
   2650 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2651 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2652 		rf_close_component(raidPtr, vp, acd);
   2653 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2654 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2655 	}
   2656 }
   2657 
   2658 
   2659 static void
   2660 rf_ReconThread(struct rf_recon_req_internal *req)
   2661 {
   2662 	int     s;
   2663 	RF_Raid_t *raidPtr;
   2664 
   2665 	s = splbio();
   2666 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2667 	raidPtr->recon_in_progress = 1;
   2668 
   2669 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2670 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2671 
   2672 	RF_Free(req, sizeof(*req));
   2673 
   2674 	raidPtr->recon_in_progress = 0;
   2675 	splx(s);
   2676 
   2677 	/* That's all... */
   2678 	kthread_exit(0);	/* does not return */
   2679 }
   2680 
   2681 static void
   2682 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2683 {
   2684 	int retcode;
   2685 	int s;
   2686 
   2687 	raidPtr->parity_rewrite_stripes_done = 0;
   2688 	raidPtr->parity_rewrite_in_progress = 1;
   2689 	s = splbio();
   2690 	retcode = rf_RewriteParity(raidPtr);
   2691 	splx(s);
   2692 	if (retcode) {
   2693 		printf("raid%d: Error re-writing parity (%d)!\n",
   2694 		    raidPtr->raidid, retcode);
   2695 	} else {
   2696 		/* set the clean bit!  If we shutdown correctly,
   2697 		   the clean bit on each component label will get
   2698 		   set */
   2699 		raidPtr->parity_good = RF_RAID_CLEAN;
   2700 	}
   2701 	raidPtr->parity_rewrite_in_progress = 0;
   2702 
   2703 	/* Anyone waiting for us to stop?  If so, inform them... */
   2704 	if (raidPtr->waitShutdown) {
   2705 		rf_lock_mutex2(raidPtr->rad_lock);
   2706 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2707 		rf_unlock_mutex2(raidPtr->rad_lock);
   2708 	}
   2709 
   2710 	/* That's all... */
   2711 	kthread_exit(0);	/* does not return */
   2712 }
   2713 
   2714 
   2715 static void
   2716 rf_CopybackThread(RF_Raid_t *raidPtr)
   2717 {
   2718 	int s;
   2719 
   2720 	raidPtr->copyback_in_progress = 1;
   2721 	s = splbio();
   2722 	rf_CopybackReconstructedData(raidPtr);
   2723 	splx(s);
   2724 	raidPtr->copyback_in_progress = 0;
   2725 
   2726 	/* That's all... */
   2727 	kthread_exit(0);	/* does not return */
   2728 }
   2729 
   2730 
   2731 static void
   2732 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2733 {
   2734 	int s;
   2735 	RF_Raid_t *raidPtr;
   2736 
   2737 	s = splbio();
   2738 	raidPtr = req->raidPtr;
   2739 	raidPtr->recon_in_progress = 1;
   2740 	rf_ReconstructInPlace(raidPtr, req->col);
   2741 	RF_Free(req, sizeof(*req));
   2742 	raidPtr->recon_in_progress = 0;
   2743 	splx(s);
   2744 
   2745 	/* That's all... */
   2746 	kthread_exit(0);	/* does not return */
   2747 }
   2748 
   2749 static RF_AutoConfig_t *
   2750 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2751     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2752     unsigned secsize)
   2753 {
   2754 	int good_one = 0;
   2755 	RF_ComponentLabel_t *clabel;
   2756 	RF_AutoConfig_t *ac;
   2757 
   2758 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2759 
   2760 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2761 		/* Got the label.  Does it look reasonable? */
   2762 		if (rf_reasonable_label(clabel, numsecs) &&
   2763 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2764 #ifdef DEBUG
   2765 			printf("Component on: %s: %llu\n",
   2766 				cname, (unsigned long long)size);
   2767 			rf_print_component_label(clabel);
   2768 #endif
   2769 			/* if it's reasonable, add it, else ignore it. */
   2770 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2771 				M_WAITOK);
   2772 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2773 			ac->dev = dev;
   2774 			ac->vp = vp;
   2775 			ac->clabel = clabel;
   2776 			ac->next = ac_list;
   2777 			ac_list = ac;
   2778 			good_one = 1;
   2779 		}
   2780 	}
   2781 	if (!good_one) {
   2782 		/* cleanup */
   2783 		free(clabel, M_RAIDFRAME);
   2784 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2785 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2786 		vput(vp);
   2787 	}
   2788 	return ac_list;
   2789 }
   2790 
   2791 static RF_AutoConfig_t *
   2792 rf_find_raid_components(void)
   2793 {
   2794 	struct vnode *vp;
   2795 	struct disklabel label;
   2796 	device_t dv;
   2797 	deviter_t di;
   2798 	dev_t dev;
   2799 	int bmajor, bminor, wedge, rf_part_found;
   2800 	int error;
   2801 	int i;
   2802 	RF_AutoConfig_t *ac_list;
   2803 	uint64_t numsecs;
   2804 	unsigned secsize;
   2805 	int dowedges;
   2806 
   2807 	/* initialize the AutoConfig list */
   2808 	ac_list = NULL;
   2809 
   2810 	/*
   2811 	 * we begin by trolling through *all* the devices on the system *twice*
   2812 	 * first we scan for wedges, second for other devices. This avoids
   2813 	 * using a raw partition instead of a wedge that covers the whole disk
   2814 	 */
   2815 
   2816 	for (dowedges=1; dowedges>=0; --dowedges) {
   2817 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2818 		     dv = deviter_next(&di)) {
   2819 
   2820 			/* we are only interested in disks */
   2821 			if (device_class(dv) != DV_DISK)
   2822 				continue;
   2823 
   2824 			/* we don't care about floppies */
   2825 			if (device_is_a(dv, "fd")) {
   2826 				continue;
   2827 			}
   2828 
   2829 			/* we don't care about CDs. */
   2830 			if (device_is_a(dv, "cd")) {
   2831 				continue;
   2832 			}
   2833 
   2834 			/* we don't care about md. */
   2835 			if (device_is_a(dv, "md")) {
   2836 				continue;
   2837 			}
   2838 
   2839 			/* hdfd is the Atari/Hades floppy driver */
   2840 			if (device_is_a(dv, "hdfd")) {
   2841 				continue;
   2842 			}
   2843 
   2844 			/* fdisa is the Atari/Milan floppy driver */
   2845 			if (device_is_a(dv, "fdisa")) {
   2846 				continue;
   2847 			}
   2848 
   2849 			/* we don't care about spiflash */
   2850 			if (device_is_a(dv, "spiflash")) {
   2851 				continue;
   2852 			}
   2853 
   2854 			/* are we in the wedges pass ? */
   2855 			wedge = device_is_a(dv, "dk");
   2856 			if (wedge != dowedges) {
   2857 				continue;
   2858 			}
   2859 
   2860 			/* need to find the device_name_to_block_device_major stuff */
   2861 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2862 
   2863 			rf_part_found = 0; /*No raid partition as yet*/
   2864 
   2865 			/* get a vnode for the raw partition of this disk */
   2866 			bminor = minor(device_unit(dv));
   2867 			dev = wedge ? makedev(bmajor, bminor) :
   2868 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2869 			if (bdevvp(dev, &vp))
   2870 				panic("RAID can't alloc vnode");
   2871 
   2872 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2873 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2874 
   2875 			if (error) {
   2876 				/* "Who cares."  Continue looking
   2877 				   for something that exists*/
   2878 				vput(vp);
   2879 				continue;
   2880 			}
   2881 
   2882 			error = getdisksize(vp, &numsecs, &secsize);
   2883 			if (error) {
   2884 				/*
   2885 				 * Pseudo devices like vnd and cgd can be
   2886 				 * opened but may still need some configuration.
   2887 				 * Ignore these quietly.
   2888 				 */
   2889 				if (error != ENXIO)
   2890 					printf("RAIDframe: can't get disk size"
   2891 					    " for dev %s (%d)\n",
   2892 					    device_xname(dv), error);
   2893 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2894 				vput(vp);
   2895 				continue;
   2896 			}
   2897 			if (wedge) {
   2898 				struct dkwedge_info dkw;
   2899 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2900 				    NOCRED);
   2901 				if (error) {
   2902 					printf("RAIDframe: can't get wedge info for "
   2903 					    "dev %s (%d)\n", device_xname(dv), error);
   2904 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2905 					vput(vp);
   2906 					continue;
   2907 				}
   2908 
   2909 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2910 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2911 					vput(vp);
   2912 					continue;
   2913 				}
   2914 
   2915 				VOP_UNLOCK(vp);
   2916 				ac_list = rf_get_component(ac_list, dev, vp,
   2917 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2918 				rf_part_found = 1; /*There is a raid component on this disk*/
   2919 				continue;
   2920 			}
   2921 
   2922 			/* Ok, the disk exists.  Go get the disklabel. */
   2923 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2924 			if (error) {
   2925 				/*
   2926 				 * XXX can't happen - open() would
   2927 				 * have errored out (or faked up one)
   2928 				 */
   2929 				if (error != ENOTTY)
   2930 					printf("RAIDframe: can't get label for dev "
   2931 					    "%s (%d)\n", device_xname(dv), error);
   2932 			}
   2933 
   2934 			/* don't need this any more.  We'll allocate it again
   2935 			   a little later if we really do... */
   2936 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2937 			vput(vp);
   2938 
   2939 			if (error)
   2940 				continue;
   2941 
   2942 			rf_part_found = 0; /*No raid partitions yet*/
   2943 			for (i = 0; i < label.d_npartitions; i++) {
   2944 				char cname[sizeof(ac_list->devname)];
   2945 
   2946 				/* We only support partitions marked as RAID */
   2947 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2948 					continue;
   2949 
   2950 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2951 				if (bdevvp(dev, &vp))
   2952 					panic("RAID can't alloc vnode");
   2953 
   2954 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2955 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2956 				if (error) {
   2957 					/* Whatever... */
   2958 					vput(vp);
   2959 					continue;
   2960 				}
   2961 				VOP_UNLOCK(vp);
   2962 				snprintf(cname, sizeof(cname), "%s%c",
   2963 				    device_xname(dv), 'a' + i);
   2964 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2965 					label.d_partitions[i].p_size, numsecs, secsize);
   2966 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2967 			}
   2968 
   2969 			/*
   2970 			 *If there is no raid component on this disk, either in a
   2971 			 *disklabel or inside a wedge, check the raw partition as well,
   2972 			 *as it is possible to configure raid components on raw disk
   2973 			 *devices.
   2974 			 */
   2975 
   2976 			if (!rf_part_found) {
   2977 				char cname[sizeof(ac_list->devname)];
   2978 
   2979 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2980 				if (bdevvp(dev, &vp))
   2981 					panic("RAID can't alloc vnode");
   2982 
   2983 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2984 
   2985 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2986 				if (error) {
   2987 					/* Whatever... */
   2988 					vput(vp);
   2989 					continue;
   2990 				}
   2991 				VOP_UNLOCK(vp);
   2992 				snprintf(cname, sizeof(cname), "%s%c",
   2993 				    device_xname(dv), 'a' + RAW_PART);
   2994 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2995 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2996 			}
   2997 		}
   2998 		deviter_release(&di);
   2999 	}
   3000 	return ac_list;
   3001 }
   3002 
   3003 int
   3004 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3005 {
   3006 
   3007 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3008 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3009 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3010 	    (clabel->clean == RF_RAID_CLEAN ||
   3011 	     clabel->clean == RF_RAID_DIRTY) &&
   3012 	    clabel->row >=0 &&
   3013 	    clabel->column >= 0 &&
   3014 	    clabel->num_rows > 0 &&
   3015 	    clabel->num_columns > 0 &&
   3016 	    clabel->row < clabel->num_rows &&
   3017 	    clabel->column < clabel->num_columns &&
   3018 	    clabel->blockSize > 0 &&
   3019 	    /*
   3020 	     * numBlocksHi may contain garbage, but it is ok since
   3021 	     * the type is unsigned.  If it is really garbage,
   3022 	     * rf_fix_old_label_size() will fix it.
   3023 	     */
   3024 	    rf_component_label_numblocks(clabel) > 0) {
   3025 		/*
   3026 		 * label looks reasonable enough...
   3027 		 * let's make sure it has no old garbage.
   3028 		 */
   3029 		if (numsecs)
   3030 			rf_fix_old_label_size(clabel, numsecs);
   3031 		return(1);
   3032 	}
   3033 	return(0);
   3034 }
   3035 
   3036 
   3037 /*
   3038  * For reasons yet unknown, some old component labels have garbage in
   3039  * the newer numBlocksHi region, and this causes lossage.  Since those
   3040  * disks will also have numsecs set to less than 32 bits of sectors,
   3041  * we can determine when this corruption has occurred, and fix it.
   3042  *
   3043  * The exact same problem, with the same unknown reason, happens to
   3044  * the partitionSizeHi member as well.
   3045  */
   3046 static void
   3047 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3048 {
   3049 
   3050 	if (numsecs < ((uint64_t)1 << 32)) {
   3051 		if (clabel->numBlocksHi) {
   3052 			printf("WARNING: total sectors < 32 bits, yet "
   3053 			       "numBlocksHi set\n"
   3054 			       "WARNING: resetting numBlocksHi to zero.\n");
   3055 			clabel->numBlocksHi = 0;
   3056 		}
   3057 
   3058 		if (clabel->partitionSizeHi) {
   3059 			printf("WARNING: total sectors < 32 bits, yet "
   3060 			       "partitionSizeHi set\n"
   3061 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3062 			clabel->partitionSizeHi = 0;
   3063 		}
   3064 	}
   3065 }
   3066 
   3067 
   3068 #ifdef DEBUG
   3069 void
   3070 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3071 {
   3072 	uint64_t numBlocks;
   3073 	static const char *rp[] = {
   3074 	    "No", "Force", "Soft", "*invalid*"
   3075 	};
   3076 
   3077 
   3078 	numBlocks = rf_component_label_numblocks(clabel);
   3079 
   3080 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3081 	       clabel->row, clabel->column,
   3082 	       clabel->num_rows, clabel->num_columns);
   3083 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3084 	       clabel->version, clabel->serial_number,
   3085 	       clabel->mod_counter);
   3086 	printf("   Clean: %s Status: %d\n",
   3087 	       clabel->clean ? "Yes" : "No", clabel->status);
   3088 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3089 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3090 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3091 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3092 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3093 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3094 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3095 #if 0
   3096 	   printf("   Config order: %d\n", clabel->config_order);
   3097 #endif
   3098 
   3099 }
   3100 #endif
   3101 
   3102 static RF_ConfigSet_t *
   3103 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3104 {
   3105 	RF_AutoConfig_t *ac;
   3106 	RF_ConfigSet_t *config_sets;
   3107 	RF_ConfigSet_t *cset;
   3108 	RF_AutoConfig_t *ac_next;
   3109 
   3110 
   3111 	config_sets = NULL;
   3112 
   3113 	/* Go through the AutoConfig list, and figure out which components
   3114 	   belong to what sets.  */
   3115 	ac = ac_list;
   3116 	while(ac!=NULL) {
   3117 		/* we're going to putz with ac->next, so save it here
   3118 		   for use at the end of the loop */
   3119 		ac_next = ac->next;
   3120 
   3121 		if (config_sets == NULL) {
   3122 			/* will need at least this one... */
   3123 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3124 				       M_RAIDFRAME, M_WAITOK);
   3125 			/* this one is easy :) */
   3126 			config_sets->ac = ac;
   3127 			config_sets->next = NULL;
   3128 			config_sets->rootable = 0;
   3129 			ac->next = NULL;
   3130 		} else {
   3131 			/* which set does this component fit into? */
   3132 			cset = config_sets;
   3133 			while(cset!=NULL) {
   3134 				if (rf_does_it_fit(cset, ac)) {
   3135 					/* looks like it matches... */
   3136 					ac->next = cset->ac;
   3137 					cset->ac = ac;
   3138 					break;
   3139 				}
   3140 				cset = cset->next;
   3141 			}
   3142 			if (cset==NULL) {
   3143 				/* didn't find a match above... new set..*/
   3144 				cset = malloc(sizeof(RF_ConfigSet_t),
   3145 					       M_RAIDFRAME, M_WAITOK);
   3146 				cset->ac = ac;
   3147 				ac->next = NULL;
   3148 				cset->next = config_sets;
   3149 				cset->rootable = 0;
   3150 				config_sets = cset;
   3151 			}
   3152 		}
   3153 		ac = ac_next;
   3154 	}
   3155 
   3156 
   3157 	return(config_sets);
   3158 }
   3159 
   3160 static int
   3161 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3162 {
   3163 	RF_ComponentLabel_t *clabel1, *clabel2;
   3164 
   3165 	/* If this one matches the *first* one in the set, that's good
   3166 	   enough, since the other members of the set would have been
   3167 	   through here too... */
   3168 	/* note that we are not checking partitionSize here..
   3169 
   3170 	   Note that we are also not checking the mod_counters here.
   3171 	   If everything else matches except the mod_counter, that's
   3172 	   good enough for this test.  We will deal with the mod_counters
   3173 	   a little later in the autoconfiguration process.
   3174 
   3175 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3176 
   3177 	   The reason we don't check for this is that failed disks
   3178 	   will have lower modification counts.  If those disks are
   3179 	   not added to the set they used to belong to, then they will
   3180 	   form their own set, which may result in 2 different sets,
   3181 	   for example, competing to be configured at raid0, and
   3182 	   perhaps competing to be the root filesystem set.  If the
   3183 	   wrong ones get configured, or both attempt to become /,
   3184 	   weird behaviour and or serious lossage will occur.  Thus we
   3185 	   need to bring them into the fold here, and kick them out at
   3186 	   a later point.
   3187 
   3188 	*/
   3189 
   3190 	clabel1 = cset->ac->clabel;
   3191 	clabel2 = ac->clabel;
   3192 	if ((clabel1->version == clabel2->version) &&
   3193 	    (clabel1->serial_number == clabel2->serial_number) &&
   3194 	    (clabel1->num_rows == clabel2->num_rows) &&
   3195 	    (clabel1->num_columns == clabel2->num_columns) &&
   3196 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3197 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3198 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3199 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3200 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3201 	    (clabel1->blockSize == clabel2->blockSize) &&
   3202 	    rf_component_label_numblocks(clabel1) ==
   3203 	    rf_component_label_numblocks(clabel2) &&
   3204 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3205 	    (clabel1->root_partition == clabel2->root_partition) &&
   3206 	    (clabel1->last_unit == clabel2->last_unit) &&
   3207 	    (clabel1->config_order == clabel2->config_order)) {
   3208 		/* if it get's here, it almost *has* to be a match */
   3209 	} else {
   3210 		/* it's not consistent with somebody in the set..
   3211 		   punt */
   3212 		return(0);
   3213 	}
   3214 	/* all was fine.. it must fit... */
   3215 	return(1);
   3216 }
   3217 
   3218 static int
   3219 rf_have_enough_components(RF_ConfigSet_t *cset)
   3220 {
   3221 	RF_AutoConfig_t *ac;
   3222 	RF_AutoConfig_t *auto_config;
   3223 	RF_ComponentLabel_t *clabel;
   3224 	int c;
   3225 	int num_cols;
   3226 	int num_missing;
   3227 	int mod_counter;
   3228 	int mod_counter_found;
   3229 	int even_pair_failed;
   3230 	char parity_type;
   3231 
   3232 
   3233 	/* check to see that we have enough 'live' components
   3234 	   of this set.  If so, we can configure it if necessary */
   3235 
   3236 	num_cols = cset->ac->clabel->num_columns;
   3237 	parity_type = cset->ac->clabel->parityConfig;
   3238 
   3239 	/* XXX Check for duplicate components!?!?!? */
   3240 
   3241 	/* Determine what the mod_counter is supposed to be for this set. */
   3242 
   3243 	mod_counter_found = 0;
   3244 	mod_counter = 0;
   3245 	ac = cset->ac;
   3246 	while(ac!=NULL) {
   3247 		if (mod_counter_found==0) {
   3248 			mod_counter = ac->clabel->mod_counter;
   3249 			mod_counter_found = 1;
   3250 		} else {
   3251 			if (ac->clabel->mod_counter > mod_counter) {
   3252 				mod_counter = ac->clabel->mod_counter;
   3253 			}
   3254 		}
   3255 		ac = ac->next;
   3256 	}
   3257 
   3258 	num_missing = 0;
   3259 	auto_config = cset->ac;
   3260 
   3261 	even_pair_failed = 0;
   3262 	for(c=0; c<num_cols; c++) {
   3263 		ac = auto_config;
   3264 		while(ac!=NULL) {
   3265 			if ((ac->clabel->column == c) &&
   3266 			    (ac->clabel->mod_counter == mod_counter)) {
   3267 				/* it's this one... */
   3268 #ifdef DEBUG
   3269 				printf("Found: %s at %d\n",
   3270 				       ac->devname,c);
   3271 #endif
   3272 				break;
   3273 			}
   3274 			ac=ac->next;
   3275 		}
   3276 		if (ac==NULL) {
   3277 				/* Didn't find one here! */
   3278 				/* special case for RAID 1, especially
   3279 				   where there are more than 2
   3280 				   components (where RAIDframe treats
   3281 				   things a little differently :( ) */
   3282 			if (parity_type == '1') {
   3283 				if (c%2 == 0) { /* even component */
   3284 					even_pair_failed = 1;
   3285 				} else { /* odd component.  If
   3286 					    we're failed, and
   3287 					    so is the even
   3288 					    component, it's
   3289 					    "Good Night, Charlie" */
   3290 					if (even_pair_failed == 1) {
   3291 						return(0);
   3292 					}
   3293 				}
   3294 			} else {
   3295 				/* normal accounting */
   3296 				num_missing++;
   3297 			}
   3298 		}
   3299 		if ((parity_type == '1') && (c%2 == 1)) {
   3300 				/* Just did an even component, and we didn't
   3301 				   bail.. reset the even_pair_failed flag,
   3302 				   and go on to the next component.... */
   3303 			even_pair_failed = 0;
   3304 		}
   3305 	}
   3306 
   3307 	clabel = cset->ac->clabel;
   3308 
   3309 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3310 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3311 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3312 		/* XXX this needs to be made *much* more general */
   3313 		/* Too many failures */
   3314 		return(0);
   3315 	}
   3316 	/* otherwise, all is well, and we've got enough to take a kick
   3317 	   at autoconfiguring this set */
   3318 	return(1);
   3319 }
   3320 
   3321 static void
   3322 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3323 			RF_Raid_t *raidPtr)
   3324 {
   3325 	RF_ComponentLabel_t *clabel;
   3326 	int i;
   3327 
   3328 	clabel = ac->clabel;
   3329 
   3330 	/* 1. Fill in the common stuff */
   3331 	config->numCol = clabel->num_columns;
   3332 	config->numSpare = 0; /* XXX should this be set here? */
   3333 	config->sectPerSU = clabel->sectPerSU;
   3334 	config->SUsPerPU = clabel->SUsPerPU;
   3335 	config->SUsPerRU = clabel->SUsPerRU;
   3336 	config->parityConfig = clabel->parityConfig;
   3337 	/* XXX... */
   3338 	strcpy(config->diskQueueType,"fifo");
   3339 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3340 	config->layoutSpecificSize = 0; /* XXX ?? */
   3341 
   3342 	while(ac!=NULL) {
   3343 		/* row/col values will be in range due to the checks
   3344 		   in reasonable_label() */
   3345 		strcpy(config->devnames[0][ac->clabel->column],
   3346 		       ac->devname);
   3347 		ac = ac->next;
   3348 	}
   3349 
   3350 	for(i=0;i<RF_MAXDBGV;i++) {
   3351 		config->debugVars[i][0] = 0;
   3352 	}
   3353 }
   3354 
   3355 static int
   3356 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3357 {
   3358 	RF_ComponentLabel_t *clabel;
   3359 	int column;
   3360 	int sparecol;
   3361 
   3362 	raidPtr->autoconfigure = new_value;
   3363 
   3364 	for(column=0; column<raidPtr->numCol; column++) {
   3365 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3366 			clabel = raidget_component_label(raidPtr, column);
   3367 			clabel->autoconfigure = new_value;
   3368 			raidflush_component_label(raidPtr, column);
   3369 		}
   3370 	}
   3371 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3372 		sparecol = raidPtr->numCol + column;
   3373 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3374 			clabel = raidget_component_label(raidPtr, sparecol);
   3375 			clabel->autoconfigure = new_value;
   3376 			raidflush_component_label(raidPtr, sparecol);
   3377 		}
   3378 	}
   3379 	return(new_value);
   3380 }
   3381 
   3382 static int
   3383 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3384 {
   3385 	RF_ComponentLabel_t *clabel;
   3386 	int column;
   3387 	int sparecol;
   3388 
   3389 	raidPtr->root_partition = new_value;
   3390 	for(column=0; column<raidPtr->numCol; column++) {
   3391 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3392 			clabel = raidget_component_label(raidPtr, column);
   3393 			clabel->root_partition = new_value;
   3394 			raidflush_component_label(raidPtr, column);
   3395 		}
   3396 	}
   3397 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3398 		sparecol = raidPtr->numCol + column;
   3399 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3400 			clabel = raidget_component_label(raidPtr, sparecol);
   3401 			clabel->root_partition = new_value;
   3402 			raidflush_component_label(raidPtr, sparecol);
   3403 		}
   3404 	}
   3405 	return(new_value);
   3406 }
   3407 
   3408 static void
   3409 rf_release_all_vps(RF_ConfigSet_t *cset)
   3410 {
   3411 	RF_AutoConfig_t *ac;
   3412 
   3413 	ac = cset->ac;
   3414 	while(ac!=NULL) {
   3415 		/* Close the vp, and give it back */
   3416 		if (ac->vp) {
   3417 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3418 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3419 			vput(ac->vp);
   3420 			ac->vp = NULL;
   3421 		}
   3422 		ac = ac->next;
   3423 	}
   3424 }
   3425 
   3426 
   3427 static void
   3428 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3429 {
   3430 	RF_AutoConfig_t *ac;
   3431 	RF_AutoConfig_t *next_ac;
   3432 
   3433 	ac = cset->ac;
   3434 	while(ac!=NULL) {
   3435 		next_ac = ac->next;
   3436 		/* nuke the label */
   3437 		free(ac->clabel, M_RAIDFRAME);
   3438 		/* cleanup the config structure */
   3439 		free(ac, M_RAIDFRAME);
   3440 		/* "next.." */
   3441 		ac = next_ac;
   3442 	}
   3443 	/* and, finally, nuke the config set */
   3444 	free(cset, M_RAIDFRAME);
   3445 }
   3446 
   3447 
   3448 void
   3449 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3450 {
   3451 	/* avoid over-writing byteswapped version. */
   3452 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3453 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3454 	clabel->serial_number = raidPtr->serial_number;
   3455 	clabel->mod_counter = raidPtr->mod_counter;
   3456 
   3457 	clabel->num_rows = 1;
   3458 	clabel->num_columns = raidPtr->numCol;
   3459 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3460 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3461 
   3462 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3463 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3464 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3465 
   3466 	clabel->blockSize = raidPtr->bytesPerSector;
   3467 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3468 
   3469 	/* XXX not portable */
   3470 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3471 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3472 	clabel->autoconfigure = raidPtr->autoconfigure;
   3473 	clabel->root_partition = raidPtr->root_partition;
   3474 	clabel->last_unit = raidPtr->raidid;
   3475 	clabel->config_order = raidPtr->config_order;
   3476 
   3477 #ifndef RF_NO_PARITY_MAP
   3478 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3479 #endif
   3480 }
   3481 
   3482 static struct raid_softc *
   3483 rf_auto_config_set(RF_ConfigSet_t *cset)
   3484 {
   3485 	RF_Raid_t *raidPtr;
   3486 	RF_Config_t *config;
   3487 	int raidID;
   3488 	struct raid_softc *sc;
   3489 
   3490 #ifdef DEBUG
   3491 	printf("RAID autoconfigure\n");
   3492 #endif
   3493 
   3494 	/* 1. Create a config structure */
   3495 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3496 
   3497 	/*
   3498 	   2. Figure out what RAID ID this one is supposed to live at
   3499 	   See if we can get the same RAID dev that it was configured
   3500 	   on last time..
   3501 	*/
   3502 
   3503 	raidID = cset->ac->clabel->last_unit;
   3504 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3505 	     sc = raidget(++raidID, false))
   3506 		continue;
   3507 #ifdef DEBUG
   3508 	printf("Configuring raid%d:\n",raidID);
   3509 #endif
   3510 
   3511 	if (sc == NULL)
   3512 		sc = raidget(raidID, true);
   3513 	raidPtr = &sc->sc_r;
   3514 
   3515 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3516 	raidPtr->softc = sc;
   3517 	raidPtr->raidid = raidID;
   3518 	raidPtr->openings = RAIDOUTSTANDING;
   3519 
   3520 	/* 3. Build the configuration structure */
   3521 	rf_create_configuration(cset->ac, config, raidPtr);
   3522 
   3523 	/* 4. Do the configuration */
   3524 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3525 		raidinit(sc);
   3526 
   3527 		rf_markalldirty(raidPtr);
   3528 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3529 		switch (cset->ac->clabel->root_partition) {
   3530 		case 1:	/* Force Root */
   3531 		case 2:	/* Soft Root: root when boot partition part of raid */
   3532 			/*
   3533 			 * everything configured just fine.  Make a note
   3534 			 * that this set is eligible to be root,
   3535 			 * or forced to be root
   3536 			 */
   3537 			cset->rootable = cset->ac->clabel->root_partition;
   3538 			/* XXX do this here? */
   3539 			raidPtr->root_partition = cset->rootable;
   3540 			break;
   3541 		default:
   3542 			break;
   3543 		}
   3544 	} else {
   3545 		raidput(sc);
   3546 		sc = NULL;
   3547 	}
   3548 
   3549 	/* 5. Cleanup */
   3550 	free(config, M_RAIDFRAME);
   3551 	return sc;
   3552 }
   3553 
   3554 void
   3555 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3556 	     size_t xmin, size_t xmax)
   3557 {
   3558 
   3559 	/* Format: raid%d_foo */
   3560 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3561 
   3562 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3563 	pool_sethiwat(p, xmax);
   3564 	pool_prime(p, xmin);
   3565 }
   3566 
   3567 
   3568 /*
   3569  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3570  * to see if there is IO pending and if that IO could possibly be done
   3571  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3572  * otherwise.
   3573  *
   3574  */
   3575 int
   3576 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3577 {
   3578 	struct raid_softc *rs;
   3579 	struct dk_softc *dksc;
   3580 
   3581 	rs = raidPtr->softc;
   3582 	dksc = &rs->sc_dksc;
   3583 
   3584 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3585 		return 1;
   3586 
   3587 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3588 		/* there is work to do */
   3589 		return 0;
   3590 	}
   3591 	/* default is nothing to do */
   3592 	return 1;
   3593 }
   3594 
   3595 int
   3596 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3597 {
   3598 	uint64_t numsecs;
   3599 	unsigned secsize;
   3600 	int error;
   3601 
   3602 	error = getdisksize(vp, &numsecs, &secsize);
   3603 	if (error == 0) {
   3604 		diskPtr->blockSize = secsize;
   3605 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3606 		diskPtr->partitionSize = numsecs;
   3607 		return 0;
   3608 	}
   3609 	return error;
   3610 }
   3611 
   3612 static int
   3613 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3614 {
   3615 	return 1;
   3616 }
   3617 
   3618 static void
   3619 raid_attach(device_t parent, device_t self, void *aux)
   3620 {
   3621 }
   3622 
   3623 
   3624 static int
   3625 raid_detach(device_t self, int flags)
   3626 {
   3627 	int error;
   3628 	struct raid_softc *rs = raidsoftc(self);
   3629 
   3630 	if (rs == NULL)
   3631 		return ENXIO;
   3632 
   3633 	if ((error = raidlock(rs)) != 0)
   3634 		return error;
   3635 
   3636 	error = raid_detach_unlocked(rs);
   3637 
   3638 	raidunlock(rs);
   3639 
   3640 	/* XXX raid can be referenced here */
   3641 
   3642 	if (error)
   3643 		return error;
   3644 
   3645 	/* Free the softc */
   3646 	raidput(rs);
   3647 
   3648 	return 0;
   3649 }
   3650 
   3651 static void
   3652 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3653 {
   3654 	struct dk_softc *dksc = &rs->sc_dksc;
   3655 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3656 
   3657 	memset(dg, 0, sizeof(*dg));
   3658 
   3659 	dg->dg_secperunit = raidPtr->totalSectors;
   3660 	dg->dg_secsize = raidPtr->bytesPerSector;
   3661 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3662 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3663 
   3664 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3665 }
   3666 
   3667 /*
   3668  * Get cache info for all the components (including spares).
   3669  * Returns intersection of all the cache flags of all disks, or first
   3670  * error if any encountered.
   3671  * XXXfua feature flags can change as spares are added - lock down somehow
   3672  */
   3673 static int
   3674 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3675 {
   3676 	int c;
   3677 	int error;
   3678 	int dkwhole = 0, dkpart;
   3679 
   3680 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3681 		/*
   3682 		 * Check any non-dead disk, even when currently being
   3683 		 * reconstructed.
   3684 		 */
   3685 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3686 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3687 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3688 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3689 			if (error) {
   3690 				if (error != ENODEV) {
   3691 					printf("raid%d: get cache for component %s failed\n",
   3692 					    raidPtr->raidid,
   3693 					    raidPtr->Disks[c].devname);
   3694 				}
   3695 
   3696 				return error;
   3697 			}
   3698 
   3699 			if (c == 0)
   3700 				dkwhole = dkpart;
   3701 			else
   3702 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3703 		}
   3704 	}
   3705 
   3706 	*data = dkwhole;
   3707 
   3708 	return 0;
   3709 }
   3710 
   3711 /*
   3712  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3713  * We end up returning whatever error was returned by the first cache flush
   3714  * that fails.
   3715  */
   3716 
   3717 static int
   3718 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3719 {
   3720 	int e = 0;
   3721 	for (int i = 0; i < 5; i++) {
   3722 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3723 		    &force, FWRITE, NOCRED);
   3724 		if (!e || e == ENODEV)
   3725 			return e;
   3726 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3727 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3728 	}
   3729 	return e;
   3730 }
   3731 
   3732 int
   3733 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3734 {
   3735 	int c, error;
   3736 
   3737 	error = 0;
   3738 	for (c = 0; c < raidPtr->numCol; c++) {
   3739 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3740 			int e = rf_sync_component_cache(raidPtr, c, force);
   3741 			if (e && !error)
   3742 				error = e;
   3743 		}
   3744 	}
   3745 
   3746 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3747 		int sparecol = raidPtr->numCol + c;
   3748 		/* Need to ensure that the reconstruct actually completed! */
   3749 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3750 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3751 			    force);
   3752 			if (e && !error)
   3753 				error = e;
   3754 		}
   3755 	}
   3756 	return error;
   3757 }
   3758 
   3759 /* Fill in info with the current status */
   3760 void
   3761 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3762 {
   3763 
   3764 	if (raidPtr->status != rf_rs_reconstructing) {
   3765 		info->total = 100;
   3766 		info->completed = 100;
   3767 	} else {
   3768 		info->total = raidPtr->reconControl->numRUsTotal;
   3769 		info->completed = raidPtr->reconControl->numRUsComplete;
   3770 	}
   3771 	info->remaining = info->total - info->completed;
   3772 }
   3773 
   3774 /* Fill in info with the current status */
   3775 void
   3776 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3777 {
   3778 
   3779 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3780 		info->total = raidPtr->Layout.numStripe;
   3781 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3782 	} else {
   3783 		info->completed = 100;
   3784 		info->total = 100;
   3785 	}
   3786 	info->remaining = info->total - info->completed;
   3787 }
   3788 
   3789 /* Fill in info with the current status */
   3790 void
   3791 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3792 {
   3793 
   3794 	if (raidPtr->copyback_in_progress == 1) {
   3795 		info->total = raidPtr->Layout.numStripe;
   3796 		info->completed = raidPtr->copyback_stripes_done;
   3797 		info->remaining = info->total - info->completed;
   3798 	} else {
   3799 		info->remaining = 0;
   3800 		info->completed = 100;
   3801 		info->total = 100;
   3802 	}
   3803 }
   3804 
   3805 /* Fill in config with the current info */
   3806 int
   3807 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3808 {
   3809 	int	d, i, j;
   3810 
   3811 	if (!raidPtr->valid)
   3812 		return ENODEV;
   3813 	config->cols = raidPtr->numCol;
   3814 	config->ndevs = raidPtr->numCol;
   3815 	if (config->ndevs >= RF_MAX_DISKS)
   3816 		return ENOMEM;
   3817 	config->nspares = raidPtr->numSpare;
   3818 	if (config->nspares >= RF_MAX_DISKS)
   3819 		return ENOMEM;
   3820 	config->maxqdepth = raidPtr->maxQueueDepth;
   3821 	d = 0;
   3822 	for (j = 0; j < config->cols; j++) {
   3823 		config->devs[d] = raidPtr->Disks[j];
   3824 		d++;
   3825 	}
   3826 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3827 		config->spares[i] = raidPtr->Disks[j];
   3828 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3829 			/* XXX: raidctl(8) expects to see this as a used spare */
   3830 			config->spares[i].status = rf_ds_used_spare;
   3831 		}
   3832 	}
   3833 	return 0;
   3834 }
   3835 
   3836 int
   3837 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3838 {
   3839 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3840 	RF_ComponentLabel_t *raid_clabel;
   3841 	int column = clabel->column;
   3842 
   3843 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3844 		return EINVAL;
   3845 	raid_clabel = raidget_component_label(raidPtr, column);
   3846 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3847 	/* Fix-up for userland. */
   3848 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3849 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3850 
   3851 	return 0;
   3852 }
   3853 
   3854 /*
   3855  * Module interface
   3856  */
   3857 
   3858 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3859 
   3860 #ifdef _MODULE
   3861 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3862 #endif
   3863 
   3864 static int raid_modcmd(modcmd_t, void *);
   3865 static int raid_modcmd_init(void);
   3866 static int raid_modcmd_fini(void);
   3867 
   3868 static int
   3869 raid_modcmd(modcmd_t cmd, void *data)
   3870 {
   3871 	int error;
   3872 
   3873 	error = 0;
   3874 	switch (cmd) {
   3875 	case MODULE_CMD_INIT:
   3876 		error = raid_modcmd_init();
   3877 		break;
   3878 	case MODULE_CMD_FINI:
   3879 		error = raid_modcmd_fini();
   3880 		break;
   3881 	default:
   3882 		error = ENOTTY;
   3883 		break;
   3884 	}
   3885 	return error;
   3886 }
   3887 
   3888 static int
   3889 raid_modcmd_init(void)
   3890 {
   3891 	int error;
   3892 	int bmajor, cmajor;
   3893 
   3894 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3895 	mutex_enter(&raid_lock);
   3896 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3897 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3898 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3899 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3900 
   3901 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3902 #endif
   3903 
   3904 	bmajor = cmajor = -1;
   3905 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3906 	    &raid_cdevsw, &cmajor);
   3907 	if (error != 0 && error != EEXIST) {
   3908 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3909 		mutex_exit(&raid_lock);
   3910 		return error;
   3911 	}
   3912 #ifdef _MODULE
   3913 	error = config_cfdriver_attach(&raid_cd);
   3914 	if (error != 0) {
   3915 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3916 		    __func__, error);
   3917 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3918 		mutex_exit(&raid_lock);
   3919 		return error;
   3920 	}
   3921 #endif
   3922 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3923 	if (error != 0) {
   3924 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3925 		    __func__, error);
   3926 #ifdef _MODULE
   3927 		config_cfdriver_detach(&raid_cd);
   3928 #endif
   3929 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3930 		mutex_exit(&raid_lock);
   3931 		return error;
   3932 	}
   3933 
   3934 	raidautoconfigdone = false;
   3935 
   3936 	mutex_exit(&raid_lock);
   3937 
   3938 	if (error == 0) {
   3939 		if (rf_BootRaidframe(true) == 0)
   3940 			aprint_verbose("Kernelized RAIDframe activated\n");
   3941 		else
   3942 			panic("Serious error activating RAID!!");
   3943 	}
   3944 
   3945 	/*
   3946 	 * Register a finalizer which will be used to auto-config RAID
   3947 	 * sets once all real hardware devices have been found.
   3948 	 */
   3949 	error = config_finalize_register(NULL, rf_autoconfig);
   3950 	if (error != 0) {
   3951 		aprint_error("WARNING: unable to register RAIDframe "
   3952 		    "finalizer\n");
   3953 		error = 0;
   3954 	}
   3955 
   3956 	return error;
   3957 }
   3958 
   3959 static int
   3960 raid_modcmd_fini(void)
   3961 {
   3962 	int error;
   3963 
   3964 	mutex_enter(&raid_lock);
   3965 
   3966 	/* Don't allow unload if raid device(s) exist.  */
   3967 	if (!LIST_EMPTY(&raids)) {
   3968 		mutex_exit(&raid_lock);
   3969 		return EBUSY;
   3970 	}
   3971 
   3972 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3973 	if (error != 0) {
   3974 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3975 		mutex_exit(&raid_lock);
   3976 		return error;
   3977 	}
   3978 #ifdef _MODULE
   3979 	error = config_cfdriver_detach(&raid_cd);
   3980 	if (error != 0) {
   3981 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3982 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3983 		mutex_exit(&raid_lock);
   3984 		return error;
   3985 	}
   3986 #endif
   3987 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3988 	if (error != 0) {
   3989 		aprint_error("%s: cannot detach devsw\n",__func__);
   3990 #ifdef _MODULE
   3991 		config_cfdriver_attach(&raid_cd);
   3992 #endif
   3993 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3994 		mutex_exit(&raid_lock);
   3995 		return error;
   3996 	}
   3997 	rf_BootRaidframe(false);
   3998 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3999 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4000 	rf_destroy_cond2(rf_sparet_wait_cv);
   4001 	rf_destroy_cond2(rf_sparet_resp_cv);
   4002 #endif
   4003 	mutex_exit(&raid_lock);
   4004 	mutex_destroy(&raid_lock);
   4005 
   4006 	return error;
   4007 }
   4008