Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.397
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.397 2021/07/26 22:50:36 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.397 2021/07/26 22:50:36 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    179 
    180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    181 
    182 /* prototypes */
    183 static void KernelWakeupFunc(struct buf *);
    184 static void InitBP(struct buf *, struct vnode *, unsigned,
    185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    186     void *, int);
    187 static void raidinit(struct raid_softc *);
    188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    190 
    191 static int raid_match(device_t, cfdata_t, void *);
    192 static void raid_attach(device_t, device_t, void *);
    193 static int raid_detach(device_t, int);
    194 
    195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t);
    197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t, int);
    199 
    200 static int raidwrite_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 static int raidread_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 
    205 static int raid_diskstart(device_t, struct buf *bp);
    206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    207 static int raid_lastclose(device_t);
    208 
    209 static dev_type_open(raidopen);
    210 static dev_type_close(raidclose);
    211 static dev_type_read(raidread);
    212 static dev_type_write(raidwrite);
    213 static dev_type_ioctl(raidioctl);
    214 static dev_type_strategy(raidstrategy);
    215 static dev_type_dump(raiddump);
    216 static dev_type_size(raidsize);
    217 
    218 const struct bdevsw raid_bdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_strategy = raidstrategy,
    222 	.d_ioctl = raidioctl,
    223 	.d_dump = raiddump,
    224 	.d_psize = raidsize,
    225 	.d_discard = nodiscard,
    226 	.d_flag = D_DISK
    227 };
    228 
    229 const struct cdevsw raid_cdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_read = raidread,
    233 	.d_write = raidwrite,
    234 	.d_ioctl = raidioctl,
    235 	.d_stop = nostop,
    236 	.d_tty = notty,
    237 	.d_poll = nopoll,
    238 	.d_mmap = nommap,
    239 	.d_kqfilter = nokqfilter,
    240 	.d_discard = nodiscard,
    241 	.d_flag = D_DISK
    242 };
    243 
    244 static struct dkdriver rf_dkdriver = {
    245 	.d_open = raidopen,
    246 	.d_close = raidclose,
    247 	.d_strategy = raidstrategy,
    248 	.d_diskstart = raid_diskstart,
    249 	.d_dumpblocks = raid_dumpblocks,
    250 	.d_lastclose = raid_lastclose,
    251 	.d_minphys = minphys
    252 };
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /* Internal representation of a rf_recon_req */
    263 struct rf_recon_req_internal {
    264 	RF_RowCol_t col;
    265 	RF_ReconReqFlags_t flags;
    266 	void   *raidPtr;
    267 };
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 static void rf_ReconThread(struct rf_recon_req_internal *);
    304 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    307 static int rf_autoconfig(device_t);
    308 static void rf_buildroothack(RF_ConfigSet_t *);
    309 
    310 static RF_AutoConfig_t *rf_find_raid_components(void);
    311 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    313 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    314 static int rf_set_autoconfig(RF_Raid_t *, int);
    315 static int rf_set_rootpartition(RF_Raid_t *, int);
    316 static void rf_release_all_vps(RF_ConfigSet_t *);
    317 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    318 static int rf_have_enough_components(RF_ConfigSet_t *);
    319 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    320 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    321 
    322 /*
    323  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    324  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    325  * in the kernel config file.
    326  */
    327 #ifdef RAID_AUTOCONFIG
    328 int raidautoconfig = 1;
    329 #else
    330 int raidautoconfig = 0;
    331 #endif
    332 static bool raidautoconfigdone = false;
    333 
    334 struct pool rf_alloclist_pool;   /* AllocList */
    335 
    336 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    337 static kmutex_t raid_lock;
    338 
    339 static struct raid_softc *
    340 raidcreate(int unit) {
    341 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    342 	sc->sc_unit = unit;
    343 	cv_init(&sc->sc_cv, "raidunit");
    344 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    345 	return sc;
    346 }
    347 
    348 static void
    349 raiddestroy(struct raid_softc *sc) {
    350 	cv_destroy(&sc->sc_cv);
    351 	mutex_destroy(&sc->sc_mutex);
    352 	kmem_free(sc, sizeof(*sc));
    353 }
    354 
    355 static struct raid_softc *
    356 raidget(int unit, bool create) {
    357 	struct raid_softc *sc;
    358 	if (unit < 0) {
    359 #ifdef DIAGNOSTIC
    360 		panic("%s: unit %d!", __func__, unit);
    361 #endif
    362 		return NULL;
    363 	}
    364 	mutex_enter(&raid_lock);
    365 	LIST_FOREACH(sc, &raids, sc_link) {
    366 		if (sc->sc_unit == unit) {
    367 			mutex_exit(&raid_lock);
    368 			return sc;
    369 		}
    370 	}
    371 	mutex_exit(&raid_lock);
    372 	if (!create)
    373 		return NULL;
    374 	sc = raidcreate(unit);
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 static int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return 0;
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 static void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_AutoConfig_t *ac_list;
    487 	RF_ConfigSet_t *cset;
    488 	RF_ConfigSet_t *next_cset;
    489 	int num_root;
    490 	int raid_added;
    491 	struct raid_softc *sc, *rsc;
    492 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    493 
    494 	sc = rsc = NULL;
    495 	num_root = 0;
    496 
    497 	raid_added = 1;
    498 	while (raid_added > 0) {
    499 		raid_added = 0;
    500 		cset = config_sets;
    501 		while (cset != NULL) {
    502 			next_cset = cset->next;
    503 			if (rf_have_enough_components(cset) &&
    504 			    cset->ac->clabel->autoconfigure == 1) {
    505 				sc = rf_auto_config_set(cset);
    506 				if (sc != NULL) {
    507 					aprint_debug("raid%d: configured ok, rootable %d\n",
    508 						     sc->sc_unit, cset->rootable);
    509 					/* We added one RAID set */
    510 					raid_added++;
    511 					if (cset->rootable) {
    512 						rsc = sc;
    513 						num_root++;
    514 					}
    515 				} else {
    516 					/* The autoconfig didn't work :( */
    517 					aprint_debug("Autoconfig failed\n");
    518 					rf_release_all_vps(cset);
    519 				}
    520 			} else {
    521 				/* we're not autoconfiguring this set...
    522 				   release the associated resources */
    523 				rf_release_all_vps(cset);
    524 			}
    525 			/* cleanup */
    526 			rf_cleanup_config_set(cset);
    527 			cset = next_cset;
    528 		}
    529 		if (raid_added > 0) {
    530 			/* We added at least one RAID set, so re-scan for recursive RAID */
    531 			ac_list = rf_find_raid_components();
    532 			config_sets = rf_create_auto_sets(ac_list);
    533 		}
    534 	}
    535 
    536 	/* if the user has specified what the root device should be
    537 	   then we don't touch booted_device or boothowto... */
    538 
    539 	if (rootspec != NULL) {
    540 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    541 		return;
    542 	}
    543 
    544 	/* we found something bootable... */
    545 
    546 	/*
    547 	 * XXX: The following code assumes that the root raid
    548 	 * is the first ('a') partition. This is about the best
    549 	 * we can do with a BSD disklabel, but we might be able
    550 	 * to do better with a GPT label, by setting a specified
    551 	 * attribute to indicate the root partition. We can then
    552 	 * stash the partition number in the r->root_partition
    553 	 * high bits (the bottom 2 bits are already used). For
    554 	 * now we just set booted_partition to 0 when we override
    555 	 * root.
    556 	 */
    557 	if (num_root == 1) {
    558 		device_t candidate_root;
    559 		dksc = &rsc->sc_dksc;
    560 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    561 			char cname[sizeof(cset->ac->devname)];
    562 			/* XXX: assume partition 'a' first */
    563 			snprintf(cname, sizeof(cname), "%s%c",
    564 			    device_xname(dksc->sc_dev), 'a');
    565 			candidate_root = dkwedge_find_by_wname(cname);
    566 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    567 			    cname);
    568 			if (candidate_root == NULL) {
    569 				/*
    570 				 * If that is not found, because we don't use
    571 				 * disklabel, return the first dk child
    572 				 * XXX: we can skip the 'a' check above
    573 				 * and always do this...
    574 				 */
    575 				size_t i = 0;
    576 				candidate_root = dkwedge_find_by_parent(
    577 				    device_xname(dksc->sc_dev), &i);
    578 			}
    579 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    580 			    candidate_root);
    581 		} else
    582 			candidate_root = dksc->sc_dev;
    583 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    584 		DPRINTF("%s: booted_device=%p root_partition=%d "
    585 			"contains_boot=%d",
    586 		    __func__, booted_device, rsc->sc_r.root_partition,
    587 			   rf_containsboot(&rsc->sc_r, booted_device));
    588 		/* XXX the check for booted_device == NULL can probably be
    589 		 * dropped, now that rf_containsboot handles that case.
    590 		 */
    591 		if (booted_device == NULL ||
    592 		    rsc->sc_r.root_partition == 1 ||
    593 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    594 			booted_device = candidate_root;
    595 			booted_method = "raidframe/single";
    596 			booted_partition = 0;	/* XXX assume 'a' */
    597 			DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
    598 			    device_xname(booted_device), booted_device);
    599 		}
    600 	} else if (num_root > 1) {
    601 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    602 		    booted_device);
    603 
    604 		/*
    605 		 * Maybe the MD code can help. If it cannot, then
    606 		 * setroot() will discover that we have no
    607 		 * booted_device and will ask the user if nothing was
    608 		 * hardwired in the kernel config file
    609 		 */
    610 		if (booted_device == NULL)
    611 			return;
    612 
    613 		num_root = 0;
    614 		mutex_enter(&raid_lock);
    615 		LIST_FOREACH(sc, &raids, sc_link) {
    616 			RF_Raid_t *r = &sc->sc_r;
    617 			if (r->valid == 0)
    618 				continue;
    619 
    620 			if (r->root_partition == 0)
    621 				continue;
    622 
    623 			if (rf_containsboot(r, booted_device)) {
    624 				num_root++;
    625 				rsc = sc;
    626 				dksc = &rsc->sc_dksc;
    627 			}
    628 		}
    629 		mutex_exit(&raid_lock);
    630 
    631 		if (num_root == 1) {
    632 			booted_device = dksc->sc_dev;
    633 			booted_method = "raidframe/multi";
    634 			booted_partition = 0;	/* XXX assume 'a' */
    635 		} else {
    636 			/* we can't guess.. require the user to answer... */
    637 			boothowto |= RB_ASKNAME;
    638 		}
    639 	}
    640 }
    641 
    642 static int
    643 raidsize(dev_t dev)
    644 {
    645 	struct raid_softc *rs;
    646 	struct dk_softc *dksc;
    647 	unsigned int unit;
    648 
    649 	unit = raidunit(dev);
    650 	if ((rs = raidget(unit, false)) == NULL)
    651 		return -1;
    652 	dksc = &rs->sc_dksc;
    653 
    654 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    655 		return -1;
    656 
    657 	return dk_size(dksc, dev);
    658 }
    659 
    660 static int
    661 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    662 {
    663 	unsigned int unit;
    664 	struct raid_softc *rs;
    665 	struct dk_softc *dksc;
    666 
    667 	unit = raidunit(dev);
    668 	if ((rs = raidget(unit, false)) == NULL)
    669 		return ENXIO;
    670 	dksc = &rs->sc_dksc;
    671 
    672 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    673 		return ENODEV;
    674 
    675         /*
    676            Note that blkno is relative to this particular partition.
    677            By adding adding RF_PROTECTED_SECTORS, we get a value that
    678 	   is relative to the partition used for the underlying component.
    679         */
    680 	blkno += RF_PROTECTED_SECTORS;
    681 
    682 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    683 }
    684 
    685 static int
    686 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    687 {
    688 	struct raid_softc *rs = raidsoftc(dev);
    689 	const struct bdevsw *bdev;
    690 	RF_Raid_t *raidPtr;
    691 	int     c, sparecol, j, scol, dumpto;
    692 	int     error = 0;
    693 
    694 	raidPtr = &rs->sc_r;
    695 
    696 	/* we only support dumping to RAID 1 sets */
    697 	if (raidPtr->Layout.numDataCol != 1 ||
    698 	    raidPtr->Layout.numParityCol != 1)
    699 		return EINVAL;
    700 
    701 	if ((error = raidlock(rs)) != 0)
    702 		return error;
    703 
    704 	/* figure out what device is alive.. */
    705 
    706 	/*
    707 	   Look for a component to dump to.  The preference for the
    708 	   component to dump to is as follows:
    709 	   1) the first component
    710 	   2) a used_spare of the first component
    711 	   3) the second component
    712 	   4) a used_spare of the second component
    713 	*/
    714 
    715 	dumpto = -1;
    716 	for (c = 0; c < raidPtr->numCol; c++) {
    717 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    718 			/* this might be the one */
    719 			dumpto = c;
    720 			break;
    721 		}
    722 	}
    723 
    724 	/*
    725 	   At this point we have possibly selected a live component.
    726 	   If we didn't find a live ocmponent, we now check to see
    727 	   if there is a relevant spared component.
    728 	*/
    729 
    730 	for (c = 0; c < raidPtr->numSpare; c++) {
    731 		sparecol = raidPtr->numCol + c;
    732 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    733 			/* How about this one? */
    734 			scol = -1;
    735 			for(j=0;j<raidPtr->numCol;j++) {
    736 				if (raidPtr->Disks[j].spareCol == sparecol) {
    737 					scol = j;
    738 					break;
    739 				}
    740 			}
    741 			if (scol == 0) {
    742 				/*
    743 				   We must have found a spared first
    744 				   component!  We'll take that over
    745 				   anything else found so far.  (We
    746 				   couldn't have found a real first
    747 				   component before, since this is a
    748 				   used spare, and it's saying that
    749 				   it's replacing the first
    750 				   component.)  On reboot (with
    751 				   autoconfiguration turned on)
    752 				   sparecol will become the first
    753 				   component (component0) of this set.
    754 				*/
    755 				dumpto = sparecol;
    756 				break;
    757 			} else if (scol != -1) {
    758 				/*
    759 				   Must be a spared second component.
    760 				   We'll dump to that if we havn't found
    761 				   anything else so far.
    762 				*/
    763 				if (dumpto == -1)
    764 					dumpto = sparecol;
    765 			}
    766 		}
    767 	}
    768 
    769 	if (dumpto == -1) {
    770 		/* we couldn't find any live components to dump to!?!?
    771 		 */
    772 		error = EINVAL;
    773 		goto out;
    774 	}
    775 
    776 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    777 	if (bdev == NULL) {
    778 		error = ENXIO;
    779 		goto out;
    780 	}
    781 
    782 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    783 				blkno, va, nblk * raidPtr->bytesPerSector);
    784 
    785 out:
    786 	raidunlock(rs);
    787 
    788 	return error;
    789 }
    790 
    791 /* ARGSUSED */
    792 static int
    793 raidopen(dev_t dev, int flags, int fmt,
    794     struct lwp *l)
    795 {
    796 	int     unit = raidunit(dev);
    797 	struct raid_softc *rs;
    798 	struct dk_softc *dksc;
    799 	int     error = 0;
    800 	int     part, pmask;
    801 
    802 	if ((rs = raidget(unit, true)) == NULL)
    803 		return ENXIO;
    804 	if ((error = raidlock(rs)) != 0)
    805 		return error;
    806 
    807 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    808 		error = EBUSY;
    809 		goto bad;
    810 	}
    811 
    812 	dksc = &rs->sc_dksc;
    813 
    814 	part = DISKPART(dev);
    815 	pmask = (1 << part);
    816 
    817 	if (!DK_BUSY(dksc, pmask) &&
    818 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    819 		/* First one... mark things as dirty... Note that we *MUST*
    820 		 have done a configure before this.  I DO NOT WANT TO BE
    821 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    822 		 THAT THEY BELONG TOGETHER!!!!! */
    823 		/* XXX should check to see if we're only open for reading
    824 		   here... If so, we needn't do this, but then need some
    825 		   other way of keeping track of what's happened.. */
    826 
    827 		rf_markalldirty(&rs->sc_r);
    828 	}
    829 
    830 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    831 		error = dk_open(dksc, dev, flags, fmt, l);
    832 
    833 bad:
    834 	raidunlock(rs);
    835 
    836 	return error;
    837 
    838 
    839 }
    840 
    841 static int
    842 raid_lastclose(device_t self)
    843 {
    844 	struct raid_softc *rs = raidsoftc(self);
    845 
    846 	/* Last one... device is not unconfigured yet.
    847 	   Device shutdown has taken care of setting the
    848 	   clean bits if RAIDF_INITED is not set
    849 	   mark things as clean... */
    850 
    851 	rf_update_component_labels(&rs->sc_r,
    852 	    RF_FINAL_COMPONENT_UPDATE);
    853 
    854 	/* pass to unlocked code */
    855 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    856 		rs->sc_flags |= RAIDF_DETACH;
    857 
    858 	return 0;
    859 }
    860 
    861 /* ARGSUSED */
    862 static int
    863 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    864 {
    865 	int     unit = raidunit(dev);
    866 	struct raid_softc *rs;
    867 	struct dk_softc *dksc;
    868 	cfdata_t cf;
    869 	int     error = 0, do_detach = 0, do_put = 0;
    870 
    871 	if ((rs = raidget(unit, false)) == NULL)
    872 		return ENXIO;
    873 	dksc = &rs->sc_dksc;
    874 
    875 	if ((error = raidlock(rs)) != 0)
    876 		return error;
    877 
    878 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    879 		error = dk_close(dksc, dev, flags, fmt, l);
    880 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    881 			do_detach = 1;
    882 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    883 		do_put = 1;
    884 
    885 	raidunlock(rs);
    886 
    887 	if (do_detach) {
    888 		/* free the pseudo device attach bits */
    889 		cf = device_cfdata(dksc->sc_dev);
    890 		error = config_detach(dksc->sc_dev, 0);
    891 		if (error == 0)
    892 			free(cf, M_RAIDFRAME);
    893 	} else if (do_put) {
    894 		raidput(rs);
    895 	}
    896 
    897 	return error;
    898 
    899 }
    900 
    901 static void
    902 raid_wakeup(RF_Raid_t *raidPtr)
    903 {
    904 	rf_lock_mutex2(raidPtr->iodone_lock);
    905 	rf_signal_cond2(raidPtr->iodone_cv);
    906 	rf_unlock_mutex2(raidPtr->iodone_lock);
    907 }
    908 
    909 static void
    910 raidstrategy(struct buf *bp)
    911 {
    912 	unsigned int unit;
    913 	struct raid_softc *rs;
    914 	struct dk_softc *dksc;
    915 	RF_Raid_t *raidPtr;
    916 
    917 	unit = raidunit(bp->b_dev);
    918 	if ((rs = raidget(unit, false)) == NULL) {
    919 		bp->b_error = ENXIO;
    920 		goto fail;
    921 	}
    922 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    923 		bp->b_error = ENXIO;
    924 		goto fail;
    925 	}
    926 	dksc = &rs->sc_dksc;
    927 	raidPtr = &rs->sc_r;
    928 
    929 	/* Queue IO only */
    930 	if (dk_strategy_defer(dksc, bp))
    931 		goto done;
    932 
    933 	/* schedule the IO to happen at the next convenient time */
    934 	raid_wakeup(raidPtr);
    935 
    936 done:
    937 	return;
    938 
    939 fail:
    940 	bp->b_resid = bp->b_bcount;
    941 	biodone(bp);
    942 }
    943 
    944 static int
    945 raid_diskstart(device_t dev, struct buf *bp)
    946 {
    947 	struct raid_softc *rs = raidsoftc(dev);
    948 	RF_Raid_t *raidPtr;
    949 
    950 	raidPtr = &rs->sc_r;
    951 	if (!raidPtr->valid) {
    952 		db1_printf(("raid is not valid..\n"));
    953 		return ENODEV;
    954 	}
    955 
    956 	/* XXX */
    957 	bp->b_resid = 0;
    958 
    959 	return raiddoaccess(raidPtr, bp);
    960 }
    961 
    962 void
    963 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    964 {
    965 	struct raid_softc *rs;
    966 	struct dk_softc *dksc;
    967 
    968 	rs = raidPtr->softc;
    969 	dksc = &rs->sc_dksc;
    970 
    971 	dk_done(dksc, bp);
    972 
    973 	rf_lock_mutex2(raidPtr->mutex);
    974 	raidPtr->openings++;
    975 	rf_unlock_mutex2(raidPtr->mutex);
    976 
    977 	/* schedule more IO */
    978 	raid_wakeup(raidPtr);
    979 }
    980 
    981 /* ARGSUSED */
    982 static int
    983 raidread(dev_t dev, struct uio *uio, int flags)
    984 {
    985 	int     unit = raidunit(dev);
    986 	struct raid_softc *rs;
    987 
    988 	if ((rs = raidget(unit, false)) == NULL)
    989 		return ENXIO;
    990 
    991 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    992 		return ENXIO;
    993 
    994 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
    995 
    996 }
    997 
    998 /* ARGSUSED */
    999 static int
   1000 raidwrite(dev_t dev, struct uio *uio, int flags)
   1001 {
   1002 	int     unit = raidunit(dev);
   1003 	struct raid_softc *rs;
   1004 
   1005 	if ((rs = raidget(unit, false)) == NULL)
   1006 		return ENXIO;
   1007 
   1008 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1009 		return ENXIO;
   1010 
   1011 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1012 
   1013 }
   1014 
   1015 static int
   1016 raid_detach_unlocked(struct raid_softc *rs)
   1017 {
   1018 	struct dk_softc *dksc = &rs->sc_dksc;
   1019 	RF_Raid_t *raidPtr;
   1020 	int error;
   1021 
   1022 	raidPtr = &rs->sc_r;
   1023 
   1024 	if (DK_BUSY(dksc, 0) ||
   1025 	    raidPtr->recon_in_progress != 0 ||
   1026 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1027 	    raidPtr->copyback_in_progress != 0)
   1028 		return EBUSY;
   1029 
   1030 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1031 		return 0;
   1032 
   1033 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1034 
   1035 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1036 		return error;
   1037 
   1038 	rs->sc_flags &= ~RAIDF_INITED;
   1039 
   1040 	/* Kill off any queued buffers */
   1041 	dk_drain(dksc);
   1042 	bufq_free(dksc->sc_bufq);
   1043 
   1044 	/* Detach the disk. */
   1045 	dkwedge_delall(&dksc->sc_dkdev);
   1046 	disk_detach(&dksc->sc_dkdev);
   1047 	disk_destroy(&dksc->sc_dkdev);
   1048 	dk_detach(dksc);
   1049 
   1050 	return 0;
   1051 }
   1052 
   1053 static bool
   1054 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1055 {
   1056 	switch (cmd) {
   1057 	case RAIDFRAME_ADD_HOT_SPARE:
   1058 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1059 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1060 	case RAIDFRAME_CHECK_PARITY:
   1061 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1062 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1063 	case RAIDFRAME_CHECK_RECON_STATUS:
   1064 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1065 	case RAIDFRAME_COPYBACK:
   1066 	case RAIDFRAME_DELETE_COMPONENT:
   1067 	case RAIDFRAME_FAIL_DISK:
   1068 	case RAIDFRAME_GET_ACCTOTALS:
   1069 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1070 	case RAIDFRAME_GET_INFO:
   1071 	case RAIDFRAME_GET_SIZE:
   1072 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1073 	case RAIDFRAME_INIT_LABELS:
   1074 	case RAIDFRAME_KEEP_ACCTOTALS:
   1075 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1076 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1077 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1078 	case RAIDFRAME_PARITYMAP_STATUS:
   1079 	case RAIDFRAME_REBUILD_IN_PLACE:
   1080 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1081 	case RAIDFRAME_RESET_ACCTOTALS:
   1082 	case RAIDFRAME_REWRITEPARITY:
   1083 	case RAIDFRAME_SET_AUTOCONFIG:
   1084 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1085 	case RAIDFRAME_SET_ROOT:
   1086 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1087 	}
   1088 	return false;
   1089 }
   1090 
   1091 int
   1092 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1093 {
   1094 	struct rf_recon_req_internal *rrint;
   1095 
   1096 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1097 		/* Can't do this on a RAID 0!! */
   1098 		return EINVAL;
   1099 	}
   1100 
   1101 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1102 		/* bad column */
   1103 		return EINVAL;
   1104 	}
   1105 
   1106 	rf_lock_mutex2(raidPtr->mutex);
   1107 	if (raidPtr->status == rf_rs_reconstructing) {
   1108 		/* you can't fail a disk while we're reconstructing! */
   1109 		/* XXX wrong for RAID6 */
   1110 		goto out;
   1111 	}
   1112 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1113 	    (raidPtr->numFailures > 0)) {
   1114 		/* some other component has failed.  Let's not make
   1115 		   things worse. XXX wrong for RAID6 */
   1116 		goto out;
   1117 	}
   1118 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1119 		/* Can't fail a spared disk! */
   1120 		goto out;
   1121 	}
   1122 	rf_unlock_mutex2(raidPtr->mutex);
   1123 
   1124 	/* make a copy of the recon request so that we don't rely on
   1125 	 * the user's buffer */
   1126 	rrint = RF_Malloc(sizeof(*rrint));
   1127 	if (rrint == NULL)
   1128 		return(ENOMEM);
   1129 	rrint->col = rr->col;
   1130 	rrint->flags = rr->flags;
   1131 	rrint->raidPtr = raidPtr;
   1132 
   1133 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1134 	    rrint, "raid_recon");
   1135 out:
   1136 	rf_unlock_mutex2(raidPtr->mutex);
   1137 	return EINVAL;
   1138 }
   1139 
   1140 static int
   1141 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1142 {
   1143 	/* allocate a buffer for the layout-specific data, and copy it in */
   1144 	if (k_cfg->layoutSpecificSize == 0)
   1145 		return 0;
   1146 
   1147 	if (k_cfg->layoutSpecificSize > 10000) {
   1148 	    /* sanity check */
   1149 	    return EINVAL;
   1150 	}
   1151 
   1152 	u_char *specific_buf;
   1153 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1154 	if (specific_buf == NULL)
   1155 		return ENOMEM;
   1156 
   1157 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1158 	    k_cfg->layoutSpecificSize);
   1159 	if (retcode) {
   1160 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1161 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1162 		return retcode;
   1163 	}
   1164 
   1165 	k_cfg->layoutSpecific = specific_buf;
   1166 	return 0;
   1167 }
   1168 
   1169 static int
   1170 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1171 {
   1172 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1173 
   1174 	if (rs->sc_r.valid) {
   1175 		/* There is a valid RAID set running on this unit! */
   1176 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1177 		return EINVAL;
   1178 	}
   1179 
   1180 	/* copy-in the configuration information */
   1181 	/* data points to a pointer to the configuration structure */
   1182 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1183 	if (*k_cfg == NULL) {
   1184 		return ENOMEM;
   1185 	}
   1186 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1187 	if (retcode == 0)
   1188 		return 0;
   1189 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1190 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1191 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1192 	return retcode;
   1193 }
   1194 
   1195 int
   1196 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1197 {
   1198 	int retcode;
   1199 	RF_Raid_t *raidPtr = &rs->sc_r;
   1200 
   1201 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1202 
   1203 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1204 		goto out;
   1205 
   1206 	/* should do some kind of sanity check on the configuration.
   1207 	 * Store the sum of all the bytes in the last byte? */
   1208 
   1209 	/* configure the system */
   1210 
   1211 	/*
   1212 	 * Clear the entire RAID descriptor, just to make sure
   1213 	 *  there is no stale data left in the case of a
   1214 	 *  reconfiguration
   1215 	 */
   1216 	memset(raidPtr, 0, sizeof(*raidPtr));
   1217 	raidPtr->softc = rs;
   1218 	raidPtr->raidid = rs->sc_unit;
   1219 
   1220 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1221 
   1222 	if (retcode == 0) {
   1223 		/* allow this many simultaneous IO's to
   1224 		   this RAID device */
   1225 		raidPtr->openings = RAIDOUTSTANDING;
   1226 
   1227 		raidinit(rs);
   1228 		raid_wakeup(raidPtr);
   1229 		rf_markalldirty(raidPtr);
   1230 	}
   1231 
   1232 	/* free the buffers.  No return code here. */
   1233 	if (k_cfg->layoutSpecificSize) {
   1234 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1235 	}
   1236 out:
   1237 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1238 	if (retcode) {
   1239 		/*
   1240 		 * If configuration failed, set sc_flags so that we
   1241 		 * will detach the device when we close it.
   1242 		 */
   1243 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1244 	}
   1245 	return retcode;
   1246 }
   1247 
   1248 #if RF_DISABLED
   1249 static int
   1250 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1251 {
   1252 
   1253 	/* XXX check the label for valid stuff... */
   1254 	/* Note that some things *should not* get modified --
   1255 	   the user should be re-initing the labels instead of
   1256 	   trying to patch things.
   1257 	   */
   1258 #ifdef DEBUG
   1259 	int raidid = raidPtr->raidid;
   1260 	printf("raid%d: Got component label:\n", raidid);
   1261 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1262 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1263 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1264 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1265 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1266 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1267 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1268 #endif	/* DEBUG */
   1269 	clabel->row = 0;
   1270 	int column = clabel->column;
   1271 
   1272 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1273 		return(EINVAL);
   1274 	}
   1275 
   1276 	/* XXX this isn't allowed to do anything for now :-) */
   1277 
   1278 	/* XXX and before it is, we need to fill in the rest
   1279 	   of the fields!?!?!?! */
   1280 	memcpy(raidget_component_label(raidPtr, column),
   1281 	    clabel, sizeof(*clabel));
   1282 	raidflush_component_label(raidPtr, column);
   1283 	return 0;
   1284 }
   1285 #endif
   1286 
   1287 static int
   1288 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1289 {
   1290 	/*
   1291 	   we only want the serial number from
   1292 	   the above.  We get all the rest of the information
   1293 	   from the config that was used to create this RAID
   1294 	   set.
   1295 	   */
   1296 
   1297 	raidPtr->serial_number = clabel->serial_number;
   1298 
   1299 	for (int column = 0; column < raidPtr->numCol; column++) {
   1300 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1301 		if (RF_DEAD_DISK(diskPtr->status))
   1302 			continue;
   1303 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1304 		    raidPtr, column);
   1305 		/* Zeroing this is important. */
   1306 		memset(ci_label, 0, sizeof(*ci_label));
   1307 		raid_init_component_label(raidPtr, ci_label);
   1308 		ci_label->serial_number = raidPtr->serial_number;
   1309 		ci_label->row = 0; /* we dont' pretend to support more */
   1310 		rf_component_label_set_partitionsize(ci_label,
   1311 		    diskPtr->partitionSize);
   1312 		ci_label->column = column;
   1313 		raidflush_component_label(raidPtr, column);
   1314 		/* XXXjld what about the spares? */
   1315 	}
   1316 
   1317 	return 0;
   1318 }
   1319 
   1320 static int
   1321 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1322 {
   1323 
   1324 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1325 		/* Can't do this on a RAID 0!! */
   1326 		return EINVAL;
   1327 	}
   1328 
   1329 	if (raidPtr->recon_in_progress == 1) {
   1330 		/* a reconstruct is already in progress! */
   1331 		return EINVAL;
   1332 	}
   1333 
   1334 	RF_SingleComponent_t component;
   1335 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1336 	component.row = 0; /* we don't support any more */
   1337 	int column = component.column;
   1338 
   1339 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1340 		return EINVAL;
   1341 	}
   1342 
   1343 	rf_lock_mutex2(raidPtr->mutex);
   1344 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1345 	    (raidPtr->numFailures > 0)) {
   1346 		/* XXX 0 above shouldn't be constant!!! */
   1347 		/* some component other than this has failed.
   1348 		   Let's not make things worse than they already
   1349 		   are... */
   1350 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1351 		       raidPtr->raidid);
   1352 		printf("raid%d:     Col: %d   Too many failures.\n",
   1353 		       raidPtr->raidid, column);
   1354 		rf_unlock_mutex2(raidPtr->mutex);
   1355 		return EINVAL;
   1356 	}
   1357 
   1358 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1359 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1360 		       raidPtr->raidid);
   1361 		printf("raid%d:    Col: %d   "
   1362 		    "Reconstruction already occurring!\n",
   1363 		    raidPtr->raidid, column);
   1364 
   1365 		rf_unlock_mutex2(raidPtr->mutex);
   1366 		return EINVAL;
   1367 	}
   1368 
   1369 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1370 		rf_unlock_mutex2(raidPtr->mutex);
   1371 		return EINVAL;
   1372 	}
   1373 
   1374 	rf_unlock_mutex2(raidPtr->mutex);
   1375 
   1376 	struct rf_recon_req_internal *rrint;
   1377 	rrint = RF_Malloc(sizeof(*rrint));
   1378 	if (rrint == NULL)
   1379 		return ENOMEM;
   1380 
   1381 	rrint->col = column;
   1382 	rrint->raidPtr = raidPtr;
   1383 
   1384 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1385 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1386 }
   1387 
   1388 static int
   1389 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1390 {
   1391 	/*
   1392 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1393 	 * so tell the user it's done.
   1394 	 */
   1395 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1396 	    raidPtr->status != rf_rs_reconstructing) {
   1397 		*data = 100;
   1398 		return 0;
   1399 	}
   1400 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1401 		*data = 0;
   1402 		return 0;
   1403 	}
   1404 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1405 	    / raidPtr->reconControl->numRUsTotal);
   1406 	return 0;
   1407 }
   1408 
   1409 static int
   1410 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1411 {
   1412 	int     unit = raidunit(dev);
   1413 	int     part, pmask;
   1414 	struct raid_softc *rs;
   1415 	struct dk_softc *dksc;
   1416 	RF_Config_t *k_cfg;
   1417 	RF_Raid_t *raidPtr;
   1418 	RF_AccTotals_t *totals;
   1419 	RF_SingleComponent_t component;
   1420 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1421 	int retcode = 0;
   1422 	int column;
   1423 	RF_ComponentLabel_t *clabel;
   1424 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1425 	int d;
   1426 
   1427 	if ((rs = raidget(unit, false)) == NULL)
   1428 		return ENXIO;
   1429 
   1430 	dksc = &rs->sc_dksc;
   1431 	raidPtr = &rs->sc_r;
   1432 
   1433 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1434 	    (int) DISKPART(dev), (int) unit, cmd));
   1435 
   1436 	/* Must be initialized for these... */
   1437 	if (rf_must_be_initialized(rs, cmd))
   1438 		return ENXIO;
   1439 
   1440 	switch (cmd) {
   1441 		/* configure the system */
   1442 	case RAIDFRAME_CONFIGURE:
   1443 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1444 			return retcode;
   1445 		return rf_construct(rs, k_cfg);
   1446 
   1447 		/* shutdown the system */
   1448 	case RAIDFRAME_SHUTDOWN:
   1449 
   1450 		part = DISKPART(dev);
   1451 		pmask = (1 << part);
   1452 
   1453 		if ((retcode = raidlock(rs)) != 0)
   1454 			return retcode;
   1455 
   1456 		if (DK_BUSY(dksc, pmask) ||
   1457 		    raidPtr->recon_in_progress != 0 ||
   1458 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1459 		    raidPtr->copyback_in_progress != 0)
   1460 			retcode = EBUSY;
   1461 		else {
   1462 			/* detach and free on close */
   1463 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1464 			retcode = 0;
   1465 		}
   1466 
   1467 		raidunlock(rs);
   1468 
   1469 		return retcode;
   1470 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1471 		return rf_get_component_label(raidPtr, data);
   1472 
   1473 #if RF_DISABLED
   1474 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1475 		return rf_set_component_label(raidPtr, data);
   1476 #endif
   1477 
   1478 	case RAIDFRAME_INIT_LABELS:
   1479 		return rf_init_component_label(raidPtr, data);
   1480 
   1481 	case RAIDFRAME_SET_AUTOCONFIG:
   1482 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1483 		printf("raid%d: New autoconfig value is: %d\n",
   1484 		       raidPtr->raidid, d);
   1485 		*(int *) data = d;
   1486 		return retcode;
   1487 
   1488 	case RAIDFRAME_SET_ROOT:
   1489 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1490 		printf("raid%d: New rootpartition value is: %d\n",
   1491 		       raidPtr->raidid, d);
   1492 		*(int *) data = d;
   1493 		return retcode;
   1494 
   1495 		/* initialize all parity */
   1496 	case RAIDFRAME_REWRITEPARITY:
   1497 
   1498 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1499 			/* Parity for RAID 0 is trivially correct */
   1500 			raidPtr->parity_good = RF_RAID_CLEAN;
   1501 			return 0;
   1502 		}
   1503 
   1504 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1505 			/* Re-write is already in progress! */
   1506 			return EINVAL;
   1507 		}
   1508 
   1509 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1510 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1511 
   1512 	case RAIDFRAME_ADD_HOT_SPARE:
   1513 		sparePtr = (RF_SingleComponent_t *) data;
   1514 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1515 		return rf_add_hot_spare(raidPtr, &component);
   1516 
   1517 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1518 		return retcode;
   1519 
   1520 	case RAIDFRAME_DELETE_COMPONENT:
   1521 		componentPtr = (RF_SingleComponent_t *)data;
   1522 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1523 		return rf_delete_component(raidPtr, &component);
   1524 
   1525 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1526 		componentPtr = (RF_SingleComponent_t *)data;
   1527 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1528 		return rf_incorporate_hot_spare(raidPtr, &component);
   1529 
   1530 	case RAIDFRAME_REBUILD_IN_PLACE:
   1531 		return rf_rebuild_in_place(raidPtr, data);
   1532 
   1533 	case RAIDFRAME_GET_INFO:
   1534 		ucfgp = *(RF_DeviceConfig_t **)data;
   1535 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1536 		if (d_cfg == NULL)
   1537 			return ENOMEM;
   1538 		retcode = rf_get_info(raidPtr, d_cfg);
   1539 		if (retcode == 0) {
   1540 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1541 		}
   1542 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1543 		return retcode;
   1544 
   1545 	case RAIDFRAME_CHECK_PARITY:
   1546 		*(int *) data = raidPtr->parity_good;
   1547 		return 0;
   1548 
   1549 	case RAIDFRAME_PARITYMAP_STATUS:
   1550 		if (rf_paritymap_ineligible(raidPtr))
   1551 			return EINVAL;
   1552 		rf_paritymap_status(raidPtr->parity_map, data);
   1553 		return 0;
   1554 
   1555 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1556 		if (rf_paritymap_ineligible(raidPtr))
   1557 			return EINVAL;
   1558 		if (raidPtr->parity_map == NULL)
   1559 			return ENOENT; /* ??? */
   1560 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1561 			return EINVAL;
   1562 		return 0;
   1563 
   1564 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1565 		if (rf_paritymap_ineligible(raidPtr))
   1566 			return EINVAL;
   1567 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1568 		return 0;
   1569 
   1570 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1571 		if (rf_paritymap_ineligible(raidPtr))
   1572 			return EINVAL;
   1573 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1574 		/* XXX should errors be passed up? */
   1575 		return 0;
   1576 
   1577 	case RAIDFRAME_RESET_ACCTOTALS:
   1578 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1579 		return 0;
   1580 
   1581 	case RAIDFRAME_GET_ACCTOTALS:
   1582 		totals = (RF_AccTotals_t *) data;
   1583 		*totals = raidPtr->acc_totals;
   1584 		return 0;
   1585 
   1586 	case RAIDFRAME_KEEP_ACCTOTALS:
   1587 		raidPtr->keep_acc_totals = *(int *)data;
   1588 		return 0;
   1589 
   1590 	case RAIDFRAME_GET_SIZE:
   1591 		*(int *) data = raidPtr->totalSectors;
   1592 		return 0;
   1593 
   1594 	case RAIDFRAME_FAIL_DISK:
   1595 		return rf_fail_disk(raidPtr, data);
   1596 
   1597 		/* invoke a copyback operation after recon on whatever disk
   1598 		 * needs it, if any */
   1599 	case RAIDFRAME_COPYBACK:
   1600 
   1601 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1602 			/* This makes no sense on a RAID 0!! */
   1603 			return EINVAL;
   1604 		}
   1605 
   1606 		if (raidPtr->copyback_in_progress == 1) {
   1607 			/* Copyback is already in progress! */
   1608 			return EINVAL;
   1609 		}
   1610 
   1611 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1612 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1613 
   1614 		/* return the percentage completion of reconstruction */
   1615 	case RAIDFRAME_CHECK_RECON_STATUS:
   1616 		return rf_check_recon_status(raidPtr, data);
   1617 
   1618 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1619 		rf_check_recon_status_ext(raidPtr, data);
   1620 		return 0;
   1621 
   1622 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1623 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1624 			/* This makes no sense on a RAID 0, so tell the
   1625 			   user it's done. */
   1626 			*(int *) data = 100;
   1627 			return 0;
   1628 		}
   1629 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1630 			*(int *) data = 100 *
   1631 				raidPtr->parity_rewrite_stripes_done /
   1632 				raidPtr->Layout.numStripe;
   1633 		} else {
   1634 			*(int *) data = 100;
   1635 		}
   1636 		return 0;
   1637 
   1638 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1639 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1640 		return 0;
   1641 
   1642 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1643 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1644 			/* This makes no sense on a RAID 0 */
   1645 			*(int *) data = 100;
   1646 			return 0;
   1647 		}
   1648 		if (raidPtr->copyback_in_progress == 1) {
   1649 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1650 				raidPtr->Layout.numStripe;
   1651 		} else {
   1652 			*(int *) data = 100;
   1653 		}
   1654 		return 0;
   1655 
   1656 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1657 		rf_check_copyback_status_ext(raidPtr, data);
   1658 		return 0;
   1659 
   1660 	case RAIDFRAME_SET_LAST_UNIT:
   1661 		for (column = 0; column < raidPtr->numCol; column++)
   1662 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1663 				return EBUSY;
   1664 
   1665 		for (column = 0; column < raidPtr->numCol; column++) {
   1666 			clabel = raidget_component_label(raidPtr, column);
   1667 			clabel->last_unit = *(int *)data;
   1668 			raidflush_component_label(raidPtr, column);
   1669 		}
   1670 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1671 		return 0;
   1672 
   1673 		/* the sparetable daemon calls this to wait for the kernel to
   1674 		 * need a spare table. this ioctl does not return until a
   1675 		 * spare table is needed. XXX -- calling mpsleep here in the
   1676 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1677 		 * -- I should either compute the spare table in the kernel,
   1678 		 * or have a different -- XXX XXX -- interface (a different
   1679 		 * character device) for delivering the table     -- XXX */
   1680 #if RF_DISABLED
   1681 	case RAIDFRAME_SPARET_WAIT:
   1682 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1683 		while (!rf_sparet_wait_queue)
   1684 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1685 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1686 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1687 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1688 
   1689 		/* structure assignment */
   1690 		*((RF_SparetWait_t *) data) = *waitreq;
   1691 
   1692 		RF_Free(waitreq, sizeof(*waitreq));
   1693 		return 0;
   1694 
   1695 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1696 		 * code in it that will cause the dameon to exit */
   1697 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1698 		waitreq = RF_Malloc(sizeof(*waitreq));
   1699 		waitreq->fcol = -1;
   1700 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1701 		waitreq->next = rf_sparet_wait_queue;
   1702 		rf_sparet_wait_queue = waitreq;
   1703 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1704 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1705 		return 0;
   1706 
   1707 		/* used by the spare table daemon to deliver a spare table
   1708 		 * into the kernel */
   1709 	case RAIDFRAME_SEND_SPARET:
   1710 
   1711 		/* install the spare table */
   1712 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1713 
   1714 		/* respond to the requestor.  the return status of the spare
   1715 		 * table installation is passed in the "fcol" field */
   1716 		waitred = RF_Malloc(sizeof(*waitreq));
   1717 		waitreq->fcol = retcode;
   1718 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1719 		waitreq->next = rf_sparet_resp_queue;
   1720 		rf_sparet_resp_queue = waitreq;
   1721 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1722 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1723 
   1724 		return retcode;
   1725 #endif
   1726 	default:
   1727 		/*
   1728 		 * Don't bother trying to load compat modules
   1729 		 * if it is not our ioctl. This is more efficient
   1730 		 * and makes rump tests not depend on compat code
   1731 		 */
   1732 		if (IOCGROUP(cmd) != 'r')
   1733 			break;
   1734 #ifdef _LP64
   1735 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1736 			module_autoload("compat_netbsd32_raid",
   1737 			    MODULE_CLASS_EXEC);
   1738 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1739 			    (rs, cmd, data), enosys(), retcode);
   1740 			if (retcode != EPASSTHROUGH)
   1741 				return retcode;
   1742 		}
   1743 #endif
   1744 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1745 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1746 		    (rs, cmd, data), enosys(), retcode);
   1747 		if (retcode != EPASSTHROUGH)
   1748 			return retcode;
   1749 
   1750 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1751 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1752 		    (rs, cmd, data), enosys(), retcode);
   1753 		if (retcode != EPASSTHROUGH)
   1754 			return retcode;
   1755 		break; /* fall through to the os-specific code below */
   1756 
   1757 	}
   1758 
   1759 	if (!raidPtr->valid)
   1760 		return EINVAL;
   1761 
   1762 	/*
   1763 	 * Add support for "regular" device ioctls here.
   1764 	 */
   1765 
   1766 	switch (cmd) {
   1767 	case DIOCGCACHE:
   1768 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1769 		break;
   1770 
   1771 	case DIOCCACHESYNC:
   1772 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1773 		break;
   1774 
   1775 	default:
   1776 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1777 		break;
   1778 	}
   1779 
   1780 	return retcode;
   1781 
   1782 }
   1783 
   1784 
   1785 /* raidinit -- complete the rest of the initialization for the
   1786    RAIDframe device.  */
   1787 
   1788 
   1789 static void
   1790 raidinit(struct raid_softc *rs)
   1791 {
   1792 	cfdata_t cf;
   1793 	unsigned int unit;
   1794 	struct dk_softc *dksc = &rs->sc_dksc;
   1795 	RF_Raid_t *raidPtr = &rs->sc_r;
   1796 	device_t dev;
   1797 
   1798 	unit = raidPtr->raidid;
   1799 
   1800 	/* XXX doesn't check bounds. */
   1801 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1802 
   1803 	/* attach the pseudo device */
   1804 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1805 	cf->cf_name = raid_cd.cd_name;
   1806 	cf->cf_atname = raid_cd.cd_name;
   1807 	cf->cf_unit = unit;
   1808 	cf->cf_fstate = FSTATE_STAR;
   1809 
   1810 	dev = config_attach_pseudo(cf);
   1811 	if (dev == NULL) {
   1812 		printf("raid%d: config_attach_pseudo failed\n",
   1813 		    raidPtr->raidid);
   1814 		free(cf, M_RAIDFRAME);
   1815 		return;
   1816 	}
   1817 
   1818 	/* provide a backpointer to the real softc */
   1819 	raidsoftc(dev) = rs;
   1820 
   1821 	/* disk_attach actually creates space for the CPU disklabel, among
   1822 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1823 	 * with disklabels. */
   1824 	dk_init(dksc, dev, DKTYPE_RAID);
   1825 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1826 
   1827 	/* XXX There may be a weird interaction here between this, and
   1828 	 * protectedSectors, as used in RAIDframe.  */
   1829 
   1830 	rs->sc_size = raidPtr->totalSectors;
   1831 
   1832 	/* Attach dk and disk subsystems */
   1833 	dk_attach(dksc);
   1834 	disk_attach(&dksc->sc_dkdev);
   1835 	rf_set_geometry(rs, raidPtr);
   1836 
   1837 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1838 
   1839 	/* mark unit as usuable */
   1840 	rs->sc_flags |= RAIDF_INITED;
   1841 
   1842 	dkwedge_discover(&dksc->sc_dkdev);
   1843 }
   1844 
   1845 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1846 /* wake up the daemon & tell it to get us a spare table
   1847  * XXX
   1848  * the entries in the queues should be tagged with the raidPtr
   1849  * so that in the extremely rare case that two recons happen at once,
   1850  * we know for which device were requesting a spare table
   1851  * XXX
   1852  *
   1853  * XXX This code is not currently used. GO
   1854  */
   1855 int
   1856 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1857 {
   1858 	int     retcode;
   1859 
   1860 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1861 	req->next = rf_sparet_wait_queue;
   1862 	rf_sparet_wait_queue = req;
   1863 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1864 
   1865 	/* mpsleep unlocks the mutex */
   1866 	while (!rf_sparet_resp_queue) {
   1867 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1868 	}
   1869 	req = rf_sparet_resp_queue;
   1870 	rf_sparet_resp_queue = req->next;
   1871 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1872 
   1873 	retcode = req->fcol;
   1874 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1875 					 * alloc'd */
   1876 	return retcode;
   1877 }
   1878 #endif
   1879 
   1880 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1881  * bp & passes it down.
   1882  * any calls originating in the kernel must use non-blocking I/O
   1883  * do some extra sanity checking to return "appropriate" error values for
   1884  * certain conditions (to make some standard utilities work)
   1885  *
   1886  * Formerly known as: rf_DoAccessKernel
   1887  */
   1888 void
   1889 raidstart(RF_Raid_t *raidPtr)
   1890 {
   1891 	struct raid_softc *rs;
   1892 	struct dk_softc *dksc;
   1893 
   1894 	rs = raidPtr->softc;
   1895 	dksc = &rs->sc_dksc;
   1896 	/* quick check to see if anything has died recently */
   1897 	rf_lock_mutex2(raidPtr->mutex);
   1898 	if (raidPtr->numNewFailures > 0) {
   1899 		rf_unlock_mutex2(raidPtr->mutex);
   1900 		rf_update_component_labels(raidPtr,
   1901 					   RF_NORMAL_COMPONENT_UPDATE);
   1902 		rf_lock_mutex2(raidPtr->mutex);
   1903 		raidPtr->numNewFailures--;
   1904 	}
   1905 	rf_unlock_mutex2(raidPtr->mutex);
   1906 
   1907 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1908 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1909 		return;
   1910 	}
   1911 
   1912 	dk_start(dksc, NULL);
   1913 }
   1914 
   1915 static int
   1916 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1917 {
   1918 	RF_SectorCount_t num_blocks, pb, sum;
   1919 	RF_RaidAddr_t raid_addr;
   1920 	daddr_t blocknum;
   1921 	int rc;
   1922 
   1923 	rf_lock_mutex2(raidPtr->mutex);
   1924 	if (raidPtr->openings == 0) {
   1925 		rf_unlock_mutex2(raidPtr->mutex);
   1926 		return EAGAIN;
   1927 	}
   1928 	rf_unlock_mutex2(raidPtr->mutex);
   1929 
   1930 	blocknum = bp->b_rawblkno;
   1931 
   1932 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1933 		    (int) blocknum));
   1934 
   1935 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1936 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1937 
   1938 	/* *THIS* is where we adjust what block we're going to...
   1939 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1940 	raid_addr = blocknum;
   1941 
   1942 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1943 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1944 	sum = raid_addr + num_blocks + pb;
   1945 	if (1 || rf_debugKernelAccess) {
   1946 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1947 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1948 			    (int) pb, (int) bp->b_resid));
   1949 	}
   1950 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1951 	    || (sum < num_blocks) || (sum < pb)) {
   1952 		rc = ENOSPC;
   1953 		goto done;
   1954 	}
   1955 	/*
   1956 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1957 	 */
   1958 
   1959 	if (bp->b_bcount & raidPtr->sectorMask) {
   1960 		rc = ENOSPC;
   1961 		goto done;
   1962 	}
   1963 	db1_printf(("Calling DoAccess..\n"));
   1964 
   1965 
   1966 	rf_lock_mutex2(raidPtr->mutex);
   1967 	raidPtr->openings--;
   1968 	rf_unlock_mutex2(raidPtr->mutex);
   1969 
   1970 	/* don't ever condition on bp->b_flags & B_WRITE.
   1971 	 * always condition on B_READ instead */
   1972 
   1973 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1974 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1975 			 raid_addr, num_blocks,
   1976 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1977 
   1978 done:
   1979 	return rc;
   1980 }
   1981 
   1982 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1983 
   1984 int
   1985 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1986 {
   1987 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1988 	struct buf *bp;
   1989 
   1990 	req->queue = queue;
   1991 	bp = req->bp;
   1992 
   1993 	switch (req->type) {
   1994 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1995 		/* XXX need to do something extra here.. */
   1996 		/* I'm leaving this in, as I've never actually seen it used,
   1997 		 * and I'd like folks to report it... GO */
   1998 		printf("%s: WAKEUP CALLED\n", __func__);
   1999 		queue->numOutstanding++;
   2000 
   2001 		bp->b_flags = 0;
   2002 		bp->b_private = req;
   2003 
   2004 		KernelWakeupFunc(bp);
   2005 		break;
   2006 
   2007 	case RF_IO_TYPE_READ:
   2008 	case RF_IO_TYPE_WRITE:
   2009 #if RF_ACC_TRACE > 0
   2010 		if (req->tracerec) {
   2011 			RF_ETIMER_START(req->tracerec->timer);
   2012 		}
   2013 #endif
   2014 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2015 		    op, queue->rf_cinfo->ci_dev,
   2016 		    req->sectorOffset, req->numSector,
   2017 		    req->buf, KernelWakeupFunc, (void *) req,
   2018 		    queue->raidPtr->logBytesPerSector);
   2019 
   2020 		if (rf_debugKernelAccess) {
   2021 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2022 				(long) bp->b_blkno));
   2023 		}
   2024 		queue->numOutstanding++;
   2025 		queue->last_deq_sector = req->sectorOffset;
   2026 		/* acc wouldn't have been let in if there were any pending
   2027 		 * reqs at any other priority */
   2028 		queue->curPriority = req->priority;
   2029 
   2030 		db1_printf(("Going for %c to unit %d col %d\n",
   2031 			    req->type, queue->raidPtr->raidid,
   2032 			    queue->col));
   2033 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2034 			(int) req->sectorOffset, (int) req->numSector,
   2035 			(int) (req->numSector <<
   2036 			    queue->raidPtr->logBytesPerSector),
   2037 			(int) queue->raidPtr->logBytesPerSector));
   2038 
   2039 		/*
   2040 		 * XXX: drop lock here since this can block at
   2041 		 * least with backing SCSI devices.  Retake it
   2042 		 * to minimize fuss with calling interfaces.
   2043 		 */
   2044 
   2045 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2046 		bdev_strategy(bp);
   2047 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2048 		break;
   2049 
   2050 	default:
   2051 		panic("bad req->type in rf_DispatchKernelIO");
   2052 	}
   2053 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2054 
   2055 	return 0;
   2056 }
   2057 /* this is the callback function associated with a I/O invoked from
   2058    kernel code.
   2059  */
   2060 static void
   2061 KernelWakeupFunc(struct buf *bp)
   2062 {
   2063 	RF_DiskQueueData_t *req = NULL;
   2064 	RF_DiskQueue_t *queue;
   2065 
   2066 	db1_printf(("recovering the request queue:\n"));
   2067 
   2068 	req = bp->b_private;
   2069 
   2070 	queue = (RF_DiskQueue_t *) req->queue;
   2071 
   2072 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2073 
   2074 #if RF_ACC_TRACE > 0
   2075 	if (req->tracerec) {
   2076 		RF_ETIMER_STOP(req->tracerec->timer);
   2077 		RF_ETIMER_EVAL(req->tracerec->timer);
   2078 		rf_lock_mutex2(rf_tracing_mutex);
   2079 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2080 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2081 		req->tracerec->num_phys_ios++;
   2082 		rf_unlock_mutex2(rf_tracing_mutex);
   2083 	}
   2084 #endif
   2085 
   2086 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2087 	 * ballistic, and mark the component as hosed... */
   2088 
   2089 	if (bp->b_error != 0) {
   2090 		/* Mark the disk as dead */
   2091 		/* but only mark it once... */
   2092 		/* and only if it wouldn't leave this RAID set
   2093 		   completely broken */
   2094 		if (((queue->raidPtr->Disks[queue->col].status ==
   2095 		      rf_ds_optimal) ||
   2096 		     (queue->raidPtr->Disks[queue->col].status ==
   2097 		      rf_ds_used_spare)) &&
   2098 		     (queue->raidPtr->numFailures <
   2099 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2100 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2101 			       queue->raidPtr->raidid,
   2102 			       bp->b_error,
   2103 			       queue->raidPtr->Disks[queue->col].devname);
   2104 			queue->raidPtr->Disks[queue->col].status =
   2105 			    rf_ds_failed;
   2106 			queue->raidPtr->status = rf_rs_degraded;
   2107 			queue->raidPtr->numFailures++;
   2108 			queue->raidPtr->numNewFailures++;
   2109 		} else {	/* Disk is already dead... */
   2110 			/* printf("Disk already marked as dead!\n"); */
   2111 		}
   2112 
   2113 	}
   2114 
   2115 	/* Fill in the error value */
   2116 	req->error = bp->b_error;
   2117 
   2118 	/* Drop this one on the "finished" queue... */
   2119 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2120 
   2121 	/* Let the raidio thread know there is work to be done. */
   2122 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2123 
   2124 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2125 }
   2126 
   2127 
   2128 /*
   2129  * initialize a buf structure for doing an I/O in the kernel.
   2130  */
   2131 static void
   2132 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2133        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2134        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2135 {
   2136 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2137 	bp->b_oflags = 0;
   2138 	bp->b_cflags = 0;
   2139 	bp->b_bcount = numSect << logBytesPerSector;
   2140 	bp->b_bufsize = bp->b_bcount;
   2141 	bp->b_error = 0;
   2142 	bp->b_dev = dev;
   2143 	bp->b_data = bf;
   2144 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2145 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2146 	if (bp->b_bcount == 0) {
   2147 		panic("bp->b_bcount is zero in InitBP!!");
   2148 	}
   2149 	bp->b_iodone = cbFunc;
   2150 	bp->b_private = cbArg;
   2151 }
   2152 
   2153 /*
   2154  * Wait interruptibly for an exclusive lock.
   2155  *
   2156  * XXX
   2157  * Several drivers do this; it should be abstracted and made MP-safe.
   2158  * (Hmm... where have we seen this warning before :->  GO )
   2159  */
   2160 static int
   2161 raidlock(struct raid_softc *rs)
   2162 {
   2163 	int     error;
   2164 
   2165 	error = 0;
   2166 	mutex_enter(&rs->sc_mutex);
   2167 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2168 		rs->sc_flags |= RAIDF_WANTED;
   2169 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2170 		if (error != 0)
   2171 			goto done;
   2172 	}
   2173 	rs->sc_flags |= RAIDF_LOCKED;
   2174 done:
   2175 	mutex_exit(&rs->sc_mutex);
   2176 	return error;
   2177 }
   2178 /*
   2179  * Unlock and wake up any waiters.
   2180  */
   2181 static void
   2182 raidunlock(struct raid_softc *rs)
   2183 {
   2184 
   2185 	mutex_enter(&rs->sc_mutex);
   2186 	rs->sc_flags &= ~RAIDF_LOCKED;
   2187 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2188 		rs->sc_flags &= ~RAIDF_WANTED;
   2189 		cv_broadcast(&rs->sc_cv);
   2190 	}
   2191 	mutex_exit(&rs->sc_mutex);
   2192 }
   2193 
   2194 
   2195 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2196 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2197 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2198 
   2199 static daddr_t
   2200 rf_component_info_offset(void)
   2201 {
   2202 
   2203 	return RF_COMPONENT_INFO_OFFSET;
   2204 }
   2205 
   2206 static daddr_t
   2207 rf_component_info_size(unsigned secsize)
   2208 {
   2209 	daddr_t info_size;
   2210 
   2211 	KASSERT(secsize);
   2212 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2213 		info_size = secsize;
   2214 	else
   2215 		info_size = RF_COMPONENT_INFO_SIZE;
   2216 
   2217 	return info_size;
   2218 }
   2219 
   2220 static daddr_t
   2221 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2222 {
   2223 	daddr_t map_offset;
   2224 
   2225 	KASSERT(raidPtr->bytesPerSector);
   2226 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2227 		map_offset = raidPtr->bytesPerSector;
   2228 	else
   2229 		map_offset = RF_COMPONENT_INFO_SIZE;
   2230 	map_offset += rf_component_info_offset();
   2231 
   2232 	return map_offset;
   2233 }
   2234 
   2235 static daddr_t
   2236 rf_parity_map_size(RF_Raid_t *raidPtr)
   2237 {
   2238 	daddr_t map_size;
   2239 
   2240 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2241 		map_size = raidPtr->bytesPerSector;
   2242 	else
   2243 		map_size = RF_PARITY_MAP_SIZE;
   2244 
   2245 	return map_size;
   2246 }
   2247 
   2248 int
   2249 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2250 {
   2251 	RF_ComponentLabel_t *clabel;
   2252 
   2253 	clabel = raidget_component_label(raidPtr, col);
   2254 	clabel->clean = RF_RAID_CLEAN;
   2255 	raidflush_component_label(raidPtr, col);
   2256 	return(0);
   2257 }
   2258 
   2259 
   2260 int
   2261 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2262 {
   2263 	RF_ComponentLabel_t *clabel;
   2264 
   2265 	clabel = raidget_component_label(raidPtr, col);
   2266 	clabel->clean = RF_RAID_DIRTY;
   2267 	raidflush_component_label(raidPtr, col);
   2268 	return(0);
   2269 }
   2270 
   2271 int
   2272 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2273 {
   2274 	KASSERT(raidPtr->bytesPerSector);
   2275 
   2276 	return raidread_component_label(raidPtr->bytesPerSector,
   2277 	    raidPtr->Disks[col].dev,
   2278 	    raidPtr->raid_cinfo[col].ci_vp,
   2279 	    &raidPtr->raid_cinfo[col].ci_label);
   2280 }
   2281 
   2282 RF_ComponentLabel_t *
   2283 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2284 {
   2285 	return &raidPtr->raid_cinfo[col].ci_label;
   2286 }
   2287 
   2288 int
   2289 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2290 {
   2291 	RF_ComponentLabel_t *label;
   2292 
   2293 	label = &raidPtr->raid_cinfo[col].ci_label;
   2294 	label->mod_counter = raidPtr->mod_counter;
   2295 #ifndef RF_NO_PARITY_MAP
   2296 	label->parity_map_modcount = label->mod_counter;
   2297 #endif
   2298 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2299 	    raidPtr->Disks[col].dev,
   2300 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2301 }
   2302 
   2303 /*
   2304  * Swap the label endianness.
   2305  *
   2306  * Everything in the component label is 4-byte-swapped except the version,
   2307  * which is kept in the byte-swapped version at all times, and indicates
   2308  * for the writer that a swap is necessary.
   2309  *
   2310  * For reads it is expected that out_label == clabel, but writes expect
   2311  * separate labels so only the re-swapped label is written out to disk,
   2312  * leaving the swapped-except-version internally.
   2313  *
   2314  * Only support swapping label version 2.
   2315  */
   2316 static void
   2317 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2318 {
   2319 	int	*in, *out, *in_last;
   2320 
   2321 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2322 
   2323 	/* Don't swap the label, but do copy it. */
   2324 	out_label->version = clabel->version;
   2325 
   2326 	in = &clabel->serial_number;
   2327 	in_last = &clabel->future_use2[42];
   2328 	out = &out_label->serial_number;
   2329 
   2330 	for (; in < in_last; in++, out++)
   2331 		*out = bswap32(*in);
   2332 }
   2333 
   2334 static int
   2335 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2336     RF_ComponentLabel_t *clabel)
   2337 {
   2338 	int error;
   2339 
   2340 	error = raidread_component_area(dev, b_vp, clabel,
   2341 	    sizeof(RF_ComponentLabel_t),
   2342 	    rf_component_info_offset(),
   2343 	    rf_component_info_size(secsize));
   2344 
   2345 	if (error == 0 &&
   2346 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2347 		rf_swap_label(clabel, clabel);
   2348 	}
   2349 
   2350 	return error;
   2351 }
   2352 
   2353 /* ARGSUSED */
   2354 static int
   2355 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2356     size_t msize, daddr_t offset, daddr_t dsize)
   2357 {
   2358 	struct buf *bp;
   2359 	int error;
   2360 
   2361 	/* XXX should probably ensure that we don't try to do this if
   2362 	   someone has changed rf_protected_sectors. */
   2363 
   2364 	if (b_vp == NULL) {
   2365 		/* For whatever reason, this component is not valid.
   2366 		   Don't try to read a component label from it. */
   2367 		return(EINVAL);
   2368 	}
   2369 
   2370 	/* get a block of the appropriate size... */
   2371 	bp = geteblk((int)dsize);
   2372 	bp->b_dev = dev;
   2373 
   2374 	/* get our ducks in a row for the read */
   2375 	bp->b_blkno = offset / DEV_BSIZE;
   2376 	bp->b_bcount = dsize;
   2377 	bp->b_flags |= B_READ;
   2378  	bp->b_resid = dsize;
   2379 
   2380 	bdev_strategy(bp);
   2381 	error = biowait(bp);
   2382 
   2383 	if (!error) {
   2384 		memcpy(data, bp->b_data, msize);
   2385 	}
   2386 
   2387 	brelse(bp, 0);
   2388 	return(error);
   2389 }
   2390 
   2391 static int
   2392 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2393     RF_ComponentLabel_t *clabel)
   2394 {
   2395 	RF_ComponentLabel_t *clabel_write = clabel;
   2396 	RF_ComponentLabel_t lclabel;
   2397 	int error;
   2398 
   2399 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2400 		clabel_write = &lclabel;
   2401 		rf_swap_label(clabel, clabel_write);
   2402 	}
   2403 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2404 	    sizeof(RF_ComponentLabel_t),
   2405 	    rf_component_info_offset(),
   2406 	    rf_component_info_size(secsize), 0);
   2407 
   2408 	return error;
   2409 }
   2410 
   2411 /* ARGSUSED */
   2412 static int
   2413 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2414     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2415 {
   2416 	struct buf *bp;
   2417 	int error;
   2418 
   2419 	/* get a block of the appropriate size... */
   2420 	bp = geteblk((int)dsize);
   2421 	bp->b_dev = dev;
   2422 
   2423 	/* get our ducks in a row for the write */
   2424 	bp->b_blkno = offset / DEV_BSIZE;
   2425 	bp->b_bcount = dsize;
   2426 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2427  	bp->b_resid = dsize;
   2428 
   2429 	memset(bp->b_data, 0, dsize);
   2430 	memcpy(bp->b_data, data, msize);
   2431 
   2432 	bdev_strategy(bp);
   2433 	if (asyncp)
   2434 		return 0;
   2435 	error = biowait(bp);
   2436 	brelse(bp, 0);
   2437 	if (error) {
   2438 #if 1
   2439 		printf("Failed to write RAID component info!\n");
   2440 #endif
   2441 	}
   2442 
   2443 	return(error);
   2444 }
   2445 
   2446 void
   2447 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2448 {
   2449 	int c;
   2450 
   2451 	for (c = 0; c < raidPtr->numCol; c++) {
   2452 		/* Skip dead disks. */
   2453 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2454 			continue;
   2455 		/* XXXjld: what if an error occurs here? */
   2456 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2457 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2458 		    RF_PARITYMAP_NBYTE,
   2459 		    rf_parity_map_offset(raidPtr),
   2460 		    rf_parity_map_size(raidPtr), 0);
   2461 	}
   2462 }
   2463 
   2464 void
   2465 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2466 {
   2467 	struct rf_paritymap_ondisk tmp;
   2468 	int c,first;
   2469 
   2470 	first=1;
   2471 	for (c = 0; c < raidPtr->numCol; c++) {
   2472 		/* Skip dead disks. */
   2473 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2474 			continue;
   2475 		raidread_component_area(raidPtr->Disks[c].dev,
   2476 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2477 		    RF_PARITYMAP_NBYTE,
   2478 		    rf_parity_map_offset(raidPtr),
   2479 		    rf_parity_map_size(raidPtr));
   2480 		if (first) {
   2481 			memcpy(map, &tmp, sizeof(*map));
   2482 			first = 0;
   2483 		} else {
   2484 			rf_paritymap_merge(map, &tmp);
   2485 		}
   2486 	}
   2487 }
   2488 
   2489 void
   2490 rf_markalldirty(RF_Raid_t *raidPtr)
   2491 {
   2492 	RF_ComponentLabel_t *clabel;
   2493 	int sparecol;
   2494 	int c;
   2495 	int j;
   2496 	int scol = -1;
   2497 
   2498 	raidPtr->mod_counter++;
   2499 	for (c = 0; c < raidPtr->numCol; c++) {
   2500 		/* we don't want to touch (at all) a disk that has
   2501 		   failed */
   2502 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2503 			clabel = raidget_component_label(raidPtr, c);
   2504 			if (clabel->status == rf_ds_spared) {
   2505 				/* XXX do something special...
   2506 				   but whatever you do, don't
   2507 				   try to access it!! */
   2508 			} else {
   2509 				raidmarkdirty(raidPtr, c);
   2510 			}
   2511 		}
   2512 	}
   2513 
   2514 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2515 		sparecol = raidPtr->numCol + c;
   2516 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2517 			/*
   2518 
   2519 			   we claim this disk is "optimal" if it's
   2520 			   rf_ds_used_spare, as that means it should be
   2521 			   directly substitutable for the disk it replaced.
   2522 			   We note that too...
   2523 
   2524 			 */
   2525 
   2526 			for(j=0;j<raidPtr->numCol;j++) {
   2527 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2528 					scol = j;
   2529 					break;
   2530 				}
   2531 			}
   2532 
   2533 			clabel = raidget_component_label(raidPtr, sparecol);
   2534 			/* make sure status is noted */
   2535 
   2536 			raid_init_component_label(raidPtr, clabel);
   2537 
   2538 			clabel->row = 0;
   2539 			clabel->column = scol;
   2540 			/* Note: we *don't* change status from rf_ds_used_spare
   2541 			   to rf_ds_optimal */
   2542 			/* clabel.status = rf_ds_optimal; */
   2543 
   2544 			raidmarkdirty(raidPtr, sparecol);
   2545 		}
   2546 	}
   2547 }
   2548 
   2549 
   2550 void
   2551 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2552 {
   2553 	RF_ComponentLabel_t *clabel;
   2554 	int sparecol;
   2555 	int c;
   2556 	int j;
   2557 	int scol;
   2558 	struct raid_softc *rs = raidPtr->softc;
   2559 
   2560 	scol = -1;
   2561 
   2562 	/* XXX should do extra checks to make sure things really are clean,
   2563 	   rather than blindly setting the clean bit... */
   2564 
   2565 	raidPtr->mod_counter++;
   2566 
   2567 	for (c = 0; c < raidPtr->numCol; c++) {
   2568 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2569 			clabel = raidget_component_label(raidPtr, c);
   2570 			/* make sure status is noted */
   2571 			clabel->status = rf_ds_optimal;
   2572 
   2573 			/* note what unit we are configured as */
   2574 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2575 				clabel->last_unit = raidPtr->raidid;
   2576 
   2577 			raidflush_component_label(raidPtr, c);
   2578 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2579 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2580 					raidmarkclean(raidPtr, c);
   2581 				}
   2582 			}
   2583 		}
   2584 		/* else we don't touch it.. */
   2585 	}
   2586 
   2587 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2588 		sparecol = raidPtr->numCol + c;
   2589 		/* Need to ensure that the reconstruct actually completed! */
   2590 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2591 			/*
   2592 
   2593 			   we claim this disk is "optimal" if it's
   2594 			   rf_ds_used_spare, as that means it should be
   2595 			   directly substitutable for the disk it replaced.
   2596 			   We note that too...
   2597 
   2598 			 */
   2599 
   2600 			for(j=0;j<raidPtr->numCol;j++) {
   2601 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2602 					scol = j;
   2603 					break;
   2604 				}
   2605 			}
   2606 
   2607 			/* XXX shouldn't *really* need this... */
   2608 			clabel = raidget_component_label(raidPtr, sparecol);
   2609 			/* make sure status is noted */
   2610 
   2611 			raid_init_component_label(raidPtr, clabel);
   2612 
   2613 			clabel->column = scol;
   2614 			clabel->status = rf_ds_optimal;
   2615 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2616 				clabel->last_unit = raidPtr->raidid;
   2617 
   2618 			raidflush_component_label(raidPtr, sparecol);
   2619 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2620 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2621 					raidmarkclean(raidPtr, sparecol);
   2622 				}
   2623 			}
   2624 		}
   2625 	}
   2626 }
   2627 
   2628 void
   2629 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2630 {
   2631 
   2632 	if (vp != NULL) {
   2633 		if (auto_configured == 1) {
   2634 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2635 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2636 			vput(vp);
   2637 
   2638 		} else {
   2639 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2640 		}
   2641 	}
   2642 }
   2643 
   2644 
   2645 void
   2646 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2647 {
   2648 	int r,c;
   2649 	struct vnode *vp;
   2650 	int acd;
   2651 
   2652 
   2653 	/* We take this opportunity to close the vnodes like we should.. */
   2654 
   2655 	for (c = 0; c < raidPtr->numCol; c++) {
   2656 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2657 		acd = raidPtr->Disks[c].auto_configured;
   2658 		rf_close_component(raidPtr, vp, acd);
   2659 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2660 		raidPtr->Disks[c].auto_configured = 0;
   2661 	}
   2662 
   2663 	for (r = 0; r < raidPtr->numSpare; r++) {
   2664 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2665 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2666 		rf_close_component(raidPtr, vp, acd);
   2667 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2668 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2669 	}
   2670 }
   2671 
   2672 
   2673 static void
   2674 rf_ReconThread(struct rf_recon_req_internal *req)
   2675 {
   2676 	int     s;
   2677 	RF_Raid_t *raidPtr;
   2678 
   2679 	s = splbio();
   2680 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2681 	raidPtr->recon_in_progress = 1;
   2682 
   2683 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2684 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2685 
   2686 	RF_Free(req, sizeof(*req));
   2687 
   2688 	raidPtr->recon_in_progress = 0;
   2689 	splx(s);
   2690 
   2691 	/* That's all... */
   2692 	kthread_exit(0);	/* does not return */
   2693 }
   2694 
   2695 static void
   2696 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2697 {
   2698 	int retcode;
   2699 	int s;
   2700 
   2701 	raidPtr->parity_rewrite_stripes_done = 0;
   2702 	raidPtr->parity_rewrite_in_progress = 1;
   2703 	s = splbio();
   2704 	retcode = rf_RewriteParity(raidPtr);
   2705 	splx(s);
   2706 	if (retcode) {
   2707 		printf("raid%d: Error re-writing parity (%d)!\n",
   2708 		    raidPtr->raidid, retcode);
   2709 	} else {
   2710 		/* set the clean bit!  If we shutdown correctly,
   2711 		   the clean bit on each component label will get
   2712 		   set */
   2713 		raidPtr->parity_good = RF_RAID_CLEAN;
   2714 	}
   2715 	raidPtr->parity_rewrite_in_progress = 0;
   2716 
   2717 	/* Anyone waiting for us to stop?  If so, inform them... */
   2718 	if (raidPtr->waitShutdown) {
   2719 		rf_lock_mutex2(raidPtr->rad_lock);
   2720 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2721 		rf_unlock_mutex2(raidPtr->rad_lock);
   2722 	}
   2723 
   2724 	/* That's all... */
   2725 	kthread_exit(0);	/* does not return */
   2726 }
   2727 
   2728 
   2729 static void
   2730 rf_CopybackThread(RF_Raid_t *raidPtr)
   2731 {
   2732 	int s;
   2733 
   2734 	raidPtr->copyback_in_progress = 1;
   2735 	s = splbio();
   2736 	rf_CopybackReconstructedData(raidPtr);
   2737 	splx(s);
   2738 	raidPtr->copyback_in_progress = 0;
   2739 
   2740 	/* That's all... */
   2741 	kthread_exit(0);	/* does not return */
   2742 }
   2743 
   2744 
   2745 static void
   2746 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2747 {
   2748 	int s;
   2749 	RF_Raid_t *raidPtr;
   2750 
   2751 	s = splbio();
   2752 	raidPtr = req->raidPtr;
   2753 	raidPtr->recon_in_progress = 1;
   2754 	rf_ReconstructInPlace(raidPtr, req->col);
   2755 	RF_Free(req, sizeof(*req));
   2756 	raidPtr->recon_in_progress = 0;
   2757 	splx(s);
   2758 
   2759 	/* That's all... */
   2760 	kthread_exit(0);	/* does not return */
   2761 }
   2762 
   2763 static RF_AutoConfig_t *
   2764 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2765     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2766     unsigned secsize)
   2767 {
   2768 	int good_one = 0;
   2769 	RF_ComponentLabel_t *clabel;
   2770 	RF_AutoConfig_t *ac;
   2771 
   2772 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2773 
   2774 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2775 		/* Got the label.  Does it look reasonable? */
   2776 		if (rf_reasonable_label(clabel, numsecs) &&
   2777 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2778 #ifdef DEBUG
   2779 			printf("Component on: %s: %llu\n",
   2780 				cname, (unsigned long long)size);
   2781 			rf_print_component_label(clabel);
   2782 #endif
   2783 			/* if it's reasonable, add it, else ignore it. */
   2784 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2785 				M_WAITOK);
   2786 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2787 			ac->dev = dev;
   2788 			ac->vp = vp;
   2789 			ac->clabel = clabel;
   2790 			ac->next = ac_list;
   2791 			ac_list = ac;
   2792 			good_one = 1;
   2793 		}
   2794 	}
   2795 	if (!good_one) {
   2796 		/* cleanup */
   2797 		free(clabel, M_RAIDFRAME);
   2798 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2799 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2800 		vput(vp);
   2801 	}
   2802 	return ac_list;
   2803 }
   2804 
   2805 static RF_AutoConfig_t *
   2806 rf_find_raid_components(void)
   2807 {
   2808 	struct vnode *vp;
   2809 	struct disklabel label;
   2810 	device_t dv;
   2811 	deviter_t di;
   2812 	dev_t dev;
   2813 	int bmajor, bminor, wedge, rf_part_found;
   2814 	int error;
   2815 	int i;
   2816 	RF_AutoConfig_t *ac_list;
   2817 	uint64_t numsecs;
   2818 	unsigned secsize;
   2819 	int dowedges;
   2820 
   2821 	/* initialize the AutoConfig list */
   2822 	ac_list = NULL;
   2823 
   2824 	/*
   2825 	 * we begin by trolling through *all* the devices on the system *twice*
   2826 	 * first we scan for wedges, second for other devices. This avoids
   2827 	 * using a raw partition instead of a wedge that covers the whole disk
   2828 	 */
   2829 
   2830 	for (dowedges=1; dowedges>=0; --dowedges) {
   2831 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2832 		     dv = deviter_next(&di)) {
   2833 
   2834 			/* we are only interested in disks */
   2835 			if (device_class(dv) != DV_DISK)
   2836 				continue;
   2837 
   2838 			/* we don't care about floppies */
   2839 			if (device_is_a(dv, "fd")) {
   2840 				continue;
   2841 			}
   2842 
   2843 			/* we don't care about CDs. */
   2844 			if (device_is_a(dv, "cd")) {
   2845 				continue;
   2846 			}
   2847 
   2848 			/* we don't care about md. */
   2849 			if (device_is_a(dv, "md")) {
   2850 				continue;
   2851 			}
   2852 
   2853 			/* hdfd is the Atari/Hades floppy driver */
   2854 			if (device_is_a(dv, "hdfd")) {
   2855 				continue;
   2856 			}
   2857 
   2858 			/* fdisa is the Atari/Milan floppy driver */
   2859 			if (device_is_a(dv, "fdisa")) {
   2860 				continue;
   2861 			}
   2862 
   2863 			/* we don't care about spiflash */
   2864 			if (device_is_a(dv, "spiflash")) {
   2865 				continue;
   2866 			}
   2867 
   2868 			/* are we in the wedges pass ? */
   2869 			wedge = device_is_a(dv, "dk");
   2870 			if (wedge != dowedges) {
   2871 				continue;
   2872 			}
   2873 
   2874 			/* need to find the device_name_to_block_device_major stuff */
   2875 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2876 
   2877 			rf_part_found = 0; /*No raid partition as yet*/
   2878 
   2879 			/* get a vnode for the raw partition of this disk */
   2880 			bminor = minor(device_unit(dv));
   2881 			dev = wedge ? makedev(bmajor, bminor) :
   2882 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2883 			if (bdevvp(dev, &vp))
   2884 				panic("RAID can't alloc vnode");
   2885 
   2886 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2887 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2888 
   2889 			if (error) {
   2890 				/* "Who cares."  Continue looking
   2891 				   for something that exists*/
   2892 				vput(vp);
   2893 				continue;
   2894 			}
   2895 
   2896 			error = getdisksize(vp, &numsecs, &secsize);
   2897 			if (error) {
   2898 				/*
   2899 				 * Pseudo devices like vnd and cgd can be
   2900 				 * opened but may still need some configuration.
   2901 				 * Ignore these quietly.
   2902 				 */
   2903 				if (error != ENXIO)
   2904 					printf("RAIDframe: can't get disk size"
   2905 					    " for dev %s (%d)\n",
   2906 					    device_xname(dv), error);
   2907 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2908 				vput(vp);
   2909 				continue;
   2910 			}
   2911 			if (wedge) {
   2912 				struct dkwedge_info dkw;
   2913 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2914 				    NOCRED);
   2915 				if (error) {
   2916 					printf("RAIDframe: can't get wedge info for "
   2917 					    "dev %s (%d)\n", device_xname(dv), error);
   2918 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2919 					vput(vp);
   2920 					continue;
   2921 				}
   2922 
   2923 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2924 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2925 					vput(vp);
   2926 					continue;
   2927 				}
   2928 
   2929 				VOP_UNLOCK(vp);
   2930 				ac_list = rf_get_component(ac_list, dev, vp,
   2931 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2932 				rf_part_found = 1; /*There is a raid component on this disk*/
   2933 				continue;
   2934 			}
   2935 
   2936 			/* Ok, the disk exists.  Go get the disklabel. */
   2937 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2938 			if (error) {
   2939 				/*
   2940 				 * XXX can't happen - open() would
   2941 				 * have errored out (or faked up one)
   2942 				 */
   2943 				if (error != ENOTTY)
   2944 					printf("RAIDframe: can't get label for dev "
   2945 					    "%s (%d)\n", device_xname(dv), error);
   2946 			}
   2947 
   2948 			/* don't need this any more.  We'll allocate it again
   2949 			   a little later if we really do... */
   2950 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2951 			vput(vp);
   2952 
   2953 			if (error)
   2954 				continue;
   2955 
   2956 			rf_part_found = 0; /*No raid partitions yet*/
   2957 			for (i = 0; i < label.d_npartitions; i++) {
   2958 				char cname[sizeof(ac_list->devname)];
   2959 
   2960 				/* We only support partitions marked as RAID */
   2961 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2962 					continue;
   2963 
   2964 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2965 				if (bdevvp(dev, &vp))
   2966 					panic("RAID can't alloc vnode");
   2967 
   2968 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2969 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2970 				if (error) {
   2971 					/* Whatever... */
   2972 					vput(vp);
   2973 					continue;
   2974 				}
   2975 				VOP_UNLOCK(vp);
   2976 				snprintf(cname, sizeof(cname), "%s%c",
   2977 				    device_xname(dv), 'a' + i);
   2978 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2979 					label.d_partitions[i].p_size, numsecs, secsize);
   2980 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2981 			}
   2982 
   2983 			/*
   2984 			 *If there is no raid component on this disk, either in a
   2985 			 *disklabel or inside a wedge, check the raw partition as well,
   2986 			 *as it is possible to configure raid components on raw disk
   2987 			 *devices.
   2988 			 */
   2989 
   2990 			if (!rf_part_found) {
   2991 				char cname[sizeof(ac_list->devname)];
   2992 
   2993 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2994 				if (bdevvp(dev, &vp))
   2995 					panic("RAID can't alloc vnode");
   2996 
   2997 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2998 
   2999 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3000 				if (error) {
   3001 					/* Whatever... */
   3002 					vput(vp);
   3003 					continue;
   3004 				}
   3005 				VOP_UNLOCK(vp);
   3006 				snprintf(cname, sizeof(cname), "%s%c",
   3007 				    device_xname(dv), 'a' + RAW_PART);
   3008 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3009 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3010 			}
   3011 		}
   3012 		deviter_release(&di);
   3013 	}
   3014 	return ac_list;
   3015 }
   3016 
   3017 int
   3018 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3019 {
   3020 
   3021 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3022 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3023 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3024 	    (clabel->clean == RF_RAID_CLEAN ||
   3025 	     clabel->clean == RF_RAID_DIRTY) &&
   3026 	    clabel->row >=0 &&
   3027 	    clabel->column >= 0 &&
   3028 	    clabel->num_rows > 0 &&
   3029 	    clabel->num_columns > 0 &&
   3030 	    clabel->row < clabel->num_rows &&
   3031 	    clabel->column < clabel->num_columns &&
   3032 	    clabel->blockSize > 0 &&
   3033 	    /*
   3034 	     * numBlocksHi may contain garbage, but it is ok since
   3035 	     * the type is unsigned.  If it is really garbage,
   3036 	     * rf_fix_old_label_size() will fix it.
   3037 	     */
   3038 	    rf_component_label_numblocks(clabel) > 0) {
   3039 		/*
   3040 		 * label looks reasonable enough...
   3041 		 * let's make sure it has no old garbage.
   3042 		 */
   3043 		if (numsecs)
   3044 			rf_fix_old_label_size(clabel, numsecs);
   3045 		return(1);
   3046 	}
   3047 	return(0);
   3048 }
   3049 
   3050 
   3051 /*
   3052  * For reasons yet unknown, some old component labels have garbage in
   3053  * the newer numBlocksHi region, and this causes lossage.  Since those
   3054  * disks will also have numsecs set to less than 32 bits of sectors,
   3055  * we can determine when this corruption has occurred, and fix it.
   3056  *
   3057  * The exact same problem, with the same unknown reason, happens to
   3058  * the partitionSizeHi member as well.
   3059  */
   3060 static void
   3061 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3062 {
   3063 
   3064 	if (numsecs < ((uint64_t)1 << 32)) {
   3065 		if (clabel->numBlocksHi) {
   3066 			printf("WARNING: total sectors < 32 bits, yet "
   3067 			       "numBlocksHi set\n"
   3068 			       "WARNING: resetting numBlocksHi to zero.\n");
   3069 			clabel->numBlocksHi = 0;
   3070 		}
   3071 
   3072 		if (clabel->partitionSizeHi) {
   3073 			printf("WARNING: total sectors < 32 bits, yet "
   3074 			       "partitionSizeHi set\n"
   3075 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3076 			clabel->partitionSizeHi = 0;
   3077 		}
   3078 	}
   3079 }
   3080 
   3081 
   3082 #ifdef DEBUG
   3083 void
   3084 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3085 {
   3086 	uint64_t numBlocks;
   3087 	static const char *rp[] = {
   3088 	    "No", "Force", "Soft", "*invalid*"
   3089 	};
   3090 
   3091 
   3092 	numBlocks = rf_component_label_numblocks(clabel);
   3093 
   3094 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3095 	       clabel->row, clabel->column,
   3096 	       clabel->num_rows, clabel->num_columns);
   3097 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3098 	       clabel->version, clabel->serial_number,
   3099 	       clabel->mod_counter);
   3100 	printf("   Clean: %s Status: %d\n",
   3101 	       clabel->clean ? "Yes" : "No", clabel->status);
   3102 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3103 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3104 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3105 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3106 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3107 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3108 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3109 #if 0
   3110 	   printf("   Config order: %d\n", clabel->config_order);
   3111 #endif
   3112 
   3113 }
   3114 #endif
   3115 
   3116 static RF_ConfigSet_t *
   3117 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3118 {
   3119 	RF_AutoConfig_t *ac;
   3120 	RF_ConfigSet_t *config_sets;
   3121 	RF_ConfigSet_t *cset;
   3122 	RF_AutoConfig_t *ac_next;
   3123 
   3124 
   3125 	config_sets = NULL;
   3126 
   3127 	/* Go through the AutoConfig list, and figure out which components
   3128 	   belong to what sets.  */
   3129 	ac = ac_list;
   3130 	while(ac!=NULL) {
   3131 		/* we're going to putz with ac->next, so save it here
   3132 		   for use at the end of the loop */
   3133 		ac_next = ac->next;
   3134 
   3135 		if (config_sets == NULL) {
   3136 			/* will need at least this one... */
   3137 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3138 				       M_RAIDFRAME, M_WAITOK);
   3139 			/* this one is easy :) */
   3140 			config_sets->ac = ac;
   3141 			config_sets->next = NULL;
   3142 			config_sets->rootable = 0;
   3143 			ac->next = NULL;
   3144 		} else {
   3145 			/* which set does this component fit into? */
   3146 			cset = config_sets;
   3147 			while(cset!=NULL) {
   3148 				if (rf_does_it_fit(cset, ac)) {
   3149 					/* looks like it matches... */
   3150 					ac->next = cset->ac;
   3151 					cset->ac = ac;
   3152 					break;
   3153 				}
   3154 				cset = cset->next;
   3155 			}
   3156 			if (cset==NULL) {
   3157 				/* didn't find a match above... new set..*/
   3158 				cset = malloc(sizeof(RF_ConfigSet_t),
   3159 					       M_RAIDFRAME, M_WAITOK);
   3160 				cset->ac = ac;
   3161 				ac->next = NULL;
   3162 				cset->next = config_sets;
   3163 				cset->rootable = 0;
   3164 				config_sets = cset;
   3165 			}
   3166 		}
   3167 		ac = ac_next;
   3168 	}
   3169 
   3170 
   3171 	return(config_sets);
   3172 }
   3173 
   3174 static int
   3175 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3176 {
   3177 	RF_ComponentLabel_t *clabel1, *clabel2;
   3178 
   3179 	/* If this one matches the *first* one in the set, that's good
   3180 	   enough, since the other members of the set would have been
   3181 	   through here too... */
   3182 	/* note that we are not checking partitionSize here..
   3183 
   3184 	   Note that we are also not checking the mod_counters here.
   3185 	   If everything else matches except the mod_counter, that's
   3186 	   good enough for this test.  We will deal with the mod_counters
   3187 	   a little later in the autoconfiguration process.
   3188 
   3189 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3190 
   3191 	   The reason we don't check for this is that failed disks
   3192 	   will have lower modification counts.  If those disks are
   3193 	   not added to the set they used to belong to, then they will
   3194 	   form their own set, which may result in 2 different sets,
   3195 	   for example, competing to be configured at raid0, and
   3196 	   perhaps competing to be the root filesystem set.  If the
   3197 	   wrong ones get configured, or both attempt to become /,
   3198 	   weird behaviour and or serious lossage will occur.  Thus we
   3199 	   need to bring them into the fold here, and kick them out at
   3200 	   a later point.
   3201 
   3202 	*/
   3203 
   3204 	clabel1 = cset->ac->clabel;
   3205 	clabel2 = ac->clabel;
   3206 	if ((clabel1->version == clabel2->version) &&
   3207 	    (clabel1->serial_number == clabel2->serial_number) &&
   3208 	    (clabel1->num_rows == clabel2->num_rows) &&
   3209 	    (clabel1->num_columns == clabel2->num_columns) &&
   3210 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3211 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3212 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3213 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3214 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3215 	    (clabel1->blockSize == clabel2->blockSize) &&
   3216 	    rf_component_label_numblocks(clabel1) ==
   3217 	    rf_component_label_numblocks(clabel2) &&
   3218 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3219 	    (clabel1->root_partition == clabel2->root_partition) &&
   3220 	    (clabel1->last_unit == clabel2->last_unit) &&
   3221 	    (clabel1->config_order == clabel2->config_order)) {
   3222 		/* if it get's here, it almost *has* to be a match */
   3223 	} else {
   3224 		/* it's not consistent with somebody in the set..
   3225 		   punt */
   3226 		return(0);
   3227 	}
   3228 	/* all was fine.. it must fit... */
   3229 	return(1);
   3230 }
   3231 
   3232 static int
   3233 rf_have_enough_components(RF_ConfigSet_t *cset)
   3234 {
   3235 	RF_AutoConfig_t *ac;
   3236 	RF_AutoConfig_t *auto_config;
   3237 	RF_ComponentLabel_t *clabel;
   3238 	int c;
   3239 	int num_cols;
   3240 	int num_missing;
   3241 	int mod_counter;
   3242 	int mod_counter_found;
   3243 	int even_pair_failed;
   3244 	char parity_type;
   3245 
   3246 
   3247 	/* check to see that we have enough 'live' components
   3248 	   of this set.  If so, we can configure it if necessary */
   3249 
   3250 	num_cols = cset->ac->clabel->num_columns;
   3251 	parity_type = cset->ac->clabel->parityConfig;
   3252 
   3253 	/* XXX Check for duplicate components!?!?!? */
   3254 
   3255 	/* Determine what the mod_counter is supposed to be for this set. */
   3256 
   3257 	mod_counter_found = 0;
   3258 	mod_counter = 0;
   3259 	ac = cset->ac;
   3260 	while(ac!=NULL) {
   3261 		if (mod_counter_found==0) {
   3262 			mod_counter = ac->clabel->mod_counter;
   3263 			mod_counter_found = 1;
   3264 		} else {
   3265 			if (ac->clabel->mod_counter > mod_counter) {
   3266 				mod_counter = ac->clabel->mod_counter;
   3267 			}
   3268 		}
   3269 		ac = ac->next;
   3270 	}
   3271 
   3272 	num_missing = 0;
   3273 	auto_config = cset->ac;
   3274 
   3275 	even_pair_failed = 0;
   3276 	for(c=0; c<num_cols; c++) {
   3277 		ac = auto_config;
   3278 		while(ac!=NULL) {
   3279 			if ((ac->clabel->column == c) &&
   3280 			    (ac->clabel->mod_counter == mod_counter)) {
   3281 				/* it's this one... */
   3282 #ifdef DEBUG
   3283 				printf("Found: %s at %d\n",
   3284 				       ac->devname,c);
   3285 #endif
   3286 				break;
   3287 			}
   3288 			ac=ac->next;
   3289 		}
   3290 		if (ac==NULL) {
   3291 				/* Didn't find one here! */
   3292 				/* special case for RAID 1, especially
   3293 				   where there are more than 2
   3294 				   components (where RAIDframe treats
   3295 				   things a little differently :( ) */
   3296 			if (parity_type == '1') {
   3297 				if (c%2 == 0) { /* even component */
   3298 					even_pair_failed = 1;
   3299 				} else { /* odd component.  If
   3300 					    we're failed, and
   3301 					    so is the even
   3302 					    component, it's
   3303 					    "Good Night, Charlie" */
   3304 					if (even_pair_failed == 1) {
   3305 						return(0);
   3306 					}
   3307 				}
   3308 			} else {
   3309 				/* normal accounting */
   3310 				num_missing++;
   3311 			}
   3312 		}
   3313 		if ((parity_type == '1') && (c%2 == 1)) {
   3314 				/* Just did an even component, and we didn't
   3315 				   bail.. reset the even_pair_failed flag,
   3316 				   and go on to the next component.... */
   3317 			even_pair_failed = 0;
   3318 		}
   3319 	}
   3320 
   3321 	clabel = cset->ac->clabel;
   3322 
   3323 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3324 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3325 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3326 		/* XXX this needs to be made *much* more general */
   3327 		/* Too many failures */
   3328 		return(0);
   3329 	}
   3330 	/* otherwise, all is well, and we've got enough to take a kick
   3331 	   at autoconfiguring this set */
   3332 	return(1);
   3333 }
   3334 
   3335 static void
   3336 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3337 			RF_Raid_t *raidPtr)
   3338 {
   3339 	RF_ComponentLabel_t *clabel;
   3340 	int i;
   3341 
   3342 	clabel = ac->clabel;
   3343 
   3344 	/* 1. Fill in the common stuff */
   3345 	config->numCol = clabel->num_columns;
   3346 	config->numSpare = 0; /* XXX should this be set here? */
   3347 	config->sectPerSU = clabel->sectPerSU;
   3348 	config->SUsPerPU = clabel->SUsPerPU;
   3349 	config->SUsPerRU = clabel->SUsPerRU;
   3350 	config->parityConfig = clabel->parityConfig;
   3351 	/* XXX... */
   3352 	strcpy(config->diskQueueType,"fifo");
   3353 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3354 	config->layoutSpecificSize = 0; /* XXX ?? */
   3355 
   3356 	while(ac!=NULL) {
   3357 		/* row/col values will be in range due to the checks
   3358 		   in reasonable_label() */
   3359 		strcpy(config->devnames[0][ac->clabel->column],
   3360 		       ac->devname);
   3361 		ac = ac->next;
   3362 	}
   3363 
   3364 	for(i=0;i<RF_MAXDBGV;i++) {
   3365 		config->debugVars[i][0] = 0;
   3366 	}
   3367 }
   3368 
   3369 static int
   3370 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3371 {
   3372 	RF_ComponentLabel_t *clabel;
   3373 	int column;
   3374 	int sparecol;
   3375 
   3376 	raidPtr->autoconfigure = new_value;
   3377 
   3378 	for(column=0; column<raidPtr->numCol; column++) {
   3379 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3380 			clabel = raidget_component_label(raidPtr, column);
   3381 			clabel->autoconfigure = new_value;
   3382 			raidflush_component_label(raidPtr, column);
   3383 		}
   3384 	}
   3385 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3386 		sparecol = raidPtr->numCol + column;
   3387 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3388 			clabel = raidget_component_label(raidPtr, sparecol);
   3389 			clabel->autoconfigure = new_value;
   3390 			raidflush_component_label(raidPtr, sparecol);
   3391 		}
   3392 	}
   3393 	return(new_value);
   3394 }
   3395 
   3396 static int
   3397 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3398 {
   3399 	RF_ComponentLabel_t *clabel;
   3400 	int column;
   3401 	int sparecol;
   3402 
   3403 	raidPtr->root_partition = new_value;
   3404 	for(column=0; column<raidPtr->numCol; column++) {
   3405 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3406 			clabel = raidget_component_label(raidPtr, column);
   3407 			clabel->root_partition = new_value;
   3408 			raidflush_component_label(raidPtr, column);
   3409 		}
   3410 	}
   3411 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3412 		sparecol = raidPtr->numCol + column;
   3413 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3414 			clabel = raidget_component_label(raidPtr, sparecol);
   3415 			clabel->root_partition = new_value;
   3416 			raidflush_component_label(raidPtr, sparecol);
   3417 		}
   3418 	}
   3419 	return(new_value);
   3420 }
   3421 
   3422 static void
   3423 rf_release_all_vps(RF_ConfigSet_t *cset)
   3424 {
   3425 	RF_AutoConfig_t *ac;
   3426 
   3427 	ac = cset->ac;
   3428 	while(ac!=NULL) {
   3429 		/* Close the vp, and give it back */
   3430 		if (ac->vp) {
   3431 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3432 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3433 			vput(ac->vp);
   3434 			ac->vp = NULL;
   3435 		}
   3436 		ac = ac->next;
   3437 	}
   3438 }
   3439 
   3440 
   3441 static void
   3442 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3443 {
   3444 	RF_AutoConfig_t *ac;
   3445 	RF_AutoConfig_t *next_ac;
   3446 
   3447 	ac = cset->ac;
   3448 	while(ac!=NULL) {
   3449 		next_ac = ac->next;
   3450 		/* nuke the label */
   3451 		free(ac->clabel, M_RAIDFRAME);
   3452 		/* cleanup the config structure */
   3453 		free(ac, M_RAIDFRAME);
   3454 		/* "next.." */
   3455 		ac = next_ac;
   3456 	}
   3457 	/* and, finally, nuke the config set */
   3458 	free(cset, M_RAIDFRAME);
   3459 }
   3460 
   3461 
   3462 void
   3463 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3464 {
   3465 	/* avoid over-writing byteswapped version. */
   3466 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3467 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3468 	clabel->serial_number = raidPtr->serial_number;
   3469 	clabel->mod_counter = raidPtr->mod_counter;
   3470 
   3471 	clabel->num_rows = 1;
   3472 	clabel->num_columns = raidPtr->numCol;
   3473 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3474 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3475 
   3476 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3477 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3478 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3479 
   3480 	clabel->blockSize = raidPtr->bytesPerSector;
   3481 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3482 
   3483 	/* XXX not portable */
   3484 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3485 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3486 	clabel->autoconfigure = raidPtr->autoconfigure;
   3487 	clabel->root_partition = raidPtr->root_partition;
   3488 	clabel->last_unit = raidPtr->raidid;
   3489 	clabel->config_order = raidPtr->config_order;
   3490 
   3491 #ifndef RF_NO_PARITY_MAP
   3492 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3493 #endif
   3494 }
   3495 
   3496 static struct raid_softc *
   3497 rf_auto_config_set(RF_ConfigSet_t *cset)
   3498 {
   3499 	RF_Raid_t *raidPtr;
   3500 	RF_Config_t *config;
   3501 	int raidID;
   3502 	struct raid_softc *sc;
   3503 
   3504 #ifdef DEBUG
   3505 	printf("RAID autoconfigure\n");
   3506 #endif
   3507 
   3508 	/* 1. Create a config structure */
   3509 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3510 
   3511 	/*
   3512 	   2. Figure out what RAID ID this one is supposed to live at
   3513 	   See if we can get the same RAID dev that it was configured
   3514 	   on last time..
   3515 	*/
   3516 
   3517 	raidID = cset->ac->clabel->last_unit;
   3518 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3519 	     sc = raidget(++raidID, false))
   3520 		continue;
   3521 #ifdef DEBUG
   3522 	printf("Configuring raid%d:\n",raidID);
   3523 #endif
   3524 
   3525 	if (sc == NULL)
   3526 		sc = raidget(raidID, true);
   3527 	raidPtr = &sc->sc_r;
   3528 
   3529 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3530 	raidPtr->softc = sc;
   3531 	raidPtr->raidid = raidID;
   3532 	raidPtr->openings = RAIDOUTSTANDING;
   3533 
   3534 	/* 3. Build the configuration structure */
   3535 	rf_create_configuration(cset->ac, config, raidPtr);
   3536 
   3537 	/* 4. Do the configuration */
   3538 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3539 		raidinit(sc);
   3540 
   3541 		rf_markalldirty(raidPtr);
   3542 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3543 		switch (cset->ac->clabel->root_partition) {
   3544 		case 1:	/* Force Root */
   3545 		case 2:	/* Soft Root: root when boot partition part of raid */
   3546 			/*
   3547 			 * everything configured just fine.  Make a note
   3548 			 * that this set is eligible to be root,
   3549 			 * or forced to be root
   3550 			 */
   3551 			cset->rootable = cset->ac->clabel->root_partition;
   3552 			/* XXX do this here? */
   3553 			raidPtr->root_partition = cset->rootable;
   3554 			break;
   3555 		default:
   3556 			break;
   3557 		}
   3558 	} else {
   3559 		raidput(sc);
   3560 		sc = NULL;
   3561 	}
   3562 
   3563 	/* 5. Cleanup */
   3564 	free(config, M_RAIDFRAME);
   3565 	return sc;
   3566 }
   3567 
   3568 void
   3569 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3570 	     size_t xmin, size_t xmax)
   3571 {
   3572 
   3573 	/* Format: raid%d_foo */
   3574 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3575 
   3576 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3577 	pool_sethiwat(p, xmax);
   3578 	pool_prime(p, xmin);
   3579 }
   3580 
   3581 
   3582 /*
   3583  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3584  * to see if there is IO pending and if that IO could possibly be done
   3585  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3586  * otherwise.
   3587  *
   3588  */
   3589 int
   3590 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3591 {
   3592 	struct raid_softc *rs;
   3593 	struct dk_softc *dksc;
   3594 
   3595 	rs = raidPtr->softc;
   3596 	dksc = &rs->sc_dksc;
   3597 
   3598 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3599 		return 1;
   3600 
   3601 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3602 		/* there is work to do */
   3603 		return 0;
   3604 	}
   3605 	/* default is nothing to do */
   3606 	return 1;
   3607 }
   3608 
   3609 int
   3610 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3611 {
   3612 	uint64_t numsecs;
   3613 	unsigned secsize;
   3614 	int error;
   3615 
   3616 	error = getdisksize(vp, &numsecs, &secsize);
   3617 	if (error == 0) {
   3618 		diskPtr->blockSize = secsize;
   3619 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3620 		diskPtr->partitionSize = numsecs;
   3621 		return 0;
   3622 	}
   3623 	return error;
   3624 }
   3625 
   3626 static int
   3627 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3628 {
   3629 	return 1;
   3630 }
   3631 
   3632 static void
   3633 raid_attach(device_t parent, device_t self, void *aux)
   3634 {
   3635 }
   3636 
   3637 
   3638 static int
   3639 raid_detach(device_t self, int flags)
   3640 {
   3641 	int error;
   3642 	struct raid_softc *rs = raidsoftc(self);
   3643 
   3644 	if (rs == NULL)
   3645 		return ENXIO;
   3646 
   3647 	if ((error = raidlock(rs)) != 0)
   3648 		return error;
   3649 
   3650 	error = raid_detach_unlocked(rs);
   3651 
   3652 	raidunlock(rs);
   3653 
   3654 	/* XXX raid can be referenced here */
   3655 
   3656 	if (error)
   3657 		return error;
   3658 
   3659 	/* Free the softc */
   3660 	raidput(rs);
   3661 
   3662 	return 0;
   3663 }
   3664 
   3665 static void
   3666 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3667 {
   3668 	struct dk_softc *dksc = &rs->sc_dksc;
   3669 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3670 
   3671 	memset(dg, 0, sizeof(*dg));
   3672 
   3673 	dg->dg_secperunit = raidPtr->totalSectors;
   3674 	dg->dg_secsize = raidPtr->bytesPerSector;
   3675 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3676 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3677 
   3678 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3679 }
   3680 
   3681 /*
   3682  * Get cache info for all the components (including spares).
   3683  * Returns intersection of all the cache flags of all disks, or first
   3684  * error if any encountered.
   3685  * XXXfua feature flags can change as spares are added - lock down somehow
   3686  */
   3687 static int
   3688 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3689 {
   3690 	int c;
   3691 	int error;
   3692 	int dkwhole = 0, dkpart;
   3693 
   3694 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3695 		/*
   3696 		 * Check any non-dead disk, even when currently being
   3697 		 * reconstructed.
   3698 		 */
   3699 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3700 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3701 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3702 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3703 			if (error) {
   3704 				if (error != ENODEV) {
   3705 					printf("raid%d: get cache for component %s failed\n",
   3706 					    raidPtr->raidid,
   3707 					    raidPtr->Disks[c].devname);
   3708 				}
   3709 
   3710 				return error;
   3711 			}
   3712 
   3713 			if (c == 0)
   3714 				dkwhole = dkpart;
   3715 			else
   3716 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3717 		}
   3718 	}
   3719 
   3720 	*data = dkwhole;
   3721 
   3722 	return 0;
   3723 }
   3724 
   3725 /*
   3726  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3727  * We end up returning whatever error was returned by the first cache flush
   3728  * that fails.
   3729  */
   3730 
   3731 static int
   3732 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3733 {
   3734 	int e = 0;
   3735 	for (int i = 0; i < 5; i++) {
   3736 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3737 		    &force, FWRITE, NOCRED);
   3738 		if (!e || e == ENODEV)
   3739 			return e;
   3740 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3741 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3742 	}
   3743 	return e;
   3744 }
   3745 
   3746 int
   3747 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3748 {
   3749 	int c, error;
   3750 
   3751 	error = 0;
   3752 	for (c = 0; c < raidPtr->numCol; c++) {
   3753 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3754 			int e = rf_sync_component_cache(raidPtr, c, force);
   3755 			if (e && !error)
   3756 				error = e;
   3757 		}
   3758 	}
   3759 
   3760 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3761 		int sparecol = raidPtr->numCol + c;
   3762 		/* Need to ensure that the reconstruct actually completed! */
   3763 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3764 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3765 			    force);
   3766 			if (e && !error)
   3767 				error = e;
   3768 		}
   3769 	}
   3770 	return error;
   3771 }
   3772 
   3773 /* Fill in info with the current status */
   3774 void
   3775 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3776 {
   3777 
   3778 	if (raidPtr->status != rf_rs_reconstructing) {
   3779 		info->total = 100;
   3780 		info->completed = 100;
   3781 	} else {
   3782 		info->total = raidPtr->reconControl->numRUsTotal;
   3783 		info->completed = raidPtr->reconControl->numRUsComplete;
   3784 	}
   3785 	info->remaining = info->total - info->completed;
   3786 }
   3787 
   3788 /* Fill in info with the current status */
   3789 void
   3790 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3791 {
   3792 
   3793 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3794 		info->total = raidPtr->Layout.numStripe;
   3795 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3796 	} else {
   3797 		info->completed = 100;
   3798 		info->total = 100;
   3799 	}
   3800 	info->remaining = info->total - info->completed;
   3801 }
   3802 
   3803 /* Fill in info with the current status */
   3804 void
   3805 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3806 {
   3807 
   3808 	if (raidPtr->copyback_in_progress == 1) {
   3809 		info->total = raidPtr->Layout.numStripe;
   3810 		info->completed = raidPtr->copyback_stripes_done;
   3811 		info->remaining = info->total - info->completed;
   3812 	} else {
   3813 		info->remaining = 0;
   3814 		info->completed = 100;
   3815 		info->total = 100;
   3816 	}
   3817 }
   3818 
   3819 /* Fill in config with the current info */
   3820 int
   3821 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3822 {
   3823 	int	d, i, j;
   3824 
   3825 	if (!raidPtr->valid)
   3826 		return ENODEV;
   3827 	config->cols = raidPtr->numCol;
   3828 	config->ndevs = raidPtr->numCol;
   3829 	if (config->ndevs >= RF_MAX_DISKS)
   3830 		return ENOMEM;
   3831 	config->nspares = raidPtr->numSpare;
   3832 	if (config->nspares >= RF_MAX_DISKS)
   3833 		return ENOMEM;
   3834 	config->maxqdepth = raidPtr->maxQueueDepth;
   3835 	d = 0;
   3836 	for (j = 0; j < config->cols; j++) {
   3837 		config->devs[d] = raidPtr->Disks[j];
   3838 		d++;
   3839 	}
   3840 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3841 		config->spares[i] = raidPtr->Disks[j];
   3842 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3843 			/* XXX: raidctl(8) expects to see this as a used spare */
   3844 			config->spares[i].status = rf_ds_used_spare;
   3845 		}
   3846 	}
   3847 	return 0;
   3848 }
   3849 
   3850 int
   3851 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3852 {
   3853 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3854 	RF_ComponentLabel_t *raid_clabel;
   3855 	int column = clabel->column;
   3856 
   3857 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3858 		return EINVAL;
   3859 	raid_clabel = raidget_component_label(raidPtr, column);
   3860 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3861 	/* Fix-up for userland. */
   3862 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3863 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3864 
   3865 	return 0;
   3866 }
   3867 
   3868 /*
   3869  * Module interface
   3870  */
   3871 
   3872 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3873 
   3874 #ifdef _MODULE
   3875 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3876 #endif
   3877 
   3878 static int raid_modcmd(modcmd_t, void *);
   3879 static int raid_modcmd_init(void);
   3880 static int raid_modcmd_fini(void);
   3881 
   3882 static int
   3883 raid_modcmd(modcmd_t cmd, void *data)
   3884 {
   3885 	int error;
   3886 
   3887 	error = 0;
   3888 	switch (cmd) {
   3889 	case MODULE_CMD_INIT:
   3890 		error = raid_modcmd_init();
   3891 		break;
   3892 	case MODULE_CMD_FINI:
   3893 		error = raid_modcmd_fini();
   3894 		break;
   3895 	default:
   3896 		error = ENOTTY;
   3897 		break;
   3898 	}
   3899 	return error;
   3900 }
   3901 
   3902 static int
   3903 raid_modcmd_init(void)
   3904 {
   3905 	int error;
   3906 	int bmajor, cmajor;
   3907 
   3908 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3909 	mutex_enter(&raid_lock);
   3910 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3911 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3912 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3913 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3914 
   3915 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3916 #endif
   3917 
   3918 	bmajor = cmajor = -1;
   3919 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3920 	    &raid_cdevsw, &cmajor);
   3921 	if (error != 0 && error != EEXIST) {
   3922 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3923 		mutex_exit(&raid_lock);
   3924 		return error;
   3925 	}
   3926 #ifdef _MODULE
   3927 	error = config_cfdriver_attach(&raid_cd);
   3928 	if (error != 0) {
   3929 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3930 		    __func__, error);
   3931 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3932 		mutex_exit(&raid_lock);
   3933 		return error;
   3934 	}
   3935 #endif
   3936 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3937 	if (error != 0) {
   3938 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3939 		    __func__, error);
   3940 #ifdef _MODULE
   3941 		config_cfdriver_detach(&raid_cd);
   3942 #endif
   3943 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3944 		mutex_exit(&raid_lock);
   3945 		return error;
   3946 	}
   3947 
   3948 	raidautoconfigdone = false;
   3949 
   3950 	mutex_exit(&raid_lock);
   3951 
   3952 	if (error == 0) {
   3953 		if (rf_BootRaidframe(true) == 0)
   3954 			aprint_verbose("Kernelized RAIDframe activated\n");
   3955 		else
   3956 			panic("Serious error activating RAID!!");
   3957 	}
   3958 
   3959 	/*
   3960 	 * Register a finalizer which will be used to auto-config RAID
   3961 	 * sets once all real hardware devices have been found.
   3962 	 */
   3963 	error = config_finalize_register(NULL, rf_autoconfig);
   3964 	if (error != 0) {
   3965 		aprint_error("WARNING: unable to register RAIDframe "
   3966 		    "finalizer\n");
   3967 		error = 0;
   3968 	}
   3969 
   3970 	return error;
   3971 }
   3972 
   3973 static int
   3974 raid_modcmd_fini(void)
   3975 {
   3976 	int error;
   3977 
   3978 	mutex_enter(&raid_lock);
   3979 
   3980 	/* Don't allow unload if raid device(s) exist.  */
   3981 	if (!LIST_EMPTY(&raids)) {
   3982 		mutex_exit(&raid_lock);
   3983 		return EBUSY;
   3984 	}
   3985 
   3986 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3987 	if (error != 0) {
   3988 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3989 		mutex_exit(&raid_lock);
   3990 		return error;
   3991 	}
   3992 #ifdef _MODULE
   3993 	error = config_cfdriver_detach(&raid_cd);
   3994 	if (error != 0) {
   3995 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3996 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3997 		mutex_exit(&raid_lock);
   3998 		return error;
   3999 	}
   4000 #endif
   4001 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4002 	if (error != 0) {
   4003 		aprint_error("%s: cannot detach devsw\n",__func__);
   4004 #ifdef _MODULE
   4005 		config_cfdriver_attach(&raid_cd);
   4006 #endif
   4007 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4008 		mutex_exit(&raid_lock);
   4009 		return error;
   4010 	}
   4011 	rf_BootRaidframe(false);
   4012 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4013 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4014 	rf_destroy_cond2(rf_sparet_wait_cv);
   4015 	rf_destroy_cond2(rf_sparet_resp_cv);
   4016 #endif
   4017 	mutex_exit(&raid_lock);
   4018 	mutex_destroy(&raid_lock);
   4019 
   4020 	return error;
   4021 }
   4022