Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.409
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.409 2022/08/28 00:26:04 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.409 2022/08/28 00:26:04 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int);
    181 static void raidinit(struct raid_softc *);
    182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 static int raid_diskstart(device_t, struct buf *bp);
    200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    201 static int raid_lastclose(device_t);
    202 
    203 static dev_type_open(raidopen);
    204 static dev_type_close(raidclose);
    205 static dev_type_read(raidread);
    206 static dev_type_write(raidwrite);
    207 static dev_type_ioctl(raidioctl);
    208 static dev_type_strategy(raidstrategy);
    209 static dev_type_dump(raiddump);
    210 static dev_type_size(raidsize);
    211 
    212 const struct bdevsw raid_bdevsw = {
    213 	.d_open = raidopen,
    214 	.d_close = raidclose,
    215 	.d_strategy = raidstrategy,
    216 	.d_ioctl = raidioctl,
    217 	.d_dump = raiddump,
    218 	.d_psize = raidsize,
    219 	.d_discard = nodiscard,
    220 	.d_flag = D_DISK
    221 };
    222 
    223 const struct cdevsw raid_cdevsw = {
    224 	.d_open = raidopen,
    225 	.d_close = raidclose,
    226 	.d_read = raidread,
    227 	.d_write = raidwrite,
    228 	.d_ioctl = raidioctl,
    229 	.d_stop = nostop,
    230 	.d_tty = notty,
    231 	.d_poll = nopoll,
    232 	.d_mmap = nommap,
    233 	.d_kqfilter = nokqfilter,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 static struct dkdriver rf_dkdriver = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_strategy = raidstrategy,
    242 	.d_diskstart = raid_diskstart,
    243 	.d_dumpblocks = raid_dumpblocks,
    244 	.d_lastclose = raid_lastclose,
    245 	.d_minphys = minphys
    246 };
    247 
    248 #define	raidunit(x)	DISKUNIT(x)
    249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    250 
    251 extern struct cfdriver raid_cd;
    252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    254     DVF_DETACH_SHUTDOWN);
    255 
    256 /* Internal representation of a rf_recon_req */
    257 struct rf_recon_req_internal {
    258 	RF_RowCol_t col;
    259 	RF_ReconReqFlags_t flags;
    260 	void   *raidPtr;
    261 };
    262 
    263 /*
    264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    265  * Be aware that large numbers can allow the driver to consume a lot of
    266  * kernel memory, especially on writes, and in degraded mode reads.
    267  *
    268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    269  * a single 64K write will typically require 64K for the old data,
    270  * 64K for the old parity, and 64K for the new parity, for a total
    271  * of 192K (if the parity buffer is not re-used immediately).
    272  * Even it if is used immediately, that's still 128K, which when multiplied
    273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    274  *
    275  * Now in degraded mode, for example, a 64K read on the above setup may
    276  * require data reconstruction, which will require *all* of the 4 remaining
    277  * disks to participate -- 4 * 32K/disk == 128K again.
    278  */
    279 
    280 #ifndef RAIDOUTSTANDING
    281 #define RAIDOUTSTANDING   6
    282 #endif
    283 
    284 #define RAIDLABELDEV(dev)	\
    285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    286 
    287 /* declared here, and made public, for the benefit of KVM stuff.. */
    288 
    289 static int raidlock(struct raid_softc *);
    290 static void raidunlock(struct raid_softc *);
    291 
    292 static int raid_detach_unlocked(struct raid_softc *);
    293 
    294 static void rf_markalldirty(RF_Raid_t *);
    295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    296 
    297 static void rf_ReconThread(struct rf_recon_req_internal *);
    298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    301 static int rf_autoconfig(device_t);
    302 static int rf_rescan(void);
    303 static void rf_buildroothack(RF_ConfigSet_t *);
    304 
    305 static RF_AutoConfig_t *rf_find_raid_components(void);
    306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    309 static int rf_set_autoconfig(RF_Raid_t *, int);
    310 static int rf_set_rootpartition(RF_Raid_t *, int);
    311 static void rf_release_all_vps(RF_ConfigSet_t *);
    312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    313 static int rf_have_enough_components(RF_ConfigSet_t *);
    314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    316 
    317 /*
    318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    320  * in the kernel config file.
    321  */
    322 #ifdef RAID_AUTOCONFIG
    323 int raidautoconfig = 1;
    324 #else
    325 int raidautoconfig = 0;
    326 #endif
    327 static bool raidautoconfigdone = false;
    328 
    329 struct pool rf_alloclist_pool;   /* AllocList */
    330 
    331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    332 static kmutex_t raid_lock;
    333 
    334 static struct raid_softc *
    335 raidcreate(int unit) {
    336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    337 	sc->sc_unit = unit;
    338 	cv_init(&sc->sc_cv, "raidunit");
    339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    340 	return sc;
    341 }
    342 
    343 static void
    344 raiddestroy(struct raid_softc *sc) {
    345 	cv_destroy(&sc->sc_cv);
    346 	mutex_destroy(&sc->sc_mutex);
    347 	kmem_free(sc, sizeof(*sc));
    348 }
    349 
    350 static struct raid_softc *
    351 raidget(int unit, bool create) {
    352 	struct raid_softc *sc;
    353 	if (unit < 0) {
    354 #ifdef DIAGNOSTIC
    355 		panic("%s: unit %d!", __func__, unit);
    356 #endif
    357 		return NULL;
    358 	}
    359 	mutex_enter(&raid_lock);
    360 	LIST_FOREACH(sc, &raids, sc_link) {
    361 		if (sc->sc_unit == unit) {
    362 			mutex_exit(&raid_lock);
    363 			return sc;
    364 		}
    365 	}
    366 	mutex_exit(&raid_lock);
    367 	if (!create)
    368 		return NULL;
    369 	sc = raidcreate(unit);
    370 	mutex_enter(&raid_lock);
    371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    372 	mutex_exit(&raid_lock);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raidput(struct raid_softc *sc) {
    378 	mutex_enter(&raid_lock);
    379 	LIST_REMOVE(sc, sc_link);
    380 	mutex_exit(&raid_lock);
    381 	raiddestroy(sc);
    382 }
    383 
    384 void
    385 raidattach(int num)
    386 {
    387 
    388 	/*
    389 	 * Device attachment and associated initialization now occurs
    390 	 * as part of the module initialization.
    391 	 */
    392 }
    393 
    394 static int
    395 rf_autoconfig(device_t self)
    396 {
    397 	RF_AutoConfig_t *ac_list;
    398 	RF_ConfigSet_t *config_sets;
    399 
    400 	if (!raidautoconfig || raidautoconfigdone == true)
    401 		return 0;
    402 
    403 	/* XXX This code can only be run once. */
    404 	raidautoconfigdone = true;
    405 
    406 #ifdef __HAVE_CPU_BOOTCONF
    407 	/*
    408 	 * 0. find the boot device if needed first so we can use it later
    409 	 * this needs to be done before we autoconfigure any raid sets,
    410 	 * because if we use wedges we are not going to be able to open
    411 	 * the boot device later
    412 	 */
    413 	if (booted_device == NULL)
    414 		cpu_bootconf();
    415 #endif
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set and configure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 int
    433 rf_inited(const struct raid_softc *rs) {
    434 	return (rs->sc_flags & RAIDF_INITED) != 0;
    435 }
    436 
    437 RF_Raid_t *
    438 rf_get_raid(struct raid_softc *rs) {
    439 	return &rs->sc_r;
    440 }
    441 
    442 int
    443 rf_get_unit(const struct raid_softc *rs) {
    444 	return rs->sc_unit;
    445 }
    446 
    447 static int
    448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    449 	const char *bootname;
    450 	size_t len;
    451 
    452 	/* if bdv is NULL, the set can't contain it. exit early. */
    453 	if (bdv == NULL)
    454 		return 0;
    455 
    456 	bootname = device_xname(bdv);
    457 	len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 static int
    479 rf_rescan(void)
    480 {
    481 	RF_AutoConfig_t *ac_list;
    482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    483 	struct raid_softc *sc;
    484 	int raid_added;
    485 
    486 	ac_list = rf_find_raid_components();
    487 	config_sets = rf_create_auto_sets(ac_list);
    488 
    489 	raid_added = 1;
    490 	while (raid_added > 0) {
    491 		raid_added = 0;
    492 		cset = config_sets;
    493 		while (cset != NULL) {
    494 			next_cset = cset->next;
    495 			if (rf_have_enough_components(cset) &&
    496 			    cset->ac->clabel->autoconfigure == 1) {
    497 				sc = rf_auto_config_set(cset);
    498 				if (sc != NULL) {
    499 					aprint_debug("raid%d: configured ok, rootable %d\n",
    500 						     sc->sc_unit, cset->rootable);
    501 					/* We added one RAID set */
    502 					raid_added++;
    503 				} else {
    504 					/* The autoconfig didn't work :( */
    505 					aprint_debug("Autoconfig failed\n");
    506 					rf_release_all_vps(cset);
    507 				}
    508 			} else {
    509 				/* we're not autoconfiguring this set...
    510 				   release the associated resources */
    511 				rf_release_all_vps(cset);
    512 			}
    513 			/* cleanup */
    514 			rf_cleanup_config_set(cset);
    515 			cset = next_cset;
    516 		}
    517 		if (raid_added > 0) {
    518 			/* We added at least one RAID set, so re-scan for recursive RAID */
    519 			ac_list = rf_find_raid_components();
    520 			config_sets = rf_create_auto_sets(ac_list);
    521 		}
    522 	}
    523 
    524 	return 0;
    525 }
    526 
    527 
    528 static void
    529 rf_buildroothack(RF_ConfigSet_t *config_sets)
    530 {
    531 	RF_AutoConfig_t *ac_list;
    532 	RF_ConfigSet_t *cset;
    533 	RF_ConfigSet_t *next_cset;
    534 	int num_root;
    535 	int raid_added;
    536 	struct raid_softc *sc, *rsc;
    537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    538 
    539 	sc = rsc = NULL;
    540 	num_root = 0;
    541 
    542 	raid_added = 1;
    543 	while (raid_added > 0) {
    544 		raid_added = 0;
    545 		cset = config_sets;
    546 		while (cset != NULL) {
    547 			next_cset = cset->next;
    548 			if (rf_have_enough_components(cset) &&
    549 			    cset->ac->clabel->autoconfigure == 1) {
    550 				sc = rf_auto_config_set(cset);
    551 				if (sc != NULL) {
    552 					aprint_debug("raid%d: configured ok, rootable %d\n",
    553 						     sc->sc_unit, cset->rootable);
    554 					/* We added one RAID set */
    555 					raid_added++;
    556 					if (cset->rootable) {
    557 						rsc = sc;
    558 						num_root++;
    559 					}
    560 				} else {
    561 					/* The autoconfig didn't work :( */
    562 					aprint_debug("Autoconfig failed\n");
    563 					rf_release_all_vps(cset);
    564 				}
    565 			} else {
    566 				/* we're not autoconfiguring this set...
    567 				   release the associated resources */
    568 				rf_release_all_vps(cset);
    569 			}
    570 			/* cleanup */
    571 			rf_cleanup_config_set(cset);
    572 			cset = next_cset;
    573 		}
    574 		if (raid_added > 0) {
    575 			/* We added at least one RAID set, so re-scan for recursive RAID */
    576 			ac_list = rf_find_raid_components();
    577 			config_sets = rf_create_auto_sets(ac_list);
    578 		}
    579 	}
    580 
    581 	/* if the user has specified what the root device should be
    582 	   then we don't touch booted_device or boothowto... */
    583 
    584 	if (rootspec != NULL) {
    585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
    586 		return;
    587 	}
    588 
    589 	/* we found something bootable... */
    590 
    591 	/*
    592 	 * XXX: The following code assumes that the root raid
    593 	 * is the first ('a') partition. This is about the best
    594 	 * we can do with a BSD disklabel, but we might be able
    595 	 * to do better with a GPT label, by setting a specified
    596 	 * attribute to indicate the root partition. We can then
    597 	 * stash the partition number in the r->root_partition
    598 	 * high bits (the bottom 2 bits are already used). For
    599 	 * now we just set booted_partition to 0 when we override
    600 	 * root.
    601 	 */
    602 	if (num_root == 1) {
    603 		device_t candidate_root;
    604 		dksc = &rsc->sc_dksc;
    605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    606 			char cname[sizeof(cset->ac->devname)];
    607 			/* XXX: assume partition 'a' first */
    608 			snprintf(cname, sizeof(cname), "%s%c",
    609 			    device_xname(dksc->sc_dev), 'a');
    610 			candidate_root = dkwedge_find_by_wname(cname);
    611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
    612 			    cname);
    613 			if (candidate_root == NULL) {
    614 				/*
    615 				 * If that is not found, because we don't use
    616 				 * disklabel, return the first dk child
    617 				 * XXX: we can skip the 'a' check above
    618 				 * and always do this...
    619 				 */
    620 				size_t i = 0;
    621 				candidate_root = dkwedge_find_by_parent(
    622 				    device_xname(dksc->sc_dev), &i);
    623 			}
    624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
    625 			    candidate_root);
    626 		} else
    627 			candidate_root = dksc->sc_dev;
    628 		aprint_debug("%s: candidate root=%p booted_device=%p "
    629 			     "root_partition=%d contains_boot=%d\n",
    630 		    __func__, candidate_root, booted_device,
    631 		    rsc->sc_r.root_partition,
    632 		    rf_containsboot(&rsc->sc_r, booted_device));
    633 		/* XXX the check for booted_device == NULL can probably be
    634 		 * dropped, now that rf_containsboot handles that case.
    635 		 */
    636 		if (booted_device == NULL ||
    637 		    rsc->sc_r.root_partition == 1 ||
    638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    639 			booted_device = candidate_root;
    640 			booted_method = "raidframe/single";
    641 			booted_partition = 0;	/* XXX assume 'a' */
    642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
    643 			    device_xname(booted_device), booted_device);
    644 		}
    645 	} else if (num_root > 1) {
    646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
    647 		    booted_device);
    648 
    649 		/*
    650 		 * Maybe the MD code can help. If it cannot, then
    651 		 * setroot() will discover that we have no
    652 		 * booted_device and will ask the user if nothing was
    653 		 * hardwired in the kernel config file
    654 		 */
    655 		if (booted_device == NULL)
    656 			return;
    657 
    658 		num_root = 0;
    659 		mutex_enter(&raid_lock);
    660 		LIST_FOREACH(sc, &raids, sc_link) {
    661 			RF_Raid_t *r = &sc->sc_r;
    662 			if (r->valid == 0)
    663 				continue;
    664 
    665 			if (r->root_partition == 0)
    666 				continue;
    667 
    668 			if (rf_containsboot(r, booted_device)) {
    669 				num_root++;
    670 				rsc = sc;
    671 				dksc = &rsc->sc_dksc;
    672 			}
    673 		}
    674 		mutex_exit(&raid_lock);
    675 
    676 		if (num_root == 1) {
    677 			booted_device = dksc->sc_dev;
    678 			booted_method = "raidframe/multi";
    679 			booted_partition = 0;	/* XXX assume 'a' */
    680 		} else {
    681 			/* we can't guess.. require the user to answer... */
    682 			boothowto |= RB_ASKNAME;
    683 		}
    684 	}
    685 }
    686 
    687 static int
    688 raidsize(dev_t dev)
    689 {
    690 	struct raid_softc *rs;
    691 	struct dk_softc *dksc;
    692 	unsigned int unit;
    693 
    694 	unit = raidunit(dev);
    695 	if ((rs = raidget(unit, false)) == NULL)
    696 		return -1;
    697 	dksc = &rs->sc_dksc;
    698 
    699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    700 		return -1;
    701 
    702 	return dk_size(dksc, dev);
    703 }
    704 
    705 static int
    706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    707 {
    708 	unsigned int unit;
    709 	struct raid_softc *rs;
    710 	struct dk_softc *dksc;
    711 
    712 	unit = raidunit(dev);
    713 	if ((rs = raidget(unit, false)) == NULL)
    714 		return ENXIO;
    715 	dksc = &rs->sc_dksc;
    716 
    717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    718 		return ENODEV;
    719 
    720         /*
    721            Note that blkno is relative to this particular partition.
    722            By adding adding RF_PROTECTED_SECTORS, we get a value that
    723 	   is relative to the partition used for the underlying component.
    724         */
    725 	blkno += RF_PROTECTED_SECTORS;
    726 
    727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    728 }
    729 
    730 static int
    731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    732 {
    733 	struct raid_softc *rs = raidsoftc(dev);
    734 	const struct bdevsw *bdev;
    735 	RF_Raid_t *raidPtr;
    736 	int     c, sparecol, j, scol, dumpto;
    737 	int     error = 0;
    738 
    739 	raidPtr = &rs->sc_r;
    740 
    741 	/* we only support dumping to RAID 1 sets */
    742 	if (raidPtr->Layout.numDataCol != 1 ||
    743 	    raidPtr->Layout.numParityCol != 1)
    744 		return EINVAL;
    745 
    746 	if ((error = raidlock(rs)) != 0)
    747 		return error;
    748 
    749 	/* figure out what device is alive.. */
    750 
    751 	/*
    752 	   Look for a component to dump to.  The preference for the
    753 	   component to dump to is as follows:
    754 	   1) the first component
    755 	   2) a used_spare of the first component
    756 	   3) the second component
    757 	   4) a used_spare of the second component
    758 	*/
    759 
    760 	dumpto = -1;
    761 	for (c = 0; c < raidPtr->numCol; c++) {
    762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    763 			/* this might be the one */
    764 			dumpto = c;
    765 			break;
    766 		}
    767 	}
    768 
    769 	/*
    770 	   At this point we have possibly selected a live component.
    771 	   If we didn't find a live ocmponent, we now check to see
    772 	   if there is a relevant spared component.
    773 	*/
    774 
    775 	for (c = 0; c < raidPtr->numSpare; c++) {
    776 		sparecol = raidPtr->numCol + c;
    777 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    778 			/* How about this one? */
    779 			scol = -1;
    780 			for(j=0;j<raidPtr->numCol;j++) {
    781 				if (raidPtr->Disks[j].spareCol == sparecol) {
    782 					scol = j;
    783 					break;
    784 				}
    785 			}
    786 			if (scol == 0) {
    787 				/*
    788 				   We must have found a spared first
    789 				   component!  We'll take that over
    790 				   anything else found so far.  (We
    791 				   couldn't have found a real first
    792 				   component before, since this is a
    793 				   used spare, and it's saying that
    794 				   it's replacing the first
    795 				   component.)  On reboot (with
    796 				   autoconfiguration turned on)
    797 				   sparecol will become the first
    798 				   component (component0) of this set.
    799 				*/
    800 				dumpto = sparecol;
    801 				break;
    802 			} else if (scol != -1) {
    803 				/*
    804 				   Must be a spared second component.
    805 				   We'll dump to that if we havn't found
    806 				   anything else so far.
    807 				*/
    808 				if (dumpto == -1)
    809 					dumpto = sparecol;
    810 			}
    811 		}
    812 	}
    813 
    814 	if (dumpto == -1) {
    815 		/* we couldn't find any live components to dump to!?!?
    816 		 */
    817 		error = EINVAL;
    818 		goto out;
    819 	}
    820 
    821 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    822 	if (bdev == NULL) {
    823 		error = ENXIO;
    824 		goto out;
    825 	}
    826 
    827 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    828 				blkno, va, nblk * raidPtr->bytesPerSector);
    829 
    830 out:
    831 	raidunlock(rs);
    832 
    833 	return error;
    834 }
    835 
    836 /* ARGSUSED */
    837 static int
    838 raidopen(dev_t dev, int flags, int fmt,
    839     struct lwp *l)
    840 {
    841 	int     unit = raidunit(dev);
    842 	struct raid_softc *rs;
    843 	struct dk_softc *dksc;
    844 	int     error = 0;
    845 	int     part, pmask;
    846 
    847 	if ((rs = raidget(unit, true)) == NULL)
    848 		return ENXIO;
    849 	if ((error = raidlock(rs)) != 0)
    850 		return error;
    851 
    852 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    853 		error = EBUSY;
    854 		goto bad;
    855 	}
    856 
    857 	dksc = &rs->sc_dksc;
    858 
    859 	part = DISKPART(dev);
    860 	pmask = (1 << part);
    861 
    862 	if (!DK_BUSY(dksc, pmask) &&
    863 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    864 		/* First one... mark things as dirty... Note that we *MUST*
    865 		 have done a configure before this.  I DO NOT WANT TO BE
    866 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    867 		 THAT THEY BELONG TOGETHER!!!!! */
    868 		/* XXX should check to see if we're only open for reading
    869 		   here... If so, we needn't do this, but then need some
    870 		   other way of keeping track of what's happened.. */
    871 
    872 		rf_markalldirty(&rs->sc_r);
    873 	}
    874 
    875 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    876 		error = dk_open(dksc, dev, flags, fmt, l);
    877 
    878 bad:
    879 	raidunlock(rs);
    880 
    881 	return error;
    882 
    883 
    884 }
    885 
    886 static int
    887 raid_lastclose(device_t self)
    888 {
    889 	struct raid_softc *rs = raidsoftc(self);
    890 
    891 	/* Last one... device is not unconfigured yet.
    892 	   Device shutdown has taken care of setting the
    893 	   clean bits if RAIDF_INITED is not set
    894 	   mark things as clean... */
    895 
    896 	rf_update_component_labels(&rs->sc_r,
    897 	    RF_FINAL_COMPONENT_UPDATE);
    898 
    899 	/* pass to unlocked code */
    900 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    901 		rs->sc_flags |= RAIDF_DETACH;
    902 
    903 	return 0;
    904 }
    905 
    906 /* ARGSUSED */
    907 static int
    908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    909 {
    910 	int     unit = raidunit(dev);
    911 	struct raid_softc *rs;
    912 	struct dk_softc *dksc;
    913 	cfdata_t cf;
    914 	int     error = 0, do_detach = 0, do_put = 0;
    915 
    916 	if ((rs = raidget(unit, false)) == NULL)
    917 		return ENXIO;
    918 	dksc = &rs->sc_dksc;
    919 
    920 	if ((error = raidlock(rs)) != 0)
    921 		return error;
    922 
    923 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    924 		error = dk_close(dksc, dev, flags, fmt, l);
    925 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    926 			do_detach = 1;
    927 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    928 		do_put = 1;
    929 
    930 	raidunlock(rs);
    931 
    932 	if (do_detach) {
    933 		/* free the pseudo device attach bits */
    934 		cf = device_cfdata(dksc->sc_dev);
    935 		error = config_detach(dksc->sc_dev, 0);
    936 		if (error == 0)
    937 			free(cf, M_RAIDFRAME);
    938 	} else if (do_put) {
    939 		raidput(rs);
    940 	}
    941 
    942 	return error;
    943 
    944 }
    945 
    946 static void
    947 raid_wakeup(RF_Raid_t *raidPtr)
    948 {
    949 	rf_lock_mutex2(raidPtr->iodone_lock);
    950 	rf_signal_cond2(raidPtr->iodone_cv);
    951 	rf_unlock_mutex2(raidPtr->iodone_lock);
    952 }
    953 
    954 static void
    955 raidstrategy(struct buf *bp)
    956 {
    957 	unsigned int unit;
    958 	struct raid_softc *rs;
    959 	struct dk_softc *dksc;
    960 	RF_Raid_t *raidPtr;
    961 
    962 	unit = raidunit(bp->b_dev);
    963 	if ((rs = raidget(unit, false)) == NULL) {
    964 		bp->b_error = ENXIO;
    965 		goto fail;
    966 	}
    967 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    968 		bp->b_error = ENXIO;
    969 		goto fail;
    970 	}
    971 	dksc = &rs->sc_dksc;
    972 	raidPtr = &rs->sc_r;
    973 
    974 	/* Queue IO only */
    975 	if (dk_strategy_defer(dksc, bp))
    976 		goto done;
    977 
    978 	/* schedule the IO to happen at the next convenient time */
    979 	raid_wakeup(raidPtr);
    980 
    981 done:
    982 	return;
    983 
    984 fail:
    985 	bp->b_resid = bp->b_bcount;
    986 	biodone(bp);
    987 }
    988 
    989 static int
    990 raid_diskstart(device_t dev, struct buf *bp)
    991 {
    992 	struct raid_softc *rs = raidsoftc(dev);
    993 	RF_Raid_t *raidPtr;
    994 
    995 	raidPtr = &rs->sc_r;
    996 	if (!raidPtr->valid) {
    997 		db1_printf(("raid is not valid..\n"));
    998 		return ENODEV;
    999 	}
   1000 
   1001 	/* XXX */
   1002 	bp->b_resid = 0;
   1003 
   1004 	return raiddoaccess(raidPtr, bp);
   1005 }
   1006 
   1007 void
   1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1009 {
   1010 	struct raid_softc *rs;
   1011 	struct dk_softc *dksc;
   1012 
   1013 	rs = raidPtr->softc;
   1014 	dksc = &rs->sc_dksc;
   1015 
   1016 	dk_done(dksc, bp);
   1017 
   1018 	rf_lock_mutex2(raidPtr->mutex);
   1019 	raidPtr->openings++;
   1020 	rf_unlock_mutex2(raidPtr->mutex);
   1021 
   1022 	/* schedule more IO */
   1023 	raid_wakeup(raidPtr);
   1024 }
   1025 
   1026 /* ARGSUSED */
   1027 static int
   1028 raidread(dev_t dev, struct uio *uio, int flags)
   1029 {
   1030 	int     unit = raidunit(dev);
   1031 	struct raid_softc *rs;
   1032 
   1033 	if ((rs = raidget(unit, false)) == NULL)
   1034 		return ENXIO;
   1035 
   1036 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1037 		return ENXIO;
   1038 
   1039 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1040 
   1041 }
   1042 
   1043 /* ARGSUSED */
   1044 static int
   1045 raidwrite(dev_t dev, struct uio *uio, int flags)
   1046 {
   1047 	int     unit = raidunit(dev);
   1048 	struct raid_softc *rs;
   1049 
   1050 	if ((rs = raidget(unit, false)) == NULL)
   1051 		return ENXIO;
   1052 
   1053 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1054 		return ENXIO;
   1055 
   1056 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1057 
   1058 }
   1059 
   1060 static int
   1061 raid_detach_unlocked(struct raid_softc *rs)
   1062 {
   1063 	struct dk_softc *dksc = &rs->sc_dksc;
   1064 	RF_Raid_t *raidPtr;
   1065 	int error;
   1066 
   1067 	raidPtr = &rs->sc_r;
   1068 
   1069 	if (DK_BUSY(dksc, 0) ||
   1070 	    raidPtr->recon_in_progress != 0 ||
   1071 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1072 	    raidPtr->copyback_in_progress != 0)
   1073 		return EBUSY;
   1074 
   1075 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1076 		return 0;
   1077 
   1078 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1079 
   1080 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1081 		return error;
   1082 
   1083 	rs->sc_flags &= ~RAIDF_INITED;
   1084 
   1085 	/* Kill off any queued buffers */
   1086 	dk_drain(dksc);
   1087 	bufq_free(dksc->sc_bufq);
   1088 
   1089 	/* Detach the disk. */
   1090 	dkwedge_delall(&dksc->sc_dkdev);
   1091 	disk_detach(&dksc->sc_dkdev);
   1092 	disk_destroy(&dksc->sc_dkdev);
   1093 	dk_detach(dksc);
   1094 
   1095 	return 0;
   1096 }
   1097 
   1098 static bool
   1099 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1100 {
   1101 	switch (cmd) {
   1102 	case RAIDFRAME_ADD_HOT_SPARE:
   1103 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1104 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1105 	case RAIDFRAME_CHECK_PARITY:
   1106 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1107 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1108 	case RAIDFRAME_CHECK_RECON_STATUS:
   1109 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1110 	case RAIDFRAME_COPYBACK:
   1111 	case RAIDFRAME_DELETE_COMPONENT:
   1112 	case RAIDFRAME_FAIL_DISK:
   1113 	case RAIDFRAME_GET_ACCTOTALS:
   1114 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1115 	case RAIDFRAME_GET_INFO:
   1116 	case RAIDFRAME_GET_SIZE:
   1117 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1118 	case RAIDFRAME_INIT_LABELS:
   1119 	case RAIDFRAME_KEEP_ACCTOTALS:
   1120 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1121 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1122 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1123 	case RAIDFRAME_PARITYMAP_STATUS:
   1124 	case RAIDFRAME_REBUILD_IN_PLACE:
   1125 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1126 	case RAIDFRAME_RESET_ACCTOTALS:
   1127 	case RAIDFRAME_REWRITEPARITY:
   1128 	case RAIDFRAME_SET_AUTOCONFIG:
   1129 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1130 	case RAIDFRAME_SET_LAST_UNIT:
   1131 	case RAIDFRAME_SET_ROOT:
   1132 	case RAIDFRAME_SHUTDOWN:
   1133 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1134 	}
   1135 	return false;
   1136 }
   1137 
   1138 int
   1139 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1140 {
   1141 	struct rf_recon_req_internal *rrint;
   1142 
   1143 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1144 		/* Can't do this on a RAID 0!! */
   1145 		return EINVAL;
   1146 	}
   1147 
   1148 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1149 		/* bad column */
   1150 		return EINVAL;
   1151 	}
   1152 
   1153 	rf_lock_mutex2(raidPtr->mutex);
   1154 	if (raidPtr->status == rf_rs_reconstructing) {
   1155 		/* you can't fail a disk while we're reconstructing! */
   1156 		/* XXX wrong for RAID6 */
   1157 		goto out;
   1158 	}
   1159 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1160 	    (raidPtr->numFailures > 0)) {
   1161 		/* some other component has failed.  Let's not make
   1162 		   things worse. XXX wrong for RAID6 */
   1163 		goto out;
   1164 	}
   1165 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1166 		/* Can't fail a spared disk! */
   1167 		goto out;
   1168 	}
   1169 	rf_unlock_mutex2(raidPtr->mutex);
   1170 
   1171 	/* make a copy of the recon request so that we don't rely on
   1172 	 * the user's buffer */
   1173 	rrint = RF_Malloc(sizeof(*rrint));
   1174 	if (rrint == NULL)
   1175 		return(ENOMEM);
   1176 	rrint->col = rr->col;
   1177 	rrint->flags = rr->flags;
   1178 	rrint->raidPtr = raidPtr;
   1179 
   1180 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1181 	    rrint, "raid_recon");
   1182 out:
   1183 	rf_unlock_mutex2(raidPtr->mutex);
   1184 	return EINVAL;
   1185 }
   1186 
   1187 static int
   1188 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1189 {
   1190 	/* allocate a buffer for the layout-specific data, and copy it in */
   1191 	if (k_cfg->layoutSpecificSize == 0)
   1192 		return 0;
   1193 
   1194 	if (k_cfg->layoutSpecificSize > 10000) {
   1195 	    /* sanity check */
   1196 	    return EINVAL;
   1197 	}
   1198 
   1199 	u_char *specific_buf;
   1200 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1201 	if (specific_buf == NULL)
   1202 		return ENOMEM;
   1203 
   1204 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1205 	    k_cfg->layoutSpecificSize);
   1206 	if (retcode) {
   1207 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1208 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1209 		return retcode;
   1210 	}
   1211 
   1212 	k_cfg->layoutSpecific = specific_buf;
   1213 	return 0;
   1214 }
   1215 
   1216 static int
   1217 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1218 {
   1219 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1220 
   1221 	if (rs->sc_r.valid) {
   1222 		/* There is a valid RAID set running on this unit! */
   1223 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1224 		return EINVAL;
   1225 	}
   1226 
   1227 	/* copy-in the configuration information */
   1228 	/* data points to a pointer to the configuration structure */
   1229 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1230 	if (*k_cfg == NULL) {
   1231 		return ENOMEM;
   1232 	}
   1233 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1234 	if (retcode == 0)
   1235 		return 0;
   1236 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1237 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1238 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1239 	return retcode;
   1240 }
   1241 
   1242 int
   1243 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1244 {
   1245 	int retcode, i;
   1246 	RF_Raid_t *raidPtr = &rs->sc_r;
   1247 
   1248 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1249 
   1250 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1251 		goto out;
   1252 
   1253 	/* should do some kind of sanity check on the configuration.
   1254 	 * Store the sum of all the bytes in the last byte? */
   1255 
   1256 	/* Force nul-termination on all strings. */
   1257 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1258 	for (i = 0; i < RF_MAXCOL; i++) {
   1259 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1260 	}
   1261 	for (i = 0; i < RF_MAXSPARE; i++) {
   1262 		ZERO_FINAL(k_cfg->spare_names[i]);
   1263 	}
   1264 	for (i = 0; i < RF_MAXDBGV; i++) {
   1265 		ZERO_FINAL(k_cfg->debugVars[i]);
   1266 	}
   1267 #undef ZERO_FINAL
   1268 
   1269 	/* Check some basic limits. */
   1270 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1271 		retcode = EINVAL;
   1272 		goto out;
   1273 	}
   1274 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1275 		retcode = EINVAL;
   1276 		goto out;
   1277 	}
   1278 
   1279 	/* configure the system */
   1280 
   1281 	/*
   1282 	 * Clear the entire RAID descriptor, just to make sure
   1283 	 *  there is no stale data left in the case of a
   1284 	 *  reconfiguration
   1285 	 */
   1286 	memset(raidPtr, 0, sizeof(*raidPtr));
   1287 	raidPtr->softc = rs;
   1288 	raidPtr->raidid = rs->sc_unit;
   1289 
   1290 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1291 
   1292 	if (retcode == 0) {
   1293 		/* allow this many simultaneous IO's to
   1294 		   this RAID device */
   1295 		raidPtr->openings = RAIDOUTSTANDING;
   1296 
   1297 		raidinit(rs);
   1298 		raid_wakeup(raidPtr);
   1299 		rf_markalldirty(raidPtr);
   1300 	}
   1301 
   1302 	/* free the buffers.  No return code here. */
   1303 	if (k_cfg->layoutSpecificSize) {
   1304 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1305 	}
   1306 out:
   1307 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1308 	if (retcode) {
   1309 		/*
   1310 		 * If configuration failed, set sc_flags so that we
   1311 		 * will detach the device when we close it.
   1312 		 */
   1313 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1314 	}
   1315 	return retcode;
   1316 }
   1317 
   1318 #if RF_DISABLED
   1319 static int
   1320 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1321 {
   1322 
   1323 	/* XXX check the label for valid stuff... */
   1324 	/* Note that some things *should not* get modified --
   1325 	   the user should be re-initing the labels instead of
   1326 	   trying to patch things.
   1327 	   */
   1328 #ifdef DEBUG
   1329 	int raidid = raidPtr->raidid;
   1330 	printf("raid%d: Got component label:\n", raidid);
   1331 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1332 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1333 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1334 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1335 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1336 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1337 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1338 #endif	/* DEBUG */
   1339 	clabel->row = 0;
   1340 	int column = clabel->column;
   1341 
   1342 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1343 		return(EINVAL);
   1344 	}
   1345 
   1346 	/* XXX this isn't allowed to do anything for now :-) */
   1347 
   1348 	/* XXX and before it is, we need to fill in the rest
   1349 	   of the fields!?!?!?! */
   1350 	memcpy(raidget_component_label(raidPtr, column),
   1351 	    clabel, sizeof(*clabel));
   1352 	raidflush_component_label(raidPtr, column);
   1353 	return 0;
   1354 }
   1355 #endif
   1356 
   1357 static int
   1358 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1359 {
   1360 	/*
   1361 	   we only want the serial number from
   1362 	   the above.  We get all the rest of the information
   1363 	   from the config that was used to create this RAID
   1364 	   set.
   1365 	   */
   1366 
   1367 	raidPtr->serial_number = clabel->serial_number;
   1368 
   1369 	for (int column = 0; column < raidPtr->numCol; column++) {
   1370 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1371 		if (RF_DEAD_DISK(diskPtr->status))
   1372 			continue;
   1373 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1374 		    raidPtr, column);
   1375 		/* Zeroing this is important. */
   1376 		memset(ci_label, 0, sizeof(*ci_label));
   1377 		raid_init_component_label(raidPtr, ci_label);
   1378 		ci_label->serial_number = raidPtr->serial_number;
   1379 		ci_label->row = 0; /* we dont' pretend to support more */
   1380 		rf_component_label_set_partitionsize(ci_label,
   1381 		    diskPtr->partitionSize);
   1382 		ci_label->column = column;
   1383 		raidflush_component_label(raidPtr, column);
   1384 		/* XXXjld what about the spares? */
   1385 	}
   1386 
   1387 	return 0;
   1388 }
   1389 
   1390 static int
   1391 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1392 {
   1393 
   1394 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1395 		/* Can't do this on a RAID 0!! */
   1396 		return EINVAL;
   1397 	}
   1398 
   1399 	if (raidPtr->recon_in_progress == 1) {
   1400 		/* a reconstruct is already in progress! */
   1401 		return EINVAL;
   1402 	}
   1403 
   1404 	RF_SingleComponent_t component;
   1405 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1406 	component.row = 0; /* we don't support any more */
   1407 	int column = component.column;
   1408 
   1409 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1410 		return EINVAL;
   1411 	}
   1412 
   1413 	rf_lock_mutex2(raidPtr->mutex);
   1414 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1415 	    (raidPtr->numFailures > 0)) {
   1416 		/* XXX 0 above shouldn't be constant!!! */
   1417 		/* some component other than this has failed.
   1418 		   Let's not make things worse than they already
   1419 		   are... */
   1420 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1421 		       raidPtr->raidid);
   1422 		printf("raid%d:     Col: %d   Too many failures.\n",
   1423 		       raidPtr->raidid, column);
   1424 		rf_unlock_mutex2(raidPtr->mutex);
   1425 		return EINVAL;
   1426 	}
   1427 
   1428 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1429 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1430 		       raidPtr->raidid);
   1431 		printf("raid%d:    Col: %d   "
   1432 		    "Reconstruction already occurring!\n",
   1433 		    raidPtr->raidid, column);
   1434 
   1435 		rf_unlock_mutex2(raidPtr->mutex);
   1436 		return EINVAL;
   1437 	}
   1438 
   1439 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1440 		rf_unlock_mutex2(raidPtr->mutex);
   1441 		return EINVAL;
   1442 	}
   1443 
   1444 	rf_unlock_mutex2(raidPtr->mutex);
   1445 
   1446 	struct rf_recon_req_internal *rrint;
   1447 	rrint = RF_Malloc(sizeof(*rrint));
   1448 	if (rrint == NULL)
   1449 		return ENOMEM;
   1450 
   1451 	rrint->col = column;
   1452 	rrint->raidPtr = raidPtr;
   1453 
   1454 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1455 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1456 }
   1457 
   1458 static int
   1459 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1460 {
   1461 	/*
   1462 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1463 	 * so tell the user it's done.
   1464 	 */
   1465 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1466 	    raidPtr->status != rf_rs_reconstructing) {
   1467 		*data = 100;
   1468 		return 0;
   1469 	}
   1470 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1471 		*data = 0;
   1472 		return 0;
   1473 	}
   1474 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1475 	    / raidPtr->reconControl->numRUsTotal);
   1476 	return 0;
   1477 }
   1478 
   1479 /*
   1480  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1481  * on the component_name[] array.
   1482  */
   1483 static void
   1484 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1485 {
   1486 
   1487 	memcpy(component, data, sizeof *component);
   1488 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1489 }
   1490 
   1491 static int
   1492 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1493 {
   1494 	int     unit = raidunit(dev);
   1495 	int     part, pmask;
   1496 	struct raid_softc *rs;
   1497 	struct dk_softc *dksc;
   1498 	RF_Config_t *k_cfg;
   1499 	RF_Raid_t *raidPtr;
   1500 	RF_AccTotals_t *totals;
   1501 	RF_SingleComponent_t component;
   1502 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1503 	int retcode = 0;
   1504 	int column;
   1505 	RF_ComponentLabel_t *clabel;
   1506 	int d;
   1507 
   1508 	if ((rs = raidget(unit, false)) == NULL)
   1509 		return ENXIO;
   1510 
   1511 	dksc = &rs->sc_dksc;
   1512 	raidPtr = &rs->sc_r;
   1513 
   1514 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1515 	    (int) DISKPART(dev), (int) unit, cmd));
   1516 
   1517 	/* Must be initialized for these... */
   1518 	if (rf_must_be_initialized(rs, cmd))
   1519 		return ENXIO;
   1520 
   1521 	switch (cmd) {
   1522 		/* configure the system */
   1523 	case RAIDFRAME_CONFIGURE:
   1524 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1525 			return retcode;
   1526 		return rf_construct(rs, k_cfg);
   1527 
   1528 		/* shutdown the system */
   1529 	case RAIDFRAME_SHUTDOWN:
   1530 
   1531 		part = DISKPART(dev);
   1532 		pmask = (1 << part);
   1533 
   1534 		if ((retcode = raidlock(rs)) != 0)
   1535 			return retcode;
   1536 
   1537 		if (DK_BUSY(dksc, pmask) ||
   1538 		    raidPtr->recon_in_progress != 0 ||
   1539 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1540 		    raidPtr->copyback_in_progress != 0)
   1541 			retcode = EBUSY;
   1542 		else {
   1543 			/* detach and free on close */
   1544 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1545 			retcode = 0;
   1546 		}
   1547 
   1548 		raidunlock(rs);
   1549 
   1550 		return retcode;
   1551 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1552 		return rf_get_component_label(raidPtr, data);
   1553 
   1554 #if RF_DISABLED
   1555 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1556 		return rf_set_component_label(raidPtr, data);
   1557 #endif
   1558 
   1559 	case RAIDFRAME_INIT_LABELS:
   1560 		return rf_init_component_label(raidPtr, data);
   1561 
   1562 	case RAIDFRAME_SET_AUTOCONFIG:
   1563 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1564 		printf("raid%d: New autoconfig value is: %d\n",
   1565 		       raidPtr->raidid, d);
   1566 		*(int *) data = d;
   1567 		return retcode;
   1568 
   1569 	case RAIDFRAME_SET_ROOT:
   1570 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1571 		printf("raid%d: New rootpartition value is: %d\n",
   1572 		       raidPtr->raidid, d);
   1573 		*(int *) data = d;
   1574 		return retcode;
   1575 
   1576 		/* initialize all parity */
   1577 	case RAIDFRAME_REWRITEPARITY:
   1578 
   1579 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1580 			/* Parity for RAID 0 is trivially correct */
   1581 			raidPtr->parity_good = RF_RAID_CLEAN;
   1582 			return 0;
   1583 		}
   1584 
   1585 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1586 			/* Re-write is already in progress! */
   1587 			return EINVAL;
   1588 		}
   1589 
   1590 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1591 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1592 
   1593 	case RAIDFRAME_ADD_HOT_SPARE:
   1594 		rf_copy_single_component(&component, data);
   1595 		return rf_add_hot_spare(raidPtr, &component);
   1596 
   1597 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1598 		return retcode;
   1599 
   1600 	case RAIDFRAME_DELETE_COMPONENT:
   1601 		rf_copy_single_component(&component, data);
   1602 		return rf_delete_component(raidPtr, &component);
   1603 
   1604 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1605 		rf_copy_single_component(&component, data);
   1606 		return rf_incorporate_hot_spare(raidPtr, &component);
   1607 
   1608 	case RAIDFRAME_REBUILD_IN_PLACE:
   1609 		return rf_rebuild_in_place(raidPtr, data);
   1610 
   1611 	case RAIDFRAME_GET_INFO:
   1612 		ucfgp = *(RF_DeviceConfig_t **)data;
   1613 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1614 		if (d_cfg == NULL)
   1615 			return ENOMEM;
   1616 		retcode = rf_get_info(raidPtr, d_cfg);
   1617 		if (retcode == 0) {
   1618 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1619 		}
   1620 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1621 		return retcode;
   1622 
   1623 	case RAIDFRAME_CHECK_PARITY:
   1624 		*(int *) data = raidPtr->parity_good;
   1625 		return 0;
   1626 
   1627 	case RAIDFRAME_PARITYMAP_STATUS:
   1628 		if (rf_paritymap_ineligible(raidPtr))
   1629 			return EINVAL;
   1630 		rf_paritymap_status(raidPtr->parity_map, data);
   1631 		return 0;
   1632 
   1633 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1634 		if (rf_paritymap_ineligible(raidPtr))
   1635 			return EINVAL;
   1636 		if (raidPtr->parity_map == NULL)
   1637 			return ENOENT; /* ??? */
   1638 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1639 			return EINVAL;
   1640 		return 0;
   1641 
   1642 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1643 		if (rf_paritymap_ineligible(raidPtr))
   1644 			return EINVAL;
   1645 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1646 		return 0;
   1647 
   1648 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1649 		if (rf_paritymap_ineligible(raidPtr))
   1650 			return EINVAL;
   1651 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1652 		/* XXX should errors be passed up? */
   1653 		return 0;
   1654 
   1655 	case RAIDFRAME_RESCAN:
   1656 		return rf_rescan();
   1657 
   1658 	case RAIDFRAME_RESET_ACCTOTALS:
   1659 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1660 		return 0;
   1661 
   1662 	case RAIDFRAME_GET_ACCTOTALS:
   1663 		totals = (RF_AccTotals_t *) data;
   1664 		*totals = raidPtr->acc_totals;
   1665 		return 0;
   1666 
   1667 	case RAIDFRAME_KEEP_ACCTOTALS:
   1668 		raidPtr->keep_acc_totals = *(int *)data;
   1669 		return 0;
   1670 
   1671 	case RAIDFRAME_GET_SIZE:
   1672 		*(int *) data = raidPtr->totalSectors;
   1673 		return 0;
   1674 
   1675 	case RAIDFRAME_FAIL_DISK:
   1676 		return rf_fail_disk(raidPtr, data);
   1677 
   1678 		/* invoke a copyback operation after recon on whatever disk
   1679 		 * needs it, if any */
   1680 	case RAIDFRAME_COPYBACK:
   1681 
   1682 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1683 			/* This makes no sense on a RAID 0!! */
   1684 			return EINVAL;
   1685 		}
   1686 
   1687 		if (raidPtr->copyback_in_progress == 1) {
   1688 			/* Copyback is already in progress! */
   1689 			return EINVAL;
   1690 		}
   1691 
   1692 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1693 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1694 
   1695 		/* return the percentage completion of reconstruction */
   1696 	case RAIDFRAME_CHECK_RECON_STATUS:
   1697 		return rf_check_recon_status(raidPtr, data);
   1698 
   1699 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1700 		rf_check_recon_status_ext(raidPtr, data);
   1701 		return 0;
   1702 
   1703 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1704 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1705 			/* This makes no sense on a RAID 0, so tell the
   1706 			   user it's done. */
   1707 			*(int *) data = 100;
   1708 			return 0;
   1709 		}
   1710 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1711 			*(int *) data = 100 *
   1712 				raidPtr->parity_rewrite_stripes_done /
   1713 				raidPtr->Layout.numStripe;
   1714 		} else {
   1715 			*(int *) data = 100;
   1716 		}
   1717 		return 0;
   1718 
   1719 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1720 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1721 		return 0;
   1722 
   1723 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1724 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1725 			/* This makes no sense on a RAID 0 */
   1726 			*(int *) data = 100;
   1727 			return 0;
   1728 		}
   1729 		if (raidPtr->copyback_in_progress == 1) {
   1730 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1731 				raidPtr->Layout.numStripe;
   1732 		} else {
   1733 			*(int *) data = 100;
   1734 		}
   1735 		return 0;
   1736 
   1737 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1738 		rf_check_copyback_status_ext(raidPtr, data);
   1739 		return 0;
   1740 
   1741 	case RAIDFRAME_SET_LAST_UNIT:
   1742 		for (column = 0; column < raidPtr->numCol; column++)
   1743 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1744 				return EBUSY;
   1745 
   1746 		for (column = 0; column < raidPtr->numCol; column++) {
   1747 			clabel = raidget_component_label(raidPtr, column);
   1748 			clabel->last_unit = *(int *)data;
   1749 			raidflush_component_label(raidPtr, column);
   1750 		}
   1751 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1752 		return 0;
   1753 
   1754 		/* the sparetable daemon calls this to wait for the kernel to
   1755 		 * need a spare table. this ioctl does not return until a
   1756 		 * spare table is needed. XXX -- calling mpsleep here in the
   1757 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1758 		 * -- I should either compute the spare table in the kernel,
   1759 		 * or have a different -- XXX XXX -- interface (a different
   1760 		 * character device) for delivering the table     -- XXX */
   1761 #if RF_DISABLED
   1762 	case RAIDFRAME_SPARET_WAIT:
   1763 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1764 		while (!rf_sparet_wait_queue)
   1765 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1766 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1767 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1768 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1769 
   1770 		/* structure assignment */
   1771 		*((RF_SparetWait_t *) data) = *waitreq;
   1772 
   1773 		RF_Free(waitreq, sizeof(*waitreq));
   1774 		return 0;
   1775 
   1776 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1777 		 * code in it that will cause the dameon to exit */
   1778 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1779 		waitreq = RF_Malloc(sizeof(*waitreq));
   1780 		waitreq->fcol = -1;
   1781 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1782 		waitreq->next = rf_sparet_wait_queue;
   1783 		rf_sparet_wait_queue = waitreq;
   1784 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1785 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1786 		return 0;
   1787 
   1788 		/* used by the spare table daemon to deliver a spare table
   1789 		 * into the kernel */
   1790 	case RAIDFRAME_SEND_SPARET:
   1791 
   1792 		/* install the spare table */
   1793 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1794 
   1795 		/* respond to the requestor.  the return status of the spare
   1796 		 * table installation is passed in the "fcol" field */
   1797 		waitred = RF_Malloc(sizeof(*waitreq));
   1798 		waitreq->fcol = retcode;
   1799 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1800 		waitreq->next = rf_sparet_resp_queue;
   1801 		rf_sparet_resp_queue = waitreq;
   1802 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1803 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1804 
   1805 		return retcode;
   1806 #endif
   1807 	default:
   1808 		/*
   1809 		 * Don't bother trying to load compat modules
   1810 		 * if it is not our ioctl. This is more efficient
   1811 		 * and makes rump tests not depend on compat code
   1812 		 */
   1813 		if (IOCGROUP(cmd) != 'r')
   1814 			break;
   1815 #ifdef _LP64
   1816 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1817 			module_autoload("compat_netbsd32_raid",
   1818 			    MODULE_CLASS_EXEC);
   1819 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1820 			    (rs, cmd, data), enosys(), retcode);
   1821 			if (retcode != EPASSTHROUGH)
   1822 				return retcode;
   1823 		}
   1824 #endif
   1825 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1826 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1827 		    (rs, cmd, data), enosys(), retcode);
   1828 		if (retcode != EPASSTHROUGH)
   1829 			return retcode;
   1830 
   1831 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1832 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1833 		    (rs, cmd, data), enosys(), retcode);
   1834 		if (retcode != EPASSTHROUGH)
   1835 			return retcode;
   1836 		break; /* fall through to the os-specific code below */
   1837 
   1838 	}
   1839 
   1840 	if (!raidPtr->valid)
   1841 		return EINVAL;
   1842 
   1843 	/*
   1844 	 * Add support for "regular" device ioctls here.
   1845 	 */
   1846 
   1847 	switch (cmd) {
   1848 	case DIOCGCACHE:
   1849 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1850 		break;
   1851 
   1852 	case DIOCCACHESYNC:
   1853 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1854 		break;
   1855 
   1856 	default:
   1857 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1858 		break;
   1859 	}
   1860 
   1861 	return retcode;
   1862 
   1863 }
   1864 
   1865 
   1866 /* raidinit -- complete the rest of the initialization for the
   1867    RAIDframe device.  */
   1868 
   1869 
   1870 static void
   1871 raidinit(struct raid_softc *rs)
   1872 {
   1873 	cfdata_t cf;
   1874 	unsigned int unit;
   1875 	struct dk_softc *dksc = &rs->sc_dksc;
   1876 	RF_Raid_t *raidPtr = &rs->sc_r;
   1877 	device_t dev;
   1878 
   1879 	unit = raidPtr->raidid;
   1880 
   1881 	/* XXX doesn't check bounds. */
   1882 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1883 
   1884 	/* attach the pseudo device */
   1885 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1886 	cf->cf_name = raid_cd.cd_name;
   1887 	cf->cf_atname = raid_cd.cd_name;
   1888 	cf->cf_unit = unit;
   1889 	cf->cf_fstate = FSTATE_STAR;
   1890 
   1891 	dev = config_attach_pseudo(cf);
   1892 	if (dev == NULL) {
   1893 		printf("raid%d: config_attach_pseudo failed\n",
   1894 		    raidPtr->raidid);
   1895 		free(cf, M_RAIDFRAME);
   1896 		return;
   1897 	}
   1898 
   1899 	/* provide a backpointer to the real softc */
   1900 	raidsoftc(dev) = rs;
   1901 
   1902 	/* disk_attach actually creates space for the CPU disklabel, among
   1903 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1904 	 * with disklabels. */
   1905 	dk_init(dksc, dev, DKTYPE_RAID);
   1906 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1907 
   1908 	/* XXX There may be a weird interaction here between this, and
   1909 	 * protectedSectors, as used in RAIDframe.  */
   1910 
   1911 	rs->sc_size = raidPtr->totalSectors;
   1912 
   1913 	/* Attach dk and disk subsystems */
   1914 	dk_attach(dksc);
   1915 	disk_attach(&dksc->sc_dkdev);
   1916 	rf_set_geometry(rs, raidPtr);
   1917 
   1918 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1919 
   1920 	/* mark unit as usuable */
   1921 	rs->sc_flags |= RAIDF_INITED;
   1922 
   1923 	dkwedge_discover(&dksc->sc_dkdev);
   1924 }
   1925 
   1926 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1927 /* wake up the daemon & tell it to get us a spare table
   1928  * XXX
   1929  * the entries in the queues should be tagged with the raidPtr
   1930  * so that in the extremely rare case that two recons happen at once,
   1931  * we know for which device were requesting a spare table
   1932  * XXX
   1933  *
   1934  * XXX This code is not currently used. GO
   1935  */
   1936 int
   1937 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1938 {
   1939 	int     retcode;
   1940 
   1941 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1942 	req->next = rf_sparet_wait_queue;
   1943 	rf_sparet_wait_queue = req;
   1944 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1945 
   1946 	/* mpsleep unlocks the mutex */
   1947 	while (!rf_sparet_resp_queue) {
   1948 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1949 	}
   1950 	req = rf_sparet_resp_queue;
   1951 	rf_sparet_resp_queue = req->next;
   1952 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1953 
   1954 	retcode = req->fcol;
   1955 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1956 					 * alloc'd */
   1957 	return retcode;
   1958 }
   1959 #endif
   1960 
   1961 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1962  * bp & passes it down.
   1963  * any calls originating in the kernel must use non-blocking I/O
   1964  * do some extra sanity checking to return "appropriate" error values for
   1965  * certain conditions (to make some standard utilities work)
   1966  *
   1967  * Formerly known as: rf_DoAccessKernel
   1968  */
   1969 void
   1970 raidstart(RF_Raid_t *raidPtr)
   1971 {
   1972 	struct raid_softc *rs;
   1973 	struct dk_softc *dksc;
   1974 
   1975 	rs = raidPtr->softc;
   1976 	dksc = &rs->sc_dksc;
   1977 	/* quick check to see if anything has died recently */
   1978 	rf_lock_mutex2(raidPtr->mutex);
   1979 	if (raidPtr->numNewFailures > 0) {
   1980 		rf_unlock_mutex2(raidPtr->mutex);
   1981 		rf_update_component_labels(raidPtr,
   1982 					   RF_NORMAL_COMPONENT_UPDATE);
   1983 		rf_lock_mutex2(raidPtr->mutex);
   1984 		raidPtr->numNewFailures--;
   1985 	}
   1986 	rf_unlock_mutex2(raidPtr->mutex);
   1987 
   1988 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1989 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1990 		return;
   1991 	}
   1992 
   1993 	dk_start(dksc, NULL);
   1994 }
   1995 
   1996 static int
   1997 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1998 {
   1999 	RF_SectorCount_t num_blocks, pb, sum;
   2000 	RF_RaidAddr_t raid_addr;
   2001 	daddr_t blocknum;
   2002 	int rc;
   2003 
   2004 	rf_lock_mutex2(raidPtr->mutex);
   2005 	if (raidPtr->openings == 0) {
   2006 		rf_unlock_mutex2(raidPtr->mutex);
   2007 		return EAGAIN;
   2008 	}
   2009 	rf_unlock_mutex2(raidPtr->mutex);
   2010 
   2011 	blocknum = bp->b_rawblkno;
   2012 
   2013 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2014 		    (int) blocknum));
   2015 
   2016 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2017 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2018 
   2019 	/* *THIS* is where we adjust what block we're going to...
   2020 	 * but DO NOT TOUCH bp->b_blkno!!! */
   2021 	raid_addr = blocknum;
   2022 
   2023 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2024 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2025 	sum = raid_addr + num_blocks + pb;
   2026 	if (1 || rf_debugKernelAccess) {
   2027 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2028 			    (int) raid_addr, (int) sum, (int) num_blocks,
   2029 			    (int) pb, (int) bp->b_resid));
   2030 	}
   2031 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2032 	    || (sum < num_blocks) || (sum < pb)) {
   2033 		rc = ENOSPC;
   2034 		goto done;
   2035 	}
   2036 	/*
   2037 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2038 	 */
   2039 
   2040 	if (bp->b_bcount & raidPtr->sectorMask) {
   2041 		rc = ENOSPC;
   2042 		goto done;
   2043 	}
   2044 	db1_printf(("Calling DoAccess..\n"));
   2045 
   2046 
   2047 	rf_lock_mutex2(raidPtr->mutex);
   2048 	raidPtr->openings--;
   2049 	rf_unlock_mutex2(raidPtr->mutex);
   2050 
   2051 	/* don't ever condition on bp->b_flags & B_WRITE.
   2052 	 * always condition on B_READ instead */
   2053 
   2054 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2055 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2056 			 raid_addr, num_blocks,
   2057 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2058 
   2059 done:
   2060 	return rc;
   2061 }
   2062 
   2063 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2064 
   2065 int
   2066 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2067 {
   2068 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2069 	struct buf *bp;
   2070 
   2071 	req->queue = queue;
   2072 	bp = req->bp;
   2073 
   2074 	switch (req->type) {
   2075 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2076 		/* XXX need to do something extra here.. */
   2077 		/* I'm leaving this in, as I've never actually seen it used,
   2078 		 * and I'd like folks to report it... GO */
   2079 		printf("%s: WAKEUP CALLED\n", __func__);
   2080 		queue->numOutstanding++;
   2081 
   2082 		bp->b_flags = 0;
   2083 		bp->b_private = req;
   2084 
   2085 		KernelWakeupFunc(bp);
   2086 		break;
   2087 
   2088 	case RF_IO_TYPE_READ:
   2089 	case RF_IO_TYPE_WRITE:
   2090 #if RF_ACC_TRACE > 0
   2091 		if (req->tracerec) {
   2092 			RF_ETIMER_START(req->tracerec->timer);
   2093 		}
   2094 #endif
   2095 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2096 		    op, queue->rf_cinfo->ci_dev,
   2097 		    req->sectorOffset, req->numSector,
   2098 		    req->buf, KernelWakeupFunc, (void *) req,
   2099 		    queue->raidPtr->logBytesPerSector);
   2100 
   2101 		if (rf_debugKernelAccess) {
   2102 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2103 				(long) bp->b_blkno));
   2104 		}
   2105 		queue->numOutstanding++;
   2106 		queue->last_deq_sector = req->sectorOffset;
   2107 		/* acc wouldn't have been let in if there were any pending
   2108 		 * reqs at any other priority */
   2109 		queue->curPriority = req->priority;
   2110 
   2111 		db1_printf(("Going for %c to unit %d col %d\n",
   2112 			    req->type, queue->raidPtr->raidid,
   2113 			    queue->col));
   2114 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2115 			(int) req->sectorOffset, (int) req->numSector,
   2116 			(int) (req->numSector <<
   2117 			    queue->raidPtr->logBytesPerSector),
   2118 			(int) queue->raidPtr->logBytesPerSector));
   2119 
   2120 		/*
   2121 		 * XXX: drop lock here since this can block at
   2122 		 * least with backing SCSI devices.  Retake it
   2123 		 * to minimize fuss with calling interfaces.
   2124 		 */
   2125 
   2126 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2127 		bdev_strategy(bp);
   2128 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2129 		break;
   2130 
   2131 	default:
   2132 		panic("bad req->type in rf_DispatchKernelIO");
   2133 	}
   2134 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2135 
   2136 	return 0;
   2137 }
   2138 /* this is the callback function associated with a I/O invoked from
   2139    kernel code.
   2140  */
   2141 static void
   2142 KernelWakeupFunc(struct buf *bp)
   2143 {
   2144 	RF_DiskQueueData_t *req = NULL;
   2145 	RF_DiskQueue_t *queue;
   2146 
   2147 	db1_printf(("recovering the request queue:\n"));
   2148 
   2149 	req = bp->b_private;
   2150 
   2151 	queue = (RF_DiskQueue_t *) req->queue;
   2152 
   2153 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2154 
   2155 #if RF_ACC_TRACE > 0
   2156 	if (req->tracerec) {
   2157 		RF_ETIMER_STOP(req->tracerec->timer);
   2158 		RF_ETIMER_EVAL(req->tracerec->timer);
   2159 		rf_lock_mutex2(rf_tracing_mutex);
   2160 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2161 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2162 		req->tracerec->num_phys_ios++;
   2163 		rf_unlock_mutex2(rf_tracing_mutex);
   2164 	}
   2165 #endif
   2166 
   2167 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2168 	 * ballistic, and mark the component as hosed... */
   2169 
   2170 	if (bp->b_error != 0) {
   2171 		/* Mark the disk as dead */
   2172 		/* but only mark it once... */
   2173 		/* and only if it wouldn't leave this RAID set
   2174 		   completely broken */
   2175 		if (((queue->raidPtr->Disks[queue->col].status ==
   2176 		      rf_ds_optimal) ||
   2177 		     (queue->raidPtr->Disks[queue->col].status ==
   2178 		      rf_ds_used_spare)) &&
   2179 		     (queue->raidPtr->numFailures <
   2180 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2181 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2182 			       queue->raidPtr->raidid,
   2183 			       bp->b_error,
   2184 			       queue->raidPtr->Disks[queue->col].devname);
   2185 			queue->raidPtr->Disks[queue->col].status =
   2186 			    rf_ds_failed;
   2187 			queue->raidPtr->status = rf_rs_degraded;
   2188 			queue->raidPtr->numFailures++;
   2189 			queue->raidPtr->numNewFailures++;
   2190 		} else {	/* Disk is already dead... */
   2191 			/* printf("Disk already marked as dead!\n"); */
   2192 		}
   2193 
   2194 	}
   2195 
   2196 	/* Fill in the error value */
   2197 	req->error = bp->b_error;
   2198 
   2199 	/* Drop this one on the "finished" queue... */
   2200 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2201 
   2202 	/* Let the raidio thread know there is work to be done. */
   2203 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2204 
   2205 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2206 }
   2207 
   2208 
   2209 /*
   2210  * initialize a buf structure for doing an I/O in the kernel.
   2211  */
   2212 static void
   2213 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2214        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2215        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2216 {
   2217 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2218 	bp->b_oflags = 0;
   2219 	bp->b_cflags = 0;
   2220 	bp->b_bcount = numSect << logBytesPerSector;
   2221 	bp->b_bufsize = bp->b_bcount;
   2222 	bp->b_error = 0;
   2223 	bp->b_dev = dev;
   2224 	bp->b_data = bf;
   2225 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2226 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2227 	if (bp->b_bcount == 0) {
   2228 		panic("bp->b_bcount is zero in InitBP!!");
   2229 	}
   2230 	bp->b_iodone = cbFunc;
   2231 	bp->b_private = cbArg;
   2232 }
   2233 
   2234 /*
   2235  * Wait interruptibly for an exclusive lock.
   2236  *
   2237  * XXX
   2238  * Several drivers do this; it should be abstracted and made MP-safe.
   2239  * (Hmm... where have we seen this warning before :->  GO )
   2240  */
   2241 static int
   2242 raidlock(struct raid_softc *rs)
   2243 {
   2244 	int     error;
   2245 
   2246 	error = 0;
   2247 	mutex_enter(&rs->sc_mutex);
   2248 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2249 		rs->sc_flags |= RAIDF_WANTED;
   2250 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2251 		if (error != 0)
   2252 			goto done;
   2253 	}
   2254 	rs->sc_flags |= RAIDF_LOCKED;
   2255 done:
   2256 	mutex_exit(&rs->sc_mutex);
   2257 	return error;
   2258 }
   2259 /*
   2260  * Unlock and wake up any waiters.
   2261  */
   2262 static void
   2263 raidunlock(struct raid_softc *rs)
   2264 {
   2265 
   2266 	mutex_enter(&rs->sc_mutex);
   2267 	rs->sc_flags &= ~RAIDF_LOCKED;
   2268 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2269 		rs->sc_flags &= ~RAIDF_WANTED;
   2270 		cv_broadcast(&rs->sc_cv);
   2271 	}
   2272 	mutex_exit(&rs->sc_mutex);
   2273 }
   2274 
   2275 
   2276 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2277 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2278 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2279 
   2280 static daddr_t
   2281 rf_component_info_offset(void)
   2282 {
   2283 
   2284 	return RF_COMPONENT_INFO_OFFSET;
   2285 }
   2286 
   2287 static daddr_t
   2288 rf_component_info_size(unsigned secsize)
   2289 {
   2290 	daddr_t info_size;
   2291 
   2292 	KASSERT(secsize);
   2293 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2294 		info_size = secsize;
   2295 	else
   2296 		info_size = RF_COMPONENT_INFO_SIZE;
   2297 
   2298 	return info_size;
   2299 }
   2300 
   2301 static daddr_t
   2302 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2303 {
   2304 	daddr_t map_offset;
   2305 
   2306 	KASSERT(raidPtr->bytesPerSector);
   2307 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2308 		map_offset = raidPtr->bytesPerSector;
   2309 	else
   2310 		map_offset = RF_COMPONENT_INFO_SIZE;
   2311 	map_offset += rf_component_info_offset();
   2312 
   2313 	return map_offset;
   2314 }
   2315 
   2316 static daddr_t
   2317 rf_parity_map_size(RF_Raid_t *raidPtr)
   2318 {
   2319 	daddr_t map_size;
   2320 
   2321 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2322 		map_size = raidPtr->bytesPerSector;
   2323 	else
   2324 		map_size = RF_PARITY_MAP_SIZE;
   2325 
   2326 	return map_size;
   2327 }
   2328 
   2329 int
   2330 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2331 {
   2332 	RF_ComponentLabel_t *clabel;
   2333 
   2334 	clabel = raidget_component_label(raidPtr, col);
   2335 	clabel->clean = RF_RAID_CLEAN;
   2336 	raidflush_component_label(raidPtr, col);
   2337 	return(0);
   2338 }
   2339 
   2340 
   2341 int
   2342 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2343 {
   2344 	RF_ComponentLabel_t *clabel;
   2345 
   2346 	clabel = raidget_component_label(raidPtr, col);
   2347 	clabel->clean = RF_RAID_DIRTY;
   2348 	raidflush_component_label(raidPtr, col);
   2349 	return(0);
   2350 }
   2351 
   2352 int
   2353 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2354 {
   2355 	KASSERT(raidPtr->bytesPerSector);
   2356 
   2357 	return raidread_component_label(raidPtr->bytesPerSector,
   2358 	    raidPtr->Disks[col].dev,
   2359 	    raidPtr->raid_cinfo[col].ci_vp,
   2360 	    &raidPtr->raid_cinfo[col].ci_label);
   2361 }
   2362 
   2363 RF_ComponentLabel_t *
   2364 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2365 {
   2366 	return &raidPtr->raid_cinfo[col].ci_label;
   2367 }
   2368 
   2369 int
   2370 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2371 {
   2372 	RF_ComponentLabel_t *label;
   2373 
   2374 	label = &raidPtr->raid_cinfo[col].ci_label;
   2375 	label->mod_counter = raidPtr->mod_counter;
   2376 #ifndef RF_NO_PARITY_MAP
   2377 	label->parity_map_modcount = label->mod_counter;
   2378 #endif
   2379 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2380 	    raidPtr->Disks[col].dev,
   2381 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2382 }
   2383 
   2384 /*
   2385  * Swap the label endianness.
   2386  *
   2387  * Everything in the component label is 4-byte-swapped except the version,
   2388  * which is kept in the byte-swapped version at all times, and indicates
   2389  * for the writer that a swap is necessary.
   2390  *
   2391  * For reads it is expected that out_label == clabel, but writes expect
   2392  * separate labels so only the re-swapped label is written out to disk,
   2393  * leaving the swapped-except-version internally.
   2394  *
   2395  * Only support swapping label version 2.
   2396  */
   2397 static void
   2398 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2399 {
   2400 	int	*in, *out, *in_last;
   2401 
   2402 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2403 
   2404 	/* Don't swap the label, but do copy it. */
   2405 	out_label->version = clabel->version;
   2406 
   2407 	in = &clabel->serial_number;
   2408 	in_last = &clabel->future_use2[42];
   2409 	out = &out_label->serial_number;
   2410 
   2411 	for (; in < in_last; in++, out++)
   2412 		*out = bswap32(*in);
   2413 }
   2414 
   2415 static int
   2416 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2417     RF_ComponentLabel_t *clabel)
   2418 {
   2419 	int error;
   2420 
   2421 	error = raidread_component_area(dev, b_vp, clabel,
   2422 	    sizeof(RF_ComponentLabel_t),
   2423 	    rf_component_info_offset(),
   2424 	    rf_component_info_size(secsize));
   2425 
   2426 	if (error == 0 &&
   2427 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2428 		rf_swap_label(clabel, clabel);
   2429 	}
   2430 
   2431 	return error;
   2432 }
   2433 
   2434 /* ARGSUSED */
   2435 static int
   2436 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2437     size_t msize, daddr_t offset, daddr_t dsize)
   2438 {
   2439 	struct buf *bp;
   2440 	int error;
   2441 
   2442 	/* XXX should probably ensure that we don't try to do this if
   2443 	   someone has changed rf_protected_sectors. */
   2444 
   2445 	if (b_vp == NULL) {
   2446 		/* For whatever reason, this component is not valid.
   2447 		   Don't try to read a component label from it. */
   2448 		return(EINVAL);
   2449 	}
   2450 
   2451 	/* get a block of the appropriate size... */
   2452 	bp = geteblk((int)dsize);
   2453 	bp->b_dev = dev;
   2454 
   2455 	/* get our ducks in a row for the read */
   2456 	bp->b_blkno = offset / DEV_BSIZE;
   2457 	bp->b_bcount = dsize;
   2458 	bp->b_flags |= B_READ;
   2459  	bp->b_resid = dsize;
   2460 
   2461 	bdev_strategy(bp);
   2462 	error = biowait(bp);
   2463 
   2464 	if (!error) {
   2465 		memcpy(data, bp->b_data, msize);
   2466 	}
   2467 
   2468 	brelse(bp, 0);
   2469 	return(error);
   2470 }
   2471 
   2472 static int
   2473 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2474     RF_ComponentLabel_t *clabel)
   2475 {
   2476 	RF_ComponentLabel_t *clabel_write = clabel;
   2477 	RF_ComponentLabel_t lclabel;
   2478 	int error;
   2479 
   2480 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2481 		clabel_write = &lclabel;
   2482 		rf_swap_label(clabel, clabel_write);
   2483 	}
   2484 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2485 	    sizeof(RF_ComponentLabel_t),
   2486 	    rf_component_info_offset(),
   2487 	    rf_component_info_size(secsize), 0);
   2488 
   2489 	return error;
   2490 }
   2491 
   2492 /* ARGSUSED */
   2493 static int
   2494 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2495     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2496 {
   2497 	struct buf *bp;
   2498 	int error;
   2499 
   2500 	/* get a block of the appropriate size... */
   2501 	bp = geteblk((int)dsize);
   2502 	bp->b_dev = dev;
   2503 
   2504 	/* get our ducks in a row for the write */
   2505 	bp->b_blkno = offset / DEV_BSIZE;
   2506 	bp->b_bcount = dsize;
   2507 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2508  	bp->b_resid = dsize;
   2509 
   2510 	memset(bp->b_data, 0, dsize);
   2511 	memcpy(bp->b_data, data, msize);
   2512 
   2513 	bdev_strategy(bp);
   2514 	if (asyncp)
   2515 		return 0;
   2516 	error = biowait(bp);
   2517 	brelse(bp, 0);
   2518 	if (error) {
   2519 #if 1
   2520 		printf("Failed to write RAID component info!\n");
   2521 #endif
   2522 	}
   2523 
   2524 	return(error);
   2525 }
   2526 
   2527 void
   2528 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2529 {
   2530 	int c;
   2531 
   2532 	for (c = 0; c < raidPtr->numCol; c++) {
   2533 		/* Skip dead disks. */
   2534 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2535 			continue;
   2536 		/* XXXjld: what if an error occurs here? */
   2537 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2538 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2539 		    RF_PARITYMAP_NBYTE,
   2540 		    rf_parity_map_offset(raidPtr),
   2541 		    rf_parity_map_size(raidPtr), 0);
   2542 	}
   2543 }
   2544 
   2545 void
   2546 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2547 {
   2548 	struct rf_paritymap_ondisk tmp;
   2549 	int c,first;
   2550 
   2551 	first=1;
   2552 	for (c = 0; c < raidPtr->numCol; c++) {
   2553 		/* Skip dead disks. */
   2554 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2555 			continue;
   2556 		raidread_component_area(raidPtr->Disks[c].dev,
   2557 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2558 		    RF_PARITYMAP_NBYTE,
   2559 		    rf_parity_map_offset(raidPtr),
   2560 		    rf_parity_map_size(raidPtr));
   2561 		if (first) {
   2562 			memcpy(map, &tmp, sizeof(*map));
   2563 			first = 0;
   2564 		} else {
   2565 			rf_paritymap_merge(map, &tmp);
   2566 		}
   2567 	}
   2568 }
   2569 
   2570 void
   2571 rf_markalldirty(RF_Raid_t *raidPtr)
   2572 {
   2573 	RF_ComponentLabel_t *clabel;
   2574 	int sparecol;
   2575 	int c;
   2576 	int j;
   2577 	int scol = -1;
   2578 
   2579 	raidPtr->mod_counter++;
   2580 	for (c = 0; c < raidPtr->numCol; c++) {
   2581 		/* we don't want to touch (at all) a disk that has
   2582 		   failed */
   2583 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2584 			clabel = raidget_component_label(raidPtr, c);
   2585 			if (clabel->status == rf_ds_spared) {
   2586 				/* XXX do something special...
   2587 				   but whatever you do, don't
   2588 				   try to access it!! */
   2589 			} else {
   2590 				raidmarkdirty(raidPtr, c);
   2591 			}
   2592 		}
   2593 	}
   2594 
   2595 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2596 		sparecol = raidPtr->numCol + c;
   2597 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2598 			/*
   2599 
   2600 			   we claim this disk is "optimal" if it's
   2601 			   rf_ds_used_spare, as that means it should be
   2602 			   directly substitutable for the disk it replaced.
   2603 			   We note that too...
   2604 
   2605 			 */
   2606 
   2607 			for(j=0;j<raidPtr->numCol;j++) {
   2608 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2609 					scol = j;
   2610 					break;
   2611 				}
   2612 			}
   2613 
   2614 			clabel = raidget_component_label(raidPtr, sparecol);
   2615 			/* make sure status is noted */
   2616 
   2617 			raid_init_component_label(raidPtr, clabel);
   2618 
   2619 			clabel->row = 0;
   2620 			clabel->column = scol;
   2621 			/* Note: we *don't* change status from rf_ds_used_spare
   2622 			   to rf_ds_optimal */
   2623 			/* clabel.status = rf_ds_optimal; */
   2624 
   2625 			raidmarkdirty(raidPtr, sparecol);
   2626 		}
   2627 	}
   2628 }
   2629 
   2630 
   2631 void
   2632 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2633 {
   2634 	RF_ComponentLabel_t *clabel;
   2635 	int sparecol;
   2636 	int c;
   2637 	int j;
   2638 	int scol;
   2639 	struct raid_softc *rs = raidPtr->softc;
   2640 
   2641 	scol = -1;
   2642 
   2643 	/* XXX should do extra checks to make sure things really are clean,
   2644 	   rather than blindly setting the clean bit... */
   2645 
   2646 	raidPtr->mod_counter++;
   2647 
   2648 	for (c = 0; c < raidPtr->numCol; c++) {
   2649 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2650 			clabel = raidget_component_label(raidPtr, c);
   2651 			/* make sure status is noted */
   2652 			clabel->status = rf_ds_optimal;
   2653 
   2654 			/* note what unit we are configured as */
   2655 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2656 				clabel->last_unit = raidPtr->raidid;
   2657 
   2658 			raidflush_component_label(raidPtr, c);
   2659 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2660 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2661 					raidmarkclean(raidPtr, c);
   2662 				}
   2663 			}
   2664 		}
   2665 		/* else we don't touch it.. */
   2666 	}
   2667 
   2668 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2669 		sparecol = raidPtr->numCol + c;
   2670 		/* Need to ensure that the reconstruct actually completed! */
   2671 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2672 			/*
   2673 
   2674 			   we claim this disk is "optimal" if it's
   2675 			   rf_ds_used_spare, as that means it should be
   2676 			   directly substitutable for the disk it replaced.
   2677 			   We note that too...
   2678 
   2679 			 */
   2680 
   2681 			for(j=0;j<raidPtr->numCol;j++) {
   2682 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2683 					scol = j;
   2684 					break;
   2685 				}
   2686 			}
   2687 
   2688 			/* XXX shouldn't *really* need this... */
   2689 			clabel = raidget_component_label(raidPtr, sparecol);
   2690 			/* make sure status is noted */
   2691 
   2692 			raid_init_component_label(raidPtr, clabel);
   2693 
   2694 			clabel->column = scol;
   2695 			clabel->status = rf_ds_optimal;
   2696 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2697 				clabel->last_unit = raidPtr->raidid;
   2698 
   2699 			raidflush_component_label(raidPtr, sparecol);
   2700 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2701 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2702 					raidmarkclean(raidPtr, sparecol);
   2703 				}
   2704 			}
   2705 		}
   2706 	}
   2707 }
   2708 
   2709 void
   2710 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2711 {
   2712 
   2713 	if (vp != NULL) {
   2714 		if (auto_configured == 1) {
   2715 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2716 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2717 			vput(vp);
   2718 
   2719 		} else {
   2720 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2721 		}
   2722 	}
   2723 }
   2724 
   2725 
   2726 void
   2727 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2728 {
   2729 	int r,c;
   2730 	struct vnode *vp;
   2731 	int acd;
   2732 
   2733 
   2734 	/* We take this opportunity to close the vnodes like we should.. */
   2735 
   2736 	for (c = 0; c < raidPtr->numCol; c++) {
   2737 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2738 		acd = raidPtr->Disks[c].auto_configured;
   2739 		rf_close_component(raidPtr, vp, acd);
   2740 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2741 		raidPtr->Disks[c].auto_configured = 0;
   2742 	}
   2743 
   2744 	for (r = 0; r < raidPtr->numSpare; r++) {
   2745 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2746 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2747 		rf_close_component(raidPtr, vp, acd);
   2748 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2749 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2750 	}
   2751 }
   2752 
   2753 
   2754 static void
   2755 rf_ReconThread(struct rf_recon_req_internal *req)
   2756 {
   2757 	int     s;
   2758 	RF_Raid_t *raidPtr;
   2759 
   2760 	s = splbio();
   2761 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2762 	raidPtr->recon_in_progress = 1;
   2763 
   2764 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2765 		raidPtr->forceRecon = 1;
   2766 	}
   2767 
   2768 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2769 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2770 
   2771 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2772 		raidPtr->forceRecon = 0;
   2773 	}
   2774 
   2775 	RF_Free(req, sizeof(*req));
   2776 
   2777 	raidPtr->recon_in_progress = 0;
   2778 	splx(s);
   2779 
   2780 	/* That's all... */
   2781 	kthread_exit(0);	/* does not return */
   2782 }
   2783 
   2784 static void
   2785 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2786 {
   2787 	int retcode;
   2788 	int s;
   2789 
   2790 	raidPtr->parity_rewrite_stripes_done = 0;
   2791 	raidPtr->parity_rewrite_in_progress = 1;
   2792 	s = splbio();
   2793 	retcode = rf_RewriteParity(raidPtr);
   2794 	splx(s);
   2795 	if (retcode) {
   2796 		printf("raid%d: Error re-writing parity (%d)!\n",
   2797 		    raidPtr->raidid, retcode);
   2798 	} else {
   2799 		/* set the clean bit!  If we shutdown correctly,
   2800 		   the clean bit on each component label will get
   2801 		   set */
   2802 		raidPtr->parity_good = RF_RAID_CLEAN;
   2803 	}
   2804 	raidPtr->parity_rewrite_in_progress = 0;
   2805 
   2806 	/* Anyone waiting for us to stop?  If so, inform them... */
   2807 	if (raidPtr->waitShutdown) {
   2808 		rf_lock_mutex2(raidPtr->rad_lock);
   2809 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2810 		rf_unlock_mutex2(raidPtr->rad_lock);
   2811 	}
   2812 
   2813 	/* That's all... */
   2814 	kthread_exit(0);	/* does not return */
   2815 }
   2816 
   2817 
   2818 static void
   2819 rf_CopybackThread(RF_Raid_t *raidPtr)
   2820 {
   2821 	int s;
   2822 
   2823 	raidPtr->copyback_in_progress = 1;
   2824 	s = splbio();
   2825 	rf_CopybackReconstructedData(raidPtr);
   2826 	splx(s);
   2827 	raidPtr->copyback_in_progress = 0;
   2828 
   2829 	/* That's all... */
   2830 	kthread_exit(0);	/* does not return */
   2831 }
   2832 
   2833 
   2834 static void
   2835 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2836 {
   2837 	int s;
   2838 	RF_Raid_t *raidPtr;
   2839 
   2840 	s = splbio();
   2841 	raidPtr = req->raidPtr;
   2842 	raidPtr->recon_in_progress = 1;
   2843 
   2844 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2845 		raidPtr->forceRecon = 1;
   2846 	}
   2847 
   2848 	rf_ReconstructInPlace(raidPtr, req->col);
   2849 
   2850 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2851 		raidPtr->forceRecon = 0;
   2852 	}
   2853 
   2854 	RF_Free(req, sizeof(*req));
   2855 	raidPtr->recon_in_progress = 0;
   2856 	splx(s);
   2857 
   2858 	/* That's all... */
   2859 	kthread_exit(0);	/* does not return */
   2860 }
   2861 
   2862 static RF_AutoConfig_t *
   2863 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2864     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2865     unsigned secsize)
   2866 {
   2867 	int good_one = 0;
   2868 	RF_ComponentLabel_t *clabel;
   2869 	RF_AutoConfig_t *ac;
   2870 
   2871 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2872 
   2873 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2874 		/* Got the label.  Does it look reasonable? */
   2875 		if (rf_reasonable_label(clabel, numsecs) &&
   2876 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2877 #ifdef DEBUG
   2878 			printf("Component on: %s: %llu\n",
   2879 				cname, (unsigned long long)size);
   2880 			rf_print_component_label(clabel);
   2881 #endif
   2882 			/* if it's reasonable, add it, else ignore it. */
   2883 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2884 				M_WAITOK);
   2885 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2886 			ac->dev = dev;
   2887 			ac->vp = vp;
   2888 			ac->clabel = clabel;
   2889 			ac->next = ac_list;
   2890 			ac_list = ac;
   2891 			good_one = 1;
   2892 		}
   2893 	}
   2894 	if (!good_one) {
   2895 		/* cleanup */
   2896 		free(clabel, M_RAIDFRAME);
   2897 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2898 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2899 		vput(vp);
   2900 	}
   2901 	return ac_list;
   2902 }
   2903 
   2904 static RF_AutoConfig_t *
   2905 rf_find_raid_components(void)
   2906 {
   2907 	struct vnode *vp;
   2908 	struct disklabel label;
   2909 	device_t dv;
   2910 	deviter_t di;
   2911 	dev_t dev;
   2912 	int bmajor, bminor, wedge, rf_part_found;
   2913 	int error;
   2914 	int i;
   2915 	RF_AutoConfig_t *ac_list;
   2916 	uint64_t numsecs;
   2917 	unsigned secsize;
   2918 	int dowedges;
   2919 
   2920 	/* initialize the AutoConfig list */
   2921 	ac_list = NULL;
   2922 
   2923 	/*
   2924 	 * we begin by trolling through *all* the devices on the system *twice*
   2925 	 * first we scan for wedges, second for other devices. This avoids
   2926 	 * using a raw partition instead of a wedge that covers the whole disk
   2927 	 */
   2928 
   2929 	for (dowedges=1; dowedges>=0; --dowedges) {
   2930 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2931 		     dv = deviter_next(&di)) {
   2932 
   2933 			/* we are only interested in disks */
   2934 			if (device_class(dv) != DV_DISK)
   2935 				continue;
   2936 
   2937 			/* we don't care about floppies */
   2938 			if (device_is_a(dv, "fd")) {
   2939 				continue;
   2940 			}
   2941 
   2942 			/* we don't care about CDs. */
   2943 			if (device_is_a(dv, "cd")) {
   2944 				continue;
   2945 			}
   2946 
   2947 			/* we don't care about md. */
   2948 			if (device_is_a(dv, "md")) {
   2949 				continue;
   2950 			}
   2951 
   2952 			/* hdfd is the Atari/Hades floppy driver */
   2953 			if (device_is_a(dv, "hdfd")) {
   2954 				continue;
   2955 			}
   2956 
   2957 			/* fdisa is the Atari/Milan floppy driver */
   2958 			if (device_is_a(dv, "fdisa")) {
   2959 				continue;
   2960 			}
   2961 
   2962 			/* we don't care about spiflash */
   2963 			if (device_is_a(dv, "spiflash")) {
   2964 				continue;
   2965 			}
   2966 
   2967 			/* are we in the wedges pass ? */
   2968 			wedge = device_is_a(dv, "dk");
   2969 			if (wedge != dowedges) {
   2970 				continue;
   2971 			}
   2972 
   2973 			/* need to find the device_name_to_block_device_major stuff */
   2974 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2975 
   2976 			rf_part_found = 0; /*No raid partition as yet*/
   2977 
   2978 			/* get a vnode for the raw partition of this disk */
   2979 			bminor = minor(device_unit(dv));
   2980 			dev = wedge ? makedev(bmajor, bminor) :
   2981 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2982 			if (bdevvp(dev, &vp))
   2983 				panic("RAID can't alloc vnode");
   2984 
   2985 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2986 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2987 
   2988 			if (error) {
   2989 				/* "Who cares."  Continue looking
   2990 				   for something that exists*/
   2991 				vput(vp);
   2992 				continue;
   2993 			}
   2994 
   2995 			VOP_UNLOCK(vp);
   2996 			error = getdisksize(vp, &numsecs, &secsize);
   2997 			if (error) {
   2998 				/*
   2999 				 * Pseudo devices like vnd and cgd can be
   3000 				 * opened but may still need some configuration.
   3001 				 * Ignore these quietly.
   3002 				 */
   3003 				if (error != ENXIO)
   3004 					printf("RAIDframe: can't get disk size"
   3005 					    " for dev %s (%d)\n",
   3006 					    device_xname(dv), error);
   3007 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3008 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3009 				vput(vp);
   3010 				continue;
   3011 			}
   3012 			if (wedge) {
   3013 				struct dkwedge_info dkw;
   3014 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3015 				    NOCRED);
   3016 				if (error) {
   3017 					printf("RAIDframe: can't get wedge info for "
   3018 					    "dev %s (%d)\n", device_xname(dv), error);
   3019 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3020 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3021 					vput(vp);
   3022 					continue;
   3023 				}
   3024 
   3025 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3026 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3027 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3028 					vput(vp);
   3029 					continue;
   3030 				}
   3031 
   3032 				ac_list = rf_get_component(ac_list, dev, vp,
   3033 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3034 				rf_part_found = 1; /*There is a raid component on this disk*/
   3035 				continue;
   3036 			}
   3037 
   3038 			/* Ok, the disk exists.  Go get the disklabel. */
   3039 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3040 			if (error) {
   3041 				/*
   3042 				 * XXX can't happen - open() would
   3043 				 * have errored out (or faked up one)
   3044 				 */
   3045 				if (error != ENOTTY)
   3046 					printf("RAIDframe: can't get label for dev "
   3047 					    "%s (%d)\n", device_xname(dv), error);
   3048 			}
   3049 
   3050 			/* don't need this any more.  We'll allocate it again
   3051 			   a little later if we really do... */
   3052 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3053 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3054 			vput(vp);
   3055 
   3056 			if (error)
   3057 				continue;
   3058 
   3059 			rf_part_found = 0; /*No raid partitions yet*/
   3060 			for (i = 0; i < label.d_npartitions; i++) {
   3061 				char cname[sizeof(ac_list->devname)];
   3062 
   3063 				/* We only support partitions marked as RAID */
   3064 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3065 					continue;
   3066 
   3067 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3068 				if (bdevvp(dev, &vp))
   3069 					panic("RAID can't alloc vnode");
   3070 
   3071 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3072 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3073 				if (error) {
   3074 					/* Not quite a 'whatever'.  In
   3075 					 * this situation we know
   3076 					 * there is a FS_RAID
   3077 					 * partition, but we can't
   3078 					 * open it.  The most likely
   3079 					 * reason is that the
   3080 					 * partition is already in
   3081 					 * use by another RAID set.
   3082 					 * So note that we've already
   3083 					 * found a partition on this
   3084 					 * disk so we don't attempt
   3085 					 * to use the raw disk later. */
   3086 					rf_part_found = 1;
   3087 					vput(vp);
   3088 					continue;
   3089 				}
   3090 				VOP_UNLOCK(vp);
   3091 				snprintf(cname, sizeof(cname), "%s%c",
   3092 				    device_xname(dv), 'a' + i);
   3093 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3094 					label.d_partitions[i].p_size, numsecs, secsize);
   3095 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3096 			}
   3097 
   3098 			/*
   3099 			 *If there is no raid component on this disk, either in a
   3100 			 *disklabel or inside a wedge, check the raw partition as well,
   3101 			 *as it is possible to configure raid components on raw disk
   3102 			 *devices.
   3103 			 */
   3104 
   3105 			if (!rf_part_found) {
   3106 				char cname[sizeof(ac_list->devname)];
   3107 
   3108 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3109 				if (bdevvp(dev, &vp))
   3110 					panic("RAID can't alloc vnode");
   3111 
   3112 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3113 
   3114 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3115 				if (error) {
   3116 					/* Whatever... */
   3117 					vput(vp);
   3118 					continue;
   3119 				}
   3120 				VOP_UNLOCK(vp);
   3121 				snprintf(cname, sizeof(cname), "%s%c",
   3122 				    device_xname(dv), 'a' + RAW_PART);
   3123 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3124 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3125 			}
   3126 		}
   3127 		deviter_release(&di);
   3128 	}
   3129 	return ac_list;
   3130 }
   3131 
   3132 int
   3133 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3134 {
   3135 
   3136 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3137 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3138 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3139 	    (clabel->clean == RF_RAID_CLEAN ||
   3140 	     clabel->clean == RF_RAID_DIRTY) &&
   3141 	    clabel->row >=0 &&
   3142 	    clabel->column >= 0 &&
   3143 	    clabel->num_rows > 0 &&
   3144 	    clabel->num_columns > 0 &&
   3145 	    clabel->row < clabel->num_rows &&
   3146 	    clabel->column < clabel->num_columns &&
   3147 	    clabel->blockSize > 0 &&
   3148 	    /*
   3149 	     * numBlocksHi may contain garbage, but it is ok since
   3150 	     * the type is unsigned.  If it is really garbage,
   3151 	     * rf_fix_old_label_size() will fix it.
   3152 	     */
   3153 	    rf_component_label_numblocks(clabel) > 0) {
   3154 		/*
   3155 		 * label looks reasonable enough...
   3156 		 * let's make sure it has no old garbage.
   3157 		 */
   3158 		if (numsecs)
   3159 			rf_fix_old_label_size(clabel, numsecs);
   3160 		return(1);
   3161 	}
   3162 	return(0);
   3163 }
   3164 
   3165 
   3166 /*
   3167  * For reasons yet unknown, some old component labels have garbage in
   3168  * the newer numBlocksHi region, and this causes lossage.  Since those
   3169  * disks will also have numsecs set to less than 32 bits of sectors,
   3170  * we can determine when this corruption has occurred, and fix it.
   3171  *
   3172  * The exact same problem, with the same unknown reason, happens to
   3173  * the partitionSizeHi member as well.
   3174  */
   3175 static void
   3176 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3177 {
   3178 
   3179 	if (numsecs < ((uint64_t)1 << 32)) {
   3180 		if (clabel->numBlocksHi) {
   3181 			printf("WARNING: total sectors < 32 bits, yet "
   3182 			       "numBlocksHi set\n"
   3183 			       "WARNING: resetting numBlocksHi to zero.\n");
   3184 			clabel->numBlocksHi = 0;
   3185 		}
   3186 
   3187 		if (clabel->partitionSizeHi) {
   3188 			printf("WARNING: total sectors < 32 bits, yet "
   3189 			       "partitionSizeHi set\n"
   3190 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3191 			clabel->partitionSizeHi = 0;
   3192 		}
   3193 	}
   3194 }
   3195 
   3196 
   3197 #ifdef DEBUG
   3198 void
   3199 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3200 {
   3201 	uint64_t numBlocks;
   3202 	static const char *rp[] = {
   3203 	    "No", "Force", "Soft", "*invalid*"
   3204 	};
   3205 
   3206 
   3207 	numBlocks = rf_component_label_numblocks(clabel);
   3208 
   3209 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3210 	       clabel->row, clabel->column,
   3211 	       clabel->num_rows, clabel->num_columns);
   3212 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3213 	       clabel->version, clabel->serial_number,
   3214 	       clabel->mod_counter);
   3215 	printf("   Clean: %s Status: %d\n",
   3216 	       clabel->clean ? "Yes" : "No", clabel->status);
   3217 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3218 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3219 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3220 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3221 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3222 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3223 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3224 #if 0
   3225 	   printf("   Config order: %d\n", clabel->config_order);
   3226 #endif
   3227 
   3228 }
   3229 #endif
   3230 
   3231 static RF_ConfigSet_t *
   3232 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3233 {
   3234 	RF_AutoConfig_t *ac;
   3235 	RF_ConfigSet_t *config_sets;
   3236 	RF_ConfigSet_t *cset;
   3237 	RF_AutoConfig_t *ac_next;
   3238 
   3239 
   3240 	config_sets = NULL;
   3241 
   3242 	/* Go through the AutoConfig list, and figure out which components
   3243 	   belong to what sets.  */
   3244 	ac = ac_list;
   3245 	while(ac!=NULL) {
   3246 		/* we're going to putz with ac->next, so save it here
   3247 		   for use at the end of the loop */
   3248 		ac_next = ac->next;
   3249 
   3250 		if (config_sets == NULL) {
   3251 			/* will need at least this one... */
   3252 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3253 				       M_RAIDFRAME, M_WAITOK);
   3254 			/* this one is easy :) */
   3255 			config_sets->ac = ac;
   3256 			config_sets->next = NULL;
   3257 			config_sets->rootable = 0;
   3258 			ac->next = NULL;
   3259 		} else {
   3260 			/* which set does this component fit into? */
   3261 			cset = config_sets;
   3262 			while(cset!=NULL) {
   3263 				if (rf_does_it_fit(cset, ac)) {
   3264 					/* looks like it matches... */
   3265 					ac->next = cset->ac;
   3266 					cset->ac = ac;
   3267 					break;
   3268 				}
   3269 				cset = cset->next;
   3270 			}
   3271 			if (cset==NULL) {
   3272 				/* didn't find a match above... new set..*/
   3273 				cset = malloc(sizeof(RF_ConfigSet_t),
   3274 					       M_RAIDFRAME, M_WAITOK);
   3275 				cset->ac = ac;
   3276 				ac->next = NULL;
   3277 				cset->next = config_sets;
   3278 				cset->rootable = 0;
   3279 				config_sets = cset;
   3280 			}
   3281 		}
   3282 		ac = ac_next;
   3283 	}
   3284 
   3285 
   3286 	return(config_sets);
   3287 }
   3288 
   3289 static int
   3290 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3291 {
   3292 	RF_ComponentLabel_t *clabel1, *clabel2;
   3293 
   3294 	/* If this one matches the *first* one in the set, that's good
   3295 	   enough, since the other members of the set would have been
   3296 	   through here too... */
   3297 	/* note that we are not checking partitionSize here..
   3298 
   3299 	   Note that we are also not checking the mod_counters here.
   3300 	   If everything else matches except the mod_counter, that's
   3301 	   good enough for this test.  We will deal with the mod_counters
   3302 	   a little later in the autoconfiguration process.
   3303 
   3304 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3305 
   3306 	   The reason we don't check for this is that failed disks
   3307 	   will have lower modification counts.  If those disks are
   3308 	   not added to the set they used to belong to, then they will
   3309 	   form their own set, which may result in 2 different sets,
   3310 	   for example, competing to be configured at raid0, and
   3311 	   perhaps competing to be the root filesystem set.  If the
   3312 	   wrong ones get configured, or both attempt to become /,
   3313 	   weird behaviour and or serious lossage will occur.  Thus we
   3314 	   need to bring them into the fold here, and kick them out at
   3315 	   a later point.
   3316 
   3317 	*/
   3318 
   3319 	clabel1 = cset->ac->clabel;
   3320 	clabel2 = ac->clabel;
   3321 	if ((clabel1->version == clabel2->version) &&
   3322 	    (clabel1->serial_number == clabel2->serial_number) &&
   3323 	    (clabel1->num_rows == clabel2->num_rows) &&
   3324 	    (clabel1->num_columns == clabel2->num_columns) &&
   3325 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3326 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3327 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3328 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3329 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3330 	    (clabel1->blockSize == clabel2->blockSize) &&
   3331 	    rf_component_label_numblocks(clabel1) ==
   3332 	    rf_component_label_numblocks(clabel2) &&
   3333 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3334 	    (clabel1->root_partition == clabel2->root_partition) &&
   3335 	    (clabel1->last_unit == clabel2->last_unit) &&
   3336 	    (clabel1->config_order == clabel2->config_order)) {
   3337 		/* if it get's here, it almost *has* to be a match */
   3338 	} else {
   3339 		/* it's not consistent with somebody in the set..
   3340 		   punt */
   3341 		return(0);
   3342 	}
   3343 	/* all was fine.. it must fit... */
   3344 	return(1);
   3345 }
   3346 
   3347 static int
   3348 rf_have_enough_components(RF_ConfigSet_t *cset)
   3349 {
   3350 	RF_AutoConfig_t *ac;
   3351 	RF_AutoConfig_t *auto_config;
   3352 	RF_ComponentLabel_t *clabel;
   3353 	int c;
   3354 	int num_cols;
   3355 	int num_missing;
   3356 	int mod_counter;
   3357 	int mod_counter_found;
   3358 	int even_pair_failed;
   3359 	char parity_type;
   3360 
   3361 
   3362 	/* check to see that we have enough 'live' components
   3363 	   of this set.  If so, we can configure it if necessary */
   3364 
   3365 	num_cols = cset->ac->clabel->num_columns;
   3366 	parity_type = cset->ac->clabel->parityConfig;
   3367 
   3368 	/* XXX Check for duplicate components!?!?!? */
   3369 
   3370 	/* Determine what the mod_counter is supposed to be for this set. */
   3371 
   3372 	mod_counter_found = 0;
   3373 	mod_counter = 0;
   3374 	ac = cset->ac;
   3375 	while(ac!=NULL) {
   3376 		if (mod_counter_found==0) {
   3377 			mod_counter = ac->clabel->mod_counter;
   3378 			mod_counter_found = 1;
   3379 		} else {
   3380 			if (ac->clabel->mod_counter > mod_counter) {
   3381 				mod_counter = ac->clabel->mod_counter;
   3382 			}
   3383 		}
   3384 		ac = ac->next;
   3385 	}
   3386 
   3387 	num_missing = 0;
   3388 	auto_config = cset->ac;
   3389 
   3390 	even_pair_failed = 0;
   3391 	for(c=0; c<num_cols; c++) {
   3392 		ac = auto_config;
   3393 		while(ac!=NULL) {
   3394 			if ((ac->clabel->column == c) &&
   3395 			    (ac->clabel->mod_counter == mod_counter)) {
   3396 				/* it's this one... */
   3397 #ifdef DEBUG
   3398 				printf("Found: %s at %d\n",
   3399 				       ac->devname,c);
   3400 #endif
   3401 				break;
   3402 			}
   3403 			ac=ac->next;
   3404 		}
   3405 		if (ac==NULL) {
   3406 				/* Didn't find one here! */
   3407 				/* special case for RAID 1, especially
   3408 				   where there are more than 2
   3409 				   components (where RAIDframe treats
   3410 				   things a little differently :( ) */
   3411 			if (parity_type == '1') {
   3412 				if (c%2 == 0) { /* even component */
   3413 					even_pair_failed = 1;
   3414 				} else { /* odd component.  If
   3415 					    we're failed, and
   3416 					    so is the even
   3417 					    component, it's
   3418 					    "Good Night, Charlie" */
   3419 					if (even_pair_failed == 1) {
   3420 						return(0);
   3421 					}
   3422 				}
   3423 			} else {
   3424 				/* normal accounting */
   3425 				num_missing++;
   3426 			}
   3427 		}
   3428 		if ((parity_type == '1') && (c%2 == 1)) {
   3429 				/* Just did an even component, and we didn't
   3430 				   bail.. reset the even_pair_failed flag,
   3431 				   and go on to the next component.... */
   3432 			even_pair_failed = 0;
   3433 		}
   3434 	}
   3435 
   3436 	clabel = cset->ac->clabel;
   3437 
   3438 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3439 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3440 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3441 		/* XXX this needs to be made *much* more general */
   3442 		/* Too many failures */
   3443 		return(0);
   3444 	}
   3445 	/* otherwise, all is well, and we've got enough to take a kick
   3446 	   at autoconfiguring this set */
   3447 	return(1);
   3448 }
   3449 
   3450 static void
   3451 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3452 			RF_Raid_t *raidPtr)
   3453 {
   3454 	RF_ComponentLabel_t *clabel;
   3455 	int i;
   3456 
   3457 	clabel = ac->clabel;
   3458 
   3459 	/* 1. Fill in the common stuff */
   3460 	config->numCol = clabel->num_columns;
   3461 	config->numSpare = 0; /* XXX should this be set here? */
   3462 	config->sectPerSU = clabel->sectPerSU;
   3463 	config->SUsPerPU = clabel->SUsPerPU;
   3464 	config->SUsPerRU = clabel->SUsPerRU;
   3465 	config->parityConfig = clabel->parityConfig;
   3466 	/* XXX... */
   3467 	strcpy(config->diskQueueType,"fifo");
   3468 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3469 	config->layoutSpecificSize = 0; /* XXX ?? */
   3470 
   3471 	while(ac!=NULL) {
   3472 		/* row/col values will be in range due to the checks
   3473 		   in reasonable_label() */
   3474 		strcpy(config->devnames[0][ac->clabel->column],
   3475 		       ac->devname);
   3476 		ac = ac->next;
   3477 	}
   3478 
   3479 	for(i=0;i<RF_MAXDBGV;i++) {
   3480 		config->debugVars[i][0] = 0;
   3481 	}
   3482 }
   3483 
   3484 static int
   3485 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3486 {
   3487 	RF_ComponentLabel_t *clabel;
   3488 	int column;
   3489 	int sparecol;
   3490 
   3491 	raidPtr->autoconfigure = new_value;
   3492 
   3493 	for(column=0; column<raidPtr->numCol; column++) {
   3494 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3495 			clabel = raidget_component_label(raidPtr, column);
   3496 			clabel->autoconfigure = new_value;
   3497 			raidflush_component_label(raidPtr, column);
   3498 		}
   3499 	}
   3500 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3501 		sparecol = raidPtr->numCol + column;
   3502 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3503 			clabel = raidget_component_label(raidPtr, sparecol);
   3504 			clabel->autoconfigure = new_value;
   3505 			raidflush_component_label(raidPtr, sparecol);
   3506 		}
   3507 	}
   3508 	return(new_value);
   3509 }
   3510 
   3511 static int
   3512 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3513 {
   3514 	RF_ComponentLabel_t *clabel;
   3515 	int column;
   3516 	int sparecol;
   3517 
   3518 	raidPtr->root_partition = new_value;
   3519 	for(column=0; column<raidPtr->numCol; column++) {
   3520 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3521 			clabel = raidget_component_label(raidPtr, column);
   3522 			clabel->root_partition = new_value;
   3523 			raidflush_component_label(raidPtr, column);
   3524 		}
   3525 	}
   3526 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3527 		sparecol = raidPtr->numCol + column;
   3528 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3529 			clabel = raidget_component_label(raidPtr, sparecol);
   3530 			clabel->root_partition = new_value;
   3531 			raidflush_component_label(raidPtr, sparecol);
   3532 		}
   3533 	}
   3534 	return(new_value);
   3535 }
   3536 
   3537 static void
   3538 rf_release_all_vps(RF_ConfigSet_t *cset)
   3539 {
   3540 	RF_AutoConfig_t *ac;
   3541 
   3542 	ac = cset->ac;
   3543 	while(ac!=NULL) {
   3544 		/* Close the vp, and give it back */
   3545 		if (ac->vp) {
   3546 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3547 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3548 			vput(ac->vp);
   3549 			ac->vp = NULL;
   3550 		}
   3551 		ac = ac->next;
   3552 	}
   3553 }
   3554 
   3555 
   3556 static void
   3557 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3558 {
   3559 	RF_AutoConfig_t *ac;
   3560 	RF_AutoConfig_t *next_ac;
   3561 
   3562 	ac = cset->ac;
   3563 	while(ac!=NULL) {
   3564 		next_ac = ac->next;
   3565 		/* nuke the label */
   3566 		free(ac->clabel, M_RAIDFRAME);
   3567 		/* cleanup the config structure */
   3568 		free(ac, M_RAIDFRAME);
   3569 		/* "next.." */
   3570 		ac = next_ac;
   3571 	}
   3572 	/* and, finally, nuke the config set */
   3573 	free(cset, M_RAIDFRAME);
   3574 }
   3575 
   3576 
   3577 void
   3578 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3579 {
   3580 	/* avoid over-writing byteswapped version. */
   3581 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3582 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3583 	clabel->serial_number = raidPtr->serial_number;
   3584 	clabel->mod_counter = raidPtr->mod_counter;
   3585 
   3586 	clabel->num_rows = 1;
   3587 	clabel->num_columns = raidPtr->numCol;
   3588 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3589 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3590 
   3591 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3592 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3593 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3594 
   3595 	clabel->blockSize = raidPtr->bytesPerSector;
   3596 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3597 
   3598 	/* XXX not portable */
   3599 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3600 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3601 	clabel->autoconfigure = raidPtr->autoconfigure;
   3602 	clabel->root_partition = raidPtr->root_partition;
   3603 	clabel->last_unit = raidPtr->raidid;
   3604 	clabel->config_order = raidPtr->config_order;
   3605 
   3606 #ifndef RF_NO_PARITY_MAP
   3607 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3608 #endif
   3609 }
   3610 
   3611 static struct raid_softc *
   3612 rf_auto_config_set(RF_ConfigSet_t *cset)
   3613 {
   3614 	RF_Raid_t *raidPtr;
   3615 	RF_Config_t *config;
   3616 	int raidID;
   3617 	struct raid_softc *sc;
   3618 
   3619 #ifdef DEBUG
   3620 	printf("RAID autoconfigure\n");
   3621 #endif
   3622 
   3623 	/* 1. Create a config structure */
   3624 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3625 
   3626 	/*
   3627 	   2. Figure out what RAID ID this one is supposed to live at
   3628 	   See if we can get the same RAID dev that it was configured
   3629 	   on last time..
   3630 	*/
   3631 
   3632 	raidID = cset->ac->clabel->last_unit;
   3633 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3634 	     sc = raidget(++raidID, false))
   3635 		continue;
   3636 #ifdef DEBUG
   3637 	printf("Configuring raid%d:\n",raidID);
   3638 #endif
   3639 
   3640 	if (sc == NULL)
   3641 		sc = raidget(raidID, true);
   3642 	raidPtr = &sc->sc_r;
   3643 
   3644 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3645 	raidPtr->softc = sc;
   3646 	raidPtr->raidid = raidID;
   3647 	raidPtr->openings = RAIDOUTSTANDING;
   3648 
   3649 	/* 3. Build the configuration structure */
   3650 	rf_create_configuration(cset->ac, config, raidPtr);
   3651 
   3652 	/* 4. Do the configuration */
   3653 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3654 		raidinit(sc);
   3655 
   3656 		rf_markalldirty(raidPtr);
   3657 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3658 		switch (cset->ac->clabel->root_partition) {
   3659 		case 1:	/* Force Root */
   3660 		case 2:	/* Soft Root: root when boot partition part of raid */
   3661 			/*
   3662 			 * everything configured just fine.  Make a note
   3663 			 * that this set is eligible to be root,
   3664 			 * or forced to be root
   3665 			 */
   3666 			cset->rootable = cset->ac->clabel->root_partition;
   3667 			/* XXX do this here? */
   3668 			raidPtr->root_partition = cset->rootable;
   3669 			break;
   3670 		default:
   3671 			break;
   3672 		}
   3673 	} else {
   3674 		raidput(sc);
   3675 		sc = NULL;
   3676 	}
   3677 
   3678 	/* 5. Cleanup */
   3679 	free(config, M_RAIDFRAME);
   3680 	return sc;
   3681 }
   3682 
   3683 void
   3684 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3685 	     size_t xmin, size_t xmax)
   3686 {
   3687 
   3688 	/* Format: raid%d_foo */
   3689 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3690 
   3691 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3692 	pool_sethiwat(p, xmax);
   3693 	pool_prime(p, xmin);
   3694 }
   3695 
   3696 
   3697 /*
   3698  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3699  * to see if there is IO pending and if that IO could possibly be done
   3700  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3701  * otherwise.
   3702  *
   3703  */
   3704 int
   3705 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3706 {
   3707 	struct raid_softc *rs;
   3708 	struct dk_softc *dksc;
   3709 
   3710 	rs = raidPtr->softc;
   3711 	dksc = &rs->sc_dksc;
   3712 
   3713 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3714 		return 1;
   3715 
   3716 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3717 		/* there is work to do */
   3718 		return 0;
   3719 	}
   3720 	/* default is nothing to do */
   3721 	return 1;
   3722 }
   3723 
   3724 int
   3725 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3726 {
   3727 	uint64_t numsecs;
   3728 	unsigned secsize;
   3729 	int error;
   3730 
   3731 	error = getdisksize(vp, &numsecs, &secsize);
   3732 	if (error == 0) {
   3733 		diskPtr->blockSize = secsize;
   3734 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3735 		diskPtr->partitionSize = numsecs;
   3736 		return 0;
   3737 	}
   3738 	return error;
   3739 }
   3740 
   3741 static int
   3742 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3743 {
   3744 	return 1;
   3745 }
   3746 
   3747 static void
   3748 raid_attach(device_t parent, device_t self, void *aux)
   3749 {
   3750 }
   3751 
   3752 
   3753 static int
   3754 raid_detach(device_t self, int flags)
   3755 {
   3756 	int error;
   3757 	struct raid_softc *rs = raidsoftc(self);
   3758 
   3759 	if (rs == NULL)
   3760 		return ENXIO;
   3761 
   3762 	if ((error = raidlock(rs)) != 0)
   3763 		return error;
   3764 
   3765 	error = raid_detach_unlocked(rs);
   3766 
   3767 	raidunlock(rs);
   3768 
   3769 	/* XXX raid can be referenced here */
   3770 
   3771 	if (error)
   3772 		return error;
   3773 
   3774 	/* Free the softc */
   3775 	raidput(rs);
   3776 
   3777 	return 0;
   3778 }
   3779 
   3780 static void
   3781 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3782 {
   3783 	struct dk_softc *dksc = &rs->sc_dksc;
   3784 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3785 
   3786 	memset(dg, 0, sizeof(*dg));
   3787 
   3788 	dg->dg_secperunit = raidPtr->totalSectors;
   3789 	dg->dg_secsize = raidPtr->bytesPerSector;
   3790 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3791 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3792 
   3793 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3794 }
   3795 
   3796 /*
   3797  * Get cache info for all the components (including spares).
   3798  * Returns intersection of all the cache flags of all disks, or first
   3799  * error if any encountered.
   3800  * XXXfua feature flags can change as spares are added - lock down somehow
   3801  */
   3802 static int
   3803 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3804 {
   3805 	int c;
   3806 	int error;
   3807 	int dkwhole = 0, dkpart;
   3808 
   3809 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3810 		/*
   3811 		 * Check any non-dead disk, even when currently being
   3812 		 * reconstructed.
   3813 		 */
   3814 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3815 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3816 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3817 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3818 			if (error) {
   3819 				if (error != ENODEV) {
   3820 					printf("raid%d: get cache for component %s failed\n",
   3821 					    raidPtr->raidid,
   3822 					    raidPtr->Disks[c].devname);
   3823 				}
   3824 
   3825 				return error;
   3826 			}
   3827 
   3828 			if (c == 0)
   3829 				dkwhole = dkpart;
   3830 			else
   3831 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3832 		}
   3833 	}
   3834 
   3835 	*data = dkwhole;
   3836 
   3837 	return 0;
   3838 }
   3839 
   3840 /*
   3841  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3842  * We end up returning whatever error was returned by the first cache flush
   3843  * that fails.
   3844  */
   3845 
   3846 static int
   3847 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3848 {
   3849 	int e = 0;
   3850 	for (int i = 0; i < 5; i++) {
   3851 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3852 		    &force, FWRITE, NOCRED);
   3853 		if (!e || e == ENODEV)
   3854 			return e;
   3855 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3856 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3857 	}
   3858 	return e;
   3859 }
   3860 
   3861 int
   3862 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3863 {
   3864 	int c, error;
   3865 
   3866 	error = 0;
   3867 	for (c = 0; c < raidPtr->numCol; c++) {
   3868 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3869 			int e = rf_sync_component_cache(raidPtr, c, force);
   3870 			if (e && !error)
   3871 				error = e;
   3872 		}
   3873 	}
   3874 
   3875 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3876 		int sparecol = raidPtr->numCol + c;
   3877 		/* Need to ensure that the reconstruct actually completed! */
   3878 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3879 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3880 			    force);
   3881 			if (e && !error)
   3882 				error = e;
   3883 		}
   3884 	}
   3885 	return error;
   3886 }
   3887 
   3888 /* Fill in info with the current status */
   3889 void
   3890 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3891 {
   3892 
   3893 	memset(info, 0, sizeof(*info));
   3894 
   3895 	if (raidPtr->status != rf_rs_reconstructing) {
   3896 		info->total = 100;
   3897 		info->completed = 100;
   3898 	} else {
   3899 		info->total = raidPtr->reconControl->numRUsTotal;
   3900 		info->completed = raidPtr->reconControl->numRUsComplete;
   3901 	}
   3902 	info->remaining = info->total - info->completed;
   3903 }
   3904 
   3905 /* Fill in info with the current status */
   3906 void
   3907 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3908 {
   3909 
   3910 	memset(info, 0, sizeof(*info));
   3911 
   3912 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3913 		info->total = raidPtr->Layout.numStripe;
   3914 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3915 	} else {
   3916 		info->completed = 100;
   3917 		info->total = 100;
   3918 	}
   3919 	info->remaining = info->total - info->completed;
   3920 }
   3921 
   3922 /* Fill in info with the current status */
   3923 void
   3924 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3925 {
   3926 
   3927 	memset(info, 0, sizeof(*info));
   3928 
   3929 	if (raidPtr->copyback_in_progress == 1) {
   3930 		info->total = raidPtr->Layout.numStripe;
   3931 		info->completed = raidPtr->copyback_stripes_done;
   3932 		info->remaining = info->total - info->completed;
   3933 	} else {
   3934 		info->remaining = 0;
   3935 		info->completed = 100;
   3936 		info->total = 100;
   3937 	}
   3938 }
   3939 
   3940 /* Fill in config with the current info */
   3941 int
   3942 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3943 {
   3944 	int	d, i, j;
   3945 
   3946 	if (!raidPtr->valid)
   3947 		return ENODEV;
   3948 	config->cols = raidPtr->numCol;
   3949 	config->ndevs = raidPtr->numCol;
   3950 	if (config->ndevs >= RF_MAX_DISKS)
   3951 		return ENOMEM;
   3952 	config->nspares = raidPtr->numSpare;
   3953 	if (config->nspares >= RF_MAX_DISKS)
   3954 		return ENOMEM;
   3955 	config->maxqdepth = raidPtr->maxQueueDepth;
   3956 	d = 0;
   3957 	for (j = 0; j < config->cols; j++) {
   3958 		config->devs[d] = raidPtr->Disks[j];
   3959 		d++;
   3960 	}
   3961 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3962 		config->spares[i] = raidPtr->Disks[j];
   3963 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3964 			/* XXX: raidctl(8) expects to see this as a used spare */
   3965 			config->spares[i].status = rf_ds_used_spare;
   3966 		}
   3967 	}
   3968 	return 0;
   3969 }
   3970 
   3971 int
   3972 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3973 {
   3974 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3975 	RF_ComponentLabel_t *raid_clabel;
   3976 	int column = clabel->column;
   3977 
   3978 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3979 		return EINVAL;
   3980 	raid_clabel = raidget_component_label(raidPtr, column);
   3981 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3982 	/* Fix-up for userland. */
   3983 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3984 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3985 
   3986 	return 0;
   3987 }
   3988 
   3989 /*
   3990  * Module interface
   3991  */
   3992 
   3993 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3994 
   3995 #ifdef _MODULE
   3996 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3997 #endif
   3998 
   3999 static int raid_modcmd(modcmd_t, void *);
   4000 static int raid_modcmd_init(void);
   4001 static int raid_modcmd_fini(void);
   4002 
   4003 static int
   4004 raid_modcmd(modcmd_t cmd, void *data)
   4005 {
   4006 	int error;
   4007 
   4008 	error = 0;
   4009 	switch (cmd) {
   4010 	case MODULE_CMD_INIT:
   4011 		error = raid_modcmd_init();
   4012 		break;
   4013 	case MODULE_CMD_FINI:
   4014 		error = raid_modcmd_fini();
   4015 		break;
   4016 	default:
   4017 		error = ENOTTY;
   4018 		break;
   4019 	}
   4020 	return error;
   4021 }
   4022 
   4023 static int
   4024 raid_modcmd_init(void)
   4025 {
   4026 	int error;
   4027 	int bmajor, cmajor;
   4028 
   4029 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   4030 	mutex_enter(&raid_lock);
   4031 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4032 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   4033 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   4034 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   4035 
   4036 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   4037 #endif
   4038 
   4039 	bmajor = cmajor = -1;
   4040 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4041 	    &raid_cdevsw, &cmajor);
   4042 	if (error != 0 && error != EEXIST) {
   4043 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4044 		mutex_exit(&raid_lock);
   4045 		return error;
   4046 	}
   4047 #ifdef _MODULE
   4048 	error = config_cfdriver_attach(&raid_cd);
   4049 	if (error != 0) {
   4050 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4051 		    __func__, error);
   4052 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4053 		mutex_exit(&raid_lock);
   4054 		return error;
   4055 	}
   4056 #endif
   4057 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4058 	if (error != 0) {
   4059 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4060 		    __func__, error);
   4061 #ifdef _MODULE
   4062 		config_cfdriver_detach(&raid_cd);
   4063 #endif
   4064 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4065 		mutex_exit(&raid_lock);
   4066 		return error;
   4067 	}
   4068 
   4069 	raidautoconfigdone = false;
   4070 
   4071 	mutex_exit(&raid_lock);
   4072 
   4073 	if (error == 0) {
   4074 		if (rf_BootRaidframe(true) == 0)
   4075 			aprint_verbose("Kernelized RAIDframe activated\n");
   4076 		else
   4077 			panic("Serious error activating RAID!!");
   4078 	}
   4079 
   4080 	/*
   4081 	 * Register a finalizer which will be used to auto-config RAID
   4082 	 * sets once all real hardware devices have been found.
   4083 	 */
   4084 	error = config_finalize_register(NULL, rf_autoconfig);
   4085 	if (error != 0) {
   4086 		aprint_error("WARNING: unable to register RAIDframe "
   4087 		    "finalizer\n");
   4088 		error = 0;
   4089 	}
   4090 
   4091 	return error;
   4092 }
   4093 
   4094 static int
   4095 raid_modcmd_fini(void)
   4096 {
   4097 	int error;
   4098 
   4099 	mutex_enter(&raid_lock);
   4100 
   4101 	/* Don't allow unload if raid device(s) exist.  */
   4102 	if (!LIST_EMPTY(&raids)) {
   4103 		mutex_exit(&raid_lock);
   4104 		return EBUSY;
   4105 	}
   4106 
   4107 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4108 	if (error != 0) {
   4109 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4110 		mutex_exit(&raid_lock);
   4111 		return error;
   4112 	}
   4113 #ifdef _MODULE
   4114 	error = config_cfdriver_detach(&raid_cd);
   4115 	if (error != 0) {
   4116 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4117 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4118 		mutex_exit(&raid_lock);
   4119 		return error;
   4120 	}
   4121 #endif
   4122 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4123 	rf_BootRaidframe(false);
   4124 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4125 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4126 	rf_destroy_cond2(rf_sparet_wait_cv);
   4127 	rf_destroy_cond2(rf_sparet_resp_cv);
   4128 #endif
   4129 	mutex_exit(&raid_lock);
   4130 	mutex_destroy(&raid_lock);
   4131 
   4132 	return error;
   4133 }
   4134