Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.414
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.414 2023/09/17 20:07:39 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.414 2023/09/17 20:07:39 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int);
    181 static void raidinit(struct raid_softc *);
    182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 static int raid_diskstart(device_t, struct buf *bp);
    200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    201 static int raid_lastclose(device_t);
    202 
    203 static dev_type_open(raidopen);
    204 static dev_type_close(raidclose);
    205 static dev_type_read(raidread);
    206 static dev_type_write(raidwrite);
    207 static dev_type_ioctl(raidioctl);
    208 static dev_type_strategy(raidstrategy);
    209 static dev_type_dump(raiddump);
    210 static dev_type_size(raidsize);
    211 
    212 const struct bdevsw raid_bdevsw = {
    213 	.d_open = raidopen,
    214 	.d_close = raidclose,
    215 	.d_strategy = raidstrategy,
    216 	.d_ioctl = raidioctl,
    217 	.d_dump = raiddump,
    218 	.d_psize = raidsize,
    219 	.d_discard = nodiscard,
    220 	.d_flag = D_DISK
    221 };
    222 
    223 const struct cdevsw raid_cdevsw = {
    224 	.d_open = raidopen,
    225 	.d_close = raidclose,
    226 	.d_read = raidread,
    227 	.d_write = raidwrite,
    228 	.d_ioctl = raidioctl,
    229 	.d_stop = nostop,
    230 	.d_tty = notty,
    231 	.d_poll = nopoll,
    232 	.d_mmap = nommap,
    233 	.d_kqfilter = nokqfilter,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 static struct dkdriver rf_dkdriver = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_strategy = raidstrategy,
    242 	.d_diskstart = raid_diskstart,
    243 	.d_dumpblocks = raid_dumpblocks,
    244 	.d_lastclose = raid_lastclose,
    245 	.d_minphys = minphys
    246 };
    247 
    248 #define	raidunit(x)	DISKUNIT(x)
    249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    250 
    251 extern struct cfdriver raid_cd;
    252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    254     DVF_DETACH_SHUTDOWN);
    255 
    256 /* Internal representation of a rf_recon_req */
    257 struct rf_recon_req_internal {
    258 	RF_RowCol_t col;
    259 	RF_ReconReqFlags_t flags;
    260 	void   *raidPtr;
    261 };
    262 
    263 /*
    264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    265  * Be aware that large numbers can allow the driver to consume a lot of
    266  * kernel memory, especially on writes, and in degraded mode reads.
    267  *
    268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    269  * a single 64K write will typically require 64K for the old data,
    270  * 64K for the old parity, and 64K for the new parity, for a total
    271  * of 192K (if the parity buffer is not re-used immediately).
    272  * Even it if is used immediately, that's still 128K, which when multiplied
    273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    274  *
    275  * Now in degraded mode, for example, a 64K read on the above setup may
    276  * require data reconstruction, which will require *all* of the 4 remaining
    277  * disks to participate -- 4 * 32K/disk == 128K again.
    278  */
    279 
    280 #ifndef RAIDOUTSTANDING
    281 #define RAIDOUTSTANDING   6
    282 #endif
    283 
    284 #define RAIDLABELDEV(dev)	\
    285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    286 
    287 /* declared here, and made public, for the benefit of KVM stuff.. */
    288 
    289 static int raidlock(struct raid_softc *);
    290 static void raidunlock(struct raid_softc *);
    291 
    292 static int raid_detach_unlocked(struct raid_softc *);
    293 
    294 static void rf_markalldirty(RF_Raid_t *);
    295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    296 
    297 static void rf_ReconThread(struct rf_recon_req_internal *);
    298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    301 static int rf_autoconfig(device_t);
    302 static int rf_rescan(void);
    303 static void rf_buildroothack(RF_ConfigSet_t *);
    304 
    305 static RF_AutoConfig_t *rf_find_raid_components(void);
    306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    309 static int rf_set_autoconfig(RF_Raid_t *, int);
    310 static int rf_set_rootpartition(RF_Raid_t *, int);
    311 static void rf_release_all_vps(RF_ConfigSet_t *);
    312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    313 static int rf_have_enough_components(RF_ConfigSet_t *);
    314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    316 
    317 /*
    318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    320  * in the kernel config file.
    321  */
    322 #ifdef RAID_AUTOCONFIG
    323 int raidautoconfig = 1;
    324 #else
    325 int raidautoconfig = 0;
    326 #endif
    327 static bool raidautoconfigdone = false;
    328 
    329 struct pool rf_alloclist_pool;   /* AllocList */
    330 
    331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    332 static kmutex_t raid_lock;
    333 
    334 static struct raid_softc *
    335 raidcreate(int unit) {
    336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    337 	sc->sc_unit = unit;
    338 	cv_init(&sc->sc_cv, "raidunit");
    339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    340 	return sc;
    341 }
    342 
    343 static void
    344 raiddestroy(struct raid_softc *sc) {
    345 	cv_destroy(&sc->sc_cv);
    346 	mutex_destroy(&sc->sc_mutex);
    347 	kmem_free(sc, sizeof(*sc));
    348 }
    349 
    350 static struct raid_softc *
    351 raidget(int unit, bool create) {
    352 	struct raid_softc *sc;
    353 	if (unit < 0) {
    354 #ifdef DIAGNOSTIC
    355 		panic("%s: unit %d!", __func__, unit);
    356 #endif
    357 		return NULL;
    358 	}
    359 	mutex_enter(&raid_lock);
    360 	LIST_FOREACH(sc, &raids, sc_link) {
    361 		if (sc->sc_unit == unit) {
    362 			mutex_exit(&raid_lock);
    363 			return sc;
    364 		}
    365 	}
    366 	mutex_exit(&raid_lock);
    367 	if (!create)
    368 		return NULL;
    369 	sc = raidcreate(unit);
    370 	mutex_enter(&raid_lock);
    371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    372 	mutex_exit(&raid_lock);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raidput(struct raid_softc *sc) {
    378 	mutex_enter(&raid_lock);
    379 	LIST_REMOVE(sc, sc_link);
    380 	mutex_exit(&raid_lock);
    381 	raiddestroy(sc);
    382 }
    383 
    384 void
    385 raidattach(int num)
    386 {
    387 
    388 	/*
    389 	 * Device attachment and associated initialization now occurs
    390 	 * as part of the module initialization.
    391 	 */
    392 }
    393 
    394 static int
    395 rf_autoconfig(device_t self)
    396 {
    397 	RF_AutoConfig_t *ac_list;
    398 	RF_ConfigSet_t *config_sets;
    399 
    400 	if (!raidautoconfig || raidautoconfigdone == true)
    401 		return 0;
    402 
    403 	/* XXX This code can only be run once. */
    404 	raidautoconfigdone = true;
    405 
    406 #ifdef __HAVE_CPU_BOOTCONF
    407 	/*
    408 	 * 0. find the boot device if needed first so we can use it later
    409 	 * this needs to be done before we autoconfigure any raid sets,
    410 	 * because if we use wedges we are not going to be able to open
    411 	 * the boot device later
    412 	 */
    413 	if (booted_device == NULL)
    414 		cpu_bootconf();
    415 #endif
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set and configure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 int
    433 rf_inited(const struct raid_softc *rs) {
    434 	return (rs->sc_flags & RAIDF_INITED) != 0;
    435 }
    436 
    437 RF_Raid_t *
    438 rf_get_raid(struct raid_softc *rs) {
    439 	return &rs->sc_r;
    440 }
    441 
    442 int
    443 rf_get_unit(const struct raid_softc *rs) {
    444 	return rs->sc_unit;
    445 }
    446 
    447 static int
    448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    449 	const char *bootname;
    450 	size_t len;
    451 
    452 	/* if bdv is NULL, the set can't contain it. exit early. */
    453 	if (bdv == NULL)
    454 		return 0;
    455 
    456 	bootname = device_xname(bdv);
    457 	len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 static int
    479 rf_rescan(void)
    480 {
    481 	RF_AutoConfig_t *ac_list;
    482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    483 	struct raid_softc *sc;
    484 	int raid_added;
    485 
    486 	ac_list = rf_find_raid_components();
    487 	config_sets = rf_create_auto_sets(ac_list);
    488 
    489 	raid_added = 1;
    490 	while (raid_added > 0) {
    491 		raid_added = 0;
    492 		cset = config_sets;
    493 		while (cset != NULL) {
    494 			next_cset = cset->next;
    495 			if (rf_have_enough_components(cset) &&
    496 			    cset->ac->clabel->autoconfigure == 1) {
    497 				sc = rf_auto_config_set(cset);
    498 				if (sc != NULL) {
    499 					aprint_debug("raid%d: configured ok, rootable %d\n",
    500 						     sc->sc_unit, cset->rootable);
    501 					/* We added one RAID set */
    502 					raid_added++;
    503 				} else {
    504 					/* The autoconfig didn't work :( */
    505 					aprint_debug("Autoconfig failed\n");
    506 					rf_release_all_vps(cset);
    507 				}
    508 			} else {
    509 				/* we're not autoconfiguring this set...
    510 				   release the associated resources */
    511 				rf_release_all_vps(cset);
    512 			}
    513 			/* cleanup */
    514 			rf_cleanup_config_set(cset);
    515 			cset = next_cset;
    516 		}
    517 		if (raid_added > 0) {
    518 			/* We added at least one RAID set, so re-scan for recursive RAID */
    519 			ac_list = rf_find_raid_components();
    520 			config_sets = rf_create_auto_sets(ac_list);
    521 		}
    522 	}
    523 
    524 	return 0;
    525 }
    526 
    527 
    528 static void
    529 rf_buildroothack(RF_ConfigSet_t *config_sets)
    530 {
    531 	RF_AutoConfig_t *ac_list;
    532 	RF_ConfigSet_t *cset;
    533 	RF_ConfigSet_t *next_cset;
    534 	int num_root;
    535 	int raid_added;
    536 	struct raid_softc *sc, *rsc;
    537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    538 
    539 	sc = rsc = NULL;
    540 	num_root = 0;
    541 
    542 	raid_added = 1;
    543 	while (raid_added > 0) {
    544 		raid_added = 0;
    545 		cset = config_sets;
    546 		while (cset != NULL) {
    547 			next_cset = cset->next;
    548 			if (rf_have_enough_components(cset) &&
    549 			    cset->ac->clabel->autoconfigure == 1) {
    550 				sc = rf_auto_config_set(cset);
    551 				if (sc != NULL) {
    552 					aprint_debug("raid%d: configured ok, rootable %d\n",
    553 						     sc->sc_unit, cset->rootable);
    554 					/* We added one RAID set */
    555 					raid_added++;
    556 					if (cset->rootable) {
    557 						rsc = sc;
    558 						num_root++;
    559 					}
    560 				} else {
    561 					/* The autoconfig didn't work :( */
    562 					aprint_debug("Autoconfig failed\n");
    563 					rf_release_all_vps(cset);
    564 				}
    565 			} else {
    566 				/* we're not autoconfiguring this set...
    567 				   release the associated resources */
    568 				rf_release_all_vps(cset);
    569 			}
    570 			/* cleanup */
    571 			rf_cleanup_config_set(cset);
    572 			cset = next_cset;
    573 		}
    574 		if (raid_added > 0) {
    575 			/* We added at least one RAID set, so re-scan for recursive RAID */
    576 			ac_list = rf_find_raid_components();
    577 			config_sets = rf_create_auto_sets(ac_list);
    578 		}
    579 	}
    580 
    581 	/* if the user has specified what the root device should be
    582 	   then we don't touch booted_device or boothowto... */
    583 
    584 	if (rootspec != NULL) {
    585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
    586 		return;
    587 	}
    588 
    589 	/* we found something bootable... */
    590 
    591 	/*
    592 	 * XXX: The following code assumes that the root raid
    593 	 * is the first ('a') partition. This is about the best
    594 	 * we can do with a BSD disklabel, but we might be able
    595 	 * to do better with a GPT label, by setting a specified
    596 	 * attribute to indicate the root partition. We can then
    597 	 * stash the partition number in the r->root_partition
    598 	 * high bits (the bottom 2 bits are already used). For
    599 	 * now we just set booted_partition to 0 when we override
    600 	 * root.
    601 	 */
    602 	if (num_root == 1) {
    603 		device_t candidate_root;
    604 		dksc = &rsc->sc_dksc;
    605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    606 			char cname[sizeof(cset->ac->devname)];
    607 			/* XXX: assume partition 'a' first */
    608 			snprintf(cname, sizeof(cname), "%s%c",
    609 			    device_xname(dksc->sc_dev), 'a');
    610 			candidate_root = dkwedge_find_by_wname(cname);
    611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
    612 			    cname);
    613 			if (candidate_root == NULL) {
    614 				/*
    615 				 * If that is not found, because we don't use
    616 				 * disklabel, return the first dk child
    617 				 * XXX: we can skip the 'a' check above
    618 				 * and always do this...
    619 				 */
    620 				size_t i = 0;
    621 				candidate_root = dkwedge_find_by_parent(
    622 				    device_xname(dksc->sc_dev), &i);
    623 			}
    624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
    625 			    candidate_root);
    626 		} else
    627 			candidate_root = dksc->sc_dev;
    628 		aprint_debug("%s: candidate root=%p booted_device=%p "
    629 			     "root_partition=%d contains_boot=%d\n",
    630 		    __func__, candidate_root, booted_device,
    631 		    rsc->sc_r.root_partition,
    632 		    rf_containsboot(&rsc->sc_r, booted_device));
    633 		/* XXX the check for booted_device == NULL can probably be
    634 		 * dropped, now that rf_containsboot handles that case.
    635 		 */
    636 		if (booted_device == NULL ||
    637 		    rsc->sc_r.root_partition == 1 ||
    638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    639 			booted_device = candidate_root;
    640 			booted_method = "raidframe/single";
    641 			booted_partition = 0;	/* XXX assume 'a' */
    642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
    643 			    device_xname(booted_device), booted_device);
    644 		}
    645 	} else if (num_root > 1) {
    646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
    647 		    booted_device);
    648 
    649 		/*
    650 		 * Maybe the MD code can help. If it cannot, then
    651 		 * setroot() will discover that we have no
    652 		 * booted_device and will ask the user if nothing was
    653 		 * hardwired in the kernel config file
    654 		 */
    655 		if (booted_device == NULL)
    656 			return;
    657 
    658 		num_root = 0;
    659 		mutex_enter(&raid_lock);
    660 		LIST_FOREACH(sc, &raids, sc_link) {
    661 			RF_Raid_t *r = &sc->sc_r;
    662 			if (r->valid == 0)
    663 				continue;
    664 
    665 			if (r->root_partition == 0)
    666 				continue;
    667 
    668 			if (rf_containsboot(r, booted_device)) {
    669 				num_root++;
    670 				rsc = sc;
    671 				dksc = &rsc->sc_dksc;
    672 			}
    673 		}
    674 		mutex_exit(&raid_lock);
    675 
    676 		if (num_root == 1) {
    677 			booted_device = dksc->sc_dev;
    678 			booted_method = "raidframe/multi";
    679 			booted_partition = 0;	/* XXX assume 'a' */
    680 		} else {
    681 			/* we can't guess.. require the user to answer... */
    682 			boothowto |= RB_ASKNAME;
    683 		}
    684 	}
    685 }
    686 
    687 static int
    688 raidsize(dev_t dev)
    689 {
    690 	struct raid_softc *rs;
    691 	struct dk_softc *dksc;
    692 	unsigned int unit;
    693 
    694 	unit = raidunit(dev);
    695 	if ((rs = raidget(unit, false)) == NULL)
    696 		return -1;
    697 	dksc = &rs->sc_dksc;
    698 
    699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    700 		return -1;
    701 
    702 	return dk_size(dksc, dev);
    703 }
    704 
    705 static int
    706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    707 {
    708 	unsigned int unit;
    709 	struct raid_softc *rs;
    710 	struct dk_softc *dksc;
    711 
    712 	unit = raidunit(dev);
    713 	if ((rs = raidget(unit, false)) == NULL)
    714 		return ENXIO;
    715 	dksc = &rs->sc_dksc;
    716 
    717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    718 		return ENODEV;
    719 
    720         /*
    721            Note that blkno is relative to this particular partition.
    722            By adding adding RF_PROTECTED_SECTORS, we get a value that
    723 	   is relative to the partition used for the underlying component.
    724         */
    725 	blkno += RF_PROTECTED_SECTORS;
    726 
    727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    728 }
    729 
    730 static int
    731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    732 {
    733 	struct raid_softc *rs = raidsoftc(dev);
    734 	const struct bdevsw *bdev;
    735 	RF_Raid_t *raidPtr;
    736 	int     c, sparecol, j, scol, dumpto;
    737 	int     error = 0;
    738 
    739 	raidPtr = &rs->sc_r;
    740 
    741 	/* we only support dumping to RAID 1 sets */
    742 	if (raidPtr->Layout.numDataCol != 1 ||
    743 	    raidPtr->Layout.numParityCol != 1)
    744 		return EINVAL;
    745 
    746 	if ((error = raidlock(rs)) != 0)
    747 		return error;
    748 
    749 	/* figure out what device is alive.. */
    750 
    751 	/*
    752 	   Look for a component to dump to.  The preference for the
    753 	   component to dump to is as follows:
    754 	   1) the first component
    755 	   2) a used_spare of the first component
    756 	   3) the second component
    757 	   4) a used_spare of the second component
    758 	*/
    759 
    760 	dumpto = -1;
    761 	for (c = 0; c < raidPtr->numCol; c++) {
    762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    763 			/* this might be the one */
    764 			dumpto = c;
    765 			break;
    766 		}
    767 	}
    768 
    769 	/*
    770 	   At this point we have possibly selected a live component.
    771 	   If we didn't find a live ocmponent, we now check to see
    772 	   if there is a relevant spared component.
    773 	*/
    774 
    775 	for (c = 0; c < raidPtr->numSpare; c++) {
    776 		sparecol = raidPtr->numCol + c;
    777 
    778 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    779 			/* How about this one? */
    780 			scol = -1;
    781 			for(j=0;j<raidPtr->numCol;j++) {
    782 				if (raidPtr->Disks[j].spareCol == sparecol) {
    783 					scol = j;
    784 					break;
    785 				}
    786 			}
    787 			if (scol == 0) {
    788 				/*
    789 				   We must have found a spared first
    790 				   component!  We'll take that over
    791 				   anything else found so far.  (We
    792 				   couldn't have found a real first
    793 				   component before, since this is a
    794 				   used spare, and it's saying that
    795 				   it's replacing the first
    796 				   component.)  On reboot (with
    797 				   autoconfiguration turned on)
    798 				   sparecol will become the first
    799 				   component (component0) of this set.
    800 				*/
    801 				dumpto = sparecol;
    802 				break;
    803 			} else if (scol != -1) {
    804 				/*
    805 				   Must be a spared second component.
    806 				   We'll dump to that if we havn't found
    807 				   anything else so far.
    808 				*/
    809 				if (dumpto == -1)
    810 					dumpto = sparecol;
    811 			}
    812 		}
    813 	}
    814 
    815 	if (dumpto == -1) {
    816 		/* we couldn't find any live components to dump to!?!?
    817 		 */
    818 		error = EINVAL;
    819 		goto out;
    820 	}
    821 
    822 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    823 	if (bdev == NULL) {
    824 		error = ENXIO;
    825 		goto out;
    826 	}
    827 
    828 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    829 				blkno, va, nblk * raidPtr->bytesPerSector);
    830 
    831 out:
    832 	raidunlock(rs);
    833 
    834 	return error;
    835 }
    836 
    837 /* ARGSUSED */
    838 static int
    839 raidopen(dev_t dev, int flags, int fmt,
    840     struct lwp *l)
    841 {
    842 	int     unit = raidunit(dev);
    843 	struct raid_softc *rs;
    844 	struct dk_softc *dksc;
    845 	int     error = 0;
    846 	int     part, pmask;
    847 
    848 	if ((rs = raidget(unit, true)) == NULL)
    849 		return ENXIO;
    850 	if ((error = raidlock(rs)) != 0)
    851 		return error;
    852 
    853 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    854 		error = EBUSY;
    855 		goto bad;
    856 	}
    857 
    858 	dksc = &rs->sc_dksc;
    859 
    860 	part = DISKPART(dev);
    861 	pmask = (1 << part);
    862 
    863 	if (!DK_BUSY(dksc, pmask) &&
    864 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    865 		/* First one... mark things as dirty... Note that we *MUST*
    866 		 have done a configure before this.  I DO NOT WANT TO BE
    867 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    868 		 THAT THEY BELONG TOGETHER!!!!! */
    869 		/* XXX should check to see if we're only open for reading
    870 		   here... If so, we needn't do this, but then need some
    871 		   other way of keeping track of what's happened.. */
    872 
    873 		rf_markalldirty(&rs->sc_r);
    874 	}
    875 
    876 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    877 		error = dk_open(dksc, dev, flags, fmt, l);
    878 
    879 bad:
    880 	raidunlock(rs);
    881 
    882 	return error;
    883 
    884 
    885 }
    886 
    887 static int
    888 raid_lastclose(device_t self)
    889 {
    890 	struct raid_softc *rs = raidsoftc(self);
    891 
    892 	/* Last one... device is not unconfigured yet.
    893 	   Device shutdown has taken care of setting the
    894 	   clean bits if RAIDF_INITED is not set
    895 	   mark things as clean... */
    896 
    897 	rf_update_component_labels(&rs->sc_r,
    898 	    RF_FINAL_COMPONENT_UPDATE);
    899 
    900 	/* pass to unlocked code */
    901 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    902 		rs->sc_flags |= RAIDF_DETACH;
    903 
    904 	return 0;
    905 }
    906 
    907 /* ARGSUSED */
    908 static int
    909 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    910 {
    911 	int     unit = raidunit(dev);
    912 	struct raid_softc *rs;
    913 	struct dk_softc *dksc;
    914 	cfdata_t cf;
    915 	int     error = 0, do_detach = 0, do_put = 0;
    916 
    917 	if ((rs = raidget(unit, false)) == NULL)
    918 		return ENXIO;
    919 	dksc = &rs->sc_dksc;
    920 
    921 	if ((error = raidlock(rs)) != 0)
    922 		return error;
    923 
    924 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    925 		error = dk_close(dksc, dev, flags, fmt, l);
    926 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    927 			do_detach = 1;
    928 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    929 		do_put = 1;
    930 
    931 	raidunlock(rs);
    932 
    933 	if (do_detach) {
    934 		/* free the pseudo device attach bits */
    935 		cf = device_cfdata(dksc->sc_dev);
    936 		error = config_detach(dksc->sc_dev, 0);
    937 		if (error == 0)
    938 			free(cf, M_RAIDFRAME);
    939 	} else if (do_put) {
    940 		raidput(rs);
    941 	}
    942 
    943 	return error;
    944 
    945 }
    946 
    947 static void
    948 raid_wakeup(RF_Raid_t *raidPtr)
    949 {
    950 	rf_lock_mutex2(raidPtr->iodone_lock);
    951 	rf_signal_cond2(raidPtr->iodone_cv);
    952 	rf_unlock_mutex2(raidPtr->iodone_lock);
    953 }
    954 
    955 static void
    956 raidstrategy(struct buf *bp)
    957 {
    958 	unsigned int unit;
    959 	struct raid_softc *rs;
    960 	struct dk_softc *dksc;
    961 	RF_Raid_t *raidPtr;
    962 
    963 	unit = raidunit(bp->b_dev);
    964 	if ((rs = raidget(unit, false)) == NULL) {
    965 		bp->b_error = ENXIO;
    966 		goto fail;
    967 	}
    968 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    969 		bp->b_error = ENXIO;
    970 		goto fail;
    971 	}
    972 	dksc = &rs->sc_dksc;
    973 	raidPtr = &rs->sc_r;
    974 
    975 	/* Queue IO only */
    976 	if (dk_strategy_defer(dksc, bp))
    977 		goto done;
    978 
    979 	/* schedule the IO to happen at the next convenient time */
    980 	raid_wakeup(raidPtr);
    981 
    982 done:
    983 	return;
    984 
    985 fail:
    986 	bp->b_resid = bp->b_bcount;
    987 	biodone(bp);
    988 }
    989 
    990 static int
    991 raid_diskstart(device_t dev, struct buf *bp)
    992 {
    993 	struct raid_softc *rs = raidsoftc(dev);
    994 	RF_Raid_t *raidPtr;
    995 
    996 	raidPtr = &rs->sc_r;
    997 	if (!raidPtr->valid) {
    998 		db1_printf(("raid is not valid..\n"));
    999 		return ENODEV;
   1000 	}
   1001 
   1002 	/* XXX */
   1003 	bp->b_resid = 0;
   1004 
   1005 	return raiddoaccess(raidPtr, bp);
   1006 }
   1007 
   1008 void
   1009 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1010 {
   1011 	struct raid_softc *rs;
   1012 	struct dk_softc *dksc;
   1013 
   1014 	rs = raidPtr->softc;
   1015 	dksc = &rs->sc_dksc;
   1016 
   1017 	dk_done(dksc, bp);
   1018 
   1019 	rf_lock_mutex2(raidPtr->mutex);
   1020 	raidPtr->openings++;
   1021 	rf_unlock_mutex2(raidPtr->mutex);
   1022 
   1023 	/* schedule more IO */
   1024 	raid_wakeup(raidPtr);
   1025 }
   1026 
   1027 /* ARGSUSED */
   1028 static int
   1029 raidread(dev_t dev, struct uio *uio, int flags)
   1030 {
   1031 	int     unit = raidunit(dev);
   1032 	struct raid_softc *rs;
   1033 
   1034 	if ((rs = raidget(unit, false)) == NULL)
   1035 		return ENXIO;
   1036 
   1037 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1038 		return ENXIO;
   1039 
   1040 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1041 
   1042 }
   1043 
   1044 /* ARGSUSED */
   1045 static int
   1046 raidwrite(dev_t dev, struct uio *uio, int flags)
   1047 {
   1048 	int     unit = raidunit(dev);
   1049 	struct raid_softc *rs;
   1050 
   1051 	if ((rs = raidget(unit, false)) == NULL)
   1052 		return ENXIO;
   1053 
   1054 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1055 		return ENXIO;
   1056 
   1057 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1058 
   1059 }
   1060 
   1061 static int
   1062 raid_detach_unlocked(struct raid_softc *rs)
   1063 {
   1064 	struct dk_softc *dksc = &rs->sc_dksc;
   1065 	RF_Raid_t *raidPtr;
   1066 	int error;
   1067 
   1068 	raidPtr = &rs->sc_r;
   1069 
   1070 	if (DK_BUSY(dksc, 0) ||
   1071 	    raidPtr->recon_in_progress != 0 ||
   1072 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1073 	    raidPtr->copyback_in_progress != 0)
   1074 		return EBUSY;
   1075 
   1076 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1077 		return 0;
   1078 
   1079 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1080 
   1081 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1082 		return error;
   1083 
   1084 	rs->sc_flags &= ~RAIDF_INITED;
   1085 
   1086 	/* Kill off any queued buffers */
   1087 	dk_drain(dksc);
   1088 	bufq_free(dksc->sc_bufq);
   1089 
   1090 	/* Detach the disk. */
   1091 	dkwedge_delall(&dksc->sc_dkdev);
   1092 	disk_detach(&dksc->sc_dkdev);
   1093 	disk_destroy(&dksc->sc_dkdev);
   1094 	dk_detach(dksc);
   1095 
   1096 	return 0;
   1097 }
   1098 
   1099 int
   1100 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1101 {
   1102 	struct rf_recon_req_internal *rrint;
   1103 
   1104 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1105 		/* Can't do this on a RAID 0!! */
   1106 		return EINVAL;
   1107 	}
   1108 
   1109 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1110 		/* bad column */
   1111 		return EINVAL;
   1112 	}
   1113 
   1114 	rf_lock_mutex2(raidPtr->mutex);
   1115 	if (raidPtr->status == rf_rs_reconstructing) {
   1116 		raidPtr->abortRecon[rr->col] = 1;
   1117 	}
   1118 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1119 	    (raidPtr->numFailures > 0)) {
   1120 		/* some other component has failed.  Let's not make
   1121 		   things worse. XXX wrong for RAID6 */
   1122 		goto out;
   1123 	}
   1124 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1125 		int spareCol = raidPtr->Disks[rr->col].spareCol;
   1126 
   1127 		if (spareCol < raidPtr->numCol ||
   1128 		    spareCol >= raidPtr->numCol + raidPtr->numSpare)
   1129 			goto out;
   1130 
   1131 		/*
   1132 		 * Fail the spare disk so that we can
   1133 		 * reconstruct on another one.
   1134 		 */
   1135 		raidPtr->Disks[spareCol].status = rf_ds_failed;
   1136 
   1137 	}
   1138 	rf_unlock_mutex2(raidPtr->mutex);
   1139 
   1140 	/* make a copy of the recon request so that we don't rely on
   1141 	 * the user's buffer */
   1142 	rrint = RF_Malloc(sizeof(*rrint));
   1143 	if (rrint == NULL)
   1144 		return(ENOMEM);
   1145 	rrint->col = rr->col;
   1146 	rrint->flags = rr->flags;
   1147 	rrint->raidPtr = raidPtr;
   1148 
   1149 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1150 	    rrint, "raid_recon");
   1151 out:
   1152 	rf_unlock_mutex2(raidPtr->mutex);
   1153 	return EINVAL;
   1154 }
   1155 
   1156 static int
   1157 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1158 {
   1159 	/* allocate a buffer for the layout-specific data, and copy it in */
   1160 	if (k_cfg->layoutSpecificSize == 0)
   1161 		return 0;
   1162 
   1163 	if (k_cfg->layoutSpecificSize > 10000) {
   1164 	    /* sanity check */
   1165 	    return EINVAL;
   1166 	}
   1167 
   1168 	u_char *specific_buf;
   1169 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1170 	if (specific_buf == NULL)
   1171 		return ENOMEM;
   1172 
   1173 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1174 	    k_cfg->layoutSpecificSize);
   1175 	if (retcode) {
   1176 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1177 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1178 		return retcode;
   1179 	}
   1180 
   1181 	k_cfg->layoutSpecific = specific_buf;
   1182 	return 0;
   1183 }
   1184 
   1185 static int
   1186 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1187 {
   1188 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1189 
   1190 	if (rs->sc_r.valid) {
   1191 		/* There is a valid RAID set running on this unit! */
   1192 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1193 		return EINVAL;
   1194 	}
   1195 
   1196 	/* copy-in the configuration information */
   1197 	/* data points to a pointer to the configuration structure */
   1198 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1199 	if (*k_cfg == NULL) {
   1200 		return ENOMEM;
   1201 	}
   1202 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1203 	if (retcode == 0)
   1204 		return 0;
   1205 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1206 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1207 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1208 	return retcode;
   1209 }
   1210 
   1211 int
   1212 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1213 {
   1214 	int retcode, i;
   1215 	RF_Raid_t *raidPtr = &rs->sc_r;
   1216 
   1217 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1218 
   1219 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1220 		goto out;
   1221 
   1222 	/* should do some kind of sanity check on the configuration.
   1223 	 * Store the sum of all the bytes in the last byte? */
   1224 
   1225 	/* Force nul-termination on all strings. */
   1226 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1227 	for (i = 0; i < RF_MAXCOL; i++) {
   1228 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1229 	}
   1230 	for (i = 0; i < RF_MAXSPARE; i++) {
   1231 		ZERO_FINAL(k_cfg->spare_names[i]);
   1232 	}
   1233 	for (i = 0; i < RF_MAXDBGV; i++) {
   1234 		ZERO_FINAL(k_cfg->debugVars[i]);
   1235 	}
   1236 #undef ZERO_FINAL
   1237 
   1238 	/* Check some basic limits. */
   1239 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1240 		retcode = EINVAL;
   1241 		goto out;
   1242 	}
   1243 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1244 		retcode = EINVAL;
   1245 		goto out;
   1246 	}
   1247 
   1248 	/* configure the system */
   1249 
   1250 	/*
   1251 	 * Clear the entire RAID descriptor, just to make sure
   1252 	 *  there is no stale data left in the case of a
   1253 	 *  reconfiguration
   1254 	 */
   1255 	memset(raidPtr, 0, sizeof(*raidPtr));
   1256 	raidPtr->softc = rs;
   1257 	raidPtr->raidid = rs->sc_unit;
   1258 
   1259 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1260 
   1261 	if (retcode == 0) {
   1262 		/* allow this many simultaneous IO's to
   1263 		   this RAID device */
   1264 		raidPtr->openings = RAIDOUTSTANDING;
   1265 
   1266 		raidinit(rs);
   1267 		raid_wakeup(raidPtr);
   1268 		rf_markalldirty(raidPtr);
   1269 	}
   1270 
   1271 	/* free the buffers.  No return code here. */
   1272 	if (k_cfg->layoutSpecificSize) {
   1273 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1274 	}
   1275 out:
   1276 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1277 	if (retcode) {
   1278 		/*
   1279 		 * If configuration failed, set sc_flags so that we
   1280 		 * will detach the device when we close it.
   1281 		 */
   1282 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1283 	}
   1284 	return retcode;
   1285 }
   1286 
   1287 #if RF_DISABLED
   1288 static int
   1289 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1290 {
   1291 
   1292 	/* XXX check the label for valid stuff... */
   1293 	/* Note that some things *should not* get modified --
   1294 	   the user should be re-initing the labels instead of
   1295 	   trying to patch things.
   1296 	   */
   1297 #ifdef DEBUG
   1298 	int raidid = raidPtr->raidid;
   1299 	printf("raid%d: Got component label:\n", raidid);
   1300 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1301 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1302 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1303 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1304 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1305 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1306 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1307 #endif	/* DEBUG */
   1308 	clabel->row = 0;
   1309 	int column = clabel->column;
   1310 
   1311 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1312 		return(EINVAL);
   1313 	}
   1314 
   1315 	/* XXX this isn't allowed to do anything for now :-) */
   1316 
   1317 	/* XXX and before it is, we need to fill in the rest
   1318 	   of the fields!?!?!?! */
   1319 	memcpy(raidget_component_label(raidPtr, column),
   1320 	    clabel, sizeof(*clabel));
   1321 	raidflush_component_label(raidPtr, column);
   1322 	return 0;
   1323 }
   1324 #endif
   1325 
   1326 static int
   1327 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1328 {
   1329 	/*
   1330 	   we only want the serial number from
   1331 	   the above.  We get all the rest of the information
   1332 	   from the config that was used to create this RAID
   1333 	   set.
   1334 	   */
   1335 
   1336 	raidPtr->serial_number = clabel->serial_number;
   1337 
   1338 	for (int column = 0; column < raidPtr->numCol; column++) {
   1339 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1340 		if (RF_DEAD_DISK(diskPtr->status))
   1341 			continue;
   1342 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1343 		    raidPtr, column);
   1344 		/* Zeroing this is important. */
   1345 		memset(ci_label, 0, sizeof(*ci_label));
   1346 		raid_init_component_label(raidPtr, ci_label);
   1347 		ci_label->serial_number = raidPtr->serial_number;
   1348 		ci_label->row = 0; /* we dont' pretend to support more */
   1349 		rf_component_label_set_partitionsize(ci_label,
   1350 		    diskPtr->partitionSize);
   1351 		ci_label->column = column;
   1352 		raidflush_component_label(raidPtr, column);
   1353 		/* XXXjld what about the spares? */
   1354 	}
   1355 
   1356 	return 0;
   1357 }
   1358 
   1359 static int
   1360 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1361 {
   1362 
   1363 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1364 		/* Can't do this on a RAID 0!! */
   1365 		return EINVAL;
   1366 	}
   1367 
   1368 	if (raidPtr->recon_in_progress == 1) {
   1369 		/* a reconstruct is already in progress! */
   1370 		return EINVAL;
   1371 	}
   1372 
   1373 	RF_SingleComponent_t component;
   1374 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1375 	component.row = 0; /* we don't support any more */
   1376 	int column = component.column;
   1377 
   1378 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1379 		return EINVAL;
   1380 	}
   1381 
   1382 	rf_lock_mutex2(raidPtr->mutex);
   1383 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1384 	    (raidPtr->numFailures > 0)) {
   1385 		/* XXX 0 above shouldn't be constant!!! */
   1386 		/* some component other than this has failed.
   1387 		   Let's not make things worse than they already
   1388 		   are... */
   1389 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1390 		       raidPtr->raidid);
   1391 		printf("raid%d:     Col: %d   Too many failures.\n",
   1392 		       raidPtr->raidid, column);
   1393 		rf_unlock_mutex2(raidPtr->mutex);
   1394 		return EINVAL;
   1395 	}
   1396 
   1397 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1398 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1399 		       raidPtr->raidid);
   1400 		printf("raid%d:    Col: %d   "
   1401 		    "Reconstruction already occurring!\n",
   1402 		    raidPtr->raidid, column);
   1403 
   1404 		rf_unlock_mutex2(raidPtr->mutex);
   1405 		return EINVAL;
   1406 	}
   1407 
   1408 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1409 		rf_unlock_mutex2(raidPtr->mutex);
   1410 		return EINVAL;
   1411 	}
   1412 
   1413 	rf_unlock_mutex2(raidPtr->mutex);
   1414 
   1415 	struct rf_recon_req_internal *rrint;
   1416 	rrint = RF_Malloc(sizeof(*rrint));
   1417 	if (rrint == NULL)
   1418 		return ENOMEM;
   1419 
   1420 	rrint->col = column;
   1421 	rrint->raidPtr = raidPtr;
   1422 
   1423 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1424 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1425 }
   1426 
   1427 static int
   1428 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1429 {
   1430 	/*
   1431 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1432 	 * so tell the user it's done.
   1433 	 */
   1434 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1435 	    raidPtr->status != rf_rs_reconstructing) {
   1436 		*data = 100;
   1437 		return 0;
   1438 	}
   1439 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1440 		*data = 0;
   1441 		return 0;
   1442 	}
   1443 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1444 	    / raidPtr->reconControl->numRUsTotal);
   1445 	return 0;
   1446 }
   1447 
   1448 /*
   1449  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1450  * on the component_name[] array.
   1451  */
   1452 static void
   1453 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1454 {
   1455 
   1456 	memcpy(component, data, sizeof *component);
   1457 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1458 }
   1459 
   1460 static int
   1461 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1462 {
   1463 	int     unit = raidunit(dev);
   1464 	int     part, pmask;
   1465 	struct raid_softc *rs;
   1466 	struct dk_softc *dksc;
   1467 	RF_Config_t *k_cfg;
   1468 	RF_Raid_t *raidPtr;
   1469 	RF_AccTotals_t *totals;
   1470 	RF_SingleComponent_t component;
   1471 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1472 	int retcode = 0;
   1473 	int column;
   1474 	RF_ComponentLabel_t *clabel;
   1475 	int d;
   1476 
   1477 	if ((rs = raidget(unit, false)) == NULL)
   1478 		return ENXIO;
   1479 
   1480 	dksc = &rs->sc_dksc;
   1481 	raidPtr = &rs->sc_r;
   1482 
   1483 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1484 	    (int) DISKPART(dev), (int) unit, cmd));
   1485 
   1486 	/* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
   1487 	switch (cmd) {
   1488 	case RAIDFRAME_CONFIGURE:
   1489 	case RAIDFRAME_RESCAN:
   1490 		break;
   1491 	default:
   1492 		if (!rf_inited(rs))
   1493 			return ENXIO;
   1494 	}
   1495 
   1496 	switch (cmd) {
   1497 		/* configure the system */
   1498 	case RAIDFRAME_CONFIGURE:
   1499 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1500 			return retcode;
   1501 		return rf_construct(rs, k_cfg);
   1502 
   1503 		/* shutdown the system */
   1504 	case RAIDFRAME_SHUTDOWN:
   1505 
   1506 		part = DISKPART(dev);
   1507 		pmask = (1 << part);
   1508 
   1509 		if ((retcode = raidlock(rs)) != 0)
   1510 			return retcode;
   1511 
   1512 		if (DK_BUSY(dksc, pmask) ||
   1513 		    raidPtr->recon_in_progress != 0 ||
   1514 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1515 		    raidPtr->copyback_in_progress != 0)
   1516 			retcode = EBUSY;
   1517 		else {
   1518 			/* detach and free on close */
   1519 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1520 			retcode = 0;
   1521 		}
   1522 
   1523 		raidunlock(rs);
   1524 
   1525 		return retcode;
   1526 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1527 		return rf_get_component_label(raidPtr, data);
   1528 
   1529 #if RF_DISABLED
   1530 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1531 		return rf_set_component_label(raidPtr, data);
   1532 #endif
   1533 
   1534 	case RAIDFRAME_INIT_LABELS:
   1535 		return rf_init_component_label(raidPtr, data);
   1536 
   1537 	case RAIDFRAME_SET_AUTOCONFIG:
   1538 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1539 		printf("raid%d: New autoconfig value is: %d\n",
   1540 		       raidPtr->raidid, d);
   1541 		*(int *) data = d;
   1542 		return retcode;
   1543 
   1544 	case RAIDFRAME_SET_ROOT:
   1545 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1546 		printf("raid%d: New rootpartition value is: %d\n",
   1547 		       raidPtr->raidid, d);
   1548 		*(int *) data = d;
   1549 		return retcode;
   1550 
   1551 		/* initialize all parity */
   1552 	case RAIDFRAME_REWRITEPARITY:
   1553 
   1554 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1555 			/* Parity for RAID 0 is trivially correct */
   1556 			raidPtr->parity_good = RF_RAID_CLEAN;
   1557 			return 0;
   1558 		}
   1559 
   1560 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1561 			/* Re-write is already in progress! */
   1562 			return EINVAL;
   1563 		}
   1564 
   1565 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1566 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1567 
   1568 	case RAIDFRAME_ADD_HOT_SPARE:
   1569 		rf_copy_single_component(&component, data);
   1570 		return rf_add_hot_spare(raidPtr, &component);
   1571 
   1572 	/* Remove a non hot-spare component, never implemented in userland */
   1573 	case RAIDFRAME_DELETE_COMPONENT:
   1574 		rf_copy_single_component(&component, data);
   1575 		return rf_delete_component(raidPtr, &component);
   1576 
   1577 	case RAIDFRAME_REMOVE_COMPONENT:
   1578 		rf_copy_single_component(&component, data);
   1579 		return rf_remove_component(raidPtr, &component);
   1580 
   1581 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1582 		rf_copy_single_component(&component, data);
   1583 		return rf_incorporate_hot_spare(raidPtr, &component);
   1584 
   1585 	case RAIDFRAME_REBUILD_IN_PLACE:
   1586 		return rf_rebuild_in_place(raidPtr, data);
   1587 
   1588 	case RAIDFRAME_GET_INFO:
   1589 		ucfgp = *(RF_DeviceConfig_t **)data;
   1590 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1591 		if (d_cfg == NULL)
   1592 			return ENOMEM;
   1593 		retcode = rf_get_info(raidPtr, d_cfg);
   1594 		if (retcode == 0) {
   1595 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1596 		}
   1597 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1598 		return retcode;
   1599 
   1600 	case RAIDFRAME_CHECK_PARITY:
   1601 		*(int *) data = raidPtr->parity_good;
   1602 		return 0;
   1603 
   1604 	case RAIDFRAME_PARITYMAP_STATUS:
   1605 		if (rf_paritymap_ineligible(raidPtr))
   1606 			return EINVAL;
   1607 		rf_paritymap_status(raidPtr->parity_map, data);
   1608 		return 0;
   1609 
   1610 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1611 		if (rf_paritymap_ineligible(raidPtr))
   1612 			return EINVAL;
   1613 		if (raidPtr->parity_map == NULL)
   1614 			return ENOENT; /* ??? */
   1615 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1616 			return EINVAL;
   1617 		return 0;
   1618 
   1619 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1620 		if (rf_paritymap_ineligible(raidPtr))
   1621 			return EINVAL;
   1622 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1623 		return 0;
   1624 
   1625 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1626 		if (rf_paritymap_ineligible(raidPtr))
   1627 			return EINVAL;
   1628 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1629 		/* XXX should errors be passed up? */
   1630 		return 0;
   1631 
   1632 	case RAIDFRAME_RESCAN:
   1633 		return rf_rescan();
   1634 
   1635 	case RAIDFRAME_RESET_ACCTOTALS:
   1636 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1637 		return 0;
   1638 
   1639 	case RAIDFRAME_GET_ACCTOTALS:
   1640 		totals = (RF_AccTotals_t *) data;
   1641 		*totals = raidPtr->acc_totals;
   1642 		return 0;
   1643 
   1644 	case RAIDFRAME_KEEP_ACCTOTALS:
   1645 		raidPtr->keep_acc_totals = *(int *)data;
   1646 		return 0;
   1647 
   1648 	case RAIDFRAME_GET_SIZE:
   1649 		*(int *) data = raidPtr->totalSectors;
   1650 		return 0;
   1651 
   1652 	case RAIDFRAME_FAIL_DISK:
   1653 		return rf_fail_disk(raidPtr, data);
   1654 
   1655 		/* invoke a copyback operation after recon on whatever disk
   1656 		 * needs it, if any */
   1657 	case RAIDFRAME_COPYBACK:
   1658 
   1659 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1660 			/* This makes no sense on a RAID 0!! */
   1661 			return EINVAL;
   1662 		}
   1663 
   1664 		if (raidPtr->copyback_in_progress == 1) {
   1665 			/* Copyback is already in progress! */
   1666 			return EINVAL;
   1667 		}
   1668 
   1669 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1670 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1671 
   1672 		/* return the percentage completion of reconstruction */
   1673 	case RAIDFRAME_CHECK_RECON_STATUS:
   1674 		return rf_check_recon_status(raidPtr, data);
   1675 
   1676 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1677 		rf_check_recon_status_ext(raidPtr, data);
   1678 		return 0;
   1679 
   1680 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1681 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1682 			/* This makes no sense on a RAID 0, so tell the
   1683 			   user it's done. */
   1684 			*(int *) data = 100;
   1685 			return 0;
   1686 		}
   1687 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1688 			*(int *) data = 100 *
   1689 				raidPtr->parity_rewrite_stripes_done /
   1690 				raidPtr->Layout.numStripe;
   1691 		} else {
   1692 			*(int *) data = 100;
   1693 		}
   1694 		return 0;
   1695 
   1696 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1697 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1698 		return 0;
   1699 
   1700 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1701 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1702 			/* This makes no sense on a RAID 0 */
   1703 			*(int *) data = 100;
   1704 			return 0;
   1705 		}
   1706 		if (raidPtr->copyback_in_progress == 1) {
   1707 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1708 				raidPtr->Layout.numStripe;
   1709 		} else {
   1710 			*(int *) data = 100;
   1711 		}
   1712 		return 0;
   1713 
   1714 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1715 		rf_check_copyback_status_ext(raidPtr, data);
   1716 		return 0;
   1717 
   1718 	case RAIDFRAME_SET_LAST_UNIT:
   1719 		for (column = 0; column < raidPtr->numCol; column++)
   1720 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1721 				return EBUSY;
   1722 
   1723 		for (column = 0; column < raidPtr->numCol; column++) {
   1724 			clabel = raidget_component_label(raidPtr, column);
   1725 			clabel->last_unit = *(int *)data;
   1726 			raidflush_component_label(raidPtr, column);
   1727 		}
   1728 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1729 		return 0;
   1730 
   1731 		/* the sparetable daemon calls this to wait for the kernel to
   1732 		 * need a spare table. this ioctl does not return until a
   1733 		 * spare table is needed. XXX -- calling mpsleep here in the
   1734 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1735 		 * -- I should either compute the spare table in the kernel,
   1736 		 * or have a different -- XXX XXX -- interface (a different
   1737 		 * character device) for delivering the table     -- XXX */
   1738 #if RF_DISABLED
   1739 	case RAIDFRAME_SPARET_WAIT:
   1740 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1741 		while (!rf_sparet_wait_queue)
   1742 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1743 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1744 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1745 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1746 
   1747 		/* structure assignment */
   1748 		*((RF_SparetWait_t *) data) = *waitreq;
   1749 
   1750 		RF_Free(waitreq, sizeof(*waitreq));
   1751 		return 0;
   1752 
   1753 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1754 		 * code in it that will cause the dameon to exit */
   1755 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1756 		waitreq = RF_Malloc(sizeof(*waitreq));
   1757 		waitreq->fcol = -1;
   1758 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1759 		waitreq->next = rf_sparet_wait_queue;
   1760 		rf_sparet_wait_queue = waitreq;
   1761 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1762 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1763 		return 0;
   1764 
   1765 		/* used by the spare table daemon to deliver a spare table
   1766 		 * into the kernel */
   1767 	case RAIDFRAME_SEND_SPARET:
   1768 
   1769 		/* install the spare table */
   1770 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1771 
   1772 		/* respond to the requestor.  the return status of the spare
   1773 		 * table installation is passed in the "fcol" field */
   1774 		waitred = RF_Malloc(sizeof(*waitreq));
   1775 		waitreq->fcol = retcode;
   1776 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1777 		waitreq->next = rf_sparet_resp_queue;
   1778 		rf_sparet_resp_queue = waitreq;
   1779 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1780 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1781 
   1782 		return retcode;
   1783 #endif
   1784 	default:
   1785 		/*
   1786 		 * Don't bother trying to load compat modules
   1787 		 * if it is not our ioctl. This is more efficient
   1788 		 * and makes rump tests not depend on compat code
   1789 		 */
   1790 		if (IOCGROUP(cmd) != 'r')
   1791 			break;
   1792 #ifdef _LP64
   1793 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1794 			module_autoload("compat_netbsd32_raid",
   1795 			    MODULE_CLASS_EXEC);
   1796 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1797 			    (rs, cmd, data), enosys(), retcode);
   1798 			if (retcode != EPASSTHROUGH)
   1799 				return retcode;
   1800 		}
   1801 #endif
   1802 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1803 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1804 		    (rs, cmd, data), enosys(), retcode);
   1805 		if (retcode != EPASSTHROUGH)
   1806 			return retcode;
   1807 
   1808 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1809 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1810 		    (rs, cmd, data), enosys(), retcode);
   1811 		if (retcode != EPASSTHROUGH)
   1812 			return retcode;
   1813 		break; /* fall through to the os-specific code below */
   1814 
   1815 	}
   1816 
   1817 	if (!raidPtr->valid)
   1818 		return EINVAL;
   1819 
   1820 	/*
   1821 	 * Add support for "regular" device ioctls here.
   1822 	 */
   1823 
   1824 	switch (cmd) {
   1825 	case DIOCGCACHE:
   1826 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1827 		break;
   1828 
   1829 	case DIOCCACHESYNC:
   1830 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1831 		break;
   1832 
   1833 	default:
   1834 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1835 		break;
   1836 	}
   1837 
   1838 	return retcode;
   1839 
   1840 }
   1841 
   1842 
   1843 /* raidinit -- complete the rest of the initialization for the
   1844    RAIDframe device.  */
   1845 
   1846 
   1847 static void
   1848 raidinit(struct raid_softc *rs)
   1849 {
   1850 	cfdata_t cf;
   1851 	unsigned int unit;
   1852 	struct dk_softc *dksc = &rs->sc_dksc;
   1853 	RF_Raid_t *raidPtr = &rs->sc_r;
   1854 	device_t dev;
   1855 
   1856 	unit = raidPtr->raidid;
   1857 
   1858 	/* XXX doesn't check bounds. */
   1859 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1860 
   1861 	/* attach the pseudo device */
   1862 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1863 	cf->cf_name = raid_cd.cd_name;
   1864 	cf->cf_atname = raid_cd.cd_name;
   1865 	cf->cf_unit = unit;
   1866 	cf->cf_fstate = FSTATE_STAR;
   1867 
   1868 	dev = config_attach_pseudo(cf);
   1869 	if (dev == NULL) {
   1870 		printf("raid%d: config_attach_pseudo failed\n",
   1871 		    raidPtr->raidid);
   1872 		free(cf, M_RAIDFRAME);
   1873 		return;
   1874 	}
   1875 
   1876 	/* provide a backpointer to the real softc */
   1877 	raidsoftc(dev) = rs;
   1878 
   1879 	/* disk_attach actually creates space for the CPU disklabel, among
   1880 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1881 	 * with disklabels. */
   1882 	dk_init(dksc, dev, DKTYPE_RAID);
   1883 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1884 
   1885 	/* XXX There may be a weird interaction here between this, and
   1886 	 * protectedSectors, as used in RAIDframe.  */
   1887 
   1888 	rs->sc_size = raidPtr->totalSectors;
   1889 
   1890 	/* Attach dk and disk subsystems */
   1891 	dk_attach(dksc);
   1892 	disk_attach(&dksc->sc_dkdev);
   1893 	rf_set_geometry(rs, raidPtr);
   1894 
   1895 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1896 
   1897 	/* mark unit as usuable */
   1898 	rs->sc_flags |= RAIDF_INITED;
   1899 
   1900 	dkwedge_discover(&dksc->sc_dkdev);
   1901 }
   1902 
   1903 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1904 /* wake up the daemon & tell it to get us a spare table
   1905  * XXX
   1906  * the entries in the queues should be tagged with the raidPtr
   1907  * so that in the extremely rare case that two recons happen at once,
   1908  * we know for which device were requesting a spare table
   1909  * XXX
   1910  *
   1911  * XXX This code is not currently used. GO
   1912  */
   1913 int
   1914 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1915 {
   1916 	int     retcode;
   1917 
   1918 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1919 	req->next = rf_sparet_wait_queue;
   1920 	rf_sparet_wait_queue = req;
   1921 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1922 
   1923 	/* mpsleep unlocks the mutex */
   1924 	while (!rf_sparet_resp_queue) {
   1925 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1926 	}
   1927 	req = rf_sparet_resp_queue;
   1928 	rf_sparet_resp_queue = req->next;
   1929 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1930 
   1931 	retcode = req->fcol;
   1932 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1933 					 * alloc'd */
   1934 	return retcode;
   1935 }
   1936 #endif
   1937 
   1938 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1939  * bp & passes it down.
   1940  * any calls originating in the kernel must use non-blocking I/O
   1941  * do some extra sanity checking to return "appropriate" error values for
   1942  * certain conditions (to make some standard utilities work)
   1943  *
   1944  * Formerly known as: rf_DoAccessKernel
   1945  */
   1946 void
   1947 raidstart(RF_Raid_t *raidPtr)
   1948 {
   1949 	struct raid_softc *rs;
   1950 	struct dk_softc *dksc;
   1951 
   1952 	rs = raidPtr->softc;
   1953 	dksc = &rs->sc_dksc;
   1954 	/* quick check to see if anything has died recently */
   1955 	rf_lock_mutex2(raidPtr->mutex);
   1956 	if (raidPtr->numNewFailures > 0) {
   1957 		rf_unlock_mutex2(raidPtr->mutex);
   1958 		rf_update_component_labels(raidPtr,
   1959 					   RF_NORMAL_COMPONENT_UPDATE);
   1960 		rf_lock_mutex2(raidPtr->mutex);
   1961 		raidPtr->numNewFailures--;
   1962 	}
   1963 	rf_unlock_mutex2(raidPtr->mutex);
   1964 
   1965 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1966 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1967 		return;
   1968 	}
   1969 
   1970 	dk_start(dksc, NULL);
   1971 }
   1972 
   1973 static int
   1974 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1975 {
   1976 	RF_SectorCount_t num_blocks, pb, sum;
   1977 	RF_RaidAddr_t raid_addr;
   1978 	daddr_t blocknum;
   1979 	int rc;
   1980 
   1981 	rf_lock_mutex2(raidPtr->mutex);
   1982 	if (raidPtr->openings == 0) {
   1983 		rf_unlock_mutex2(raidPtr->mutex);
   1984 		return EAGAIN;
   1985 	}
   1986 	rf_unlock_mutex2(raidPtr->mutex);
   1987 
   1988 	blocknum = bp->b_rawblkno;
   1989 
   1990 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1991 		    (int) blocknum));
   1992 
   1993 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1994 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1995 
   1996 	/* *THIS* is where we adjust what block we're going to...
   1997 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1998 	raid_addr = blocknum;
   1999 
   2000 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2001 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2002 	sum = raid_addr + num_blocks + pb;
   2003 	if (1 || rf_debugKernelAccess) {
   2004 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2005 			    (int) raid_addr, (int) sum, (int) num_blocks,
   2006 			    (int) pb, (int) bp->b_resid));
   2007 	}
   2008 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2009 	    || (sum < num_blocks) || (sum < pb)) {
   2010 		rc = ENOSPC;
   2011 		goto done;
   2012 	}
   2013 	/*
   2014 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2015 	 */
   2016 
   2017 	if (bp->b_bcount & raidPtr->sectorMask) {
   2018 		rc = ENOSPC;
   2019 		goto done;
   2020 	}
   2021 	db1_printf(("Calling DoAccess..\n"));
   2022 
   2023 
   2024 	rf_lock_mutex2(raidPtr->mutex);
   2025 	raidPtr->openings--;
   2026 	rf_unlock_mutex2(raidPtr->mutex);
   2027 
   2028 	/* don't ever condition on bp->b_flags & B_WRITE.
   2029 	 * always condition on B_READ instead */
   2030 
   2031 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2032 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2033 			 raid_addr, num_blocks,
   2034 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2035 
   2036 done:
   2037 	return rc;
   2038 }
   2039 
   2040 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2041 
   2042 int
   2043 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2044 {
   2045 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2046 	struct buf *bp;
   2047 
   2048 	req->queue = queue;
   2049 	bp = req->bp;
   2050 
   2051 	switch (req->type) {
   2052 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2053 		/* XXX need to do something extra here.. */
   2054 		/* I'm leaving this in, as I've never actually seen it used,
   2055 		 * and I'd like folks to report it... GO */
   2056 		printf("%s: WAKEUP CALLED\n", __func__);
   2057 		queue->numOutstanding++;
   2058 
   2059 		bp->b_flags = 0;
   2060 		bp->b_private = req;
   2061 
   2062 		KernelWakeupFunc(bp);
   2063 		break;
   2064 
   2065 	case RF_IO_TYPE_READ:
   2066 	case RF_IO_TYPE_WRITE:
   2067 #if RF_ACC_TRACE > 0
   2068 		if (req->tracerec) {
   2069 			RF_ETIMER_START(req->tracerec->timer);
   2070 		}
   2071 #endif
   2072 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2073 		    op, queue->rf_cinfo->ci_dev,
   2074 		    req->sectorOffset, req->numSector,
   2075 		    req->buf, KernelWakeupFunc, (void *) req,
   2076 		    queue->raidPtr->logBytesPerSector);
   2077 
   2078 		if (rf_debugKernelAccess) {
   2079 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2080 				(long) bp->b_blkno));
   2081 		}
   2082 		queue->numOutstanding++;
   2083 		queue->last_deq_sector = req->sectorOffset;
   2084 		/* acc wouldn't have been let in if there were any pending
   2085 		 * reqs at any other priority */
   2086 		queue->curPriority = req->priority;
   2087 
   2088 		db1_printf(("Going for %c to unit %d col %d\n",
   2089 			    req->type, queue->raidPtr->raidid,
   2090 			    queue->col));
   2091 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2092 			(int) req->sectorOffset, (int) req->numSector,
   2093 			(int) (req->numSector <<
   2094 			    queue->raidPtr->logBytesPerSector),
   2095 			(int) queue->raidPtr->logBytesPerSector));
   2096 
   2097 		/*
   2098 		 * XXX: drop lock here since this can block at
   2099 		 * least with backing SCSI devices.  Retake it
   2100 		 * to minimize fuss with calling interfaces.
   2101 		 */
   2102 
   2103 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2104 		bdev_strategy(bp);
   2105 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2106 		break;
   2107 
   2108 	default:
   2109 		panic("bad req->type in rf_DispatchKernelIO");
   2110 	}
   2111 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2112 
   2113 	return 0;
   2114 }
   2115 /* this is the callback function associated with a I/O invoked from
   2116    kernel code.
   2117  */
   2118 static void
   2119 KernelWakeupFunc(struct buf *bp)
   2120 {
   2121 	RF_DiskQueueData_t *req = NULL;
   2122 	RF_DiskQueue_t *queue;
   2123 
   2124 	db1_printf(("recovering the request queue:\n"));
   2125 
   2126 	req = bp->b_private;
   2127 
   2128 	queue = (RF_DiskQueue_t *) req->queue;
   2129 
   2130 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2131 
   2132 #if RF_ACC_TRACE > 0
   2133 	if (req->tracerec) {
   2134 		RF_ETIMER_STOP(req->tracerec->timer);
   2135 		RF_ETIMER_EVAL(req->tracerec->timer);
   2136 		rf_lock_mutex2(rf_tracing_mutex);
   2137 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2138 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2139 		req->tracerec->num_phys_ios++;
   2140 		rf_unlock_mutex2(rf_tracing_mutex);
   2141 	}
   2142 #endif
   2143 
   2144 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2145 	 * ballistic, and mark the component as hosed... */
   2146 
   2147 	if (bp->b_error != 0) {
   2148 		/* Mark the disk as dead */
   2149 		/* but only mark it once... */
   2150 		/* and only if it wouldn't leave this RAID set
   2151 		   completely broken */
   2152 		if (((queue->raidPtr->Disks[queue->col].status ==
   2153 		      rf_ds_optimal) ||
   2154 		     (queue->raidPtr->Disks[queue->col].status ==
   2155 		      rf_ds_used_spare)) &&
   2156 		     (queue->raidPtr->numFailures <
   2157 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2158 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2159 			       queue->raidPtr->raidid,
   2160 			       bp->b_error,
   2161 			       queue->raidPtr->Disks[queue->col].devname);
   2162 			queue->raidPtr->Disks[queue->col].status =
   2163 			    rf_ds_failed;
   2164 			queue->raidPtr->status = rf_rs_degraded;
   2165 			queue->raidPtr->numFailures++;
   2166 			queue->raidPtr->numNewFailures++;
   2167 		} else {	/* Disk is already dead... */
   2168 			/* printf("Disk already marked as dead!\n"); */
   2169 		}
   2170 
   2171 	}
   2172 
   2173 	/* Fill in the error value */
   2174 	req->error = bp->b_error;
   2175 
   2176 	/* Drop this one on the "finished" queue... */
   2177 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2178 
   2179 	/* Let the raidio thread know there is work to be done. */
   2180 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2181 
   2182 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2183 }
   2184 
   2185 
   2186 /*
   2187  * initialize a buf structure for doing an I/O in the kernel.
   2188  */
   2189 static void
   2190 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2191        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2192        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2193 {
   2194 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2195 	bp->b_oflags = 0;
   2196 	bp->b_cflags = 0;
   2197 	bp->b_bcount = numSect << logBytesPerSector;
   2198 	bp->b_bufsize = bp->b_bcount;
   2199 	bp->b_error = 0;
   2200 	bp->b_dev = dev;
   2201 	bp->b_data = bf;
   2202 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2203 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2204 	if (bp->b_bcount == 0) {
   2205 		panic("bp->b_bcount is zero in InitBP!!");
   2206 	}
   2207 	bp->b_iodone = cbFunc;
   2208 	bp->b_private = cbArg;
   2209 }
   2210 
   2211 /*
   2212  * Wait interruptibly for an exclusive lock.
   2213  *
   2214  * XXX
   2215  * Several drivers do this; it should be abstracted and made MP-safe.
   2216  * (Hmm... where have we seen this warning before :->  GO )
   2217  */
   2218 static int
   2219 raidlock(struct raid_softc *rs)
   2220 {
   2221 	int     error;
   2222 
   2223 	error = 0;
   2224 	mutex_enter(&rs->sc_mutex);
   2225 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2226 		rs->sc_flags |= RAIDF_WANTED;
   2227 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2228 		if (error != 0)
   2229 			goto done;
   2230 	}
   2231 	rs->sc_flags |= RAIDF_LOCKED;
   2232 done:
   2233 	mutex_exit(&rs->sc_mutex);
   2234 	return error;
   2235 }
   2236 /*
   2237  * Unlock and wake up any waiters.
   2238  */
   2239 static void
   2240 raidunlock(struct raid_softc *rs)
   2241 {
   2242 
   2243 	mutex_enter(&rs->sc_mutex);
   2244 	rs->sc_flags &= ~RAIDF_LOCKED;
   2245 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2246 		rs->sc_flags &= ~RAIDF_WANTED;
   2247 		cv_broadcast(&rs->sc_cv);
   2248 	}
   2249 	mutex_exit(&rs->sc_mutex);
   2250 }
   2251 
   2252 
   2253 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2254 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2255 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2256 
   2257 static daddr_t
   2258 rf_component_info_offset(void)
   2259 {
   2260 
   2261 	return RF_COMPONENT_INFO_OFFSET;
   2262 }
   2263 
   2264 static daddr_t
   2265 rf_component_info_size(unsigned secsize)
   2266 {
   2267 	daddr_t info_size;
   2268 
   2269 	KASSERT(secsize);
   2270 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2271 		info_size = secsize;
   2272 	else
   2273 		info_size = RF_COMPONENT_INFO_SIZE;
   2274 
   2275 	return info_size;
   2276 }
   2277 
   2278 static daddr_t
   2279 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2280 {
   2281 	daddr_t map_offset;
   2282 
   2283 	KASSERT(raidPtr->bytesPerSector);
   2284 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2285 		map_offset = raidPtr->bytesPerSector;
   2286 	else
   2287 		map_offset = RF_COMPONENT_INFO_SIZE;
   2288 	map_offset += rf_component_info_offset();
   2289 
   2290 	return map_offset;
   2291 }
   2292 
   2293 static daddr_t
   2294 rf_parity_map_size(RF_Raid_t *raidPtr)
   2295 {
   2296 	daddr_t map_size;
   2297 
   2298 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2299 		map_size = raidPtr->bytesPerSector;
   2300 	else
   2301 		map_size = RF_PARITY_MAP_SIZE;
   2302 
   2303 	return map_size;
   2304 }
   2305 
   2306 int
   2307 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2308 {
   2309 	RF_ComponentLabel_t *clabel;
   2310 
   2311 	clabel = raidget_component_label(raidPtr, col);
   2312 	clabel->clean = RF_RAID_CLEAN;
   2313 	raidflush_component_label(raidPtr, col);
   2314 	return(0);
   2315 }
   2316 
   2317 
   2318 int
   2319 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2320 {
   2321 	RF_ComponentLabel_t *clabel;
   2322 
   2323 	clabel = raidget_component_label(raidPtr, col);
   2324 	clabel->clean = RF_RAID_DIRTY;
   2325 	raidflush_component_label(raidPtr, col);
   2326 	return(0);
   2327 }
   2328 
   2329 int
   2330 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2331 {
   2332 	KASSERT(raidPtr->bytesPerSector);
   2333 
   2334 	return raidread_component_label(raidPtr->bytesPerSector,
   2335 	    raidPtr->Disks[col].dev,
   2336 	    raidPtr->raid_cinfo[col].ci_vp,
   2337 	    &raidPtr->raid_cinfo[col].ci_label);
   2338 }
   2339 
   2340 RF_ComponentLabel_t *
   2341 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2342 {
   2343 	return &raidPtr->raid_cinfo[col].ci_label;
   2344 }
   2345 
   2346 int
   2347 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2348 {
   2349 	RF_ComponentLabel_t *label;
   2350 
   2351 	label = &raidPtr->raid_cinfo[col].ci_label;
   2352 	label->mod_counter = raidPtr->mod_counter;
   2353 #ifndef RF_NO_PARITY_MAP
   2354 	label->parity_map_modcount = label->mod_counter;
   2355 #endif
   2356 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2357 	    raidPtr->Disks[col].dev,
   2358 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2359 }
   2360 
   2361 /*
   2362  * Swap the label endianness.
   2363  *
   2364  * Everything in the component label is 4-byte-swapped except the version,
   2365  * which is kept in the byte-swapped version at all times, and indicates
   2366  * for the writer that a swap is necessary.
   2367  *
   2368  * For reads it is expected that out_label == clabel, but writes expect
   2369  * separate labels so only the re-swapped label is written out to disk,
   2370  * leaving the swapped-except-version internally.
   2371  *
   2372  * Only support swapping label version 2.
   2373  */
   2374 static void
   2375 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2376 {
   2377 	int	*in, *out, *in_last;
   2378 
   2379 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2380 
   2381 	/* Don't swap the label, but do copy it. */
   2382 	out_label->version = clabel->version;
   2383 
   2384 	in = &clabel->serial_number;
   2385 	in_last = &clabel->future_use2[42];
   2386 	out = &out_label->serial_number;
   2387 
   2388 	for (; in < in_last; in++, out++)
   2389 		*out = bswap32(*in);
   2390 }
   2391 
   2392 static int
   2393 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2394     RF_ComponentLabel_t *clabel)
   2395 {
   2396 	int error;
   2397 
   2398 	error = raidread_component_area(dev, b_vp, clabel,
   2399 	    sizeof(RF_ComponentLabel_t),
   2400 	    rf_component_info_offset(),
   2401 	    rf_component_info_size(secsize));
   2402 
   2403 	if (error == 0 &&
   2404 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2405 		rf_swap_label(clabel, clabel);
   2406 	}
   2407 
   2408 	return error;
   2409 }
   2410 
   2411 /* ARGSUSED */
   2412 static int
   2413 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2414     size_t msize, daddr_t offset, daddr_t dsize)
   2415 {
   2416 	struct buf *bp;
   2417 	int error;
   2418 
   2419 	/* XXX should probably ensure that we don't try to do this if
   2420 	   someone has changed rf_protected_sectors. */
   2421 
   2422 	if (b_vp == NULL) {
   2423 		/* For whatever reason, this component is not valid.
   2424 		   Don't try to read a component label from it. */
   2425 		return(EINVAL);
   2426 	}
   2427 
   2428 	/* get a block of the appropriate size... */
   2429 	bp = geteblk((int)dsize);
   2430 	bp->b_dev = dev;
   2431 
   2432 	/* get our ducks in a row for the read */
   2433 	bp->b_blkno = offset / DEV_BSIZE;
   2434 	bp->b_bcount = dsize;
   2435 	bp->b_flags |= B_READ;
   2436  	bp->b_resid = dsize;
   2437 
   2438 	bdev_strategy(bp);
   2439 	error = biowait(bp);
   2440 
   2441 	if (!error) {
   2442 		memcpy(data, bp->b_data, msize);
   2443 	}
   2444 
   2445 	brelse(bp, 0);
   2446 	return(error);
   2447 }
   2448 
   2449 static int
   2450 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2451     RF_ComponentLabel_t *clabel)
   2452 {
   2453 	RF_ComponentLabel_t *clabel_write = clabel;
   2454 	RF_ComponentLabel_t lclabel;
   2455 	int error;
   2456 
   2457 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2458 		clabel_write = &lclabel;
   2459 		rf_swap_label(clabel, clabel_write);
   2460 	}
   2461 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2462 	    sizeof(RF_ComponentLabel_t),
   2463 	    rf_component_info_offset(),
   2464 	    rf_component_info_size(secsize));
   2465 
   2466 	return error;
   2467 }
   2468 
   2469 /* ARGSUSED */
   2470 static int
   2471 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2472     size_t msize, daddr_t offset, daddr_t dsize)
   2473 {
   2474 	struct buf *bp;
   2475 	int error;
   2476 
   2477 	/* get a block of the appropriate size... */
   2478 	bp = geteblk((int)dsize);
   2479 	bp->b_dev = dev;
   2480 
   2481 	/* get our ducks in a row for the write */
   2482 	bp->b_blkno = offset / DEV_BSIZE;
   2483 	bp->b_bcount = dsize;
   2484 	bp->b_flags |= B_WRITE;
   2485  	bp->b_resid = dsize;
   2486 
   2487 	memset(bp->b_data, 0, dsize);
   2488 	memcpy(bp->b_data, data, msize);
   2489 
   2490 	bdev_strategy(bp);
   2491 	error = biowait(bp);
   2492 	brelse(bp, 0);
   2493 	if (error) {
   2494 #if 1
   2495 		printf("Failed to write RAID component info!\n");
   2496 #endif
   2497 	}
   2498 
   2499 	return(error);
   2500 }
   2501 
   2502 void
   2503 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2504 {
   2505 	int c;
   2506 
   2507 	for (c = 0; c < raidPtr->numCol; c++) {
   2508 		/* Skip dead disks. */
   2509 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2510 			continue;
   2511 		/* XXXjld: what if an error occurs here? */
   2512 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2513 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2514 		    RF_PARITYMAP_NBYTE,
   2515 		    rf_parity_map_offset(raidPtr),
   2516 		    rf_parity_map_size(raidPtr));
   2517 	}
   2518 }
   2519 
   2520 void
   2521 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2522 {
   2523 	struct rf_paritymap_ondisk tmp;
   2524 	int c,first;
   2525 
   2526 	first=1;
   2527 	for (c = 0; c < raidPtr->numCol; c++) {
   2528 		/* Skip dead disks. */
   2529 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2530 			continue;
   2531 		raidread_component_area(raidPtr->Disks[c].dev,
   2532 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2533 		    RF_PARITYMAP_NBYTE,
   2534 		    rf_parity_map_offset(raidPtr),
   2535 		    rf_parity_map_size(raidPtr));
   2536 		if (first) {
   2537 			memcpy(map, &tmp, sizeof(*map));
   2538 			first = 0;
   2539 		} else {
   2540 			rf_paritymap_merge(map, &tmp);
   2541 		}
   2542 	}
   2543 }
   2544 
   2545 void
   2546 rf_markalldirty(RF_Raid_t *raidPtr)
   2547 {
   2548 	RF_ComponentLabel_t *clabel;
   2549 	int sparecol;
   2550 	int c;
   2551 	int j;
   2552 	int scol = -1;
   2553 
   2554 	raidPtr->mod_counter++;
   2555 	for (c = 0; c < raidPtr->numCol; c++) {
   2556 		/* we don't want to touch (at all) a disk that has
   2557 		   failed */
   2558 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2559 			clabel = raidget_component_label(raidPtr, c);
   2560 			if (clabel->status == rf_ds_spared) {
   2561 				/* XXX do something special...
   2562 				   but whatever you do, don't
   2563 				   try to access it!! */
   2564 			} else {
   2565 				raidmarkdirty(raidPtr, c);
   2566 			}
   2567 		}
   2568 	}
   2569 
   2570 	for (c = 0; c < raidPtr->numSpare ; c++) {
   2571 		sparecol = raidPtr->numCol + c;
   2572 
   2573 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2574 			/*
   2575 
   2576 			   we claim this disk is "optimal" if it's
   2577 			   rf_ds_used_spare, as that means it should be
   2578 			   directly substitutable for the disk it replaced.
   2579 			   We note that too...
   2580 
   2581 			 */
   2582 
   2583 			for(j=0;j<raidPtr->numCol;j++) {
   2584 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2585 					scol = j;
   2586 					break;
   2587 				}
   2588 			}
   2589 
   2590 			clabel = raidget_component_label(raidPtr, sparecol);
   2591 			/* make sure status is noted */
   2592 
   2593 			raid_init_component_label(raidPtr, clabel);
   2594 
   2595 			clabel->row = 0;
   2596 			clabel->column = scol;
   2597 			/* Note: we *don't* change status from rf_ds_used_spare
   2598 			   to rf_ds_optimal */
   2599 			/* clabel.status = rf_ds_optimal; */
   2600 
   2601 			raidmarkdirty(raidPtr, sparecol);
   2602 		}
   2603 	}
   2604 }
   2605 
   2606 
   2607 void
   2608 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2609 {
   2610 	RF_ComponentLabel_t *clabel;
   2611 	int sparecol;
   2612 	int c;
   2613 	int j;
   2614 	int scol;
   2615 	struct raid_softc *rs = raidPtr->softc;
   2616 
   2617 	scol = -1;
   2618 
   2619 	/* XXX should do extra checks to make sure things really are clean,
   2620 	   rather than blindly setting the clean bit... */
   2621 
   2622 	raidPtr->mod_counter++;
   2623 
   2624 	for (c = 0; c < raidPtr->numCol; c++) {
   2625 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2626 			clabel = raidget_component_label(raidPtr, c);
   2627 			/* make sure status is noted */
   2628 			clabel->status = rf_ds_optimal;
   2629 
   2630 			/* note what unit we are configured as */
   2631 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2632 				clabel->last_unit = raidPtr->raidid;
   2633 
   2634 			raidflush_component_label(raidPtr, c);
   2635 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2636 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2637 					raidmarkclean(raidPtr, c);
   2638 				}
   2639 			}
   2640 		}
   2641 		/* else we don't touch it.. */
   2642 	}
   2643 
   2644 	for (c = 0; c < raidPtr->numSpare ; c++) {
   2645 		sparecol = raidPtr->numCol + c;
   2646 
   2647 		/* Need to ensure that the reconstruct actually completed! */
   2648 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2649 			/*
   2650 
   2651 			   we claim this disk is "optimal" if it's
   2652 			   rf_ds_used_spare, as that means it should be
   2653 			   directly substitutable for the disk it replaced.
   2654 			   We note that too...
   2655 
   2656 			 */
   2657 
   2658 			for(j=0;j<raidPtr->numCol;j++) {
   2659 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2660 					scol = j;
   2661 					break;
   2662 				}
   2663 			}
   2664 
   2665 			/* XXX shouldn't *really* need this... */
   2666 			clabel = raidget_component_label(raidPtr, sparecol);
   2667 			/* make sure status is noted */
   2668 
   2669 			raid_init_component_label(raidPtr, clabel);
   2670 
   2671 			clabel->column = scol;
   2672 			clabel->status = rf_ds_optimal;
   2673 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2674 				clabel->last_unit = raidPtr->raidid;
   2675 
   2676 			raidflush_component_label(raidPtr, sparecol);
   2677 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2678 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2679 					raidmarkclean(raidPtr, sparecol);
   2680 				}
   2681 			}
   2682 		}
   2683 	}
   2684 }
   2685 
   2686 void
   2687 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2688 {
   2689 
   2690 	if (vp != NULL) {
   2691 		if (auto_configured == 1) {
   2692 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2693 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2694 			vput(vp);
   2695 
   2696 		} else {
   2697 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2698 		}
   2699 	}
   2700 }
   2701 
   2702 
   2703 void
   2704 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2705 {
   2706 	int r,c;
   2707 	struct vnode *vp;
   2708 	int acd;
   2709 
   2710 
   2711 	/* We take this opportunity to close the vnodes like we should.. */
   2712 
   2713 	for (c = 0; c < raidPtr->numCol; c++) {
   2714 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2715 		acd = raidPtr->Disks[c].auto_configured;
   2716 		rf_close_component(raidPtr, vp, acd);
   2717 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2718 		raidPtr->Disks[c].auto_configured = 0;
   2719 	}
   2720 
   2721 	for (r = 0; r < raidPtr->numSpare; r++) {
   2722 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2723 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2724 		rf_close_component(raidPtr, vp, acd);
   2725 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2726 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2727 	}
   2728 }
   2729 
   2730 
   2731 static void
   2732 rf_ReconThread(struct rf_recon_req_internal *req)
   2733 {
   2734 	int     s;
   2735 	RF_Raid_t *raidPtr;
   2736 
   2737 	s = splbio();
   2738 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2739 	raidPtr->recon_in_progress = 1;
   2740 
   2741 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2742 		raidPtr->forceRecon = 1;
   2743 	}
   2744 
   2745 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2746 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2747 
   2748 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2749 		raidPtr->forceRecon = 0;
   2750 	}
   2751 
   2752 	RF_Free(req, sizeof(*req));
   2753 
   2754 	raidPtr->recon_in_progress = 0;
   2755 	splx(s);
   2756 
   2757 	/* That's all... */
   2758 	kthread_exit(0);	/* does not return */
   2759 }
   2760 
   2761 static void
   2762 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2763 {
   2764 	int retcode;
   2765 	int s;
   2766 
   2767 	raidPtr->parity_rewrite_stripes_done = 0;
   2768 	raidPtr->parity_rewrite_in_progress = 1;
   2769 	s = splbio();
   2770 	retcode = rf_RewriteParity(raidPtr);
   2771 	splx(s);
   2772 	if (retcode) {
   2773 		printf("raid%d: Error re-writing parity (%d)!\n",
   2774 		    raidPtr->raidid, retcode);
   2775 	} else {
   2776 		/* set the clean bit!  If we shutdown correctly,
   2777 		   the clean bit on each component label will get
   2778 		   set */
   2779 		raidPtr->parity_good = RF_RAID_CLEAN;
   2780 	}
   2781 	raidPtr->parity_rewrite_in_progress = 0;
   2782 
   2783 	/* Anyone waiting for us to stop?  If so, inform them... */
   2784 	if (raidPtr->waitShutdown) {
   2785 		rf_lock_mutex2(raidPtr->rad_lock);
   2786 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2787 		rf_unlock_mutex2(raidPtr->rad_lock);
   2788 	}
   2789 
   2790 	/* That's all... */
   2791 	kthread_exit(0);	/* does not return */
   2792 }
   2793 
   2794 
   2795 static void
   2796 rf_CopybackThread(RF_Raid_t *raidPtr)
   2797 {
   2798 	int s;
   2799 
   2800 	raidPtr->copyback_in_progress = 1;
   2801 	s = splbio();
   2802 	rf_CopybackReconstructedData(raidPtr);
   2803 	splx(s);
   2804 	raidPtr->copyback_in_progress = 0;
   2805 
   2806 	/* That's all... */
   2807 	kthread_exit(0);	/* does not return */
   2808 }
   2809 
   2810 
   2811 static void
   2812 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2813 {
   2814 	int s;
   2815 	RF_Raid_t *raidPtr;
   2816 
   2817 	s = splbio();
   2818 	raidPtr = req->raidPtr;
   2819 	raidPtr->recon_in_progress = 1;
   2820 
   2821 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2822 		raidPtr->forceRecon = 1;
   2823 	}
   2824 
   2825 	rf_ReconstructInPlace(raidPtr, req->col);
   2826 
   2827 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2828 		raidPtr->forceRecon = 0;
   2829 	}
   2830 
   2831 	RF_Free(req, sizeof(*req));
   2832 	raidPtr->recon_in_progress = 0;
   2833 	splx(s);
   2834 
   2835 	/* That's all... */
   2836 	kthread_exit(0);	/* does not return */
   2837 }
   2838 
   2839 static RF_AutoConfig_t *
   2840 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2841     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2842     unsigned secsize)
   2843 {
   2844 	int good_one = 0;
   2845 	RF_ComponentLabel_t *clabel;
   2846 	RF_AutoConfig_t *ac;
   2847 
   2848 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2849 
   2850 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2851 		/* Got the label.  Does it look reasonable? */
   2852 		if (rf_reasonable_label(clabel, numsecs) &&
   2853 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2854 #ifdef DEBUG
   2855 			printf("Component on: %s: %llu\n",
   2856 				cname, (unsigned long long)size);
   2857 			rf_print_component_label(clabel);
   2858 #endif
   2859 			/* if it's reasonable, add it, else ignore it. */
   2860 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2861 				M_WAITOK);
   2862 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2863 			ac->dev = dev;
   2864 			ac->vp = vp;
   2865 			ac->clabel = clabel;
   2866 			ac->next = ac_list;
   2867 			ac_list = ac;
   2868 			good_one = 1;
   2869 		}
   2870 	}
   2871 	if (!good_one) {
   2872 		/* cleanup */
   2873 		free(clabel, M_RAIDFRAME);
   2874 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2875 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2876 		vput(vp);
   2877 	}
   2878 	return ac_list;
   2879 }
   2880 
   2881 static RF_AutoConfig_t *
   2882 rf_find_raid_components(void)
   2883 {
   2884 	struct vnode *vp;
   2885 	struct disklabel label;
   2886 	device_t dv;
   2887 	deviter_t di;
   2888 	dev_t dev;
   2889 	int bmajor, bminor, wedge, rf_part_found;
   2890 	int error;
   2891 	int i;
   2892 	RF_AutoConfig_t *ac_list;
   2893 	uint64_t numsecs;
   2894 	unsigned secsize;
   2895 	int dowedges;
   2896 
   2897 	/* initialize the AutoConfig list */
   2898 	ac_list = NULL;
   2899 
   2900 	/*
   2901 	 * we begin by trolling through *all* the devices on the system *twice*
   2902 	 * first we scan for wedges, second for other devices. This avoids
   2903 	 * using a raw partition instead of a wedge that covers the whole disk
   2904 	 */
   2905 
   2906 	for (dowedges=1; dowedges>=0; --dowedges) {
   2907 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2908 		     dv = deviter_next(&di)) {
   2909 
   2910 			/* we are only interested in disks */
   2911 			if (device_class(dv) != DV_DISK)
   2912 				continue;
   2913 
   2914 			/* we don't care about floppies */
   2915 			if (device_is_a(dv, "fd")) {
   2916 				continue;
   2917 			}
   2918 
   2919 			/* we don't care about CDs. */
   2920 			if (device_is_a(dv, "cd")) {
   2921 				continue;
   2922 			}
   2923 
   2924 			/* we don't care about md. */
   2925 			if (device_is_a(dv, "md")) {
   2926 				continue;
   2927 			}
   2928 
   2929 			/* hdfd is the Atari/Hades floppy driver */
   2930 			if (device_is_a(dv, "hdfd")) {
   2931 				continue;
   2932 			}
   2933 
   2934 			/* fdisa is the Atari/Milan floppy driver */
   2935 			if (device_is_a(dv, "fdisa")) {
   2936 				continue;
   2937 			}
   2938 
   2939 			/* we don't care about spiflash */
   2940 			if (device_is_a(dv, "spiflash")) {
   2941 				continue;
   2942 			}
   2943 
   2944 			/* are we in the wedges pass ? */
   2945 			wedge = device_is_a(dv, "dk");
   2946 			if (wedge != dowedges) {
   2947 				continue;
   2948 			}
   2949 
   2950 			/* need to find the device_name_to_block_device_major stuff */
   2951 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2952 
   2953 			rf_part_found = 0; /*No raid partition as yet*/
   2954 
   2955 			/* get a vnode for the raw partition of this disk */
   2956 			bminor = minor(device_unit(dv));
   2957 			dev = wedge ? makedev(bmajor, bminor) :
   2958 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2959 			if (bdevvp(dev, &vp))
   2960 				panic("RAID can't alloc vnode");
   2961 
   2962 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2963 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2964 
   2965 			if (error) {
   2966 				/* "Who cares."  Continue looking
   2967 				   for something that exists*/
   2968 				vput(vp);
   2969 				continue;
   2970 			}
   2971 
   2972 			error = getdisksize(vp, &numsecs, &secsize);
   2973 			if (error) {
   2974 				/*
   2975 				 * Pseudo devices like vnd and cgd can be
   2976 				 * opened but may still need some configuration.
   2977 				 * Ignore these quietly.
   2978 				 */
   2979 				if (error != ENXIO)
   2980 					printf("RAIDframe: can't get disk size"
   2981 					    " for dev %s (%d)\n",
   2982 					    device_xname(dv), error);
   2983 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2984 				vput(vp);
   2985 				continue;
   2986 			}
   2987 			if (wedge) {
   2988 				struct dkwedge_info dkw;
   2989 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2990 				    NOCRED);
   2991 				if (error) {
   2992 					printf("RAIDframe: can't get wedge info for "
   2993 					    "dev %s (%d)\n", device_xname(dv), error);
   2994 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2995 					vput(vp);
   2996 					continue;
   2997 				}
   2998 
   2999 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3000 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3001 					vput(vp);
   3002 					continue;
   3003 				}
   3004 
   3005 				VOP_UNLOCK(vp);
   3006 				ac_list = rf_get_component(ac_list, dev, vp,
   3007 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3008 				rf_part_found = 1; /*There is a raid component on this disk*/
   3009 				continue;
   3010 			}
   3011 
   3012 			/* Ok, the disk exists.  Go get the disklabel. */
   3013 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3014 			if (error) {
   3015 				/*
   3016 				 * XXX can't happen - open() would
   3017 				 * have errored out (or faked up one)
   3018 				 */
   3019 				if (error != ENOTTY)
   3020 					printf("RAIDframe: can't get label for dev "
   3021 					    "%s (%d)\n", device_xname(dv), error);
   3022 			}
   3023 
   3024 			/* don't need this any more.  We'll allocate it again
   3025 			   a little later if we really do... */
   3026 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3027 			vput(vp);
   3028 
   3029 			if (error)
   3030 				continue;
   3031 
   3032 			rf_part_found = 0; /*No raid partitions yet*/
   3033 			for (i = 0; i < label.d_npartitions; i++) {
   3034 				char cname[sizeof(ac_list->devname)];
   3035 
   3036 				/* We only support partitions marked as RAID */
   3037 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3038 					continue;
   3039 
   3040 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3041 				if (bdevvp(dev, &vp))
   3042 					panic("RAID can't alloc vnode");
   3043 
   3044 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3045 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3046 				if (error) {
   3047 					/* Not quite a 'whatever'.  In
   3048 					 * this situation we know
   3049 					 * there is a FS_RAID
   3050 					 * partition, but we can't
   3051 					 * open it.  The most likely
   3052 					 * reason is that the
   3053 					 * partition is already in
   3054 					 * use by another RAID set.
   3055 					 * So note that we've already
   3056 					 * found a partition on this
   3057 					 * disk so we don't attempt
   3058 					 * to use the raw disk later. */
   3059 					rf_part_found = 1;
   3060 					vput(vp);
   3061 					continue;
   3062 				}
   3063 				VOP_UNLOCK(vp);
   3064 				snprintf(cname, sizeof(cname), "%s%c",
   3065 				    device_xname(dv), 'a' + i);
   3066 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3067 					label.d_partitions[i].p_size, numsecs, secsize);
   3068 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3069 			}
   3070 
   3071 			/*
   3072 			 *If there is no raid component on this disk, either in a
   3073 			 *disklabel or inside a wedge, check the raw partition as well,
   3074 			 *as it is possible to configure raid components on raw disk
   3075 			 *devices.
   3076 			 */
   3077 
   3078 			if (!rf_part_found) {
   3079 				char cname[sizeof(ac_list->devname)];
   3080 
   3081 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3082 				if (bdevvp(dev, &vp))
   3083 					panic("RAID can't alloc vnode");
   3084 
   3085 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3086 
   3087 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3088 				if (error) {
   3089 					/* Whatever... */
   3090 					vput(vp);
   3091 					continue;
   3092 				}
   3093 				VOP_UNLOCK(vp);
   3094 				snprintf(cname, sizeof(cname), "%s%c",
   3095 				    device_xname(dv), 'a' + RAW_PART);
   3096 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3097 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3098 			}
   3099 		}
   3100 		deviter_release(&di);
   3101 	}
   3102 	return ac_list;
   3103 }
   3104 
   3105 int
   3106 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3107 {
   3108 
   3109 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3110 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3111 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3112 	    (clabel->clean == RF_RAID_CLEAN ||
   3113 	     clabel->clean == RF_RAID_DIRTY) &&
   3114 	    clabel->row >=0 &&
   3115 	    clabel->column >= 0 &&
   3116 	    clabel->num_rows > 0 &&
   3117 	    clabel->num_columns > 0 &&
   3118 	    clabel->row < clabel->num_rows &&
   3119 	    clabel->column < clabel->num_columns &&
   3120 	    clabel->blockSize > 0 &&
   3121 	    /*
   3122 	     * numBlocksHi may contain garbage, but it is ok since
   3123 	     * the type is unsigned.  If it is really garbage,
   3124 	     * rf_fix_old_label_size() will fix it.
   3125 	     */
   3126 	    rf_component_label_numblocks(clabel) > 0) {
   3127 		/*
   3128 		 * label looks reasonable enough...
   3129 		 * let's make sure it has no old garbage.
   3130 		 */
   3131 		if (numsecs)
   3132 			rf_fix_old_label_size(clabel, numsecs);
   3133 		return(1);
   3134 	}
   3135 	return(0);
   3136 }
   3137 
   3138 
   3139 /*
   3140  * For reasons yet unknown, some old component labels have garbage in
   3141  * the newer numBlocksHi region, and this causes lossage.  Since those
   3142  * disks will also have numsecs set to less than 32 bits of sectors,
   3143  * we can determine when this corruption has occurred, and fix it.
   3144  *
   3145  * The exact same problem, with the same unknown reason, happens to
   3146  * the partitionSizeHi member as well.
   3147  */
   3148 static void
   3149 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3150 {
   3151 
   3152 	if (numsecs < ((uint64_t)1 << 32)) {
   3153 		if (clabel->numBlocksHi) {
   3154 			printf("WARNING: total sectors < 32 bits, yet "
   3155 			       "numBlocksHi set\n"
   3156 			       "WARNING: resetting numBlocksHi to zero.\n");
   3157 			clabel->numBlocksHi = 0;
   3158 		}
   3159 
   3160 		if (clabel->partitionSizeHi) {
   3161 			printf("WARNING: total sectors < 32 bits, yet "
   3162 			       "partitionSizeHi set\n"
   3163 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3164 			clabel->partitionSizeHi = 0;
   3165 		}
   3166 	}
   3167 }
   3168 
   3169 
   3170 #ifdef DEBUG
   3171 void
   3172 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3173 {
   3174 	uint64_t numBlocks;
   3175 	static const char *rp[] = {
   3176 	    "No", "Force", "Soft", "*invalid*"
   3177 	};
   3178 
   3179 
   3180 	numBlocks = rf_component_label_numblocks(clabel);
   3181 
   3182 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3183 	       clabel->row, clabel->column,
   3184 	       clabel->num_rows, clabel->num_columns);
   3185 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3186 	       clabel->version, clabel->serial_number,
   3187 	       clabel->mod_counter);
   3188 	printf("   Clean: %s Status: %d\n",
   3189 	       clabel->clean ? "Yes" : "No", clabel->status);
   3190 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3191 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3192 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3193 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3194 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3195 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3196 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3197 #if 0
   3198 	   printf("   Config order: %d\n", clabel->config_order);
   3199 #endif
   3200 
   3201 }
   3202 #endif
   3203 
   3204 static RF_ConfigSet_t *
   3205 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3206 {
   3207 	RF_AutoConfig_t *ac;
   3208 	RF_ConfigSet_t *config_sets;
   3209 	RF_ConfigSet_t *cset;
   3210 	RF_AutoConfig_t *ac_next;
   3211 
   3212 
   3213 	config_sets = NULL;
   3214 
   3215 	/* Go through the AutoConfig list, and figure out which components
   3216 	   belong to what sets.  */
   3217 	ac = ac_list;
   3218 	while(ac!=NULL) {
   3219 		/* we're going to putz with ac->next, so save it here
   3220 		   for use at the end of the loop */
   3221 		ac_next = ac->next;
   3222 
   3223 		if (config_sets == NULL) {
   3224 			/* will need at least this one... */
   3225 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3226 				       M_RAIDFRAME, M_WAITOK);
   3227 			/* this one is easy :) */
   3228 			config_sets->ac = ac;
   3229 			config_sets->next = NULL;
   3230 			config_sets->rootable = 0;
   3231 			ac->next = NULL;
   3232 		} else {
   3233 			/* which set does this component fit into? */
   3234 			cset = config_sets;
   3235 			while(cset!=NULL) {
   3236 				if (rf_does_it_fit(cset, ac)) {
   3237 					/* looks like it matches... */
   3238 					ac->next = cset->ac;
   3239 					cset->ac = ac;
   3240 					break;
   3241 				}
   3242 				cset = cset->next;
   3243 			}
   3244 			if (cset==NULL) {
   3245 				/* didn't find a match above... new set..*/
   3246 				cset = malloc(sizeof(RF_ConfigSet_t),
   3247 					       M_RAIDFRAME, M_WAITOK);
   3248 				cset->ac = ac;
   3249 				ac->next = NULL;
   3250 				cset->next = config_sets;
   3251 				cset->rootable = 0;
   3252 				config_sets = cset;
   3253 			}
   3254 		}
   3255 		ac = ac_next;
   3256 	}
   3257 
   3258 
   3259 	return(config_sets);
   3260 }
   3261 
   3262 static int
   3263 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3264 {
   3265 	RF_ComponentLabel_t *clabel1, *clabel2;
   3266 
   3267 	/* If this one matches the *first* one in the set, that's good
   3268 	   enough, since the other members of the set would have been
   3269 	   through here too... */
   3270 	/* note that we are not checking partitionSize here..
   3271 
   3272 	   Note that we are also not checking the mod_counters here.
   3273 	   If everything else matches except the mod_counter, that's
   3274 	   good enough for this test.  We will deal with the mod_counters
   3275 	   a little later in the autoconfiguration process.
   3276 
   3277 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3278 
   3279 	   The reason we don't check for this is that failed disks
   3280 	   will have lower modification counts.  If those disks are
   3281 	   not added to the set they used to belong to, then they will
   3282 	   form their own set, which may result in 2 different sets,
   3283 	   for example, competing to be configured at raid0, and
   3284 	   perhaps competing to be the root filesystem set.  If the
   3285 	   wrong ones get configured, or both attempt to become /,
   3286 	   weird behaviour and or serious lossage will occur.  Thus we
   3287 	   need to bring them into the fold here, and kick them out at
   3288 	   a later point.
   3289 
   3290 	*/
   3291 
   3292 	clabel1 = cset->ac->clabel;
   3293 	clabel2 = ac->clabel;
   3294 	if ((clabel1->version == clabel2->version) &&
   3295 	    (clabel1->serial_number == clabel2->serial_number) &&
   3296 	    (clabel1->num_rows == clabel2->num_rows) &&
   3297 	    (clabel1->num_columns == clabel2->num_columns) &&
   3298 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3299 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3300 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3301 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3302 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3303 	    (clabel1->blockSize == clabel2->blockSize) &&
   3304 	    rf_component_label_numblocks(clabel1) ==
   3305 	    rf_component_label_numblocks(clabel2) &&
   3306 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3307 	    (clabel1->root_partition == clabel2->root_partition) &&
   3308 	    (clabel1->last_unit == clabel2->last_unit) &&
   3309 	    (clabel1->config_order == clabel2->config_order)) {
   3310 		/* if it get's here, it almost *has* to be a match */
   3311 	} else {
   3312 		/* it's not consistent with somebody in the set..
   3313 		   punt */
   3314 		return(0);
   3315 	}
   3316 	/* all was fine.. it must fit... */
   3317 	return(1);
   3318 }
   3319 
   3320 static int
   3321 rf_have_enough_components(RF_ConfigSet_t *cset)
   3322 {
   3323 	RF_AutoConfig_t *ac;
   3324 	RF_AutoConfig_t *auto_config;
   3325 	RF_ComponentLabel_t *clabel;
   3326 	int c;
   3327 	int num_cols;
   3328 	int num_missing;
   3329 	int mod_counter;
   3330 	int mod_counter_found;
   3331 	int even_pair_failed;
   3332 	char parity_type;
   3333 
   3334 
   3335 	/* check to see that we have enough 'live' components
   3336 	   of this set.  If so, we can configure it if necessary */
   3337 
   3338 	num_cols = cset->ac->clabel->num_columns;
   3339 	parity_type = cset->ac->clabel->parityConfig;
   3340 
   3341 	/* XXX Check for duplicate components!?!?!? */
   3342 
   3343 	/* Determine what the mod_counter is supposed to be for this set. */
   3344 
   3345 	mod_counter_found = 0;
   3346 	mod_counter = 0;
   3347 	ac = cset->ac;
   3348 	while(ac!=NULL) {
   3349 		if (mod_counter_found==0) {
   3350 			mod_counter = ac->clabel->mod_counter;
   3351 			mod_counter_found = 1;
   3352 		} else {
   3353 			if (ac->clabel->mod_counter > mod_counter) {
   3354 				mod_counter = ac->clabel->mod_counter;
   3355 			}
   3356 		}
   3357 		ac = ac->next;
   3358 	}
   3359 
   3360 	num_missing = 0;
   3361 	auto_config = cset->ac;
   3362 
   3363 	even_pair_failed = 0;
   3364 	for(c=0; c<num_cols; c++) {
   3365 		ac = auto_config;
   3366 		while(ac!=NULL) {
   3367 			if ((ac->clabel->column == c) &&
   3368 			    (ac->clabel->mod_counter == mod_counter)) {
   3369 				/* it's this one... */
   3370 #ifdef DEBUG
   3371 				printf("Found: %s at %d\n",
   3372 				       ac->devname,c);
   3373 #endif
   3374 				break;
   3375 			}
   3376 			ac=ac->next;
   3377 		}
   3378 		if (ac==NULL) {
   3379 				/* Didn't find one here! */
   3380 				/* special case for RAID 1, especially
   3381 				   where there are more than 2
   3382 				   components (where RAIDframe treats
   3383 				   things a little differently :( ) */
   3384 			if (parity_type == '1') {
   3385 				if (c%2 == 0) { /* even component */
   3386 					even_pair_failed = 1;
   3387 				} else { /* odd component.  If
   3388 					    we're failed, and
   3389 					    so is the even
   3390 					    component, it's
   3391 					    "Good Night, Charlie" */
   3392 					if (even_pair_failed == 1) {
   3393 						return(0);
   3394 					}
   3395 				}
   3396 			} else {
   3397 				/* normal accounting */
   3398 				num_missing++;
   3399 			}
   3400 		}
   3401 		if ((parity_type == '1') && (c%2 == 1)) {
   3402 				/* Just did an even component, and we didn't
   3403 				   bail.. reset the even_pair_failed flag,
   3404 				   and go on to the next component.... */
   3405 			even_pair_failed = 0;
   3406 		}
   3407 	}
   3408 
   3409 	clabel = cset->ac->clabel;
   3410 
   3411 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3412 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3413 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3414 		/* XXX this needs to be made *much* more general */
   3415 		/* Too many failures */
   3416 		return(0);
   3417 	}
   3418 	/* otherwise, all is well, and we've got enough to take a kick
   3419 	   at autoconfiguring this set */
   3420 	return(1);
   3421 }
   3422 
   3423 static void
   3424 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3425 			RF_Raid_t *raidPtr)
   3426 {
   3427 	RF_ComponentLabel_t *clabel;
   3428 	int i;
   3429 
   3430 	clabel = ac->clabel;
   3431 
   3432 	/* 1. Fill in the common stuff */
   3433 	config->numCol = clabel->num_columns;
   3434 	config->numSpare = 0; /* XXX should this be set here? */
   3435 	config->sectPerSU = clabel->sectPerSU;
   3436 	config->SUsPerPU = clabel->SUsPerPU;
   3437 	config->SUsPerRU = clabel->SUsPerRU;
   3438 	config->parityConfig = clabel->parityConfig;
   3439 	/* XXX... */
   3440 	strcpy(config->diskQueueType,"fifo");
   3441 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3442 	config->layoutSpecificSize = 0; /* XXX ?? */
   3443 
   3444 	while(ac!=NULL) {
   3445 		/* row/col values will be in range due to the checks
   3446 		   in reasonable_label() */
   3447 		strcpy(config->devnames[0][ac->clabel->column],
   3448 		       ac->devname);
   3449 		ac = ac->next;
   3450 	}
   3451 
   3452 	for(i=0;i<RF_MAXDBGV;i++) {
   3453 		config->debugVars[i][0] = 0;
   3454 	}
   3455 }
   3456 
   3457 static int
   3458 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3459 {
   3460 	RF_ComponentLabel_t *clabel;
   3461 	int column;
   3462 	int sparecol;
   3463 
   3464 	raidPtr->autoconfigure = new_value;
   3465 
   3466 	for(column=0; column<raidPtr->numCol; column++) {
   3467 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3468 			clabel = raidget_component_label(raidPtr, column);
   3469 			clabel->autoconfigure = new_value;
   3470 			raidflush_component_label(raidPtr, column);
   3471 		}
   3472 	}
   3473 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3474 		sparecol = raidPtr->numCol + column;
   3475 
   3476 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3477 			clabel = raidget_component_label(raidPtr, sparecol);
   3478 			clabel->autoconfigure = new_value;
   3479 			raidflush_component_label(raidPtr, sparecol);
   3480 		}
   3481 	}
   3482 	return(new_value);
   3483 }
   3484 
   3485 static int
   3486 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3487 {
   3488 	RF_ComponentLabel_t *clabel;
   3489 	int column;
   3490 	int sparecol;
   3491 
   3492 	raidPtr->root_partition = new_value;
   3493 	for(column=0; column<raidPtr->numCol; column++) {
   3494 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3495 			clabel = raidget_component_label(raidPtr, column);
   3496 			clabel->root_partition = new_value;
   3497 			raidflush_component_label(raidPtr, column);
   3498 		}
   3499 	}
   3500 	for (column = 0; column < raidPtr->numSpare ; column++) {
   3501 		sparecol = raidPtr->numCol + column;
   3502 
   3503 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3504 			clabel = raidget_component_label(raidPtr, sparecol);
   3505 			clabel->root_partition = new_value;
   3506 			raidflush_component_label(raidPtr, sparecol);
   3507 		}
   3508 	}
   3509 	return(new_value);
   3510 }
   3511 
   3512 static void
   3513 rf_release_all_vps(RF_ConfigSet_t *cset)
   3514 {
   3515 	RF_AutoConfig_t *ac;
   3516 
   3517 	ac = cset->ac;
   3518 	while(ac!=NULL) {
   3519 		/* Close the vp, and give it back */
   3520 		if (ac->vp) {
   3521 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3522 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3523 			vput(ac->vp);
   3524 			ac->vp = NULL;
   3525 		}
   3526 		ac = ac->next;
   3527 	}
   3528 }
   3529 
   3530 
   3531 static void
   3532 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3533 {
   3534 	RF_AutoConfig_t *ac;
   3535 	RF_AutoConfig_t *next_ac;
   3536 
   3537 	ac = cset->ac;
   3538 	while(ac!=NULL) {
   3539 		next_ac = ac->next;
   3540 		/* nuke the label */
   3541 		free(ac->clabel, M_RAIDFRAME);
   3542 		/* cleanup the config structure */
   3543 		free(ac, M_RAIDFRAME);
   3544 		/* "next.." */
   3545 		ac = next_ac;
   3546 	}
   3547 	/* and, finally, nuke the config set */
   3548 	free(cset, M_RAIDFRAME);
   3549 }
   3550 
   3551 
   3552 void
   3553 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3554 {
   3555 	/* avoid over-writing byteswapped version. */
   3556 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3557 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3558 	clabel->serial_number = raidPtr->serial_number;
   3559 	clabel->mod_counter = raidPtr->mod_counter;
   3560 
   3561 	clabel->num_rows = 1;
   3562 	clabel->num_columns = raidPtr->numCol;
   3563 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3564 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3565 
   3566 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3567 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3568 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3569 
   3570 	clabel->blockSize = raidPtr->bytesPerSector;
   3571 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3572 
   3573 	/* XXX not portable */
   3574 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3575 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3576 	clabel->autoconfigure = raidPtr->autoconfigure;
   3577 	clabel->root_partition = raidPtr->root_partition;
   3578 	clabel->last_unit = raidPtr->raidid;
   3579 	clabel->config_order = raidPtr->config_order;
   3580 
   3581 #ifndef RF_NO_PARITY_MAP
   3582 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3583 #endif
   3584 }
   3585 
   3586 static struct raid_softc *
   3587 rf_auto_config_set(RF_ConfigSet_t *cset)
   3588 {
   3589 	RF_Raid_t *raidPtr;
   3590 	RF_Config_t *config;
   3591 	int raidID;
   3592 	struct raid_softc *sc;
   3593 
   3594 #ifdef DEBUG
   3595 	printf("RAID autoconfigure\n");
   3596 #endif
   3597 
   3598 	/* 1. Create a config structure */
   3599 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3600 
   3601 	/*
   3602 	   2. Figure out what RAID ID this one is supposed to live at
   3603 	   See if we can get the same RAID dev that it was configured
   3604 	   on last time..
   3605 	*/
   3606 
   3607 	raidID = cset->ac->clabel->last_unit;
   3608 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3609 	     sc = raidget(++raidID, false))
   3610 		continue;
   3611 #ifdef DEBUG
   3612 	printf("Configuring raid%d:\n",raidID);
   3613 #endif
   3614 
   3615 	if (sc == NULL)
   3616 		sc = raidget(raidID, true);
   3617 	raidPtr = &sc->sc_r;
   3618 
   3619 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3620 	raidPtr->softc = sc;
   3621 	raidPtr->raidid = raidID;
   3622 	raidPtr->openings = RAIDOUTSTANDING;
   3623 
   3624 	/* 3. Build the configuration structure */
   3625 	rf_create_configuration(cset->ac, config, raidPtr);
   3626 
   3627 	/* 4. Do the configuration */
   3628 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3629 		raidinit(sc);
   3630 
   3631 		rf_markalldirty(raidPtr);
   3632 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3633 		switch (cset->ac->clabel->root_partition) {
   3634 		case 1:	/* Force Root */
   3635 		case 2:	/* Soft Root: root when boot partition part of raid */
   3636 			/*
   3637 			 * everything configured just fine.  Make a note
   3638 			 * that this set is eligible to be root,
   3639 			 * or forced to be root
   3640 			 */
   3641 			cset->rootable = cset->ac->clabel->root_partition;
   3642 			/* XXX do this here? */
   3643 			raidPtr->root_partition = cset->rootable;
   3644 			break;
   3645 		default:
   3646 			break;
   3647 		}
   3648 	} else {
   3649 		raidput(sc);
   3650 		sc = NULL;
   3651 	}
   3652 
   3653 	/* 5. Cleanup */
   3654 	free(config, M_RAIDFRAME);
   3655 	return sc;
   3656 }
   3657 
   3658 void
   3659 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3660 	     size_t xmin, size_t xmax)
   3661 {
   3662 
   3663 	/* Format: raid%d_foo */
   3664 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3665 
   3666 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3667 	pool_sethiwat(p, xmax);
   3668 	pool_prime(p, xmin);
   3669 }
   3670 
   3671 
   3672 /*
   3673  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3674  * to see if there is IO pending and if that IO could possibly be done
   3675  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3676  * otherwise.
   3677  *
   3678  */
   3679 int
   3680 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3681 {
   3682 	struct raid_softc *rs;
   3683 	struct dk_softc *dksc;
   3684 
   3685 	rs = raidPtr->softc;
   3686 	dksc = &rs->sc_dksc;
   3687 
   3688 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3689 		return 1;
   3690 
   3691 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3692 		/* there is work to do */
   3693 		return 0;
   3694 	}
   3695 	/* default is nothing to do */
   3696 	return 1;
   3697 }
   3698 
   3699 int
   3700 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3701 {
   3702 	uint64_t numsecs;
   3703 	unsigned secsize;
   3704 	int error;
   3705 
   3706 	error = getdisksize(vp, &numsecs, &secsize);
   3707 	if (error == 0) {
   3708 		diskPtr->blockSize = secsize;
   3709 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3710 		diskPtr->partitionSize = numsecs;
   3711 		return 0;
   3712 	}
   3713 	return error;
   3714 }
   3715 
   3716 static int
   3717 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3718 {
   3719 	return 1;
   3720 }
   3721 
   3722 static void
   3723 raid_attach(device_t parent, device_t self, void *aux)
   3724 {
   3725 }
   3726 
   3727 
   3728 static int
   3729 raid_detach(device_t self, int flags)
   3730 {
   3731 	int error;
   3732 	struct raid_softc *rs = raidsoftc(self);
   3733 
   3734 	if (rs == NULL)
   3735 		return ENXIO;
   3736 
   3737 	if ((error = raidlock(rs)) != 0)
   3738 		return error;
   3739 
   3740 	error = raid_detach_unlocked(rs);
   3741 
   3742 	raidunlock(rs);
   3743 
   3744 	/* XXX raid can be referenced here */
   3745 
   3746 	if (error)
   3747 		return error;
   3748 
   3749 	/* Free the softc */
   3750 	raidput(rs);
   3751 
   3752 	return 0;
   3753 }
   3754 
   3755 static void
   3756 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3757 {
   3758 	struct dk_softc *dksc = &rs->sc_dksc;
   3759 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3760 
   3761 	memset(dg, 0, sizeof(*dg));
   3762 
   3763 	dg->dg_secperunit = raidPtr->totalSectors;
   3764 	dg->dg_secsize = raidPtr->bytesPerSector;
   3765 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3766 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3767 
   3768 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3769 }
   3770 
   3771 /*
   3772  * Get cache info for all the components (including spares).
   3773  * Returns intersection of all the cache flags of all disks, or first
   3774  * error if any encountered.
   3775  * XXXfua feature flags can change as spares are added - lock down somehow
   3776  */
   3777 static int
   3778 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3779 {
   3780 	int c;
   3781 	int error;
   3782 	int dkwhole = 0, dkpart;
   3783 
   3784 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3785 		/*
   3786 		 * Check any non-dead disk, even when currently being
   3787 		 * reconstructed.
   3788 		 */
   3789 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   3790 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3791 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3792 			if (error) {
   3793 				if (error != ENODEV) {
   3794 					printf("raid%d: get cache for component %s failed\n",
   3795 					    raidPtr->raidid,
   3796 					    raidPtr->Disks[c].devname);
   3797 				}
   3798 
   3799 				return error;
   3800 			}
   3801 
   3802 			if (c == 0)
   3803 				dkwhole = dkpart;
   3804 			else
   3805 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3806 		}
   3807 	}
   3808 
   3809 	*data = dkwhole;
   3810 
   3811 	return 0;
   3812 }
   3813 
   3814 /*
   3815  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3816  * We end up returning whatever error was returned by the first cache flush
   3817  * that fails.
   3818  */
   3819 
   3820 static int
   3821 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3822 {
   3823 	int e = 0;
   3824 	for (int i = 0; i < 5; i++) {
   3825 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3826 		    &force, FWRITE, NOCRED);
   3827 		if (!e || e == ENODEV)
   3828 			return e;
   3829 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3830 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3831 	}
   3832 	return e;
   3833 }
   3834 
   3835 int
   3836 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3837 {
   3838 	int c, error;
   3839 
   3840 	error = 0;
   3841 	for (c = 0; c < raidPtr->numCol; c++) {
   3842 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3843 			int e = rf_sync_component_cache(raidPtr, c, force);
   3844 			if (e && !error)
   3845 				error = e;
   3846 		}
   3847 	}
   3848 
   3849 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3850 		int sparecol = raidPtr->numCol + c;
   3851 
   3852 		/* Need to ensure that the reconstruct actually completed! */
   3853 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3854 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3855 			    force);
   3856 			if (e && !error)
   3857 				error = e;
   3858 		}
   3859 	}
   3860 	return error;
   3861 }
   3862 
   3863 /* Fill in info with the current status */
   3864 void
   3865 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3866 {
   3867 
   3868 	memset(info, 0, sizeof(*info));
   3869 
   3870 	if (raidPtr->status != rf_rs_reconstructing) {
   3871 		info->total = 100;
   3872 		info->completed = 100;
   3873 	} else {
   3874 		info->total = raidPtr->reconControl->numRUsTotal;
   3875 		info->completed = raidPtr->reconControl->numRUsComplete;
   3876 	}
   3877 	info->remaining = info->total - info->completed;
   3878 }
   3879 
   3880 /* Fill in info with the current status */
   3881 void
   3882 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3883 {
   3884 
   3885 	memset(info, 0, sizeof(*info));
   3886 
   3887 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3888 		info->total = raidPtr->Layout.numStripe;
   3889 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3890 	} else {
   3891 		info->completed = 100;
   3892 		info->total = 100;
   3893 	}
   3894 	info->remaining = info->total - info->completed;
   3895 }
   3896 
   3897 /* Fill in info with the current status */
   3898 void
   3899 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3900 {
   3901 
   3902 	memset(info, 0, sizeof(*info));
   3903 
   3904 	if (raidPtr->copyback_in_progress == 1) {
   3905 		info->total = raidPtr->Layout.numStripe;
   3906 		info->completed = raidPtr->copyback_stripes_done;
   3907 		info->remaining = info->total - info->completed;
   3908 	} else {
   3909 		info->remaining = 0;
   3910 		info->completed = 100;
   3911 		info->total = 100;
   3912 	}
   3913 }
   3914 
   3915 /* Fill in config with the current info */
   3916 int
   3917 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3918 {
   3919 	int	d, i, j;
   3920 
   3921 	if (!raidPtr->valid)
   3922 		return ENODEV;
   3923 	config->cols = raidPtr->numCol;
   3924 	config->ndevs = raidPtr->numCol;
   3925 	if (config->ndevs >= RF_MAX_DISKS)
   3926 		return ENOMEM;
   3927 	config->nspares = raidPtr->numSpare;
   3928 	if (config->nspares >= RF_MAX_DISKS)
   3929 		return ENOMEM;
   3930 	config->maxqdepth = raidPtr->maxQueueDepth;
   3931 	d = 0;
   3932 	for (j = 0; j < config->cols; j++) {
   3933 		config->devs[d] = raidPtr->Disks[j];
   3934 		d++;
   3935 	}
   3936 	for (i = 0; i < config->nspares; i++) {
   3937 		config->spares[i] = raidPtr->Disks[raidPtr->numCol + i];
   3938                 if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3939                         /* raidctl(8) expects to see this as a used spare */
   3940                         config->spares[i].status = rf_ds_used_spare;
   3941                 }
   3942 	}
   3943 	return 0;
   3944 }
   3945 
   3946 int
   3947 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3948 {
   3949 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3950 	RF_ComponentLabel_t *raid_clabel;
   3951 	int column = clabel->column;
   3952 
   3953 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3954 		return EINVAL;
   3955 	raid_clabel = raidget_component_label(raidPtr, column);
   3956 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3957 	/* Fix-up for userland. */
   3958 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3959 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3960 
   3961 	return 0;
   3962 }
   3963 
   3964 /*
   3965  * Module interface
   3966  */
   3967 
   3968 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3969 
   3970 #ifdef _MODULE
   3971 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3972 #endif
   3973 
   3974 static int raid_modcmd(modcmd_t, void *);
   3975 static int raid_modcmd_init(void);
   3976 static int raid_modcmd_fini(void);
   3977 
   3978 static int
   3979 raid_modcmd(modcmd_t cmd, void *data)
   3980 {
   3981 	int error;
   3982 
   3983 	error = 0;
   3984 	switch (cmd) {
   3985 	case MODULE_CMD_INIT:
   3986 		error = raid_modcmd_init();
   3987 		break;
   3988 	case MODULE_CMD_FINI:
   3989 		error = raid_modcmd_fini();
   3990 		break;
   3991 	default:
   3992 		error = ENOTTY;
   3993 		break;
   3994 	}
   3995 	return error;
   3996 }
   3997 
   3998 static int
   3999 raid_modcmd_init(void)
   4000 {
   4001 	int error;
   4002 	int bmajor, cmajor;
   4003 
   4004 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   4005 	mutex_enter(&raid_lock);
   4006 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4007 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   4008 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   4009 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   4010 
   4011 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   4012 #endif
   4013 
   4014 	bmajor = cmajor = -1;
   4015 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4016 	    &raid_cdevsw, &cmajor);
   4017 	if (error != 0 && error != EEXIST) {
   4018 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4019 		mutex_exit(&raid_lock);
   4020 		return error;
   4021 	}
   4022 #ifdef _MODULE
   4023 	error = config_cfdriver_attach(&raid_cd);
   4024 	if (error != 0) {
   4025 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4026 		    __func__, error);
   4027 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4028 		mutex_exit(&raid_lock);
   4029 		return error;
   4030 	}
   4031 #endif
   4032 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4033 	if (error != 0) {
   4034 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4035 		    __func__, error);
   4036 #ifdef _MODULE
   4037 		config_cfdriver_detach(&raid_cd);
   4038 #endif
   4039 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4040 		mutex_exit(&raid_lock);
   4041 		return error;
   4042 	}
   4043 
   4044 	raidautoconfigdone = false;
   4045 
   4046 	mutex_exit(&raid_lock);
   4047 
   4048 	if (error == 0) {
   4049 		if (rf_BootRaidframe(true) == 0)
   4050 			aprint_verbose("Kernelized RAIDframe activated\n");
   4051 		else
   4052 			panic("Serious error activating RAID!!");
   4053 	}
   4054 
   4055 	/*
   4056 	 * Register a finalizer which will be used to auto-config RAID
   4057 	 * sets once all real hardware devices have been found.
   4058 	 */
   4059 	error = config_finalize_register(NULL, rf_autoconfig);
   4060 	if (error != 0) {
   4061 		aprint_error("WARNING: unable to register RAIDframe "
   4062 		    "finalizer\n");
   4063 		error = 0;
   4064 	}
   4065 
   4066 	return error;
   4067 }
   4068 
   4069 static int
   4070 raid_modcmd_fini(void)
   4071 {
   4072 	int error;
   4073 
   4074 	mutex_enter(&raid_lock);
   4075 
   4076 	/* Don't allow unload if raid device(s) exist.  */
   4077 	if (!LIST_EMPTY(&raids)) {
   4078 		mutex_exit(&raid_lock);
   4079 		return EBUSY;
   4080 	}
   4081 
   4082 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4083 	if (error != 0) {
   4084 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4085 		mutex_exit(&raid_lock);
   4086 		return error;
   4087 	}
   4088 #ifdef _MODULE
   4089 	error = config_cfdriver_detach(&raid_cd);
   4090 	if (error != 0) {
   4091 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4092 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4093 		mutex_exit(&raid_lock);
   4094 		return error;
   4095 	}
   4096 #endif
   4097 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4098 	rf_BootRaidframe(false);
   4099 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4100 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4101 	rf_destroy_cond2(rf_sparet_wait_cv);
   4102 	rf_destroy_cond2(rf_sparet_resp_cv);
   4103 #endif
   4104 	mutex_exit(&raid_lock);
   4105 	mutex_destroy(&raid_lock);
   4106 
   4107 	return error;
   4108 }
   4109