Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.410.4.2
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.410.4.2 2023/09/18 18:57:33 martin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.410.4.2 2023/09/18 18:57:33 martin Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int);
    181 static void raidinit(struct raid_softc *);
    182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 static int raid_diskstart(device_t, struct buf *bp);
    200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    201 static int raid_lastclose(device_t);
    202 
    203 static dev_type_open(raidopen);
    204 static dev_type_close(raidclose);
    205 static dev_type_read(raidread);
    206 static dev_type_write(raidwrite);
    207 static dev_type_ioctl(raidioctl);
    208 static dev_type_strategy(raidstrategy);
    209 static dev_type_dump(raiddump);
    210 static dev_type_size(raidsize);
    211 
    212 const struct bdevsw raid_bdevsw = {
    213 	.d_open = raidopen,
    214 	.d_close = raidclose,
    215 	.d_strategy = raidstrategy,
    216 	.d_ioctl = raidioctl,
    217 	.d_dump = raiddump,
    218 	.d_psize = raidsize,
    219 	.d_discard = nodiscard,
    220 	.d_flag = D_DISK
    221 };
    222 
    223 const struct cdevsw raid_cdevsw = {
    224 	.d_open = raidopen,
    225 	.d_close = raidclose,
    226 	.d_read = raidread,
    227 	.d_write = raidwrite,
    228 	.d_ioctl = raidioctl,
    229 	.d_stop = nostop,
    230 	.d_tty = notty,
    231 	.d_poll = nopoll,
    232 	.d_mmap = nommap,
    233 	.d_kqfilter = nokqfilter,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 static struct dkdriver rf_dkdriver = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_strategy = raidstrategy,
    242 	.d_diskstart = raid_diskstart,
    243 	.d_dumpblocks = raid_dumpblocks,
    244 	.d_lastclose = raid_lastclose,
    245 	.d_minphys = minphys
    246 };
    247 
    248 #define	raidunit(x)	DISKUNIT(x)
    249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    250 
    251 extern struct cfdriver raid_cd;
    252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    254     DVF_DETACH_SHUTDOWN);
    255 
    256 /* Internal representation of a rf_recon_req */
    257 struct rf_recon_req_internal {
    258 	RF_RowCol_t col;
    259 	RF_ReconReqFlags_t flags;
    260 	void   *raidPtr;
    261 };
    262 
    263 /*
    264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    265  * Be aware that large numbers can allow the driver to consume a lot of
    266  * kernel memory, especially on writes, and in degraded mode reads.
    267  *
    268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    269  * a single 64K write will typically require 64K for the old data,
    270  * 64K for the old parity, and 64K for the new parity, for a total
    271  * of 192K (if the parity buffer is not re-used immediately).
    272  * Even it if is used immediately, that's still 128K, which when multiplied
    273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    274  *
    275  * Now in degraded mode, for example, a 64K read on the above setup may
    276  * require data reconstruction, which will require *all* of the 4 remaining
    277  * disks to participate -- 4 * 32K/disk == 128K again.
    278  */
    279 
    280 #ifndef RAIDOUTSTANDING
    281 #define RAIDOUTSTANDING   6
    282 #endif
    283 
    284 #define RAIDLABELDEV(dev)	\
    285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    286 
    287 /* declared here, and made public, for the benefit of KVM stuff.. */
    288 
    289 static int raidlock(struct raid_softc *);
    290 static void raidunlock(struct raid_softc *);
    291 
    292 static int raid_detach_unlocked(struct raid_softc *);
    293 
    294 static void rf_markalldirty(RF_Raid_t *);
    295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    296 
    297 static void rf_ReconThread(struct rf_recon_req_internal *);
    298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    301 static int rf_autoconfig(device_t);
    302 static int rf_rescan(void);
    303 static void rf_buildroothack(RF_ConfigSet_t *);
    304 
    305 static RF_AutoConfig_t *rf_find_raid_components(void);
    306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    309 static int rf_set_autoconfig(RF_Raid_t *, int);
    310 static int rf_set_rootpartition(RF_Raid_t *, int);
    311 static void rf_release_all_vps(RF_ConfigSet_t *);
    312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    313 static int rf_have_enough_components(RF_ConfigSet_t *);
    314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    316 
    317 /*
    318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    320  * in the kernel config file.
    321  */
    322 #ifdef RAID_AUTOCONFIG
    323 int raidautoconfig = 1;
    324 #else
    325 int raidautoconfig = 0;
    326 #endif
    327 static bool raidautoconfigdone = false;
    328 
    329 struct pool rf_alloclist_pool;   /* AllocList */
    330 
    331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    332 static kmutex_t raid_lock;
    333 
    334 static struct raid_softc *
    335 raidcreate(int unit) {
    336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    337 	sc->sc_unit = unit;
    338 	cv_init(&sc->sc_cv, "raidunit");
    339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    340 	return sc;
    341 }
    342 
    343 static void
    344 raiddestroy(struct raid_softc *sc) {
    345 	cv_destroy(&sc->sc_cv);
    346 	mutex_destroy(&sc->sc_mutex);
    347 	kmem_free(sc, sizeof(*sc));
    348 }
    349 
    350 static struct raid_softc *
    351 raidget(int unit, bool create) {
    352 	struct raid_softc *sc;
    353 	if (unit < 0) {
    354 #ifdef DIAGNOSTIC
    355 		panic("%s: unit %d!", __func__, unit);
    356 #endif
    357 		return NULL;
    358 	}
    359 	mutex_enter(&raid_lock);
    360 	LIST_FOREACH(sc, &raids, sc_link) {
    361 		if (sc->sc_unit == unit) {
    362 			mutex_exit(&raid_lock);
    363 			return sc;
    364 		}
    365 	}
    366 	mutex_exit(&raid_lock);
    367 	if (!create)
    368 		return NULL;
    369 	sc = raidcreate(unit);
    370 	mutex_enter(&raid_lock);
    371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    372 	mutex_exit(&raid_lock);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raidput(struct raid_softc *sc) {
    378 	mutex_enter(&raid_lock);
    379 	LIST_REMOVE(sc, sc_link);
    380 	mutex_exit(&raid_lock);
    381 	raiddestroy(sc);
    382 }
    383 
    384 void
    385 raidattach(int num)
    386 {
    387 
    388 	/*
    389 	 * Device attachment and associated initialization now occurs
    390 	 * as part of the module initialization.
    391 	 */
    392 }
    393 
    394 static int
    395 rf_autoconfig(device_t self)
    396 {
    397 	RF_AutoConfig_t *ac_list;
    398 	RF_ConfigSet_t *config_sets;
    399 
    400 	if (!raidautoconfig || raidautoconfigdone == true)
    401 		return 0;
    402 
    403 	/* XXX This code can only be run once. */
    404 	raidautoconfigdone = true;
    405 
    406 #ifdef __HAVE_CPU_BOOTCONF
    407 	/*
    408 	 * 0. find the boot device if needed first so we can use it later
    409 	 * this needs to be done before we autoconfigure any raid sets,
    410 	 * because if we use wedges we are not going to be able to open
    411 	 * the boot device later
    412 	 */
    413 	if (booted_device == NULL)
    414 		cpu_bootconf();
    415 #endif
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set and configure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 int
    433 rf_inited(const struct raid_softc *rs) {
    434 	return (rs->sc_flags & RAIDF_INITED) != 0;
    435 }
    436 
    437 RF_Raid_t *
    438 rf_get_raid(struct raid_softc *rs) {
    439 	return &rs->sc_r;
    440 }
    441 
    442 int
    443 rf_get_unit(const struct raid_softc *rs) {
    444 	return rs->sc_unit;
    445 }
    446 
    447 static int
    448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    449 	const char *bootname;
    450 	size_t len;
    451 
    452 	/* if bdv is NULL, the set can't contain it. exit early. */
    453 	if (bdv == NULL)
    454 		return 0;
    455 
    456 	bootname = device_xname(bdv);
    457 	len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 static int
    479 rf_rescan(void)
    480 {
    481 	RF_AutoConfig_t *ac_list;
    482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    483 	struct raid_softc *sc;
    484 	int raid_added;
    485 
    486 	ac_list = rf_find_raid_components();
    487 	config_sets = rf_create_auto_sets(ac_list);
    488 
    489 	raid_added = 1;
    490 	while (raid_added > 0) {
    491 		raid_added = 0;
    492 		cset = config_sets;
    493 		while (cset != NULL) {
    494 			next_cset = cset->next;
    495 			if (rf_have_enough_components(cset) &&
    496 			    cset->ac->clabel->autoconfigure == 1) {
    497 				sc = rf_auto_config_set(cset);
    498 				if (sc != NULL) {
    499 					aprint_debug("raid%d: configured ok, rootable %d\n",
    500 						     sc->sc_unit, cset->rootable);
    501 					/* We added one RAID set */
    502 					raid_added++;
    503 				} else {
    504 					/* The autoconfig didn't work :( */
    505 					aprint_debug("Autoconfig failed\n");
    506 					rf_release_all_vps(cset);
    507 				}
    508 			} else {
    509 				/* we're not autoconfiguring this set...
    510 				   release the associated resources */
    511 				rf_release_all_vps(cset);
    512 			}
    513 			/* cleanup */
    514 			rf_cleanup_config_set(cset);
    515 			cset = next_cset;
    516 		}
    517 		if (raid_added > 0) {
    518 			/* We added at least one RAID set, so re-scan for recursive RAID */
    519 			ac_list = rf_find_raid_components();
    520 			config_sets = rf_create_auto_sets(ac_list);
    521 		}
    522 	}
    523 
    524 	return 0;
    525 }
    526 
    527 
    528 static void
    529 rf_buildroothack(RF_ConfigSet_t *config_sets)
    530 {
    531 	RF_AutoConfig_t *ac_list;
    532 	RF_ConfigSet_t *cset;
    533 	RF_ConfigSet_t *next_cset;
    534 	int num_root;
    535 	int raid_added;
    536 	struct raid_softc *sc, *rsc;
    537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    538 
    539 	sc = rsc = NULL;
    540 	num_root = 0;
    541 
    542 	raid_added = 1;
    543 	while (raid_added > 0) {
    544 		raid_added = 0;
    545 		cset = config_sets;
    546 		while (cset != NULL) {
    547 			next_cset = cset->next;
    548 			if (rf_have_enough_components(cset) &&
    549 			    cset->ac->clabel->autoconfigure == 1) {
    550 				sc = rf_auto_config_set(cset);
    551 				if (sc != NULL) {
    552 					aprint_debug("raid%d: configured ok, rootable %d\n",
    553 						     sc->sc_unit, cset->rootable);
    554 					/* We added one RAID set */
    555 					raid_added++;
    556 					if (cset->rootable) {
    557 						rsc = sc;
    558 						num_root++;
    559 					}
    560 				} else {
    561 					/* The autoconfig didn't work :( */
    562 					aprint_debug("Autoconfig failed\n");
    563 					rf_release_all_vps(cset);
    564 				}
    565 			} else {
    566 				/* we're not autoconfiguring this set...
    567 				   release the associated resources */
    568 				rf_release_all_vps(cset);
    569 			}
    570 			/* cleanup */
    571 			rf_cleanup_config_set(cset);
    572 			cset = next_cset;
    573 		}
    574 		if (raid_added > 0) {
    575 			/* We added at least one RAID set, so re-scan for recursive RAID */
    576 			ac_list = rf_find_raid_components();
    577 			config_sets = rf_create_auto_sets(ac_list);
    578 		}
    579 	}
    580 
    581 	/* if the user has specified what the root device should be
    582 	   then we don't touch booted_device or boothowto... */
    583 
    584 	if (rootspec != NULL) {
    585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
    586 		return;
    587 	}
    588 
    589 	/* we found something bootable... */
    590 
    591 	/*
    592 	 * XXX: The following code assumes that the root raid
    593 	 * is the first ('a') partition. This is about the best
    594 	 * we can do with a BSD disklabel, but we might be able
    595 	 * to do better with a GPT label, by setting a specified
    596 	 * attribute to indicate the root partition. We can then
    597 	 * stash the partition number in the r->root_partition
    598 	 * high bits (the bottom 2 bits are already used). For
    599 	 * now we just set booted_partition to 0 when we override
    600 	 * root.
    601 	 */
    602 	if (num_root == 1) {
    603 		device_t candidate_root;
    604 		dksc = &rsc->sc_dksc;
    605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    606 			char cname[sizeof(cset->ac->devname)];
    607 			/* XXX: assume partition 'a' first */
    608 			snprintf(cname, sizeof(cname), "%s%c",
    609 			    device_xname(dksc->sc_dev), 'a');
    610 			candidate_root = dkwedge_find_by_wname(cname);
    611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
    612 			    cname);
    613 			if (candidate_root == NULL) {
    614 				/*
    615 				 * If that is not found, because we don't use
    616 				 * disklabel, return the first dk child
    617 				 * XXX: we can skip the 'a' check above
    618 				 * and always do this...
    619 				 */
    620 				size_t i = 0;
    621 				candidate_root = dkwedge_find_by_parent(
    622 				    device_xname(dksc->sc_dev), &i);
    623 			}
    624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
    625 			    candidate_root);
    626 		} else
    627 			candidate_root = dksc->sc_dev;
    628 		aprint_debug("%s: candidate root=%p booted_device=%p "
    629 			     "root_partition=%d contains_boot=%d\n",
    630 		    __func__, candidate_root, booted_device,
    631 		    rsc->sc_r.root_partition,
    632 		    rf_containsboot(&rsc->sc_r, booted_device));
    633 		/* XXX the check for booted_device == NULL can probably be
    634 		 * dropped, now that rf_containsboot handles that case.
    635 		 */
    636 		if (booted_device == NULL ||
    637 		    rsc->sc_r.root_partition == 1 ||
    638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    639 			booted_device = candidate_root;
    640 			booted_method = "raidframe/single";
    641 			booted_partition = 0;	/* XXX assume 'a' */
    642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
    643 			    device_xname(booted_device), booted_device);
    644 		}
    645 	} else if (num_root > 1) {
    646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
    647 		    booted_device);
    648 
    649 		/*
    650 		 * Maybe the MD code can help. If it cannot, then
    651 		 * setroot() will discover that we have no
    652 		 * booted_device and will ask the user if nothing was
    653 		 * hardwired in the kernel config file
    654 		 */
    655 		if (booted_device == NULL)
    656 			return;
    657 
    658 		num_root = 0;
    659 		mutex_enter(&raid_lock);
    660 		LIST_FOREACH(sc, &raids, sc_link) {
    661 			RF_Raid_t *r = &sc->sc_r;
    662 			if (r->valid == 0)
    663 				continue;
    664 
    665 			if (r->root_partition == 0)
    666 				continue;
    667 
    668 			if (rf_containsboot(r, booted_device)) {
    669 				num_root++;
    670 				rsc = sc;
    671 				dksc = &rsc->sc_dksc;
    672 			}
    673 		}
    674 		mutex_exit(&raid_lock);
    675 
    676 		if (num_root == 1) {
    677 			booted_device = dksc->sc_dev;
    678 			booted_method = "raidframe/multi";
    679 			booted_partition = 0;	/* XXX assume 'a' */
    680 		} else {
    681 			/* we can't guess.. require the user to answer... */
    682 			boothowto |= RB_ASKNAME;
    683 		}
    684 	}
    685 }
    686 
    687 static int
    688 raidsize(dev_t dev)
    689 {
    690 	struct raid_softc *rs;
    691 	struct dk_softc *dksc;
    692 	unsigned int unit;
    693 
    694 	unit = raidunit(dev);
    695 	if ((rs = raidget(unit, false)) == NULL)
    696 		return -1;
    697 	dksc = &rs->sc_dksc;
    698 
    699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    700 		return -1;
    701 
    702 	return dk_size(dksc, dev);
    703 }
    704 
    705 static int
    706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    707 {
    708 	unsigned int unit;
    709 	struct raid_softc *rs;
    710 	struct dk_softc *dksc;
    711 
    712 	unit = raidunit(dev);
    713 	if ((rs = raidget(unit, false)) == NULL)
    714 		return ENXIO;
    715 	dksc = &rs->sc_dksc;
    716 
    717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    718 		return ENODEV;
    719 
    720         /*
    721            Note that blkno is relative to this particular partition.
    722            By adding adding RF_PROTECTED_SECTORS, we get a value that
    723 	   is relative to the partition used for the underlying component.
    724         */
    725 	blkno += RF_PROTECTED_SECTORS;
    726 
    727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    728 }
    729 
    730 static int
    731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    732 {
    733 	struct raid_softc *rs = raidsoftc(dev);
    734 	const struct bdevsw *bdev;
    735 	RF_Raid_t *raidPtr;
    736 	int     c, sparecol, j, scol, dumpto;
    737 	int     error = 0;
    738 
    739 	raidPtr = &rs->sc_r;
    740 
    741 	/* we only support dumping to RAID 1 sets */
    742 	if (raidPtr->Layout.numDataCol != 1 ||
    743 	    raidPtr->Layout.numParityCol != 1)
    744 		return EINVAL;
    745 
    746 	if ((error = raidlock(rs)) != 0)
    747 		return error;
    748 
    749 	/* figure out what device is alive.. */
    750 
    751 	/*
    752 	   Look for a component to dump to.  The preference for the
    753 	   component to dump to is as follows:
    754 	   1) the first component
    755 	   2) a used_spare of the first component
    756 	   3) the second component
    757 	   4) a used_spare of the second component
    758 	*/
    759 
    760 	dumpto = -1;
    761 	for (c = 0; c < raidPtr->numCol; c++) {
    762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    763 			/* this might be the one */
    764 			dumpto = c;
    765 			break;
    766 		}
    767 	}
    768 
    769 	/*
    770 	   At this point we have possibly selected a live component.
    771 	   If we didn't find a live ocmponent, we now check to see
    772 	   if there is a relevant spared component.
    773 	*/
    774 
    775 	for (c = 0; c < raidPtr->numSpare; c++) {
    776 		sparecol = raidPtr->numCol + c;
    777 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    778 			/* How about this one? */
    779 			scol = -1;
    780 			for(j=0;j<raidPtr->numCol;j++) {
    781 				if (raidPtr->Disks[j].spareCol == sparecol) {
    782 					scol = j;
    783 					break;
    784 				}
    785 			}
    786 			if (scol == 0) {
    787 				/*
    788 				   We must have found a spared first
    789 				   component!  We'll take that over
    790 				   anything else found so far.  (We
    791 				   couldn't have found a real first
    792 				   component before, since this is a
    793 				   used spare, and it's saying that
    794 				   it's replacing the first
    795 				   component.)  On reboot (with
    796 				   autoconfiguration turned on)
    797 				   sparecol will become the first
    798 				   component (component0) of this set.
    799 				*/
    800 				dumpto = sparecol;
    801 				break;
    802 			} else if (scol != -1) {
    803 				/*
    804 				   Must be a spared second component.
    805 				   We'll dump to that if we havn't found
    806 				   anything else so far.
    807 				*/
    808 				if (dumpto == -1)
    809 					dumpto = sparecol;
    810 			}
    811 		}
    812 	}
    813 
    814 	if (dumpto == -1) {
    815 		/* we couldn't find any live components to dump to!?!?
    816 		 */
    817 		error = EINVAL;
    818 		goto out;
    819 	}
    820 
    821 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    822 	if (bdev == NULL) {
    823 		error = ENXIO;
    824 		goto out;
    825 	}
    826 
    827 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    828 				blkno, va, nblk * raidPtr->bytesPerSector);
    829 
    830 out:
    831 	raidunlock(rs);
    832 
    833 	return error;
    834 }
    835 
    836 /* ARGSUSED */
    837 static int
    838 raidopen(dev_t dev, int flags, int fmt,
    839     struct lwp *l)
    840 {
    841 	int     unit = raidunit(dev);
    842 	struct raid_softc *rs;
    843 	struct dk_softc *dksc;
    844 	int     error = 0;
    845 	int     part, pmask;
    846 
    847 	if ((rs = raidget(unit, true)) == NULL)
    848 		return ENXIO;
    849 	if ((error = raidlock(rs)) != 0)
    850 		return error;
    851 
    852 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    853 		error = EBUSY;
    854 		goto bad;
    855 	}
    856 
    857 	dksc = &rs->sc_dksc;
    858 
    859 	part = DISKPART(dev);
    860 	pmask = (1 << part);
    861 
    862 	if (!DK_BUSY(dksc, pmask) &&
    863 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    864 		/* First one... mark things as dirty... Note that we *MUST*
    865 		 have done a configure before this.  I DO NOT WANT TO BE
    866 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    867 		 THAT THEY BELONG TOGETHER!!!!! */
    868 		/* XXX should check to see if we're only open for reading
    869 		   here... If so, we needn't do this, but then need some
    870 		   other way of keeping track of what's happened.. */
    871 
    872 		rf_markalldirty(&rs->sc_r);
    873 	}
    874 
    875 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    876 		error = dk_open(dksc, dev, flags, fmt, l);
    877 
    878 bad:
    879 	raidunlock(rs);
    880 
    881 	return error;
    882 
    883 
    884 }
    885 
    886 static int
    887 raid_lastclose(device_t self)
    888 {
    889 	struct raid_softc *rs = raidsoftc(self);
    890 
    891 	/* Last one... device is not unconfigured yet.
    892 	   Device shutdown has taken care of setting the
    893 	   clean bits if RAIDF_INITED is not set
    894 	   mark things as clean... */
    895 
    896 	rf_update_component_labels(&rs->sc_r,
    897 	    RF_FINAL_COMPONENT_UPDATE);
    898 
    899 	/* pass to unlocked code */
    900 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    901 		rs->sc_flags |= RAIDF_DETACH;
    902 
    903 	return 0;
    904 }
    905 
    906 /* ARGSUSED */
    907 static int
    908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    909 {
    910 	int     unit = raidunit(dev);
    911 	struct raid_softc *rs;
    912 	struct dk_softc *dksc;
    913 	cfdata_t cf;
    914 	int     error = 0, do_detach = 0, do_put = 0;
    915 
    916 	if ((rs = raidget(unit, false)) == NULL)
    917 		return ENXIO;
    918 	dksc = &rs->sc_dksc;
    919 
    920 	if ((error = raidlock(rs)) != 0)
    921 		return error;
    922 
    923 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    924 		error = dk_close(dksc, dev, flags, fmt, l);
    925 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    926 			do_detach = 1;
    927 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    928 		do_put = 1;
    929 
    930 	raidunlock(rs);
    931 
    932 	if (do_detach) {
    933 		/* free the pseudo device attach bits */
    934 		cf = device_cfdata(dksc->sc_dev);
    935 		error = config_detach(dksc->sc_dev, 0);
    936 		if (error == 0)
    937 			free(cf, M_RAIDFRAME);
    938 	} else if (do_put) {
    939 		raidput(rs);
    940 	}
    941 
    942 	return error;
    943 
    944 }
    945 
    946 static void
    947 raid_wakeup(RF_Raid_t *raidPtr)
    948 {
    949 	rf_lock_mutex2(raidPtr->iodone_lock);
    950 	rf_signal_cond2(raidPtr->iodone_cv);
    951 	rf_unlock_mutex2(raidPtr->iodone_lock);
    952 }
    953 
    954 static void
    955 raidstrategy(struct buf *bp)
    956 {
    957 	unsigned int unit;
    958 	struct raid_softc *rs;
    959 	struct dk_softc *dksc;
    960 	RF_Raid_t *raidPtr;
    961 
    962 	unit = raidunit(bp->b_dev);
    963 	if ((rs = raidget(unit, false)) == NULL) {
    964 		bp->b_error = ENXIO;
    965 		goto fail;
    966 	}
    967 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    968 		bp->b_error = ENXIO;
    969 		goto fail;
    970 	}
    971 	dksc = &rs->sc_dksc;
    972 	raidPtr = &rs->sc_r;
    973 
    974 	/* Queue IO only */
    975 	if (dk_strategy_defer(dksc, bp))
    976 		goto done;
    977 
    978 	/* schedule the IO to happen at the next convenient time */
    979 	raid_wakeup(raidPtr);
    980 
    981 done:
    982 	return;
    983 
    984 fail:
    985 	bp->b_resid = bp->b_bcount;
    986 	biodone(bp);
    987 }
    988 
    989 static int
    990 raid_diskstart(device_t dev, struct buf *bp)
    991 {
    992 	struct raid_softc *rs = raidsoftc(dev);
    993 	RF_Raid_t *raidPtr;
    994 
    995 	raidPtr = &rs->sc_r;
    996 	if (!raidPtr->valid) {
    997 		db1_printf(("raid is not valid..\n"));
    998 		return ENODEV;
    999 	}
   1000 
   1001 	/* XXX */
   1002 	bp->b_resid = 0;
   1003 
   1004 	return raiddoaccess(raidPtr, bp);
   1005 }
   1006 
   1007 void
   1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1009 {
   1010 	struct raid_softc *rs;
   1011 	struct dk_softc *dksc;
   1012 
   1013 	rs = raidPtr->softc;
   1014 	dksc = &rs->sc_dksc;
   1015 
   1016 	dk_done(dksc, bp);
   1017 
   1018 	rf_lock_mutex2(raidPtr->mutex);
   1019 	raidPtr->openings++;
   1020 	rf_unlock_mutex2(raidPtr->mutex);
   1021 
   1022 	/* schedule more IO */
   1023 	raid_wakeup(raidPtr);
   1024 }
   1025 
   1026 /* ARGSUSED */
   1027 static int
   1028 raidread(dev_t dev, struct uio *uio, int flags)
   1029 {
   1030 	int     unit = raidunit(dev);
   1031 	struct raid_softc *rs;
   1032 
   1033 	if ((rs = raidget(unit, false)) == NULL)
   1034 		return ENXIO;
   1035 
   1036 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1037 		return ENXIO;
   1038 
   1039 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1040 
   1041 }
   1042 
   1043 /* ARGSUSED */
   1044 static int
   1045 raidwrite(dev_t dev, struct uio *uio, int flags)
   1046 {
   1047 	int     unit = raidunit(dev);
   1048 	struct raid_softc *rs;
   1049 
   1050 	if ((rs = raidget(unit, false)) == NULL)
   1051 		return ENXIO;
   1052 
   1053 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1054 		return ENXIO;
   1055 
   1056 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1057 
   1058 }
   1059 
   1060 static int
   1061 raid_detach_unlocked(struct raid_softc *rs)
   1062 {
   1063 	struct dk_softc *dksc = &rs->sc_dksc;
   1064 	RF_Raid_t *raidPtr;
   1065 	int error;
   1066 
   1067 	raidPtr = &rs->sc_r;
   1068 
   1069 	if (DK_BUSY(dksc, 0) ||
   1070 	    raidPtr->recon_in_progress != 0 ||
   1071 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1072 	    raidPtr->copyback_in_progress != 0)
   1073 		return EBUSY;
   1074 
   1075 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1076 		return 0;
   1077 
   1078 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1079 
   1080 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1081 		return error;
   1082 
   1083 	rs->sc_flags &= ~RAIDF_INITED;
   1084 
   1085 	/* Kill off any queued buffers */
   1086 	dk_drain(dksc);
   1087 	bufq_free(dksc->sc_bufq);
   1088 
   1089 	/* Detach the disk. */
   1090 	dkwedge_delall(&dksc->sc_dkdev);
   1091 	disk_detach(&dksc->sc_dkdev);
   1092 	disk_destroy(&dksc->sc_dkdev);
   1093 	dk_detach(dksc);
   1094 
   1095 	return 0;
   1096 }
   1097 
   1098 int
   1099 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1100 {
   1101 	struct rf_recon_req_internal *rrint;
   1102 
   1103 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1104 		/* Can't do this on a RAID 0!! */
   1105 		return EINVAL;
   1106 	}
   1107 
   1108 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1109 		/* bad column */
   1110 		return EINVAL;
   1111 	}
   1112 
   1113 	rf_lock_mutex2(raidPtr->mutex);
   1114 	if (raidPtr->status == rf_rs_reconstructing) {
   1115 		/* you can't fail a disk while we're reconstructing! */
   1116 		/* XXX wrong for RAID6 */
   1117 		goto out;
   1118 	}
   1119 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1120 	    (raidPtr->numFailures > 0)) {
   1121 		/* some other component has failed.  Let's not make
   1122 		   things worse. XXX wrong for RAID6 */
   1123 		goto out;
   1124 	}
   1125 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1126 		/* Can't fail a spared disk! */
   1127 		goto out;
   1128 	}
   1129 	rf_unlock_mutex2(raidPtr->mutex);
   1130 
   1131 	/* make a copy of the recon request so that we don't rely on
   1132 	 * the user's buffer */
   1133 	rrint = RF_Malloc(sizeof(*rrint));
   1134 	if (rrint == NULL)
   1135 		return(ENOMEM);
   1136 	rrint->col = rr->col;
   1137 	rrint->flags = rr->flags;
   1138 	rrint->raidPtr = raidPtr;
   1139 
   1140 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1141 	    rrint, "raid_recon");
   1142 out:
   1143 	rf_unlock_mutex2(raidPtr->mutex);
   1144 	return EINVAL;
   1145 }
   1146 
   1147 static int
   1148 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1149 {
   1150 	/* allocate a buffer for the layout-specific data, and copy it in */
   1151 	if (k_cfg->layoutSpecificSize == 0)
   1152 		return 0;
   1153 
   1154 	if (k_cfg->layoutSpecificSize > 10000) {
   1155 	    /* sanity check */
   1156 	    return EINVAL;
   1157 	}
   1158 
   1159 	u_char *specific_buf;
   1160 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1161 	if (specific_buf == NULL)
   1162 		return ENOMEM;
   1163 
   1164 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1165 	    k_cfg->layoutSpecificSize);
   1166 	if (retcode) {
   1167 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1168 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1169 		return retcode;
   1170 	}
   1171 
   1172 	k_cfg->layoutSpecific = specific_buf;
   1173 	return 0;
   1174 }
   1175 
   1176 static int
   1177 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1178 {
   1179 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1180 
   1181 	if (rs->sc_r.valid) {
   1182 		/* There is a valid RAID set running on this unit! */
   1183 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1184 		return EINVAL;
   1185 	}
   1186 
   1187 	/* copy-in the configuration information */
   1188 	/* data points to a pointer to the configuration structure */
   1189 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1190 	if (*k_cfg == NULL) {
   1191 		return ENOMEM;
   1192 	}
   1193 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1194 	if (retcode == 0)
   1195 		return 0;
   1196 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1197 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1198 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1199 	return retcode;
   1200 }
   1201 
   1202 int
   1203 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1204 {
   1205 	int retcode, i;
   1206 	RF_Raid_t *raidPtr = &rs->sc_r;
   1207 
   1208 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1209 
   1210 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1211 		goto out;
   1212 
   1213 	/* should do some kind of sanity check on the configuration.
   1214 	 * Store the sum of all the bytes in the last byte? */
   1215 
   1216 	/* Force nul-termination on all strings. */
   1217 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1218 	for (i = 0; i < RF_MAXCOL; i++) {
   1219 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1220 	}
   1221 	for (i = 0; i < RF_MAXSPARE; i++) {
   1222 		ZERO_FINAL(k_cfg->spare_names[i]);
   1223 	}
   1224 	for (i = 0; i < RF_MAXDBGV; i++) {
   1225 		ZERO_FINAL(k_cfg->debugVars[i]);
   1226 	}
   1227 #undef ZERO_FINAL
   1228 
   1229 	/* Check some basic limits. */
   1230 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1231 		retcode = EINVAL;
   1232 		goto out;
   1233 	}
   1234 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1235 		retcode = EINVAL;
   1236 		goto out;
   1237 	}
   1238 
   1239 	/* configure the system */
   1240 
   1241 	/*
   1242 	 * Clear the entire RAID descriptor, just to make sure
   1243 	 *  there is no stale data left in the case of a
   1244 	 *  reconfiguration
   1245 	 */
   1246 	memset(raidPtr, 0, sizeof(*raidPtr));
   1247 	raidPtr->softc = rs;
   1248 	raidPtr->raidid = rs->sc_unit;
   1249 
   1250 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1251 
   1252 	if (retcode == 0) {
   1253 		/* allow this many simultaneous IO's to
   1254 		   this RAID device */
   1255 		raidPtr->openings = RAIDOUTSTANDING;
   1256 
   1257 		raidinit(rs);
   1258 		raid_wakeup(raidPtr);
   1259 		rf_markalldirty(raidPtr);
   1260 	}
   1261 
   1262 	/* free the buffers.  No return code here. */
   1263 	if (k_cfg->layoutSpecificSize) {
   1264 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1265 	}
   1266 out:
   1267 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1268 	if (retcode) {
   1269 		/*
   1270 		 * If configuration failed, set sc_flags so that we
   1271 		 * will detach the device when we close it.
   1272 		 */
   1273 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1274 	}
   1275 	return retcode;
   1276 }
   1277 
   1278 #if RF_DISABLED
   1279 static int
   1280 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1281 {
   1282 
   1283 	/* XXX check the label for valid stuff... */
   1284 	/* Note that some things *should not* get modified --
   1285 	   the user should be re-initing the labels instead of
   1286 	   trying to patch things.
   1287 	   */
   1288 #ifdef DEBUG
   1289 	int raidid = raidPtr->raidid;
   1290 	printf("raid%d: Got component label:\n", raidid);
   1291 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1292 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1293 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1294 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1295 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1296 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1297 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1298 #endif	/* DEBUG */
   1299 	clabel->row = 0;
   1300 	int column = clabel->column;
   1301 
   1302 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1303 		return(EINVAL);
   1304 	}
   1305 
   1306 	/* XXX this isn't allowed to do anything for now :-) */
   1307 
   1308 	/* XXX and before it is, we need to fill in the rest
   1309 	   of the fields!?!?!?! */
   1310 	memcpy(raidget_component_label(raidPtr, column),
   1311 	    clabel, sizeof(*clabel));
   1312 	raidflush_component_label(raidPtr, column);
   1313 	return 0;
   1314 }
   1315 #endif
   1316 
   1317 static int
   1318 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1319 {
   1320 	/*
   1321 	   we only want the serial number from
   1322 	   the above.  We get all the rest of the information
   1323 	   from the config that was used to create this RAID
   1324 	   set.
   1325 	   */
   1326 
   1327 	raidPtr->serial_number = clabel->serial_number;
   1328 
   1329 	for (int column = 0; column < raidPtr->numCol; column++) {
   1330 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1331 		if (RF_DEAD_DISK(diskPtr->status))
   1332 			continue;
   1333 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1334 		    raidPtr, column);
   1335 		/* Zeroing this is important. */
   1336 		memset(ci_label, 0, sizeof(*ci_label));
   1337 		raid_init_component_label(raidPtr, ci_label);
   1338 		ci_label->serial_number = raidPtr->serial_number;
   1339 		ci_label->row = 0; /* we dont' pretend to support more */
   1340 		rf_component_label_set_partitionsize(ci_label,
   1341 		    diskPtr->partitionSize);
   1342 		ci_label->column = column;
   1343 		raidflush_component_label(raidPtr, column);
   1344 		/* XXXjld what about the spares? */
   1345 	}
   1346 
   1347 	return 0;
   1348 }
   1349 
   1350 static int
   1351 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1352 {
   1353 
   1354 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1355 		/* Can't do this on a RAID 0!! */
   1356 		return EINVAL;
   1357 	}
   1358 
   1359 	if (raidPtr->recon_in_progress == 1) {
   1360 		/* a reconstruct is already in progress! */
   1361 		return EINVAL;
   1362 	}
   1363 
   1364 	RF_SingleComponent_t component;
   1365 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1366 	component.row = 0; /* we don't support any more */
   1367 	int column = component.column;
   1368 
   1369 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1370 		return EINVAL;
   1371 	}
   1372 
   1373 	rf_lock_mutex2(raidPtr->mutex);
   1374 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1375 	    (raidPtr->numFailures > 0)) {
   1376 		/* XXX 0 above shouldn't be constant!!! */
   1377 		/* some component other than this has failed.
   1378 		   Let's not make things worse than they already
   1379 		   are... */
   1380 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1381 		       raidPtr->raidid);
   1382 		printf("raid%d:     Col: %d   Too many failures.\n",
   1383 		       raidPtr->raidid, column);
   1384 		rf_unlock_mutex2(raidPtr->mutex);
   1385 		return EINVAL;
   1386 	}
   1387 
   1388 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1389 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1390 		       raidPtr->raidid);
   1391 		printf("raid%d:    Col: %d   "
   1392 		    "Reconstruction already occurring!\n",
   1393 		    raidPtr->raidid, column);
   1394 
   1395 		rf_unlock_mutex2(raidPtr->mutex);
   1396 		return EINVAL;
   1397 	}
   1398 
   1399 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1400 		rf_unlock_mutex2(raidPtr->mutex);
   1401 		return EINVAL;
   1402 	}
   1403 
   1404 	rf_unlock_mutex2(raidPtr->mutex);
   1405 
   1406 	struct rf_recon_req_internal *rrint;
   1407 	rrint = RF_Malloc(sizeof(*rrint));
   1408 	if (rrint == NULL)
   1409 		return ENOMEM;
   1410 
   1411 	rrint->col = column;
   1412 	rrint->raidPtr = raidPtr;
   1413 
   1414 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1415 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1416 }
   1417 
   1418 static int
   1419 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1420 {
   1421 	/*
   1422 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1423 	 * so tell the user it's done.
   1424 	 */
   1425 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1426 	    raidPtr->status != rf_rs_reconstructing) {
   1427 		*data = 100;
   1428 		return 0;
   1429 	}
   1430 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1431 		*data = 0;
   1432 		return 0;
   1433 	}
   1434 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1435 	    / raidPtr->reconControl->numRUsTotal);
   1436 	return 0;
   1437 }
   1438 
   1439 /*
   1440  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1441  * on the component_name[] array.
   1442  */
   1443 static void
   1444 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1445 {
   1446 
   1447 	memcpy(component, data, sizeof *component);
   1448 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1449 }
   1450 
   1451 static int
   1452 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1453 {
   1454 	int     unit = raidunit(dev);
   1455 	int     part, pmask;
   1456 	struct raid_softc *rs;
   1457 	struct dk_softc *dksc;
   1458 	RF_Config_t *k_cfg;
   1459 	RF_Raid_t *raidPtr;
   1460 	RF_AccTotals_t *totals;
   1461 	RF_SingleComponent_t component;
   1462 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1463 	int retcode = 0;
   1464 	int column;
   1465 	RF_ComponentLabel_t *clabel;
   1466 	int d;
   1467 
   1468 	if ((rs = raidget(unit, false)) == NULL)
   1469 		return ENXIO;
   1470 
   1471 	dksc = &rs->sc_dksc;
   1472 	raidPtr = &rs->sc_r;
   1473 
   1474 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1475 	    (int) DISKPART(dev), (int) unit, cmd));
   1476 
   1477 	/* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
   1478 	switch (cmd) {
   1479 	case RAIDFRAME_CONFIGURE:
   1480 	case RAIDFRAME_RESCAN:
   1481 		break;
   1482 	default:
   1483 		if (!rf_inited(rs))
   1484 			return ENXIO;
   1485 	}
   1486 
   1487 	switch (cmd) {
   1488 		/* configure the system */
   1489 	case RAIDFRAME_CONFIGURE:
   1490 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1491 			return retcode;
   1492 		return rf_construct(rs, k_cfg);
   1493 
   1494 		/* shutdown the system */
   1495 	case RAIDFRAME_SHUTDOWN:
   1496 
   1497 		part = DISKPART(dev);
   1498 		pmask = (1 << part);
   1499 
   1500 		if ((retcode = raidlock(rs)) != 0)
   1501 			return retcode;
   1502 
   1503 		if (DK_BUSY(dksc, pmask) ||
   1504 		    raidPtr->recon_in_progress != 0 ||
   1505 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1506 		    raidPtr->copyback_in_progress != 0)
   1507 			retcode = EBUSY;
   1508 		else {
   1509 			/* detach and free on close */
   1510 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1511 			retcode = 0;
   1512 		}
   1513 
   1514 		raidunlock(rs);
   1515 
   1516 		return retcode;
   1517 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1518 		return rf_get_component_label(raidPtr, data);
   1519 
   1520 #if RF_DISABLED
   1521 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1522 		return rf_set_component_label(raidPtr, data);
   1523 #endif
   1524 
   1525 	case RAIDFRAME_INIT_LABELS:
   1526 		return rf_init_component_label(raidPtr, data);
   1527 
   1528 	case RAIDFRAME_SET_AUTOCONFIG:
   1529 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1530 		printf("raid%d: New autoconfig value is: %d\n",
   1531 		       raidPtr->raidid, d);
   1532 		*(int *) data = d;
   1533 		return retcode;
   1534 
   1535 	case RAIDFRAME_SET_ROOT:
   1536 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1537 		printf("raid%d: New rootpartition value is: %d\n",
   1538 		       raidPtr->raidid, d);
   1539 		*(int *) data = d;
   1540 		return retcode;
   1541 
   1542 		/* initialize all parity */
   1543 	case RAIDFRAME_REWRITEPARITY:
   1544 
   1545 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1546 			/* Parity for RAID 0 is trivially correct */
   1547 			raidPtr->parity_good = RF_RAID_CLEAN;
   1548 			return 0;
   1549 		}
   1550 
   1551 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1552 			/* Re-write is already in progress! */
   1553 			return EINVAL;
   1554 		}
   1555 
   1556 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1557 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1558 
   1559 	case RAIDFRAME_ADD_HOT_SPARE:
   1560 		rf_copy_single_component(&component, data);
   1561 		return rf_add_hot_spare(raidPtr, &component);
   1562 
   1563 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1564 		return retcode;
   1565 
   1566 	case RAIDFRAME_DELETE_COMPONENT:
   1567 		rf_copy_single_component(&component, data);
   1568 		return rf_delete_component(raidPtr, &component);
   1569 
   1570 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1571 		rf_copy_single_component(&component, data);
   1572 		return rf_incorporate_hot_spare(raidPtr, &component);
   1573 
   1574 	case RAIDFRAME_REBUILD_IN_PLACE:
   1575 		return rf_rebuild_in_place(raidPtr, data);
   1576 
   1577 	case RAIDFRAME_GET_INFO:
   1578 		ucfgp = *(RF_DeviceConfig_t **)data;
   1579 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1580 		if (d_cfg == NULL)
   1581 			return ENOMEM;
   1582 		retcode = rf_get_info(raidPtr, d_cfg);
   1583 		if (retcode == 0) {
   1584 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1585 		}
   1586 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1587 		return retcode;
   1588 
   1589 	case RAIDFRAME_CHECK_PARITY:
   1590 		*(int *) data = raidPtr->parity_good;
   1591 		return 0;
   1592 
   1593 	case RAIDFRAME_PARITYMAP_STATUS:
   1594 		if (rf_paritymap_ineligible(raidPtr))
   1595 			return EINVAL;
   1596 		rf_paritymap_status(raidPtr->parity_map, data);
   1597 		return 0;
   1598 
   1599 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1600 		if (rf_paritymap_ineligible(raidPtr))
   1601 			return EINVAL;
   1602 		if (raidPtr->parity_map == NULL)
   1603 			return ENOENT; /* ??? */
   1604 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1605 			return EINVAL;
   1606 		return 0;
   1607 
   1608 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1609 		if (rf_paritymap_ineligible(raidPtr))
   1610 			return EINVAL;
   1611 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1612 		return 0;
   1613 
   1614 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1615 		if (rf_paritymap_ineligible(raidPtr))
   1616 			return EINVAL;
   1617 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1618 		/* XXX should errors be passed up? */
   1619 		return 0;
   1620 
   1621 	case RAIDFRAME_RESCAN:
   1622 		return rf_rescan();
   1623 
   1624 	case RAIDFRAME_RESET_ACCTOTALS:
   1625 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1626 		return 0;
   1627 
   1628 	case RAIDFRAME_GET_ACCTOTALS:
   1629 		totals = (RF_AccTotals_t *) data;
   1630 		*totals = raidPtr->acc_totals;
   1631 		return 0;
   1632 
   1633 	case RAIDFRAME_KEEP_ACCTOTALS:
   1634 		raidPtr->keep_acc_totals = *(int *)data;
   1635 		return 0;
   1636 
   1637 	case RAIDFRAME_GET_SIZE:
   1638 		*(int *) data = raidPtr->totalSectors;
   1639 		return 0;
   1640 
   1641 	case RAIDFRAME_FAIL_DISK:
   1642 		return rf_fail_disk(raidPtr, data);
   1643 
   1644 		/* invoke a copyback operation after recon on whatever disk
   1645 		 * needs it, if any */
   1646 	case RAIDFRAME_COPYBACK:
   1647 
   1648 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1649 			/* This makes no sense on a RAID 0!! */
   1650 			return EINVAL;
   1651 		}
   1652 
   1653 		if (raidPtr->copyback_in_progress == 1) {
   1654 			/* Copyback is already in progress! */
   1655 			return EINVAL;
   1656 		}
   1657 
   1658 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1659 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1660 
   1661 		/* return the percentage completion of reconstruction */
   1662 	case RAIDFRAME_CHECK_RECON_STATUS:
   1663 		return rf_check_recon_status(raidPtr, data);
   1664 
   1665 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1666 		rf_check_recon_status_ext(raidPtr, data);
   1667 		return 0;
   1668 
   1669 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1670 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1671 			/* This makes no sense on a RAID 0, so tell the
   1672 			   user it's done. */
   1673 			*(int *) data = 100;
   1674 			return 0;
   1675 		}
   1676 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1677 			*(int *) data = 100 *
   1678 				raidPtr->parity_rewrite_stripes_done /
   1679 				raidPtr->Layout.numStripe;
   1680 		} else {
   1681 			*(int *) data = 100;
   1682 		}
   1683 		return 0;
   1684 
   1685 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1686 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1687 		return 0;
   1688 
   1689 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1690 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1691 			/* This makes no sense on a RAID 0 */
   1692 			*(int *) data = 100;
   1693 			return 0;
   1694 		}
   1695 		if (raidPtr->copyback_in_progress == 1) {
   1696 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1697 				raidPtr->Layout.numStripe;
   1698 		} else {
   1699 			*(int *) data = 100;
   1700 		}
   1701 		return 0;
   1702 
   1703 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1704 		rf_check_copyback_status_ext(raidPtr, data);
   1705 		return 0;
   1706 
   1707 	case RAIDFRAME_SET_LAST_UNIT:
   1708 		for (column = 0; column < raidPtr->numCol; column++)
   1709 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1710 				return EBUSY;
   1711 
   1712 		for (column = 0; column < raidPtr->numCol; column++) {
   1713 			clabel = raidget_component_label(raidPtr, column);
   1714 			clabel->last_unit = *(int *)data;
   1715 			raidflush_component_label(raidPtr, column);
   1716 		}
   1717 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1718 		return 0;
   1719 
   1720 		/* the sparetable daemon calls this to wait for the kernel to
   1721 		 * need a spare table. this ioctl does not return until a
   1722 		 * spare table is needed. XXX -- calling mpsleep here in the
   1723 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1724 		 * -- I should either compute the spare table in the kernel,
   1725 		 * or have a different -- XXX XXX -- interface (a different
   1726 		 * character device) for delivering the table     -- XXX */
   1727 #if RF_DISABLED
   1728 	case RAIDFRAME_SPARET_WAIT:
   1729 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1730 		while (!rf_sparet_wait_queue)
   1731 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1732 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1733 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1734 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1735 
   1736 		/* structure assignment */
   1737 		*((RF_SparetWait_t *) data) = *waitreq;
   1738 
   1739 		RF_Free(waitreq, sizeof(*waitreq));
   1740 		return 0;
   1741 
   1742 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1743 		 * code in it that will cause the dameon to exit */
   1744 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1745 		waitreq = RF_Malloc(sizeof(*waitreq));
   1746 		waitreq->fcol = -1;
   1747 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1748 		waitreq->next = rf_sparet_wait_queue;
   1749 		rf_sparet_wait_queue = waitreq;
   1750 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1751 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1752 		return 0;
   1753 
   1754 		/* used by the spare table daemon to deliver a spare table
   1755 		 * into the kernel */
   1756 	case RAIDFRAME_SEND_SPARET:
   1757 
   1758 		/* install the spare table */
   1759 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1760 
   1761 		/* respond to the requestor.  the return status of the spare
   1762 		 * table installation is passed in the "fcol" field */
   1763 		waitred = RF_Malloc(sizeof(*waitreq));
   1764 		waitreq->fcol = retcode;
   1765 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1766 		waitreq->next = rf_sparet_resp_queue;
   1767 		rf_sparet_resp_queue = waitreq;
   1768 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1769 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1770 
   1771 		return retcode;
   1772 #endif
   1773 	default:
   1774 		/*
   1775 		 * Don't bother trying to load compat modules
   1776 		 * if it is not our ioctl. This is more efficient
   1777 		 * and makes rump tests not depend on compat code
   1778 		 */
   1779 		if (IOCGROUP(cmd) != 'r')
   1780 			break;
   1781 #ifdef _LP64
   1782 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1783 			module_autoload("compat_netbsd32_raid",
   1784 			    MODULE_CLASS_EXEC);
   1785 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1786 			    (rs, cmd, data), enosys(), retcode);
   1787 			if (retcode != EPASSTHROUGH)
   1788 				return retcode;
   1789 		}
   1790 #endif
   1791 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1792 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1793 		    (rs, cmd, data), enosys(), retcode);
   1794 		if (retcode != EPASSTHROUGH)
   1795 			return retcode;
   1796 
   1797 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1798 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1799 		    (rs, cmd, data), enosys(), retcode);
   1800 		if (retcode != EPASSTHROUGH)
   1801 			return retcode;
   1802 		break; /* fall through to the os-specific code below */
   1803 
   1804 	}
   1805 
   1806 	if (!raidPtr->valid)
   1807 		return EINVAL;
   1808 
   1809 	/*
   1810 	 * Add support for "regular" device ioctls here.
   1811 	 */
   1812 
   1813 	switch (cmd) {
   1814 	case DIOCGCACHE:
   1815 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1816 		break;
   1817 
   1818 	case DIOCCACHESYNC:
   1819 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1820 		break;
   1821 
   1822 	default:
   1823 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1824 		break;
   1825 	}
   1826 
   1827 	return retcode;
   1828 
   1829 }
   1830 
   1831 
   1832 /* raidinit -- complete the rest of the initialization for the
   1833    RAIDframe device.  */
   1834 
   1835 
   1836 static void
   1837 raidinit(struct raid_softc *rs)
   1838 {
   1839 	cfdata_t cf;
   1840 	unsigned int unit;
   1841 	struct dk_softc *dksc = &rs->sc_dksc;
   1842 	RF_Raid_t *raidPtr = &rs->sc_r;
   1843 	device_t dev;
   1844 
   1845 	unit = raidPtr->raidid;
   1846 
   1847 	/* XXX doesn't check bounds. */
   1848 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1849 
   1850 	/* attach the pseudo device */
   1851 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1852 	cf->cf_name = raid_cd.cd_name;
   1853 	cf->cf_atname = raid_cd.cd_name;
   1854 	cf->cf_unit = unit;
   1855 	cf->cf_fstate = FSTATE_STAR;
   1856 
   1857 	dev = config_attach_pseudo(cf);
   1858 	if (dev == NULL) {
   1859 		printf("raid%d: config_attach_pseudo failed\n",
   1860 		    raidPtr->raidid);
   1861 		free(cf, M_RAIDFRAME);
   1862 		return;
   1863 	}
   1864 
   1865 	/* provide a backpointer to the real softc */
   1866 	raidsoftc(dev) = rs;
   1867 
   1868 	/* disk_attach actually creates space for the CPU disklabel, among
   1869 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1870 	 * with disklabels. */
   1871 	dk_init(dksc, dev, DKTYPE_RAID);
   1872 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1873 
   1874 	/* XXX There may be a weird interaction here between this, and
   1875 	 * protectedSectors, as used in RAIDframe.  */
   1876 
   1877 	rs->sc_size = raidPtr->totalSectors;
   1878 
   1879 	/* Attach dk and disk subsystems */
   1880 	dk_attach(dksc);
   1881 	disk_attach(&dksc->sc_dkdev);
   1882 	rf_set_geometry(rs, raidPtr);
   1883 
   1884 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1885 
   1886 	/* mark unit as usuable */
   1887 	rs->sc_flags |= RAIDF_INITED;
   1888 
   1889 	dkwedge_discover(&dksc->sc_dkdev);
   1890 }
   1891 
   1892 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1893 /* wake up the daemon & tell it to get us a spare table
   1894  * XXX
   1895  * the entries in the queues should be tagged with the raidPtr
   1896  * so that in the extremely rare case that two recons happen at once,
   1897  * we know for which device were requesting a spare table
   1898  * XXX
   1899  *
   1900  * XXX This code is not currently used. GO
   1901  */
   1902 int
   1903 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1904 {
   1905 	int     retcode;
   1906 
   1907 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1908 	req->next = rf_sparet_wait_queue;
   1909 	rf_sparet_wait_queue = req;
   1910 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1911 
   1912 	/* mpsleep unlocks the mutex */
   1913 	while (!rf_sparet_resp_queue) {
   1914 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1915 	}
   1916 	req = rf_sparet_resp_queue;
   1917 	rf_sparet_resp_queue = req->next;
   1918 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1919 
   1920 	retcode = req->fcol;
   1921 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1922 					 * alloc'd */
   1923 	return retcode;
   1924 }
   1925 #endif
   1926 
   1927 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1928  * bp & passes it down.
   1929  * any calls originating in the kernel must use non-blocking I/O
   1930  * do some extra sanity checking to return "appropriate" error values for
   1931  * certain conditions (to make some standard utilities work)
   1932  *
   1933  * Formerly known as: rf_DoAccessKernel
   1934  */
   1935 void
   1936 raidstart(RF_Raid_t *raidPtr)
   1937 {
   1938 	struct raid_softc *rs;
   1939 	struct dk_softc *dksc;
   1940 
   1941 	rs = raidPtr->softc;
   1942 	dksc = &rs->sc_dksc;
   1943 	/* quick check to see if anything has died recently */
   1944 	rf_lock_mutex2(raidPtr->mutex);
   1945 	if (raidPtr->numNewFailures > 0) {
   1946 		rf_unlock_mutex2(raidPtr->mutex);
   1947 		rf_update_component_labels(raidPtr,
   1948 					   RF_NORMAL_COMPONENT_UPDATE);
   1949 		rf_lock_mutex2(raidPtr->mutex);
   1950 		raidPtr->numNewFailures--;
   1951 	}
   1952 	rf_unlock_mutex2(raidPtr->mutex);
   1953 
   1954 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1955 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1956 		return;
   1957 	}
   1958 
   1959 	dk_start(dksc, NULL);
   1960 }
   1961 
   1962 static int
   1963 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1964 {
   1965 	RF_SectorCount_t num_blocks, pb, sum;
   1966 	RF_RaidAddr_t raid_addr;
   1967 	daddr_t blocknum;
   1968 	int rc;
   1969 
   1970 	rf_lock_mutex2(raidPtr->mutex);
   1971 	if (raidPtr->openings == 0) {
   1972 		rf_unlock_mutex2(raidPtr->mutex);
   1973 		return EAGAIN;
   1974 	}
   1975 	rf_unlock_mutex2(raidPtr->mutex);
   1976 
   1977 	blocknum = bp->b_rawblkno;
   1978 
   1979 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1980 		    (int) blocknum));
   1981 
   1982 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1983 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1984 
   1985 	/* *THIS* is where we adjust what block we're going to...
   1986 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1987 	raid_addr = blocknum;
   1988 
   1989 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1990 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1991 	sum = raid_addr + num_blocks + pb;
   1992 	if (1 || rf_debugKernelAccess) {
   1993 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1994 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1995 			    (int) pb, (int) bp->b_resid));
   1996 	}
   1997 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1998 	    || (sum < num_blocks) || (sum < pb)) {
   1999 		rc = ENOSPC;
   2000 		goto done;
   2001 	}
   2002 	/*
   2003 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2004 	 */
   2005 
   2006 	if (bp->b_bcount & raidPtr->sectorMask) {
   2007 		rc = ENOSPC;
   2008 		goto done;
   2009 	}
   2010 	db1_printf(("Calling DoAccess..\n"));
   2011 
   2012 
   2013 	rf_lock_mutex2(raidPtr->mutex);
   2014 	raidPtr->openings--;
   2015 	rf_unlock_mutex2(raidPtr->mutex);
   2016 
   2017 	/* don't ever condition on bp->b_flags & B_WRITE.
   2018 	 * always condition on B_READ instead */
   2019 
   2020 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2021 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2022 			 raid_addr, num_blocks,
   2023 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2024 
   2025 done:
   2026 	return rc;
   2027 }
   2028 
   2029 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2030 
   2031 int
   2032 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2033 {
   2034 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2035 	struct buf *bp;
   2036 
   2037 	req->queue = queue;
   2038 	bp = req->bp;
   2039 
   2040 	switch (req->type) {
   2041 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2042 		/* XXX need to do something extra here.. */
   2043 		/* I'm leaving this in, as I've never actually seen it used,
   2044 		 * and I'd like folks to report it... GO */
   2045 		printf("%s: WAKEUP CALLED\n", __func__);
   2046 		queue->numOutstanding++;
   2047 
   2048 		bp->b_flags = 0;
   2049 		bp->b_private = req;
   2050 
   2051 		KernelWakeupFunc(bp);
   2052 		break;
   2053 
   2054 	case RF_IO_TYPE_READ:
   2055 	case RF_IO_TYPE_WRITE:
   2056 #if RF_ACC_TRACE > 0
   2057 		if (req->tracerec) {
   2058 			RF_ETIMER_START(req->tracerec->timer);
   2059 		}
   2060 #endif
   2061 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2062 		    op, queue->rf_cinfo->ci_dev,
   2063 		    req->sectorOffset, req->numSector,
   2064 		    req->buf, KernelWakeupFunc, (void *) req,
   2065 		    queue->raidPtr->logBytesPerSector);
   2066 
   2067 		if (rf_debugKernelAccess) {
   2068 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2069 				(long) bp->b_blkno));
   2070 		}
   2071 		queue->numOutstanding++;
   2072 		queue->last_deq_sector = req->sectorOffset;
   2073 		/* acc wouldn't have been let in if there were any pending
   2074 		 * reqs at any other priority */
   2075 		queue->curPriority = req->priority;
   2076 
   2077 		db1_printf(("Going for %c to unit %d col %d\n",
   2078 			    req->type, queue->raidPtr->raidid,
   2079 			    queue->col));
   2080 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2081 			(int) req->sectorOffset, (int) req->numSector,
   2082 			(int) (req->numSector <<
   2083 			    queue->raidPtr->logBytesPerSector),
   2084 			(int) queue->raidPtr->logBytesPerSector));
   2085 
   2086 		/*
   2087 		 * XXX: drop lock here since this can block at
   2088 		 * least with backing SCSI devices.  Retake it
   2089 		 * to minimize fuss with calling interfaces.
   2090 		 */
   2091 
   2092 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2093 		bdev_strategy(bp);
   2094 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2095 		break;
   2096 
   2097 	default:
   2098 		panic("bad req->type in rf_DispatchKernelIO");
   2099 	}
   2100 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2101 
   2102 	return 0;
   2103 }
   2104 /* this is the callback function associated with a I/O invoked from
   2105    kernel code.
   2106  */
   2107 static void
   2108 KernelWakeupFunc(struct buf *bp)
   2109 {
   2110 	RF_DiskQueueData_t *req = NULL;
   2111 	RF_DiskQueue_t *queue;
   2112 
   2113 	db1_printf(("recovering the request queue:\n"));
   2114 
   2115 	req = bp->b_private;
   2116 
   2117 	queue = (RF_DiskQueue_t *) req->queue;
   2118 
   2119 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2120 
   2121 #if RF_ACC_TRACE > 0
   2122 	if (req->tracerec) {
   2123 		RF_ETIMER_STOP(req->tracerec->timer);
   2124 		RF_ETIMER_EVAL(req->tracerec->timer);
   2125 		rf_lock_mutex2(rf_tracing_mutex);
   2126 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2127 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2128 		req->tracerec->num_phys_ios++;
   2129 		rf_unlock_mutex2(rf_tracing_mutex);
   2130 	}
   2131 #endif
   2132 
   2133 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2134 	 * ballistic, and mark the component as hosed... */
   2135 
   2136 	if (bp->b_error != 0) {
   2137 		/* Mark the disk as dead */
   2138 		/* but only mark it once... */
   2139 		/* and only if it wouldn't leave this RAID set
   2140 		   completely broken */
   2141 		if (((queue->raidPtr->Disks[queue->col].status ==
   2142 		      rf_ds_optimal) ||
   2143 		     (queue->raidPtr->Disks[queue->col].status ==
   2144 		      rf_ds_used_spare)) &&
   2145 		     (queue->raidPtr->numFailures <
   2146 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2147 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2148 			       queue->raidPtr->raidid,
   2149 			       bp->b_error,
   2150 			       queue->raidPtr->Disks[queue->col].devname);
   2151 			queue->raidPtr->Disks[queue->col].status =
   2152 			    rf_ds_failed;
   2153 			queue->raidPtr->status = rf_rs_degraded;
   2154 			queue->raidPtr->numFailures++;
   2155 			queue->raidPtr->numNewFailures++;
   2156 		} else {	/* Disk is already dead... */
   2157 			/* printf("Disk already marked as dead!\n"); */
   2158 		}
   2159 
   2160 	}
   2161 
   2162 	/* Fill in the error value */
   2163 	req->error = bp->b_error;
   2164 
   2165 	/* Drop this one on the "finished" queue... */
   2166 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2167 
   2168 	/* Let the raidio thread know there is work to be done. */
   2169 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2170 
   2171 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2172 }
   2173 
   2174 
   2175 /*
   2176  * initialize a buf structure for doing an I/O in the kernel.
   2177  */
   2178 static void
   2179 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2180        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2181        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2182 {
   2183 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2184 	bp->b_oflags = 0;
   2185 	bp->b_cflags = 0;
   2186 	bp->b_bcount = numSect << logBytesPerSector;
   2187 	bp->b_bufsize = bp->b_bcount;
   2188 	bp->b_error = 0;
   2189 	bp->b_dev = dev;
   2190 	bp->b_data = bf;
   2191 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2192 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2193 	if (bp->b_bcount == 0) {
   2194 		panic("bp->b_bcount is zero in InitBP!!");
   2195 	}
   2196 	bp->b_iodone = cbFunc;
   2197 	bp->b_private = cbArg;
   2198 }
   2199 
   2200 /*
   2201  * Wait interruptibly for an exclusive lock.
   2202  *
   2203  * XXX
   2204  * Several drivers do this; it should be abstracted and made MP-safe.
   2205  * (Hmm... where have we seen this warning before :->  GO )
   2206  */
   2207 static int
   2208 raidlock(struct raid_softc *rs)
   2209 {
   2210 	int     error;
   2211 
   2212 	error = 0;
   2213 	mutex_enter(&rs->sc_mutex);
   2214 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2215 		rs->sc_flags |= RAIDF_WANTED;
   2216 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2217 		if (error != 0)
   2218 			goto done;
   2219 	}
   2220 	rs->sc_flags |= RAIDF_LOCKED;
   2221 done:
   2222 	mutex_exit(&rs->sc_mutex);
   2223 	return error;
   2224 }
   2225 /*
   2226  * Unlock and wake up any waiters.
   2227  */
   2228 static void
   2229 raidunlock(struct raid_softc *rs)
   2230 {
   2231 
   2232 	mutex_enter(&rs->sc_mutex);
   2233 	rs->sc_flags &= ~RAIDF_LOCKED;
   2234 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2235 		rs->sc_flags &= ~RAIDF_WANTED;
   2236 		cv_broadcast(&rs->sc_cv);
   2237 	}
   2238 	mutex_exit(&rs->sc_mutex);
   2239 }
   2240 
   2241 
   2242 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2243 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2244 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2245 
   2246 static daddr_t
   2247 rf_component_info_offset(void)
   2248 {
   2249 
   2250 	return RF_COMPONENT_INFO_OFFSET;
   2251 }
   2252 
   2253 static daddr_t
   2254 rf_component_info_size(unsigned secsize)
   2255 {
   2256 	daddr_t info_size;
   2257 
   2258 	KASSERT(secsize);
   2259 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2260 		info_size = secsize;
   2261 	else
   2262 		info_size = RF_COMPONENT_INFO_SIZE;
   2263 
   2264 	return info_size;
   2265 }
   2266 
   2267 static daddr_t
   2268 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2269 {
   2270 	daddr_t map_offset;
   2271 
   2272 	KASSERT(raidPtr->bytesPerSector);
   2273 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2274 		map_offset = raidPtr->bytesPerSector;
   2275 	else
   2276 		map_offset = RF_COMPONENT_INFO_SIZE;
   2277 	map_offset += rf_component_info_offset();
   2278 
   2279 	return map_offset;
   2280 }
   2281 
   2282 static daddr_t
   2283 rf_parity_map_size(RF_Raid_t *raidPtr)
   2284 {
   2285 	daddr_t map_size;
   2286 
   2287 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2288 		map_size = raidPtr->bytesPerSector;
   2289 	else
   2290 		map_size = RF_PARITY_MAP_SIZE;
   2291 
   2292 	return map_size;
   2293 }
   2294 
   2295 int
   2296 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2297 {
   2298 	RF_ComponentLabel_t *clabel;
   2299 
   2300 	clabel = raidget_component_label(raidPtr, col);
   2301 	clabel->clean = RF_RAID_CLEAN;
   2302 	raidflush_component_label(raidPtr, col);
   2303 	return(0);
   2304 }
   2305 
   2306 
   2307 int
   2308 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2309 {
   2310 	RF_ComponentLabel_t *clabel;
   2311 
   2312 	clabel = raidget_component_label(raidPtr, col);
   2313 	clabel->clean = RF_RAID_DIRTY;
   2314 	raidflush_component_label(raidPtr, col);
   2315 	return(0);
   2316 }
   2317 
   2318 int
   2319 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2320 {
   2321 	KASSERT(raidPtr->bytesPerSector);
   2322 
   2323 	return raidread_component_label(raidPtr->bytesPerSector,
   2324 	    raidPtr->Disks[col].dev,
   2325 	    raidPtr->raid_cinfo[col].ci_vp,
   2326 	    &raidPtr->raid_cinfo[col].ci_label);
   2327 }
   2328 
   2329 RF_ComponentLabel_t *
   2330 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2331 {
   2332 	return &raidPtr->raid_cinfo[col].ci_label;
   2333 }
   2334 
   2335 int
   2336 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2337 {
   2338 	RF_ComponentLabel_t *label;
   2339 
   2340 	label = &raidPtr->raid_cinfo[col].ci_label;
   2341 	label->mod_counter = raidPtr->mod_counter;
   2342 #ifndef RF_NO_PARITY_MAP
   2343 	label->parity_map_modcount = label->mod_counter;
   2344 #endif
   2345 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2346 	    raidPtr->Disks[col].dev,
   2347 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2348 }
   2349 
   2350 /*
   2351  * Swap the label endianness.
   2352  *
   2353  * Everything in the component label is 4-byte-swapped except the version,
   2354  * which is kept in the byte-swapped version at all times, and indicates
   2355  * for the writer that a swap is necessary.
   2356  *
   2357  * For reads it is expected that out_label == clabel, but writes expect
   2358  * separate labels so only the re-swapped label is written out to disk,
   2359  * leaving the swapped-except-version internally.
   2360  *
   2361  * Only support swapping label version 2.
   2362  */
   2363 static void
   2364 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2365 {
   2366 	int	*in, *out, *in_last;
   2367 
   2368 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2369 
   2370 	/* Don't swap the label, but do copy it. */
   2371 	out_label->version = clabel->version;
   2372 
   2373 	in = &clabel->serial_number;
   2374 	in_last = &clabel->future_use2[42];
   2375 	out = &out_label->serial_number;
   2376 
   2377 	for (; in < in_last; in++, out++)
   2378 		*out = bswap32(*in);
   2379 }
   2380 
   2381 static int
   2382 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2383     RF_ComponentLabel_t *clabel)
   2384 {
   2385 	int error;
   2386 
   2387 	error = raidread_component_area(dev, b_vp, clabel,
   2388 	    sizeof(RF_ComponentLabel_t),
   2389 	    rf_component_info_offset(),
   2390 	    rf_component_info_size(secsize));
   2391 
   2392 	if (error == 0 &&
   2393 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2394 		rf_swap_label(clabel, clabel);
   2395 	}
   2396 
   2397 	return error;
   2398 }
   2399 
   2400 /* ARGSUSED */
   2401 static int
   2402 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2403     size_t msize, daddr_t offset, daddr_t dsize)
   2404 {
   2405 	struct buf *bp;
   2406 	int error;
   2407 
   2408 	/* XXX should probably ensure that we don't try to do this if
   2409 	   someone has changed rf_protected_sectors. */
   2410 
   2411 	if (b_vp == NULL) {
   2412 		/* For whatever reason, this component is not valid.
   2413 		   Don't try to read a component label from it. */
   2414 		return(EINVAL);
   2415 	}
   2416 
   2417 	/* get a block of the appropriate size... */
   2418 	bp = geteblk((int)dsize);
   2419 	bp->b_dev = dev;
   2420 
   2421 	/* get our ducks in a row for the read */
   2422 	bp->b_blkno = offset / DEV_BSIZE;
   2423 	bp->b_bcount = dsize;
   2424 	bp->b_flags |= B_READ;
   2425  	bp->b_resid = dsize;
   2426 
   2427 	bdev_strategy(bp);
   2428 	error = biowait(bp);
   2429 
   2430 	if (!error) {
   2431 		memcpy(data, bp->b_data, msize);
   2432 	}
   2433 
   2434 	brelse(bp, 0);
   2435 	return(error);
   2436 }
   2437 
   2438 static int
   2439 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2440     RF_ComponentLabel_t *clabel)
   2441 {
   2442 	RF_ComponentLabel_t *clabel_write = clabel;
   2443 	RF_ComponentLabel_t lclabel;
   2444 	int error;
   2445 
   2446 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2447 		clabel_write = &lclabel;
   2448 		rf_swap_label(clabel, clabel_write);
   2449 	}
   2450 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2451 	    sizeof(RF_ComponentLabel_t),
   2452 	    rf_component_info_offset(),
   2453 	    rf_component_info_size(secsize), 0);
   2454 
   2455 	return error;
   2456 }
   2457 
   2458 /* ARGSUSED */
   2459 static int
   2460 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2461     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2462 {
   2463 	struct buf *bp;
   2464 	int error;
   2465 
   2466 	/* get a block of the appropriate size... */
   2467 	bp = geteblk((int)dsize);
   2468 	bp->b_dev = dev;
   2469 
   2470 	/* get our ducks in a row for the write */
   2471 	bp->b_blkno = offset / DEV_BSIZE;
   2472 	bp->b_bcount = dsize;
   2473 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2474  	bp->b_resid = dsize;
   2475 
   2476 	memset(bp->b_data, 0, dsize);
   2477 	memcpy(bp->b_data, data, msize);
   2478 
   2479 	bdev_strategy(bp);
   2480 	if (asyncp)
   2481 		return 0;
   2482 	error = biowait(bp);
   2483 	brelse(bp, 0);
   2484 	if (error) {
   2485 #if 1
   2486 		printf("Failed to write RAID component info!\n");
   2487 #endif
   2488 	}
   2489 
   2490 	return(error);
   2491 }
   2492 
   2493 void
   2494 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2495 {
   2496 	int c;
   2497 
   2498 	for (c = 0; c < raidPtr->numCol; c++) {
   2499 		/* Skip dead disks. */
   2500 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2501 			continue;
   2502 		/* XXXjld: what if an error occurs here? */
   2503 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2504 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2505 		    RF_PARITYMAP_NBYTE,
   2506 		    rf_parity_map_offset(raidPtr),
   2507 		    rf_parity_map_size(raidPtr), 0);
   2508 	}
   2509 }
   2510 
   2511 void
   2512 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2513 {
   2514 	struct rf_paritymap_ondisk tmp;
   2515 	int c,first;
   2516 
   2517 	first=1;
   2518 	for (c = 0; c < raidPtr->numCol; c++) {
   2519 		/* Skip dead disks. */
   2520 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2521 			continue;
   2522 		raidread_component_area(raidPtr->Disks[c].dev,
   2523 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2524 		    RF_PARITYMAP_NBYTE,
   2525 		    rf_parity_map_offset(raidPtr),
   2526 		    rf_parity_map_size(raidPtr));
   2527 		if (first) {
   2528 			memcpy(map, &tmp, sizeof(*map));
   2529 			first = 0;
   2530 		} else {
   2531 			rf_paritymap_merge(map, &tmp);
   2532 		}
   2533 	}
   2534 }
   2535 
   2536 void
   2537 rf_markalldirty(RF_Raid_t *raidPtr)
   2538 {
   2539 	RF_ComponentLabel_t *clabel;
   2540 	int sparecol;
   2541 	int c;
   2542 	int j;
   2543 	int scol = -1;
   2544 
   2545 	raidPtr->mod_counter++;
   2546 	for (c = 0; c < raidPtr->numCol; c++) {
   2547 		/* we don't want to touch (at all) a disk that has
   2548 		   failed */
   2549 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2550 			clabel = raidget_component_label(raidPtr, c);
   2551 			if (clabel->status == rf_ds_spared) {
   2552 				/* XXX do something special...
   2553 				   but whatever you do, don't
   2554 				   try to access it!! */
   2555 			} else {
   2556 				raidmarkdirty(raidPtr, c);
   2557 			}
   2558 		}
   2559 	}
   2560 
   2561 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2562 		sparecol = raidPtr->numCol + c;
   2563 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2564 			/*
   2565 
   2566 			   we claim this disk is "optimal" if it's
   2567 			   rf_ds_used_spare, as that means it should be
   2568 			   directly substitutable for the disk it replaced.
   2569 			   We note that too...
   2570 
   2571 			 */
   2572 
   2573 			for(j=0;j<raidPtr->numCol;j++) {
   2574 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2575 					scol = j;
   2576 					break;
   2577 				}
   2578 			}
   2579 
   2580 			clabel = raidget_component_label(raidPtr, sparecol);
   2581 			/* make sure status is noted */
   2582 
   2583 			raid_init_component_label(raidPtr, clabel);
   2584 
   2585 			clabel->row = 0;
   2586 			clabel->column = scol;
   2587 			/* Note: we *don't* change status from rf_ds_used_spare
   2588 			   to rf_ds_optimal */
   2589 			/* clabel.status = rf_ds_optimal; */
   2590 
   2591 			raidmarkdirty(raidPtr, sparecol);
   2592 		}
   2593 	}
   2594 }
   2595 
   2596 
   2597 void
   2598 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2599 {
   2600 	RF_ComponentLabel_t *clabel;
   2601 	int sparecol;
   2602 	int c;
   2603 	int j;
   2604 	int scol;
   2605 	struct raid_softc *rs = raidPtr->softc;
   2606 
   2607 	scol = -1;
   2608 
   2609 	/* XXX should do extra checks to make sure things really are clean,
   2610 	   rather than blindly setting the clean bit... */
   2611 
   2612 	raidPtr->mod_counter++;
   2613 
   2614 	for (c = 0; c < raidPtr->numCol; c++) {
   2615 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2616 			clabel = raidget_component_label(raidPtr, c);
   2617 			/* make sure status is noted */
   2618 			clabel->status = rf_ds_optimal;
   2619 
   2620 			/* note what unit we are configured as */
   2621 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2622 				clabel->last_unit = raidPtr->raidid;
   2623 
   2624 			raidflush_component_label(raidPtr, c);
   2625 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2626 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2627 					raidmarkclean(raidPtr, c);
   2628 				}
   2629 			}
   2630 		}
   2631 		/* else we don't touch it.. */
   2632 	}
   2633 
   2634 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2635 		sparecol = raidPtr->numCol + c;
   2636 		/* Need to ensure that the reconstruct actually completed! */
   2637 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2638 			/*
   2639 
   2640 			   we claim this disk is "optimal" if it's
   2641 			   rf_ds_used_spare, as that means it should be
   2642 			   directly substitutable for the disk it replaced.
   2643 			   We note that too...
   2644 
   2645 			 */
   2646 
   2647 			for(j=0;j<raidPtr->numCol;j++) {
   2648 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2649 					scol = j;
   2650 					break;
   2651 				}
   2652 			}
   2653 
   2654 			/* XXX shouldn't *really* need this... */
   2655 			clabel = raidget_component_label(raidPtr, sparecol);
   2656 			/* make sure status is noted */
   2657 
   2658 			raid_init_component_label(raidPtr, clabel);
   2659 
   2660 			clabel->column = scol;
   2661 			clabel->status = rf_ds_optimal;
   2662 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2663 				clabel->last_unit = raidPtr->raidid;
   2664 
   2665 			raidflush_component_label(raidPtr, sparecol);
   2666 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2667 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2668 					raidmarkclean(raidPtr, sparecol);
   2669 				}
   2670 			}
   2671 		}
   2672 	}
   2673 }
   2674 
   2675 void
   2676 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2677 {
   2678 
   2679 	if (vp != NULL) {
   2680 		if (auto_configured == 1) {
   2681 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2682 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2683 			vput(vp);
   2684 
   2685 		} else {
   2686 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2687 		}
   2688 	}
   2689 }
   2690 
   2691 
   2692 void
   2693 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2694 {
   2695 	int r,c;
   2696 	struct vnode *vp;
   2697 	int acd;
   2698 
   2699 
   2700 	/* We take this opportunity to close the vnodes like we should.. */
   2701 
   2702 	for (c = 0; c < raidPtr->numCol; c++) {
   2703 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2704 		acd = raidPtr->Disks[c].auto_configured;
   2705 		rf_close_component(raidPtr, vp, acd);
   2706 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2707 		raidPtr->Disks[c].auto_configured = 0;
   2708 	}
   2709 
   2710 	for (r = 0; r < raidPtr->numSpare; r++) {
   2711 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2712 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2713 		rf_close_component(raidPtr, vp, acd);
   2714 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2715 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2716 	}
   2717 }
   2718 
   2719 
   2720 static void
   2721 rf_ReconThread(struct rf_recon_req_internal *req)
   2722 {
   2723 	int     s;
   2724 	RF_Raid_t *raidPtr;
   2725 
   2726 	s = splbio();
   2727 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2728 	raidPtr->recon_in_progress = 1;
   2729 
   2730 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2731 		raidPtr->forceRecon = 1;
   2732 	}
   2733 
   2734 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2735 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2736 
   2737 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2738 		raidPtr->forceRecon = 0;
   2739 	}
   2740 
   2741 	RF_Free(req, sizeof(*req));
   2742 
   2743 	raidPtr->recon_in_progress = 0;
   2744 	splx(s);
   2745 
   2746 	/* That's all... */
   2747 	kthread_exit(0);	/* does not return */
   2748 }
   2749 
   2750 static void
   2751 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2752 {
   2753 	int retcode;
   2754 	int s;
   2755 
   2756 	raidPtr->parity_rewrite_stripes_done = 0;
   2757 	raidPtr->parity_rewrite_in_progress = 1;
   2758 	s = splbio();
   2759 	retcode = rf_RewriteParity(raidPtr);
   2760 	splx(s);
   2761 	if (retcode) {
   2762 		printf("raid%d: Error re-writing parity (%d)!\n",
   2763 		    raidPtr->raidid, retcode);
   2764 	} else {
   2765 		/* set the clean bit!  If we shutdown correctly,
   2766 		   the clean bit on each component label will get
   2767 		   set */
   2768 		raidPtr->parity_good = RF_RAID_CLEAN;
   2769 	}
   2770 	raidPtr->parity_rewrite_in_progress = 0;
   2771 
   2772 	/* Anyone waiting for us to stop?  If so, inform them... */
   2773 	if (raidPtr->waitShutdown) {
   2774 		rf_lock_mutex2(raidPtr->rad_lock);
   2775 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2776 		rf_unlock_mutex2(raidPtr->rad_lock);
   2777 	}
   2778 
   2779 	/* That's all... */
   2780 	kthread_exit(0);	/* does not return */
   2781 }
   2782 
   2783 
   2784 static void
   2785 rf_CopybackThread(RF_Raid_t *raidPtr)
   2786 {
   2787 	int s;
   2788 
   2789 	raidPtr->copyback_in_progress = 1;
   2790 	s = splbio();
   2791 	rf_CopybackReconstructedData(raidPtr);
   2792 	splx(s);
   2793 	raidPtr->copyback_in_progress = 0;
   2794 
   2795 	/* That's all... */
   2796 	kthread_exit(0);	/* does not return */
   2797 }
   2798 
   2799 
   2800 static void
   2801 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2802 {
   2803 	int s;
   2804 	RF_Raid_t *raidPtr;
   2805 
   2806 	s = splbio();
   2807 	raidPtr = req->raidPtr;
   2808 	raidPtr->recon_in_progress = 1;
   2809 
   2810 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2811 		raidPtr->forceRecon = 1;
   2812 	}
   2813 
   2814 	rf_ReconstructInPlace(raidPtr, req->col);
   2815 
   2816 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2817 		raidPtr->forceRecon = 0;
   2818 	}
   2819 
   2820 	RF_Free(req, sizeof(*req));
   2821 	raidPtr->recon_in_progress = 0;
   2822 	splx(s);
   2823 
   2824 	/* That's all... */
   2825 	kthread_exit(0);	/* does not return */
   2826 }
   2827 
   2828 static RF_AutoConfig_t *
   2829 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2830     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2831     unsigned secsize)
   2832 {
   2833 	int good_one = 0;
   2834 	RF_ComponentLabel_t *clabel;
   2835 	RF_AutoConfig_t *ac;
   2836 
   2837 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2838 
   2839 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2840 		/* Got the label.  Does it look reasonable? */
   2841 		if (rf_reasonable_label(clabel, numsecs) &&
   2842 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2843 #ifdef DEBUG
   2844 			printf("Component on: %s: %llu\n",
   2845 				cname, (unsigned long long)size);
   2846 			rf_print_component_label(clabel);
   2847 #endif
   2848 			/* if it's reasonable, add it, else ignore it. */
   2849 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2850 				M_WAITOK);
   2851 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2852 			ac->dev = dev;
   2853 			ac->vp = vp;
   2854 			ac->clabel = clabel;
   2855 			ac->next = ac_list;
   2856 			ac_list = ac;
   2857 			good_one = 1;
   2858 		}
   2859 	}
   2860 	if (!good_one) {
   2861 		/* cleanup */
   2862 		free(clabel, M_RAIDFRAME);
   2863 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2864 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2865 		vput(vp);
   2866 	}
   2867 	return ac_list;
   2868 }
   2869 
   2870 static RF_AutoConfig_t *
   2871 rf_find_raid_components(void)
   2872 {
   2873 	struct vnode *vp;
   2874 	struct disklabel label;
   2875 	device_t dv;
   2876 	deviter_t di;
   2877 	dev_t dev;
   2878 	int bmajor, bminor, wedge, rf_part_found;
   2879 	int error;
   2880 	int i;
   2881 	RF_AutoConfig_t *ac_list;
   2882 	uint64_t numsecs;
   2883 	unsigned secsize;
   2884 	int dowedges;
   2885 
   2886 	/* initialize the AutoConfig list */
   2887 	ac_list = NULL;
   2888 
   2889 	/*
   2890 	 * we begin by trolling through *all* the devices on the system *twice*
   2891 	 * first we scan for wedges, second for other devices. This avoids
   2892 	 * using a raw partition instead of a wedge that covers the whole disk
   2893 	 */
   2894 
   2895 	for (dowedges=1; dowedges>=0; --dowedges) {
   2896 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2897 		     dv = deviter_next(&di)) {
   2898 
   2899 			/* we are only interested in disks */
   2900 			if (device_class(dv) != DV_DISK)
   2901 				continue;
   2902 
   2903 			/* we don't care about floppies */
   2904 			if (device_is_a(dv, "fd")) {
   2905 				continue;
   2906 			}
   2907 
   2908 			/* we don't care about CDs. */
   2909 			if (device_is_a(dv, "cd")) {
   2910 				continue;
   2911 			}
   2912 
   2913 			/* we don't care about md. */
   2914 			if (device_is_a(dv, "md")) {
   2915 				continue;
   2916 			}
   2917 
   2918 			/* hdfd is the Atari/Hades floppy driver */
   2919 			if (device_is_a(dv, "hdfd")) {
   2920 				continue;
   2921 			}
   2922 
   2923 			/* fdisa is the Atari/Milan floppy driver */
   2924 			if (device_is_a(dv, "fdisa")) {
   2925 				continue;
   2926 			}
   2927 
   2928 			/* we don't care about spiflash */
   2929 			if (device_is_a(dv, "spiflash")) {
   2930 				continue;
   2931 			}
   2932 
   2933 			/* are we in the wedges pass ? */
   2934 			wedge = device_is_a(dv, "dk");
   2935 			if (wedge != dowedges) {
   2936 				continue;
   2937 			}
   2938 
   2939 			/* need to find the device_name_to_block_device_major stuff */
   2940 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2941 
   2942 			rf_part_found = 0; /*No raid partition as yet*/
   2943 
   2944 			/* get a vnode for the raw partition of this disk */
   2945 			bminor = minor(device_unit(dv));
   2946 			dev = wedge ? makedev(bmajor, bminor) :
   2947 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2948 			if (bdevvp(dev, &vp))
   2949 				panic("RAID can't alloc vnode");
   2950 
   2951 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2952 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2953 
   2954 			if (error) {
   2955 				/* "Who cares."  Continue looking
   2956 				   for something that exists*/
   2957 				vput(vp);
   2958 				continue;
   2959 			}
   2960 
   2961 			error = getdisksize(vp, &numsecs, &secsize);
   2962 			if (error) {
   2963 				/*
   2964 				 * Pseudo devices like vnd and cgd can be
   2965 				 * opened but may still need some configuration.
   2966 				 * Ignore these quietly.
   2967 				 */
   2968 				if (error != ENXIO)
   2969 					printf("RAIDframe: can't get disk size"
   2970 					    " for dev %s (%d)\n",
   2971 					    device_xname(dv), error);
   2972 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2973 				vput(vp);
   2974 				continue;
   2975 			}
   2976 			if (wedge) {
   2977 				struct dkwedge_info dkw;
   2978 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2979 				    NOCRED);
   2980 				if (error) {
   2981 					printf("RAIDframe: can't get wedge info for "
   2982 					    "dev %s (%d)\n", device_xname(dv), error);
   2983 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2984 					vput(vp);
   2985 					continue;
   2986 				}
   2987 
   2988 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2989 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2990 					vput(vp);
   2991 					continue;
   2992 				}
   2993 
   2994 				VOP_UNLOCK(vp);
   2995 				ac_list = rf_get_component(ac_list, dev, vp,
   2996 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2997 				rf_part_found = 1; /*There is a raid component on this disk*/
   2998 				continue;
   2999 			}
   3000 
   3001 			/* Ok, the disk exists.  Go get the disklabel. */
   3002 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3003 			if (error) {
   3004 				/*
   3005 				 * XXX can't happen - open() would
   3006 				 * have errored out (or faked up one)
   3007 				 */
   3008 				if (error != ENOTTY)
   3009 					printf("RAIDframe: can't get label for dev "
   3010 					    "%s (%d)\n", device_xname(dv), error);
   3011 			}
   3012 
   3013 			/* don't need this any more.  We'll allocate it again
   3014 			   a little later if we really do... */
   3015 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3016 			vput(vp);
   3017 
   3018 			if (error)
   3019 				continue;
   3020 
   3021 			rf_part_found = 0; /*No raid partitions yet*/
   3022 			for (i = 0; i < label.d_npartitions; i++) {
   3023 				char cname[sizeof(ac_list->devname)];
   3024 
   3025 				/* We only support partitions marked as RAID */
   3026 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3027 					continue;
   3028 
   3029 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3030 				if (bdevvp(dev, &vp))
   3031 					panic("RAID can't alloc vnode");
   3032 
   3033 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3034 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3035 				if (error) {
   3036 					/* Not quite a 'whatever'.  In
   3037 					 * this situation we know
   3038 					 * there is a FS_RAID
   3039 					 * partition, but we can't
   3040 					 * open it.  The most likely
   3041 					 * reason is that the
   3042 					 * partition is already in
   3043 					 * use by another RAID set.
   3044 					 * So note that we've already
   3045 					 * found a partition on this
   3046 					 * disk so we don't attempt
   3047 					 * to use the raw disk later. */
   3048 					rf_part_found = 1;
   3049 					vput(vp);
   3050 					continue;
   3051 				}
   3052 				VOP_UNLOCK(vp);
   3053 				snprintf(cname, sizeof(cname), "%s%c",
   3054 				    device_xname(dv), 'a' + i);
   3055 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3056 					label.d_partitions[i].p_size, numsecs, secsize);
   3057 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3058 			}
   3059 
   3060 			/*
   3061 			 *If there is no raid component on this disk, either in a
   3062 			 *disklabel or inside a wedge, check the raw partition as well,
   3063 			 *as it is possible to configure raid components on raw disk
   3064 			 *devices.
   3065 			 */
   3066 
   3067 			if (!rf_part_found) {
   3068 				char cname[sizeof(ac_list->devname)];
   3069 
   3070 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3071 				if (bdevvp(dev, &vp))
   3072 					panic("RAID can't alloc vnode");
   3073 
   3074 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3075 
   3076 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3077 				if (error) {
   3078 					/* Whatever... */
   3079 					vput(vp);
   3080 					continue;
   3081 				}
   3082 				VOP_UNLOCK(vp);
   3083 				snprintf(cname, sizeof(cname), "%s%c",
   3084 				    device_xname(dv), 'a' + RAW_PART);
   3085 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3086 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3087 			}
   3088 		}
   3089 		deviter_release(&di);
   3090 	}
   3091 	return ac_list;
   3092 }
   3093 
   3094 int
   3095 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3096 {
   3097 
   3098 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3099 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3100 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3101 	    (clabel->clean == RF_RAID_CLEAN ||
   3102 	     clabel->clean == RF_RAID_DIRTY) &&
   3103 	    clabel->row >=0 &&
   3104 	    clabel->column >= 0 &&
   3105 	    clabel->num_rows > 0 &&
   3106 	    clabel->num_columns > 0 &&
   3107 	    clabel->row < clabel->num_rows &&
   3108 	    clabel->column < clabel->num_columns &&
   3109 	    clabel->blockSize > 0 &&
   3110 	    /*
   3111 	     * numBlocksHi may contain garbage, but it is ok since
   3112 	     * the type is unsigned.  If it is really garbage,
   3113 	     * rf_fix_old_label_size() will fix it.
   3114 	     */
   3115 	    rf_component_label_numblocks(clabel) > 0) {
   3116 		/*
   3117 		 * label looks reasonable enough...
   3118 		 * let's make sure it has no old garbage.
   3119 		 */
   3120 		if (numsecs)
   3121 			rf_fix_old_label_size(clabel, numsecs);
   3122 		return(1);
   3123 	}
   3124 	return(0);
   3125 }
   3126 
   3127 
   3128 /*
   3129  * For reasons yet unknown, some old component labels have garbage in
   3130  * the newer numBlocksHi region, and this causes lossage.  Since those
   3131  * disks will also have numsecs set to less than 32 bits of sectors,
   3132  * we can determine when this corruption has occurred, and fix it.
   3133  *
   3134  * The exact same problem, with the same unknown reason, happens to
   3135  * the partitionSizeHi member as well.
   3136  */
   3137 static void
   3138 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3139 {
   3140 
   3141 	if (numsecs < ((uint64_t)1 << 32)) {
   3142 		if (clabel->numBlocksHi) {
   3143 			printf("WARNING: total sectors < 32 bits, yet "
   3144 			       "numBlocksHi set\n"
   3145 			       "WARNING: resetting numBlocksHi to zero.\n");
   3146 			clabel->numBlocksHi = 0;
   3147 		}
   3148 
   3149 		if (clabel->partitionSizeHi) {
   3150 			printf("WARNING: total sectors < 32 bits, yet "
   3151 			       "partitionSizeHi set\n"
   3152 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3153 			clabel->partitionSizeHi = 0;
   3154 		}
   3155 	}
   3156 }
   3157 
   3158 
   3159 #ifdef DEBUG
   3160 void
   3161 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3162 {
   3163 	uint64_t numBlocks;
   3164 	static const char *rp[] = {
   3165 	    "No", "Force", "Soft", "*invalid*"
   3166 	};
   3167 
   3168 
   3169 	numBlocks = rf_component_label_numblocks(clabel);
   3170 
   3171 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3172 	       clabel->row, clabel->column,
   3173 	       clabel->num_rows, clabel->num_columns);
   3174 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3175 	       clabel->version, clabel->serial_number,
   3176 	       clabel->mod_counter);
   3177 	printf("   Clean: %s Status: %d\n",
   3178 	       clabel->clean ? "Yes" : "No", clabel->status);
   3179 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3180 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3181 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3182 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3183 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3184 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3185 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3186 #if 0
   3187 	   printf("   Config order: %d\n", clabel->config_order);
   3188 #endif
   3189 
   3190 }
   3191 #endif
   3192 
   3193 static RF_ConfigSet_t *
   3194 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3195 {
   3196 	RF_AutoConfig_t *ac;
   3197 	RF_ConfigSet_t *config_sets;
   3198 	RF_ConfigSet_t *cset;
   3199 	RF_AutoConfig_t *ac_next;
   3200 
   3201 
   3202 	config_sets = NULL;
   3203 
   3204 	/* Go through the AutoConfig list, and figure out which components
   3205 	   belong to what sets.  */
   3206 	ac = ac_list;
   3207 	while(ac!=NULL) {
   3208 		/* we're going to putz with ac->next, so save it here
   3209 		   for use at the end of the loop */
   3210 		ac_next = ac->next;
   3211 
   3212 		if (config_sets == NULL) {
   3213 			/* will need at least this one... */
   3214 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3215 				       M_RAIDFRAME, M_WAITOK);
   3216 			/* this one is easy :) */
   3217 			config_sets->ac = ac;
   3218 			config_sets->next = NULL;
   3219 			config_sets->rootable = 0;
   3220 			ac->next = NULL;
   3221 		} else {
   3222 			/* which set does this component fit into? */
   3223 			cset = config_sets;
   3224 			while(cset!=NULL) {
   3225 				if (rf_does_it_fit(cset, ac)) {
   3226 					/* looks like it matches... */
   3227 					ac->next = cset->ac;
   3228 					cset->ac = ac;
   3229 					break;
   3230 				}
   3231 				cset = cset->next;
   3232 			}
   3233 			if (cset==NULL) {
   3234 				/* didn't find a match above... new set..*/
   3235 				cset = malloc(sizeof(RF_ConfigSet_t),
   3236 					       M_RAIDFRAME, M_WAITOK);
   3237 				cset->ac = ac;
   3238 				ac->next = NULL;
   3239 				cset->next = config_sets;
   3240 				cset->rootable = 0;
   3241 				config_sets = cset;
   3242 			}
   3243 		}
   3244 		ac = ac_next;
   3245 	}
   3246 
   3247 
   3248 	return(config_sets);
   3249 }
   3250 
   3251 static int
   3252 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3253 {
   3254 	RF_ComponentLabel_t *clabel1, *clabel2;
   3255 
   3256 	/* If this one matches the *first* one in the set, that's good
   3257 	   enough, since the other members of the set would have been
   3258 	   through here too... */
   3259 	/* note that we are not checking partitionSize here..
   3260 
   3261 	   Note that we are also not checking the mod_counters here.
   3262 	   If everything else matches except the mod_counter, that's
   3263 	   good enough for this test.  We will deal with the mod_counters
   3264 	   a little later in the autoconfiguration process.
   3265 
   3266 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3267 
   3268 	   The reason we don't check for this is that failed disks
   3269 	   will have lower modification counts.  If those disks are
   3270 	   not added to the set they used to belong to, then they will
   3271 	   form their own set, which may result in 2 different sets,
   3272 	   for example, competing to be configured at raid0, and
   3273 	   perhaps competing to be the root filesystem set.  If the
   3274 	   wrong ones get configured, or both attempt to become /,
   3275 	   weird behaviour and or serious lossage will occur.  Thus we
   3276 	   need to bring them into the fold here, and kick them out at
   3277 	   a later point.
   3278 
   3279 	*/
   3280 
   3281 	clabel1 = cset->ac->clabel;
   3282 	clabel2 = ac->clabel;
   3283 	if ((clabel1->version == clabel2->version) &&
   3284 	    (clabel1->serial_number == clabel2->serial_number) &&
   3285 	    (clabel1->num_rows == clabel2->num_rows) &&
   3286 	    (clabel1->num_columns == clabel2->num_columns) &&
   3287 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3288 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3289 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3290 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3291 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3292 	    (clabel1->blockSize == clabel2->blockSize) &&
   3293 	    rf_component_label_numblocks(clabel1) ==
   3294 	    rf_component_label_numblocks(clabel2) &&
   3295 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3296 	    (clabel1->root_partition == clabel2->root_partition) &&
   3297 	    (clabel1->last_unit == clabel2->last_unit) &&
   3298 	    (clabel1->config_order == clabel2->config_order)) {
   3299 		/* if it get's here, it almost *has* to be a match */
   3300 	} else {
   3301 		/* it's not consistent with somebody in the set..
   3302 		   punt */
   3303 		return(0);
   3304 	}
   3305 	/* all was fine.. it must fit... */
   3306 	return(1);
   3307 }
   3308 
   3309 static int
   3310 rf_have_enough_components(RF_ConfigSet_t *cset)
   3311 {
   3312 	RF_AutoConfig_t *ac;
   3313 	RF_AutoConfig_t *auto_config;
   3314 	RF_ComponentLabel_t *clabel;
   3315 	int c;
   3316 	int num_cols;
   3317 	int num_missing;
   3318 	int mod_counter;
   3319 	int mod_counter_found;
   3320 	int even_pair_failed;
   3321 	char parity_type;
   3322 
   3323 
   3324 	/* check to see that we have enough 'live' components
   3325 	   of this set.  If so, we can configure it if necessary */
   3326 
   3327 	num_cols = cset->ac->clabel->num_columns;
   3328 	parity_type = cset->ac->clabel->parityConfig;
   3329 
   3330 	/* XXX Check for duplicate components!?!?!? */
   3331 
   3332 	/* Determine what the mod_counter is supposed to be for this set. */
   3333 
   3334 	mod_counter_found = 0;
   3335 	mod_counter = 0;
   3336 	ac = cset->ac;
   3337 	while(ac!=NULL) {
   3338 		if (mod_counter_found==0) {
   3339 			mod_counter = ac->clabel->mod_counter;
   3340 			mod_counter_found = 1;
   3341 		} else {
   3342 			if (ac->clabel->mod_counter > mod_counter) {
   3343 				mod_counter = ac->clabel->mod_counter;
   3344 			}
   3345 		}
   3346 		ac = ac->next;
   3347 	}
   3348 
   3349 	num_missing = 0;
   3350 	auto_config = cset->ac;
   3351 
   3352 	even_pair_failed = 0;
   3353 	for(c=0; c<num_cols; c++) {
   3354 		ac = auto_config;
   3355 		while(ac!=NULL) {
   3356 			if ((ac->clabel->column == c) &&
   3357 			    (ac->clabel->mod_counter == mod_counter)) {
   3358 				/* it's this one... */
   3359 #ifdef DEBUG
   3360 				printf("Found: %s at %d\n",
   3361 				       ac->devname,c);
   3362 #endif
   3363 				break;
   3364 			}
   3365 			ac=ac->next;
   3366 		}
   3367 		if (ac==NULL) {
   3368 				/* Didn't find one here! */
   3369 				/* special case for RAID 1, especially
   3370 				   where there are more than 2
   3371 				   components (where RAIDframe treats
   3372 				   things a little differently :( ) */
   3373 			if (parity_type == '1') {
   3374 				if (c%2 == 0) { /* even component */
   3375 					even_pair_failed = 1;
   3376 				} else { /* odd component.  If
   3377 					    we're failed, and
   3378 					    so is the even
   3379 					    component, it's
   3380 					    "Good Night, Charlie" */
   3381 					if (even_pair_failed == 1) {
   3382 						return(0);
   3383 					}
   3384 				}
   3385 			} else {
   3386 				/* normal accounting */
   3387 				num_missing++;
   3388 			}
   3389 		}
   3390 		if ((parity_type == '1') && (c%2 == 1)) {
   3391 				/* Just did an even component, and we didn't
   3392 				   bail.. reset the even_pair_failed flag,
   3393 				   and go on to the next component.... */
   3394 			even_pair_failed = 0;
   3395 		}
   3396 	}
   3397 
   3398 	clabel = cset->ac->clabel;
   3399 
   3400 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3401 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3402 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3403 		/* XXX this needs to be made *much* more general */
   3404 		/* Too many failures */
   3405 		return(0);
   3406 	}
   3407 	/* otherwise, all is well, and we've got enough to take a kick
   3408 	   at autoconfiguring this set */
   3409 	return(1);
   3410 }
   3411 
   3412 static void
   3413 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3414 			RF_Raid_t *raidPtr)
   3415 {
   3416 	RF_ComponentLabel_t *clabel;
   3417 	int i;
   3418 
   3419 	clabel = ac->clabel;
   3420 
   3421 	/* 1. Fill in the common stuff */
   3422 	config->numCol = clabel->num_columns;
   3423 	config->numSpare = 0; /* XXX should this be set here? */
   3424 	config->sectPerSU = clabel->sectPerSU;
   3425 	config->SUsPerPU = clabel->SUsPerPU;
   3426 	config->SUsPerRU = clabel->SUsPerRU;
   3427 	config->parityConfig = clabel->parityConfig;
   3428 	/* XXX... */
   3429 	strcpy(config->diskQueueType,"fifo");
   3430 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3431 	config->layoutSpecificSize = 0; /* XXX ?? */
   3432 
   3433 	while(ac!=NULL) {
   3434 		/* row/col values will be in range due to the checks
   3435 		   in reasonable_label() */
   3436 		strcpy(config->devnames[0][ac->clabel->column],
   3437 		       ac->devname);
   3438 		ac = ac->next;
   3439 	}
   3440 
   3441 	for(i=0;i<RF_MAXDBGV;i++) {
   3442 		config->debugVars[i][0] = 0;
   3443 	}
   3444 }
   3445 
   3446 static int
   3447 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3448 {
   3449 	RF_ComponentLabel_t *clabel;
   3450 	int column;
   3451 	int sparecol;
   3452 
   3453 	raidPtr->autoconfigure = new_value;
   3454 
   3455 	for(column=0; column<raidPtr->numCol; column++) {
   3456 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3457 			clabel = raidget_component_label(raidPtr, column);
   3458 			clabel->autoconfigure = new_value;
   3459 			raidflush_component_label(raidPtr, column);
   3460 		}
   3461 	}
   3462 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3463 		sparecol = raidPtr->numCol + column;
   3464 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3465 			clabel = raidget_component_label(raidPtr, sparecol);
   3466 			clabel->autoconfigure = new_value;
   3467 			raidflush_component_label(raidPtr, sparecol);
   3468 		}
   3469 	}
   3470 	return(new_value);
   3471 }
   3472 
   3473 static int
   3474 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3475 {
   3476 	RF_ComponentLabel_t *clabel;
   3477 	int column;
   3478 	int sparecol;
   3479 
   3480 	raidPtr->root_partition = new_value;
   3481 	for(column=0; column<raidPtr->numCol; column++) {
   3482 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3483 			clabel = raidget_component_label(raidPtr, column);
   3484 			clabel->root_partition = new_value;
   3485 			raidflush_component_label(raidPtr, column);
   3486 		}
   3487 	}
   3488 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3489 		sparecol = raidPtr->numCol + column;
   3490 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3491 			clabel = raidget_component_label(raidPtr, sparecol);
   3492 			clabel->root_partition = new_value;
   3493 			raidflush_component_label(raidPtr, sparecol);
   3494 		}
   3495 	}
   3496 	return(new_value);
   3497 }
   3498 
   3499 static void
   3500 rf_release_all_vps(RF_ConfigSet_t *cset)
   3501 {
   3502 	RF_AutoConfig_t *ac;
   3503 
   3504 	ac = cset->ac;
   3505 	while(ac!=NULL) {
   3506 		/* Close the vp, and give it back */
   3507 		if (ac->vp) {
   3508 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3509 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3510 			vput(ac->vp);
   3511 			ac->vp = NULL;
   3512 		}
   3513 		ac = ac->next;
   3514 	}
   3515 }
   3516 
   3517 
   3518 static void
   3519 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3520 {
   3521 	RF_AutoConfig_t *ac;
   3522 	RF_AutoConfig_t *next_ac;
   3523 
   3524 	ac = cset->ac;
   3525 	while(ac!=NULL) {
   3526 		next_ac = ac->next;
   3527 		/* nuke the label */
   3528 		free(ac->clabel, M_RAIDFRAME);
   3529 		/* cleanup the config structure */
   3530 		free(ac, M_RAIDFRAME);
   3531 		/* "next.." */
   3532 		ac = next_ac;
   3533 	}
   3534 	/* and, finally, nuke the config set */
   3535 	free(cset, M_RAIDFRAME);
   3536 }
   3537 
   3538 
   3539 void
   3540 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3541 {
   3542 	/* avoid over-writing byteswapped version. */
   3543 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3544 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3545 	clabel->serial_number = raidPtr->serial_number;
   3546 	clabel->mod_counter = raidPtr->mod_counter;
   3547 
   3548 	clabel->num_rows = 1;
   3549 	clabel->num_columns = raidPtr->numCol;
   3550 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3551 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3552 
   3553 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3554 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3555 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3556 
   3557 	clabel->blockSize = raidPtr->bytesPerSector;
   3558 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3559 
   3560 	/* XXX not portable */
   3561 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3562 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3563 	clabel->autoconfigure = raidPtr->autoconfigure;
   3564 	clabel->root_partition = raidPtr->root_partition;
   3565 	clabel->last_unit = raidPtr->raidid;
   3566 	clabel->config_order = raidPtr->config_order;
   3567 
   3568 #ifndef RF_NO_PARITY_MAP
   3569 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3570 #endif
   3571 }
   3572 
   3573 static struct raid_softc *
   3574 rf_auto_config_set(RF_ConfigSet_t *cset)
   3575 {
   3576 	RF_Raid_t *raidPtr;
   3577 	RF_Config_t *config;
   3578 	int raidID;
   3579 	struct raid_softc *sc;
   3580 
   3581 #ifdef DEBUG
   3582 	printf("RAID autoconfigure\n");
   3583 #endif
   3584 
   3585 	/* 1. Create a config structure */
   3586 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3587 
   3588 	/*
   3589 	   2. Figure out what RAID ID this one is supposed to live at
   3590 	   See if we can get the same RAID dev that it was configured
   3591 	   on last time..
   3592 	*/
   3593 
   3594 	raidID = cset->ac->clabel->last_unit;
   3595 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3596 	     sc = raidget(++raidID, false))
   3597 		continue;
   3598 #ifdef DEBUG
   3599 	printf("Configuring raid%d:\n",raidID);
   3600 #endif
   3601 
   3602 	if (sc == NULL)
   3603 		sc = raidget(raidID, true);
   3604 	raidPtr = &sc->sc_r;
   3605 
   3606 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3607 	raidPtr->softc = sc;
   3608 	raidPtr->raidid = raidID;
   3609 	raidPtr->openings = RAIDOUTSTANDING;
   3610 
   3611 	/* 3. Build the configuration structure */
   3612 	rf_create_configuration(cset->ac, config, raidPtr);
   3613 
   3614 	/* 4. Do the configuration */
   3615 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3616 		raidinit(sc);
   3617 
   3618 		rf_markalldirty(raidPtr);
   3619 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3620 		switch (cset->ac->clabel->root_partition) {
   3621 		case 1:	/* Force Root */
   3622 		case 2:	/* Soft Root: root when boot partition part of raid */
   3623 			/*
   3624 			 * everything configured just fine.  Make a note
   3625 			 * that this set is eligible to be root,
   3626 			 * or forced to be root
   3627 			 */
   3628 			cset->rootable = cset->ac->clabel->root_partition;
   3629 			/* XXX do this here? */
   3630 			raidPtr->root_partition = cset->rootable;
   3631 			break;
   3632 		default:
   3633 			break;
   3634 		}
   3635 	} else {
   3636 		raidput(sc);
   3637 		sc = NULL;
   3638 	}
   3639 
   3640 	/* 5. Cleanup */
   3641 	free(config, M_RAIDFRAME);
   3642 	return sc;
   3643 }
   3644 
   3645 void
   3646 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3647 	     size_t xmin, size_t xmax)
   3648 {
   3649 
   3650 	/* Format: raid%d_foo */
   3651 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3652 
   3653 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3654 	pool_sethiwat(p, xmax);
   3655 	pool_prime(p, xmin);
   3656 }
   3657 
   3658 
   3659 /*
   3660  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3661  * to see if there is IO pending and if that IO could possibly be done
   3662  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3663  * otherwise.
   3664  *
   3665  */
   3666 int
   3667 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3668 {
   3669 	struct raid_softc *rs;
   3670 	struct dk_softc *dksc;
   3671 
   3672 	rs = raidPtr->softc;
   3673 	dksc = &rs->sc_dksc;
   3674 
   3675 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3676 		return 1;
   3677 
   3678 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3679 		/* there is work to do */
   3680 		return 0;
   3681 	}
   3682 	/* default is nothing to do */
   3683 	return 1;
   3684 }
   3685 
   3686 int
   3687 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3688 {
   3689 	uint64_t numsecs;
   3690 	unsigned secsize;
   3691 	int error;
   3692 
   3693 	error = getdisksize(vp, &numsecs, &secsize);
   3694 	if (error == 0) {
   3695 		diskPtr->blockSize = secsize;
   3696 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3697 		diskPtr->partitionSize = numsecs;
   3698 		return 0;
   3699 	}
   3700 	return error;
   3701 }
   3702 
   3703 static int
   3704 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3705 {
   3706 	return 1;
   3707 }
   3708 
   3709 static void
   3710 raid_attach(device_t parent, device_t self, void *aux)
   3711 {
   3712 }
   3713 
   3714 
   3715 static int
   3716 raid_detach(device_t self, int flags)
   3717 {
   3718 	int error;
   3719 	struct raid_softc *rs = raidsoftc(self);
   3720 
   3721 	if (rs == NULL)
   3722 		return ENXIO;
   3723 
   3724 	if ((error = raidlock(rs)) != 0)
   3725 		return error;
   3726 
   3727 	error = raid_detach_unlocked(rs);
   3728 
   3729 	raidunlock(rs);
   3730 
   3731 	/* XXX raid can be referenced here */
   3732 
   3733 	if (error)
   3734 		return error;
   3735 
   3736 	/* Free the softc */
   3737 	raidput(rs);
   3738 
   3739 	return 0;
   3740 }
   3741 
   3742 static void
   3743 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3744 {
   3745 	struct dk_softc *dksc = &rs->sc_dksc;
   3746 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3747 
   3748 	memset(dg, 0, sizeof(*dg));
   3749 
   3750 	dg->dg_secperunit = raidPtr->totalSectors;
   3751 	dg->dg_secsize = raidPtr->bytesPerSector;
   3752 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3753 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3754 
   3755 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3756 }
   3757 
   3758 /*
   3759  * Get cache info for all the components (including spares).
   3760  * Returns intersection of all the cache flags of all disks, or first
   3761  * error if any encountered.
   3762  * XXXfua feature flags can change as spares are added - lock down somehow
   3763  */
   3764 static int
   3765 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3766 {
   3767 	int c;
   3768 	int error;
   3769 	int dkwhole = 0, dkpart;
   3770 
   3771 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3772 		/*
   3773 		 * Check any non-dead disk, even when currently being
   3774 		 * reconstructed.
   3775 		 */
   3776 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   3777 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3778 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3779 			if (error) {
   3780 				if (error != ENODEV) {
   3781 					printf("raid%d: get cache for component %s failed\n",
   3782 					    raidPtr->raidid,
   3783 					    raidPtr->Disks[c].devname);
   3784 				}
   3785 
   3786 				return error;
   3787 			}
   3788 
   3789 			if (c == 0)
   3790 				dkwhole = dkpart;
   3791 			else
   3792 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3793 		}
   3794 	}
   3795 
   3796 	*data = dkwhole;
   3797 
   3798 	return 0;
   3799 }
   3800 
   3801 /*
   3802  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3803  * We end up returning whatever error was returned by the first cache flush
   3804  * that fails.
   3805  */
   3806 
   3807 static int
   3808 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3809 {
   3810 	int e = 0;
   3811 	for (int i = 0; i < 5; i++) {
   3812 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3813 		    &force, FWRITE, NOCRED);
   3814 		if (!e || e == ENODEV)
   3815 			return e;
   3816 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3817 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3818 	}
   3819 	return e;
   3820 }
   3821 
   3822 int
   3823 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3824 {
   3825 	int c, error;
   3826 
   3827 	error = 0;
   3828 	for (c = 0; c < raidPtr->numCol; c++) {
   3829 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3830 			int e = rf_sync_component_cache(raidPtr, c, force);
   3831 			if (e && !error)
   3832 				error = e;
   3833 		}
   3834 	}
   3835 
   3836 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3837 		int sparecol = raidPtr->numCol + c;
   3838 		/* Need to ensure that the reconstruct actually completed! */
   3839 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3840 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3841 			    force);
   3842 			if (e && !error)
   3843 				error = e;
   3844 		}
   3845 	}
   3846 	return error;
   3847 }
   3848 
   3849 /* Fill in info with the current status */
   3850 void
   3851 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3852 {
   3853 
   3854 	memset(info, 0, sizeof(*info));
   3855 
   3856 	if (raidPtr->status != rf_rs_reconstructing) {
   3857 		info->total = 100;
   3858 		info->completed = 100;
   3859 	} else {
   3860 		info->total = raidPtr->reconControl->numRUsTotal;
   3861 		info->completed = raidPtr->reconControl->numRUsComplete;
   3862 	}
   3863 	info->remaining = info->total - info->completed;
   3864 }
   3865 
   3866 /* Fill in info with the current status */
   3867 void
   3868 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3869 {
   3870 
   3871 	memset(info, 0, sizeof(*info));
   3872 
   3873 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3874 		info->total = raidPtr->Layout.numStripe;
   3875 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3876 	} else {
   3877 		info->completed = 100;
   3878 		info->total = 100;
   3879 	}
   3880 	info->remaining = info->total - info->completed;
   3881 }
   3882 
   3883 /* Fill in info with the current status */
   3884 void
   3885 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3886 {
   3887 
   3888 	memset(info, 0, sizeof(*info));
   3889 
   3890 	if (raidPtr->copyback_in_progress == 1) {
   3891 		info->total = raidPtr->Layout.numStripe;
   3892 		info->completed = raidPtr->copyback_stripes_done;
   3893 		info->remaining = info->total - info->completed;
   3894 	} else {
   3895 		info->remaining = 0;
   3896 		info->completed = 100;
   3897 		info->total = 100;
   3898 	}
   3899 }
   3900 
   3901 /* Fill in config with the current info */
   3902 int
   3903 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3904 {
   3905 	int	d, i, j;
   3906 
   3907 	if (!raidPtr->valid)
   3908 		return ENODEV;
   3909 	config->cols = raidPtr->numCol;
   3910 	config->ndevs = raidPtr->numCol;
   3911 	if (config->ndevs >= RF_MAX_DISKS)
   3912 		return ENOMEM;
   3913 	config->nspares = raidPtr->numSpare;
   3914 	if (config->nspares >= RF_MAX_DISKS)
   3915 		return ENOMEM;
   3916 	config->maxqdepth = raidPtr->maxQueueDepth;
   3917 	d = 0;
   3918 	for (j = 0; j < config->cols; j++) {
   3919 		config->devs[d] = raidPtr->Disks[j];
   3920 		d++;
   3921 	}
   3922 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3923 		config->spares[i] = raidPtr->Disks[j];
   3924 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3925 			/* XXX: raidctl(8) expects to see this as a used spare */
   3926 			config->spares[i].status = rf_ds_used_spare;
   3927 		}
   3928 	}
   3929 	return 0;
   3930 }
   3931 
   3932 int
   3933 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3934 {
   3935 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3936 	RF_ComponentLabel_t *raid_clabel;
   3937 	int column = clabel->column;
   3938 
   3939 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3940 		return EINVAL;
   3941 	raid_clabel = raidget_component_label(raidPtr, column);
   3942 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3943 	/* Fix-up for userland. */
   3944 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3945 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3946 
   3947 	return 0;
   3948 }
   3949 
   3950 /*
   3951  * Module interface
   3952  */
   3953 
   3954 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3955 
   3956 #ifdef _MODULE
   3957 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3958 #endif
   3959 
   3960 static int raid_modcmd(modcmd_t, void *);
   3961 static int raid_modcmd_init(void);
   3962 static int raid_modcmd_fini(void);
   3963 
   3964 static int
   3965 raid_modcmd(modcmd_t cmd, void *data)
   3966 {
   3967 	int error;
   3968 
   3969 	error = 0;
   3970 	switch (cmd) {
   3971 	case MODULE_CMD_INIT:
   3972 		error = raid_modcmd_init();
   3973 		break;
   3974 	case MODULE_CMD_FINI:
   3975 		error = raid_modcmd_fini();
   3976 		break;
   3977 	default:
   3978 		error = ENOTTY;
   3979 		break;
   3980 	}
   3981 	return error;
   3982 }
   3983 
   3984 static int
   3985 raid_modcmd_init(void)
   3986 {
   3987 	int error;
   3988 	int bmajor, cmajor;
   3989 
   3990 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3991 	mutex_enter(&raid_lock);
   3992 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3993 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3994 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3995 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3996 
   3997 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3998 #endif
   3999 
   4000 	bmajor = cmajor = -1;
   4001 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4002 	    &raid_cdevsw, &cmajor);
   4003 	if (error != 0 && error != EEXIST) {
   4004 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4005 		mutex_exit(&raid_lock);
   4006 		return error;
   4007 	}
   4008 #ifdef _MODULE
   4009 	error = config_cfdriver_attach(&raid_cd);
   4010 	if (error != 0) {
   4011 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4012 		    __func__, error);
   4013 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4014 		mutex_exit(&raid_lock);
   4015 		return error;
   4016 	}
   4017 #endif
   4018 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4019 	if (error != 0) {
   4020 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4021 		    __func__, error);
   4022 #ifdef _MODULE
   4023 		config_cfdriver_detach(&raid_cd);
   4024 #endif
   4025 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4026 		mutex_exit(&raid_lock);
   4027 		return error;
   4028 	}
   4029 
   4030 	raidautoconfigdone = false;
   4031 
   4032 	mutex_exit(&raid_lock);
   4033 
   4034 	if (error == 0) {
   4035 		if (rf_BootRaidframe(true) == 0)
   4036 			aprint_verbose("Kernelized RAIDframe activated\n");
   4037 		else
   4038 			panic("Serious error activating RAID!!");
   4039 	}
   4040 
   4041 	/*
   4042 	 * Register a finalizer which will be used to auto-config RAID
   4043 	 * sets once all real hardware devices have been found.
   4044 	 */
   4045 	error = config_finalize_register(NULL, rf_autoconfig);
   4046 	if (error != 0) {
   4047 		aprint_error("WARNING: unable to register RAIDframe "
   4048 		    "finalizer\n");
   4049 		error = 0;
   4050 	}
   4051 
   4052 	return error;
   4053 }
   4054 
   4055 static int
   4056 raid_modcmd_fini(void)
   4057 {
   4058 	int error;
   4059 
   4060 	mutex_enter(&raid_lock);
   4061 
   4062 	/* Don't allow unload if raid device(s) exist.  */
   4063 	if (!LIST_EMPTY(&raids)) {
   4064 		mutex_exit(&raid_lock);
   4065 		return EBUSY;
   4066 	}
   4067 
   4068 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4069 	if (error != 0) {
   4070 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4071 		mutex_exit(&raid_lock);
   4072 		return error;
   4073 	}
   4074 #ifdef _MODULE
   4075 	error = config_cfdriver_detach(&raid_cd);
   4076 	if (error != 0) {
   4077 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4078 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4079 		mutex_exit(&raid_lock);
   4080 		return error;
   4081 	}
   4082 #endif
   4083 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4084 	rf_BootRaidframe(false);
   4085 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4086 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4087 	rf_destroy_cond2(rf_sparet_wait_cv);
   4088 	rf_destroy_cond2(rf_sparet_resp_cv);
   4089 #endif
   4090 	mutex_exit(&raid_lock);
   4091 	mutex_destroy(&raid_lock);
   4092 
   4093 	return error;
   4094 }
   4095