Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.403
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.403 2022/03/11 01:59:33 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.403 2022/03/11 01:59:33 mrg Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int);
    181 static void raidinit(struct raid_softc *);
    182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 static int raid_diskstart(device_t, struct buf *bp);
    200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    201 static int raid_lastclose(device_t);
    202 
    203 static dev_type_open(raidopen);
    204 static dev_type_close(raidclose);
    205 static dev_type_read(raidread);
    206 static dev_type_write(raidwrite);
    207 static dev_type_ioctl(raidioctl);
    208 static dev_type_strategy(raidstrategy);
    209 static dev_type_dump(raiddump);
    210 static dev_type_size(raidsize);
    211 
    212 const struct bdevsw raid_bdevsw = {
    213 	.d_open = raidopen,
    214 	.d_close = raidclose,
    215 	.d_strategy = raidstrategy,
    216 	.d_ioctl = raidioctl,
    217 	.d_dump = raiddump,
    218 	.d_psize = raidsize,
    219 	.d_discard = nodiscard,
    220 	.d_flag = D_DISK
    221 };
    222 
    223 const struct cdevsw raid_cdevsw = {
    224 	.d_open = raidopen,
    225 	.d_close = raidclose,
    226 	.d_read = raidread,
    227 	.d_write = raidwrite,
    228 	.d_ioctl = raidioctl,
    229 	.d_stop = nostop,
    230 	.d_tty = notty,
    231 	.d_poll = nopoll,
    232 	.d_mmap = nommap,
    233 	.d_kqfilter = nokqfilter,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 static struct dkdriver rf_dkdriver = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_strategy = raidstrategy,
    242 	.d_diskstart = raid_diskstart,
    243 	.d_dumpblocks = raid_dumpblocks,
    244 	.d_lastclose = raid_lastclose,
    245 	.d_minphys = minphys
    246 };
    247 
    248 #define	raidunit(x)	DISKUNIT(x)
    249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    250 
    251 extern struct cfdriver raid_cd;
    252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    254     DVF_DETACH_SHUTDOWN);
    255 
    256 /* Internal representation of a rf_recon_req */
    257 struct rf_recon_req_internal {
    258 	RF_RowCol_t col;
    259 	RF_ReconReqFlags_t flags;
    260 	void   *raidPtr;
    261 };
    262 
    263 /*
    264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    265  * Be aware that large numbers can allow the driver to consume a lot of
    266  * kernel memory, especially on writes, and in degraded mode reads.
    267  *
    268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    269  * a single 64K write will typically require 64K for the old data,
    270  * 64K for the old parity, and 64K for the new parity, for a total
    271  * of 192K (if the parity buffer is not re-used immediately).
    272  * Even it if is used immediately, that's still 128K, which when multiplied
    273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    274  *
    275  * Now in degraded mode, for example, a 64K read on the above setup may
    276  * require data reconstruction, which will require *all* of the 4 remaining
    277  * disks to participate -- 4 * 32K/disk == 128K again.
    278  */
    279 
    280 #ifndef RAIDOUTSTANDING
    281 #define RAIDOUTSTANDING   6
    282 #endif
    283 
    284 #define RAIDLABELDEV(dev)	\
    285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    286 
    287 /* declared here, and made public, for the benefit of KVM stuff.. */
    288 
    289 static int raidlock(struct raid_softc *);
    290 static void raidunlock(struct raid_softc *);
    291 
    292 static int raid_detach_unlocked(struct raid_softc *);
    293 
    294 static void rf_markalldirty(RF_Raid_t *);
    295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    296 
    297 static void rf_ReconThread(struct rf_recon_req_internal *);
    298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    301 static int rf_autoconfig(device_t);
    302 static int rf_rescan(void);
    303 static void rf_buildroothack(RF_ConfigSet_t *);
    304 
    305 static RF_AutoConfig_t *rf_find_raid_components(void);
    306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    309 static int rf_set_autoconfig(RF_Raid_t *, int);
    310 static int rf_set_rootpartition(RF_Raid_t *, int);
    311 static void rf_release_all_vps(RF_ConfigSet_t *);
    312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    313 static int rf_have_enough_components(RF_ConfigSet_t *);
    314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    316 
    317 /*
    318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    320  * in the kernel config file.
    321  */
    322 #ifdef RAID_AUTOCONFIG
    323 int raidautoconfig = 1;
    324 #else
    325 int raidautoconfig = 0;
    326 #endif
    327 static bool raidautoconfigdone = false;
    328 
    329 struct pool rf_alloclist_pool;   /* AllocList */
    330 
    331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    332 static kmutex_t raid_lock;
    333 
    334 static struct raid_softc *
    335 raidcreate(int unit) {
    336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    337 	sc->sc_unit = unit;
    338 	cv_init(&sc->sc_cv, "raidunit");
    339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    340 	return sc;
    341 }
    342 
    343 static void
    344 raiddestroy(struct raid_softc *sc) {
    345 	cv_destroy(&sc->sc_cv);
    346 	mutex_destroy(&sc->sc_mutex);
    347 	kmem_free(sc, sizeof(*sc));
    348 }
    349 
    350 static struct raid_softc *
    351 raidget(int unit, bool create) {
    352 	struct raid_softc *sc;
    353 	if (unit < 0) {
    354 #ifdef DIAGNOSTIC
    355 		panic("%s: unit %d!", __func__, unit);
    356 #endif
    357 		return NULL;
    358 	}
    359 	mutex_enter(&raid_lock);
    360 	LIST_FOREACH(sc, &raids, sc_link) {
    361 		if (sc->sc_unit == unit) {
    362 			mutex_exit(&raid_lock);
    363 			return sc;
    364 		}
    365 	}
    366 	mutex_exit(&raid_lock);
    367 	if (!create)
    368 		return NULL;
    369 	sc = raidcreate(unit);
    370 	mutex_enter(&raid_lock);
    371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    372 	mutex_exit(&raid_lock);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raidput(struct raid_softc *sc) {
    378 	mutex_enter(&raid_lock);
    379 	LIST_REMOVE(sc, sc_link);
    380 	mutex_exit(&raid_lock);
    381 	raiddestroy(sc);
    382 }
    383 
    384 void
    385 raidattach(int num)
    386 {
    387 
    388 	/*
    389 	 * Device attachment and associated initialization now occurs
    390 	 * as part of the module initialization.
    391 	 */
    392 }
    393 
    394 static int
    395 rf_autoconfig(device_t self)
    396 {
    397 	RF_AutoConfig_t *ac_list;
    398 	RF_ConfigSet_t *config_sets;
    399 
    400 	if (!raidautoconfig || raidautoconfigdone == true)
    401 		return 0;
    402 
    403 	/* XXX This code can only be run once. */
    404 	raidautoconfigdone = true;
    405 
    406 #ifdef __HAVE_CPU_BOOTCONF
    407 	/*
    408 	 * 0. find the boot device if needed first so we can use it later
    409 	 * this needs to be done before we autoconfigure any raid sets,
    410 	 * because if we use wedges we are not going to be able to open
    411 	 * the boot device later
    412 	 */
    413 	if (booted_device == NULL)
    414 		cpu_bootconf();
    415 #endif
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set and configure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 int
    433 rf_inited(const struct raid_softc *rs) {
    434 	return (rs->sc_flags & RAIDF_INITED) != 0;
    435 }
    436 
    437 RF_Raid_t *
    438 rf_get_raid(struct raid_softc *rs) {
    439 	return &rs->sc_r;
    440 }
    441 
    442 int
    443 rf_get_unit(const struct raid_softc *rs) {
    444 	return rs->sc_unit;
    445 }
    446 
    447 static int
    448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    449 	const char *bootname;
    450 	size_t len;
    451 
    452 	/* if bdv is NULL, the set can't contain it. exit early. */
    453 	if (bdv == NULL)
    454 		return 0;
    455 
    456 	bootname = device_xname(bdv);
    457 	len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 static int
    479 rf_rescan(void)
    480 {
    481 	RF_AutoConfig_t *ac_list;
    482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    483 	struct raid_softc *sc;
    484 	int raid_added;
    485 
    486 	ac_list = rf_find_raid_components();
    487 	config_sets = rf_create_auto_sets(ac_list);
    488 
    489 	raid_added = 1;
    490 	while (raid_added > 0) {
    491 		raid_added = 0;
    492 		cset = config_sets;
    493 		while (cset != NULL) {
    494 			next_cset = cset->next;
    495 			if (rf_have_enough_components(cset) &&
    496 			    cset->ac->clabel->autoconfigure == 1) {
    497 				sc = rf_auto_config_set(cset);
    498 				if (sc != NULL) {
    499 					aprint_debug("raid%d: configured ok, rootable %d\n",
    500 						     sc->sc_unit, cset->rootable);
    501 					/* We added one RAID set */
    502 					raid_added++;
    503 				} else {
    504 					/* The autoconfig didn't work :( */
    505 					aprint_debug("Autoconfig failed\n");
    506 					rf_release_all_vps(cset);
    507 				}
    508 			} else {
    509 				/* we're not autoconfiguring this set...
    510 				   release the associated resources */
    511 				rf_release_all_vps(cset);
    512 			}
    513 			/* cleanup */
    514 			rf_cleanup_config_set(cset);
    515 			cset = next_cset;
    516 		}
    517 		if (raid_added > 0) {
    518 			/* We added at least one RAID set, so re-scan for recursive RAID */
    519 			ac_list = rf_find_raid_components();
    520 			config_sets = rf_create_auto_sets(ac_list);
    521 		}
    522 	}
    523 
    524 	return 0;
    525 }
    526 
    527 
    528 static void
    529 rf_buildroothack(RF_ConfigSet_t *config_sets)
    530 {
    531 	RF_AutoConfig_t *ac_list;
    532 	RF_ConfigSet_t *cset;
    533 	RF_ConfigSet_t *next_cset;
    534 	int num_root;
    535 	int raid_added;
    536 	struct raid_softc *sc, *rsc;
    537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    538 
    539 	sc = rsc = NULL;
    540 	num_root = 0;
    541 
    542 	raid_added = 1;
    543 	while (raid_added > 0) {
    544 		raid_added = 0;
    545 		cset = config_sets;
    546 		while (cset != NULL) {
    547 			next_cset = cset->next;
    548 			if (rf_have_enough_components(cset) &&
    549 			    cset->ac->clabel->autoconfigure == 1) {
    550 				sc = rf_auto_config_set(cset);
    551 				if (sc != NULL) {
    552 					aprint_debug("raid%d: configured ok, rootable %d\n",
    553 						     sc->sc_unit, cset->rootable);
    554 					/* We added one RAID set */
    555 					raid_added++;
    556 					if (cset->rootable) {
    557 						rsc = sc;
    558 						num_root++;
    559 					}
    560 				} else {
    561 					/* The autoconfig didn't work :( */
    562 					aprint_debug("Autoconfig failed\n");
    563 					rf_release_all_vps(cset);
    564 				}
    565 			} else {
    566 				/* we're not autoconfiguring this set...
    567 				   release the associated resources */
    568 				rf_release_all_vps(cset);
    569 			}
    570 			/* cleanup */
    571 			rf_cleanup_config_set(cset);
    572 			cset = next_cset;
    573 		}
    574 		if (raid_added > 0) {
    575 			/* We added at least one RAID set, so re-scan for recursive RAID */
    576 			ac_list = rf_find_raid_components();
    577 			config_sets = rf_create_auto_sets(ac_list);
    578 		}
    579 	}
    580 
    581 	/* if the user has specified what the root device should be
    582 	   then we don't touch booted_device or boothowto... */
    583 
    584 	if (rootspec != NULL) {
    585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
    586 		return;
    587 	}
    588 
    589 	/* we found something bootable... */
    590 
    591 	/*
    592 	 * XXX: The following code assumes that the root raid
    593 	 * is the first ('a') partition. This is about the best
    594 	 * we can do with a BSD disklabel, but we might be able
    595 	 * to do better with a GPT label, by setting a specified
    596 	 * attribute to indicate the root partition. We can then
    597 	 * stash the partition number in the r->root_partition
    598 	 * high bits (the bottom 2 bits are already used). For
    599 	 * now we just set booted_partition to 0 when we override
    600 	 * root.
    601 	 */
    602 	if (num_root == 1) {
    603 		device_t candidate_root;
    604 		dksc = &rsc->sc_dksc;
    605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    606 			char cname[sizeof(cset->ac->devname)];
    607 			/* XXX: assume partition 'a' first */
    608 			snprintf(cname, sizeof(cname), "%s%c",
    609 			    device_xname(dksc->sc_dev), 'a');
    610 			candidate_root = dkwedge_find_by_wname(cname);
    611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
    612 			    cname);
    613 			if (candidate_root == NULL) {
    614 				/*
    615 				 * If that is not found, because we don't use
    616 				 * disklabel, return the first dk child
    617 				 * XXX: we can skip the 'a' check above
    618 				 * and always do this...
    619 				 */
    620 				size_t i = 0;
    621 				candidate_root = dkwedge_find_by_parent(
    622 				    device_xname(dksc->sc_dev), &i);
    623 			}
    624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
    625 			    candidate_root);
    626 		} else
    627 			candidate_root = dksc->sc_dev;
    628 		aprint_debug("%s: candidate root=%p booted_device=%p "
    629 			     "root_partition=%d contains_boot=%d\n",
    630 		    __func__, candidate_root, booted_device,
    631 		    rsc->sc_r.root_partition,
    632 		    rf_containsboot(&rsc->sc_r, booted_device));
    633 		/* XXX the check for booted_device == NULL can probably be
    634 		 * dropped, now that rf_containsboot handles that case.
    635 		 */
    636 		if (booted_device == NULL ||
    637 		    rsc->sc_r.root_partition == 1 ||
    638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    639 			booted_device = candidate_root;
    640 			booted_method = "raidframe/single";
    641 			booted_partition = 0;	/* XXX assume 'a' */
    642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
    643 			    device_xname(booted_device), booted_device);
    644 		}
    645 	} else if (num_root > 1) {
    646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
    647 		    booted_device);
    648 
    649 		/*
    650 		 * Maybe the MD code can help. If it cannot, then
    651 		 * setroot() will discover that we have no
    652 		 * booted_device and will ask the user if nothing was
    653 		 * hardwired in the kernel config file
    654 		 */
    655 		if (booted_device == NULL)
    656 			return;
    657 
    658 		num_root = 0;
    659 		mutex_enter(&raid_lock);
    660 		LIST_FOREACH(sc, &raids, sc_link) {
    661 			RF_Raid_t *r = &sc->sc_r;
    662 			if (r->valid == 0)
    663 				continue;
    664 
    665 			if (r->root_partition == 0)
    666 				continue;
    667 
    668 			if (rf_containsboot(r, booted_device)) {
    669 				num_root++;
    670 				rsc = sc;
    671 				dksc = &rsc->sc_dksc;
    672 			}
    673 		}
    674 		mutex_exit(&raid_lock);
    675 
    676 		if (num_root == 1) {
    677 			booted_device = dksc->sc_dev;
    678 			booted_method = "raidframe/multi";
    679 			booted_partition = 0;	/* XXX assume 'a' */
    680 		} else {
    681 			/* we can't guess.. require the user to answer... */
    682 			boothowto |= RB_ASKNAME;
    683 		}
    684 	}
    685 }
    686 
    687 static int
    688 raidsize(dev_t dev)
    689 {
    690 	struct raid_softc *rs;
    691 	struct dk_softc *dksc;
    692 	unsigned int unit;
    693 
    694 	unit = raidunit(dev);
    695 	if ((rs = raidget(unit, false)) == NULL)
    696 		return -1;
    697 	dksc = &rs->sc_dksc;
    698 
    699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    700 		return -1;
    701 
    702 	return dk_size(dksc, dev);
    703 }
    704 
    705 static int
    706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    707 {
    708 	unsigned int unit;
    709 	struct raid_softc *rs;
    710 	struct dk_softc *dksc;
    711 
    712 	unit = raidunit(dev);
    713 	if ((rs = raidget(unit, false)) == NULL)
    714 		return ENXIO;
    715 	dksc = &rs->sc_dksc;
    716 
    717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    718 		return ENODEV;
    719 
    720         /*
    721            Note that blkno is relative to this particular partition.
    722            By adding adding RF_PROTECTED_SECTORS, we get a value that
    723 	   is relative to the partition used for the underlying component.
    724         */
    725 	blkno += RF_PROTECTED_SECTORS;
    726 
    727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    728 }
    729 
    730 static int
    731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    732 {
    733 	struct raid_softc *rs = raidsoftc(dev);
    734 	const struct bdevsw *bdev;
    735 	RF_Raid_t *raidPtr;
    736 	int     c, sparecol, j, scol, dumpto;
    737 	int     error = 0;
    738 
    739 	raidPtr = &rs->sc_r;
    740 
    741 	/* we only support dumping to RAID 1 sets */
    742 	if (raidPtr->Layout.numDataCol != 1 ||
    743 	    raidPtr->Layout.numParityCol != 1)
    744 		return EINVAL;
    745 
    746 	if ((error = raidlock(rs)) != 0)
    747 		return error;
    748 
    749 	/* figure out what device is alive.. */
    750 
    751 	/*
    752 	   Look for a component to dump to.  The preference for the
    753 	   component to dump to is as follows:
    754 	   1) the first component
    755 	   2) a used_spare of the first component
    756 	   3) the second component
    757 	   4) a used_spare of the second component
    758 	*/
    759 
    760 	dumpto = -1;
    761 	for (c = 0; c < raidPtr->numCol; c++) {
    762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    763 			/* this might be the one */
    764 			dumpto = c;
    765 			break;
    766 		}
    767 	}
    768 
    769 	/*
    770 	   At this point we have possibly selected a live component.
    771 	   If we didn't find a live ocmponent, we now check to see
    772 	   if there is a relevant spared component.
    773 	*/
    774 
    775 	for (c = 0; c < raidPtr->numSpare; c++) {
    776 		sparecol = raidPtr->numCol + c;
    777 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    778 			/* How about this one? */
    779 			scol = -1;
    780 			for(j=0;j<raidPtr->numCol;j++) {
    781 				if (raidPtr->Disks[j].spareCol == sparecol) {
    782 					scol = j;
    783 					break;
    784 				}
    785 			}
    786 			if (scol == 0) {
    787 				/*
    788 				   We must have found a spared first
    789 				   component!  We'll take that over
    790 				   anything else found so far.  (We
    791 				   couldn't have found a real first
    792 				   component before, since this is a
    793 				   used spare, and it's saying that
    794 				   it's replacing the first
    795 				   component.)  On reboot (with
    796 				   autoconfiguration turned on)
    797 				   sparecol will become the first
    798 				   component (component0) of this set.
    799 				*/
    800 				dumpto = sparecol;
    801 				break;
    802 			} else if (scol != -1) {
    803 				/*
    804 				   Must be a spared second component.
    805 				   We'll dump to that if we havn't found
    806 				   anything else so far.
    807 				*/
    808 				if (dumpto == -1)
    809 					dumpto = sparecol;
    810 			}
    811 		}
    812 	}
    813 
    814 	if (dumpto == -1) {
    815 		/* we couldn't find any live components to dump to!?!?
    816 		 */
    817 		error = EINVAL;
    818 		goto out;
    819 	}
    820 
    821 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    822 	if (bdev == NULL) {
    823 		error = ENXIO;
    824 		goto out;
    825 	}
    826 
    827 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    828 				blkno, va, nblk * raidPtr->bytesPerSector);
    829 
    830 out:
    831 	raidunlock(rs);
    832 
    833 	return error;
    834 }
    835 
    836 /* ARGSUSED */
    837 static int
    838 raidopen(dev_t dev, int flags, int fmt,
    839     struct lwp *l)
    840 {
    841 	int     unit = raidunit(dev);
    842 	struct raid_softc *rs;
    843 	struct dk_softc *dksc;
    844 	int     error = 0;
    845 	int     part, pmask;
    846 
    847 	if ((rs = raidget(unit, true)) == NULL)
    848 		return ENXIO;
    849 	if ((error = raidlock(rs)) != 0)
    850 		return error;
    851 
    852 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    853 		error = EBUSY;
    854 		goto bad;
    855 	}
    856 
    857 	dksc = &rs->sc_dksc;
    858 
    859 	part = DISKPART(dev);
    860 	pmask = (1 << part);
    861 
    862 	if (!DK_BUSY(dksc, pmask) &&
    863 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    864 		/* First one... mark things as dirty... Note that we *MUST*
    865 		 have done a configure before this.  I DO NOT WANT TO BE
    866 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    867 		 THAT THEY BELONG TOGETHER!!!!! */
    868 		/* XXX should check to see if we're only open for reading
    869 		   here... If so, we needn't do this, but then need some
    870 		   other way of keeping track of what's happened.. */
    871 
    872 		rf_markalldirty(&rs->sc_r);
    873 	}
    874 
    875 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    876 		error = dk_open(dksc, dev, flags, fmt, l);
    877 
    878 bad:
    879 	raidunlock(rs);
    880 
    881 	return error;
    882 
    883 
    884 }
    885 
    886 static int
    887 raid_lastclose(device_t self)
    888 {
    889 	struct raid_softc *rs = raidsoftc(self);
    890 
    891 	/* Last one... device is not unconfigured yet.
    892 	   Device shutdown has taken care of setting the
    893 	   clean bits if RAIDF_INITED is not set
    894 	   mark things as clean... */
    895 
    896 	rf_update_component_labels(&rs->sc_r,
    897 	    RF_FINAL_COMPONENT_UPDATE);
    898 
    899 	/* pass to unlocked code */
    900 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    901 		rs->sc_flags |= RAIDF_DETACH;
    902 
    903 	return 0;
    904 }
    905 
    906 /* ARGSUSED */
    907 static int
    908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    909 {
    910 	int     unit = raidunit(dev);
    911 	struct raid_softc *rs;
    912 	struct dk_softc *dksc;
    913 	cfdata_t cf;
    914 	int     error = 0, do_detach = 0, do_put = 0;
    915 
    916 	if ((rs = raidget(unit, false)) == NULL)
    917 		return ENXIO;
    918 	dksc = &rs->sc_dksc;
    919 
    920 	if ((error = raidlock(rs)) != 0)
    921 		return error;
    922 
    923 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    924 		error = dk_close(dksc, dev, flags, fmt, l);
    925 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    926 			do_detach = 1;
    927 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    928 		do_put = 1;
    929 
    930 	raidunlock(rs);
    931 
    932 	if (do_detach) {
    933 		/* free the pseudo device attach bits */
    934 		cf = device_cfdata(dksc->sc_dev);
    935 		error = config_detach(dksc->sc_dev, 0);
    936 		if (error == 0)
    937 			free(cf, M_RAIDFRAME);
    938 	} else if (do_put) {
    939 		raidput(rs);
    940 	}
    941 
    942 	return error;
    943 
    944 }
    945 
    946 static void
    947 raid_wakeup(RF_Raid_t *raidPtr)
    948 {
    949 	rf_lock_mutex2(raidPtr->iodone_lock);
    950 	rf_signal_cond2(raidPtr->iodone_cv);
    951 	rf_unlock_mutex2(raidPtr->iodone_lock);
    952 }
    953 
    954 static void
    955 raidstrategy(struct buf *bp)
    956 {
    957 	unsigned int unit;
    958 	struct raid_softc *rs;
    959 	struct dk_softc *dksc;
    960 	RF_Raid_t *raidPtr;
    961 
    962 	unit = raidunit(bp->b_dev);
    963 	if ((rs = raidget(unit, false)) == NULL) {
    964 		bp->b_error = ENXIO;
    965 		goto fail;
    966 	}
    967 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    968 		bp->b_error = ENXIO;
    969 		goto fail;
    970 	}
    971 	dksc = &rs->sc_dksc;
    972 	raidPtr = &rs->sc_r;
    973 
    974 	/* Queue IO only */
    975 	if (dk_strategy_defer(dksc, bp))
    976 		goto done;
    977 
    978 	/* schedule the IO to happen at the next convenient time */
    979 	raid_wakeup(raidPtr);
    980 
    981 done:
    982 	return;
    983 
    984 fail:
    985 	bp->b_resid = bp->b_bcount;
    986 	biodone(bp);
    987 }
    988 
    989 static int
    990 raid_diskstart(device_t dev, struct buf *bp)
    991 {
    992 	struct raid_softc *rs = raidsoftc(dev);
    993 	RF_Raid_t *raidPtr;
    994 
    995 	raidPtr = &rs->sc_r;
    996 	if (!raidPtr->valid) {
    997 		db1_printf(("raid is not valid..\n"));
    998 		return ENODEV;
    999 	}
   1000 
   1001 	/* XXX */
   1002 	bp->b_resid = 0;
   1003 
   1004 	return raiddoaccess(raidPtr, bp);
   1005 }
   1006 
   1007 void
   1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1009 {
   1010 	struct raid_softc *rs;
   1011 	struct dk_softc *dksc;
   1012 
   1013 	rs = raidPtr->softc;
   1014 	dksc = &rs->sc_dksc;
   1015 
   1016 	dk_done(dksc, bp);
   1017 
   1018 	rf_lock_mutex2(raidPtr->mutex);
   1019 	raidPtr->openings++;
   1020 	rf_unlock_mutex2(raidPtr->mutex);
   1021 
   1022 	/* schedule more IO */
   1023 	raid_wakeup(raidPtr);
   1024 }
   1025 
   1026 /* ARGSUSED */
   1027 static int
   1028 raidread(dev_t dev, struct uio *uio, int flags)
   1029 {
   1030 	int     unit = raidunit(dev);
   1031 	struct raid_softc *rs;
   1032 
   1033 	if ((rs = raidget(unit, false)) == NULL)
   1034 		return ENXIO;
   1035 
   1036 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1037 		return ENXIO;
   1038 
   1039 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1040 
   1041 }
   1042 
   1043 /* ARGSUSED */
   1044 static int
   1045 raidwrite(dev_t dev, struct uio *uio, int flags)
   1046 {
   1047 	int     unit = raidunit(dev);
   1048 	struct raid_softc *rs;
   1049 
   1050 	if ((rs = raidget(unit, false)) == NULL)
   1051 		return ENXIO;
   1052 
   1053 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1054 		return ENXIO;
   1055 
   1056 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1057 
   1058 }
   1059 
   1060 static int
   1061 raid_detach_unlocked(struct raid_softc *rs)
   1062 {
   1063 	struct dk_softc *dksc = &rs->sc_dksc;
   1064 	RF_Raid_t *raidPtr;
   1065 	int error;
   1066 
   1067 	raidPtr = &rs->sc_r;
   1068 
   1069 	if (DK_BUSY(dksc, 0) ||
   1070 	    raidPtr->recon_in_progress != 0 ||
   1071 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1072 	    raidPtr->copyback_in_progress != 0)
   1073 		return EBUSY;
   1074 
   1075 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1076 		return 0;
   1077 
   1078 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1079 
   1080 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1081 		return error;
   1082 
   1083 	rs->sc_flags &= ~RAIDF_INITED;
   1084 
   1085 	/* Kill off any queued buffers */
   1086 	dk_drain(dksc);
   1087 	bufq_free(dksc->sc_bufq);
   1088 
   1089 	/* Detach the disk. */
   1090 	dkwedge_delall(&dksc->sc_dkdev);
   1091 	disk_detach(&dksc->sc_dkdev);
   1092 	disk_destroy(&dksc->sc_dkdev);
   1093 	dk_detach(dksc);
   1094 
   1095 	return 0;
   1096 }
   1097 
   1098 static bool
   1099 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1100 {
   1101 	switch (cmd) {
   1102 	case RAIDFRAME_ADD_HOT_SPARE:
   1103 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1104 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1105 	case RAIDFRAME_CHECK_PARITY:
   1106 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1107 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1108 	case RAIDFRAME_CHECK_RECON_STATUS:
   1109 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1110 	case RAIDFRAME_COPYBACK:
   1111 	case RAIDFRAME_DELETE_COMPONENT:
   1112 	case RAIDFRAME_FAIL_DISK:
   1113 	case RAIDFRAME_GET_ACCTOTALS:
   1114 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1115 	case RAIDFRAME_GET_INFO:
   1116 	case RAIDFRAME_GET_SIZE:
   1117 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1118 	case RAIDFRAME_INIT_LABELS:
   1119 	case RAIDFRAME_KEEP_ACCTOTALS:
   1120 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1121 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1122 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1123 	case RAIDFRAME_PARITYMAP_STATUS:
   1124 	case RAIDFRAME_REBUILD_IN_PLACE:
   1125 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1126 	case RAIDFRAME_RESET_ACCTOTALS:
   1127 	case RAIDFRAME_REWRITEPARITY:
   1128 	case RAIDFRAME_SET_AUTOCONFIG:
   1129 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1130 	case RAIDFRAME_SET_ROOT:
   1131 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1132 	}
   1133 	return false;
   1134 }
   1135 
   1136 int
   1137 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1138 {
   1139 	struct rf_recon_req_internal *rrint;
   1140 
   1141 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1142 		/* Can't do this on a RAID 0!! */
   1143 		return EINVAL;
   1144 	}
   1145 
   1146 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1147 		/* bad column */
   1148 		return EINVAL;
   1149 	}
   1150 
   1151 	rf_lock_mutex2(raidPtr->mutex);
   1152 	if (raidPtr->status == rf_rs_reconstructing) {
   1153 		/* you can't fail a disk while we're reconstructing! */
   1154 		/* XXX wrong for RAID6 */
   1155 		goto out;
   1156 	}
   1157 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1158 	    (raidPtr->numFailures > 0)) {
   1159 		/* some other component has failed.  Let's not make
   1160 		   things worse. XXX wrong for RAID6 */
   1161 		goto out;
   1162 	}
   1163 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1164 		/* Can't fail a spared disk! */
   1165 		goto out;
   1166 	}
   1167 	rf_unlock_mutex2(raidPtr->mutex);
   1168 
   1169 	/* make a copy of the recon request so that we don't rely on
   1170 	 * the user's buffer */
   1171 	rrint = RF_Malloc(sizeof(*rrint));
   1172 	if (rrint == NULL)
   1173 		return(ENOMEM);
   1174 	rrint->col = rr->col;
   1175 	rrint->flags = rr->flags;
   1176 	rrint->raidPtr = raidPtr;
   1177 
   1178 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1179 	    rrint, "raid_recon");
   1180 out:
   1181 	rf_unlock_mutex2(raidPtr->mutex);
   1182 	return EINVAL;
   1183 }
   1184 
   1185 static int
   1186 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1187 {
   1188 	/* allocate a buffer for the layout-specific data, and copy it in */
   1189 	if (k_cfg->layoutSpecificSize == 0)
   1190 		return 0;
   1191 
   1192 	if (k_cfg->layoutSpecificSize > 10000) {
   1193 	    /* sanity check */
   1194 	    return EINVAL;
   1195 	}
   1196 
   1197 	u_char *specific_buf;
   1198 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1199 	if (specific_buf == NULL)
   1200 		return ENOMEM;
   1201 
   1202 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1203 	    k_cfg->layoutSpecificSize);
   1204 	if (retcode) {
   1205 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1206 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1207 		return retcode;
   1208 	}
   1209 
   1210 	k_cfg->layoutSpecific = specific_buf;
   1211 	return 0;
   1212 }
   1213 
   1214 static int
   1215 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1216 {
   1217 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1218 
   1219 	if (rs->sc_r.valid) {
   1220 		/* There is a valid RAID set running on this unit! */
   1221 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1222 		return EINVAL;
   1223 	}
   1224 
   1225 	/* copy-in the configuration information */
   1226 	/* data points to a pointer to the configuration structure */
   1227 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1228 	if (*k_cfg == NULL) {
   1229 		return ENOMEM;
   1230 	}
   1231 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1232 	if (retcode == 0)
   1233 		return 0;
   1234 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1235 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1236 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1237 	return retcode;
   1238 }
   1239 
   1240 int
   1241 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1242 {
   1243 	int retcode;
   1244 	RF_Raid_t *raidPtr = &rs->sc_r;
   1245 
   1246 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1247 
   1248 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1249 		goto out;
   1250 
   1251 	/* should do some kind of sanity check on the configuration.
   1252 	 * Store the sum of all the bytes in the last byte? */
   1253 
   1254 	/* configure the system */
   1255 
   1256 	/*
   1257 	 * Clear the entire RAID descriptor, just to make sure
   1258 	 *  there is no stale data left in the case of a
   1259 	 *  reconfiguration
   1260 	 */
   1261 	memset(raidPtr, 0, sizeof(*raidPtr));
   1262 	raidPtr->softc = rs;
   1263 	raidPtr->raidid = rs->sc_unit;
   1264 
   1265 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1266 
   1267 	if (retcode == 0) {
   1268 		/* allow this many simultaneous IO's to
   1269 		   this RAID device */
   1270 		raidPtr->openings = RAIDOUTSTANDING;
   1271 
   1272 		raidinit(rs);
   1273 		raid_wakeup(raidPtr);
   1274 		rf_markalldirty(raidPtr);
   1275 	}
   1276 
   1277 	/* free the buffers.  No return code here. */
   1278 	if (k_cfg->layoutSpecificSize) {
   1279 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1280 	}
   1281 out:
   1282 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1283 	if (retcode) {
   1284 		/*
   1285 		 * If configuration failed, set sc_flags so that we
   1286 		 * will detach the device when we close it.
   1287 		 */
   1288 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1289 	}
   1290 	return retcode;
   1291 }
   1292 
   1293 #if RF_DISABLED
   1294 static int
   1295 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1296 {
   1297 
   1298 	/* XXX check the label for valid stuff... */
   1299 	/* Note that some things *should not* get modified --
   1300 	   the user should be re-initing the labels instead of
   1301 	   trying to patch things.
   1302 	   */
   1303 #ifdef DEBUG
   1304 	int raidid = raidPtr->raidid;
   1305 	printf("raid%d: Got component label:\n", raidid);
   1306 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1307 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1308 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1309 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1310 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1311 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1312 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1313 #endif	/* DEBUG */
   1314 	clabel->row = 0;
   1315 	int column = clabel->column;
   1316 
   1317 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1318 		return(EINVAL);
   1319 	}
   1320 
   1321 	/* XXX this isn't allowed to do anything for now :-) */
   1322 
   1323 	/* XXX and before it is, we need to fill in the rest
   1324 	   of the fields!?!?!?! */
   1325 	memcpy(raidget_component_label(raidPtr, column),
   1326 	    clabel, sizeof(*clabel));
   1327 	raidflush_component_label(raidPtr, column);
   1328 	return 0;
   1329 }
   1330 #endif
   1331 
   1332 static int
   1333 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1334 {
   1335 	/*
   1336 	   we only want the serial number from
   1337 	   the above.  We get all the rest of the information
   1338 	   from the config that was used to create this RAID
   1339 	   set.
   1340 	   */
   1341 
   1342 	raidPtr->serial_number = clabel->serial_number;
   1343 
   1344 	for (int column = 0; column < raidPtr->numCol; column++) {
   1345 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1346 		if (RF_DEAD_DISK(diskPtr->status))
   1347 			continue;
   1348 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1349 		    raidPtr, column);
   1350 		/* Zeroing this is important. */
   1351 		memset(ci_label, 0, sizeof(*ci_label));
   1352 		raid_init_component_label(raidPtr, ci_label);
   1353 		ci_label->serial_number = raidPtr->serial_number;
   1354 		ci_label->row = 0; /* we dont' pretend to support more */
   1355 		rf_component_label_set_partitionsize(ci_label,
   1356 		    diskPtr->partitionSize);
   1357 		ci_label->column = column;
   1358 		raidflush_component_label(raidPtr, column);
   1359 		/* XXXjld what about the spares? */
   1360 	}
   1361 
   1362 	return 0;
   1363 }
   1364 
   1365 static int
   1366 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1367 {
   1368 
   1369 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1370 		/* Can't do this on a RAID 0!! */
   1371 		return EINVAL;
   1372 	}
   1373 
   1374 	if (raidPtr->recon_in_progress == 1) {
   1375 		/* a reconstruct is already in progress! */
   1376 		return EINVAL;
   1377 	}
   1378 
   1379 	RF_SingleComponent_t component;
   1380 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1381 	component.row = 0; /* we don't support any more */
   1382 	int column = component.column;
   1383 
   1384 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1385 		return EINVAL;
   1386 	}
   1387 
   1388 	rf_lock_mutex2(raidPtr->mutex);
   1389 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1390 	    (raidPtr->numFailures > 0)) {
   1391 		/* XXX 0 above shouldn't be constant!!! */
   1392 		/* some component other than this has failed.
   1393 		   Let's not make things worse than they already
   1394 		   are... */
   1395 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1396 		       raidPtr->raidid);
   1397 		printf("raid%d:     Col: %d   Too many failures.\n",
   1398 		       raidPtr->raidid, column);
   1399 		rf_unlock_mutex2(raidPtr->mutex);
   1400 		return EINVAL;
   1401 	}
   1402 
   1403 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1404 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1405 		       raidPtr->raidid);
   1406 		printf("raid%d:    Col: %d   "
   1407 		    "Reconstruction already occurring!\n",
   1408 		    raidPtr->raidid, column);
   1409 
   1410 		rf_unlock_mutex2(raidPtr->mutex);
   1411 		return EINVAL;
   1412 	}
   1413 
   1414 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1415 		rf_unlock_mutex2(raidPtr->mutex);
   1416 		return EINVAL;
   1417 	}
   1418 
   1419 	rf_unlock_mutex2(raidPtr->mutex);
   1420 
   1421 	struct rf_recon_req_internal *rrint;
   1422 	rrint = RF_Malloc(sizeof(*rrint));
   1423 	if (rrint == NULL)
   1424 		return ENOMEM;
   1425 
   1426 	rrint->col = column;
   1427 	rrint->raidPtr = raidPtr;
   1428 
   1429 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1430 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1431 }
   1432 
   1433 static int
   1434 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1435 {
   1436 	/*
   1437 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1438 	 * so tell the user it's done.
   1439 	 */
   1440 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1441 	    raidPtr->status != rf_rs_reconstructing) {
   1442 		*data = 100;
   1443 		return 0;
   1444 	}
   1445 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1446 		*data = 0;
   1447 		return 0;
   1448 	}
   1449 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1450 	    / raidPtr->reconControl->numRUsTotal);
   1451 	return 0;
   1452 }
   1453 
   1454 static int
   1455 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1456 {
   1457 	int     unit = raidunit(dev);
   1458 	int     part, pmask;
   1459 	struct raid_softc *rs;
   1460 	struct dk_softc *dksc;
   1461 	RF_Config_t *k_cfg;
   1462 	RF_Raid_t *raidPtr;
   1463 	RF_AccTotals_t *totals;
   1464 	RF_SingleComponent_t component;
   1465 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1466 	int retcode = 0;
   1467 	int column;
   1468 	RF_ComponentLabel_t *clabel;
   1469 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1470 	int d;
   1471 
   1472 	if ((rs = raidget(unit, false)) == NULL)
   1473 		return ENXIO;
   1474 
   1475 	dksc = &rs->sc_dksc;
   1476 	raidPtr = &rs->sc_r;
   1477 
   1478 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1479 	    (int) DISKPART(dev), (int) unit, cmd));
   1480 
   1481 	/* Must be initialized for these... */
   1482 	if (rf_must_be_initialized(rs, cmd))
   1483 		return ENXIO;
   1484 
   1485 	switch (cmd) {
   1486 		/* configure the system */
   1487 	case RAIDFRAME_CONFIGURE:
   1488 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1489 			return retcode;
   1490 		return rf_construct(rs, k_cfg);
   1491 
   1492 		/* shutdown the system */
   1493 	case RAIDFRAME_SHUTDOWN:
   1494 
   1495 		part = DISKPART(dev);
   1496 		pmask = (1 << part);
   1497 
   1498 		if ((retcode = raidlock(rs)) != 0)
   1499 			return retcode;
   1500 
   1501 		if (DK_BUSY(dksc, pmask) ||
   1502 		    raidPtr->recon_in_progress != 0 ||
   1503 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1504 		    raidPtr->copyback_in_progress != 0)
   1505 			retcode = EBUSY;
   1506 		else {
   1507 			/* detach and free on close */
   1508 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1509 			retcode = 0;
   1510 		}
   1511 
   1512 		raidunlock(rs);
   1513 
   1514 		return retcode;
   1515 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1516 		return rf_get_component_label(raidPtr, data);
   1517 
   1518 #if RF_DISABLED
   1519 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1520 		return rf_set_component_label(raidPtr, data);
   1521 #endif
   1522 
   1523 	case RAIDFRAME_INIT_LABELS:
   1524 		return rf_init_component_label(raidPtr, data);
   1525 
   1526 	case RAIDFRAME_SET_AUTOCONFIG:
   1527 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1528 		printf("raid%d: New autoconfig value is: %d\n",
   1529 		       raidPtr->raidid, d);
   1530 		*(int *) data = d;
   1531 		return retcode;
   1532 
   1533 	case RAIDFRAME_SET_ROOT:
   1534 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1535 		printf("raid%d: New rootpartition value is: %d\n",
   1536 		       raidPtr->raidid, d);
   1537 		*(int *) data = d;
   1538 		return retcode;
   1539 
   1540 		/* initialize all parity */
   1541 	case RAIDFRAME_REWRITEPARITY:
   1542 
   1543 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1544 			/* Parity for RAID 0 is trivially correct */
   1545 			raidPtr->parity_good = RF_RAID_CLEAN;
   1546 			return 0;
   1547 		}
   1548 
   1549 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1550 			/* Re-write is already in progress! */
   1551 			return EINVAL;
   1552 		}
   1553 
   1554 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1555 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1556 
   1557 	case RAIDFRAME_ADD_HOT_SPARE:
   1558 		sparePtr = (RF_SingleComponent_t *) data;
   1559 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1560 		return rf_add_hot_spare(raidPtr, &component);
   1561 
   1562 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1563 		return retcode;
   1564 
   1565 	case RAIDFRAME_DELETE_COMPONENT:
   1566 		componentPtr = (RF_SingleComponent_t *)data;
   1567 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1568 		return rf_delete_component(raidPtr, &component);
   1569 
   1570 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1571 		componentPtr = (RF_SingleComponent_t *)data;
   1572 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1573 		return rf_incorporate_hot_spare(raidPtr, &component);
   1574 
   1575 	case RAIDFRAME_REBUILD_IN_PLACE:
   1576 		return rf_rebuild_in_place(raidPtr, data);
   1577 
   1578 	case RAIDFRAME_GET_INFO:
   1579 		ucfgp = *(RF_DeviceConfig_t **)data;
   1580 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1581 		if (d_cfg == NULL)
   1582 			return ENOMEM;
   1583 		retcode = rf_get_info(raidPtr, d_cfg);
   1584 		if (retcode == 0) {
   1585 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1586 		}
   1587 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1588 		return retcode;
   1589 
   1590 	case RAIDFRAME_CHECK_PARITY:
   1591 		*(int *) data = raidPtr->parity_good;
   1592 		return 0;
   1593 
   1594 	case RAIDFRAME_PARITYMAP_STATUS:
   1595 		if (rf_paritymap_ineligible(raidPtr))
   1596 			return EINVAL;
   1597 		rf_paritymap_status(raidPtr->parity_map, data);
   1598 		return 0;
   1599 
   1600 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1601 		if (rf_paritymap_ineligible(raidPtr))
   1602 			return EINVAL;
   1603 		if (raidPtr->parity_map == NULL)
   1604 			return ENOENT; /* ??? */
   1605 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1606 			return EINVAL;
   1607 		return 0;
   1608 
   1609 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1610 		if (rf_paritymap_ineligible(raidPtr))
   1611 			return EINVAL;
   1612 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1613 		return 0;
   1614 
   1615 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1616 		if (rf_paritymap_ineligible(raidPtr))
   1617 			return EINVAL;
   1618 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1619 		/* XXX should errors be passed up? */
   1620 		return 0;
   1621 
   1622 	case RAIDFRAME_RESCAN:
   1623 		return rf_rescan();
   1624 
   1625 	case RAIDFRAME_RESET_ACCTOTALS:
   1626 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1627 		return 0;
   1628 
   1629 	case RAIDFRAME_GET_ACCTOTALS:
   1630 		totals = (RF_AccTotals_t *) data;
   1631 		*totals = raidPtr->acc_totals;
   1632 		return 0;
   1633 
   1634 	case RAIDFRAME_KEEP_ACCTOTALS:
   1635 		raidPtr->keep_acc_totals = *(int *)data;
   1636 		return 0;
   1637 
   1638 	case RAIDFRAME_GET_SIZE:
   1639 		*(int *) data = raidPtr->totalSectors;
   1640 		return 0;
   1641 
   1642 	case RAIDFRAME_FAIL_DISK:
   1643 		return rf_fail_disk(raidPtr, data);
   1644 
   1645 		/* invoke a copyback operation after recon on whatever disk
   1646 		 * needs it, if any */
   1647 	case RAIDFRAME_COPYBACK:
   1648 
   1649 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1650 			/* This makes no sense on a RAID 0!! */
   1651 			return EINVAL;
   1652 		}
   1653 
   1654 		if (raidPtr->copyback_in_progress == 1) {
   1655 			/* Copyback is already in progress! */
   1656 			return EINVAL;
   1657 		}
   1658 
   1659 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1660 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1661 
   1662 		/* return the percentage completion of reconstruction */
   1663 	case RAIDFRAME_CHECK_RECON_STATUS:
   1664 		return rf_check_recon_status(raidPtr, data);
   1665 
   1666 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1667 		rf_check_recon_status_ext(raidPtr, data);
   1668 		return 0;
   1669 
   1670 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1671 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1672 			/* This makes no sense on a RAID 0, so tell the
   1673 			   user it's done. */
   1674 			*(int *) data = 100;
   1675 			return 0;
   1676 		}
   1677 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1678 			*(int *) data = 100 *
   1679 				raidPtr->parity_rewrite_stripes_done /
   1680 				raidPtr->Layout.numStripe;
   1681 		} else {
   1682 			*(int *) data = 100;
   1683 		}
   1684 		return 0;
   1685 
   1686 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1687 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1688 		return 0;
   1689 
   1690 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1691 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1692 			/* This makes no sense on a RAID 0 */
   1693 			*(int *) data = 100;
   1694 			return 0;
   1695 		}
   1696 		if (raidPtr->copyback_in_progress == 1) {
   1697 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1698 				raidPtr->Layout.numStripe;
   1699 		} else {
   1700 			*(int *) data = 100;
   1701 		}
   1702 		return 0;
   1703 
   1704 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1705 		rf_check_copyback_status_ext(raidPtr, data);
   1706 		return 0;
   1707 
   1708 	case RAIDFRAME_SET_LAST_UNIT:
   1709 		for (column = 0; column < raidPtr->numCol; column++)
   1710 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1711 				return EBUSY;
   1712 
   1713 		for (column = 0; column < raidPtr->numCol; column++) {
   1714 			clabel = raidget_component_label(raidPtr, column);
   1715 			clabel->last_unit = *(int *)data;
   1716 			raidflush_component_label(raidPtr, column);
   1717 		}
   1718 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1719 		return 0;
   1720 
   1721 		/* the sparetable daemon calls this to wait for the kernel to
   1722 		 * need a spare table. this ioctl does not return until a
   1723 		 * spare table is needed. XXX -- calling mpsleep here in the
   1724 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1725 		 * -- I should either compute the spare table in the kernel,
   1726 		 * or have a different -- XXX XXX -- interface (a different
   1727 		 * character device) for delivering the table     -- XXX */
   1728 #if RF_DISABLED
   1729 	case RAIDFRAME_SPARET_WAIT:
   1730 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1731 		while (!rf_sparet_wait_queue)
   1732 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1733 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1734 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1735 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1736 
   1737 		/* structure assignment */
   1738 		*((RF_SparetWait_t *) data) = *waitreq;
   1739 
   1740 		RF_Free(waitreq, sizeof(*waitreq));
   1741 		return 0;
   1742 
   1743 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1744 		 * code in it that will cause the dameon to exit */
   1745 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1746 		waitreq = RF_Malloc(sizeof(*waitreq));
   1747 		waitreq->fcol = -1;
   1748 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1749 		waitreq->next = rf_sparet_wait_queue;
   1750 		rf_sparet_wait_queue = waitreq;
   1751 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1752 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1753 		return 0;
   1754 
   1755 		/* used by the spare table daemon to deliver a spare table
   1756 		 * into the kernel */
   1757 	case RAIDFRAME_SEND_SPARET:
   1758 
   1759 		/* install the spare table */
   1760 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1761 
   1762 		/* respond to the requestor.  the return status of the spare
   1763 		 * table installation is passed in the "fcol" field */
   1764 		waitred = RF_Malloc(sizeof(*waitreq));
   1765 		waitreq->fcol = retcode;
   1766 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1767 		waitreq->next = rf_sparet_resp_queue;
   1768 		rf_sparet_resp_queue = waitreq;
   1769 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1770 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1771 
   1772 		return retcode;
   1773 #endif
   1774 	default:
   1775 		/*
   1776 		 * Don't bother trying to load compat modules
   1777 		 * if it is not our ioctl. This is more efficient
   1778 		 * and makes rump tests not depend on compat code
   1779 		 */
   1780 		if (IOCGROUP(cmd) != 'r')
   1781 			break;
   1782 #ifdef _LP64
   1783 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1784 			module_autoload("compat_netbsd32_raid",
   1785 			    MODULE_CLASS_EXEC);
   1786 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1787 			    (rs, cmd, data), enosys(), retcode);
   1788 			if (retcode != EPASSTHROUGH)
   1789 				return retcode;
   1790 		}
   1791 #endif
   1792 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1793 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1794 		    (rs, cmd, data), enosys(), retcode);
   1795 		if (retcode != EPASSTHROUGH)
   1796 			return retcode;
   1797 
   1798 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1799 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1800 		    (rs, cmd, data), enosys(), retcode);
   1801 		if (retcode != EPASSTHROUGH)
   1802 			return retcode;
   1803 		break; /* fall through to the os-specific code below */
   1804 
   1805 	}
   1806 
   1807 	if (!raidPtr->valid)
   1808 		return EINVAL;
   1809 
   1810 	/*
   1811 	 * Add support for "regular" device ioctls here.
   1812 	 */
   1813 
   1814 	switch (cmd) {
   1815 	case DIOCGCACHE:
   1816 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1817 		break;
   1818 
   1819 	case DIOCCACHESYNC:
   1820 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1821 		break;
   1822 
   1823 	default:
   1824 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1825 		break;
   1826 	}
   1827 
   1828 	return retcode;
   1829 
   1830 }
   1831 
   1832 
   1833 /* raidinit -- complete the rest of the initialization for the
   1834    RAIDframe device.  */
   1835 
   1836 
   1837 static void
   1838 raidinit(struct raid_softc *rs)
   1839 {
   1840 	cfdata_t cf;
   1841 	unsigned int unit;
   1842 	struct dk_softc *dksc = &rs->sc_dksc;
   1843 	RF_Raid_t *raidPtr = &rs->sc_r;
   1844 	device_t dev;
   1845 
   1846 	unit = raidPtr->raidid;
   1847 
   1848 	/* XXX doesn't check bounds. */
   1849 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1850 
   1851 	/* attach the pseudo device */
   1852 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1853 	cf->cf_name = raid_cd.cd_name;
   1854 	cf->cf_atname = raid_cd.cd_name;
   1855 	cf->cf_unit = unit;
   1856 	cf->cf_fstate = FSTATE_STAR;
   1857 
   1858 	dev = config_attach_pseudo(cf);
   1859 	if (dev == NULL) {
   1860 		printf("raid%d: config_attach_pseudo failed\n",
   1861 		    raidPtr->raidid);
   1862 		free(cf, M_RAIDFRAME);
   1863 		return;
   1864 	}
   1865 
   1866 	/* provide a backpointer to the real softc */
   1867 	raidsoftc(dev) = rs;
   1868 
   1869 	/* disk_attach actually creates space for the CPU disklabel, among
   1870 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1871 	 * with disklabels. */
   1872 	dk_init(dksc, dev, DKTYPE_RAID);
   1873 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1874 
   1875 	/* XXX There may be a weird interaction here between this, and
   1876 	 * protectedSectors, as used in RAIDframe.  */
   1877 
   1878 	rs->sc_size = raidPtr->totalSectors;
   1879 
   1880 	/* Attach dk and disk subsystems */
   1881 	dk_attach(dksc);
   1882 	disk_attach(&dksc->sc_dkdev);
   1883 	rf_set_geometry(rs, raidPtr);
   1884 
   1885 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1886 
   1887 	/* mark unit as usuable */
   1888 	rs->sc_flags |= RAIDF_INITED;
   1889 
   1890 	dkwedge_discover(&dksc->sc_dkdev);
   1891 }
   1892 
   1893 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1894 /* wake up the daemon & tell it to get us a spare table
   1895  * XXX
   1896  * the entries in the queues should be tagged with the raidPtr
   1897  * so that in the extremely rare case that two recons happen at once,
   1898  * we know for which device were requesting a spare table
   1899  * XXX
   1900  *
   1901  * XXX This code is not currently used. GO
   1902  */
   1903 int
   1904 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1905 {
   1906 	int     retcode;
   1907 
   1908 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1909 	req->next = rf_sparet_wait_queue;
   1910 	rf_sparet_wait_queue = req;
   1911 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1912 
   1913 	/* mpsleep unlocks the mutex */
   1914 	while (!rf_sparet_resp_queue) {
   1915 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1916 	}
   1917 	req = rf_sparet_resp_queue;
   1918 	rf_sparet_resp_queue = req->next;
   1919 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1920 
   1921 	retcode = req->fcol;
   1922 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1923 					 * alloc'd */
   1924 	return retcode;
   1925 }
   1926 #endif
   1927 
   1928 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1929  * bp & passes it down.
   1930  * any calls originating in the kernel must use non-blocking I/O
   1931  * do some extra sanity checking to return "appropriate" error values for
   1932  * certain conditions (to make some standard utilities work)
   1933  *
   1934  * Formerly known as: rf_DoAccessKernel
   1935  */
   1936 void
   1937 raidstart(RF_Raid_t *raidPtr)
   1938 {
   1939 	struct raid_softc *rs;
   1940 	struct dk_softc *dksc;
   1941 
   1942 	rs = raidPtr->softc;
   1943 	dksc = &rs->sc_dksc;
   1944 	/* quick check to see if anything has died recently */
   1945 	rf_lock_mutex2(raidPtr->mutex);
   1946 	if (raidPtr->numNewFailures > 0) {
   1947 		rf_unlock_mutex2(raidPtr->mutex);
   1948 		rf_update_component_labels(raidPtr,
   1949 					   RF_NORMAL_COMPONENT_UPDATE);
   1950 		rf_lock_mutex2(raidPtr->mutex);
   1951 		raidPtr->numNewFailures--;
   1952 	}
   1953 	rf_unlock_mutex2(raidPtr->mutex);
   1954 
   1955 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1956 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1957 		return;
   1958 	}
   1959 
   1960 	dk_start(dksc, NULL);
   1961 }
   1962 
   1963 static int
   1964 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1965 {
   1966 	RF_SectorCount_t num_blocks, pb, sum;
   1967 	RF_RaidAddr_t raid_addr;
   1968 	daddr_t blocknum;
   1969 	int rc;
   1970 
   1971 	rf_lock_mutex2(raidPtr->mutex);
   1972 	if (raidPtr->openings == 0) {
   1973 		rf_unlock_mutex2(raidPtr->mutex);
   1974 		return EAGAIN;
   1975 	}
   1976 	rf_unlock_mutex2(raidPtr->mutex);
   1977 
   1978 	blocknum = bp->b_rawblkno;
   1979 
   1980 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1981 		    (int) blocknum));
   1982 
   1983 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1984 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1985 
   1986 	/* *THIS* is where we adjust what block we're going to...
   1987 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1988 	raid_addr = blocknum;
   1989 
   1990 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1991 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1992 	sum = raid_addr + num_blocks + pb;
   1993 	if (1 || rf_debugKernelAccess) {
   1994 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1995 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1996 			    (int) pb, (int) bp->b_resid));
   1997 	}
   1998 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1999 	    || (sum < num_blocks) || (sum < pb)) {
   2000 		rc = ENOSPC;
   2001 		goto done;
   2002 	}
   2003 	/*
   2004 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2005 	 */
   2006 
   2007 	if (bp->b_bcount & raidPtr->sectorMask) {
   2008 		rc = ENOSPC;
   2009 		goto done;
   2010 	}
   2011 	db1_printf(("Calling DoAccess..\n"));
   2012 
   2013 
   2014 	rf_lock_mutex2(raidPtr->mutex);
   2015 	raidPtr->openings--;
   2016 	rf_unlock_mutex2(raidPtr->mutex);
   2017 
   2018 	/* don't ever condition on bp->b_flags & B_WRITE.
   2019 	 * always condition on B_READ instead */
   2020 
   2021 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2022 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2023 			 raid_addr, num_blocks,
   2024 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2025 
   2026 done:
   2027 	return rc;
   2028 }
   2029 
   2030 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2031 
   2032 int
   2033 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2034 {
   2035 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2036 	struct buf *bp;
   2037 
   2038 	req->queue = queue;
   2039 	bp = req->bp;
   2040 
   2041 	switch (req->type) {
   2042 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2043 		/* XXX need to do something extra here.. */
   2044 		/* I'm leaving this in, as I've never actually seen it used,
   2045 		 * and I'd like folks to report it... GO */
   2046 		printf("%s: WAKEUP CALLED\n", __func__);
   2047 		queue->numOutstanding++;
   2048 
   2049 		bp->b_flags = 0;
   2050 		bp->b_private = req;
   2051 
   2052 		KernelWakeupFunc(bp);
   2053 		break;
   2054 
   2055 	case RF_IO_TYPE_READ:
   2056 	case RF_IO_TYPE_WRITE:
   2057 #if RF_ACC_TRACE > 0
   2058 		if (req->tracerec) {
   2059 			RF_ETIMER_START(req->tracerec->timer);
   2060 		}
   2061 #endif
   2062 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2063 		    op, queue->rf_cinfo->ci_dev,
   2064 		    req->sectorOffset, req->numSector,
   2065 		    req->buf, KernelWakeupFunc, (void *) req,
   2066 		    queue->raidPtr->logBytesPerSector);
   2067 
   2068 		if (rf_debugKernelAccess) {
   2069 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2070 				(long) bp->b_blkno));
   2071 		}
   2072 		queue->numOutstanding++;
   2073 		queue->last_deq_sector = req->sectorOffset;
   2074 		/* acc wouldn't have been let in if there were any pending
   2075 		 * reqs at any other priority */
   2076 		queue->curPriority = req->priority;
   2077 
   2078 		db1_printf(("Going for %c to unit %d col %d\n",
   2079 			    req->type, queue->raidPtr->raidid,
   2080 			    queue->col));
   2081 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2082 			(int) req->sectorOffset, (int) req->numSector,
   2083 			(int) (req->numSector <<
   2084 			    queue->raidPtr->logBytesPerSector),
   2085 			(int) queue->raidPtr->logBytesPerSector));
   2086 
   2087 		/*
   2088 		 * XXX: drop lock here since this can block at
   2089 		 * least with backing SCSI devices.  Retake it
   2090 		 * to minimize fuss with calling interfaces.
   2091 		 */
   2092 
   2093 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2094 		bdev_strategy(bp);
   2095 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2096 		break;
   2097 
   2098 	default:
   2099 		panic("bad req->type in rf_DispatchKernelIO");
   2100 	}
   2101 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2102 
   2103 	return 0;
   2104 }
   2105 /* this is the callback function associated with a I/O invoked from
   2106    kernel code.
   2107  */
   2108 static void
   2109 KernelWakeupFunc(struct buf *bp)
   2110 {
   2111 	RF_DiskQueueData_t *req = NULL;
   2112 	RF_DiskQueue_t *queue;
   2113 
   2114 	db1_printf(("recovering the request queue:\n"));
   2115 
   2116 	req = bp->b_private;
   2117 
   2118 	queue = (RF_DiskQueue_t *) req->queue;
   2119 
   2120 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2121 
   2122 #if RF_ACC_TRACE > 0
   2123 	if (req->tracerec) {
   2124 		RF_ETIMER_STOP(req->tracerec->timer);
   2125 		RF_ETIMER_EVAL(req->tracerec->timer);
   2126 		rf_lock_mutex2(rf_tracing_mutex);
   2127 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2128 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2129 		req->tracerec->num_phys_ios++;
   2130 		rf_unlock_mutex2(rf_tracing_mutex);
   2131 	}
   2132 #endif
   2133 
   2134 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2135 	 * ballistic, and mark the component as hosed... */
   2136 
   2137 	if (bp->b_error != 0) {
   2138 		/* Mark the disk as dead */
   2139 		/* but only mark it once... */
   2140 		/* and only if it wouldn't leave this RAID set
   2141 		   completely broken */
   2142 		if (((queue->raidPtr->Disks[queue->col].status ==
   2143 		      rf_ds_optimal) ||
   2144 		     (queue->raidPtr->Disks[queue->col].status ==
   2145 		      rf_ds_used_spare)) &&
   2146 		     (queue->raidPtr->numFailures <
   2147 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2148 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2149 			       queue->raidPtr->raidid,
   2150 			       bp->b_error,
   2151 			       queue->raidPtr->Disks[queue->col].devname);
   2152 			queue->raidPtr->Disks[queue->col].status =
   2153 			    rf_ds_failed;
   2154 			queue->raidPtr->status = rf_rs_degraded;
   2155 			queue->raidPtr->numFailures++;
   2156 			queue->raidPtr->numNewFailures++;
   2157 		} else {	/* Disk is already dead... */
   2158 			/* printf("Disk already marked as dead!\n"); */
   2159 		}
   2160 
   2161 	}
   2162 
   2163 	/* Fill in the error value */
   2164 	req->error = bp->b_error;
   2165 
   2166 	/* Drop this one on the "finished" queue... */
   2167 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2168 
   2169 	/* Let the raidio thread know there is work to be done. */
   2170 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2171 
   2172 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2173 }
   2174 
   2175 
   2176 /*
   2177  * initialize a buf structure for doing an I/O in the kernel.
   2178  */
   2179 static void
   2180 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2181        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2182        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2183 {
   2184 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2185 	bp->b_oflags = 0;
   2186 	bp->b_cflags = 0;
   2187 	bp->b_bcount = numSect << logBytesPerSector;
   2188 	bp->b_bufsize = bp->b_bcount;
   2189 	bp->b_error = 0;
   2190 	bp->b_dev = dev;
   2191 	bp->b_data = bf;
   2192 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2193 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2194 	if (bp->b_bcount == 0) {
   2195 		panic("bp->b_bcount is zero in InitBP!!");
   2196 	}
   2197 	bp->b_iodone = cbFunc;
   2198 	bp->b_private = cbArg;
   2199 }
   2200 
   2201 /*
   2202  * Wait interruptibly for an exclusive lock.
   2203  *
   2204  * XXX
   2205  * Several drivers do this; it should be abstracted and made MP-safe.
   2206  * (Hmm... where have we seen this warning before :->  GO )
   2207  */
   2208 static int
   2209 raidlock(struct raid_softc *rs)
   2210 {
   2211 	int     error;
   2212 
   2213 	error = 0;
   2214 	mutex_enter(&rs->sc_mutex);
   2215 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2216 		rs->sc_flags |= RAIDF_WANTED;
   2217 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2218 		if (error != 0)
   2219 			goto done;
   2220 	}
   2221 	rs->sc_flags |= RAIDF_LOCKED;
   2222 done:
   2223 	mutex_exit(&rs->sc_mutex);
   2224 	return error;
   2225 }
   2226 /*
   2227  * Unlock and wake up any waiters.
   2228  */
   2229 static void
   2230 raidunlock(struct raid_softc *rs)
   2231 {
   2232 
   2233 	mutex_enter(&rs->sc_mutex);
   2234 	rs->sc_flags &= ~RAIDF_LOCKED;
   2235 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2236 		rs->sc_flags &= ~RAIDF_WANTED;
   2237 		cv_broadcast(&rs->sc_cv);
   2238 	}
   2239 	mutex_exit(&rs->sc_mutex);
   2240 }
   2241 
   2242 
   2243 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2244 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2245 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2246 
   2247 static daddr_t
   2248 rf_component_info_offset(void)
   2249 {
   2250 
   2251 	return RF_COMPONENT_INFO_OFFSET;
   2252 }
   2253 
   2254 static daddr_t
   2255 rf_component_info_size(unsigned secsize)
   2256 {
   2257 	daddr_t info_size;
   2258 
   2259 	KASSERT(secsize);
   2260 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2261 		info_size = secsize;
   2262 	else
   2263 		info_size = RF_COMPONENT_INFO_SIZE;
   2264 
   2265 	return info_size;
   2266 }
   2267 
   2268 static daddr_t
   2269 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2270 {
   2271 	daddr_t map_offset;
   2272 
   2273 	KASSERT(raidPtr->bytesPerSector);
   2274 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2275 		map_offset = raidPtr->bytesPerSector;
   2276 	else
   2277 		map_offset = RF_COMPONENT_INFO_SIZE;
   2278 	map_offset += rf_component_info_offset();
   2279 
   2280 	return map_offset;
   2281 }
   2282 
   2283 static daddr_t
   2284 rf_parity_map_size(RF_Raid_t *raidPtr)
   2285 {
   2286 	daddr_t map_size;
   2287 
   2288 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2289 		map_size = raidPtr->bytesPerSector;
   2290 	else
   2291 		map_size = RF_PARITY_MAP_SIZE;
   2292 
   2293 	return map_size;
   2294 }
   2295 
   2296 int
   2297 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2298 {
   2299 	RF_ComponentLabel_t *clabel;
   2300 
   2301 	clabel = raidget_component_label(raidPtr, col);
   2302 	clabel->clean = RF_RAID_CLEAN;
   2303 	raidflush_component_label(raidPtr, col);
   2304 	return(0);
   2305 }
   2306 
   2307 
   2308 int
   2309 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2310 {
   2311 	RF_ComponentLabel_t *clabel;
   2312 
   2313 	clabel = raidget_component_label(raidPtr, col);
   2314 	clabel->clean = RF_RAID_DIRTY;
   2315 	raidflush_component_label(raidPtr, col);
   2316 	return(0);
   2317 }
   2318 
   2319 int
   2320 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2321 {
   2322 	KASSERT(raidPtr->bytesPerSector);
   2323 
   2324 	return raidread_component_label(raidPtr->bytesPerSector,
   2325 	    raidPtr->Disks[col].dev,
   2326 	    raidPtr->raid_cinfo[col].ci_vp,
   2327 	    &raidPtr->raid_cinfo[col].ci_label);
   2328 }
   2329 
   2330 RF_ComponentLabel_t *
   2331 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2332 {
   2333 	return &raidPtr->raid_cinfo[col].ci_label;
   2334 }
   2335 
   2336 int
   2337 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2338 {
   2339 	RF_ComponentLabel_t *label;
   2340 
   2341 	label = &raidPtr->raid_cinfo[col].ci_label;
   2342 	label->mod_counter = raidPtr->mod_counter;
   2343 #ifndef RF_NO_PARITY_MAP
   2344 	label->parity_map_modcount = label->mod_counter;
   2345 #endif
   2346 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2347 	    raidPtr->Disks[col].dev,
   2348 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2349 }
   2350 
   2351 /*
   2352  * Swap the label endianness.
   2353  *
   2354  * Everything in the component label is 4-byte-swapped except the version,
   2355  * which is kept in the byte-swapped version at all times, and indicates
   2356  * for the writer that a swap is necessary.
   2357  *
   2358  * For reads it is expected that out_label == clabel, but writes expect
   2359  * separate labels so only the re-swapped label is written out to disk,
   2360  * leaving the swapped-except-version internally.
   2361  *
   2362  * Only support swapping label version 2.
   2363  */
   2364 static void
   2365 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2366 {
   2367 	int	*in, *out, *in_last;
   2368 
   2369 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2370 
   2371 	/* Don't swap the label, but do copy it. */
   2372 	out_label->version = clabel->version;
   2373 
   2374 	in = &clabel->serial_number;
   2375 	in_last = &clabel->future_use2[42];
   2376 	out = &out_label->serial_number;
   2377 
   2378 	for (; in < in_last; in++, out++)
   2379 		*out = bswap32(*in);
   2380 }
   2381 
   2382 static int
   2383 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2384     RF_ComponentLabel_t *clabel)
   2385 {
   2386 	int error;
   2387 
   2388 	error = raidread_component_area(dev, b_vp, clabel,
   2389 	    sizeof(RF_ComponentLabel_t),
   2390 	    rf_component_info_offset(),
   2391 	    rf_component_info_size(secsize));
   2392 
   2393 	if (error == 0 &&
   2394 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2395 		rf_swap_label(clabel, clabel);
   2396 	}
   2397 
   2398 	return error;
   2399 }
   2400 
   2401 /* ARGSUSED */
   2402 static int
   2403 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2404     size_t msize, daddr_t offset, daddr_t dsize)
   2405 {
   2406 	struct buf *bp;
   2407 	int error;
   2408 
   2409 	/* XXX should probably ensure that we don't try to do this if
   2410 	   someone has changed rf_protected_sectors. */
   2411 
   2412 	if (b_vp == NULL) {
   2413 		/* For whatever reason, this component is not valid.
   2414 		   Don't try to read a component label from it. */
   2415 		return(EINVAL);
   2416 	}
   2417 
   2418 	/* get a block of the appropriate size... */
   2419 	bp = geteblk((int)dsize);
   2420 	bp->b_dev = dev;
   2421 
   2422 	/* get our ducks in a row for the read */
   2423 	bp->b_blkno = offset / DEV_BSIZE;
   2424 	bp->b_bcount = dsize;
   2425 	bp->b_flags |= B_READ;
   2426  	bp->b_resid = dsize;
   2427 
   2428 	bdev_strategy(bp);
   2429 	error = biowait(bp);
   2430 
   2431 	if (!error) {
   2432 		memcpy(data, bp->b_data, msize);
   2433 	}
   2434 
   2435 	brelse(bp, 0);
   2436 	return(error);
   2437 }
   2438 
   2439 static int
   2440 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2441     RF_ComponentLabel_t *clabel)
   2442 {
   2443 	RF_ComponentLabel_t *clabel_write = clabel;
   2444 	RF_ComponentLabel_t lclabel;
   2445 	int error;
   2446 
   2447 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2448 		clabel_write = &lclabel;
   2449 		rf_swap_label(clabel, clabel_write);
   2450 	}
   2451 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2452 	    sizeof(RF_ComponentLabel_t),
   2453 	    rf_component_info_offset(),
   2454 	    rf_component_info_size(secsize), 0);
   2455 
   2456 	return error;
   2457 }
   2458 
   2459 /* ARGSUSED */
   2460 static int
   2461 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2462     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2463 {
   2464 	struct buf *bp;
   2465 	int error;
   2466 
   2467 	/* get a block of the appropriate size... */
   2468 	bp = geteblk((int)dsize);
   2469 	bp->b_dev = dev;
   2470 
   2471 	/* get our ducks in a row for the write */
   2472 	bp->b_blkno = offset / DEV_BSIZE;
   2473 	bp->b_bcount = dsize;
   2474 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2475  	bp->b_resid = dsize;
   2476 
   2477 	memset(bp->b_data, 0, dsize);
   2478 	memcpy(bp->b_data, data, msize);
   2479 
   2480 	bdev_strategy(bp);
   2481 	if (asyncp)
   2482 		return 0;
   2483 	error = biowait(bp);
   2484 	brelse(bp, 0);
   2485 	if (error) {
   2486 #if 1
   2487 		printf("Failed to write RAID component info!\n");
   2488 #endif
   2489 	}
   2490 
   2491 	return(error);
   2492 }
   2493 
   2494 void
   2495 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2496 {
   2497 	int c;
   2498 
   2499 	for (c = 0; c < raidPtr->numCol; c++) {
   2500 		/* Skip dead disks. */
   2501 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2502 			continue;
   2503 		/* XXXjld: what if an error occurs here? */
   2504 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2505 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2506 		    RF_PARITYMAP_NBYTE,
   2507 		    rf_parity_map_offset(raidPtr),
   2508 		    rf_parity_map_size(raidPtr), 0);
   2509 	}
   2510 }
   2511 
   2512 void
   2513 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2514 {
   2515 	struct rf_paritymap_ondisk tmp;
   2516 	int c,first;
   2517 
   2518 	first=1;
   2519 	for (c = 0; c < raidPtr->numCol; c++) {
   2520 		/* Skip dead disks. */
   2521 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2522 			continue;
   2523 		raidread_component_area(raidPtr->Disks[c].dev,
   2524 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2525 		    RF_PARITYMAP_NBYTE,
   2526 		    rf_parity_map_offset(raidPtr),
   2527 		    rf_parity_map_size(raidPtr));
   2528 		if (first) {
   2529 			memcpy(map, &tmp, sizeof(*map));
   2530 			first = 0;
   2531 		} else {
   2532 			rf_paritymap_merge(map, &tmp);
   2533 		}
   2534 	}
   2535 }
   2536 
   2537 void
   2538 rf_markalldirty(RF_Raid_t *raidPtr)
   2539 {
   2540 	RF_ComponentLabel_t *clabel;
   2541 	int sparecol;
   2542 	int c;
   2543 	int j;
   2544 	int scol = -1;
   2545 
   2546 	raidPtr->mod_counter++;
   2547 	for (c = 0; c < raidPtr->numCol; c++) {
   2548 		/* we don't want to touch (at all) a disk that has
   2549 		   failed */
   2550 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2551 			clabel = raidget_component_label(raidPtr, c);
   2552 			if (clabel->status == rf_ds_spared) {
   2553 				/* XXX do something special...
   2554 				   but whatever you do, don't
   2555 				   try to access it!! */
   2556 			} else {
   2557 				raidmarkdirty(raidPtr, c);
   2558 			}
   2559 		}
   2560 	}
   2561 
   2562 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2563 		sparecol = raidPtr->numCol + c;
   2564 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2565 			/*
   2566 
   2567 			   we claim this disk is "optimal" if it's
   2568 			   rf_ds_used_spare, as that means it should be
   2569 			   directly substitutable for the disk it replaced.
   2570 			   We note that too...
   2571 
   2572 			 */
   2573 
   2574 			for(j=0;j<raidPtr->numCol;j++) {
   2575 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2576 					scol = j;
   2577 					break;
   2578 				}
   2579 			}
   2580 
   2581 			clabel = raidget_component_label(raidPtr, sparecol);
   2582 			/* make sure status is noted */
   2583 
   2584 			raid_init_component_label(raidPtr, clabel);
   2585 
   2586 			clabel->row = 0;
   2587 			clabel->column = scol;
   2588 			/* Note: we *don't* change status from rf_ds_used_spare
   2589 			   to rf_ds_optimal */
   2590 			/* clabel.status = rf_ds_optimal; */
   2591 
   2592 			raidmarkdirty(raidPtr, sparecol);
   2593 		}
   2594 	}
   2595 }
   2596 
   2597 
   2598 void
   2599 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2600 {
   2601 	RF_ComponentLabel_t *clabel;
   2602 	int sparecol;
   2603 	int c;
   2604 	int j;
   2605 	int scol;
   2606 	struct raid_softc *rs = raidPtr->softc;
   2607 
   2608 	scol = -1;
   2609 
   2610 	/* XXX should do extra checks to make sure things really are clean,
   2611 	   rather than blindly setting the clean bit... */
   2612 
   2613 	raidPtr->mod_counter++;
   2614 
   2615 	for (c = 0; c < raidPtr->numCol; c++) {
   2616 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2617 			clabel = raidget_component_label(raidPtr, c);
   2618 			/* make sure status is noted */
   2619 			clabel->status = rf_ds_optimal;
   2620 
   2621 			/* note what unit we are configured as */
   2622 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2623 				clabel->last_unit = raidPtr->raidid;
   2624 
   2625 			raidflush_component_label(raidPtr, c);
   2626 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2627 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2628 					raidmarkclean(raidPtr, c);
   2629 				}
   2630 			}
   2631 		}
   2632 		/* else we don't touch it.. */
   2633 	}
   2634 
   2635 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2636 		sparecol = raidPtr->numCol + c;
   2637 		/* Need to ensure that the reconstruct actually completed! */
   2638 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2639 			/*
   2640 
   2641 			   we claim this disk is "optimal" if it's
   2642 			   rf_ds_used_spare, as that means it should be
   2643 			   directly substitutable for the disk it replaced.
   2644 			   We note that too...
   2645 
   2646 			 */
   2647 
   2648 			for(j=0;j<raidPtr->numCol;j++) {
   2649 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2650 					scol = j;
   2651 					break;
   2652 				}
   2653 			}
   2654 
   2655 			/* XXX shouldn't *really* need this... */
   2656 			clabel = raidget_component_label(raidPtr, sparecol);
   2657 			/* make sure status is noted */
   2658 
   2659 			raid_init_component_label(raidPtr, clabel);
   2660 
   2661 			clabel->column = scol;
   2662 			clabel->status = rf_ds_optimal;
   2663 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2664 				clabel->last_unit = raidPtr->raidid;
   2665 
   2666 			raidflush_component_label(raidPtr, sparecol);
   2667 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2668 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2669 					raidmarkclean(raidPtr, sparecol);
   2670 				}
   2671 			}
   2672 		}
   2673 	}
   2674 }
   2675 
   2676 void
   2677 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2678 {
   2679 
   2680 	if (vp != NULL) {
   2681 		if (auto_configured == 1) {
   2682 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2683 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2684 			vput(vp);
   2685 
   2686 		} else {
   2687 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2688 		}
   2689 	}
   2690 }
   2691 
   2692 
   2693 void
   2694 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2695 {
   2696 	int r,c;
   2697 	struct vnode *vp;
   2698 	int acd;
   2699 
   2700 
   2701 	/* We take this opportunity to close the vnodes like we should.. */
   2702 
   2703 	for (c = 0; c < raidPtr->numCol; c++) {
   2704 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2705 		acd = raidPtr->Disks[c].auto_configured;
   2706 		rf_close_component(raidPtr, vp, acd);
   2707 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2708 		raidPtr->Disks[c].auto_configured = 0;
   2709 	}
   2710 
   2711 	for (r = 0; r < raidPtr->numSpare; r++) {
   2712 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2713 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2714 		rf_close_component(raidPtr, vp, acd);
   2715 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2716 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2717 	}
   2718 }
   2719 
   2720 
   2721 static void
   2722 rf_ReconThread(struct rf_recon_req_internal *req)
   2723 {
   2724 	int     s;
   2725 	RF_Raid_t *raidPtr;
   2726 
   2727 	s = splbio();
   2728 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2729 	raidPtr->recon_in_progress = 1;
   2730 
   2731 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2732 		raidPtr->forceRecon = 1;
   2733 	}
   2734 
   2735 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2736 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2737 
   2738 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2739 		raidPtr->forceRecon = 0;
   2740 	}
   2741 
   2742 	RF_Free(req, sizeof(*req));
   2743 
   2744 	raidPtr->recon_in_progress = 0;
   2745 	splx(s);
   2746 
   2747 	/* That's all... */
   2748 	kthread_exit(0);	/* does not return */
   2749 }
   2750 
   2751 static void
   2752 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2753 {
   2754 	int retcode;
   2755 	int s;
   2756 
   2757 	raidPtr->parity_rewrite_stripes_done = 0;
   2758 	raidPtr->parity_rewrite_in_progress = 1;
   2759 	s = splbio();
   2760 	retcode = rf_RewriteParity(raidPtr);
   2761 	splx(s);
   2762 	if (retcode) {
   2763 		printf("raid%d: Error re-writing parity (%d)!\n",
   2764 		    raidPtr->raidid, retcode);
   2765 	} else {
   2766 		/* set the clean bit!  If we shutdown correctly,
   2767 		   the clean bit on each component label will get
   2768 		   set */
   2769 		raidPtr->parity_good = RF_RAID_CLEAN;
   2770 	}
   2771 	raidPtr->parity_rewrite_in_progress = 0;
   2772 
   2773 	/* Anyone waiting for us to stop?  If so, inform them... */
   2774 	if (raidPtr->waitShutdown) {
   2775 		rf_lock_mutex2(raidPtr->rad_lock);
   2776 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2777 		rf_unlock_mutex2(raidPtr->rad_lock);
   2778 	}
   2779 
   2780 	/* That's all... */
   2781 	kthread_exit(0);	/* does not return */
   2782 }
   2783 
   2784 
   2785 static void
   2786 rf_CopybackThread(RF_Raid_t *raidPtr)
   2787 {
   2788 	int s;
   2789 
   2790 	raidPtr->copyback_in_progress = 1;
   2791 	s = splbio();
   2792 	rf_CopybackReconstructedData(raidPtr);
   2793 	splx(s);
   2794 	raidPtr->copyback_in_progress = 0;
   2795 
   2796 	/* That's all... */
   2797 	kthread_exit(0);	/* does not return */
   2798 }
   2799 
   2800 
   2801 static void
   2802 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2803 {
   2804 	int s;
   2805 	RF_Raid_t *raidPtr;
   2806 
   2807 	s = splbio();
   2808 	raidPtr = req->raidPtr;
   2809 	raidPtr->recon_in_progress = 1;
   2810 
   2811 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2812 		raidPtr->forceRecon = 1;
   2813 	}
   2814 
   2815 	rf_ReconstructInPlace(raidPtr, req->col);
   2816 
   2817 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2818 		raidPtr->forceRecon = 0;
   2819 	}
   2820 
   2821 	RF_Free(req, sizeof(*req));
   2822 	raidPtr->recon_in_progress = 0;
   2823 	splx(s);
   2824 
   2825 	/* That's all... */
   2826 	kthread_exit(0);	/* does not return */
   2827 }
   2828 
   2829 static RF_AutoConfig_t *
   2830 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2831     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2832     unsigned secsize)
   2833 {
   2834 	int good_one = 0;
   2835 	RF_ComponentLabel_t *clabel;
   2836 	RF_AutoConfig_t *ac;
   2837 
   2838 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2839 
   2840 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2841 		/* Got the label.  Does it look reasonable? */
   2842 		if (rf_reasonable_label(clabel, numsecs) &&
   2843 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2844 #ifdef DEBUG
   2845 			printf("Component on: %s: %llu\n",
   2846 				cname, (unsigned long long)size);
   2847 			rf_print_component_label(clabel);
   2848 #endif
   2849 			/* if it's reasonable, add it, else ignore it. */
   2850 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2851 				M_WAITOK);
   2852 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2853 			ac->dev = dev;
   2854 			ac->vp = vp;
   2855 			ac->clabel = clabel;
   2856 			ac->next = ac_list;
   2857 			ac_list = ac;
   2858 			good_one = 1;
   2859 		}
   2860 	}
   2861 	if (!good_one) {
   2862 		/* cleanup */
   2863 		free(clabel, M_RAIDFRAME);
   2864 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2865 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2866 		vput(vp);
   2867 	}
   2868 	return ac_list;
   2869 }
   2870 
   2871 static RF_AutoConfig_t *
   2872 rf_find_raid_components(void)
   2873 {
   2874 	struct vnode *vp;
   2875 	struct disklabel label;
   2876 	device_t dv;
   2877 	deviter_t di;
   2878 	dev_t dev;
   2879 	int bmajor, bminor, wedge, rf_part_found;
   2880 	int error;
   2881 	int i;
   2882 	RF_AutoConfig_t *ac_list;
   2883 	uint64_t numsecs;
   2884 	unsigned secsize;
   2885 	int dowedges;
   2886 
   2887 	/* initialize the AutoConfig list */
   2888 	ac_list = NULL;
   2889 
   2890 	/*
   2891 	 * we begin by trolling through *all* the devices on the system *twice*
   2892 	 * first we scan for wedges, second for other devices. This avoids
   2893 	 * using a raw partition instead of a wedge that covers the whole disk
   2894 	 */
   2895 
   2896 	for (dowedges=1; dowedges>=0; --dowedges) {
   2897 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2898 		     dv = deviter_next(&di)) {
   2899 
   2900 			/* we are only interested in disks */
   2901 			if (device_class(dv) != DV_DISK)
   2902 				continue;
   2903 
   2904 			/* we don't care about floppies */
   2905 			if (device_is_a(dv, "fd")) {
   2906 				continue;
   2907 			}
   2908 
   2909 			/* we don't care about CDs. */
   2910 			if (device_is_a(dv, "cd")) {
   2911 				continue;
   2912 			}
   2913 
   2914 			/* we don't care about md. */
   2915 			if (device_is_a(dv, "md")) {
   2916 				continue;
   2917 			}
   2918 
   2919 			/* hdfd is the Atari/Hades floppy driver */
   2920 			if (device_is_a(dv, "hdfd")) {
   2921 				continue;
   2922 			}
   2923 
   2924 			/* fdisa is the Atari/Milan floppy driver */
   2925 			if (device_is_a(dv, "fdisa")) {
   2926 				continue;
   2927 			}
   2928 
   2929 			/* we don't care about spiflash */
   2930 			if (device_is_a(dv, "spiflash")) {
   2931 				continue;
   2932 			}
   2933 
   2934 			/* are we in the wedges pass ? */
   2935 			wedge = device_is_a(dv, "dk");
   2936 			if (wedge != dowedges) {
   2937 				continue;
   2938 			}
   2939 
   2940 			/* need to find the device_name_to_block_device_major stuff */
   2941 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2942 
   2943 			rf_part_found = 0; /*No raid partition as yet*/
   2944 
   2945 			/* get a vnode for the raw partition of this disk */
   2946 			bminor = minor(device_unit(dv));
   2947 			dev = wedge ? makedev(bmajor, bminor) :
   2948 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2949 			if (bdevvp(dev, &vp))
   2950 				panic("RAID can't alloc vnode");
   2951 
   2952 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2953 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2954 
   2955 			if (error) {
   2956 				/* "Who cares."  Continue looking
   2957 				   for something that exists*/
   2958 				vput(vp);
   2959 				continue;
   2960 			}
   2961 
   2962 			error = getdisksize(vp, &numsecs, &secsize);
   2963 			if (error) {
   2964 				/*
   2965 				 * Pseudo devices like vnd and cgd can be
   2966 				 * opened but may still need some configuration.
   2967 				 * Ignore these quietly.
   2968 				 */
   2969 				if (error != ENXIO)
   2970 					printf("RAIDframe: can't get disk size"
   2971 					    " for dev %s (%d)\n",
   2972 					    device_xname(dv), error);
   2973 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2974 				vput(vp);
   2975 				continue;
   2976 			}
   2977 			if (wedge) {
   2978 				struct dkwedge_info dkw;
   2979 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2980 				    NOCRED);
   2981 				if (error) {
   2982 					printf("RAIDframe: can't get wedge info for "
   2983 					    "dev %s (%d)\n", device_xname(dv), error);
   2984 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2985 					vput(vp);
   2986 					continue;
   2987 				}
   2988 
   2989 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2990 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2991 					vput(vp);
   2992 					continue;
   2993 				}
   2994 
   2995 				VOP_UNLOCK(vp);
   2996 				ac_list = rf_get_component(ac_list, dev, vp,
   2997 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2998 				rf_part_found = 1; /*There is a raid component on this disk*/
   2999 				continue;
   3000 			}
   3001 
   3002 			/* Ok, the disk exists.  Go get the disklabel. */
   3003 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3004 			if (error) {
   3005 				/*
   3006 				 * XXX can't happen - open() would
   3007 				 * have errored out (or faked up one)
   3008 				 */
   3009 				if (error != ENOTTY)
   3010 					printf("RAIDframe: can't get label for dev "
   3011 					    "%s (%d)\n", device_xname(dv), error);
   3012 			}
   3013 
   3014 			/* don't need this any more.  We'll allocate it again
   3015 			   a little later if we really do... */
   3016 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3017 			vput(vp);
   3018 
   3019 			if (error)
   3020 				continue;
   3021 
   3022 			rf_part_found = 0; /*No raid partitions yet*/
   3023 			for (i = 0; i < label.d_npartitions; i++) {
   3024 				char cname[sizeof(ac_list->devname)];
   3025 
   3026 				/* We only support partitions marked as RAID */
   3027 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3028 					continue;
   3029 
   3030 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3031 				if (bdevvp(dev, &vp))
   3032 					panic("RAID can't alloc vnode");
   3033 
   3034 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3035 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3036 				if (error) {
   3037 					/* Not quite a 'whatever'.  In
   3038 					 * this situation we know
   3039 					 * there is a FS_RAID
   3040 					 * partition, but we can't
   3041 					 * open it.  The most likely
   3042 					 * reason is that the
   3043 					 * partition is already in
   3044 					 * use by another RAID set.
   3045 					 * So note that we've already
   3046 					 * found a partition on this
   3047 					 * disk so we don't attempt
   3048 					 * to use the raw disk later. */
   3049 					rf_part_found = 1;
   3050 					vput(vp);
   3051 					continue;
   3052 				}
   3053 				VOP_UNLOCK(vp);
   3054 				snprintf(cname, sizeof(cname), "%s%c",
   3055 				    device_xname(dv), 'a' + i);
   3056 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3057 					label.d_partitions[i].p_size, numsecs, secsize);
   3058 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3059 			}
   3060 
   3061 			/*
   3062 			 *If there is no raid component on this disk, either in a
   3063 			 *disklabel or inside a wedge, check the raw partition as well,
   3064 			 *as it is possible to configure raid components on raw disk
   3065 			 *devices.
   3066 			 */
   3067 
   3068 			if (!rf_part_found) {
   3069 				char cname[sizeof(ac_list->devname)];
   3070 
   3071 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3072 				if (bdevvp(dev, &vp))
   3073 					panic("RAID can't alloc vnode");
   3074 
   3075 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3076 
   3077 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3078 				if (error) {
   3079 					/* Whatever... */
   3080 					vput(vp);
   3081 					continue;
   3082 				}
   3083 				VOP_UNLOCK(vp);
   3084 				snprintf(cname, sizeof(cname), "%s%c",
   3085 				    device_xname(dv), 'a' + RAW_PART);
   3086 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3087 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3088 			}
   3089 		}
   3090 		deviter_release(&di);
   3091 	}
   3092 	return ac_list;
   3093 }
   3094 
   3095 int
   3096 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3097 {
   3098 
   3099 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3100 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3101 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3102 	    (clabel->clean == RF_RAID_CLEAN ||
   3103 	     clabel->clean == RF_RAID_DIRTY) &&
   3104 	    clabel->row >=0 &&
   3105 	    clabel->column >= 0 &&
   3106 	    clabel->num_rows > 0 &&
   3107 	    clabel->num_columns > 0 &&
   3108 	    clabel->row < clabel->num_rows &&
   3109 	    clabel->column < clabel->num_columns &&
   3110 	    clabel->blockSize > 0 &&
   3111 	    /*
   3112 	     * numBlocksHi may contain garbage, but it is ok since
   3113 	     * the type is unsigned.  If it is really garbage,
   3114 	     * rf_fix_old_label_size() will fix it.
   3115 	     */
   3116 	    rf_component_label_numblocks(clabel) > 0) {
   3117 		/*
   3118 		 * label looks reasonable enough...
   3119 		 * let's make sure it has no old garbage.
   3120 		 */
   3121 		if (numsecs)
   3122 			rf_fix_old_label_size(clabel, numsecs);
   3123 		return(1);
   3124 	}
   3125 	return(0);
   3126 }
   3127 
   3128 
   3129 /*
   3130  * For reasons yet unknown, some old component labels have garbage in
   3131  * the newer numBlocksHi region, and this causes lossage.  Since those
   3132  * disks will also have numsecs set to less than 32 bits of sectors,
   3133  * we can determine when this corruption has occurred, and fix it.
   3134  *
   3135  * The exact same problem, with the same unknown reason, happens to
   3136  * the partitionSizeHi member as well.
   3137  */
   3138 static void
   3139 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3140 {
   3141 
   3142 	if (numsecs < ((uint64_t)1 << 32)) {
   3143 		if (clabel->numBlocksHi) {
   3144 			printf("WARNING: total sectors < 32 bits, yet "
   3145 			       "numBlocksHi set\n"
   3146 			       "WARNING: resetting numBlocksHi to zero.\n");
   3147 			clabel->numBlocksHi = 0;
   3148 		}
   3149 
   3150 		if (clabel->partitionSizeHi) {
   3151 			printf("WARNING: total sectors < 32 bits, yet "
   3152 			       "partitionSizeHi set\n"
   3153 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3154 			clabel->partitionSizeHi = 0;
   3155 		}
   3156 	}
   3157 }
   3158 
   3159 
   3160 #ifdef DEBUG
   3161 void
   3162 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3163 {
   3164 	uint64_t numBlocks;
   3165 	static const char *rp[] = {
   3166 	    "No", "Force", "Soft", "*invalid*"
   3167 	};
   3168 
   3169 
   3170 	numBlocks = rf_component_label_numblocks(clabel);
   3171 
   3172 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3173 	       clabel->row, clabel->column,
   3174 	       clabel->num_rows, clabel->num_columns);
   3175 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3176 	       clabel->version, clabel->serial_number,
   3177 	       clabel->mod_counter);
   3178 	printf("   Clean: %s Status: %d\n",
   3179 	       clabel->clean ? "Yes" : "No", clabel->status);
   3180 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3181 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3182 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3183 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3184 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3185 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3186 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3187 #if 0
   3188 	   printf("   Config order: %d\n", clabel->config_order);
   3189 #endif
   3190 
   3191 }
   3192 #endif
   3193 
   3194 static RF_ConfigSet_t *
   3195 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3196 {
   3197 	RF_AutoConfig_t *ac;
   3198 	RF_ConfigSet_t *config_sets;
   3199 	RF_ConfigSet_t *cset;
   3200 	RF_AutoConfig_t *ac_next;
   3201 
   3202 
   3203 	config_sets = NULL;
   3204 
   3205 	/* Go through the AutoConfig list, and figure out which components
   3206 	   belong to what sets.  */
   3207 	ac = ac_list;
   3208 	while(ac!=NULL) {
   3209 		/* we're going to putz with ac->next, so save it here
   3210 		   for use at the end of the loop */
   3211 		ac_next = ac->next;
   3212 
   3213 		if (config_sets == NULL) {
   3214 			/* will need at least this one... */
   3215 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3216 				       M_RAIDFRAME, M_WAITOK);
   3217 			/* this one is easy :) */
   3218 			config_sets->ac = ac;
   3219 			config_sets->next = NULL;
   3220 			config_sets->rootable = 0;
   3221 			ac->next = NULL;
   3222 		} else {
   3223 			/* which set does this component fit into? */
   3224 			cset = config_sets;
   3225 			while(cset!=NULL) {
   3226 				if (rf_does_it_fit(cset, ac)) {
   3227 					/* looks like it matches... */
   3228 					ac->next = cset->ac;
   3229 					cset->ac = ac;
   3230 					break;
   3231 				}
   3232 				cset = cset->next;
   3233 			}
   3234 			if (cset==NULL) {
   3235 				/* didn't find a match above... new set..*/
   3236 				cset = malloc(sizeof(RF_ConfigSet_t),
   3237 					       M_RAIDFRAME, M_WAITOK);
   3238 				cset->ac = ac;
   3239 				ac->next = NULL;
   3240 				cset->next = config_sets;
   3241 				cset->rootable = 0;
   3242 				config_sets = cset;
   3243 			}
   3244 		}
   3245 		ac = ac_next;
   3246 	}
   3247 
   3248 
   3249 	return(config_sets);
   3250 }
   3251 
   3252 static int
   3253 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3254 {
   3255 	RF_ComponentLabel_t *clabel1, *clabel2;
   3256 
   3257 	/* If this one matches the *first* one in the set, that's good
   3258 	   enough, since the other members of the set would have been
   3259 	   through here too... */
   3260 	/* note that we are not checking partitionSize here..
   3261 
   3262 	   Note that we are also not checking the mod_counters here.
   3263 	   If everything else matches except the mod_counter, that's
   3264 	   good enough for this test.  We will deal with the mod_counters
   3265 	   a little later in the autoconfiguration process.
   3266 
   3267 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3268 
   3269 	   The reason we don't check for this is that failed disks
   3270 	   will have lower modification counts.  If those disks are
   3271 	   not added to the set they used to belong to, then they will
   3272 	   form their own set, which may result in 2 different sets,
   3273 	   for example, competing to be configured at raid0, and
   3274 	   perhaps competing to be the root filesystem set.  If the
   3275 	   wrong ones get configured, or both attempt to become /,
   3276 	   weird behaviour and or serious lossage will occur.  Thus we
   3277 	   need to bring them into the fold here, and kick them out at
   3278 	   a later point.
   3279 
   3280 	*/
   3281 
   3282 	clabel1 = cset->ac->clabel;
   3283 	clabel2 = ac->clabel;
   3284 	if ((clabel1->version == clabel2->version) &&
   3285 	    (clabel1->serial_number == clabel2->serial_number) &&
   3286 	    (clabel1->num_rows == clabel2->num_rows) &&
   3287 	    (clabel1->num_columns == clabel2->num_columns) &&
   3288 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3289 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3290 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3291 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3292 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3293 	    (clabel1->blockSize == clabel2->blockSize) &&
   3294 	    rf_component_label_numblocks(clabel1) ==
   3295 	    rf_component_label_numblocks(clabel2) &&
   3296 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3297 	    (clabel1->root_partition == clabel2->root_partition) &&
   3298 	    (clabel1->last_unit == clabel2->last_unit) &&
   3299 	    (clabel1->config_order == clabel2->config_order)) {
   3300 		/* if it get's here, it almost *has* to be a match */
   3301 	} else {
   3302 		/* it's not consistent with somebody in the set..
   3303 		   punt */
   3304 		return(0);
   3305 	}
   3306 	/* all was fine.. it must fit... */
   3307 	return(1);
   3308 }
   3309 
   3310 static int
   3311 rf_have_enough_components(RF_ConfigSet_t *cset)
   3312 {
   3313 	RF_AutoConfig_t *ac;
   3314 	RF_AutoConfig_t *auto_config;
   3315 	RF_ComponentLabel_t *clabel;
   3316 	int c;
   3317 	int num_cols;
   3318 	int num_missing;
   3319 	int mod_counter;
   3320 	int mod_counter_found;
   3321 	int even_pair_failed;
   3322 	char parity_type;
   3323 
   3324 
   3325 	/* check to see that we have enough 'live' components
   3326 	   of this set.  If so, we can configure it if necessary */
   3327 
   3328 	num_cols = cset->ac->clabel->num_columns;
   3329 	parity_type = cset->ac->clabel->parityConfig;
   3330 
   3331 	/* XXX Check for duplicate components!?!?!? */
   3332 
   3333 	/* Determine what the mod_counter is supposed to be for this set. */
   3334 
   3335 	mod_counter_found = 0;
   3336 	mod_counter = 0;
   3337 	ac = cset->ac;
   3338 	while(ac!=NULL) {
   3339 		if (mod_counter_found==0) {
   3340 			mod_counter = ac->clabel->mod_counter;
   3341 			mod_counter_found = 1;
   3342 		} else {
   3343 			if (ac->clabel->mod_counter > mod_counter) {
   3344 				mod_counter = ac->clabel->mod_counter;
   3345 			}
   3346 		}
   3347 		ac = ac->next;
   3348 	}
   3349 
   3350 	num_missing = 0;
   3351 	auto_config = cset->ac;
   3352 
   3353 	even_pair_failed = 0;
   3354 	for(c=0; c<num_cols; c++) {
   3355 		ac = auto_config;
   3356 		while(ac!=NULL) {
   3357 			if ((ac->clabel->column == c) &&
   3358 			    (ac->clabel->mod_counter == mod_counter)) {
   3359 				/* it's this one... */
   3360 #ifdef DEBUG
   3361 				printf("Found: %s at %d\n",
   3362 				       ac->devname,c);
   3363 #endif
   3364 				break;
   3365 			}
   3366 			ac=ac->next;
   3367 		}
   3368 		if (ac==NULL) {
   3369 				/* Didn't find one here! */
   3370 				/* special case for RAID 1, especially
   3371 				   where there are more than 2
   3372 				   components (where RAIDframe treats
   3373 				   things a little differently :( ) */
   3374 			if (parity_type == '1') {
   3375 				if (c%2 == 0) { /* even component */
   3376 					even_pair_failed = 1;
   3377 				} else { /* odd component.  If
   3378 					    we're failed, and
   3379 					    so is the even
   3380 					    component, it's
   3381 					    "Good Night, Charlie" */
   3382 					if (even_pair_failed == 1) {
   3383 						return(0);
   3384 					}
   3385 				}
   3386 			} else {
   3387 				/* normal accounting */
   3388 				num_missing++;
   3389 			}
   3390 		}
   3391 		if ((parity_type == '1') && (c%2 == 1)) {
   3392 				/* Just did an even component, and we didn't
   3393 				   bail.. reset the even_pair_failed flag,
   3394 				   and go on to the next component.... */
   3395 			even_pair_failed = 0;
   3396 		}
   3397 	}
   3398 
   3399 	clabel = cset->ac->clabel;
   3400 
   3401 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3402 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3403 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3404 		/* XXX this needs to be made *much* more general */
   3405 		/* Too many failures */
   3406 		return(0);
   3407 	}
   3408 	/* otherwise, all is well, and we've got enough to take a kick
   3409 	   at autoconfiguring this set */
   3410 	return(1);
   3411 }
   3412 
   3413 static void
   3414 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3415 			RF_Raid_t *raidPtr)
   3416 {
   3417 	RF_ComponentLabel_t *clabel;
   3418 	int i;
   3419 
   3420 	clabel = ac->clabel;
   3421 
   3422 	/* 1. Fill in the common stuff */
   3423 	config->numCol = clabel->num_columns;
   3424 	config->numSpare = 0; /* XXX should this be set here? */
   3425 	config->sectPerSU = clabel->sectPerSU;
   3426 	config->SUsPerPU = clabel->SUsPerPU;
   3427 	config->SUsPerRU = clabel->SUsPerRU;
   3428 	config->parityConfig = clabel->parityConfig;
   3429 	/* XXX... */
   3430 	strcpy(config->diskQueueType,"fifo");
   3431 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3432 	config->layoutSpecificSize = 0; /* XXX ?? */
   3433 
   3434 	while(ac!=NULL) {
   3435 		/* row/col values will be in range due to the checks
   3436 		   in reasonable_label() */
   3437 		strcpy(config->devnames[0][ac->clabel->column],
   3438 		       ac->devname);
   3439 		ac = ac->next;
   3440 	}
   3441 
   3442 	for(i=0;i<RF_MAXDBGV;i++) {
   3443 		config->debugVars[i][0] = 0;
   3444 	}
   3445 }
   3446 
   3447 static int
   3448 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3449 {
   3450 	RF_ComponentLabel_t *clabel;
   3451 	int column;
   3452 	int sparecol;
   3453 
   3454 	raidPtr->autoconfigure = new_value;
   3455 
   3456 	for(column=0; column<raidPtr->numCol; column++) {
   3457 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3458 			clabel = raidget_component_label(raidPtr, column);
   3459 			clabel->autoconfigure = new_value;
   3460 			raidflush_component_label(raidPtr, column);
   3461 		}
   3462 	}
   3463 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3464 		sparecol = raidPtr->numCol + column;
   3465 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3466 			clabel = raidget_component_label(raidPtr, sparecol);
   3467 			clabel->autoconfigure = new_value;
   3468 			raidflush_component_label(raidPtr, sparecol);
   3469 		}
   3470 	}
   3471 	return(new_value);
   3472 }
   3473 
   3474 static int
   3475 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3476 {
   3477 	RF_ComponentLabel_t *clabel;
   3478 	int column;
   3479 	int sparecol;
   3480 
   3481 	raidPtr->root_partition = new_value;
   3482 	for(column=0; column<raidPtr->numCol; column++) {
   3483 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3484 			clabel = raidget_component_label(raidPtr, column);
   3485 			clabel->root_partition = new_value;
   3486 			raidflush_component_label(raidPtr, column);
   3487 		}
   3488 	}
   3489 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3490 		sparecol = raidPtr->numCol + column;
   3491 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3492 			clabel = raidget_component_label(raidPtr, sparecol);
   3493 			clabel->root_partition = new_value;
   3494 			raidflush_component_label(raidPtr, sparecol);
   3495 		}
   3496 	}
   3497 	return(new_value);
   3498 }
   3499 
   3500 static void
   3501 rf_release_all_vps(RF_ConfigSet_t *cset)
   3502 {
   3503 	RF_AutoConfig_t *ac;
   3504 
   3505 	ac = cset->ac;
   3506 	while(ac!=NULL) {
   3507 		/* Close the vp, and give it back */
   3508 		if (ac->vp) {
   3509 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3510 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3511 			vput(ac->vp);
   3512 			ac->vp = NULL;
   3513 		}
   3514 		ac = ac->next;
   3515 	}
   3516 }
   3517 
   3518 
   3519 static void
   3520 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3521 {
   3522 	RF_AutoConfig_t *ac;
   3523 	RF_AutoConfig_t *next_ac;
   3524 
   3525 	ac = cset->ac;
   3526 	while(ac!=NULL) {
   3527 		next_ac = ac->next;
   3528 		/* nuke the label */
   3529 		free(ac->clabel, M_RAIDFRAME);
   3530 		/* cleanup the config structure */
   3531 		free(ac, M_RAIDFRAME);
   3532 		/* "next.." */
   3533 		ac = next_ac;
   3534 	}
   3535 	/* and, finally, nuke the config set */
   3536 	free(cset, M_RAIDFRAME);
   3537 }
   3538 
   3539 
   3540 void
   3541 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3542 {
   3543 	/* avoid over-writing byteswapped version. */
   3544 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3545 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3546 	clabel->serial_number = raidPtr->serial_number;
   3547 	clabel->mod_counter = raidPtr->mod_counter;
   3548 
   3549 	clabel->num_rows = 1;
   3550 	clabel->num_columns = raidPtr->numCol;
   3551 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3552 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3553 
   3554 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3555 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3556 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3557 
   3558 	clabel->blockSize = raidPtr->bytesPerSector;
   3559 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3560 
   3561 	/* XXX not portable */
   3562 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3563 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3564 	clabel->autoconfigure = raidPtr->autoconfigure;
   3565 	clabel->root_partition = raidPtr->root_partition;
   3566 	clabel->last_unit = raidPtr->raidid;
   3567 	clabel->config_order = raidPtr->config_order;
   3568 
   3569 #ifndef RF_NO_PARITY_MAP
   3570 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3571 #endif
   3572 }
   3573 
   3574 static struct raid_softc *
   3575 rf_auto_config_set(RF_ConfigSet_t *cset)
   3576 {
   3577 	RF_Raid_t *raidPtr;
   3578 	RF_Config_t *config;
   3579 	int raidID;
   3580 	struct raid_softc *sc;
   3581 
   3582 #ifdef DEBUG
   3583 	printf("RAID autoconfigure\n");
   3584 #endif
   3585 
   3586 	/* 1. Create a config structure */
   3587 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3588 
   3589 	/*
   3590 	   2. Figure out what RAID ID this one is supposed to live at
   3591 	   See if we can get the same RAID dev that it was configured
   3592 	   on last time..
   3593 	*/
   3594 
   3595 	raidID = cset->ac->clabel->last_unit;
   3596 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3597 	     sc = raidget(++raidID, false))
   3598 		continue;
   3599 #ifdef DEBUG
   3600 	printf("Configuring raid%d:\n",raidID);
   3601 #endif
   3602 
   3603 	if (sc == NULL)
   3604 		sc = raidget(raidID, true);
   3605 	raidPtr = &sc->sc_r;
   3606 
   3607 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3608 	raidPtr->softc = sc;
   3609 	raidPtr->raidid = raidID;
   3610 	raidPtr->openings = RAIDOUTSTANDING;
   3611 
   3612 	/* 3. Build the configuration structure */
   3613 	rf_create_configuration(cset->ac, config, raidPtr);
   3614 
   3615 	/* 4. Do the configuration */
   3616 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3617 		raidinit(sc);
   3618 
   3619 		rf_markalldirty(raidPtr);
   3620 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3621 		switch (cset->ac->clabel->root_partition) {
   3622 		case 1:	/* Force Root */
   3623 		case 2:	/* Soft Root: root when boot partition part of raid */
   3624 			/*
   3625 			 * everything configured just fine.  Make a note
   3626 			 * that this set is eligible to be root,
   3627 			 * or forced to be root
   3628 			 */
   3629 			cset->rootable = cset->ac->clabel->root_partition;
   3630 			/* XXX do this here? */
   3631 			raidPtr->root_partition = cset->rootable;
   3632 			break;
   3633 		default:
   3634 			break;
   3635 		}
   3636 	} else {
   3637 		raidput(sc);
   3638 		sc = NULL;
   3639 	}
   3640 
   3641 	/* 5. Cleanup */
   3642 	free(config, M_RAIDFRAME);
   3643 	return sc;
   3644 }
   3645 
   3646 void
   3647 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3648 	     size_t xmin, size_t xmax)
   3649 {
   3650 
   3651 	/* Format: raid%d_foo */
   3652 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3653 
   3654 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3655 	pool_sethiwat(p, xmax);
   3656 	pool_prime(p, xmin);
   3657 }
   3658 
   3659 
   3660 /*
   3661  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3662  * to see if there is IO pending and if that IO could possibly be done
   3663  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3664  * otherwise.
   3665  *
   3666  */
   3667 int
   3668 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3669 {
   3670 	struct raid_softc *rs;
   3671 	struct dk_softc *dksc;
   3672 
   3673 	rs = raidPtr->softc;
   3674 	dksc = &rs->sc_dksc;
   3675 
   3676 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3677 		return 1;
   3678 
   3679 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3680 		/* there is work to do */
   3681 		return 0;
   3682 	}
   3683 	/* default is nothing to do */
   3684 	return 1;
   3685 }
   3686 
   3687 int
   3688 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3689 {
   3690 	uint64_t numsecs;
   3691 	unsigned secsize;
   3692 	int error;
   3693 
   3694 	error = getdisksize(vp, &numsecs, &secsize);
   3695 	if (error == 0) {
   3696 		diskPtr->blockSize = secsize;
   3697 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3698 		diskPtr->partitionSize = numsecs;
   3699 		return 0;
   3700 	}
   3701 	return error;
   3702 }
   3703 
   3704 static int
   3705 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3706 {
   3707 	return 1;
   3708 }
   3709 
   3710 static void
   3711 raid_attach(device_t parent, device_t self, void *aux)
   3712 {
   3713 }
   3714 
   3715 
   3716 static int
   3717 raid_detach(device_t self, int flags)
   3718 {
   3719 	int error;
   3720 	struct raid_softc *rs = raidsoftc(self);
   3721 
   3722 	if (rs == NULL)
   3723 		return ENXIO;
   3724 
   3725 	if ((error = raidlock(rs)) != 0)
   3726 		return error;
   3727 
   3728 	error = raid_detach_unlocked(rs);
   3729 
   3730 	raidunlock(rs);
   3731 
   3732 	/* XXX raid can be referenced here */
   3733 
   3734 	if (error)
   3735 		return error;
   3736 
   3737 	/* Free the softc */
   3738 	raidput(rs);
   3739 
   3740 	return 0;
   3741 }
   3742 
   3743 static void
   3744 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3745 {
   3746 	struct dk_softc *dksc = &rs->sc_dksc;
   3747 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3748 
   3749 	memset(dg, 0, sizeof(*dg));
   3750 
   3751 	dg->dg_secperunit = raidPtr->totalSectors;
   3752 	dg->dg_secsize = raidPtr->bytesPerSector;
   3753 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3754 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3755 
   3756 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3757 }
   3758 
   3759 /*
   3760  * Get cache info for all the components (including spares).
   3761  * Returns intersection of all the cache flags of all disks, or first
   3762  * error if any encountered.
   3763  * XXXfua feature flags can change as spares are added - lock down somehow
   3764  */
   3765 static int
   3766 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3767 {
   3768 	int c;
   3769 	int error;
   3770 	int dkwhole = 0, dkpart;
   3771 
   3772 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3773 		/*
   3774 		 * Check any non-dead disk, even when currently being
   3775 		 * reconstructed.
   3776 		 */
   3777 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3778 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3779 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3780 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3781 			if (error) {
   3782 				if (error != ENODEV) {
   3783 					printf("raid%d: get cache for component %s failed\n",
   3784 					    raidPtr->raidid,
   3785 					    raidPtr->Disks[c].devname);
   3786 				}
   3787 
   3788 				return error;
   3789 			}
   3790 
   3791 			if (c == 0)
   3792 				dkwhole = dkpart;
   3793 			else
   3794 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3795 		}
   3796 	}
   3797 
   3798 	*data = dkwhole;
   3799 
   3800 	return 0;
   3801 }
   3802 
   3803 /*
   3804  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3805  * We end up returning whatever error was returned by the first cache flush
   3806  * that fails.
   3807  */
   3808 
   3809 static int
   3810 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3811 {
   3812 	int e = 0;
   3813 	for (int i = 0; i < 5; i++) {
   3814 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3815 		    &force, FWRITE, NOCRED);
   3816 		if (!e || e == ENODEV)
   3817 			return e;
   3818 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3819 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3820 	}
   3821 	return e;
   3822 }
   3823 
   3824 int
   3825 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3826 {
   3827 	int c, error;
   3828 
   3829 	error = 0;
   3830 	for (c = 0; c < raidPtr->numCol; c++) {
   3831 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3832 			int e = rf_sync_component_cache(raidPtr, c, force);
   3833 			if (e && !error)
   3834 				error = e;
   3835 		}
   3836 	}
   3837 
   3838 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3839 		int sparecol = raidPtr->numCol + c;
   3840 		/* Need to ensure that the reconstruct actually completed! */
   3841 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3842 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3843 			    force);
   3844 			if (e && !error)
   3845 				error = e;
   3846 		}
   3847 	}
   3848 	return error;
   3849 }
   3850 
   3851 /* Fill in info with the current status */
   3852 void
   3853 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3854 {
   3855 
   3856 	memset(info, 0, sizeof(*info));
   3857 
   3858 	if (raidPtr->status != rf_rs_reconstructing) {
   3859 		info->total = 100;
   3860 		info->completed = 100;
   3861 	} else {
   3862 		info->total = raidPtr->reconControl->numRUsTotal;
   3863 		info->completed = raidPtr->reconControl->numRUsComplete;
   3864 	}
   3865 	info->remaining = info->total - info->completed;
   3866 }
   3867 
   3868 /* Fill in info with the current status */
   3869 void
   3870 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3871 {
   3872 
   3873 	memset(info, 0, sizeof(*info));
   3874 
   3875 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3876 		info->total = raidPtr->Layout.numStripe;
   3877 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3878 	} else {
   3879 		info->completed = 100;
   3880 		info->total = 100;
   3881 	}
   3882 	info->remaining = info->total - info->completed;
   3883 }
   3884 
   3885 /* Fill in info with the current status */
   3886 void
   3887 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3888 {
   3889 
   3890 	memset(info, 0, sizeof(*info));
   3891 
   3892 	if (raidPtr->copyback_in_progress == 1) {
   3893 		info->total = raidPtr->Layout.numStripe;
   3894 		info->completed = raidPtr->copyback_stripes_done;
   3895 		info->remaining = info->total - info->completed;
   3896 	} else {
   3897 		info->remaining = 0;
   3898 		info->completed = 100;
   3899 		info->total = 100;
   3900 	}
   3901 }
   3902 
   3903 /* Fill in config with the current info */
   3904 int
   3905 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3906 {
   3907 	int	d, i, j;
   3908 
   3909 	if (!raidPtr->valid)
   3910 		return ENODEV;
   3911 	config->cols = raidPtr->numCol;
   3912 	config->ndevs = raidPtr->numCol;
   3913 	if (config->ndevs >= RF_MAX_DISKS)
   3914 		return ENOMEM;
   3915 	config->nspares = raidPtr->numSpare;
   3916 	if (config->nspares >= RF_MAX_DISKS)
   3917 		return ENOMEM;
   3918 	config->maxqdepth = raidPtr->maxQueueDepth;
   3919 	d = 0;
   3920 	for (j = 0; j < config->cols; j++) {
   3921 		config->devs[d] = raidPtr->Disks[j];
   3922 		d++;
   3923 	}
   3924 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3925 		config->spares[i] = raidPtr->Disks[j];
   3926 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3927 			/* XXX: raidctl(8) expects to see this as a used spare */
   3928 			config->spares[i].status = rf_ds_used_spare;
   3929 		}
   3930 	}
   3931 	return 0;
   3932 }
   3933 
   3934 int
   3935 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3936 {
   3937 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3938 	RF_ComponentLabel_t *raid_clabel;
   3939 	int column = clabel->column;
   3940 
   3941 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3942 		return EINVAL;
   3943 	raid_clabel = raidget_component_label(raidPtr, column);
   3944 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3945 	/* Fix-up for userland. */
   3946 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3947 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3948 
   3949 	return 0;
   3950 }
   3951 
   3952 /*
   3953  * Module interface
   3954  */
   3955 
   3956 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3957 
   3958 #ifdef _MODULE
   3959 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3960 #endif
   3961 
   3962 static int raid_modcmd(modcmd_t, void *);
   3963 static int raid_modcmd_init(void);
   3964 static int raid_modcmd_fini(void);
   3965 
   3966 static int
   3967 raid_modcmd(modcmd_t cmd, void *data)
   3968 {
   3969 	int error;
   3970 
   3971 	error = 0;
   3972 	switch (cmd) {
   3973 	case MODULE_CMD_INIT:
   3974 		error = raid_modcmd_init();
   3975 		break;
   3976 	case MODULE_CMD_FINI:
   3977 		error = raid_modcmd_fini();
   3978 		break;
   3979 	default:
   3980 		error = ENOTTY;
   3981 		break;
   3982 	}
   3983 	return error;
   3984 }
   3985 
   3986 static int
   3987 raid_modcmd_init(void)
   3988 {
   3989 	int error;
   3990 	int bmajor, cmajor;
   3991 
   3992 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3993 	mutex_enter(&raid_lock);
   3994 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3995 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3996 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3997 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3998 
   3999 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   4000 #endif
   4001 
   4002 	bmajor = cmajor = -1;
   4003 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4004 	    &raid_cdevsw, &cmajor);
   4005 	if (error != 0 && error != EEXIST) {
   4006 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4007 		mutex_exit(&raid_lock);
   4008 		return error;
   4009 	}
   4010 #ifdef _MODULE
   4011 	error = config_cfdriver_attach(&raid_cd);
   4012 	if (error != 0) {
   4013 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4014 		    __func__, error);
   4015 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4016 		mutex_exit(&raid_lock);
   4017 		return error;
   4018 	}
   4019 #endif
   4020 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4021 	if (error != 0) {
   4022 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4023 		    __func__, error);
   4024 #ifdef _MODULE
   4025 		config_cfdriver_detach(&raid_cd);
   4026 #endif
   4027 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4028 		mutex_exit(&raid_lock);
   4029 		return error;
   4030 	}
   4031 
   4032 	raidautoconfigdone = false;
   4033 
   4034 	mutex_exit(&raid_lock);
   4035 
   4036 	if (error == 0) {
   4037 		if (rf_BootRaidframe(true) == 0)
   4038 			aprint_verbose("Kernelized RAIDframe activated\n");
   4039 		else
   4040 			panic("Serious error activating RAID!!");
   4041 	}
   4042 
   4043 	/*
   4044 	 * Register a finalizer which will be used to auto-config RAID
   4045 	 * sets once all real hardware devices have been found.
   4046 	 */
   4047 	error = config_finalize_register(NULL, rf_autoconfig);
   4048 	if (error != 0) {
   4049 		aprint_error("WARNING: unable to register RAIDframe "
   4050 		    "finalizer\n");
   4051 		error = 0;
   4052 	}
   4053 
   4054 	return error;
   4055 }
   4056 
   4057 static int
   4058 raid_modcmd_fini(void)
   4059 {
   4060 	int error;
   4061 
   4062 	mutex_enter(&raid_lock);
   4063 
   4064 	/* Don't allow unload if raid device(s) exist.  */
   4065 	if (!LIST_EMPTY(&raids)) {
   4066 		mutex_exit(&raid_lock);
   4067 		return EBUSY;
   4068 	}
   4069 
   4070 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4071 	if (error != 0) {
   4072 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4073 		mutex_exit(&raid_lock);
   4074 		return error;
   4075 	}
   4076 #ifdef _MODULE
   4077 	error = config_cfdriver_detach(&raid_cd);
   4078 	if (error != 0) {
   4079 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4080 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4081 		mutex_exit(&raid_lock);
   4082 		return error;
   4083 	}
   4084 #endif
   4085 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4086 	if (error != 0) {
   4087 		aprint_error("%s: cannot detach devsw\n",__func__);
   4088 #ifdef _MODULE
   4089 		config_cfdriver_attach(&raid_cd);
   4090 #endif
   4091 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4092 		mutex_exit(&raid_lock);
   4093 		return error;
   4094 	}
   4095 	rf_BootRaidframe(false);
   4096 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4097 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4098 	rf_destroy_cond2(rf_sparet_wait_cv);
   4099 	rf_destroy_cond2(rf_sparet_resp_cv);
   4100 #endif
   4101 	mutex_exit(&raid_lock);
   4102 	mutex_destroy(&raid_lock);
   4103 
   4104 	return error;
   4105 }
   4106