Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.397.2.1
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.397.2.1 2021/08/05 03:37:41 thorpej Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.397.2.1 2021/08/05 03:37:41 thorpej Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    179 
    180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    181 
    182 /* prototypes */
    183 static void KernelWakeupFunc(struct buf *);
    184 static void InitBP(struct buf *, struct vnode *, unsigned,
    185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    186     void *, int);
    187 static void raidinit(struct raid_softc *);
    188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    190 
    191 static int raid_match(device_t, cfdata_t, void *);
    192 static void raid_attach(device_t, device_t, void *);
    193 static int raid_detach(device_t, int);
    194 
    195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t);
    197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t, int);
    199 
    200 static int raidwrite_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 static int raidread_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 
    205 static int raid_diskstart(device_t, struct buf *bp);
    206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    207 static int raid_lastclose(device_t);
    208 
    209 static dev_type_open(raidopen);
    210 static dev_type_close(raidclose);
    211 static dev_type_read(raidread);
    212 static dev_type_write(raidwrite);
    213 static dev_type_ioctl(raidioctl);
    214 static dev_type_strategy(raidstrategy);
    215 static dev_type_dump(raiddump);
    216 static dev_type_size(raidsize);
    217 
    218 const struct bdevsw raid_bdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_strategy = raidstrategy,
    222 	.d_ioctl = raidioctl,
    223 	.d_dump = raiddump,
    224 	.d_psize = raidsize,
    225 	.d_discard = nodiscard,
    226 	.d_flag = D_DISK
    227 };
    228 
    229 const struct cdevsw raid_cdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_read = raidread,
    233 	.d_write = raidwrite,
    234 	.d_ioctl = raidioctl,
    235 	.d_stop = nostop,
    236 	.d_tty = notty,
    237 	.d_poll = nopoll,
    238 	.d_mmap = nommap,
    239 	.d_kqfilter = nokqfilter,
    240 	.d_discard = nodiscard,
    241 	.d_flag = D_DISK
    242 };
    243 
    244 static struct dkdriver rf_dkdriver = {
    245 	.d_open = raidopen,
    246 	.d_close = raidclose,
    247 	.d_strategy = raidstrategy,
    248 	.d_diskstart = raid_diskstart,
    249 	.d_dumpblocks = raid_dumpblocks,
    250 	.d_lastclose = raid_lastclose,
    251 	.d_minphys = minphys
    252 };
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /* Internal representation of a rf_recon_req */
    263 struct rf_recon_req_internal {
    264 	RF_RowCol_t col;
    265 	RF_ReconReqFlags_t flags;
    266 	void   *raidPtr;
    267 };
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 static void rf_ReconThread(struct rf_recon_req_internal *);
    304 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    307 static int rf_autoconfig(device_t);
    308 static int rf_rescan(void);
    309 static void rf_buildroothack(RF_ConfigSet_t *);
    310 
    311 static RF_AutoConfig_t *rf_find_raid_components(void);
    312 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    313 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    314 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    315 static int rf_set_autoconfig(RF_Raid_t *, int);
    316 static int rf_set_rootpartition(RF_Raid_t *, int);
    317 static void rf_release_all_vps(RF_ConfigSet_t *);
    318 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    319 static int rf_have_enough_components(RF_ConfigSet_t *);
    320 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    322 
    323 /*
    324  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    325  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    326  * in the kernel config file.
    327  */
    328 #ifdef RAID_AUTOCONFIG
    329 int raidautoconfig = 1;
    330 #else
    331 int raidautoconfig = 0;
    332 #endif
    333 static bool raidautoconfigdone = false;
    334 
    335 struct pool rf_alloclist_pool;   /* AllocList */
    336 
    337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    338 static kmutex_t raid_lock;
    339 
    340 static struct raid_softc *
    341 raidcreate(int unit) {
    342 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    343 	sc->sc_unit = unit;
    344 	cv_init(&sc->sc_cv, "raidunit");
    345 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    346 	return sc;
    347 }
    348 
    349 static void
    350 raiddestroy(struct raid_softc *sc) {
    351 	cv_destroy(&sc->sc_cv);
    352 	mutex_destroy(&sc->sc_mutex);
    353 	kmem_free(sc, sizeof(*sc));
    354 }
    355 
    356 static struct raid_softc *
    357 raidget(int unit, bool create) {
    358 	struct raid_softc *sc;
    359 	if (unit < 0) {
    360 #ifdef DIAGNOSTIC
    361 		panic("%s: unit %d!", __func__, unit);
    362 #endif
    363 		return NULL;
    364 	}
    365 	mutex_enter(&raid_lock);
    366 	LIST_FOREACH(sc, &raids, sc_link) {
    367 		if (sc->sc_unit == unit) {
    368 			mutex_exit(&raid_lock);
    369 			return sc;
    370 		}
    371 	}
    372 	mutex_exit(&raid_lock);
    373 	if (!create)
    374 		return NULL;
    375 	sc = raidcreate(unit);
    376 	mutex_enter(&raid_lock);
    377 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    378 	mutex_exit(&raid_lock);
    379 	return sc;
    380 }
    381 
    382 static void
    383 raidput(struct raid_softc *sc) {
    384 	mutex_enter(&raid_lock);
    385 	LIST_REMOVE(sc, sc_link);
    386 	mutex_exit(&raid_lock);
    387 	raiddestroy(sc);
    388 }
    389 
    390 void
    391 raidattach(int num)
    392 {
    393 
    394 	/*
    395 	 * Device attachment and associated initialization now occurs
    396 	 * as part of the module initialization.
    397 	 */
    398 }
    399 
    400 static int
    401 rf_autoconfig(device_t self)
    402 {
    403 	RF_AutoConfig_t *ac_list;
    404 	RF_ConfigSet_t *config_sets;
    405 
    406 	if (!raidautoconfig || raidautoconfigdone == true)
    407 		return 0;
    408 
    409 	/* XXX This code can only be run once. */
    410 	raidautoconfigdone = true;
    411 
    412 #ifdef __HAVE_CPU_BOOTCONF
    413 	/*
    414 	 * 0. find the boot device if needed first so we can use it later
    415 	 * this needs to be done before we autoconfigure any raid sets,
    416 	 * because if we use wedges we are not going to be able to open
    417 	 * the boot device later
    418 	 */
    419 	if (booted_device == NULL)
    420 		cpu_bootconf();
    421 #endif
    422 	/* 1. locate all RAID components on the system */
    423 	aprint_debug("Searching for RAID components...\n");
    424 	ac_list = rf_find_raid_components();
    425 
    426 	/* 2. Sort them into their respective sets. */
    427 	config_sets = rf_create_auto_sets(ac_list);
    428 
    429 	/*
    430 	 * 3. Evaluate each set and configure the valid ones.
    431 	 * This gets done in rf_buildroothack().
    432 	 */
    433 	rf_buildroothack(config_sets);
    434 
    435 	return 1;
    436 }
    437 
    438 int
    439 rf_inited(const struct raid_softc *rs) {
    440 	return (rs->sc_flags & RAIDF_INITED) != 0;
    441 }
    442 
    443 RF_Raid_t *
    444 rf_get_raid(struct raid_softc *rs) {
    445 	return &rs->sc_r;
    446 }
    447 
    448 int
    449 rf_get_unit(const struct raid_softc *rs) {
    450 	return rs->sc_unit;
    451 }
    452 
    453 static int
    454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    455 	const char *bootname;
    456 	size_t len;
    457 
    458 	/* if bdv is NULL, the set can't contain it. exit early. */
    459 	if (bdv == NULL)
    460 		return 0;
    461 
    462 	bootname = device_xname(bdv);
    463 	len = strlen(bootname);
    464 
    465 	for (int col = 0; col < r->numCol; col++) {
    466 		const char *devname = r->Disks[col].devname;
    467 		devname += sizeof("/dev/") - 1;
    468 		if (strncmp(devname, "dk", 2) == 0) {
    469 			const char *parent =
    470 			    dkwedge_get_parent_name(r->Disks[col].dev);
    471 			if (parent != NULL)
    472 				devname = parent;
    473 		}
    474 		if (strncmp(devname, bootname, len) == 0) {
    475 			struct raid_softc *sc = r->softc;
    476 			aprint_debug("raid%d includes boot device %s\n",
    477 			    sc->sc_unit, devname);
    478 			return 1;
    479 		}
    480 	}
    481 	return 0;
    482 }
    483 
    484 static int
    485 rf_rescan(void)
    486 {
    487 	RF_AutoConfig_t *ac_list;
    488 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    489 	struct raid_softc *sc;
    490 	int raid_added;
    491 
    492 	ac_list = rf_find_raid_components();
    493 	config_sets = rf_create_auto_sets(ac_list);
    494 
    495 	raid_added = 1;
    496 	while (raid_added > 0) {
    497 		raid_added = 0;
    498 		cset = config_sets;
    499 		while (cset != NULL) {
    500 			next_cset = cset->next;
    501 			if (rf_have_enough_components(cset) &&
    502 			    cset->ac->clabel->autoconfigure == 1) {
    503 				sc = rf_auto_config_set(cset);
    504 				if (sc != NULL) {
    505 					aprint_debug("raid%d: configured ok, rootable %d\n",
    506 						     sc->sc_unit, cset->rootable);
    507 					/* We added one RAID set */
    508 					raid_added++;
    509 				} else {
    510 					/* The autoconfig didn't work :( */
    511 					aprint_debug("Autoconfig failed\n");
    512 					rf_release_all_vps(cset);
    513 				}
    514 			} else {
    515 				/* we're not autoconfiguring this set...
    516 				   release the associated resources */
    517 				rf_release_all_vps(cset);
    518 			}
    519 			/* cleanup */
    520 			rf_cleanup_config_set(cset);
    521 			cset = next_cset;
    522 		}
    523 		if (raid_added > 0) {
    524 			/* We added at least one RAID set, so re-scan for recursive RAID */
    525 			ac_list = rf_find_raid_components();
    526 			config_sets = rf_create_auto_sets(ac_list);
    527 		}
    528 	}
    529 
    530 	return 0;
    531 }
    532 
    533 
    534 static void
    535 rf_buildroothack(RF_ConfigSet_t *config_sets)
    536 {
    537 	RF_AutoConfig_t *ac_list;
    538 	RF_ConfigSet_t *cset;
    539 	RF_ConfigSet_t *next_cset;
    540 	int num_root;
    541 	int raid_added;
    542 	struct raid_softc *sc, *rsc;
    543 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    544 
    545 	sc = rsc = NULL;
    546 	num_root = 0;
    547 
    548 	raid_added = 1;
    549 	while (raid_added > 0) {
    550 		raid_added = 0;
    551 		cset = config_sets;
    552 		while (cset != NULL) {
    553 			next_cset = cset->next;
    554 			if (rf_have_enough_components(cset) &&
    555 			    cset->ac->clabel->autoconfigure == 1) {
    556 				sc = rf_auto_config_set(cset);
    557 				if (sc != NULL) {
    558 					aprint_debug("raid%d: configured ok, rootable %d\n",
    559 						     sc->sc_unit, cset->rootable);
    560 					/* We added one RAID set */
    561 					raid_added++;
    562 					if (cset->rootable) {
    563 						rsc = sc;
    564 						num_root++;
    565 					}
    566 				} else {
    567 					/* The autoconfig didn't work :( */
    568 					aprint_debug("Autoconfig failed\n");
    569 					rf_release_all_vps(cset);
    570 				}
    571 			} else {
    572 				/* we're not autoconfiguring this set...
    573 				   release the associated resources */
    574 				rf_release_all_vps(cset);
    575 			}
    576 			/* cleanup */
    577 			rf_cleanup_config_set(cset);
    578 			cset = next_cset;
    579 		}
    580 		if (raid_added > 0) {
    581 			/* We added at least one RAID set, so re-scan for recursive RAID */
    582 			ac_list = rf_find_raid_components();
    583 			config_sets = rf_create_auto_sets(ac_list);
    584 		}
    585 	}
    586 
    587 	/* if the user has specified what the root device should be
    588 	   then we don't touch booted_device or boothowto... */
    589 
    590 	if (rootspec != NULL) {
    591 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    592 		return;
    593 	}
    594 
    595 	/* we found something bootable... */
    596 
    597 	/*
    598 	 * XXX: The following code assumes that the root raid
    599 	 * is the first ('a') partition. This is about the best
    600 	 * we can do with a BSD disklabel, but we might be able
    601 	 * to do better with a GPT label, by setting a specified
    602 	 * attribute to indicate the root partition. We can then
    603 	 * stash the partition number in the r->root_partition
    604 	 * high bits (the bottom 2 bits are already used). For
    605 	 * now we just set booted_partition to 0 when we override
    606 	 * root.
    607 	 */
    608 	if (num_root == 1) {
    609 		device_t candidate_root;
    610 		dksc = &rsc->sc_dksc;
    611 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    612 			char cname[sizeof(cset->ac->devname)];
    613 			/* XXX: assume partition 'a' first */
    614 			snprintf(cname, sizeof(cname), "%s%c",
    615 			    device_xname(dksc->sc_dev), 'a');
    616 			candidate_root = dkwedge_find_by_wname(cname);
    617 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    618 			    cname);
    619 			if (candidate_root == NULL) {
    620 				/*
    621 				 * If that is not found, because we don't use
    622 				 * disklabel, return the first dk child
    623 				 * XXX: we can skip the 'a' check above
    624 				 * and always do this...
    625 				 */
    626 				size_t i = 0;
    627 				candidate_root = dkwedge_find_by_parent(
    628 				    device_xname(dksc->sc_dev), &i);
    629 			}
    630 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    631 			    candidate_root);
    632 		} else
    633 			candidate_root = dksc->sc_dev;
    634 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    635 		DPRINTF("%s: booted_device=%p root_partition=%d "
    636 			"contains_boot=%d",
    637 		    __func__, booted_device, rsc->sc_r.root_partition,
    638 			   rf_containsboot(&rsc->sc_r, booted_device));
    639 		/* XXX the check for booted_device == NULL can probably be
    640 		 * dropped, now that rf_containsboot handles that case.
    641 		 */
    642 		if (booted_device == NULL ||
    643 		    rsc->sc_r.root_partition == 1 ||
    644 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    645 			booted_device = candidate_root;
    646 			booted_method = "raidframe/single";
    647 			booted_partition = 0;	/* XXX assume 'a' */
    648 			DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
    649 			    device_xname(booted_device), booted_device);
    650 		}
    651 	} else if (num_root > 1) {
    652 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    653 		    booted_device);
    654 
    655 		/*
    656 		 * Maybe the MD code can help. If it cannot, then
    657 		 * setroot() will discover that we have no
    658 		 * booted_device and will ask the user if nothing was
    659 		 * hardwired in the kernel config file
    660 		 */
    661 		if (booted_device == NULL)
    662 			return;
    663 
    664 		num_root = 0;
    665 		mutex_enter(&raid_lock);
    666 		LIST_FOREACH(sc, &raids, sc_link) {
    667 			RF_Raid_t *r = &sc->sc_r;
    668 			if (r->valid == 0)
    669 				continue;
    670 
    671 			if (r->root_partition == 0)
    672 				continue;
    673 
    674 			if (rf_containsboot(r, booted_device)) {
    675 				num_root++;
    676 				rsc = sc;
    677 				dksc = &rsc->sc_dksc;
    678 			}
    679 		}
    680 		mutex_exit(&raid_lock);
    681 
    682 		if (num_root == 1) {
    683 			booted_device = dksc->sc_dev;
    684 			booted_method = "raidframe/multi";
    685 			booted_partition = 0;	/* XXX assume 'a' */
    686 		} else {
    687 			/* we can't guess.. require the user to answer... */
    688 			boothowto |= RB_ASKNAME;
    689 		}
    690 	}
    691 }
    692 
    693 static int
    694 raidsize(dev_t dev)
    695 {
    696 	struct raid_softc *rs;
    697 	struct dk_softc *dksc;
    698 	unsigned int unit;
    699 
    700 	unit = raidunit(dev);
    701 	if ((rs = raidget(unit, false)) == NULL)
    702 		return -1;
    703 	dksc = &rs->sc_dksc;
    704 
    705 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    706 		return -1;
    707 
    708 	return dk_size(dksc, dev);
    709 }
    710 
    711 static int
    712 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    713 {
    714 	unsigned int unit;
    715 	struct raid_softc *rs;
    716 	struct dk_softc *dksc;
    717 
    718 	unit = raidunit(dev);
    719 	if ((rs = raidget(unit, false)) == NULL)
    720 		return ENXIO;
    721 	dksc = &rs->sc_dksc;
    722 
    723 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    724 		return ENODEV;
    725 
    726         /*
    727            Note that blkno is relative to this particular partition.
    728            By adding adding RF_PROTECTED_SECTORS, we get a value that
    729 	   is relative to the partition used for the underlying component.
    730         */
    731 	blkno += RF_PROTECTED_SECTORS;
    732 
    733 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    734 }
    735 
    736 static int
    737 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    738 {
    739 	struct raid_softc *rs = raidsoftc(dev);
    740 	const struct bdevsw *bdev;
    741 	RF_Raid_t *raidPtr;
    742 	int     c, sparecol, j, scol, dumpto;
    743 	int     error = 0;
    744 
    745 	raidPtr = &rs->sc_r;
    746 
    747 	/* we only support dumping to RAID 1 sets */
    748 	if (raidPtr->Layout.numDataCol != 1 ||
    749 	    raidPtr->Layout.numParityCol != 1)
    750 		return EINVAL;
    751 
    752 	if ((error = raidlock(rs)) != 0)
    753 		return error;
    754 
    755 	/* figure out what device is alive.. */
    756 
    757 	/*
    758 	   Look for a component to dump to.  The preference for the
    759 	   component to dump to is as follows:
    760 	   1) the first component
    761 	   2) a used_spare of the first component
    762 	   3) the second component
    763 	   4) a used_spare of the second component
    764 	*/
    765 
    766 	dumpto = -1;
    767 	for (c = 0; c < raidPtr->numCol; c++) {
    768 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    769 			/* this might be the one */
    770 			dumpto = c;
    771 			break;
    772 		}
    773 	}
    774 
    775 	/*
    776 	   At this point we have possibly selected a live component.
    777 	   If we didn't find a live ocmponent, we now check to see
    778 	   if there is a relevant spared component.
    779 	*/
    780 
    781 	for (c = 0; c < raidPtr->numSpare; c++) {
    782 		sparecol = raidPtr->numCol + c;
    783 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    784 			/* How about this one? */
    785 			scol = -1;
    786 			for(j=0;j<raidPtr->numCol;j++) {
    787 				if (raidPtr->Disks[j].spareCol == sparecol) {
    788 					scol = j;
    789 					break;
    790 				}
    791 			}
    792 			if (scol == 0) {
    793 				/*
    794 				   We must have found a spared first
    795 				   component!  We'll take that over
    796 				   anything else found so far.  (We
    797 				   couldn't have found a real first
    798 				   component before, since this is a
    799 				   used spare, and it's saying that
    800 				   it's replacing the first
    801 				   component.)  On reboot (with
    802 				   autoconfiguration turned on)
    803 				   sparecol will become the first
    804 				   component (component0) of this set.
    805 				*/
    806 				dumpto = sparecol;
    807 				break;
    808 			} else if (scol != -1) {
    809 				/*
    810 				   Must be a spared second component.
    811 				   We'll dump to that if we havn't found
    812 				   anything else so far.
    813 				*/
    814 				if (dumpto == -1)
    815 					dumpto = sparecol;
    816 			}
    817 		}
    818 	}
    819 
    820 	if (dumpto == -1) {
    821 		/* we couldn't find any live components to dump to!?!?
    822 		 */
    823 		error = EINVAL;
    824 		goto out;
    825 	}
    826 
    827 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    828 	if (bdev == NULL) {
    829 		error = ENXIO;
    830 		goto out;
    831 	}
    832 
    833 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    834 				blkno, va, nblk * raidPtr->bytesPerSector);
    835 
    836 out:
    837 	raidunlock(rs);
    838 
    839 	return error;
    840 }
    841 
    842 /* ARGSUSED */
    843 static int
    844 raidopen(dev_t dev, int flags, int fmt,
    845     struct lwp *l)
    846 {
    847 	int     unit = raidunit(dev);
    848 	struct raid_softc *rs;
    849 	struct dk_softc *dksc;
    850 	int     error = 0;
    851 	int     part, pmask;
    852 
    853 	if ((rs = raidget(unit, true)) == NULL)
    854 		return ENXIO;
    855 	if ((error = raidlock(rs)) != 0)
    856 		return error;
    857 
    858 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    859 		error = EBUSY;
    860 		goto bad;
    861 	}
    862 
    863 	dksc = &rs->sc_dksc;
    864 
    865 	part = DISKPART(dev);
    866 	pmask = (1 << part);
    867 
    868 	if (!DK_BUSY(dksc, pmask) &&
    869 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    870 		/* First one... mark things as dirty... Note that we *MUST*
    871 		 have done a configure before this.  I DO NOT WANT TO BE
    872 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    873 		 THAT THEY BELONG TOGETHER!!!!! */
    874 		/* XXX should check to see if we're only open for reading
    875 		   here... If so, we needn't do this, but then need some
    876 		   other way of keeping track of what's happened.. */
    877 
    878 		rf_markalldirty(&rs->sc_r);
    879 	}
    880 
    881 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    882 		error = dk_open(dksc, dev, flags, fmt, l);
    883 
    884 bad:
    885 	raidunlock(rs);
    886 
    887 	return error;
    888 
    889 
    890 }
    891 
    892 static int
    893 raid_lastclose(device_t self)
    894 {
    895 	struct raid_softc *rs = raidsoftc(self);
    896 
    897 	/* Last one... device is not unconfigured yet.
    898 	   Device shutdown has taken care of setting the
    899 	   clean bits if RAIDF_INITED is not set
    900 	   mark things as clean... */
    901 
    902 	rf_update_component_labels(&rs->sc_r,
    903 	    RF_FINAL_COMPONENT_UPDATE);
    904 
    905 	/* pass to unlocked code */
    906 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    907 		rs->sc_flags |= RAIDF_DETACH;
    908 
    909 	return 0;
    910 }
    911 
    912 /* ARGSUSED */
    913 static int
    914 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    915 {
    916 	int     unit = raidunit(dev);
    917 	struct raid_softc *rs;
    918 	struct dk_softc *dksc;
    919 	cfdata_t cf;
    920 	int     error = 0, do_detach = 0, do_put = 0;
    921 
    922 	if ((rs = raidget(unit, false)) == NULL)
    923 		return ENXIO;
    924 	dksc = &rs->sc_dksc;
    925 
    926 	if ((error = raidlock(rs)) != 0)
    927 		return error;
    928 
    929 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    930 		error = dk_close(dksc, dev, flags, fmt, l);
    931 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    932 			do_detach = 1;
    933 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    934 		do_put = 1;
    935 
    936 	raidunlock(rs);
    937 
    938 	if (do_detach) {
    939 		/* free the pseudo device attach bits */
    940 		cf = device_cfdata(dksc->sc_dev);
    941 		error = config_detach(dksc->sc_dev, 0);
    942 		if (error == 0)
    943 			free(cf, M_RAIDFRAME);
    944 	} else if (do_put) {
    945 		raidput(rs);
    946 	}
    947 
    948 	return error;
    949 
    950 }
    951 
    952 static void
    953 raid_wakeup(RF_Raid_t *raidPtr)
    954 {
    955 	rf_lock_mutex2(raidPtr->iodone_lock);
    956 	rf_signal_cond2(raidPtr->iodone_cv);
    957 	rf_unlock_mutex2(raidPtr->iodone_lock);
    958 }
    959 
    960 static void
    961 raidstrategy(struct buf *bp)
    962 {
    963 	unsigned int unit;
    964 	struct raid_softc *rs;
    965 	struct dk_softc *dksc;
    966 	RF_Raid_t *raidPtr;
    967 
    968 	unit = raidunit(bp->b_dev);
    969 	if ((rs = raidget(unit, false)) == NULL) {
    970 		bp->b_error = ENXIO;
    971 		goto fail;
    972 	}
    973 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    974 		bp->b_error = ENXIO;
    975 		goto fail;
    976 	}
    977 	dksc = &rs->sc_dksc;
    978 	raidPtr = &rs->sc_r;
    979 
    980 	/* Queue IO only */
    981 	if (dk_strategy_defer(dksc, bp))
    982 		goto done;
    983 
    984 	/* schedule the IO to happen at the next convenient time */
    985 	raid_wakeup(raidPtr);
    986 
    987 done:
    988 	return;
    989 
    990 fail:
    991 	bp->b_resid = bp->b_bcount;
    992 	biodone(bp);
    993 }
    994 
    995 static int
    996 raid_diskstart(device_t dev, struct buf *bp)
    997 {
    998 	struct raid_softc *rs = raidsoftc(dev);
    999 	RF_Raid_t *raidPtr;
   1000 
   1001 	raidPtr = &rs->sc_r;
   1002 	if (!raidPtr->valid) {
   1003 		db1_printf(("raid is not valid..\n"));
   1004 		return ENODEV;
   1005 	}
   1006 
   1007 	/* XXX */
   1008 	bp->b_resid = 0;
   1009 
   1010 	return raiddoaccess(raidPtr, bp);
   1011 }
   1012 
   1013 void
   1014 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1015 {
   1016 	struct raid_softc *rs;
   1017 	struct dk_softc *dksc;
   1018 
   1019 	rs = raidPtr->softc;
   1020 	dksc = &rs->sc_dksc;
   1021 
   1022 	dk_done(dksc, bp);
   1023 
   1024 	rf_lock_mutex2(raidPtr->mutex);
   1025 	raidPtr->openings++;
   1026 	rf_unlock_mutex2(raidPtr->mutex);
   1027 
   1028 	/* schedule more IO */
   1029 	raid_wakeup(raidPtr);
   1030 }
   1031 
   1032 /* ARGSUSED */
   1033 static int
   1034 raidread(dev_t dev, struct uio *uio, int flags)
   1035 {
   1036 	int     unit = raidunit(dev);
   1037 	struct raid_softc *rs;
   1038 
   1039 	if ((rs = raidget(unit, false)) == NULL)
   1040 		return ENXIO;
   1041 
   1042 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1043 		return ENXIO;
   1044 
   1045 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1046 
   1047 }
   1048 
   1049 /* ARGSUSED */
   1050 static int
   1051 raidwrite(dev_t dev, struct uio *uio, int flags)
   1052 {
   1053 	int     unit = raidunit(dev);
   1054 	struct raid_softc *rs;
   1055 
   1056 	if ((rs = raidget(unit, false)) == NULL)
   1057 		return ENXIO;
   1058 
   1059 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1060 		return ENXIO;
   1061 
   1062 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1063 
   1064 }
   1065 
   1066 static int
   1067 raid_detach_unlocked(struct raid_softc *rs)
   1068 {
   1069 	struct dk_softc *dksc = &rs->sc_dksc;
   1070 	RF_Raid_t *raidPtr;
   1071 	int error;
   1072 
   1073 	raidPtr = &rs->sc_r;
   1074 
   1075 	if (DK_BUSY(dksc, 0) ||
   1076 	    raidPtr->recon_in_progress != 0 ||
   1077 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1078 	    raidPtr->copyback_in_progress != 0)
   1079 		return EBUSY;
   1080 
   1081 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1082 		return 0;
   1083 
   1084 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1085 
   1086 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1087 		return error;
   1088 
   1089 	rs->sc_flags &= ~RAIDF_INITED;
   1090 
   1091 	/* Kill off any queued buffers */
   1092 	dk_drain(dksc);
   1093 	bufq_free(dksc->sc_bufq);
   1094 
   1095 	/* Detach the disk. */
   1096 	dkwedge_delall(&dksc->sc_dkdev);
   1097 	disk_detach(&dksc->sc_dkdev);
   1098 	disk_destroy(&dksc->sc_dkdev);
   1099 	dk_detach(dksc);
   1100 
   1101 	return 0;
   1102 }
   1103 
   1104 static bool
   1105 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1106 {
   1107 	switch (cmd) {
   1108 	case RAIDFRAME_ADD_HOT_SPARE:
   1109 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1110 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1111 	case RAIDFRAME_CHECK_PARITY:
   1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1113 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1114 	case RAIDFRAME_CHECK_RECON_STATUS:
   1115 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1116 	case RAIDFRAME_COPYBACK:
   1117 	case RAIDFRAME_DELETE_COMPONENT:
   1118 	case RAIDFRAME_FAIL_DISK:
   1119 	case RAIDFRAME_GET_ACCTOTALS:
   1120 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1121 	case RAIDFRAME_GET_INFO:
   1122 	case RAIDFRAME_GET_SIZE:
   1123 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1124 	case RAIDFRAME_INIT_LABELS:
   1125 	case RAIDFRAME_KEEP_ACCTOTALS:
   1126 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1127 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1128 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1129 	case RAIDFRAME_PARITYMAP_STATUS:
   1130 	case RAIDFRAME_REBUILD_IN_PLACE:
   1131 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1132 	case RAIDFRAME_RESET_ACCTOTALS:
   1133 	case RAIDFRAME_REWRITEPARITY:
   1134 	case RAIDFRAME_SET_AUTOCONFIG:
   1135 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1136 	case RAIDFRAME_SET_ROOT:
   1137 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1138 	}
   1139 	return false;
   1140 }
   1141 
   1142 int
   1143 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1144 {
   1145 	struct rf_recon_req_internal *rrint;
   1146 
   1147 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1148 		/* Can't do this on a RAID 0!! */
   1149 		return EINVAL;
   1150 	}
   1151 
   1152 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1153 		/* bad column */
   1154 		return EINVAL;
   1155 	}
   1156 
   1157 	rf_lock_mutex2(raidPtr->mutex);
   1158 	if (raidPtr->status == rf_rs_reconstructing) {
   1159 		/* you can't fail a disk while we're reconstructing! */
   1160 		/* XXX wrong for RAID6 */
   1161 		goto out;
   1162 	}
   1163 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1164 	    (raidPtr->numFailures > 0)) {
   1165 		/* some other component has failed.  Let's not make
   1166 		   things worse. XXX wrong for RAID6 */
   1167 		goto out;
   1168 	}
   1169 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1170 		/* Can't fail a spared disk! */
   1171 		goto out;
   1172 	}
   1173 	rf_unlock_mutex2(raidPtr->mutex);
   1174 
   1175 	/* make a copy of the recon request so that we don't rely on
   1176 	 * the user's buffer */
   1177 	rrint = RF_Malloc(sizeof(*rrint));
   1178 	if (rrint == NULL)
   1179 		return(ENOMEM);
   1180 	rrint->col = rr->col;
   1181 	rrint->flags = rr->flags;
   1182 	rrint->raidPtr = raidPtr;
   1183 
   1184 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1185 	    rrint, "raid_recon");
   1186 out:
   1187 	rf_unlock_mutex2(raidPtr->mutex);
   1188 	return EINVAL;
   1189 }
   1190 
   1191 static int
   1192 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1193 {
   1194 	/* allocate a buffer for the layout-specific data, and copy it in */
   1195 	if (k_cfg->layoutSpecificSize == 0)
   1196 		return 0;
   1197 
   1198 	if (k_cfg->layoutSpecificSize > 10000) {
   1199 	    /* sanity check */
   1200 	    return EINVAL;
   1201 	}
   1202 
   1203 	u_char *specific_buf;
   1204 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1205 	if (specific_buf == NULL)
   1206 		return ENOMEM;
   1207 
   1208 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1209 	    k_cfg->layoutSpecificSize);
   1210 	if (retcode) {
   1211 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1212 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1213 		return retcode;
   1214 	}
   1215 
   1216 	k_cfg->layoutSpecific = specific_buf;
   1217 	return 0;
   1218 }
   1219 
   1220 static int
   1221 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1222 {
   1223 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1224 
   1225 	if (rs->sc_r.valid) {
   1226 		/* There is a valid RAID set running on this unit! */
   1227 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1228 		return EINVAL;
   1229 	}
   1230 
   1231 	/* copy-in the configuration information */
   1232 	/* data points to a pointer to the configuration structure */
   1233 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1234 	if (*k_cfg == NULL) {
   1235 		return ENOMEM;
   1236 	}
   1237 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1238 	if (retcode == 0)
   1239 		return 0;
   1240 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1241 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1242 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1243 	return retcode;
   1244 }
   1245 
   1246 int
   1247 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1248 {
   1249 	int retcode;
   1250 	RF_Raid_t *raidPtr = &rs->sc_r;
   1251 
   1252 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1253 
   1254 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1255 		goto out;
   1256 
   1257 	/* should do some kind of sanity check on the configuration.
   1258 	 * Store the sum of all the bytes in the last byte? */
   1259 
   1260 	/* configure the system */
   1261 
   1262 	/*
   1263 	 * Clear the entire RAID descriptor, just to make sure
   1264 	 *  there is no stale data left in the case of a
   1265 	 *  reconfiguration
   1266 	 */
   1267 	memset(raidPtr, 0, sizeof(*raidPtr));
   1268 	raidPtr->softc = rs;
   1269 	raidPtr->raidid = rs->sc_unit;
   1270 
   1271 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1272 
   1273 	if (retcode == 0) {
   1274 		/* allow this many simultaneous IO's to
   1275 		   this RAID device */
   1276 		raidPtr->openings = RAIDOUTSTANDING;
   1277 
   1278 		raidinit(rs);
   1279 		raid_wakeup(raidPtr);
   1280 		rf_markalldirty(raidPtr);
   1281 	}
   1282 
   1283 	/* free the buffers.  No return code here. */
   1284 	if (k_cfg->layoutSpecificSize) {
   1285 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1286 	}
   1287 out:
   1288 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1289 	if (retcode) {
   1290 		/*
   1291 		 * If configuration failed, set sc_flags so that we
   1292 		 * will detach the device when we close it.
   1293 		 */
   1294 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1295 	}
   1296 	return retcode;
   1297 }
   1298 
   1299 #if RF_DISABLED
   1300 static int
   1301 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1302 {
   1303 
   1304 	/* XXX check the label for valid stuff... */
   1305 	/* Note that some things *should not* get modified --
   1306 	   the user should be re-initing the labels instead of
   1307 	   trying to patch things.
   1308 	   */
   1309 #ifdef DEBUG
   1310 	int raidid = raidPtr->raidid;
   1311 	printf("raid%d: Got component label:\n", raidid);
   1312 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1313 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1314 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1315 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1316 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1317 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1318 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1319 #endif	/* DEBUG */
   1320 	clabel->row = 0;
   1321 	int column = clabel->column;
   1322 
   1323 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1324 		return(EINVAL);
   1325 	}
   1326 
   1327 	/* XXX this isn't allowed to do anything for now :-) */
   1328 
   1329 	/* XXX and before it is, we need to fill in the rest
   1330 	   of the fields!?!?!?! */
   1331 	memcpy(raidget_component_label(raidPtr, column),
   1332 	    clabel, sizeof(*clabel));
   1333 	raidflush_component_label(raidPtr, column);
   1334 	return 0;
   1335 }
   1336 #endif
   1337 
   1338 static int
   1339 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1340 {
   1341 	/*
   1342 	   we only want the serial number from
   1343 	   the above.  We get all the rest of the information
   1344 	   from the config that was used to create this RAID
   1345 	   set.
   1346 	   */
   1347 
   1348 	raidPtr->serial_number = clabel->serial_number;
   1349 
   1350 	for (int column = 0; column < raidPtr->numCol; column++) {
   1351 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1352 		if (RF_DEAD_DISK(diskPtr->status))
   1353 			continue;
   1354 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1355 		    raidPtr, column);
   1356 		/* Zeroing this is important. */
   1357 		memset(ci_label, 0, sizeof(*ci_label));
   1358 		raid_init_component_label(raidPtr, ci_label);
   1359 		ci_label->serial_number = raidPtr->serial_number;
   1360 		ci_label->row = 0; /* we dont' pretend to support more */
   1361 		rf_component_label_set_partitionsize(ci_label,
   1362 		    diskPtr->partitionSize);
   1363 		ci_label->column = column;
   1364 		raidflush_component_label(raidPtr, column);
   1365 		/* XXXjld what about the spares? */
   1366 	}
   1367 
   1368 	return 0;
   1369 }
   1370 
   1371 static int
   1372 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1373 {
   1374 
   1375 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1376 		/* Can't do this on a RAID 0!! */
   1377 		return EINVAL;
   1378 	}
   1379 
   1380 	if (raidPtr->recon_in_progress == 1) {
   1381 		/* a reconstruct is already in progress! */
   1382 		return EINVAL;
   1383 	}
   1384 
   1385 	RF_SingleComponent_t component;
   1386 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1387 	component.row = 0; /* we don't support any more */
   1388 	int column = component.column;
   1389 
   1390 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1391 		return EINVAL;
   1392 	}
   1393 
   1394 	rf_lock_mutex2(raidPtr->mutex);
   1395 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1396 	    (raidPtr->numFailures > 0)) {
   1397 		/* XXX 0 above shouldn't be constant!!! */
   1398 		/* some component other than this has failed.
   1399 		   Let's not make things worse than they already
   1400 		   are... */
   1401 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1402 		       raidPtr->raidid);
   1403 		printf("raid%d:     Col: %d   Too many failures.\n",
   1404 		       raidPtr->raidid, column);
   1405 		rf_unlock_mutex2(raidPtr->mutex);
   1406 		return EINVAL;
   1407 	}
   1408 
   1409 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1410 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1411 		       raidPtr->raidid);
   1412 		printf("raid%d:    Col: %d   "
   1413 		    "Reconstruction already occurring!\n",
   1414 		    raidPtr->raidid, column);
   1415 
   1416 		rf_unlock_mutex2(raidPtr->mutex);
   1417 		return EINVAL;
   1418 	}
   1419 
   1420 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1421 		rf_unlock_mutex2(raidPtr->mutex);
   1422 		return EINVAL;
   1423 	}
   1424 
   1425 	rf_unlock_mutex2(raidPtr->mutex);
   1426 
   1427 	struct rf_recon_req_internal *rrint;
   1428 	rrint = RF_Malloc(sizeof(*rrint));
   1429 	if (rrint == NULL)
   1430 		return ENOMEM;
   1431 
   1432 	rrint->col = column;
   1433 	rrint->raidPtr = raidPtr;
   1434 
   1435 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1436 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1437 }
   1438 
   1439 static int
   1440 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1441 {
   1442 	/*
   1443 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1444 	 * so tell the user it's done.
   1445 	 */
   1446 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1447 	    raidPtr->status != rf_rs_reconstructing) {
   1448 		*data = 100;
   1449 		return 0;
   1450 	}
   1451 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1452 		*data = 0;
   1453 		return 0;
   1454 	}
   1455 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1456 	    / raidPtr->reconControl->numRUsTotal);
   1457 	return 0;
   1458 }
   1459 
   1460 static int
   1461 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1462 {
   1463 	int     unit = raidunit(dev);
   1464 	int     part, pmask;
   1465 	struct raid_softc *rs;
   1466 	struct dk_softc *dksc;
   1467 	RF_Config_t *k_cfg;
   1468 	RF_Raid_t *raidPtr;
   1469 	RF_AccTotals_t *totals;
   1470 	RF_SingleComponent_t component;
   1471 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1472 	int retcode = 0;
   1473 	int column;
   1474 	RF_ComponentLabel_t *clabel;
   1475 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1476 	int d;
   1477 
   1478 	if ((rs = raidget(unit, false)) == NULL)
   1479 		return ENXIO;
   1480 
   1481 	dksc = &rs->sc_dksc;
   1482 	raidPtr = &rs->sc_r;
   1483 
   1484 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1485 	    (int) DISKPART(dev), (int) unit, cmd));
   1486 
   1487 	/* Must be initialized for these... */
   1488 	if (rf_must_be_initialized(rs, cmd))
   1489 		return ENXIO;
   1490 
   1491 	switch (cmd) {
   1492 		/* configure the system */
   1493 	case RAIDFRAME_CONFIGURE:
   1494 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1495 			return retcode;
   1496 		return rf_construct(rs, k_cfg);
   1497 
   1498 		/* shutdown the system */
   1499 	case RAIDFRAME_SHUTDOWN:
   1500 
   1501 		part = DISKPART(dev);
   1502 		pmask = (1 << part);
   1503 
   1504 		if ((retcode = raidlock(rs)) != 0)
   1505 			return retcode;
   1506 
   1507 		if (DK_BUSY(dksc, pmask) ||
   1508 		    raidPtr->recon_in_progress != 0 ||
   1509 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1510 		    raidPtr->copyback_in_progress != 0)
   1511 			retcode = EBUSY;
   1512 		else {
   1513 			/* detach and free on close */
   1514 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1515 			retcode = 0;
   1516 		}
   1517 
   1518 		raidunlock(rs);
   1519 
   1520 		return retcode;
   1521 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1522 		return rf_get_component_label(raidPtr, data);
   1523 
   1524 #if RF_DISABLED
   1525 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1526 		return rf_set_component_label(raidPtr, data);
   1527 #endif
   1528 
   1529 	case RAIDFRAME_INIT_LABELS:
   1530 		return rf_init_component_label(raidPtr, data);
   1531 
   1532 	case RAIDFRAME_SET_AUTOCONFIG:
   1533 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1534 		printf("raid%d: New autoconfig value is: %d\n",
   1535 		       raidPtr->raidid, d);
   1536 		*(int *) data = d;
   1537 		return retcode;
   1538 
   1539 	case RAIDFRAME_SET_ROOT:
   1540 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1541 		printf("raid%d: New rootpartition value is: %d\n",
   1542 		       raidPtr->raidid, d);
   1543 		*(int *) data = d;
   1544 		return retcode;
   1545 
   1546 		/* initialize all parity */
   1547 	case RAIDFRAME_REWRITEPARITY:
   1548 
   1549 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1550 			/* Parity for RAID 0 is trivially correct */
   1551 			raidPtr->parity_good = RF_RAID_CLEAN;
   1552 			return 0;
   1553 		}
   1554 
   1555 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1556 			/* Re-write is already in progress! */
   1557 			return EINVAL;
   1558 		}
   1559 
   1560 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1561 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1562 
   1563 	case RAIDFRAME_ADD_HOT_SPARE:
   1564 		sparePtr = (RF_SingleComponent_t *) data;
   1565 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1566 		return rf_add_hot_spare(raidPtr, &component);
   1567 
   1568 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1569 		return retcode;
   1570 
   1571 	case RAIDFRAME_DELETE_COMPONENT:
   1572 		componentPtr = (RF_SingleComponent_t *)data;
   1573 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1574 		return rf_delete_component(raidPtr, &component);
   1575 
   1576 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1577 		componentPtr = (RF_SingleComponent_t *)data;
   1578 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1579 		return rf_incorporate_hot_spare(raidPtr, &component);
   1580 
   1581 	case RAIDFRAME_REBUILD_IN_PLACE:
   1582 		return rf_rebuild_in_place(raidPtr, data);
   1583 
   1584 	case RAIDFRAME_GET_INFO:
   1585 		ucfgp = *(RF_DeviceConfig_t **)data;
   1586 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1587 		if (d_cfg == NULL)
   1588 			return ENOMEM;
   1589 		retcode = rf_get_info(raidPtr, d_cfg);
   1590 		if (retcode == 0) {
   1591 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1592 		}
   1593 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1594 		return retcode;
   1595 
   1596 	case RAIDFRAME_CHECK_PARITY:
   1597 		*(int *) data = raidPtr->parity_good;
   1598 		return 0;
   1599 
   1600 	case RAIDFRAME_PARITYMAP_STATUS:
   1601 		if (rf_paritymap_ineligible(raidPtr))
   1602 			return EINVAL;
   1603 		rf_paritymap_status(raidPtr->parity_map, data);
   1604 		return 0;
   1605 
   1606 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1607 		if (rf_paritymap_ineligible(raidPtr))
   1608 			return EINVAL;
   1609 		if (raidPtr->parity_map == NULL)
   1610 			return ENOENT; /* ??? */
   1611 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1612 			return EINVAL;
   1613 		return 0;
   1614 
   1615 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1616 		if (rf_paritymap_ineligible(raidPtr))
   1617 			return EINVAL;
   1618 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1619 		return 0;
   1620 
   1621 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1622 		if (rf_paritymap_ineligible(raidPtr))
   1623 			return EINVAL;
   1624 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1625 		/* XXX should errors be passed up? */
   1626 		return 0;
   1627 
   1628 	case RAIDFRAME_RESCAN:
   1629 		return rf_rescan();
   1630 
   1631 	case RAIDFRAME_RESET_ACCTOTALS:
   1632 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1633 		return 0;
   1634 
   1635 	case RAIDFRAME_GET_ACCTOTALS:
   1636 		totals = (RF_AccTotals_t *) data;
   1637 		*totals = raidPtr->acc_totals;
   1638 		return 0;
   1639 
   1640 	case RAIDFRAME_KEEP_ACCTOTALS:
   1641 		raidPtr->keep_acc_totals = *(int *)data;
   1642 		return 0;
   1643 
   1644 	case RAIDFRAME_GET_SIZE:
   1645 		*(int *) data = raidPtr->totalSectors;
   1646 		return 0;
   1647 
   1648 	case RAIDFRAME_FAIL_DISK:
   1649 		return rf_fail_disk(raidPtr, data);
   1650 
   1651 		/* invoke a copyback operation after recon on whatever disk
   1652 		 * needs it, if any */
   1653 	case RAIDFRAME_COPYBACK:
   1654 
   1655 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1656 			/* This makes no sense on a RAID 0!! */
   1657 			return EINVAL;
   1658 		}
   1659 
   1660 		if (raidPtr->copyback_in_progress == 1) {
   1661 			/* Copyback is already in progress! */
   1662 			return EINVAL;
   1663 		}
   1664 
   1665 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1666 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1667 
   1668 		/* return the percentage completion of reconstruction */
   1669 	case RAIDFRAME_CHECK_RECON_STATUS:
   1670 		return rf_check_recon_status(raidPtr, data);
   1671 
   1672 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1673 		rf_check_recon_status_ext(raidPtr, data);
   1674 		return 0;
   1675 
   1676 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1677 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1678 			/* This makes no sense on a RAID 0, so tell the
   1679 			   user it's done. */
   1680 			*(int *) data = 100;
   1681 			return 0;
   1682 		}
   1683 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1684 			*(int *) data = 100 *
   1685 				raidPtr->parity_rewrite_stripes_done /
   1686 				raidPtr->Layout.numStripe;
   1687 		} else {
   1688 			*(int *) data = 100;
   1689 		}
   1690 		return 0;
   1691 
   1692 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1693 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1694 		return 0;
   1695 
   1696 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1697 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1698 			/* This makes no sense on a RAID 0 */
   1699 			*(int *) data = 100;
   1700 			return 0;
   1701 		}
   1702 		if (raidPtr->copyback_in_progress == 1) {
   1703 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1704 				raidPtr->Layout.numStripe;
   1705 		} else {
   1706 			*(int *) data = 100;
   1707 		}
   1708 		return 0;
   1709 
   1710 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1711 		rf_check_copyback_status_ext(raidPtr, data);
   1712 		return 0;
   1713 
   1714 	case RAIDFRAME_SET_LAST_UNIT:
   1715 		for (column = 0; column < raidPtr->numCol; column++)
   1716 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1717 				return EBUSY;
   1718 
   1719 		for (column = 0; column < raidPtr->numCol; column++) {
   1720 			clabel = raidget_component_label(raidPtr, column);
   1721 			clabel->last_unit = *(int *)data;
   1722 			raidflush_component_label(raidPtr, column);
   1723 		}
   1724 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1725 		return 0;
   1726 
   1727 		/* the sparetable daemon calls this to wait for the kernel to
   1728 		 * need a spare table. this ioctl does not return until a
   1729 		 * spare table is needed. XXX -- calling mpsleep here in the
   1730 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1731 		 * -- I should either compute the spare table in the kernel,
   1732 		 * or have a different -- XXX XXX -- interface (a different
   1733 		 * character device) for delivering the table     -- XXX */
   1734 #if RF_DISABLED
   1735 	case RAIDFRAME_SPARET_WAIT:
   1736 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1737 		while (!rf_sparet_wait_queue)
   1738 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1739 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1740 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1741 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1742 
   1743 		/* structure assignment */
   1744 		*((RF_SparetWait_t *) data) = *waitreq;
   1745 
   1746 		RF_Free(waitreq, sizeof(*waitreq));
   1747 		return 0;
   1748 
   1749 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1750 		 * code in it that will cause the dameon to exit */
   1751 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1752 		waitreq = RF_Malloc(sizeof(*waitreq));
   1753 		waitreq->fcol = -1;
   1754 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1755 		waitreq->next = rf_sparet_wait_queue;
   1756 		rf_sparet_wait_queue = waitreq;
   1757 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1758 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1759 		return 0;
   1760 
   1761 		/* used by the spare table daemon to deliver a spare table
   1762 		 * into the kernel */
   1763 	case RAIDFRAME_SEND_SPARET:
   1764 
   1765 		/* install the spare table */
   1766 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1767 
   1768 		/* respond to the requestor.  the return status of the spare
   1769 		 * table installation is passed in the "fcol" field */
   1770 		waitred = RF_Malloc(sizeof(*waitreq));
   1771 		waitreq->fcol = retcode;
   1772 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1773 		waitreq->next = rf_sparet_resp_queue;
   1774 		rf_sparet_resp_queue = waitreq;
   1775 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1776 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1777 
   1778 		return retcode;
   1779 #endif
   1780 	default:
   1781 		/*
   1782 		 * Don't bother trying to load compat modules
   1783 		 * if it is not our ioctl. This is more efficient
   1784 		 * and makes rump tests not depend on compat code
   1785 		 */
   1786 		if (IOCGROUP(cmd) != 'r')
   1787 			break;
   1788 #ifdef _LP64
   1789 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1790 			module_autoload("compat_netbsd32_raid",
   1791 			    MODULE_CLASS_EXEC);
   1792 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1793 			    (rs, cmd, data), enosys(), retcode);
   1794 			if (retcode != EPASSTHROUGH)
   1795 				return retcode;
   1796 		}
   1797 #endif
   1798 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1799 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1800 		    (rs, cmd, data), enosys(), retcode);
   1801 		if (retcode != EPASSTHROUGH)
   1802 			return retcode;
   1803 
   1804 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1805 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1806 		    (rs, cmd, data), enosys(), retcode);
   1807 		if (retcode != EPASSTHROUGH)
   1808 			return retcode;
   1809 		break; /* fall through to the os-specific code below */
   1810 
   1811 	}
   1812 
   1813 	if (!raidPtr->valid)
   1814 		return EINVAL;
   1815 
   1816 	/*
   1817 	 * Add support for "regular" device ioctls here.
   1818 	 */
   1819 
   1820 	switch (cmd) {
   1821 	case DIOCGCACHE:
   1822 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1823 		break;
   1824 
   1825 	case DIOCCACHESYNC:
   1826 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1827 		break;
   1828 
   1829 	default:
   1830 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1831 		break;
   1832 	}
   1833 
   1834 	return retcode;
   1835 
   1836 }
   1837 
   1838 
   1839 /* raidinit -- complete the rest of the initialization for the
   1840    RAIDframe device.  */
   1841 
   1842 
   1843 static void
   1844 raidinit(struct raid_softc *rs)
   1845 {
   1846 	cfdata_t cf;
   1847 	unsigned int unit;
   1848 	struct dk_softc *dksc = &rs->sc_dksc;
   1849 	RF_Raid_t *raidPtr = &rs->sc_r;
   1850 	device_t dev;
   1851 
   1852 	unit = raidPtr->raidid;
   1853 
   1854 	/* XXX doesn't check bounds. */
   1855 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1856 
   1857 	/* attach the pseudo device */
   1858 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1859 	cf->cf_name = raid_cd.cd_name;
   1860 	cf->cf_atname = raid_cd.cd_name;
   1861 	cf->cf_unit = unit;
   1862 	cf->cf_fstate = FSTATE_STAR;
   1863 
   1864 	dev = config_attach_pseudo(cf);
   1865 	if (dev == NULL) {
   1866 		printf("raid%d: config_attach_pseudo failed\n",
   1867 		    raidPtr->raidid);
   1868 		free(cf, M_RAIDFRAME);
   1869 		return;
   1870 	}
   1871 
   1872 	/* provide a backpointer to the real softc */
   1873 	raidsoftc(dev) = rs;
   1874 
   1875 	/* disk_attach actually creates space for the CPU disklabel, among
   1876 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1877 	 * with disklabels. */
   1878 	dk_init(dksc, dev, DKTYPE_RAID);
   1879 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1880 
   1881 	/* XXX There may be a weird interaction here between this, and
   1882 	 * protectedSectors, as used in RAIDframe.  */
   1883 
   1884 	rs->sc_size = raidPtr->totalSectors;
   1885 
   1886 	/* Attach dk and disk subsystems */
   1887 	dk_attach(dksc);
   1888 	disk_attach(&dksc->sc_dkdev);
   1889 	rf_set_geometry(rs, raidPtr);
   1890 
   1891 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1892 
   1893 	/* mark unit as usuable */
   1894 	rs->sc_flags |= RAIDF_INITED;
   1895 
   1896 	dkwedge_discover(&dksc->sc_dkdev);
   1897 }
   1898 
   1899 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1900 /* wake up the daemon & tell it to get us a spare table
   1901  * XXX
   1902  * the entries in the queues should be tagged with the raidPtr
   1903  * so that in the extremely rare case that two recons happen at once,
   1904  * we know for which device were requesting a spare table
   1905  * XXX
   1906  *
   1907  * XXX This code is not currently used. GO
   1908  */
   1909 int
   1910 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1911 {
   1912 	int     retcode;
   1913 
   1914 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1915 	req->next = rf_sparet_wait_queue;
   1916 	rf_sparet_wait_queue = req;
   1917 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1918 
   1919 	/* mpsleep unlocks the mutex */
   1920 	while (!rf_sparet_resp_queue) {
   1921 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1922 	}
   1923 	req = rf_sparet_resp_queue;
   1924 	rf_sparet_resp_queue = req->next;
   1925 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1926 
   1927 	retcode = req->fcol;
   1928 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1929 					 * alloc'd */
   1930 	return retcode;
   1931 }
   1932 #endif
   1933 
   1934 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1935  * bp & passes it down.
   1936  * any calls originating in the kernel must use non-blocking I/O
   1937  * do some extra sanity checking to return "appropriate" error values for
   1938  * certain conditions (to make some standard utilities work)
   1939  *
   1940  * Formerly known as: rf_DoAccessKernel
   1941  */
   1942 void
   1943 raidstart(RF_Raid_t *raidPtr)
   1944 {
   1945 	struct raid_softc *rs;
   1946 	struct dk_softc *dksc;
   1947 
   1948 	rs = raidPtr->softc;
   1949 	dksc = &rs->sc_dksc;
   1950 	/* quick check to see if anything has died recently */
   1951 	rf_lock_mutex2(raidPtr->mutex);
   1952 	if (raidPtr->numNewFailures > 0) {
   1953 		rf_unlock_mutex2(raidPtr->mutex);
   1954 		rf_update_component_labels(raidPtr,
   1955 					   RF_NORMAL_COMPONENT_UPDATE);
   1956 		rf_lock_mutex2(raidPtr->mutex);
   1957 		raidPtr->numNewFailures--;
   1958 	}
   1959 	rf_unlock_mutex2(raidPtr->mutex);
   1960 
   1961 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1962 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1963 		return;
   1964 	}
   1965 
   1966 	dk_start(dksc, NULL);
   1967 }
   1968 
   1969 static int
   1970 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1971 {
   1972 	RF_SectorCount_t num_blocks, pb, sum;
   1973 	RF_RaidAddr_t raid_addr;
   1974 	daddr_t blocknum;
   1975 	int rc;
   1976 
   1977 	rf_lock_mutex2(raidPtr->mutex);
   1978 	if (raidPtr->openings == 0) {
   1979 		rf_unlock_mutex2(raidPtr->mutex);
   1980 		return EAGAIN;
   1981 	}
   1982 	rf_unlock_mutex2(raidPtr->mutex);
   1983 
   1984 	blocknum = bp->b_rawblkno;
   1985 
   1986 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1987 		    (int) blocknum));
   1988 
   1989 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1990 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1991 
   1992 	/* *THIS* is where we adjust what block we're going to...
   1993 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1994 	raid_addr = blocknum;
   1995 
   1996 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1997 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1998 	sum = raid_addr + num_blocks + pb;
   1999 	if (1 || rf_debugKernelAccess) {
   2000 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2001 			    (int) raid_addr, (int) sum, (int) num_blocks,
   2002 			    (int) pb, (int) bp->b_resid));
   2003 	}
   2004 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2005 	    || (sum < num_blocks) || (sum < pb)) {
   2006 		rc = ENOSPC;
   2007 		goto done;
   2008 	}
   2009 	/*
   2010 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2011 	 */
   2012 
   2013 	if (bp->b_bcount & raidPtr->sectorMask) {
   2014 		rc = ENOSPC;
   2015 		goto done;
   2016 	}
   2017 	db1_printf(("Calling DoAccess..\n"));
   2018 
   2019 
   2020 	rf_lock_mutex2(raidPtr->mutex);
   2021 	raidPtr->openings--;
   2022 	rf_unlock_mutex2(raidPtr->mutex);
   2023 
   2024 	/* don't ever condition on bp->b_flags & B_WRITE.
   2025 	 * always condition on B_READ instead */
   2026 
   2027 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2028 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2029 			 raid_addr, num_blocks,
   2030 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2031 
   2032 done:
   2033 	return rc;
   2034 }
   2035 
   2036 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2037 
   2038 int
   2039 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2040 {
   2041 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2042 	struct buf *bp;
   2043 
   2044 	req->queue = queue;
   2045 	bp = req->bp;
   2046 
   2047 	switch (req->type) {
   2048 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2049 		/* XXX need to do something extra here.. */
   2050 		/* I'm leaving this in, as I've never actually seen it used,
   2051 		 * and I'd like folks to report it... GO */
   2052 		printf("%s: WAKEUP CALLED\n", __func__);
   2053 		queue->numOutstanding++;
   2054 
   2055 		bp->b_flags = 0;
   2056 		bp->b_private = req;
   2057 
   2058 		KernelWakeupFunc(bp);
   2059 		break;
   2060 
   2061 	case RF_IO_TYPE_READ:
   2062 	case RF_IO_TYPE_WRITE:
   2063 #if RF_ACC_TRACE > 0
   2064 		if (req->tracerec) {
   2065 			RF_ETIMER_START(req->tracerec->timer);
   2066 		}
   2067 #endif
   2068 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2069 		    op, queue->rf_cinfo->ci_dev,
   2070 		    req->sectorOffset, req->numSector,
   2071 		    req->buf, KernelWakeupFunc, (void *) req,
   2072 		    queue->raidPtr->logBytesPerSector);
   2073 
   2074 		if (rf_debugKernelAccess) {
   2075 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2076 				(long) bp->b_blkno));
   2077 		}
   2078 		queue->numOutstanding++;
   2079 		queue->last_deq_sector = req->sectorOffset;
   2080 		/* acc wouldn't have been let in if there were any pending
   2081 		 * reqs at any other priority */
   2082 		queue->curPriority = req->priority;
   2083 
   2084 		db1_printf(("Going for %c to unit %d col %d\n",
   2085 			    req->type, queue->raidPtr->raidid,
   2086 			    queue->col));
   2087 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2088 			(int) req->sectorOffset, (int) req->numSector,
   2089 			(int) (req->numSector <<
   2090 			    queue->raidPtr->logBytesPerSector),
   2091 			(int) queue->raidPtr->logBytesPerSector));
   2092 
   2093 		/*
   2094 		 * XXX: drop lock here since this can block at
   2095 		 * least with backing SCSI devices.  Retake it
   2096 		 * to minimize fuss with calling interfaces.
   2097 		 */
   2098 
   2099 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2100 		bdev_strategy(bp);
   2101 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2102 		break;
   2103 
   2104 	default:
   2105 		panic("bad req->type in rf_DispatchKernelIO");
   2106 	}
   2107 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2108 
   2109 	return 0;
   2110 }
   2111 /* this is the callback function associated with a I/O invoked from
   2112    kernel code.
   2113  */
   2114 static void
   2115 KernelWakeupFunc(struct buf *bp)
   2116 {
   2117 	RF_DiskQueueData_t *req = NULL;
   2118 	RF_DiskQueue_t *queue;
   2119 
   2120 	db1_printf(("recovering the request queue:\n"));
   2121 
   2122 	req = bp->b_private;
   2123 
   2124 	queue = (RF_DiskQueue_t *) req->queue;
   2125 
   2126 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2127 
   2128 #if RF_ACC_TRACE > 0
   2129 	if (req->tracerec) {
   2130 		RF_ETIMER_STOP(req->tracerec->timer);
   2131 		RF_ETIMER_EVAL(req->tracerec->timer);
   2132 		rf_lock_mutex2(rf_tracing_mutex);
   2133 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2134 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2135 		req->tracerec->num_phys_ios++;
   2136 		rf_unlock_mutex2(rf_tracing_mutex);
   2137 	}
   2138 #endif
   2139 
   2140 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2141 	 * ballistic, and mark the component as hosed... */
   2142 
   2143 	if (bp->b_error != 0) {
   2144 		/* Mark the disk as dead */
   2145 		/* but only mark it once... */
   2146 		/* and only if it wouldn't leave this RAID set
   2147 		   completely broken */
   2148 		if (((queue->raidPtr->Disks[queue->col].status ==
   2149 		      rf_ds_optimal) ||
   2150 		     (queue->raidPtr->Disks[queue->col].status ==
   2151 		      rf_ds_used_spare)) &&
   2152 		     (queue->raidPtr->numFailures <
   2153 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2154 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2155 			       queue->raidPtr->raidid,
   2156 			       bp->b_error,
   2157 			       queue->raidPtr->Disks[queue->col].devname);
   2158 			queue->raidPtr->Disks[queue->col].status =
   2159 			    rf_ds_failed;
   2160 			queue->raidPtr->status = rf_rs_degraded;
   2161 			queue->raidPtr->numFailures++;
   2162 			queue->raidPtr->numNewFailures++;
   2163 		} else {	/* Disk is already dead... */
   2164 			/* printf("Disk already marked as dead!\n"); */
   2165 		}
   2166 
   2167 	}
   2168 
   2169 	/* Fill in the error value */
   2170 	req->error = bp->b_error;
   2171 
   2172 	/* Drop this one on the "finished" queue... */
   2173 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2174 
   2175 	/* Let the raidio thread know there is work to be done. */
   2176 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2177 
   2178 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2179 }
   2180 
   2181 
   2182 /*
   2183  * initialize a buf structure for doing an I/O in the kernel.
   2184  */
   2185 static void
   2186 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2187        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2188        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2189 {
   2190 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2191 	bp->b_oflags = 0;
   2192 	bp->b_cflags = 0;
   2193 	bp->b_bcount = numSect << logBytesPerSector;
   2194 	bp->b_bufsize = bp->b_bcount;
   2195 	bp->b_error = 0;
   2196 	bp->b_dev = dev;
   2197 	bp->b_data = bf;
   2198 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2199 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2200 	if (bp->b_bcount == 0) {
   2201 		panic("bp->b_bcount is zero in InitBP!!");
   2202 	}
   2203 	bp->b_iodone = cbFunc;
   2204 	bp->b_private = cbArg;
   2205 }
   2206 
   2207 /*
   2208  * Wait interruptibly for an exclusive lock.
   2209  *
   2210  * XXX
   2211  * Several drivers do this; it should be abstracted and made MP-safe.
   2212  * (Hmm... where have we seen this warning before :->  GO )
   2213  */
   2214 static int
   2215 raidlock(struct raid_softc *rs)
   2216 {
   2217 	int     error;
   2218 
   2219 	error = 0;
   2220 	mutex_enter(&rs->sc_mutex);
   2221 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2222 		rs->sc_flags |= RAIDF_WANTED;
   2223 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2224 		if (error != 0)
   2225 			goto done;
   2226 	}
   2227 	rs->sc_flags |= RAIDF_LOCKED;
   2228 done:
   2229 	mutex_exit(&rs->sc_mutex);
   2230 	return error;
   2231 }
   2232 /*
   2233  * Unlock and wake up any waiters.
   2234  */
   2235 static void
   2236 raidunlock(struct raid_softc *rs)
   2237 {
   2238 
   2239 	mutex_enter(&rs->sc_mutex);
   2240 	rs->sc_flags &= ~RAIDF_LOCKED;
   2241 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2242 		rs->sc_flags &= ~RAIDF_WANTED;
   2243 		cv_broadcast(&rs->sc_cv);
   2244 	}
   2245 	mutex_exit(&rs->sc_mutex);
   2246 }
   2247 
   2248 
   2249 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2250 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2251 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2252 
   2253 static daddr_t
   2254 rf_component_info_offset(void)
   2255 {
   2256 
   2257 	return RF_COMPONENT_INFO_OFFSET;
   2258 }
   2259 
   2260 static daddr_t
   2261 rf_component_info_size(unsigned secsize)
   2262 {
   2263 	daddr_t info_size;
   2264 
   2265 	KASSERT(secsize);
   2266 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2267 		info_size = secsize;
   2268 	else
   2269 		info_size = RF_COMPONENT_INFO_SIZE;
   2270 
   2271 	return info_size;
   2272 }
   2273 
   2274 static daddr_t
   2275 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2276 {
   2277 	daddr_t map_offset;
   2278 
   2279 	KASSERT(raidPtr->bytesPerSector);
   2280 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2281 		map_offset = raidPtr->bytesPerSector;
   2282 	else
   2283 		map_offset = RF_COMPONENT_INFO_SIZE;
   2284 	map_offset += rf_component_info_offset();
   2285 
   2286 	return map_offset;
   2287 }
   2288 
   2289 static daddr_t
   2290 rf_parity_map_size(RF_Raid_t *raidPtr)
   2291 {
   2292 	daddr_t map_size;
   2293 
   2294 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2295 		map_size = raidPtr->bytesPerSector;
   2296 	else
   2297 		map_size = RF_PARITY_MAP_SIZE;
   2298 
   2299 	return map_size;
   2300 }
   2301 
   2302 int
   2303 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2304 {
   2305 	RF_ComponentLabel_t *clabel;
   2306 
   2307 	clabel = raidget_component_label(raidPtr, col);
   2308 	clabel->clean = RF_RAID_CLEAN;
   2309 	raidflush_component_label(raidPtr, col);
   2310 	return(0);
   2311 }
   2312 
   2313 
   2314 int
   2315 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2316 {
   2317 	RF_ComponentLabel_t *clabel;
   2318 
   2319 	clabel = raidget_component_label(raidPtr, col);
   2320 	clabel->clean = RF_RAID_DIRTY;
   2321 	raidflush_component_label(raidPtr, col);
   2322 	return(0);
   2323 }
   2324 
   2325 int
   2326 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2327 {
   2328 	KASSERT(raidPtr->bytesPerSector);
   2329 
   2330 	return raidread_component_label(raidPtr->bytesPerSector,
   2331 	    raidPtr->Disks[col].dev,
   2332 	    raidPtr->raid_cinfo[col].ci_vp,
   2333 	    &raidPtr->raid_cinfo[col].ci_label);
   2334 }
   2335 
   2336 RF_ComponentLabel_t *
   2337 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2338 {
   2339 	return &raidPtr->raid_cinfo[col].ci_label;
   2340 }
   2341 
   2342 int
   2343 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2344 {
   2345 	RF_ComponentLabel_t *label;
   2346 
   2347 	label = &raidPtr->raid_cinfo[col].ci_label;
   2348 	label->mod_counter = raidPtr->mod_counter;
   2349 #ifndef RF_NO_PARITY_MAP
   2350 	label->parity_map_modcount = label->mod_counter;
   2351 #endif
   2352 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2353 	    raidPtr->Disks[col].dev,
   2354 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2355 }
   2356 
   2357 /*
   2358  * Swap the label endianness.
   2359  *
   2360  * Everything in the component label is 4-byte-swapped except the version,
   2361  * which is kept in the byte-swapped version at all times, and indicates
   2362  * for the writer that a swap is necessary.
   2363  *
   2364  * For reads it is expected that out_label == clabel, but writes expect
   2365  * separate labels so only the re-swapped label is written out to disk,
   2366  * leaving the swapped-except-version internally.
   2367  *
   2368  * Only support swapping label version 2.
   2369  */
   2370 static void
   2371 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2372 {
   2373 	int	*in, *out, *in_last;
   2374 
   2375 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2376 
   2377 	/* Don't swap the label, but do copy it. */
   2378 	out_label->version = clabel->version;
   2379 
   2380 	in = &clabel->serial_number;
   2381 	in_last = &clabel->future_use2[42];
   2382 	out = &out_label->serial_number;
   2383 
   2384 	for (; in < in_last; in++, out++)
   2385 		*out = bswap32(*in);
   2386 }
   2387 
   2388 static int
   2389 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2390     RF_ComponentLabel_t *clabel)
   2391 {
   2392 	int error;
   2393 
   2394 	error = raidread_component_area(dev, b_vp, clabel,
   2395 	    sizeof(RF_ComponentLabel_t),
   2396 	    rf_component_info_offset(),
   2397 	    rf_component_info_size(secsize));
   2398 
   2399 	if (error == 0 &&
   2400 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2401 		rf_swap_label(clabel, clabel);
   2402 	}
   2403 
   2404 	return error;
   2405 }
   2406 
   2407 /* ARGSUSED */
   2408 static int
   2409 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2410     size_t msize, daddr_t offset, daddr_t dsize)
   2411 {
   2412 	struct buf *bp;
   2413 	int error;
   2414 
   2415 	/* XXX should probably ensure that we don't try to do this if
   2416 	   someone has changed rf_protected_sectors. */
   2417 
   2418 	if (b_vp == NULL) {
   2419 		/* For whatever reason, this component is not valid.
   2420 		   Don't try to read a component label from it. */
   2421 		return(EINVAL);
   2422 	}
   2423 
   2424 	/* get a block of the appropriate size... */
   2425 	bp = geteblk((int)dsize);
   2426 	bp->b_dev = dev;
   2427 
   2428 	/* get our ducks in a row for the read */
   2429 	bp->b_blkno = offset / DEV_BSIZE;
   2430 	bp->b_bcount = dsize;
   2431 	bp->b_flags |= B_READ;
   2432  	bp->b_resid = dsize;
   2433 
   2434 	bdev_strategy(bp);
   2435 	error = biowait(bp);
   2436 
   2437 	if (!error) {
   2438 		memcpy(data, bp->b_data, msize);
   2439 	}
   2440 
   2441 	brelse(bp, 0);
   2442 	return(error);
   2443 }
   2444 
   2445 static int
   2446 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2447     RF_ComponentLabel_t *clabel)
   2448 {
   2449 	RF_ComponentLabel_t *clabel_write = clabel;
   2450 	RF_ComponentLabel_t lclabel;
   2451 	int error;
   2452 
   2453 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2454 		clabel_write = &lclabel;
   2455 		rf_swap_label(clabel, clabel_write);
   2456 	}
   2457 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2458 	    sizeof(RF_ComponentLabel_t),
   2459 	    rf_component_info_offset(),
   2460 	    rf_component_info_size(secsize), 0);
   2461 
   2462 	return error;
   2463 }
   2464 
   2465 /* ARGSUSED */
   2466 static int
   2467 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2468     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2469 {
   2470 	struct buf *bp;
   2471 	int error;
   2472 
   2473 	/* get a block of the appropriate size... */
   2474 	bp = geteblk((int)dsize);
   2475 	bp->b_dev = dev;
   2476 
   2477 	/* get our ducks in a row for the write */
   2478 	bp->b_blkno = offset / DEV_BSIZE;
   2479 	bp->b_bcount = dsize;
   2480 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2481  	bp->b_resid = dsize;
   2482 
   2483 	memset(bp->b_data, 0, dsize);
   2484 	memcpy(bp->b_data, data, msize);
   2485 
   2486 	bdev_strategy(bp);
   2487 	if (asyncp)
   2488 		return 0;
   2489 	error = biowait(bp);
   2490 	brelse(bp, 0);
   2491 	if (error) {
   2492 #if 1
   2493 		printf("Failed to write RAID component info!\n");
   2494 #endif
   2495 	}
   2496 
   2497 	return(error);
   2498 }
   2499 
   2500 void
   2501 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2502 {
   2503 	int c;
   2504 
   2505 	for (c = 0; c < raidPtr->numCol; c++) {
   2506 		/* Skip dead disks. */
   2507 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2508 			continue;
   2509 		/* XXXjld: what if an error occurs here? */
   2510 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2511 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2512 		    RF_PARITYMAP_NBYTE,
   2513 		    rf_parity_map_offset(raidPtr),
   2514 		    rf_parity_map_size(raidPtr), 0);
   2515 	}
   2516 }
   2517 
   2518 void
   2519 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2520 {
   2521 	struct rf_paritymap_ondisk tmp;
   2522 	int c,first;
   2523 
   2524 	first=1;
   2525 	for (c = 0; c < raidPtr->numCol; c++) {
   2526 		/* Skip dead disks. */
   2527 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2528 			continue;
   2529 		raidread_component_area(raidPtr->Disks[c].dev,
   2530 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2531 		    RF_PARITYMAP_NBYTE,
   2532 		    rf_parity_map_offset(raidPtr),
   2533 		    rf_parity_map_size(raidPtr));
   2534 		if (first) {
   2535 			memcpy(map, &tmp, sizeof(*map));
   2536 			first = 0;
   2537 		} else {
   2538 			rf_paritymap_merge(map, &tmp);
   2539 		}
   2540 	}
   2541 }
   2542 
   2543 void
   2544 rf_markalldirty(RF_Raid_t *raidPtr)
   2545 {
   2546 	RF_ComponentLabel_t *clabel;
   2547 	int sparecol;
   2548 	int c;
   2549 	int j;
   2550 	int scol = -1;
   2551 
   2552 	raidPtr->mod_counter++;
   2553 	for (c = 0; c < raidPtr->numCol; c++) {
   2554 		/* we don't want to touch (at all) a disk that has
   2555 		   failed */
   2556 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2557 			clabel = raidget_component_label(raidPtr, c);
   2558 			if (clabel->status == rf_ds_spared) {
   2559 				/* XXX do something special...
   2560 				   but whatever you do, don't
   2561 				   try to access it!! */
   2562 			} else {
   2563 				raidmarkdirty(raidPtr, c);
   2564 			}
   2565 		}
   2566 	}
   2567 
   2568 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2569 		sparecol = raidPtr->numCol + c;
   2570 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2571 			/*
   2572 
   2573 			   we claim this disk is "optimal" if it's
   2574 			   rf_ds_used_spare, as that means it should be
   2575 			   directly substitutable for the disk it replaced.
   2576 			   We note that too...
   2577 
   2578 			 */
   2579 
   2580 			for(j=0;j<raidPtr->numCol;j++) {
   2581 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2582 					scol = j;
   2583 					break;
   2584 				}
   2585 			}
   2586 
   2587 			clabel = raidget_component_label(raidPtr, sparecol);
   2588 			/* make sure status is noted */
   2589 
   2590 			raid_init_component_label(raidPtr, clabel);
   2591 
   2592 			clabel->row = 0;
   2593 			clabel->column = scol;
   2594 			/* Note: we *don't* change status from rf_ds_used_spare
   2595 			   to rf_ds_optimal */
   2596 			/* clabel.status = rf_ds_optimal; */
   2597 
   2598 			raidmarkdirty(raidPtr, sparecol);
   2599 		}
   2600 	}
   2601 }
   2602 
   2603 
   2604 void
   2605 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2606 {
   2607 	RF_ComponentLabel_t *clabel;
   2608 	int sparecol;
   2609 	int c;
   2610 	int j;
   2611 	int scol;
   2612 	struct raid_softc *rs = raidPtr->softc;
   2613 
   2614 	scol = -1;
   2615 
   2616 	/* XXX should do extra checks to make sure things really are clean,
   2617 	   rather than blindly setting the clean bit... */
   2618 
   2619 	raidPtr->mod_counter++;
   2620 
   2621 	for (c = 0; c < raidPtr->numCol; c++) {
   2622 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2623 			clabel = raidget_component_label(raidPtr, c);
   2624 			/* make sure status is noted */
   2625 			clabel->status = rf_ds_optimal;
   2626 
   2627 			/* note what unit we are configured as */
   2628 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2629 				clabel->last_unit = raidPtr->raidid;
   2630 
   2631 			raidflush_component_label(raidPtr, c);
   2632 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2633 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2634 					raidmarkclean(raidPtr, c);
   2635 				}
   2636 			}
   2637 		}
   2638 		/* else we don't touch it.. */
   2639 	}
   2640 
   2641 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2642 		sparecol = raidPtr->numCol + c;
   2643 		/* Need to ensure that the reconstruct actually completed! */
   2644 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2645 			/*
   2646 
   2647 			   we claim this disk is "optimal" if it's
   2648 			   rf_ds_used_spare, as that means it should be
   2649 			   directly substitutable for the disk it replaced.
   2650 			   We note that too...
   2651 
   2652 			 */
   2653 
   2654 			for(j=0;j<raidPtr->numCol;j++) {
   2655 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2656 					scol = j;
   2657 					break;
   2658 				}
   2659 			}
   2660 
   2661 			/* XXX shouldn't *really* need this... */
   2662 			clabel = raidget_component_label(raidPtr, sparecol);
   2663 			/* make sure status is noted */
   2664 
   2665 			raid_init_component_label(raidPtr, clabel);
   2666 
   2667 			clabel->column = scol;
   2668 			clabel->status = rf_ds_optimal;
   2669 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2670 				clabel->last_unit = raidPtr->raidid;
   2671 
   2672 			raidflush_component_label(raidPtr, sparecol);
   2673 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2674 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2675 					raidmarkclean(raidPtr, sparecol);
   2676 				}
   2677 			}
   2678 		}
   2679 	}
   2680 }
   2681 
   2682 void
   2683 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2684 {
   2685 
   2686 	if (vp != NULL) {
   2687 		if (auto_configured == 1) {
   2688 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2689 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2690 			vput(vp);
   2691 
   2692 		} else {
   2693 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2694 		}
   2695 	}
   2696 }
   2697 
   2698 
   2699 void
   2700 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2701 {
   2702 	int r,c;
   2703 	struct vnode *vp;
   2704 	int acd;
   2705 
   2706 
   2707 	/* We take this opportunity to close the vnodes like we should.. */
   2708 
   2709 	for (c = 0; c < raidPtr->numCol; c++) {
   2710 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2711 		acd = raidPtr->Disks[c].auto_configured;
   2712 		rf_close_component(raidPtr, vp, acd);
   2713 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2714 		raidPtr->Disks[c].auto_configured = 0;
   2715 	}
   2716 
   2717 	for (r = 0; r < raidPtr->numSpare; r++) {
   2718 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2719 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2720 		rf_close_component(raidPtr, vp, acd);
   2721 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2722 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2723 	}
   2724 }
   2725 
   2726 
   2727 static void
   2728 rf_ReconThread(struct rf_recon_req_internal *req)
   2729 {
   2730 	int     s;
   2731 	RF_Raid_t *raidPtr;
   2732 
   2733 	s = splbio();
   2734 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2735 	raidPtr->recon_in_progress = 1;
   2736 
   2737 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2738 		raidPtr->forceRecon = 1;
   2739 	}
   2740 
   2741 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2742 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2743 
   2744 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2745 		raidPtr->forceRecon = 0;
   2746 	}
   2747 
   2748 	RF_Free(req, sizeof(*req));
   2749 
   2750 	raidPtr->recon_in_progress = 0;
   2751 	splx(s);
   2752 
   2753 	/* That's all... */
   2754 	kthread_exit(0);	/* does not return */
   2755 }
   2756 
   2757 static void
   2758 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2759 {
   2760 	int retcode;
   2761 	int s;
   2762 
   2763 	raidPtr->parity_rewrite_stripes_done = 0;
   2764 	raidPtr->parity_rewrite_in_progress = 1;
   2765 	s = splbio();
   2766 	retcode = rf_RewriteParity(raidPtr);
   2767 	splx(s);
   2768 	if (retcode) {
   2769 		printf("raid%d: Error re-writing parity (%d)!\n",
   2770 		    raidPtr->raidid, retcode);
   2771 	} else {
   2772 		/* set the clean bit!  If we shutdown correctly,
   2773 		   the clean bit on each component label will get
   2774 		   set */
   2775 		raidPtr->parity_good = RF_RAID_CLEAN;
   2776 	}
   2777 	raidPtr->parity_rewrite_in_progress = 0;
   2778 
   2779 	/* Anyone waiting for us to stop?  If so, inform them... */
   2780 	if (raidPtr->waitShutdown) {
   2781 		rf_lock_mutex2(raidPtr->rad_lock);
   2782 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2783 		rf_unlock_mutex2(raidPtr->rad_lock);
   2784 	}
   2785 
   2786 	/* That's all... */
   2787 	kthread_exit(0);	/* does not return */
   2788 }
   2789 
   2790 
   2791 static void
   2792 rf_CopybackThread(RF_Raid_t *raidPtr)
   2793 {
   2794 	int s;
   2795 
   2796 	raidPtr->copyback_in_progress = 1;
   2797 	s = splbio();
   2798 	rf_CopybackReconstructedData(raidPtr);
   2799 	splx(s);
   2800 	raidPtr->copyback_in_progress = 0;
   2801 
   2802 	/* That's all... */
   2803 	kthread_exit(0);	/* does not return */
   2804 }
   2805 
   2806 
   2807 static void
   2808 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2809 {
   2810 	int s;
   2811 	RF_Raid_t *raidPtr;
   2812 
   2813 	s = splbio();
   2814 	raidPtr = req->raidPtr;
   2815 	raidPtr->recon_in_progress = 1;
   2816 
   2817 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2818 		raidPtr->forceRecon = 1;
   2819 	}
   2820 
   2821 	rf_ReconstructInPlace(raidPtr, req->col);
   2822 
   2823 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2824 		raidPtr->forceRecon = 0;
   2825 	}
   2826 
   2827 	RF_Free(req, sizeof(*req));
   2828 	raidPtr->recon_in_progress = 0;
   2829 	splx(s);
   2830 
   2831 	/* That's all... */
   2832 	kthread_exit(0);	/* does not return */
   2833 }
   2834 
   2835 static RF_AutoConfig_t *
   2836 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2837     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2838     unsigned secsize)
   2839 {
   2840 	int good_one = 0;
   2841 	RF_ComponentLabel_t *clabel;
   2842 	RF_AutoConfig_t *ac;
   2843 
   2844 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2845 
   2846 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2847 		/* Got the label.  Does it look reasonable? */
   2848 		if (rf_reasonable_label(clabel, numsecs) &&
   2849 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2850 #ifdef DEBUG
   2851 			printf("Component on: %s: %llu\n",
   2852 				cname, (unsigned long long)size);
   2853 			rf_print_component_label(clabel);
   2854 #endif
   2855 			/* if it's reasonable, add it, else ignore it. */
   2856 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2857 				M_WAITOK);
   2858 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2859 			ac->dev = dev;
   2860 			ac->vp = vp;
   2861 			ac->clabel = clabel;
   2862 			ac->next = ac_list;
   2863 			ac_list = ac;
   2864 			good_one = 1;
   2865 		}
   2866 	}
   2867 	if (!good_one) {
   2868 		/* cleanup */
   2869 		free(clabel, M_RAIDFRAME);
   2870 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2871 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2872 		vput(vp);
   2873 	}
   2874 	return ac_list;
   2875 }
   2876 
   2877 static RF_AutoConfig_t *
   2878 rf_find_raid_components(void)
   2879 {
   2880 	struct vnode *vp;
   2881 	struct disklabel label;
   2882 	device_t dv;
   2883 	deviter_t di;
   2884 	dev_t dev;
   2885 	int bmajor, bminor, wedge, rf_part_found;
   2886 	int error;
   2887 	int i;
   2888 	RF_AutoConfig_t *ac_list;
   2889 	uint64_t numsecs;
   2890 	unsigned secsize;
   2891 	int dowedges;
   2892 
   2893 	/* initialize the AutoConfig list */
   2894 	ac_list = NULL;
   2895 
   2896 	/*
   2897 	 * we begin by trolling through *all* the devices on the system *twice*
   2898 	 * first we scan for wedges, second for other devices. This avoids
   2899 	 * using a raw partition instead of a wedge that covers the whole disk
   2900 	 */
   2901 
   2902 	for (dowedges=1; dowedges>=0; --dowedges) {
   2903 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2904 		     dv = deviter_next(&di)) {
   2905 
   2906 			/* we are only interested in disks */
   2907 			if (device_class(dv) != DV_DISK)
   2908 				continue;
   2909 
   2910 			/* we don't care about floppies */
   2911 			if (device_is_a(dv, "fd")) {
   2912 				continue;
   2913 			}
   2914 
   2915 			/* we don't care about CDs. */
   2916 			if (device_is_a(dv, "cd")) {
   2917 				continue;
   2918 			}
   2919 
   2920 			/* we don't care about md. */
   2921 			if (device_is_a(dv, "md")) {
   2922 				continue;
   2923 			}
   2924 
   2925 			/* hdfd is the Atari/Hades floppy driver */
   2926 			if (device_is_a(dv, "hdfd")) {
   2927 				continue;
   2928 			}
   2929 
   2930 			/* fdisa is the Atari/Milan floppy driver */
   2931 			if (device_is_a(dv, "fdisa")) {
   2932 				continue;
   2933 			}
   2934 
   2935 			/* we don't care about spiflash */
   2936 			if (device_is_a(dv, "spiflash")) {
   2937 				continue;
   2938 			}
   2939 
   2940 			/* are we in the wedges pass ? */
   2941 			wedge = device_is_a(dv, "dk");
   2942 			if (wedge != dowedges) {
   2943 				continue;
   2944 			}
   2945 
   2946 			/* need to find the device_name_to_block_device_major stuff */
   2947 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2948 
   2949 			rf_part_found = 0; /*No raid partition as yet*/
   2950 
   2951 			/* get a vnode for the raw partition of this disk */
   2952 			bminor = minor(device_unit(dv));
   2953 			dev = wedge ? makedev(bmajor, bminor) :
   2954 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2955 			if (bdevvp(dev, &vp))
   2956 				panic("RAID can't alloc vnode");
   2957 
   2958 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2959 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2960 
   2961 			if (error) {
   2962 				/* "Who cares."  Continue looking
   2963 				   for something that exists*/
   2964 				vput(vp);
   2965 				continue;
   2966 			}
   2967 
   2968 			error = getdisksize(vp, &numsecs, &secsize);
   2969 			if (error) {
   2970 				/*
   2971 				 * Pseudo devices like vnd and cgd can be
   2972 				 * opened but may still need some configuration.
   2973 				 * Ignore these quietly.
   2974 				 */
   2975 				if (error != ENXIO)
   2976 					printf("RAIDframe: can't get disk size"
   2977 					    " for dev %s (%d)\n",
   2978 					    device_xname(dv), error);
   2979 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2980 				vput(vp);
   2981 				continue;
   2982 			}
   2983 			if (wedge) {
   2984 				struct dkwedge_info dkw;
   2985 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2986 				    NOCRED);
   2987 				if (error) {
   2988 					printf("RAIDframe: can't get wedge info for "
   2989 					    "dev %s (%d)\n", device_xname(dv), error);
   2990 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2991 					vput(vp);
   2992 					continue;
   2993 				}
   2994 
   2995 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2996 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2997 					vput(vp);
   2998 					continue;
   2999 				}
   3000 
   3001 				VOP_UNLOCK(vp);
   3002 				ac_list = rf_get_component(ac_list, dev, vp,
   3003 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3004 				rf_part_found = 1; /*There is a raid component on this disk*/
   3005 				continue;
   3006 			}
   3007 
   3008 			/* Ok, the disk exists.  Go get the disklabel. */
   3009 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3010 			if (error) {
   3011 				/*
   3012 				 * XXX can't happen - open() would
   3013 				 * have errored out (or faked up one)
   3014 				 */
   3015 				if (error != ENOTTY)
   3016 					printf("RAIDframe: can't get label for dev "
   3017 					    "%s (%d)\n", device_xname(dv), error);
   3018 			}
   3019 
   3020 			/* don't need this any more.  We'll allocate it again
   3021 			   a little later if we really do... */
   3022 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3023 			vput(vp);
   3024 
   3025 			if (error)
   3026 				continue;
   3027 
   3028 			rf_part_found = 0; /*No raid partitions yet*/
   3029 			for (i = 0; i < label.d_npartitions; i++) {
   3030 				char cname[sizeof(ac_list->devname)];
   3031 
   3032 				/* We only support partitions marked as RAID */
   3033 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3034 					continue;
   3035 
   3036 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3037 				if (bdevvp(dev, &vp))
   3038 					panic("RAID can't alloc vnode");
   3039 
   3040 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3041 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3042 				if (error) {
   3043 					/* Whatever... */
   3044 					vput(vp);
   3045 					continue;
   3046 				}
   3047 				VOP_UNLOCK(vp);
   3048 				snprintf(cname, sizeof(cname), "%s%c",
   3049 				    device_xname(dv), 'a' + i);
   3050 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3051 					label.d_partitions[i].p_size, numsecs, secsize);
   3052 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3053 			}
   3054 
   3055 			/*
   3056 			 *If there is no raid component on this disk, either in a
   3057 			 *disklabel or inside a wedge, check the raw partition as well,
   3058 			 *as it is possible to configure raid components on raw disk
   3059 			 *devices.
   3060 			 */
   3061 
   3062 			if (!rf_part_found) {
   3063 				char cname[sizeof(ac_list->devname)];
   3064 
   3065 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3066 				if (bdevvp(dev, &vp))
   3067 					panic("RAID can't alloc vnode");
   3068 
   3069 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3070 
   3071 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3072 				if (error) {
   3073 					/* Whatever... */
   3074 					vput(vp);
   3075 					continue;
   3076 				}
   3077 				VOP_UNLOCK(vp);
   3078 				snprintf(cname, sizeof(cname), "%s%c",
   3079 				    device_xname(dv), 'a' + RAW_PART);
   3080 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3081 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3082 			}
   3083 		}
   3084 		deviter_release(&di);
   3085 	}
   3086 	return ac_list;
   3087 }
   3088 
   3089 int
   3090 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3091 {
   3092 
   3093 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3094 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3095 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3096 	    (clabel->clean == RF_RAID_CLEAN ||
   3097 	     clabel->clean == RF_RAID_DIRTY) &&
   3098 	    clabel->row >=0 &&
   3099 	    clabel->column >= 0 &&
   3100 	    clabel->num_rows > 0 &&
   3101 	    clabel->num_columns > 0 &&
   3102 	    clabel->row < clabel->num_rows &&
   3103 	    clabel->column < clabel->num_columns &&
   3104 	    clabel->blockSize > 0 &&
   3105 	    /*
   3106 	     * numBlocksHi may contain garbage, but it is ok since
   3107 	     * the type is unsigned.  If it is really garbage,
   3108 	     * rf_fix_old_label_size() will fix it.
   3109 	     */
   3110 	    rf_component_label_numblocks(clabel) > 0) {
   3111 		/*
   3112 		 * label looks reasonable enough...
   3113 		 * let's make sure it has no old garbage.
   3114 		 */
   3115 		if (numsecs)
   3116 			rf_fix_old_label_size(clabel, numsecs);
   3117 		return(1);
   3118 	}
   3119 	return(0);
   3120 }
   3121 
   3122 
   3123 /*
   3124  * For reasons yet unknown, some old component labels have garbage in
   3125  * the newer numBlocksHi region, and this causes lossage.  Since those
   3126  * disks will also have numsecs set to less than 32 bits of sectors,
   3127  * we can determine when this corruption has occurred, and fix it.
   3128  *
   3129  * The exact same problem, with the same unknown reason, happens to
   3130  * the partitionSizeHi member as well.
   3131  */
   3132 static void
   3133 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3134 {
   3135 
   3136 	if (numsecs < ((uint64_t)1 << 32)) {
   3137 		if (clabel->numBlocksHi) {
   3138 			printf("WARNING: total sectors < 32 bits, yet "
   3139 			       "numBlocksHi set\n"
   3140 			       "WARNING: resetting numBlocksHi to zero.\n");
   3141 			clabel->numBlocksHi = 0;
   3142 		}
   3143 
   3144 		if (clabel->partitionSizeHi) {
   3145 			printf("WARNING: total sectors < 32 bits, yet "
   3146 			       "partitionSizeHi set\n"
   3147 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3148 			clabel->partitionSizeHi = 0;
   3149 		}
   3150 	}
   3151 }
   3152 
   3153 
   3154 #ifdef DEBUG
   3155 void
   3156 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3157 {
   3158 	uint64_t numBlocks;
   3159 	static const char *rp[] = {
   3160 	    "No", "Force", "Soft", "*invalid*"
   3161 	};
   3162 
   3163 
   3164 	numBlocks = rf_component_label_numblocks(clabel);
   3165 
   3166 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3167 	       clabel->row, clabel->column,
   3168 	       clabel->num_rows, clabel->num_columns);
   3169 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3170 	       clabel->version, clabel->serial_number,
   3171 	       clabel->mod_counter);
   3172 	printf("   Clean: %s Status: %d\n",
   3173 	       clabel->clean ? "Yes" : "No", clabel->status);
   3174 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3175 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3176 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3177 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3178 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3179 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3180 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3181 #if 0
   3182 	   printf("   Config order: %d\n", clabel->config_order);
   3183 #endif
   3184 
   3185 }
   3186 #endif
   3187 
   3188 static RF_ConfigSet_t *
   3189 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3190 {
   3191 	RF_AutoConfig_t *ac;
   3192 	RF_ConfigSet_t *config_sets;
   3193 	RF_ConfigSet_t *cset;
   3194 	RF_AutoConfig_t *ac_next;
   3195 
   3196 
   3197 	config_sets = NULL;
   3198 
   3199 	/* Go through the AutoConfig list, and figure out which components
   3200 	   belong to what sets.  */
   3201 	ac = ac_list;
   3202 	while(ac!=NULL) {
   3203 		/* we're going to putz with ac->next, so save it here
   3204 		   for use at the end of the loop */
   3205 		ac_next = ac->next;
   3206 
   3207 		if (config_sets == NULL) {
   3208 			/* will need at least this one... */
   3209 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3210 				       M_RAIDFRAME, M_WAITOK);
   3211 			/* this one is easy :) */
   3212 			config_sets->ac = ac;
   3213 			config_sets->next = NULL;
   3214 			config_sets->rootable = 0;
   3215 			ac->next = NULL;
   3216 		} else {
   3217 			/* which set does this component fit into? */
   3218 			cset = config_sets;
   3219 			while(cset!=NULL) {
   3220 				if (rf_does_it_fit(cset, ac)) {
   3221 					/* looks like it matches... */
   3222 					ac->next = cset->ac;
   3223 					cset->ac = ac;
   3224 					break;
   3225 				}
   3226 				cset = cset->next;
   3227 			}
   3228 			if (cset==NULL) {
   3229 				/* didn't find a match above... new set..*/
   3230 				cset = malloc(sizeof(RF_ConfigSet_t),
   3231 					       M_RAIDFRAME, M_WAITOK);
   3232 				cset->ac = ac;
   3233 				ac->next = NULL;
   3234 				cset->next = config_sets;
   3235 				cset->rootable = 0;
   3236 				config_sets = cset;
   3237 			}
   3238 		}
   3239 		ac = ac_next;
   3240 	}
   3241 
   3242 
   3243 	return(config_sets);
   3244 }
   3245 
   3246 static int
   3247 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3248 {
   3249 	RF_ComponentLabel_t *clabel1, *clabel2;
   3250 
   3251 	/* If this one matches the *first* one in the set, that's good
   3252 	   enough, since the other members of the set would have been
   3253 	   through here too... */
   3254 	/* note that we are not checking partitionSize here..
   3255 
   3256 	   Note that we are also not checking the mod_counters here.
   3257 	   If everything else matches except the mod_counter, that's
   3258 	   good enough for this test.  We will deal with the mod_counters
   3259 	   a little later in the autoconfiguration process.
   3260 
   3261 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3262 
   3263 	   The reason we don't check for this is that failed disks
   3264 	   will have lower modification counts.  If those disks are
   3265 	   not added to the set they used to belong to, then they will
   3266 	   form their own set, which may result in 2 different sets,
   3267 	   for example, competing to be configured at raid0, and
   3268 	   perhaps competing to be the root filesystem set.  If the
   3269 	   wrong ones get configured, or both attempt to become /,
   3270 	   weird behaviour and or serious lossage will occur.  Thus we
   3271 	   need to bring them into the fold here, and kick them out at
   3272 	   a later point.
   3273 
   3274 	*/
   3275 
   3276 	clabel1 = cset->ac->clabel;
   3277 	clabel2 = ac->clabel;
   3278 	if ((clabel1->version == clabel2->version) &&
   3279 	    (clabel1->serial_number == clabel2->serial_number) &&
   3280 	    (clabel1->num_rows == clabel2->num_rows) &&
   3281 	    (clabel1->num_columns == clabel2->num_columns) &&
   3282 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3283 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3284 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3285 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3286 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3287 	    (clabel1->blockSize == clabel2->blockSize) &&
   3288 	    rf_component_label_numblocks(clabel1) ==
   3289 	    rf_component_label_numblocks(clabel2) &&
   3290 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3291 	    (clabel1->root_partition == clabel2->root_partition) &&
   3292 	    (clabel1->last_unit == clabel2->last_unit) &&
   3293 	    (clabel1->config_order == clabel2->config_order)) {
   3294 		/* if it get's here, it almost *has* to be a match */
   3295 	} else {
   3296 		/* it's not consistent with somebody in the set..
   3297 		   punt */
   3298 		return(0);
   3299 	}
   3300 	/* all was fine.. it must fit... */
   3301 	return(1);
   3302 }
   3303 
   3304 static int
   3305 rf_have_enough_components(RF_ConfigSet_t *cset)
   3306 {
   3307 	RF_AutoConfig_t *ac;
   3308 	RF_AutoConfig_t *auto_config;
   3309 	RF_ComponentLabel_t *clabel;
   3310 	int c;
   3311 	int num_cols;
   3312 	int num_missing;
   3313 	int mod_counter;
   3314 	int mod_counter_found;
   3315 	int even_pair_failed;
   3316 	char parity_type;
   3317 
   3318 
   3319 	/* check to see that we have enough 'live' components
   3320 	   of this set.  If so, we can configure it if necessary */
   3321 
   3322 	num_cols = cset->ac->clabel->num_columns;
   3323 	parity_type = cset->ac->clabel->parityConfig;
   3324 
   3325 	/* XXX Check for duplicate components!?!?!? */
   3326 
   3327 	/* Determine what the mod_counter is supposed to be for this set. */
   3328 
   3329 	mod_counter_found = 0;
   3330 	mod_counter = 0;
   3331 	ac = cset->ac;
   3332 	while(ac!=NULL) {
   3333 		if (mod_counter_found==0) {
   3334 			mod_counter = ac->clabel->mod_counter;
   3335 			mod_counter_found = 1;
   3336 		} else {
   3337 			if (ac->clabel->mod_counter > mod_counter) {
   3338 				mod_counter = ac->clabel->mod_counter;
   3339 			}
   3340 		}
   3341 		ac = ac->next;
   3342 	}
   3343 
   3344 	num_missing = 0;
   3345 	auto_config = cset->ac;
   3346 
   3347 	even_pair_failed = 0;
   3348 	for(c=0; c<num_cols; c++) {
   3349 		ac = auto_config;
   3350 		while(ac!=NULL) {
   3351 			if ((ac->clabel->column == c) &&
   3352 			    (ac->clabel->mod_counter == mod_counter)) {
   3353 				/* it's this one... */
   3354 #ifdef DEBUG
   3355 				printf("Found: %s at %d\n",
   3356 				       ac->devname,c);
   3357 #endif
   3358 				break;
   3359 			}
   3360 			ac=ac->next;
   3361 		}
   3362 		if (ac==NULL) {
   3363 				/* Didn't find one here! */
   3364 				/* special case for RAID 1, especially
   3365 				   where there are more than 2
   3366 				   components (where RAIDframe treats
   3367 				   things a little differently :( ) */
   3368 			if (parity_type == '1') {
   3369 				if (c%2 == 0) { /* even component */
   3370 					even_pair_failed = 1;
   3371 				} else { /* odd component.  If
   3372 					    we're failed, and
   3373 					    so is the even
   3374 					    component, it's
   3375 					    "Good Night, Charlie" */
   3376 					if (even_pair_failed == 1) {
   3377 						return(0);
   3378 					}
   3379 				}
   3380 			} else {
   3381 				/* normal accounting */
   3382 				num_missing++;
   3383 			}
   3384 		}
   3385 		if ((parity_type == '1') && (c%2 == 1)) {
   3386 				/* Just did an even component, and we didn't
   3387 				   bail.. reset the even_pair_failed flag,
   3388 				   and go on to the next component.... */
   3389 			even_pair_failed = 0;
   3390 		}
   3391 	}
   3392 
   3393 	clabel = cset->ac->clabel;
   3394 
   3395 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3396 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3397 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3398 		/* XXX this needs to be made *much* more general */
   3399 		/* Too many failures */
   3400 		return(0);
   3401 	}
   3402 	/* otherwise, all is well, and we've got enough to take a kick
   3403 	   at autoconfiguring this set */
   3404 	return(1);
   3405 }
   3406 
   3407 static void
   3408 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3409 			RF_Raid_t *raidPtr)
   3410 {
   3411 	RF_ComponentLabel_t *clabel;
   3412 	int i;
   3413 
   3414 	clabel = ac->clabel;
   3415 
   3416 	/* 1. Fill in the common stuff */
   3417 	config->numCol = clabel->num_columns;
   3418 	config->numSpare = 0; /* XXX should this be set here? */
   3419 	config->sectPerSU = clabel->sectPerSU;
   3420 	config->SUsPerPU = clabel->SUsPerPU;
   3421 	config->SUsPerRU = clabel->SUsPerRU;
   3422 	config->parityConfig = clabel->parityConfig;
   3423 	/* XXX... */
   3424 	strcpy(config->diskQueueType,"fifo");
   3425 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3426 	config->layoutSpecificSize = 0; /* XXX ?? */
   3427 
   3428 	while(ac!=NULL) {
   3429 		/* row/col values will be in range due to the checks
   3430 		   in reasonable_label() */
   3431 		strcpy(config->devnames[0][ac->clabel->column],
   3432 		       ac->devname);
   3433 		ac = ac->next;
   3434 	}
   3435 
   3436 	for(i=0;i<RF_MAXDBGV;i++) {
   3437 		config->debugVars[i][0] = 0;
   3438 	}
   3439 }
   3440 
   3441 static int
   3442 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3443 {
   3444 	RF_ComponentLabel_t *clabel;
   3445 	int column;
   3446 	int sparecol;
   3447 
   3448 	raidPtr->autoconfigure = new_value;
   3449 
   3450 	for(column=0; column<raidPtr->numCol; column++) {
   3451 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3452 			clabel = raidget_component_label(raidPtr, column);
   3453 			clabel->autoconfigure = new_value;
   3454 			raidflush_component_label(raidPtr, column);
   3455 		}
   3456 	}
   3457 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3458 		sparecol = raidPtr->numCol + column;
   3459 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3460 			clabel = raidget_component_label(raidPtr, sparecol);
   3461 			clabel->autoconfigure = new_value;
   3462 			raidflush_component_label(raidPtr, sparecol);
   3463 		}
   3464 	}
   3465 	return(new_value);
   3466 }
   3467 
   3468 static int
   3469 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3470 {
   3471 	RF_ComponentLabel_t *clabel;
   3472 	int column;
   3473 	int sparecol;
   3474 
   3475 	raidPtr->root_partition = new_value;
   3476 	for(column=0; column<raidPtr->numCol; column++) {
   3477 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3478 			clabel = raidget_component_label(raidPtr, column);
   3479 			clabel->root_partition = new_value;
   3480 			raidflush_component_label(raidPtr, column);
   3481 		}
   3482 	}
   3483 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3484 		sparecol = raidPtr->numCol + column;
   3485 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3486 			clabel = raidget_component_label(raidPtr, sparecol);
   3487 			clabel->root_partition = new_value;
   3488 			raidflush_component_label(raidPtr, sparecol);
   3489 		}
   3490 	}
   3491 	return(new_value);
   3492 }
   3493 
   3494 static void
   3495 rf_release_all_vps(RF_ConfigSet_t *cset)
   3496 {
   3497 	RF_AutoConfig_t *ac;
   3498 
   3499 	ac = cset->ac;
   3500 	while(ac!=NULL) {
   3501 		/* Close the vp, and give it back */
   3502 		if (ac->vp) {
   3503 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3504 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3505 			vput(ac->vp);
   3506 			ac->vp = NULL;
   3507 		}
   3508 		ac = ac->next;
   3509 	}
   3510 }
   3511 
   3512 
   3513 static void
   3514 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3515 {
   3516 	RF_AutoConfig_t *ac;
   3517 	RF_AutoConfig_t *next_ac;
   3518 
   3519 	ac = cset->ac;
   3520 	while(ac!=NULL) {
   3521 		next_ac = ac->next;
   3522 		/* nuke the label */
   3523 		free(ac->clabel, M_RAIDFRAME);
   3524 		/* cleanup the config structure */
   3525 		free(ac, M_RAIDFRAME);
   3526 		/* "next.." */
   3527 		ac = next_ac;
   3528 	}
   3529 	/* and, finally, nuke the config set */
   3530 	free(cset, M_RAIDFRAME);
   3531 }
   3532 
   3533 
   3534 void
   3535 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3536 {
   3537 	/* avoid over-writing byteswapped version. */
   3538 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3539 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3540 	clabel->serial_number = raidPtr->serial_number;
   3541 	clabel->mod_counter = raidPtr->mod_counter;
   3542 
   3543 	clabel->num_rows = 1;
   3544 	clabel->num_columns = raidPtr->numCol;
   3545 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3546 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3547 
   3548 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3549 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3550 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3551 
   3552 	clabel->blockSize = raidPtr->bytesPerSector;
   3553 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3554 
   3555 	/* XXX not portable */
   3556 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3557 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3558 	clabel->autoconfigure = raidPtr->autoconfigure;
   3559 	clabel->root_partition = raidPtr->root_partition;
   3560 	clabel->last_unit = raidPtr->raidid;
   3561 	clabel->config_order = raidPtr->config_order;
   3562 
   3563 #ifndef RF_NO_PARITY_MAP
   3564 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3565 #endif
   3566 }
   3567 
   3568 static struct raid_softc *
   3569 rf_auto_config_set(RF_ConfigSet_t *cset)
   3570 {
   3571 	RF_Raid_t *raidPtr;
   3572 	RF_Config_t *config;
   3573 	int raidID;
   3574 	struct raid_softc *sc;
   3575 
   3576 #ifdef DEBUG
   3577 	printf("RAID autoconfigure\n");
   3578 #endif
   3579 
   3580 	/* 1. Create a config structure */
   3581 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3582 
   3583 	/*
   3584 	   2. Figure out what RAID ID this one is supposed to live at
   3585 	   See if we can get the same RAID dev that it was configured
   3586 	   on last time..
   3587 	*/
   3588 
   3589 	raidID = cset->ac->clabel->last_unit;
   3590 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3591 	     sc = raidget(++raidID, false))
   3592 		continue;
   3593 #ifdef DEBUG
   3594 	printf("Configuring raid%d:\n",raidID);
   3595 #endif
   3596 
   3597 	if (sc == NULL)
   3598 		sc = raidget(raidID, true);
   3599 	raidPtr = &sc->sc_r;
   3600 
   3601 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3602 	raidPtr->softc = sc;
   3603 	raidPtr->raidid = raidID;
   3604 	raidPtr->openings = RAIDOUTSTANDING;
   3605 
   3606 	/* 3. Build the configuration structure */
   3607 	rf_create_configuration(cset->ac, config, raidPtr);
   3608 
   3609 	/* 4. Do the configuration */
   3610 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3611 		raidinit(sc);
   3612 
   3613 		rf_markalldirty(raidPtr);
   3614 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3615 		switch (cset->ac->clabel->root_partition) {
   3616 		case 1:	/* Force Root */
   3617 		case 2:	/* Soft Root: root when boot partition part of raid */
   3618 			/*
   3619 			 * everything configured just fine.  Make a note
   3620 			 * that this set is eligible to be root,
   3621 			 * or forced to be root
   3622 			 */
   3623 			cset->rootable = cset->ac->clabel->root_partition;
   3624 			/* XXX do this here? */
   3625 			raidPtr->root_partition = cset->rootable;
   3626 			break;
   3627 		default:
   3628 			break;
   3629 		}
   3630 	} else {
   3631 		raidput(sc);
   3632 		sc = NULL;
   3633 	}
   3634 
   3635 	/* 5. Cleanup */
   3636 	free(config, M_RAIDFRAME);
   3637 	return sc;
   3638 }
   3639 
   3640 void
   3641 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3642 	     size_t xmin, size_t xmax)
   3643 {
   3644 
   3645 	/* Format: raid%d_foo */
   3646 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3647 
   3648 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3649 	pool_sethiwat(p, xmax);
   3650 	pool_prime(p, xmin);
   3651 }
   3652 
   3653 
   3654 /*
   3655  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3656  * to see if there is IO pending and if that IO could possibly be done
   3657  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3658  * otherwise.
   3659  *
   3660  */
   3661 int
   3662 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3663 {
   3664 	struct raid_softc *rs;
   3665 	struct dk_softc *dksc;
   3666 
   3667 	rs = raidPtr->softc;
   3668 	dksc = &rs->sc_dksc;
   3669 
   3670 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3671 		return 1;
   3672 
   3673 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3674 		/* there is work to do */
   3675 		return 0;
   3676 	}
   3677 	/* default is nothing to do */
   3678 	return 1;
   3679 }
   3680 
   3681 int
   3682 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3683 {
   3684 	uint64_t numsecs;
   3685 	unsigned secsize;
   3686 	int error;
   3687 
   3688 	error = getdisksize(vp, &numsecs, &secsize);
   3689 	if (error == 0) {
   3690 		diskPtr->blockSize = secsize;
   3691 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3692 		diskPtr->partitionSize = numsecs;
   3693 		return 0;
   3694 	}
   3695 	return error;
   3696 }
   3697 
   3698 static int
   3699 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3700 {
   3701 	return 1;
   3702 }
   3703 
   3704 static void
   3705 raid_attach(device_t parent, device_t self, void *aux)
   3706 {
   3707 }
   3708 
   3709 
   3710 static int
   3711 raid_detach(device_t self, int flags)
   3712 {
   3713 	int error;
   3714 	struct raid_softc *rs = raidsoftc(self);
   3715 
   3716 	if (rs == NULL)
   3717 		return ENXIO;
   3718 
   3719 	if ((error = raidlock(rs)) != 0)
   3720 		return error;
   3721 
   3722 	error = raid_detach_unlocked(rs);
   3723 
   3724 	raidunlock(rs);
   3725 
   3726 	/* XXX raid can be referenced here */
   3727 
   3728 	if (error)
   3729 		return error;
   3730 
   3731 	/* Free the softc */
   3732 	raidput(rs);
   3733 
   3734 	return 0;
   3735 }
   3736 
   3737 static void
   3738 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3739 {
   3740 	struct dk_softc *dksc = &rs->sc_dksc;
   3741 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3742 
   3743 	memset(dg, 0, sizeof(*dg));
   3744 
   3745 	dg->dg_secperunit = raidPtr->totalSectors;
   3746 	dg->dg_secsize = raidPtr->bytesPerSector;
   3747 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3748 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3749 
   3750 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3751 }
   3752 
   3753 /*
   3754  * Get cache info for all the components (including spares).
   3755  * Returns intersection of all the cache flags of all disks, or first
   3756  * error if any encountered.
   3757  * XXXfua feature flags can change as spares are added - lock down somehow
   3758  */
   3759 static int
   3760 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3761 {
   3762 	int c;
   3763 	int error;
   3764 	int dkwhole = 0, dkpart;
   3765 
   3766 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3767 		/*
   3768 		 * Check any non-dead disk, even when currently being
   3769 		 * reconstructed.
   3770 		 */
   3771 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3772 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3773 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3774 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3775 			if (error) {
   3776 				if (error != ENODEV) {
   3777 					printf("raid%d: get cache for component %s failed\n",
   3778 					    raidPtr->raidid,
   3779 					    raidPtr->Disks[c].devname);
   3780 				}
   3781 
   3782 				return error;
   3783 			}
   3784 
   3785 			if (c == 0)
   3786 				dkwhole = dkpart;
   3787 			else
   3788 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3789 		}
   3790 	}
   3791 
   3792 	*data = dkwhole;
   3793 
   3794 	return 0;
   3795 }
   3796 
   3797 /*
   3798  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3799  * We end up returning whatever error was returned by the first cache flush
   3800  * that fails.
   3801  */
   3802 
   3803 static int
   3804 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3805 {
   3806 	int e = 0;
   3807 	for (int i = 0; i < 5; i++) {
   3808 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3809 		    &force, FWRITE, NOCRED);
   3810 		if (!e || e == ENODEV)
   3811 			return e;
   3812 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3813 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3814 	}
   3815 	return e;
   3816 }
   3817 
   3818 int
   3819 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3820 {
   3821 	int c, error;
   3822 
   3823 	error = 0;
   3824 	for (c = 0; c < raidPtr->numCol; c++) {
   3825 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3826 			int e = rf_sync_component_cache(raidPtr, c, force);
   3827 			if (e && !error)
   3828 				error = e;
   3829 		}
   3830 	}
   3831 
   3832 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3833 		int sparecol = raidPtr->numCol + c;
   3834 		/* Need to ensure that the reconstruct actually completed! */
   3835 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3836 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3837 			    force);
   3838 			if (e && !error)
   3839 				error = e;
   3840 		}
   3841 	}
   3842 	return error;
   3843 }
   3844 
   3845 /* Fill in info with the current status */
   3846 void
   3847 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3848 {
   3849 
   3850 	if (raidPtr->status != rf_rs_reconstructing) {
   3851 		info->total = 100;
   3852 		info->completed = 100;
   3853 	} else {
   3854 		info->total = raidPtr->reconControl->numRUsTotal;
   3855 		info->completed = raidPtr->reconControl->numRUsComplete;
   3856 	}
   3857 	info->remaining = info->total - info->completed;
   3858 }
   3859 
   3860 /* Fill in info with the current status */
   3861 void
   3862 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3863 {
   3864 
   3865 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3866 		info->total = raidPtr->Layout.numStripe;
   3867 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3868 	} else {
   3869 		info->completed = 100;
   3870 		info->total = 100;
   3871 	}
   3872 	info->remaining = info->total - info->completed;
   3873 }
   3874 
   3875 /* Fill in info with the current status */
   3876 void
   3877 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3878 {
   3879 
   3880 	if (raidPtr->copyback_in_progress == 1) {
   3881 		info->total = raidPtr->Layout.numStripe;
   3882 		info->completed = raidPtr->copyback_stripes_done;
   3883 		info->remaining = info->total - info->completed;
   3884 	} else {
   3885 		info->remaining = 0;
   3886 		info->completed = 100;
   3887 		info->total = 100;
   3888 	}
   3889 }
   3890 
   3891 /* Fill in config with the current info */
   3892 int
   3893 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3894 {
   3895 	int	d, i, j;
   3896 
   3897 	if (!raidPtr->valid)
   3898 		return ENODEV;
   3899 	config->cols = raidPtr->numCol;
   3900 	config->ndevs = raidPtr->numCol;
   3901 	if (config->ndevs >= RF_MAX_DISKS)
   3902 		return ENOMEM;
   3903 	config->nspares = raidPtr->numSpare;
   3904 	if (config->nspares >= RF_MAX_DISKS)
   3905 		return ENOMEM;
   3906 	config->maxqdepth = raidPtr->maxQueueDepth;
   3907 	d = 0;
   3908 	for (j = 0; j < config->cols; j++) {
   3909 		config->devs[d] = raidPtr->Disks[j];
   3910 		d++;
   3911 	}
   3912 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3913 		config->spares[i] = raidPtr->Disks[j];
   3914 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3915 			/* XXX: raidctl(8) expects to see this as a used spare */
   3916 			config->spares[i].status = rf_ds_used_spare;
   3917 		}
   3918 	}
   3919 	return 0;
   3920 }
   3921 
   3922 int
   3923 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3924 {
   3925 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3926 	RF_ComponentLabel_t *raid_clabel;
   3927 	int column = clabel->column;
   3928 
   3929 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3930 		return EINVAL;
   3931 	raid_clabel = raidget_component_label(raidPtr, column);
   3932 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3933 	/* Fix-up for userland. */
   3934 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3935 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3936 
   3937 	return 0;
   3938 }
   3939 
   3940 /*
   3941  * Module interface
   3942  */
   3943 
   3944 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3945 
   3946 #ifdef _MODULE
   3947 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3948 #endif
   3949 
   3950 static int raid_modcmd(modcmd_t, void *);
   3951 static int raid_modcmd_init(void);
   3952 static int raid_modcmd_fini(void);
   3953 
   3954 static int
   3955 raid_modcmd(modcmd_t cmd, void *data)
   3956 {
   3957 	int error;
   3958 
   3959 	error = 0;
   3960 	switch (cmd) {
   3961 	case MODULE_CMD_INIT:
   3962 		error = raid_modcmd_init();
   3963 		break;
   3964 	case MODULE_CMD_FINI:
   3965 		error = raid_modcmd_fini();
   3966 		break;
   3967 	default:
   3968 		error = ENOTTY;
   3969 		break;
   3970 	}
   3971 	return error;
   3972 }
   3973 
   3974 static int
   3975 raid_modcmd_init(void)
   3976 {
   3977 	int error;
   3978 	int bmajor, cmajor;
   3979 
   3980 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3981 	mutex_enter(&raid_lock);
   3982 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3983 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3984 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3985 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3986 
   3987 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3988 #endif
   3989 
   3990 	bmajor = cmajor = -1;
   3991 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3992 	    &raid_cdevsw, &cmajor);
   3993 	if (error != 0 && error != EEXIST) {
   3994 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3995 		mutex_exit(&raid_lock);
   3996 		return error;
   3997 	}
   3998 #ifdef _MODULE
   3999 	error = config_cfdriver_attach(&raid_cd);
   4000 	if (error != 0) {
   4001 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4002 		    __func__, error);
   4003 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4004 		mutex_exit(&raid_lock);
   4005 		return error;
   4006 	}
   4007 #endif
   4008 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4009 	if (error != 0) {
   4010 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4011 		    __func__, error);
   4012 #ifdef _MODULE
   4013 		config_cfdriver_detach(&raid_cd);
   4014 #endif
   4015 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4016 		mutex_exit(&raid_lock);
   4017 		return error;
   4018 	}
   4019 
   4020 	raidautoconfigdone = false;
   4021 
   4022 	mutex_exit(&raid_lock);
   4023 
   4024 	if (error == 0) {
   4025 		if (rf_BootRaidframe(true) == 0)
   4026 			aprint_verbose("Kernelized RAIDframe activated\n");
   4027 		else
   4028 			panic("Serious error activating RAID!!");
   4029 	}
   4030 
   4031 	/*
   4032 	 * Register a finalizer which will be used to auto-config RAID
   4033 	 * sets once all real hardware devices have been found.
   4034 	 */
   4035 	error = config_finalize_register(NULL, rf_autoconfig);
   4036 	if (error != 0) {
   4037 		aprint_error("WARNING: unable to register RAIDframe "
   4038 		    "finalizer\n");
   4039 		error = 0;
   4040 	}
   4041 
   4042 	return error;
   4043 }
   4044 
   4045 static int
   4046 raid_modcmd_fini(void)
   4047 {
   4048 	int error;
   4049 
   4050 	mutex_enter(&raid_lock);
   4051 
   4052 	/* Don't allow unload if raid device(s) exist.  */
   4053 	if (!LIST_EMPTY(&raids)) {
   4054 		mutex_exit(&raid_lock);
   4055 		return EBUSY;
   4056 	}
   4057 
   4058 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4059 	if (error != 0) {
   4060 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4061 		mutex_exit(&raid_lock);
   4062 		return error;
   4063 	}
   4064 #ifdef _MODULE
   4065 	error = config_cfdriver_detach(&raid_cd);
   4066 	if (error != 0) {
   4067 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4068 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4069 		mutex_exit(&raid_lock);
   4070 		return error;
   4071 	}
   4072 #endif
   4073 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4074 	if (error != 0) {
   4075 		aprint_error("%s: cannot detach devsw\n",__func__);
   4076 #ifdef _MODULE
   4077 		config_cfdriver_attach(&raid_cd);
   4078 #endif
   4079 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4080 		mutex_exit(&raid_lock);
   4081 		return error;
   4082 	}
   4083 	rf_BootRaidframe(false);
   4084 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4085 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4086 	rf_destroy_cond2(rf_sparet_wait_cv);
   4087 	rf_destroy_cond2(rf_sparet_resp_cv);
   4088 #endif
   4089 	mutex_exit(&raid_lock);
   4090 	mutex_destroy(&raid_lock);
   4091 
   4092 	return error;
   4093 }
   4094