Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.408
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.408 2022/08/10 01:16:38 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.408 2022/08/10 01:16:38 mrg Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int);
    181 static void raidinit(struct raid_softc *);
    182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 static int raid_diskstart(device_t, struct buf *bp);
    200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    201 static int raid_lastclose(device_t);
    202 
    203 static dev_type_open(raidopen);
    204 static dev_type_close(raidclose);
    205 static dev_type_read(raidread);
    206 static dev_type_write(raidwrite);
    207 static dev_type_ioctl(raidioctl);
    208 static dev_type_strategy(raidstrategy);
    209 static dev_type_dump(raiddump);
    210 static dev_type_size(raidsize);
    211 
    212 const struct bdevsw raid_bdevsw = {
    213 	.d_open = raidopen,
    214 	.d_close = raidclose,
    215 	.d_strategy = raidstrategy,
    216 	.d_ioctl = raidioctl,
    217 	.d_dump = raiddump,
    218 	.d_psize = raidsize,
    219 	.d_discard = nodiscard,
    220 	.d_flag = D_DISK
    221 };
    222 
    223 const struct cdevsw raid_cdevsw = {
    224 	.d_open = raidopen,
    225 	.d_close = raidclose,
    226 	.d_read = raidread,
    227 	.d_write = raidwrite,
    228 	.d_ioctl = raidioctl,
    229 	.d_stop = nostop,
    230 	.d_tty = notty,
    231 	.d_poll = nopoll,
    232 	.d_mmap = nommap,
    233 	.d_kqfilter = nokqfilter,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 static struct dkdriver rf_dkdriver = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_strategy = raidstrategy,
    242 	.d_diskstart = raid_diskstart,
    243 	.d_dumpblocks = raid_dumpblocks,
    244 	.d_lastclose = raid_lastclose,
    245 	.d_minphys = minphys
    246 };
    247 
    248 #define	raidunit(x)	DISKUNIT(x)
    249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    250 
    251 extern struct cfdriver raid_cd;
    252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    254     DVF_DETACH_SHUTDOWN);
    255 
    256 /* Internal representation of a rf_recon_req */
    257 struct rf_recon_req_internal {
    258 	RF_RowCol_t col;
    259 	RF_ReconReqFlags_t flags;
    260 	void   *raidPtr;
    261 };
    262 
    263 /*
    264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    265  * Be aware that large numbers can allow the driver to consume a lot of
    266  * kernel memory, especially on writes, and in degraded mode reads.
    267  *
    268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    269  * a single 64K write will typically require 64K for the old data,
    270  * 64K for the old parity, and 64K for the new parity, for a total
    271  * of 192K (if the parity buffer is not re-used immediately).
    272  * Even it if is used immediately, that's still 128K, which when multiplied
    273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    274  *
    275  * Now in degraded mode, for example, a 64K read on the above setup may
    276  * require data reconstruction, which will require *all* of the 4 remaining
    277  * disks to participate -- 4 * 32K/disk == 128K again.
    278  */
    279 
    280 #ifndef RAIDOUTSTANDING
    281 #define RAIDOUTSTANDING   6
    282 #endif
    283 
    284 #define RAIDLABELDEV(dev)	\
    285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    286 
    287 /* declared here, and made public, for the benefit of KVM stuff.. */
    288 
    289 static int raidlock(struct raid_softc *);
    290 static void raidunlock(struct raid_softc *);
    291 
    292 static int raid_detach_unlocked(struct raid_softc *);
    293 
    294 static void rf_markalldirty(RF_Raid_t *);
    295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    296 
    297 static void rf_ReconThread(struct rf_recon_req_internal *);
    298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    301 static int rf_autoconfig(device_t);
    302 static int rf_rescan(void);
    303 static void rf_buildroothack(RF_ConfigSet_t *);
    304 
    305 static RF_AutoConfig_t *rf_find_raid_components(void);
    306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    309 static int rf_set_autoconfig(RF_Raid_t *, int);
    310 static int rf_set_rootpartition(RF_Raid_t *, int);
    311 static void rf_release_all_vps(RF_ConfigSet_t *);
    312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    313 static int rf_have_enough_components(RF_ConfigSet_t *);
    314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    316 
    317 /*
    318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    320  * in the kernel config file.
    321  */
    322 #ifdef RAID_AUTOCONFIG
    323 int raidautoconfig = 1;
    324 #else
    325 int raidautoconfig = 0;
    326 #endif
    327 static bool raidautoconfigdone = false;
    328 
    329 struct pool rf_alloclist_pool;   /* AllocList */
    330 
    331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    332 static kmutex_t raid_lock;
    333 
    334 static struct raid_softc *
    335 raidcreate(int unit) {
    336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    337 	sc->sc_unit = unit;
    338 	cv_init(&sc->sc_cv, "raidunit");
    339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    340 	return sc;
    341 }
    342 
    343 static void
    344 raiddestroy(struct raid_softc *sc) {
    345 	cv_destroy(&sc->sc_cv);
    346 	mutex_destroy(&sc->sc_mutex);
    347 	kmem_free(sc, sizeof(*sc));
    348 }
    349 
    350 static struct raid_softc *
    351 raidget(int unit, bool create) {
    352 	struct raid_softc *sc;
    353 	if (unit < 0) {
    354 #ifdef DIAGNOSTIC
    355 		panic("%s: unit %d!", __func__, unit);
    356 #endif
    357 		return NULL;
    358 	}
    359 	mutex_enter(&raid_lock);
    360 	LIST_FOREACH(sc, &raids, sc_link) {
    361 		if (sc->sc_unit == unit) {
    362 			mutex_exit(&raid_lock);
    363 			return sc;
    364 		}
    365 	}
    366 	mutex_exit(&raid_lock);
    367 	if (!create)
    368 		return NULL;
    369 	sc = raidcreate(unit);
    370 	mutex_enter(&raid_lock);
    371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    372 	mutex_exit(&raid_lock);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raidput(struct raid_softc *sc) {
    378 	mutex_enter(&raid_lock);
    379 	LIST_REMOVE(sc, sc_link);
    380 	mutex_exit(&raid_lock);
    381 	raiddestroy(sc);
    382 }
    383 
    384 void
    385 raidattach(int num)
    386 {
    387 
    388 	/*
    389 	 * Device attachment and associated initialization now occurs
    390 	 * as part of the module initialization.
    391 	 */
    392 }
    393 
    394 static int
    395 rf_autoconfig(device_t self)
    396 {
    397 	RF_AutoConfig_t *ac_list;
    398 	RF_ConfigSet_t *config_sets;
    399 
    400 	if (!raidautoconfig || raidautoconfigdone == true)
    401 		return 0;
    402 
    403 	/* XXX This code can only be run once. */
    404 	raidautoconfigdone = true;
    405 
    406 #ifdef __HAVE_CPU_BOOTCONF
    407 	/*
    408 	 * 0. find the boot device if needed first so we can use it later
    409 	 * this needs to be done before we autoconfigure any raid sets,
    410 	 * because if we use wedges we are not going to be able to open
    411 	 * the boot device later
    412 	 */
    413 	if (booted_device == NULL)
    414 		cpu_bootconf();
    415 #endif
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set and configure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 int
    433 rf_inited(const struct raid_softc *rs) {
    434 	return (rs->sc_flags & RAIDF_INITED) != 0;
    435 }
    436 
    437 RF_Raid_t *
    438 rf_get_raid(struct raid_softc *rs) {
    439 	return &rs->sc_r;
    440 }
    441 
    442 int
    443 rf_get_unit(const struct raid_softc *rs) {
    444 	return rs->sc_unit;
    445 }
    446 
    447 static int
    448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    449 	const char *bootname;
    450 	size_t len;
    451 
    452 	/* if bdv is NULL, the set can't contain it. exit early. */
    453 	if (bdv == NULL)
    454 		return 0;
    455 
    456 	bootname = device_xname(bdv);
    457 	len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 static int
    479 rf_rescan(void)
    480 {
    481 	RF_AutoConfig_t *ac_list;
    482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    483 	struct raid_softc *sc;
    484 	int raid_added;
    485 
    486 	ac_list = rf_find_raid_components();
    487 	config_sets = rf_create_auto_sets(ac_list);
    488 
    489 	raid_added = 1;
    490 	while (raid_added > 0) {
    491 		raid_added = 0;
    492 		cset = config_sets;
    493 		while (cset != NULL) {
    494 			next_cset = cset->next;
    495 			if (rf_have_enough_components(cset) &&
    496 			    cset->ac->clabel->autoconfigure == 1) {
    497 				sc = rf_auto_config_set(cset);
    498 				if (sc != NULL) {
    499 					aprint_debug("raid%d: configured ok, rootable %d\n",
    500 						     sc->sc_unit, cset->rootable);
    501 					/* We added one RAID set */
    502 					raid_added++;
    503 				} else {
    504 					/* The autoconfig didn't work :( */
    505 					aprint_debug("Autoconfig failed\n");
    506 					rf_release_all_vps(cset);
    507 				}
    508 			} else {
    509 				/* we're not autoconfiguring this set...
    510 				   release the associated resources */
    511 				rf_release_all_vps(cset);
    512 			}
    513 			/* cleanup */
    514 			rf_cleanup_config_set(cset);
    515 			cset = next_cset;
    516 		}
    517 		if (raid_added > 0) {
    518 			/* We added at least one RAID set, so re-scan for recursive RAID */
    519 			ac_list = rf_find_raid_components();
    520 			config_sets = rf_create_auto_sets(ac_list);
    521 		}
    522 	}
    523 
    524 	return 0;
    525 }
    526 
    527 
    528 static void
    529 rf_buildroothack(RF_ConfigSet_t *config_sets)
    530 {
    531 	RF_AutoConfig_t *ac_list;
    532 	RF_ConfigSet_t *cset;
    533 	RF_ConfigSet_t *next_cset;
    534 	int num_root;
    535 	int raid_added;
    536 	struct raid_softc *sc, *rsc;
    537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    538 
    539 	sc = rsc = NULL;
    540 	num_root = 0;
    541 
    542 	raid_added = 1;
    543 	while (raid_added > 0) {
    544 		raid_added = 0;
    545 		cset = config_sets;
    546 		while (cset != NULL) {
    547 			next_cset = cset->next;
    548 			if (rf_have_enough_components(cset) &&
    549 			    cset->ac->clabel->autoconfigure == 1) {
    550 				sc = rf_auto_config_set(cset);
    551 				if (sc != NULL) {
    552 					aprint_debug("raid%d: configured ok, rootable %d\n",
    553 						     sc->sc_unit, cset->rootable);
    554 					/* We added one RAID set */
    555 					raid_added++;
    556 					if (cset->rootable) {
    557 						rsc = sc;
    558 						num_root++;
    559 					}
    560 				} else {
    561 					/* The autoconfig didn't work :( */
    562 					aprint_debug("Autoconfig failed\n");
    563 					rf_release_all_vps(cset);
    564 				}
    565 			} else {
    566 				/* we're not autoconfiguring this set...
    567 				   release the associated resources */
    568 				rf_release_all_vps(cset);
    569 			}
    570 			/* cleanup */
    571 			rf_cleanup_config_set(cset);
    572 			cset = next_cset;
    573 		}
    574 		if (raid_added > 0) {
    575 			/* We added at least one RAID set, so re-scan for recursive RAID */
    576 			ac_list = rf_find_raid_components();
    577 			config_sets = rf_create_auto_sets(ac_list);
    578 		}
    579 	}
    580 
    581 	/* if the user has specified what the root device should be
    582 	   then we don't touch booted_device or boothowto... */
    583 
    584 	if (rootspec != NULL) {
    585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
    586 		return;
    587 	}
    588 
    589 	/* we found something bootable... */
    590 
    591 	/*
    592 	 * XXX: The following code assumes that the root raid
    593 	 * is the first ('a') partition. This is about the best
    594 	 * we can do with a BSD disklabel, but we might be able
    595 	 * to do better with a GPT label, by setting a specified
    596 	 * attribute to indicate the root partition. We can then
    597 	 * stash the partition number in the r->root_partition
    598 	 * high bits (the bottom 2 bits are already used). For
    599 	 * now we just set booted_partition to 0 when we override
    600 	 * root.
    601 	 */
    602 	if (num_root == 1) {
    603 		device_t candidate_root;
    604 		dksc = &rsc->sc_dksc;
    605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    606 			char cname[sizeof(cset->ac->devname)];
    607 			/* XXX: assume partition 'a' first */
    608 			snprintf(cname, sizeof(cname), "%s%c",
    609 			    device_xname(dksc->sc_dev), 'a');
    610 			candidate_root = dkwedge_find_by_wname(cname);
    611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
    612 			    cname);
    613 			if (candidate_root == NULL) {
    614 				/*
    615 				 * If that is not found, because we don't use
    616 				 * disklabel, return the first dk child
    617 				 * XXX: we can skip the 'a' check above
    618 				 * and always do this...
    619 				 */
    620 				size_t i = 0;
    621 				candidate_root = dkwedge_find_by_parent(
    622 				    device_xname(dksc->sc_dev), &i);
    623 			}
    624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
    625 			    candidate_root);
    626 		} else
    627 			candidate_root = dksc->sc_dev;
    628 		aprint_debug("%s: candidate root=%p booted_device=%p "
    629 			     "root_partition=%d contains_boot=%d\n",
    630 		    __func__, candidate_root, booted_device,
    631 		    rsc->sc_r.root_partition,
    632 		    rf_containsboot(&rsc->sc_r, booted_device));
    633 		/* XXX the check for booted_device == NULL can probably be
    634 		 * dropped, now that rf_containsboot handles that case.
    635 		 */
    636 		if (booted_device == NULL ||
    637 		    rsc->sc_r.root_partition == 1 ||
    638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    639 			booted_device = candidate_root;
    640 			booted_method = "raidframe/single";
    641 			booted_partition = 0;	/* XXX assume 'a' */
    642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
    643 			    device_xname(booted_device), booted_device);
    644 		}
    645 	} else if (num_root > 1) {
    646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
    647 		    booted_device);
    648 
    649 		/*
    650 		 * Maybe the MD code can help. If it cannot, then
    651 		 * setroot() will discover that we have no
    652 		 * booted_device and will ask the user if nothing was
    653 		 * hardwired in the kernel config file
    654 		 */
    655 		if (booted_device == NULL)
    656 			return;
    657 
    658 		num_root = 0;
    659 		mutex_enter(&raid_lock);
    660 		LIST_FOREACH(sc, &raids, sc_link) {
    661 			RF_Raid_t *r = &sc->sc_r;
    662 			if (r->valid == 0)
    663 				continue;
    664 
    665 			if (r->root_partition == 0)
    666 				continue;
    667 
    668 			if (rf_containsboot(r, booted_device)) {
    669 				num_root++;
    670 				rsc = sc;
    671 				dksc = &rsc->sc_dksc;
    672 			}
    673 		}
    674 		mutex_exit(&raid_lock);
    675 
    676 		if (num_root == 1) {
    677 			booted_device = dksc->sc_dev;
    678 			booted_method = "raidframe/multi";
    679 			booted_partition = 0;	/* XXX assume 'a' */
    680 		} else {
    681 			/* we can't guess.. require the user to answer... */
    682 			boothowto |= RB_ASKNAME;
    683 		}
    684 	}
    685 }
    686 
    687 static int
    688 raidsize(dev_t dev)
    689 {
    690 	struct raid_softc *rs;
    691 	struct dk_softc *dksc;
    692 	unsigned int unit;
    693 
    694 	unit = raidunit(dev);
    695 	if ((rs = raidget(unit, false)) == NULL)
    696 		return -1;
    697 	dksc = &rs->sc_dksc;
    698 
    699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    700 		return -1;
    701 
    702 	return dk_size(dksc, dev);
    703 }
    704 
    705 static int
    706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    707 {
    708 	unsigned int unit;
    709 	struct raid_softc *rs;
    710 	struct dk_softc *dksc;
    711 
    712 	unit = raidunit(dev);
    713 	if ((rs = raidget(unit, false)) == NULL)
    714 		return ENXIO;
    715 	dksc = &rs->sc_dksc;
    716 
    717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    718 		return ENODEV;
    719 
    720         /*
    721            Note that blkno is relative to this particular partition.
    722            By adding adding RF_PROTECTED_SECTORS, we get a value that
    723 	   is relative to the partition used for the underlying component.
    724         */
    725 	blkno += RF_PROTECTED_SECTORS;
    726 
    727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    728 }
    729 
    730 static int
    731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    732 {
    733 	struct raid_softc *rs = raidsoftc(dev);
    734 	const struct bdevsw *bdev;
    735 	RF_Raid_t *raidPtr;
    736 	int     c, sparecol, j, scol, dumpto;
    737 	int     error = 0;
    738 
    739 	raidPtr = &rs->sc_r;
    740 
    741 	/* we only support dumping to RAID 1 sets */
    742 	if (raidPtr->Layout.numDataCol != 1 ||
    743 	    raidPtr->Layout.numParityCol != 1)
    744 		return EINVAL;
    745 
    746 	if ((error = raidlock(rs)) != 0)
    747 		return error;
    748 
    749 	/* figure out what device is alive.. */
    750 
    751 	/*
    752 	   Look for a component to dump to.  The preference for the
    753 	   component to dump to is as follows:
    754 	   1) the first component
    755 	   2) a used_spare of the first component
    756 	   3) the second component
    757 	   4) a used_spare of the second component
    758 	*/
    759 
    760 	dumpto = -1;
    761 	for (c = 0; c < raidPtr->numCol; c++) {
    762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    763 			/* this might be the one */
    764 			dumpto = c;
    765 			break;
    766 		}
    767 	}
    768 
    769 	/*
    770 	   At this point we have possibly selected a live component.
    771 	   If we didn't find a live ocmponent, we now check to see
    772 	   if there is a relevant spared component.
    773 	*/
    774 
    775 	for (c = 0; c < raidPtr->numSpare; c++) {
    776 		sparecol = raidPtr->numCol + c;
    777 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    778 			/* How about this one? */
    779 			scol = -1;
    780 			for(j=0;j<raidPtr->numCol;j++) {
    781 				if (raidPtr->Disks[j].spareCol == sparecol) {
    782 					scol = j;
    783 					break;
    784 				}
    785 			}
    786 			if (scol == 0) {
    787 				/*
    788 				   We must have found a spared first
    789 				   component!  We'll take that over
    790 				   anything else found so far.  (We
    791 				   couldn't have found a real first
    792 				   component before, since this is a
    793 				   used spare, and it's saying that
    794 				   it's replacing the first
    795 				   component.)  On reboot (with
    796 				   autoconfiguration turned on)
    797 				   sparecol will become the first
    798 				   component (component0) of this set.
    799 				*/
    800 				dumpto = sparecol;
    801 				break;
    802 			} else if (scol != -1) {
    803 				/*
    804 				   Must be a spared second component.
    805 				   We'll dump to that if we havn't found
    806 				   anything else so far.
    807 				*/
    808 				if (dumpto == -1)
    809 					dumpto = sparecol;
    810 			}
    811 		}
    812 	}
    813 
    814 	if (dumpto == -1) {
    815 		/* we couldn't find any live components to dump to!?!?
    816 		 */
    817 		error = EINVAL;
    818 		goto out;
    819 	}
    820 
    821 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    822 	if (bdev == NULL) {
    823 		error = ENXIO;
    824 		goto out;
    825 	}
    826 
    827 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    828 				blkno, va, nblk * raidPtr->bytesPerSector);
    829 
    830 out:
    831 	raidunlock(rs);
    832 
    833 	return error;
    834 }
    835 
    836 /* ARGSUSED */
    837 static int
    838 raidopen(dev_t dev, int flags, int fmt,
    839     struct lwp *l)
    840 {
    841 	int     unit = raidunit(dev);
    842 	struct raid_softc *rs;
    843 	struct dk_softc *dksc;
    844 	int     error = 0;
    845 	int     part, pmask;
    846 
    847 	if ((rs = raidget(unit, true)) == NULL)
    848 		return ENXIO;
    849 	if ((error = raidlock(rs)) != 0)
    850 		return error;
    851 
    852 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    853 		error = EBUSY;
    854 		goto bad;
    855 	}
    856 
    857 	dksc = &rs->sc_dksc;
    858 
    859 	part = DISKPART(dev);
    860 	pmask = (1 << part);
    861 
    862 	if (!DK_BUSY(dksc, pmask) &&
    863 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    864 		/* First one... mark things as dirty... Note that we *MUST*
    865 		 have done a configure before this.  I DO NOT WANT TO BE
    866 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    867 		 THAT THEY BELONG TOGETHER!!!!! */
    868 		/* XXX should check to see if we're only open for reading
    869 		   here... If so, we needn't do this, but then need some
    870 		   other way of keeping track of what's happened.. */
    871 
    872 		rf_markalldirty(&rs->sc_r);
    873 	}
    874 
    875 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    876 		error = dk_open(dksc, dev, flags, fmt, l);
    877 
    878 bad:
    879 	raidunlock(rs);
    880 
    881 	return error;
    882 
    883 
    884 }
    885 
    886 static int
    887 raid_lastclose(device_t self)
    888 {
    889 	struct raid_softc *rs = raidsoftc(self);
    890 
    891 	/* Last one... device is not unconfigured yet.
    892 	   Device shutdown has taken care of setting the
    893 	   clean bits if RAIDF_INITED is not set
    894 	   mark things as clean... */
    895 
    896 	rf_update_component_labels(&rs->sc_r,
    897 	    RF_FINAL_COMPONENT_UPDATE);
    898 
    899 	/* pass to unlocked code */
    900 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    901 		rs->sc_flags |= RAIDF_DETACH;
    902 
    903 	return 0;
    904 }
    905 
    906 /* ARGSUSED */
    907 static int
    908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    909 {
    910 	int     unit = raidunit(dev);
    911 	struct raid_softc *rs;
    912 	struct dk_softc *dksc;
    913 	cfdata_t cf;
    914 	int     error = 0, do_detach = 0, do_put = 0;
    915 
    916 	if ((rs = raidget(unit, false)) == NULL)
    917 		return ENXIO;
    918 	dksc = &rs->sc_dksc;
    919 
    920 	if ((error = raidlock(rs)) != 0)
    921 		return error;
    922 
    923 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    924 		error = dk_close(dksc, dev, flags, fmt, l);
    925 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    926 			do_detach = 1;
    927 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    928 		do_put = 1;
    929 
    930 	raidunlock(rs);
    931 
    932 	if (do_detach) {
    933 		/* free the pseudo device attach bits */
    934 		cf = device_cfdata(dksc->sc_dev);
    935 		error = config_detach(dksc->sc_dev, 0);
    936 		if (error == 0)
    937 			free(cf, M_RAIDFRAME);
    938 	} else if (do_put) {
    939 		raidput(rs);
    940 	}
    941 
    942 	return error;
    943 
    944 }
    945 
    946 static void
    947 raid_wakeup(RF_Raid_t *raidPtr)
    948 {
    949 	rf_lock_mutex2(raidPtr->iodone_lock);
    950 	rf_signal_cond2(raidPtr->iodone_cv);
    951 	rf_unlock_mutex2(raidPtr->iodone_lock);
    952 }
    953 
    954 static void
    955 raidstrategy(struct buf *bp)
    956 {
    957 	unsigned int unit;
    958 	struct raid_softc *rs;
    959 	struct dk_softc *dksc;
    960 	RF_Raid_t *raidPtr;
    961 
    962 	unit = raidunit(bp->b_dev);
    963 	if ((rs = raidget(unit, false)) == NULL) {
    964 		bp->b_error = ENXIO;
    965 		goto fail;
    966 	}
    967 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    968 		bp->b_error = ENXIO;
    969 		goto fail;
    970 	}
    971 	dksc = &rs->sc_dksc;
    972 	raidPtr = &rs->sc_r;
    973 
    974 	/* Queue IO only */
    975 	if (dk_strategy_defer(dksc, bp))
    976 		goto done;
    977 
    978 	/* schedule the IO to happen at the next convenient time */
    979 	raid_wakeup(raidPtr);
    980 
    981 done:
    982 	return;
    983 
    984 fail:
    985 	bp->b_resid = bp->b_bcount;
    986 	biodone(bp);
    987 }
    988 
    989 static int
    990 raid_diskstart(device_t dev, struct buf *bp)
    991 {
    992 	struct raid_softc *rs = raidsoftc(dev);
    993 	RF_Raid_t *raidPtr;
    994 
    995 	raidPtr = &rs->sc_r;
    996 	if (!raidPtr->valid) {
    997 		db1_printf(("raid is not valid..\n"));
    998 		return ENODEV;
    999 	}
   1000 
   1001 	/* XXX */
   1002 	bp->b_resid = 0;
   1003 
   1004 	return raiddoaccess(raidPtr, bp);
   1005 }
   1006 
   1007 void
   1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1009 {
   1010 	struct raid_softc *rs;
   1011 	struct dk_softc *dksc;
   1012 
   1013 	rs = raidPtr->softc;
   1014 	dksc = &rs->sc_dksc;
   1015 
   1016 	dk_done(dksc, bp);
   1017 
   1018 	rf_lock_mutex2(raidPtr->mutex);
   1019 	raidPtr->openings++;
   1020 	rf_unlock_mutex2(raidPtr->mutex);
   1021 
   1022 	/* schedule more IO */
   1023 	raid_wakeup(raidPtr);
   1024 }
   1025 
   1026 /* ARGSUSED */
   1027 static int
   1028 raidread(dev_t dev, struct uio *uio, int flags)
   1029 {
   1030 	int     unit = raidunit(dev);
   1031 	struct raid_softc *rs;
   1032 
   1033 	if ((rs = raidget(unit, false)) == NULL)
   1034 		return ENXIO;
   1035 
   1036 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1037 		return ENXIO;
   1038 
   1039 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1040 
   1041 }
   1042 
   1043 /* ARGSUSED */
   1044 static int
   1045 raidwrite(dev_t dev, struct uio *uio, int flags)
   1046 {
   1047 	int     unit = raidunit(dev);
   1048 	struct raid_softc *rs;
   1049 
   1050 	if ((rs = raidget(unit, false)) == NULL)
   1051 		return ENXIO;
   1052 
   1053 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1054 		return ENXIO;
   1055 
   1056 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1057 
   1058 }
   1059 
   1060 static int
   1061 raid_detach_unlocked(struct raid_softc *rs)
   1062 {
   1063 	struct dk_softc *dksc = &rs->sc_dksc;
   1064 	RF_Raid_t *raidPtr;
   1065 	int error;
   1066 
   1067 	raidPtr = &rs->sc_r;
   1068 
   1069 	if (DK_BUSY(dksc, 0) ||
   1070 	    raidPtr->recon_in_progress != 0 ||
   1071 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1072 	    raidPtr->copyback_in_progress != 0)
   1073 		return EBUSY;
   1074 
   1075 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1076 		return 0;
   1077 
   1078 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1079 
   1080 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1081 		return error;
   1082 
   1083 	rs->sc_flags &= ~RAIDF_INITED;
   1084 
   1085 	/* Kill off any queued buffers */
   1086 	dk_drain(dksc);
   1087 	bufq_free(dksc->sc_bufq);
   1088 
   1089 	/* Detach the disk. */
   1090 	dkwedge_delall(&dksc->sc_dkdev);
   1091 	disk_detach(&dksc->sc_dkdev);
   1092 	disk_destroy(&dksc->sc_dkdev);
   1093 	dk_detach(dksc);
   1094 
   1095 	return 0;
   1096 }
   1097 
   1098 static bool
   1099 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1100 {
   1101 	switch (cmd) {
   1102 	case RAIDFRAME_ADD_HOT_SPARE:
   1103 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1104 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1105 	case RAIDFRAME_CHECK_PARITY:
   1106 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1107 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1108 	case RAIDFRAME_CHECK_RECON_STATUS:
   1109 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1110 	case RAIDFRAME_COPYBACK:
   1111 	case RAIDFRAME_DELETE_COMPONENT:
   1112 	case RAIDFRAME_FAIL_DISK:
   1113 	case RAIDFRAME_GET_ACCTOTALS:
   1114 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1115 	case RAIDFRAME_GET_INFO:
   1116 	case RAIDFRAME_GET_SIZE:
   1117 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1118 	case RAIDFRAME_INIT_LABELS:
   1119 	case RAIDFRAME_KEEP_ACCTOTALS:
   1120 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1121 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1122 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1123 	case RAIDFRAME_PARITYMAP_STATUS:
   1124 	case RAIDFRAME_REBUILD_IN_PLACE:
   1125 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1126 	case RAIDFRAME_RESET_ACCTOTALS:
   1127 	case RAIDFRAME_REWRITEPARITY:
   1128 	case RAIDFRAME_SET_AUTOCONFIG:
   1129 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1130 	case RAIDFRAME_SET_ROOT:
   1131 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1132 	}
   1133 	return false;
   1134 }
   1135 
   1136 int
   1137 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1138 {
   1139 	struct rf_recon_req_internal *rrint;
   1140 
   1141 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1142 		/* Can't do this on a RAID 0!! */
   1143 		return EINVAL;
   1144 	}
   1145 
   1146 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1147 		/* bad column */
   1148 		return EINVAL;
   1149 	}
   1150 
   1151 	rf_lock_mutex2(raidPtr->mutex);
   1152 	if (raidPtr->status == rf_rs_reconstructing) {
   1153 		/* you can't fail a disk while we're reconstructing! */
   1154 		/* XXX wrong for RAID6 */
   1155 		goto out;
   1156 	}
   1157 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1158 	    (raidPtr->numFailures > 0)) {
   1159 		/* some other component has failed.  Let's not make
   1160 		   things worse. XXX wrong for RAID6 */
   1161 		goto out;
   1162 	}
   1163 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1164 		/* Can't fail a spared disk! */
   1165 		goto out;
   1166 	}
   1167 	rf_unlock_mutex2(raidPtr->mutex);
   1168 
   1169 	/* make a copy of the recon request so that we don't rely on
   1170 	 * the user's buffer */
   1171 	rrint = RF_Malloc(sizeof(*rrint));
   1172 	if (rrint == NULL)
   1173 		return(ENOMEM);
   1174 	rrint->col = rr->col;
   1175 	rrint->flags = rr->flags;
   1176 	rrint->raidPtr = raidPtr;
   1177 
   1178 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1179 	    rrint, "raid_recon");
   1180 out:
   1181 	rf_unlock_mutex2(raidPtr->mutex);
   1182 	return EINVAL;
   1183 }
   1184 
   1185 static int
   1186 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1187 {
   1188 	/* allocate a buffer for the layout-specific data, and copy it in */
   1189 	if (k_cfg->layoutSpecificSize == 0)
   1190 		return 0;
   1191 
   1192 	if (k_cfg->layoutSpecificSize > 10000) {
   1193 	    /* sanity check */
   1194 	    return EINVAL;
   1195 	}
   1196 
   1197 	u_char *specific_buf;
   1198 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1199 	if (specific_buf == NULL)
   1200 		return ENOMEM;
   1201 
   1202 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1203 	    k_cfg->layoutSpecificSize);
   1204 	if (retcode) {
   1205 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1206 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1207 		return retcode;
   1208 	}
   1209 
   1210 	k_cfg->layoutSpecific = specific_buf;
   1211 	return 0;
   1212 }
   1213 
   1214 static int
   1215 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1216 {
   1217 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1218 
   1219 	if (rs->sc_r.valid) {
   1220 		/* There is a valid RAID set running on this unit! */
   1221 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1222 		return EINVAL;
   1223 	}
   1224 
   1225 	/* copy-in the configuration information */
   1226 	/* data points to a pointer to the configuration structure */
   1227 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1228 	if (*k_cfg == NULL) {
   1229 		return ENOMEM;
   1230 	}
   1231 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1232 	if (retcode == 0)
   1233 		return 0;
   1234 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1235 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1236 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1237 	return retcode;
   1238 }
   1239 
   1240 int
   1241 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1242 {
   1243 	int retcode, i;
   1244 	RF_Raid_t *raidPtr = &rs->sc_r;
   1245 
   1246 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1247 
   1248 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1249 		goto out;
   1250 
   1251 	/* should do some kind of sanity check on the configuration.
   1252 	 * Store the sum of all the bytes in the last byte? */
   1253 
   1254 	/* Force nul-termination on all strings. */
   1255 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1256 	for (i = 0; i < RF_MAXCOL; i++) {
   1257 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1258 	}
   1259 	for (i = 0; i < RF_MAXSPARE; i++) {
   1260 		ZERO_FINAL(k_cfg->spare_names[i]);
   1261 	}
   1262 	for (i = 0; i < RF_MAXDBGV; i++) {
   1263 		ZERO_FINAL(k_cfg->debugVars[i]);
   1264 	}
   1265 #undef ZERO_FINAL
   1266 
   1267 	/* Check some basic limits. */
   1268 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1269 		retcode = EINVAL;
   1270 		goto out;
   1271 	}
   1272 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1273 		retcode = EINVAL;
   1274 		goto out;
   1275 	}
   1276 
   1277 	/* configure the system */
   1278 
   1279 	/*
   1280 	 * Clear the entire RAID descriptor, just to make sure
   1281 	 *  there is no stale data left in the case of a
   1282 	 *  reconfiguration
   1283 	 */
   1284 	memset(raidPtr, 0, sizeof(*raidPtr));
   1285 	raidPtr->softc = rs;
   1286 	raidPtr->raidid = rs->sc_unit;
   1287 
   1288 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1289 
   1290 	if (retcode == 0) {
   1291 		/* allow this many simultaneous IO's to
   1292 		   this RAID device */
   1293 		raidPtr->openings = RAIDOUTSTANDING;
   1294 
   1295 		raidinit(rs);
   1296 		raid_wakeup(raidPtr);
   1297 		rf_markalldirty(raidPtr);
   1298 	}
   1299 
   1300 	/* free the buffers.  No return code here. */
   1301 	if (k_cfg->layoutSpecificSize) {
   1302 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1303 	}
   1304 out:
   1305 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1306 	if (retcode) {
   1307 		/*
   1308 		 * If configuration failed, set sc_flags so that we
   1309 		 * will detach the device when we close it.
   1310 		 */
   1311 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1312 	}
   1313 	return retcode;
   1314 }
   1315 
   1316 #if RF_DISABLED
   1317 static int
   1318 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1319 {
   1320 
   1321 	/* XXX check the label for valid stuff... */
   1322 	/* Note that some things *should not* get modified --
   1323 	   the user should be re-initing the labels instead of
   1324 	   trying to patch things.
   1325 	   */
   1326 #ifdef DEBUG
   1327 	int raidid = raidPtr->raidid;
   1328 	printf("raid%d: Got component label:\n", raidid);
   1329 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1330 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1331 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1332 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1333 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1334 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1335 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1336 #endif	/* DEBUG */
   1337 	clabel->row = 0;
   1338 	int column = clabel->column;
   1339 
   1340 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1341 		return(EINVAL);
   1342 	}
   1343 
   1344 	/* XXX this isn't allowed to do anything for now :-) */
   1345 
   1346 	/* XXX and before it is, we need to fill in the rest
   1347 	   of the fields!?!?!?! */
   1348 	memcpy(raidget_component_label(raidPtr, column),
   1349 	    clabel, sizeof(*clabel));
   1350 	raidflush_component_label(raidPtr, column);
   1351 	return 0;
   1352 }
   1353 #endif
   1354 
   1355 static int
   1356 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1357 {
   1358 	/*
   1359 	   we only want the serial number from
   1360 	   the above.  We get all the rest of the information
   1361 	   from the config that was used to create this RAID
   1362 	   set.
   1363 	   */
   1364 
   1365 	raidPtr->serial_number = clabel->serial_number;
   1366 
   1367 	for (int column = 0; column < raidPtr->numCol; column++) {
   1368 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1369 		if (RF_DEAD_DISK(diskPtr->status))
   1370 			continue;
   1371 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1372 		    raidPtr, column);
   1373 		/* Zeroing this is important. */
   1374 		memset(ci_label, 0, sizeof(*ci_label));
   1375 		raid_init_component_label(raidPtr, ci_label);
   1376 		ci_label->serial_number = raidPtr->serial_number;
   1377 		ci_label->row = 0; /* we dont' pretend to support more */
   1378 		rf_component_label_set_partitionsize(ci_label,
   1379 		    diskPtr->partitionSize);
   1380 		ci_label->column = column;
   1381 		raidflush_component_label(raidPtr, column);
   1382 		/* XXXjld what about the spares? */
   1383 	}
   1384 
   1385 	return 0;
   1386 }
   1387 
   1388 static int
   1389 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1390 {
   1391 
   1392 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1393 		/* Can't do this on a RAID 0!! */
   1394 		return EINVAL;
   1395 	}
   1396 
   1397 	if (raidPtr->recon_in_progress == 1) {
   1398 		/* a reconstruct is already in progress! */
   1399 		return EINVAL;
   1400 	}
   1401 
   1402 	RF_SingleComponent_t component;
   1403 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1404 	component.row = 0; /* we don't support any more */
   1405 	int column = component.column;
   1406 
   1407 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1408 		return EINVAL;
   1409 	}
   1410 
   1411 	rf_lock_mutex2(raidPtr->mutex);
   1412 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1413 	    (raidPtr->numFailures > 0)) {
   1414 		/* XXX 0 above shouldn't be constant!!! */
   1415 		/* some component other than this has failed.
   1416 		   Let's not make things worse than they already
   1417 		   are... */
   1418 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1419 		       raidPtr->raidid);
   1420 		printf("raid%d:     Col: %d   Too many failures.\n",
   1421 		       raidPtr->raidid, column);
   1422 		rf_unlock_mutex2(raidPtr->mutex);
   1423 		return EINVAL;
   1424 	}
   1425 
   1426 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1427 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1428 		       raidPtr->raidid);
   1429 		printf("raid%d:    Col: %d   "
   1430 		    "Reconstruction already occurring!\n",
   1431 		    raidPtr->raidid, column);
   1432 
   1433 		rf_unlock_mutex2(raidPtr->mutex);
   1434 		return EINVAL;
   1435 	}
   1436 
   1437 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1438 		rf_unlock_mutex2(raidPtr->mutex);
   1439 		return EINVAL;
   1440 	}
   1441 
   1442 	rf_unlock_mutex2(raidPtr->mutex);
   1443 
   1444 	struct rf_recon_req_internal *rrint;
   1445 	rrint = RF_Malloc(sizeof(*rrint));
   1446 	if (rrint == NULL)
   1447 		return ENOMEM;
   1448 
   1449 	rrint->col = column;
   1450 	rrint->raidPtr = raidPtr;
   1451 
   1452 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1453 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1454 }
   1455 
   1456 static int
   1457 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1458 {
   1459 	/*
   1460 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1461 	 * so tell the user it's done.
   1462 	 */
   1463 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1464 	    raidPtr->status != rf_rs_reconstructing) {
   1465 		*data = 100;
   1466 		return 0;
   1467 	}
   1468 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1469 		*data = 0;
   1470 		return 0;
   1471 	}
   1472 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1473 	    / raidPtr->reconControl->numRUsTotal);
   1474 	return 0;
   1475 }
   1476 
   1477 /*
   1478  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1479  * on the component_name[] array.
   1480  */
   1481 static void
   1482 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1483 {
   1484 
   1485 	memcpy(component, data, sizeof *component);
   1486 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1487 }
   1488 
   1489 static int
   1490 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1491 {
   1492 	int     unit = raidunit(dev);
   1493 	int     part, pmask;
   1494 	struct raid_softc *rs;
   1495 	struct dk_softc *dksc;
   1496 	RF_Config_t *k_cfg;
   1497 	RF_Raid_t *raidPtr;
   1498 	RF_AccTotals_t *totals;
   1499 	RF_SingleComponent_t component;
   1500 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1501 	int retcode = 0;
   1502 	int column;
   1503 	RF_ComponentLabel_t *clabel;
   1504 	int d;
   1505 
   1506 	if ((rs = raidget(unit, false)) == NULL)
   1507 		return ENXIO;
   1508 
   1509 	dksc = &rs->sc_dksc;
   1510 	raidPtr = &rs->sc_r;
   1511 
   1512 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1513 	    (int) DISKPART(dev), (int) unit, cmd));
   1514 
   1515 	/* Must be initialized for these... */
   1516 	if (rf_must_be_initialized(rs, cmd))
   1517 		return ENXIO;
   1518 
   1519 	switch (cmd) {
   1520 		/* configure the system */
   1521 	case RAIDFRAME_CONFIGURE:
   1522 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1523 			return retcode;
   1524 		return rf_construct(rs, k_cfg);
   1525 
   1526 		/* shutdown the system */
   1527 	case RAIDFRAME_SHUTDOWN:
   1528 
   1529 		part = DISKPART(dev);
   1530 		pmask = (1 << part);
   1531 
   1532 		if ((retcode = raidlock(rs)) != 0)
   1533 			return retcode;
   1534 
   1535 		if (DK_BUSY(dksc, pmask) ||
   1536 		    raidPtr->recon_in_progress != 0 ||
   1537 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1538 		    raidPtr->copyback_in_progress != 0)
   1539 			retcode = EBUSY;
   1540 		else {
   1541 			/* detach and free on close */
   1542 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1543 			retcode = 0;
   1544 		}
   1545 
   1546 		raidunlock(rs);
   1547 
   1548 		return retcode;
   1549 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1550 		return rf_get_component_label(raidPtr, data);
   1551 
   1552 #if RF_DISABLED
   1553 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1554 		return rf_set_component_label(raidPtr, data);
   1555 #endif
   1556 
   1557 	case RAIDFRAME_INIT_LABELS:
   1558 		return rf_init_component_label(raidPtr, data);
   1559 
   1560 	case RAIDFRAME_SET_AUTOCONFIG:
   1561 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1562 		printf("raid%d: New autoconfig value is: %d\n",
   1563 		       raidPtr->raidid, d);
   1564 		*(int *) data = d;
   1565 		return retcode;
   1566 
   1567 	case RAIDFRAME_SET_ROOT:
   1568 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1569 		printf("raid%d: New rootpartition value is: %d\n",
   1570 		       raidPtr->raidid, d);
   1571 		*(int *) data = d;
   1572 		return retcode;
   1573 
   1574 		/* initialize all parity */
   1575 	case RAIDFRAME_REWRITEPARITY:
   1576 
   1577 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1578 			/* Parity for RAID 0 is trivially correct */
   1579 			raidPtr->parity_good = RF_RAID_CLEAN;
   1580 			return 0;
   1581 		}
   1582 
   1583 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1584 			/* Re-write is already in progress! */
   1585 			return EINVAL;
   1586 		}
   1587 
   1588 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1589 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1590 
   1591 	case RAIDFRAME_ADD_HOT_SPARE:
   1592 		rf_copy_single_component(&component, data);
   1593 		return rf_add_hot_spare(raidPtr, &component);
   1594 
   1595 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1596 		return retcode;
   1597 
   1598 	case RAIDFRAME_DELETE_COMPONENT:
   1599 		rf_copy_single_component(&component, data);
   1600 		return rf_delete_component(raidPtr, &component);
   1601 
   1602 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1603 		rf_copy_single_component(&component, data);
   1604 		return rf_incorporate_hot_spare(raidPtr, &component);
   1605 
   1606 	case RAIDFRAME_REBUILD_IN_PLACE:
   1607 		return rf_rebuild_in_place(raidPtr, data);
   1608 
   1609 	case RAIDFRAME_GET_INFO:
   1610 		ucfgp = *(RF_DeviceConfig_t **)data;
   1611 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1612 		if (d_cfg == NULL)
   1613 			return ENOMEM;
   1614 		retcode = rf_get_info(raidPtr, d_cfg);
   1615 		if (retcode == 0) {
   1616 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1617 		}
   1618 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1619 		return retcode;
   1620 
   1621 	case RAIDFRAME_CHECK_PARITY:
   1622 		*(int *) data = raidPtr->parity_good;
   1623 		return 0;
   1624 
   1625 	case RAIDFRAME_PARITYMAP_STATUS:
   1626 		if (rf_paritymap_ineligible(raidPtr))
   1627 			return EINVAL;
   1628 		rf_paritymap_status(raidPtr->parity_map, data);
   1629 		return 0;
   1630 
   1631 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1632 		if (rf_paritymap_ineligible(raidPtr))
   1633 			return EINVAL;
   1634 		if (raidPtr->parity_map == NULL)
   1635 			return ENOENT; /* ??? */
   1636 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1637 			return EINVAL;
   1638 		return 0;
   1639 
   1640 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1641 		if (rf_paritymap_ineligible(raidPtr))
   1642 			return EINVAL;
   1643 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1644 		return 0;
   1645 
   1646 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1647 		if (rf_paritymap_ineligible(raidPtr))
   1648 			return EINVAL;
   1649 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1650 		/* XXX should errors be passed up? */
   1651 		return 0;
   1652 
   1653 	case RAIDFRAME_RESCAN:
   1654 		return rf_rescan();
   1655 
   1656 	case RAIDFRAME_RESET_ACCTOTALS:
   1657 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1658 		return 0;
   1659 
   1660 	case RAIDFRAME_GET_ACCTOTALS:
   1661 		totals = (RF_AccTotals_t *) data;
   1662 		*totals = raidPtr->acc_totals;
   1663 		return 0;
   1664 
   1665 	case RAIDFRAME_KEEP_ACCTOTALS:
   1666 		raidPtr->keep_acc_totals = *(int *)data;
   1667 		return 0;
   1668 
   1669 	case RAIDFRAME_GET_SIZE:
   1670 		*(int *) data = raidPtr->totalSectors;
   1671 		return 0;
   1672 
   1673 	case RAIDFRAME_FAIL_DISK:
   1674 		return rf_fail_disk(raidPtr, data);
   1675 
   1676 		/* invoke a copyback operation after recon on whatever disk
   1677 		 * needs it, if any */
   1678 	case RAIDFRAME_COPYBACK:
   1679 
   1680 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1681 			/* This makes no sense on a RAID 0!! */
   1682 			return EINVAL;
   1683 		}
   1684 
   1685 		if (raidPtr->copyback_in_progress == 1) {
   1686 			/* Copyback is already in progress! */
   1687 			return EINVAL;
   1688 		}
   1689 
   1690 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1691 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1692 
   1693 		/* return the percentage completion of reconstruction */
   1694 	case RAIDFRAME_CHECK_RECON_STATUS:
   1695 		return rf_check_recon_status(raidPtr, data);
   1696 
   1697 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1698 		rf_check_recon_status_ext(raidPtr, data);
   1699 		return 0;
   1700 
   1701 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1702 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1703 			/* This makes no sense on a RAID 0, so tell the
   1704 			   user it's done. */
   1705 			*(int *) data = 100;
   1706 			return 0;
   1707 		}
   1708 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1709 			*(int *) data = 100 *
   1710 				raidPtr->parity_rewrite_stripes_done /
   1711 				raidPtr->Layout.numStripe;
   1712 		} else {
   1713 			*(int *) data = 100;
   1714 		}
   1715 		return 0;
   1716 
   1717 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1718 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1719 		return 0;
   1720 
   1721 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1722 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1723 			/* This makes no sense on a RAID 0 */
   1724 			*(int *) data = 100;
   1725 			return 0;
   1726 		}
   1727 		if (raidPtr->copyback_in_progress == 1) {
   1728 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1729 				raidPtr->Layout.numStripe;
   1730 		} else {
   1731 			*(int *) data = 100;
   1732 		}
   1733 		return 0;
   1734 
   1735 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1736 		rf_check_copyback_status_ext(raidPtr, data);
   1737 		return 0;
   1738 
   1739 	case RAIDFRAME_SET_LAST_UNIT:
   1740 		for (column = 0; column < raidPtr->numCol; column++)
   1741 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1742 				return EBUSY;
   1743 
   1744 		for (column = 0; column < raidPtr->numCol; column++) {
   1745 			clabel = raidget_component_label(raidPtr, column);
   1746 			clabel->last_unit = *(int *)data;
   1747 			raidflush_component_label(raidPtr, column);
   1748 		}
   1749 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1750 		return 0;
   1751 
   1752 		/* the sparetable daemon calls this to wait for the kernel to
   1753 		 * need a spare table. this ioctl does not return until a
   1754 		 * spare table is needed. XXX -- calling mpsleep here in the
   1755 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1756 		 * -- I should either compute the spare table in the kernel,
   1757 		 * or have a different -- XXX XXX -- interface (a different
   1758 		 * character device) for delivering the table     -- XXX */
   1759 #if RF_DISABLED
   1760 	case RAIDFRAME_SPARET_WAIT:
   1761 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1762 		while (!rf_sparet_wait_queue)
   1763 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1764 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1765 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1766 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1767 
   1768 		/* structure assignment */
   1769 		*((RF_SparetWait_t *) data) = *waitreq;
   1770 
   1771 		RF_Free(waitreq, sizeof(*waitreq));
   1772 		return 0;
   1773 
   1774 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1775 		 * code in it that will cause the dameon to exit */
   1776 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1777 		waitreq = RF_Malloc(sizeof(*waitreq));
   1778 		waitreq->fcol = -1;
   1779 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1780 		waitreq->next = rf_sparet_wait_queue;
   1781 		rf_sparet_wait_queue = waitreq;
   1782 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1783 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1784 		return 0;
   1785 
   1786 		/* used by the spare table daemon to deliver a spare table
   1787 		 * into the kernel */
   1788 	case RAIDFRAME_SEND_SPARET:
   1789 
   1790 		/* install the spare table */
   1791 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1792 
   1793 		/* respond to the requestor.  the return status of the spare
   1794 		 * table installation is passed in the "fcol" field */
   1795 		waitred = RF_Malloc(sizeof(*waitreq));
   1796 		waitreq->fcol = retcode;
   1797 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1798 		waitreq->next = rf_sparet_resp_queue;
   1799 		rf_sparet_resp_queue = waitreq;
   1800 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1801 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1802 
   1803 		return retcode;
   1804 #endif
   1805 	default:
   1806 		/*
   1807 		 * Don't bother trying to load compat modules
   1808 		 * if it is not our ioctl. This is more efficient
   1809 		 * and makes rump tests not depend on compat code
   1810 		 */
   1811 		if (IOCGROUP(cmd) != 'r')
   1812 			break;
   1813 #ifdef _LP64
   1814 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1815 			module_autoload("compat_netbsd32_raid",
   1816 			    MODULE_CLASS_EXEC);
   1817 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1818 			    (rs, cmd, data), enosys(), retcode);
   1819 			if (retcode != EPASSTHROUGH)
   1820 				return retcode;
   1821 		}
   1822 #endif
   1823 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1824 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1825 		    (rs, cmd, data), enosys(), retcode);
   1826 		if (retcode != EPASSTHROUGH)
   1827 			return retcode;
   1828 
   1829 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1830 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1831 		    (rs, cmd, data), enosys(), retcode);
   1832 		if (retcode != EPASSTHROUGH)
   1833 			return retcode;
   1834 		break; /* fall through to the os-specific code below */
   1835 
   1836 	}
   1837 
   1838 	if (!raidPtr->valid)
   1839 		return EINVAL;
   1840 
   1841 	/*
   1842 	 * Add support for "regular" device ioctls here.
   1843 	 */
   1844 
   1845 	switch (cmd) {
   1846 	case DIOCGCACHE:
   1847 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1848 		break;
   1849 
   1850 	case DIOCCACHESYNC:
   1851 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1852 		break;
   1853 
   1854 	default:
   1855 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1856 		break;
   1857 	}
   1858 
   1859 	return retcode;
   1860 
   1861 }
   1862 
   1863 
   1864 /* raidinit -- complete the rest of the initialization for the
   1865    RAIDframe device.  */
   1866 
   1867 
   1868 static void
   1869 raidinit(struct raid_softc *rs)
   1870 {
   1871 	cfdata_t cf;
   1872 	unsigned int unit;
   1873 	struct dk_softc *dksc = &rs->sc_dksc;
   1874 	RF_Raid_t *raidPtr = &rs->sc_r;
   1875 	device_t dev;
   1876 
   1877 	unit = raidPtr->raidid;
   1878 
   1879 	/* XXX doesn't check bounds. */
   1880 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1881 
   1882 	/* attach the pseudo device */
   1883 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1884 	cf->cf_name = raid_cd.cd_name;
   1885 	cf->cf_atname = raid_cd.cd_name;
   1886 	cf->cf_unit = unit;
   1887 	cf->cf_fstate = FSTATE_STAR;
   1888 
   1889 	dev = config_attach_pseudo(cf);
   1890 	if (dev == NULL) {
   1891 		printf("raid%d: config_attach_pseudo failed\n",
   1892 		    raidPtr->raidid);
   1893 		free(cf, M_RAIDFRAME);
   1894 		return;
   1895 	}
   1896 
   1897 	/* provide a backpointer to the real softc */
   1898 	raidsoftc(dev) = rs;
   1899 
   1900 	/* disk_attach actually creates space for the CPU disklabel, among
   1901 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1902 	 * with disklabels. */
   1903 	dk_init(dksc, dev, DKTYPE_RAID);
   1904 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1905 
   1906 	/* XXX There may be a weird interaction here between this, and
   1907 	 * protectedSectors, as used in RAIDframe.  */
   1908 
   1909 	rs->sc_size = raidPtr->totalSectors;
   1910 
   1911 	/* Attach dk and disk subsystems */
   1912 	dk_attach(dksc);
   1913 	disk_attach(&dksc->sc_dkdev);
   1914 	rf_set_geometry(rs, raidPtr);
   1915 
   1916 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1917 
   1918 	/* mark unit as usuable */
   1919 	rs->sc_flags |= RAIDF_INITED;
   1920 
   1921 	dkwedge_discover(&dksc->sc_dkdev);
   1922 }
   1923 
   1924 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1925 /* wake up the daemon & tell it to get us a spare table
   1926  * XXX
   1927  * the entries in the queues should be tagged with the raidPtr
   1928  * so that in the extremely rare case that two recons happen at once,
   1929  * we know for which device were requesting a spare table
   1930  * XXX
   1931  *
   1932  * XXX This code is not currently used. GO
   1933  */
   1934 int
   1935 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1936 {
   1937 	int     retcode;
   1938 
   1939 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1940 	req->next = rf_sparet_wait_queue;
   1941 	rf_sparet_wait_queue = req;
   1942 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1943 
   1944 	/* mpsleep unlocks the mutex */
   1945 	while (!rf_sparet_resp_queue) {
   1946 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1947 	}
   1948 	req = rf_sparet_resp_queue;
   1949 	rf_sparet_resp_queue = req->next;
   1950 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1951 
   1952 	retcode = req->fcol;
   1953 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1954 					 * alloc'd */
   1955 	return retcode;
   1956 }
   1957 #endif
   1958 
   1959 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1960  * bp & passes it down.
   1961  * any calls originating in the kernel must use non-blocking I/O
   1962  * do some extra sanity checking to return "appropriate" error values for
   1963  * certain conditions (to make some standard utilities work)
   1964  *
   1965  * Formerly known as: rf_DoAccessKernel
   1966  */
   1967 void
   1968 raidstart(RF_Raid_t *raidPtr)
   1969 {
   1970 	struct raid_softc *rs;
   1971 	struct dk_softc *dksc;
   1972 
   1973 	rs = raidPtr->softc;
   1974 	dksc = &rs->sc_dksc;
   1975 	/* quick check to see if anything has died recently */
   1976 	rf_lock_mutex2(raidPtr->mutex);
   1977 	if (raidPtr->numNewFailures > 0) {
   1978 		rf_unlock_mutex2(raidPtr->mutex);
   1979 		rf_update_component_labels(raidPtr,
   1980 					   RF_NORMAL_COMPONENT_UPDATE);
   1981 		rf_lock_mutex2(raidPtr->mutex);
   1982 		raidPtr->numNewFailures--;
   1983 	}
   1984 	rf_unlock_mutex2(raidPtr->mutex);
   1985 
   1986 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1987 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1988 		return;
   1989 	}
   1990 
   1991 	dk_start(dksc, NULL);
   1992 }
   1993 
   1994 static int
   1995 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1996 {
   1997 	RF_SectorCount_t num_blocks, pb, sum;
   1998 	RF_RaidAddr_t raid_addr;
   1999 	daddr_t blocknum;
   2000 	int rc;
   2001 
   2002 	rf_lock_mutex2(raidPtr->mutex);
   2003 	if (raidPtr->openings == 0) {
   2004 		rf_unlock_mutex2(raidPtr->mutex);
   2005 		return EAGAIN;
   2006 	}
   2007 	rf_unlock_mutex2(raidPtr->mutex);
   2008 
   2009 	blocknum = bp->b_rawblkno;
   2010 
   2011 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2012 		    (int) blocknum));
   2013 
   2014 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2015 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2016 
   2017 	/* *THIS* is where we adjust what block we're going to...
   2018 	 * but DO NOT TOUCH bp->b_blkno!!! */
   2019 	raid_addr = blocknum;
   2020 
   2021 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2022 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2023 	sum = raid_addr + num_blocks + pb;
   2024 	if (1 || rf_debugKernelAccess) {
   2025 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2026 			    (int) raid_addr, (int) sum, (int) num_blocks,
   2027 			    (int) pb, (int) bp->b_resid));
   2028 	}
   2029 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2030 	    || (sum < num_blocks) || (sum < pb)) {
   2031 		rc = ENOSPC;
   2032 		goto done;
   2033 	}
   2034 	/*
   2035 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2036 	 */
   2037 
   2038 	if (bp->b_bcount & raidPtr->sectorMask) {
   2039 		rc = ENOSPC;
   2040 		goto done;
   2041 	}
   2042 	db1_printf(("Calling DoAccess..\n"));
   2043 
   2044 
   2045 	rf_lock_mutex2(raidPtr->mutex);
   2046 	raidPtr->openings--;
   2047 	rf_unlock_mutex2(raidPtr->mutex);
   2048 
   2049 	/* don't ever condition on bp->b_flags & B_WRITE.
   2050 	 * always condition on B_READ instead */
   2051 
   2052 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2053 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2054 			 raid_addr, num_blocks,
   2055 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2056 
   2057 done:
   2058 	return rc;
   2059 }
   2060 
   2061 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2062 
   2063 int
   2064 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2065 {
   2066 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2067 	struct buf *bp;
   2068 
   2069 	req->queue = queue;
   2070 	bp = req->bp;
   2071 
   2072 	switch (req->type) {
   2073 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2074 		/* XXX need to do something extra here.. */
   2075 		/* I'm leaving this in, as I've never actually seen it used,
   2076 		 * and I'd like folks to report it... GO */
   2077 		printf("%s: WAKEUP CALLED\n", __func__);
   2078 		queue->numOutstanding++;
   2079 
   2080 		bp->b_flags = 0;
   2081 		bp->b_private = req;
   2082 
   2083 		KernelWakeupFunc(bp);
   2084 		break;
   2085 
   2086 	case RF_IO_TYPE_READ:
   2087 	case RF_IO_TYPE_WRITE:
   2088 #if RF_ACC_TRACE > 0
   2089 		if (req->tracerec) {
   2090 			RF_ETIMER_START(req->tracerec->timer);
   2091 		}
   2092 #endif
   2093 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2094 		    op, queue->rf_cinfo->ci_dev,
   2095 		    req->sectorOffset, req->numSector,
   2096 		    req->buf, KernelWakeupFunc, (void *) req,
   2097 		    queue->raidPtr->logBytesPerSector);
   2098 
   2099 		if (rf_debugKernelAccess) {
   2100 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2101 				(long) bp->b_blkno));
   2102 		}
   2103 		queue->numOutstanding++;
   2104 		queue->last_deq_sector = req->sectorOffset;
   2105 		/* acc wouldn't have been let in if there were any pending
   2106 		 * reqs at any other priority */
   2107 		queue->curPriority = req->priority;
   2108 
   2109 		db1_printf(("Going for %c to unit %d col %d\n",
   2110 			    req->type, queue->raidPtr->raidid,
   2111 			    queue->col));
   2112 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2113 			(int) req->sectorOffset, (int) req->numSector,
   2114 			(int) (req->numSector <<
   2115 			    queue->raidPtr->logBytesPerSector),
   2116 			(int) queue->raidPtr->logBytesPerSector));
   2117 
   2118 		/*
   2119 		 * XXX: drop lock here since this can block at
   2120 		 * least with backing SCSI devices.  Retake it
   2121 		 * to minimize fuss with calling interfaces.
   2122 		 */
   2123 
   2124 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2125 		bdev_strategy(bp);
   2126 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2127 		break;
   2128 
   2129 	default:
   2130 		panic("bad req->type in rf_DispatchKernelIO");
   2131 	}
   2132 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2133 
   2134 	return 0;
   2135 }
   2136 /* this is the callback function associated with a I/O invoked from
   2137    kernel code.
   2138  */
   2139 static void
   2140 KernelWakeupFunc(struct buf *bp)
   2141 {
   2142 	RF_DiskQueueData_t *req = NULL;
   2143 	RF_DiskQueue_t *queue;
   2144 
   2145 	db1_printf(("recovering the request queue:\n"));
   2146 
   2147 	req = bp->b_private;
   2148 
   2149 	queue = (RF_DiskQueue_t *) req->queue;
   2150 
   2151 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2152 
   2153 #if RF_ACC_TRACE > 0
   2154 	if (req->tracerec) {
   2155 		RF_ETIMER_STOP(req->tracerec->timer);
   2156 		RF_ETIMER_EVAL(req->tracerec->timer);
   2157 		rf_lock_mutex2(rf_tracing_mutex);
   2158 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2159 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2160 		req->tracerec->num_phys_ios++;
   2161 		rf_unlock_mutex2(rf_tracing_mutex);
   2162 	}
   2163 #endif
   2164 
   2165 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2166 	 * ballistic, and mark the component as hosed... */
   2167 
   2168 	if (bp->b_error != 0) {
   2169 		/* Mark the disk as dead */
   2170 		/* but only mark it once... */
   2171 		/* and only if it wouldn't leave this RAID set
   2172 		   completely broken */
   2173 		if (((queue->raidPtr->Disks[queue->col].status ==
   2174 		      rf_ds_optimal) ||
   2175 		     (queue->raidPtr->Disks[queue->col].status ==
   2176 		      rf_ds_used_spare)) &&
   2177 		     (queue->raidPtr->numFailures <
   2178 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2179 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2180 			       queue->raidPtr->raidid,
   2181 			       bp->b_error,
   2182 			       queue->raidPtr->Disks[queue->col].devname);
   2183 			queue->raidPtr->Disks[queue->col].status =
   2184 			    rf_ds_failed;
   2185 			queue->raidPtr->status = rf_rs_degraded;
   2186 			queue->raidPtr->numFailures++;
   2187 			queue->raidPtr->numNewFailures++;
   2188 		} else {	/* Disk is already dead... */
   2189 			/* printf("Disk already marked as dead!\n"); */
   2190 		}
   2191 
   2192 	}
   2193 
   2194 	/* Fill in the error value */
   2195 	req->error = bp->b_error;
   2196 
   2197 	/* Drop this one on the "finished" queue... */
   2198 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2199 
   2200 	/* Let the raidio thread know there is work to be done. */
   2201 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2202 
   2203 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2204 }
   2205 
   2206 
   2207 /*
   2208  * initialize a buf structure for doing an I/O in the kernel.
   2209  */
   2210 static void
   2211 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2212        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2213        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2214 {
   2215 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2216 	bp->b_oflags = 0;
   2217 	bp->b_cflags = 0;
   2218 	bp->b_bcount = numSect << logBytesPerSector;
   2219 	bp->b_bufsize = bp->b_bcount;
   2220 	bp->b_error = 0;
   2221 	bp->b_dev = dev;
   2222 	bp->b_data = bf;
   2223 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2224 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2225 	if (bp->b_bcount == 0) {
   2226 		panic("bp->b_bcount is zero in InitBP!!");
   2227 	}
   2228 	bp->b_iodone = cbFunc;
   2229 	bp->b_private = cbArg;
   2230 }
   2231 
   2232 /*
   2233  * Wait interruptibly for an exclusive lock.
   2234  *
   2235  * XXX
   2236  * Several drivers do this; it should be abstracted and made MP-safe.
   2237  * (Hmm... where have we seen this warning before :->  GO )
   2238  */
   2239 static int
   2240 raidlock(struct raid_softc *rs)
   2241 {
   2242 	int     error;
   2243 
   2244 	error = 0;
   2245 	mutex_enter(&rs->sc_mutex);
   2246 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2247 		rs->sc_flags |= RAIDF_WANTED;
   2248 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2249 		if (error != 0)
   2250 			goto done;
   2251 	}
   2252 	rs->sc_flags |= RAIDF_LOCKED;
   2253 done:
   2254 	mutex_exit(&rs->sc_mutex);
   2255 	return error;
   2256 }
   2257 /*
   2258  * Unlock and wake up any waiters.
   2259  */
   2260 static void
   2261 raidunlock(struct raid_softc *rs)
   2262 {
   2263 
   2264 	mutex_enter(&rs->sc_mutex);
   2265 	rs->sc_flags &= ~RAIDF_LOCKED;
   2266 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2267 		rs->sc_flags &= ~RAIDF_WANTED;
   2268 		cv_broadcast(&rs->sc_cv);
   2269 	}
   2270 	mutex_exit(&rs->sc_mutex);
   2271 }
   2272 
   2273 
   2274 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2275 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2276 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2277 
   2278 static daddr_t
   2279 rf_component_info_offset(void)
   2280 {
   2281 
   2282 	return RF_COMPONENT_INFO_OFFSET;
   2283 }
   2284 
   2285 static daddr_t
   2286 rf_component_info_size(unsigned secsize)
   2287 {
   2288 	daddr_t info_size;
   2289 
   2290 	KASSERT(secsize);
   2291 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2292 		info_size = secsize;
   2293 	else
   2294 		info_size = RF_COMPONENT_INFO_SIZE;
   2295 
   2296 	return info_size;
   2297 }
   2298 
   2299 static daddr_t
   2300 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2301 {
   2302 	daddr_t map_offset;
   2303 
   2304 	KASSERT(raidPtr->bytesPerSector);
   2305 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2306 		map_offset = raidPtr->bytesPerSector;
   2307 	else
   2308 		map_offset = RF_COMPONENT_INFO_SIZE;
   2309 	map_offset += rf_component_info_offset();
   2310 
   2311 	return map_offset;
   2312 }
   2313 
   2314 static daddr_t
   2315 rf_parity_map_size(RF_Raid_t *raidPtr)
   2316 {
   2317 	daddr_t map_size;
   2318 
   2319 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2320 		map_size = raidPtr->bytesPerSector;
   2321 	else
   2322 		map_size = RF_PARITY_MAP_SIZE;
   2323 
   2324 	return map_size;
   2325 }
   2326 
   2327 int
   2328 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2329 {
   2330 	RF_ComponentLabel_t *clabel;
   2331 
   2332 	clabel = raidget_component_label(raidPtr, col);
   2333 	clabel->clean = RF_RAID_CLEAN;
   2334 	raidflush_component_label(raidPtr, col);
   2335 	return(0);
   2336 }
   2337 
   2338 
   2339 int
   2340 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2341 {
   2342 	RF_ComponentLabel_t *clabel;
   2343 
   2344 	clabel = raidget_component_label(raidPtr, col);
   2345 	clabel->clean = RF_RAID_DIRTY;
   2346 	raidflush_component_label(raidPtr, col);
   2347 	return(0);
   2348 }
   2349 
   2350 int
   2351 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2352 {
   2353 	KASSERT(raidPtr->bytesPerSector);
   2354 
   2355 	return raidread_component_label(raidPtr->bytesPerSector,
   2356 	    raidPtr->Disks[col].dev,
   2357 	    raidPtr->raid_cinfo[col].ci_vp,
   2358 	    &raidPtr->raid_cinfo[col].ci_label);
   2359 }
   2360 
   2361 RF_ComponentLabel_t *
   2362 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2363 {
   2364 	return &raidPtr->raid_cinfo[col].ci_label;
   2365 }
   2366 
   2367 int
   2368 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2369 {
   2370 	RF_ComponentLabel_t *label;
   2371 
   2372 	label = &raidPtr->raid_cinfo[col].ci_label;
   2373 	label->mod_counter = raidPtr->mod_counter;
   2374 #ifndef RF_NO_PARITY_MAP
   2375 	label->parity_map_modcount = label->mod_counter;
   2376 #endif
   2377 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2378 	    raidPtr->Disks[col].dev,
   2379 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2380 }
   2381 
   2382 /*
   2383  * Swap the label endianness.
   2384  *
   2385  * Everything in the component label is 4-byte-swapped except the version,
   2386  * which is kept in the byte-swapped version at all times, and indicates
   2387  * for the writer that a swap is necessary.
   2388  *
   2389  * For reads it is expected that out_label == clabel, but writes expect
   2390  * separate labels so only the re-swapped label is written out to disk,
   2391  * leaving the swapped-except-version internally.
   2392  *
   2393  * Only support swapping label version 2.
   2394  */
   2395 static void
   2396 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2397 {
   2398 	int	*in, *out, *in_last;
   2399 
   2400 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2401 
   2402 	/* Don't swap the label, but do copy it. */
   2403 	out_label->version = clabel->version;
   2404 
   2405 	in = &clabel->serial_number;
   2406 	in_last = &clabel->future_use2[42];
   2407 	out = &out_label->serial_number;
   2408 
   2409 	for (; in < in_last; in++, out++)
   2410 		*out = bswap32(*in);
   2411 }
   2412 
   2413 static int
   2414 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2415     RF_ComponentLabel_t *clabel)
   2416 {
   2417 	int error;
   2418 
   2419 	error = raidread_component_area(dev, b_vp, clabel,
   2420 	    sizeof(RF_ComponentLabel_t),
   2421 	    rf_component_info_offset(),
   2422 	    rf_component_info_size(secsize));
   2423 
   2424 	if (error == 0 &&
   2425 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2426 		rf_swap_label(clabel, clabel);
   2427 	}
   2428 
   2429 	return error;
   2430 }
   2431 
   2432 /* ARGSUSED */
   2433 static int
   2434 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2435     size_t msize, daddr_t offset, daddr_t dsize)
   2436 {
   2437 	struct buf *bp;
   2438 	int error;
   2439 
   2440 	/* XXX should probably ensure that we don't try to do this if
   2441 	   someone has changed rf_protected_sectors. */
   2442 
   2443 	if (b_vp == NULL) {
   2444 		/* For whatever reason, this component is not valid.
   2445 		   Don't try to read a component label from it. */
   2446 		return(EINVAL);
   2447 	}
   2448 
   2449 	/* get a block of the appropriate size... */
   2450 	bp = geteblk((int)dsize);
   2451 	bp->b_dev = dev;
   2452 
   2453 	/* get our ducks in a row for the read */
   2454 	bp->b_blkno = offset / DEV_BSIZE;
   2455 	bp->b_bcount = dsize;
   2456 	bp->b_flags |= B_READ;
   2457  	bp->b_resid = dsize;
   2458 
   2459 	bdev_strategy(bp);
   2460 	error = biowait(bp);
   2461 
   2462 	if (!error) {
   2463 		memcpy(data, bp->b_data, msize);
   2464 	}
   2465 
   2466 	brelse(bp, 0);
   2467 	return(error);
   2468 }
   2469 
   2470 static int
   2471 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2472     RF_ComponentLabel_t *clabel)
   2473 {
   2474 	RF_ComponentLabel_t *clabel_write = clabel;
   2475 	RF_ComponentLabel_t lclabel;
   2476 	int error;
   2477 
   2478 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2479 		clabel_write = &lclabel;
   2480 		rf_swap_label(clabel, clabel_write);
   2481 	}
   2482 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2483 	    sizeof(RF_ComponentLabel_t),
   2484 	    rf_component_info_offset(),
   2485 	    rf_component_info_size(secsize), 0);
   2486 
   2487 	return error;
   2488 }
   2489 
   2490 /* ARGSUSED */
   2491 static int
   2492 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2493     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2494 {
   2495 	struct buf *bp;
   2496 	int error;
   2497 
   2498 	/* get a block of the appropriate size... */
   2499 	bp = geteblk((int)dsize);
   2500 	bp->b_dev = dev;
   2501 
   2502 	/* get our ducks in a row for the write */
   2503 	bp->b_blkno = offset / DEV_BSIZE;
   2504 	bp->b_bcount = dsize;
   2505 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2506  	bp->b_resid = dsize;
   2507 
   2508 	memset(bp->b_data, 0, dsize);
   2509 	memcpy(bp->b_data, data, msize);
   2510 
   2511 	bdev_strategy(bp);
   2512 	if (asyncp)
   2513 		return 0;
   2514 	error = biowait(bp);
   2515 	brelse(bp, 0);
   2516 	if (error) {
   2517 #if 1
   2518 		printf("Failed to write RAID component info!\n");
   2519 #endif
   2520 	}
   2521 
   2522 	return(error);
   2523 }
   2524 
   2525 void
   2526 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2527 {
   2528 	int c;
   2529 
   2530 	for (c = 0; c < raidPtr->numCol; c++) {
   2531 		/* Skip dead disks. */
   2532 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2533 			continue;
   2534 		/* XXXjld: what if an error occurs here? */
   2535 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2536 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2537 		    RF_PARITYMAP_NBYTE,
   2538 		    rf_parity_map_offset(raidPtr),
   2539 		    rf_parity_map_size(raidPtr), 0);
   2540 	}
   2541 }
   2542 
   2543 void
   2544 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2545 {
   2546 	struct rf_paritymap_ondisk tmp;
   2547 	int c,first;
   2548 
   2549 	first=1;
   2550 	for (c = 0; c < raidPtr->numCol; c++) {
   2551 		/* Skip dead disks. */
   2552 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2553 			continue;
   2554 		raidread_component_area(raidPtr->Disks[c].dev,
   2555 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2556 		    RF_PARITYMAP_NBYTE,
   2557 		    rf_parity_map_offset(raidPtr),
   2558 		    rf_parity_map_size(raidPtr));
   2559 		if (first) {
   2560 			memcpy(map, &tmp, sizeof(*map));
   2561 			first = 0;
   2562 		} else {
   2563 			rf_paritymap_merge(map, &tmp);
   2564 		}
   2565 	}
   2566 }
   2567 
   2568 void
   2569 rf_markalldirty(RF_Raid_t *raidPtr)
   2570 {
   2571 	RF_ComponentLabel_t *clabel;
   2572 	int sparecol;
   2573 	int c;
   2574 	int j;
   2575 	int scol = -1;
   2576 
   2577 	raidPtr->mod_counter++;
   2578 	for (c = 0; c < raidPtr->numCol; c++) {
   2579 		/* we don't want to touch (at all) a disk that has
   2580 		   failed */
   2581 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2582 			clabel = raidget_component_label(raidPtr, c);
   2583 			if (clabel->status == rf_ds_spared) {
   2584 				/* XXX do something special...
   2585 				   but whatever you do, don't
   2586 				   try to access it!! */
   2587 			} else {
   2588 				raidmarkdirty(raidPtr, c);
   2589 			}
   2590 		}
   2591 	}
   2592 
   2593 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2594 		sparecol = raidPtr->numCol + c;
   2595 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2596 			/*
   2597 
   2598 			   we claim this disk is "optimal" if it's
   2599 			   rf_ds_used_spare, as that means it should be
   2600 			   directly substitutable for the disk it replaced.
   2601 			   We note that too...
   2602 
   2603 			 */
   2604 
   2605 			for(j=0;j<raidPtr->numCol;j++) {
   2606 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2607 					scol = j;
   2608 					break;
   2609 				}
   2610 			}
   2611 
   2612 			clabel = raidget_component_label(raidPtr, sparecol);
   2613 			/* make sure status is noted */
   2614 
   2615 			raid_init_component_label(raidPtr, clabel);
   2616 
   2617 			clabel->row = 0;
   2618 			clabel->column = scol;
   2619 			/* Note: we *don't* change status from rf_ds_used_spare
   2620 			   to rf_ds_optimal */
   2621 			/* clabel.status = rf_ds_optimal; */
   2622 
   2623 			raidmarkdirty(raidPtr, sparecol);
   2624 		}
   2625 	}
   2626 }
   2627 
   2628 
   2629 void
   2630 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2631 {
   2632 	RF_ComponentLabel_t *clabel;
   2633 	int sparecol;
   2634 	int c;
   2635 	int j;
   2636 	int scol;
   2637 	struct raid_softc *rs = raidPtr->softc;
   2638 
   2639 	scol = -1;
   2640 
   2641 	/* XXX should do extra checks to make sure things really are clean,
   2642 	   rather than blindly setting the clean bit... */
   2643 
   2644 	raidPtr->mod_counter++;
   2645 
   2646 	for (c = 0; c < raidPtr->numCol; c++) {
   2647 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2648 			clabel = raidget_component_label(raidPtr, c);
   2649 			/* make sure status is noted */
   2650 			clabel->status = rf_ds_optimal;
   2651 
   2652 			/* note what unit we are configured as */
   2653 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2654 				clabel->last_unit = raidPtr->raidid;
   2655 
   2656 			raidflush_component_label(raidPtr, c);
   2657 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2658 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2659 					raidmarkclean(raidPtr, c);
   2660 				}
   2661 			}
   2662 		}
   2663 		/* else we don't touch it.. */
   2664 	}
   2665 
   2666 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2667 		sparecol = raidPtr->numCol + c;
   2668 		/* Need to ensure that the reconstruct actually completed! */
   2669 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2670 			/*
   2671 
   2672 			   we claim this disk is "optimal" if it's
   2673 			   rf_ds_used_spare, as that means it should be
   2674 			   directly substitutable for the disk it replaced.
   2675 			   We note that too...
   2676 
   2677 			 */
   2678 
   2679 			for(j=0;j<raidPtr->numCol;j++) {
   2680 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2681 					scol = j;
   2682 					break;
   2683 				}
   2684 			}
   2685 
   2686 			/* XXX shouldn't *really* need this... */
   2687 			clabel = raidget_component_label(raidPtr, sparecol);
   2688 			/* make sure status is noted */
   2689 
   2690 			raid_init_component_label(raidPtr, clabel);
   2691 
   2692 			clabel->column = scol;
   2693 			clabel->status = rf_ds_optimal;
   2694 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2695 				clabel->last_unit = raidPtr->raidid;
   2696 
   2697 			raidflush_component_label(raidPtr, sparecol);
   2698 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2699 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2700 					raidmarkclean(raidPtr, sparecol);
   2701 				}
   2702 			}
   2703 		}
   2704 	}
   2705 }
   2706 
   2707 void
   2708 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2709 {
   2710 
   2711 	if (vp != NULL) {
   2712 		if (auto_configured == 1) {
   2713 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2714 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2715 			vput(vp);
   2716 
   2717 		} else {
   2718 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2719 		}
   2720 	}
   2721 }
   2722 
   2723 
   2724 void
   2725 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2726 {
   2727 	int r,c;
   2728 	struct vnode *vp;
   2729 	int acd;
   2730 
   2731 
   2732 	/* We take this opportunity to close the vnodes like we should.. */
   2733 
   2734 	for (c = 0; c < raidPtr->numCol; c++) {
   2735 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2736 		acd = raidPtr->Disks[c].auto_configured;
   2737 		rf_close_component(raidPtr, vp, acd);
   2738 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2739 		raidPtr->Disks[c].auto_configured = 0;
   2740 	}
   2741 
   2742 	for (r = 0; r < raidPtr->numSpare; r++) {
   2743 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2744 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2745 		rf_close_component(raidPtr, vp, acd);
   2746 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2747 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2748 	}
   2749 }
   2750 
   2751 
   2752 static void
   2753 rf_ReconThread(struct rf_recon_req_internal *req)
   2754 {
   2755 	int     s;
   2756 	RF_Raid_t *raidPtr;
   2757 
   2758 	s = splbio();
   2759 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2760 	raidPtr->recon_in_progress = 1;
   2761 
   2762 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2763 		raidPtr->forceRecon = 1;
   2764 	}
   2765 
   2766 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2767 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2768 
   2769 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2770 		raidPtr->forceRecon = 0;
   2771 	}
   2772 
   2773 	RF_Free(req, sizeof(*req));
   2774 
   2775 	raidPtr->recon_in_progress = 0;
   2776 	splx(s);
   2777 
   2778 	/* That's all... */
   2779 	kthread_exit(0);	/* does not return */
   2780 }
   2781 
   2782 static void
   2783 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2784 {
   2785 	int retcode;
   2786 	int s;
   2787 
   2788 	raidPtr->parity_rewrite_stripes_done = 0;
   2789 	raidPtr->parity_rewrite_in_progress = 1;
   2790 	s = splbio();
   2791 	retcode = rf_RewriteParity(raidPtr);
   2792 	splx(s);
   2793 	if (retcode) {
   2794 		printf("raid%d: Error re-writing parity (%d)!\n",
   2795 		    raidPtr->raidid, retcode);
   2796 	} else {
   2797 		/* set the clean bit!  If we shutdown correctly,
   2798 		   the clean bit on each component label will get
   2799 		   set */
   2800 		raidPtr->parity_good = RF_RAID_CLEAN;
   2801 	}
   2802 	raidPtr->parity_rewrite_in_progress = 0;
   2803 
   2804 	/* Anyone waiting for us to stop?  If so, inform them... */
   2805 	if (raidPtr->waitShutdown) {
   2806 		rf_lock_mutex2(raidPtr->rad_lock);
   2807 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2808 		rf_unlock_mutex2(raidPtr->rad_lock);
   2809 	}
   2810 
   2811 	/* That's all... */
   2812 	kthread_exit(0);	/* does not return */
   2813 }
   2814 
   2815 
   2816 static void
   2817 rf_CopybackThread(RF_Raid_t *raidPtr)
   2818 {
   2819 	int s;
   2820 
   2821 	raidPtr->copyback_in_progress = 1;
   2822 	s = splbio();
   2823 	rf_CopybackReconstructedData(raidPtr);
   2824 	splx(s);
   2825 	raidPtr->copyback_in_progress = 0;
   2826 
   2827 	/* That's all... */
   2828 	kthread_exit(0);	/* does not return */
   2829 }
   2830 
   2831 
   2832 static void
   2833 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2834 {
   2835 	int s;
   2836 	RF_Raid_t *raidPtr;
   2837 
   2838 	s = splbio();
   2839 	raidPtr = req->raidPtr;
   2840 	raidPtr->recon_in_progress = 1;
   2841 
   2842 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2843 		raidPtr->forceRecon = 1;
   2844 	}
   2845 
   2846 	rf_ReconstructInPlace(raidPtr, req->col);
   2847 
   2848 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2849 		raidPtr->forceRecon = 0;
   2850 	}
   2851 
   2852 	RF_Free(req, sizeof(*req));
   2853 	raidPtr->recon_in_progress = 0;
   2854 	splx(s);
   2855 
   2856 	/* That's all... */
   2857 	kthread_exit(0);	/* does not return */
   2858 }
   2859 
   2860 static RF_AutoConfig_t *
   2861 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2862     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2863     unsigned secsize)
   2864 {
   2865 	int good_one = 0;
   2866 	RF_ComponentLabel_t *clabel;
   2867 	RF_AutoConfig_t *ac;
   2868 
   2869 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2870 
   2871 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2872 		/* Got the label.  Does it look reasonable? */
   2873 		if (rf_reasonable_label(clabel, numsecs) &&
   2874 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2875 #ifdef DEBUG
   2876 			printf("Component on: %s: %llu\n",
   2877 				cname, (unsigned long long)size);
   2878 			rf_print_component_label(clabel);
   2879 #endif
   2880 			/* if it's reasonable, add it, else ignore it. */
   2881 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2882 				M_WAITOK);
   2883 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2884 			ac->dev = dev;
   2885 			ac->vp = vp;
   2886 			ac->clabel = clabel;
   2887 			ac->next = ac_list;
   2888 			ac_list = ac;
   2889 			good_one = 1;
   2890 		}
   2891 	}
   2892 	if (!good_one) {
   2893 		/* cleanup */
   2894 		free(clabel, M_RAIDFRAME);
   2895 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2896 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2897 		vput(vp);
   2898 	}
   2899 	return ac_list;
   2900 }
   2901 
   2902 static RF_AutoConfig_t *
   2903 rf_find_raid_components(void)
   2904 {
   2905 	struct vnode *vp;
   2906 	struct disklabel label;
   2907 	device_t dv;
   2908 	deviter_t di;
   2909 	dev_t dev;
   2910 	int bmajor, bminor, wedge, rf_part_found;
   2911 	int error;
   2912 	int i;
   2913 	RF_AutoConfig_t *ac_list;
   2914 	uint64_t numsecs;
   2915 	unsigned secsize;
   2916 	int dowedges;
   2917 
   2918 	/* initialize the AutoConfig list */
   2919 	ac_list = NULL;
   2920 
   2921 	/*
   2922 	 * we begin by trolling through *all* the devices on the system *twice*
   2923 	 * first we scan for wedges, second for other devices. This avoids
   2924 	 * using a raw partition instead of a wedge that covers the whole disk
   2925 	 */
   2926 
   2927 	for (dowedges=1; dowedges>=0; --dowedges) {
   2928 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2929 		     dv = deviter_next(&di)) {
   2930 
   2931 			/* we are only interested in disks */
   2932 			if (device_class(dv) != DV_DISK)
   2933 				continue;
   2934 
   2935 			/* we don't care about floppies */
   2936 			if (device_is_a(dv, "fd")) {
   2937 				continue;
   2938 			}
   2939 
   2940 			/* we don't care about CDs. */
   2941 			if (device_is_a(dv, "cd")) {
   2942 				continue;
   2943 			}
   2944 
   2945 			/* we don't care about md. */
   2946 			if (device_is_a(dv, "md")) {
   2947 				continue;
   2948 			}
   2949 
   2950 			/* hdfd is the Atari/Hades floppy driver */
   2951 			if (device_is_a(dv, "hdfd")) {
   2952 				continue;
   2953 			}
   2954 
   2955 			/* fdisa is the Atari/Milan floppy driver */
   2956 			if (device_is_a(dv, "fdisa")) {
   2957 				continue;
   2958 			}
   2959 
   2960 			/* we don't care about spiflash */
   2961 			if (device_is_a(dv, "spiflash")) {
   2962 				continue;
   2963 			}
   2964 
   2965 			/* are we in the wedges pass ? */
   2966 			wedge = device_is_a(dv, "dk");
   2967 			if (wedge != dowedges) {
   2968 				continue;
   2969 			}
   2970 
   2971 			/* need to find the device_name_to_block_device_major stuff */
   2972 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2973 
   2974 			rf_part_found = 0; /*No raid partition as yet*/
   2975 
   2976 			/* get a vnode for the raw partition of this disk */
   2977 			bminor = minor(device_unit(dv));
   2978 			dev = wedge ? makedev(bmajor, bminor) :
   2979 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2980 			if (bdevvp(dev, &vp))
   2981 				panic("RAID can't alloc vnode");
   2982 
   2983 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2984 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2985 
   2986 			if (error) {
   2987 				/* "Who cares."  Continue looking
   2988 				   for something that exists*/
   2989 				vput(vp);
   2990 				continue;
   2991 			}
   2992 
   2993 			VOP_UNLOCK(vp);
   2994 			error = getdisksize(vp, &numsecs, &secsize);
   2995 			if (error) {
   2996 				/*
   2997 				 * Pseudo devices like vnd and cgd can be
   2998 				 * opened but may still need some configuration.
   2999 				 * Ignore these quietly.
   3000 				 */
   3001 				if (error != ENXIO)
   3002 					printf("RAIDframe: can't get disk size"
   3003 					    " for dev %s (%d)\n",
   3004 					    device_xname(dv), error);
   3005 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3006 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3007 				vput(vp);
   3008 				continue;
   3009 			}
   3010 			if (wedge) {
   3011 				struct dkwedge_info dkw;
   3012 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3013 				    NOCRED);
   3014 				if (error) {
   3015 					printf("RAIDframe: can't get wedge info for "
   3016 					    "dev %s (%d)\n", device_xname(dv), error);
   3017 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3018 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3019 					vput(vp);
   3020 					continue;
   3021 				}
   3022 
   3023 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3024 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3025 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3026 					vput(vp);
   3027 					continue;
   3028 				}
   3029 
   3030 				ac_list = rf_get_component(ac_list, dev, vp,
   3031 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3032 				rf_part_found = 1; /*There is a raid component on this disk*/
   3033 				continue;
   3034 			}
   3035 
   3036 			/* Ok, the disk exists.  Go get the disklabel. */
   3037 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3038 			if (error) {
   3039 				/*
   3040 				 * XXX can't happen - open() would
   3041 				 * have errored out (or faked up one)
   3042 				 */
   3043 				if (error != ENOTTY)
   3044 					printf("RAIDframe: can't get label for dev "
   3045 					    "%s (%d)\n", device_xname(dv), error);
   3046 			}
   3047 
   3048 			/* don't need this any more.  We'll allocate it again
   3049 			   a little later if we really do... */
   3050 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3051 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3052 			vput(vp);
   3053 
   3054 			if (error)
   3055 				continue;
   3056 
   3057 			rf_part_found = 0; /*No raid partitions yet*/
   3058 			for (i = 0; i < label.d_npartitions; i++) {
   3059 				char cname[sizeof(ac_list->devname)];
   3060 
   3061 				/* We only support partitions marked as RAID */
   3062 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3063 					continue;
   3064 
   3065 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3066 				if (bdevvp(dev, &vp))
   3067 					panic("RAID can't alloc vnode");
   3068 
   3069 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3070 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3071 				if (error) {
   3072 					/* Not quite a 'whatever'.  In
   3073 					 * this situation we know
   3074 					 * there is a FS_RAID
   3075 					 * partition, but we can't
   3076 					 * open it.  The most likely
   3077 					 * reason is that the
   3078 					 * partition is already in
   3079 					 * use by another RAID set.
   3080 					 * So note that we've already
   3081 					 * found a partition on this
   3082 					 * disk so we don't attempt
   3083 					 * to use the raw disk later. */
   3084 					rf_part_found = 1;
   3085 					vput(vp);
   3086 					continue;
   3087 				}
   3088 				VOP_UNLOCK(vp);
   3089 				snprintf(cname, sizeof(cname), "%s%c",
   3090 				    device_xname(dv), 'a' + i);
   3091 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3092 					label.d_partitions[i].p_size, numsecs, secsize);
   3093 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3094 			}
   3095 
   3096 			/*
   3097 			 *If there is no raid component on this disk, either in a
   3098 			 *disklabel or inside a wedge, check the raw partition as well,
   3099 			 *as it is possible to configure raid components on raw disk
   3100 			 *devices.
   3101 			 */
   3102 
   3103 			if (!rf_part_found) {
   3104 				char cname[sizeof(ac_list->devname)];
   3105 
   3106 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3107 				if (bdevvp(dev, &vp))
   3108 					panic("RAID can't alloc vnode");
   3109 
   3110 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3111 
   3112 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3113 				if (error) {
   3114 					/* Whatever... */
   3115 					vput(vp);
   3116 					continue;
   3117 				}
   3118 				VOP_UNLOCK(vp);
   3119 				snprintf(cname, sizeof(cname), "%s%c",
   3120 				    device_xname(dv), 'a' + RAW_PART);
   3121 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3122 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3123 			}
   3124 		}
   3125 		deviter_release(&di);
   3126 	}
   3127 	return ac_list;
   3128 }
   3129 
   3130 int
   3131 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3132 {
   3133 
   3134 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3135 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3136 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3137 	    (clabel->clean == RF_RAID_CLEAN ||
   3138 	     clabel->clean == RF_RAID_DIRTY) &&
   3139 	    clabel->row >=0 &&
   3140 	    clabel->column >= 0 &&
   3141 	    clabel->num_rows > 0 &&
   3142 	    clabel->num_columns > 0 &&
   3143 	    clabel->row < clabel->num_rows &&
   3144 	    clabel->column < clabel->num_columns &&
   3145 	    clabel->blockSize > 0 &&
   3146 	    /*
   3147 	     * numBlocksHi may contain garbage, but it is ok since
   3148 	     * the type is unsigned.  If it is really garbage,
   3149 	     * rf_fix_old_label_size() will fix it.
   3150 	     */
   3151 	    rf_component_label_numblocks(clabel) > 0) {
   3152 		/*
   3153 		 * label looks reasonable enough...
   3154 		 * let's make sure it has no old garbage.
   3155 		 */
   3156 		if (numsecs)
   3157 			rf_fix_old_label_size(clabel, numsecs);
   3158 		return(1);
   3159 	}
   3160 	return(0);
   3161 }
   3162 
   3163 
   3164 /*
   3165  * For reasons yet unknown, some old component labels have garbage in
   3166  * the newer numBlocksHi region, and this causes lossage.  Since those
   3167  * disks will also have numsecs set to less than 32 bits of sectors,
   3168  * we can determine when this corruption has occurred, and fix it.
   3169  *
   3170  * The exact same problem, with the same unknown reason, happens to
   3171  * the partitionSizeHi member as well.
   3172  */
   3173 static void
   3174 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3175 {
   3176 
   3177 	if (numsecs < ((uint64_t)1 << 32)) {
   3178 		if (clabel->numBlocksHi) {
   3179 			printf("WARNING: total sectors < 32 bits, yet "
   3180 			       "numBlocksHi set\n"
   3181 			       "WARNING: resetting numBlocksHi to zero.\n");
   3182 			clabel->numBlocksHi = 0;
   3183 		}
   3184 
   3185 		if (clabel->partitionSizeHi) {
   3186 			printf("WARNING: total sectors < 32 bits, yet "
   3187 			       "partitionSizeHi set\n"
   3188 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3189 			clabel->partitionSizeHi = 0;
   3190 		}
   3191 	}
   3192 }
   3193 
   3194 
   3195 #ifdef DEBUG
   3196 void
   3197 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3198 {
   3199 	uint64_t numBlocks;
   3200 	static const char *rp[] = {
   3201 	    "No", "Force", "Soft", "*invalid*"
   3202 	};
   3203 
   3204 
   3205 	numBlocks = rf_component_label_numblocks(clabel);
   3206 
   3207 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3208 	       clabel->row, clabel->column,
   3209 	       clabel->num_rows, clabel->num_columns);
   3210 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3211 	       clabel->version, clabel->serial_number,
   3212 	       clabel->mod_counter);
   3213 	printf("   Clean: %s Status: %d\n",
   3214 	       clabel->clean ? "Yes" : "No", clabel->status);
   3215 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3216 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3217 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3218 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3219 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3220 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3221 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3222 #if 0
   3223 	   printf("   Config order: %d\n", clabel->config_order);
   3224 #endif
   3225 
   3226 }
   3227 #endif
   3228 
   3229 static RF_ConfigSet_t *
   3230 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3231 {
   3232 	RF_AutoConfig_t *ac;
   3233 	RF_ConfigSet_t *config_sets;
   3234 	RF_ConfigSet_t *cset;
   3235 	RF_AutoConfig_t *ac_next;
   3236 
   3237 
   3238 	config_sets = NULL;
   3239 
   3240 	/* Go through the AutoConfig list, and figure out which components
   3241 	   belong to what sets.  */
   3242 	ac = ac_list;
   3243 	while(ac!=NULL) {
   3244 		/* we're going to putz with ac->next, so save it here
   3245 		   for use at the end of the loop */
   3246 		ac_next = ac->next;
   3247 
   3248 		if (config_sets == NULL) {
   3249 			/* will need at least this one... */
   3250 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3251 				       M_RAIDFRAME, M_WAITOK);
   3252 			/* this one is easy :) */
   3253 			config_sets->ac = ac;
   3254 			config_sets->next = NULL;
   3255 			config_sets->rootable = 0;
   3256 			ac->next = NULL;
   3257 		} else {
   3258 			/* which set does this component fit into? */
   3259 			cset = config_sets;
   3260 			while(cset!=NULL) {
   3261 				if (rf_does_it_fit(cset, ac)) {
   3262 					/* looks like it matches... */
   3263 					ac->next = cset->ac;
   3264 					cset->ac = ac;
   3265 					break;
   3266 				}
   3267 				cset = cset->next;
   3268 			}
   3269 			if (cset==NULL) {
   3270 				/* didn't find a match above... new set..*/
   3271 				cset = malloc(sizeof(RF_ConfigSet_t),
   3272 					       M_RAIDFRAME, M_WAITOK);
   3273 				cset->ac = ac;
   3274 				ac->next = NULL;
   3275 				cset->next = config_sets;
   3276 				cset->rootable = 0;
   3277 				config_sets = cset;
   3278 			}
   3279 		}
   3280 		ac = ac_next;
   3281 	}
   3282 
   3283 
   3284 	return(config_sets);
   3285 }
   3286 
   3287 static int
   3288 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3289 {
   3290 	RF_ComponentLabel_t *clabel1, *clabel2;
   3291 
   3292 	/* If this one matches the *first* one in the set, that's good
   3293 	   enough, since the other members of the set would have been
   3294 	   through here too... */
   3295 	/* note that we are not checking partitionSize here..
   3296 
   3297 	   Note that we are also not checking the mod_counters here.
   3298 	   If everything else matches except the mod_counter, that's
   3299 	   good enough for this test.  We will deal with the mod_counters
   3300 	   a little later in the autoconfiguration process.
   3301 
   3302 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3303 
   3304 	   The reason we don't check for this is that failed disks
   3305 	   will have lower modification counts.  If those disks are
   3306 	   not added to the set they used to belong to, then they will
   3307 	   form their own set, which may result in 2 different sets,
   3308 	   for example, competing to be configured at raid0, and
   3309 	   perhaps competing to be the root filesystem set.  If the
   3310 	   wrong ones get configured, or both attempt to become /,
   3311 	   weird behaviour and or serious lossage will occur.  Thus we
   3312 	   need to bring them into the fold here, and kick them out at
   3313 	   a later point.
   3314 
   3315 	*/
   3316 
   3317 	clabel1 = cset->ac->clabel;
   3318 	clabel2 = ac->clabel;
   3319 	if ((clabel1->version == clabel2->version) &&
   3320 	    (clabel1->serial_number == clabel2->serial_number) &&
   3321 	    (clabel1->num_rows == clabel2->num_rows) &&
   3322 	    (clabel1->num_columns == clabel2->num_columns) &&
   3323 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3324 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3325 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3326 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3327 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3328 	    (clabel1->blockSize == clabel2->blockSize) &&
   3329 	    rf_component_label_numblocks(clabel1) ==
   3330 	    rf_component_label_numblocks(clabel2) &&
   3331 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3332 	    (clabel1->root_partition == clabel2->root_partition) &&
   3333 	    (clabel1->last_unit == clabel2->last_unit) &&
   3334 	    (clabel1->config_order == clabel2->config_order)) {
   3335 		/* if it get's here, it almost *has* to be a match */
   3336 	} else {
   3337 		/* it's not consistent with somebody in the set..
   3338 		   punt */
   3339 		return(0);
   3340 	}
   3341 	/* all was fine.. it must fit... */
   3342 	return(1);
   3343 }
   3344 
   3345 static int
   3346 rf_have_enough_components(RF_ConfigSet_t *cset)
   3347 {
   3348 	RF_AutoConfig_t *ac;
   3349 	RF_AutoConfig_t *auto_config;
   3350 	RF_ComponentLabel_t *clabel;
   3351 	int c;
   3352 	int num_cols;
   3353 	int num_missing;
   3354 	int mod_counter;
   3355 	int mod_counter_found;
   3356 	int even_pair_failed;
   3357 	char parity_type;
   3358 
   3359 
   3360 	/* check to see that we have enough 'live' components
   3361 	   of this set.  If so, we can configure it if necessary */
   3362 
   3363 	num_cols = cset->ac->clabel->num_columns;
   3364 	parity_type = cset->ac->clabel->parityConfig;
   3365 
   3366 	/* XXX Check for duplicate components!?!?!? */
   3367 
   3368 	/* Determine what the mod_counter is supposed to be for this set. */
   3369 
   3370 	mod_counter_found = 0;
   3371 	mod_counter = 0;
   3372 	ac = cset->ac;
   3373 	while(ac!=NULL) {
   3374 		if (mod_counter_found==0) {
   3375 			mod_counter = ac->clabel->mod_counter;
   3376 			mod_counter_found = 1;
   3377 		} else {
   3378 			if (ac->clabel->mod_counter > mod_counter) {
   3379 				mod_counter = ac->clabel->mod_counter;
   3380 			}
   3381 		}
   3382 		ac = ac->next;
   3383 	}
   3384 
   3385 	num_missing = 0;
   3386 	auto_config = cset->ac;
   3387 
   3388 	even_pair_failed = 0;
   3389 	for(c=0; c<num_cols; c++) {
   3390 		ac = auto_config;
   3391 		while(ac!=NULL) {
   3392 			if ((ac->clabel->column == c) &&
   3393 			    (ac->clabel->mod_counter == mod_counter)) {
   3394 				/* it's this one... */
   3395 #ifdef DEBUG
   3396 				printf("Found: %s at %d\n",
   3397 				       ac->devname,c);
   3398 #endif
   3399 				break;
   3400 			}
   3401 			ac=ac->next;
   3402 		}
   3403 		if (ac==NULL) {
   3404 				/* Didn't find one here! */
   3405 				/* special case for RAID 1, especially
   3406 				   where there are more than 2
   3407 				   components (where RAIDframe treats
   3408 				   things a little differently :( ) */
   3409 			if (parity_type == '1') {
   3410 				if (c%2 == 0) { /* even component */
   3411 					even_pair_failed = 1;
   3412 				} else { /* odd component.  If
   3413 					    we're failed, and
   3414 					    so is the even
   3415 					    component, it's
   3416 					    "Good Night, Charlie" */
   3417 					if (even_pair_failed == 1) {
   3418 						return(0);
   3419 					}
   3420 				}
   3421 			} else {
   3422 				/* normal accounting */
   3423 				num_missing++;
   3424 			}
   3425 		}
   3426 		if ((parity_type == '1') && (c%2 == 1)) {
   3427 				/* Just did an even component, and we didn't
   3428 				   bail.. reset the even_pair_failed flag,
   3429 				   and go on to the next component.... */
   3430 			even_pair_failed = 0;
   3431 		}
   3432 	}
   3433 
   3434 	clabel = cset->ac->clabel;
   3435 
   3436 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3437 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3438 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3439 		/* XXX this needs to be made *much* more general */
   3440 		/* Too many failures */
   3441 		return(0);
   3442 	}
   3443 	/* otherwise, all is well, and we've got enough to take a kick
   3444 	   at autoconfiguring this set */
   3445 	return(1);
   3446 }
   3447 
   3448 static void
   3449 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3450 			RF_Raid_t *raidPtr)
   3451 {
   3452 	RF_ComponentLabel_t *clabel;
   3453 	int i;
   3454 
   3455 	clabel = ac->clabel;
   3456 
   3457 	/* 1. Fill in the common stuff */
   3458 	config->numCol = clabel->num_columns;
   3459 	config->numSpare = 0; /* XXX should this be set here? */
   3460 	config->sectPerSU = clabel->sectPerSU;
   3461 	config->SUsPerPU = clabel->SUsPerPU;
   3462 	config->SUsPerRU = clabel->SUsPerRU;
   3463 	config->parityConfig = clabel->parityConfig;
   3464 	/* XXX... */
   3465 	strcpy(config->diskQueueType,"fifo");
   3466 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3467 	config->layoutSpecificSize = 0; /* XXX ?? */
   3468 
   3469 	while(ac!=NULL) {
   3470 		/* row/col values will be in range due to the checks
   3471 		   in reasonable_label() */
   3472 		strcpy(config->devnames[0][ac->clabel->column],
   3473 		       ac->devname);
   3474 		ac = ac->next;
   3475 	}
   3476 
   3477 	for(i=0;i<RF_MAXDBGV;i++) {
   3478 		config->debugVars[i][0] = 0;
   3479 	}
   3480 }
   3481 
   3482 static int
   3483 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3484 {
   3485 	RF_ComponentLabel_t *clabel;
   3486 	int column;
   3487 	int sparecol;
   3488 
   3489 	raidPtr->autoconfigure = new_value;
   3490 
   3491 	for(column=0; column<raidPtr->numCol; column++) {
   3492 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3493 			clabel = raidget_component_label(raidPtr, column);
   3494 			clabel->autoconfigure = new_value;
   3495 			raidflush_component_label(raidPtr, column);
   3496 		}
   3497 	}
   3498 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3499 		sparecol = raidPtr->numCol + column;
   3500 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3501 			clabel = raidget_component_label(raidPtr, sparecol);
   3502 			clabel->autoconfigure = new_value;
   3503 			raidflush_component_label(raidPtr, sparecol);
   3504 		}
   3505 	}
   3506 	return(new_value);
   3507 }
   3508 
   3509 static int
   3510 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3511 {
   3512 	RF_ComponentLabel_t *clabel;
   3513 	int column;
   3514 	int sparecol;
   3515 
   3516 	raidPtr->root_partition = new_value;
   3517 	for(column=0; column<raidPtr->numCol; column++) {
   3518 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3519 			clabel = raidget_component_label(raidPtr, column);
   3520 			clabel->root_partition = new_value;
   3521 			raidflush_component_label(raidPtr, column);
   3522 		}
   3523 	}
   3524 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3525 		sparecol = raidPtr->numCol + column;
   3526 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3527 			clabel = raidget_component_label(raidPtr, sparecol);
   3528 			clabel->root_partition = new_value;
   3529 			raidflush_component_label(raidPtr, sparecol);
   3530 		}
   3531 	}
   3532 	return(new_value);
   3533 }
   3534 
   3535 static void
   3536 rf_release_all_vps(RF_ConfigSet_t *cset)
   3537 {
   3538 	RF_AutoConfig_t *ac;
   3539 
   3540 	ac = cset->ac;
   3541 	while(ac!=NULL) {
   3542 		/* Close the vp, and give it back */
   3543 		if (ac->vp) {
   3544 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3545 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3546 			vput(ac->vp);
   3547 			ac->vp = NULL;
   3548 		}
   3549 		ac = ac->next;
   3550 	}
   3551 }
   3552 
   3553 
   3554 static void
   3555 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3556 {
   3557 	RF_AutoConfig_t *ac;
   3558 	RF_AutoConfig_t *next_ac;
   3559 
   3560 	ac = cset->ac;
   3561 	while(ac!=NULL) {
   3562 		next_ac = ac->next;
   3563 		/* nuke the label */
   3564 		free(ac->clabel, M_RAIDFRAME);
   3565 		/* cleanup the config structure */
   3566 		free(ac, M_RAIDFRAME);
   3567 		/* "next.." */
   3568 		ac = next_ac;
   3569 	}
   3570 	/* and, finally, nuke the config set */
   3571 	free(cset, M_RAIDFRAME);
   3572 }
   3573 
   3574 
   3575 void
   3576 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3577 {
   3578 	/* avoid over-writing byteswapped version. */
   3579 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3580 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3581 	clabel->serial_number = raidPtr->serial_number;
   3582 	clabel->mod_counter = raidPtr->mod_counter;
   3583 
   3584 	clabel->num_rows = 1;
   3585 	clabel->num_columns = raidPtr->numCol;
   3586 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3587 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3588 
   3589 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3590 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3591 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3592 
   3593 	clabel->blockSize = raidPtr->bytesPerSector;
   3594 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3595 
   3596 	/* XXX not portable */
   3597 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3598 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3599 	clabel->autoconfigure = raidPtr->autoconfigure;
   3600 	clabel->root_partition = raidPtr->root_partition;
   3601 	clabel->last_unit = raidPtr->raidid;
   3602 	clabel->config_order = raidPtr->config_order;
   3603 
   3604 #ifndef RF_NO_PARITY_MAP
   3605 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3606 #endif
   3607 }
   3608 
   3609 static struct raid_softc *
   3610 rf_auto_config_set(RF_ConfigSet_t *cset)
   3611 {
   3612 	RF_Raid_t *raidPtr;
   3613 	RF_Config_t *config;
   3614 	int raidID;
   3615 	struct raid_softc *sc;
   3616 
   3617 #ifdef DEBUG
   3618 	printf("RAID autoconfigure\n");
   3619 #endif
   3620 
   3621 	/* 1. Create a config structure */
   3622 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3623 
   3624 	/*
   3625 	   2. Figure out what RAID ID this one is supposed to live at
   3626 	   See if we can get the same RAID dev that it was configured
   3627 	   on last time..
   3628 	*/
   3629 
   3630 	raidID = cset->ac->clabel->last_unit;
   3631 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3632 	     sc = raidget(++raidID, false))
   3633 		continue;
   3634 #ifdef DEBUG
   3635 	printf("Configuring raid%d:\n",raidID);
   3636 #endif
   3637 
   3638 	if (sc == NULL)
   3639 		sc = raidget(raidID, true);
   3640 	raidPtr = &sc->sc_r;
   3641 
   3642 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3643 	raidPtr->softc = sc;
   3644 	raidPtr->raidid = raidID;
   3645 	raidPtr->openings = RAIDOUTSTANDING;
   3646 
   3647 	/* 3. Build the configuration structure */
   3648 	rf_create_configuration(cset->ac, config, raidPtr);
   3649 
   3650 	/* 4. Do the configuration */
   3651 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3652 		raidinit(sc);
   3653 
   3654 		rf_markalldirty(raidPtr);
   3655 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3656 		switch (cset->ac->clabel->root_partition) {
   3657 		case 1:	/* Force Root */
   3658 		case 2:	/* Soft Root: root when boot partition part of raid */
   3659 			/*
   3660 			 * everything configured just fine.  Make a note
   3661 			 * that this set is eligible to be root,
   3662 			 * or forced to be root
   3663 			 */
   3664 			cset->rootable = cset->ac->clabel->root_partition;
   3665 			/* XXX do this here? */
   3666 			raidPtr->root_partition = cset->rootable;
   3667 			break;
   3668 		default:
   3669 			break;
   3670 		}
   3671 	} else {
   3672 		raidput(sc);
   3673 		sc = NULL;
   3674 	}
   3675 
   3676 	/* 5. Cleanup */
   3677 	free(config, M_RAIDFRAME);
   3678 	return sc;
   3679 }
   3680 
   3681 void
   3682 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3683 	     size_t xmin, size_t xmax)
   3684 {
   3685 
   3686 	/* Format: raid%d_foo */
   3687 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3688 
   3689 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3690 	pool_sethiwat(p, xmax);
   3691 	pool_prime(p, xmin);
   3692 }
   3693 
   3694 
   3695 /*
   3696  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3697  * to see if there is IO pending and if that IO could possibly be done
   3698  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3699  * otherwise.
   3700  *
   3701  */
   3702 int
   3703 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3704 {
   3705 	struct raid_softc *rs;
   3706 	struct dk_softc *dksc;
   3707 
   3708 	rs = raidPtr->softc;
   3709 	dksc = &rs->sc_dksc;
   3710 
   3711 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3712 		return 1;
   3713 
   3714 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3715 		/* there is work to do */
   3716 		return 0;
   3717 	}
   3718 	/* default is nothing to do */
   3719 	return 1;
   3720 }
   3721 
   3722 int
   3723 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3724 {
   3725 	uint64_t numsecs;
   3726 	unsigned secsize;
   3727 	int error;
   3728 
   3729 	error = getdisksize(vp, &numsecs, &secsize);
   3730 	if (error == 0) {
   3731 		diskPtr->blockSize = secsize;
   3732 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3733 		diskPtr->partitionSize = numsecs;
   3734 		return 0;
   3735 	}
   3736 	return error;
   3737 }
   3738 
   3739 static int
   3740 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3741 {
   3742 	return 1;
   3743 }
   3744 
   3745 static void
   3746 raid_attach(device_t parent, device_t self, void *aux)
   3747 {
   3748 }
   3749 
   3750 
   3751 static int
   3752 raid_detach(device_t self, int flags)
   3753 {
   3754 	int error;
   3755 	struct raid_softc *rs = raidsoftc(self);
   3756 
   3757 	if (rs == NULL)
   3758 		return ENXIO;
   3759 
   3760 	if ((error = raidlock(rs)) != 0)
   3761 		return error;
   3762 
   3763 	error = raid_detach_unlocked(rs);
   3764 
   3765 	raidunlock(rs);
   3766 
   3767 	/* XXX raid can be referenced here */
   3768 
   3769 	if (error)
   3770 		return error;
   3771 
   3772 	/* Free the softc */
   3773 	raidput(rs);
   3774 
   3775 	return 0;
   3776 }
   3777 
   3778 static void
   3779 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3780 {
   3781 	struct dk_softc *dksc = &rs->sc_dksc;
   3782 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3783 
   3784 	memset(dg, 0, sizeof(*dg));
   3785 
   3786 	dg->dg_secperunit = raidPtr->totalSectors;
   3787 	dg->dg_secsize = raidPtr->bytesPerSector;
   3788 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3789 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3790 
   3791 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3792 }
   3793 
   3794 /*
   3795  * Get cache info for all the components (including spares).
   3796  * Returns intersection of all the cache flags of all disks, or first
   3797  * error if any encountered.
   3798  * XXXfua feature flags can change as spares are added - lock down somehow
   3799  */
   3800 static int
   3801 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3802 {
   3803 	int c;
   3804 	int error;
   3805 	int dkwhole = 0, dkpart;
   3806 
   3807 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3808 		/*
   3809 		 * Check any non-dead disk, even when currently being
   3810 		 * reconstructed.
   3811 		 */
   3812 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3813 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3814 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3815 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3816 			if (error) {
   3817 				if (error != ENODEV) {
   3818 					printf("raid%d: get cache for component %s failed\n",
   3819 					    raidPtr->raidid,
   3820 					    raidPtr->Disks[c].devname);
   3821 				}
   3822 
   3823 				return error;
   3824 			}
   3825 
   3826 			if (c == 0)
   3827 				dkwhole = dkpart;
   3828 			else
   3829 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3830 		}
   3831 	}
   3832 
   3833 	*data = dkwhole;
   3834 
   3835 	return 0;
   3836 }
   3837 
   3838 /*
   3839  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3840  * We end up returning whatever error was returned by the first cache flush
   3841  * that fails.
   3842  */
   3843 
   3844 static int
   3845 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3846 {
   3847 	int e = 0;
   3848 	for (int i = 0; i < 5; i++) {
   3849 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3850 		    &force, FWRITE, NOCRED);
   3851 		if (!e || e == ENODEV)
   3852 			return e;
   3853 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3854 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3855 	}
   3856 	return e;
   3857 }
   3858 
   3859 int
   3860 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3861 {
   3862 	int c, error;
   3863 
   3864 	error = 0;
   3865 	for (c = 0; c < raidPtr->numCol; c++) {
   3866 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3867 			int e = rf_sync_component_cache(raidPtr, c, force);
   3868 			if (e && !error)
   3869 				error = e;
   3870 		}
   3871 	}
   3872 
   3873 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3874 		int sparecol = raidPtr->numCol + c;
   3875 		/* Need to ensure that the reconstruct actually completed! */
   3876 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3877 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3878 			    force);
   3879 			if (e && !error)
   3880 				error = e;
   3881 		}
   3882 	}
   3883 	return error;
   3884 }
   3885 
   3886 /* Fill in info with the current status */
   3887 void
   3888 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3889 {
   3890 
   3891 	memset(info, 0, sizeof(*info));
   3892 
   3893 	if (raidPtr->status != rf_rs_reconstructing) {
   3894 		info->total = 100;
   3895 		info->completed = 100;
   3896 	} else {
   3897 		info->total = raidPtr->reconControl->numRUsTotal;
   3898 		info->completed = raidPtr->reconControl->numRUsComplete;
   3899 	}
   3900 	info->remaining = info->total - info->completed;
   3901 }
   3902 
   3903 /* Fill in info with the current status */
   3904 void
   3905 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3906 {
   3907 
   3908 	memset(info, 0, sizeof(*info));
   3909 
   3910 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3911 		info->total = raidPtr->Layout.numStripe;
   3912 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3913 	} else {
   3914 		info->completed = 100;
   3915 		info->total = 100;
   3916 	}
   3917 	info->remaining = info->total - info->completed;
   3918 }
   3919 
   3920 /* Fill in info with the current status */
   3921 void
   3922 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3923 {
   3924 
   3925 	memset(info, 0, sizeof(*info));
   3926 
   3927 	if (raidPtr->copyback_in_progress == 1) {
   3928 		info->total = raidPtr->Layout.numStripe;
   3929 		info->completed = raidPtr->copyback_stripes_done;
   3930 		info->remaining = info->total - info->completed;
   3931 	} else {
   3932 		info->remaining = 0;
   3933 		info->completed = 100;
   3934 		info->total = 100;
   3935 	}
   3936 }
   3937 
   3938 /* Fill in config with the current info */
   3939 int
   3940 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3941 {
   3942 	int	d, i, j;
   3943 
   3944 	if (!raidPtr->valid)
   3945 		return ENODEV;
   3946 	config->cols = raidPtr->numCol;
   3947 	config->ndevs = raidPtr->numCol;
   3948 	if (config->ndevs >= RF_MAX_DISKS)
   3949 		return ENOMEM;
   3950 	config->nspares = raidPtr->numSpare;
   3951 	if (config->nspares >= RF_MAX_DISKS)
   3952 		return ENOMEM;
   3953 	config->maxqdepth = raidPtr->maxQueueDepth;
   3954 	d = 0;
   3955 	for (j = 0; j < config->cols; j++) {
   3956 		config->devs[d] = raidPtr->Disks[j];
   3957 		d++;
   3958 	}
   3959 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3960 		config->spares[i] = raidPtr->Disks[j];
   3961 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3962 			/* XXX: raidctl(8) expects to see this as a used spare */
   3963 			config->spares[i].status = rf_ds_used_spare;
   3964 		}
   3965 	}
   3966 	return 0;
   3967 }
   3968 
   3969 int
   3970 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3971 {
   3972 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3973 	RF_ComponentLabel_t *raid_clabel;
   3974 	int column = clabel->column;
   3975 
   3976 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3977 		return EINVAL;
   3978 	raid_clabel = raidget_component_label(raidPtr, column);
   3979 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3980 	/* Fix-up for userland. */
   3981 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3982 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3983 
   3984 	return 0;
   3985 }
   3986 
   3987 /*
   3988  * Module interface
   3989  */
   3990 
   3991 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3992 
   3993 #ifdef _MODULE
   3994 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3995 #endif
   3996 
   3997 static int raid_modcmd(modcmd_t, void *);
   3998 static int raid_modcmd_init(void);
   3999 static int raid_modcmd_fini(void);
   4000 
   4001 static int
   4002 raid_modcmd(modcmd_t cmd, void *data)
   4003 {
   4004 	int error;
   4005 
   4006 	error = 0;
   4007 	switch (cmd) {
   4008 	case MODULE_CMD_INIT:
   4009 		error = raid_modcmd_init();
   4010 		break;
   4011 	case MODULE_CMD_FINI:
   4012 		error = raid_modcmd_fini();
   4013 		break;
   4014 	default:
   4015 		error = ENOTTY;
   4016 		break;
   4017 	}
   4018 	return error;
   4019 }
   4020 
   4021 static int
   4022 raid_modcmd_init(void)
   4023 {
   4024 	int error;
   4025 	int bmajor, cmajor;
   4026 
   4027 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   4028 	mutex_enter(&raid_lock);
   4029 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4030 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   4031 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   4032 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   4033 
   4034 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   4035 #endif
   4036 
   4037 	bmajor = cmajor = -1;
   4038 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4039 	    &raid_cdevsw, &cmajor);
   4040 	if (error != 0 && error != EEXIST) {
   4041 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4042 		mutex_exit(&raid_lock);
   4043 		return error;
   4044 	}
   4045 #ifdef _MODULE
   4046 	error = config_cfdriver_attach(&raid_cd);
   4047 	if (error != 0) {
   4048 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4049 		    __func__, error);
   4050 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4051 		mutex_exit(&raid_lock);
   4052 		return error;
   4053 	}
   4054 #endif
   4055 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4056 	if (error != 0) {
   4057 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4058 		    __func__, error);
   4059 #ifdef _MODULE
   4060 		config_cfdriver_detach(&raid_cd);
   4061 #endif
   4062 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4063 		mutex_exit(&raid_lock);
   4064 		return error;
   4065 	}
   4066 
   4067 	raidautoconfigdone = false;
   4068 
   4069 	mutex_exit(&raid_lock);
   4070 
   4071 	if (error == 0) {
   4072 		if (rf_BootRaidframe(true) == 0)
   4073 			aprint_verbose("Kernelized RAIDframe activated\n");
   4074 		else
   4075 			panic("Serious error activating RAID!!");
   4076 	}
   4077 
   4078 	/*
   4079 	 * Register a finalizer which will be used to auto-config RAID
   4080 	 * sets once all real hardware devices have been found.
   4081 	 */
   4082 	error = config_finalize_register(NULL, rf_autoconfig);
   4083 	if (error != 0) {
   4084 		aprint_error("WARNING: unable to register RAIDframe "
   4085 		    "finalizer\n");
   4086 		error = 0;
   4087 	}
   4088 
   4089 	return error;
   4090 }
   4091 
   4092 static int
   4093 raid_modcmd_fini(void)
   4094 {
   4095 	int error;
   4096 
   4097 	mutex_enter(&raid_lock);
   4098 
   4099 	/* Don't allow unload if raid device(s) exist.  */
   4100 	if (!LIST_EMPTY(&raids)) {
   4101 		mutex_exit(&raid_lock);
   4102 		return EBUSY;
   4103 	}
   4104 
   4105 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4106 	if (error != 0) {
   4107 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4108 		mutex_exit(&raid_lock);
   4109 		return error;
   4110 	}
   4111 #ifdef _MODULE
   4112 	error = config_cfdriver_detach(&raid_cd);
   4113 	if (error != 0) {
   4114 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4115 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4116 		mutex_exit(&raid_lock);
   4117 		return error;
   4118 	}
   4119 #endif
   4120 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4121 	rf_BootRaidframe(false);
   4122 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4123 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4124 	rf_destroy_cond2(rf_sparet_wait_cv);
   4125 	rf_destroy_cond2(rf_sparet_resp_cv);
   4126 #endif
   4127 	mutex_exit(&raid_lock);
   4128 	mutex_destroy(&raid_lock);
   4129 
   4130 	return error;
   4131 }
   4132