Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.410
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.410 2022/08/28 00:37:41 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.410 2022/08/28 00:37:41 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int);
    181 static void raidinit(struct raid_softc *);
    182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 static int raid_diskstart(device_t, struct buf *bp);
    200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    201 static int raid_lastclose(device_t);
    202 
    203 static dev_type_open(raidopen);
    204 static dev_type_close(raidclose);
    205 static dev_type_read(raidread);
    206 static dev_type_write(raidwrite);
    207 static dev_type_ioctl(raidioctl);
    208 static dev_type_strategy(raidstrategy);
    209 static dev_type_dump(raiddump);
    210 static dev_type_size(raidsize);
    211 
    212 const struct bdevsw raid_bdevsw = {
    213 	.d_open = raidopen,
    214 	.d_close = raidclose,
    215 	.d_strategy = raidstrategy,
    216 	.d_ioctl = raidioctl,
    217 	.d_dump = raiddump,
    218 	.d_psize = raidsize,
    219 	.d_discard = nodiscard,
    220 	.d_flag = D_DISK
    221 };
    222 
    223 const struct cdevsw raid_cdevsw = {
    224 	.d_open = raidopen,
    225 	.d_close = raidclose,
    226 	.d_read = raidread,
    227 	.d_write = raidwrite,
    228 	.d_ioctl = raidioctl,
    229 	.d_stop = nostop,
    230 	.d_tty = notty,
    231 	.d_poll = nopoll,
    232 	.d_mmap = nommap,
    233 	.d_kqfilter = nokqfilter,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 static struct dkdriver rf_dkdriver = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_strategy = raidstrategy,
    242 	.d_diskstart = raid_diskstart,
    243 	.d_dumpblocks = raid_dumpblocks,
    244 	.d_lastclose = raid_lastclose,
    245 	.d_minphys = minphys
    246 };
    247 
    248 #define	raidunit(x)	DISKUNIT(x)
    249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    250 
    251 extern struct cfdriver raid_cd;
    252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    254     DVF_DETACH_SHUTDOWN);
    255 
    256 /* Internal representation of a rf_recon_req */
    257 struct rf_recon_req_internal {
    258 	RF_RowCol_t col;
    259 	RF_ReconReqFlags_t flags;
    260 	void   *raidPtr;
    261 };
    262 
    263 /*
    264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    265  * Be aware that large numbers can allow the driver to consume a lot of
    266  * kernel memory, especially on writes, and in degraded mode reads.
    267  *
    268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    269  * a single 64K write will typically require 64K for the old data,
    270  * 64K for the old parity, and 64K for the new parity, for a total
    271  * of 192K (if the parity buffer is not re-used immediately).
    272  * Even it if is used immediately, that's still 128K, which when multiplied
    273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    274  *
    275  * Now in degraded mode, for example, a 64K read on the above setup may
    276  * require data reconstruction, which will require *all* of the 4 remaining
    277  * disks to participate -- 4 * 32K/disk == 128K again.
    278  */
    279 
    280 #ifndef RAIDOUTSTANDING
    281 #define RAIDOUTSTANDING   6
    282 #endif
    283 
    284 #define RAIDLABELDEV(dev)	\
    285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    286 
    287 /* declared here, and made public, for the benefit of KVM stuff.. */
    288 
    289 static int raidlock(struct raid_softc *);
    290 static void raidunlock(struct raid_softc *);
    291 
    292 static int raid_detach_unlocked(struct raid_softc *);
    293 
    294 static void rf_markalldirty(RF_Raid_t *);
    295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    296 
    297 static void rf_ReconThread(struct rf_recon_req_internal *);
    298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    301 static int rf_autoconfig(device_t);
    302 static int rf_rescan(void);
    303 static void rf_buildroothack(RF_ConfigSet_t *);
    304 
    305 static RF_AutoConfig_t *rf_find_raid_components(void);
    306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    309 static int rf_set_autoconfig(RF_Raid_t *, int);
    310 static int rf_set_rootpartition(RF_Raid_t *, int);
    311 static void rf_release_all_vps(RF_ConfigSet_t *);
    312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    313 static int rf_have_enough_components(RF_ConfigSet_t *);
    314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    316 
    317 /*
    318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    320  * in the kernel config file.
    321  */
    322 #ifdef RAID_AUTOCONFIG
    323 int raidautoconfig = 1;
    324 #else
    325 int raidautoconfig = 0;
    326 #endif
    327 static bool raidautoconfigdone = false;
    328 
    329 struct pool rf_alloclist_pool;   /* AllocList */
    330 
    331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    332 static kmutex_t raid_lock;
    333 
    334 static struct raid_softc *
    335 raidcreate(int unit) {
    336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    337 	sc->sc_unit = unit;
    338 	cv_init(&sc->sc_cv, "raidunit");
    339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    340 	return sc;
    341 }
    342 
    343 static void
    344 raiddestroy(struct raid_softc *sc) {
    345 	cv_destroy(&sc->sc_cv);
    346 	mutex_destroy(&sc->sc_mutex);
    347 	kmem_free(sc, sizeof(*sc));
    348 }
    349 
    350 static struct raid_softc *
    351 raidget(int unit, bool create) {
    352 	struct raid_softc *sc;
    353 	if (unit < 0) {
    354 #ifdef DIAGNOSTIC
    355 		panic("%s: unit %d!", __func__, unit);
    356 #endif
    357 		return NULL;
    358 	}
    359 	mutex_enter(&raid_lock);
    360 	LIST_FOREACH(sc, &raids, sc_link) {
    361 		if (sc->sc_unit == unit) {
    362 			mutex_exit(&raid_lock);
    363 			return sc;
    364 		}
    365 	}
    366 	mutex_exit(&raid_lock);
    367 	if (!create)
    368 		return NULL;
    369 	sc = raidcreate(unit);
    370 	mutex_enter(&raid_lock);
    371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    372 	mutex_exit(&raid_lock);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raidput(struct raid_softc *sc) {
    378 	mutex_enter(&raid_lock);
    379 	LIST_REMOVE(sc, sc_link);
    380 	mutex_exit(&raid_lock);
    381 	raiddestroy(sc);
    382 }
    383 
    384 void
    385 raidattach(int num)
    386 {
    387 
    388 	/*
    389 	 * Device attachment and associated initialization now occurs
    390 	 * as part of the module initialization.
    391 	 */
    392 }
    393 
    394 static int
    395 rf_autoconfig(device_t self)
    396 {
    397 	RF_AutoConfig_t *ac_list;
    398 	RF_ConfigSet_t *config_sets;
    399 
    400 	if (!raidautoconfig || raidautoconfigdone == true)
    401 		return 0;
    402 
    403 	/* XXX This code can only be run once. */
    404 	raidautoconfigdone = true;
    405 
    406 #ifdef __HAVE_CPU_BOOTCONF
    407 	/*
    408 	 * 0. find the boot device if needed first so we can use it later
    409 	 * this needs to be done before we autoconfigure any raid sets,
    410 	 * because if we use wedges we are not going to be able to open
    411 	 * the boot device later
    412 	 */
    413 	if (booted_device == NULL)
    414 		cpu_bootconf();
    415 #endif
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set and configure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 int
    433 rf_inited(const struct raid_softc *rs) {
    434 	return (rs->sc_flags & RAIDF_INITED) != 0;
    435 }
    436 
    437 RF_Raid_t *
    438 rf_get_raid(struct raid_softc *rs) {
    439 	return &rs->sc_r;
    440 }
    441 
    442 int
    443 rf_get_unit(const struct raid_softc *rs) {
    444 	return rs->sc_unit;
    445 }
    446 
    447 static int
    448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    449 	const char *bootname;
    450 	size_t len;
    451 
    452 	/* if bdv is NULL, the set can't contain it. exit early. */
    453 	if (bdv == NULL)
    454 		return 0;
    455 
    456 	bootname = device_xname(bdv);
    457 	len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 static int
    479 rf_rescan(void)
    480 {
    481 	RF_AutoConfig_t *ac_list;
    482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    483 	struct raid_softc *sc;
    484 	int raid_added;
    485 
    486 	ac_list = rf_find_raid_components();
    487 	config_sets = rf_create_auto_sets(ac_list);
    488 
    489 	raid_added = 1;
    490 	while (raid_added > 0) {
    491 		raid_added = 0;
    492 		cset = config_sets;
    493 		while (cset != NULL) {
    494 			next_cset = cset->next;
    495 			if (rf_have_enough_components(cset) &&
    496 			    cset->ac->clabel->autoconfigure == 1) {
    497 				sc = rf_auto_config_set(cset);
    498 				if (sc != NULL) {
    499 					aprint_debug("raid%d: configured ok, rootable %d\n",
    500 						     sc->sc_unit, cset->rootable);
    501 					/* We added one RAID set */
    502 					raid_added++;
    503 				} else {
    504 					/* The autoconfig didn't work :( */
    505 					aprint_debug("Autoconfig failed\n");
    506 					rf_release_all_vps(cset);
    507 				}
    508 			} else {
    509 				/* we're not autoconfiguring this set...
    510 				   release the associated resources */
    511 				rf_release_all_vps(cset);
    512 			}
    513 			/* cleanup */
    514 			rf_cleanup_config_set(cset);
    515 			cset = next_cset;
    516 		}
    517 		if (raid_added > 0) {
    518 			/* We added at least one RAID set, so re-scan for recursive RAID */
    519 			ac_list = rf_find_raid_components();
    520 			config_sets = rf_create_auto_sets(ac_list);
    521 		}
    522 	}
    523 
    524 	return 0;
    525 }
    526 
    527 
    528 static void
    529 rf_buildroothack(RF_ConfigSet_t *config_sets)
    530 {
    531 	RF_AutoConfig_t *ac_list;
    532 	RF_ConfigSet_t *cset;
    533 	RF_ConfigSet_t *next_cset;
    534 	int num_root;
    535 	int raid_added;
    536 	struct raid_softc *sc, *rsc;
    537 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    538 
    539 	sc = rsc = NULL;
    540 	num_root = 0;
    541 
    542 	raid_added = 1;
    543 	while (raid_added > 0) {
    544 		raid_added = 0;
    545 		cset = config_sets;
    546 		while (cset != NULL) {
    547 			next_cset = cset->next;
    548 			if (rf_have_enough_components(cset) &&
    549 			    cset->ac->clabel->autoconfigure == 1) {
    550 				sc = rf_auto_config_set(cset);
    551 				if (sc != NULL) {
    552 					aprint_debug("raid%d: configured ok, rootable %d\n",
    553 						     sc->sc_unit, cset->rootable);
    554 					/* We added one RAID set */
    555 					raid_added++;
    556 					if (cset->rootable) {
    557 						rsc = sc;
    558 						num_root++;
    559 					}
    560 				} else {
    561 					/* The autoconfig didn't work :( */
    562 					aprint_debug("Autoconfig failed\n");
    563 					rf_release_all_vps(cset);
    564 				}
    565 			} else {
    566 				/* we're not autoconfiguring this set...
    567 				   release the associated resources */
    568 				rf_release_all_vps(cset);
    569 			}
    570 			/* cleanup */
    571 			rf_cleanup_config_set(cset);
    572 			cset = next_cset;
    573 		}
    574 		if (raid_added > 0) {
    575 			/* We added at least one RAID set, so re-scan for recursive RAID */
    576 			ac_list = rf_find_raid_components();
    577 			config_sets = rf_create_auto_sets(ac_list);
    578 		}
    579 	}
    580 
    581 	/* if the user has specified what the root device should be
    582 	   then we don't touch booted_device or boothowto... */
    583 
    584 	if (rootspec != NULL) {
    585 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
    586 		return;
    587 	}
    588 
    589 	/* we found something bootable... */
    590 
    591 	/*
    592 	 * XXX: The following code assumes that the root raid
    593 	 * is the first ('a') partition. This is about the best
    594 	 * we can do with a BSD disklabel, but we might be able
    595 	 * to do better with a GPT label, by setting a specified
    596 	 * attribute to indicate the root partition. We can then
    597 	 * stash the partition number in the r->root_partition
    598 	 * high bits (the bottom 2 bits are already used). For
    599 	 * now we just set booted_partition to 0 when we override
    600 	 * root.
    601 	 */
    602 	if (num_root == 1) {
    603 		device_t candidate_root;
    604 		dksc = &rsc->sc_dksc;
    605 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    606 			char cname[sizeof(cset->ac->devname)];
    607 			/* XXX: assume partition 'a' first */
    608 			snprintf(cname, sizeof(cname), "%s%c",
    609 			    device_xname(dksc->sc_dev), 'a');
    610 			candidate_root = dkwedge_find_by_wname(cname);
    611 			aprint_debug("%s: candidate wedge root=%s\n", __func__,
    612 			    cname);
    613 			if (candidate_root == NULL) {
    614 				/*
    615 				 * If that is not found, because we don't use
    616 				 * disklabel, return the first dk child
    617 				 * XXX: we can skip the 'a' check above
    618 				 * and always do this...
    619 				 */
    620 				size_t i = 0;
    621 				candidate_root = dkwedge_find_by_parent(
    622 				    device_xname(dksc->sc_dev), &i);
    623 			}
    624 			aprint_debug("%s: candidate wedge root=%p\n", __func__,
    625 			    candidate_root);
    626 		} else
    627 			candidate_root = dksc->sc_dev;
    628 		aprint_debug("%s: candidate root=%p booted_device=%p "
    629 			     "root_partition=%d contains_boot=%d\n",
    630 		    __func__, candidate_root, booted_device,
    631 		    rsc->sc_r.root_partition,
    632 		    rf_containsboot(&rsc->sc_r, booted_device));
    633 		/* XXX the check for booted_device == NULL can probably be
    634 		 * dropped, now that rf_containsboot handles that case.
    635 		 */
    636 		if (booted_device == NULL ||
    637 		    rsc->sc_r.root_partition == 1 ||
    638 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    639 			booted_device = candidate_root;
    640 			booted_method = "raidframe/single";
    641 			booted_partition = 0;	/* XXX assume 'a' */
    642 			aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
    643 			    device_xname(booted_device), booted_device);
    644 		}
    645 	} else if (num_root > 1) {
    646 		aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
    647 		    booted_device);
    648 
    649 		/*
    650 		 * Maybe the MD code can help. If it cannot, then
    651 		 * setroot() will discover that we have no
    652 		 * booted_device and will ask the user if nothing was
    653 		 * hardwired in the kernel config file
    654 		 */
    655 		if (booted_device == NULL)
    656 			return;
    657 
    658 		num_root = 0;
    659 		mutex_enter(&raid_lock);
    660 		LIST_FOREACH(sc, &raids, sc_link) {
    661 			RF_Raid_t *r = &sc->sc_r;
    662 			if (r->valid == 0)
    663 				continue;
    664 
    665 			if (r->root_partition == 0)
    666 				continue;
    667 
    668 			if (rf_containsboot(r, booted_device)) {
    669 				num_root++;
    670 				rsc = sc;
    671 				dksc = &rsc->sc_dksc;
    672 			}
    673 		}
    674 		mutex_exit(&raid_lock);
    675 
    676 		if (num_root == 1) {
    677 			booted_device = dksc->sc_dev;
    678 			booted_method = "raidframe/multi";
    679 			booted_partition = 0;	/* XXX assume 'a' */
    680 		} else {
    681 			/* we can't guess.. require the user to answer... */
    682 			boothowto |= RB_ASKNAME;
    683 		}
    684 	}
    685 }
    686 
    687 static int
    688 raidsize(dev_t dev)
    689 {
    690 	struct raid_softc *rs;
    691 	struct dk_softc *dksc;
    692 	unsigned int unit;
    693 
    694 	unit = raidunit(dev);
    695 	if ((rs = raidget(unit, false)) == NULL)
    696 		return -1;
    697 	dksc = &rs->sc_dksc;
    698 
    699 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    700 		return -1;
    701 
    702 	return dk_size(dksc, dev);
    703 }
    704 
    705 static int
    706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    707 {
    708 	unsigned int unit;
    709 	struct raid_softc *rs;
    710 	struct dk_softc *dksc;
    711 
    712 	unit = raidunit(dev);
    713 	if ((rs = raidget(unit, false)) == NULL)
    714 		return ENXIO;
    715 	dksc = &rs->sc_dksc;
    716 
    717 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    718 		return ENODEV;
    719 
    720         /*
    721            Note that blkno is relative to this particular partition.
    722            By adding adding RF_PROTECTED_SECTORS, we get a value that
    723 	   is relative to the partition used for the underlying component.
    724         */
    725 	blkno += RF_PROTECTED_SECTORS;
    726 
    727 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    728 }
    729 
    730 static int
    731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    732 {
    733 	struct raid_softc *rs = raidsoftc(dev);
    734 	const struct bdevsw *bdev;
    735 	RF_Raid_t *raidPtr;
    736 	int     c, sparecol, j, scol, dumpto;
    737 	int     error = 0;
    738 
    739 	raidPtr = &rs->sc_r;
    740 
    741 	/* we only support dumping to RAID 1 sets */
    742 	if (raidPtr->Layout.numDataCol != 1 ||
    743 	    raidPtr->Layout.numParityCol != 1)
    744 		return EINVAL;
    745 
    746 	if ((error = raidlock(rs)) != 0)
    747 		return error;
    748 
    749 	/* figure out what device is alive.. */
    750 
    751 	/*
    752 	   Look for a component to dump to.  The preference for the
    753 	   component to dump to is as follows:
    754 	   1) the first component
    755 	   2) a used_spare of the first component
    756 	   3) the second component
    757 	   4) a used_spare of the second component
    758 	*/
    759 
    760 	dumpto = -1;
    761 	for (c = 0; c < raidPtr->numCol; c++) {
    762 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    763 			/* this might be the one */
    764 			dumpto = c;
    765 			break;
    766 		}
    767 	}
    768 
    769 	/*
    770 	   At this point we have possibly selected a live component.
    771 	   If we didn't find a live ocmponent, we now check to see
    772 	   if there is a relevant spared component.
    773 	*/
    774 
    775 	for (c = 0; c < raidPtr->numSpare; c++) {
    776 		sparecol = raidPtr->numCol + c;
    777 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    778 			/* How about this one? */
    779 			scol = -1;
    780 			for(j=0;j<raidPtr->numCol;j++) {
    781 				if (raidPtr->Disks[j].spareCol == sparecol) {
    782 					scol = j;
    783 					break;
    784 				}
    785 			}
    786 			if (scol == 0) {
    787 				/*
    788 				   We must have found a spared first
    789 				   component!  We'll take that over
    790 				   anything else found so far.  (We
    791 				   couldn't have found a real first
    792 				   component before, since this is a
    793 				   used spare, and it's saying that
    794 				   it's replacing the first
    795 				   component.)  On reboot (with
    796 				   autoconfiguration turned on)
    797 				   sparecol will become the first
    798 				   component (component0) of this set.
    799 				*/
    800 				dumpto = sparecol;
    801 				break;
    802 			} else if (scol != -1) {
    803 				/*
    804 				   Must be a spared second component.
    805 				   We'll dump to that if we havn't found
    806 				   anything else so far.
    807 				*/
    808 				if (dumpto == -1)
    809 					dumpto = sparecol;
    810 			}
    811 		}
    812 	}
    813 
    814 	if (dumpto == -1) {
    815 		/* we couldn't find any live components to dump to!?!?
    816 		 */
    817 		error = EINVAL;
    818 		goto out;
    819 	}
    820 
    821 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    822 	if (bdev == NULL) {
    823 		error = ENXIO;
    824 		goto out;
    825 	}
    826 
    827 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    828 				blkno, va, nblk * raidPtr->bytesPerSector);
    829 
    830 out:
    831 	raidunlock(rs);
    832 
    833 	return error;
    834 }
    835 
    836 /* ARGSUSED */
    837 static int
    838 raidopen(dev_t dev, int flags, int fmt,
    839     struct lwp *l)
    840 {
    841 	int     unit = raidunit(dev);
    842 	struct raid_softc *rs;
    843 	struct dk_softc *dksc;
    844 	int     error = 0;
    845 	int     part, pmask;
    846 
    847 	if ((rs = raidget(unit, true)) == NULL)
    848 		return ENXIO;
    849 	if ((error = raidlock(rs)) != 0)
    850 		return error;
    851 
    852 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    853 		error = EBUSY;
    854 		goto bad;
    855 	}
    856 
    857 	dksc = &rs->sc_dksc;
    858 
    859 	part = DISKPART(dev);
    860 	pmask = (1 << part);
    861 
    862 	if (!DK_BUSY(dksc, pmask) &&
    863 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    864 		/* First one... mark things as dirty... Note that we *MUST*
    865 		 have done a configure before this.  I DO NOT WANT TO BE
    866 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    867 		 THAT THEY BELONG TOGETHER!!!!! */
    868 		/* XXX should check to see if we're only open for reading
    869 		   here... If so, we needn't do this, but then need some
    870 		   other way of keeping track of what's happened.. */
    871 
    872 		rf_markalldirty(&rs->sc_r);
    873 	}
    874 
    875 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    876 		error = dk_open(dksc, dev, flags, fmt, l);
    877 
    878 bad:
    879 	raidunlock(rs);
    880 
    881 	return error;
    882 
    883 
    884 }
    885 
    886 static int
    887 raid_lastclose(device_t self)
    888 {
    889 	struct raid_softc *rs = raidsoftc(self);
    890 
    891 	/* Last one... device is not unconfigured yet.
    892 	   Device shutdown has taken care of setting the
    893 	   clean bits if RAIDF_INITED is not set
    894 	   mark things as clean... */
    895 
    896 	rf_update_component_labels(&rs->sc_r,
    897 	    RF_FINAL_COMPONENT_UPDATE);
    898 
    899 	/* pass to unlocked code */
    900 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    901 		rs->sc_flags |= RAIDF_DETACH;
    902 
    903 	return 0;
    904 }
    905 
    906 /* ARGSUSED */
    907 static int
    908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    909 {
    910 	int     unit = raidunit(dev);
    911 	struct raid_softc *rs;
    912 	struct dk_softc *dksc;
    913 	cfdata_t cf;
    914 	int     error = 0, do_detach = 0, do_put = 0;
    915 
    916 	if ((rs = raidget(unit, false)) == NULL)
    917 		return ENXIO;
    918 	dksc = &rs->sc_dksc;
    919 
    920 	if ((error = raidlock(rs)) != 0)
    921 		return error;
    922 
    923 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    924 		error = dk_close(dksc, dev, flags, fmt, l);
    925 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    926 			do_detach = 1;
    927 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    928 		do_put = 1;
    929 
    930 	raidunlock(rs);
    931 
    932 	if (do_detach) {
    933 		/* free the pseudo device attach bits */
    934 		cf = device_cfdata(dksc->sc_dev);
    935 		error = config_detach(dksc->sc_dev, 0);
    936 		if (error == 0)
    937 			free(cf, M_RAIDFRAME);
    938 	} else if (do_put) {
    939 		raidput(rs);
    940 	}
    941 
    942 	return error;
    943 
    944 }
    945 
    946 static void
    947 raid_wakeup(RF_Raid_t *raidPtr)
    948 {
    949 	rf_lock_mutex2(raidPtr->iodone_lock);
    950 	rf_signal_cond2(raidPtr->iodone_cv);
    951 	rf_unlock_mutex2(raidPtr->iodone_lock);
    952 }
    953 
    954 static void
    955 raidstrategy(struct buf *bp)
    956 {
    957 	unsigned int unit;
    958 	struct raid_softc *rs;
    959 	struct dk_softc *dksc;
    960 	RF_Raid_t *raidPtr;
    961 
    962 	unit = raidunit(bp->b_dev);
    963 	if ((rs = raidget(unit, false)) == NULL) {
    964 		bp->b_error = ENXIO;
    965 		goto fail;
    966 	}
    967 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    968 		bp->b_error = ENXIO;
    969 		goto fail;
    970 	}
    971 	dksc = &rs->sc_dksc;
    972 	raidPtr = &rs->sc_r;
    973 
    974 	/* Queue IO only */
    975 	if (dk_strategy_defer(dksc, bp))
    976 		goto done;
    977 
    978 	/* schedule the IO to happen at the next convenient time */
    979 	raid_wakeup(raidPtr);
    980 
    981 done:
    982 	return;
    983 
    984 fail:
    985 	bp->b_resid = bp->b_bcount;
    986 	biodone(bp);
    987 }
    988 
    989 static int
    990 raid_diskstart(device_t dev, struct buf *bp)
    991 {
    992 	struct raid_softc *rs = raidsoftc(dev);
    993 	RF_Raid_t *raidPtr;
    994 
    995 	raidPtr = &rs->sc_r;
    996 	if (!raidPtr->valid) {
    997 		db1_printf(("raid is not valid..\n"));
    998 		return ENODEV;
    999 	}
   1000 
   1001 	/* XXX */
   1002 	bp->b_resid = 0;
   1003 
   1004 	return raiddoaccess(raidPtr, bp);
   1005 }
   1006 
   1007 void
   1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1009 {
   1010 	struct raid_softc *rs;
   1011 	struct dk_softc *dksc;
   1012 
   1013 	rs = raidPtr->softc;
   1014 	dksc = &rs->sc_dksc;
   1015 
   1016 	dk_done(dksc, bp);
   1017 
   1018 	rf_lock_mutex2(raidPtr->mutex);
   1019 	raidPtr->openings++;
   1020 	rf_unlock_mutex2(raidPtr->mutex);
   1021 
   1022 	/* schedule more IO */
   1023 	raid_wakeup(raidPtr);
   1024 }
   1025 
   1026 /* ARGSUSED */
   1027 static int
   1028 raidread(dev_t dev, struct uio *uio, int flags)
   1029 {
   1030 	int     unit = raidunit(dev);
   1031 	struct raid_softc *rs;
   1032 
   1033 	if ((rs = raidget(unit, false)) == NULL)
   1034 		return ENXIO;
   1035 
   1036 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1037 		return ENXIO;
   1038 
   1039 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1040 
   1041 }
   1042 
   1043 /* ARGSUSED */
   1044 static int
   1045 raidwrite(dev_t dev, struct uio *uio, int flags)
   1046 {
   1047 	int     unit = raidunit(dev);
   1048 	struct raid_softc *rs;
   1049 
   1050 	if ((rs = raidget(unit, false)) == NULL)
   1051 		return ENXIO;
   1052 
   1053 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1054 		return ENXIO;
   1055 
   1056 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1057 
   1058 }
   1059 
   1060 static int
   1061 raid_detach_unlocked(struct raid_softc *rs)
   1062 {
   1063 	struct dk_softc *dksc = &rs->sc_dksc;
   1064 	RF_Raid_t *raidPtr;
   1065 	int error;
   1066 
   1067 	raidPtr = &rs->sc_r;
   1068 
   1069 	if (DK_BUSY(dksc, 0) ||
   1070 	    raidPtr->recon_in_progress != 0 ||
   1071 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1072 	    raidPtr->copyback_in_progress != 0)
   1073 		return EBUSY;
   1074 
   1075 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1076 		return 0;
   1077 
   1078 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1079 
   1080 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1081 		return error;
   1082 
   1083 	rs->sc_flags &= ~RAIDF_INITED;
   1084 
   1085 	/* Kill off any queued buffers */
   1086 	dk_drain(dksc);
   1087 	bufq_free(dksc->sc_bufq);
   1088 
   1089 	/* Detach the disk. */
   1090 	dkwedge_delall(&dksc->sc_dkdev);
   1091 	disk_detach(&dksc->sc_dkdev);
   1092 	disk_destroy(&dksc->sc_dkdev);
   1093 	dk_detach(dksc);
   1094 
   1095 	return 0;
   1096 }
   1097 
   1098 int
   1099 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1100 {
   1101 	struct rf_recon_req_internal *rrint;
   1102 
   1103 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1104 		/* Can't do this on a RAID 0!! */
   1105 		return EINVAL;
   1106 	}
   1107 
   1108 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1109 		/* bad column */
   1110 		return EINVAL;
   1111 	}
   1112 
   1113 	rf_lock_mutex2(raidPtr->mutex);
   1114 	if (raidPtr->status == rf_rs_reconstructing) {
   1115 		/* you can't fail a disk while we're reconstructing! */
   1116 		/* XXX wrong for RAID6 */
   1117 		goto out;
   1118 	}
   1119 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1120 	    (raidPtr->numFailures > 0)) {
   1121 		/* some other component has failed.  Let's not make
   1122 		   things worse. XXX wrong for RAID6 */
   1123 		goto out;
   1124 	}
   1125 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1126 		/* Can't fail a spared disk! */
   1127 		goto out;
   1128 	}
   1129 	rf_unlock_mutex2(raidPtr->mutex);
   1130 
   1131 	/* make a copy of the recon request so that we don't rely on
   1132 	 * the user's buffer */
   1133 	rrint = RF_Malloc(sizeof(*rrint));
   1134 	if (rrint == NULL)
   1135 		return(ENOMEM);
   1136 	rrint->col = rr->col;
   1137 	rrint->flags = rr->flags;
   1138 	rrint->raidPtr = raidPtr;
   1139 
   1140 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1141 	    rrint, "raid_recon");
   1142 out:
   1143 	rf_unlock_mutex2(raidPtr->mutex);
   1144 	return EINVAL;
   1145 }
   1146 
   1147 static int
   1148 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1149 {
   1150 	/* allocate a buffer for the layout-specific data, and copy it in */
   1151 	if (k_cfg->layoutSpecificSize == 0)
   1152 		return 0;
   1153 
   1154 	if (k_cfg->layoutSpecificSize > 10000) {
   1155 	    /* sanity check */
   1156 	    return EINVAL;
   1157 	}
   1158 
   1159 	u_char *specific_buf;
   1160 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1161 	if (specific_buf == NULL)
   1162 		return ENOMEM;
   1163 
   1164 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1165 	    k_cfg->layoutSpecificSize);
   1166 	if (retcode) {
   1167 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1168 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1169 		return retcode;
   1170 	}
   1171 
   1172 	k_cfg->layoutSpecific = specific_buf;
   1173 	return 0;
   1174 }
   1175 
   1176 static int
   1177 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1178 {
   1179 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1180 
   1181 	if (rs->sc_r.valid) {
   1182 		/* There is a valid RAID set running on this unit! */
   1183 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1184 		return EINVAL;
   1185 	}
   1186 
   1187 	/* copy-in the configuration information */
   1188 	/* data points to a pointer to the configuration structure */
   1189 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1190 	if (*k_cfg == NULL) {
   1191 		return ENOMEM;
   1192 	}
   1193 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1194 	if (retcode == 0)
   1195 		return 0;
   1196 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1197 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1198 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1199 	return retcode;
   1200 }
   1201 
   1202 int
   1203 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1204 {
   1205 	int retcode, i;
   1206 	RF_Raid_t *raidPtr = &rs->sc_r;
   1207 
   1208 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1209 
   1210 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1211 		goto out;
   1212 
   1213 	/* should do some kind of sanity check on the configuration.
   1214 	 * Store the sum of all the bytes in the last byte? */
   1215 
   1216 	/* Force nul-termination on all strings. */
   1217 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1218 	for (i = 0; i < RF_MAXCOL; i++) {
   1219 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1220 	}
   1221 	for (i = 0; i < RF_MAXSPARE; i++) {
   1222 		ZERO_FINAL(k_cfg->spare_names[i]);
   1223 	}
   1224 	for (i = 0; i < RF_MAXDBGV; i++) {
   1225 		ZERO_FINAL(k_cfg->debugVars[i]);
   1226 	}
   1227 #undef ZERO_FINAL
   1228 
   1229 	/* Check some basic limits. */
   1230 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1231 		retcode = EINVAL;
   1232 		goto out;
   1233 	}
   1234 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1235 		retcode = EINVAL;
   1236 		goto out;
   1237 	}
   1238 
   1239 	/* configure the system */
   1240 
   1241 	/*
   1242 	 * Clear the entire RAID descriptor, just to make sure
   1243 	 *  there is no stale data left in the case of a
   1244 	 *  reconfiguration
   1245 	 */
   1246 	memset(raidPtr, 0, sizeof(*raidPtr));
   1247 	raidPtr->softc = rs;
   1248 	raidPtr->raidid = rs->sc_unit;
   1249 
   1250 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1251 
   1252 	if (retcode == 0) {
   1253 		/* allow this many simultaneous IO's to
   1254 		   this RAID device */
   1255 		raidPtr->openings = RAIDOUTSTANDING;
   1256 
   1257 		raidinit(rs);
   1258 		raid_wakeup(raidPtr);
   1259 		rf_markalldirty(raidPtr);
   1260 	}
   1261 
   1262 	/* free the buffers.  No return code here. */
   1263 	if (k_cfg->layoutSpecificSize) {
   1264 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1265 	}
   1266 out:
   1267 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1268 	if (retcode) {
   1269 		/*
   1270 		 * If configuration failed, set sc_flags so that we
   1271 		 * will detach the device when we close it.
   1272 		 */
   1273 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1274 	}
   1275 	return retcode;
   1276 }
   1277 
   1278 #if RF_DISABLED
   1279 static int
   1280 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1281 {
   1282 
   1283 	/* XXX check the label for valid stuff... */
   1284 	/* Note that some things *should not* get modified --
   1285 	   the user should be re-initing the labels instead of
   1286 	   trying to patch things.
   1287 	   */
   1288 #ifdef DEBUG
   1289 	int raidid = raidPtr->raidid;
   1290 	printf("raid%d: Got component label:\n", raidid);
   1291 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1292 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1293 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1294 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1295 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1296 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1297 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1298 #endif	/* DEBUG */
   1299 	clabel->row = 0;
   1300 	int column = clabel->column;
   1301 
   1302 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1303 		return(EINVAL);
   1304 	}
   1305 
   1306 	/* XXX this isn't allowed to do anything for now :-) */
   1307 
   1308 	/* XXX and before it is, we need to fill in the rest
   1309 	   of the fields!?!?!?! */
   1310 	memcpy(raidget_component_label(raidPtr, column),
   1311 	    clabel, sizeof(*clabel));
   1312 	raidflush_component_label(raidPtr, column);
   1313 	return 0;
   1314 }
   1315 #endif
   1316 
   1317 static int
   1318 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1319 {
   1320 	/*
   1321 	   we only want the serial number from
   1322 	   the above.  We get all the rest of the information
   1323 	   from the config that was used to create this RAID
   1324 	   set.
   1325 	   */
   1326 
   1327 	raidPtr->serial_number = clabel->serial_number;
   1328 
   1329 	for (int column = 0; column < raidPtr->numCol; column++) {
   1330 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1331 		if (RF_DEAD_DISK(diskPtr->status))
   1332 			continue;
   1333 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1334 		    raidPtr, column);
   1335 		/* Zeroing this is important. */
   1336 		memset(ci_label, 0, sizeof(*ci_label));
   1337 		raid_init_component_label(raidPtr, ci_label);
   1338 		ci_label->serial_number = raidPtr->serial_number;
   1339 		ci_label->row = 0; /* we dont' pretend to support more */
   1340 		rf_component_label_set_partitionsize(ci_label,
   1341 		    diskPtr->partitionSize);
   1342 		ci_label->column = column;
   1343 		raidflush_component_label(raidPtr, column);
   1344 		/* XXXjld what about the spares? */
   1345 	}
   1346 
   1347 	return 0;
   1348 }
   1349 
   1350 static int
   1351 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1352 {
   1353 
   1354 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1355 		/* Can't do this on a RAID 0!! */
   1356 		return EINVAL;
   1357 	}
   1358 
   1359 	if (raidPtr->recon_in_progress == 1) {
   1360 		/* a reconstruct is already in progress! */
   1361 		return EINVAL;
   1362 	}
   1363 
   1364 	RF_SingleComponent_t component;
   1365 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1366 	component.row = 0; /* we don't support any more */
   1367 	int column = component.column;
   1368 
   1369 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1370 		return EINVAL;
   1371 	}
   1372 
   1373 	rf_lock_mutex2(raidPtr->mutex);
   1374 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1375 	    (raidPtr->numFailures > 0)) {
   1376 		/* XXX 0 above shouldn't be constant!!! */
   1377 		/* some component other than this has failed.
   1378 		   Let's not make things worse than they already
   1379 		   are... */
   1380 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1381 		       raidPtr->raidid);
   1382 		printf("raid%d:     Col: %d   Too many failures.\n",
   1383 		       raidPtr->raidid, column);
   1384 		rf_unlock_mutex2(raidPtr->mutex);
   1385 		return EINVAL;
   1386 	}
   1387 
   1388 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1389 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1390 		       raidPtr->raidid);
   1391 		printf("raid%d:    Col: %d   "
   1392 		    "Reconstruction already occurring!\n",
   1393 		    raidPtr->raidid, column);
   1394 
   1395 		rf_unlock_mutex2(raidPtr->mutex);
   1396 		return EINVAL;
   1397 	}
   1398 
   1399 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1400 		rf_unlock_mutex2(raidPtr->mutex);
   1401 		return EINVAL;
   1402 	}
   1403 
   1404 	rf_unlock_mutex2(raidPtr->mutex);
   1405 
   1406 	struct rf_recon_req_internal *rrint;
   1407 	rrint = RF_Malloc(sizeof(*rrint));
   1408 	if (rrint == NULL)
   1409 		return ENOMEM;
   1410 
   1411 	rrint->col = column;
   1412 	rrint->raidPtr = raidPtr;
   1413 
   1414 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1415 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1416 }
   1417 
   1418 static int
   1419 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1420 {
   1421 	/*
   1422 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1423 	 * so tell the user it's done.
   1424 	 */
   1425 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1426 	    raidPtr->status != rf_rs_reconstructing) {
   1427 		*data = 100;
   1428 		return 0;
   1429 	}
   1430 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1431 		*data = 0;
   1432 		return 0;
   1433 	}
   1434 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1435 	    / raidPtr->reconControl->numRUsTotal);
   1436 	return 0;
   1437 }
   1438 
   1439 /*
   1440  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1441  * on the component_name[] array.
   1442  */
   1443 static void
   1444 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1445 {
   1446 
   1447 	memcpy(component, data, sizeof *component);
   1448 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1449 }
   1450 
   1451 static int
   1452 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1453 {
   1454 	int     unit = raidunit(dev);
   1455 	int     part, pmask;
   1456 	struct raid_softc *rs;
   1457 	struct dk_softc *dksc;
   1458 	RF_Config_t *k_cfg;
   1459 	RF_Raid_t *raidPtr;
   1460 	RF_AccTotals_t *totals;
   1461 	RF_SingleComponent_t component;
   1462 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1463 	int retcode = 0;
   1464 	int column;
   1465 	RF_ComponentLabel_t *clabel;
   1466 	int d;
   1467 
   1468 	if ((rs = raidget(unit, false)) == NULL)
   1469 		return ENXIO;
   1470 
   1471 	dksc = &rs->sc_dksc;
   1472 	raidPtr = &rs->sc_r;
   1473 
   1474 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1475 	    (int) DISKPART(dev), (int) unit, cmd));
   1476 
   1477 	/* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
   1478 	switch (cmd) {
   1479 	case RAIDFRAME_CONFIGURE:
   1480 	case RAIDFRAME_RESCAN:
   1481 		break;
   1482 	default:
   1483 		if (!rf_inited(rs))
   1484 			return ENXIO;
   1485 	}
   1486 
   1487 	switch (cmd) {
   1488 		/* configure the system */
   1489 	case RAIDFRAME_CONFIGURE:
   1490 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1491 			return retcode;
   1492 		return rf_construct(rs, k_cfg);
   1493 
   1494 		/* shutdown the system */
   1495 	case RAIDFRAME_SHUTDOWN:
   1496 
   1497 		part = DISKPART(dev);
   1498 		pmask = (1 << part);
   1499 
   1500 		if ((retcode = raidlock(rs)) != 0)
   1501 			return retcode;
   1502 
   1503 		if (DK_BUSY(dksc, pmask) ||
   1504 		    raidPtr->recon_in_progress != 0 ||
   1505 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1506 		    raidPtr->copyback_in_progress != 0)
   1507 			retcode = EBUSY;
   1508 		else {
   1509 			/* detach and free on close */
   1510 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1511 			retcode = 0;
   1512 		}
   1513 
   1514 		raidunlock(rs);
   1515 
   1516 		return retcode;
   1517 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1518 		return rf_get_component_label(raidPtr, data);
   1519 
   1520 #if RF_DISABLED
   1521 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1522 		return rf_set_component_label(raidPtr, data);
   1523 #endif
   1524 
   1525 	case RAIDFRAME_INIT_LABELS:
   1526 		return rf_init_component_label(raidPtr, data);
   1527 
   1528 	case RAIDFRAME_SET_AUTOCONFIG:
   1529 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1530 		printf("raid%d: New autoconfig value is: %d\n",
   1531 		       raidPtr->raidid, d);
   1532 		*(int *) data = d;
   1533 		return retcode;
   1534 
   1535 	case RAIDFRAME_SET_ROOT:
   1536 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1537 		printf("raid%d: New rootpartition value is: %d\n",
   1538 		       raidPtr->raidid, d);
   1539 		*(int *) data = d;
   1540 		return retcode;
   1541 
   1542 		/* initialize all parity */
   1543 	case RAIDFRAME_REWRITEPARITY:
   1544 
   1545 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1546 			/* Parity for RAID 0 is trivially correct */
   1547 			raidPtr->parity_good = RF_RAID_CLEAN;
   1548 			return 0;
   1549 		}
   1550 
   1551 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1552 			/* Re-write is already in progress! */
   1553 			return EINVAL;
   1554 		}
   1555 
   1556 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1557 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1558 
   1559 	case RAIDFRAME_ADD_HOT_SPARE:
   1560 		rf_copy_single_component(&component, data);
   1561 		return rf_add_hot_spare(raidPtr, &component);
   1562 
   1563 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1564 		return retcode;
   1565 
   1566 	case RAIDFRAME_DELETE_COMPONENT:
   1567 		rf_copy_single_component(&component, data);
   1568 		return rf_delete_component(raidPtr, &component);
   1569 
   1570 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1571 		rf_copy_single_component(&component, data);
   1572 		return rf_incorporate_hot_spare(raidPtr, &component);
   1573 
   1574 	case RAIDFRAME_REBUILD_IN_PLACE:
   1575 		return rf_rebuild_in_place(raidPtr, data);
   1576 
   1577 	case RAIDFRAME_GET_INFO:
   1578 		ucfgp = *(RF_DeviceConfig_t **)data;
   1579 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1580 		if (d_cfg == NULL)
   1581 			return ENOMEM;
   1582 		retcode = rf_get_info(raidPtr, d_cfg);
   1583 		if (retcode == 0) {
   1584 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1585 		}
   1586 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1587 		return retcode;
   1588 
   1589 	case RAIDFRAME_CHECK_PARITY:
   1590 		*(int *) data = raidPtr->parity_good;
   1591 		return 0;
   1592 
   1593 	case RAIDFRAME_PARITYMAP_STATUS:
   1594 		if (rf_paritymap_ineligible(raidPtr))
   1595 			return EINVAL;
   1596 		rf_paritymap_status(raidPtr->parity_map, data);
   1597 		return 0;
   1598 
   1599 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1600 		if (rf_paritymap_ineligible(raidPtr))
   1601 			return EINVAL;
   1602 		if (raidPtr->parity_map == NULL)
   1603 			return ENOENT; /* ??? */
   1604 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1605 			return EINVAL;
   1606 		return 0;
   1607 
   1608 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1609 		if (rf_paritymap_ineligible(raidPtr))
   1610 			return EINVAL;
   1611 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1612 		return 0;
   1613 
   1614 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1615 		if (rf_paritymap_ineligible(raidPtr))
   1616 			return EINVAL;
   1617 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1618 		/* XXX should errors be passed up? */
   1619 		return 0;
   1620 
   1621 	case RAIDFRAME_RESCAN:
   1622 		return rf_rescan();
   1623 
   1624 	case RAIDFRAME_RESET_ACCTOTALS:
   1625 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1626 		return 0;
   1627 
   1628 	case RAIDFRAME_GET_ACCTOTALS:
   1629 		totals = (RF_AccTotals_t *) data;
   1630 		*totals = raidPtr->acc_totals;
   1631 		return 0;
   1632 
   1633 	case RAIDFRAME_KEEP_ACCTOTALS:
   1634 		raidPtr->keep_acc_totals = *(int *)data;
   1635 		return 0;
   1636 
   1637 	case RAIDFRAME_GET_SIZE:
   1638 		*(int *) data = raidPtr->totalSectors;
   1639 		return 0;
   1640 
   1641 	case RAIDFRAME_FAIL_DISK:
   1642 		return rf_fail_disk(raidPtr, data);
   1643 
   1644 		/* invoke a copyback operation after recon on whatever disk
   1645 		 * needs it, if any */
   1646 	case RAIDFRAME_COPYBACK:
   1647 
   1648 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1649 			/* This makes no sense on a RAID 0!! */
   1650 			return EINVAL;
   1651 		}
   1652 
   1653 		if (raidPtr->copyback_in_progress == 1) {
   1654 			/* Copyback is already in progress! */
   1655 			return EINVAL;
   1656 		}
   1657 
   1658 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1659 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1660 
   1661 		/* return the percentage completion of reconstruction */
   1662 	case RAIDFRAME_CHECK_RECON_STATUS:
   1663 		return rf_check_recon_status(raidPtr, data);
   1664 
   1665 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1666 		rf_check_recon_status_ext(raidPtr, data);
   1667 		return 0;
   1668 
   1669 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1670 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1671 			/* This makes no sense on a RAID 0, so tell the
   1672 			   user it's done. */
   1673 			*(int *) data = 100;
   1674 			return 0;
   1675 		}
   1676 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1677 			*(int *) data = 100 *
   1678 				raidPtr->parity_rewrite_stripes_done /
   1679 				raidPtr->Layout.numStripe;
   1680 		} else {
   1681 			*(int *) data = 100;
   1682 		}
   1683 		return 0;
   1684 
   1685 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1686 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1687 		return 0;
   1688 
   1689 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1690 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1691 			/* This makes no sense on a RAID 0 */
   1692 			*(int *) data = 100;
   1693 			return 0;
   1694 		}
   1695 		if (raidPtr->copyback_in_progress == 1) {
   1696 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1697 				raidPtr->Layout.numStripe;
   1698 		} else {
   1699 			*(int *) data = 100;
   1700 		}
   1701 		return 0;
   1702 
   1703 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1704 		rf_check_copyback_status_ext(raidPtr, data);
   1705 		return 0;
   1706 
   1707 	case RAIDFRAME_SET_LAST_UNIT:
   1708 		for (column = 0; column < raidPtr->numCol; column++)
   1709 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1710 				return EBUSY;
   1711 
   1712 		for (column = 0; column < raidPtr->numCol; column++) {
   1713 			clabel = raidget_component_label(raidPtr, column);
   1714 			clabel->last_unit = *(int *)data;
   1715 			raidflush_component_label(raidPtr, column);
   1716 		}
   1717 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1718 		return 0;
   1719 
   1720 		/* the sparetable daemon calls this to wait for the kernel to
   1721 		 * need a spare table. this ioctl does not return until a
   1722 		 * spare table is needed. XXX -- calling mpsleep here in the
   1723 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1724 		 * -- I should either compute the spare table in the kernel,
   1725 		 * or have a different -- XXX XXX -- interface (a different
   1726 		 * character device) for delivering the table     -- XXX */
   1727 #if RF_DISABLED
   1728 	case RAIDFRAME_SPARET_WAIT:
   1729 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1730 		while (!rf_sparet_wait_queue)
   1731 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1732 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1733 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1734 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1735 
   1736 		/* structure assignment */
   1737 		*((RF_SparetWait_t *) data) = *waitreq;
   1738 
   1739 		RF_Free(waitreq, sizeof(*waitreq));
   1740 		return 0;
   1741 
   1742 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1743 		 * code in it that will cause the dameon to exit */
   1744 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1745 		waitreq = RF_Malloc(sizeof(*waitreq));
   1746 		waitreq->fcol = -1;
   1747 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1748 		waitreq->next = rf_sparet_wait_queue;
   1749 		rf_sparet_wait_queue = waitreq;
   1750 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1751 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1752 		return 0;
   1753 
   1754 		/* used by the spare table daemon to deliver a spare table
   1755 		 * into the kernel */
   1756 	case RAIDFRAME_SEND_SPARET:
   1757 
   1758 		/* install the spare table */
   1759 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1760 
   1761 		/* respond to the requestor.  the return status of the spare
   1762 		 * table installation is passed in the "fcol" field */
   1763 		waitred = RF_Malloc(sizeof(*waitreq));
   1764 		waitreq->fcol = retcode;
   1765 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1766 		waitreq->next = rf_sparet_resp_queue;
   1767 		rf_sparet_resp_queue = waitreq;
   1768 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1769 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1770 
   1771 		return retcode;
   1772 #endif
   1773 	default:
   1774 		/*
   1775 		 * Don't bother trying to load compat modules
   1776 		 * if it is not our ioctl. This is more efficient
   1777 		 * and makes rump tests not depend on compat code
   1778 		 */
   1779 		if (IOCGROUP(cmd) != 'r')
   1780 			break;
   1781 #ifdef _LP64
   1782 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1783 			module_autoload("compat_netbsd32_raid",
   1784 			    MODULE_CLASS_EXEC);
   1785 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1786 			    (rs, cmd, data), enosys(), retcode);
   1787 			if (retcode != EPASSTHROUGH)
   1788 				return retcode;
   1789 		}
   1790 #endif
   1791 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1792 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1793 		    (rs, cmd, data), enosys(), retcode);
   1794 		if (retcode != EPASSTHROUGH)
   1795 			return retcode;
   1796 
   1797 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1798 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1799 		    (rs, cmd, data), enosys(), retcode);
   1800 		if (retcode != EPASSTHROUGH)
   1801 			return retcode;
   1802 		break; /* fall through to the os-specific code below */
   1803 
   1804 	}
   1805 
   1806 	if (!raidPtr->valid)
   1807 		return EINVAL;
   1808 
   1809 	/*
   1810 	 * Add support for "regular" device ioctls here.
   1811 	 */
   1812 
   1813 	switch (cmd) {
   1814 	case DIOCGCACHE:
   1815 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1816 		break;
   1817 
   1818 	case DIOCCACHESYNC:
   1819 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1820 		break;
   1821 
   1822 	default:
   1823 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1824 		break;
   1825 	}
   1826 
   1827 	return retcode;
   1828 
   1829 }
   1830 
   1831 
   1832 /* raidinit -- complete the rest of the initialization for the
   1833    RAIDframe device.  */
   1834 
   1835 
   1836 static void
   1837 raidinit(struct raid_softc *rs)
   1838 {
   1839 	cfdata_t cf;
   1840 	unsigned int unit;
   1841 	struct dk_softc *dksc = &rs->sc_dksc;
   1842 	RF_Raid_t *raidPtr = &rs->sc_r;
   1843 	device_t dev;
   1844 
   1845 	unit = raidPtr->raidid;
   1846 
   1847 	/* XXX doesn't check bounds. */
   1848 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1849 
   1850 	/* attach the pseudo device */
   1851 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1852 	cf->cf_name = raid_cd.cd_name;
   1853 	cf->cf_atname = raid_cd.cd_name;
   1854 	cf->cf_unit = unit;
   1855 	cf->cf_fstate = FSTATE_STAR;
   1856 
   1857 	dev = config_attach_pseudo(cf);
   1858 	if (dev == NULL) {
   1859 		printf("raid%d: config_attach_pseudo failed\n",
   1860 		    raidPtr->raidid);
   1861 		free(cf, M_RAIDFRAME);
   1862 		return;
   1863 	}
   1864 
   1865 	/* provide a backpointer to the real softc */
   1866 	raidsoftc(dev) = rs;
   1867 
   1868 	/* disk_attach actually creates space for the CPU disklabel, among
   1869 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1870 	 * with disklabels. */
   1871 	dk_init(dksc, dev, DKTYPE_RAID);
   1872 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1873 
   1874 	/* XXX There may be a weird interaction here between this, and
   1875 	 * protectedSectors, as used in RAIDframe.  */
   1876 
   1877 	rs->sc_size = raidPtr->totalSectors;
   1878 
   1879 	/* Attach dk and disk subsystems */
   1880 	dk_attach(dksc);
   1881 	disk_attach(&dksc->sc_dkdev);
   1882 	rf_set_geometry(rs, raidPtr);
   1883 
   1884 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1885 
   1886 	/* mark unit as usuable */
   1887 	rs->sc_flags |= RAIDF_INITED;
   1888 
   1889 	dkwedge_discover(&dksc->sc_dkdev);
   1890 }
   1891 
   1892 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1893 /* wake up the daemon & tell it to get us a spare table
   1894  * XXX
   1895  * the entries in the queues should be tagged with the raidPtr
   1896  * so that in the extremely rare case that two recons happen at once,
   1897  * we know for which device were requesting a spare table
   1898  * XXX
   1899  *
   1900  * XXX This code is not currently used. GO
   1901  */
   1902 int
   1903 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1904 {
   1905 	int     retcode;
   1906 
   1907 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1908 	req->next = rf_sparet_wait_queue;
   1909 	rf_sparet_wait_queue = req;
   1910 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1911 
   1912 	/* mpsleep unlocks the mutex */
   1913 	while (!rf_sparet_resp_queue) {
   1914 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1915 	}
   1916 	req = rf_sparet_resp_queue;
   1917 	rf_sparet_resp_queue = req->next;
   1918 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1919 
   1920 	retcode = req->fcol;
   1921 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1922 					 * alloc'd */
   1923 	return retcode;
   1924 }
   1925 #endif
   1926 
   1927 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1928  * bp & passes it down.
   1929  * any calls originating in the kernel must use non-blocking I/O
   1930  * do some extra sanity checking to return "appropriate" error values for
   1931  * certain conditions (to make some standard utilities work)
   1932  *
   1933  * Formerly known as: rf_DoAccessKernel
   1934  */
   1935 void
   1936 raidstart(RF_Raid_t *raidPtr)
   1937 {
   1938 	struct raid_softc *rs;
   1939 	struct dk_softc *dksc;
   1940 
   1941 	rs = raidPtr->softc;
   1942 	dksc = &rs->sc_dksc;
   1943 	/* quick check to see if anything has died recently */
   1944 	rf_lock_mutex2(raidPtr->mutex);
   1945 	if (raidPtr->numNewFailures > 0) {
   1946 		rf_unlock_mutex2(raidPtr->mutex);
   1947 		rf_update_component_labels(raidPtr,
   1948 					   RF_NORMAL_COMPONENT_UPDATE);
   1949 		rf_lock_mutex2(raidPtr->mutex);
   1950 		raidPtr->numNewFailures--;
   1951 	}
   1952 	rf_unlock_mutex2(raidPtr->mutex);
   1953 
   1954 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1955 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1956 		return;
   1957 	}
   1958 
   1959 	dk_start(dksc, NULL);
   1960 }
   1961 
   1962 static int
   1963 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1964 {
   1965 	RF_SectorCount_t num_blocks, pb, sum;
   1966 	RF_RaidAddr_t raid_addr;
   1967 	daddr_t blocknum;
   1968 	int rc;
   1969 
   1970 	rf_lock_mutex2(raidPtr->mutex);
   1971 	if (raidPtr->openings == 0) {
   1972 		rf_unlock_mutex2(raidPtr->mutex);
   1973 		return EAGAIN;
   1974 	}
   1975 	rf_unlock_mutex2(raidPtr->mutex);
   1976 
   1977 	blocknum = bp->b_rawblkno;
   1978 
   1979 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1980 		    (int) blocknum));
   1981 
   1982 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1983 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1984 
   1985 	/* *THIS* is where we adjust what block we're going to...
   1986 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1987 	raid_addr = blocknum;
   1988 
   1989 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1990 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1991 	sum = raid_addr + num_blocks + pb;
   1992 	if (1 || rf_debugKernelAccess) {
   1993 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1994 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1995 			    (int) pb, (int) bp->b_resid));
   1996 	}
   1997 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1998 	    || (sum < num_blocks) || (sum < pb)) {
   1999 		rc = ENOSPC;
   2000 		goto done;
   2001 	}
   2002 	/*
   2003 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2004 	 */
   2005 
   2006 	if (bp->b_bcount & raidPtr->sectorMask) {
   2007 		rc = ENOSPC;
   2008 		goto done;
   2009 	}
   2010 	db1_printf(("Calling DoAccess..\n"));
   2011 
   2012 
   2013 	rf_lock_mutex2(raidPtr->mutex);
   2014 	raidPtr->openings--;
   2015 	rf_unlock_mutex2(raidPtr->mutex);
   2016 
   2017 	/* don't ever condition on bp->b_flags & B_WRITE.
   2018 	 * always condition on B_READ instead */
   2019 
   2020 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2021 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2022 			 raid_addr, num_blocks,
   2023 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2024 
   2025 done:
   2026 	return rc;
   2027 }
   2028 
   2029 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2030 
   2031 int
   2032 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2033 {
   2034 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2035 	struct buf *bp;
   2036 
   2037 	req->queue = queue;
   2038 	bp = req->bp;
   2039 
   2040 	switch (req->type) {
   2041 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2042 		/* XXX need to do something extra here.. */
   2043 		/* I'm leaving this in, as I've never actually seen it used,
   2044 		 * and I'd like folks to report it... GO */
   2045 		printf("%s: WAKEUP CALLED\n", __func__);
   2046 		queue->numOutstanding++;
   2047 
   2048 		bp->b_flags = 0;
   2049 		bp->b_private = req;
   2050 
   2051 		KernelWakeupFunc(bp);
   2052 		break;
   2053 
   2054 	case RF_IO_TYPE_READ:
   2055 	case RF_IO_TYPE_WRITE:
   2056 #if RF_ACC_TRACE > 0
   2057 		if (req->tracerec) {
   2058 			RF_ETIMER_START(req->tracerec->timer);
   2059 		}
   2060 #endif
   2061 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2062 		    op, queue->rf_cinfo->ci_dev,
   2063 		    req->sectorOffset, req->numSector,
   2064 		    req->buf, KernelWakeupFunc, (void *) req,
   2065 		    queue->raidPtr->logBytesPerSector);
   2066 
   2067 		if (rf_debugKernelAccess) {
   2068 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2069 				(long) bp->b_blkno));
   2070 		}
   2071 		queue->numOutstanding++;
   2072 		queue->last_deq_sector = req->sectorOffset;
   2073 		/* acc wouldn't have been let in if there were any pending
   2074 		 * reqs at any other priority */
   2075 		queue->curPriority = req->priority;
   2076 
   2077 		db1_printf(("Going for %c to unit %d col %d\n",
   2078 			    req->type, queue->raidPtr->raidid,
   2079 			    queue->col));
   2080 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2081 			(int) req->sectorOffset, (int) req->numSector,
   2082 			(int) (req->numSector <<
   2083 			    queue->raidPtr->logBytesPerSector),
   2084 			(int) queue->raidPtr->logBytesPerSector));
   2085 
   2086 		/*
   2087 		 * XXX: drop lock here since this can block at
   2088 		 * least with backing SCSI devices.  Retake it
   2089 		 * to minimize fuss with calling interfaces.
   2090 		 */
   2091 
   2092 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2093 		bdev_strategy(bp);
   2094 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2095 		break;
   2096 
   2097 	default:
   2098 		panic("bad req->type in rf_DispatchKernelIO");
   2099 	}
   2100 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2101 
   2102 	return 0;
   2103 }
   2104 /* this is the callback function associated with a I/O invoked from
   2105    kernel code.
   2106  */
   2107 static void
   2108 KernelWakeupFunc(struct buf *bp)
   2109 {
   2110 	RF_DiskQueueData_t *req = NULL;
   2111 	RF_DiskQueue_t *queue;
   2112 
   2113 	db1_printf(("recovering the request queue:\n"));
   2114 
   2115 	req = bp->b_private;
   2116 
   2117 	queue = (RF_DiskQueue_t *) req->queue;
   2118 
   2119 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2120 
   2121 #if RF_ACC_TRACE > 0
   2122 	if (req->tracerec) {
   2123 		RF_ETIMER_STOP(req->tracerec->timer);
   2124 		RF_ETIMER_EVAL(req->tracerec->timer);
   2125 		rf_lock_mutex2(rf_tracing_mutex);
   2126 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2127 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2128 		req->tracerec->num_phys_ios++;
   2129 		rf_unlock_mutex2(rf_tracing_mutex);
   2130 	}
   2131 #endif
   2132 
   2133 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2134 	 * ballistic, and mark the component as hosed... */
   2135 
   2136 	if (bp->b_error != 0) {
   2137 		/* Mark the disk as dead */
   2138 		/* but only mark it once... */
   2139 		/* and only if it wouldn't leave this RAID set
   2140 		   completely broken */
   2141 		if (((queue->raidPtr->Disks[queue->col].status ==
   2142 		      rf_ds_optimal) ||
   2143 		     (queue->raidPtr->Disks[queue->col].status ==
   2144 		      rf_ds_used_spare)) &&
   2145 		     (queue->raidPtr->numFailures <
   2146 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2147 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2148 			       queue->raidPtr->raidid,
   2149 			       bp->b_error,
   2150 			       queue->raidPtr->Disks[queue->col].devname);
   2151 			queue->raidPtr->Disks[queue->col].status =
   2152 			    rf_ds_failed;
   2153 			queue->raidPtr->status = rf_rs_degraded;
   2154 			queue->raidPtr->numFailures++;
   2155 			queue->raidPtr->numNewFailures++;
   2156 		} else {	/* Disk is already dead... */
   2157 			/* printf("Disk already marked as dead!\n"); */
   2158 		}
   2159 
   2160 	}
   2161 
   2162 	/* Fill in the error value */
   2163 	req->error = bp->b_error;
   2164 
   2165 	/* Drop this one on the "finished" queue... */
   2166 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2167 
   2168 	/* Let the raidio thread know there is work to be done. */
   2169 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2170 
   2171 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2172 }
   2173 
   2174 
   2175 /*
   2176  * initialize a buf structure for doing an I/O in the kernel.
   2177  */
   2178 static void
   2179 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2180        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2181        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2182 {
   2183 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2184 	bp->b_oflags = 0;
   2185 	bp->b_cflags = 0;
   2186 	bp->b_bcount = numSect << logBytesPerSector;
   2187 	bp->b_bufsize = bp->b_bcount;
   2188 	bp->b_error = 0;
   2189 	bp->b_dev = dev;
   2190 	bp->b_data = bf;
   2191 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2192 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2193 	if (bp->b_bcount == 0) {
   2194 		panic("bp->b_bcount is zero in InitBP!!");
   2195 	}
   2196 	bp->b_iodone = cbFunc;
   2197 	bp->b_private = cbArg;
   2198 }
   2199 
   2200 /*
   2201  * Wait interruptibly for an exclusive lock.
   2202  *
   2203  * XXX
   2204  * Several drivers do this; it should be abstracted and made MP-safe.
   2205  * (Hmm... where have we seen this warning before :->  GO )
   2206  */
   2207 static int
   2208 raidlock(struct raid_softc *rs)
   2209 {
   2210 	int     error;
   2211 
   2212 	error = 0;
   2213 	mutex_enter(&rs->sc_mutex);
   2214 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2215 		rs->sc_flags |= RAIDF_WANTED;
   2216 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2217 		if (error != 0)
   2218 			goto done;
   2219 	}
   2220 	rs->sc_flags |= RAIDF_LOCKED;
   2221 done:
   2222 	mutex_exit(&rs->sc_mutex);
   2223 	return error;
   2224 }
   2225 /*
   2226  * Unlock and wake up any waiters.
   2227  */
   2228 static void
   2229 raidunlock(struct raid_softc *rs)
   2230 {
   2231 
   2232 	mutex_enter(&rs->sc_mutex);
   2233 	rs->sc_flags &= ~RAIDF_LOCKED;
   2234 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2235 		rs->sc_flags &= ~RAIDF_WANTED;
   2236 		cv_broadcast(&rs->sc_cv);
   2237 	}
   2238 	mutex_exit(&rs->sc_mutex);
   2239 }
   2240 
   2241 
   2242 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2243 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2244 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2245 
   2246 static daddr_t
   2247 rf_component_info_offset(void)
   2248 {
   2249 
   2250 	return RF_COMPONENT_INFO_OFFSET;
   2251 }
   2252 
   2253 static daddr_t
   2254 rf_component_info_size(unsigned secsize)
   2255 {
   2256 	daddr_t info_size;
   2257 
   2258 	KASSERT(secsize);
   2259 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2260 		info_size = secsize;
   2261 	else
   2262 		info_size = RF_COMPONENT_INFO_SIZE;
   2263 
   2264 	return info_size;
   2265 }
   2266 
   2267 static daddr_t
   2268 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2269 {
   2270 	daddr_t map_offset;
   2271 
   2272 	KASSERT(raidPtr->bytesPerSector);
   2273 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2274 		map_offset = raidPtr->bytesPerSector;
   2275 	else
   2276 		map_offset = RF_COMPONENT_INFO_SIZE;
   2277 	map_offset += rf_component_info_offset();
   2278 
   2279 	return map_offset;
   2280 }
   2281 
   2282 static daddr_t
   2283 rf_parity_map_size(RF_Raid_t *raidPtr)
   2284 {
   2285 	daddr_t map_size;
   2286 
   2287 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2288 		map_size = raidPtr->bytesPerSector;
   2289 	else
   2290 		map_size = RF_PARITY_MAP_SIZE;
   2291 
   2292 	return map_size;
   2293 }
   2294 
   2295 int
   2296 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2297 {
   2298 	RF_ComponentLabel_t *clabel;
   2299 
   2300 	clabel = raidget_component_label(raidPtr, col);
   2301 	clabel->clean = RF_RAID_CLEAN;
   2302 	raidflush_component_label(raidPtr, col);
   2303 	return(0);
   2304 }
   2305 
   2306 
   2307 int
   2308 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2309 {
   2310 	RF_ComponentLabel_t *clabel;
   2311 
   2312 	clabel = raidget_component_label(raidPtr, col);
   2313 	clabel->clean = RF_RAID_DIRTY;
   2314 	raidflush_component_label(raidPtr, col);
   2315 	return(0);
   2316 }
   2317 
   2318 int
   2319 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2320 {
   2321 	KASSERT(raidPtr->bytesPerSector);
   2322 
   2323 	return raidread_component_label(raidPtr->bytesPerSector,
   2324 	    raidPtr->Disks[col].dev,
   2325 	    raidPtr->raid_cinfo[col].ci_vp,
   2326 	    &raidPtr->raid_cinfo[col].ci_label);
   2327 }
   2328 
   2329 RF_ComponentLabel_t *
   2330 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2331 {
   2332 	return &raidPtr->raid_cinfo[col].ci_label;
   2333 }
   2334 
   2335 int
   2336 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2337 {
   2338 	RF_ComponentLabel_t *label;
   2339 
   2340 	label = &raidPtr->raid_cinfo[col].ci_label;
   2341 	label->mod_counter = raidPtr->mod_counter;
   2342 #ifndef RF_NO_PARITY_MAP
   2343 	label->parity_map_modcount = label->mod_counter;
   2344 #endif
   2345 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2346 	    raidPtr->Disks[col].dev,
   2347 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2348 }
   2349 
   2350 /*
   2351  * Swap the label endianness.
   2352  *
   2353  * Everything in the component label is 4-byte-swapped except the version,
   2354  * which is kept in the byte-swapped version at all times, and indicates
   2355  * for the writer that a swap is necessary.
   2356  *
   2357  * For reads it is expected that out_label == clabel, but writes expect
   2358  * separate labels so only the re-swapped label is written out to disk,
   2359  * leaving the swapped-except-version internally.
   2360  *
   2361  * Only support swapping label version 2.
   2362  */
   2363 static void
   2364 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2365 {
   2366 	int	*in, *out, *in_last;
   2367 
   2368 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2369 
   2370 	/* Don't swap the label, but do copy it. */
   2371 	out_label->version = clabel->version;
   2372 
   2373 	in = &clabel->serial_number;
   2374 	in_last = &clabel->future_use2[42];
   2375 	out = &out_label->serial_number;
   2376 
   2377 	for (; in < in_last; in++, out++)
   2378 		*out = bswap32(*in);
   2379 }
   2380 
   2381 static int
   2382 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2383     RF_ComponentLabel_t *clabel)
   2384 {
   2385 	int error;
   2386 
   2387 	error = raidread_component_area(dev, b_vp, clabel,
   2388 	    sizeof(RF_ComponentLabel_t),
   2389 	    rf_component_info_offset(),
   2390 	    rf_component_info_size(secsize));
   2391 
   2392 	if (error == 0 &&
   2393 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2394 		rf_swap_label(clabel, clabel);
   2395 	}
   2396 
   2397 	return error;
   2398 }
   2399 
   2400 /* ARGSUSED */
   2401 static int
   2402 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2403     size_t msize, daddr_t offset, daddr_t dsize)
   2404 {
   2405 	struct buf *bp;
   2406 	int error;
   2407 
   2408 	/* XXX should probably ensure that we don't try to do this if
   2409 	   someone has changed rf_protected_sectors. */
   2410 
   2411 	if (b_vp == NULL) {
   2412 		/* For whatever reason, this component is not valid.
   2413 		   Don't try to read a component label from it. */
   2414 		return(EINVAL);
   2415 	}
   2416 
   2417 	/* get a block of the appropriate size... */
   2418 	bp = geteblk((int)dsize);
   2419 	bp->b_dev = dev;
   2420 
   2421 	/* get our ducks in a row for the read */
   2422 	bp->b_blkno = offset / DEV_BSIZE;
   2423 	bp->b_bcount = dsize;
   2424 	bp->b_flags |= B_READ;
   2425  	bp->b_resid = dsize;
   2426 
   2427 	bdev_strategy(bp);
   2428 	error = biowait(bp);
   2429 
   2430 	if (!error) {
   2431 		memcpy(data, bp->b_data, msize);
   2432 	}
   2433 
   2434 	brelse(bp, 0);
   2435 	return(error);
   2436 }
   2437 
   2438 static int
   2439 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2440     RF_ComponentLabel_t *clabel)
   2441 {
   2442 	RF_ComponentLabel_t *clabel_write = clabel;
   2443 	RF_ComponentLabel_t lclabel;
   2444 	int error;
   2445 
   2446 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2447 		clabel_write = &lclabel;
   2448 		rf_swap_label(clabel, clabel_write);
   2449 	}
   2450 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2451 	    sizeof(RF_ComponentLabel_t),
   2452 	    rf_component_info_offset(),
   2453 	    rf_component_info_size(secsize), 0);
   2454 
   2455 	return error;
   2456 }
   2457 
   2458 /* ARGSUSED */
   2459 static int
   2460 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2461     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2462 {
   2463 	struct buf *bp;
   2464 	int error;
   2465 
   2466 	/* get a block of the appropriate size... */
   2467 	bp = geteblk((int)dsize);
   2468 	bp->b_dev = dev;
   2469 
   2470 	/* get our ducks in a row for the write */
   2471 	bp->b_blkno = offset / DEV_BSIZE;
   2472 	bp->b_bcount = dsize;
   2473 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2474  	bp->b_resid = dsize;
   2475 
   2476 	memset(bp->b_data, 0, dsize);
   2477 	memcpy(bp->b_data, data, msize);
   2478 
   2479 	bdev_strategy(bp);
   2480 	if (asyncp)
   2481 		return 0;
   2482 	error = biowait(bp);
   2483 	brelse(bp, 0);
   2484 	if (error) {
   2485 #if 1
   2486 		printf("Failed to write RAID component info!\n");
   2487 #endif
   2488 	}
   2489 
   2490 	return(error);
   2491 }
   2492 
   2493 void
   2494 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2495 {
   2496 	int c;
   2497 
   2498 	for (c = 0; c < raidPtr->numCol; c++) {
   2499 		/* Skip dead disks. */
   2500 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2501 			continue;
   2502 		/* XXXjld: what if an error occurs here? */
   2503 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2504 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2505 		    RF_PARITYMAP_NBYTE,
   2506 		    rf_parity_map_offset(raidPtr),
   2507 		    rf_parity_map_size(raidPtr), 0);
   2508 	}
   2509 }
   2510 
   2511 void
   2512 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2513 {
   2514 	struct rf_paritymap_ondisk tmp;
   2515 	int c,first;
   2516 
   2517 	first=1;
   2518 	for (c = 0; c < raidPtr->numCol; c++) {
   2519 		/* Skip dead disks. */
   2520 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2521 			continue;
   2522 		raidread_component_area(raidPtr->Disks[c].dev,
   2523 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2524 		    RF_PARITYMAP_NBYTE,
   2525 		    rf_parity_map_offset(raidPtr),
   2526 		    rf_parity_map_size(raidPtr));
   2527 		if (first) {
   2528 			memcpy(map, &tmp, sizeof(*map));
   2529 			first = 0;
   2530 		} else {
   2531 			rf_paritymap_merge(map, &tmp);
   2532 		}
   2533 	}
   2534 }
   2535 
   2536 void
   2537 rf_markalldirty(RF_Raid_t *raidPtr)
   2538 {
   2539 	RF_ComponentLabel_t *clabel;
   2540 	int sparecol;
   2541 	int c;
   2542 	int j;
   2543 	int scol = -1;
   2544 
   2545 	raidPtr->mod_counter++;
   2546 	for (c = 0; c < raidPtr->numCol; c++) {
   2547 		/* we don't want to touch (at all) a disk that has
   2548 		   failed */
   2549 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2550 			clabel = raidget_component_label(raidPtr, c);
   2551 			if (clabel->status == rf_ds_spared) {
   2552 				/* XXX do something special...
   2553 				   but whatever you do, don't
   2554 				   try to access it!! */
   2555 			} else {
   2556 				raidmarkdirty(raidPtr, c);
   2557 			}
   2558 		}
   2559 	}
   2560 
   2561 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2562 		sparecol = raidPtr->numCol + c;
   2563 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2564 			/*
   2565 
   2566 			   we claim this disk is "optimal" if it's
   2567 			   rf_ds_used_spare, as that means it should be
   2568 			   directly substitutable for the disk it replaced.
   2569 			   We note that too...
   2570 
   2571 			 */
   2572 
   2573 			for(j=0;j<raidPtr->numCol;j++) {
   2574 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2575 					scol = j;
   2576 					break;
   2577 				}
   2578 			}
   2579 
   2580 			clabel = raidget_component_label(raidPtr, sparecol);
   2581 			/* make sure status is noted */
   2582 
   2583 			raid_init_component_label(raidPtr, clabel);
   2584 
   2585 			clabel->row = 0;
   2586 			clabel->column = scol;
   2587 			/* Note: we *don't* change status from rf_ds_used_spare
   2588 			   to rf_ds_optimal */
   2589 			/* clabel.status = rf_ds_optimal; */
   2590 
   2591 			raidmarkdirty(raidPtr, sparecol);
   2592 		}
   2593 	}
   2594 }
   2595 
   2596 
   2597 void
   2598 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2599 {
   2600 	RF_ComponentLabel_t *clabel;
   2601 	int sparecol;
   2602 	int c;
   2603 	int j;
   2604 	int scol;
   2605 	struct raid_softc *rs = raidPtr->softc;
   2606 
   2607 	scol = -1;
   2608 
   2609 	/* XXX should do extra checks to make sure things really are clean,
   2610 	   rather than blindly setting the clean bit... */
   2611 
   2612 	raidPtr->mod_counter++;
   2613 
   2614 	for (c = 0; c < raidPtr->numCol; c++) {
   2615 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2616 			clabel = raidget_component_label(raidPtr, c);
   2617 			/* make sure status is noted */
   2618 			clabel->status = rf_ds_optimal;
   2619 
   2620 			/* note what unit we are configured as */
   2621 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2622 				clabel->last_unit = raidPtr->raidid;
   2623 
   2624 			raidflush_component_label(raidPtr, c);
   2625 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2626 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2627 					raidmarkclean(raidPtr, c);
   2628 				}
   2629 			}
   2630 		}
   2631 		/* else we don't touch it.. */
   2632 	}
   2633 
   2634 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2635 		sparecol = raidPtr->numCol + c;
   2636 		/* Need to ensure that the reconstruct actually completed! */
   2637 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2638 			/*
   2639 
   2640 			   we claim this disk is "optimal" if it's
   2641 			   rf_ds_used_spare, as that means it should be
   2642 			   directly substitutable for the disk it replaced.
   2643 			   We note that too...
   2644 
   2645 			 */
   2646 
   2647 			for(j=0;j<raidPtr->numCol;j++) {
   2648 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2649 					scol = j;
   2650 					break;
   2651 				}
   2652 			}
   2653 
   2654 			/* XXX shouldn't *really* need this... */
   2655 			clabel = raidget_component_label(raidPtr, sparecol);
   2656 			/* make sure status is noted */
   2657 
   2658 			raid_init_component_label(raidPtr, clabel);
   2659 
   2660 			clabel->column = scol;
   2661 			clabel->status = rf_ds_optimal;
   2662 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2663 				clabel->last_unit = raidPtr->raidid;
   2664 
   2665 			raidflush_component_label(raidPtr, sparecol);
   2666 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2667 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2668 					raidmarkclean(raidPtr, sparecol);
   2669 				}
   2670 			}
   2671 		}
   2672 	}
   2673 }
   2674 
   2675 void
   2676 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2677 {
   2678 
   2679 	if (vp != NULL) {
   2680 		if (auto_configured == 1) {
   2681 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2682 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2683 			vput(vp);
   2684 
   2685 		} else {
   2686 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2687 		}
   2688 	}
   2689 }
   2690 
   2691 
   2692 void
   2693 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2694 {
   2695 	int r,c;
   2696 	struct vnode *vp;
   2697 	int acd;
   2698 
   2699 
   2700 	/* We take this opportunity to close the vnodes like we should.. */
   2701 
   2702 	for (c = 0; c < raidPtr->numCol; c++) {
   2703 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2704 		acd = raidPtr->Disks[c].auto_configured;
   2705 		rf_close_component(raidPtr, vp, acd);
   2706 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2707 		raidPtr->Disks[c].auto_configured = 0;
   2708 	}
   2709 
   2710 	for (r = 0; r < raidPtr->numSpare; r++) {
   2711 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2712 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2713 		rf_close_component(raidPtr, vp, acd);
   2714 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2715 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2716 	}
   2717 }
   2718 
   2719 
   2720 static void
   2721 rf_ReconThread(struct rf_recon_req_internal *req)
   2722 {
   2723 	int     s;
   2724 	RF_Raid_t *raidPtr;
   2725 
   2726 	s = splbio();
   2727 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2728 	raidPtr->recon_in_progress = 1;
   2729 
   2730 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2731 		raidPtr->forceRecon = 1;
   2732 	}
   2733 
   2734 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2735 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2736 
   2737 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2738 		raidPtr->forceRecon = 0;
   2739 	}
   2740 
   2741 	RF_Free(req, sizeof(*req));
   2742 
   2743 	raidPtr->recon_in_progress = 0;
   2744 	splx(s);
   2745 
   2746 	/* That's all... */
   2747 	kthread_exit(0);	/* does not return */
   2748 }
   2749 
   2750 static void
   2751 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2752 {
   2753 	int retcode;
   2754 	int s;
   2755 
   2756 	raidPtr->parity_rewrite_stripes_done = 0;
   2757 	raidPtr->parity_rewrite_in_progress = 1;
   2758 	s = splbio();
   2759 	retcode = rf_RewriteParity(raidPtr);
   2760 	splx(s);
   2761 	if (retcode) {
   2762 		printf("raid%d: Error re-writing parity (%d)!\n",
   2763 		    raidPtr->raidid, retcode);
   2764 	} else {
   2765 		/* set the clean bit!  If we shutdown correctly,
   2766 		   the clean bit on each component label will get
   2767 		   set */
   2768 		raidPtr->parity_good = RF_RAID_CLEAN;
   2769 	}
   2770 	raidPtr->parity_rewrite_in_progress = 0;
   2771 
   2772 	/* Anyone waiting for us to stop?  If so, inform them... */
   2773 	if (raidPtr->waitShutdown) {
   2774 		rf_lock_mutex2(raidPtr->rad_lock);
   2775 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2776 		rf_unlock_mutex2(raidPtr->rad_lock);
   2777 	}
   2778 
   2779 	/* That's all... */
   2780 	kthread_exit(0);	/* does not return */
   2781 }
   2782 
   2783 
   2784 static void
   2785 rf_CopybackThread(RF_Raid_t *raidPtr)
   2786 {
   2787 	int s;
   2788 
   2789 	raidPtr->copyback_in_progress = 1;
   2790 	s = splbio();
   2791 	rf_CopybackReconstructedData(raidPtr);
   2792 	splx(s);
   2793 	raidPtr->copyback_in_progress = 0;
   2794 
   2795 	/* That's all... */
   2796 	kthread_exit(0);	/* does not return */
   2797 }
   2798 
   2799 
   2800 static void
   2801 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2802 {
   2803 	int s;
   2804 	RF_Raid_t *raidPtr;
   2805 
   2806 	s = splbio();
   2807 	raidPtr = req->raidPtr;
   2808 	raidPtr->recon_in_progress = 1;
   2809 
   2810 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2811 		raidPtr->forceRecon = 1;
   2812 	}
   2813 
   2814 	rf_ReconstructInPlace(raidPtr, req->col);
   2815 
   2816 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2817 		raidPtr->forceRecon = 0;
   2818 	}
   2819 
   2820 	RF_Free(req, sizeof(*req));
   2821 	raidPtr->recon_in_progress = 0;
   2822 	splx(s);
   2823 
   2824 	/* That's all... */
   2825 	kthread_exit(0);	/* does not return */
   2826 }
   2827 
   2828 static RF_AutoConfig_t *
   2829 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2830     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2831     unsigned secsize)
   2832 {
   2833 	int good_one = 0;
   2834 	RF_ComponentLabel_t *clabel;
   2835 	RF_AutoConfig_t *ac;
   2836 
   2837 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2838 
   2839 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2840 		/* Got the label.  Does it look reasonable? */
   2841 		if (rf_reasonable_label(clabel, numsecs) &&
   2842 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2843 #ifdef DEBUG
   2844 			printf("Component on: %s: %llu\n",
   2845 				cname, (unsigned long long)size);
   2846 			rf_print_component_label(clabel);
   2847 #endif
   2848 			/* if it's reasonable, add it, else ignore it. */
   2849 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2850 				M_WAITOK);
   2851 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2852 			ac->dev = dev;
   2853 			ac->vp = vp;
   2854 			ac->clabel = clabel;
   2855 			ac->next = ac_list;
   2856 			ac_list = ac;
   2857 			good_one = 1;
   2858 		}
   2859 	}
   2860 	if (!good_one) {
   2861 		/* cleanup */
   2862 		free(clabel, M_RAIDFRAME);
   2863 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2864 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2865 		vput(vp);
   2866 	}
   2867 	return ac_list;
   2868 }
   2869 
   2870 static RF_AutoConfig_t *
   2871 rf_find_raid_components(void)
   2872 {
   2873 	struct vnode *vp;
   2874 	struct disklabel label;
   2875 	device_t dv;
   2876 	deviter_t di;
   2877 	dev_t dev;
   2878 	int bmajor, bminor, wedge, rf_part_found;
   2879 	int error;
   2880 	int i;
   2881 	RF_AutoConfig_t *ac_list;
   2882 	uint64_t numsecs;
   2883 	unsigned secsize;
   2884 	int dowedges;
   2885 
   2886 	/* initialize the AutoConfig list */
   2887 	ac_list = NULL;
   2888 
   2889 	/*
   2890 	 * we begin by trolling through *all* the devices on the system *twice*
   2891 	 * first we scan for wedges, second for other devices. This avoids
   2892 	 * using a raw partition instead of a wedge that covers the whole disk
   2893 	 */
   2894 
   2895 	for (dowedges=1; dowedges>=0; --dowedges) {
   2896 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2897 		     dv = deviter_next(&di)) {
   2898 
   2899 			/* we are only interested in disks */
   2900 			if (device_class(dv) != DV_DISK)
   2901 				continue;
   2902 
   2903 			/* we don't care about floppies */
   2904 			if (device_is_a(dv, "fd")) {
   2905 				continue;
   2906 			}
   2907 
   2908 			/* we don't care about CDs. */
   2909 			if (device_is_a(dv, "cd")) {
   2910 				continue;
   2911 			}
   2912 
   2913 			/* we don't care about md. */
   2914 			if (device_is_a(dv, "md")) {
   2915 				continue;
   2916 			}
   2917 
   2918 			/* hdfd is the Atari/Hades floppy driver */
   2919 			if (device_is_a(dv, "hdfd")) {
   2920 				continue;
   2921 			}
   2922 
   2923 			/* fdisa is the Atari/Milan floppy driver */
   2924 			if (device_is_a(dv, "fdisa")) {
   2925 				continue;
   2926 			}
   2927 
   2928 			/* we don't care about spiflash */
   2929 			if (device_is_a(dv, "spiflash")) {
   2930 				continue;
   2931 			}
   2932 
   2933 			/* are we in the wedges pass ? */
   2934 			wedge = device_is_a(dv, "dk");
   2935 			if (wedge != dowedges) {
   2936 				continue;
   2937 			}
   2938 
   2939 			/* need to find the device_name_to_block_device_major stuff */
   2940 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2941 
   2942 			rf_part_found = 0; /*No raid partition as yet*/
   2943 
   2944 			/* get a vnode for the raw partition of this disk */
   2945 			bminor = minor(device_unit(dv));
   2946 			dev = wedge ? makedev(bmajor, bminor) :
   2947 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2948 			if (bdevvp(dev, &vp))
   2949 				panic("RAID can't alloc vnode");
   2950 
   2951 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2952 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2953 
   2954 			if (error) {
   2955 				/* "Who cares."  Continue looking
   2956 				   for something that exists*/
   2957 				vput(vp);
   2958 				continue;
   2959 			}
   2960 
   2961 			VOP_UNLOCK(vp);
   2962 			error = getdisksize(vp, &numsecs, &secsize);
   2963 			if (error) {
   2964 				/*
   2965 				 * Pseudo devices like vnd and cgd can be
   2966 				 * opened but may still need some configuration.
   2967 				 * Ignore these quietly.
   2968 				 */
   2969 				if (error != ENXIO)
   2970 					printf("RAIDframe: can't get disk size"
   2971 					    " for dev %s (%d)\n",
   2972 					    device_xname(dv), error);
   2973 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2974 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2975 				vput(vp);
   2976 				continue;
   2977 			}
   2978 			if (wedge) {
   2979 				struct dkwedge_info dkw;
   2980 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2981 				    NOCRED);
   2982 				if (error) {
   2983 					printf("RAIDframe: can't get wedge info for "
   2984 					    "dev %s (%d)\n", device_xname(dv), error);
   2985 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2986 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2987 					vput(vp);
   2988 					continue;
   2989 				}
   2990 
   2991 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2992 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2993 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2994 					vput(vp);
   2995 					continue;
   2996 				}
   2997 
   2998 				ac_list = rf_get_component(ac_list, dev, vp,
   2999 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3000 				rf_part_found = 1; /*There is a raid component on this disk*/
   3001 				continue;
   3002 			}
   3003 
   3004 			/* Ok, the disk exists.  Go get the disklabel. */
   3005 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3006 			if (error) {
   3007 				/*
   3008 				 * XXX can't happen - open() would
   3009 				 * have errored out (or faked up one)
   3010 				 */
   3011 				if (error != ENOTTY)
   3012 					printf("RAIDframe: can't get label for dev "
   3013 					    "%s (%d)\n", device_xname(dv), error);
   3014 			}
   3015 
   3016 			/* don't need this any more.  We'll allocate it again
   3017 			   a little later if we really do... */
   3018 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3019 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3020 			vput(vp);
   3021 
   3022 			if (error)
   3023 				continue;
   3024 
   3025 			rf_part_found = 0; /*No raid partitions yet*/
   3026 			for (i = 0; i < label.d_npartitions; i++) {
   3027 				char cname[sizeof(ac_list->devname)];
   3028 
   3029 				/* We only support partitions marked as RAID */
   3030 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3031 					continue;
   3032 
   3033 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3034 				if (bdevvp(dev, &vp))
   3035 					panic("RAID can't alloc vnode");
   3036 
   3037 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3038 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3039 				if (error) {
   3040 					/* Not quite a 'whatever'.  In
   3041 					 * this situation we know
   3042 					 * there is a FS_RAID
   3043 					 * partition, but we can't
   3044 					 * open it.  The most likely
   3045 					 * reason is that the
   3046 					 * partition is already in
   3047 					 * use by another RAID set.
   3048 					 * So note that we've already
   3049 					 * found a partition on this
   3050 					 * disk so we don't attempt
   3051 					 * to use the raw disk later. */
   3052 					rf_part_found = 1;
   3053 					vput(vp);
   3054 					continue;
   3055 				}
   3056 				VOP_UNLOCK(vp);
   3057 				snprintf(cname, sizeof(cname), "%s%c",
   3058 				    device_xname(dv), 'a' + i);
   3059 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3060 					label.d_partitions[i].p_size, numsecs, secsize);
   3061 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3062 			}
   3063 
   3064 			/*
   3065 			 *If there is no raid component on this disk, either in a
   3066 			 *disklabel or inside a wedge, check the raw partition as well,
   3067 			 *as it is possible to configure raid components on raw disk
   3068 			 *devices.
   3069 			 */
   3070 
   3071 			if (!rf_part_found) {
   3072 				char cname[sizeof(ac_list->devname)];
   3073 
   3074 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3075 				if (bdevvp(dev, &vp))
   3076 					panic("RAID can't alloc vnode");
   3077 
   3078 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3079 
   3080 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3081 				if (error) {
   3082 					/* Whatever... */
   3083 					vput(vp);
   3084 					continue;
   3085 				}
   3086 				VOP_UNLOCK(vp);
   3087 				snprintf(cname, sizeof(cname), "%s%c",
   3088 				    device_xname(dv), 'a' + RAW_PART);
   3089 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3090 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3091 			}
   3092 		}
   3093 		deviter_release(&di);
   3094 	}
   3095 	return ac_list;
   3096 }
   3097 
   3098 int
   3099 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3100 {
   3101 
   3102 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3103 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3104 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3105 	    (clabel->clean == RF_RAID_CLEAN ||
   3106 	     clabel->clean == RF_RAID_DIRTY) &&
   3107 	    clabel->row >=0 &&
   3108 	    clabel->column >= 0 &&
   3109 	    clabel->num_rows > 0 &&
   3110 	    clabel->num_columns > 0 &&
   3111 	    clabel->row < clabel->num_rows &&
   3112 	    clabel->column < clabel->num_columns &&
   3113 	    clabel->blockSize > 0 &&
   3114 	    /*
   3115 	     * numBlocksHi may contain garbage, but it is ok since
   3116 	     * the type is unsigned.  If it is really garbage,
   3117 	     * rf_fix_old_label_size() will fix it.
   3118 	     */
   3119 	    rf_component_label_numblocks(clabel) > 0) {
   3120 		/*
   3121 		 * label looks reasonable enough...
   3122 		 * let's make sure it has no old garbage.
   3123 		 */
   3124 		if (numsecs)
   3125 			rf_fix_old_label_size(clabel, numsecs);
   3126 		return(1);
   3127 	}
   3128 	return(0);
   3129 }
   3130 
   3131 
   3132 /*
   3133  * For reasons yet unknown, some old component labels have garbage in
   3134  * the newer numBlocksHi region, and this causes lossage.  Since those
   3135  * disks will also have numsecs set to less than 32 bits of sectors,
   3136  * we can determine when this corruption has occurred, and fix it.
   3137  *
   3138  * The exact same problem, with the same unknown reason, happens to
   3139  * the partitionSizeHi member as well.
   3140  */
   3141 static void
   3142 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3143 {
   3144 
   3145 	if (numsecs < ((uint64_t)1 << 32)) {
   3146 		if (clabel->numBlocksHi) {
   3147 			printf("WARNING: total sectors < 32 bits, yet "
   3148 			       "numBlocksHi set\n"
   3149 			       "WARNING: resetting numBlocksHi to zero.\n");
   3150 			clabel->numBlocksHi = 0;
   3151 		}
   3152 
   3153 		if (clabel->partitionSizeHi) {
   3154 			printf("WARNING: total sectors < 32 bits, yet "
   3155 			       "partitionSizeHi set\n"
   3156 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3157 			clabel->partitionSizeHi = 0;
   3158 		}
   3159 	}
   3160 }
   3161 
   3162 
   3163 #ifdef DEBUG
   3164 void
   3165 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3166 {
   3167 	uint64_t numBlocks;
   3168 	static const char *rp[] = {
   3169 	    "No", "Force", "Soft", "*invalid*"
   3170 	};
   3171 
   3172 
   3173 	numBlocks = rf_component_label_numblocks(clabel);
   3174 
   3175 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3176 	       clabel->row, clabel->column,
   3177 	       clabel->num_rows, clabel->num_columns);
   3178 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3179 	       clabel->version, clabel->serial_number,
   3180 	       clabel->mod_counter);
   3181 	printf("   Clean: %s Status: %d\n",
   3182 	       clabel->clean ? "Yes" : "No", clabel->status);
   3183 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3184 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3185 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3186 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3187 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3188 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3189 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3190 #if 0
   3191 	   printf("   Config order: %d\n", clabel->config_order);
   3192 #endif
   3193 
   3194 }
   3195 #endif
   3196 
   3197 static RF_ConfigSet_t *
   3198 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3199 {
   3200 	RF_AutoConfig_t *ac;
   3201 	RF_ConfigSet_t *config_sets;
   3202 	RF_ConfigSet_t *cset;
   3203 	RF_AutoConfig_t *ac_next;
   3204 
   3205 
   3206 	config_sets = NULL;
   3207 
   3208 	/* Go through the AutoConfig list, and figure out which components
   3209 	   belong to what sets.  */
   3210 	ac = ac_list;
   3211 	while(ac!=NULL) {
   3212 		/* we're going to putz with ac->next, so save it here
   3213 		   for use at the end of the loop */
   3214 		ac_next = ac->next;
   3215 
   3216 		if (config_sets == NULL) {
   3217 			/* will need at least this one... */
   3218 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3219 				       M_RAIDFRAME, M_WAITOK);
   3220 			/* this one is easy :) */
   3221 			config_sets->ac = ac;
   3222 			config_sets->next = NULL;
   3223 			config_sets->rootable = 0;
   3224 			ac->next = NULL;
   3225 		} else {
   3226 			/* which set does this component fit into? */
   3227 			cset = config_sets;
   3228 			while(cset!=NULL) {
   3229 				if (rf_does_it_fit(cset, ac)) {
   3230 					/* looks like it matches... */
   3231 					ac->next = cset->ac;
   3232 					cset->ac = ac;
   3233 					break;
   3234 				}
   3235 				cset = cset->next;
   3236 			}
   3237 			if (cset==NULL) {
   3238 				/* didn't find a match above... new set..*/
   3239 				cset = malloc(sizeof(RF_ConfigSet_t),
   3240 					       M_RAIDFRAME, M_WAITOK);
   3241 				cset->ac = ac;
   3242 				ac->next = NULL;
   3243 				cset->next = config_sets;
   3244 				cset->rootable = 0;
   3245 				config_sets = cset;
   3246 			}
   3247 		}
   3248 		ac = ac_next;
   3249 	}
   3250 
   3251 
   3252 	return(config_sets);
   3253 }
   3254 
   3255 static int
   3256 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3257 {
   3258 	RF_ComponentLabel_t *clabel1, *clabel2;
   3259 
   3260 	/* If this one matches the *first* one in the set, that's good
   3261 	   enough, since the other members of the set would have been
   3262 	   through here too... */
   3263 	/* note that we are not checking partitionSize here..
   3264 
   3265 	   Note that we are also not checking the mod_counters here.
   3266 	   If everything else matches except the mod_counter, that's
   3267 	   good enough for this test.  We will deal with the mod_counters
   3268 	   a little later in the autoconfiguration process.
   3269 
   3270 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3271 
   3272 	   The reason we don't check for this is that failed disks
   3273 	   will have lower modification counts.  If those disks are
   3274 	   not added to the set they used to belong to, then they will
   3275 	   form their own set, which may result in 2 different sets,
   3276 	   for example, competing to be configured at raid0, and
   3277 	   perhaps competing to be the root filesystem set.  If the
   3278 	   wrong ones get configured, or both attempt to become /,
   3279 	   weird behaviour and or serious lossage will occur.  Thus we
   3280 	   need to bring them into the fold here, and kick them out at
   3281 	   a later point.
   3282 
   3283 	*/
   3284 
   3285 	clabel1 = cset->ac->clabel;
   3286 	clabel2 = ac->clabel;
   3287 	if ((clabel1->version == clabel2->version) &&
   3288 	    (clabel1->serial_number == clabel2->serial_number) &&
   3289 	    (clabel1->num_rows == clabel2->num_rows) &&
   3290 	    (clabel1->num_columns == clabel2->num_columns) &&
   3291 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3292 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3293 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3294 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3295 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3296 	    (clabel1->blockSize == clabel2->blockSize) &&
   3297 	    rf_component_label_numblocks(clabel1) ==
   3298 	    rf_component_label_numblocks(clabel2) &&
   3299 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3300 	    (clabel1->root_partition == clabel2->root_partition) &&
   3301 	    (clabel1->last_unit == clabel2->last_unit) &&
   3302 	    (clabel1->config_order == clabel2->config_order)) {
   3303 		/* if it get's here, it almost *has* to be a match */
   3304 	} else {
   3305 		/* it's not consistent with somebody in the set..
   3306 		   punt */
   3307 		return(0);
   3308 	}
   3309 	/* all was fine.. it must fit... */
   3310 	return(1);
   3311 }
   3312 
   3313 static int
   3314 rf_have_enough_components(RF_ConfigSet_t *cset)
   3315 {
   3316 	RF_AutoConfig_t *ac;
   3317 	RF_AutoConfig_t *auto_config;
   3318 	RF_ComponentLabel_t *clabel;
   3319 	int c;
   3320 	int num_cols;
   3321 	int num_missing;
   3322 	int mod_counter;
   3323 	int mod_counter_found;
   3324 	int even_pair_failed;
   3325 	char parity_type;
   3326 
   3327 
   3328 	/* check to see that we have enough 'live' components
   3329 	   of this set.  If so, we can configure it if necessary */
   3330 
   3331 	num_cols = cset->ac->clabel->num_columns;
   3332 	parity_type = cset->ac->clabel->parityConfig;
   3333 
   3334 	/* XXX Check for duplicate components!?!?!? */
   3335 
   3336 	/* Determine what the mod_counter is supposed to be for this set. */
   3337 
   3338 	mod_counter_found = 0;
   3339 	mod_counter = 0;
   3340 	ac = cset->ac;
   3341 	while(ac!=NULL) {
   3342 		if (mod_counter_found==0) {
   3343 			mod_counter = ac->clabel->mod_counter;
   3344 			mod_counter_found = 1;
   3345 		} else {
   3346 			if (ac->clabel->mod_counter > mod_counter) {
   3347 				mod_counter = ac->clabel->mod_counter;
   3348 			}
   3349 		}
   3350 		ac = ac->next;
   3351 	}
   3352 
   3353 	num_missing = 0;
   3354 	auto_config = cset->ac;
   3355 
   3356 	even_pair_failed = 0;
   3357 	for(c=0; c<num_cols; c++) {
   3358 		ac = auto_config;
   3359 		while(ac!=NULL) {
   3360 			if ((ac->clabel->column == c) &&
   3361 			    (ac->clabel->mod_counter == mod_counter)) {
   3362 				/* it's this one... */
   3363 #ifdef DEBUG
   3364 				printf("Found: %s at %d\n",
   3365 				       ac->devname,c);
   3366 #endif
   3367 				break;
   3368 			}
   3369 			ac=ac->next;
   3370 		}
   3371 		if (ac==NULL) {
   3372 				/* Didn't find one here! */
   3373 				/* special case for RAID 1, especially
   3374 				   where there are more than 2
   3375 				   components (where RAIDframe treats
   3376 				   things a little differently :( ) */
   3377 			if (parity_type == '1') {
   3378 				if (c%2 == 0) { /* even component */
   3379 					even_pair_failed = 1;
   3380 				} else { /* odd component.  If
   3381 					    we're failed, and
   3382 					    so is the even
   3383 					    component, it's
   3384 					    "Good Night, Charlie" */
   3385 					if (even_pair_failed == 1) {
   3386 						return(0);
   3387 					}
   3388 				}
   3389 			} else {
   3390 				/* normal accounting */
   3391 				num_missing++;
   3392 			}
   3393 		}
   3394 		if ((parity_type == '1') && (c%2 == 1)) {
   3395 				/* Just did an even component, and we didn't
   3396 				   bail.. reset the even_pair_failed flag,
   3397 				   and go on to the next component.... */
   3398 			even_pair_failed = 0;
   3399 		}
   3400 	}
   3401 
   3402 	clabel = cset->ac->clabel;
   3403 
   3404 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3405 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3406 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3407 		/* XXX this needs to be made *much* more general */
   3408 		/* Too many failures */
   3409 		return(0);
   3410 	}
   3411 	/* otherwise, all is well, and we've got enough to take a kick
   3412 	   at autoconfiguring this set */
   3413 	return(1);
   3414 }
   3415 
   3416 static void
   3417 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3418 			RF_Raid_t *raidPtr)
   3419 {
   3420 	RF_ComponentLabel_t *clabel;
   3421 	int i;
   3422 
   3423 	clabel = ac->clabel;
   3424 
   3425 	/* 1. Fill in the common stuff */
   3426 	config->numCol = clabel->num_columns;
   3427 	config->numSpare = 0; /* XXX should this be set here? */
   3428 	config->sectPerSU = clabel->sectPerSU;
   3429 	config->SUsPerPU = clabel->SUsPerPU;
   3430 	config->SUsPerRU = clabel->SUsPerRU;
   3431 	config->parityConfig = clabel->parityConfig;
   3432 	/* XXX... */
   3433 	strcpy(config->diskQueueType,"fifo");
   3434 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3435 	config->layoutSpecificSize = 0; /* XXX ?? */
   3436 
   3437 	while(ac!=NULL) {
   3438 		/* row/col values will be in range due to the checks
   3439 		   in reasonable_label() */
   3440 		strcpy(config->devnames[0][ac->clabel->column],
   3441 		       ac->devname);
   3442 		ac = ac->next;
   3443 	}
   3444 
   3445 	for(i=0;i<RF_MAXDBGV;i++) {
   3446 		config->debugVars[i][0] = 0;
   3447 	}
   3448 }
   3449 
   3450 static int
   3451 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3452 {
   3453 	RF_ComponentLabel_t *clabel;
   3454 	int column;
   3455 	int sparecol;
   3456 
   3457 	raidPtr->autoconfigure = new_value;
   3458 
   3459 	for(column=0; column<raidPtr->numCol; column++) {
   3460 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3461 			clabel = raidget_component_label(raidPtr, column);
   3462 			clabel->autoconfigure = new_value;
   3463 			raidflush_component_label(raidPtr, column);
   3464 		}
   3465 	}
   3466 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3467 		sparecol = raidPtr->numCol + column;
   3468 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3469 			clabel = raidget_component_label(raidPtr, sparecol);
   3470 			clabel->autoconfigure = new_value;
   3471 			raidflush_component_label(raidPtr, sparecol);
   3472 		}
   3473 	}
   3474 	return(new_value);
   3475 }
   3476 
   3477 static int
   3478 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3479 {
   3480 	RF_ComponentLabel_t *clabel;
   3481 	int column;
   3482 	int sparecol;
   3483 
   3484 	raidPtr->root_partition = new_value;
   3485 	for(column=0; column<raidPtr->numCol; column++) {
   3486 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3487 			clabel = raidget_component_label(raidPtr, column);
   3488 			clabel->root_partition = new_value;
   3489 			raidflush_component_label(raidPtr, column);
   3490 		}
   3491 	}
   3492 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3493 		sparecol = raidPtr->numCol + column;
   3494 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3495 			clabel = raidget_component_label(raidPtr, sparecol);
   3496 			clabel->root_partition = new_value;
   3497 			raidflush_component_label(raidPtr, sparecol);
   3498 		}
   3499 	}
   3500 	return(new_value);
   3501 }
   3502 
   3503 static void
   3504 rf_release_all_vps(RF_ConfigSet_t *cset)
   3505 {
   3506 	RF_AutoConfig_t *ac;
   3507 
   3508 	ac = cset->ac;
   3509 	while(ac!=NULL) {
   3510 		/* Close the vp, and give it back */
   3511 		if (ac->vp) {
   3512 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3513 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3514 			vput(ac->vp);
   3515 			ac->vp = NULL;
   3516 		}
   3517 		ac = ac->next;
   3518 	}
   3519 }
   3520 
   3521 
   3522 static void
   3523 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3524 {
   3525 	RF_AutoConfig_t *ac;
   3526 	RF_AutoConfig_t *next_ac;
   3527 
   3528 	ac = cset->ac;
   3529 	while(ac!=NULL) {
   3530 		next_ac = ac->next;
   3531 		/* nuke the label */
   3532 		free(ac->clabel, M_RAIDFRAME);
   3533 		/* cleanup the config structure */
   3534 		free(ac, M_RAIDFRAME);
   3535 		/* "next.." */
   3536 		ac = next_ac;
   3537 	}
   3538 	/* and, finally, nuke the config set */
   3539 	free(cset, M_RAIDFRAME);
   3540 }
   3541 
   3542 
   3543 void
   3544 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3545 {
   3546 	/* avoid over-writing byteswapped version. */
   3547 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3548 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3549 	clabel->serial_number = raidPtr->serial_number;
   3550 	clabel->mod_counter = raidPtr->mod_counter;
   3551 
   3552 	clabel->num_rows = 1;
   3553 	clabel->num_columns = raidPtr->numCol;
   3554 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3555 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3556 
   3557 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3558 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3559 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3560 
   3561 	clabel->blockSize = raidPtr->bytesPerSector;
   3562 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3563 
   3564 	/* XXX not portable */
   3565 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3566 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3567 	clabel->autoconfigure = raidPtr->autoconfigure;
   3568 	clabel->root_partition = raidPtr->root_partition;
   3569 	clabel->last_unit = raidPtr->raidid;
   3570 	clabel->config_order = raidPtr->config_order;
   3571 
   3572 #ifndef RF_NO_PARITY_MAP
   3573 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3574 #endif
   3575 }
   3576 
   3577 static struct raid_softc *
   3578 rf_auto_config_set(RF_ConfigSet_t *cset)
   3579 {
   3580 	RF_Raid_t *raidPtr;
   3581 	RF_Config_t *config;
   3582 	int raidID;
   3583 	struct raid_softc *sc;
   3584 
   3585 #ifdef DEBUG
   3586 	printf("RAID autoconfigure\n");
   3587 #endif
   3588 
   3589 	/* 1. Create a config structure */
   3590 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3591 
   3592 	/*
   3593 	   2. Figure out what RAID ID this one is supposed to live at
   3594 	   See if we can get the same RAID dev that it was configured
   3595 	   on last time..
   3596 	*/
   3597 
   3598 	raidID = cset->ac->clabel->last_unit;
   3599 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3600 	     sc = raidget(++raidID, false))
   3601 		continue;
   3602 #ifdef DEBUG
   3603 	printf("Configuring raid%d:\n",raidID);
   3604 #endif
   3605 
   3606 	if (sc == NULL)
   3607 		sc = raidget(raidID, true);
   3608 	raidPtr = &sc->sc_r;
   3609 
   3610 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3611 	raidPtr->softc = sc;
   3612 	raidPtr->raidid = raidID;
   3613 	raidPtr->openings = RAIDOUTSTANDING;
   3614 
   3615 	/* 3. Build the configuration structure */
   3616 	rf_create_configuration(cset->ac, config, raidPtr);
   3617 
   3618 	/* 4. Do the configuration */
   3619 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3620 		raidinit(sc);
   3621 
   3622 		rf_markalldirty(raidPtr);
   3623 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3624 		switch (cset->ac->clabel->root_partition) {
   3625 		case 1:	/* Force Root */
   3626 		case 2:	/* Soft Root: root when boot partition part of raid */
   3627 			/*
   3628 			 * everything configured just fine.  Make a note
   3629 			 * that this set is eligible to be root,
   3630 			 * or forced to be root
   3631 			 */
   3632 			cset->rootable = cset->ac->clabel->root_partition;
   3633 			/* XXX do this here? */
   3634 			raidPtr->root_partition = cset->rootable;
   3635 			break;
   3636 		default:
   3637 			break;
   3638 		}
   3639 	} else {
   3640 		raidput(sc);
   3641 		sc = NULL;
   3642 	}
   3643 
   3644 	/* 5. Cleanup */
   3645 	free(config, M_RAIDFRAME);
   3646 	return sc;
   3647 }
   3648 
   3649 void
   3650 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3651 	     size_t xmin, size_t xmax)
   3652 {
   3653 
   3654 	/* Format: raid%d_foo */
   3655 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3656 
   3657 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3658 	pool_sethiwat(p, xmax);
   3659 	pool_prime(p, xmin);
   3660 }
   3661 
   3662 
   3663 /*
   3664  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3665  * to see if there is IO pending and if that IO could possibly be done
   3666  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3667  * otherwise.
   3668  *
   3669  */
   3670 int
   3671 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3672 {
   3673 	struct raid_softc *rs;
   3674 	struct dk_softc *dksc;
   3675 
   3676 	rs = raidPtr->softc;
   3677 	dksc = &rs->sc_dksc;
   3678 
   3679 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3680 		return 1;
   3681 
   3682 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3683 		/* there is work to do */
   3684 		return 0;
   3685 	}
   3686 	/* default is nothing to do */
   3687 	return 1;
   3688 }
   3689 
   3690 int
   3691 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3692 {
   3693 	uint64_t numsecs;
   3694 	unsigned secsize;
   3695 	int error;
   3696 
   3697 	error = getdisksize(vp, &numsecs, &secsize);
   3698 	if (error == 0) {
   3699 		diskPtr->blockSize = secsize;
   3700 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3701 		diskPtr->partitionSize = numsecs;
   3702 		return 0;
   3703 	}
   3704 	return error;
   3705 }
   3706 
   3707 static int
   3708 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3709 {
   3710 	return 1;
   3711 }
   3712 
   3713 static void
   3714 raid_attach(device_t parent, device_t self, void *aux)
   3715 {
   3716 }
   3717 
   3718 
   3719 static int
   3720 raid_detach(device_t self, int flags)
   3721 {
   3722 	int error;
   3723 	struct raid_softc *rs = raidsoftc(self);
   3724 
   3725 	if (rs == NULL)
   3726 		return ENXIO;
   3727 
   3728 	if ((error = raidlock(rs)) != 0)
   3729 		return error;
   3730 
   3731 	error = raid_detach_unlocked(rs);
   3732 
   3733 	raidunlock(rs);
   3734 
   3735 	/* XXX raid can be referenced here */
   3736 
   3737 	if (error)
   3738 		return error;
   3739 
   3740 	/* Free the softc */
   3741 	raidput(rs);
   3742 
   3743 	return 0;
   3744 }
   3745 
   3746 static void
   3747 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3748 {
   3749 	struct dk_softc *dksc = &rs->sc_dksc;
   3750 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3751 
   3752 	memset(dg, 0, sizeof(*dg));
   3753 
   3754 	dg->dg_secperunit = raidPtr->totalSectors;
   3755 	dg->dg_secsize = raidPtr->bytesPerSector;
   3756 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3757 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3758 
   3759 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3760 }
   3761 
   3762 /*
   3763  * Get cache info for all the components (including spares).
   3764  * Returns intersection of all the cache flags of all disks, or first
   3765  * error if any encountered.
   3766  * XXXfua feature flags can change as spares are added - lock down somehow
   3767  */
   3768 static int
   3769 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3770 {
   3771 	int c;
   3772 	int error;
   3773 	int dkwhole = 0, dkpart;
   3774 
   3775 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3776 		/*
   3777 		 * Check any non-dead disk, even when currently being
   3778 		 * reconstructed.
   3779 		 */
   3780 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3781 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3782 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3783 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3784 			if (error) {
   3785 				if (error != ENODEV) {
   3786 					printf("raid%d: get cache for component %s failed\n",
   3787 					    raidPtr->raidid,
   3788 					    raidPtr->Disks[c].devname);
   3789 				}
   3790 
   3791 				return error;
   3792 			}
   3793 
   3794 			if (c == 0)
   3795 				dkwhole = dkpart;
   3796 			else
   3797 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3798 		}
   3799 	}
   3800 
   3801 	*data = dkwhole;
   3802 
   3803 	return 0;
   3804 }
   3805 
   3806 /*
   3807  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3808  * We end up returning whatever error was returned by the first cache flush
   3809  * that fails.
   3810  */
   3811 
   3812 static int
   3813 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3814 {
   3815 	int e = 0;
   3816 	for (int i = 0; i < 5; i++) {
   3817 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3818 		    &force, FWRITE, NOCRED);
   3819 		if (!e || e == ENODEV)
   3820 			return e;
   3821 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3822 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3823 	}
   3824 	return e;
   3825 }
   3826 
   3827 int
   3828 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3829 {
   3830 	int c, error;
   3831 
   3832 	error = 0;
   3833 	for (c = 0; c < raidPtr->numCol; c++) {
   3834 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3835 			int e = rf_sync_component_cache(raidPtr, c, force);
   3836 			if (e && !error)
   3837 				error = e;
   3838 		}
   3839 	}
   3840 
   3841 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3842 		int sparecol = raidPtr->numCol + c;
   3843 		/* Need to ensure that the reconstruct actually completed! */
   3844 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3845 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3846 			    force);
   3847 			if (e && !error)
   3848 				error = e;
   3849 		}
   3850 	}
   3851 	return error;
   3852 }
   3853 
   3854 /* Fill in info with the current status */
   3855 void
   3856 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3857 {
   3858 
   3859 	memset(info, 0, sizeof(*info));
   3860 
   3861 	if (raidPtr->status != rf_rs_reconstructing) {
   3862 		info->total = 100;
   3863 		info->completed = 100;
   3864 	} else {
   3865 		info->total = raidPtr->reconControl->numRUsTotal;
   3866 		info->completed = raidPtr->reconControl->numRUsComplete;
   3867 	}
   3868 	info->remaining = info->total - info->completed;
   3869 }
   3870 
   3871 /* Fill in info with the current status */
   3872 void
   3873 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3874 {
   3875 
   3876 	memset(info, 0, sizeof(*info));
   3877 
   3878 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3879 		info->total = raidPtr->Layout.numStripe;
   3880 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3881 	} else {
   3882 		info->completed = 100;
   3883 		info->total = 100;
   3884 	}
   3885 	info->remaining = info->total - info->completed;
   3886 }
   3887 
   3888 /* Fill in info with the current status */
   3889 void
   3890 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3891 {
   3892 
   3893 	memset(info, 0, sizeof(*info));
   3894 
   3895 	if (raidPtr->copyback_in_progress == 1) {
   3896 		info->total = raidPtr->Layout.numStripe;
   3897 		info->completed = raidPtr->copyback_stripes_done;
   3898 		info->remaining = info->total - info->completed;
   3899 	} else {
   3900 		info->remaining = 0;
   3901 		info->completed = 100;
   3902 		info->total = 100;
   3903 	}
   3904 }
   3905 
   3906 /* Fill in config with the current info */
   3907 int
   3908 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3909 {
   3910 	int	d, i, j;
   3911 
   3912 	if (!raidPtr->valid)
   3913 		return ENODEV;
   3914 	config->cols = raidPtr->numCol;
   3915 	config->ndevs = raidPtr->numCol;
   3916 	if (config->ndevs >= RF_MAX_DISKS)
   3917 		return ENOMEM;
   3918 	config->nspares = raidPtr->numSpare;
   3919 	if (config->nspares >= RF_MAX_DISKS)
   3920 		return ENOMEM;
   3921 	config->maxqdepth = raidPtr->maxQueueDepth;
   3922 	d = 0;
   3923 	for (j = 0; j < config->cols; j++) {
   3924 		config->devs[d] = raidPtr->Disks[j];
   3925 		d++;
   3926 	}
   3927 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3928 		config->spares[i] = raidPtr->Disks[j];
   3929 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3930 			/* XXX: raidctl(8) expects to see this as a used spare */
   3931 			config->spares[i].status = rf_ds_used_spare;
   3932 		}
   3933 	}
   3934 	return 0;
   3935 }
   3936 
   3937 int
   3938 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3939 {
   3940 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3941 	RF_ComponentLabel_t *raid_clabel;
   3942 	int column = clabel->column;
   3943 
   3944 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3945 		return EINVAL;
   3946 	raid_clabel = raidget_component_label(raidPtr, column);
   3947 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3948 	/* Fix-up for userland. */
   3949 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3950 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3951 
   3952 	return 0;
   3953 }
   3954 
   3955 /*
   3956  * Module interface
   3957  */
   3958 
   3959 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3960 
   3961 #ifdef _MODULE
   3962 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3963 #endif
   3964 
   3965 static int raid_modcmd(modcmd_t, void *);
   3966 static int raid_modcmd_init(void);
   3967 static int raid_modcmd_fini(void);
   3968 
   3969 static int
   3970 raid_modcmd(modcmd_t cmd, void *data)
   3971 {
   3972 	int error;
   3973 
   3974 	error = 0;
   3975 	switch (cmd) {
   3976 	case MODULE_CMD_INIT:
   3977 		error = raid_modcmd_init();
   3978 		break;
   3979 	case MODULE_CMD_FINI:
   3980 		error = raid_modcmd_fini();
   3981 		break;
   3982 	default:
   3983 		error = ENOTTY;
   3984 		break;
   3985 	}
   3986 	return error;
   3987 }
   3988 
   3989 static int
   3990 raid_modcmd_init(void)
   3991 {
   3992 	int error;
   3993 	int bmajor, cmajor;
   3994 
   3995 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3996 	mutex_enter(&raid_lock);
   3997 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3998 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3999 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   4000 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   4001 
   4002 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   4003 #endif
   4004 
   4005 	bmajor = cmajor = -1;
   4006 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4007 	    &raid_cdevsw, &cmajor);
   4008 	if (error != 0 && error != EEXIST) {
   4009 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4010 		mutex_exit(&raid_lock);
   4011 		return error;
   4012 	}
   4013 #ifdef _MODULE
   4014 	error = config_cfdriver_attach(&raid_cd);
   4015 	if (error != 0) {
   4016 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4017 		    __func__, error);
   4018 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4019 		mutex_exit(&raid_lock);
   4020 		return error;
   4021 	}
   4022 #endif
   4023 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4024 	if (error != 0) {
   4025 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4026 		    __func__, error);
   4027 #ifdef _MODULE
   4028 		config_cfdriver_detach(&raid_cd);
   4029 #endif
   4030 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4031 		mutex_exit(&raid_lock);
   4032 		return error;
   4033 	}
   4034 
   4035 	raidautoconfigdone = false;
   4036 
   4037 	mutex_exit(&raid_lock);
   4038 
   4039 	if (error == 0) {
   4040 		if (rf_BootRaidframe(true) == 0)
   4041 			aprint_verbose("Kernelized RAIDframe activated\n");
   4042 		else
   4043 			panic("Serious error activating RAID!!");
   4044 	}
   4045 
   4046 	/*
   4047 	 * Register a finalizer which will be used to auto-config RAID
   4048 	 * sets once all real hardware devices have been found.
   4049 	 */
   4050 	error = config_finalize_register(NULL, rf_autoconfig);
   4051 	if (error != 0) {
   4052 		aprint_error("WARNING: unable to register RAIDframe "
   4053 		    "finalizer\n");
   4054 		error = 0;
   4055 	}
   4056 
   4057 	return error;
   4058 }
   4059 
   4060 static int
   4061 raid_modcmd_fini(void)
   4062 {
   4063 	int error;
   4064 
   4065 	mutex_enter(&raid_lock);
   4066 
   4067 	/* Don't allow unload if raid device(s) exist.  */
   4068 	if (!LIST_EMPTY(&raids)) {
   4069 		mutex_exit(&raid_lock);
   4070 		return EBUSY;
   4071 	}
   4072 
   4073 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4074 	if (error != 0) {
   4075 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4076 		mutex_exit(&raid_lock);
   4077 		return error;
   4078 	}
   4079 #ifdef _MODULE
   4080 	error = config_cfdriver_detach(&raid_cd);
   4081 	if (error != 0) {
   4082 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4083 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4084 		mutex_exit(&raid_lock);
   4085 		return error;
   4086 	}
   4087 #endif
   4088 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4089 	rf_BootRaidframe(false);
   4090 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4091 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4092 	rf_destroy_cond2(rf_sparet_wait_cv);
   4093 	rf_destroy_cond2(rf_sparet_resp_cv);
   4094 #endif
   4095 	mutex_exit(&raid_lock);
   4096 	mutex_destroy(&raid_lock);
   4097 
   4098 	return error;
   4099 }
   4100