Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.356.2.9
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.356.2.9 2019/01/18 00:01:01 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.356.2.9 2019/01/18 00:01:01 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 #include <sys/compat_stub.h>
    132 
    133 #include <prop/proplib.h>
    134 
    135 #include <dev/raidframe/raidframevar.h>
    136 #include <dev/raidframe/raidframeio.h>
    137 #include <dev/raidframe/rf_paritymap.h>
    138 
    139 #include "rf_raid.h"
    140 #include "rf_copyback.h"
    141 #include "rf_dag.h"
    142 #include "rf_dagflags.h"
    143 #include "rf_desc.h"
    144 #include "rf_diskqueue.h"
    145 #include "rf_etimer.h"
    146 #include "rf_general.h"
    147 #include "rf_kintf.h"
    148 #include "rf_options.h"
    149 #include "rf_driver.h"
    150 #include "rf_parityscan.h"
    151 #include "rf_threadstuff.h"
    152 
    153 #include "rf_compat50.h"
    154 
    155 #include "rf_compat80.h"
    156 
    157 #ifdef COMPAT_NETBSD32
    158 #include "rf_compat32.h"
    159 #endif
    160 
    161 #include "ioconf.h"
    162 
    163 #ifdef DEBUG
    164 int     rf_kdebug_level = 0;
    165 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    166 #else				/* DEBUG */
    167 #define db1_printf(a) { }
    168 #endif				/* DEBUG */
    169 
    170 #ifdef DEBUG_ROOT
    171 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    172 #else
    173 #define DPRINTF(a, ...)
    174 #endif
    175 
    176 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    177 static rf_declare_mutex2(rf_sparet_wait_mutex);
    178 static rf_declare_cond2(rf_sparet_wait_cv);
    179 static rf_declare_cond2(rf_sparet_resp_cv);
    180 
    181 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    182 						 * spare table */
    183 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    184 						 * installation process */
    185 #endif
    186 
    187 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    188 
    189 /* prototypes */
    190 static void KernelWakeupFunc(struct buf *);
    191 static void InitBP(struct buf *, struct vnode *, unsigned,
    192     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    193     void *, int, struct proc *);
    194 struct raid_softc;
    195 static void raidinit(struct raid_softc *);
    196 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    197 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    198 
    199 static int raid_match(device_t, cfdata_t, void *);
    200 static void raid_attach(device_t, device_t, void *);
    201 static int raid_detach(device_t, int);
    202 
    203 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    204     daddr_t, daddr_t);
    205 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    206     daddr_t, daddr_t, int);
    207 
    208 static int raidwrite_component_label(unsigned,
    209     dev_t, struct vnode *, RF_ComponentLabel_t *);
    210 static int raidread_component_label(unsigned,
    211     dev_t, struct vnode *, RF_ComponentLabel_t *);
    212 
    213 static int raid_diskstart(device_t, struct buf *bp);
    214 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    215 static int raid_lastclose(device_t);
    216 
    217 static dev_type_open(raidopen);
    218 static dev_type_close(raidclose);
    219 static dev_type_read(raidread);
    220 static dev_type_write(raidwrite);
    221 static dev_type_ioctl(raidioctl);
    222 static dev_type_strategy(raidstrategy);
    223 static dev_type_dump(raiddump);
    224 static dev_type_size(raidsize);
    225 
    226 const struct bdevsw raid_bdevsw = {
    227 	.d_open = raidopen,
    228 	.d_close = raidclose,
    229 	.d_strategy = raidstrategy,
    230 	.d_ioctl = raidioctl,
    231 	.d_dump = raiddump,
    232 	.d_psize = raidsize,
    233 	.d_discard = nodiscard,
    234 	.d_flag = D_DISK
    235 };
    236 
    237 const struct cdevsw raid_cdevsw = {
    238 	.d_open = raidopen,
    239 	.d_close = raidclose,
    240 	.d_read = raidread,
    241 	.d_write = raidwrite,
    242 	.d_ioctl = raidioctl,
    243 	.d_stop = nostop,
    244 	.d_tty = notty,
    245 	.d_poll = nopoll,
    246 	.d_mmap = nommap,
    247 	.d_kqfilter = nokqfilter,
    248 	.d_discard = nodiscard,
    249 	.d_flag = D_DISK
    250 };
    251 
    252 static struct dkdriver rf_dkdriver = {
    253 	.d_open = raidopen,
    254 	.d_close = raidclose,
    255 	.d_strategy = raidstrategy,
    256 	.d_diskstart = raid_diskstart,
    257 	.d_dumpblocks = raid_dumpblocks,
    258 	.d_lastclose = raid_lastclose,
    259 	.d_minphys = minphys
    260 };
    261 
    262 struct raid_softc {
    263 	struct dk_softc sc_dksc;
    264 	int	sc_unit;
    265 	int     sc_flags;	/* flags */
    266 	int     sc_cflags;	/* configuration flags */
    267 	kmutex_t sc_mutex;	/* interlock mutex */
    268 	kcondvar_t sc_cv;	/* and the condvar */
    269 	uint64_t sc_size;	/* size of the raid device */
    270 	char    sc_xname[20];	/* XXX external name */
    271 	RF_Raid_t sc_r;
    272 	LIST_ENTRY(raid_softc) sc_link;
    273 };
    274 /* sc_flags */
    275 #define RAIDF_INITED		0x01	/* unit has been initialized */
    276 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    277 #define RAIDF_DETACH  		0x04	/* detach after final close */
    278 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    279 #define RAIDF_LOCKED		0x10	/* unit is locked */
    280 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    281 
    282 #define	raidunit(x)	DISKUNIT(x)
    283 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    284 
    285 extern struct cfdriver raid_cd;
    286 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    287     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    288     DVF_DETACH_SHUTDOWN);
    289 
    290 /* Internal representation of a rf_recon_req */
    291 struct rf_recon_req_internal {
    292 	RF_RowCol_t col;
    293 	RF_ReconReqFlags_t flags;
    294 	void   *raidPtr;
    295 };
    296 
    297 /*
    298  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    299  * Be aware that large numbers can allow the driver to consume a lot of
    300  * kernel memory, especially on writes, and in degraded mode reads.
    301  *
    302  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    303  * a single 64K write will typically require 64K for the old data,
    304  * 64K for the old parity, and 64K for the new parity, for a total
    305  * of 192K (if the parity buffer is not re-used immediately).
    306  * Even it if is used immediately, that's still 128K, which when multiplied
    307  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    308  *
    309  * Now in degraded mode, for example, a 64K read on the above setup may
    310  * require data reconstruction, which will require *all* of the 4 remaining
    311  * disks to participate -- 4 * 32K/disk == 128K again.
    312  */
    313 
    314 #ifndef RAIDOUTSTANDING
    315 #define RAIDOUTSTANDING   6
    316 #endif
    317 
    318 #define RAIDLABELDEV(dev)	\
    319 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    320 
    321 /* declared here, and made public, for the benefit of KVM stuff.. */
    322 
    323 static int raidlock(struct raid_softc *);
    324 static void raidunlock(struct raid_softc *);
    325 
    326 static int raid_detach_unlocked(struct raid_softc *);
    327 
    328 static void rf_markalldirty(RF_Raid_t *);
    329 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    330 
    331 void rf_ReconThread(struct rf_recon_req_internal *);
    332 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    333 void rf_CopybackThread(RF_Raid_t *raidPtr);
    334 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    335 int rf_autoconfig(device_t);
    336 void rf_buildroothack(RF_ConfigSet_t *);
    337 
    338 RF_AutoConfig_t *rf_find_raid_components(void);
    339 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    340 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    341 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    342 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    343 int rf_set_autoconfig(RF_Raid_t *, int);
    344 int rf_set_rootpartition(RF_Raid_t *, int);
    345 void rf_release_all_vps(RF_ConfigSet_t *);
    346 void rf_cleanup_config_set(RF_ConfigSet_t *);
    347 int rf_have_enough_components(RF_ConfigSet_t *);
    348 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    349 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    350 
    351 /*
    352  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    353  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    354  * in the kernel config file.
    355  */
    356 #ifdef RAID_AUTOCONFIG
    357 int raidautoconfig = 1;
    358 #else
    359 int raidautoconfig = 0;
    360 #endif
    361 static bool raidautoconfigdone = false;
    362 
    363 struct RF_Pools_s rf_pools;
    364 
    365 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    366 static kmutex_t raid_lock;
    367 
    368 static struct raid_softc *
    369 raidcreate(int unit) {
    370 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    371 	sc->sc_unit = unit;
    372 	cv_init(&sc->sc_cv, "raidunit");
    373 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    374 	return sc;
    375 }
    376 
    377 static void
    378 raiddestroy(struct raid_softc *sc) {
    379 	cv_destroy(&sc->sc_cv);
    380 	mutex_destroy(&sc->sc_mutex);
    381 	kmem_free(sc, sizeof(*sc));
    382 }
    383 
    384 static struct raid_softc *
    385 raidget(int unit, bool create) {
    386 	struct raid_softc *sc;
    387 	if (unit < 0) {
    388 #ifdef DIAGNOSTIC
    389 		panic("%s: unit %d!", __func__, unit);
    390 #endif
    391 		return NULL;
    392 	}
    393 	mutex_enter(&raid_lock);
    394 	LIST_FOREACH(sc, &raids, sc_link) {
    395 		if (sc->sc_unit == unit) {
    396 			mutex_exit(&raid_lock);
    397 			return sc;
    398 		}
    399 	}
    400 	mutex_exit(&raid_lock);
    401 	if (!create)
    402 		return NULL;
    403 	if ((sc = raidcreate(unit)) == NULL)
    404 		return NULL;
    405 	mutex_enter(&raid_lock);
    406 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    407 	mutex_exit(&raid_lock);
    408 	return sc;
    409 }
    410 
    411 static void
    412 raidput(struct raid_softc *sc) {
    413 	mutex_enter(&raid_lock);
    414 	LIST_REMOVE(sc, sc_link);
    415 	mutex_exit(&raid_lock);
    416 	raiddestroy(sc);
    417 }
    418 
    419 void
    420 raidattach(int num)
    421 {
    422 
    423 	/*
    424 	 * Device attachment and associated initialization now occurs
    425 	 * as part of the module initialization.
    426 	 */
    427 }
    428 
    429 int
    430 rf_autoconfig(device_t self)
    431 {
    432 	RF_AutoConfig_t *ac_list;
    433 	RF_ConfigSet_t *config_sets;
    434 
    435 	if (!raidautoconfig || raidautoconfigdone == true)
    436 		return (0);
    437 
    438 	/* XXX This code can only be run once. */
    439 	raidautoconfigdone = true;
    440 
    441 #ifdef __HAVE_CPU_BOOTCONF
    442 	/*
    443 	 * 0. find the boot device if needed first so we can use it later
    444 	 * this needs to be done before we autoconfigure any raid sets,
    445 	 * because if we use wedges we are not going to be able to open
    446 	 * the boot device later
    447 	 */
    448 	if (booted_device == NULL)
    449 		cpu_bootconf();
    450 #endif
    451 	/* 1. locate all RAID components on the system */
    452 	aprint_debug("Searching for RAID components...\n");
    453 	ac_list = rf_find_raid_components();
    454 
    455 	/* 2. Sort them into their respective sets. */
    456 	config_sets = rf_create_auto_sets(ac_list);
    457 
    458 	/*
    459 	 * 3. Evaluate each set and configure the valid ones.
    460 	 * This gets done in rf_buildroothack().
    461 	 */
    462 	rf_buildroothack(config_sets);
    463 
    464 	return 1;
    465 }
    466 
    467 static int
    468 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    469 	const char *bootname = device_xname(bdv);
    470 	size_t len = strlen(bootname);
    471 
    472 	for (int col = 0; col < r->numCol; col++) {
    473 		const char *devname = r->Disks[col].devname;
    474 		devname += sizeof("/dev/") - 1;
    475 		if (strncmp(devname, "dk", 2) == 0) {
    476 			const char *parent =
    477 			    dkwedge_get_parent_name(r->Disks[col].dev);
    478 			if (parent != NULL)
    479 				devname = parent;
    480 		}
    481 		if (strncmp(devname, bootname, len) == 0) {
    482 			struct raid_softc *sc = r->softc;
    483 			aprint_debug("raid%d includes boot device %s\n",
    484 			    sc->sc_unit, devname);
    485 			return 1;
    486 		}
    487 	}
    488 	return 0;
    489 }
    490 
    491 void
    492 rf_buildroothack(RF_ConfigSet_t *config_sets)
    493 {
    494 	RF_ConfigSet_t *cset;
    495 	RF_ConfigSet_t *next_cset;
    496 	int num_root;
    497 	struct raid_softc *sc, *rsc;
    498 	struct dk_softc *dksc;
    499 
    500 	sc = rsc = NULL;
    501 	num_root = 0;
    502 	cset = config_sets;
    503 	while (cset != NULL) {
    504 		next_cset = cset->next;
    505 		if (rf_have_enough_components(cset) &&
    506 		    cset->ac->clabel->autoconfigure == 1) {
    507 			sc = rf_auto_config_set(cset);
    508 			if (sc != NULL) {
    509 				aprint_debug("raid%d: configured ok\n",
    510 				    sc->sc_unit);
    511 				if (cset->rootable) {
    512 					rsc = sc;
    513 					num_root++;
    514 				}
    515 			} else {
    516 				/* The autoconfig didn't work :( */
    517 				aprint_debug("Autoconfig failed\n");
    518 				rf_release_all_vps(cset);
    519 			}
    520 		} else {
    521 			/* we're not autoconfiguring this set...
    522 			   release the associated resources */
    523 			rf_release_all_vps(cset);
    524 		}
    525 		/* cleanup */
    526 		rf_cleanup_config_set(cset);
    527 		cset = next_cset;
    528 	}
    529 	dksc = &rsc->sc_dksc;
    530 
    531 	/* if the user has specified what the root device should be
    532 	   then we don't touch booted_device or boothowto... */
    533 
    534 	if (rootspec != NULL)
    535 		return;
    536 
    537 	/* we found something bootable... */
    538 
    539 	/*
    540 	 * XXX: The following code assumes that the root raid
    541 	 * is the first ('a') partition. This is about the best
    542 	 * we can do with a BSD disklabel, but we might be able
    543 	 * to do better with a GPT label, by setting a specified
    544 	 * attribute to indicate the root partition. We can then
    545 	 * stash the partition number in the r->root_partition
    546 	 * high bits (the bottom 2 bits are already used). For
    547 	 * now we just set booted_partition to 0 when we override
    548 	 * root.
    549 	 */
    550 	if (num_root == 1) {
    551 		device_t candidate_root;
    552 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    553 			char cname[sizeof(cset->ac->devname)];
    554 			/* XXX: assume partition 'a' first */
    555 			snprintf(cname, sizeof(cname), "%s%c",
    556 			    device_xname(dksc->sc_dev), 'a');
    557 			candidate_root = dkwedge_find_by_wname(cname);
    558 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    559 			    cname);
    560 			if (candidate_root == NULL) {
    561 				/*
    562 				 * If that is not found, because we don't use
    563 				 * disklabel, return the first dk child
    564 				 * XXX: we can skip the 'a' check above
    565 				 * and always do this...
    566 				 */
    567 				size_t i = 0;
    568 				candidate_root = dkwedge_find_by_parent(
    569 				    device_xname(dksc->sc_dev), &i);
    570 			}
    571 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    572 			    candidate_root);
    573 		} else
    574 			candidate_root = dksc->sc_dev;
    575 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    576 		DPRINTF("%s: booted_device=%p root_partition=%d "
    577 		   "contains_boot=%d\n", __func__, booted_device,
    578 		   rsc->sc_r.root_partition,
    579 		   rf_containsboot(&rsc->sc_r, booted_device));
    580 		if (booted_device == NULL ||
    581 		    rsc->sc_r.root_partition == 1 ||
    582 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    583 			booted_device = candidate_root;
    584 			booted_method = "raidframe/single";
    585 			booted_partition = 0;	/* XXX assume 'a' */
    586 		}
    587 	} else if (num_root > 1) {
    588 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    589 		    booted_device);
    590 
    591 		/*
    592 		 * Maybe the MD code can help. If it cannot, then
    593 		 * setroot() will discover that we have no
    594 		 * booted_device and will ask the user if nothing was
    595 		 * hardwired in the kernel config file
    596 		 */
    597 		if (booted_device == NULL)
    598 			return;
    599 
    600 		num_root = 0;
    601 		mutex_enter(&raid_lock);
    602 		LIST_FOREACH(sc, &raids, sc_link) {
    603 			RF_Raid_t *r = &sc->sc_r;
    604 			if (r->valid == 0)
    605 				continue;
    606 
    607 			if (r->root_partition == 0)
    608 				continue;
    609 
    610 			if (rf_containsboot(r, booted_device)) {
    611 				num_root++;
    612 				rsc = sc;
    613 				dksc = &rsc->sc_dksc;
    614 			}
    615 		}
    616 		mutex_exit(&raid_lock);
    617 
    618 		if (num_root == 1) {
    619 			booted_device = dksc->sc_dev;
    620 			booted_method = "raidframe/multi";
    621 			booted_partition = 0;	/* XXX assume 'a' */
    622 		} else {
    623 			/* we can't guess.. require the user to answer... */
    624 			boothowto |= RB_ASKNAME;
    625 		}
    626 	}
    627 }
    628 
    629 static int
    630 raidsize(dev_t dev)
    631 {
    632 	struct raid_softc *rs;
    633 	struct dk_softc *dksc;
    634 	unsigned int unit;
    635 
    636 	unit = raidunit(dev);
    637 	if ((rs = raidget(unit, false)) == NULL)
    638 		return -1;
    639 	dksc = &rs->sc_dksc;
    640 
    641 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    642 		return -1;
    643 
    644 	return dk_size(dksc, dev);
    645 }
    646 
    647 static int
    648 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    649 {
    650 	unsigned int unit;
    651 	struct raid_softc *rs;
    652 	struct dk_softc *dksc;
    653 
    654 	unit = raidunit(dev);
    655 	if ((rs = raidget(unit, false)) == NULL)
    656 		return ENXIO;
    657 	dksc = &rs->sc_dksc;
    658 
    659 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    660 		return ENODEV;
    661 
    662         /*
    663            Note that blkno is relative to this particular partition.
    664            By adding adding RF_PROTECTED_SECTORS, we get a value that
    665 	   is relative to the partition used for the underlying component.
    666         */
    667 	blkno += RF_PROTECTED_SECTORS;
    668 
    669 	return dk_dump(dksc, dev, blkno, va, size);
    670 }
    671 
    672 static int
    673 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    674 {
    675 	struct raid_softc *rs = raidsoftc(dev);
    676 	const struct bdevsw *bdev;
    677 	RF_Raid_t *raidPtr;
    678 	int     c, sparecol, j, scol, dumpto;
    679 	int     error = 0;
    680 
    681 	raidPtr = &rs->sc_r;
    682 
    683 	/* we only support dumping to RAID 1 sets */
    684 	if (raidPtr->Layout.numDataCol != 1 ||
    685 	    raidPtr->Layout.numParityCol != 1)
    686 		return EINVAL;
    687 
    688 	if ((error = raidlock(rs)) != 0)
    689 		return error;
    690 
    691 	/* figure out what device is alive.. */
    692 
    693 	/*
    694 	   Look for a component to dump to.  The preference for the
    695 	   component to dump to is as follows:
    696 	   1) the master
    697 	   2) a used_spare of the master
    698 	   3) the slave
    699 	   4) a used_spare of the slave
    700 	*/
    701 
    702 	dumpto = -1;
    703 	for (c = 0; c < raidPtr->numCol; c++) {
    704 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    705 			/* this might be the one */
    706 			dumpto = c;
    707 			break;
    708 		}
    709 	}
    710 
    711 	/*
    712 	   At this point we have possibly selected a live master or a
    713 	   live slave.  We now check to see if there is a spared
    714 	   master (or a spared slave), if we didn't find a live master
    715 	   or a live slave.
    716 	*/
    717 
    718 	for (c = 0; c < raidPtr->numSpare; c++) {
    719 		sparecol = raidPtr->numCol + c;
    720 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    721 			/* How about this one? */
    722 			scol = -1;
    723 			for(j=0;j<raidPtr->numCol;j++) {
    724 				if (raidPtr->Disks[j].spareCol == sparecol) {
    725 					scol = j;
    726 					break;
    727 				}
    728 			}
    729 			if (scol == 0) {
    730 				/*
    731 				   We must have found a spared master!
    732 				   We'll take that over anything else
    733 				   found so far.  (We couldn't have
    734 				   found a real master before, since
    735 				   this is a used spare, and it's
    736 				   saying that it's replacing the
    737 				   master.)  On reboot (with
    738 				   autoconfiguration turned on)
    739 				   sparecol will become the 1st
    740 				   component (component0) of this set.
    741 				*/
    742 				dumpto = sparecol;
    743 				break;
    744 			} else if (scol != -1) {
    745 				/*
    746 				   Must be a spared slave.  We'll dump
    747 				   to that if we havn't found anything
    748 				   else so far.
    749 				*/
    750 				if (dumpto == -1)
    751 					dumpto = sparecol;
    752 			}
    753 		}
    754 	}
    755 
    756 	if (dumpto == -1) {
    757 		/* we couldn't find any live components to dump to!?!?
    758 		 */
    759 		error = EINVAL;
    760 		goto out;
    761 	}
    762 
    763 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    764 	if (bdev == NULL) {
    765 		error = ENXIO;
    766 		goto out;
    767 	}
    768 
    769 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    770 				blkno, va, nblk * raidPtr->bytesPerSector);
    771 
    772 out:
    773 	raidunlock(rs);
    774 
    775 	return error;
    776 }
    777 
    778 /* ARGSUSED */
    779 static int
    780 raidopen(dev_t dev, int flags, int fmt,
    781     struct lwp *l)
    782 {
    783 	int     unit = raidunit(dev);
    784 	struct raid_softc *rs;
    785 	struct dk_softc *dksc;
    786 	int     error = 0;
    787 	int     part, pmask;
    788 
    789 	if ((rs = raidget(unit, true)) == NULL)
    790 		return ENXIO;
    791 	if ((error = raidlock(rs)) != 0)
    792 		return (error);
    793 
    794 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    795 		error = EBUSY;
    796 		goto bad;
    797 	}
    798 
    799 	dksc = &rs->sc_dksc;
    800 
    801 	part = DISKPART(dev);
    802 	pmask = (1 << part);
    803 
    804 	if (!DK_BUSY(dksc, pmask) &&
    805 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    806 		/* First one... mark things as dirty... Note that we *MUST*
    807 		 have done a configure before this.  I DO NOT WANT TO BE
    808 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    809 		 THAT THEY BELONG TOGETHER!!!!! */
    810 		/* XXX should check to see if we're only open for reading
    811 		   here... If so, we needn't do this, but then need some
    812 		   other way of keeping track of what's happened.. */
    813 
    814 		rf_markalldirty(&rs->sc_r);
    815 	}
    816 
    817 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    818 		error = dk_open(dksc, dev, flags, fmt, l);
    819 
    820 bad:
    821 	raidunlock(rs);
    822 
    823 	return (error);
    824 
    825 
    826 }
    827 
    828 static int
    829 raid_lastclose(device_t self)
    830 {
    831 	struct raid_softc *rs = raidsoftc(self);
    832 
    833 	/* Last one... device is not unconfigured yet.
    834 	   Device shutdown has taken care of setting the
    835 	   clean bits if RAIDF_INITED is not set
    836 	   mark things as clean... */
    837 
    838 	rf_update_component_labels(&rs->sc_r,
    839 	    RF_FINAL_COMPONENT_UPDATE);
    840 
    841 	/* pass to unlocked code */
    842 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    843 		rs->sc_flags |= RAIDF_DETACH;
    844 
    845 	return 0;
    846 }
    847 
    848 /* ARGSUSED */
    849 static int
    850 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    851 {
    852 	int     unit = raidunit(dev);
    853 	struct raid_softc *rs;
    854 	struct dk_softc *dksc;
    855 	cfdata_t cf;
    856 	int     error = 0, do_detach = 0, do_put = 0;
    857 
    858 	if ((rs = raidget(unit, false)) == NULL)
    859 		return ENXIO;
    860 	dksc = &rs->sc_dksc;
    861 
    862 	if ((error = raidlock(rs)) != 0)
    863 		return (error);
    864 
    865 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    866 		error = dk_close(dksc, dev, flags, fmt, l);
    867 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    868 			do_detach = 1;
    869 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    870 		do_put = 1;
    871 
    872 	raidunlock(rs);
    873 
    874 	if (do_detach) {
    875 		/* free the pseudo device attach bits */
    876 		cf = device_cfdata(dksc->sc_dev);
    877 		error = config_detach(dksc->sc_dev, 0);
    878 		if (error == 0)
    879 			free(cf, M_RAIDFRAME);
    880 	} else if (do_put) {
    881 		raidput(rs);
    882 	}
    883 
    884 	return (error);
    885 
    886 }
    887 
    888 static void
    889 raid_wakeup(RF_Raid_t *raidPtr)
    890 {
    891 	rf_lock_mutex2(raidPtr->iodone_lock);
    892 	rf_signal_cond2(raidPtr->iodone_cv);
    893 	rf_unlock_mutex2(raidPtr->iodone_lock);
    894 }
    895 
    896 static void
    897 raidstrategy(struct buf *bp)
    898 {
    899 	unsigned int unit;
    900 	struct raid_softc *rs;
    901 	struct dk_softc *dksc;
    902 	RF_Raid_t *raidPtr;
    903 
    904 	unit = raidunit(bp->b_dev);
    905 	if ((rs = raidget(unit, false)) == NULL) {
    906 		bp->b_error = ENXIO;
    907 		goto fail;
    908 	}
    909 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    910 		bp->b_error = ENXIO;
    911 		goto fail;
    912 	}
    913 	dksc = &rs->sc_dksc;
    914 	raidPtr = &rs->sc_r;
    915 
    916 	/* Queue IO only */
    917 	if (dk_strategy_defer(dksc, bp))
    918 		goto done;
    919 
    920 	/* schedule the IO to happen at the next convenient time */
    921 	raid_wakeup(raidPtr);
    922 
    923 done:
    924 	return;
    925 
    926 fail:
    927 	bp->b_resid = bp->b_bcount;
    928 	biodone(bp);
    929 }
    930 
    931 static int
    932 raid_diskstart(device_t dev, struct buf *bp)
    933 {
    934 	struct raid_softc *rs = raidsoftc(dev);
    935 	RF_Raid_t *raidPtr;
    936 
    937 	raidPtr = &rs->sc_r;
    938 	if (!raidPtr->valid) {
    939 		db1_printf(("raid is not valid..\n"));
    940 		return ENODEV;
    941 	}
    942 
    943 	/* XXX */
    944 	bp->b_resid = 0;
    945 
    946 	return raiddoaccess(raidPtr, bp);
    947 }
    948 
    949 void
    950 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    951 {
    952 	struct raid_softc *rs;
    953 	struct dk_softc *dksc;
    954 
    955 	rs = raidPtr->softc;
    956 	dksc = &rs->sc_dksc;
    957 
    958 	dk_done(dksc, bp);
    959 
    960 	rf_lock_mutex2(raidPtr->mutex);
    961 	raidPtr->openings++;
    962 	rf_unlock_mutex2(raidPtr->mutex);
    963 
    964 	/* schedule more IO */
    965 	raid_wakeup(raidPtr);
    966 }
    967 
    968 /* ARGSUSED */
    969 static int
    970 raidread(dev_t dev, struct uio *uio, int flags)
    971 {
    972 	int     unit = raidunit(dev);
    973 	struct raid_softc *rs;
    974 
    975 	if ((rs = raidget(unit, false)) == NULL)
    976 		return ENXIO;
    977 
    978 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    979 		return (ENXIO);
    980 
    981 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    982 
    983 }
    984 
    985 /* ARGSUSED */
    986 static int
    987 raidwrite(dev_t dev, struct uio *uio, int flags)
    988 {
    989 	int     unit = raidunit(dev);
    990 	struct raid_softc *rs;
    991 
    992 	if ((rs = raidget(unit, false)) == NULL)
    993 		return ENXIO;
    994 
    995 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    996 		return (ENXIO);
    997 
    998 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    999 
   1000 }
   1001 
   1002 static int
   1003 raid_detach_unlocked(struct raid_softc *rs)
   1004 {
   1005 	struct dk_softc *dksc = &rs->sc_dksc;
   1006 	RF_Raid_t *raidPtr;
   1007 	int error;
   1008 
   1009 	raidPtr = &rs->sc_r;
   1010 
   1011 	if (DK_BUSY(dksc, 0) ||
   1012 	    raidPtr->recon_in_progress != 0 ||
   1013 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1014 	    raidPtr->copyback_in_progress != 0)
   1015 		return EBUSY;
   1016 
   1017 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1018 		return 0;
   1019 
   1020 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1021 
   1022 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1023 		return error;
   1024 
   1025 	rs->sc_flags &= ~RAIDF_INITED;
   1026 
   1027 	/* Kill off any queued buffers */
   1028 	dk_drain(dksc);
   1029 	bufq_free(dksc->sc_bufq);
   1030 
   1031 	/* Detach the disk. */
   1032 	dkwedge_delall(&dksc->sc_dkdev);
   1033 	disk_detach(&dksc->sc_dkdev);
   1034 	disk_destroy(&dksc->sc_dkdev);
   1035 	dk_detach(dksc);
   1036 
   1037 	return 0;
   1038 }
   1039 
   1040 /* Hooks to call the 5.0 and 8.0 ioctl compat code */
   1041 MODULE_CALL_HOOK_DECL(raidframe50_ioctl_hook, int,
   1042     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1043      RF_Config_t **k_cfg));
   1044 MODULE_CALL_HOOK(raidframe50_ioctl_hook, int,
   1045     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1046      RF_Config_t **k_cfg),
   1047     (cmd, initted, raidPtr, unit, data, k_cfg),
   1048     enosys());
   1049 
   1050 MODULE_CALL_HOOK_DECL(raidframe80_ioctl_hook, int,
   1051     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1052      RF_Config_t **k_cfg));
   1053 MODULE_CALL_HOOK(raidframe80_ioctl_hook, int,
   1054     (int cmd, int initted, RF_Raid_t *raidPtr, int unit, void *data,
   1055      RF_Config_t **k_cfg),
   1056     (cmd, initted, raidPtr, unit, data, k_cfg),
   1057     enosys());
   1058 
   1059 static int
   1060 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1061 {
   1062 	int     unit = raidunit(dev);
   1063 	int     error = 0;
   1064 	int     part, pmask;
   1065 	struct raid_softc *rs;
   1066 	struct dk_softc *dksc;
   1067 	RF_Config_t *k_cfg, *u_cfg;
   1068 	RF_Raid_t *raidPtr;
   1069 	RF_RaidDisk_t *diskPtr;
   1070 	RF_AccTotals_t *totals;
   1071 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1072 	u_char *specific_buf;
   1073 	int retcode = 0;
   1074 	int column;
   1075 /*	int raidid; */
   1076 	struct rf_recon_req *rr;
   1077 	struct rf_recon_req_internal *rrint;
   1078 	RF_ComponentLabel_t *clabel;
   1079 	RF_ComponentLabel_t *ci_label;
   1080 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1081 	RF_SingleComponent_t component;
   1082 	int d;
   1083 
   1084 	if ((rs = raidget(unit, false)) == NULL)
   1085 		return ENXIO;
   1086 	dksc = &rs->sc_dksc;
   1087 	raidPtr = &rs->sc_r;
   1088 
   1089 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1090 		(int) DISKPART(dev), (int) unit, cmd));
   1091 
   1092 	/* Must be initialized for these... */
   1093 	switch (cmd) {
   1094 	case RAIDFRAME_REWRITEPARITY:
   1095 	case RAIDFRAME_GET_INFO:
   1096 	case RAIDFRAME_RESET_ACCTOTALS:
   1097 	case RAIDFRAME_GET_ACCTOTALS:
   1098 	case RAIDFRAME_KEEP_ACCTOTALS:
   1099 	case RAIDFRAME_GET_SIZE:
   1100 	case RAIDFRAME_FAIL_DISK:
   1101 	case RAIDFRAME_COPYBACK:
   1102 	case RAIDFRAME_CHECK_RECON_STATUS:
   1103 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1104 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1105 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1106 	case RAIDFRAME_ADD_HOT_SPARE:
   1107 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1108 	case RAIDFRAME_INIT_LABELS:
   1109 	case RAIDFRAME_REBUILD_IN_PLACE:
   1110 	case RAIDFRAME_CHECK_PARITY:
   1111 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1113 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1114 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1115 	case RAIDFRAME_SET_AUTOCONFIG:
   1116 	case RAIDFRAME_SET_ROOT:
   1117 	case RAIDFRAME_DELETE_COMPONENT:
   1118 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1119 	case RAIDFRAME_PARITYMAP_STATUS:
   1120 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1121 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1122 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1123 #ifdef COMPAT_NETBSD32
   1124 #ifdef _LP64
   1125 	case RAIDFRAME_GET_INFO32:
   1126 #endif
   1127 #endif
   1128 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1129 			return (ENXIO);
   1130 	}
   1131 
   1132 	/*
   1133 	 * Handle compat ioctl calls
   1134 	 *
   1135 	 * * If compat code is not loaded, stub returns ENOSYS and we just
   1136 	 *   check the "native" cmd's
   1137 	 * * If compat code is loaded but does not recognize the cmd, it
   1138 	 *   returns EPASSTHROUGH, and we just check the "native" cmd's
   1139 	 * * If compat code returns EAGAIN, we need to finish via config
   1140 	 * * Otherwise the cmd has been handled and we just return
   1141 	 */
   1142 	retcode = raidframe50_ioctl_hook_call(cmd,
   1143 	    (rs->sc_flags & RAIDF_INITED), raidPtr, unit, data, &k_cfg);
   1144 	if (retcode == ENOSYS)
   1145 		retcode = 0;
   1146 	else if (retcode == EAGAIN)
   1147 		goto config;
   1148 	else if (retcode != EPASSTHROUGH)
   1149 		return retcode;
   1150 
   1151 	retcode = raidframe80_ioctl_hook_call(cmd,
   1152 	    (rs->sc_flags & RAIDF_INITED), raidPtr, unit, data, &k_cfg);
   1153 	if (retcode == ENOSYS)
   1154 		retcode = 0;
   1155 	else if (retcode == EAGAIN)
   1156 		goto config;
   1157 	else if (retcode != EPASSTHROUGH)
   1158 		return retcode;
   1159 
   1160 	/*
   1161 	 * XXX
   1162 	 * Handling of FAIL_DISK80 command requires us to retain retcode's
   1163 	 * value of EPASSTHROUGH.  If you add more compat code later, make
   1164 	 * sure you don't overwrite retcode and break this!
   1165 	 */
   1166 
   1167 	switch (cmd) {
   1168 
   1169 		/* configure the system */
   1170 	case RAIDFRAME_CONFIGURE:
   1171 #ifdef COMPAT_NETBSD32
   1172 #ifdef _LP64
   1173 	case RAIDFRAME_CONFIGURE32:
   1174 #endif
   1175 #endif
   1176 
   1177 		if (raidPtr->valid) {
   1178 			/* There is a valid RAID set running on this unit! */
   1179 			printf("raid%d: Device already configured!\n",unit);
   1180 			return(EINVAL);
   1181 		}
   1182 
   1183 		/* copy-in the configuration information */
   1184 		/* data points to a pointer to the configuration structure */
   1185 
   1186 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1187 		if (k_cfg == NULL) {
   1188 			return (ENOMEM);
   1189 		}
   1190 #ifdef COMPAT_NETBSD32
   1191 #ifdef _LP64
   1192 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1193 		    (l->l_proc->p_flag & PK_32) != 0)
   1194 			retcode = rf_config_netbsd32(data, k_cfg);
   1195 		else
   1196 #endif
   1197 #endif
   1198 		{
   1199 			u_cfg = *((RF_Config_t **) data);
   1200 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1201 		}
   1202 		if (retcode) {
   1203 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1204 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1205 				retcode));
   1206 			goto no_config;
   1207 		}
   1208 		goto config;
   1209 	config:
   1210 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1211 
   1212 		/* allocate a buffer for the layout-specific data, and copy it
   1213 		 * in */
   1214 		if (k_cfg->layoutSpecificSize) {
   1215 			if (k_cfg->layoutSpecificSize > 10000) {
   1216 				/* sanity check */
   1217 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1218 				retcode = EINVAL;
   1219 				goto no_config;
   1220 			}
   1221 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1222 			    (u_char *));
   1223 			if (specific_buf == NULL) {
   1224 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1225 				retcode = ENOMEM;
   1226 				goto no_config;
   1227 			}
   1228 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1229 			    k_cfg->layoutSpecificSize);
   1230 			if (retcode) {
   1231 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1232 				RF_Free(specific_buf,
   1233 					k_cfg->layoutSpecificSize);
   1234 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1235 					retcode));
   1236 				goto no_config;
   1237 			}
   1238 		} else
   1239 			specific_buf = NULL;
   1240 		k_cfg->layoutSpecific = specific_buf;
   1241 
   1242 		/* should do some kind of sanity check on the configuration.
   1243 		 * Store the sum of all the bytes in the last byte? */
   1244 
   1245 		/* configure the system */
   1246 
   1247 		/*
   1248 		 * Clear the entire RAID descriptor, just to make sure
   1249 		 *  there is no stale data left in the case of a
   1250 		 *  reconfiguration
   1251 		 */
   1252 		memset(raidPtr, 0, sizeof(*raidPtr));
   1253 		raidPtr->softc = rs;
   1254 		raidPtr->raidid = unit;
   1255 
   1256 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1257 
   1258 		if (retcode == 0) {
   1259 
   1260 			/* allow this many simultaneous IO's to
   1261 			   this RAID device */
   1262 			raidPtr->openings = RAIDOUTSTANDING;
   1263 
   1264 			raidinit(rs);
   1265 			raid_wakeup(raidPtr);
   1266 			rf_markalldirty(raidPtr);
   1267 		}
   1268 		/* free the buffers.  No return code here. */
   1269 		if (k_cfg->layoutSpecificSize) {
   1270 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1271 		}
   1272 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1273 
   1274 	no_config:
   1275 		/*
   1276 		 * If configuration failed, set sc_flags so that we
   1277 		 * will detach the device when we close it.
   1278 		 */
   1279 		if (retcode != 0)
   1280 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1281 		return (retcode);
   1282 
   1283 		/* shutdown the system */
   1284 	case RAIDFRAME_SHUTDOWN:
   1285 
   1286 		part = DISKPART(dev);
   1287 		pmask = (1 << part);
   1288 
   1289 		if ((error = raidlock(rs)) != 0)
   1290 			return (error);
   1291 
   1292 		if (DK_BUSY(dksc, pmask) ||
   1293 		    raidPtr->recon_in_progress != 0 ||
   1294 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1295 		    raidPtr->copyback_in_progress != 0)
   1296 			retcode = EBUSY;
   1297 		else {
   1298 			/* detach and free on close */
   1299 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1300 			retcode = 0;
   1301 		}
   1302 
   1303 		raidunlock(rs);
   1304 
   1305 		return (retcode);
   1306 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1307 		return rf_get_component_label(raidPtr, data);
   1308 
   1309 #if 0
   1310 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1311 		clabel = (RF_ComponentLabel_t *) data;
   1312 
   1313 		/* XXX check the label for valid stuff... */
   1314 		/* Note that some things *should not* get modified --
   1315 		   the user should be re-initing the labels instead of
   1316 		   trying to patch things.
   1317 		   */
   1318 
   1319 		raidid = raidPtr->raidid;
   1320 #ifdef DEBUG
   1321 		printf("raid%d: Got component label:\n", raidid);
   1322 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1323 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1324 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1325 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1326 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1327 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1328 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1329 #endif
   1330 		clabel->row = 0;
   1331 		column = clabel->column;
   1332 
   1333 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1334 			return(EINVAL);
   1335 		}
   1336 
   1337 		/* XXX this isn't allowed to do anything for now :-) */
   1338 
   1339 		/* XXX and before it is, we need to fill in the rest
   1340 		   of the fields!?!?!?! */
   1341 		memcpy(raidget_component_label(raidPtr, column),
   1342 		    clabel, sizeof(*clabel));
   1343 		raidflush_component_label(raidPtr, column);
   1344 		return (0);
   1345 #endif
   1346 
   1347 	case RAIDFRAME_INIT_LABELS:
   1348 		clabel = (RF_ComponentLabel_t *) data;
   1349 		/*
   1350 		   we only want the serial number from
   1351 		   the above.  We get all the rest of the information
   1352 		   from the config that was used to create this RAID
   1353 		   set.
   1354 		   */
   1355 
   1356 		raidPtr->serial_number = clabel->serial_number;
   1357 
   1358 		for(column=0;column<raidPtr->numCol;column++) {
   1359 			diskPtr = &raidPtr->Disks[column];
   1360 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1361 				ci_label = raidget_component_label(raidPtr,
   1362 				    column);
   1363 				/* Zeroing this is important. */
   1364 				memset(ci_label, 0, sizeof(*ci_label));
   1365 				raid_init_component_label(raidPtr, ci_label);
   1366 				ci_label->serial_number =
   1367 				    raidPtr->serial_number;
   1368 				ci_label->row = 0; /* we dont' pretend to support more */
   1369 				rf_component_label_set_partitionsize(ci_label,
   1370 				    diskPtr->partitionSize);
   1371 				ci_label->column = column;
   1372 				raidflush_component_label(raidPtr, column);
   1373 			}
   1374 			/* XXXjld what about the spares? */
   1375 		}
   1376 
   1377 		return (retcode);
   1378 	case RAIDFRAME_SET_AUTOCONFIG:
   1379 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1380 		printf("raid%d: New autoconfig value is: %d\n",
   1381 		       raidPtr->raidid, d);
   1382 		*(int *) data = d;
   1383 		return (retcode);
   1384 
   1385 	case RAIDFRAME_SET_ROOT:
   1386 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1387 		printf("raid%d: New rootpartition value is: %d\n",
   1388 		       raidPtr->raidid, d);
   1389 		*(int *) data = d;
   1390 		return (retcode);
   1391 
   1392 		/* initialize all parity */
   1393 	case RAIDFRAME_REWRITEPARITY:
   1394 
   1395 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1396 			/* Parity for RAID 0 is trivially correct */
   1397 			raidPtr->parity_good = RF_RAID_CLEAN;
   1398 			return(0);
   1399 		}
   1400 
   1401 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1402 			/* Re-write is already in progress! */
   1403 			return(EINVAL);
   1404 		}
   1405 
   1406 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1407 					   rf_RewriteParityThread,
   1408 					   raidPtr,"raid_parity");
   1409 		return (retcode);
   1410 
   1411 
   1412 	case RAIDFRAME_ADD_HOT_SPARE:
   1413 		sparePtr = (RF_SingleComponent_t *) data;
   1414 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1415 		retcode = rf_add_hot_spare(raidPtr, &component);
   1416 		return(retcode);
   1417 
   1418 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1419 		return(retcode);
   1420 
   1421 	case RAIDFRAME_DELETE_COMPONENT:
   1422 		componentPtr = (RF_SingleComponent_t *)data;
   1423 		memcpy( &component, componentPtr,
   1424 			sizeof(RF_SingleComponent_t));
   1425 		retcode = rf_delete_component(raidPtr, &component);
   1426 		return(retcode);
   1427 
   1428 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1429 		componentPtr = (RF_SingleComponent_t *)data;
   1430 		memcpy( &component, componentPtr,
   1431 			sizeof(RF_SingleComponent_t));
   1432 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1433 		return(retcode);
   1434 
   1435 	case RAIDFRAME_REBUILD_IN_PLACE:
   1436 
   1437 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1438 			/* Can't do this on a RAID 0!! */
   1439 			return(EINVAL);
   1440 		}
   1441 
   1442 		if (raidPtr->recon_in_progress == 1) {
   1443 			/* a reconstruct is already in progress! */
   1444 			return(EINVAL);
   1445 		}
   1446 
   1447 		componentPtr = (RF_SingleComponent_t *) data;
   1448 		memcpy( &component, componentPtr,
   1449 			sizeof(RF_SingleComponent_t));
   1450 		component.row = 0; /* we don't support any more */
   1451 		column = component.column;
   1452 
   1453 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1454 			return(EINVAL);
   1455 		}
   1456 
   1457 		rf_lock_mutex2(raidPtr->mutex);
   1458 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1459 		    (raidPtr->numFailures > 0)) {
   1460 			/* XXX 0 above shouldn't be constant!!! */
   1461 			/* some component other than this has failed.
   1462 			   Let's not make things worse than they already
   1463 			   are... */
   1464 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1465 			       raidPtr->raidid);
   1466 			printf("raid%d:     Col: %d   Too many failures.\n",
   1467 			       raidPtr->raidid, column);
   1468 			rf_unlock_mutex2(raidPtr->mutex);
   1469 			return (EINVAL);
   1470 		}
   1471 		if (raidPtr->Disks[column].status ==
   1472 		    rf_ds_reconstructing) {
   1473 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1474 			       raidPtr->raidid);
   1475 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1476 
   1477 			rf_unlock_mutex2(raidPtr->mutex);
   1478 			return (EINVAL);
   1479 		}
   1480 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1481 			rf_unlock_mutex2(raidPtr->mutex);
   1482 			return (EINVAL);
   1483 		}
   1484 		rf_unlock_mutex2(raidPtr->mutex);
   1485 
   1486 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1487 		if (rrint == NULL)
   1488 			return(ENOMEM);
   1489 
   1490 		rrint->col = column;
   1491 		rrint->raidPtr = raidPtr;
   1492 
   1493 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1494 					   rf_ReconstructInPlaceThread,
   1495 					   rrint, "raid_reconip");
   1496 		return(retcode);
   1497 
   1498 	case RAIDFRAME_GET_INFO:
   1499 #ifdef COMPAT_NETBSD32
   1500 #ifdef _LP64
   1501 	case RAIDFRAME_GET_INFO32:
   1502 #endif
   1503 #endif
   1504 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1505 			  (RF_DeviceConfig_t *));
   1506 		if (d_cfg == NULL)
   1507 			return (ENOMEM);
   1508 		retcode = rf_get_info(raidPtr, d_cfg);
   1509 		if (retcode == 0) {
   1510 #ifdef COMPAT_NETBSD32
   1511 #ifdef _LP64
   1512 			if (cmd == RAIDFRAME_GET_INFO32)
   1513 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1514 			else
   1515 #endif
   1516 #endif
   1517 				ucfgp = *(RF_DeviceConfig_t **)data;
   1518 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1519 		}
   1520 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1521 
   1522 		return (retcode);
   1523 
   1524 	case RAIDFRAME_CHECK_PARITY:
   1525 		*(int *) data = raidPtr->parity_good;
   1526 		return (0);
   1527 
   1528 	case RAIDFRAME_PARITYMAP_STATUS:
   1529 		if (rf_paritymap_ineligible(raidPtr))
   1530 			return EINVAL;
   1531 		rf_paritymap_status(raidPtr->parity_map,
   1532 		    (struct rf_pmstat *)data);
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		if (raidPtr->parity_map == NULL)
   1539 			return ENOENT; /* ??? */
   1540 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1541 			(struct rf_pmparams *)data, 1))
   1542 			return EINVAL;
   1543 		return 0;
   1544 
   1545 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1546 		if (rf_paritymap_ineligible(raidPtr))
   1547 			return EINVAL;
   1548 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1549 		return 0;
   1550 
   1551 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1552 		if (rf_paritymap_ineligible(raidPtr))
   1553 			return EINVAL;
   1554 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1555 		/* XXX should errors be passed up? */
   1556 		return 0;
   1557 
   1558 	case RAIDFRAME_RESET_ACCTOTALS:
   1559 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1560 		return (0);
   1561 
   1562 	case RAIDFRAME_GET_ACCTOTALS:
   1563 		totals = (RF_AccTotals_t *) data;
   1564 		*totals = raidPtr->acc_totals;
   1565 		return (0);
   1566 
   1567 	case RAIDFRAME_KEEP_ACCTOTALS:
   1568 		raidPtr->keep_acc_totals = *(int *)data;
   1569 		return (0);
   1570 
   1571 	case RAIDFRAME_GET_SIZE:
   1572 		*(int *) data = raidPtr->totalSectors;
   1573 		return (0);
   1574 
   1575 		/* fail a disk & optionally start reconstruction */
   1576 	case RAIDFRAME_FAIL_DISK80:
   1577 		/* Check if we called compat code for this cmd */
   1578 		if (retcode != EPASSTHROUGH)
   1579 			return EINVAL;
   1580 		/* FALLTHRU */
   1581 	case RAIDFRAME_FAIL_DISK:
   1582 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1583 			/* Can't do this on a RAID 0!! */
   1584 			return(EINVAL);
   1585 		}
   1586 
   1587 		rr = (struct rf_recon_req *) data;
   1588 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1589 			return (EINVAL);
   1590 
   1591 		rf_lock_mutex2(raidPtr->mutex);
   1592 		if (raidPtr->status == rf_rs_reconstructing) {
   1593 			/* you can't fail a disk while we're reconstructing! */
   1594 			/* XXX wrong for RAID6 */
   1595 			rf_unlock_mutex2(raidPtr->mutex);
   1596 			return (EINVAL);
   1597 		}
   1598 		if ((raidPtr->Disks[rr->col].status ==
   1599 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1600 			/* some other component has failed.  Let's not make
   1601 			   things worse. XXX wrong for RAID6 */
   1602 			rf_unlock_mutex2(raidPtr->mutex);
   1603 			return (EINVAL);
   1604 		}
   1605 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1606 			/* Can't fail a spared disk! */
   1607 			rf_unlock_mutex2(raidPtr->mutex);
   1608 			return (EINVAL);
   1609 		}
   1610 		rf_unlock_mutex2(raidPtr->mutex);
   1611 
   1612 		/* make a copy of the recon request so that we don't rely on
   1613 		 * the user's buffer */
   1614 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1615 		if (rrint == NULL)
   1616 			return(ENOMEM);
   1617 		rrint->col = rr->col;
   1618 		rrint->flags = rr->flags;
   1619 		rrint->raidPtr = raidPtr;
   1620 
   1621 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1622 					   rf_ReconThread,
   1623 					   rrint, "raid_recon");
   1624 		return (0);
   1625 
   1626 		/* invoke a copyback operation after recon on whatever disk
   1627 		 * needs it, if any */
   1628 	case RAIDFRAME_COPYBACK:
   1629 
   1630 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1631 			/* This makes no sense on a RAID 0!! */
   1632 			return(EINVAL);
   1633 		}
   1634 
   1635 		if (raidPtr->copyback_in_progress == 1) {
   1636 			/* Copyback is already in progress! */
   1637 			return(EINVAL);
   1638 		}
   1639 
   1640 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1641 					   rf_CopybackThread,
   1642 					   raidPtr,"raid_copyback");
   1643 		return (retcode);
   1644 
   1645 		/* return the percentage completion of reconstruction */
   1646 	case RAIDFRAME_CHECK_RECON_STATUS:
   1647 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1648 			/* This makes no sense on a RAID 0, so tell the
   1649 			   user it's done. */
   1650 			*(int *) data = 100;
   1651 			return(0);
   1652 		}
   1653 		if (raidPtr->status != rf_rs_reconstructing)
   1654 			*(int *) data = 100;
   1655 		else {
   1656 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1657 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1658 			} else {
   1659 				*(int *) data = 0;
   1660 			}
   1661 		}
   1662 		return (0);
   1663 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1664 		rf_check_recon_status_ext(raidPtr, data);
   1665 		return (0);
   1666 
   1667 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1668 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1669 			/* This makes no sense on a RAID 0, so tell the
   1670 			   user it's done. */
   1671 			*(int *) data = 100;
   1672 			return(0);
   1673 		}
   1674 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1675 			*(int *) data = 100 *
   1676 				raidPtr->parity_rewrite_stripes_done /
   1677 				raidPtr->Layout.numStripe;
   1678 		} else {
   1679 			*(int *) data = 100;
   1680 		}
   1681 		return (0);
   1682 
   1683 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1684 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1685 		return (0);
   1686 
   1687 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1688 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1689 			/* This makes no sense on a RAID 0 */
   1690 			*(int *) data = 100;
   1691 			return(0);
   1692 		}
   1693 		if (raidPtr->copyback_in_progress == 1) {
   1694 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1695 				raidPtr->Layout.numStripe;
   1696 		} else {
   1697 			*(int *) data = 100;
   1698 		}
   1699 		return (0);
   1700 
   1701 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1702 		rf_check_copyback_status_ext(raidPtr, data);
   1703 		return 0;
   1704 
   1705 	case RAIDFRAME_SET_LAST_UNIT:
   1706 		for (column = 0; column < raidPtr->numCol; column++)
   1707 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1708 				return EBUSY;
   1709 
   1710 		for (column = 0; column < raidPtr->numCol; column++) {
   1711 			clabel = raidget_component_label(raidPtr, column);
   1712 			clabel->last_unit = *(int *)data;
   1713 			raidflush_component_label(raidPtr, column);
   1714 		}
   1715 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1716 		return 0;
   1717 
   1718 		/* the sparetable daemon calls this to wait for the kernel to
   1719 		 * need a spare table. this ioctl does not return until a
   1720 		 * spare table is needed. XXX -- calling mpsleep here in the
   1721 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1722 		 * -- I should either compute the spare table in the kernel,
   1723 		 * or have a different -- XXX XXX -- interface (a different
   1724 		 * character device) for delivering the table     -- XXX */
   1725 #if 0
   1726 	case RAIDFRAME_SPARET_WAIT:
   1727 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1728 		while (!rf_sparet_wait_queue)
   1729 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1730 		waitreq = rf_sparet_wait_queue;
   1731 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1732 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1733 
   1734 		/* structure assignment */
   1735 		*((RF_SparetWait_t *) data) = *waitreq;
   1736 
   1737 		RF_Free(waitreq, sizeof(*waitreq));
   1738 		return (0);
   1739 
   1740 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1741 		 * code in it that will cause the dameon to exit */
   1742 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1743 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1744 		waitreq->fcol = -1;
   1745 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1746 		waitreq->next = rf_sparet_wait_queue;
   1747 		rf_sparet_wait_queue = waitreq;
   1748 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1749 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1750 		return (0);
   1751 
   1752 		/* used by the spare table daemon to deliver a spare table
   1753 		 * into the kernel */
   1754 	case RAIDFRAME_SEND_SPARET:
   1755 
   1756 		/* install the spare table */
   1757 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1758 
   1759 		/* respond to the requestor.  the return status of the spare
   1760 		 * table installation is passed in the "fcol" field */
   1761 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1762 		waitreq->fcol = retcode;
   1763 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1764 		waitreq->next = rf_sparet_resp_queue;
   1765 		rf_sparet_resp_queue = waitreq;
   1766 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1767 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1768 
   1769 		return (retcode);
   1770 #endif
   1771 
   1772 	default:
   1773 		break; /* fall through to the os-specific code below */
   1774 
   1775 	}
   1776 
   1777 	if (!raidPtr->valid)
   1778 		return (EINVAL);
   1779 
   1780 	/*
   1781 	 * Add support for "regular" device ioctls here.
   1782 	 */
   1783 
   1784 	switch (cmd) {
   1785 	case DIOCGCACHE:
   1786 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1787 		break;
   1788 
   1789 	case DIOCCACHESYNC:
   1790 		retcode = rf_sync_component_caches(raidPtr);
   1791 		break;
   1792 
   1793 	default:
   1794 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1795 		break;
   1796 	}
   1797 
   1798 	return (retcode);
   1799 
   1800 }
   1801 
   1802 
   1803 /* raidinit -- complete the rest of the initialization for the
   1804    RAIDframe device.  */
   1805 
   1806 
   1807 static void
   1808 raidinit(struct raid_softc *rs)
   1809 {
   1810 	cfdata_t cf;
   1811 	unsigned int unit;
   1812 	struct dk_softc *dksc = &rs->sc_dksc;
   1813 	RF_Raid_t *raidPtr = &rs->sc_r;
   1814 	device_t dev;
   1815 
   1816 	unit = raidPtr->raidid;
   1817 
   1818 	/* XXX doesn't check bounds. */
   1819 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1820 
   1821 	/* attach the pseudo device */
   1822 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1823 	cf->cf_name = raid_cd.cd_name;
   1824 	cf->cf_atname = raid_cd.cd_name;
   1825 	cf->cf_unit = unit;
   1826 	cf->cf_fstate = FSTATE_STAR;
   1827 
   1828 	dev = config_attach_pseudo(cf);
   1829 	if (dev == NULL) {
   1830 		printf("raid%d: config_attach_pseudo failed\n",
   1831 		    raidPtr->raidid);
   1832 		free(cf, M_RAIDFRAME);
   1833 		return;
   1834 	}
   1835 
   1836 	/* provide a backpointer to the real softc */
   1837 	raidsoftc(dev) = rs;
   1838 
   1839 	/* disk_attach actually creates space for the CPU disklabel, among
   1840 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1841 	 * with disklabels. */
   1842 	dk_init(dksc, dev, DKTYPE_RAID);
   1843 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1844 
   1845 	/* XXX There may be a weird interaction here between this, and
   1846 	 * protectedSectors, as used in RAIDframe.  */
   1847 
   1848 	rs->sc_size = raidPtr->totalSectors;
   1849 
   1850 	/* Attach dk and disk subsystems */
   1851 	dk_attach(dksc);
   1852 	disk_attach(&dksc->sc_dkdev);
   1853 	rf_set_geometry(rs, raidPtr);
   1854 
   1855 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1856 
   1857 	/* mark unit as usuable */
   1858 	rs->sc_flags |= RAIDF_INITED;
   1859 
   1860 	dkwedge_discover(&dksc->sc_dkdev);
   1861 }
   1862 
   1863 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1864 /* wake up the daemon & tell it to get us a spare table
   1865  * XXX
   1866  * the entries in the queues should be tagged with the raidPtr
   1867  * so that in the extremely rare case that two recons happen at once,
   1868  * we know for which device were requesting a spare table
   1869  * XXX
   1870  *
   1871  * XXX This code is not currently used. GO
   1872  */
   1873 int
   1874 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1875 {
   1876 	int     retcode;
   1877 
   1878 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1879 	req->next = rf_sparet_wait_queue;
   1880 	rf_sparet_wait_queue = req;
   1881 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1882 
   1883 	/* mpsleep unlocks the mutex */
   1884 	while (!rf_sparet_resp_queue) {
   1885 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1886 	}
   1887 	req = rf_sparet_resp_queue;
   1888 	rf_sparet_resp_queue = req->next;
   1889 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1890 
   1891 	retcode = req->fcol;
   1892 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1893 					 * alloc'd */
   1894 	return (retcode);
   1895 }
   1896 #endif
   1897 
   1898 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1899  * bp & passes it down.
   1900  * any calls originating in the kernel must use non-blocking I/O
   1901  * do some extra sanity checking to return "appropriate" error values for
   1902  * certain conditions (to make some standard utilities work)
   1903  *
   1904  * Formerly known as: rf_DoAccessKernel
   1905  */
   1906 void
   1907 raidstart(RF_Raid_t *raidPtr)
   1908 {
   1909 	struct raid_softc *rs;
   1910 	struct dk_softc *dksc;
   1911 
   1912 	rs = raidPtr->softc;
   1913 	dksc = &rs->sc_dksc;
   1914 	/* quick check to see if anything has died recently */
   1915 	rf_lock_mutex2(raidPtr->mutex);
   1916 	if (raidPtr->numNewFailures > 0) {
   1917 		rf_unlock_mutex2(raidPtr->mutex);
   1918 		rf_update_component_labels(raidPtr,
   1919 					   RF_NORMAL_COMPONENT_UPDATE);
   1920 		rf_lock_mutex2(raidPtr->mutex);
   1921 		raidPtr->numNewFailures--;
   1922 	}
   1923 	rf_unlock_mutex2(raidPtr->mutex);
   1924 
   1925 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1926 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1927 		return;
   1928 	}
   1929 
   1930 	dk_start(dksc, NULL);
   1931 }
   1932 
   1933 static int
   1934 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1935 {
   1936 	RF_SectorCount_t num_blocks, pb, sum;
   1937 	RF_RaidAddr_t raid_addr;
   1938 	daddr_t blocknum;
   1939 	int     do_async;
   1940 	int rc;
   1941 
   1942 	rf_lock_mutex2(raidPtr->mutex);
   1943 	if (raidPtr->openings == 0) {
   1944 		rf_unlock_mutex2(raidPtr->mutex);
   1945 		return EAGAIN;
   1946 	}
   1947 	rf_unlock_mutex2(raidPtr->mutex);
   1948 
   1949 	blocknum = bp->b_rawblkno;
   1950 
   1951 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1952 		    (int) blocknum));
   1953 
   1954 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1955 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1956 
   1957 	/* *THIS* is where we adjust what block we're going to...
   1958 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1959 	raid_addr = blocknum;
   1960 
   1961 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1962 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1963 	sum = raid_addr + num_blocks + pb;
   1964 	if (1 || rf_debugKernelAccess) {
   1965 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1966 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1967 			    (int) pb, (int) bp->b_resid));
   1968 	}
   1969 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1970 	    || (sum < num_blocks) || (sum < pb)) {
   1971 		rc = ENOSPC;
   1972 		goto done;
   1973 	}
   1974 	/*
   1975 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1976 	 */
   1977 
   1978 	if (bp->b_bcount & raidPtr->sectorMask) {
   1979 		rc = ENOSPC;
   1980 		goto done;
   1981 	}
   1982 	db1_printf(("Calling DoAccess..\n"));
   1983 
   1984 
   1985 	rf_lock_mutex2(raidPtr->mutex);
   1986 	raidPtr->openings--;
   1987 	rf_unlock_mutex2(raidPtr->mutex);
   1988 
   1989 	/*
   1990 	 * Everything is async.
   1991 	 */
   1992 	do_async = 1;
   1993 
   1994 	/* don't ever condition on bp->b_flags & B_WRITE.
   1995 	 * always condition on B_READ instead */
   1996 
   1997 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1998 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1999 			 do_async, raid_addr, num_blocks,
   2000 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2001 
   2002 done:
   2003 	return rc;
   2004 }
   2005 
   2006 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2007 
   2008 int
   2009 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2010 {
   2011 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2012 	struct buf *bp;
   2013 
   2014 	req->queue = queue;
   2015 	bp = req->bp;
   2016 
   2017 	switch (req->type) {
   2018 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2019 		/* XXX need to do something extra here.. */
   2020 		/* I'm leaving this in, as I've never actually seen it used,
   2021 		 * and I'd like folks to report it... GO */
   2022 		printf(("WAKEUP CALLED\n"));
   2023 		queue->numOutstanding++;
   2024 
   2025 		bp->b_flags = 0;
   2026 		bp->b_private = req;
   2027 
   2028 		KernelWakeupFunc(bp);
   2029 		break;
   2030 
   2031 	case RF_IO_TYPE_READ:
   2032 	case RF_IO_TYPE_WRITE:
   2033 #if RF_ACC_TRACE > 0
   2034 		if (req->tracerec) {
   2035 			RF_ETIMER_START(req->tracerec->timer);
   2036 		}
   2037 #endif
   2038 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2039 		    op, queue->rf_cinfo->ci_dev,
   2040 		    req->sectorOffset, req->numSector,
   2041 		    req->buf, KernelWakeupFunc, (void *) req,
   2042 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2043 
   2044 		if (rf_debugKernelAccess) {
   2045 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2046 				(long) bp->b_blkno));
   2047 		}
   2048 		queue->numOutstanding++;
   2049 		queue->last_deq_sector = req->sectorOffset;
   2050 		/* acc wouldn't have been let in if there were any pending
   2051 		 * reqs at any other priority */
   2052 		queue->curPriority = req->priority;
   2053 
   2054 		db1_printf(("Going for %c to unit %d col %d\n",
   2055 			    req->type, queue->raidPtr->raidid,
   2056 			    queue->col));
   2057 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2058 			(int) req->sectorOffset, (int) req->numSector,
   2059 			(int) (req->numSector <<
   2060 			    queue->raidPtr->logBytesPerSector),
   2061 			(int) queue->raidPtr->logBytesPerSector));
   2062 
   2063 		/*
   2064 		 * XXX: drop lock here since this can block at
   2065 		 * least with backing SCSI devices.  Retake it
   2066 		 * to minimize fuss with calling interfaces.
   2067 		 */
   2068 
   2069 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2070 		bdev_strategy(bp);
   2071 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2072 		break;
   2073 
   2074 	default:
   2075 		panic("bad req->type in rf_DispatchKernelIO");
   2076 	}
   2077 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2078 
   2079 	return (0);
   2080 }
   2081 /* this is the callback function associated with a I/O invoked from
   2082    kernel code.
   2083  */
   2084 static void
   2085 KernelWakeupFunc(struct buf *bp)
   2086 {
   2087 	RF_DiskQueueData_t *req = NULL;
   2088 	RF_DiskQueue_t *queue;
   2089 
   2090 	db1_printf(("recovering the request queue:\n"));
   2091 
   2092 	req = bp->b_private;
   2093 
   2094 	queue = (RF_DiskQueue_t *) req->queue;
   2095 
   2096 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2097 
   2098 #if RF_ACC_TRACE > 0
   2099 	if (req->tracerec) {
   2100 		RF_ETIMER_STOP(req->tracerec->timer);
   2101 		RF_ETIMER_EVAL(req->tracerec->timer);
   2102 		rf_lock_mutex2(rf_tracing_mutex);
   2103 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2104 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2105 		req->tracerec->num_phys_ios++;
   2106 		rf_unlock_mutex2(rf_tracing_mutex);
   2107 	}
   2108 #endif
   2109 
   2110 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2111 	 * ballistic, and mark the component as hosed... */
   2112 
   2113 	if (bp->b_error != 0) {
   2114 		/* Mark the disk as dead */
   2115 		/* but only mark it once... */
   2116 		/* and only if it wouldn't leave this RAID set
   2117 		   completely broken */
   2118 		if (((queue->raidPtr->Disks[queue->col].status ==
   2119 		      rf_ds_optimal) ||
   2120 		     (queue->raidPtr->Disks[queue->col].status ==
   2121 		      rf_ds_used_spare)) &&
   2122 		     (queue->raidPtr->numFailures <
   2123 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2124 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2125 			       queue->raidPtr->raidid,
   2126 			       bp->b_error,
   2127 			       queue->raidPtr->Disks[queue->col].devname);
   2128 			queue->raidPtr->Disks[queue->col].status =
   2129 			    rf_ds_failed;
   2130 			queue->raidPtr->status = rf_rs_degraded;
   2131 			queue->raidPtr->numFailures++;
   2132 			queue->raidPtr->numNewFailures++;
   2133 		} else {	/* Disk is already dead... */
   2134 			/* printf("Disk already marked as dead!\n"); */
   2135 		}
   2136 
   2137 	}
   2138 
   2139 	/* Fill in the error value */
   2140 	req->error = bp->b_error;
   2141 
   2142 	/* Drop this one on the "finished" queue... */
   2143 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2144 
   2145 	/* Let the raidio thread know there is work to be done. */
   2146 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2147 
   2148 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2149 }
   2150 
   2151 
   2152 /*
   2153  * initialize a buf structure for doing an I/O in the kernel.
   2154  */
   2155 static void
   2156 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2157        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2158        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2159        struct proc *b_proc)
   2160 {
   2161 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2162 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2163 	bp->b_oflags = 0;
   2164 	bp->b_cflags = 0;
   2165 	bp->b_bcount = numSect << logBytesPerSector;
   2166 	bp->b_bufsize = bp->b_bcount;
   2167 	bp->b_error = 0;
   2168 	bp->b_dev = dev;
   2169 	bp->b_data = bf;
   2170 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2171 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2172 	if (bp->b_bcount == 0) {
   2173 		panic("bp->b_bcount is zero in InitBP!!");
   2174 	}
   2175 	bp->b_proc = b_proc;
   2176 	bp->b_iodone = cbFunc;
   2177 	bp->b_private = cbArg;
   2178 }
   2179 
   2180 /*
   2181  * Wait interruptibly for an exclusive lock.
   2182  *
   2183  * XXX
   2184  * Several drivers do this; it should be abstracted and made MP-safe.
   2185  * (Hmm... where have we seen this warning before :->  GO )
   2186  */
   2187 static int
   2188 raidlock(struct raid_softc *rs)
   2189 {
   2190 	int     error;
   2191 
   2192 	error = 0;
   2193 	mutex_enter(&rs->sc_mutex);
   2194 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2195 		rs->sc_flags |= RAIDF_WANTED;
   2196 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2197 		if (error != 0)
   2198 			goto done;
   2199 	}
   2200 	rs->sc_flags |= RAIDF_LOCKED;
   2201 done:
   2202 	mutex_exit(&rs->sc_mutex);
   2203 	return (error);
   2204 }
   2205 /*
   2206  * Unlock and wake up any waiters.
   2207  */
   2208 static void
   2209 raidunlock(struct raid_softc *rs)
   2210 {
   2211 
   2212 	mutex_enter(&rs->sc_mutex);
   2213 	rs->sc_flags &= ~RAIDF_LOCKED;
   2214 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2215 		rs->sc_flags &= ~RAIDF_WANTED;
   2216 		cv_broadcast(&rs->sc_cv);
   2217 	}
   2218 	mutex_exit(&rs->sc_mutex);
   2219 }
   2220 
   2221 
   2222 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2223 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2224 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2225 
   2226 static daddr_t
   2227 rf_component_info_offset(void)
   2228 {
   2229 
   2230 	return RF_COMPONENT_INFO_OFFSET;
   2231 }
   2232 
   2233 static daddr_t
   2234 rf_component_info_size(unsigned secsize)
   2235 {
   2236 	daddr_t info_size;
   2237 
   2238 	KASSERT(secsize);
   2239 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2240 		info_size = secsize;
   2241 	else
   2242 		info_size = RF_COMPONENT_INFO_SIZE;
   2243 
   2244 	return info_size;
   2245 }
   2246 
   2247 static daddr_t
   2248 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2249 {
   2250 	daddr_t map_offset;
   2251 
   2252 	KASSERT(raidPtr->bytesPerSector);
   2253 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2254 		map_offset = raidPtr->bytesPerSector;
   2255 	else
   2256 		map_offset = RF_COMPONENT_INFO_SIZE;
   2257 	map_offset += rf_component_info_offset();
   2258 
   2259 	return map_offset;
   2260 }
   2261 
   2262 static daddr_t
   2263 rf_parity_map_size(RF_Raid_t *raidPtr)
   2264 {
   2265 	daddr_t map_size;
   2266 
   2267 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2268 		map_size = raidPtr->bytesPerSector;
   2269 	else
   2270 		map_size = RF_PARITY_MAP_SIZE;
   2271 
   2272 	return map_size;
   2273 }
   2274 
   2275 int
   2276 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2277 {
   2278 	RF_ComponentLabel_t *clabel;
   2279 
   2280 	clabel = raidget_component_label(raidPtr, col);
   2281 	clabel->clean = RF_RAID_CLEAN;
   2282 	raidflush_component_label(raidPtr, col);
   2283 	return(0);
   2284 }
   2285 
   2286 
   2287 int
   2288 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2289 {
   2290 	RF_ComponentLabel_t *clabel;
   2291 
   2292 	clabel = raidget_component_label(raidPtr, col);
   2293 	clabel->clean = RF_RAID_DIRTY;
   2294 	raidflush_component_label(raidPtr, col);
   2295 	return(0);
   2296 }
   2297 
   2298 int
   2299 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2300 {
   2301 	KASSERT(raidPtr->bytesPerSector);
   2302 	return raidread_component_label(raidPtr->bytesPerSector,
   2303 	    raidPtr->Disks[col].dev,
   2304 	    raidPtr->raid_cinfo[col].ci_vp,
   2305 	    &raidPtr->raid_cinfo[col].ci_label);
   2306 }
   2307 
   2308 RF_ComponentLabel_t *
   2309 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2310 {
   2311 	return &raidPtr->raid_cinfo[col].ci_label;
   2312 }
   2313 
   2314 int
   2315 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2316 {
   2317 	RF_ComponentLabel_t *label;
   2318 
   2319 	label = &raidPtr->raid_cinfo[col].ci_label;
   2320 	label->mod_counter = raidPtr->mod_counter;
   2321 #ifndef RF_NO_PARITY_MAP
   2322 	label->parity_map_modcount = label->mod_counter;
   2323 #endif
   2324 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2325 	    raidPtr->Disks[col].dev,
   2326 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2327 }
   2328 
   2329 
   2330 static int
   2331 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2332     RF_ComponentLabel_t *clabel)
   2333 {
   2334 	return raidread_component_area(dev, b_vp, clabel,
   2335 	    sizeof(RF_ComponentLabel_t),
   2336 	    rf_component_info_offset(),
   2337 	    rf_component_info_size(secsize));
   2338 }
   2339 
   2340 /* ARGSUSED */
   2341 static int
   2342 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2343     size_t msize, daddr_t offset, daddr_t dsize)
   2344 {
   2345 	struct buf *bp;
   2346 	int error;
   2347 
   2348 	/* XXX should probably ensure that we don't try to do this if
   2349 	   someone has changed rf_protected_sectors. */
   2350 
   2351 	if (b_vp == NULL) {
   2352 		/* For whatever reason, this component is not valid.
   2353 		   Don't try to read a component label from it. */
   2354 		return(EINVAL);
   2355 	}
   2356 
   2357 	/* get a block of the appropriate size... */
   2358 	bp = geteblk((int)dsize);
   2359 	bp->b_dev = dev;
   2360 
   2361 	/* get our ducks in a row for the read */
   2362 	bp->b_blkno = offset / DEV_BSIZE;
   2363 	bp->b_bcount = dsize;
   2364 	bp->b_flags |= B_READ;
   2365  	bp->b_resid = dsize;
   2366 
   2367 	bdev_strategy(bp);
   2368 	error = biowait(bp);
   2369 
   2370 	if (!error) {
   2371 		memcpy(data, bp->b_data, msize);
   2372 	}
   2373 
   2374 	brelse(bp, 0);
   2375 	return(error);
   2376 }
   2377 
   2378 
   2379 static int
   2380 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2381     RF_ComponentLabel_t *clabel)
   2382 {
   2383 	return raidwrite_component_area(dev, b_vp, clabel,
   2384 	    sizeof(RF_ComponentLabel_t),
   2385 	    rf_component_info_offset(),
   2386 	    rf_component_info_size(secsize), 0);
   2387 }
   2388 
   2389 /* ARGSUSED */
   2390 static int
   2391 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2392     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2393 {
   2394 	struct buf *bp;
   2395 	int error;
   2396 
   2397 	/* get a block of the appropriate size... */
   2398 	bp = geteblk((int)dsize);
   2399 	bp->b_dev = dev;
   2400 
   2401 	/* get our ducks in a row for the write */
   2402 	bp->b_blkno = offset / DEV_BSIZE;
   2403 	bp->b_bcount = dsize;
   2404 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2405  	bp->b_resid = dsize;
   2406 
   2407 	memset(bp->b_data, 0, dsize);
   2408 	memcpy(bp->b_data, data, msize);
   2409 
   2410 	bdev_strategy(bp);
   2411 	if (asyncp)
   2412 		return 0;
   2413 	error = biowait(bp);
   2414 	brelse(bp, 0);
   2415 	if (error) {
   2416 #if 1
   2417 		printf("Failed to write RAID component info!\n");
   2418 #endif
   2419 	}
   2420 
   2421 	return(error);
   2422 }
   2423 
   2424 void
   2425 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2426 {
   2427 	int c;
   2428 
   2429 	for (c = 0; c < raidPtr->numCol; c++) {
   2430 		/* Skip dead disks. */
   2431 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2432 			continue;
   2433 		/* XXXjld: what if an error occurs here? */
   2434 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2435 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2436 		    RF_PARITYMAP_NBYTE,
   2437 		    rf_parity_map_offset(raidPtr),
   2438 		    rf_parity_map_size(raidPtr), 0);
   2439 	}
   2440 }
   2441 
   2442 void
   2443 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2444 {
   2445 	struct rf_paritymap_ondisk tmp;
   2446 	int c,first;
   2447 
   2448 	first=1;
   2449 	for (c = 0; c < raidPtr->numCol; c++) {
   2450 		/* Skip dead disks. */
   2451 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2452 			continue;
   2453 		raidread_component_area(raidPtr->Disks[c].dev,
   2454 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2455 		    RF_PARITYMAP_NBYTE,
   2456 		    rf_parity_map_offset(raidPtr),
   2457 		    rf_parity_map_size(raidPtr));
   2458 		if (first) {
   2459 			memcpy(map, &tmp, sizeof(*map));
   2460 			first = 0;
   2461 		} else {
   2462 			rf_paritymap_merge(map, &tmp);
   2463 		}
   2464 	}
   2465 }
   2466 
   2467 void
   2468 rf_markalldirty(RF_Raid_t *raidPtr)
   2469 {
   2470 	RF_ComponentLabel_t *clabel;
   2471 	int sparecol;
   2472 	int c;
   2473 	int j;
   2474 	int scol = -1;
   2475 
   2476 	raidPtr->mod_counter++;
   2477 	for (c = 0; c < raidPtr->numCol; c++) {
   2478 		/* we don't want to touch (at all) a disk that has
   2479 		   failed */
   2480 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2481 			clabel = raidget_component_label(raidPtr, c);
   2482 			if (clabel->status == rf_ds_spared) {
   2483 				/* XXX do something special...
   2484 				   but whatever you do, don't
   2485 				   try to access it!! */
   2486 			} else {
   2487 				raidmarkdirty(raidPtr, c);
   2488 			}
   2489 		}
   2490 	}
   2491 
   2492 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2493 		sparecol = raidPtr->numCol + c;
   2494 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2495 			/*
   2496 
   2497 			   we claim this disk is "optimal" if it's
   2498 			   rf_ds_used_spare, as that means it should be
   2499 			   directly substitutable for the disk it replaced.
   2500 			   We note that too...
   2501 
   2502 			 */
   2503 
   2504 			for(j=0;j<raidPtr->numCol;j++) {
   2505 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2506 					scol = j;
   2507 					break;
   2508 				}
   2509 			}
   2510 
   2511 			clabel = raidget_component_label(raidPtr, sparecol);
   2512 			/* make sure status is noted */
   2513 
   2514 			raid_init_component_label(raidPtr, clabel);
   2515 
   2516 			clabel->row = 0;
   2517 			clabel->column = scol;
   2518 			/* Note: we *don't* change status from rf_ds_used_spare
   2519 			   to rf_ds_optimal */
   2520 			/* clabel.status = rf_ds_optimal; */
   2521 
   2522 			raidmarkdirty(raidPtr, sparecol);
   2523 		}
   2524 	}
   2525 }
   2526 
   2527 
   2528 void
   2529 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2530 {
   2531 	RF_ComponentLabel_t *clabel;
   2532 	int sparecol;
   2533 	int c;
   2534 	int j;
   2535 	int scol;
   2536 	struct raid_softc *rs = raidPtr->softc;
   2537 
   2538 	scol = -1;
   2539 
   2540 	/* XXX should do extra checks to make sure things really are clean,
   2541 	   rather than blindly setting the clean bit... */
   2542 
   2543 	raidPtr->mod_counter++;
   2544 
   2545 	for (c = 0; c < raidPtr->numCol; c++) {
   2546 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2547 			clabel = raidget_component_label(raidPtr, c);
   2548 			/* make sure status is noted */
   2549 			clabel->status = rf_ds_optimal;
   2550 
   2551 			/* note what unit we are configured as */
   2552 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2553 				clabel->last_unit = raidPtr->raidid;
   2554 
   2555 			raidflush_component_label(raidPtr, c);
   2556 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2557 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2558 					raidmarkclean(raidPtr, c);
   2559 				}
   2560 			}
   2561 		}
   2562 		/* else we don't touch it.. */
   2563 	}
   2564 
   2565 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2566 		sparecol = raidPtr->numCol + c;
   2567 		/* Need to ensure that the reconstruct actually completed! */
   2568 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2569 			/*
   2570 
   2571 			   we claim this disk is "optimal" if it's
   2572 			   rf_ds_used_spare, as that means it should be
   2573 			   directly substitutable for the disk it replaced.
   2574 			   We note that too...
   2575 
   2576 			 */
   2577 
   2578 			for(j=0;j<raidPtr->numCol;j++) {
   2579 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2580 					scol = j;
   2581 					break;
   2582 				}
   2583 			}
   2584 
   2585 			/* XXX shouldn't *really* need this... */
   2586 			clabel = raidget_component_label(raidPtr, sparecol);
   2587 			/* make sure status is noted */
   2588 
   2589 			raid_init_component_label(raidPtr, clabel);
   2590 
   2591 			clabel->column = scol;
   2592 			clabel->status = rf_ds_optimal;
   2593 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2594 				clabel->last_unit = raidPtr->raidid;
   2595 
   2596 			raidflush_component_label(raidPtr, sparecol);
   2597 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2598 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2599 					raidmarkclean(raidPtr, sparecol);
   2600 				}
   2601 			}
   2602 		}
   2603 	}
   2604 }
   2605 
   2606 void
   2607 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2608 {
   2609 
   2610 	if (vp != NULL) {
   2611 		if (auto_configured == 1) {
   2612 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2613 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2614 			vput(vp);
   2615 
   2616 		} else {
   2617 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2618 		}
   2619 	}
   2620 }
   2621 
   2622 
   2623 void
   2624 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2625 {
   2626 	int r,c;
   2627 	struct vnode *vp;
   2628 	int acd;
   2629 
   2630 
   2631 	/* We take this opportunity to close the vnodes like we should.. */
   2632 
   2633 	for (c = 0; c < raidPtr->numCol; c++) {
   2634 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2635 		acd = raidPtr->Disks[c].auto_configured;
   2636 		rf_close_component(raidPtr, vp, acd);
   2637 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2638 		raidPtr->Disks[c].auto_configured = 0;
   2639 	}
   2640 
   2641 	for (r = 0; r < raidPtr->numSpare; r++) {
   2642 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2643 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2644 		rf_close_component(raidPtr, vp, acd);
   2645 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2646 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2647 	}
   2648 }
   2649 
   2650 
   2651 void
   2652 rf_ReconThread(struct rf_recon_req_internal *req)
   2653 {
   2654 	int     s;
   2655 	RF_Raid_t *raidPtr;
   2656 
   2657 	s = splbio();
   2658 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2659 	raidPtr->recon_in_progress = 1;
   2660 
   2661 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2662 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2663 
   2664 	RF_Free(req, sizeof(*req));
   2665 
   2666 	raidPtr->recon_in_progress = 0;
   2667 	splx(s);
   2668 
   2669 	/* That's all... */
   2670 	kthread_exit(0);	/* does not return */
   2671 }
   2672 
   2673 void
   2674 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2675 {
   2676 	int retcode;
   2677 	int s;
   2678 
   2679 	raidPtr->parity_rewrite_stripes_done = 0;
   2680 	raidPtr->parity_rewrite_in_progress = 1;
   2681 	s = splbio();
   2682 	retcode = rf_RewriteParity(raidPtr);
   2683 	splx(s);
   2684 	if (retcode) {
   2685 		printf("raid%d: Error re-writing parity (%d)!\n",
   2686 		    raidPtr->raidid, retcode);
   2687 	} else {
   2688 		/* set the clean bit!  If we shutdown correctly,
   2689 		   the clean bit on each component label will get
   2690 		   set */
   2691 		raidPtr->parity_good = RF_RAID_CLEAN;
   2692 	}
   2693 	raidPtr->parity_rewrite_in_progress = 0;
   2694 
   2695 	/* Anyone waiting for us to stop?  If so, inform them... */
   2696 	if (raidPtr->waitShutdown) {
   2697 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2698 	}
   2699 
   2700 	/* That's all... */
   2701 	kthread_exit(0);	/* does not return */
   2702 }
   2703 
   2704 
   2705 void
   2706 rf_CopybackThread(RF_Raid_t *raidPtr)
   2707 {
   2708 	int s;
   2709 
   2710 	raidPtr->copyback_in_progress = 1;
   2711 	s = splbio();
   2712 	rf_CopybackReconstructedData(raidPtr);
   2713 	splx(s);
   2714 	raidPtr->copyback_in_progress = 0;
   2715 
   2716 	/* That's all... */
   2717 	kthread_exit(0);	/* does not return */
   2718 }
   2719 
   2720 
   2721 void
   2722 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2723 {
   2724 	int s;
   2725 	RF_Raid_t *raidPtr;
   2726 
   2727 	s = splbio();
   2728 	raidPtr = req->raidPtr;
   2729 	raidPtr->recon_in_progress = 1;
   2730 	rf_ReconstructInPlace(raidPtr, req->col);
   2731 	RF_Free(req, sizeof(*req));
   2732 	raidPtr->recon_in_progress = 0;
   2733 	splx(s);
   2734 
   2735 	/* That's all... */
   2736 	kthread_exit(0);	/* does not return */
   2737 }
   2738 
   2739 static RF_AutoConfig_t *
   2740 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2741     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2742     unsigned secsize)
   2743 {
   2744 	int good_one = 0;
   2745 	RF_ComponentLabel_t *clabel;
   2746 	RF_AutoConfig_t *ac;
   2747 
   2748 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2749 	if (clabel == NULL) {
   2750 oomem:
   2751 		    while(ac_list) {
   2752 			    ac = ac_list;
   2753 			    if (ac->clabel)
   2754 				    free(ac->clabel, M_RAIDFRAME);
   2755 			    ac_list = ac_list->next;
   2756 			    free(ac, M_RAIDFRAME);
   2757 		    }
   2758 		    printf("RAID auto config: out of memory!\n");
   2759 		    return NULL; /* XXX probably should panic? */
   2760 	}
   2761 
   2762 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2763 		/* Got the label.  Does it look reasonable? */
   2764 		if (rf_reasonable_label(clabel, numsecs) &&
   2765 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2766 #ifdef DEBUG
   2767 			printf("Component on: %s: %llu\n",
   2768 				cname, (unsigned long long)size);
   2769 			rf_print_component_label(clabel);
   2770 #endif
   2771 			/* if it's reasonable, add it, else ignore it. */
   2772 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2773 				M_NOWAIT);
   2774 			if (ac == NULL) {
   2775 				free(clabel, M_RAIDFRAME);
   2776 				goto oomem;
   2777 			}
   2778 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2779 			ac->dev = dev;
   2780 			ac->vp = vp;
   2781 			ac->clabel = clabel;
   2782 			ac->next = ac_list;
   2783 			ac_list = ac;
   2784 			good_one = 1;
   2785 		}
   2786 	}
   2787 	if (!good_one) {
   2788 		/* cleanup */
   2789 		free(clabel, M_RAIDFRAME);
   2790 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2791 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2792 		vput(vp);
   2793 	}
   2794 	return ac_list;
   2795 }
   2796 
   2797 RF_AutoConfig_t *
   2798 rf_find_raid_components(void)
   2799 {
   2800 	struct vnode *vp;
   2801 	struct disklabel label;
   2802 	device_t dv;
   2803 	deviter_t di;
   2804 	dev_t dev;
   2805 	int bmajor, bminor, wedge, rf_part_found;
   2806 	int error;
   2807 	int i;
   2808 	RF_AutoConfig_t *ac_list;
   2809 	uint64_t numsecs;
   2810 	unsigned secsize;
   2811 	int dowedges;
   2812 
   2813 	/* initialize the AutoConfig list */
   2814 	ac_list = NULL;
   2815 
   2816 	/*
   2817 	 * we begin by trolling through *all* the devices on the system *twice*
   2818 	 * first we scan for wedges, second for other devices. This avoids
   2819 	 * using a raw partition instead of a wedge that covers the whole disk
   2820 	 */
   2821 
   2822 	for (dowedges=1; dowedges>=0; --dowedges) {
   2823 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2824 		     dv = deviter_next(&di)) {
   2825 
   2826 			/* we are only interested in disks... */
   2827 			if (device_class(dv) != DV_DISK)
   2828 				continue;
   2829 
   2830 			/* we don't care about floppies... */
   2831 			if (device_is_a(dv, "fd")) {
   2832 				continue;
   2833 			}
   2834 
   2835 			/* we don't care about CD's... */
   2836 			if (device_is_a(dv, "cd")) {
   2837 				continue;
   2838 			}
   2839 
   2840 			/* we don't care about md's... */
   2841 			if (device_is_a(dv, "md")) {
   2842 				continue;
   2843 			}
   2844 
   2845 			/* hdfd is the Atari/Hades floppy driver */
   2846 			if (device_is_a(dv, "hdfd")) {
   2847 				continue;
   2848 			}
   2849 
   2850 			/* fdisa is the Atari/Milan floppy driver */
   2851 			if (device_is_a(dv, "fdisa")) {
   2852 				continue;
   2853 			}
   2854 
   2855 			/* are we in the wedges pass ? */
   2856 			wedge = device_is_a(dv, "dk");
   2857 			if (wedge != dowedges) {
   2858 				continue;
   2859 			}
   2860 
   2861 			/* need to find the device_name_to_block_device_major stuff */
   2862 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2863 
   2864 			rf_part_found = 0; /*No raid partition as yet*/
   2865 
   2866 			/* get a vnode for the raw partition of this disk */
   2867 			bminor = minor(device_unit(dv));
   2868 			dev = wedge ? makedev(bmajor, bminor) :
   2869 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2870 			if (bdevvp(dev, &vp))
   2871 				panic("RAID can't alloc vnode");
   2872 
   2873 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2874 
   2875 			if (error) {
   2876 				/* "Who cares."  Continue looking
   2877 				   for something that exists*/
   2878 				vput(vp);
   2879 				continue;
   2880 			}
   2881 
   2882 			error = getdisksize(vp, &numsecs, &secsize);
   2883 			if (error) {
   2884 				/*
   2885 				 * Pseudo devices like vnd and cgd can be
   2886 				 * opened but may still need some configuration.
   2887 				 * Ignore these quietly.
   2888 				 */
   2889 				if (error != ENXIO)
   2890 					printf("RAIDframe: can't get disk size"
   2891 					    " for dev %s (%d)\n",
   2892 					    device_xname(dv), error);
   2893 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2894 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2895 				vput(vp);
   2896 				continue;
   2897 			}
   2898 			if (wedge) {
   2899 				struct dkwedge_info dkw;
   2900 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2901 				    NOCRED);
   2902 				if (error) {
   2903 					printf("RAIDframe: can't get wedge info for "
   2904 					    "dev %s (%d)\n", device_xname(dv), error);
   2905 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2906 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2907 					vput(vp);
   2908 					continue;
   2909 				}
   2910 
   2911 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2912 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2913 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2914 					vput(vp);
   2915 					continue;
   2916 				}
   2917 
   2918 				ac_list = rf_get_component(ac_list, dev, vp,
   2919 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2920 				rf_part_found = 1; /*There is a raid component on this disk*/
   2921 				continue;
   2922 			}
   2923 
   2924 			/* Ok, the disk exists.  Go get the disklabel. */
   2925 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2926 			if (error) {
   2927 				/*
   2928 				 * XXX can't happen - open() would
   2929 				 * have errored out (or faked up one)
   2930 				 */
   2931 				if (error != ENOTTY)
   2932 					printf("RAIDframe: can't get label for dev "
   2933 					    "%s (%d)\n", device_xname(dv), error);
   2934 			}
   2935 
   2936 			/* don't need this any more.  We'll allocate it again
   2937 			   a little later if we really do... */
   2938 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2939 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2940 			vput(vp);
   2941 
   2942 			if (error)
   2943 				continue;
   2944 
   2945 			rf_part_found = 0; /*No raid partitions yet*/
   2946 			for (i = 0; i < label.d_npartitions; i++) {
   2947 				char cname[sizeof(ac_list->devname)];
   2948 
   2949 				/* We only support partitions marked as RAID */
   2950 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2951 					continue;
   2952 
   2953 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2954 				if (bdevvp(dev, &vp))
   2955 					panic("RAID can't alloc vnode");
   2956 
   2957 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2958 				if (error) {
   2959 					/* Whatever... */
   2960 					vput(vp);
   2961 					continue;
   2962 				}
   2963 				snprintf(cname, sizeof(cname), "%s%c",
   2964 				    device_xname(dv), 'a' + i);
   2965 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2966 					label.d_partitions[i].p_size, numsecs, secsize);
   2967 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2968 			}
   2969 
   2970 			/*
   2971 			 *If there is no raid component on this disk, either in a
   2972 			 *disklabel or inside a wedge, check the raw partition as well,
   2973 			 *as it is possible to configure raid components on raw disk
   2974 			 *devices.
   2975 			 */
   2976 
   2977 			if (!rf_part_found) {
   2978 				char cname[sizeof(ac_list->devname)];
   2979 
   2980 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2981 				if (bdevvp(dev, &vp))
   2982 					panic("RAID can't alloc vnode");
   2983 
   2984 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2985 				if (error) {
   2986 					/* Whatever... */
   2987 					vput(vp);
   2988 					continue;
   2989 				}
   2990 				snprintf(cname, sizeof(cname), "%s%c",
   2991 				    device_xname(dv), 'a' + RAW_PART);
   2992 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2993 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2994 			}
   2995 		}
   2996 		deviter_release(&di);
   2997 	}
   2998 	return ac_list;
   2999 }
   3000 
   3001 
   3002 int
   3003 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3004 {
   3005 
   3006 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3007 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3008 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3009 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3010 	    clabel->row >=0 &&
   3011 	    clabel->column >= 0 &&
   3012 	    clabel->num_rows > 0 &&
   3013 	    clabel->num_columns > 0 &&
   3014 	    clabel->row < clabel->num_rows &&
   3015 	    clabel->column < clabel->num_columns &&
   3016 	    clabel->blockSize > 0 &&
   3017 	    /*
   3018 	     * numBlocksHi may contain garbage, but it is ok since
   3019 	     * the type is unsigned.  If it is really garbage,
   3020 	     * rf_fix_old_label_size() will fix it.
   3021 	     */
   3022 	    rf_component_label_numblocks(clabel) > 0) {
   3023 		/*
   3024 		 * label looks reasonable enough...
   3025 		 * let's make sure it has no old garbage.
   3026 		 */
   3027 		if (numsecs)
   3028 			rf_fix_old_label_size(clabel, numsecs);
   3029 		return(1);
   3030 	}
   3031 	return(0);
   3032 }
   3033 
   3034 
   3035 /*
   3036  * For reasons yet unknown, some old component labels have garbage in
   3037  * the newer numBlocksHi region, and this causes lossage.  Since those
   3038  * disks will also have numsecs set to less than 32 bits of sectors,
   3039  * we can determine when this corruption has occurred, and fix it.
   3040  *
   3041  * The exact same problem, with the same unknown reason, happens to
   3042  * the partitionSizeHi member as well.
   3043  */
   3044 static void
   3045 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3046 {
   3047 
   3048 	if (numsecs < ((uint64_t)1 << 32)) {
   3049 		if (clabel->numBlocksHi) {
   3050 			printf("WARNING: total sectors < 32 bits, yet "
   3051 			       "numBlocksHi set\n"
   3052 			       "WARNING: resetting numBlocksHi to zero.\n");
   3053 			clabel->numBlocksHi = 0;
   3054 		}
   3055 
   3056 		if (clabel->partitionSizeHi) {
   3057 			printf("WARNING: total sectors < 32 bits, yet "
   3058 			       "partitionSizeHi set\n"
   3059 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3060 			clabel->partitionSizeHi = 0;
   3061 		}
   3062 	}
   3063 }
   3064 
   3065 
   3066 #ifdef DEBUG
   3067 void
   3068 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3069 {
   3070 	uint64_t numBlocks;
   3071 	static const char *rp[] = {
   3072 	    "No", "Force", "Soft", "*invalid*"
   3073 	};
   3074 
   3075 
   3076 	numBlocks = rf_component_label_numblocks(clabel);
   3077 
   3078 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3079 	       clabel->row, clabel->column,
   3080 	       clabel->num_rows, clabel->num_columns);
   3081 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3082 	       clabel->version, clabel->serial_number,
   3083 	       clabel->mod_counter);
   3084 	printf("   Clean: %s Status: %d\n",
   3085 	       clabel->clean ? "Yes" : "No", clabel->status);
   3086 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3087 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3088 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3089 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3090 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3091 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3092 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3093 #if 0
   3094 	   printf("   Config order: %d\n", clabel->config_order);
   3095 #endif
   3096 
   3097 }
   3098 #endif
   3099 
   3100 RF_ConfigSet_t *
   3101 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3102 {
   3103 	RF_AutoConfig_t *ac;
   3104 	RF_ConfigSet_t *config_sets;
   3105 	RF_ConfigSet_t *cset;
   3106 	RF_AutoConfig_t *ac_next;
   3107 
   3108 
   3109 	config_sets = NULL;
   3110 
   3111 	/* Go through the AutoConfig list, and figure out which components
   3112 	   belong to what sets.  */
   3113 	ac = ac_list;
   3114 	while(ac!=NULL) {
   3115 		/* we're going to putz with ac->next, so save it here
   3116 		   for use at the end of the loop */
   3117 		ac_next = ac->next;
   3118 
   3119 		if (config_sets == NULL) {
   3120 			/* will need at least this one... */
   3121 			config_sets = (RF_ConfigSet_t *)
   3122 				malloc(sizeof(RF_ConfigSet_t),
   3123 				       M_RAIDFRAME, M_NOWAIT);
   3124 			if (config_sets == NULL) {
   3125 				panic("rf_create_auto_sets: No memory!");
   3126 			}
   3127 			/* this one is easy :) */
   3128 			config_sets->ac = ac;
   3129 			config_sets->next = NULL;
   3130 			config_sets->rootable = 0;
   3131 			ac->next = NULL;
   3132 		} else {
   3133 			/* which set does this component fit into? */
   3134 			cset = config_sets;
   3135 			while(cset!=NULL) {
   3136 				if (rf_does_it_fit(cset, ac)) {
   3137 					/* looks like it matches... */
   3138 					ac->next = cset->ac;
   3139 					cset->ac = ac;
   3140 					break;
   3141 				}
   3142 				cset = cset->next;
   3143 			}
   3144 			if (cset==NULL) {
   3145 				/* didn't find a match above... new set..*/
   3146 				cset = (RF_ConfigSet_t *)
   3147 					malloc(sizeof(RF_ConfigSet_t),
   3148 					       M_RAIDFRAME, M_NOWAIT);
   3149 				if (cset == NULL) {
   3150 					panic("rf_create_auto_sets: No memory!");
   3151 				}
   3152 				cset->ac = ac;
   3153 				ac->next = NULL;
   3154 				cset->next = config_sets;
   3155 				cset->rootable = 0;
   3156 				config_sets = cset;
   3157 			}
   3158 		}
   3159 		ac = ac_next;
   3160 	}
   3161 
   3162 
   3163 	return(config_sets);
   3164 }
   3165 
   3166 static int
   3167 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3168 {
   3169 	RF_ComponentLabel_t *clabel1, *clabel2;
   3170 
   3171 	/* If this one matches the *first* one in the set, that's good
   3172 	   enough, since the other members of the set would have been
   3173 	   through here too... */
   3174 	/* note that we are not checking partitionSize here..
   3175 
   3176 	   Note that we are also not checking the mod_counters here.
   3177 	   If everything else matches except the mod_counter, that's
   3178 	   good enough for this test.  We will deal with the mod_counters
   3179 	   a little later in the autoconfiguration process.
   3180 
   3181 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3182 
   3183 	   The reason we don't check for this is that failed disks
   3184 	   will have lower modification counts.  If those disks are
   3185 	   not added to the set they used to belong to, then they will
   3186 	   form their own set, which may result in 2 different sets,
   3187 	   for example, competing to be configured at raid0, and
   3188 	   perhaps competing to be the root filesystem set.  If the
   3189 	   wrong ones get configured, or both attempt to become /,
   3190 	   weird behaviour and or serious lossage will occur.  Thus we
   3191 	   need to bring them into the fold here, and kick them out at
   3192 	   a later point.
   3193 
   3194 	*/
   3195 
   3196 	clabel1 = cset->ac->clabel;
   3197 	clabel2 = ac->clabel;
   3198 	if ((clabel1->version == clabel2->version) &&
   3199 	    (clabel1->serial_number == clabel2->serial_number) &&
   3200 	    (clabel1->num_rows == clabel2->num_rows) &&
   3201 	    (clabel1->num_columns == clabel2->num_columns) &&
   3202 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3203 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3204 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3205 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3206 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3207 	    (clabel1->blockSize == clabel2->blockSize) &&
   3208 	    rf_component_label_numblocks(clabel1) ==
   3209 	    rf_component_label_numblocks(clabel2) &&
   3210 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3211 	    (clabel1->root_partition == clabel2->root_partition) &&
   3212 	    (clabel1->last_unit == clabel2->last_unit) &&
   3213 	    (clabel1->config_order == clabel2->config_order)) {
   3214 		/* if it get's here, it almost *has* to be a match */
   3215 	} else {
   3216 		/* it's not consistent with somebody in the set..
   3217 		   punt */
   3218 		return(0);
   3219 	}
   3220 	/* all was fine.. it must fit... */
   3221 	return(1);
   3222 }
   3223 
   3224 int
   3225 rf_have_enough_components(RF_ConfigSet_t *cset)
   3226 {
   3227 	RF_AutoConfig_t *ac;
   3228 	RF_AutoConfig_t *auto_config;
   3229 	RF_ComponentLabel_t *clabel;
   3230 	int c;
   3231 	int num_cols;
   3232 	int num_missing;
   3233 	int mod_counter;
   3234 	int mod_counter_found;
   3235 	int even_pair_failed;
   3236 	char parity_type;
   3237 
   3238 
   3239 	/* check to see that we have enough 'live' components
   3240 	   of this set.  If so, we can configure it if necessary */
   3241 
   3242 	num_cols = cset->ac->clabel->num_columns;
   3243 	parity_type = cset->ac->clabel->parityConfig;
   3244 
   3245 	/* XXX Check for duplicate components!?!?!? */
   3246 
   3247 	/* Determine what the mod_counter is supposed to be for this set. */
   3248 
   3249 	mod_counter_found = 0;
   3250 	mod_counter = 0;
   3251 	ac = cset->ac;
   3252 	while(ac!=NULL) {
   3253 		if (mod_counter_found==0) {
   3254 			mod_counter = ac->clabel->mod_counter;
   3255 			mod_counter_found = 1;
   3256 		} else {
   3257 			if (ac->clabel->mod_counter > mod_counter) {
   3258 				mod_counter = ac->clabel->mod_counter;
   3259 			}
   3260 		}
   3261 		ac = ac->next;
   3262 	}
   3263 
   3264 	num_missing = 0;
   3265 	auto_config = cset->ac;
   3266 
   3267 	even_pair_failed = 0;
   3268 	for(c=0; c<num_cols; c++) {
   3269 		ac = auto_config;
   3270 		while(ac!=NULL) {
   3271 			if ((ac->clabel->column == c) &&
   3272 			    (ac->clabel->mod_counter == mod_counter)) {
   3273 				/* it's this one... */
   3274 #ifdef DEBUG
   3275 				printf("Found: %s at %d\n",
   3276 				       ac->devname,c);
   3277 #endif
   3278 				break;
   3279 			}
   3280 			ac=ac->next;
   3281 		}
   3282 		if (ac==NULL) {
   3283 				/* Didn't find one here! */
   3284 				/* special case for RAID 1, especially
   3285 				   where there are more than 2
   3286 				   components (where RAIDframe treats
   3287 				   things a little differently :( ) */
   3288 			if (parity_type == '1') {
   3289 				if (c%2 == 0) { /* even component */
   3290 					even_pair_failed = 1;
   3291 				} else { /* odd component.  If
   3292 					    we're failed, and
   3293 					    so is the even
   3294 					    component, it's
   3295 					    "Good Night, Charlie" */
   3296 					if (even_pair_failed == 1) {
   3297 						return(0);
   3298 					}
   3299 				}
   3300 			} else {
   3301 				/* normal accounting */
   3302 				num_missing++;
   3303 			}
   3304 		}
   3305 		if ((parity_type == '1') && (c%2 == 1)) {
   3306 				/* Just did an even component, and we didn't
   3307 				   bail.. reset the even_pair_failed flag,
   3308 				   and go on to the next component.... */
   3309 			even_pair_failed = 0;
   3310 		}
   3311 	}
   3312 
   3313 	clabel = cset->ac->clabel;
   3314 
   3315 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3316 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3317 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3318 		/* XXX this needs to be made *much* more general */
   3319 		/* Too many failures */
   3320 		return(0);
   3321 	}
   3322 	/* otherwise, all is well, and we've got enough to take a kick
   3323 	   at autoconfiguring this set */
   3324 	return(1);
   3325 }
   3326 
   3327 void
   3328 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3329 			RF_Raid_t *raidPtr)
   3330 {
   3331 	RF_ComponentLabel_t *clabel;
   3332 	int i;
   3333 
   3334 	clabel = ac->clabel;
   3335 
   3336 	/* 1. Fill in the common stuff */
   3337 	config->numCol = clabel->num_columns;
   3338 	config->numSpare = 0; /* XXX should this be set here? */
   3339 	config->sectPerSU = clabel->sectPerSU;
   3340 	config->SUsPerPU = clabel->SUsPerPU;
   3341 	config->SUsPerRU = clabel->SUsPerRU;
   3342 	config->parityConfig = clabel->parityConfig;
   3343 	/* XXX... */
   3344 	strcpy(config->diskQueueType,"fifo");
   3345 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3346 	config->layoutSpecificSize = 0; /* XXX ?? */
   3347 
   3348 	while(ac!=NULL) {
   3349 		/* row/col values will be in range due to the checks
   3350 		   in reasonable_label() */
   3351 		strcpy(config->devnames[0][ac->clabel->column],
   3352 		       ac->devname);
   3353 		ac = ac->next;
   3354 	}
   3355 
   3356 	for(i=0;i<RF_MAXDBGV;i++) {
   3357 		config->debugVars[i][0] = 0;
   3358 	}
   3359 }
   3360 
   3361 int
   3362 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3363 {
   3364 	RF_ComponentLabel_t *clabel;
   3365 	int column;
   3366 	int sparecol;
   3367 
   3368 	raidPtr->autoconfigure = new_value;
   3369 
   3370 	for(column=0; column<raidPtr->numCol; column++) {
   3371 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3372 			clabel = raidget_component_label(raidPtr, column);
   3373 			clabel->autoconfigure = new_value;
   3374 			raidflush_component_label(raidPtr, column);
   3375 		}
   3376 	}
   3377 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3378 		sparecol = raidPtr->numCol + column;
   3379 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3380 			clabel = raidget_component_label(raidPtr, sparecol);
   3381 			clabel->autoconfigure = new_value;
   3382 			raidflush_component_label(raidPtr, sparecol);
   3383 		}
   3384 	}
   3385 	return(new_value);
   3386 }
   3387 
   3388 int
   3389 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3390 {
   3391 	RF_ComponentLabel_t *clabel;
   3392 	int column;
   3393 	int sparecol;
   3394 
   3395 	raidPtr->root_partition = new_value;
   3396 	for(column=0; column<raidPtr->numCol; column++) {
   3397 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3398 			clabel = raidget_component_label(raidPtr, column);
   3399 			clabel->root_partition = new_value;
   3400 			raidflush_component_label(raidPtr, column);
   3401 		}
   3402 	}
   3403 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3404 		sparecol = raidPtr->numCol + column;
   3405 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3406 			clabel = raidget_component_label(raidPtr, sparecol);
   3407 			clabel->root_partition = new_value;
   3408 			raidflush_component_label(raidPtr, sparecol);
   3409 		}
   3410 	}
   3411 	return(new_value);
   3412 }
   3413 
   3414 void
   3415 rf_release_all_vps(RF_ConfigSet_t *cset)
   3416 {
   3417 	RF_AutoConfig_t *ac;
   3418 
   3419 	ac = cset->ac;
   3420 	while(ac!=NULL) {
   3421 		/* Close the vp, and give it back */
   3422 		if (ac->vp) {
   3423 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3424 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3425 			vput(ac->vp);
   3426 			ac->vp = NULL;
   3427 		}
   3428 		ac = ac->next;
   3429 	}
   3430 }
   3431 
   3432 
   3433 void
   3434 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3435 {
   3436 	RF_AutoConfig_t *ac;
   3437 	RF_AutoConfig_t *next_ac;
   3438 
   3439 	ac = cset->ac;
   3440 	while(ac!=NULL) {
   3441 		next_ac = ac->next;
   3442 		/* nuke the label */
   3443 		free(ac->clabel, M_RAIDFRAME);
   3444 		/* cleanup the config structure */
   3445 		free(ac, M_RAIDFRAME);
   3446 		/* "next.." */
   3447 		ac = next_ac;
   3448 	}
   3449 	/* and, finally, nuke the config set */
   3450 	free(cset, M_RAIDFRAME);
   3451 }
   3452 
   3453 
   3454 void
   3455 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3456 {
   3457 	/* current version number */
   3458 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3459 	clabel->serial_number = raidPtr->serial_number;
   3460 	clabel->mod_counter = raidPtr->mod_counter;
   3461 
   3462 	clabel->num_rows = 1;
   3463 	clabel->num_columns = raidPtr->numCol;
   3464 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3465 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3466 
   3467 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3468 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3469 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3470 
   3471 	clabel->blockSize = raidPtr->bytesPerSector;
   3472 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3473 
   3474 	/* XXX not portable */
   3475 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3476 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3477 	clabel->autoconfigure = raidPtr->autoconfigure;
   3478 	clabel->root_partition = raidPtr->root_partition;
   3479 	clabel->last_unit = raidPtr->raidid;
   3480 	clabel->config_order = raidPtr->config_order;
   3481 
   3482 #ifndef RF_NO_PARITY_MAP
   3483 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3484 #endif
   3485 }
   3486 
   3487 struct raid_softc *
   3488 rf_auto_config_set(RF_ConfigSet_t *cset)
   3489 {
   3490 	RF_Raid_t *raidPtr;
   3491 	RF_Config_t *config;
   3492 	int raidID;
   3493 	struct raid_softc *sc;
   3494 
   3495 #ifdef DEBUG
   3496 	printf("RAID autoconfigure\n");
   3497 #endif
   3498 
   3499 	/* 1. Create a config structure */
   3500 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3501 	if (config == NULL) {
   3502 		printf("%s: Out of mem - config!?!?\n", __func__);
   3503 				/* XXX do something more intelligent here. */
   3504 		return NULL;
   3505 	}
   3506 
   3507 	/*
   3508 	   2. Figure out what RAID ID this one is supposed to live at
   3509 	   See if we can get the same RAID dev that it was configured
   3510 	   on last time..
   3511 	*/
   3512 
   3513 	raidID = cset->ac->clabel->last_unit;
   3514 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3515 	     sc = raidget(++raidID, false))
   3516 		continue;
   3517 #ifdef DEBUG
   3518 	printf("Configuring raid%d:\n",raidID);
   3519 #endif
   3520 
   3521 	if (sc == NULL)
   3522 		sc = raidget(raidID, true);
   3523 	if (sc == NULL) {
   3524 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3525 				/* XXX do something more intelligent here. */
   3526 		free(config, M_RAIDFRAME);
   3527 		return NULL;
   3528 	}
   3529 
   3530 	raidPtr = &sc->sc_r;
   3531 
   3532 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3533 	raidPtr->softc = sc;
   3534 	raidPtr->raidid = raidID;
   3535 	raidPtr->openings = RAIDOUTSTANDING;
   3536 
   3537 	/* 3. Build the configuration structure */
   3538 	rf_create_configuration(cset->ac, config, raidPtr);
   3539 
   3540 	/* 4. Do the configuration */
   3541 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3542 		raidinit(sc);
   3543 
   3544 		rf_markalldirty(raidPtr);
   3545 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3546 		switch (cset->ac->clabel->root_partition) {
   3547 		case 1:	/* Force Root */
   3548 		case 2:	/* Soft Root: root when boot partition part of raid */
   3549 			/*
   3550 			 * everything configured just fine.  Make a note
   3551 			 * that this set is eligible to be root,
   3552 			 * or forced to be root
   3553 			 */
   3554 			cset->rootable = cset->ac->clabel->root_partition;
   3555 			/* XXX do this here? */
   3556 			raidPtr->root_partition = cset->rootable;
   3557 			break;
   3558 		default:
   3559 			break;
   3560 		}
   3561 	} else {
   3562 		raidput(sc);
   3563 		sc = NULL;
   3564 	}
   3565 
   3566 	/* 5. Cleanup */
   3567 	free(config, M_RAIDFRAME);
   3568 	return sc;
   3569 }
   3570 
   3571 void
   3572 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3573 	     size_t xmin, size_t xmax)
   3574 {
   3575 	int error;
   3576 
   3577 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3578 	pool_sethiwat(p, xmax);
   3579 	if ((error = pool_prime(p, xmin)) != 0)
   3580 		panic("%s: failed to prime pool: %d", __func__, error);
   3581 	pool_setlowat(p, xmin);
   3582 }
   3583 
   3584 /*
   3585  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3586  * to see if there is IO pending and if that IO could possibly be done
   3587  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3588  * otherwise.
   3589  *
   3590  */
   3591 int
   3592 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3593 {
   3594 	struct raid_softc *rs;
   3595 	struct dk_softc *dksc;
   3596 
   3597 	rs = raidPtr->softc;
   3598 	dksc = &rs->sc_dksc;
   3599 
   3600 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3601 		return 1;
   3602 
   3603 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3604 		/* there is work to do */
   3605 		return 0;
   3606 	}
   3607 	/* default is nothing to do */
   3608 	return 1;
   3609 }
   3610 
   3611 int
   3612 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3613 {
   3614 	uint64_t numsecs;
   3615 	unsigned secsize;
   3616 	int error;
   3617 
   3618 	error = getdisksize(vp, &numsecs, &secsize);
   3619 	if (error == 0) {
   3620 		diskPtr->blockSize = secsize;
   3621 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3622 		diskPtr->partitionSize = numsecs;
   3623 		return 0;
   3624 	}
   3625 	return error;
   3626 }
   3627 
   3628 static int
   3629 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3630 {
   3631 	return 1;
   3632 }
   3633 
   3634 static void
   3635 raid_attach(device_t parent, device_t self, void *aux)
   3636 {
   3637 }
   3638 
   3639 
   3640 static int
   3641 raid_detach(device_t self, int flags)
   3642 {
   3643 	int error;
   3644 	struct raid_softc *rs = raidsoftc(self);
   3645 
   3646 	if (rs == NULL)
   3647 		return ENXIO;
   3648 
   3649 	if ((error = raidlock(rs)) != 0)
   3650 		return (error);
   3651 
   3652 	error = raid_detach_unlocked(rs);
   3653 
   3654 	raidunlock(rs);
   3655 
   3656 	/* XXX raid can be referenced here */
   3657 
   3658 	if (error)
   3659 		return error;
   3660 
   3661 	/* Free the softc */
   3662 	raidput(rs);
   3663 
   3664 	return 0;
   3665 }
   3666 
   3667 static void
   3668 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3669 {
   3670 	struct dk_softc *dksc = &rs->sc_dksc;
   3671 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3672 
   3673 	memset(dg, 0, sizeof(*dg));
   3674 
   3675 	dg->dg_secperunit = raidPtr->totalSectors;
   3676 	dg->dg_secsize = raidPtr->bytesPerSector;
   3677 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3678 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3679 
   3680 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3681 }
   3682 
   3683 /*
   3684  * Get cache info for all the components (including spares).
   3685  * Returns intersection of all the cache flags of all disks, or first
   3686  * error if any encountered.
   3687  * XXXfua feature flags can change as spares are added - lock down somehow
   3688  */
   3689 static int
   3690 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3691 {
   3692 	int c;
   3693 	int error;
   3694 	int dkwhole = 0, dkpart;
   3695 
   3696 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3697 		/*
   3698 		 * Check any non-dead disk, even when currently being
   3699 		 * reconstructed.
   3700 		 */
   3701 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3702 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3703 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3704 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3705 			if (error) {
   3706 				if (error != ENODEV) {
   3707 					printf("raid%d: get cache for component %s failed\n",
   3708 					    raidPtr->raidid,
   3709 					    raidPtr->Disks[c].devname);
   3710 				}
   3711 
   3712 				return error;
   3713 			}
   3714 
   3715 			if (c == 0)
   3716 				dkwhole = dkpart;
   3717 			else
   3718 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3719 		}
   3720 	}
   3721 
   3722 	*data = dkwhole;
   3723 
   3724 	return 0;
   3725 }
   3726 
   3727 /*
   3728  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3729  * We end up returning whatever error was returned by the first cache flush
   3730  * that fails.
   3731  */
   3732 
   3733 int
   3734 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3735 {
   3736 	int c, sparecol;
   3737 	int e,error;
   3738 	int force = 1;
   3739 
   3740 	error = 0;
   3741 	for (c = 0; c < raidPtr->numCol; c++) {
   3742 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3743 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3744 					  &force, FWRITE, NOCRED);
   3745 			if (e) {
   3746 				if (e != ENODEV)
   3747 					printf("raid%d: cache flush to component %s failed.\n",
   3748 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3749 				if (error == 0) {
   3750 					error = e;
   3751 				}
   3752 			}
   3753 		}
   3754 	}
   3755 
   3756 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3757 		sparecol = raidPtr->numCol + c;
   3758 		/* Need to ensure that the reconstruct actually completed! */
   3759 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3760 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3761 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3762 			if (e) {
   3763 				if (e != ENODEV)
   3764 					printf("raid%d: cache flush to component %s failed.\n",
   3765 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3766 				if (error == 0) {
   3767 					error = e;
   3768 				}
   3769 			}
   3770 		}
   3771 	}
   3772 	return error;
   3773 }
   3774 
   3775 /* Fill in info with the current status */
   3776 void
   3777 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3778 {
   3779 
   3780 	if (raidPtr->status != rf_rs_reconstructing) {
   3781 		info->total = 100;
   3782 		info->completed = 100;
   3783 	} else {
   3784 		info->total = raidPtr->reconControl->numRUsTotal;
   3785 		info->completed = raidPtr->reconControl->numRUsComplete;
   3786 	}
   3787 	info->remaining = info->total - info->completed;
   3788 }
   3789 
   3790 /* Fill in info with the current status */
   3791 void
   3792 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3793 {
   3794 
   3795 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3796 		info->total = raidPtr->Layout.numStripe;
   3797 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3798 	} else {
   3799 		info->completed = 100;
   3800 		info->total = 100;
   3801 	}
   3802 	info->remaining = info->total - info->completed;
   3803 }
   3804 
   3805 /* Fill in info with the current status */
   3806 void
   3807 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3808 {
   3809 
   3810 	if (raidPtr->copyback_in_progress == 1) {
   3811 		info->total = raidPtr->Layout.numStripe;
   3812 		info->completed = raidPtr->copyback_stripes_done;
   3813 		info->remaining = info->total - info->completed;
   3814 	} else {
   3815 		info->remaining = 0;
   3816 		info->completed = 100;
   3817 		info->total = 100;
   3818 	}
   3819 }
   3820 
   3821 /* Fill in config with the current info */
   3822 int
   3823 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3824 {
   3825 	int	d, i, j;
   3826 
   3827 	if (!raidPtr->valid)
   3828 		return (ENODEV);
   3829 	config->cols = raidPtr->numCol;
   3830 	config->ndevs = raidPtr->numCol;
   3831 	if (config->ndevs >= RF_MAX_DISKS)
   3832 		return (ENOMEM);
   3833 	config->nspares = raidPtr->numSpare;
   3834 	if (config->nspares >= RF_MAX_DISKS)
   3835 		return (ENOMEM);
   3836 	config->maxqdepth = raidPtr->maxQueueDepth;
   3837 	d = 0;
   3838 	for (j = 0; j < config->cols; j++) {
   3839 		config->devs[d] = raidPtr->Disks[j];
   3840 		d++;
   3841 	}
   3842 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3843 		config->spares[i] = raidPtr->Disks[j];
   3844 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3845 			/* XXX: raidctl(8) expects to see this as a used spare */
   3846 			config->spares[i].status = rf_ds_used_spare;
   3847 		}
   3848 	}
   3849 	return 0;
   3850 }
   3851 
   3852 int
   3853 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3854 {
   3855 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3856 	RF_ComponentLabel_t *raid_clabel;
   3857 	int column = clabel->column;
   3858 
   3859 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3860 		return EINVAL;
   3861 	raid_clabel = raidget_component_label(raidPtr, column);
   3862 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3863 
   3864 	return 0;
   3865 }
   3866 
   3867 /*
   3868  * Module interface
   3869  */
   3870 
   3871 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3872 
   3873 #ifdef _MODULE
   3874 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3875 #endif
   3876 
   3877 static int raid_modcmd(modcmd_t, void *);
   3878 static int raid_modcmd_init(void);
   3879 static int raid_modcmd_fini(void);
   3880 
   3881 static int
   3882 raid_modcmd(modcmd_t cmd, void *data)
   3883 {
   3884 	int error;
   3885 
   3886 	error = 0;
   3887 	switch (cmd) {
   3888 	case MODULE_CMD_INIT:
   3889 		error = raid_modcmd_init();
   3890 		break;
   3891 	case MODULE_CMD_FINI:
   3892 		error = raid_modcmd_fini();
   3893 		break;
   3894 	default:
   3895 		error = ENOTTY;
   3896 		break;
   3897 	}
   3898 	return error;
   3899 }
   3900 
   3901 static int
   3902 raid_modcmd_init(void)
   3903 {
   3904 	int error;
   3905 	int bmajor, cmajor;
   3906 
   3907 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3908 	mutex_enter(&raid_lock);
   3909 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3910 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3911 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3912 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3913 
   3914 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3915 #endif
   3916 
   3917 	bmajor = cmajor = -1;
   3918 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3919 	    &raid_cdevsw, &cmajor);
   3920 	if (error != 0 && error != EEXIST) {
   3921 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3922 		mutex_exit(&raid_lock);
   3923 		return error;
   3924 	}
   3925 #ifdef _MODULE
   3926 	error = config_cfdriver_attach(&raid_cd);
   3927 	if (error != 0) {
   3928 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3929 		    __func__, error);
   3930 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3931 		mutex_exit(&raid_lock);
   3932 		return error;
   3933 	}
   3934 #endif
   3935 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3936 	if (error != 0) {
   3937 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3938 		    __func__, error);
   3939 #ifdef _MODULE
   3940 		config_cfdriver_detach(&raid_cd);
   3941 #endif
   3942 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3943 		mutex_exit(&raid_lock);
   3944 		return error;
   3945 	}
   3946 
   3947 	raidautoconfigdone = false;
   3948 
   3949 	mutex_exit(&raid_lock);
   3950 
   3951 	if (error == 0) {
   3952 		if (rf_BootRaidframe(true) == 0)
   3953 			aprint_verbose("Kernelized RAIDframe activated\n");
   3954 		else
   3955 			panic("Serious error activating RAID!!");
   3956 	}
   3957 
   3958 	/*
   3959 	 * Register a finalizer which will be used to auto-config RAID
   3960 	 * sets once all real hardware devices have been found.
   3961 	 */
   3962 	error = config_finalize_register(NULL, rf_autoconfig);
   3963 	if (error != 0) {
   3964 		aprint_error("WARNING: unable to register RAIDframe "
   3965 		    "finalizer\n");
   3966 		error = 0;
   3967 	}
   3968 
   3969 	return error;
   3970 }
   3971 
   3972 static int
   3973 raid_modcmd_fini(void)
   3974 {
   3975 	int error;
   3976 
   3977 	mutex_enter(&raid_lock);
   3978 
   3979 	/* Don't allow unload if raid device(s) exist.  */
   3980 	if (!LIST_EMPTY(&raids)) {
   3981 		mutex_exit(&raid_lock);
   3982 		return EBUSY;
   3983 	}
   3984 
   3985 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3986 	if (error != 0) {
   3987 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3988 		mutex_exit(&raid_lock);
   3989 		return error;
   3990 	}
   3991 #ifdef _MODULE
   3992 	error = config_cfdriver_detach(&raid_cd);
   3993 	if (error != 0) {
   3994 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3995 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3996 		mutex_exit(&raid_lock);
   3997 		return error;
   3998 	}
   3999 #endif
   4000 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4001 	if (error != 0) {
   4002 		aprint_error("%s: cannot detach devsw\n",__func__);
   4003 #ifdef _MODULE
   4004 		config_cfdriver_attach(&raid_cd);
   4005 #endif
   4006 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4007 		mutex_exit(&raid_lock);
   4008 		return error;
   4009 	}
   4010 	rf_BootRaidframe(false);
   4011 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4012 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4013 	rf_destroy_cond2(rf_sparet_wait_cv);
   4014 	rf_destroy_cond2(rf_sparet_resp_cv);
   4015 #endif
   4016 	mutex_exit(&raid_lock);
   4017 	mutex_destroy(&raid_lock);
   4018 
   4019 	return error;
   4020 }
   4021