Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.361
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.361 2019/02/03 08:02:24 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.361 2019/02/03 08:02:24 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #endif
    109 
    110 #include <sys/param.h>
    111 #include <sys/errno.h>
    112 #include <sys/pool.h>
    113 #include <sys/proc.h>
    114 #include <sys/queue.h>
    115 #include <sys/disk.h>
    116 #include <sys/device.h>
    117 #include <sys/stat.h>
    118 #include <sys/ioctl.h>
    119 #include <sys/fcntl.h>
    120 #include <sys/systm.h>
    121 #include <sys/vnode.h>
    122 #include <sys/disklabel.h>
    123 #include <sys/conf.h>
    124 #include <sys/buf.h>
    125 #include <sys/bufq.h>
    126 #include <sys/reboot.h>
    127 #include <sys/kauth.h>
    128 #include <sys/module.h>
    129 #include <sys/compat_stub.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #include "rf_compat80.h"
    152 
    153 #ifdef _LP64
    154 #ifndef COMPAT_NETBSD32
    155 #define COMPAT_NETBSD32
    156 #endif
    157 #include "rf_compat32.h"
    158 #endif
    159 
    160 #include "ioconf.h"
    161 
    162 #ifdef DEBUG
    163 int     rf_kdebug_level = 0;
    164 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    165 #else				/* DEBUG */
    166 #define db1_printf(a) { }
    167 #endif				/* DEBUG */
    168 
    169 #ifdef DEBUG_ROOT
    170 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    171 #else
    172 #define DPRINTF(a, ...)
    173 #endif
    174 
    175 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    176 static rf_declare_mutex2(rf_sparet_wait_mutex);
    177 static rf_declare_cond2(rf_sparet_wait_cv);
    178 static rf_declare_cond2(rf_sparet_resp_cv);
    179 
    180 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    181 						 * spare table */
    182 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    183 						 * installation process */
    184 #endif
    185 
    186 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    187 
    188 /* prototypes */
    189 static void KernelWakeupFunc(struct buf *);
    190 static void InitBP(struct buf *, struct vnode *, unsigned,
    191     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    192     void *, int, struct proc *);
    193 struct raid_softc;
    194 static void raidinit(struct raid_softc *);
    195 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    196 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    197 
    198 static int raid_match(device_t, cfdata_t, void *);
    199 static void raid_attach(device_t, device_t, void *);
    200 static int raid_detach(device_t, int);
    201 
    202 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    203     daddr_t, daddr_t);
    204 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    205     daddr_t, daddr_t, int);
    206 
    207 static int raidwrite_component_label(unsigned,
    208     dev_t, struct vnode *, RF_ComponentLabel_t *);
    209 static int raidread_component_label(unsigned,
    210     dev_t, struct vnode *, RF_ComponentLabel_t *);
    211 
    212 static int raid_diskstart(device_t, struct buf *bp);
    213 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    214 static int raid_lastclose(device_t);
    215 
    216 static dev_type_open(raidopen);
    217 static dev_type_close(raidclose);
    218 static dev_type_read(raidread);
    219 static dev_type_write(raidwrite);
    220 static dev_type_ioctl(raidioctl);
    221 static dev_type_strategy(raidstrategy);
    222 static dev_type_dump(raiddump);
    223 static dev_type_size(raidsize);
    224 
    225 const struct bdevsw raid_bdevsw = {
    226 	.d_open = raidopen,
    227 	.d_close = raidclose,
    228 	.d_strategy = raidstrategy,
    229 	.d_ioctl = raidioctl,
    230 	.d_dump = raiddump,
    231 	.d_psize = raidsize,
    232 	.d_discard = nodiscard,
    233 	.d_flag = D_DISK
    234 };
    235 
    236 const struct cdevsw raid_cdevsw = {
    237 	.d_open = raidopen,
    238 	.d_close = raidclose,
    239 	.d_read = raidread,
    240 	.d_write = raidwrite,
    241 	.d_ioctl = raidioctl,
    242 	.d_stop = nostop,
    243 	.d_tty = notty,
    244 	.d_poll = nopoll,
    245 	.d_mmap = nommap,
    246 	.d_kqfilter = nokqfilter,
    247 	.d_discard = nodiscard,
    248 	.d_flag = D_DISK
    249 };
    250 
    251 static struct dkdriver rf_dkdriver = {
    252 	.d_open = raidopen,
    253 	.d_close = raidclose,
    254 	.d_strategy = raidstrategy,
    255 	.d_diskstart = raid_diskstart,
    256 	.d_dumpblocks = raid_dumpblocks,
    257 	.d_lastclose = raid_lastclose,
    258 	.d_minphys = minphys
    259 };
    260 
    261 struct raid_softc {
    262 	struct dk_softc sc_dksc;
    263 	int	sc_unit;
    264 	int     sc_flags;	/* flags */
    265 	int     sc_cflags;	/* configuration flags */
    266 	kmutex_t sc_mutex;	/* interlock mutex */
    267 	kcondvar_t sc_cv;	/* and the condvar */
    268 	uint64_t sc_size;	/* size of the raid device */
    269 	char    sc_xname[20];	/* XXX external name */
    270 	RF_Raid_t sc_r;
    271 	LIST_ENTRY(raid_softc) sc_link;
    272 };
    273 /* sc_flags */
    274 #define RAIDF_INITED		0x01	/* unit has been initialized */
    275 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    276 #define RAIDF_DETACH  		0x04	/* detach after final close */
    277 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    278 #define RAIDF_LOCKED		0x10	/* unit is locked */
    279 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    280 
    281 #define	raidunit(x)	DISKUNIT(x)
    282 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    283 
    284 extern struct cfdriver raid_cd;
    285 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    286     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    287     DVF_DETACH_SHUTDOWN);
    288 
    289 /* Internal representation of a rf_recon_req */
    290 struct rf_recon_req_internal {
    291 	RF_RowCol_t col;
    292 	RF_ReconReqFlags_t flags;
    293 	void   *raidPtr;
    294 };
    295 
    296 /*
    297  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    298  * Be aware that large numbers can allow the driver to consume a lot of
    299  * kernel memory, especially on writes, and in degraded mode reads.
    300  *
    301  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    302  * a single 64K write will typically require 64K for the old data,
    303  * 64K for the old parity, and 64K for the new parity, for a total
    304  * of 192K (if the parity buffer is not re-used immediately).
    305  * Even it if is used immediately, that's still 128K, which when multiplied
    306  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    307  *
    308  * Now in degraded mode, for example, a 64K read on the above setup may
    309  * require data reconstruction, which will require *all* of the 4 remaining
    310  * disks to participate -- 4 * 32K/disk == 128K again.
    311  */
    312 
    313 #ifndef RAIDOUTSTANDING
    314 #define RAIDOUTSTANDING   6
    315 #endif
    316 
    317 #define RAIDLABELDEV(dev)	\
    318 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    319 
    320 /* declared here, and made public, for the benefit of KVM stuff.. */
    321 
    322 static int raidlock(struct raid_softc *);
    323 static void raidunlock(struct raid_softc *);
    324 
    325 static int raid_detach_unlocked(struct raid_softc *);
    326 
    327 static void rf_markalldirty(RF_Raid_t *);
    328 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    329 
    330 void rf_ReconThread(struct rf_recon_req_internal *);
    331 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    332 void rf_CopybackThread(RF_Raid_t *raidPtr);
    333 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    334 int rf_autoconfig(device_t);
    335 void rf_buildroothack(RF_ConfigSet_t *);
    336 
    337 RF_AutoConfig_t *rf_find_raid_components(void);
    338 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    339 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    340 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    341 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    342 int rf_set_autoconfig(RF_Raid_t *, int);
    343 int rf_set_rootpartition(RF_Raid_t *, int);
    344 void rf_release_all_vps(RF_ConfigSet_t *);
    345 void rf_cleanup_config_set(RF_ConfigSet_t *);
    346 int rf_have_enough_components(RF_ConfigSet_t *);
    347 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    348 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    349 
    350 /*
    351  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    352  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    353  * in the kernel config file.
    354  */
    355 #ifdef RAID_AUTOCONFIG
    356 int raidautoconfig = 1;
    357 #else
    358 int raidautoconfig = 0;
    359 #endif
    360 static bool raidautoconfigdone = false;
    361 
    362 struct RF_Pools_s rf_pools;
    363 
    364 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    365 static kmutex_t raid_lock;
    366 
    367 static struct raid_softc *
    368 raidcreate(int unit) {
    369 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    370 	sc->sc_unit = unit;
    371 	cv_init(&sc->sc_cv, "raidunit");
    372 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raiddestroy(struct raid_softc *sc) {
    378 	cv_destroy(&sc->sc_cv);
    379 	mutex_destroy(&sc->sc_mutex);
    380 	kmem_free(sc, sizeof(*sc));
    381 }
    382 
    383 static struct raid_softc *
    384 raidget(int unit, bool create) {
    385 	struct raid_softc *sc;
    386 	if (unit < 0) {
    387 #ifdef DIAGNOSTIC
    388 		panic("%s: unit %d!", __func__, unit);
    389 #endif
    390 		return NULL;
    391 	}
    392 	mutex_enter(&raid_lock);
    393 	LIST_FOREACH(sc, &raids, sc_link) {
    394 		if (sc->sc_unit == unit) {
    395 			mutex_exit(&raid_lock);
    396 			return sc;
    397 		}
    398 	}
    399 	mutex_exit(&raid_lock);
    400 	if (!create)
    401 		return NULL;
    402 	if ((sc = raidcreate(unit)) == NULL)
    403 		return NULL;
    404 	mutex_enter(&raid_lock);
    405 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    406 	mutex_exit(&raid_lock);
    407 	return sc;
    408 }
    409 
    410 static void
    411 raidput(struct raid_softc *sc) {
    412 	mutex_enter(&raid_lock);
    413 	LIST_REMOVE(sc, sc_link);
    414 	mutex_exit(&raid_lock);
    415 	raiddestroy(sc);
    416 }
    417 
    418 void
    419 raidattach(int num)
    420 {
    421 
    422 	/*
    423 	 * Device attachment and associated initialization now occurs
    424 	 * as part of the module initialization.
    425 	 */
    426 }
    427 
    428 int
    429 rf_autoconfig(device_t self)
    430 {
    431 	RF_AutoConfig_t *ac_list;
    432 	RF_ConfigSet_t *config_sets;
    433 
    434 	if (!raidautoconfig || raidautoconfigdone == true)
    435 		return (0);
    436 
    437 	/* XXX This code can only be run once. */
    438 	raidautoconfigdone = true;
    439 
    440 #ifdef __HAVE_CPU_BOOTCONF
    441 	/*
    442 	 * 0. find the boot device if needed first so we can use it later
    443 	 * this needs to be done before we autoconfigure any raid sets,
    444 	 * because if we use wedges we are not going to be able to open
    445 	 * the boot device later
    446 	 */
    447 	if (booted_device == NULL)
    448 		cpu_bootconf();
    449 #endif
    450 	/* 1. locate all RAID components on the system */
    451 	aprint_debug("Searching for RAID components...\n");
    452 	ac_list = rf_find_raid_components();
    453 
    454 	/* 2. Sort them into their respective sets. */
    455 	config_sets = rf_create_auto_sets(ac_list);
    456 
    457 	/*
    458 	 * 3. Evaluate each set and configure the valid ones.
    459 	 * This gets done in rf_buildroothack().
    460 	 */
    461 	rf_buildroothack(config_sets);
    462 
    463 	return 1;
    464 }
    465 
    466 static int
    467 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    468 	const char *bootname;
    469 	size_t len;
    470 
    471 	/* if bdv is NULL, the set can't contain it. exit early. */
    472 	if (bdv == NULL)
    473 		return 0;
    474 
    475 	bootname = device_xname(bdv);
    476 	len = strlen(bootname);
    477 
    478 	for (int col = 0; col < r->numCol; col++) {
    479 		const char *devname = r->Disks[col].devname;
    480 		devname += sizeof("/dev/") - 1;
    481 		if (strncmp(devname, "dk", 2) == 0) {
    482 			const char *parent =
    483 			    dkwedge_get_parent_name(r->Disks[col].dev);
    484 			if (parent != NULL)
    485 				devname = parent;
    486 		}
    487 		if (strncmp(devname, bootname, len) == 0) {
    488 			struct raid_softc *sc = r->softc;
    489 			aprint_debug("raid%d includes boot device %s\n",
    490 			    sc->sc_unit, devname);
    491 			return 1;
    492 		}
    493 	}
    494 	return 0;
    495 }
    496 
    497 void
    498 rf_buildroothack(RF_ConfigSet_t *config_sets)
    499 {
    500 	RF_ConfigSet_t *cset;
    501 	RF_ConfigSet_t *next_cset;
    502 	int num_root;
    503 	struct raid_softc *sc, *rsc;
    504 	struct dk_softc *dksc;
    505 
    506 	sc = rsc = NULL;
    507 	num_root = 0;
    508 	cset = config_sets;
    509 	while (cset != NULL) {
    510 		next_cset = cset->next;
    511 		if (rf_have_enough_components(cset) &&
    512 		    cset->ac->clabel->autoconfigure == 1) {
    513 			sc = rf_auto_config_set(cset);
    514 			if (sc != NULL) {
    515 				aprint_debug("raid%d: configured ok, rootable %d\n",
    516 				    sc->sc_unit, cset->rootable);
    517 				if (cset->rootable) {
    518 					rsc = sc;
    519 					num_root++;
    520 				}
    521 			} else {
    522 				/* The autoconfig didn't work :( */
    523 				aprint_debug("Autoconfig failed\n");
    524 				rf_release_all_vps(cset);
    525 			}
    526 		} else {
    527 			/* we're not autoconfiguring this set...
    528 			   release the associated resources */
    529 			rf_release_all_vps(cset);
    530 		}
    531 		/* cleanup */
    532 		rf_cleanup_config_set(cset);
    533 		cset = next_cset;
    534 	}
    535 	dksc = &rsc->sc_dksc;
    536 
    537 	/* if the user has specified what the root device should be
    538 	   then we don't touch booted_device or boothowto... */
    539 
    540 	if (rootspec != NULL) {
    541 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    542 		return;
    543 	}
    544 
    545 	/* we found something bootable... */
    546 
    547 	/*
    548 	 * XXX: The following code assumes that the root raid
    549 	 * is the first ('a') partition. This is about the best
    550 	 * we can do with a BSD disklabel, but we might be able
    551 	 * to do better with a GPT label, by setting a specified
    552 	 * attribute to indicate the root partition. We can then
    553 	 * stash the partition number in the r->root_partition
    554 	 * high bits (the bottom 2 bits are already used). For
    555 	 * now we just set booted_partition to 0 when we override
    556 	 * root.
    557 	 */
    558 	if (num_root == 1) {
    559 		device_t candidate_root;
    560 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    561 			char cname[sizeof(cset->ac->devname)];
    562 			/* XXX: assume partition 'a' first */
    563 			snprintf(cname, sizeof(cname), "%s%c",
    564 			    device_xname(dksc->sc_dev), 'a');
    565 			candidate_root = dkwedge_find_by_wname(cname);
    566 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    567 			    cname);
    568 			if (candidate_root == NULL) {
    569 				/*
    570 				 * If that is not found, because we don't use
    571 				 * disklabel, return the first dk child
    572 				 * XXX: we can skip the 'a' check above
    573 				 * and always do this...
    574 				 */
    575 				size_t i = 0;
    576 				candidate_root = dkwedge_find_by_parent(
    577 				    device_xname(dksc->sc_dev), &i);
    578 			}
    579 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    580 			    candidate_root);
    581 		} else
    582 			candidate_root = dksc->sc_dev;
    583 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    584 		DPRINTF("%s: booted_device=%p root_partition=%d "
    585 			"contains_boot=%d",
    586 		    __func__, booted_device, rsc->sc_r.root_partition,
    587 			   rf_containsboot(&rsc->sc_r, booted_device));
    588 		/* XXX the check for booted_device == NULL can probably be
    589 		 * dropped, now that rf_containsboot handles that case.
    590 		 */
    591 		if (booted_device == NULL ||
    592 		    rsc->sc_r.root_partition == 1 ||
    593 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    594 			booted_device = candidate_root;
    595 			booted_method = "raidframe/single";
    596 			booted_partition = 0;	/* XXX assume 'a' */
    597 		}
    598 	} else if (num_root > 1) {
    599 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    600 		    booted_device);
    601 
    602 		/*
    603 		 * Maybe the MD code can help. If it cannot, then
    604 		 * setroot() will discover that we have no
    605 		 * booted_device and will ask the user if nothing was
    606 		 * hardwired in the kernel config file
    607 		 */
    608 		if (booted_device == NULL)
    609 			return;
    610 
    611 		num_root = 0;
    612 		mutex_enter(&raid_lock);
    613 		LIST_FOREACH(sc, &raids, sc_link) {
    614 			RF_Raid_t *r = &sc->sc_r;
    615 			if (r->valid == 0)
    616 				continue;
    617 
    618 			if (r->root_partition == 0)
    619 				continue;
    620 
    621 			if (rf_containsboot(r, booted_device)) {
    622 				num_root++;
    623 				rsc = sc;
    624 				dksc = &rsc->sc_dksc;
    625 			}
    626 		}
    627 		mutex_exit(&raid_lock);
    628 
    629 		if (num_root == 1) {
    630 			booted_device = dksc->sc_dev;
    631 			booted_method = "raidframe/multi";
    632 			booted_partition = 0;	/* XXX assume 'a' */
    633 		} else {
    634 			/* we can't guess.. require the user to answer... */
    635 			boothowto |= RB_ASKNAME;
    636 		}
    637 	}
    638 }
    639 
    640 static int
    641 raidsize(dev_t dev)
    642 {
    643 	struct raid_softc *rs;
    644 	struct dk_softc *dksc;
    645 	unsigned int unit;
    646 
    647 	unit = raidunit(dev);
    648 	if ((rs = raidget(unit, false)) == NULL)
    649 		return -1;
    650 	dksc = &rs->sc_dksc;
    651 
    652 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    653 		return -1;
    654 
    655 	return dk_size(dksc, dev);
    656 }
    657 
    658 static int
    659 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    660 {
    661 	unsigned int unit;
    662 	struct raid_softc *rs;
    663 	struct dk_softc *dksc;
    664 
    665 	unit = raidunit(dev);
    666 	if ((rs = raidget(unit, false)) == NULL)
    667 		return ENXIO;
    668 	dksc = &rs->sc_dksc;
    669 
    670 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    671 		return ENODEV;
    672 
    673         /*
    674            Note that blkno is relative to this particular partition.
    675            By adding adding RF_PROTECTED_SECTORS, we get a value that
    676 	   is relative to the partition used for the underlying component.
    677         */
    678 	blkno += RF_PROTECTED_SECTORS;
    679 
    680 	return dk_dump(dksc, dev, blkno, va, size);
    681 }
    682 
    683 static int
    684 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    685 {
    686 	struct raid_softc *rs = raidsoftc(dev);
    687 	const struct bdevsw *bdev;
    688 	RF_Raid_t *raidPtr;
    689 	int     c, sparecol, j, scol, dumpto;
    690 	int     error = 0;
    691 
    692 	raidPtr = &rs->sc_r;
    693 
    694 	/* we only support dumping to RAID 1 sets */
    695 	if (raidPtr->Layout.numDataCol != 1 ||
    696 	    raidPtr->Layout.numParityCol != 1)
    697 		return EINVAL;
    698 
    699 	if ((error = raidlock(rs)) != 0)
    700 		return error;
    701 
    702 	/* figure out what device is alive.. */
    703 
    704 	/*
    705 	   Look for a component to dump to.  The preference for the
    706 	   component to dump to is as follows:
    707 	   1) the master
    708 	   2) a used_spare of the master
    709 	   3) the slave
    710 	   4) a used_spare of the slave
    711 	*/
    712 
    713 	dumpto = -1;
    714 	for (c = 0; c < raidPtr->numCol; c++) {
    715 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    716 			/* this might be the one */
    717 			dumpto = c;
    718 			break;
    719 		}
    720 	}
    721 
    722 	/*
    723 	   At this point we have possibly selected a live master or a
    724 	   live slave.  We now check to see if there is a spared
    725 	   master (or a spared slave), if we didn't find a live master
    726 	   or a live slave.
    727 	*/
    728 
    729 	for (c = 0; c < raidPtr->numSpare; c++) {
    730 		sparecol = raidPtr->numCol + c;
    731 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    732 			/* How about this one? */
    733 			scol = -1;
    734 			for(j=0;j<raidPtr->numCol;j++) {
    735 				if (raidPtr->Disks[j].spareCol == sparecol) {
    736 					scol = j;
    737 					break;
    738 				}
    739 			}
    740 			if (scol == 0) {
    741 				/*
    742 				   We must have found a spared master!
    743 				   We'll take that over anything else
    744 				   found so far.  (We couldn't have
    745 				   found a real master before, since
    746 				   this is a used spare, and it's
    747 				   saying that it's replacing the
    748 				   master.)  On reboot (with
    749 				   autoconfiguration turned on)
    750 				   sparecol will become the 1st
    751 				   component (component0) of this set.
    752 				*/
    753 				dumpto = sparecol;
    754 				break;
    755 			} else if (scol != -1) {
    756 				/*
    757 				   Must be a spared slave.  We'll dump
    758 				   to that if we havn't found anything
    759 				   else so far.
    760 				*/
    761 				if (dumpto == -1)
    762 					dumpto = sparecol;
    763 			}
    764 		}
    765 	}
    766 
    767 	if (dumpto == -1) {
    768 		/* we couldn't find any live components to dump to!?!?
    769 		 */
    770 		error = EINVAL;
    771 		goto out;
    772 	}
    773 
    774 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    775 	if (bdev == NULL) {
    776 		error = ENXIO;
    777 		goto out;
    778 	}
    779 
    780 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    781 				blkno, va, nblk * raidPtr->bytesPerSector);
    782 
    783 out:
    784 	raidunlock(rs);
    785 
    786 	return error;
    787 }
    788 
    789 /* ARGSUSED */
    790 static int
    791 raidopen(dev_t dev, int flags, int fmt,
    792     struct lwp *l)
    793 {
    794 	int     unit = raidunit(dev);
    795 	struct raid_softc *rs;
    796 	struct dk_softc *dksc;
    797 	int     error = 0;
    798 	int     part, pmask;
    799 
    800 	if ((rs = raidget(unit, true)) == NULL)
    801 		return ENXIO;
    802 	if ((error = raidlock(rs)) != 0)
    803 		return (error);
    804 
    805 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    806 		error = EBUSY;
    807 		goto bad;
    808 	}
    809 
    810 	dksc = &rs->sc_dksc;
    811 
    812 	part = DISKPART(dev);
    813 	pmask = (1 << part);
    814 
    815 	if (!DK_BUSY(dksc, pmask) &&
    816 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    817 		/* First one... mark things as dirty... Note that we *MUST*
    818 		 have done a configure before this.  I DO NOT WANT TO BE
    819 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    820 		 THAT THEY BELONG TOGETHER!!!!! */
    821 		/* XXX should check to see if we're only open for reading
    822 		   here... If so, we needn't do this, but then need some
    823 		   other way of keeping track of what's happened.. */
    824 
    825 		rf_markalldirty(&rs->sc_r);
    826 	}
    827 
    828 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    829 		error = dk_open(dksc, dev, flags, fmt, l);
    830 
    831 bad:
    832 	raidunlock(rs);
    833 
    834 	return (error);
    835 
    836 
    837 }
    838 
    839 static int
    840 raid_lastclose(device_t self)
    841 {
    842 	struct raid_softc *rs = raidsoftc(self);
    843 
    844 	/* Last one... device is not unconfigured yet.
    845 	   Device shutdown has taken care of setting the
    846 	   clean bits if RAIDF_INITED is not set
    847 	   mark things as clean... */
    848 
    849 	rf_update_component_labels(&rs->sc_r,
    850 	    RF_FINAL_COMPONENT_UPDATE);
    851 
    852 	/* pass to unlocked code */
    853 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    854 		rs->sc_flags |= RAIDF_DETACH;
    855 
    856 	return 0;
    857 }
    858 
    859 /* ARGSUSED */
    860 static int
    861 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    862 {
    863 	int     unit = raidunit(dev);
    864 	struct raid_softc *rs;
    865 	struct dk_softc *dksc;
    866 	cfdata_t cf;
    867 	int     error = 0, do_detach = 0, do_put = 0;
    868 
    869 	if ((rs = raidget(unit, false)) == NULL)
    870 		return ENXIO;
    871 	dksc = &rs->sc_dksc;
    872 
    873 	if ((error = raidlock(rs)) != 0)
    874 		return (error);
    875 
    876 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    877 		error = dk_close(dksc, dev, flags, fmt, l);
    878 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    879 			do_detach = 1;
    880 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    881 		do_put = 1;
    882 
    883 	raidunlock(rs);
    884 
    885 	if (do_detach) {
    886 		/* free the pseudo device attach bits */
    887 		cf = device_cfdata(dksc->sc_dev);
    888 		error = config_detach(dksc->sc_dev, 0);
    889 		if (error == 0)
    890 			free(cf, M_RAIDFRAME);
    891 	} else if (do_put) {
    892 		raidput(rs);
    893 	}
    894 
    895 	return (error);
    896 
    897 }
    898 
    899 static void
    900 raid_wakeup(RF_Raid_t *raidPtr)
    901 {
    902 	rf_lock_mutex2(raidPtr->iodone_lock);
    903 	rf_signal_cond2(raidPtr->iodone_cv);
    904 	rf_unlock_mutex2(raidPtr->iodone_lock);
    905 }
    906 
    907 static void
    908 raidstrategy(struct buf *bp)
    909 {
    910 	unsigned int unit;
    911 	struct raid_softc *rs;
    912 	struct dk_softc *dksc;
    913 	RF_Raid_t *raidPtr;
    914 
    915 	unit = raidunit(bp->b_dev);
    916 	if ((rs = raidget(unit, false)) == NULL) {
    917 		bp->b_error = ENXIO;
    918 		goto fail;
    919 	}
    920 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    921 		bp->b_error = ENXIO;
    922 		goto fail;
    923 	}
    924 	dksc = &rs->sc_dksc;
    925 	raidPtr = &rs->sc_r;
    926 
    927 	/* Queue IO only */
    928 	if (dk_strategy_defer(dksc, bp))
    929 		goto done;
    930 
    931 	/* schedule the IO to happen at the next convenient time */
    932 	raid_wakeup(raidPtr);
    933 
    934 done:
    935 	return;
    936 
    937 fail:
    938 	bp->b_resid = bp->b_bcount;
    939 	biodone(bp);
    940 }
    941 
    942 static int
    943 raid_diskstart(device_t dev, struct buf *bp)
    944 {
    945 	struct raid_softc *rs = raidsoftc(dev);
    946 	RF_Raid_t *raidPtr;
    947 
    948 	raidPtr = &rs->sc_r;
    949 	if (!raidPtr->valid) {
    950 		db1_printf(("raid is not valid..\n"));
    951 		return ENODEV;
    952 	}
    953 
    954 	/* XXX */
    955 	bp->b_resid = 0;
    956 
    957 	return raiddoaccess(raidPtr, bp);
    958 }
    959 
    960 void
    961 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    962 {
    963 	struct raid_softc *rs;
    964 	struct dk_softc *dksc;
    965 
    966 	rs = raidPtr->softc;
    967 	dksc = &rs->sc_dksc;
    968 
    969 	dk_done(dksc, bp);
    970 
    971 	rf_lock_mutex2(raidPtr->mutex);
    972 	raidPtr->openings++;
    973 	rf_unlock_mutex2(raidPtr->mutex);
    974 
    975 	/* schedule more IO */
    976 	raid_wakeup(raidPtr);
    977 }
    978 
    979 /* ARGSUSED */
    980 static int
    981 raidread(dev_t dev, struct uio *uio, int flags)
    982 {
    983 	int     unit = raidunit(dev);
    984 	struct raid_softc *rs;
    985 
    986 	if ((rs = raidget(unit, false)) == NULL)
    987 		return ENXIO;
    988 
    989 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    990 		return (ENXIO);
    991 
    992 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    993 
    994 }
    995 
    996 /* ARGSUSED */
    997 static int
    998 raidwrite(dev_t dev, struct uio *uio, int flags)
    999 {
   1000 	int     unit = raidunit(dev);
   1001 	struct raid_softc *rs;
   1002 
   1003 	if ((rs = raidget(unit, false)) == NULL)
   1004 		return ENXIO;
   1005 
   1006 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1007 		return (ENXIO);
   1008 
   1009 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1010 
   1011 }
   1012 
   1013 static int
   1014 raid_detach_unlocked(struct raid_softc *rs)
   1015 {
   1016 	struct dk_softc *dksc = &rs->sc_dksc;
   1017 	RF_Raid_t *raidPtr;
   1018 	int error;
   1019 
   1020 	raidPtr = &rs->sc_r;
   1021 
   1022 	if (DK_BUSY(dksc, 0) ||
   1023 	    raidPtr->recon_in_progress != 0 ||
   1024 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1025 	    raidPtr->copyback_in_progress != 0)
   1026 		return EBUSY;
   1027 
   1028 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1029 		return 0;
   1030 
   1031 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1032 
   1033 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1034 		return error;
   1035 
   1036 	rs->sc_flags &= ~RAIDF_INITED;
   1037 
   1038 	/* Kill off any queued buffers */
   1039 	dk_drain(dksc);
   1040 	bufq_free(dksc->sc_bufq);
   1041 
   1042 	/* Detach the disk. */
   1043 	dkwedge_delall(&dksc->sc_dkdev);
   1044 	disk_detach(&dksc->sc_dkdev);
   1045 	disk_destroy(&dksc->sc_dkdev);
   1046 	dk_detach(dksc);
   1047 
   1048 	return 0;
   1049 }
   1050 
   1051 static int
   1052 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1053 {
   1054 	int     unit = raidunit(dev);
   1055 	int     error = 0;
   1056 	int     part, pmask;
   1057 	struct raid_softc *rs;
   1058 	struct dk_softc *dksc;
   1059 	RF_Config_t *k_cfg, *u_cfg;
   1060 	RF_Raid_t *raidPtr;
   1061 	RF_RaidDisk_t *diskPtr;
   1062 	RF_AccTotals_t *totals;
   1063 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1064 	u_char *specific_buf;
   1065 	int retcode = 0;
   1066 	int column;
   1067 /*	int raidid; */
   1068 	struct rf_recon_req *rr;
   1069 	struct rf_recon_req_internal *rrint;
   1070 	RF_ComponentLabel_t *clabel;
   1071 	RF_ComponentLabel_t *ci_label;
   1072 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1073 	RF_SingleComponent_t component;
   1074 	int d;
   1075 
   1076 	if ((rs = raidget(unit, false)) == NULL)
   1077 		return ENXIO;
   1078 	dksc = &rs->sc_dksc;
   1079 	raidPtr = &rs->sc_r;
   1080 
   1081 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1082 		(int) DISKPART(dev), (int) unit, cmd));
   1083 
   1084 	/* Must be initialized for these... */
   1085 	switch (cmd) {
   1086 	case RAIDFRAME_REWRITEPARITY:
   1087 	case RAIDFRAME_GET_INFO:
   1088 	case RAIDFRAME_RESET_ACCTOTALS:
   1089 	case RAIDFRAME_GET_ACCTOTALS:
   1090 	case RAIDFRAME_KEEP_ACCTOTALS:
   1091 	case RAIDFRAME_GET_SIZE:
   1092 	case RAIDFRAME_FAIL_DISK:
   1093 	case RAIDFRAME_COPYBACK:
   1094 	case RAIDFRAME_CHECK_RECON_STATUS:
   1095 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1096 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1097 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1098 	case RAIDFRAME_ADD_HOT_SPARE:
   1099 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1100 	case RAIDFRAME_INIT_LABELS:
   1101 	case RAIDFRAME_REBUILD_IN_PLACE:
   1102 	case RAIDFRAME_CHECK_PARITY:
   1103 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1104 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1105 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1106 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1107 	case RAIDFRAME_SET_AUTOCONFIG:
   1108 	case RAIDFRAME_SET_ROOT:
   1109 	case RAIDFRAME_DELETE_COMPONENT:
   1110 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1111 	case RAIDFRAME_PARITYMAP_STATUS:
   1112 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1113 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1114 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1115 #ifdef _LP64
   1116 	case RAIDFRAME_GET_INFO32:
   1117 #endif
   1118 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1119 			return (ENXIO);
   1120 	}
   1121 
   1122 	/*
   1123 	 * Handle compat ioctl calls
   1124 	 *
   1125 	 * * If compat code is not loaded, stub returns ENOSYS and we just
   1126 	 *   check the "native" cmd's
   1127 	 * * If compat code is loaded but does not recognize the cmd, it
   1128 	 *   returns EPASSTHROUGH, and we just check the "native" cmd's
   1129 	 * * If compat code returns EAGAIN, we need to finish via config
   1130 	 * * Otherwise the cmd has been handled and we just return
   1131 	 */
   1132 	module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1133 	MODULE_CALL_HOOK(raidframe_ioctl_50_hook,
   1134 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1135 	    enosys(), retcode);
   1136 	if (retcode == ENOSYS)
   1137 		retcode = 0;
   1138 	else if (retcode == EAGAIN)
   1139 		goto config;
   1140 	else if (retcode != EPASSTHROUGH)
   1141 		return retcode;
   1142 
   1143 	module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1144 	MODULE_CALL_HOOK(raidframe_ioctl_80_hook,
   1145 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1146 	    enosys(), retcode);
   1147 	if (retcode == ENOSYS)
   1148 		retcode = 0;
   1149 	else if (retcode == EAGAIN)
   1150 		goto config;
   1151 	else if (retcode != EPASSTHROUGH)
   1152 		return retcode;
   1153 
   1154 	/*
   1155 	 * XXX
   1156 	 * Handling of FAIL_DISK80 command requires us to retain retcode's
   1157 	 * value of EPASSTHROUGH.  If you add more compat code later, make
   1158 	 * sure you don't overwrite retcode and break this!
   1159 	 */
   1160 
   1161 	switch (cmd) {
   1162 
   1163 		/* configure the system */
   1164 	case RAIDFRAME_CONFIGURE:
   1165 #ifdef _LP64
   1166 	case RAIDFRAME_CONFIGURE32:
   1167 #endif
   1168 
   1169 		if (raidPtr->valid) {
   1170 			/* There is a valid RAID set running on this unit! */
   1171 			printf("raid%d: Device already configured!\n",unit);
   1172 			return(EINVAL);
   1173 		}
   1174 
   1175 		/* copy-in the configuration information */
   1176 		/* data points to a pointer to the configuration structure */
   1177 
   1178 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1179 		if (k_cfg == NULL) {
   1180 			return (ENOMEM);
   1181 		}
   1182 #ifdef _LP64
   1183 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1184 		    (l->l_proc->p_flag & PK_32) != 0)
   1185 			MODULE_CALL_HOOK(raidframe_netbsd32_config_hook,
   1186 			    (data, k_cfg), enosys(), retcode);
   1187 		else
   1188 #endif
   1189 		{
   1190 			u_cfg = *((RF_Config_t **) data);
   1191 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1192 		}
   1193 		if (retcode) {
   1194 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1195 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1196 				retcode));
   1197 			goto no_config;
   1198 		}
   1199 		goto config;
   1200 	config:
   1201 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1202 
   1203 		/* allocate a buffer for the layout-specific data, and copy it
   1204 		 * in */
   1205 		if (k_cfg->layoutSpecificSize) {
   1206 			if (k_cfg->layoutSpecificSize > 10000) {
   1207 				/* sanity check */
   1208 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1209 				retcode = EINVAL;
   1210 				goto no_config;
   1211 			}
   1212 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1213 			    (u_char *));
   1214 			if (specific_buf == NULL) {
   1215 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1216 				retcode = ENOMEM;
   1217 				goto no_config;
   1218 			}
   1219 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1220 			    k_cfg->layoutSpecificSize);
   1221 			if (retcode) {
   1222 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1223 				RF_Free(specific_buf,
   1224 					k_cfg->layoutSpecificSize);
   1225 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1226 					retcode));
   1227 				goto no_config;
   1228 			}
   1229 		} else
   1230 			specific_buf = NULL;
   1231 		k_cfg->layoutSpecific = specific_buf;
   1232 
   1233 		/* should do some kind of sanity check on the configuration.
   1234 		 * Store the sum of all the bytes in the last byte? */
   1235 
   1236 		/* configure the system */
   1237 
   1238 		/*
   1239 		 * Clear the entire RAID descriptor, just to make sure
   1240 		 *  there is no stale data left in the case of a
   1241 		 *  reconfiguration
   1242 		 */
   1243 		memset(raidPtr, 0, sizeof(*raidPtr));
   1244 		raidPtr->softc = rs;
   1245 		raidPtr->raidid = unit;
   1246 
   1247 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1248 
   1249 		if (retcode == 0) {
   1250 
   1251 			/* allow this many simultaneous IO's to
   1252 			   this RAID device */
   1253 			raidPtr->openings = RAIDOUTSTANDING;
   1254 
   1255 			raidinit(rs);
   1256 			raid_wakeup(raidPtr);
   1257 			rf_markalldirty(raidPtr);
   1258 		}
   1259 		/* free the buffers.  No return code here. */
   1260 		if (k_cfg->layoutSpecificSize) {
   1261 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1262 		}
   1263 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1264 
   1265 	no_config:
   1266 		/*
   1267 		 * If configuration failed, set sc_flags so that we
   1268 		 * will detach the device when we close it.
   1269 		 */
   1270 		if (retcode != 0)
   1271 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1272 		return (retcode);
   1273 
   1274 		/* shutdown the system */
   1275 	case RAIDFRAME_SHUTDOWN:
   1276 
   1277 		part = DISKPART(dev);
   1278 		pmask = (1 << part);
   1279 
   1280 		if ((error = raidlock(rs)) != 0)
   1281 			return (error);
   1282 
   1283 		if (DK_BUSY(dksc, pmask) ||
   1284 		    raidPtr->recon_in_progress != 0 ||
   1285 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1286 		    raidPtr->copyback_in_progress != 0)
   1287 			retcode = EBUSY;
   1288 		else {
   1289 			/* detach and free on close */
   1290 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1291 			retcode = 0;
   1292 		}
   1293 
   1294 		raidunlock(rs);
   1295 
   1296 		return (retcode);
   1297 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1298 		return rf_get_component_label(raidPtr, data);
   1299 
   1300 #if 0
   1301 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1302 		clabel = (RF_ComponentLabel_t *) data;
   1303 
   1304 		/* XXX check the label for valid stuff... */
   1305 		/* Note that some things *should not* get modified --
   1306 		   the user should be re-initing the labels instead of
   1307 		   trying to patch things.
   1308 		   */
   1309 
   1310 		raidid = raidPtr->raidid;
   1311 #ifdef DEBUG
   1312 		printf("raid%d: Got component label:\n", raidid);
   1313 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1314 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1315 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1316 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1317 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1318 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1319 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1320 #endif	/* DEBUG */
   1321 		clabel->row = 0;
   1322 		column = clabel->column;
   1323 
   1324 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1325 			return(EINVAL);
   1326 		}
   1327 
   1328 		/* XXX this isn't allowed to do anything for now :-) */
   1329 
   1330 		/* XXX and before it is, we need to fill in the rest
   1331 		   of the fields!?!?!?! */
   1332 		memcpy(raidget_component_label(raidPtr, column),
   1333 		    clabel, sizeof(*clabel));
   1334 		raidflush_component_label(raidPtr, column);
   1335 		return (0);
   1336 #endif	/* 0 */
   1337 
   1338 	case RAIDFRAME_INIT_LABELS:
   1339 		clabel = (RF_ComponentLabel_t *) data;
   1340 		/*
   1341 		   we only want the serial number from
   1342 		   the above.  We get all the rest of the information
   1343 		   from the config that was used to create this RAID
   1344 		   set.
   1345 		   */
   1346 
   1347 		raidPtr->serial_number = clabel->serial_number;
   1348 
   1349 		for(column=0;column<raidPtr->numCol;column++) {
   1350 			diskPtr = &raidPtr->Disks[column];
   1351 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1352 				ci_label = raidget_component_label(raidPtr,
   1353 				    column);
   1354 				/* Zeroing this is important. */
   1355 				memset(ci_label, 0, sizeof(*ci_label));
   1356 				raid_init_component_label(raidPtr, ci_label);
   1357 				ci_label->serial_number =
   1358 				    raidPtr->serial_number;
   1359 				ci_label->row = 0; /* we dont' pretend to support more */
   1360 				rf_component_label_set_partitionsize(ci_label,
   1361 				    diskPtr->partitionSize);
   1362 				ci_label->column = column;
   1363 				raidflush_component_label(raidPtr, column);
   1364 			}
   1365 			/* XXXjld what about the spares? */
   1366 		}
   1367 
   1368 		return (retcode);
   1369 	case RAIDFRAME_SET_AUTOCONFIG:
   1370 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1371 		printf("raid%d: New autoconfig value is: %d\n",
   1372 		       raidPtr->raidid, d);
   1373 		*(int *) data = d;
   1374 		return (retcode);
   1375 
   1376 	case RAIDFRAME_SET_ROOT:
   1377 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1378 		printf("raid%d: New rootpartition value is: %d\n",
   1379 		       raidPtr->raidid, d);
   1380 		*(int *) data = d;
   1381 		return (retcode);
   1382 
   1383 		/* initialize all parity */
   1384 	case RAIDFRAME_REWRITEPARITY:
   1385 
   1386 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1387 			/* Parity for RAID 0 is trivially correct */
   1388 			raidPtr->parity_good = RF_RAID_CLEAN;
   1389 			return(0);
   1390 		}
   1391 
   1392 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1393 			/* Re-write is already in progress! */
   1394 			return(EINVAL);
   1395 		}
   1396 
   1397 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1398 					   rf_RewriteParityThread,
   1399 					   raidPtr,"raid_parity");
   1400 		return (retcode);
   1401 
   1402 
   1403 	case RAIDFRAME_ADD_HOT_SPARE:
   1404 		sparePtr = (RF_SingleComponent_t *) data;
   1405 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1406 		retcode = rf_add_hot_spare(raidPtr, &component);
   1407 		return(retcode);
   1408 
   1409 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1410 		return(retcode);
   1411 
   1412 	case RAIDFRAME_DELETE_COMPONENT:
   1413 		componentPtr = (RF_SingleComponent_t *)data;
   1414 		memcpy( &component, componentPtr,
   1415 			sizeof(RF_SingleComponent_t));
   1416 		retcode = rf_delete_component(raidPtr, &component);
   1417 		return(retcode);
   1418 
   1419 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1420 		componentPtr = (RF_SingleComponent_t *)data;
   1421 		memcpy( &component, componentPtr,
   1422 			sizeof(RF_SingleComponent_t));
   1423 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1424 		return(retcode);
   1425 
   1426 	case RAIDFRAME_REBUILD_IN_PLACE:
   1427 
   1428 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1429 			/* Can't do this on a RAID 0!! */
   1430 			return(EINVAL);
   1431 		}
   1432 
   1433 		if (raidPtr->recon_in_progress == 1) {
   1434 			/* a reconstruct is already in progress! */
   1435 			return(EINVAL);
   1436 		}
   1437 
   1438 		componentPtr = (RF_SingleComponent_t *) data;
   1439 		memcpy( &component, componentPtr,
   1440 			sizeof(RF_SingleComponent_t));
   1441 		component.row = 0; /* we don't support any more */
   1442 		column = component.column;
   1443 
   1444 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1445 			return(EINVAL);
   1446 		}
   1447 
   1448 		rf_lock_mutex2(raidPtr->mutex);
   1449 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1450 		    (raidPtr->numFailures > 0)) {
   1451 			/* XXX 0 above shouldn't be constant!!! */
   1452 			/* some component other than this has failed.
   1453 			   Let's not make things worse than they already
   1454 			   are... */
   1455 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1456 			       raidPtr->raidid);
   1457 			printf("raid%d:     Col: %d   Too many failures.\n",
   1458 			       raidPtr->raidid, column);
   1459 			rf_unlock_mutex2(raidPtr->mutex);
   1460 			return (EINVAL);
   1461 		}
   1462 		if (raidPtr->Disks[column].status ==
   1463 		    rf_ds_reconstructing) {
   1464 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1465 			       raidPtr->raidid);
   1466 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1467 
   1468 			rf_unlock_mutex2(raidPtr->mutex);
   1469 			return (EINVAL);
   1470 		}
   1471 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1472 			rf_unlock_mutex2(raidPtr->mutex);
   1473 			return (EINVAL);
   1474 		}
   1475 		rf_unlock_mutex2(raidPtr->mutex);
   1476 
   1477 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1478 		if (rrint == NULL)
   1479 			return(ENOMEM);
   1480 
   1481 		rrint->col = column;
   1482 		rrint->raidPtr = raidPtr;
   1483 
   1484 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1485 					   rf_ReconstructInPlaceThread,
   1486 					   rrint, "raid_reconip");
   1487 		return(retcode);
   1488 
   1489 	case RAIDFRAME_GET_INFO:
   1490 #ifdef _LP64
   1491 	case RAIDFRAME_GET_INFO32:
   1492 #endif	/* LP64 */
   1493 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1494 			  (RF_DeviceConfig_t *));
   1495 		if (d_cfg == NULL)
   1496 			return (ENOMEM);
   1497 		retcode = rf_get_info(raidPtr, d_cfg);
   1498 		if (retcode == 0) {
   1499 #ifdef _LP64
   1500 			if (raidframe_netbsd32_config_hook.hooked &&
   1501 			    cmd == RAIDFRAME_GET_INFO32)
   1502 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1503 			else
   1504 #endif	/* _LP64 */
   1505 				ucfgp = *(RF_DeviceConfig_t **)data;
   1506 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1507 		}
   1508 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1509 
   1510 		return (retcode);
   1511 
   1512 	case RAIDFRAME_CHECK_PARITY:
   1513 		*(int *) data = raidPtr->parity_good;
   1514 		return (0);
   1515 
   1516 	case RAIDFRAME_PARITYMAP_STATUS:
   1517 		if (rf_paritymap_ineligible(raidPtr))
   1518 			return EINVAL;
   1519 		rf_paritymap_status(raidPtr->parity_map,
   1520 		    (struct rf_pmstat *)data);
   1521 		return 0;
   1522 
   1523 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1524 		if (rf_paritymap_ineligible(raidPtr))
   1525 			return EINVAL;
   1526 		if (raidPtr->parity_map == NULL)
   1527 			return ENOENT; /* ??? */
   1528 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1529 			(struct rf_pmparams *)data, 1))
   1530 			return EINVAL;
   1531 		return 0;
   1532 
   1533 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1534 		if (rf_paritymap_ineligible(raidPtr))
   1535 			return EINVAL;
   1536 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1537 		return 0;
   1538 
   1539 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1540 		if (rf_paritymap_ineligible(raidPtr))
   1541 			return EINVAL;
   1542 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1543 		/* XXX should errors be passed up? */
   1544 		return 0;
   1545 
   1546 	case RAIDFRAME_RESET_ACCTOTALS:
   1547 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1548 		return (0);
   1549 
   1550 	case RAIDFRAME_GET_ACCTOTALS:
   1551 		totals = (RF_AccTotals_t *) data;
   1552 		*totals = raidPtr->acc_totals;
   1553 		return (0);
   1554 
   1555 	case RAIDFRAME_KEEP_ACCTOTALS:
   1556 		raidPtr->keep_acc_totals = *(int *)data;
   1557 		return (0);
   1558 
   1559 	case RAIDFRAME_GET_SIZE:
   1560 		*(int *) data = raidPtr->totalSectors;
   1561 		return (0);
   1562 
   1563 		/* fail a disk & optionally start reconstruction */
   1564 	case RAIDFRAME_FAIL_DISK80:
   1565 		/* Check if we called compat code for this cmd */
   1566 		if (retcode != EPASSTHROUGH)
   1567 			return EINVAL;
   1568 		/* FALLTHRU */
   1569 	case RAIDFRAME_FAIL_DISK:
   1570 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1571 			/* Can't do this on a RAID 0!! */
   1572 			return(EINVAL);
   1573 		}
   1574 
   1575 		rr = (struct rf_recon_req *) data;
   1576 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1577 			return (EINVAL);
   1578 
   1579 		rf_lock_mutex2(raidPtr->mutex);
   1580 		if (raidPtr->status == rf_rs_reconstructing) {
   1581 			/* you can't fail a disk while we're reconstructing! */
   1582 			/* XXX wrong for RAID6 */
   1583 			rf_unlock_mutex2(raidPtr->mutex);
   1584 			return (EINVAL);
   1585 		}
   1586 		if ((raidPtr->Disks[rr->col].status ==
   1587 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1588 			/* some other component has failed.  Let's not make
   1589 			   things worse. XXX wrong for RAID6 */
   1590 			rf_unlock_mutex2(raidPtr->mutex);
   1591 			return (EINVAL);
   1592 		}
   1593 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1594 			/* Can't fail a spared disk! */
   1595 			rf_unlock_mutex2(raidPtr->mutex);
   1596 			return (EINVAL);
   1597 		}
   1598 		rf_unlock_mutex2(raidPtr->mutex);
   1599 
   1600 		/* make a copy of the recon request so that we don't rely on
   1601 		 * the user's buffer */
   1602 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1603 		if (rrint == NULL)
   1604 			return(ENOMEM);
   1605 		rrint->col = rr->col;
   1606 		rrint->flags = rr->flags;
   1607 		rrint->raidPtr = raidPtr;
   1608 
   1609 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1610 					   rf_ReconThread,
   1611 					   rrint, "raid_recon");
   1612 		return (0);
   1613 
   1614 		/* invoke a copyback operation after recon on whatever disk
   1615 		 * needs it, if any */
   1616 	case RAIDFRAME_COPYBACK:
   1617 
   1618 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1619 			/* This makes no sense on a RAID 0!! */
   1620 			return(EINVAL);
   1621 		}
   1622 
   1623 		if (raidPtr->copyback_in_progress == 1) {
   1624 			/* Copyback is already in progress! */
   1625 			return(EINVAL);
   1626 		}
   1627 
   1628 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1629 					   rf_CopybackThread,
   1630 					   raidPtr,"raid_copyback");
   1631 		return (retcode);
   1632 
   1633 		/* return the percentage completion of reconstruction */
   1634 	case RAIDFRAME_CHECK_RECON_STATUS:
   1635 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1636 			/* This makes no sense on a RAID 0, so tell the
   1637 			   user it's done. */
   1638 			*(int *) data = 100;
   1639 			return(0);
   1640 		}
   1641 		if (raidPtr->status != rf_rs_reconstructing)
   1642 			*(int *) data = 100;
   1643 		else {
   1644 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1645 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1646 			} else {
   1647 				*(int *) data = 0;
   1648 			}
   1649 		}
   1650 		return (0);
   1651 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1652 		rf_check_recon_status_ext(raidPtr, data);
   1653 		return (0);
   1654 
   1655 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1656 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1657 			/* This makes no sense on a RAID 0, so tell the
   1658 			   user it's done. */
   1659 			*(int *) data = 100;
   1660 			return(0);
   1661 		}
   1662 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1663 			*(int *) data = 100 *
   1664 				raidPtr->parity_rewrite_stripes_done /
   1665 				raidPtr->Layout.numStripe;
   1666 		} else {
   1667 			*(int *) data = 100;
   1668 		}
   1669 		return (0);
   1670 
   1671 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1672 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1673 		return (0);
   1674 
   1675 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1676 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1677 			/* This makes no sense on a RAID 0 */
   1678 			*(int *) data = 100;
   1679 			return(0);
   1680 		}
   1681 		if (raidPtr->copyback_in_progress == 1) {
   1682 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1683 				raidPtr->Layout.numStripe;
   1684 		} else {
   1685 			*(int *) data = 100;
   1686 		}
   1687 		return (0);
   1688 
   1689 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1690 		rf_check_copyback_status_ext(raidPtr, data);
   1691 		return 0;
   1692 
   1693 	case RAIDFRAME_SET_LAST_UNIT:
   1694 		for (column = 0; column < raidPtr->numCol; column++)
   1695 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1696 				return EBUSY;
   1697 
   1698 		for (column = 0; column < raidPtr->numCol; column++) {
   1699 			clabel = raidget_component_label(raidPtr, column);
   1700 			clabel->last_unit = *(int *)data;
   1701 			raidflush_component_label(raidPtr, column);
   1702 		}
   1703 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1704 		return 0;
   1705 
   1706 		/* the sparetable daemon calls this to wait for the kernel to
   1707 		 * need a spare table. this ioctl does not return until a
   1708 		 * spare table is needed. XXX -- calling mpsleep here in the
   1709 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1710 		 * -- I should either compute the spare table in the kernel,
   1711 		 * or have a different -- XXX XXX -- interface (a different
   1712 		 * character device) for delivering the table     -- XXX */
   1713 #if 0
   1714 	case RAIDFRAME_SPARET_WAIT:
   1715 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1716 		while (!rf_sparet_wait_queue)
   1717 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1718 		waitreq = rf_sparet_wait_queue;
   1719 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1720 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1721 
   1722 		/* structure assignment */
   1723 		*((RF_SparetWait_t *) data) = *waitreq;
   1724 
   1725 		RF_Free(waitreq, sizeof(*waitreq));
   1726 		return (0);
   1727 
   1728 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1729 		 * code in it that will cause the dameon to exit */
   1730 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1731 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1732 		waitreq->fcol = -1;
   1733 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1734 		waitreq->next = rf_sparet_wait_queue;
   1735 		rf_sparet_wait_queue = waitreq;
   1736 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1737 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1738 		return (0);
   1739 
   1740 		/* used by the spare table daemon to deliver a spare table
   1741 		 * into the kernel */
   1742 	case RAIDFRAME_SEND_SPARET:
   1743 
   1744 		/* install the spare table */
   1745 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1746 
   1747 		/* respond to the requestor.  the return status of the spare
   1748 		 * table installation is passed in the "fcol" field */
   1749 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1750 		waitreq->fcol = retcode;
   1751 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1752 		waitreq->next = rf_sparet_resp_queue;
   1753 		rf_sparet_resp_queue = waitreq;
   1754 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1755 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1756 
   1757 		return (retcode);
   1758 #endif
   1759 
   1760 	default:
   1761 		break; /* fall through to the os-specific code below */
   1762 
   1763 	}
   1764 
   1765 	if (!raidPtr->valid)
   1766 		return (EINVAL);
   1767 
   1768 	/*
   1769 	 * Add support for "regular" device ioctls here.
   1770 	 */
   1771 
   1772 	switch (cmd) {
   1773 	case DIOCGCACHE:
   1774 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1775 		break;
   1776 
   1777 	case DIOCCACHESYNC:
   1778 		retcode = rf_sync_component_caches(raidPtr);
   1779 		break;
   1780 
   1781 	default:
   1782 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1783 		break;
   1784 	}
   1785 
   1786 	return (retcode);
   1787 
   1788 }
   1789 
   1790 
   1791 /* raidinit -- complete the rest of the initialization for the
   1792    RAIDframe device.  */
   1793 
   1794 
   1795 static void
   1796 raidinit(struct raid_softc *rs)
   1797 {
   1798 	cfdata_t cf;
   1799 	unsigned int unit;
   1800 	struct dk_softc *dksc = &rs->sc_dksc;
   1801 	RF_Raid_t *raidPtr = &rs->sc_r;
   1802 	device_t dev;
   1803 
   1804 	unit = raidPtr->raidid;
   1805 
   1806 	/* XXX doesn't check bounds. */
   1807 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1808 
   1809 	/* attach the pseudo device */
   1810 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1811 	cf->cf_name = raid_cd.cd_name;
   1812 	cf->cf_atname = raid_cd.cd_name;
   1813 	cf->cf_unit = unit;
   1814 	cf->cf_fstate = FSTATE_STAR;
   1815 
   1816 	dev = config_attach_pseudo(cf);
   1817 	if (dev == NULL) {
   1818 		printf("raid%d: config_attach_pseudo failed\n",
   1819 		    raidPtr->raidid);
   1820 		free(cf, M_RAIDFRAME);
   1821 		return;
   1822 	}
   1823 
   1824 	/* provide a backpointer to the real softc */
   1825 	raidsoftc(dev) = rs;
   1826 
   1827 	/* disk_attach actually creates space for the CPU disklabel, among
   1828 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1829 	 * with disklabels. */
   1830 	dk_init(dksc, dev, DKTYPE_RAID);
   1831 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1832 
   1833 	/* XXX There may be a weird interaction here between this, and
   1834 	 * protectedSectors, as used in RAIDframe.  */
   1835 
   1836 	rs->sc_size = raidPtr->totalSectors;
   1837 
   1838 	/* Attach dk and disk subsystems */
   1839 	dk_attach(dksc);
   1840 	disk_attach(&dksc->sc_dkdev);
   1841 	rf_set_geometry(rs, raidPtr);
   1842 
   1843 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1844 
   1845 	/* mark unit as usuable */
   1846 	rs->sc_flags |= RAIDF_INITED;
   1847 
   1848 	dkwedge_discover(&dksc->sc_dkdev);
   1849 }
   1850 
   1851 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1852 /* wake up the daemon & tell it to get us a spare table
   1853  * XXX
   1854  * the entries in the queues should be tagged with the raidPtr
   1855  * so that in the extremely rare case that two recons happen at once,
   1856  * we know for which device were requesting a spare table
   1857  * XXX
   1858  *
   1859  * XXX This code is not currently used. GO
   1860  */
   1861 int
   1862 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1863 {
   1864 	int     retcode;
   1865 
   1866 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1867 	req->next = rf_sparet_wait_queue;
   1868 	rf_sparet_wait_queue = req;
   1869 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1870 
   1871 	/* mpsleep unlocks the mutex */
   1872 	while (!rf_sparet_resp_queue) {
   1873 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1874 	}
   1875 	req = rf_sparet_resp_queue;
   1876 	rf_sparet_resp_queue = req->next;
   1877 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1878 
   1879 	retcode = req->fcol;
   1880 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1881 					 * alloc'd */
   1882 	return (retcode);
   1883 }
   1884 #endif
   1885 
   1886 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1887  * bp & passes it down.
   1888  * any calls originating in the kernel must use non-blocking I/O
   1889  * do some extra sanity checking to return "appropriate" error values for
   1890  * certain conditions (to make some standard utilities work)
   1891  *
   1892  * Formerly known as: rf_DoAccessKernel
   1893  */
   1894 void
   1895 raidstart(RF_Raid_t *raidPtr)
   1896 {
   1897 	struct raid_softc *rs;
   1898 	struct dk_softc *dksc;
   1899 
   1900 	rs = raidPtr->softc;
   1901 	dksc = &rs->sc_dksc;
   1902 	/* quick check to see if anything has died recently */
   1903 	rf_lock_mutex2(raidPtr->mutex);
   1904 	if (raidPtr->numNewFailures > 0) {
   1905 		rf_unlock_mutex2(raidPtr->mutex);
   1906 		rf_update_component_labels(raidPtr,
   1907 					   RF_NORMAL_COMPONENT_UPDATE);
   1908 		rf_lock_mutex2(raidPtr->mutex);
   1909 		raidPtr->numNewFailures--;
   1910 	}
   1911 	rf_unlock_mutex2(raidPtr->mutex);
   1912 
   1913 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1914 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1915 		return;
   1916 	}
   1917 
   1918 	dk_start(dksc, NULL);
   1919 }
   1920 
   1921 static int
   1922 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1923 {
   1924 	RF_SectorCount_t num_blocks, pb, sum;
   1925 	RF_RaidAddr_t raid_addr;
   1926 	daddr_t blocknum;
   1927 	int     do_async;
   1928 	int rc;
   1929 
   1930 	rf_lock_mutex2(raidPtr->mutex);
   1931 	if (raidPtr->openings == 0) {
   1932 		rf_unlock_mutex2(raidPtr->mutex);
   1933 		return EAGAIN;
   1934 	}
   1935 	rf_unlock_mutex2(raidPtr->mutex);
   1936 
   1937 	blocknum = bp->b_rawblkno;
   1938 
   1939 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1940 		    (int) blocknum));
   1941 
   1942 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1943 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1944 
   1945 	/* *THIS* is where we adjust what block we're going to...
   1946 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1947 	raid_addr = blocknum;
   1948 
   1949 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1950 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1951 	sum = raid_addr + num_blocks + pb;
   1952 	if (1 || rf_debugKernelAccess) {
   1953 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1954 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1955 			    (int) pb, (int) bp->b_resid));
   1956 	}
   1957 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1958 	    || (sum < num_blocks) || (sum < pb)) {
   1959 		rc = ENOSPC;
   1960 		goto done;
   1961 	}
   1962 	/*
   1963 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1964 	 */
   1965 
   1966 	if (bp->b_bcount & raidPtr->sectorMask) {
   1967 		rc = ENOSPC;
   1968 		goto done;
   1969 	}
   1970 	db1_printf(("Calling DoAccess..\n"));
   1971 
   1972 
   1973 	rf_lock_mutex2(raidPtr->mutex);
   1974 	raidPtr->openings--;
   1975 	rf_unlock_mutex2(raidPtr->mutex);
   1976 
   1977 	/*
   1978 	 * Everything is async.
   1979 	 */
   1980 	do_async = 1;
   1981 
   1982 	/* don't ever condition on bp->b_flags & B_WRITE.
   1983 	 * always condition on B_READ instead */
   1984 
   1985 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1986 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1987 			 do_async, raid_addr, num_blocks,
   1988 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1989 
   1990 done:
   1991 	return rc;
   1992 }
   1993 
   1994 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1995 
   1996 int
   1997 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1998 {
   1999 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2000 	struct buf *bp;
   2001 
   2002 	req->queue = queue;
   2003 	bp = req->bp;
   2004 
   2005 	switch (req->type) {
   2006 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2007 		/* XXX need to do something extra here.. */
   2008 		/* I'm leaving this in, as I've never actually seen it used,
   2009 		 * and I'd like folks to report it... GO */
   2010 		printf(("WAKEUP CALLED\n"));
   2011 		queue->numOutstanding++;
   2012 
   2013 		bp->b_flags = 0;
   2014 		bp->b_private = req;
   2015 
   2016 		KernelWakeupFunc(bp);
   2017 		break;
   2018 
   2019 	case RF_IO_TYPE_READ:
   2020 	case RF_IO_TYPE_WRITE:
   2021 #if RF_ACC_TRACE > 0
   2022 		if (req->tracerec) {
   2023 			RF_ETIMER_START(req->tracerec->timer);
   2024 		}
   2025 #endif
   2026 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2027 		    op, queue->rf_cinfo->ci_dev,
   2028 		    req->sectorOffset, req->numSector,
   2029 		    req->buf, KernelWakeupFunc, (void *) req,
   2030 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2031 
   2032 		if (rf_debugKernelAccess) {
   2033 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2034 				(long) bp->b_blkno));
   2035 		}
   2036 		queue->numOutstanding++;
   2037 		queue->last_deq_sector = req->sectorOffset;
   2038 		/* acc wouldn't have been let in if there were any pending
   2039 		 * reqs at any other priority */
   2040 		queue->curPriority = req->priority;
   2041 
   2042 		db1_printf(("Going for %c to unit %d col %d\n",
   2043 			    req->type, queue->raidPtr->raidid,
   2044 			    queue->col));
   2045 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2046 			(int) req->sectorOffset, (int) req->numSector,
   2047 			(int) (req->numSector <<
   2048 			    queue->raidPtr->logBytesPerSector),
   2049 			(int) queue->raidPtr->logBytesPerSector));
   2050 
   2051 		/*
   2052 		 * XXX: drop lock here since this can block at
   2053 		 * least with backing SCSI devices.  Retake it
   2054 		 * to minimize fuss with calling interfaces.
   2055 		 */
   2056 
   2057 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2058 		bdev_strategy(bp);
   2059 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2060 		break;
   2061 
   2062 	default:
   2063 		panic("bad req->type in rf_DispatchKernelIO");
   2064 	}
   2065 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2066 
   2067 	return (0);
   2068 }
   2069 /* this is the callback function associated with a I/O invoked from
   2070    kernel code.
   2071  */
   2072 static void
   2073 KernelWakeupFunc(struct buf *bp)
   2074 {
   2075 	RF_DiskQueueData_t *req = NULL;
   2076 	RF_DiskQueue_t *queue;
   2077 
   2078 	db1_printf(("recovering the request queue:\n"));
   2079 
   2080 	req = bp->b_private;
   2081 
   2082 	queue = (RF_DiskQueue_t *) req->queue;
   2083 
   2084 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2085 
   2086 #if RF_ACC_TRACE > 0
   2087 	if (req->tracerec) {
   2088 		RF_ETIMER_STOP(req->tracerec->timer);
   2089 		RF_ETIMER_EVAL(req->tracerec->timer);
   2090 		rf_lock_mutex2(rf_tracing_mutex);
   2091 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2092 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2093 		req->tracerec->num_phys_ios++;
   2094 		rf_unlock_mutex2(rf_tracing_mutex);
   2095 	}
   2096 #endif
   2097 
   2098 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2099 	 * ballistic, and mark the component as hosed... */
   2100 
   2101 	if (bp->b_error != 0) {
   2102 		/* Mark the disk as dead */
   2103 		/* but only mark it once... */
   2104 		/* and only if it wouldn't leave this RAID set
   2105 		   completely broken */
   2106 		if (((queue->raidPtr->Disks[queue->col].status ==
   2107 		      rf_ds_optimal) ||
   2108 		     (queue->raidPtr->Disks[queue->col].status ==
   2109 		      rf_ds_used_spare)) &&
   2110 		     (queue->raidPtr->numFailures <
   2111 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2112 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2113 			       queue->raidPtr->raidid,
   2114 			       bp->b_error,
   2115 			       queue->raidPtr->Disks[queue->col].devname);
   2116 			queue->raidPtr->Disks[queue->col].status =
   2117 			    rf_ds_failed;
   2118 			queue->raidPtr->status = rf_rs_degraded;
   2119 			queue->raidPtr->numFailures++;
   2120 			queue->raidPtr->numNewFailures++;
   2121 		} else {	/* Disk is already dead... */
   2122 			/* printf("Disk already marked as dead!\n"); */
   2123 		}
   2124 
   2125 	}
   2126 
   2127 	/* Fill in the error value */
   2128 	req->error = bp->b_error;
   2129 
   2130 	/* Drop this one on the "finished" queue... */
   2131 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2132 
   2133 	/* Let the raidio thread know there is work to be done. */
   2134 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2135 
   2136 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2137 }
   2138 
   2139 
   2140 /*
   2141  * initialize a buf structure for doing an I/O in the kernel.
   2142  */
   2143 static void
   2144 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2145        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2146        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2147        struct proc *b_proc)
   2148 {
   2149 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2150 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2151 	bp->b_oflags = 0;
   2152 	bp->b_cflags = 0;
   2153 	bp->b_bcount = numSect << logBytesPerSector;
   2154 	bp->b_bufsize = bp->b_bcount;
   2155 	bp->b_error = 0;
   2156 	bp->b_dev = dev;
   2157 	bp->b_data = bf;
   2158 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2159 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2160 	if (bp->b_bcount == 0) {
   2161 		panic("bp->b_bcount is zero in InitBP!!");
   2162 	}
   2163 	bp->b_proc = b_proc;
   2164 	bp->b_iodone = cbFunc;
   2165 	bp->b_private = cbArg;
   2166 }
   2167 
   2168 /*
   2169  * Wait interruptibly for an exclusive lock.
   2170  *
   2171  * XXX
   2172  * Several drivers do this; it should be abstracted and made MP-safe.
   2173  * (Hmm... where have we seen this warning before :->  GO )
   2174  */
   2175 static int
   2176 raidlock(struct raid_softc *rs)
   2177 {
   2178 	int     error;
   2179 
   2180 	error = 0;
   2181 	mutex_enter(&rs->sc_mutex);
   2182 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2183 		rs->sc_flags |= RAIDF_WANTED;
   2184 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2185 		if (error != 0)
   2186 			goto done;
   2187 	}
   2188 	rs->sc_flags |= RAIDF_LOCKED;
   2189 done:
   2190 	mutex_exit(&rs->sc_mutex);
   2191 	return (error);
   2192 }
   2193 /*
   2194  * Unlock and wake up any waiters.
   2195  */
   2196 static void
   2197 raidunlock(struct raid_softc *rs)
   2198 {
   2199 
   2200 	mutex_enter(&rs->sc_mutex);
   2201 	rs->sc_flags &= ~RAIDF_LOCKED;
   2202 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2203 		rs->sc_flags &= ~RAIDF_WANTED;
   2204 		cv_broadcast(&rs->sc_cv);
   2205 	}
   2206 	mutex_exit(&rs->sc_mutex);
   2207 }
   2208 
   2209 
   2210 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2211 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2212 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2213 
   2214 static daddr_t
   2215 rf_component_info_offset(void)
   2216 {
   2217 
   2218 	return RF_COMPONENT_INFO_OFFSET;
   2219 }
   2220 
   2221 static daddr_t
   2222 rf_component_info_size(unsigned secsize)
   2223 {
   2224 	daddr_t info_size;
   2225 
   2226 	KASSERT(secsize);
   2227 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2228 		info_size = secsize;
   2229 	else
   2230 		info_size = RF_COMPONENT_INFO_SIZE;
   2231 
   2232 	return info_size;
   2233 }
   2234 
   2235 static daddr_t
   2236 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2237 {
   2238 	daddr_t map_offset;
   2239 
   2240 	KASSERT(raidPtr->bytesPerSector);
   2241 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2242 		map_offset = raidPtr->bytesPerSector;
   2243 	else
   2244 		map_offset = RF_COMPONENT_INFO_SIZE;
   2245 	map_offset += rf_component_info_offset();
   2246 
   2247 	return map_offset;
   2248 }
   2249 
   2250 static daddr_t
   2251 rf_parity_map_size(RF_Raid_t *raidPtr)
   2252 {
   2253 	daddr_t map_size;
   2254 
   2255 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2256 		map_size = raidPtr->bytesPerSector;
   2257 	else
   2258 		map_size = RF_PARITY_MAP_SIZE;
   2259 
   2260 	return map_size;
   2261 }
   2262 
   2263 int
   2264 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2265 {
   2266 	RF_ComponentLabel_t *clabel;
   2267 
   2268 	clabel = raidget_component_label(raidPtr, col);
   2269 	clabel->clean = RF_RAID_CLEAN;
   2270 	raidflush_component_label(raidPtr, col);
   2271 	return(0);
   2272 }
   2273 
   2274 
   2275 int
   2276 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2277 {
   2278 	RF_ComponentLabel_t *clabel;
   2279 
   2280 	clabel = raidget_component_label(raidPtr, col);
   2281 	clabel->clean = RF_RAID_DIRTY;
   2282 	raidflush_component_label(raidPtr, col);
   2283 	return(0);
   2284 }
   2285 
   2286 int
   2287 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2288 {
   2289 	KASSERT(raidPtr->bytesPerSector);
   2290 	return raidread_component_label(raidPtr->bytesPerSector,
   2291 	    raidPtr->Disks[col].dev,
   2292 	    raidPtr->raid_cinfo[col].ci_vp,
   2293 	    &raidPtr->raid_cinfo[col].ci_label);
   2294 }
   2295 
   2296 RF_ComponentLabel_t *
   2297 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2298 {
   2299 	return &raidPtr->raid_cinfo[col].ci_label;
   2300 }
   2301 
   2302 int
   2303 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2304 {
   2305 	RF_ComponentLabel_t *label;
   2306 
   2307 	label = &raidPtr->raid_cinfo[col].ci_label;
   2308 	label->mod_counter = raidPtr->mod_counter;
   2309 #ifndef RF_NO_PARITY_MAP
   2310 	label->parity_map_modcount = label->mod_counter;
   2311 #endif
   2312 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2313 	    raidPtr->Disks[col].dev,
   2314 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2315 }
   2316 
   2317 
   2318 static int
   2319 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2320     RF_ComponentLabel_t *clabel)
   2321 {
   2322 	return raidread_component_area(dev, b_vp, clabel,
   2323 	    sizeof(RF_ComponentLabel_t),
   2324 	    rf_component_info_offset(),
   2325 	    rf_component_info_size(secsize));
   2326 }
   2327 
   2328 /* ARGSUSED */
   2329 static int
   2330 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2331     size_t msize, daddr_t offset, daddr_t dsize)
   2332 {
   2333 	struct buf *bp;
   2334 	int error;
   2335 
   2336 	/* XXX should probably ensure that we don't try to do this if
   2337 	   someone has changed rf_protected_sectors. */
   2338 
   2339 	if (b_vp == NULL) {
   2340 		/* For whatever reason, this component is not valid.
   2341 		   Don't try to read a component label from it. */
   2342 		return(EINVAL);
   2343 	}
   2344 
   2345 	/* get a block of the appropriate size... */
   2346 	bp = geteblk((int)dsize);
   2347 	bp->b_dev = dev;
   2348 
   2349 	/* get our ducks in a row for the read */
   2350 	bp->b_blkno = offset / DEV_BSIZE;
   2351 	bp->b_bcount = dsize;
   2352 	bp->b_flags |= B_READ;
   2353  	bp->b_resid = dsize;
   2354 
   2355 	bdev_strategy(bp);
   2356 	error = biowait(bp);
   2357 
   2358 	if (!error) {
   2359 		memcpy(data, bp->b_data, msize);
   2360 	}
   2361 
   2362 	brelse(bp, 0);
   2363 	return(error);
   2364 }
   2365 
   2366 
   2367 static int
   2368 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2369     RF_ComponentLabel_t *clabel)
   2370 {
   2371 	return raidwrite_component_area(dev, b_vp, clabel,
   2372 	    sizeof(RF_ComponentLabel_t),
   2373 	    rf_component_info_offset(),
   2374 	    rf_component_info_size(secsize), 0);
   2375 }
   2376 
   2377 /* ARGSUSED */
   2378 static int
   2379 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2380     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2381 {
   2382 	struct buf *bp;
   2383 	int error;
   2384 
   2385 	/* get a block of the appropriate size... */
   2386 	bp = geteblk((int)dsize);
   2387 	bp->b_dev = dev;
   2388 
   2389 	/* get our ducks in a row for the write */
   2390 	bp->b_blkno = offset / DEV_BSIZE;
   2391 	bp->b_bcount = dsize;
   2392 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2393  	bp->b_resid = dsize;
   2394 
   2395 	memset(bp->b_data, 0, dsize);
   2396 	memcpy(bp->b_data, data, msize);
   2397 
   2398 	bdev_strategy(bp);
   2399 	if (asyncp)
   2400 		return 0;
   2401 	error = biowait(bp);
   2402 	brelse(bp, 0);
   2403 	if (error) {
   2404 #if 1
   2405 		printf("Failed to write RAID component info!\n");
   2406 #endif
   2407 	}
   2408 
   2409 	return(error);
   2410 }
   2411 
   2412 void
   2413 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2414 {
   2415 	int c;
   2416 
   2417 	for (c = 0; c < raidPtr->numCol; c++) {
   2418 		/* Skip dead disks. */
   2419 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2420 			continue;
   2421 		/* XXXjld: what if an error occurs here? */
   2422 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2423 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2424 		    RF_PARITYMAP_NBYTE,
   2425 		    rf_parity_map_offset(raidPtr),
   2426 		    rf_parity_map_size(raidPtr), 0);
   2427 	}
   2428 }
   2429 
   2430 void
   2431 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2432 {
   2433 	struct rf_paritymap_ondisk tmp;
   2434 	int c,first;
   2435 
   2436 	first=1;
   2437 	for (c = 0; c < raidPtr->numCol; c++) {
   2438 		/* Skip dead disks. */
   2439 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2440 			continue;
   2441 		raidread_component_area(raidPtr->Disks[c].dev,
   2442 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2443 		    RF_PARITYMAP_NBYTE,
   2444 		    rf_parity_map_offset(raidPtr),
   2445 		    rf_parity_map_size(raidPtr));
   2446 		if (first) {
   2447 			memcpy(map, &tmp, sizeof(*map));
   2448 			first = 0;
   2449 		} else {
   2450 			rf_paritymap_merge(map, &tmp);
   2451 		}
   2452 	}
   2453 }
   2454 
   2455 void
   2456 rf_markalldirty(RF_Raid_t *raidPtr)
   2457 {
   2458 	RF_ComponentLabel_t *clabel;
   2459 	int sparecol;
   2460 	int c;
   2461 	int j;
   2462 	int scol = -1;
   2463 
   2464 	raidPtr->mod_counter++;
   2465 	for (c = 0; c < raidPtr->numCol; c++) {
   2466 		/* we don't want to touch (at all) a disk that has
   2467 		   failed */
   2468 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2469 			clabel = raidget_component_label(raidPtr, c);
   2470 			if (clabel->status == rf_ds_spared) {
   2471 				/* XXX do something special...
   2472 				   but whatever you do, don't
   2473 				   try to access it!! */
   2474 			} else {
   2475 				raidmarkdirty(raidPtr, c);
   2476 			}
   2477 		}
   2478 	}
   2479 
   2480 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2481 		sparecol = raidPtr->numCol + c;
   2482 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2483 			/*
   2484 
   2485 			   we claim this disk is "optimal" if it's
   2486 			   rf_ds_used_spare, as that means it should be
   2487 			   directly substitutable for the disk it replaced.
   2488 			   We note that too...
   2489 
   2490 			 */
   2491 
   2492 			for(j=0;j<raidPtr->numCol;j++) {
   2493 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2494 					scol = j;
   2495 					break;
   2496 				}
   2497 			}
   2498 
   2499 			clabel = raidget_component_label(raidPtr, sparecol);
   2500 			/* make sure status is noted */
   2501 
   2502 			raid_init_component_label(raidPtr, clabel);
   2503 
   2504 			clabel->row = 0;
   2505 			clabel->column = scol;
   2506 			/* Note: we *don't* change status from rf_ds_used_spare
   2507 			   to rf_ds_optimal */
   2508 			/* clabel.status = rf_ds_optimal; */
   2509 
   2510 			raidmarkdirty(raidPtr, sparecol);
   2511 		}
   2512 	}
   2513 }
   2514 
   2515 
   2516 void
   2517 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2518 {
   2519 	RF_ComponentLabel_t *clabel;
   2520 	int sparecol;
   2521 	int c;
   2522 	int j;
   2523 	int scol;
   2524 	struct raid_softc *rs = raidPtr->softc;
   2525 
   2526 	scol = -1;
   2527 
   2528 	/* XXX should do extra checks to make sure things really are clean,
   2529 	   rather than blindly setting the clean bit... */
   2530 
   2531 	raidPtr->mod_counter++;
   2532 
   2533 	for (c = 0; c < raidPtr->numCol; c++) {
   2534 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2535 			clabel = raidget_component_label(raidPtr, c);
   2536 			/* make sure status is noted */
   2537 			clabel->status = rf_ds_optimal;
   2538 
   2539 			/* note what unit we are configured as */
   2540 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2541 				clabel->last_unit = raidPtr->raidid;
   2542 
   2543 			raidflush_component_label(raidPtr, c);
   2544 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2545 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2546 					raidmarkclean(raidPtr, c);
   2547 				}
   2548 			}
   2549 		}
   2550 		/* else we don't touch it.. */
   2551 	}
   2552 
   2553 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2554 		sparecol = raidPtr->numCol + c;
   2555 		/* Need to ensure that the reconstruct actually completed! */
   2556 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2557 			/*
   2558 
   2559 			   we claim this disk is "optimal" if it's
   2560 			   rf_ds_used_spare, as that means it should be
   2561 			   directly substitutable for the disk it replaced.
   2562 			   We note that too...
   2563 
   2564 			 */
   2565 
   2566 			for(j=0;j<raidPtr->numCol;j++) {
   2567 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2568 					scol = j;
   2569 					break;
   2570 				}
   2571 			}
   2572 
   2573 			/* XXX shouldn't *really* need this... */
   2574 			clabel = raidget_component_label(raidPtr, sparecol);
   2575 			/* make sure status is noted */
   2576 
   2577 			raid_init_component_label(raidPtr, clabel);
   2578 
   2579 			clabel->column = scol;
   2580 			clabel->status = rf_ds_optimal;
   2581 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2582 				clabel->last_unit = raidPtr->raidid;
   2583 
   2584 			raidflush_component_label(raidPtr, sparecol);
   2585 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2586 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2587 					raidmarkclean(raidPtr, sparecol);
   2588 				}
   2589 			}
   2590 		}
   2591 	}
   2592 }
   2593 
   2594 void
   2595 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2596 {
   2597 
   2598 	if (vp != NULL) {
   2599 		if (auto_configured == 1) {
   2600 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2601 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2602 			vput(vp);
   2603 
   2604 		} else {
   2605 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2606 		}
   2607 	}
   2608 }
   2609 
   2610 
   2611 void
   2612 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2613 {
   2614 	int r,c;
   2615 	struct vnode *vp;
   2616 	int acd;
   2617 
   2618 
   2619 	/* We take this opportunity to close the vnodes like we should.. */
   2620 
   2621 	for (c = 0; c < raidPtr->numCol; c++) {
   2622 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2623 		acd = raidPtr->Disks[c].auto_configured;
   2624 		rf_close_component(raidPtr, vp, acd);
   2625 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2626 		raidPtr->Disks[c].auto_configured = 0;
   2627 	}
   2628 
   2629 	for (r = 0; r < raidPtr->numSpare; r++) {
   2630 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2631 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2632 		rf_close_component(raidPtr, vp, acd);
   2633 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2634 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2635 	}
   2636 }
   2637 
   2638 
   2639 void
   2640 rf_ReconThread(struct rf_recon_req_internal *req)
   2641 {
   2642 	int     s;
   2643 	RF_Raid_t *raidPtr;
   2644 
   2645 	s = splbio();
   2646 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2647 	raidPtr->recon_in_progress = 1;
   2648 
   2649 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2650 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2651 
   2652 	RF_Free(req, sizeof(*req));
   2653 
   2654 	raidPtr->recon_in_progress = 0;
   2655 	splx(s);
   2656 
   2657 	/* That's all... */
   2658 	kthread_exit(0);	/* does not return */
   2659 }
   2660 
   2661 void
   2662 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2663 {
   2664 	int retcode;
   2665 	int s;
   2666 
   2667 	raidPtr->parity_rewrite_stripes_done = 0;
   2668 	raidPtr->parity_rewrite_in_progress = 1;
   2669 	s = splbio();
   2670 	retcode = rf_RewriteParity(raidPtr);
   2671 	splx(s);
   2672 	if (retcode) {
   2673 		printf("raid%d: Error re-writing parity (%d)!\n",
   2674 		    raidPtr->raidid, retcode);
   2675 	} else {
   2676 		/* set the clean bit!  If we shutdown correctly,
   2677 		   the clean bit on each component label will get
   2678 		   set */
   2679 		raidPtr->parity_good = RF_RAID_CLEAN;
   2680 	}
   2681 	raidPtr->parity_rewrite_in_progress = 0;
   2682 
   2683 	/* Anyone waiting for us to stop?  If so, inform them... */
   2684 	if (raidPtr->waitShutdown) {
   2685 		rf_lock_mutex2(raidPtr->rad_lock);
   2686 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2687 		rf_unlock_mutex2(raidPtr->rad_lock);
   2688 	}
   2689 
   2690 	/* That's all... */
   2691 	kthread_exit(0);	/* does not return */
   2692 }
   2693 
   2694 
   2695 void
   2696 rf_CopybackThread(RF_Raid_t *raidPtr)
   2697 {
   2698 	int s;
   2699 
   2700 	raidPtr->copyback_in_progress = 1;
   2701 	s = splbio();
   2702 	rf_CopybackReconstructedData(raidPtr);
   2703 	splx(s);
   2704 	raidPtr->copyback_in_progress = 0;
   2705 
   2706 	/* That's all... */
   2707 	kthread_exit(0);	/* does not return */
   2708 }
   2709 
   2710 
   2711 void
   2712 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2713 {
   2714 	int s;
   2715 	RF_Raid_t *raidPtr;
   2716 
   2717 	s = splbio();
   2718 	raidPtr = req->raidPtr;
   2719 	raidPtr->recon_in_progress = 1;
   2720 	rf_ReconstructInPlace(raidPtr, req->col);
   2721 	RF_Free(req, sizeof(*req));
   2722 	raidPtr->recon_in_progress = 0;
   2723 	splx(s);
   2724 
   2725 	/* That's all... */
   2726 	kthread_exit(0);	/* does not return */
   2727 }
   2728 
   2729 static RF_AutoConfig_t *
   2730 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2731     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2732     unsigned secsize)
   2733 {
   2734 	int good_one = 0;
   2735 	RF_ComponentLabel_t *clabel;
   2736 	RF_AutoConfig_t *ac;
   2737 
   2738 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2739 	if (clabel == NULL) {
   2740 oomem:
   2741 		    while(ac_list) {
   2742 			    ac = ac_list;
   2743 			    if (ac->clabel)
   2744 				    free(ac->clabel, M_RAIDFRAME);
   2745 			    ac_list = ac_list->next;
   2746 			    free(ac, M_RAIDFRAME);
   2747 		    }
   2748 		    printf("RAID auto config: out of memory!\n");
   2749 		    return NULL; /* XXX probably should panic? */
   2750 	}
   2751 
   2752 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2753 		/* Got the label.  Does it look reasonable? */
   2754 		if (rf_reasonable_label(clabel, numsecs) &&
   2755 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2756 #ifdef DEBUG
   2757 			printf("Component on: %s: %llu\n",
   2758 				cname, (unsigned long long)size);
   2759 			rf_print_component_label(clabel);
   2760 #endif
   2761 			/* if it's reasonable, add it, else ignore it. */
   2762 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2763 				M_NOWAIT);
   2764 			if (ac == NULL) {
   2765 				free(clabel, M_RAIDFRAME);
   2766 				goto oomem;
   2767 			}
   2768 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2769 			ac->dev = dev;
   2770 			ac->vp = vp;
   2771 			ac->clabel = clabel;
   2772 			ac->next = ac_list;
   2773 			ac_list = ac;
   2774 			good_one = 1;
   2775 		}
   2776 	}
   2777 	if (!good_one) {
   2778 		/* cleanup */
   2779 		free(clabel, M_RAIDFRAME);
   2780 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2781 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2782 		vput(vp);
   2783 	}
   2784 	return ac_list;
   2785 }
   2786 
   2787 RF_AutoConfig_t *
   2788 rf_find_raid_components(void)
   2789 {
   2790 	struct vnode *vp;
   2791 	struct disklabel label;
   2792 	device_t dv;
   2793 	deviter_t di;
   2794 	dev_t dev;
   2795 	int bmajor, bminor, wedge, rf_part_found;
   2796 	int error;
   2797 	int i;
   2798 	RF_AutoConfig_t *ac_list;
   2799 	uint64_t numsecs;
   2800 	unsigned secsize;
   2801 	int dowedges;
   2802 
   2803 	/* initialize the AutoConfig list */
   2804 	ac_list = NULL;
   2805 
   2806 	/*
   2807 	 * we begin by trolling through *all* the devices on the system *twice*
   2808 	 * first we scan for wedges, second for other devices. This avoids
   2809 	 * using a raw partition instead of a wedge that covers the whole disk
   2810 	 */
   2811 
   2812 	for (dowedges=1; dowedges>=0; --dowedges) {
   2813 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2814 		     dv = deviter_next(&di)) {
   2815 
   2816 			/* we are only interested in disks... */
   2817 			if (device_class(dv) != DV_DISK)
   2818 				continue;
   2819 
   2820 			/* we don't care about floppies... */
   2821 			if (device_is_a(dv, "fd")) {
   2822 				continue;
   2823 			}
   2824 
   2825 			/* we don't care about CD's... */
   2826 			if (device_is_a(dv, "cd")) {
   2827 				continue;
   2828 			}
   2829 
   2830 			/* we don't care about md's... */
   2831 			if (device_is_a(dv, "md")) {
   2832 				continue;
   2833 			}
   2834 
   2835 			/* hdfd is the Atari/Hades floppy driver */
   2836 			if (device_is_a(dv, "hdfd")) {
   2837 				continue;
   2838 			}
   2839 
   2840 			/* fdisa is the Atari/Milan floppy driver */
   2841 			if (device_is_a(dv, "fdisa")) {
   2842 				continue;
   2843 			}
   2844 
   2845 			/* are we in the wedges pass ? */
   2846 			wedge = device_is_a(dv, "dk");
   2847 			if (wedge != dowedges) {
   2848 				continue;
   2849 			}
   2850 
   2851 			/* need to find the device_name_to_block_device_major stuff */
   2852 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2853 
   2854 			rf_part_found = 0; /*No raid partition as yet*/
   2855 
   2856 			/* get a vnode for the raw partition of this disk */
   2857 			bminor = minor(device_unit(dv));
   2858 			dev = wedge ? makedev(bmajor, bminor) :
   2859 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2860 			if (bdevvp(dev, &vp))
   2861 				panic("RAID can't alloc vnode");
   2862 
   2863 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2864 
   2865 			if (error) {
   2866 				/* "Who cares."  Continue looking
   2867 				   for something that exists*/
   2868 				vput(vp);
   2869 				continue;
   2870 			}
   2871 
   2872 			error = getdisksize(vp, &numsecs, &secsize);
   2873 			if (error) {
   2874 				/*
   2875 				 * Pseudo devices like vnd and cgd can be
   2876 				 * opened but may still need some configuration.
   2877 				 * Ignore these quietly.
   2878 				 */
   2879 				if (error != ENXIO)
   2880 					printf("RAIDframe: can't get disk size"
   2881 					    " for dev %s (%d)\n",
   2882 					    device_xname(dv), error);
   2883 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2884 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2885 				vput(vp);
   2886 				continue;
   2887 			}
   2888 			if (wedge) {
   2889 				struct dkwedge_info dkw;
   2890 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2891 				    NOCRED);
   2892 				if (error) {
   2893 					printf("RAIDframe: can't get wedge info for "
   2894 					    "dev %s (%d)\n", device_xname(dv), error);
   2895 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2896 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2897 					vput(vp);
   2898 					continue;
   2899 				}
   2900 
   2901 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2902 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2903 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2904 					vput(vp);
   2905 					continue;
   2906 				}
   2907 
   2908 				ac_list = rf_get_component(ac_list, dev, vp,
   2909 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2910 				rf_part_found = 1; /*There is a raid component on this disk*/
   2911 				continue;
   2912 			}
   2913 
   2914 			/* Ok, the disk exists.  Go get the disklabel. */
   2915 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2916 			if (error) {
   2917 				/*
   2918 				 * XXX can't happen - open() would
   2919 				 * have errored out (or faked up one)
   2920 				 */
   2921 				if (error != ENOTTY)
   2922 					printf("RAIDframe: can't get label for dev "
   2923 					    "%s (%d)\n", device_xname(dv), error);
   2924 			}
   2925 
   2926 			/* don't need this any more.  We'll allocate it again
   2927 			   a little later if we really do... */
   2928 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2929 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2930 			vput(vp);
   2931 
   2932 			if (error)
   2933 				continue;
   2934 
   2935 			rf_part_found = 0; /*No raid partitions yet*/
   2936 			for (i = 0; i < label.d_npartitions; i++) {
   2937 				char cname[sizeof(ac_list->devname)];
   2938 
   2939 				/* We only support partitions marked as RAID */
   2940 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2941 					continue;
   2942 
   2943 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2944 				if (bdevvp(dev, &vp))
   2945 					panic("RAID can't alloc vnode");
   2946 
   2947 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2948 				if (error) {
   2949 					/* Whatever... */
   2950 					vput(vp);
   2951 					continue;
   2952 				}
   2953 				snprintf(cname, sizeof(cname), "%s%c",
   2954 				    device_xname(dv), 'a' + i);
   2955 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2956 					label.d_partitions[i].p_size, numsecs, secsize);
   2957 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2958 			}
   2959 
   2960 			/*
   2961 			 *If there is no raid component on this disk, either in a
   2962 			 *disklabel or inside a wedge, check the raw partition as well,
   2963 			 *as it is possible to configure raid components on raw disk
   2964 			 *devices.
   2965 			 */
   2966 
   2967 			if (!rf_part_found) {
   2968 				char cname[sizeof(ac_list->devname)];
   2969 
   2970 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2971 				if (bdevvp(dev, &vp))
   2972 					panic("RAID can't alloc vnode");
   2973 
   2974 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2975 				if (error) {
   2976 					/* Whatever... */
   2977 					vput(vp);
   2978 					continue;
   2979 				}
   2980 				snprintf(cname, sizeof(cname), "%s%c",
   2981 				    device_xname(dv), 'a' + RAW_PART);
   2982 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2983 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2984 			}
   2985 		}
   2986 		deviter_release(&di);
   2987 	}
   2988 	return ac_list;
   2989 }
   2990 
   2991 
   2992 int
   2993 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2994 {
   2995 
   2996 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2997 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2998 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2999 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3000 	    clabel->row >=0 &&
   3001 	    clabel->column >= 0 &&
   3002 	    clabel->num_rows > 0 &&
   3003 	    clabel->num_columns > 0 &&
   3004 	    clabel->row < clabel->num_rows &&
   3005 	    clabel->column < clabel->num_columns &&
   3006 	    clabel->blockSize > 0 &&
   3007 	    /*
   3008 	     * numBlocksHi may contain garbage, but it is ok since
   3009 	     * the type is unsigned.  If it is really garbage,
   3010 	     * rf_fix_old_label_size() will fix it.
   3011 	     */
   3012 	    rf_component_label_numblocks(clabel) > 0) {
   3013 		/*
   3014 		 * label looks reasonable enough...
   3015 		 * let's make sure it has no old garbage.
   3016 		 */
   3017 		if (numsecs)
   3018 			rf_fix_old_label_size(clabel, numsecs);
   3019 		return(1);
   3020 	}
   3021 	return(0);
   3022 }
   3023 
   3024 
   3025 /*
   3026  * For reasons yet unknown, some old component labels have garbage in
   3027  * the newer numBlocksHi region, and this causes lossage.  Since those
   3028  * disks will also have numsecs set to less than 32 bits of sectors,
   3029  * we can determine when this corruption has occurred, and fix it.
   3030  *
   3031  * The exact same problem, with the same unknown reason, happens to
   3032  * the partitionSizeHi member as well.
   3033  */
   3034 static void
   3035 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3036 {
   3037 
   3038 	if (numsecs < ((uint64_t)1 << 32)) {
   3039 		if (clabel->numBlocksHi) {
   3040 			printf("WARNING: total sectors < 32 bits, yet "
   3041 			       "numBlocksHi set\n"
   3042 			       "WARNING: resetting numBlocksHi to zero.\n");
   3043 			clabel->numBlocksHi = 0;
   3044 		}
   3045 
   3046 		if (clabel->partitionSizeHi) {
   3047 			printf("WARNING: total sectors < 32 bits, yet "
   3048 			       "partitionSizeHi set\n"
   3049 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3050 			clabel->partitionSizeHi = 0;
   3051 		}
   3052 	}
   3053 }
   3054 
   3055 
   3056 #ifdef DEBUG
   3057 void
   3058 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3059 {
   3060 	uint64_t numBlocks;
   3061 	static const char *rp[] = {
   3062 	    "No", "Force", "Soft", "*invalid*"
   3063 	};
   3064 
   3065 
   3066 	numBlocks = rf_component_label_numblocks(clabel);
   3067 
   3068 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3069 	       clabel->row, clabel->column,
   3070 	       clabel->num_rows, clabel->num_columns);
   3071 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3072 	       clabel->version, clabel->serial_number,
   3073 	       clabel->mod_counter);
   3074 	printf("   Clean: %s Status: %d\n",
   3075 	       clabel->clean ? "Yes" : "No", clabel->status);
   3076 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3077 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3078 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3079 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3080 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3081 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3082 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3083 #if 0
   3084 	   printf("   Config order: %d\n", clabel->config_order);
   3085 #endif
   3086 
   3087 }
   3088 #endif
   3089 
   3090 RF_ConfigSet_t *
   3091 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3092 {
   3093 	RF_AutoConfig_t *ac;
   3094 	RF_ConfigSet_t *config_sets;
   3095 	RF_ConfigSet_t *cset;
   3096 	RF_AutoConfig_t *ac_next;
   3097 
   3098 
   3099 	config_sets = NULL;
   3100 
   3101 	/* Go through the AutoConfig list, and figure out which components
   3102 	   belong to what sets.  */
   3103 	ac = ac_list;
   3104 	while(ac!=NULL) {
   3105 		/* we're going to putz with ac->next, so save it here
   3106 		   for use at the end of the loop */
   3107 		ac_next = ac->next;
   3108 
   3109 		if (config_sets == NULL) {
   3110 			/* will need at least this one... */
   3111 			config_sets = (RF_ConfigSet_t *)
   3112 				malloc(sizeof(RF_ConfigSet_t),
   3113 				       M_RAIDFRAME, M_NOWAIT);
   3114 			if (config_sets == NULL) {
   3115 				panic("rf_create_auto_sets: No memory!");
   3116 			}
   3117 			/* this one is easy :) */
   3118 			config_sets->ac = ac;
   3119 			config_sets->next = NULL;
   3120 			config_sets->rootable = 0;
   3121 			ac->next = NULL;
   3122 		} else {
   3123 			/* which set does this component fit into? */
   3124 			cset = config_sets;
   3125 			while(cset!=NULL) {
   3126 				if (rf_does_it_fit(cset, ac)) {
   3127 					/* looks like it matches... */
   3128 					ac->next = cset->ac;
   3129 					cset->ac = ac;
   3130 					break;
   3131 				}
   3132 				cset = cset->next;
   3133 			}
   3134 			if (cset==NULL) {
   3135 				/* didn't find a match above... new set..*/
   3136 				cset = (RF_ConfigSet_t *)
   3137 					malloc(sizeof(RF_ConfigSet_t),
   3138 					       M_RAIDFRAME, M_NOWAIT);
   3139 				if (cset == NULL) {
   3140 					panic("rf_create_auto_sets: No memory!");
   3141 				}
   3142 				cset->ac = ac;
   3143 				ac->next = NULL;
   3144 				cset->next = config_sets;
   3145 				cset->rootable = 0;
   3146 				config_sets = cset;
   3147 			}
   3148 		}
   3149 		ac = ac_next;
   3150 	}
   3151 
   3152 
   3153 	return(config_sets);
   3154 }
   3155 
   3156 static int
   3157 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3158 {
   3159 	RF_ComponentLabel_t *clabel1, *clabel2;
   3160 
   3161 	/* If this one matches the *first* one in the set, that's good
   3162 	   enough, since the other members of the set would have been
   3163 	   through here too... */
   3164 	/* note that we are not checking partitionSize here..
   3165 
   3166 	   Note that we are also not checking the mod_counters here.
   3167 	   If everything else matches except the mod_counter, that's
   3168 	   good enough for this test.  We will deal with the mod_counters
   3169 	   a little later in the autoconfiguration process.
   3170 
   3171 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3172 
   3173 	   The reason we don't check for this is that failed disks
   3174 	   will have lower modification counts.  If those disks are
   3175 	   not added to the set they used to belong to, then they will
   3176 	   form their own set, which may result in 2 different sets,
   3177 	   for example, competing to be configured at raid0, and
   3178 	   perhaps competing to be the root filesystem set.  If the
   3179 	   wrong ones get configured, or both attempt to become /,
   3180 	   weird behaviour and or serious lossage will occur.  Thus we
   3181 	   need to bring them into the fold here, and kick them out at
   3182 	   a later point.
   3183 
   3184 	*/
   3185 
   3186 	clabel1 = cset->ac->clabel;
   3187 	clabel2 = ac->clabel;
   3188 	if ((clabel1->version == clabel2->version) &&
   3189 	    (clabel1->serial_number == clabel2->serial_number) &&
   3190 	    (clabel1->num_rows == clabel2->num_rows) &&
   3191 	    (clabel1->num_columns == clabel2->num_columns) &&
   3192 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3193 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3194 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3195 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3196 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3197 	    (clabel1->blockSize == clabel2->blockSize) &&
   3198 	    rf_component_label_numblocks(clabel1) ==
   3199 	    rf_component_label_numblocks(clabel2) &&
   3200 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3201 	    (clabel1->root_partition == clabel2->root_partition) &&
   3202 	    (clabel1->last_unit == clabel2->last_unit) &&
   3203 	    (clabel1->config_order == clabel2->config_order)) {
   3204 		/* if it get's here, it almost *has* to be a match */
   3205 	} else {
   3206 		/* it's not consistent with somebody in the set..
   3207 		   punt */
   3208 		return(0);
   3209 	}
   3210 	/* all was fine.. it must fit... */
   3211 	return(1);
   3212 }
   3213 
   3214 int
   3215 rf_have_enough_components(RF_ConfigSet_t *cset)
   3216 {
   3217 	RF_AutoConfig_t *ac;
   3218 	RF_AutoConfig_t *auto_config;
   3219 	RF_ComponentLabel_t *clabel;
   3220 	int c;
   3221 	int num_cols;
   3222 	int num_missing;
   3223 	int mod_counter;
   3224 	int mod_counter_found;
   3225 	int even_pair_failed;
   3226 	char parity_type;
   3227 
   3228 
   3229 	/* check to see that we have enough 'live' components
   3230 	   of this set.  If so, we can configure it if necessary */
   3231 
   3232 	num_cols = cset->ac->clabel->num_columns;
   3233 	parity_type = cset->ac->clabel->parityConfig;
   3234 
   3235 	/* XXX Check for duplicate components!?!?!? */
   3236 
   3237 	/* Determine what the mod_counter is supposed to be for this set. */
   3238 
   3239 	mod_counter_found = 0;
   3240 	mod_counter = 0;
   3241 	ac = cset->ac;
   3242 	while(ac!=NULL) {
   3243 		if (mod_counter_found==0) {
   3244 			mod_counter = ac->clabel->mod_counter;
   3245 			mod_counter_found = 1;
   3246 		} else {
   3247 			if (ac->clabel->mod_counter > mod_counter) {
   3248 				mod_counter = ac->clabel->mod_counter;
   3249 			}
   3250 		}
   3251 		ac = ac->next;
   3252 	}
   3253 
   3254 	num_missing = 0;
   3255 	auto_config = cset->ac;
   3256 
   3257 	even_pair_failed = 0;
   3258 	for(c=0; c<num_cols; c++) {
   3259 		ac = auto_config;
   3260 		while(ac!=NULL) {
   3261 			if ((ac->clabel->column == c) &&
   3262 			    (ac->clabel->mod_counter == mod_counter)) {
   3263 				/* it's this one... */
   3264 #ifdef DEBUG
   3265 				printf("Found: %s at %d\n",
   3266 				       ac->devname,c);
   3267 #endif
   3268 				break;
   3269 			}
   3270 			ac=ac->next;
   3271 		}
   3272 		if (ac==NULL) {
   3273 				/* Didn't find one here! */
   3274 				/* special case for RAID 1, especially
   3275 				   where there are more than 2
   3276 				   components (where RAIDframe treats
   3277 				   things a little differently :( ) */
   3278 			if (parity_type == '1') {
   3279 				if (c%2 == 0) { /* even component */
   3280 					even_pair_failed = 1;
   3281 				} else { /* odd component.  If
   3282 					    we're failed, and
   3283 					    so is the even
   3284 					    component, it's
   3285 					    "Good Night, Charlie" */
   3286 					if (even_pair_failed == 1) {
   3287 						return(0);
   3288 					}
   3289 				}
   3290 			} else {
   3291 				/* normal accounting */
   3292 				num_missing++;
   3293 			}
   3294 		}
   3295 		if ((parity_type == '1') && (c%2 == 1)) {
   3296 				/* Just did an even component, and we didn't
   3297 				   bail.. reset the even_pair_failed flag,
   3298 				   and go on to the next component.... */
   3299 			even_pair_failed = 0;
   3300 		}
   3301 	}
   3302 
   3303 	clabel = cset->ac->clabel;
   3304 
   3305 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3306 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3307 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3308 		/* XXX this needs to be made *much* more general */
   3309 		/* Too many failures */
   3310 		return(0);
   3311 	}
   3312 	/* otherwise, all is well, and we've got enough to take a kick
   3313 	   at autoconfiguring this set */
   3314 	return(1);
   3315 }
   3316 
   3317 void
   3318 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3319 			RF_Raid_t *raidPtr)
   3320 {
   3321 	RF_ComponentLabel_t *clabel;
   3322 	int i;
   3323 
   3324 	clabel = ac->clabel;
   3325 
   3326 	/* 1. Fill in the common stuff */
   3327 	config->numCol = clabel->num_columns;
   3328 	config->numSpare = 0; /* XXX should this be set here? */
   3329 	config->sectPerSU = clabel->sectPerSU;
   3330 	config->SUsPerPU = clabel->SUsPerPU;
   3331 	config->SUsPerRU = clabel->SUsPerRU;
   3332 	config->parityConfig = clabel->parityConfig;
   3333 	/* XXX... */
   3334 	strcpy(config->diskQueueType,"fifo");
   3335 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3336 	config->layoutSpecificSize = 0; /* XXX ?? */
   3337 
   3338 	while(ac!=NULL) {
   3339 		/* row/col values will be in range due to the checks
   3340 		   in reasonable_label() */
   3341 		strcpy(config->devnames[0][ac->clabel->column],
   3342 		       ac->devname);
   3343 		ac = ac->next;
   3344 	}
   3345 
   3346 	for(i=0;i<RF_MAXDBGV;i++) {
   3347 		config->debugVars[i][0] = 0;
   3348 	}
   3349 }
   3350 
   3351 int
   3352 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3353 {
   3354 	RF_ComponentLabel_t *clabel;
   3355 	int column;
   3356 	int sparecol;
   3357 
   3358 	raidPtr->autoconfigure = new_value;
   3359 
   3360 	for(column=0; column<raidPtr->numCol; column++) {
   3361 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3362 			clabel = raidget_component_label(raidPtr, column);
   3363 			clabel->autoconfigure = new_value;
   3364 			raidflush_component_label(raidPtr, column);
   3365 		}
   3366 	}
   3367 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3368 		sparecol = raidPtr->numCol + column;
   3369 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3370 			clabel = raidget_component_label(raidPtr, sparecol);
   3371 			clabel->autoconfigure = new_value;
   3372 			raidflush_component_label(raidPtr, sparecol);
   3373 		}
   3374 	}
   3375 	return(new_value);
   3376 }
   3377 
   3378 int
   3379 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3380 {
   3381 	RF_ComponentLabel_t *clabel;
   3382 	int column;
   3383 	int sparecol;
   3384 
   3385 	raidPtr->root_partition = new_value;
   3386 	for(column=0; column<raidPtr->numCol; column++) {
   3387 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3388 			clabel = raidget_component_label(raidPtr, column);
   3389 			clabel->root_partition = new_value;
   3390 			raidflush_component_label(raidPtr, column);
   3391 		}
   3392 	}
   3393 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3394 		sparecol = raidPtr->numCol + column;
   3395 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3396 			clabel = raidget_component_label(raidPtr, sparecol);
   3397 			clabel->root_partition = new_value;
   3398 			raidflush_component_label(raidPtr, sparecol);
   3399 		}
   3400 	}
   3401 	return(new_value);
   3402 }
   3403 
   3404 void
   3405 rf_release_all_vps(RF_ConfigSet_t *cset)
   3406 {
   3407 	RF_AutoConfig_t *ac;
   3408 
   3409 	ac = cset->ac;
   3410 	while(ac!=NULL) {
   3411 		/* Close the vp, and give it back */
   3412 		if (ac->vp) {
   3413 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3414 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3415 			vput(ac->vp);
   3416 			ac->vp = NULL;
   3417 		}
   3418 		ac = ac->next;
   3419 	}
   3420 }
   3421 
   3422 
   3423 void
   3424 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3425 {
   3426 	RF_AutoConfig_t *ac;
   3427 	RF_AutoConfig_t *next_ac;
   3428 
   3429 	ac = cset->ac;
   3430 	while(ac!=NULL) {
   3431 		next_ac = ac->next;
   3432 		/* nuke the label */
   3433 		free(ac->clabel, M_RAIDFRAME);
   3434 		/* cleanup the config structure */
   3435 		free(ac, M_RAIDFRAME);
   3436 		/* "next.." */
   3437 		ac = next_ac;
   3438 	}
   3439 	/* and, finally, nuke the config set */
   3440 	free(cset, M_RAIDFRAME);
   3441 }
   3442 
   3443 
   3444 void
   3445 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3446 {
   3447 	/* current version number */
   3448 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3449 	clabel->serial_number = raidPtr->serial_number;
   3450 	clabel->mod_counter = raidPtr->mod_counter;
   3451 
   3452 	clabel->num_rows = 1;
   3453 	clabel->num_columns = raidPtr->numCol;
   3454 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3455 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3456 
   3457 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3458 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3459 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3460 
   3461 	clabel->blockSize = raidPtr->bytesPerSector;
   3462 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3463 
   3464 	/* XXX not portable */
   3465 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3466 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3467 	clabel->autoconfigure = raidPtr->autoconfigure;
   3468 	clabel->root_partition = raidPtr->root_partition;
   3469 	clabel->last_unit = raidPtr->raidid;
   3470 	clabel->config_order = raidPtr->config_order;
   3471 
   3472 #ifndef RF_NO_PARITY_MAP
   3473 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3474 #endif
   3475 }
   3476 
   3477 struct raid_softc *
   3478 rf_auto_config_set(RF_ConfigSet_t *cset)
   3479 {
   3480 	RF_Raid_t *raidPtr;
   3481 	RF_Config_t *config;
   3482 	int raidID;
   3483 	struct raid_softc *sc;
   3484 
   3485 #ifdef DEBUG
   3486 	printf("RAID autoconfigure\n");
   3487 #endif
   3488 
   3489 	/* 1. Create a config structure */
   3490 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3491 	if (config == NULL) {
   3492 		printf("%s: Out of mem - config!?!?\n", __func__);
   3493 				/* XXX do something more intelligent here. */
   3494 		return NULL;
   3495 	}
   3496 
   3497 	/*
   3498 	   2. Figure out what RAID ID this one is supposed to live at
   3499 	   See if we can get the same RAID dev that it was configured
   3500 	   on last time..
   3501 	*/
   3502 
   3503 	raidID = cset->ac->clabel->last_unit;
   3504 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3505 	     sc = raidget(++raidID, false))
   3506 		continue;
   3507 #ifdef DEBUG
   3508 	printf("Configuring raid%d:\n",raidID);
   3509 #endif
   3510 
   3511 	if (sc == NULL)
   3512 		sc = raidget(raidID, true);
   3513 	if (sc == NULL) {
   3514 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3515 				/* XXX do something more intelligent here. */
   3516 		free(config, M_RAIDFRAME);
   3517 		return NULL;
   3518 	}
   3519 
   3520 	raidPtr = &sc->sc_r;
   3521 
   3522 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3523 	raidPtr->softc = sc;
   3524 	raidPtr->raidid = raidID;
   3525 	raidPtr->openings = RAIDOUTSTANDING;
   3526 
   3527 	/* 3. Build the configuration structure */
   3528 	rf_create_configuration(cset->ac, config, raidPtr);
   3529 
   3530 	/* 4. Do the configuration */
   3531 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3532 		raidinit(sc);
   3533 
   3534 		rf_markalldirty(raidPtr);
   3535 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3536 		switch (cset->ac->clabel->root_partition) {
   3537 		case 1:	/* Force Root */
   3538 		case 2:	/* Soft Root: root when boot partition part of raid */
   3539 			/*
   3540 			 * everything configured just fine.  Make a note
   3541 			 * that this set is eligible to be root,
   3542 			 * or forced to be root
   3543 			 */
   3544 			cset->rootable = cset->ac->clabel->root_partition;
   3545 			/* XXX do this here? */
   3546 			raidPtr->root_partition = cset->rootable;
   3547 			break;
   3548 		default:
   3549 			break;
   3550 		}
   3551 	} else {
   3552 		raidput(sc);
   3553 		sc = NULL;
   3554 	}
   3555 
   3556 	/* 5. Cleanup */
   3557 	free(config, M_RAIDFRAME);
   3558 	return sc;
   3559 }
   3560 
   3561 void
   3562 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3563 	     size_t xmin, size_t xmax)
   3564 {
   3565 	int error;
   3566 
   3567 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3568 	pool_sethiwat(p, xmax);
   3569 	if ((error = pool_prime(p, xmin)) != 0)
   3570 		panic("%s: failed to prime pool: %d", __func__, error);
   3571 	pool_setlowat(p, xmin);
   3572 }
   3573 
   3574 /*
   3575  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3576  * to see if there is IO pending and if that IO could possibly be done
   3577  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3578  * otherwise.
   3579  *
   3580  */
   3581 int
   3582 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3583 {
   3584 	struct raid_softc *rs;
   3585 	struct dk_softc *dksc;
   3586 
   3587 	rs = raidPtr->softc;
   3588 	dksc = &rs->sc_dksc;
   3589 
   3590 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3591 		return 1;
   3592 
   3593 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3594 		/* there is work to do */
   3595 		return 0;
   3596 	}
   3597 	/* default is nothing to do */
   3598 	return 1;
   3599 }
   3600 
   3601 int
   3602 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3603 {
   3604 	uint64_t numsecs;
   3605 	unsigned secsize;
   3606 	int error;
   3607 
   3608 	error = getdisksize(vp, &numsecs, &secsize);
   3609 	if (error == 0) {
   3610 		diskPtr->blockSize = secsize;
   3611 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3612 		diskPtr->partitionSize = numsecs;
   3613 		return 0;
   3614 	}
   3615 	return error;
   3616 }
   3617 
   3618 static int
   3619 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3620 {
   3621 	return 1;
   3622 }
   3623 
   3624 static void
   3625 raid_attach(device_t parent, device_t self, void *aux)
   3626 {
   3627 }
   3628 
   3629 
   3630 static int
   3631 raid_detach(device_t self, int flags)
   3632 {
   3633 	int error;
   3634 	struct raid_softc *rs = raidsoftc(self);
   3635 
   3636 	if (rs == NULL)
   3637 		return ENXIO;
   3638 
   3639 	if ((error = raidlock(rs)) != 0)
   3640 		return (error);
   3641 
   3642 	error = raid_detach_unlocked(rs);
   3643 
   3644 	raidunlock(rs);
   3645 
   3646 	/* XXX raid can be referenced here */
   3647 
   3648 	if (error)
   3649 		return error;
   3650 
   3651 	/* Free the softc */
   3652 	raidput(rs);
   3653 
   3654 	return 0;
   3655 }
   3656 
   3657 static void
   3658 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3659 {
   3660 	struct dk_softc *dksc = &rs->sc_dksc;
   3661 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3662 
   3663 	memset(dg, 0, sizeof(*dg));
   3664 
   3665 	dg->dg_secperunit = raidPtr->totalSectors;
   3666 	dg->dg_secsize = raidPtr->bytesPerSector;
   3667 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3668 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3669 
   3670 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3671 }
   3672 
   3673 /*
   3674  * Get cache info for all the components (including spares).
   3675  * Returns intersection of all the cache flags of all disks, or first
   3676  * error if any encountered.
   3677  * XXXfua feature flags can change as spares are added - lock down somehow
   3678  */
   3679 static int
   3680 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3681 {
   3682 	int c;
   3683 	int error;
   3684 	int dkwhole = 0, dkpart;
   3685 
   3686 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3687 		/*
   3688 		 * Check any non-dead disk, even when currently being
   3689 		 * reconstructed.
   3690 		 */
   3691 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3692 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3693 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3694 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3695 			if (error) {
   3696 				if (error != ENODEV) {
   3697 					printf("raid%d: get cache for component %s failed\n",
   3698 					    raidPtr->raidid,
   3699 					    raidPtr->Disks[c].devname);
   3700 				}
   3701 
   3702 				return error;
   3703 			}
   3704 
   3705 			if (c == 0)
   3706 				dkwhole = dkpart;
   3707 			else
   3708 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3709 		}
   3710 	}
   3711 
   3712 	*data = dkwhole;
   3713 
   3714 	return 0;
   3715 }
   3716 
   3717 /*
   3718  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3719  * We end up returning whatever error was returned by the first cache flush
   3720  * that fails.
   3721  */
   3722 
   3723 int
   3724 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3725 {
   3726 	int c, sparecol;
   3727 	int e,error;
   3728 	int force = 1;
   3729 
   3730 	error = 0;
   3731 	for (c = 0; c < raidPtr->numCol; c++) {
   3732 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3733 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3734 					  &force, FWRITE, NOCRED);
   3735 			if (e) {
   3736 				if (e != ENODEV)
   3737 					printf("raid%d: cache flush to component %s failed.\n",
   3738 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3739 				if (error == 0) {
   3740 					error = e;
   3741 				}
   3742 			}
   3743 		}
   3744 	}
   3745 
   3746 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3747 		sparecol = raidPtr->numCol + c;
   3748 		/* Need to ensure that the reconstruct actually completed! */
   3749 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3750 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3751 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3752 			if (e) {
   3753 				if (e != ENODEV)
   3754 					printf("raid%d: cache flush to component %s failed.\n",
   3755 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3756 				if (error == 0) {
   3757 					error = e;
   3758 				}
   3759 			}
   3760 		}
   3761 	}
   3762 	return error;
   3763 }
   3764 
   3765 /* Fill in info with the current status */
   3766 void
   3767 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3768 {
   3769 
   3770 	if (raidPtr->status != rf_rs_reconstructing) {
   3771 		info->total = 100;
   3772 		info->completed = 100;
   3773 	} else {
   3774 		info->total = raidPtr->reconControl->numRUsTotal;
   3775 		info->completed = raidPtr->reconControl->numRUsComplete;
   3776 	}
   3777 	info->remaining = info->total - info->completed;
   3778 }
   3779 
   3780 /* Fill in info with the current status */
   3781 void
   3782 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3783 {
   3784 
   3785 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3786 		info->total = raidPtr->Layout.numStripe;
   3787 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3788 	} else {
   3789 		info->completed = 100;
   3790 		info->total = 100;
   3791 	}
   3792 	info->remaining = info->total - info->completed;
   3793 }
   3794 
   3795 /* Fill in info with the current status */
   3796 void
   3797 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3798 {
   3799 
   3800 	if (raidPtr->copyback_in_progress == 1) {
   3801 		info->total = raidPtr->Layout.numStripe;
   3802 		info->completed = raidPtr->copyback_stripes_done;
   3803 		info->remaining = info->total - info->completed;
   3804 	} else {
   3805 		info->remaining = 0;
   3806 		info->completed = 100;
   3807 		info->total = 100;
   3808 	}
   3809 }
   3810 
   3811 /* Fill in config with the current info */
   3812 int
   3813 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3814 {
   3815 	int	d, i, j;
   3816 
   3817 	if (!raidPtr->valid)
   3818 		return (ENODEV);
   3819 	config->cols = raidPtr->numCol;
   3820 	config->ndevs = raidPtr->numCol;
   3821 	if (config->ndevs >= RF_MAX_DISKS)
   3822 		return (ENOMEM);
   3823 	config->nspares = raidPtr->numSpare;
   3824 	if (config->nspares >= RF_MAX_DISKS)
   3825 		return (ENOMEM);
   3826 	config->maxqdepth = raidPtr->maxQueueDepth;
   3827 	d = 0;
   3828 	for (j = 0; j < config->cols; j++) {
   3829 		config->devs[d] = raidPtr->Disks[j];
   3830 		d++;
   3831 	}
   3832 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3833 		config->spares[i] = raidPtr->Disks[j];
   3834 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3835 			/* XXX: raidctl(8) expects to see this as a used spare */
   3836 			config->spares[i].status = rf_ds_used_spare;
   3837 		}
   3838 	}
   3839 	return 0;
   3840 }
   3841 
   3842 int
   3843 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3844 {
   3845 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3846 	RF_ComponentLabel_t *raid_clabel;
   3847 	int column = clabel->column;
   3848 
   3849 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3850 		return EINVAL;
   3851 	raid_clabel = raidget_component_label(raidPtr, column);
   3852 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3853 
   3854 	return 0;
   3855 }
   3856 
   3857 /*
   3858  * Module interface
   3859  */
   3860 
   3861 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3862 
   3863 #ifdef _MODULE
   3864 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3865 #endif
   3866 
   3867 static int raid_modcmd(modcmd_t, void *);
   3868 static int raid_modcmd_init(void);
   3869 static int raid_modcmd_fini(void);
   3870 
   3871 static int
   3872 raid_modcmd(modcmd_t cmd, void *data)
   3873 {
   3874 	int error;
   3875 
   3876 	error = 0;
   3877 	switch (cmd) {
   3878 	case MODULE_CMD_INIT:
   3879 		error = raid_modcmd_init();
   3880 		break;
   3881 	case MODULE_CMD_FINI:
   3882 		error = raid_modcmd_fini();
   3883 		break;
   3884 	default:
   3885 		error = ENOTTY;
   3886 		break;
   3887 	}
   3888 	return error;
   3889 }
   3890 
   3891 static int
   3892 raid_modcmd_init(void)
   3893 {
   3894 	int error;
   3895 	int bmajor, cmajor;
   3896 
   3897 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3898 	mutex_enter(&raid_lock);
   3899 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3900 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3901 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3902 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3903 
   3904 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3905 #endif
   3906 
   3907 	bmajor = cmajor = -1;
   3908 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3909 	    &raid_cdevsw, &cmajor);
   3910 	if (error != 0 && error != EEXIST) {
   3911 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3912 		mutex_exit(&raid_lock);
   3913 		return error;
   3914 	}
   3915 #ifdef _MODULE
   3916 	error = config_cfdriver_attach(&raid_cd);
   3917 	if (error != 0) {
   3918 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3919 		    __func__, error);
   3920 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3921 		mutex_exit(&raid_lock);
   3922 		return error;
   3923 	}
   3924 #endif
   3925 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3926 	if (error != 0) {
   3927 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3928 		    __func__, error);
   3929 #ifdef _MODULE
   3930 		config_cfdriver_detach(&raid_cd);
   3931 #endif
   3932 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3933 		mutex_exit(&raid_lock);
   3934 		return error;
   3935 	}
   3936 
   3937 	raidautoconfigdone = false;
   3938 
   3939 	mutex_exit(&raid_lock);
   3940 
   3941 	if (error == 0) {
   3942 		if (rf_BootRaidframe(true) == 0)
   3943 			aprint_verbose("Kernelized RAIDframe activated\n");
   3944 		else
   3945 			panic("Serious error activating RAID!!");
   3946 	}
   3947 
   3948 	/*
   3949 	 * Register a finalizer which will be used to auto-config RAID
   3950 	 * sets once all real hardware devices have been found.
   3951 	 */
   3952 	error = config_finalize_register(NULL, rf_autoconfig);
   3953 	if (error != 0) {
   3954 		aprint_error("WARNING: unable to register RAIDframe "
   3955 		    "finalizer\n");
   3956 		error = 0;
   3957 	}
   3958 
   3959 	return error;
   3960 }
   3961 
   3962 static int
   3963 raid_modcmd_fini(void)
   3964 {
   3965 	int error;
   3966 
   3967 	mutex_enter(&raid_lock);
   3968 
   3969 	/* Don't allow unload if raid device(s) exist.  */
   3970 	if (!LIST_EMPTY(&raids)) {
   3971 		mutex_exit(&raid_lock);
   3972 		return EBUSY;
   3973 	}
   3974 
   3975 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3976 	if (error != 0) {
   3977 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3978 		mutex_exit(&raid_lock);
   3979 		return error;
   3980 	}
   3981 #ifdef _MODULE
   3982 	error = config_cfdriver_detach(&raid_cd);
   3983 	if (error != 0) {
   3984 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3985 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3986 		mutex_exit(&raid_lock);
   3987 		return error;
   3988 	}
   3989 #endif
   3990 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3991 	if (error != 0) {
   3992 		aprint_error("%s: cannot detach devsw\n",__func__);
   3993 #ifdef _MODULE
   3994 		config_cfdriver_attach(&raid_cd);
   3995 #endif
   3996 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3997 		mutex_exit(&raid_lock);
   3998 		return error;
   3999 	}
   4000 	rf_BootRaidframe(false);
   4001 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4002 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4003 	rf_destroy_cond2(rf_sparet_wait_cv);
   4004 	rf_destroy_cond2(rf_sparet_resp_cv);
   4005 #endif
   4006 	mutex_exit(&raid_lock);
   4007 	mutex_destroy(&raid_lock);
   4008 
   4009 	return error;
   4010 }
   4011