Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.360
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.360 2019/01/29 09:28:50 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.360 2019/01/29 09:28:50 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 #include <sys/compat_stub.h>
    132 
    133 #include <prop/proplib.h>
    134 
    135 #include <dev/raidframe/raidframevar.h>
    136 #include <dev/raidframe/raidframeio.h>
    137 #include <dev/raidframe/rf_paritymap.h>
    138 
    139 #include "rf_raid.h"
    140 #include "rf_copyback.h"
    141 #include "rf_dag.h"
    142 #include "rf_dagflags.h"
    143 #include "rf_desc.h"
    144 #include "rf_diskqueue.h"
    145 #include "rf_etimer.h"
    146 #include "rf_general.h"
    147 #include "rf_kintf.h"
    148 #include "rf_options.h"
    149 #include "rf_driver.h"
    150 #include "rf_parityscan.h"
    151 #include "rf_threadstuff.h"
    152 
    153 #include "rf_compat50.h"
    154 
    155 #include "rf_compat80.h"
    156 
    157 #ifdef COMPAT_NETBSD32
    158 #include "rf_compat32.h"
    159 #endif
    160 
    161 #include "ioconf.h"
    162 
    163 #ifdef DEBUG
    164 int     rf_kdebug_level = 0;
    165 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    166 #else				/* DEBUG */
    167 #define db1_printf(a) { }
    168 #endif				/* DEBUG */
    169 
    170 #ifdef DEBUG_ROOT
    171 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    172 #else
    173 #define DPRINTF(a, ...)
    174 #endif
    175 
    176 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    177 static rf_declare_mutex2(rf_sparet_wait_mutex);
    178 static rf_declare_cond2(rf_sparet_wait_cv);
    179 static rf_declare_cond2(rf_sparet_resp_cv);
    180 
    181 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    182 						 * spare table */
    183 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    184 						 * installation process */
    185 #endif
    186 
    187 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    188 
    189 /* prototypes */
    190 static void KernelWakeupFunc(struct buf *);
    191 static void InitBP(struct buf *, struct vnode *, unsigned,
    192     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    193     void *, int, struct proc *);
    194 struct raid_softc;
    195 static void raidinit(struct raid_softc *);
    196 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    197 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    198 
    199 static int raid_match(device_t, cfdata_t, void *);
    200 static void raid_attach(device_t, device_t, void *);
    201 static int raid_detach(device_t, int);
    202 
    203 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    204     daddr_t, daddr_t);
    205 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    206     daddr_t, daddr_t, int);
    207 
    208 static int raidwrite_component_label(unsigned,
    209     dev_t, struct vnode *, RF_ComponentLabel_t *);
    210 static int raidread_component_label(unsigned,
    211     dev_t, struct vnode *, RF_ComponentLabel_t *);
    212 
    213 static int raid_diskstart(device_t, struct buf *bp);
    214 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    215 static int raid_lastclose(device_t);
    216 
    217 static dev_type_open(raidopen);
    218 static dev_type_close(raidclose);
    219 static dev_type_read(raidread);
    220 static dev_type_write(raidwrite);
    221 static dev_type_ioctl(raidioctl);
    222 static dev_type_strategy(raidstrategy);
    223 static dev_type_dump(raiddump);
    224 static dev_type_size(raidsize);
    225 
    226 const struct bdevsw raid_bdevsw = {
    227 	.d_open = raidopen,
    228 	.d_close = raidclose,
    229 	.d_strategy = raidstrategy,
    230 	.d_ioctl = raidioctl,
    231 	.d_dump = raiddump,
    232 	.d_psize = raidsize,
    233 	.d_discard = nodiscard,
    234 	.d_flag = D_DISK
    235 };
    236 
    237 const struct cdevsw raid_cdevsw = {
    238 	.d_open = raidopen,
    239 	.d_close = raidclose,
    240 	.d_read = raidread,
    241 	.d_write = raidwrite,
    242 	.d_ioctl = raidioctl,
    243 	.d_stop = nostop,
    244 	.d_tty = notty,
    245 	.d_poll = nopoll,
    246 	.d_mmap = nommap,
    247 	.d_kqfilter = nokqfilter,
    248 	.d_discard = nodiscard,
    249 	.d_flag = D_DISK
    250 };
    251 
    252 static struct dkdriver rf_dkdriver = {
    253 	.d_open = raidopen,
    254 	.d_close = raidclose,
    255 	.d_strategy = raidstrategy,
    256 	.d_diskstart = raid_diskstart,
    257 	.d_dumpblocks = raid_dumpblocks,
    258 	.d_lastclose = raid_lastclose,
    259 	.d_minphys = minphys
    260 };
    261 
    262 struct raid_softc {
    263 	struct dk_softc sc_dksc;
    264 	int	sc_unit;
    265 	int     sc_flags;	/* flags */
    266 	int     sc_cflags;	/* configuration flags */
    267 	kmutex_t sc_mutex;	/* interlock mutex */
    268 	kcondvar_t sc_cv;	/* and the condvar */
    269 	uint64_t sc_size;	/* size of the raid device */
    270 	char    sc_xname[20];	/* XXX external name */
    271 	RF_Raid_t sc_r;
    272 	LIST_ENTRY(raid_softc) sc_link;
    273 };
    274 /* sc_flags */
    275 #define RAIDF_INITED		0x01	/* unit has been initialized */
    276 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    277 #define RAIDF_DETACH  		0x04	/* detach after final close */
    278 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    279 #define RAIDF_LOCKED		0x10	/* unit is locked */
    280 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    281 
    282 #define	raidunit(x)	DISKUNIT(x)
    283 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    284 
    285 extern struct cfdriver raid_cd;
    286 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    287     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    288     DVF_DETACH_SHUTDOWN);
    289 
    290 /* Internal representation of a rf_recon_req */
    291 struct rf_recon_req_internal {
    292 	RF_RowCol_t col;
    293 	RF_ReconReqFlags_t flags;
    294 	void   *raidPtr;
    295 };
    296 
    297 /*
    298  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    299  * Be aware that large numbers can allow the driver to consume a lot of
    300  * kernel memory, especially on writes, and in degraded mode reads.
    301  *
    302  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    303  * a single 64K write will typically require 64K for the old data,
    304  * 64K for the old parity, and 64K for the new parity, for a total
    305  * of 192K (if the parity buffer is not re-used immediately).
    306  * Even it if is used immediately, that's still 128K, which when multiplied
    307  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    308  *
    309  * Now in degraded mode, for example, a 64K read on the above setup may
    310  * require data reconstruction, which will require *all* of the 4 remaining
    311  * disks to participate -- 4 * 32K/disk == 128K again.
    312  */
    313 
    314 #ifndef RAIDOUTSTANDING
    315 #define RAIDOUTSTANDING   6
    316 #endif
    317 
    318 #define RAIDLABELDEV(dev)	\
    319 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    320 
    321 /* declared here, and made public, for the benefit of KVM stuff.. */
    322 
    323 static int raidlock(struct raid_softc *);
    324 static void raidunlock(struct raid_softc *);
    325 
    326 static int raid_detach_unlocked(struct raid_softc *);
    327 
    328 static void rf_markalldirty(RF_Raid_t *);
    329 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    330 
    331 void rf_ReconThread(struct rf_recon_req_internal *);
    332 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    333 void rf_CopybackThread(RF_Raid_t *raidPtr);
    334 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    335 int rf_autoconfig(device_t);
    336 void rf_buildroothack(RF_ConfigSet_t *);
    337 
    338 RF_AutoConfig_t *rf_find_raid_components(void);
    339 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    340 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    341 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    342 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    343 int rf_set_autoconfig(RF_Raid_t *, int);
    344 int rf_set_rootpartition(RF_Raid_t *, int);
    345 void rf_release_all_vps(RF_ConfigSet_t *);
    346 void rf_cleanup_config_set(RF_ConfigSet_t *);
    347 int rf_have_enough_components(RF_ConfigSet_t *);
    348 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    349 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    350 
    351 /*
    352  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    353  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    354  * in the kernel config file.
    355  */
    356 #ifdef RAID_AUTOCONFIG
    357 int raidautoconfig = 1;
    358 #else
    359 int raidautoconfig = 0;
    360 #endif
    361 static bool raidautoconfigdone = false;
    362 
    363 struct RF_Pools_s rf_pools;
    364 
    365 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    366 static kmutex_t raid_lock;
    367 
    368 static struct raid_softc *
    369 raidcreate(int unit) {
    370 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    371 	sc->sc_unit = unit;
    372 	cv_init(&sc->sc_cv, "raidunit");
    373 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    374 	return sc;
    375 }
    376 
    377 static void
    378 raiddestroy(struct raid_softc *sc) {
    379 	cv_destroy(&sc->sc_cv);
    380 	mutex_destroy(&sc->sc_mutex);
    381 	kmem_free(sc, sizeof(*sc));
    382 }
    383 
    384 static struct raid_softc *
    385 raidget(int unit, bool create) {
    386 	struct raid_softc *sc;
    387 	if (unit < 0) {
    388 #ifdef DIAGNOSTIC
    389 		panic("%s: unit %d!", __func__, unit);
    390 #endif
    391 		return NULL;
    392 	}
    393 	mutex_enter(&raid_lock);
    394 	LIST_FOREACH(sc, &raids, sc_link) {
    395 		if (sc->sc_unit == unit) {
    396 			mutex_exit(&raid_lock);
    397 			return sc;
    398 		}
    399 	}
    400 	mutex_exit(&raid_lock);
    401 	if (!create)
    402 		return NULL;
    403 	if ((sc = raidcreate(unit)) == NULL)
    404 		return NULL;
    405 	mutex_enter(&raid_lock);
    406 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    407 	mutex_exit(&raid_lock);
    408 	return sc;
    409 }
    410 
    411 static void
    412 raidput(struct raid_softc *sc) {
    413 	mutex_enter(&raid_lock);
    414 	LIST_REMOVE(sc, sc_link);
    415 	mutex_exit(&raid_lock);
    416 	raiddestroy(sc);
    417 }
    418 
    419 void
    420 raidattach(int num)
    421 {
    422 
    423 	/*
    424 	 * Device attachment and associated initialization now occurs
    425 	 * as part of the module initialization.
    426 	 */
    427 }
    428 
    429 int
    430 rf_autoconfig(device_t self)
    431 {
    432 	RF_AutoConfig_t *ac_list;
    433 	RF_ConfigSet_t *config_sets;
    434 
    435 	if (!raidautoconfig || raidautoconfigdone == true)
    436 		return (0);
    437 
    438 	/* XXX This code can only be run once. */
    439 	raidautoconfigdone = true;
    440 
    441 #ifdef __HAVE_CPU_BOOTCONF
    442 	/*
    443 	 * 0. find the boot device if needed first so we can use it later
    444 	 * this needs to be done before we autoconfigure any raid sets,
    445 	 * because if we use wedges we are not going to be able to open
    446 	 * the boot device later
    447 	 */
    448 	if (booted_device == NULL)
    449 		cpu_bootconf();
    450 #endif
    451 	/* 1. locate all RAID components on the system */
    452 	aprint_debug("Searching for RAID components...\n");
    453 	ac_list = rf_find_raid_components();
    454 
    455 	/* 2. Sort them into their respective sets. */
    456 	config_sets = rf_create_auto_sets(ac_list);
    457 
    458 	/*
    459 	 * 3. Evaluate each set and configure the valid ones.
    460 	 * This gets done in rf_buildroothack().
    461 	 */
    462 	rf_buildroothack(config_sets);
    463 
    464 	return 1;
    465 }
    466 
    467 static int
    468 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    469 	const char *bootname;
    470 	size_t len;
    471 
    472 	/* if bdv is NULL, the set can't contain it. exit early. */
    473 	if (bdv == NULL)
    474 		return 0;
    475 
    476 	bootname = device_xname(bdv);
    477 	len = strlen(bootname);
    478 
    479 	for (int col = 0; col < r->numCol; col++) {
    480 		const char *devname = r->Disks[col].devname;
    481 		devname += sizeof("/dev/") - 1;
    482 		if (strncmp(devname, "dk", 2) == 0) {
    483 			const char *parent =
    484 			    dkwedge_get_parent_name(r->Disks[col].dev);
    485 			if (parent != NULL)
    486 				devname = parent;
    487 		}
    488 		if (strncmp(devname, bootname, len) == 0) {
    489 			struct raid_softc *sc = r->softc;
    490 			aprint_debug("raid%d includes boot device %s\n",
    491 			    sc->sc_unit, devname);
    492 			return 1;
    493 		}
    494 	}
    495 	return 0;
    496 }
    497 
    498 void
    499 rf_buildroothack(RF_ConfigSet_t *config_sets)
    500 {
    501 	RF_ConfigSet_t *cset;
    502 	RF_ConfigSet_t *next_cset;
    503 	int num_root;
    504 	struct raid_softc *sc, *rsc;
    505 	struct dk_softc *dksc;
    506 
    507 	sc = rsc = NULL;
    508 	num_root = 0;
    509 	cset = config_sets;
    510 	while (cset != NULL) {
    511 		next_cset = cset->next;
    512 		if (rf_have_enough_components(cset) &&
    513 		    cset->ac->clabel->autoconfigure == 1) {
    514 			sc = rf_auto_config_set(cset);
    515 			if (sc != NULL) {
    516 				aprint_debug("raid%d: configured ok, rootable %d\n",
    517 				    sc->sc_unit, cset->rootable);
    518 				if (cset->rootable) {
    519 					rsc = sc;
    520 					num_root++;
    521 				}
    522 			} else {
    523 				/* The autoconfig didn't work :( */
    524 				aprint_debug("Autoconfig failed\n");
    525 				rf_release_all_vps(cset);
    526 			}
    527 		} else {
    528 			/* we're not autoconfiguring this set...
    529 			   release the associated resources */
    530 			rf_release_all_vps(cset);
    531 		}
    532 		/* cleanup */
    533 		rf_cleanup_config_set(cset);
    534 		cset = next_cset;
    535 	}
    536 	dksc = &rsc->sc_dksc;
    537 
    538 	/* if the user has specified what the root device should be
    539 	   then we don't touch booted_device or boothowto... */
    540 
    541 	if (rootspec != NULL) {
    542 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    543 		return;
    544 	}
    545 
    546 	/* we found something bootable... */
    547 
    548 	/*
    549 	 * XXX: The following code assumes that the root raid
    550 	 * is the first ('a') partition. This is about the best
    551 	 * we can do with a BSD disklabel, but we might be able
    552 	 * to do better with a GPT label, by setting a specified
    553 	 * attribute to indicate the root partition. We can then
    554 	 * stash the partition number in the r->root_partition
    555 	 * high bits (the bottom 2 bits are already used). For
    556 	 * now we just set booted_partition to 0 when we override
    557 	 * root.
    558 	 */
    559 	if (num_root == 1) {
    560 		device_t candidate_root;
    561 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    562 			char cname[sizeof(cset->ac->devname)];
    563 			/* XXX: assume partition 'a' first */
    564 			snprintf(cname, sizeof(cname), "%s%c",
    565 			    device_xname(dksc->sc_dev), 'a');
    566 			candidate_root = dkwedge_find_by_wname(cname);
    567 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    568 			    cname);
    569 			if (candidate_root == NULL) {
    570 				/*
    571 				 * If that is not found, because we don't use
    572 				 * disklabel, return the first dk child
    573 				 * XXX: we can skip the 'a' check above
    574 				 * and always do this...
    575 				 */
    576 				size_t i = 0;
    577 				candidate_root = dkwedge_find_by_parent(
    578 				    device_xname(dksc->sc_dev), &i);
    579 			}
    580 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    581 			    candidate_root);
    582 		} else
    583 			candidate_root = dksc->sc_dev;
    584 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    585 		DPRINTF("%s: booted_device=%p root_partition=%d "
    586 			"contains_boot=%d",
    587 		    __func__, booted_device, rsc->sc_r.root_partition,
    588 			   rf_containsboot(&rsc->sc_r, booted_device));
    589 		/* XXX the check for booted_device == NULL can probably be
    590 		 * dropped, now that rf_containsboot handles that case.
    591 		 */
    592 		if (booted_device == NULL ||
    593 		    rsc->sc_r.root_partition == 1 ||
    594 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    595 			booted_device = candidate_root;
    596 			booted_method = "raidframe/single";
    597 			booted_partition = 0;	/* XXX assume 'a' */
    598 		}
    599 	} else if (num_root > 1) {
    600 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    601 		    booted_device);
    602 
    603 		/*
    604 		 * Maybe the MD code can help. If it cannot, then
    605 		 * setroot() will discover that we have no
    606 		 * booted_device and will ask the user if nothing was
    607 		 * hardwired in the kernel config file
    608 		 */
    609 		if (booted_device == NULL)
    610 			return;
    611 
    612 		num_root = 0;
    613 		mutex_enter(&raid_lock);
    614 		LIST_FOREACH(sc, &raids, sc_link) {
    615 			RF_Raid_t *r = &sc->sc_r;
    616 			if (r->valid == 0)
    617 				continue;
    618 
    619 			if (r->root_partition == 0)
    620 				continue;
    621 
    622 			if (rf_containsboot(r, booted_device)) {
    623 				num_root++;
    624 				rsc = sc;
    625 				dksc = &rsc->sc_dksc;
    626 			}
    627 		}
    628 		mutex_exit(&raid_lock);
    629 
    630 		if (num_root == 1) {
    631 			booted_device = dksc->sc_dev;
    632 			booted_method = "raidframe/multi";
    633 			booted_partition = 0;	/* XXX assume 'a' */
    634 		} else {
    635 			/* we can't guess.. require the user to answer... */
    636 			boothowto |= RB_ASKNAME;
    637 		}
    638 	}
    639 }
    640 
    641 static int
    642 raidsize(dev_t dev)
    643 {
    644 	struct raid_softc *rs;
    645 	struct dk_softc *dksc;
    646 	unsigned int unit;
    647 
    648 	unit = raidunit(dev);
    649 	if ((rs = raidget(unit, false)) == NULL)
    650 		return -1;
    651 	dksc = &rs->sc_dksc;
    652 
    653 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    654 		return -1;
    655 
    656 	return dk_size(dksc, dev);
    657 }
    658 
    659 static int
    660 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    661 {
    662 	unsigned int unit;
    663 	struct raid_softc *rs;
    664 	struct dk_softc *dksc;
    665 
    666 	unit = raidunit(dev);
    667 	if ((rs = raidget(unit, false)) == NULL)
    668 		return ENXIO;
    669 	dksc = &rs->sc_dksc;
    670 
    671 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    672 		return ENODEV;
    673 
    674         /*
    675            Note that blkno is relative to this particular partition.
    676            By adding adding RF_PROTECTED_SECTORS, we get a value that
    677 	   is relative to the partition used for the underlying component.
    678         */
    679 	blkno += RF_PROTECTED_SECTORS;
    680 
    681 	return dk_dump(dksc, dev, blkno, va, size);
    682 }
    683 
    684 static int
    685 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    686 {
    687 	struct raid_softc *rs = raidsoftc(dev);
    688 	const struct bdevsw *bdev;
    689 	RF_Raid_t *raidPtr;
    690 	int     c, sparecol, j, scol, dumpto;
    691 	int     error = 0;
    692 
    693 	raidPtr = &rs->sc_r;
    694 
    695 	/* we only support dumping to RAID 1 sets */
    696 	if (raidPtr->Layout.numDataCol != 1 ||
    697 	    raidPtr->Layout.numParityCol != 1)
    698 		return EINVAL;
    699 
    700 	if ((error = raidlock(rs)) != 0)
    701 		return error;
    702 
    703 	/* figure out what device is alive.. */
    704 
    705 	/*
    706 	   Look for a component to dump to.  The preference for the
    707 	   component to dump to is as follows:
    708 	   1) the master
    709 	   2) a used_spare of the master
    710 	   3) the slave
    711 	   4) a used_spare of the slave
    712 	*/
    713 
    714 	dumpto = -1;
    715 	for (c = 0; c < raidPtr->numCol; c++) {
    716 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    717 			/* this might be the one */
    718 			dumpto = c;
    719 			break;
    720 		}
    721 	}
    722 
    723 	/*
    724 	   At this point we have possibly selected a live master or a
    725 	   live slave.  We now check to see if there is a spared
    726 	   master (or a spared slave), if we didn't find a live master
    727 	   or a live slave.
    728 	*/
    729 
    730 	for (c = 0; c < raidPtr->numSpare; c++) {
    731 		sparecol = raidPtr->numCol + c;
    732 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    733 			/* How about this one? */
    734 			scol = -1;
    735 			for(j=0;j<raidPtr->numCol;j++) {
    736 				if (raidPtr->Disks[j].spareCol == sparecol) {
    737 					scol = j;
    738 					break;
    739 				}
    740 			}
    741 			if (scol == 0) {
    742 				/*
    743 				   We must have found a spared master!
    744 				   We'll take that over anything else
    745 				   found so far.  (We couldn't have
    746 				   found a real master before, since
    747 				   this is a used spare, and it's
    748 				   saying that it's replacing the
    749 				   master.)  On reboot (with
    750 				   autoconfiguration turned on)
    751 				   sparecol will become the 1st
    752 				   component (component0) of this set.
    753 				*/
    754 				dumpto = sparecol;
    755 				break;
    756 			} else if (scol != -1) {
    757 				/*
    758 				   Must be a spared slave.  We'll dump
    759 				   to that if we havn't found anything
    760 				   else so far.
    761 				*/
    762 				if (dumpto == -1)
    763 					dumpto = sparecol;
    764 			}
    765 		}
    766 	}
    767 
    768 	if (dumpto == -1) {
    769 		/* we couldn't find any live components to dump to!?!?
    770 		 */
    771 		error = EINVAL;
    772 		goto out;
    773 	}
    774 
    775 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    776 	if (bdev == NULL) {
    777 		error = ENXIO;
    778 		goto out;
    779 	}
    780 
    781 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    782 				blkno, va, nblk * raidPtr->bytesPerSector);
    783 
    784 out:
    785 	raidunlock(rs);
    786 
    787 	return error;
    788 }
    789 
    790 /* ARGSUSED */
    791 static int
    792 raidopen(dev_t dev, int flags, int fmt,
    793     struct lwp *l)
    794 {
    795 	int     unit = raidunit(dev);
    796 	struct raid_softc *rs;
    797 	struct dk_softc *dksc;
    798 	int     error = 0;
    799 	int     part, pmask;
    800 
    801 	if ((rs = raidget(unit, true)) == NULL)
    802 		return ENXIO;
    803 	if ((error = raidlock(rs)) != 0)
    804 		return (error);
    805 
    806 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    807 		error = EBUSY;
    808 		goto bad;
    809 	}
    810 
    811 	dksc = &rs->sc_dksc;
    812 
    813 	part = DISKPART(dev);
    814 	pmask = (1 << part);
    815 
    816 	if (!DK_BUSY(dksc, pmask) &&
    817 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    818 		/* First one... mark things as dirty... Note that we *MUST*
    819 		 have done a configure before this.  I DO NOT WANT TO BE
    820 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    821 		 THAT THEY BELONG TOGETHER!!!!! */
    822 		/* XXX should check to see if we're only open for reading
    823 		   here... If so, we needn't do this, but then need some
    824 		   other way of keeping track of what's happened.. */
    825 
    826 		rf_markalldirty(&rs->sc_r);
    827 	}
    828 
    829 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    830 		error = dk_open(dksc, dev, flags, fmt, l);
    831 
    832 bad:
    833 	raidunlock(rs);
    834 
    835 	return (error);
    836 
    837 
    838 }
    839 
    840 static int
    841 raid_lastclose(device_t self)
    842 {
    843 	struct raid_softc *rs = raidsoftc(self);
    844 
    845 	/* Last one... device is not unconfigured yet.
    846 	   Device shutdown has taken care of setting the
    847 	   clean bits if RAIDF_INITED is not set
    848 	   mark things as clean... */
    849 
    850 	rf_update_component_labels(&rs->sc_r,
    851 	    RF_FINAL_COMPONENT_UPDATE);
    852 
    853 	/* pass to unlocked code */
    854 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    855 		rs->sc_flags |= RAIDF_DETACH;
    856 
    857 	return 0;
    858 }
    859 
    860 /* ARGSUSED */
    861 static int
    862 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    863 {
    864 	int     unit = raidunit(dev);
    865 	struct raid_softc *rs;
    866 	struct dk_softc *dksc;
    867 	cfdata_t cf;
    868 	int     error = 0, do_detach = 0, do_put = 0;
    869 
    870 	if ((rs = raidget(unit, false)) == NULL)
    871 		return ENXIO;
    872 	dksc = &rs->sc_dksc;
    873 
    874 	if ((error = raidlock(rs)) != 0)
    875 		return (error);
    876 
    877 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    878 		error = dk_close(dksc, dev, flags, fmt, l);
    879 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    880 			do_detach = 1;
    881 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    882 		do_put = 1;
    883 
    884 	raidunlock(rs);
    885 
    886 	if (do_detach) {
    887 		/* free the pseudo device attach bits */
    888 		cf = device_cfdata(dksc->sc_dev);
    889 		error = config_detach(dksc->sc_dev, 0);
    890 		if (error == 0)
    891 			free(cf, M_RAIDFRAME);
    892 	} else if (do_put) {
    893 		raidput(rs);
    894 	}
    895 
    896 	return (error);
    897 
    898 }
    899 
    900 static void
    901 raid_wakeup(RF_Raid_t *raidPtr)
    902 {
    903 	rf_lock_mutex2(raidPtr->iodone_lock);
    904 	rf_signal_cond2(raidPtr->iodone_cv);
    905 	rf_unlock_mutex2(raidPtr->iodone_lock);
    906 }
    907 
    908 static void
    909 raidstrategy(struct buf *bp)
    910 {
    911 	unsigned int unit;
    912 	struct raid_softc *rs;
    913 	struct dk_softc *dksc;
    914 	RF_Raid_t *raidPtr;
    915 
    916 	unit = raidunit(bp->b_dev);
    917 	if ((rs = raidget(unit, false)) == NULL) {
    918 		bp->b_error = ENXIO;
    919 		goto fail;
    920 	}
    921 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    922 		bp->b_error = ENXIO;
    923 		goto fail;
    924 	}
    925 	dksc = &rs->sc_dksc;
    926 	raidPtr = &rs->sc_r;
    927 
    928 	/* Queue IO only */
    929 	if (dk_strategy_defer(dksc, bp))
    930 		goto done;
    931 
    932 	/* schedule the IO to happen at the next convenient time */
    933 	raid_wakeup(raidPtr);
    934 
    935 done:
    936 	return;
    937 
    938 fail:
    939 	bp->b_resid = bp->b_bcount;
    940 	biodone(bp);
    941 }
    942 
    943 static int
    944 raid_diskstart(device_t dev, struct buf *bp)
    945 {
    946 	struct raid_softc *rs = raidsoftc(dev);
    947 	RF_Raid_t *raidPtr;
    948 
    949 	raidPtr = &rs->sc_r;
    950 	if (!raidPtr->valid) {
    951 		db1_printf(("raid is not valid..\n"));
    952 		return ENODEV;
    953 	}
    954 
    955 	/* XXX */
    956 	bp->b_resid = 0;
    957 
    958 	return raiddoaccess(raidPtr, bp);
    959 }
    960 
    961 void
    962 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    963 {
    964 	struct raid_softc *rs;
    965 	struct dk_softc *dksc;
    966 
    967 	rs = raidPtr->softc;
    968 	dksc = &rs->sc_dksc;
    969 
    970 	dk_done(dksc, bp);
    971 
    972 	rf_lock_mutex2(raidPtr->mutex);
    973 	raidPtr->openings++;
    974 	rf_unlock_mutex2(raidPtr->mutex);
    975 
    976 	/* schedule more IO */
    977 	raid_wakeup(raidPtr);
    978 }
    979 
    980 /* ARGSUSED */
    981 static int
    982 raidread(dev_t dev, struct uio *uio, int flags)
    983 {
    984 	int     unit = raidunit(dev);
    985 	struct raid_softc *rs;
    986 
    987 	if ((rs = raidget(unit, false)) == NULL)
    988 		return ENXIO;
    989 
    990 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    991 		return (ENXIO);
    992 
    993 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    994 
    995 }
    996 
    997 /* ARGSUSED */
    998 static int
    999 raidwrite(dev_t dev, struct uio *uio, int flags)
   1000 {
   1001 	int     unit = raidunit(dev);
   1002 	struct raid_softc *rs;
   1003 
   1004 	if ((rs = raidget(unit, false)) == NULL)
   1005 		return ENXIO;
   1006 
   1007 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1008 		return (ENXIO);
   1009 
   1010 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1011 
   1012 }
   1013 
   1014 static int
   1015 raid_detach_unlocked(struct raid_softc *rs)
   1016 {
   1017 	struct dk_softc *dksc = &rs->sc_dksc;
   1018 	RF_Raid_t *raidPtr;
   1019 	int error;
   1020 
   1021 	raidPtr = &rs->sc_r;
   1022 
   1023 	if (DK_BUSY(dksc, 0) ||
   1024 	    raidPtr->recon_in_progress != 0 ||
   1025 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1026 	    raidPtr->copyback_in_progress != 0)
   1027 		return EBUSY;
   1028 
   1029 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1030 		return 0;
   1031 
   1032 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1033 
   1034 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1035 		return error;
   1036 
   1037 	rs->sc_flags &= ~RAIDF_INITED;
   1038 
   1039 	/* Kill off any queued buffers */
   1040 	dk_drain(dksc);
   1041 	bufq_free(dksc->sc_bufq);
   1042 
   1043 	/* Detach the disk. */
   1044 	dkwedge_delall(&dksc->sc_dkdev);
   1045 	disk_detach(&dksc->sc_dkdev);
   1046 	disk_destroy(&dksc->sc_dkdev);
   1047 	dk_detach(dksc);
   1048 
   1049 	return 0;
   1050 }
   1051 
   1052 static int
   1053 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1054 {
   1055 	int     unit = raidunit(dev);
   1056 	int     error = 0;
   1057 	int     part, pmask;
   1058 	struct raid_softc *rs;
   1059 	struct dk_softc *dksc;
   1060 	RF_Config_t *k_cfg, *u_cfg;
   1061 	RF_Raid_t *raidPtr;
   1062 	RF_RaidDisk_t *diskPtr;
   1063 	RF_AccTotals_t *totals;
   1064 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1065 	u_char *specific_buf;
   1066 	int retcode = 0;
   1067 	int column;
   1068 /*	int raidid; */
   1069 	struct rf_recon_req *rr;
   1070 	struct rf_recon_req_internal *rrint;
   1071 	RF_ComponentLabel_t *clabel;
   1072 	RF_ComponentLabel_t *ci_label;
   1073 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1074 	RF_SingleComponent_t component;
   1075 	int d;
   1076 
   1077 	if ((rs = raidget(unit, false)) == NULL)
   1078 		return ENXIO;
   1079 	dksc = &rs->sc_dksc;
   1080 	raidPtr = &rs->sc_r;
   1081 
   1082 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1083 		(int) DISKPART(dev), (int) unit, cmd));
   1084 
   1085 	/* Must be initialized for these... */
   1086 	switch (cmd) {
   1087 	case RAIDFRAME_REWRITEPARITY:
   1088 	case RAIDFRAME_GET_INFO:
   1089 	case RAIDFRAME_RESET_ACCTOTALS:
   1090 	case RAIDFRAME_GET_ACCTOTALS:
   1091 	case RAIDFRAME_KEEP_ACCTOTALS:
   1092 	case RAIDFRAME_GET_SIZE:
   1093 	case RAIDFRAME_FAIL_DISK:
   1094 	case RAIDFRAME_COPYBACK:
   1095 	case RAIDFRAME_CHECK_RECON_STATUS:
   1096 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1097 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1098 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1099 	case RAIDFRAME_ADD_HOT_SPARE:
   1100 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1101 	case RAIDFRAME_INIT_LABELS:
   1102 	case RAIDFRAME_REBUILD_IN_PLACE:
   1103 	case RAIDFRAME_CHECK_PARITY:
   1104 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1105 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1106 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1107 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1108 	case RAIDFRAME_SET_AUTOCONFIG:
   1109 	case RAIDFRAME_SET_ROOT:
   1110 	case RAIDFRAME_DELETE_COMPONENT:
   1111 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1112 	case RAIDFRAME_PARITYMAP_STATUS:
   1113 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1114 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1115 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1116 #ifdef COMPAT_NETBSD32
   1117 #ifdef _LP64
   1118 	case RAIDFRAME_GET_INFO32:
   1119 #endif
   1120 #endif
   1121 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1122 			return (ENXIO);
   1123 	}
   1124 
   1125 	/*
   1126 	 * Handle compat ioctl calls
   1127 	 *
   1128 	 * * If compat code is not loaded, stub returns ENOSYS and we just
   1129 	 *   check the "native" cmd's
   1130 	 * * If compat code is loaded but does not recognize the cmd, it
   1131 	 *   returns EPASSTHROUGH, and we just check the "native" cmd's
   1132 	 * * If compat code returns EAGAIN, we need to finish via config
   1133 	 * * Otherwise the cmd has been handled and we just return
   1134 	 */
   1135 	MODULE_CALL_HOOK(raidframe_ioctl_50_hook,
   1136 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1137 	    enosys(), retcode);
   1138 	if (retcode == ENOSYS)
   1139 		retcode = 0;
   1140 	else if (retcode == EAGAIN)
   1141 		goto config;
   1142 	else if (retcode != EPASSTHROUGH)
   1143 		return retcode;
   1144 
   1145 	MODULE_CALL_HOOK(raidframe_ioctl_80_hook,
   1146 	    (cmd, (rs->sc_flags & RAIDF_INITED),raidPtr, unit, data, &k_cfg),
   1147 	    enosys(), retcode);
   1148 	if (retcode == ENOSYS)
   1149 		retcode = 0;
   1150 	else if (retcode == EAGAIN)
   1151 		goto config;
   1152 	else if (retcode != EPASSTHROUGH)
   1153 		return retcode;
   1154 
   1155 	/*
   1156 	 * XXX
   1157 	 * Handling of FAIL_DISK80 command requires us to retain retcode's
   1158 	 * value of EPASSTHROUGH.  If you add more compat code later, make
   1159 	 * sure you don't overwrite retcode and break this!
   1160 	 */
   1161 
   1162 	switch (cmd) {
   1163 
   1164 		/* configure the system */
   1165 	case RAIDFRAME_CONFIGURE:
   1166 #ifdef COMPAT_NETBSD32
   1167 #ifdef _LP64
   1168 	case RAIDFRAME_CONFIGURE32:
   1169 #endif
   1170 #endif
   1171 
   1172 		if (raidPtr->valid) {
   1173 			/* There is a valid RAID set running on this unit! */
   1174 			printf("raid%d: Device already configured!\n",unit);
   1175 			return(EINVAL);
   1176 		}
   1177 
   1178 		/* copy-in the configuration information */
   1179 		/* data points to a pointer to the configuration structure */
   1180 
   1181 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1182 		if (k_cfg == NULL) {
   1183 			return (ENOMEM);
   1184 		}
   1185 #ifdef COMPAT_NETBSD32
   1186 #ifdef _LP64
   1187 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1188 		    (l->l_proc->p_flag & PK_32) != 0)
   1189 			retcode = rf_config_netbsd32(data, k_cfg);
   1190 		else
   1191 #endif
   1192 #endif
   1193 		{
   1194 			u_cfg = *((RF_Config_t **) data);
   1195 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1196 		}
   1197 		if (retcode) {
   1198 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1199 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1200 				retcode));
   1201 			goto no_config;
   1202 		}
   1203 		goto config;
   1204 	config:
   1205 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1206 
   1207 		/* allocate a buffer for the layout-specific data, and copy it
   1208 		 * in */
   1209 		if (k_cfg->layoutSpecificSize) {
   1210 			if (k_cfg->layoutSpecificSize > 10000) {
   1211 				/* sanity check */
   1212 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1213 				retcode = EINVAL;
   1214 				goto no_config;
   1215 			}
   1216 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1217 			    (u_char *));
   1218 			if (specific_buf == NULL) {
   1219 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1220 				retcode = ENOMEM;
   1221 				goto no_config;
   1222 			}
   1223 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1224 			    k_cfg->layoutSpecificSize);
   1225 			if (retcode) {
   1226 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1227 				RF_Free(specific_buf,
   1228 					k_cfg->layoutSpecificSize);
   1229 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1230 					retcode));
   1231 				goto no_config;
   1232 			}
   1233 		} else
   1234 			specific_buf = NULL;
   1235 		k_cfg->layoutSpecific = specific_buf;
   1236 
   1237 		/* should do some kind of sanity check on the configuration.
   1238 		 * Store the sum of all the bytes in the last byte? */
   1239 
   1240 		/* configure the system */
   1241 
   1242 		/*
   1243 		 * Clear the entire RAID descriptor, just to make sure
   1244 		 *  there is no stale data left in the case of a
   1245 		 *  reconfiguration
   1246 		 */
   1247 		memset(raidPtr, 0, sizeof(*raidPtr));
   1248 		raidPtr->softc = rs;
   1249 		raidPtr->raidid = unit;
   1250 
   1251 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1252 
   1253 		if (retcode == 0) {
   1254 
   1255 			/* allow this many simultaneous IO's to
   1256 			   this RAID device */
   1257 			raidPtr->openings = RAIDOUTSTANDING;
   1258 
   1259 			raidinit(rs);
   1260 			raid_wakeup(raidPtr);
   1261 			rf_markalldirty(raidPtr);
   1262 		}
   1263 		/* free the buffers.  No return code here. */
   1264 		if (k_cfg->layoutSpecificSize) {
   1265 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1266 		}
   1267 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1268 
   1269 	no_config:
   1270 		/*
   1271 		 * If configuration failed, set sc_flags so that we
   1272 		 * will detach the device when we close it.
   1273 		 */
   1274 		if (retcode != 0)
   1275 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1276 		return (retcode);
   1277 
   1278 		/* shutdown the system */
   1279 	case RAIDFRAME_SHUTDOWN:
   1280 
   1281 		part = DISKPART(dev);
   1282 		pmask = (1 << part);
   1283 
   1284 		if ((error = raidlock(rs)) != 0)
   1285 			return (error);
   1286 
   1287 		if (DK_BUSY(dksc, pmask) ||
   1288 		    raidPtr->recon_in_progress != 0 ||
   1289 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1290 		    raidPtr->copyback_in_progress != 0)
   1291 			retcode = EBUSY;
   1292 		else {
   1293 			/* detach and free on close */
   1294 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1295 			retcode = 0;
   1296 		}
   1297 
   1298 		raidunlock(rs);
   1299 
   1300 		return (retcode);
   1301 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1302 		return rf_get_component_label(raidPtr, data);
   1303 
   1304 #if 0
   1305 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1306 		clabel = (RF_ComponentLabel_t *) data;
   1307 
   1308 		/* XXX check the label for valid stuff... */
   1309 		/* Note that some things *should not* get modified --
   1310 		   the user should be re-initing the labels instead of
   1311 		   trying to patch things.
   1312 		   */
   1313 
   1314 		raidid = raidPtr->raidid;
   1315 #ifdef DEBUG
   1316 		printf("raid%d: Got component label:\n", raidid);
   1317 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1318 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1319 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1320 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1321 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1322 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1323 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1324 #endif
   1325 		clabel->row = 0;
   1326 		column = clabel->column;
   1327 
   1328 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1329 			return(EINVAL);
   1330 		}
   1331 
   1332 		/* XXX this isn't allowed to do anything for now :-) */
   1333 
   1334 		/* XXX and before it is, we need to fill in the rest
   1335 		   of the fields!?!?!?! */
   1336 		memcpy(raidget_component_label(raidPtr, column),
   1337 		    clabel, sizeof(*clabel));
   1338 		raidflush_component_label(raidPtr, column);
   1339 		return (0);
   1340 #endif
   1341 
   1342 	case RAIDFRAME_INIT_LABELS:
   1343 		clabel = (RF_ComponentLabel_t *) data;
   1344 		/*
   1345 		   we only want the serial number from
   1346 		   the above.  We get all the rest of the information
   1347 		   from the config that was used to create this RAID
   1348 		   set.
   1349 		   */
   1350 
   1351 		raidPtr->serial_number = clabel->serial_number;
   1352 
   1353 		for(column=0;column<raidPtr->numCol;column++) {
   1354 			diskPtr = &raidPtr->Disks[column];
   1355 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1356 				ci_label = raidget_component_label(raidPtr,
   1357 				    column);
   1358 				/* Zeroing this is important. */
   1359 				memset(ci_label, 0, sizeof(*ci_label));
   1360 				raid_init_component_label(raidPtr, ci_label);
   1361 				ci_label->serial_number =
   1362 				    raidPtr->serial_number;
   1363 				ci_label->row = 0; /* we dont' pretend to support more */
   1364 				rf_component_label_set_partitionsize(ci_label,
   1365 				    diskPtr->partitionSize);
   1366 				ci_label->column = column;
   1367 				raidflush_component_label(raidPtr, column);
   1368 			}
   1369 			/* XXXjld what about the spares? */
   1370 		}
   1371 
   1372 		return (retcode);
   1373 	case RAIDFRAME_SET_AUTOCONFIG:
   1374 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1375 		printf("raid%d: New autoconfig value is: %d\n",
   1376 		       raidPtr->raidid, d);
   1377 		*(int *) data = d;
   1378 		return (retcode);
   1379 
   1380 	case RAIDFRAME_SET_ROOT:
   1381 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1382 		printf("raid%d: New rootpartition value is: %d\n",
   1383 		       raidPtr->raidid, d);
   1384 		*(int *) data = d;
   1385 		return (retcode);
   1386 
   1387 		/* initialize all parity */
   1388 	case RAIDFRAME_REWRITEPARITY:
   1389 
   1390 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1391 			/* Parity for RAID 0 is trivially correct */
   1392 			raidPtr->parity_good = RF_RAID_CLEAN;
   1393 			return(0);
   1394 		}
   1395 
   1396 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1397 			/* Re-write is already in progress! */
   1398 			return(EINVAL);
   1399 		}
   1400 
   1401 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1402 					   rf_RewriteParityThread,
   1403 					   raidPtr,"raid_parity");
   1404 		return (retcode);
   1405 
   1406 
   1407 	case RAIDFRAME_ADD_HOT_SPARE:
   1408 		sparePtr = (RF_SingleComponent_t *) data;
   1409 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1410 		retcode = rf_add_hot_spare(raidPtr, &component);
   1411 		return(retcode);
   1412 
   1413 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1414 		return(retcode);
   1415 
   1416 	case RAIDFRAME_DELETE_COMPONENT:
   1417 		componentPtr = (RF_SingleComponent_t *)data;
   1418 		memcpy( &component, componentPtr,
   1419 			sizeof(RF_SingleComponent_t));
   1420 		retcode = rf_delete_component(raidPtr, &component);
   1421 		return(retcode);
   1422 
   1423 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1424 		componentPtr = (RF_SingleComponent_t *)data;
   1425 		memcpy( &component, componentPtr,
   1426 			sizeof(RF_SingleComponent_t));
   1427 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1428 		return(retcode);
   1429 
   1430 	case RAIDFRAME_REBUILD_IN_PLACE:
   1431 
   1432 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1433 			/* Can't do this on a RAID 0!! */
   1434 			return(EINVAL);
   1435 		}
   1436 
   1437 		if (raidPtr->recon_in_progress == 1) {
   1438 			/* a reconstruct is already in progress! */
   1439 			return(EINVAL);
   1440 		}
   1441 
   1442 		componentPtr = (RF_SingleComponent_t *) data;
   1443 		memcpy( &component, componentPtr,
   1444 			sizeof(RF_SingleComponent_t));
   1445 		component.row = 0; /* we don't support any more */
   1446 		column = component.column;
   1447 
   1448 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1449 			return(EINVAL);
   1450 		}
   1451 
   1452 		rf_lock_mutex2(raidPtr->mutex);
   1453 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1454 		    (raidPtr->numFailures > 0)) {
   1455 			/* XXX 0 above shouldn't be constant!!! */
   1456 			/* some component other than this has failed.
   1457 			   Let's not make things worse than they already
   1458 			   are... */
   1459 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1460 			       raidPtr->raidid);
   1461 			printf("raid%d:     Col: %d   Too many failures.\n",
   1462 			       raidPtr->raidid, column);
   1463 			rf_unlock_mutex2(raidPtr->mutex);
   1464 			return (EINVAL);
   1465 		}
   1466 		if (raidPtr->Disks[column].status ==
   1467 		    rf_ds_reconstructing) {
   1468 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1469 			       raidPtr->raidid);
   1470 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1471 
   1472 			rf_unlock_mutex2(raidPtr->mutex);
   1473 			return (EINVAL);
   1474 		}
   1475 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1476 			rf_unlock_mutex2(raidPtr->mutex);
   1477 			return (EINVAL);
   1478 		}
   1479 		rf_unlock_mutex2(raidPtr->mutex);
   1480 
   1481 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1482 		if (rrint == NULL)
   1483 			return(ENOMEM);
   1484 
   1485 		rrint->col = column;
   1486 		rrint->raidPtr = raidPtr;
   1487 
   1488 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1489 					   rf_ReconstructInPlaceThread,
   1490 					   rrint, "raid_reconip");
   1491 		return(retcode);
   1492 
   1493 	case RAIDFRAME_GET_INFO:
   1494 #ifdef COMPAT_NETBSD32
   1495 #ifdef _LP64
   1496 	case RAIDFRAME_GET_INFO32:
   1497 #endif
   1498 #endif
   1499 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1500 			  (RF_DeviceConfig_t *));
   1501 		if (d_cfg == NULL)
   1502 			return (ENOMEM);
   1503 		retcode = rf_get_info(raidPtr, d_cfg);
   1504 		if (retcode == 0) {
   1505 #ifdef COMPAT_NETBSD32
   1506 #ifdef _LP64
   1507 			if (cmd == RAIDFRAME_GET_INFO32)
   1508 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1509 			else
   1510 #endif
   1511 #endif
   1512 				ucfgp = *(RF_DeviceConfig_t **)data;
   1513 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1514 		}
   1515 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1516 
   1517 		return (retcode);
   1518 
   1519 	case RAIDFRAME_CHECK_PARITY:
   1520 		*(int *) data = raidPtr->parity_good;
   1521 		return (0);
   1522 
   1523 	case RAIDFRAME_PARITYMAP_STATUS:
   1524 		if (rf_paritymap_ineligible(raidPtr))
   1525 			return EINVAL;
   1526 		rf_paritymap_status(raidPtr->parity_map,
   1527 		    (struct rf_pmstat *)data);
   1528 		return 0;
   1529 
   1530 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1531 		if (rf_paritymap_ineligible(raidPtr))
   1532 			return EINVAL;
   1533 		if (raidPtr->parity_map == NULL)
   1534 			return ENOENT; /* ??? */
   1535 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1536 			(struct rf_pmparams *)data, 1))
   1537 			return EINVAL;
   1538 		return 0;
   1539 
   1540 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1541 		if (rf_paritymap_ineligible(raidPtr))
   1542 			return EINVAL;
   1543 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1544 		return 0;
   1545 
   1546 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1547 		if (rf_paritymap_ineligible(raidPtr))
   1548 			return EINVAL;
   1549 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1550 		/* XXX should errors be passed up? */
   1551 		return 0;
   1552 
   1553 	case RAIDFRAME_RESET_ACCTOTALS:
   1554 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1555 		return (0);
   1556 
   1557 	case RAIDFRAME_GET_ACCTOTALS:
   1558 		totals = (RF_AccTotals_t *) data;
   1559 		*totals = raidPtr->acc_totals;
   1560 		return (0);
   1561 
   1562 	case RAIDFRAME_KEEP_ACCTOTALS:
   1563 		raidPtr->keep_acc_totals = *(int *)data;
   1564 		return (0);
   1565 
   1566 	case RAIDFRAME_GET_SIZE:
   1567 		*(int *) data = raidPtr->totalSectors;
   1568 		return (0);
   1569 
   1570 		/* fail a disk & optionally start reconstruction */
   1571 	case RAIDFRAME_FAIL_DISK80:
   1572 		/* Check if we called compat code for this cmd */
   1573 		if (retcode != EPASSTHROUGH)
   1574 			return EINVAL;
   1575 		/* FALLTHRU */
   1576 	case RAIDFRAME_FAIL_DISK:
   1577 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1578 			/* Can't do this on a RAID 0!! */
   1579 			return(EINVAL);
   1580 		}
   1581 
   1582 		rr = (struct rf_recon_req *) data;
   1583 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1584 			return (EINVAL);
   1585 
   1586 		rf_lock_mutex2(raidPtr->mutex);
   1587 		if (raidPtr->status == rf_rs_reconstructing) {
   1588 			/* you can't fail a disk while we're reconstructing! */
   1589 			/* XXX wrong for RAID6 */
   1590 			rf_unlock_mutex2(raidPtr->mutex);
   1591 			return (EINVAL);
   1592 		}
   1593 		if ((raidPtr->Disks[rr->col].status ==
   1594 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1595 			/* some other component has failed.  Let's not make
   1596 			   things worse. XXX wrong for RAID6 */
   1597 			rf_unlock_mutex2(raidPtr->mutex);
   1598 			return (EINVAL);
   1599 		}
   1600 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1601 			/* Can't fail a spared disk! */
   1602 			rf_unlock_mutex2(raidPtr->mutex);
   1603 			return (EINVAL);
   1604 		}
   1605 		rf_unlock_mutex2(raidPtr->mutex);
   1606 
   1607 		/* make a copy of the recon request so that we don't rely on
   1608 		 * the user's buffer */
   1609 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1610 		if (rrint == NULL)
   1611 			return(ENOMEM);
   1612 		rrint->col = rr->col;
   1613 		rrint->flags = rr->flags;
   1614 		rrint->raidPtr = raidPtr;
   1615 
   1616 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1617 					   rf_ReconThread,
   1618 					   rrint, "raid_recon");
   1619 		return (0);
   1620 
   1621 		/* invoke a copyback operation after recon on whatever disk
   1622 		 * needs it, if any */
   1623 	case RAIDFRAME_COPYBACK:
   1624 
   1625 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1626 			/* This makes no sense on a RAID 0!! */
   1627 			return(EINVAL);
   1628 		}
   1629 
   1630 		if (raidPtr->copyback_in_progress == 1) {
   1631 			/* Copyback is already in progress! */
   1632 			return(EINVAL);
   1633 		}
   1634 
   1635 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1636 					   rf_CopybackThread,
   1637 					   raidPtr,"raid_copyback");
   1638 		return (retcode);
   1639 
   1640 		/* return the percentage completion of reconstruction */
   1641 	case RAIDFRAME_CHECK_RECON_STATUS:
   1642 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1643 			/* This makes no sense on a RAID 0, so tell the
   1644 			   user it's done. */
   1645 			*(int *) data = 100;
   1646 			return(0);
   1647 		}
   1648 		if (raidPtr->status != rf_rs_reconstructing)
   1649 			*(int *) data = 100;
   1650 		else {
   1651 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1652 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1653 			} else {
   1654 				*(int *) data = 0;
   1655 			}
   1656 		}
   1657 		return (0);
   1658 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1659 		rf_check_recon_status_ext(raidPtr, data);
   1660 		return (0);
   1661 
   1662 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1663 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1664 			/* This makes no sense on a RAID 0, so tell the
   1665 			   user it's done. */
   1666 			*(int *) data = 100;
   1667 			return(0);
   1668 		}
   1669 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1670 			*(int *) data = 100 *
   1671 				raidPtr->parity_rewrite_stripes_done /
   1672 				raidPtr->Layout.numStripe;
   1673 		} else {
   1674 			*(int *) data = 100;
   1675 		}
   1676 		return (0);
   1677 
   1678 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1679 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1680 		return (0);
   1681 
   1682 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1683 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1684 			/* This makes no sense on a RAID 0 */
   1685 			*(int *) data = 100;
   1686 			return(0);
   1687 		}
   1688 		if (raidPtr->copyback_in_progress == 1) {
   1689 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1690 				raidPtr->Layout.numStripe;
   1691 		} else {
   1692 			*(int *) data = 100;
   1693 		}
   1694 		return (0);
   1695 
   1696 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1697 		rf_check_copyback_status_ext(raidPtr, data);
   1698 		return 0;
   1699 
   1700 	case RAIDFRAME_SET_LAST_UNIT:
   1701 		for (column = 0; column < raidPtr->numCol; column++)
   1702 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1703 				return EBUSY;
   1704 
   1705 		for (column = 0; column < raidPtr->numCol; column++) {
   1706 			clabel = raidget_component_label(raidPtr, column);
   1707 			clabel->last_unit = *(int *)data;
   1708 			raidflush_component_label(raidPtr, column);
   1709 		}
   1710 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1711 		return 0;
   1712 
   1713 		/* the sparetable daemon calls this to wait for the kernel to
   1714 		 * need a spare table. this ioctl does not return until a
   1715 		 * spare table is needed. XXX -- calling mpsleep here in the
   1716 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1717 		 * -- I should either compute the spare table in the kernel,
   1718 		 * or have a different -- XXX XXX -- interface (a different
   1719 		 * character device) for delivering the table     -- XXX */
   1720 #if 0
   1721 	case RAIDFRAME_SPARET_WAIT:
   1722 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1723 		while (!rf_sparet_wait_queue)
   1724 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1725 		waitreq = rf_sparet_wait_queue;
   1726 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1727 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1728 
   1729 		/* structure assignment */
   1730 		*((RF_SparetWait_t *) data) = *waitreq;
   1731 
   1732 		RF_Free(waitreq, sizeof(*waitreq));
   1733 		return (0);
   1734 
   1735 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1736 		 * code in it that will cause the dameon to exit */
   1737 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1738 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1739 		waitreq->fcol = -1;
   1740 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1741 		waitreq->next = rf_sparet_wait_queue;
   1742 		rf_sparet_wait_queue = waitreq;
   1743 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1744 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1745 		return (0);
   1746 
   1747 		/* used by the spare table daemon to deliver a spare table
   1748 		 * into the kernel */
   1749 	case RAIDFRAME_SEND_SPARET:
   1750 
   1751 		/* install the spare table */
   1752 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1753 
   1754 		/* respond to the requestor.  the return status of the spare
   1755 		 * table installation is passed in the "fcol" field */
   1756 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1757 		waitreq->fcol = retcode;
   1758 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1759 		waitreq->next = rf_sparet_resp_queue;
   1760 		rf_sparet_resp_queue = waitreq;
   1761 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1762 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1763 
   1764 		return (retcode);
   1765 #endif
   1766 
   1767 	default:
   1768 		break; /* fall through to the os-specific code below */
   1769 
   1770 	}
   1771 
   1772 	if (!raidPtr->valid)
   1773 		return (EINVAL);
   1774 
   1775 	/*
   1776 	 * Add support for "regular" device ioctls here.
   1777 	 */
   1778 
   1779 	switch (cmd) {
   1780 	case DIOCGCACHE:
   1781 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1782 		break;
   1783 
   1784 	case DIOCCACHESYNC:
   1785 		retcode = rf_sync_component_caches(raidPtr);
   1786 		break;
   1787 
   1788 	default:
   1789 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1790 		break;
   1791 	}
   1792 
   1793 	return (retcode);
   1794 
   1795 }
   1796 
   1797 
   1798 /* raidinit -- complete the rest of the initialization for the
   1799    RAIDframe device.  */
   1800 
   1801 
   1802 static void
   1803 raidinit(struct raid_softc *rs)
   1804 {
   1805 	cfdata_t cf;
   1806 	unsigned int unit;
   1807 	struct dk_softc *dksc = &rs->sc_dksc;
   1808 	RF_Raid_t *raidPtr = &rs->sc_r;
   1809 	device_t dev;
   1810 
   1811 	unit = raidPtr->raidid;
   1812 
   1813 	/* XXX doesn't check bounds. */
   1814 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1815 
   1816 	/* attach the pseudo device */
   1817 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1818 	cf->cf_name = raid_cd.cd_name;
   1819 	cf->cf_atname = raid_cd.cd_name;
   1820 	cf->cf_unit = unit;
   1821 	cf->cf_fstate = FSTATE_STAR;
   1822 
   1823 	dev = config_attach_pseudo(cf);
   1824 	if (dev == NULL) {
   1825 		printf("raid%d: config_attach_pseudo failed\n",
   1826 		    raidPtr->raidid);
   1827 		free(cf, M_RAIDFRAME);
   1828 		return;
   1829 	}
   1830 
   1831 	/* provide a backpointer to the real softc */
   1832 	raidsoftc(dev) = rs;
   1833 
   1834 	/* disk_attach actually creates space for the CPU disklabel, among
   1835 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1836 	 * with disklabels. */
   1837 	dk_init(dksc, dev, DKTYPE_RAID);
   1838 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1839 
   1840 	/* XXX There may be a weird interaction here between this, and
   1841 	 * protectedSectors, as used in RAIDframe.  */
   1842 
   1843 	rs->sc_size = raidPtr->totalSectors;
   1844 
   1845 	/* Attach dk and disk subsystems */
   1846 	dk_attach(dksc);
   1847 	disk_attach(&dksc->sc_dkdev);
   1848 	rf_set_geometry(rs, raidPtr);
   1849 
   1850 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1851 
   1852 	/* mark unit as usuable */
   1853 	rs->sc_flags |= RAIDF_INITED;
   1854 
   1855 	dkwedge_discover(&dksc->sc_dkdev);
   1856 }
   1857 
   1858 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1859 /* wake up the daemon & tell it to get us a spare table
   1860  * XXX
   1861  * the entries in the queues should be tagged with the raidPtr
   1862  * so that in the extremely rare case that two recons happen at once,
   1863  * we know for which device were requesting a spare table
   1864  * XXX
   1865  *
   1866  * XXX This code is not currently used. GO
   1867  */
   1868 int
   1869 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1870 {
   1871 	int     retcode;
   1872 
   1873 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1874 	req->next = rf_sparet_wait_queue;
   1875 	rf_sparet_wait_queue = req;
   1876 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1877 
   1878 	/* mpsleep unlocks the mutex */
   1879 	while (!rf_sparet_resp_queue) {
   1880 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1881 	}
   1882 	req = rf_sparet_resp_queue;
   1883 	rf_sparet_resp_queue = req->next;
   1884 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1885 
   1886 	retcode = req->fcol;
   1887 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1888 					 * alloc'd */
   1889 	return (retcode);
   1890 }
   1891 #endif
   1892 
   1893 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1894  * bp & passes it down.
   1895  * any calls originating in the kernel must use non-blocking I/O
   1896  * do some extra sanity checking to return "appropriate" error values for
   1897  * certain conditions (to make some standard utilities work)
   1898  *
   1899  * Formerly known as: rf_DoAccessKernel
   1900  */
   1901 void
   1902 raidstart(RF_Raid_t *raidPtr)
   1903 {
   1904 	struct raid_softc *rs;
   1905 	struct dk_softc *dksc;
   1906 
   1907 	rs = raidPtr->softc;
   1908 	dksc = &rs->sc_dksc;
   1909 	/* quick check to see if anything has died recently */
   1910 	rf_lock_mutex2(raidPtr->mutex);
   1911 	if (raidPtr->numNewFailures > 0) {
   1912 		rf_unlock_mutex2(raidPtr->mutex);
   1913 		rf_update_component_labels(raidPtr,
   1914 					   RF_NORMAL_COMPONENT_UPDATE);
   1915 		rf_lock_mutex2(raidPtr->mutex);
   1916 		raidPtr->numNewFailures--;
   1917 	}
   1918 	rf_unlock_mutex2(raidPtr->mutex);
   1919 
   1920 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1921 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1922 		return;
   1923 	}
   1924 
   1925 	dk_start(dksc, NULL);
   1926 }
   1927 
   1928 static int
   1929 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1930 {
   1931 	RF_SectorCount_t num_blocks, pb, sum;
   1932 	RF_RaidAddr_t raid_addr;
   1933 	daddr_t blocknum;
   1934 	int     do_async;
   1935 	int rc;
   1936 
   1937 	rf_lock_mutex2(raidPtr->mutex);
   1938 	if (raidPtr->openings == 0) {
   1939 		rf_unlock_mutex2(raidPtr->mutex);
   1940 		return EAGAIN;
   1941 	}
   1942 	rf_unlock_mutex2(raidPtr->mutex);
   1943 
   1944 	blocknum = bp->b_rawblkno;
   1945 
   1946 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1947 		    (int) blocknum));
   1948 
   1949 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1950 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1951 
   1952 	/* *THIS* is where we adjust what block we're going to...
   1953 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1954 	raid_addr = blocknum;
   1955 
   1956 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1957 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1958 	sum = raid_addr + num_blocks + pb;
   1959 	if (1 || rf_debugKernelAccess) {
   1960 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1961 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1962 			    (int) pb, (int) bp->b_resid));
   1963 	}
   1964 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1965 	    || (sum < num_blocks) || (sum < pb)) {
   1966 		rc = ENOSPC;
   1967 		goto done;
   1968 	}
   1969 	/*
   1970 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1971 	 */
   1972 
   1973 	if (bp->b_bcount & raidPtr->sectorMask) {
   1974 		rc = ENOSPC;
   1975 		goto done;
   1976 	}
   1977 	db1_printf(("Calling DoAccess..\n"));
   1978 
   1979 
   1980 	rf_lock_mutex2(raidPtr->mutex);
   1981 	raidPtr->openings--;
   1982 	rf_unlock_mutex2(raidPtr->mutex);
   1983 
   1984 	/*
   1985 	 * Everything is async.
   1986 	 */
   1987 	do_async = 1;
   1988 
   1989 	/* don't ever condition on bp->b_flags & B_WRITE.
   1990 	 * always condition on B_READ instead */
   1991 
   1992 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1993 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1994 			 do_async, raid_addr, num_blocks,
   1995 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1996 
   1997 done:
   1998 	return rc;
   1999 }
   2000 
   2001 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2002 
   2003 int
   2004 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2005 {
   2006 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2007 	struct buf *bp;
   2008 
   2009 	req->queue = queue;
   2010 	bp = req->bp;
   2011 
   2012 	switch (req->type) {
   2013 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2014 		/* XXX need to do something extra here.. */
   2015 		/* I'm leaving this in, as I've never actually seen it used,
   2016 		 * and I'd like folks to report it... GO */
   2017 		printf(("WAKEUP CALLED\n"));
   2018 		queue->numOutstanding++;
   2019 
   2020 		bp->b_flags = 0;
   2021 		bp->b_private = req;
   2022 
   2023 		KernelWakeupFunc(bp);
   2024 		break;
   2025 
   2026 	case RF_IO_TYPE_READ:
   2027 	case RF_IO_TYPE_WRITE:
   2028 #if RF_ACC_TRACE > 0
   2029 		if (req->tracerec) {
   2030 			RF_ETIMER_START(req->tracerec->timer);
   2031 		}
   2032 #endif
   2033 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2034 		    op, queue->rf_cinfo->ci_dev,
   2035 		    req->sectorOffset, req->numSector,
   2036 		    req->buf, KernelWakeupFunc, (void *) req,
   2037 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2038 
   2039 		if (rf_debugKernelAccess) {
   2040 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2041 				(long) bp->b_blkno));
   2042 		}
   2043 		queue->numOutstanding++;
   2044 		queue->last_deq_sector = req->sectorOffset;
   2045 		/* acc wouldn't have been let in if there were any pending
   2046 		 * reqs at any other priority */
   2047 		queue->curPriority = req->priority;
   2048 
   2049 		db1_printf(("Going for %c to unit %d col %d\n",
   2050 			    req->type, queue->raidPtr->raidid,
   2051 			    queue->col));
   2052 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2053 			(int) req->sectorOffset, (int) req->numSector,
   2054 			(int) (req->numSector <<
   2055 			    queue->raidPtr->logBytesPerSector),
   2056 			(int) queue->raidPtr->logBytesPerSector));
   2057 
   2058 		/*
   2059 		 * XXX: drop lock here since this can block at
   2060 		 * least with backing SCSI devices.  Retake it
   2061 		 * to minimize fuss with calling interfaces.
   2062 		 */
   2063 
   2064 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2065 		bdev_strategy(bp);
   2066 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2067 		break;
   2068 
   2069 	default:
   2070 		panic("bad req->type in rf_DispatchKernelIO");
   2071 	}
   2072 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2073 
   2074 	return (0);
   2075 }
   2076 /* this is the callback function associated with a I/O invoked from
   2077    kernel code.
   2078  */
   2079 static void
   2080 KernelWakeupFunc(struct buf *bp)
   2081 {
   2082 	RF_DiskQueueData_t *req = NULL;
   2083 	RF_DiskQueue_t *queue;
   2084 
   2085 	db1_printf(("recovering the request queue:\n"));
   2086 
   2087 	req = bp->b_private;
   2088 
   2089 	queue = (RF_DiskQueue_t *) req->queue;
   2090 
   2091 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2092 
   2093 #if RF_ACC_TRACE > 0
   2094 	if (req->tracerec) {
   2095 		RF_ETIMER_STOP(req->tracerec->timer);
   2096 		RF_ETIMER_EVAL(req->tracerec->timer);
   2097 		rf_lock_mutex2(rf_tracing_mutex);
   2098 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2099 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2100 		req->tracerec->num_phys_ios++;
   2101 		rf_unlock_mutex2(rf_tracing_mutex);
   2102 	}
   2103 #endif
   2104 
   2105 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2106 	 * ballistic, and mark the component as hosed... */
   2107 
   2108 	if (bp->b_error != 0) {
   2109 		/* Mark the disk as dead */
   2110 		/* but only mark it once... */
   2111 		/* and only if it wouldn't leave this RAID set
   2112 		   completely broken */
   2113 		if (((queue->raidPtr->Disks[queue->col].status ==
   2114 		      rf_ds_optimal) ||
   2115 		     (queue->raidPtr->Disks[queue->col].status ==
   2116 		      rf_ds_used_spare)) &&
   2117 		     (queue->raidPtr->numFailures <
   2118 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2119 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2120 			       queue->raidPtr->raidid,
   2121 			       bp->b_error,
   2122 			       queue->raidPtr->Disks[queue->col].devname);
   2123 			queue->raidPtr->Disks[queue->col].status =
   2124 			    rf_ds_failed;
   2125 			queue->raidPtr->status = rf_rs_degraded;
   2126 			queue->raidPtr->numFailures++;
   2127 			queue->raidPtr->numNewFailures++;
   2128 		} else {	/* Disk is already dead... */
   2129 			/* printf("Disk already marked as dead!\n"); */
   2130 		}
   2131 
   2132 	}
   2133 
   2134 	/* Fill in the error value */
   2135 	req->error = bp->b_error;
   2136 
   2137 	/* Drop this one on the "finished" queue... */
   2138 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2139 
   2140 	/* Let the raidio thread know there is work to be done. */
   2141 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2142 
   2143 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2144 }
   2145 
   2146 
   2147 /*
   2148  * initialize a buf structure for doing an I/O in the kernel.
   2149  */
   2150 static void
   2151 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2152        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2153        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2154        struct proc *b_proc)
   2155 {
   2156 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2157 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2158 	bp->b_oflags = 0;
   2159 	bp->b_cflags = 0;
   2160 	bp->b_bcount = numSect << logBytesPerSector;
   2161 	bp->b_bufsize = bp->b_bcount;
   2162 	bp->b_error = 0;
   2163 	bp->b_dev = dev;
   2164 	bp->b_data = bf;
   2165 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2166 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2167 	if (bp->b_bcount == 0) {
   2168 		panic("bp->b_bcount is zero in InitBP!!");
   2169 	}
   2170 	bp->b_proc = b_proc;
   2171 	bp->b_iodone = cbFunc;
   2172 	bp->b_private = cbArg;
   2173 }
   2174 
   2175 /*
   2176  * Wait interruptibly for an exclusive lock.
   2177  *
   2178  * XXX
   2179  * Several drivers do this; it should be abstracted and made MP-safe.
   2180  * (Hmm... where have we seen this warning before :->  GO )
   2181  */
   2182 static int
   2183 raidlock(struct raid_softc *rs)
   2184 {
   2185 	int     error;
   2186 
   2187 	error = 0;
   2188 	mutex_enter(&rs->sc_mutex);
   2189 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2190 		rs->sc_flags |= RAIDF_WANTED;
   2191 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2192 		if (error != 0)
   2193 			goto done;
   2194 	}
   2195 	rs->sc_flags |= RAIDF_LOCKED;
   2196 done:
   2197 	mutex_exit(&rs->sc_mutex);
   2198 	return (error);
   2199 }
   2200 /*
   2201  * Unlock and wake up any waiters.
   2202  */
   2203 static void
   2204 raidunlock(struct raid_softc *rs)
   2205 {
   2206 
   2207 	mutex_enter(&rs->sc_mutex);
   2208 	rs->sc_flags &= ~RAIDF_LOCKED;
   2209 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2210 		rs->sc_flags &= ~RAIDF_WANTED;
   2211 		cv_broadcast(&rs->sc_cv);
   2212 	}
   2213 	mutex_exit(&rs->sc_mutex);
   2214 }
   2215 
   2216 
   2217 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2218 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2219 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2220 
   2221 static daddr_t
   2222 rf_component_info_offset(void)
   2223 {
   2224 
   2225 	return RF_COMPONENT_INFO_OFFSET;
   2226 }
   2227 
   2228 static daddr_t
   2229 rf_component_info_size(unsigned secsize)
   2230 {
   2231 	daddr_t info_size;
   2232 
   2233 	KASSERT(secsize);
   2234 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2235 		info_size = secsize;
   2236 	else
   2237 		info_size = RF_COMPONENT_INFO_SIZE;
   2238 
   2239 	return info_size;
   2240 }
   2241 
   2242 static daddr_t
   2243 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2244 {
   2245 	daddr_t map_offset;
   2246 
   2247 	KASSERT(raidPtr->bytesPerSector);
   2248 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2249 		map_offset = raidPtr->bytesPerSector;
   2250 	else
   2251 		map_offset = RF_COMPONENT_INFO_SIZE;
   2252 	map_offset += rf_component_info_offset();
   2253 
   2254 	return map_offset;
   2255 }
   2256 
   2257 static daddr_t
   2258 rf_parity_map_size(RF_Raid_t *raidPtr)
   2259 {
   2260 	daddr_t map_size;
   2261 
   2262 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2263 		map_size = raidPtr->bytesPerSector;
   2264 	else
   2265 		map_size = RF_PARITY_MAP_SIZE;
   2266 
   2267 	return map_size;
   2268 }
   2269 
   2270 int
   2271 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2272 {
   2273 	RF_ComponentLabel_t *clabel;
   2274 
   2275 	clabel = raidget_component_label(raidPtr, col);
   2276 	clabel->clean = RF_RAID_CLEAN;
   2277 	raidflush_component_label(raidPtr, col);
   2278 	return(0);
   2279 }
   2280 
   2281 
   2282 int
   2283 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2284 {
   2285 	RF_ComponentLabel_t *clabel;
   2286 
   2287 	clabel = raidget_component_label(raidPtr, col);
   2288 	clabel->clean = RF_RAID_DIRTY;
   2289 	raidflush_component_label(raidPtr, col);
   2290 	return(0);
   2291 }
   2292 
   2293 int
   2294 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2295 {
   2296 	KASSERT(raidPtr->bytesPerSector);
   2297 	return raidread_component_label(raidPtr->bytesPerSector,
   2298 	    raidPtr->Disks[col].dev,
   2299 	    raidPtr->raid_cinfo[col].ci_vp,
   2300 	    &raidPtr->raid_cinfo[col].ci_label);
   2301 }
   2302 
   2303 RF_ComponentLabel_t *
   2304 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2305 {
   2306 	return &raidPtr->raid_cinfo[col].ci_label;
   2307 }
   2308 
   2309 int
   2310 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2311 {
   2312 	RF_ComponentLabel_t *label;
   2313 
   2314 	label = &raidPtr->raid_cinfo[col].ci_label;
   2315 	label->mod_counter = raidPtr->mod_counter;
   2316 #ifndef RF_NO_PARITY_MAP
   2317 	label->parity_map_modcount = label->mod_counter;
   2318 #endif
   2319 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2320 	    raidPtr->Disks[col].dev,
   2321 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2322 }
   2323 
   2324 
   2325 static int
   2326 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2327     RF_ComponentLabel_t *clabel)
   2328 {
   2329 	return raidread_component_area(dev, b_vp, clabel,
   2330 	    sizeof(RF_ComponentLabel_t),
   2331 	    rf_component_info_offset(),
   2332 	    rf_component_info_size(secsize));
   2333 }
   2334 
   2335 /* ARGSUSED */
   2336 static int
   2337 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2338     size_t msize, daddr_t offset, daddr_t dsize)
   2339 {
   2340 	struct buf *bp;
   2341 	int error;
   2342 
   2343 	/* XXX should probably ensure that we don't try to do this if
   2344 	   someone has changed rf_protected_sectors. */
   2345 
   2346 	if (b_vp == NULL) {
   2347 		/* For whatever reason, this component is not valid.
   2348 		   Don't try to read a component label from it. */
   2349 		return(EINVAL);
   2350 	}
   2351 
   2352 	/* get a block of the appropriate size... */
   2353 	bp = geteblk((int)dsize);
   2354 	bp->b_dev = dev;
   2355 
   2356 	/* get our ducks in a row for the read */
   2357 	bp->b_blkno = offset / DEV_BSIZE;
   2358 	bp->b_bcount = dsize;
   2359 	bp->b_flags |= B_READ;
   2360  	bp->b_resid = dsize;
   2361 
   2362 	bdev_strategy(bp);
   2363 	error = biowait(bp);
   2364 
   2365 	if (!error) {
   2366 		memcpy(data, bp->b_data, msize);
   2367 	}
   2368 
   2369 	brelse(bp, 0);
   2370 	return(error);
   2371 }
   2372 
   2373 
   2374 static int
   2375 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2376     RF_ComponentLabel_t *clabel)
   2377 {
   2378 	return raidwrite_component_area(dev, b_vp, clabel,
   2379 	    sizeof(RF_ComponentLabel_t),
   2380 	    rf_component_info_offset(),
   2381 	    rf_component_info_size(secsize), 0);
   2382 }
   2383 
   2384 /* ARGSUSED */
   2385 static int
   2386 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2387     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2388 {
   2389 	struct buf *bp;
   2390 	int error;
   2391 
   2392 	/* get a block of the appropriate size... */
   2393 	bp = geteblk((int)dsize);
   2394 	bp->b_dev = dev;
   2395 
   2396 	/* get our ducks in a row for the write */
   2397 	bp->b_blkno = offset / DEV_BSIZE;
   2398 	bp->b_bcount = dsize;
   2399 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2400  	bp->b_resid = dsize;
   2401 
   2402 	memset(bp->b_data, 0, dsize);
   2403 	memcpy(bp->b_data, data, msize);
   2404 
   2405 	bdev_strategy(bp);
   2406 	if (asyncp)
   2407 		return 0;
   2408 	error = biowait(bp);
   2409 	brelse(bp, 0);
   2410 	if (error) {
   2411 #if 1
   2412 		printf("Failed to write RAID component info!\n");
   2413 #endif
   2414 	}
   2415 
   2416 	return(error);
   2417 }
   2418 
   2419 void
   2420 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2421 {
   2422 	int c;
   2423 
   2424 	for (c = 0; c < raidPtr->numCol; c++) {
   2425 		/* Skip dead disks. */
   2426 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2427 			continue;
   2428 		/* XXXjld: what if an error occurs here? */
   2429 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2430 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2431 		    RF_PARITYMAP_NBYTE,
   2432 		    rf_parity_map_offset(raidPtr),
   2433 		    rf_parity_map_size(raidPtr), 0);
   2434 	}
   2435 }
   2436 
   2437 void
   2438 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2439 {
   2440 	struct rf_paritymap_ondisk tmp;
   2441 	int c,first;
   2442 
   2443 	first=1;
   2444 	for (c = 0; c < raidPtr->numCol; c++) {
   2445 		/* Skip dead disks. */
   2446 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2447 			continue;
   2448 		raidread_component_area(raidPtr->Disks[c].dev,
   2449 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2450 		    RF_PARITYMAP_NBYTE,
   2451 		    rf_parity_map_offset(raidPtr),
   2452 		    rf_parity_map_size(raidPtr));
   2453 		if (first) {
   2454 			memcpy(map, &tmp, sizeof(*map));
   2455 			first = 0;
   2456 		} else {
   2457 			rf_paritymap_merge(map, &tmp);
   2458 		}
   2459 	}
   2460 }
   2461 
   2462 void
   2463 rf_markalldirty(RF_Raid_t *raidPtr)
   2464 {
   2465 	RF_ComponentLabel_t *clabel;
   2466 	int sparecol;
   2467 	int c;
   2468 	int j;
   2469 	int scol = -1;
   2470 
   2471 	raidPtr->mod_counter++;
   2472 	for (c = 0; c < raidPtr->numCol; c++) {
   2473 		/* we don't want to touch (at all) a disk that has
   2474 		   failed */
   2475 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2476 			clabel = raidget_component_label(raidPtr, c);
   2477 			if (clabel->status == rf_ds_spared) {
   2478 				/* XXX do something special...
   2479 				   but whatever you do, don't
   2480 				   try to access it!! */
   2481 			} else {
   2482 				raidmarkdirty(raidPtr, c);
   2483 			}
   2484 		}
   2485 	}
   2486 
   2487 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2488 		sparecol = raidPtr->numCol + c;
   2489 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2490 			/*
   2491 
   2492 			   we claim this disk is "optimal" if it's
   2493 			   rf_ds_used_spare, as that means it should be
   2494 			   directly substitutable for the disk it replaced.
   2495 			   We note that too...
   2496 
   2497 			 */
   2498 
   2499 			for(j=0;j<raidPtr->numCol;j++) {
   2500 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2501 					scol = j;
   2502 					break;
   2503 				}
   2504 			}
   2505 
   2506 			clabel = raidget_component_label(raidPtr, sparecol);
   2507 			/* make sure status is noted */
   2508 
   2509 			raid_init_component_label(raidPtr, clabel);
   2510 
   2511 			clabel->row = 0;
   2512 			clabel->column = scol;
   2513 			/* Note: we *don't* change status from rf_ds_used_spare
   2514 			   to rf_ds_optimal */
   2515 			/* clabel.status = rf_ds_optimal; */
   2516 
   2517 			raidmarkdirty(raidPtr, sparecol);
   2518 		}
   2519 	}
   2520 }
   2521 
   2522 
   2523 void
   2524 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2525 {
   2526 	RF_ComponentLabel_t *clabel;
   2527 	int sparecol;
   2528 	int c;
   2529 	int j;
   2530 	int scol;
   2531 	struct raid_softc *rs = raidPtr->softc;
   2532 
   2533 	scol = -1;
   2534 
   2535 	/* XXX should do extra checks to make sure things really are clean,
   2536 	   rather than blindly setting the clean bit... */
   2537 
   2538 	raidPtr->mod_counter++;
   2539 
   2540 	for (c = 0; c < raidPtr->numCol; c++) {
   2541 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2542 			clabel = raidget_component_label(raidPtr, c);
   2543 			/* make sure status is noted */
   2544 			clabel->status = rf_ds_optimal;
   2545 
   2546 			/* note what unit we are configured as */
   2547 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2548 				clabel->last_unit = raidPtr->raidid;
   2549 
   2550 			raidflush_component_label(raidPtr, c);
   2551 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2552 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2553 					raidmarkclean(raidPtr, c);
   2554 				}
   2555 			}
   2556 		}
   2557 		/* else we don't touch it.. */
   2558 	}
   2559 
   2560 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2561 		sparecol = raidPtr->numCol + c;
   2562 		/* Need to ensure that the reconstruct actually completed! */
   2563 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2564 			/*
   2565 
   2566 			   we claim this disk is "optimal" if it's
   2567 			   rf_ds_used_spare, as that means it should be
   2568 			   directly substitutable for the disk it replaced.
   2569 			   We note that too...
   2570 
   2571 			 */
   2572 
   2573 			for(j=0;j<raidPtr->numCol;j++) {
   2574 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2575 					scol = j;
   2576 					break;
   2577 				}
   2578 			}
   2579 
   2580 			/* XXX shouldn't *really* need this... */
   2581 			clabel = raidget_component_label(raidPtr, sparecol);
   2582 			/* make sure status is noted */
   2583 
   2584 			raid_init_component_label(raidPtr, clabel);
   2585 
   2586 			clabel->column = scol;
   2587 			clabel->status = rf_ds_optimal;
   2588 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2589 				clabel->last_unit = raidPtr->raidid;
   2590 
   2591 			raidflush_component_label(raidPtr, sparecol);
   2592 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2593 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2594 					raidmarkclean(raidPtr, sparecol);
   2595 				}
   2596 			}
   2597 		}
   2598 	}
   2599 }
   2600 
   2601 void
   2602 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2603 {
   2604 
   2605 	if (vp != NULL) {
   2606 		if (auto_configured == 1) {
   2607 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2608 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2609 			vput(vp);
   2610 
   2611 		} else {
   2612 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2613 		}
   2614 	}
   2615 }
   2616 
   2617 
   2618 void
   2619 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2620 {
   2621 	int r,c;
   2622 	struct vnode *vp;
   2623 	int acd;
   2624 
   2625 
   2626 	/* We take this opportunity to close the vnodes like we should.. */
   2627 
   2628 	for (c = 0; c < raidPtr->numCol; c++) {
   2629 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2630 		acd = raidPtr->Disks[c].auto_configured;
   2631 		rf_close_component(raidPtr, vp, acd);
   2632 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2633 		raidPtr->Disks[c].auto_configured = 0;
   2634 	}
   2635 
   2636 	for (r = 0; r < raidPtr->numSpare; r++) {
   2637 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2638 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2639 		rf_close_component(raidPtr, vp, acd);
   2640 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2641 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2642 	}
   2643 }
   2644 
   2645 
   2646 void
   2647 rf_ReconThread(struct rf_recon_req_internal *req)
   2648 {
   2649 	int     s;
   2650 	RF_Raid_t *raidPtr;
   2651 
   2652 	s = splbio();
   2653 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2654 	raidPtr->recon_in_progress = 1;
   2655 
   2656 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2657 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2658 
   2659 	RF_Free(req, sizeof(*req));
   2660 
   2661 	raidPtr->recon_in_progress = 0;
   2662 	splx(s);
   2663 
   2664 	/* That's all... */
   2665 	kthread_exit(0);	/* does not return */
   2666 }
   2667 
   2668 void
   2669 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2670 {
   2671 	int retcode;
   2672 	int s;
   2673 
   2674 	raidPtr->parity_rewrite_stripes_done = 0;
   2675 	raidPtr->parity_rewrite_in_progress = 1;
   2676 	s = splbio();
   2677 	retcode = rf_RewriteParity(raidPtr);
   2678 	splx(s);
   2679 	if (retcode) {
   2680 		printf("raid%d: Error re-writing parity (%d)!\n",
   2681 		    raidPtr->raidid, retcode);
   2682 	} else {
   2683 		/* set the clean bit!  If we shutdown correctly,
   2684 		   the clean bit on each component label will get
   2685 		   set */
   2686 		raidPtr->parity_good = RF_RAID_CLEAN;
   2687 	}
   2688 	raidPtr->parity_rewrite_in_progress = 0;
   2689 
   2690 	/* Anyone waiting for us to stop?  If so, inform them... */
   2691 	if (raidPtr->waitShutdown) {
   2692 		rf_lock_mutex2(raidPtr->rad_lock);
   2693 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2694 		rf_unlock_mutex2(raidPtr->rad_lock);
   2695 	}
   2696 
   2697 	/* That's all... */
   2698 	kthread_exit(0);	/* does not return */
   2699 }
   2700 
   2701 
   2702 void
   2703 rf_CopybackThread(RF_Raid_t *raidPtr)
   2704 {
   2705 	int s;
   2706 
   2707 	raidPtr->copyback_in_progress = 1;
   2708 	s = splbio();
   2709 	rf_CopybackReconstructedData(raidPtr);
   2710 	splx(s);
   2711 	raidPtr->copyback_in_progress = 0;
   2712 
   2713 	/* That's all... */
   2714 	kthread_exit(0);	/* does not return */
   2715 }
   2716 
   2717 
   2718 void
   2719 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2720 {
   2721 	int s;
   2722 	RF_Raid_t *raidPtr;
   2723 
   2724 	s = splbio();
   2725 	raidPtr = req->raidPtr;
   2726 	raidPtr->recon_in_progress = 1;
   2727 	rf_ReconstructInPlace(raidPtr, req->col);
   2728 	RF_Free(req, sizeof(*req));
   2729 	raidPtr->recon_in_progress = 0;
   2730 	splx(s);
   2731 
   2732 	/* That's all... */
   2733 	kthread_exit(0);	/* does not return */
   2734 }
   2735 
   2736 static RF_AutoConfig_t *
   2737 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2738     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2739     unsigned secsize)
   2740 {
   2741 	int good_one = 0;
   2742 	RF_ComponentLabel_t *clabel;
   2743 	RF_AutoConfig_t *ac;
   2744 
   2745 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2746 	if (clabel == NULL) {
   2747 oomem:
   2748 		    while(ac_list) {
   2749 			    ac = ac_list;
   2750 			    if (ac->clabel)
   2751 				    free(ac->clabel, M_RAIDFRAME);
   2752 			    ac_list = ac_list->next;
   2753 			    free(ac, M_RAIDFRAME);
   2754 		    }
   2755 		    printf("RAID auto config: out of memory!\n");
   2756 		    return NULL; /* XXX probably should panic? */
   2757 	}
   2758 
   2759 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2760 		/* Got the label.  Does it look reasonable? */
   2761 		if (rf_reasonable_label(clabel, numsecs) &&
   2762 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2763 #ifdef DEBUG
   2764 			printf("Component on: %s: %llu\n",
   2765 				cname, (unsigned long long)size);
   2766 			rf_print_component_label(clabel);
   2767 #endif
   2768 			/* if it's reasonable, add it, else ignore it. */
   2769 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2770 				M_NOWAIT);
   2771 			if (ac == NULL) {
   2772 				free(clabel, M_RAIDFRAME);
   2773 				goto oomem;
   2774 			}
   2775 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2776 			ac->dev = dev;
   2777 			ac->vp = vp;
   2778 			ac->clabel = clabel;
   2779 			ac->next = ac_list;
   2780 			ac_list = ac;
   2781 			good_one = 1;
   2782 		}
   2783 	}
   2784 	if (!good_one) {
   2785 		/* cleanup */
   2786 		free(clabel, M_RAIDFRAME);
   2787 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2788 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2789 		vput(vp);
   2790 	}
   2791 	return ac_list;
   2792 }
   2793 
   2794 RF_AutoConfig_t *
   2795 rf_find_raid_components(void)
   2796 {
   2797 	struct vnode *vp;
   2798 	struct disklabel label;
   2799 	device_t dv;
   2800 	deviter_t di;
   2801 	dev_t dev;
   2802 	int bmajor, bminor, wedge, rf_part_found;
   2803 	int error;
   2804 	int i;
   2805 	RF_AutoConfig_t *ac_list;
   2806 	uint64_t numsecs;
   2807 	unsigned secsize;
   2808 	int dowedges;
   2809 
   2810 	/* initialize the AutoConfig list */
   2811 	ac_list = NULL;
   2812 
   2813 	/*
   2814 	 * we begin by trolling through *all* the devices on the system *twice*
   2815 	 * first we scan for wedges, second for other devices. This avoids
   2816 	 * using a raw partition instead of a wedge that covers the whole disk
   2817 	 */
   2818 
   2819 	for (dowedges=1; dowedges>=0; --dowedges) {
   2820 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2821 		     dv = deviter_next(&di)) {
   2822 
   2823 			/* we are only interested in disks... */
   2824 			if (device_class(dv) != DV_DISK)
   2825 				continue;
   2826 
   2827 			/* we don't care about floppies... */
   2828 			if (device_is_a(dv, "fd")) {
   2829 				continue;
   2830 			}
   2831 
   2832 			/* we don't care about CD's... */
   2833 			if (device_is_a(dv, "cd")) {
   2834 				continue;
   2835 			}
   2836 
   2837 			/* we don't care about md's... */
   2838 			if (device_is_a(dv, "md")) {
   2839 				continue;
   2840 			}
   2841 
   2842 			/* hdfd is the Atari/Hades floppy driver */
   2843 			if (device_is_a(dv, "hdfd")) {
   2844 				continue;
   2845 			}
   2846 
   2847 			/* fdisa is the Atari/Milan floppy driver */
   2848 			if (device_is_a(dv, "fdisa")) {
   2849 				continue;
   2850 			}
   2851 
   2852 			/* are we in the wedges pass ? */
   2853 			wedge = device_is_a(dv, "dk");
   2854 			if (wedge != dowedges) {
   2855 				continue;
   2856 			}
   2857 
   2858 			/* need to find the device_name_to_block_device_major stuff */
   2859 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2860 
   2861 			rf_part_found = 0; /*No raid partition as yet*/
   2862 
   2863 			/* get a vnode for the raw partition of this disk */
   2864 			bminor = minor(device_unit(dv));
   2865 			dev = wedge ? makedev(bmajor, bminor) :
   2866 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2867 			if (bdevvp(dev, &vp))
   2868 				panic("RAID can't alloc vnode");
   2869 
   2870 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2871 
   2872 			if (error) {
   2873 				/* "Who cares."  Continue looking
   2874 				   for something that exists*/
   2875 				vput(vp);
   2876 				continue;
   2877 			}
   2878 
   2879 			error = getdisksize(vp, &numsecs, &secsize);
   2880 			if (error) {
   2881 				/*
   2882 				 * Pseudo devices like vnd and cgd can be
   2883 				 * opened but may still need some configuration.
   2884 				 * Ignore these quietly.
   2885 				 */
   2886 				if (error != ENXIO)
   2887 					printf("RAIDframe: can't get disk size"
   2888 					    " for dev %s (%d)\n",
   2889 					    device_xname(dv), error);
   2890 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2891 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2892 				vput(vp);
   2893 				continue;
   2894 			}
   2895 			if (wedge) {
   2896 				struct dkwedge_info dkw;
   2897 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2898 				    NOCRED);
   2899 				if (error) {
   2900 					printf("RAIDframe: can't get wedge info for "
   2901 					    "dev %s (%d)\n", device_xname(dv), error);
   2902 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2903 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2904 					vput(vp);
   2905 					continue;
   2906 				}
   2907 
   2908 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2909 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2910 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2911 					vput(vp);
   2912 					continue;
   2913 				}
   2914 
   2915 				ac_list = rf_get_component(ac_list, dev, vp,
   2916 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2917 				rf_part_found = 1; /*There is a raid component on this disk*/
   2918 				continue;
   2919 			}
   2920 
   2921 			/* Ok, the disk exists.  Go get the disklabel. */
   2922 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2923 			if (error) {
   2924 				/*
   2925 				 * XXX can't happen - open() would
   2926 				 * have errored out (or faked up one)
   2927 				 */
   2928 				if (error != ENOTTY)
   2929 					printf("RAIDframe: can't get label for dev "
   2930 					    "%s (%d)\n", device_xname(dv), error);
   2931 			}
   2932 
   2933 			/* don't need this any more.  We'll allocate it again
   2934 			   a little later if we really do... */
   2935 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2936 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2937 			vput(vp);
   2938 
   2939 			if (error)
   2940 				continue;
   2941 
   2942 			rf_part_found = 0; /*No raid partitions yet*/
   2943 			for (i = 0; i < label.d_npartitions; i++) {
   2944 				char cname[sizeof(ac_list->devname)];
   2945 
   2946 				/* We only support partitions marked as RAID */
   2947 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2948 					continue;
   2949 
   2950 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2951 				if (bdevvp(dev, &vp))
   2952 					panic("RAID can't alloc vnode");
   2953 
   2954 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2955 				if (error) {
   2956 					/* Whatever... */
   2957 					vput(vp);
   2958 					continue;
   2959 				}
   2960 				snprintf(cname, sizeof(cname), "%s%c",
   2961 				    device_xname(dv), 'a' + i);
   2962 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2963 					label.d_partitions[i].p_size, numsecs, secsize);
   2964 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2965 			}
   2966 
   2967 			/*
   2968 			 *If there is no raid component on this disk, either in a
   2969 			 *disklabel or inside a wedge, check the raw partition as well,
   2970 			 *as it is possible to configure raid components on raw disk
   2971 			 *devices.
   2972 			 */
   2973 
   2974 			if (!rf_part_found) {
   2975 				char cname[sizeof(ac_list->devname)];
   2976 
   2977 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2978 				if (bdevvp(dev, &vp))
   2979 					panic("RAID can't alloc vnode");
   2980 
   2981 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2982 				if (error) {
   2983 					/* Whatever... */
   2984 					vput(vp);
   2985 					continue;
   2986 				}
   2987 				snprintf(cname, sizeof(cname), "%s%c",
   2988 				    device_xname(dv), 'a' + RAW_PART);
   2989 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2990 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2991 			}
   2992 		}
   2993 		deviter_release(&di);
   2994 	}
   2995 	return ac_list;
   2996 }
   2997 
   2998 
   2999 int
   3000 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3001 {
   3002 
   3003 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3004 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3005 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3006 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3007 	    clabel->row >=0 &&
   3008 	    clabel->column >= 0 &&
   3009 	    clabel->num_rows > 0 &&
   3010 	    clabel->num_columns > 0 &&
   3011 	    clabel->row < clabel->num_rows &&
   3012 	    clabel->column < clabel->num_columns &&
   3013 	    clabel->blockSize > 0 &&
   3014 	    /*
   3015 	     * numBlocksHi may contain garbage, but it is ok since
   3016 	     * the type is unsigned.  If it is really garbage,
   3017 	     * rf_fix_old_label_size() will fix it.
   3018 	     */
   3019 	    rf_component_label_numblocks(clabel) > 0) {
   3020 		/*
   3021 		 * label looks reasonable enough...
   3022 		 * let's make sure it has no old garbage.
   3023 		 */
   3024 		if (numsecs)
   3025 			rf_fix_old_label_size(clabel, numsecs);
   3026 		return(1);
   3027 	}
   3028 	return(0);
   3029 }
   3030 
   3031 
   3032 /*
   3033  * For reasons yet unknown, some old component labels have garbage in
   3034  * the newer numBlocksHi region, and this causes lossage.  Since those
   3035  * disks will also have numsecs set to less than 32 bits of sectors,
   3036  * we can determine when this corruption has occurred, and fix it.
   3037  *
   3038  * The exact same problem, with the same unknown reason, happens to
   3039  * the partitionSizeHi member as well.
   3040  */
   3041 static void
   3042 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3043 {
   3044 
   3045 	if (numsecs < ((uint64_t)1 << 32)) {
   3046 		if (clabel->numBlocksHi) {
   3047 			printf("WARNING: total sectors < 32 bits, yet "
   3048 			       "numBlocksHi set\n"
   3049 			       "WARNING: resetting numBlocksHi to zero.\n");
   3050 			clabel->numBlocksHi = 0;
   3051 		}
   3052 
   3053 		if (clabel->partitionSizeHi) {
   3054 			printf("WARNING: total sectors < 32 bits, yet "
   3055 			       "partitionSizeHi set\n"
   3056 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3057 			clabel->partitionSizeHi = 0;
   3058 		}
   3059 	}
   3060 }
   3061 
   3062 
   3063 #ifdef DEBUG
   3064 void
   3065 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3066 {
   3067 	uint64_t numBlocks;
   3068 	static const char *rp[] = {
   3069 	    "No", "Force", "Soft", "*invalid*"
   3070 	};
   3071 
   3072 
   3073 	numBlocks = rf_component_label_numblocks(clabel);
   3074 
   3075 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3076 	       clabel->row, clabel->column,
   3077 	       clabel->num_rows, clabel->num_columns);
   3078 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3079 	       clabel->version, clabel->serial_number,
   3080 	       clabel->mod_counter);
   3081 	printf("   Clean: %s Status: %d\n",
   3082 	       clabel->clean ? "Yes" : "No", clabel->status);
   3083 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3084 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3085 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3086 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3087 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3088 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3089 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3090 #if 0
   3091 	   printf("   Config order: %d\n", clabel->config_order);
   3092 #endif
   3093 
   3094 }
   3095 #endif
   3096 
   3097 RF_ConfigSet_t *
   3098 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3099 {
   3100 	RF_AutoConfig_t *ac;
   3101 	RF_ConfigSet_t *config_sets;
   3102 	RF_ConfigSet_t *cset;
   3103 	RF_AutoConfig_t *ac_next;
   3104 
   3105 
   3106 	config_sets = NULL;
   3107 
   3108 	/* Go through the AutoConfig list, and figure out which components
   3109 	   belong to what sets.  */
   3110 	ac = ac_list;
   3111 	while(ac!=NULL) {
   3112 		/* we're going to putz with ac->next, so save it here
   3113 		   for use at the end of the loop */
   3114 		ac_next = ac->next;
   3115 
   3116 		if (config_sets == NULL) {
   3117 			/* will need at least this one... */
   3118 			config_sets = (RF_ConfigSet_t *)
   3119 				malloc(sizeof(RF_ConfigSet_t),
   3120 				       M_RAIDFRAME, M_NOWAIT);
   3121 			if (config_sets == NULL) {
   3122 				panic("rf_create_auto_sets: No memory!");
   3123 			}
   3124 			/* this one is easy :) */
   3125 			config_sets->ac = ac;
   3126 			config_sets->next = NULL;
   3127 			config_sets->rootable = 0;
   3128 			ac->next = NULL;
   3129 		} else {
   3130 			/* which set does this component fit into? */
   3131 			cset = config_sets;
   3132 			while(cset!=NULL) {
   3133 				if (rf_does_it_fit(cset, ac)) {
   3134 					/* looks like it matches... */
   3135 					ac->next = cset->ac;
   3136 					cset->ac = ac;
   3137 					break;
   3138 				}
   3139 				cset = cset->next;
   3140 			}
   3141 			if (cset==NULL) {
   3142 				/* didn't find a match above... new set..*/
   3143 				cset = (RF_ConfigSet_t *)
   3144 					malloc(sizeof(RF_ConfigSet_t),
   3145 					       M_RAIDFRAME, M_NOWAIT);
   3146 				if (cset == NULL) {
   3147 					panic("rf_create_auto_sets: No memory!");
   3148 				}
   3149 				cset->ac = ac;
   3150 				ac->next = NULL;
   3151 				cset->next = config_sets;
   3152 				cset->rootable = 0;
   3153 				config_sets = cset;
   3154 			}
   3155 		}
   3156 		ac = ac_next;
   3157 	}
   3158 
   3159 
   3160 	return(config_sets);
   3161 }
   3162 
   3163 static int
   3164 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3165 {
   3166 	RF_ComponentLabel_t *clabel1, *clabel2;
   3167 
   3168 	/* If this one matches the *first* one in the set, that's good
   3169 	   enough, since the other members of the set would have been
   3170 	   through here too... */
   3171 	/* note that we are not checking partitionSize here..
   3172 
   3173 	   Note that we are also not checking the mod_counters here.
   3174 	   If everything else matches except the mod_counter, that's
   3175 	   good enough for this test.  We will deal with the mod_counters
   3176 	   a little later in the autoconfiguration process.
   3177 
   3178 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3179 
   3180 	   The reason we don't check for this is that failed disks
   3181 	   will have lower modification counts.  If those disks are
   3182 	   not added to the set they used to belong to, then they will
   3183 	   form their own set, which may result in 2 different sets,
   3184 	   for example, competing to be configured at raid0, and
   3185 	   perhaps competing to be the root filesystem set.  If the
   3186 	   wrong ones get configured, or both attempt to become /,
   3187 	   weird behaviour and or serious lossage will occur.  Thus we
   3188 	   need to bring them into the fold here, and kick them out at
   3189 	   a later point.
   3190 
   3191 	*/
   3192 
   3193 	clabel1 = cset->ac->clabel;
   3194 	clabel2 = ac->clabel;
   3195 	if ((clabel1->version == clabel2->version) &&
   3196 	    (clabel1->serial_number == clabel2->serial_number) &&
   3197 	    (clabel1->num_rows == clabel2->num_rows) &&
   3198 	    (clabel1->num_columns == clabel2->num_columns) &&
   3199 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3200 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3201 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3202 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3203 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3204 	    (clabel1->blockSize == clabel2->blockSize) &&
   3205 	    rf_component_label_numblocks(clabel1) ==
   3206 	    rf_component_label_numblocks(clabel2) &&
   3207 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3208 	    (clabel1->root_partition == clabel2->root_partition) &&
   3209 	    (clabel1->last_unit == clabel2->last_unit) &&
   3210 	    (clabel1->config_order == clabel2->config_order)) {
   3211 		/* if it get's here, it almost *has* to be a match */
   3212 	} else {
   3213 		/* it's not consistent with somebody in the set..
   3214 		   punt */
   3215 		return(0);
   3216 	}
   3217 	/* all was fine.. it must fit... */
   3218 	return(1);
   3219 }
   3220 
   3221 int
   3222 rf_have_enough_components(RF_ConfigSet_t *cset)
   3223 {
   3224 	RF_AutoConfig_t *ac;
   3225 	RF_AutoConfig_t *auto_config;
   3226 	RF_ComponentLabel_t *clabel;
   3227 	int c;
   3228 	int num_cols;
   3229 	int num_missing;
   3230 	int mod_counter;
   3231 	int mod_counter_found;
   3232 	int even_pair_failed;
   3233 	char parity_type;
   3234 
   3235 
   3236 	/* check to see that we have enough 'live' components
   3237 	   of this set.  If so, we can configure it if necessary */
   3238 
   3239 	num_cols = cset->ac->clabel->num_columns;
   3240 	parity_type = cset->ac->clabel->parityConfig;
   3241 
   3242 	/* XXX Check for duplicate components!?!?!? */
   3243 
   3244 	/* Determine what the mod_counter is supposed to be for this set. */
   3245 
   3246 	mod_counter_found = 0;
   3247 	mod_counter = 0;
   3248 	ac = cset->ac;
   3249 	while(ac!=NULL) {
   3250 		if (mod_counter_found==0) {
   3251 			mod_counter = ac->clabel->mod_counter;
   3252 			mod_counter_found = 1;
   3253 		} else {
   3254 			if (ac->clabel->mod_counter > mod_counter) {
   3255 				mod_counter = ac->clabel->mod_counter;
   3256 			}
   3257 		}
   3258 		ac = ac->next;
   3259 	}
   3260 
   3261 	num_missing = 0;
   3262 	auto_config = cset->ac;
   3263 
   3264 	even_pair_failed = 0;
   3265 	for(c=0; c<num_cols; c++) {
   3266 		ac = auto_config;
   3267 		while(ac!=NULL) {
   3268 			if ((ac->clabel->column == c) &&
   3269 			    (ac->clabel->mod_counter == mod_counter)) {
   3270 				/* it's this one... */
   3271 #ifdef DEBUG
   3272 				printf("Found: %s at %d\n",
   3273 				       ac->devname,c);
   3274 #endif
   3275 				break;
   3276 			}
   3277 			ac=ac->next;
   3278 		}
   3279 		if (ac==NULL) {
   3280 				/* Didn't find one here! */
   3281 				/* special case for RAID 1, especially
   3282 				   where there are more than 2
   3283 				   components (where RAIDframe treats
   3284 				   things a little differently :( ) */
   3285 			if (parity_type == '1') {
   3286 				if (c%2 == 0) { /* even component */
   3287 					even_pair_failed = 1;
   3288 				} else { /* odd component.  If
   3289 					    we're failed, and
   3290 					    so is the even
   3291 					    component, it's
   3292 					    "Good Night, Charlie" */
   3293 					if (even_pair_failed == 1) {
   3294 						return(0);
   3295 					}
   3296 				}
   3297 			} else {
   3298 				/* normal accounting */
   3299 				num_missing++;
   3300 			}
   3301 		}
   3302 		if ((parity_type == '1') && (c%2 == 1)) {
   3303 				/* Just did an even component, and we didn't
   3304 				   bail.. reset the even_pair_failed flag,
   3305 				   and go on to the next component.... */
   3306 			even_pair_failed = 0;
   3307 		}
   3308 	}
   3309 
   3310 	clabel = cset->ac->clabel;
   3311 
   3312 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3313 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3314 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3315 		/* XXX this needs to be made *much* more general */
   3316 		/* Too many failures */
   3317 		return(0);
   3318 	}
   3319 	/* otherwise, all is well, and we've got enough to take a kick
   3320 	   at autoconfiguring this set */
   3321 	return(1);
   3322 }
   3323 
   3324 void
   3325 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3326 			RF_Raid_t *raidPtr)
   3327 {
   3328 	RF_ComponentLabel_t *clabel;
   3329 	int i;
   3330 
   3331 	clabel = ac->clabel;
   3332 
   3333 	/* 1. Fill in the common stuff */
   3334 	config->numCol = clabel->num_columns;
   3335 	config->numSpare = 0; /* XXX should this be set here? */
   3336 	config->sectPerSU = clabel->sectPerSU;
   3337 	config->SUsPerPU = clabel->SUsPerPU;
   3338 	config->SUsPerRU = clabel->SUsPerRU;
   3339 	config->parityConfig = clabel->parityConfig;
   3340 	/* XXX... */
   3341 	strcpy(config->diskQueueType,"fifo");
   3342 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3343 	config->layoutSpecificSize = 0; /* XXX ?? */
   3344 
   3345 	while(ac!=NULL) {
   3346 		/* row/col values will be in range due to the checks
   3347 		   in reasonable_label() */
   3348 		strcpy(config->devnames[0][ac->clabel->column],
   3349 		       ac->devname);
   3350 		ac = ac->next;
   3351 	}
   3352 
   3353 	for(i=0;i<RF_MAXDBGV;i++) {
   3354 		config->debugVars[i][0] = 0;
   3355 	}
   3356 }
   3357 
   3358 int
   3359 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3360 {
   3361 	RF_ComponentLabel_t *clabel;
   3362 	int column;
   3363 	int sparecol;
   3364 
   3365 	raidPtr->autoconfigure = new_value;
   3366 
   3367 	for(column=0; column<raidPtr->numCol; column++) {
   3368 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3369 			clabel = raidget_component_label(raidPtr, column);
   3370 			clabel->autoconfigure = new_value;
   3371 			raidflush_component_label(raidPtr, column);
   3372 		}
   3373 	}
   3374 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3375 		sparecol = raidPtr->numCol + column;
   3376 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3377 			clabel = raidget_component_label(raidPtr, sparecol);
   3378 			clabel->autoconfigure = new_value;
   3379 			raidflush_component_label(raidPtr, sparecol);
   3380 		}
   3381 	}
   3382 	return(new_value);
   3383 }
   3384 
   3385 int
   3386 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3387 {
   3388 	RF_ComponentLabel_t *clabel;
   3389 	int column;
   3390 	int sparecol;
   3391 
   3392 	raidPtr->root_partition = new_value;
   3393 	for(column=0; column<raidPtr->numCol; column++) {
   3394 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3395 			clabel = raidget_component_label(raidPtr, column);
   3396 			clabel->root_partition = new_value;
   3397 			raidflush_component_label(raidPtr, column);
   3398 		}
   3399 	}
   3400 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3401 		sparecol = raidPtr->numCol + column;
   3402 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3403 			clabel = raidget_component_label(raidPtr, sparecol);
   3404 			clabel->root_partition = new_value;
   3405 			raidflush_component_label(raidPtr, sparecol);
   3406 		}
   3407 	}
   3408 	return(new_value);
   3409 }
   3410 
   3411 void
   3412 rf_release_all_vps(RF_ConfigSet_t *cset)
   3413 {
   3414 	RF_AutoConfig_t *ac;
   3415 
   3416 	ac = cset->ac;
   3417 	while(ac!=NULL) {
   3418 		/* Close the vp, and give it back */
   3419 		if (ac->vp) {
   3420 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3421 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3422 			vput(ac->vp);
   3423 			ac->vp = NULL;
   3424 		}
   3425 		ac = ac->next;
   3426 	}
   3427 }
   3428 
   3429 
   3430 void
   3431 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3432 {
   3433 	RF_AutoConfig_t *ac;
   3434 	RF_AutoConfig_t *next_ac;
   3435 
   3436 	ac = cset->ac;
   3437 	while(ac!=NULL) {
   3438 		next_ac = ac->next;
   3439 		/* nuke the label */
   3440 		free(ac->clabel, M_RAIDFRAME);
   3441 		/* cleanup the config structure */
   3442 		free(ac, M_RAIDFRAME);
   3443 		/* "next.." */
   3444 		ac = next_ac;
   3445 	}
   3446 	/* and, finally, nuke the config set */
   3447 	free(cset, M_RAIDFRAME);
   3448 }
   3449 
   3450 
   3451 void
   3452 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3453 {
   3454 	/* current version number */
   3455 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3456 	clabel->serial_number = raidPtr->serial_number;
   3457 	clabel->mod_counter = raidPtr->mod_counter;
   3458 
   3459 	clabel->num_rows = 1;
   3460 	clabel->num_columns = raidPtr->numCol;
   3461 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3462 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3463 
   3464 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3465 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3466 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3467 
   3468 	clabel->blockSize = raidPtr->bytesPerSector;
   3469 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3470 
   3471 	/* XXX not portable */
   3472 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3473 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3474 	clabel->autoconfigure = raidPtr->autoconfigure;
   3475 	clabel->root_partition = raidPtr->root_partition;
   3476 	clabel->last_unit = raidPtr->raidid;
   3477 	clabel->config_order = raidPtr->config_order;
   3478 
   3479 #ifndef RF_NO_PARITY_MAP
   3480 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3481 #endif
   3482 }
   3483 
   3484 struct raid_softc *
   3485 rf_auto_config_set(RF_ConfigSet_t *cset)
   3486 {
   3487 	RF_Raid_t *raidPtr;
   3488 	RF_Config_t *config;
   3489 	int raidID;
   3490 	struct raid_softc *sc;
   3491 
   3492 #ifdef DEBUG
   3493 	printf("RAID autoconfigure\n");
   3494 #endif
   3495 
   3496 	/* 1. Create a config structure */
   3497 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3498 	if (config == NULL) {
   3499 		printf("%s: Out of mem - config!?!?\n", __func__);
   3500 				/* XXX do something more intelligent here. */
   3501 		return NULL;
   3502 	}
   3503 
   3504 	/*
   3505 	   2. Figure out what RAID ID this one is supposed to live at
   3506 	   See if we can get the same RAID dev that it was configured
   3507 	   on last time..
   3508 	*/
   3509 
   3510 	raidID = cset->ac->clabel->last_unit;
   3511 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3512 	     sc = raidget(++raidID, false))
   3513 		continue;
   3514 #ifdef DEBUG
   3515 	printf("Configuring raid%d:\n",raidID);
   3516 #endif
   3517 
   3518 	if (sc == NULL)
   3519 		sc = raidget(raidID, true);
   3520 	if (sc == NULL) {
   3521 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3522 				/* XXX do something more intelligent here. */
   3523 		free(config, M_RAIDFRAME);
   3524 		return NULL;
   3525 	}
   3526 
   3527 	raidPtr = &sc->sc_r;
   3528 
   3529 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3530 	raidPtr->softc = sc;
   3531 	raidPtr->raidid = raidID;
   3532 	raidPtr->openings = RAIDOUTSTANDING;
   3533 
   3534 	/* 3. Build the configuration structure */
   3535 	rf_create_configuration(cset->ac, config, raidPtr);
   3536 
   3537 	/* 4. Do the configuration */
   3538 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3539 		raidinit(sc);
   3540 
   3541 		rf_markalldirty(raidPtr);
   3542 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3543 		switch (cset->ac->clabel->root_partition) {
   3544 		case 1:	/* Force Root */
   3545 		case 2:	/* Soft Root: root when boot partition part of raid */
   3546 			/*
   3547 			 * everything configured just fine.  Make a note
   3548 			 * that this set is eligible to be root,
   3549 			 * or forced to be root
   3550 			 */
   3551 			cset->rootable = cset->ac->clabel->root_partition;
   3552 			/* XXX do this here? */
   3553 			raidPtr->root_partition = cset->rootable;
   3554 			break;
   3555 		default:
   3556 			break;
   3557 		}
   3558 	} else {
   3559 		raidput(sc);
   3560 		sc = NULL;
   3561 	}
   3562 
   3563 	/* 5. Cleanup */
   3564 	free(config, M_RAIDFRAME);
   3565 	return sc;
   3566 }
   3567 
   3568 void
   3569 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3570 	     size_t xmin, size_t xmax)
   3571 {
   3572 	int error;
   3573 
   3574 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3575 	pool_sethiwat(p, xmax);
   3576 	if ((error = pool_prime(p, xmin)) != 0)
   3577 		panic("%s: failed to prime pool: %d", __func__, error);
   3578 	pool_setlowat(p, xmin);
   3579 }
   3580 
   3581 /*
   3582  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3583  * to see if there is IO pending and if that IO could possibly be done
   3584  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3585  * otherwise.
   3586  *
   3587  */
   3588 int
   3589 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3590 {
   3591 	struct raid_softc *rs;
   3592 	struct dk_softc *dksc;
   3593 
   3594 	rs = raidPtr->softc;
   3595 	dksc = &rs->sc_dksc;
   3596 
   3597 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3598 		return 1;
   3599 
   3600 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3601 		/* there is work to do */
   3602 		return 0;
   3603 	}
   3604 	/* default is nothing to do */
   3605 	return 1;
   3606 }
   3607 
   3608 int
   3609 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3610 {
   3611 	uint64_t numsecs;
   3612 	unsigned secsize;
   3613 	int error;
   3614 
   3615 	error = getdisksize(vp, &numsecs, &secsize);
   3616 	if (error == 0) {
   3617 		diskPtr->blockSize = secsize;
   3618 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3619 		diskPtr->partitionSize = numsecs;
   3620 		return 0;
   3621 	}
   3622 	return error;
   3623 }
   3624 
   3625 static int
   3626 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3627 {
   3628 	return 1;
   3629 }
   3630 
   3631 static void
   3632 raid_attach(device_t parent, device_t self, void *aux)
   3633 {
   3634 }
   3635 
   3636 
   3637 static int
   3638 raid_detach(device_t self, int flags)
   3639 {
   3640 	int error;
   3641 	struct raid_softc *rs = raidsoftc(self);
   3642 
   3643 	if (rs == NULL)
   3644 		return ENXIO;
   3645 
   3646 	if ((error = raidlock(rs)) != 0)
   3647 		return (error);
   3648 
   3649 	error = raid_detach_unlocked(rs);
   3650 
   3651 	raidunlock(rs);
   3652 
   3653 	/* XXX raid can be referenced here */
   3654 
   3655 	if (error)
   3656 		return error;
   3657 
   3658 	/* Free the softc */
   3659 	raidput(rs);
   3660 
   3661 	return 0;
   3662 }
   3663 
   3664 static void
   3665 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3666 {
   3667 	struct dk_softc *dksc = &rs->sc_dksc;
   3668 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3669 
   3670 	memset(dg, 0, sizeof(*dg));
   3671 
   3672 	dg->dg_secperunit = raidPtr->totalSectors;
   3673 	dg->dg_secsize = raidPtr->bytesPerSector;
   3674 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3675 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3676 
   3677 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3678 }
   3679 
   3680 /*
   3681  * Get cache info for all the components (including spares).
   3682  * Returns intersection of all the cache flags of all disks, or first
   3683  * error if any encountered.
   3684  * XXXfua feature flags can change as spares are added - lock down somehow
   3685  */
   3686 static int
   3687 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3688 {
   3689 	int c;
   3690 	int error;
   3691 	int dkwhole = 0, dkpart;
   3692 
   3693 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3694 		/*
   3695 		 * Check any non-dead disk, even when currently being
   3696 		 * reconstructed.
   3697 		 */
   3698 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3699 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3700 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3701 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3702 			if (error) {
   3703 				if (error != ENODEV) {
   3704 					printf("raid%d: get cache for component %s failed\n",
   3705 					    raidPtr->raidid,
   3706 					    raidPtr->Disks[c].devname);
   3707 				}
   3708 
   3709 				return error;
   3710 			}
   3711 
   3712 			if (c == 0)
   3713 				dkwhole = dkpart;
   3714 			else
   3715 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3716 		}
   3717 	}
   3718 
   3719 	*data = dkwhole;
   3720 
   3721 	return 0;
   3722 }
   3723 
   3724 /*
   3725  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3726  * We end up returning whatever error was returned by the first cache flush
   3727  * that fails.
   3728  */
   3729 
   3730 int
   3731 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3732 {
   3733 	int c, sparecol;
   3734 	int e,error;
   3735 	int force = 1;
   3736 
   3737 	error = 0;
   3738 	for (c = 0; c < raidPtr->numCol; c++) {
   3739 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3740 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3741 					  &force, FWRITE, NOCRED);
   3742 			if (e) {
   3743 				if (e != ENODEV)
   3744 					printf("raid%d: cache flush to component %s failed.\n",
   3745 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3746 				if (error == 0) {
   3747 					error = e;
   3748 				}
   3749 			}
   3750 		}
   3751 	}
   3752 
   3753 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3754 		sparecol = raidPtr->numCol + c;
   3755 		/* Need to ensure that the reconstruct actually completed! */
   3756 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3757 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3758 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3759 			if (e) {
   3760 				if (e != ENODEV)
   3761 					printf("raid%d: cache flush to component %s failed.\n",
   3762 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3763 				if (error == 0) {
   3764 					error = e;
   3765 				}
   3766 			}
   3767 		}
   3768 	}
   3769 	return error;
   3770 }
   3771 
   3772 /* Fill in info with the current status */
   3773 void
   3774 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3775 {
   3776 
   3777 	if (raidPtr->status != rf_rs_reconstructing) {
   3778 		info->total = 100;
   3779 		info->completed = 100;
   3780 	} else {
   3781 		info->total = raidPtr->reconControl->numRUsTotal;
   3782 		info->completed = raidPtr->reconControl->numRUsComplete;
   3783 	}
   3784 	info->remaining = info->total - info->completed;
   3785 }
   3786 
   3787 /* Fill in info with the current status */
   3788 void
   3789 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3790 {
   3791 
   3792 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3793 		info->total = raidPtr->Layout.numStripe;
   3794 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3795 	} else {
   3796 		info->completed = 100;
   3797 		info->total = 100;
   3798 	}
   3799 	info->remaining = info->total - info->completed;
   3800 }
   3801 
   3802 /* Fill in info with the current status */
   3803 void
   3804 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3805 {
   3806 
   3807 	if (raidPtr->copyback_in_progress == 1) {
   3808 		info->total = raidPtr->Layout.numStripe;
   3809 		info->completed = raidPtr->copyback_stripes_done;
   3810 		info->remaining = info->total - info->completed;
   3811 	} else {
   3812 		info->remaining = 0;
   3813 		info->completed = 100;
   3814 		info->total = 100;
   3815 	}
   3816 }
   3817 
   3818 /* Fill in config with the current info */
   3819 int
   3820 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3821 {
   3822 	int	d, i, j;
   3823 
   3824 	if (!raidPtr->valid)
   3825 		return (ENODEV);
   3826 	config->cols = raidPtr->numCol;
   3827 	config->ndevs = raidPtr->numCol;
   3828 	if (config->ndevs >= RF_MAX_DISKS)
   3829 		return (ENOMEM);
   3830 	config->nspares = raidPtr->numSpare;
   3831 	if (config->nspares >= RF_MAX_DISKS)
   3832 		return (ENOMEM);
   3833 	config->maxqdepth = raidPtr->maxQueueDepth;
   3834 	d = 0;
   3835 	for (j = 0; j < config->cols; j++) {
   3836 		config->devs[d] = raidPtr->Disks[j];
   3837 		d++;
   3838 	}
   3839 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3840 		config->spares[i] = raidPtr->Disks[j];
   3841 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3842 			/* XXX: raidctl(8) expects to see this as a used spare */
   3843 			config->spares[i].status = rf_ds_used_spare;
   3844 		}
   3845 	}
   3846 	return 0;
   3847 }
   3848 
   3849 int
   3850 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3851 {
   3852 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3853 	RF_ComponentLabel_t *raid_clabel;
   3854 	int column = clabel->column;
   3855 
   3856 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3857 		return EINVAL;
   3858 	raid_clabel = raidget_component_label(raidPtr, column);
   3859 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3860 
   3861 	return 0;
   3862 }
   3863 
   3864 /*
   3865  * Module interface
   3866  */
   3867 
   3868 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3869 
   3870 #ifdef _MODULE
   3871 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3872 #endif
   3873 
   3874 static int raid_modcmd(modcmd_t, void *);
   3875 static int raid_modcmd_init(void);
   3876 static int raid_modcmd_fini(void);
   3877 
   3878 static int
   3879 raid_modcmd(modcmd_t cmd, void *data)
   3880 {
   3881 	int error;
   3882 
   3883 	error = 0;
   3884 	switch (cmd) {
   3885 	case MODULE_CMD_INIT:
   3886 		error = raid_modcmd_init();
   3887 		break;
   3888 	case MODULE_CMD_FINI:
   3889 		error = raid_modcmd_fini();
   3890 		break;
   3891 	default:
   3892 		error = ENOTTY;
   3893 		break;
   3894 	}
   3895 	return error;
   3896 }
   3897 
   3898 static int
   3899 raid_modcmd_init(void)
   3900 {
   3901 	int error;
   3902 	int bmajor, cmajor;
   3903 
   3904 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3905 	mutex_enter(&raid_lock);
   3906 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3907 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3908 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3909 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3910 
   3911 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3912 #endif
   3913 
   3914 	bmajor = cmajor = -1;
   3915 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3916 	    &raid_cdevsw, &cmajor);
   3917 	if (error != 0 && error != EEXIST) {
   3918 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3919 		mutex_exit(&raid_lock);
   3920 		return error;
   3921 	}
   3922 #ifdef _MODULE
   3923 	error = config_cfdriver_attach(&raid_cd);
   3924 	if (error != 0) {
   3925 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3926 		    __func__, error);
   3927 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3928 		mutex_exit(&raid_lock);
   3929 		return error;
   3930 	}
   3931 #endif
   3932 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3933 	if (error != 0) {
   3934 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3935 		    __func__, error);
   3936 #ifdef _MODULE
   3937 		config_cfdriver_detach(&raid_cd);
   3938 #endif
   3939 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3940 		mutex_exit(&raid_lock);
   3941 		return error;
   3942 	}
   3943 
   3944 	raidautoconfigdone = false;
   3945 
   3946 	mutex_exit(&raid_lock);
   3947 
   3948 	if (error == 0) {
   3949 		if (rf_BootRaidframe(true) == 0)
   3950 			aprint_verbose("Kernelized RAIDframe activated\n");
   3951 		else
   3952 			panic("Serious error activating RAID!!");
   3953 	}
   3954 
   3955 	/*
   3956 	 * Register a finalizer which will be used to auto-config RAID
   3957 	 * sets once all real hardware devices have been found.
   3958 	 */
   3959 	error = config_finalize_register(NULL, rf_autoconfig);
   3960 	if (error != 0) {
   3961 		aprint_error("WARNING: unable to register RAIDframe "
   3962 		    "finalizer\n");
   3963 		error = 0;
   3964 	}
   3965 
   3966 	return error;
   3967 }
   3968 
   3969 static int
   3970 raid_modcmd_fini(void)
   3971 {
   3972 	int error;
   3973 
   3974 	mutex_enter(&raid_lock);
   3975 
   3976 	/* Don't allow unload if raid device(s) exist.  */
   3977 	if (!LIST_EMPTY(&raids)) {
   3978 		mutex_exit(&raid_lock);
   3979 		return EBUSY;
   3980 	}
   3981 
   3982 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3983 	if (error != 0) {
   3984 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3985 		mutex_exit(&raid_lock);
   3986 		return error;
   3987 	}
   3988 #ifdef _MODULE
   3989 	error = config_cfdriver_detach(&raid_cd);
   3990 	if (error != 0) {
   3991 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3992 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3993 		mutex_exit(&raid_lock);
   3994 		return error;
   3995 	}
   3996 #endif
   3997 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3998 	if (error != 0) {
   3999 		aprint_error("%s: cannot detach devsw\n",__func__);
   4000 #ifdef _MODULE
   4001 		config_cfdriver_attach(&raid_cd);
   4002 #endif
   4003 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4004 		mutex_exit(&raid_lock);
   4005 		return error;
   4006 	}
   4007 	rf_BootRaidframe(false);
   4008 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4009 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4010 	rf_destroy_cond2(rf_sparet_wait_cv);
   4011 	rf_destroy_cond2(rf_sparet_resp_cv);
   4012 #endif
   4013 	mutex_exit(&raid_lock);
   4014 	mutex_destroy(&raid_lock);
   4015 
   4016 	return error;
   4017 }
   4018