Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.372
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.372 2019/02/06 23:00:16 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.372 2019/02/06 23:00:16 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    179 
    180 /* prototypes */
    181 static void KernelWakeupFunc(struct buf *);
    182 static void InitBP(struct buf *, struct vnode *, unsigned,
    183     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    184     void *, int, struct proc *);
    185 static void raidinit(struct raid_softc *);
    186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    188 
    189 static int raid_match(device_t, cfdata_t, void *);
    190 static void raid_attach(device_t, device_t, void *);
    191 static int raid_detach(device_t, int);
    192 
    193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    194     daddr_t, daddr_t);
    195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t, int);
    197 
    198 static int raidwrite_component_label(unsigned,
    199     dev_t, struct vnode *, RF_ComponentLabel_t *);
    200 static int raidread_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 
    203 static int raid_diskstart(device_t, struct buf *bp);
    204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    205 static int raid_lastclose(device_t);
    206 
    207 static dev_type_open(raidopen);
    208 static dev_type_close(raidclose);
    209 static dev_type_read(raidread);
    210 static dev_type_write(raidwrite);
    211 static dev_type_ioctl(raidioctl);
    212 static dev_type_strategy(raidstrategy);
    213 static dev_type_dump(raiddump);
    214 static dev_type_size(raidsize);
    215 
    216 const struct bdevsw raid_bdevsw = {
    217 	.d_open = raidopen,
    218 	.d_close = raidclose,
    219 	.d_strategy = raidstrategy,
    220 	.d_ioctl = raidioctl,
    221 	.d_dump = raiddump,
    222 	.d_psize = raidsize,
    223 	.d_discard = nodiscard,
    224 	.d_flag = D_DISK
    225 };
    226 
    227 const struct cdevsw raid_cdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_read = raidread,
    231 	.d_write = raidwrite,
    232 	.d_ioctl = raidioctl,
    233 	.d_stop = nostop,
    234 	.d_tty = notty,
    235 	.d_poll = nopoll,
    236 	.d_mmap = nommap,
    237 	.d_kqfilter = nokqfilter,
    238 	.d_discard = nodiscard,
    239 	.d_flag = D_DISK
    240 };
    241 
    242 static struct dkdriver rf_dkdriver = {
    243 	.d_open = raidopen,
    244 	.d_close = raidclose,
    245 	.d_strategy = raidstrategy,
    246 	.d_diskstart = raid_diskstart,
    247 	.d_dumpblocks = raid_dumpblocks,
    248 	.d_lastclose = raid_lastclose,
    249 	.d_minphys = minphys
    250 };
    251 
    252 #define	raidunit(x)	DISKUNIT(x)
    253 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /* Internal representation of a rf_recon_req */
    261 struct rf_recon_req_internal {
    262 	RF_RowCol_t col;
    263 	RF_ReconReqFlags_t flags;
    264 	void   *raidPtr;
    265 };
    266 
    267 /*
    268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    269  * Be aware that large numbers can allow the driver to consume a lot of
    270  * kernel memory, especially on writes, and in degraded mode reads.
    271  *
    272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    273  * a single 64K write will typically require 64K for the old data,
    274  * 64K for the old parity, and 64K for the new parity, for a total
    275  * of 192K (if the parity buffer is not re-used immediately).
    276  * Even it if is used immediately, that's still 128K, which when multiplied
    277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    278  *
    279  * Now in degraded mode, for example, a 64K read on the above setup may
    280  * require data reconstruction, which will require *all* of the 4 remaining
    281  * disks to participate -- 4 * 32K/disk == 128K again.
    282  */
    283 
    284 #ifndef RAIDOUTSTANDING
    285 #define RAIDOUTSTANDING   6
    286 #endif
    287 
    288 #define RAIDLABELDEV(dev)	\
    289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    290 
    291 /* declared here, and made public, for the benefit of KVM stuff.. */
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req_internal *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	sc->sc_unit = unit;
    342 	cv_init(&sc->sc_cv, "raidunit");
    343 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    344 	return sc;
    345 }
    346 
    347 static void
    348 raiddestroy(struct raid_softc *sc) {
    349 	cv_destroy(&sc->sc_cv);
    350 	mutex_destroy(&sc->sc_mutex);
    351 	kmem_free(sc, sizeof(*sc));
    352 }
    353 
    354 static struct raid_softc *
    355 raidget(int unit, bool create) {
    356 	struct raid_softc *sc;
    357 	if (unit < 0) {
    358 #ifdef DIAGNOSTIC
    359 		panic("%s: unit %d!", __func__, unit);
    360 #endif
    361 		return NULL;
    362 	}
    363 	mutex_enter(&raid_lock);
    364 	LIST_FOREACH(sc, &raids, sc_link) {
    365 		if (sc->sc_unit == unit) {
    366 			mutex_exit(&raid_lock);
    367 			return sc;
    368 		}
    369 	}
    370 	mutex_exit(&raid_lock);
    371 	if (!create)
    372 		return NULL;
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return (0);
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok, rootable %d\n",
    502 				    sc->sc_unit, cset->rootable);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 	dksc = &rsc->sc_dksc;
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL) {
    527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    528 		return;
    529 	}
    530 
    531 	/* we found something bootable... */
    532 
    533 	/*
    534 	 * XXX: The following code assumes that the root raid
    535 	 * is the first ('a') partition. This is about the best
    536 	 * we can do with a BSD disklabel, but we might be able
    537 	 * to do better with a GPT label, by setting a specified
    538 	 * attribute to indicate the root partition. We can then
    539 	 * stash the partition number in the r->root_partition
    540 	 * high bits (the bottom 2 bits are already used). For
    541 	 * now we just set booted_partition to 0 when we override
    542 	 * root.
    543 	 */
    544 	if (num_root == 1) {
    545 		device_t candidate_root;
    546 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    547 			char cname[sizeof(cset->ac->devname)];
    548 			/* XXX: assume partition 'a' first */
    549 			snprintf(cname, sizeof(cname), "%s%c",
    550 			    device_xname(dksc->sc_dev), 'a');
    551 			candidate_root = dkwedge_find_by_wname(cname);
    552 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    553 			    cname);
    554 			if (candidate_root == NULL) {
    555 				/*
    556 				 * If that is not found, because we don't use
    557 				 * disklabel, return the first dk child
    558 				 * XXX: we can skip the 'a' check above
    559 				 * and always do this...
    560 				 */
    561 				size_t i = 0;
    562 				candidate_root = dkwedge_find_by_parent(
    563 				    device_xname(dksc->sc_dev), &i);
    564 			}
    565 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    566 			    candidate_root);
    567 		} else
    568 			candidate_root = dksc->sc_dev;
    569 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    570 		DPRINTF("%s: booted_device=%p root_partition=%d "
    571 			"contains_boot=%d",
    572 		    __func__, booted_device, rsc->sc_r.root_partition,
    573 			   rf_containsboot(&rsc->sc_r, booted_device));
    574 		/* XXX the check for booted_device == NULL can probably be
    575 		 * dropped, now that rf_containsboot handles that case.
    576 		 */
    577 		if (booted_device == NULL ||
    578 		    rsc->sc_r.root_partition == 1 ||
    579 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    580 			booted_device = candidate_root;
    581 			booted_method = "raidframe/single";
    582 			booted_partition = 0;	/* XXX assume 'a' */
    583 		}
    584 	} else if (num_root > 1) {
    585 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    586 		    booted_device);
    587 
    588 		/*
    589 		 * Maybe the MD code can help. If it cannot, then
    590 		 * setroot() will discover that we have no
    591 		 * booted_device and will ask the user if nothing was
    592 		 * hardwired in the kernel config file
    593 		 */
    594 		if (booted_device == NULL)
    595 			return;
    596 
    597 		num_root = 0;
    598 		mutex_enter(&raid_lock);
    599 		LIST_FOREACH(sc, &raids, sc_link) {
    600 			RF_Raid_t *r = &sc->sc_r;
    601 			if (r->valid == 0)
    602 				continue;
    603 
    604 			if (r->root_partition == 0)
    605 				continue;
    606 
    607 			if (rf_containsboot(r, booted_device)) {
    608 				num_root++;
    609 				rsc = sc;
    610 				dksc = &rsc->sc_dksc;
    611 			}
    612 		}
    613 		mutex_exit(&raid_lock);
    614 
    615 		if (num_root == 1) {
    616 			booted_device = dksc->sc_dev;
    617 			booted_method = "raidframe/multi";
    618 			booted_partition = 0;	/* XXX assume 'a' */
    619 		} else {
    620 			/* we can't guess.. require the user to answer... */
    621 			boothowto |= RB_ASKNAME;
    622 		}
    623 	}
    624 }
    625 
    626 static int
    627 raidsize(dev_t dev)
    628 {
    629 	struct raid_softc *rs;
    630 	struct dk_softc *dksc;
    631 	unsigned int unit;
    632 
    633 	unit = raidunit(dev);
    634 	if ((rs = raidget(unit, false)) == NULL)
    635 		return -1;
    636 	dksc = &rs->sc_dksc;
    637 
    638 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    639 		return -1;
    640 
    641 	return dk_size(dksc, dev);
    642 }
    643 
    644 static int
    645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    646 {
    647 	unsigned int unit;
    648 	struct raid_softc *rs;
    649 	struct dk_softc *dksc;
    650 
    651 	unit = raidunit(dev);
    652 	if ((rs = raidget(unit, false)) == NULL)
    653 		return ENXIO;
    654 	dksc = &rs->sc_dksc;
    655 
    656 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    657 		return ENODEV;
    658 
    659         /*
    660            Note that blkno is relative to this particular partition.
    661            By adding adding RF_PROTECTED_SECTORS, we get a value that
    662 	   is relative to the partition used for the underlying component.
    663         */
    664 	blkno += RF_PROTECTED_SECTORS;
    665 
    666 	return dk_dump(dksc, dev, blkno, va, size);
    667 }
    668 
    669 static int
    670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    671 {
    672 	struct raid_softc *rs = raidsoftc(dev);
    673 	const struct bdevsw *bdev;
    674 	RF_Raid_t *raidPtr;
    675 	int     c, sparecol, j, scol, dumpto;
    676 	int     error = 0;
    677 
    678 	raidPtr = &rs->sc_r;
    679 
    680 	/* we only support dumping to RAID 1 sets */
    681 	if (raidPtr->Layout.numDataCol != 1 ||
    682 	    raidPtr->Layout.numParityCol != 1)
    683 		return EINVAL;
    684 
    685 	if ((error = raidlock(rs)) != 0)
    686 		return error;
    687 
    688 	/* figure out what device is alive.. */
    689 
    690 	/*
    691 	   Look for a component to dump to.  The preference for the
    692 	   component to dump to is as follows:
    693 	   1) the master
    694 	   2) a used_spare of the master
    695 	   3) the slave
    696 	   4) a used_spare of the slave
    697 	*/
    698 
    699 	dumpto = -1;
    700 	for (c = 0; c < raidPtr->numCol; c++) {
    701 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    702 			/* this might be the one */
    703 			dumpto = c;
    704 			break;
    705 		}
    706 	}
    707 
    708 	/*
    709 	   At this point we have possibly selected a live master or a
    710 	   live slave.  We now check to see if there is a spared
    711 	   master (or a spared slave), if we didn't find a live master
    712 	   or a live slave.
    713 	*/
    714 
    715 	for (c = 0; c < raidPtr->numSpare; c++) {
    716 		sparecol = raidPtr->numCol + c;
    717 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    718 			/* How about this one? */
    719 			scol = -1;
    720 			for(j=0;j<raidPtr->numCol;j++) {
    721 				if (raidPtr->Disks[j].spareCol == sparecol) {
    722 					scol = j;
    723 					break;
    724 				}
    725 			}
    726 			if (scol == 0) {
    727 				/*
    728 				   We must have found a spared master!
    729 				   We'll take that over anything else
    730 				   found so far.  (We couldn't have
    731 				   found a real master before, since
    732 				   this is a used spare, and it's
    733 				   saying that it's replacing the
    734 				   master.)  On reboot (with
    735 				   autoconfiguration turned on)
    736 				   sparecol will become the 1st
    737 				   component (component0) of this set.
    738 				*/
    739 				dumpto = sparecol;
    740 				break;
    741 			} else if (scol != -1) {
    742 				/*
    743 				   Must be a spared slave.  We'll dump
    744 				   to that if we havn't found anything
    745 				   else so far.
    746 				*/
    747 				if (dumpto == -1)
    748 					dumpto = sparecol;
    749 			}
    750 		}
    751 	}
    752 
    753 	if (dumpto == -1) {
    754 		/* we couldn't find any live components to dump to!?!?
    755 		 */
    756 		error = EINVAL;
    757 		goto out;
    758 	}
    759 
    760 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    761 	if (bdev == NULL) {
    762 		error = ENXIO;
    763 		goto out;
    764 	}
    765 
    766 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    767 				blkno, va, nblk * raidPtr->bytesPerSector);
    768 
    769 out:
    770 	raidunlock(rs);
    771 
    772 	return error;
    773 }
    774 
    775 /* ARGSUSED */
    776 static int
    777 raidopen(dev_t dev, int flags, int fmt,
    778     struct lwp *l)
    779 {
    780 	int     unit = raidunit(dev);
    781 	struct raid_softc *rs;
    782 	struct dk_softc *dksc;
    783 	int     error = 0;
    784 	int     part, pmask;
    785 
    786 	if ((rs = raidget(unit, true)) == NULL)
    787 		return ENXIO;
    788 	if ((error = raidlock(rs)) != 0)
    789 		return (error);
    790 
    791 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    792 		error = EBUSY;
    793 		goto bad;
    794 	}
    795 
    796 	dksc = &rs->sc_dksc;
    797 
    798 	part = DISKPART(dev);
    799 	pmask = (1 << part);
    800 
    801 	if (!DK_BUSY(dksc, pmask) &&
    802 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    803 		/* First one... mark things as dirty... Note that we *MUST*
    804 		 have done a configure before this.  I DO NOT WANT TO BE
    805 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    806 		 THAT THEY BELONG TOGETHER!!!!! */
    807 		/* XXX should check to see if we're only open for reading
    808 		   here... If so, we needn't do this, but then need some
    809 		   other way of keeping track of what's happened.. */
    810 
    811 		rf_markalldirty(&rs->sc_r);
    812 	}
    813 
    814 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    815 		error = dk_open(dksc, dev, flags, fmt, l);
    816 
    817 bad:
    818 	raidunlock(rs);
    819 
    820 	return (error);
    821 
    822 
    823 }
    824 
    825 static int
    826 raid_lastclose(device_t self)
    827 {
    828 	struct raid_softc *rs = raidsoftc(self);
    829 
    830 	/* Last one... device is not unconfigured yet.
    831 	   Device shutdown has taken care of setting the
    832 	   clean bits if RAIDF_INITED is not set
    833 	   mark things as clean... */
    834 
    835 	rf_update_component_labels(&rs->sc_r,
    836 	    RF_FINAL_COMPONENT_UPDATE);
    837 
    838 	/* pass to unlocked code */
    839 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    840 		rs->sc_flags |= RAIDF_DETACH;
    841 
    842 	return 0;
    843 }
    844 
    845 /* ARGSUSED */
    846 static int
    847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    848 {
    849 	int     unit = raidunit(dev);
    850 	struct raid_softc *rs;
    851 	struct dk_softc *dksc;
    852 	cfdata_t cf;
    853 	int     error = 0, do_detach = 0, do_put = 0;
    854 
    855 	if ((rs = raidget(unit, false)) == NULL)
    856 		return ENXIO;
    857 	dksc = &rs->sc_dksc;
    858 
    859 	if ((error = raidlock(rs)) != 0)
    860 		return (error);
    861 
    862 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    863 		error = dk_close(dksc, dev, flags, fmt, l);
    864 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    865 			do_detach = 1;
    866 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    867 		do_put = 1;
    868 
    869 	raidunlock(rs);
    870 
    871 	if (do_detach) {
    872 		/* free the pseudo device attach bits */
    873 		cf = device_cfdata(dksc->sc_dev);
    874 		error = config_detach(dksc->sc_dev, 0);
    875 		if (error == 0)
    876 			free(cf, M_RAIDFRAME);
    877 	} else if (do_put) {
    878 		raidput(rs);
    879 	}
    880 
    881 	return (error);
    882 
    883 }
    884 
    885 static void
    886 raid_wakeup(RF_Raid_t *raidPtr)
    887 {
    888 	rf_lock_mutex2(raidPtr->iodone_lock);
    889 	rf_signal_cond2(raidPtr->iodone_cv);
    890 	rf_unlock_mutex2(raidPtr->iodone_lock);
    891 }
    892 
    893 static void
    894 raidstrategy(struct buf *bp)
    895 {
    896 	unsigned int unit;
    897 	struct raid_softc *rs;
    898 	struct dk_softc *dksc;
    899 	RF_Raid_t *raidPtr;
    900 
    901 	unit = raidunit(bp->b_dev);
    902 	if ((rs = raidget(unit, false)) == NULL) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    907 		bp->b_error = ENXIO;
    908 		goto fail;
    909 	}
    910 	dksc = &rs->sc_dksc;
    911 	raidPtr = &rs->sc_r;
    912 
    913 	/* Queue IO only */
    914 	if (dk_strategy_defer(dksc, bp))
    915 		goto done;
    916 
    917 	/* schedule the IO to happen at the next convenient time */
    918 	raid_wakeup(raidPtr);
    919 
    920 done:
    921 	return;
    922 
    923 fail:
    924 	bp->b_resid = bp->b_bcount;
    925 	biodone(bp);
    926 }
    927 
    928 static int
    929 raid_diskstart(device_t dev, struct buf *bp)
    930 {
    931 	struct raid_softc *rs = raidsoftc(dev);
    932 	RF_Raid_t *raidPtr;
    933 
    934 	raidPtr = &rs->sc_r;
    935 	if (!raidPtr->valid) {
    936 		db1_printf(("raid is not valid..\n"));
    937 		return ENODEV;
    938 	}
    939 
    940 	/* XXX */
    941 	bp->b_resid = 0;
    942 
    943 	return raiddoaccess(raidPtr, bp);
    944 }
    945 
    946 void
    947 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    948 {
    949 	struct raid_softc *rs;
    950 	struct dk_softc *dksc;
    951 
    952 	rs = raidPtr->softc;
    953 	dksc = &rs->sc_dksc;
    954 
    955 	dk_done(dksc, bp);
    956 
    957 	rf_lock_mutex2(raidPtr->mutex);
    958 	raidPtr->openings++;
    959 	rf_unlock_mutex2(raidPtr->mutex);
    960 
    961 	/* schedule more IO */
    962 	raid_wakeup(raidPtr);
    963 }
    964 
    965 /* ARGSUSED */
    966 static int
    967 raidread(dev_t dev, struct uio *uio, int flags)
    968 {
    969 	int     unit = raidunit(dev);
    970 	struct raid_softc *rs;
    971 
    972 	if ((rs = raidget(unit, false)) == NULL)
    973 		return ENXIO;
    974 
    975 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    976 		return (ENXIO);
    977 
    978 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    979 
    980 }
    981 
    982 /* ARGSUSED */
    983 static int
    984 raidwrite(dev_t dev, struct uio *uio, int flags)
    985 {
    986 	int     unit = raidunit(dev);
    987 	struct raid_softc *rs;
    988 
    989 	if ((rs = raidget(unit, false)) == NULL)
    990 		return ENXIO;
    991 
    992 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    993 		return (ENXIO);
    994 
    995 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    996 
    997 }
    998 
    999 static int
   1000 raid_detach_unlocked(struct raid_softc *rs)
   1001 {
   1002 	struct dk_softc *dksc = &rs->sc_dksc;
   1003 	RF_Raid_t *raidPtr;
   1004 	int error;
   1005 
   1006 	raidPtr = &rs->sc_r;
   1007 
   1008 	if (DK_BUSY(dksc, 0) ||
   1009 	    raidPtr->recon_in_progress != 0 ||
   1010 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1011 	    raidPtr->copyback_in_progress != 0)
   1012 		return EBUSY;
   1013 
   1014 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1015 		return 0;
   1016 
   1017 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1018 
   1019 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1020 		return error;
   1021 
   1022 	rs->sc_flags &= ~RAIDF_INITED;
   1023 
   1024 	/* Kill off any queued buffers */
   1025 	dk_drain(dksc);
   1026 	bufq_free(dksc->sc_bufq);
   1027 
   1028 	/* Detach the disk. */
   1029 	dkwedge_delall(&dksc->sc_dkdev);
   1030 	disk_detach(&dksc->sc_dkdev);
   1031 	disk_destroy(&dksc->sc_dkdev);
   1032 	dk_detach(dksc);
   1033 
   1034 	return 0;
   1035 }
   1036 
   1037 static bool
   1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1039 {
   1040 	switch (cmd) {
   1041 	case RAIDFRAME_ADD_HOT_SPARE:
   1042 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1043 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1044 	case RAIDFRAME_CHECK_PARITY:
   1045 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1046 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1047 	case RAIDFRAME_CHECK_RECON_STATUS:
   1048 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1049 	case RAIDFRAME_COPYBACK:
   1050 	case RAIDFRAME_DELETE_COMPONENT:
   1051 	case RAIDFRAME_FAIL_DISK:
   1052 	case RAIDFRAME_GET_ACCTOTALS:
   1053 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1054 	case RAIDFRAME_GET_INFO:
   1055 	case RAIDFRAME_GET_SIZE:
   1056 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1057 	case RAIDFRAME_INIT_LABELS:
   1058 	case RAIDFRAME_KEEP_ACCTOTALS:
   1059 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1060 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1061 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1062 	case RAIDFRAME_PARITYMAP_STATUS:
   1063 	case RAIDFRAME_REBUILD_IN_PLACE:
   1064 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1065 	case RAIDFRAME_RESET_ACCTOTALS:
   1066 	case RAIDFRAME_REWRITEPARITY:
   1067 	case RAIDFRAME_SET_AUTOCONFIG:
   1068 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1069 	case RAIDFRAME_SET_ROOT:
   1070 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1071 	}
   1072 	return false;
   1073 }
   1074 
   1075 int
   1076 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1077 {
   1078 	struct rf_recon_req_internal *rrint;
   1079 
   1080 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1081 		/* Can't do this on a RAID 0!! */
   1082 		return EINVAL;
   1083 	}
   1084 
   1085 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1086 		/* bad column */
   1087 		return EINVAL;
   1088 	}
   1089 
   1090 	rf_lock_mutex2(raidPtr->mutex);
   1091 	if (raidPtr->status == rf_rs_reconstructing) {
   1092 		/* you can't fail a disk while we're reconstructing! */
   1093 		/* XXX wrong for RAID6 */
   1094 		goto out;
   1095 	}
   1096 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1097 	    (raidPtr->numFailures > 0)) {
   1098 		/* some other component has failed.  Let's not make
   1099 		   things worse. XXX wrong for RAID6 */
   1100 		goto out;
   1101 	}
   1102 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1103 		/* Can't fail a spared disk! */
   1104 		goto out;
   1105 	}
   1106 	rf_unlock_mutex2(raidPtr->mutex);
   1107 
   1108 	/* make a copy of the recon request so that we don't rely on
   1109 	 * the user's buffer */
   1110 	RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1111 	if (rrint == NULL)
   1112 		return(ENOMEM);
   1113 	rrint->col = rr->col;
   1114 	rrint->flags = rr->flags;
   1115 	rrint->raidPtr = raidPtr;
   1116 
   1117 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1118 	    rrint, "raid_recon");
   1119 out:
   1120 	rf_unlock_mutex2(raidPtr->mutex);
   1121 	return EINVAL;
   1122 }
   1123 
   1124 static int
   1125 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1126 {
   1127 	/* allocate a buffer for the layout-specific data, and copy it in */
   1128 	if (k_cfg->layoutSpecificSize == 0)
   1129 		return 0;
   1130 
   1131 	if (k_cfg->layoutSpecificSize > 10000) {
   1132 	    /* sanity check */
   1133 	    return EINVAL;
   1134 	}
   1135 
   1136 	u_char *specific_buf;
   1137 	RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, (u_char *));
   1138 	if (specific_buf == NULL)
   1139 		return ENOMEM;
   1140 
   1141 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1142 	    k_cfg->layoutSpecificSize);
   1143 	if (retcode) {
   1144 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1145 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1146 		return retcode;
   1147 	}
   1148 
   1149 	k_cfg->layoutSpecific = specific_buf;
   1150 	return 0;
   1151 }
   1152 
   1153 static int
   1154 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1155 {
   1156 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1157 
   1158 	if (rs->sc_r.valid) {
   1159 		/* There is a valid RAID set running on this unit! */
   1160 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1161 		return EINVAL;
   1162 	}
   1163 
   1164 	/* copy-in the configuration information */
   1165 	/* data points to a pointer to the configuration structure */
   1166 	RF_Malloc(*k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1167 	if (*k_cfg == NULL) {
   1168 		return ENOMEM;
   1169 	}
   1170 	int retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1171 	if (retcode == 0)
   1172 		return 0;
   1173 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1174 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1175 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1176 	return retcode;
   1177 }
   1178 
   1179 int
   1180 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1181 {
   1182 	int retcode;
   1183 	RF_Raid_t *raidPtr = &rs->sc_r;
   1184 
   1185 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1186 
   1187 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1188 		goto out;
   1189 
   1190 	/* should do some kind of sanity check on the configuration.
   1191 	 * Store the sum of all the bytes in the last byte? */
   1192 
   1193 	/* configure the system */
   1194 
   1195 	/*
   1196 	 * Clear the entire RAID descriptor, just to make sure
   1197 	 *  there is no stale data left in the case of a
   1198 	 *  reconfiguration
   1199 	 */
   1200 	memset(raidPtr, 0, sizeof(*raidPtr));
   1201 	raidPtr->softc = rs;
   1202 	raidPtr->raidid = rs->sc_unit;
   1203 
   1204 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1205 
   1206 	if (retcode == 0) {
   1207 		/* allow this many simultaneous IO's to
   1208 		   this RAID device */
   1209 		raidPtr->openings = RAIDOUTSTANDING;
   1210 
   1211 		raidinit(rs);
   1212 		raid_wakeup(raidPtr);
   1213 		rf_markalldirty(raidPtr);
   1214 	}
   1215 
   1216 	/* free the buffers.  No return code here. */
   1217 	if (k_cfg->layoutSpecificSize) {
   1218 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1219 	}
   1220 out:
   1221 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1222 	if (retcode) {
   1223 		/*
   1224 		 * If configuration failed, set sc_flags so that we
   1225 		 * will detach the device when we close it.
   1226 		 */
   1227 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1228 	}
   1229 	return retcode;
   1230 }
   1231 
   1232 #if RF_DISABLED
   1233 static int
   1234 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1235 {
   1236 
   1237 	/* XXX check the label for valid stuff... */
   1238 	/* Note that some things *should not* get modified --
   1239 	   the user should be re-initing the labels instead of
   1240 	   trying to patch things.
   1241 	   */
   1242 #ifdef DEBUG
   1243 	int raidid = raidPtr->raidid;
   1244 	printf("raid%d: Got component label:\n", raidid);
   1245 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1246 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1247 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1248 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1249 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1250 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1251 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1252 #endif	/* DEBUG */
   1253 	clabel->row = 0;
   1254 	int column = clabel->column;
   1255 
   1256 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1257 		return(EINVAL);
   1258 	}
   1259 
   1260 	/* XXX this isn't allowed to do anything for now :-) */
   1261 
   1262 	/* XXX and before it is, we need to fill in the rest
   1263 	   of the fields!?!?!?! */
   1264 	memcpy(raidget_component_label(raidPtr, column),
   1265 	    clabel, sizeof(*clabel));
   1266 	raidflush_component_label(raidPtr, column);
   1267 	return 0;
   1268 }
   1269 #endif
   1270 
   1271 static int
   1272 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1273 {
   1274 	/*
   1275 	   we only want the serial number from
   1276 	   the above.  We get all the rest of the information
   1277 	   from the config that was used to create this RAID
   1278 	   set.
   1279 	   */
   1280 
   1281 	raidPtr->serial_number = clabel->serial_number;
   1282 
   1283 	for (int column = 0; column < raidPtr->numCol; column++) {
   1284 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1285 		if (RF_DEAD_DISK(diskPtr->status))
   1286 			continue;
   1287 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1288 		    raidPtr, column);
   1289 		/* Zeroing this is important. */
   1290 		memset(ci_label, 0, sizeof(*ci_label));
   1291 		raid_init_component_label(raidPtr, ci_label);
   1292 		ci_label->serial_number = raidPtr->serial_number;
   1293 		ci_label->row = 0; /* we dont' pretend to support more */
   1294 		rf_component_label_set_partitionsize(ci_label,
   1295 		    diskPtr->partitionSize);
   1296 		ci_label->column = column;
   1297 		raidflush_component_label(raidPtr, column);
   1298 		/* XXXjld what about the spares? */
   1299 	}
   1300 
   1301 	return 0;
   1302 }
   1303 
   1304 static int
   1305 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1306 {
   1307 
   1308 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1309 		/* Can't do this on a RAID 0!! */
   1310 		return EINVAL;
   1311 	}
   1312 
   1313 	if (raidPtr->recon_in_progress == 1) {
   1314 		/* a reconstruct is already in progress! */
   1315 		return EINVAL;
   1316 	}
   1317 
   1318 	RF_SingleComponent_t component;
   1319 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1320 	component.row = 0; /* we don't support any more */
   1321 	int column = component.column;
   1322 
   1323 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1324 		return EINVAL;
   1325 	}
   1326 
   1327 	rf_lock_mutex2(raidPtr->mutex);
   1328 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1329 	    (raidPtr->numFailures > 0)) {
   1330 		/* XXX 0 above shouldn't be constant!!! */
   1331 		/* some component other than this has failed.
   1332 		   Let's not make things worse than they already
   1333 		   are... */
   1334 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1335 		       raidPtr->raidid);
   1336 		printf("raid%d:     Col: %d   Too many failures.\n",
   1337 		       raidPtr->raidid, column);
   1338 		rf_unlock_mutex2(raidPtr->mutex);
   1339 		return EINVAL;
   1340 	}
   1341 
   1342 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1343 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1344 		       raidPtr->raidid);
   1345 		printf("raid%d:    Col: %d   "
   1346 		    "Reconstruction already occurring!\n",
   1347 		    raidPtr->raidid, column);
   1348 
   1349 		rf_unlock_mutex2(raidPtr->mutex);
   1350 		return EINVAL;
   1351 	}
   1352 
   1353 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1354 		rf_unlock_mutex2(raidPtr->mutex);
   1355 		return EINVAL;
   1356 	}
   1357 
   1358 	rf_unlock_mutex2(raidPtr->mutex);
   1359 
   1360 	struct rf_recon_req_internal *rrint;
   1361 	RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1362 	if (rrint == NULL)
   1363 		return ENOMEM;
   1364 
   1365 	rrint->col = column;
   1366 	rrint->raidPtr = raidPtr;
   1367 
   1368 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1369 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1370 }
   1371 
   1372 static int
   1373 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1374 {
   1375 	/*
   1376 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1377 	 * so tell the user it's done.
   1378 	 */
   1379 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1380 	    raidPtr->status != rf_rs_reconstructing) {
   1381 		*data = 100;
   1382 		return 0;
   1383 	}
   1384 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1385 		*data = 0;
   1386 		return 0;
   1387 	}
   1388 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1389 	    / raidPtr->reconControl->numRUsTotal);
   1390 	return 0;
   1391 }
   1392 
   1393 static int
   1394 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1395 {
   1396 	int     unit = raidunit(dev);
   1397 	int     part, pmask;
   1398 	struct raid_softc *rs;
   1399 	struct dk_softc *dksc;
   1400 	RF_Config_t *k_cfg;
   1401 	RF_Raid_t *raidPtr;
   1402 	RF_AccTotals_t *totals;
   1403 	RF_SingleComponent_t component;
   1404 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1405 	int retcode = 0;
   1406 	int column;
   1407 	RF_ComponentLabel_t *clabel;
   1408 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1409 	int d;
   1410 
   1411 	if ((rs = raidget(unit, false)) == NULL)
   1412 		return ENXIO;
   1413 
   1414 	dksc = &rs->sc_dksc;
   1415 	raidPtr = &rs->sc_r;
   1416 
   1417 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1418 	    (int) DISKPART(dev), (int) unit, cmd));
   1419 
   1420 	/* Must be initialized for these... */
   1421 	if (rf_must_be_initialized(rs, cmd))
   1422 		return ENXIO;
   1423 
   1424 	switch (cmd) {
   1425 		/* configure the system */
   1426 	case RAIDFRAME_CONFIGURE:
   1427 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1428 			return retcode;
   1429 		return rf_construct(rs, k_cfg);
   1430 
   1431 		/* shutdown the system */
   1432 	case RAIDFRAME_SHUTDOWN:
   1433 
   1434 		part = DISKPART(dev);
   1435 		pmask = (1 << part);
   1436 
   1437 		if ((retcode = raidlock(rs)) != 0)
   1438 			return retcode;
   1439 
   1440 		if (DK_BUSY(dksc, pmask) ||
   1441 		    raidPtr->recon_in_progress != 0 ||
   1442 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1443 		    raidPtr->copyback_in_progress != 0)
   1444 			retcode = EBUSY;
   1445 		else {
   1446 			/* detach and free on close */
   1447 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1448 			retcode = 0;
   1449 		}
   1450 
   1451 		raidunlock(rs);
   1452 
   1453 		return retcode;
   1454 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1455 		return rf_get_component_label(raidPtr, data);
   1456 
   1457 #if RF_DISABLED
   1458 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1459 		return rf_set_component_label(raidPtr, data);
   1460 #endif
   1461 
   1462 	case RAIDFRAME_INIT_LABELS:
   1463 		return rf_init_component_label(raidPtr, data);
   1464 
   1465 	case RAIDFRAME_SET_AUTOCONFIG:
   1466 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1467 		printf("raid%d: New autoconfig value is: %d\n",
   1468 		       raidPtr->raidid, d);
   1469 		*(int *) data = d;
   1470 		return retcode;
   1471 
   1472 	case RAIDFRAME_SET_ROOT:
   1473 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1474 		printf("raid%d: New rootpartition value is: %d\n",
   1475 		       raidPtr->raidid, d);
   1476 		*(int *) data = d;
   1477 		return retcode;
   1478 
   1479 		/* initialize all parity */
   1480 	case RAIDFRAME_REWRITEPARITY:
   1481 
   1482 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1483 			/* Parity for RAID 0 is trivially correct */
   1484 			raidPtr->parity_good = RF_RAID_CLEAN;
   1485 			return 0;
   1486 		}
   1487 
   1488 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1489 			/* Re-write is already in progress! */
   1490 			return EINVAL;
   1491 		}
   1492 
   1493 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1494 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1495 
   1496 	case RAIDFRAME_ADD_HOT_SPARE:
   1497 		sparePtr = (RF_SingleComponent_t *) data;
   1498 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1499 		return rf_add_hot_spare(raidPtr, &component);
   1500 
   1501 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1502 		return retcode;
   1503 
   1504 	case RAIDFRAME_DELETE_COMPONENT:
   1505 		componentPtr = (RF_SingleComponent_t *)data;
   1506 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1507 		return rf_delete_component(raidPtr, &component);
   1508 
   1509 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1510 		componentPtr = (RF_SingleComponent_t *)data;
   1511 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1512 		return rf_incorporate_hot_spare(raidPtr, &component);
   1513 
   1514 	case RAIDFRAME_REBUILD_IN_PLACE:
   1515 		return rf_rebuild_in_place(raidPtr, data);
   1516 
   1517 	case RAIDFRAME_GET_INFO:
   1518 		ucfgp = *(RF_DeviceConfig_t **)data;
   1519 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1520 			  (RF_DeviceConfig_t *));
   1521 		if (d_cfg == NULL)
   1522 			return ENOMEM;
   1523 		retcode = rf_get_info(raidPtr, d_cfg);
   1524 		if (retcode == 0) {
   1525 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1526 		}
   1527 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1528 		return retcode;
   1529 
   1530 	case RAIDFRAME_CHECK_PARITY:
   1531 		*(int *) data = raidPtr->parity_good;
   1532 		return 0;
   1533 
   1534 	case RAIDFRAME_PARITYMAP_STATUS:
   1535 		if (rf_paritymap_ineligible(raidPtr))
   1536 			return EINVAL;
   1537 		rf_paritymap_status(raidPtr->parity_map, data);
   1538 		return 0;
   1539 
   1540 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1541 		if (rf_paritymap_ineligible(raidPtr))
   1542 			return EINVAL;
   1543 		if (raidPtr->parity_map == NULL)
   1544 			return ENOENT; /* ??? */
   1545 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1546 			return EINVAL;
   1547 		return 0;
   1548 
   1549 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1550 		if (rf_paritymap_ineligible(raidPtr))
   1551 			return EINVAL;
   1552 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1553 		return 0;
   1554 
   1555 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1556 		if (rf_paritymap_ineligible(raidPtr))
   1557 			return EINVAL;
   1558 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1559 		/* XXX should errors be passed up? */
   1560 		return 0;
   1561 
   1562 	case RAIDFRAME_RESET_ACCTOTALS:
   1563 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1564 		return 0;
   1565 
   1566 	case RAIDFRAME_GET_ACCTOTALS:
   1567 		totals = (RF_AccTotals_t *) data;
   1568 		*totals = raidPtr->acc_totals;
   1569 		return 0;
   1570 
   1571 	case RAIDFRAME_KEEP_ACCTOTALS:
   1572 		raidPtr->keep_acc_totals = *(int *)data;
   1573 		return 0;
   1574 
   1575 	case RAIDFRAME_GET_SIZE:
   1576 		*(int *) data = raidPtr->totalSectors;
   1577 		return 0;
   1578 
   1579 	case RAIDFRAME_FAIL_DISK:
   1580 		return rf_fail_disk(raidPtr, data);
   1581 
   1582 		/* invoke a copyback operation after recon on whatever disk
   1583 		 * needs it, if any */
   1584 	case RAIDFRAME_COPYBACK:
   1585 
   1586 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1587 			/* This makes no sense on a RAID 0!! */
   1588 			return EINVAL;
   1589 		}
   1590 
   1591 		if (raidPtr->copyback_in_progress == 1) {
   1592 			/* Copyback is already in progress! */
   1593 			return EINVAL;
   1594 		}
   1595 
   1596 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1597 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1598 
   1599 		/* return the percentage completion of reconstruction */
   1600 	case RAIDFRAME_CHECK_RECON_STATUS:
   1601 		return rf_check_recon_status(raidPtr, data);
   1602 
   1603 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1604 		rf_check_recon_status_ext(raidPtr, data);
   1605 		return 0;
   1606 
   1607 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1608 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1609 			/* This makes no sense on a RAID 0, so tell the
   1610 			   user it's done. */
   1611 			*(int *) data = 100;
   1612 			return 0;
   1613 		}
   1614 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1615 			*(int *) data = 100 *
   1616 				raidPtr->parity_rewrite_stripes_done /
   1617 				raidPtr->Layout.numStripe;
   1618 		} else {
   1619 			*(int *) data = 100;
   1620 		}
   1621 		return 0;
   1622 
   1623 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1624 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1625 		return 0;
   1626 
   1627 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1628 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1629 			/* This makes no sense on a RAID 0 */
   1630 			*(int *) data = 100;
   1631 			return 0;
   1632 		}
   1633 		if (raidPtr->copyback_in_progress == 1) {
   1634 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1635 				raidPtr->Layout.numStripe;
   1636 		} else {
   1637 			*(int *) data = 100;
   1638 		}
   1639 		return 0;
   1640 
   1641 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1642 		rf_check_copyback_status_ext(raidPtr, data);
   1643 		return 0;
   1644 
   1645 	case RAIDFRAME_SET_LAST_UNIT:
   1646 		for (column = 0; column < raidPtr->numCol; column++)
   1647 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1648 				return EBUSY;
   1649 
   1650 		for (column = 0; column < raidPtr->numCol; column++) {
   1651 			clabel = raidget_component_label(raidPtr, column);
   1652 			clabel->last_unit = *(int *)data;
   1653 			raidflush_component_label(raidPtr, column);
   1654 		}
   1655 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1656 		return 0;
   1657 
   1658 		/* the sparetable daemon calls this to wait for the kernel to
   1659 		 * need a spare table. this ioctl does not return until a
   1660 		 * spare table is needed. XXX -- calling mpsleep here in the
   1661 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1662 		 * -- I should either compute the spare table in the kernel,
   1663 		 * or have a different -- XXX XXX -- interface (a different
   1664 		 * character device) for delivering the table     -- XXX */
   1665 #if RF_DISABLED
   1666 	case RAIDFRAME_SPARET_WAIT:
   1667 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1668 		while (!rf_sparet_wait_queue)
   1669 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1670 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1671 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1672 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1673 
   1674 		/* structure assignment */
   1675 		*((RF_SparetWait_t *) data) = *waitreq;
   1676 
   1677 		RF_Free(waitreq, sizeof(*waitreq));
   1678 		return 0;
   1679 
   1680 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1681 		 * code in it that will cause the dameon to exit */
   1682 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1683 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1684 		waitreq->fcol = -1;
   1685 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1686 		waitreq->next = rf_sparet_wait_queue;
   1687 		rf_sparet_wait_queue = waitreq;
   1688 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1689 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1690 		return 0;
   1691 
   1692 		/* used by the spare table daemon to deliver a spare table
   1693 		 * into the kernel */
   1694 	case RAIDFRAME_SEND_SPARET:
   1695 
   1696 		/* install the spare table */
   1697 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1698 
   1699 		/* respond to the requestor.  the return status of the spare
   1700 		 * table installation is passed in the "fcol" field */
   1701 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1702 		waitreq->fcol = retcode;
   1703 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1704 		waitreq->next = rf_sparet_resp_queue;
   1705 		rf_sparet_resp_queue = waitreq;
   1706 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1707 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1708 
   1709 		return retcode;
   1710 #endif
   1711 	default:
   1712 		/*
   1713 		 * Don't bother trying to load compat modules
   1714 		 * if it is not our ioctl. This is more efficient
   1715 		 * and makes rump tests not depend on compat code
   1716 		 */
   1717 		if (IOCGROUP(cmd) != 'r')
   1718 			break;
   1719 #ifdef _LP64
   1720 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1721 			module_autoload("compat_netbsd32_raid",
   1722 			    MODULE_CLASS_EXEC);
   1723 			MODULE_CALL_HOOK(raidframe_netbsd32_ioctl_hook,
   1724 			    (rs, cmd, data), enosys(), retcode);
   1725 			if (retcode != EPASSTHROUGH)
   1726 				return retcode;
   1727 		}
   1728 #endif
   1729 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1730 		MODULE_CALL_HOOK(raidframe_ioctl_80_hook,
   1731 		    (rs, cmd, data), enosys(), retcode);
   1732 		if (retcode != EPASSTHROUGH)
   1733 			return retcode;
   1734 
   1735 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1736 		MODULE_CALL_HOOK(raidframe_ioctl_50_hook,
   1737 		    (rs, cmd, data), enosys(), retcode);
   1738 		if (retcode != EPASSTHROUGH)
   1739 			return retcode;
   1740 		break; /* fall through to the os-specific code below */
   1741 
   1742 	}
   1743 
   1744 	if (!raidPtr->valid)
   1745 		return (EINVAL);
   1746 
   1747 	/*
   1748 	 * Add support for "regular" device ioctls here.
   1749 	 */
   1750 
   1751 	switch (cmd) {
   1752 	case DIOCGCACHE:
   1753 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1754 		break;
   1755 
   1756 	case DIOCCACHESYNC:
   1757 		retcode = rf_sync_component_caches(raidPtr);
   1758 		break;
   1759 
   1760 	default:
   1761 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1762 		break;
   1763 	}
   1764 
   1765 	return (retcode);
   1766 
   1767 }
   1768 
   1769 
   1770 /* raidinit -- complete the rest of the initialization for the
   1771    RAIDframe device.  */
   1772 
   1773 
   1774 static void
   1775 raidinit(struct raid_softc *rs)
   1776 {
   1777 	cfdata_t cf;
   1778 	unsigned int unit;
   1779 	struct dk_softc *dksc = &rs->sc_dksc;
   1780 	RF_Raid_t *raidPtr = &rs->sc_r;
   1781 	device_t dev;
   1782 
   1783 	unit = raidPtr->raidid;
   1784 
   1785 	/* XXX doesn't check bounds. */
   1786 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1787 
   1788 	/* attach the pseudo device */
   1789 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1790 	cf->cf_name = raid_cd.cd_name;
   1791 	cf->cf_atname = raid_cd.cd_name;
   1792 	cf->cf_unit = unit;
   1793 	cf->cf_fstate = FSTATE_STAR;
   1794 
   1795 	dev = config_attach_pseudo(cf);
   1796 	if (dev == NULL) {
   1797 		printf("raid%d: config_attach_pseudo failed\n",
   1798 		    raidPtr->raidid);
   1799 		free(cf, M_RAIDFRAME);
   1800 		return;
   1801 	}
   1802 
   1803 	/* provide a backpointer to the real softc */
   1804 	raidsoftc(dev) = rs;
   1805 
   1806 	/* disk_attach actually creates space for the CPU disklabel, among
   1807 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1808 	 * with disklabels. */
   1809 	dk_init(dksc, dev, DKTYPE_RAID);
   1810 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1811 
   1812 	/* XXX There may be a weird interaction here between this, and
   1813 	 * protectedSectors, as used in RAIDframe.  */
   1814 
   1815 	rs->sc_size = raidPtr->totalSectors;
   1816 
   1817 	/* Attach dk and disk subsystems */
   1818 	dk_attach(dksc);
   1819 	disk_attach(&dksc->sc_dkdev);
   1820 	rf_set_geometry(rs, raidPtr);
   1821 
   1822 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1823 
   1824 	/* mark unit as usuable */
   1825 	rs->sc_flags |= RAIDF_INITED;
   1826 
   1827 	dkwedge_discover(&dksc->sc_dkdev);
   1828 }
   1829 
   1830 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1831 /* wake up the daemon & tell it to get us a spare table
   1832  * XXX
   1833  * the entries in the queues should be tagged with the raidPtr
   1834  * so that in the extremely rare case that two recons happen at once,
   1835  * we know for which device were requesting a spare table
   1836  * XXX
   1837  *
   1838  * XXX This code is not currently used. GO
   1839  */
   1840 int
   1841 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1842 {
   1843 	int     retcode;
   1844 
   1845 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1846 	req->next = rf_sparet_wait_queue;
   1847 	rf_sparet_wait_queue = req;
   1848 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1849 
   1850 	/* mpsleep unlocks the mutex */
   1851 	while (!rf_sparet_resp_queue) {
   1852 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1853 	}
   1854 	req = rf_sparet_resp_queue;
   1855 	rf_sparet_resp_queue = req->next;
   1856 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1857 
   1858 	retcode = req->fcol;
   1859 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1860 					 * alloc'd */
   1861 	return (retcode);
   1862 }
   1863 #endif
   1864 
   1865 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1866  * bp & passes it down.
   1867  * any calls originating in the kernel must use non-blocking I/O
   1868  * do some extra sanity checking to return "appropriate" error values for
   1869  * certain conditions (to make some standard utilities work)
   1870  *
   1871  * Formerly known as: rf_DoAccessKernel
   1872  */
   1873 void
   1874 raidstart(RF_Raid_t *raidPtr)
   1875 {
   1876 	struct raid_softc *rs;
   1877 	struct dk_softc *dksc;
   1878 
   1879 	rs = raidPtr->softc;
   1880 	dksc = &rs->sc_dksc;
   1881 	/* quick check to see if anything has died recently */
   1882 	rf_lock_mutex2(raidPtr->mutex);
   1883 	if (raidPtr->numNewFailures > 0) {
   1884 		rf_unlock_mutex2(raidPtr->mutex);
   1885 		rf_update_component_labels(raidPtr,
   1886 					   RF_NORMAL_COMPONENT_UPDATE);
   1887 		rf_lock_mutex2(raidPtr->mutex);
   1888 		raidPtr->numNewFailures--;
   1889 	}
   1890 	rf_unlock_mutex2(raidPtr->mutex);
   1891 
   1892 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1893 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1894 		return;
   1895 	}
   1896 
   1897 	dk_start(dksc, NULL);
   1898 }
   1899 
   1900 static int
   1901 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1902 {
   1903 	RF_SectorCount_t num_blocks, pb, sum;
   1904 	RF_RaidAddr_t raid_addr;
   1905 	daddr_t blocknum;
   1906 	int     do_async;
   1907 	int rc;
   1908 
   1909 	rf_lock_mutex2(raidPtr->mutex);
   1910 	if (raidPtr->openings == 0) {
   1911 		rf_unlock_mutex2(raidPtr->mutex);
   1912 		return EAGAIN;
   1913 	}
   1914 	rf_unlock_mutex2(raidPtr->mutex);
   1915 
   1916 	blocknum = bp->b_rawblkno;
   1917 
   1918 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1919 		    (int) blocknum));
   1920 
   1921 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1922 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1923 
   1924 	/* *THIS* is where we adjust what block we're going to...
   1925 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1926 	raid_addr = blocknum;
   1927 
   1928 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1929 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1930 	sum = raid_addr + num_blocks + pb;
   1931 	if (1 || rf_debugKernelAccess) {
   1932 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1933 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1934 			    (int) pb, (int) bp->b_resid));
   1935 	}
   1936 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1937 	    || (sum < num_blocks) || (sum < pb)) {
   1938 		rc = ENOSPC;
   1939 		goto done;
   1940 	}
   1941 	/*
   1942 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1943 	 */
   1944 
   1945 	if (bp->b_bcount & raidPtr->sectorMask) {
   1946 		rc = ENOSPC;
   1947 		goto done;
   1948 	}
   1949 	db1_printf(("Calling DoAccess..\n"));
   1950 
   1951 
   1952 	rf_lock_mutex2(raidPtr->mutex);
   1953 	raidPtr->openings--;
   1954 	rf_unlock_mutex2(raidPtr->mutex);
   1955 
   1956 	/*
   1957 	 * Everything is async.
   1958 	 */
   1959 	do_async = 1;
   1960 
   1961 	/* don't ever condition on bp->b_flags & B_WRITE.
   1962 	 * always condition on B_READ instead */
   1963 
   1964 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1965 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1966 			 do_async, raid_addr, num_blocks,
   1967 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1968 
   1969 done:
   1970 	return rc;
   1971 }
   1972 
   1973 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1974 
   1975 int
   1976 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1977 {
   1978 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1979 	struct buf *bp;
   1980 
   1981 	req->queue = queue;
   1982 	bp = req->bp;
   1983 
   1984 	switch (req->type) {
   1985 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1986 		/* XXX need to do something extra here.. */
   1987 		/* I'm leaving this in, as I've never actually seen it used,
   1988 		 * and I'd like folks to report it... GO */
   1989 		printf(("WAKEUP CALLED\n"));
   1990 		queue->numOutstanding++;
   1991 
   1992 		bp->b_flags = 0;
   1993 		bp->b_private = req;
   1994 
   1995 		KernelWakeupFunc(bp);
   1996 		break;
   1997 
   1998 	case RF_IO_TYPE_READ:
   1999 	case RF_IO_TYPE_WRITE:
   2000 #if RF_ACC_TRACE > 0
   2001 		if (req->tracerec) {
   2002 			RF_ETIMER_START(req->tracerec->timer);
   2003 		}
   2004 #endif
   2005 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2006 		    op, queue->rf_cinfo->ci_dev,
   2007 		    req->sectorOffset, req->numSector,
   2008 		    req->buf, KernelWakeupFunc, (void *) req,
   2009 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2010 
   2011 		if (rf_debugKernelAccess) {
   2012 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2013 				(long) bp->b_blkno));
   2014 		}
   2015 		queue->numOutstanding++;
   2016 		queue->last_deq_sector = req->sectorOffset;
   2017 		/* acc wouldn't have been let in if there were any pending
   2018 		 * reqs at any other priority */
   2019 		queue->curPriority = req->priority;
   2020 
   2021 		db1_printf(("Going for %c to unit %d col %d\n",
   2022 			    req->type, queue->raidPtr->raidid,
   2023 			    queue->col));
   2024 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2025 			(int) req->sectorOffset, (int) req->numSector,
   2026 			(int) (req->numSector <<
   2027 			    queue->raidPtr->logBytesPerSector),
   2028 			(int) queue->raidPtr->logBytesPerSector));
   2029 
   2030 		/*
   2031 		 * XXX: drop lock here since this can block at
   2032 		 * least with backing SCSI devices.  Retake it
   2033 		 * to minimize fuss with calling interfaces.
   2034 		 */
   2035 
   2036 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2037 		bdev_strategy(bp);
   2038 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2039 		break;
   2040 
   2041 	default:
   2042 		panic("bad req->type in rf_DispatchKernelIO");
   2043 	}
   2044 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2045 
   2046 	return (0);
   2047 }
   2048 /* this is the callback function associated with a I/O invoked from
   2049    kernel code.
   2050  */
   2051 static void
   2052 KernelWakeupFunc(struct buf *bp)
   2053 {
   2054 	RF_DiskQueueData_t *req = NULL;
   2055 	RF_DiskQueue_t *queue;
   2056 
   2057 	db1_printf(("recovering the request queue:\n"));
   2058 
   2059 	req = bp->b_private;
   2060 
   2061 	queue = (RF_DiskQueue_t *) req->queue;
   2062 
   2063 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2064 
   2065 #if RF_ACC_TRACE > 0
   2066 	if (req->tracerec) {
   2067 		RF_ETIMER_STOP(req->tracerec->timer);
   2068 		RF_ETIMER_EVAL(req->tracerec->timer);
   2069 		rf_lock_mutex2(rf_tracing_mutex);
   2070 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2071 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2072 		req->tracerec->num_phys_ios++;
   2073 		rf_unlock_mutex2(rf_tracing_mutex);
   2074 	}
   2075 #endif
   2076 
   2077 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2078 	 * ballistic, and mark the component as hosed... */
   2079 
   2080 	if (bp->b_error != 0) {
   2081 		/* Mark the disk as dead */
   2082 		/* but only mark it once... */
   2083 		/* and only if it wouldn't leave this RAID set
   2084 		   completely broken */
   2085 		if (((queue->raidPtr->Disks[queue->col].status ==
   2086 		      rf_ds_optimal) ||
   2087 		     (queue->raidPtr->Disks[queue->col].status ==
   2088 		      rf_ds_used_spare)) &&
   2089 		     (queue->raidPtr->numFailures <
   2090 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2091 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2092 			       queue->raidPtr->raidid,
   2093 			       bp->b_error,
   2094 			       queue->raidPtr->Disks[queue->col].devname);
   2095 			queue->raidPtr->Disks[queue->col].status =
   2096 			    rf_ds_failed;
   2097 			queue->raidPtr->status = rf_rs_degraded;
   2098 			queue->raidPtr->numFailures++;
   2099 			queue->raidPtr->numNewFailures++;
   2100 		} else {	/* Disk is already dead... */
   2101 			/* printf("Disk already marked as dead!\n"); */
   2102 		}
   2103 
   2104 	}
   2105 
   2106 	/* Fill in the error value */
   2107 	req->error = bp->b_error;
   2108 
   2109 	/* Drop this one on the "finished" queue... */
   2110 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2111 
   2112 	/* Let the raidio thread know there is work to be done. */
   2113 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2114 
   2115 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2116 }
   2117 
   2118 
   2119 /*
   2120  * initialize a buf structure for doing an I/O in the kernel.
   2121  */
   2122 static void
   2123 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2124        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2125        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2126        struct proc *b_proc)
   2127 {
   2128 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2129 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2130 	bp->b_oflags = 0;
   2131 	bp->b_cflags = 0;
   2132 	bp->b_bcount = numSect << logBytesPerSector;
   2133 	bp->b_bufsize = bp->b_bcount;
   2134 	bp->b_error = 0;
   2135 	bp->b_dev = dev;
   2136 	bp->b_data = bf;
   2137 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2138 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2139 	if (bp->b_bcount == 0) {
   2140 		panic("bp->b_bcount is zero in InitBP!!");
   2141 	}
   2142 	bp->b_proc = b_proc;
   2143 	bp->b_iodone = cbFunc;
   2144 	bp->b_private = cbArg;
   2145 }
   2146 
   2147 /*
   2148  * Wait interruptibly for an exclusive lock.
   2149  *
   2150  * XXX
   2151  * Several drivers do this; it should be abstracted and made MP-safe.
   2152  * (Hmm... where have we seen this warning before :->  GO )
   2153  */
   2154 static int
   2155 raidlock(struct raid_softc *rs)
   2156 {
   2157 	int     error;
   2158 
   2159 	error = 0;
   2160 	mutex_enter(&rs->sc_mutex);
   2161 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2162 		rs->sc_flags |= RAIDF_WANTED;
   2163 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2164 		if (error != 0)
   2165 			goto done;
   2166 	}
   2167 	rs->sc_flags |= RAIDF_LOCKED;
   2168 done:
   2169 	mutex_exit(&rs->sc_mutex);
   2170 	return (error);
   2171 }
   2172 /*
   2173  * Unlock and wake up any waiters.
   2174  */
   2175 static void
   2176 raidunlock(struct raid_softc *rs)
   2177 {
   2178 
   2179 	mutex_enter(&rs->sc_mutex);
   2180 	rs->sc_flags &= ~RAIDF_LOCKED;
   2181 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2182 		rs->sc_flags &= ~RAIDF_WANTED;
   2183 		cv_broadcast(&rs->sc_cv);
   2184 	}
   2185 	mutex_exit(&rs->sc_mutex);
   2186 }
   2187 
   2188 
   2189 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2190 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2191 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2192 
   2193 static daddr_t
   2194 rf_component_info_offset(void)
   2195 {
   2196 
   2197 	return RF_COMPONENT_INFO_OFFSET;
   2198 }
   2199 
   2200 static daddr_t
   2201 rf_component_info_size(unsigned secsize)
   2202 {
   2203 	daddr_t info_size;
   2204 
   2205 	KASSERT(secsize);
   2206 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2207 		info_size = secsize;
   2208 	else
   2209 		info_size = RF_COMPONENT_INFO_SIZE;
   2210 
   2211 	return info_size;
   2212 }
   2213 
   2214 static daddr_t
   2215 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2216 {
   2217 	daddr_t map_offset;
   2218 
   2219 	KASSERT(raidPtr->bytesPerSector);
   2220 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2221 		map_offset = raidPtr->bytesPerSector;
   2222 	else
   2223 		map_offset = RF_COMPONENT_INFO_SIZE;
   2224 	map_offset += rf_component_info_offset();
   2225 
   2226 	return map_offset;
   2227 }
   2228 
   2229 static daddr_t
   2230 rf_parity_map_size(RF_Raid_t *raidPtr)
   2231 {
   2232 	daddr_t map_size;
   2233 
   2234 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2235 		map_size = raidPtr->bytesPerSector;
   2236 	else
   2237 		map_size = RF_PARITY_MAP_SIZE;
   2238 
   2239 	return map_size;
   2240 }
   2241 
   2242 int
   2243 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2244 {
   2245 	RF_ComponentLabel_t *clabel;
   2246 
   2247 	clabel = raidget_component_label(raidPtr, col);
   2248 	clabel->clean = RF_RAID_CLEAN;
   2249 	raidflush_component_label(raidPtr, col);
   2250 	return(0);
   2251 }
   2252 
   2253 
   2254 int
   2255 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2256 {
   2257 	RF_ComponentLabel_t *clabel;
   2258 
   2259 	clabel = raidget_component_label(raidPtr, col);
   2260 	clabel->clean = RF_RAID_DIRTY;
   2261 	raidflush_component_label(raidPtr, col);
   2262 	return(0);
   2263 }
   2264 
   2265 int
   2266 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2267 {
   2268 	KASSERT(raidPtr->bytesPerSector);
   2269 	return raidread_component_label(raidPtr->bytesPerSector,
   2270 	    raidPtr->Disks[col].dev,
   2271 	    raidPtr->raid_cinfo[col].ci_vp,
   2272 	    &raidPtr->raid_cinfo[col].ci_label);
   2273 }
   2274 
   2275 RF_ComponentLabel_t *
   2276 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2277 {
   2278 	return &raidPtr->raid_cinfo[col].ci_label;
   2279 }
   2280 
   2281 int
   2282 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2283 {
   2284 	RF_ComponentLabel_t *label;
   2285 
   2286 	label = &raidPtr->raid_cinfo[col].ci_label;
   2287 	label->mod_counter = raidPtr->mod_counter;
   2288 #ifndef RF_NO_PARITY_MAP
   2289 	label->parity_map_modcount = label->mod_counter;
   2290 #endif
   2291 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2292 	    raidPtr->Disks[col].dev,
   2293 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2294 }
   2295 
   2296 
   2297 static int
   2298 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2299     RF_ComponentLabel_t *clabel)
   2300 {
   2301 	return raidread_component_area(dev, b_vp, clabel,
   2302 	    sizeof(RF_ComponentLabel_t),
   2303 	    rf_component_info_offset(),
   2304 	    rf_component_info_size(secsize));
   2305 }
   2306 
   2307 /* ARGSUSED */
   2308 static int
   2309 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2310     size_t msize, daddr_t offset, daddr_t dsize)
   2311 {
   2312 	struct buf *bp;
   2313 	int error;
   2314 
   2315 	/* XXX should probably ensure that we don't try to do this if
   2316 	   someone has changed rf_protected_sectors. */
   2317 
   2318 	if (b_vp == NULL) {
   2319 		/* For whatever reason, this component is not valid.
   2320 		   Don't try to read a component label from it. */
   2321 		return(EINVAL);
   2322 	}
   2323 
   2324 	/* get a block of the appropriate size... */
   2325 	bp = geteblk((int)dsize);
   2326 	bp->b_dev = dev;
   2327 
   2328 	/* get our ducks in a row for the read */
   2329 	bp->b_blkno = offset / DEV_BSIZE;
   2330 	bp->b_bcount = dsize;
   2331 	bp->b_flags |= B_READ;
   2332  	bp->b_resid = dsize;
   2333 
   2334 	bdev_strategy(bp);
   2335 	error = biowait(bp);
   2336 
   2337 	if (!error) {
   2338 		memcpy(data, bp->b_data, msize);
   2339 	}
   2340 
   2341 	brelse(bp, 0);
   2342 	return(error);
   2343 }
   2344 
   2345 
   2346 static int
   2347 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2348     RF_ComponentLabel_t *clabel)
   2349 {
   2350 	return raidwrite_component_area(dev, b_vp, clabel,
   2351 	    sizeof(RF_ComponentLabel_t),
   2352 	    rf_component_info_offset(),
   2353 	    rf_component_info_size(secsize), 0);
   2354 }
   2355 
   2356 /* ARGSUSED */
   2357 static int
   2358 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2359     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2360 {
   2361 	struct buf *bp;
   2362 	int error;
   2363 
   2364 	/* get a block of the appropriate size... */
   2365 	bp = geteblk((int)dsize);
   2366 	bp->b_dev = dev;
   2367 
   2368 	/* get our ducks in a row for the write */
   2369 	bp->b_blkno = offset / DEV_BSIZE;
   2370 	bp->b_bcount = dsize;
   2371 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2372  	bp->b_resid = dsize;
   2373 
   2374 	memset(bp->b_data, 0, dsize);
   2375 	memcpy(bp->b_data, data, msize);
   2376 
   2377 	bdev_strategy(bp);
   2378 	if (asyncp)
   2379 		return 0;
   2380 	error = biowait(bp);
   2381 	brelse(bp, 0);
   2382 	if (error) {
   2383 #if 1
   2384 		printf("Failed to write RAID component info!\n");
   2385 #endif
   2386 	}
   2387 
   2388 	return(error);
   2389 }
   2390 
   2391 void
   2392 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2393 {
   2394 	int c;
   2395 
   2396 	for (c = 0; c < raidPtr->numCol; c++) {
   2397 		/* Skip dead disks. */
   2398 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2399 			continue;
   2400 		/* XXXjld: what if an error occurs here? */
   2401 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2402 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2403 		    RF_PARITYMAP_NBYTE,
   2404 		    rf_parity_map_offset(raidPtr),
   2405 		    rf_parity_map_size(raidPtr), 0);
   2406 	}
   2407 }
   2408 
   2409 void
   2410 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2411 {
   2412 	struct rf_paritymap_ondisk tmp;
   2413 	int c,first;
   2414 
   2415 	first=1;
   2416 	for (c = 0; c < raidPtr->numCol; c++) {
   2417 		/* Skip dead disks. */
   2418 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2419 			continue;
   2420 		raidread_component_area(raidPtr->Disks[c].dev,
   2421 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2422 		    RF_PARITYMAP_NBYTE,
   2423 		    rf_parity_map_offset(raidPtr),
   2424 		    rf_parity_map_size(raidPtr));
   2425 		if (first) {
   2426 			memcpy(map, &tmp, sizeof(*map));
   2427 			first = 0;
   2428 		} else {
   2429 			rf_paritymap_merge(map, &tmp);
   2430 		}
   2431 	}
   2432 }
   2433 
   2434 void
   2435 rf_markalldirty(RF_Raid_t *raidPtr)
   2436 {
   2437 	RF_ComponentLabel_t *clabel;
   2438 	int sparecol;
   2439 	int c;
   2440 	int j;
   2441 	int scol = -1;
   2442 
   2443 	raidPtr->mod_counter++;
   2444 	for (c = 0; c < raidPtr->numCol; c++) {
   2445 		/* we don't want to touch (at all) a disk that has
   2446 		   failed */
   2447 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2448 			clabel = raidget_component_label(raidPtr, c);
   2449 			if (clabel->status == rf_ds_spared) {
   2450 				/* XXX do something special...
   2451 				   but whatever you do, don't
   2452 				   try to access it!! */
   2453 			} else {
   2454 				raidmarkdirty(raidPtr, c);
   2455 			}
   2456 		}
   2457 	}
   2458 
   2459 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2460 		sparecol = raidPtr->numCol + c;
   2461 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2462 			/*
   2463 
   2464 			   we claim this disk is "optimal" if it's
   2465 			   rf_ds_used_spare, as that means it should be
   2466 			   directly substitutable for the disk it replaced.
   2467 			   We note that too...
   2468 
   2469 			 */
   2470 
   2471 			for(j=0;j<raidPtr->numCol;j++) {
   2472 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2473 					scol = j;
   2474 					break;
   2475 				}
   2476 			}
   2477 
   2478 			clabel = raidget_component_label(raidPtr, sparecol);
   2479 			/* make sure status is noted */
   2480 
   2481 			raid_init_component_label(raidPtr, clabel);
   2482 
   2483 			clabel->row = 0;
   2484 			clabel->column = scol;
   2485 			/* Note: we *don't* change status from rf_ds_used_spare
   2486 			   to rf_ds_optimal */
   2487 			/* clabel.status = rf_ds_optimal; */
   2488 
   2489 			raidmarkdirty(raidPtr, sparecol);
   2490 		}
   2491 	}
   2492 }
   2493 
   2494 
   2495 void
   2496 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2497 {
   2498 	RF_ComponentLabel_t *clabel;
   2499 	int sparecol;
   2500 	int c;
   2501 	int j;
   2502 	int scol;
   2503 	struct raid_softc *rs = raidPtr->softc;
   2504 
   2505 	scol = -1;
   2506 
   2507 	/* XXX should do extra checks to make sure things really are clean,
   2508 	   rather than blindly setting the clean bit... */
   2509 
   2510 	raidPtr->mod_counter++;
   2511 
   2512 	for (c = 0; c < raidPtr->numCol; c++) {
   2513 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2514 			clabel = raidget_component_label(raidPtr, c);
   2515 			/* make sure status is noted */
   2516 			clabel->status = rf_ds_optimal;
   2517 
   2518 			/* note what unit we are configured as */
   2519 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2520 				clabel->last_unit = raidPtr->raidid;
   2521 
   2522 			raidflush_component_label(raidPtr, c);
   2523 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2524 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2525 					raidmarkclean(raidPtr, c);
   2526 				}
   2527 			}
   2528 		}
   2529 		/* else we don't touch it.. */
   2530 	}
   2531 
   2532 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2533 		sparecol = raidPtr->numCol + c;
   2534 		/* Need to ensure that the reconstruct actually completed! */
   2535 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2536 			/*
   2537 
   2538 			   we claim this disk is "optimal" if it's
   2539 			   rf_ds_used_spare, as that means it should be
   2540 			   directly substitutable for the disk it replaced.
   2541 			   We note that too...
   2542 
   2543 			 */
   2544 
   2545 			for(j=0;j<raidPtr->numCol;j++) {
   2546 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2547 					scol = j;
   2548 					break;
   2549 				}
   2550 			}
   2551 
   2552 			/* XXX shouldn't *really* need this... */
   2553 			clabel = raidget_component_label(raidPtr, sparecol);
   2554 			/* make sure status is noted */
   2555 
   2556 			raid_init_component_label(raidPtr, clabel);
   2557 
   2558 			clabel->column = scol;
   2559 			clabel->status = rf_ds_optimal;
   2560 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2561 				clabel->last_unit = raidPtr->raidid;
   2562 
   2563 			raidflush_component_label(raidPtr, sparecol);
   2564 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2565 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2566 					raidmarkclean(raidPtr, sparecol);
   2567 				}
   2568 			}
   2569 		}
   2570 	}
   2571 }
   2572 
   2573 void
   2574 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2575 {
   2576 
   2577 	if (vp != NULL) {
   2578 		if (auto_configured == 1) {
   2579 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2580 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2581 			vput(vp);
   2582 
   2583 		} else {
   2584 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2585 		}
   2586 	}
   2587 }
   2588 
   2589 
   2590 void
   2591 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2592 {
   2593 	int r,c;
   2594 	struct vnode *vp;
   2595 	int acd;
   2596 
   2597 
   2598 	/* We take this opportunity to close the vnodes like we should.. */
   2599 
   2600 	for (c = 0; c < raidPtr->numCol; c++) {
   2601 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2602 		acd = raidPtr->Disks[c].auto_configured;
   2603 		rf_close_component(raidPtr, vp, acd);
   2604 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2605 		raidPtr->Disks[c].auto_configured = 0;
   2606 	}
   2607 
   2608 	for (r = 0; r < raidPtr->numSpare; r++) {
   2609 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2610 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2611 		rf_close_component(raidPtr, vp, acd);
   2612 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2613 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2614 	}
   2615 }
   2616 
   2617 
   2618 void
   2619 rf_ReconThread(struct rf_recon_req_internal *req)
   2620 {
   2621 	int     s;
   2622 	RF_Raid_t *raidPtr;
   2623 
   2624 	s = splbio();
   2625 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2626 	raidPtr->recon_in_progress = 1;
   2627 
   2628 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2629 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2630 
   2631 	RF_Free(req, sizeof(*req));
   2632 
   2633 	raidPtr->recon_in_progress = 0;
   2634 	splx(s);
   2635 
   2636 	/* That's all... */
   2637 	kthread_exit(0);	/* does not return */
   2638 }
   2639 
   2640 void
   2641 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2642 {
   2643 	int retcode;
   2644 	int s;
   2645 
   2646 	raidPtr->parity_rewrite_stripes_done = 0;
   2647 	raidPtr->parity_rewrite_in_progress = 1;
   2648 	s = splbio();
   2649 	retcode = rf_RewriteParity(raidPtr);
   2650 	splx(s);
   2651 	if (retcode) {
   2652 		printf("raid%d: Error re-writing parity (%d)!\n",
   2653 		    raidPtr->raidid, retcode);
   2654 	} else {
   2655 		/* set the clean bit!  If we shutdown correctly,
   2656 		   the clean bit on each component label will get
   2657 		   set */
   2658 		raidPtr->parity_good = RF_RAID_CLEAN;
   2659 	}
   2660 	raidPtr->parity_rewrite_in_progress = 0;
   2661 
   2662 	/* Anyone waiting for us to stop?  If so, inform them... */
   2663 	if (raidPtr->waitShutdown) {
   2664 		rf_lock_mutex2(raidPtr->rad_lock);
   2665 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2666 		rf_unlock_mutex2(raidPtr->rad_lock);
   2667 	}
   2668 
   2669 	/* That's all... */
   2670 	kthread_exit(0);	/* does not return */
   2671 }
   2672 
   2673 
   2674 void
   2675 rf_CopybackThread(RF_Raid_t *raidPtr)
   2676 {
   2677 	int s;
   2678 
   2679 	raidPtr->copyback_in_progress = 1;
   2680 	s = splbio();
   2681 	rf_CopybackReconstructedData(raidPtr);
   2682 	splx(s);
   2683 	raidPtr->copyback_in_progress = 0;
   2684 
   2685 	/* That's all... */
   2686 	kthread_exit(0);	/* does not return */
   2687 }
   2688 
   2689 
   2690 void
   2691 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2692 {
   2693 	int s;
   2694 	RF_Raid_t *raidPtr;
   2695 
   2696 	s = splbio();
   2697 	raidPtr = req->raidPtr;
   2698 	raidPtr->recon_in_progress = 1;
   2699 	rf_ReconstructInPlace(raidPtr, req->col);
   2700 	RF_Free(req, sizeof(*req));
   2701 	raidPtr->recon_in_progress = 0;
   2702 	splx(s);
   2703 
   2704 	/* That's all... */
   2705 	kthread_exit(0);	/* does not return */
   2706 }
   2707 
   2708 static RF_AutoConfig_t *
   2709 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2710     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2711     unsigned secsize)
   2712 {
   2713 	int good_one = 0;
   2714 	RF_ComponentLabel_t *clabel;
   2715 	RF_AutoConfig_t *ac;
   2716 
   2717 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2718 	if (clabel == NULL) {
   2719 oomem:
   2720 		    while(ac_list) {
   2721 			    ac = ac_list;
   2722 			    if (ac->clabel)
   2723 				    free(ac->clabel, M_RAIDFRAME);
   2724 			    ac_list = ac_list->next;
   2725 			    free(ac, M_RAIDFRAME);
   2726 		    }
   2727 		    printf("RAID auto config: out of memory!\n");
   2728 		    return NULL; /* XXX probably should panic? */
   2729 	}
   2730 
   2731 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2732 		/* Got the label.  Does it look reasonable? */
   2733 		if (rf_reasonable_label(clabel, numsecs) &&
   2734 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2735 #ifdef DEBUG
   2736 			printf("Component on: %s: %llu\n",
   2737 				cname, (unsigned long long)size);
   2738 			rf_print_component_label(clabel);
   2739 #endif
   2740 			/* if it's reasonable, add it, else ignore it. */
   2741 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2742 				M_NOWAIT);
   2743 			if (ac == NULL) {
   2744 				free(clabel, M_RAIDFRAME);
   2745 				goto oomem;
   2746 			}
   2747 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2748 			ac->dev = dev;
   2749 			ac->vp = vp;
   2750 			ac->clabel = clabel;
   2751 			ac->next = ac_list;
   2752 			ac_list = ac;
   2753 			good_one = 1;
   2754 		}
   2755 	}
   2756 	if (!good_one) {
   2757 		/* cleanup */
   2758 		free(clabel, M_RAIDFRAME);
   2759 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2760 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2761 		vput(vp);
   2762 	}
   2763 	return ac_list;
   2764 }
   2765 
   2766 RF_AutoConfig_t *
   2767 rf_find_raid_components(void)
   2768 {
   2769 	struct vnode *vp;
   2770 	struct disklabel label;
   2771 	device_t dv;
   2772 	deviter_t di;
   2773 	dev_t dev;
   2774 	int bmajor, bminor, wedge, rf_part_found;
   2775 	int error;
   2776 	int i;
   2777 	RF_AutoConfig_t *ac_list;
   2778 	uint64_t numsecs;
   2779 	unsigned secsize;
   2780 	int dowedges;
   2781 
   2782 	/* initialize the AutoConfig list */
   2783 	ac_list = NULL;
   2784 
   2785 	/*
   2786 	 * we begin by trolling through *all* the devices on the system *twice*
   2787 	 * first we scan for wedges, second for other devices. This avoids
   2788 	 * using a raw partition instead of a wedge that covers the whole disk
   2789 	 */
   2790 
   2791 	for (dowedges=1; dowedges>=0; --dowedges) {
   2792 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2793 		     dv = deviter_next(&di)) {
   2794 
   2795 			/* we are only interested in disks... */
   2796 			if (device_class(dv) != DV_DISK)
   2797 				continue;
   2798 
   2799 			/* we don't care about floppies... */
   2800 			if (device_is_a(dv, "fd")) {
   2801 				continue;
   2802 			}
   2803 
   2804 			/* we don't care about CD's... */
   2805 			if (device_is_a(dv, "cd")) {
   2806 				continue;
   2807 			}
   2808 
   2809 			/* we don't care about md's... */
   2810 			if (device_is_a(dv, "md")) {
   2811 				continue;
   2812 			}
   2813 
   2814 			/* hdfd is the Atari/Hades floppy driver */
   2815 			if (device_is_a(dv, "hdfd")) {
   2816 				continue;
   2817 			}
   2818 
   2819 			/* fdisa is the Atari/Milan floppy driver */
   2820 			if (device_is_a(dv, "fdisa")) {
   2821 				continue;
   2822 			}
   2823 
   2824 			/* are we in the wedges pass ? */
   2825 			wedge = device_is_a(dv, "dk");
   2826 			if (wedge != dowedges) {
   2827 				continue;
   2828 			}
   2829 
   2830 			/* need to find the device_name_to_block_device_major stuff */
   2831 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2832 
   2833 			rf_part_found = 0; /*No raid partition as yet*/
   2834 
   2835 			/* get a vnode for the raw partition of this disk */
   2836 			bminor = minor(device_unit(dv));
   2837 			dev = wedge ? makedev(bmajor, bminor) :
   2838 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2839 			if (bdevvp(dev, &vp))
   2840 				panic("RAID can't alloc vnode");
   2841 
   2842 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2843 
   2844 			if (error) {
   2845 				/* "Who cares."  Continue looking
   2846 				   for something that exists*/
   2847 				vput(vp);
   2848 				continue;
   2849 			}
   2850 
   2851 			error = getdisksize(vp, &numsecs, &secsize);
   2852 			if (error) {
   2853 				/*
   2854 				 * Pseudo devices like vnd and cgd can be
   2855 				 * opened but may still need some configuration.
   2856 				 * Ignore these quietly.
   2857 				 */
   2858 				if (error != ENXIO)
   2859 					printf("RAIDframe: can't get disk size"
   2860 					    " for dev %s (%d)\n",
   2861 					    device_xname(dv), error);
   2862 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2863 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2864 				vput(vp);
   2865 				continue;
   2866 			}
   2867 			if (wedge) {
   2868 				struct dkwedge_info dkw;
   2869 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2870 				    NOCRED);
   2871 				if (error) {
   2872 					printf("RAIDframe: can't get wedge info for "
   2873 					    "dev %s (%d)\n", device_xname(dv), error);
   2874 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2875 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2876 					vput(vp);
   2877 					continue;
   2878 				}
   2879 
   2880 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2881 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2882 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2883 					vput(vp);
   2884 					continue;
   2885 				}
   2886 
   2887 				ac_list = rf_get_component(ac_list, dev, vp,
   2888 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2889 				rf_part_found = 1; /*There is a raid component on this disk*/
   2890 				continue;
   2891 			}
   2892 
   2893 			/* Ok, the disk exists.  Go get the disklabel. */
   2894 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2895 			if (error) {
   2896 				/*
   2897 				 * XXX can't happen - open() would
   2898 				 * have errored out (or faked up one)
   2899 				 */
   2900 				if (error != ENOTTY)
   2901 					printf("RAIDframe: can't get label for dev "
   2902 					    "%s (%d)\n", device_xname(dv), error);
   2903 			}
   2904 
   2905 			/* don't need this any more.  We'll allocate it again
   2906 			   a little later if we really do... */
   2907 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2908 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2909 			vput(vp);
   2910 
   2911 			if (error)
   2912 				continue;
   2913 
   2914 			rf_part_found = 0; /*No raid partitions yet*/
   2915 			for (i = 0; i < label.d_npartitions; i++) {
   2916 				char cname[sizeof(ac_list->devname)];
   2917 
   2918 				/* We only support partitions marked as RAID */
   2919 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2920 					continue;
   2921 
   2922 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2923 				if (bdevvp(dev, &vp))
   2924 					panic("RAID can't alloc vnode");
   2925 
   2926 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2927 				if (error) {
   2928 					/* Whatever... */
   2929 					vput(vp);
   2930 					continue;
   2931 				}
   2932 				snprintf(cname, sizeof(cname), "%s%c",
   2933 				    device_xname(dv), 'a' + i);
   2934 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2935 					label.d_partitions[i].p_size, numsecs, secsize);
   2936 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2937 			}
   2938 
   2939 			/*
   2940 			 *If there is no raid component on this disk, either in a
   2941 			 *disklabel or inside a wedge, check the raw partition as well,
   2942 			 *as it is possible to configure raid components on raw disk
   2943 			 *devices.
   2944 			 */
   2945 
   2946 			if (!rf_part_found) {
   2947 				char cname[sizeof(ac_list->devname)];
   2948 
   2949 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2950 				if (bdevvp(dev, &vp))
   2951 					panic("RAID can't alloc vnode");
   2952 
   2953 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2954 				if (error) {
   2955 					/* Whatever... */
   2956 					vput(vp);
   2957 					continue;
   2958 				}
   2959 				snprintf(cname, sizeof(cname), "%s%c",
   2960 				    device_xname(dv), 'a' + RAW_PART);
   2961 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2962 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2963 			}
   2964 		}
   2965 		deviter_release(&di);
   2966 	}
   2967 	return ac_list;
   2968 }
   2969 
   2970 
   2971 int
   2972 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2973 {
   2974 
   2975 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2976 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2977 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2978 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2979 	    clabel->row >=0 &&
   2980 	    clabel->column >= 0 &&
   2981 	    clabel->num_rows > 0 &&
   2982 	    clabel->num_columns > 0 &&
   2983 	    clabel->row < clabel->num_rows &&
   2984 	    clabel->column < clabel->num_columns &&
   2985 	    clabel->blockSize > 0 &&
   2986 	    /*
   2987 	     * numBlocksHi may contain garbage, but it is ok since
   2988 	     * the type is unsigned.  If it is really garbage,
   2989 	     * rf_fix_old_label_size() will fix it.
   2990 	     */
   2991 	    rf_component_label_numblocks(clabel) > 0) {
   2992 		/*
   2993 		 * label looks reasonable enough...
   2994 		 * let's make sure it has no old garbage.
   2995 		 */
   2996 		if (numsecs)
   2997 			rf_fix_old_label_size(clabel, numsecs);
   2998 		return(1);
   2999 	}
   3000 	return(0);
   3001 }
   3002 
   3003 
   3004 /*
   3005  * For reasons yet unknown, some old component labels have garbage in
   3006  * the newer numBlocksHi region, and this causes lossage.  Since those
   3007  * disks will also have numsecs set to less than 32 bits of sectors,
   3008  * we can determine when this corruption has occurred, and fix it.
   3009  *
   3010  * The exact same problem, with the same unknown reason, happens to
   3011  * the partitionSizeHi member as well.
   3012  */
   3013 static void
   3014 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3015 {
   3016 
   3017 	if (numsecs < ((uint64_t)1 << 32)) {
   3018 		if (clabel->numBlocksHi) {
   3019 			printf("WARNING: total sectors < 32 bits, yet "
   3020 			       "numBlocksHi set\n"
   3021 			       "WARNING: resetting numBlocksHi to zero.\n");
   3022 			clabel->numBlocksHi = 0;
   3023 		}
   3024 
   3025 		if (clabel->partitionSizeHi) {
   3026 			printf("WARNING: total sectors < 32 bits, yet "
   3027 			       "partitionSizeHi set\n"
   3028 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3029 			clabel->partitionSizeHi = 0;
   3030 		}
   3031 	}
   3032 }
   3033 
   3034 
   3035 #ifdef DEBUG
   3036 void
   3037 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3038 {
   3039 	uint64_t numBlocks;
   3040 	static const char *rp[] = {
   3041 	    "No", "Force", "Soft", "*invalid*"
   3042 	};
   3043 
   3044 
   3045 	numBlocks = rf_component_label_numblocks(clabel);
   3046 
   3047 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3048 	       clabel->row, clabel->column,
   3049 	       clabel->num_rows, clabel->num_columns);
   3050 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3051 	       clabel->version, clabel->serial_number,
   3052 	       clabel->mod_counter);
   3053 	printf("   Clean: %s Status: %d\n",
   3054 	       clabel->clean ? "Yes" : "No", clabel->status);
   3055 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3056 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3057 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3058 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3059 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3060 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3061 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3062 #if 0
   3063 	   printf("   Config order: %d\n", clabel->config_order);
   3064 #endif
   3065 
   3066 }
   3067 #endif
   3068 
   3069 RF_ConfigSet_t *
   3070 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3071 {
   3072 	RF_AutoConfig_t *ac;
   3073 	RF_ConfigSet_t *config_sets;
   3074 	RF_ConfigSet_t *cset;
   3075 	RF_AutoConfig_t *ac_next;
   3076 
   3077 
   3078 	config_sets = NULL;
   3079 
   3080 	/* Go through the AutoConfig list, and figure out which components
   3081 	   belong to what sets.  */
   3082 	ac = ac_list;
   3083 	while(ac!=NULL) {
   3084 		/* we're going to putz with ac->next, so save it here
   3085 		   for use at the end of the loop */
   3086 		ac_next = ac->next;
   3087 
   3088 		if (config_sets == NULL) {
   3089 			/* will need at least this one... */
   3090 			config_sets = (RF_ConfigSet_t *)
   3091 				malloc(sizeof(RF_ConfigSet_t),
   3092 				       M_RAIDFRAME, M_NOWAIT);
   3093 			if (config_sets == NULL) {
   3094 				panic("rf_create_auto_sets: No memory!");
   3095 			}
   3096 			/* this one is easy :) */
   3097 			config_sets->ac = ac;
   3098 			config_sets->next = NULL;
   3099 			config_sets->rootable = 0;
   3100 			ac->next = NULL;
   3101 		} else {
   3102 			/* which set does this component fit into? */
   3103 			cset = config_sets;
   3104 			while(cset!=NULL) {
   3105 				if (rf_does_it_fit(cset, ac)) {
   3106 					/* looks like it matches... */
   3107 					ac->next = cset->ac;
   3108 					cset->ac = ac;
   3109 					break;
   3110 				}
   3111 				cset = cset->next;
   3112 			}
   3113 			if (cset==NULL) {
   3114 				/* didn't find a match above... new set..*/
   3115 				cset = (RF_ConfigSet_t *)
   3116 					malloc(sizeof(RF_ConfigSet_t),
   3117 					       M_RAIDFRAME, M_NOWAIT);
   3118 				if (cset == NULL) {
   3119 					panic("rf_create_auto_sets: No memory!");
   3120 				}
   3121 				cset->ac = ac;
   3122 				ac->next = NULL;
   3123 				cset->next = config_sets;
   3124 				cset->rootable = 0;
   3125 				config_sets = cset;
   3126 			}
   3127 		}
   3128 		ac = ac_next;
   3129 	}
   3130 
   3131 
   3132 	return(config_sets);
   3133 }
   3134 
   3135 static int
   3136 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3137 {
   3138 	RF_ComponentLabel_t *clabel1, *clabel2;
   3139 
   3140 	/* If this one matches the *first* one in the set, that's good
   3141 	   enough, since the other members of the set would have been
   3142 	   through here too... */
   3143 	/* note that we are not checking partitionSize here..
   3144 
   3145 	   Note that we are also not checking the mod_counters here.
   3146 	   If everything else matches except the mod_counter, that's
   3147 	   good enough for this test.  We will deal with the mod_counters
   3148 	   a little later in the autoconfiguration process.
   3149 
   3150 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3151 
   3152 	   The reason we don't check for this is that failed disks
   3153 	   will have lower modification counts.  If those disks are
   3154 	   not added to the set they used to belong to, then they will
   3155 	   form their own set, which may result in 2 different sets,
   3156 	   for example, competing to be configured at raid0, and
   3157 	   perhaps competing to be the root filesystem set.  If the
   3158 	   wrong ones get configured, or both attempt to become /,
   3159 	   weird behaviour and or serious lossage will occur.  Thus we
   3160 	   need to bring them into the fold here, and kick them out at
   3161 	   a later point.
   3162 
   3163 	*/
   3164 
   3165 	clabel1 = cset->ac->clabel;
   3166 	clabel2 = ac->clabel;
   3167 	if ((clabel1->version == clabel2->version) &&
   3168 	    (clabel1->serial_number == clabel2->serial_number) &&
   3169 	    (clabel1->num_rows == clabel2->num_rows) &&
   3170 	    (clabel1->num_columns == clabel2->num_columns) &&
   3171 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3172 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3173 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3174 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3175 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3176 	    (clabel1->blockSize == clabel2->blockSize) &&
   3177 	    rf_component_label_numblocks(clabel1) ==
   3178 	    rf_component_label_numblocks(clabel2) &&
   3179 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3180 	    (clabel1->root_partition == clabel2->root_partition) &&
   3181 	    (clabel1->last_unit == clabel2->last_unit) &&
   3182 	    (clabel1->config_order == clabel2->config_order)) {
   3183 		/* if it get's here, it almost *has* to be a match */
   3184 	} else {
   3185 		/* it's not consistent with somebody in the set..
   3186 		   punt */
   3187 		return(0);
   3188 	}
   3189 	/* all was fine.. it must fit... */
   3190 	return(1);
   3191 }
   3192 
   3193 int
   3194 rf_have_enough_components(RF_ConfigSet_t *cset)
   3195 {
   3196 	RF_AutoConfig_t *ac;
   3197 	RF_AutoConfig_t *auto_config;
   3198 	RF_ComponentLabel_t *clabel;
   3199 	int c;
   3200 	int num_cols;
   3201 	int num_missing;
   3202 	int mod_counter;
   3203 	int mod_counter_found;
   3204 	int even_pair_failed;
   3205 	char parity_type;
   3206 
   3207 
   3208 	/* check to see that we have enough 'live' components
   3209 	   of this set.  If so, we can configure it if necessary */
   3210 
   3211 	num_cols = cset->ac->clabel->num_columns;
   3212 	parity_type = cset->ac->clabel->parityConfig;
   3213 
   3214 	/* XXX Check for duplicate components!?!?!? */
   3215 
   3216 	/* Determine what the mod_counter is supposed to be for this set. */
   3217 
   3218 	mod_counter_found = 0;
   3219 	mod_counter = 0;
   3220 	ac = cset->ac;
   3221 	while(ac!=NULL) {
   3222 		if (mod_counter_found==0) {
   3223 			mod_counter = ac->clabel->mod_counter;
   3224 			mod_counter_found = 1;
   3225 		} else {
   3226 			if (ac->clabel->mod_counter > mod_counter) {
   3227 				mod_counter = ac->clabel->mod_counter;
   3228 			}
   3229 		}
   3230 		ac = ac->next;
   3231 	}
   3232 
   3233 	num_missing = 0;
   3234 	auto_config = cset->ac;
   3235 
   3236 	even_pair_failed = 0;
   3237 	for(c=0; c<num_cols; c++) {
   3238 		ac = auto_config;
   3239 		while(ac!=NULL) {
   3240 			if ((ac->clabel->column == c) &&
   3241 			    (ac->clabel->mod_counter == mod_counter)) {
   3242 				/* it's this one... */
   3243 #ifdef DEBUG
   3244 				printf("Found: %s at %d\n",
   3245 				       ac->devname,c);
   3246 #endif
   3247 				break;
   3248 			}
   3249 			ac=ac->next;
   3250 		}
   3251 		if (ac==NULL) {
   3252 				/* Didn't find one here! */
   3253 				/* special case for RAID 1, especially
   3254 				   where there are more than 2
   3255 				   components (where RAIDframe treats
   3256 				   things a little differently :( ) */
   3257 			if (parity_type == '1') {
   3258 				if (c%2 == 0) { /* even component */
   3259 					even_pair_failed = 1;
   3260 				} else { /* odd component.  If
   3261 					    we're failed, and
   3262 					    so is the even
   3263 					    component, it's
   3264 					    "Good Night, Charlie" */
   3265 					if (even_pair_failed == 1) {
   3266 						return(0);
   3267 					}
   3268 				}
   3269 			} else {
   3270 				/* normal accounting */
   3271 				num_missing++;
   3272 			}
   3273 		}
   3274 		if ((parity_type == '1') && (c%2 == 1)) {
   3275 				/* Just did an even component, and we didn't
   3276 				   bail.. reset the even_pair_failed flag,
   3277 				   and go on to the next component.... */
   3278 			even_pair_failed = 0;
   3279 		}
   3280 	}
   3281 
   3282 	clabel = cset->ac->clabel;
   3283 
   3284 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3285 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3286 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3287 		/* XXX this needs to be made *much* more general */
   3288 		/* Too many failures */
   3289 		return(0);
   3290 	}
   3291 	/* otherwise, all is well, and we've got enough to take a kick
   3292 	   at autoconfiguring this set */
   3293 	return(1);
   3294 }
   3295 
   3296 void
   3297 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3298 			RF_Raid_t *raidPtr)
   3299 {
   3300 	RF_ComponentLabel_t *clabel;
   3301 	int i;
   3302 
   3303 	clabel = ac->clabel;
   3304 
   3305 	/* 1. Fill in the common stuff */
   3306 	config->numCol = clabel->num_columns;
   3307 	config->numSpare = 0; /* XXX should this be set here? */
   3308 	config->sectPerSU = clabel->sectPerSU;
   3309 	config->SUsPerPU = clabel->SUsPerPU;
   3310 	config->SUsPerRU = clabel->SUsPerRU;
   3311 	config->parityConfig = clabel->parityConfig;
   3312 	/* XXX... */
   3313 	strcpy(config->diskQueueType,"fifo");
   3314 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3315 	config->layoutSpecificSize = 0; /* XXX ?? */
   3316 
   3317 	while(ac!=NULL) {
   3318 		/* row/col values will be in range due to the checks
   3319 		   in reasonable_label() */
   3320 		strcpy(config->devnames[0][ac->clabel->column],
   3321 		       ac->devname);
   3322 		ac = ac->next;
   3323 	}
   3324 
   3325 	for(i=0;i<RF_MAXDBGV;i++) {
   3326 		config->debugVars[i][0] = 0;
   3327 	}
   3328 }
   3329 
   3330 int
   3331 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3332 {
   3333 	RF_ComponentLabel_t *clabel;
   3334 	int column;
   3335 	int sparecol;
   3336 
   3337 	raidPtr->autoconfigure = new_value;
   3338 
   3339 	for(column=0; column<raidPtr->numCol; column++) {
   3340 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3341 			clabel = raidget_component_label(raidPtr, column);
   3342 			clabel->autoconfigure = new_value;
   3343 			raidflush_component_label(raidPtr, column);
   3344 		}
   3345 	}
   3346 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3347 		sparecol = raidPtr->numCol + column;
   3348 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3349 			clabel = raidget_component_label(raidPtr, sparecol);
   3350 			clabel->autoconfigure = new_value;
   3351 			raidflush_component_label(raidPtr, sparecol);
   3352 		}
   3353 	}
   3354 	return(new_value);
   3355 }
   3356 
   3357 int
   3358 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3359 {
   3360 	RF_ComponentLabel_t *clabel;
   3361 	int column;
   3362 	int sparecol;
   3363 
   3364 	raidPtr->root_partition = new_value;
   3365 	for(column=0; column<raidPtr->numCol; column++) {
   3366 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3367 			clabel = raidget_component_label(raidPtr, column);
   3368 			clabel->root_partition = new_value;
   3369 			raidflush_component_label(raidPtr, column);
   3370 		}
   3371 	}
   3372 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3373 		sparecol = raidPtr->numCol + column;
   3374 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3375 			clabel = raidget_component_label(raidPtr, sparecol);
   3376 			clabel->root_partition = new_value;
   3377 			raidflush_component_label(raidPtr, sparecol);
   3378 		}
   3379 	}
   3380 	return(new_value);
   3381 }
   3382 
   3383 void
   3384 rf_release_all_vps(RF_ConfigSet_t *cset)
   3385 {
   3386 	RF_AutoConfig_t *ac;
   3387 
   3388 	ac = cset->ac;
   3389 	while(ac!=NULL) {
   3390 		/* Close the vp, and give it back */
   3391 		if (ac->vp) {
   3392 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3393 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3394 			vput(ac->vp);
   3395 			ac->vp = NULL;
   3396 		}
   3397 		ac = ac->next;
   3398 	}
   3399 }
   3400 
   3401 
   3402 void
   3403 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3404 {
   3405 	RF_AutoConfig_t *ac;
   3406 	RF_AutoConfig_t *next_ac;
   3407 
   3408 	ac = cset->ac;
   3409 	while(ac!=NULL) {
   3410 		next_ac = ac->next;
   3411 		/* nuke the label */
   3412 		free(ac->clabel, M_RAIDFRAME);
   3413 		/* cleanup the config structure */
   3414 		free(ac, M_RAIDFRAME);
   3415 		/* "next.." */
   3416 		ac = next_ac;
   3417 	}
   3418 	/* and, finally, nuke the config set */
   3419 	free(cset, M_RAIDFRAME);
   3420 }
   3421 
   3422 
   3423 void
   3424 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3425 {
   3426 	/* current version number */
   3427 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3428 	clabel->serial_number = raidPtr->serial_number;
   3429 	clabel->mod_counter = raidPtr->mod_counter;
   3430 
   3431 	clabel->num_rows = 1;
   3432 	clabel->num_columns = raidPtr->numCol;
   3433 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3434 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3435 
   3436 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3437 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3438 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3439 
   3440 	clabel->blockSize = raidPtr->bytesPerSector;
   3441 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3442 
   3443 	/* XXX not portable */
   3444 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3445 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3446 	clabel->autoconfigure = raidPtr->autoconfigure;
   3447 	clabel->root_partition = raidPtr->root_partition;
   3448 	clabel->last_unit = raidPtr->raidid;
   3449 	clabel->config_order = raidPtr->config_order;
   3450 
   3451 #ifndef RF_NO_PARITY_MAP
   3452 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3453 #endif
   3454 }
   3455 
   3456 struct raid_softc *
   3457 rf_auto_config_set(RF_ConfigSet_t *cset)
   3458 {
   3459 	RF_Raid_t *raidPtr;
   3460 	RF_Config_t *config;
   3461 	int raidID;
   3462 	struct raid_softc *sc;
   3463 
   3464 #ifdef DEBUG
   3465 	printf("RAID autoconfigure\n");
   3466 #endif
   3467 
   3468 	/* 1. Create a config structure */
   3469 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3470 	if (config == NULL) {
   3471 		printf("%s: Out of mem - config!?!?\n", __func__);
   3472 				/* XXX do something more intelligent here. */
   3473 		return NULL;
   3474 	}
   3475 
   3476 	/*
   3477 	   2. Figure out what RAID ID this one is supposed to live at
   3478 	   See if we can get the same RAID dev that it was configured
   3479 	   on last time..
   3480 	*/
   3481 
   3482 	raidID = cset->ac->clabel->last_unit;
   3483 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3484 	     sc = raidget(++raidID, false))
   3485 		continue;
   3486 #ifdef DEBUG
   3487 	printf("Configuring raid%d:\n",raidID);
   3488 #endif
   3489 
   3490 	if (sc == NULL)
   3491 		sc = raidget(raidID, true);
   3492 	if (sc == NULL) {
   3493 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3494 				/* XXX do something more intelligent here. */
   3495 		free(config, M_RAIDFRAME);
   3496 		return NULL;
   3497 	}
   3498 
   3499 	raidPtr = &sc->sc_r;
   3500 
   3501 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3502 	raidPtr->softc = sc;
   3503 	raidPtr->raidid = raidID;
   3504 	raidPtr->openings = RAIDOUTSTANDING;
   3505 
   3506 	/* 3. Build the configuration structure */
   3507 	rf_create_configuration(cset->ac, config, raidPtr);
   3508 
   3509 	/* 4. Do the configuration */
   3510 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3511 		raidinit(sc);
   3512 
   3513 		rf_markalldirty(raidPtr);
   3514 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3515 		switch (cset->ac->clabel->root_partition) {
   3516 		case 1:	/* Force Root */
   3517 		case 2:	/* Soft Root: root when boot partition part of raid */
   3518 			/*
   3519 			 * everything configured just fine.  Make a note
   3520 			 * that this set is eligible to be root,
   3521 			 * or forced to be root
   3522 			 */
   3523 			cset->rootable = cset->ac->clabel->root_partition;
   3524 			/* XXX do this here? */
   3525 			raidPtr->root_partition = cset->rootable;
   3526 			break;
   3527 		default:
   3528 			break;
   3529 		}
   3530 	} else {
   3531 		raidput(sc);
   3532 		sc = NULL;
   3533 	}
   3534 
   3535 	/* 5. Cleanup */
   3536 	free(config, M_RAIDFRAME);
   3537 	return sc;
   3538 }
   3539 
   3540 void
   3541 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3542 	     size_t xmin, size_t xmax)
   3543 {
   3544 	int error;
   3545 
   3546 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3547 	pool_sethiwat(p, xmax);
   3548 	if ((error = pool_prime(p, xmin)) != 0)
   3549 		panic("%s: failed to prime pool: %d", __func__, error);
   3550 	pool_setlowat(p, xmin);
   3551 }
   3552 
   3553 /*
   3554  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3555  * to see if there is IO pending and if that IO could possibly be done
   3556  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3557  * otherwise.
   3558  *
   3559  */
   3560 int
   3561 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3562 {
   3563 	struct raid_softc *rs;
   3564 	struct dk_softc *dksc;
   3565 
   3566 	rs = raidPtr->softc;
   3567 	dksc = &rs->sc_dksc;
   3568 
   3569 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3570 		return 1;
   3571 
   3572 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3573 		/* there is work to do */
   3574 		return 0;
   3575 	}
   3576 	/* default is nothing to do */
   3577 	return 1;
   3578 }
   3579 
   3580 int
   3581 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3582 {
   3583 	uint64_t numsecs;
   3584 	unsigned secsize;
   3585 	int error;
   3586 
   3587 	error = getdisksize(vp, &numsecs, &secsize);
   3588 	if (error == 0) {
   3589 		diskPtr->blockSize = secsize;
   3590 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3591 		diskPtr->partitionSize = numsecs;
   3592 		return 0;
   3593 	}
   3594 	return error;
   3595 }
   3596 
   3597 static int
   3598 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3599 {
   3600 	return 1;
   3601 }
   3602 
   3603 static void
   3604 raid_attach(device_t parent, device_t self, void *aux)
   3605 {
   3606 }
   3607 
   3608 
   3609 static int
   3610 raid_detach(device_t self, int flags)
   3611 {
   3612 	int error;
   3613 	struct raid_softc *rs = raidsoftc(self);
   3614 
   3615 	if (rs == NULL)
   3616 		return ENXIO;
   3617 
   3618 	if ((error = raidlock(rs)) != 0)
   3619 		return (error);
   3620 
   3621 	error = raid_detach_unlocked(rs);
   3622 
   3623 	raidunlock(rs);
   3624 
   3625 	/* XXX raid can be referenced here */
   3626 
   3627 	if (error)
   3628 		return error;
   3629 
   3630 	/* Free the softc */
   3631 	raidput(rs);
   3632 
   3633 	return 0;
   3634 }
   3635 
   3636 static void
   3637 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3638 {
   3639 	struct dk_softc *dksc = &rs->sc_dksc;
   3640 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3641 
   3642 	memset(dg, 0, sizeof(*dg));
   3643 
   3644 	dg->dg_secperunit = raidPtr->totalSectors;
   3645 	dg->dg_secsize = raidPtr->bytesPerSector;
   3646 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3647 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3648 
   3649 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3650 }
   3651 
   3652 /*
   3653  * Get cache info for all the components (including spares).
   3654  * Returns intersection of all the cache flags of all disks, or first
   3655  * error if any encountered.
   3656  * XXXfua feature flags can change as spares are added - lock down somehow
   3657  */
   3658 static int
   3659 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3660 {
   3661 	int c;
   3662 	int error;
   3663 	int dkwhole = 0, dkpart;
   3664 
   3665 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3666 		/*
   3667 		 * Check any non-dead disk, even when currently being
   3668 		 * reconstructed.
   3669 		 */
   3670 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3671 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3672 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3673 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3674 			if (error) {
   3675 				if (error != ENODEV) {
   3676 					printf("raid%d: get cache for component %s failed\n",
   3677 					    raidPtr->raidid,
   3678 					    raidPtr->Disks[c].devname);
   3679 				}
   3680 
   3681 				return error;
   3682 			}
   3683 
   3684 			if (c == 0)
   3685 				dkwhole = dkpart;
   3686 			else
   3687 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3688 		}
   3689 	}
   3690 
   3691 	*data = dkwhole;
   3692 
   3693 	return 0;
   3694 }
   3695 
   3696 /*
   3697  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3698  * We end up returning whatever error was returned by the first cache flush
   3699  * that fails.
   3700  */
   3701 
   3702 int
   3703 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3704 {
   3705 	int c, sparecol;
   3706 	int e,error;
   3707 	int force = 1;
   3708 
   3709 	error = 0;
   3710 	for (c = 0; c < raidPtr->numCol; c++) {
   3711 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3712 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3713 					  &force, FWRITE, NOCRED);
   3714 			if (e) {
   3715 				if (e != ENODEV)
   3716 					printf("raid%d: cache flush to component %s failed.\n",
   3717 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3718 				if (error == 0) {
   3719 					error = e;
   3720 				}
   3721 			}
   3722 		}
   3723 	}
   3724 
   3725 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3726 		sparecol = raidPtr->numCol + c;
   3727 		/* Need to ensure that the reconstruct actually completed! */
   3728 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3729 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3730 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3731 			if (e) {
   3732 				if (e != ENODEV)
   3733 					printf("raid%d: cache flush to component %s failed.\n",
   3734 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3735 				if (error == 0) {
   3736 					error = e;
   3737 				}
   3738 			}
   3739 		}
   3740 	}
   3741 	return error;
   3742 }
   3743 
   3744 /* Fill in info with the current status */
   3745 void
   3746 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3747 {
   3748 
   3749 	if (raidPtr->status != rf_rs_reconstructing) {
   3750 		info->total = 100;
   3751 		info->completed = 100;
   3752 	} else {
   3753 		info->total = raidPtr->reconControl->numRUsTotal;
   3754 		info->completed = raidPtr->reconControl->numRUsComplete;
   3755 	}
   3756 	info->remaining = info->total - info->completed;
   3757 }
   3758 
   3759 /* Fill in info with the current status */
   3760 void
   3761 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3762 {
   3763 
   3764 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3765 		info->total = raidPtr->Layout.numStripe;
   3766 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3767 	} else {
   3768 		info->completed = 100;
   3769 		info->total = 100;
   3770 	}
   3771 	info->remaining = info->total - info->completed;
   3772 }
   3773 
   3774 /* Fill in info with the current status */
   3775 void
   3776 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3777 {
   3778 
   3779 	if (raidPtr->copyback_in_progress == 1) {
   3780 		info->total = raidPtr->Layout.numStripe;
   3781 		info->completed = raidPtr->copyback_stripes_done;
   3782 		info->remaining = info->total - info->completed;
   3783 	} else {
   3784 		info->remaining = 0;
   3785 		info->completed = 100;
   3786 		info->total = 100;
   3787 	}
   3788 }
   3789 
   3790 /* Fill in config with the current info */
   3791 int
   3792 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3793 {
   3794 	int	d, i, j;
   3795 
   3796 	if (!raidPtr->valid)
   3797 		return (ENODEV);
   3798 	config->cols = raidPtr->numCol;
   3799 	config->ndevs = raidPtr->numCol;
   3800 	if (config->ndevs >= RF_MAX_DISKS)
   3801 		return (ENOMEM);
   3802 	config->nspares = raidPtr->numSpare;
   3803 	if (config->nspares >= RF_MAX_DISKS)
   3804 		return (ENOMEM);
   3805 	config->maxqdepth = raidPtr->maxQueueDepth;
   3806 	d = 0;
   3807 	for (j = 0; j < config->cols; j++) {
   3808 		config->devs[d] = raidPtr->Disks[j];
   3809 		d++;
   3810 	}
   3811 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3812 		config->spares[i] = raidPtr->Disks[j];
   3813 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3814 			/* XXX: raidctl(8) expects to see this as a used spare */
   3815 			config->spares[i].status = rf_ds_used_spare;
   3816 		}
   3817 	}
   3818 	return 0;
   3819 }
   3820 
   3821 int
   3822 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3823 {
   3824 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3825 	RF_ComponentLabel_t *raid_clabel;
   3826 	int column = clabel->column;
   3827 
   3828 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3829 		return EINVAL;
   3830 	raid_clabel = raidget_component_label(raidPtr, column);
   3831 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3832 
   3833 	return 0;
   3834 }
   3835 
   3836 /*
   3837  * Module interface
   3838  */
   3839 
   3840 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3841 
   3842 #ifdef _MODULE
   3843 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3844 #endif
   3845 
   3846 static int raid_modcmd(modcmd_t, void *);
   3847 static int raid_modcmd_init(void);
   3848 static int raid_modcmd_fini(void);
   3849 
   3850 static int
   3851 raid_modcmd(modcmd_t cmd, void *data)
   3852 {
   3853 	int error;
   3854 
   3855 	error = 0;
   3856 	switch (cmd) {
   3857 	case MODULE_CMD_INIT:
   3858 		error = raid_modcmd_init();
   3859 		break;
   3860 	case MODULE_CMD_FINI:
   3861 		error = raid_modcmd_fini();
   3862 		break;
   3863 	default:
   3864 		error = ENOTTY;
   3865 		break;
   3866 	}
   3867 	return error;
   3868 }
   3869 
   3870 static int
   3871 raid_modcmd_init(void)
   3872 {
   3873 	int error;
   3874 	int bmajor, cmajor;
   3875 
   3876 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3877 	mutex_enter(&raid_lock);
   3878 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3879 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3880 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3881 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3882 
   3883 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3884 #endif
   3885 
   3886 	bmajor = cmajor = -1;
   3887 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3888 	    &raid_cdevsw, &cmajor);
   3889 	if (error != 0 && error != EEXIST) {
   3890 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3891 		mutex_exit(&raid_lock);
   3892 		return error;
   3893 	}
   3894 #ifdef _MODULE
   3895 	error = config_cfdriver_attach(&raid_cd);
   3896 	if (error != 0) {
   3897 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3898 		    __func__, error);
   3899 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3900 		mutex_exit(&raid_lock);
   3901 		return error;
   3902 	}
   3903 #endif
   3904 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3905 	if (error != 0) {
   3906 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3907 		    __func__, error);
   3908 #ifdef _MODULE
   3909 		config_cfdriver_detach(&raid_cd);
   3910 #endif
   3911 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3912 		mutex_exit(&raid_lock);
   3913 		return error;
   3914 	}
   3915 
   3916 	raidautoconfigdone = false;
   3917 
   3918 	mutex_exit(&raid_lock);
   3919 
   3920 	if (error == 0) {
   3921 		if (rf_BootRaidframe(true) == 0)
   3922 			aprint_verbose("Kernelized RAIDframe activated\n");
   3923 		else
   3924 			panic("Serious error activating RAID!!");
   3925 	}
   3926 
   3927 	/*
   3928 	 * Register a finalizer which will be used to auto-config RAID
   3929 	 * sets once all real hardware devices have been found.
   3930 	 */
   3931 	error = config_finalize_register(NULL, rf_autoconfig);
   3932 	if (error != 0) {
   3933 		aprint_error("WARNING: unable to register RAIDframe "
   3934 		    "finalizer\n");
   3935 		error = 0;
   3936 	}
   3937 
   3938 	return error;
   3939 }
   3940 
   3941 static int
   3942 raid_modcmd_fini(void)
   3943 {
   3944 	int error;
   3945 
   3946 	mutex_enter(&raid_lock);
   3947 
   3948 	/* Don't allow unload if raid device(s) exist.  */
   3949 	if (!LIST_EMPTY(&raids)) {
   3950 		mutex_exit(&raid_lock);
   3951 		return EBUSY;
   3952 	}
   3953 
   3954 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3955 	if (error != 0) {
   3956 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3957 		mutex_exit(&raid_lock);
   3958 		return error;
   3959 	}
   3960 #ifdef _MODULE
   3961 	error = config_cfdriver_detach(&raid_cd);
   3962 	if (error != 0) {
   3963 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3964 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3965 		mutex_exit(&raid_lock);
   3966 		return error;
   3967 	}
   3968 #endif
   3969 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3970 	if (error != 0) {
   3971 		aprint_error("%s: cannot detach devsw\n",__func__);
   3972 #ifdef _MODULE
   3973 		config_cfdriver_attach(&raid_cd);
   3974 #endif
   3975 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3976 		mutex_exit(&raid_lock);
   3977 		return error;
   3978 	}
   3979 	rf_BootRaidframe(false);
   3980 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3981 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3982 	rf_destroy_cond2(rf_sparet_wait_cv);
   3983 	rf_destroy_cond2(rf_sparet_resp_cv);
   3984 #endif
   3985 	mutex_exit(&raid_lock);
   3986 	mutex_destroy(&raid_lock);
   3987 
   3988 	return error;
   3989 }
   3990