Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.382
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.382 2020/04/13 00:27:17 chs Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.382 2020/04/13 00:27:17 chs Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    179 
    180 /* prototypes */
    181 static void KernelWakeupFunc(struct buf *);
    182 static void InitBP(struct buf *, struct vnode *, unsigned,
    183     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    184     void *, int, struct proc *);
    185 static void raidinit(struct raid_softc *);
    186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    188 
    189 static int raid_match(device_t, cfdata_t, void *);
    190 static void raid_attach(device_t, device_t, void *);
    191 static int raid_detach(device_t, int);
    192 
    193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    194     daddr_t, daddr_t);
    195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t, int);
    197 
    198 static int raidwrite_component_label(unsigned,
    199     dev_t, struct vnode *, RF_ComponentLabel_t *);
    200 static int raidread_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 
    203 static int raid_diskstart(device_t, struct buf *bp);
    204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    205 static int raid_lastclose(device_t);
    206 
    207 static dev_type_open(raidopen);
    208 static dev_type_close(raidclose);
    209 static dev_type_read(raidread);
    210 static dev_type_write(raidwrite);
    211 static dev_type_ioctl(raidioctl);
    212 static dev_type_strategy(raidstrategy);
    213 static dev_type_dump(raiddump);
    214 static dev_type_size(raidsize);
    215 
    216 const struct bdevsw raid_bdevsw = {
    217 	.d_open = raidopen,
    218 	.d_close = raidclose,
    219 	.d_strategy = raidstrategy,
    220 	.d_ioctl = raidioctl,
    221 	.d_dump = raiddump,
    222 	.d_psize = raidsize,
    223 	.d_discard = nodiscard,
    224 	.d_flag = D_DISK
    225 };
    226 
    227 const struct cdevsw raid_cdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_read = raidread,
    231 	.d_write = raidwrite,
    232 	.d_ioctl = raidioctl,
    233 	.d_stop = nostop,
    234 	.d_tty = notty,
    235 	.d_poll = nopoll,
    236 	.d_mmap = nommap,
    237 	.d_kqfilter = nokqfilter,
    238 	.d_discard = nodiscard,
    239 	.d_flag = D_DISK
    240 };
    241 
    242 static struct dkdriver rf_dkdriver = {
    243 	.d_open = raidopen,
    244 	.d_close = raidclose,
    245 	.d_strategy = raidstrategy,
    246 	.d_diskstart = raid_diskstart,
    247 	.d_dumpblocks = raid_dumpblocks,
    248 	.d_lastclose = raid_lastclose,
    249 	.d_minphys = minphys
    250 };
    251 
    252 #define	raidunit(x)	DISKUNIT(x)
    253 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /* Internal representation of a rf_recon_req */
    261 struct rf_recon_req_internal {
    262 	RF_RowCol_t col;
    263 	RF_ReconReqFlags_t flags;
    264 	void   *raidPtr;
    265 };
    266 
    267 /*
    268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    269  * Be aware that large numbers can allow the driver to consume a lot of
    270  * kernel memory, especially on writes, and in degraded mode reads.
    271  *
    272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    273  * a single 64K write will typically require 64K for the old data,
    274  * 64K for the old parity, and 64K for the new parity, for a total
    275  * of 192K (if the parity buffer is not re-used immediately).
    276  * Even it if is used immediately, that's still 128K, which when multiplied
    277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    278  *
    279  * Now in degraded mode, for example, a 64K read on the above setup may
    280  * require data reconstruction, which will require *all* of the 4 remaining
    281  * disks to participate -- 4 * 32K/disk == 128K again.
    282  */
    283 
    284 #ifndef RAIDOUTSTANDING
    285 #define RAIDOUTSTANDING   6
    286 #endif
    287 
    288 #define RAIDLABELDEV(dev)	\
    289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    290 
    291 /* declared here, and made public, for the benefit of KVM stuff.. */
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req_internal *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	sc->sc_unit = unit;
    342 	cv_init(&sc->sc_cv, "raidunit");
    343 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    344 	return sc;
    345 }
    346 
    347 static void
    348 raiddestroy(struct raid_softc *sc) {
    349 	cv_destroy(&sc->sc_cv);
    350 	mutex_destroy(&sc->sc_mutex);
    351 	kmem_free(sc, sizeof(*sc));
    352 }
    353 
    354 static struct raid_softc *
    355 raidget(int unit, bool create) {
    356 	struct raid_softc *sc;
    357 	if (unit < 0) {
    358 #ifdef DIAGNOSTIC
    359 		panic("%s: unit %d!", __func__, unit);
    360 #endif
    361 		return NULL;
    362 	}
    363 	mutex_enter(&raid_lock);
    364 	LIST_FOREACH(sc, &raids, sc_link) {
    365 		if (sc->sc_unit == unit) {
    366 			mutex_exit(&raid_lock);
    367 			return sc;
    368 		}
    369 	}
    370 	mutex_exit(&raid_lock);
    371 	if (!create)
    372 		return NULL;
    373 	sc = raidcreate(unit);
    374 	mutex_enter(&raid_lock);
    375 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    376 	mutex_exit(&raid_lock);
    377 	return sc;
    378 }
    379 
    380 static void
    381 raidput(struct raid_softc *sc) {
    382 	mutex_enter(&raid_lock);
    383 	LIST_REMOVE(sc, sc_link);
    384 	mutex_exit(&raid_lock);
    385 	raiddestroy(sc);
    386 }
    387 
    388 void
    389 raidattach(int num)
    390 {
    391 
    392 	/*
    393 	 * Device attachment and associated initialization now occurs
    394 	 * as part of the module initialization.
    395 	 */
    396 }
    397 
    398 int
    399 rf_autoconfig(device_t self)
    400 {
    401 	RF_AutoConfig_t *ac_list;
    402 	RF_ConfigSet_t *config_sets;
    403 
    404 	if (!raidautoconfig || raidautoconfigdone == true)
    405 		return (0);
    406 
    407 	/* XXX This code can only be run once. */
    408 	raidautoconfigdone = true;
    409 
    410 #ifdef __HAVE_CPU_BOOTCONF
    411 	/*
    412 	 * 0. find the boot device if needed first so we can use it later
    413 	 * this needs to be done before we autoconfigure any raid sets,
    414 	 * because if we use wedges we are not going to be able to open
    415 	 * the boot device later
    416 	 */
    417 	if (booted_device == NULL)
    418 		cpu_bootconf();
    419 #endif
    420 	/* 1. locate all RAID components on the system */
    421 	aprint_debug("Searching for RAID components...\n");
    422 	ac_list = rf_find_raid_components();
    423 
    424 	/* 2. Sort them into their respective sets. */
    425 	config_sets = rf_create_auto_sets(ac_list);
    426 
    427 	/*
    428 	 * 3. Evaluate each set and configure the valid ones.
    429 	 * This gets done in rf_buildroothack().
    430 	 */
    431 	rf_buildroothack(config_sets);
    432 
    433 	return 1;
    434 }
    435 
    436 int
    437 rf_inited(const struct raid_softc *rs) {
    438 	return (rs->sc_flags & RAIDF_INITED) != 0;
    439 }
    440 
    441 RF_Raid_t *
    442 rf_get_raid(struct raid_softc *rs) {
    443 	return &rs->sc_r;
    444 }
    445 
    446 int
    447 rf_get_unit(const struct raid_softc *rs) {
    448 	return rs->sc_unit;
    449 }
    450 
    451 static int
    452 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    453 	const char *bootname;
    454 	size_t len;
    455 
    456 	/* if bdv is NULL, the set can't contain it. exit early. */
    457 	if (bdv == NULL)
    458 		return 0;
    459 
    460 	bootname = device_xname(bdv);
    461 	len = strlen(bootname);
    462 
    463 	for (int col = 0; col < r->numCol; col++) {
    464 		const char *devname = r->Disks[col].devname;
    465 		devname += sizeof("/dev/") - 1;
    466 		if (strncmp(devname, "dk", 2) == 0) {
    467 			const char *parent =
    468 			    dkwedge_get_parent_name(r->Disks[col].dev);
    469 			if (parent != NULL)
    470 				devname = parent;
    471 		}
    472 		if (strncmp(devname, bootname, len) == 0) {
    473 			struct raid_softc *sc = r->softc;
    474 			aprint_debug("raid%d includes boot device %s\n",
    475 			    sc->sc_unit, devname);
    476 			return 1;
    477 		}
    478 	}
    479 	return 0;
    480 }
    481 
    482 void
    483 rf_buildroothack(RF_ConfigSet_t *config_sets)
    484 {
    485 	RF_ConfigSet_t *cset;
    486 	RF_ConfigSet_t *next_cset;
    487 	int num_root;
    488 	struct raid_softc *sc, *rsc;
    489 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    490 
    491 	sc = rsc = NULL;
    492 	num_root = 0;
    493 	cset = config_sets;
    494 	while (cset != NULL) {
    495 		next_cset = cset->next;
    496 		if (rf_have_enough_components(cset) &&
    497 		    cset->ac->clabel->autoconfigure == 1) {
    498 			sc = rf_auto_config_set(cset);
    499 			if (sc != NULL) {
    500 				aprint_debug("raid%d: configured ok, rootable %d\n",
    501 				    sc->sc_unit, cset->rootable);
    502 				if (cset->rootable) {
    503 					rsc = sc;
    504 					num_root++;
    505 				}
    506 			} else {
    507 				/* The autoconfig didn't work :( */
    508 				aprint_debug("Autoconfig failed\n");
    509 				rf_release_all_vps(cset);
    510 			}
    511 		} else {
    512 			/* we're not autoconfiguring this set...
    513 			   release the associated resources */
    514 			rf_release_all_vps(cset);
    515 		}
    516 		/* cleanup */
    517 		rf_cleanup_config_set(cset);
    518 		cset = next_cset;
    519 	}
    520 
    521 	/* if the user has specified what the root device should be
    522 	   then we don't touch booted_device or boothowto... */
    523 
    524 	if (rootspec != NULL) {
    525 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    526 		return;
    527 	}
    528 
    529 	/* we found something bootable... */
    530 
    531 	/*
    532 	 * XXX: The following code assumes that the root raid
    533 	 * is the first ('a') partition. This is about the best
    534 	 * we can do with a BSD disklabel, but we might be able
    535 	 * to do better with a GPT label, by setting a specified
    536 	 * attribute to indicate the root partition. We can then
    537 	 * stash the partition number in the r->root_partition
    538 	 * high bits (the bottom 2 bits are already used). For
    539 	 * now we just set booted_partition to 0 when we override
    540 	 * root.
    541 	 */
    542 	if (num_root == 1) {
    543 		device_t candidate_root;
    544 		dksc = &rsc->sc_dksc;
    545 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    546 			char cname[sizeof(cset->ac->devname)];
    547 			/* XXX: assume partition 'a' first */
    548 			snprintf(cname, sizeof(cname), "%s%c",
    549 			    device_xname(dksc->sc_dev), 'a');
    550 			candidate_root = dkwedge_find_by_wname(cname);
    551 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    552 			    cname);
    553 			if (candidate_root == NULL) {
    554 				/*
    555 				 * If that is not found, because we don't use
    556 				 * disklabel, return the first dk child
    557 				 * XXX: we can skip the 'a' check above
    558 				 * and always do this...
    559 				 */
    560 				size_t i = 0;
    561 				candidate_root = dkwedge_find_by_parent(
    562 				    device_xname(dksc->sc_dev), &i);
    563 			}
    564 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    565 			    candidate_root);
    566 		} else
    567 			candidate_root = dksc->sc_dev;
    568 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    569 		DPRINTF("%s: booted_device=%p root_partition=%d "
    570 			"contains_boot=%d",
    571 		    __func__, booted_device, rsc->sc_r.root_partition,
    572 			   rf_containsboot(&rsc->sc_r, booted_device));
    573 		/* XXX the check for booted_device == NULL can probably be
    574 		 * dropped, now that rf_containsboot handles that case.
    575 		 */
    576 		if (booted_device == NULL ||
    577 		    rsc->sc_r.root_partition == 1 ||
    578 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    579 			booted_device = candidate_root;
    580 			booted_method = "raidframe/single";
    581 			booted_partition = 0;	/* XXX assume 'a' */
    582 		}
    583 	} else if (num_root > 1) {
    584 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    585 		    booted_device);
    586 
    587 		/*
    588 		 * Maybe the MD code can help. If it cannot, then
    589 		 * setroot() will discover that we have no
    590 		 * booted_device and will ask the user if nothing was
    591 		 * hardwired in the kernel config file
    592 		 */
    593 		if (booted_device == NULL)
    594 			return;
    595 
    596 		num_root = 0;
    597 		mutex_enter(&raid_lock);
    598 		LIST_FOREACH(sc, &raids, sc_link) {
    599 			RF_Raid_t *r = &sc->sc_r;
    600 			if (r->valid == 0)
    601 				continue;
    602 
    603 			if (r->root_partition == 0)
    604 				continue;
    605 
    606 			if (rf_containsboot(r, booted_device)) {
    607 				num_root++;
    608 				rsc = sc;
    609 				dksc = &rsc->sc_dksc;
    610 			}
    611 		}
    612 		mutex_exit(&raid_lock);
    613 
    614 		if (num_root == 1) {
    615 			booted_device = dksc->sc_dev;
    616 			booted_method = "raidframe/multi";
    617 			booted_partition = 0;	/* XXX assume 'a' */
    618 		} else {
    619 			/* we can't guess.. require the user to answer... */
    620 			boothowto |= RB_ASKNAME;
    621 		}
    622 	}
    623 }
    624 
    625 static int
    626 raidsize(dev_t dev)
    627 {
    628 	struct raid_softc *rs;
    629 	struct dk_softc *dksc;
    630 	unsigned int unit;
    631 
    632 	unit = raidunit(dev);
    633 	if ((rs = raidget(unit, false)) == NULL)
    634 		return -1;
    635 	dksc = &rs->sc_dksc;
    636 
    637 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    638 		return -1;
    639 
    640 	return dk_size(dksc, dev);
    641 }
    642 
    643 static int
    644 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    645 {
    646 	unsigned int unit;
    647 	struct raid_softc *rs;
    648 	struct dk_softc *dksc;
    649 
    650 	unit = raidunit(dev);
    651 	if ((rs = raidget(unit, false)) == NULL)
    652 		return ENXIO;
    653 	dksc = &rs->sc_dksc;
    654 
    655 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    656 		return ENODEV;
    657 
    658         /*
    659            Note that blkno is relative to this particular partition.
    660            By adding adding RF_PROTECTED_SECTORS, we get a value that
    661 	   is relative to the partition used for the underlying component.
    662         */
    663 	blkno += RF_PROTECTED_SECTORS;
    664 
    665 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    666 }
    667 
    668 static int
    669 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    670 {
    671 	struct raid_softc *rs = raidsoftc(dev);
    672 	const struct bdevsw *bdev;
    673 	RF_Raid_t *raidPtr;
    674 	int     c, sparecol, j, scol, dumpto;
    675 	int     error = 0;
    676 
    677 	raidPtr = &rs->sc_r;
    678 
    679 	/* we only support dumping to RAID 1 sets */
    680 	if (raidPtr->Layout.numDataCol != 1 ||
    681 	    raidPtr->Layout.numParityCol != 1)
    682 		return EINVAL;
    683 
    684 	if ((error = raidlock(rs)) != 0)
    685 		return error;
    686 
    687 	/* figure out what device is alive.. */
    688 
    689 	/*
    690 	   Look for a component to dump to.  The preference for the
    691 	   component to dump to is as follows:
    692 	   1) the master
    693 	   2) a used_spare of the master
    694 	   3) the slave
    695 	   4) a used_spare of the slave
    696 	*/
    697 
    698 	dumpto = -1;
    699 	for (c = 0; c < raidPtr->numCol; c++) {
    700 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    701 			/* this might be the one */
    702 			dumpto = c;
    703 			break;
    704 		}
    705 	}
    706 
    707 	/*
    708 	   At this point we have possibly selected a live master or a
    709 	   live slave.  We now check to see if there is a spared
    710 	   master (or a spared slave), if we didn't find a live master
    711 	   or a live slave.
    712 	*/
    713 
    714 	for (c = 0; c < raidPtr->numSpare; c++) {
    715 		sparecol = raidPtr->numCol + c;
    716 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    717 			/* How about this one? */
    718 			scol = -1;
    719 			for(j=0;j<raidPtr->numCol;j++) {
    720 				if (raidPtr->Disks[j].spareCol == sparecol) {
    721 					scol = j;
    722 					break;
    723 				}
    724 			}
    725 			if (scol == 0) {
    726 				/*
    727 				   We must have found a spared master!
    728 				   We'll take that over anything else
    729 				   found so far.  (We couldn't have
    730 				   found a real master before, since
    731 				   this is a used spare, and it's
    732 				   saying that it's replacing the
    733 				   master.)  On reboot (with
    734 				   autoconfiguration turned on)
    735 				   sparecol will become the 1st
    736 				   component (component0) of this set.
    737 				*/
    738 				dumpto = sparecol;
    739 				break;
    740 			} else if (scol != -1) {
    741 				/*
    742 				   Must be a spared slave.  We'll dump
    743 				   to that if we havn't found anything
    744 				   else so far.
    745 				*/
    746 				if (dumpto == -1)
    747 					dumpto = sparecol;
    748 			}
    749 		}
    750 	}
    751 
    752 	if (dumpto == -1) {
    753 		/* we couldn't find any live components to dump to!?!?
    754 		 */
    755 		error = EINVAL;
    756 		goto out;
    757 	}
    758 
    759 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    760 	if (bdev == NULL) {
    761 		error = ENXIO;
    762 		goto out;
    763 	}
    764 
    765 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    766 				blkno, va, nblk * raidPtr->bytesPerSector);
    767 
    768 out:
    769 	raidunlock(rs);
    770 
    771 	return error;
    772 }
    773 
    774 /* ARGSUSED */
    775 static int
    776 raidopen(dev_t dev, int flags, int fmt,
    777     struct lwp *l)
    778 {
    779 	int     unit = raidunit(dev);
    780 	struct raid_softc *rs;
    781 	struct dk_softc *dksc;
    782 	int     error = 0;
    783 	int     part, pmask;
    784 
    785 	if ((rs = raidget(unit, true)) == NULL)
    786 		return ENXIO;
    787 	if ((error = raidlock(rs)) != 0)
    788 		return (error);
    789 
    790 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    791 		error = EBUSY;
    792 		goto bad;
    793 	}
    794 
    795 	dksc = &rs->sc_dksc;
    796 
    797 	part = DISKPART(dev);
    798 	pmask = (1 << part);
    799 
    800 	if (!DK_BUSY(dksc, pmask) &&
    801 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    802 		/* First one... mark things as dirty... Note that we *MUST*
    803 		 have done a configure before this.  I DO NOT WANT TO BE
    804 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    805 		 THAT THEY BELONG TOGETHER!!!!! */
    806 		/* XXX should check to see if we're only open for reading
    807 		   here... If so, we needn't do this, but then need some
    808 		   other way of keeping track of what's happened.. */
    809 
    810 		rf_markalldirty(&rs->sc_r);
    811 	}
    812 
    813 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    814 		error = dk_open(dksc, dev, flags, fmt, l);
    815 
    816 bad:
    817 	raidunlock(rs);
    818 
    819 	return (error);
    820 
    821 
    822 }
    823 
    824 static int
    825 raid_lastclose(device_t self)
    826 {
    827 	struct raid_softc *rs = raidsoftc(self);
    828 
    829 	/* Last one... device is not unconfigured yet.
    830 	   Device shutdown has taken care of setting the
    831 	   clean bits if RAIDF_INITED is not set
    832 	   mark things as clean... */
    833 
    834 	rf_update_component_labels(&rs->sc_r,
    835 	    RF_FINAL_COMPONENT_UPDATE);
    836 
    837 	/* pass to unlocked code */
    838 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    839 		rs->sc_flags |= RAIDF_DETACH;
    840 
    841 	return 0;
    842 }
    843 
    844 /* ARGSUSED */
    845 static int
    846 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    847 {
    848 	int     unit = raidunit(dev);
    849 	struct raid_softc *rs;
    850 	struct dk_softc *dksc;
    851 	cfdata_t cf;
    852 	int     error = 0, do_detach = 0, do_put = 0;
    853 
    854 	if ((rs = raidget(unit, false)) == NULL)
    855 		return ENXIO;
    856 	dksc = &rs->sc_dksc;
    857 
    858 	if ((error = raidlock(rs)) != 0)
    859 		return (error);
    860 
    861 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    862 		error = dk_close(dksc, dev, flags, fmt, l);
    863 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    864 			do_detach = 1;
    865 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    866 		do_put = 1;
    867 
    868 	raidunlock(rs);
    869 
    870 	if (do_detach) {
    871 		/* free the pseudo device attach bits */
    872 		cf = device_cfdata(dksc->sc_dev);
    873 		error = config_detach(dksc->sc_dev, 0);
    874 		if (error == 0)
    875 			free(cf, M_RAIDFRAME);
    876 	} else if (do_put) {
    877 		raidput(rs);
    878 	}
    879 
    880 	return (error);
    881 
    882 }
    883 
    884 static void
    885 raid_wakeup(RF_Raid_t *raidPtr)
    886 {
    887 	rf_lock_mutex2(raidPtr->iodone_lock);
    888 	rf_signal_cond2(raidPtr->iodone_cv);
    889 	rf_unlock_mutex2(raidPtr->iodone_lock);
    890 }
    891 
    892 static void
    893 raidstrategy(struct buf *bp)
    894 {
    895 	unsigned int unit;
    896 	struct raid_softc *rs;
    897 	struct dk_softc *dksc;
    898 	RF_Raid_t *raidPtr;
    899 
    900 	unit = raidunit(bp->b_dev);
    901 	if ((rs = raidget(unit, false)) == NULL) {
    902 		bp->b_error = ENXIO;
    903 		goto fail;
    904 	}
    905 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    906 		bp->b_error = ENXIO;
    907 		goto fail;
    908 	}
    909 	dksc = &rs->sc_dksc;
    910 	raidPtr = &rs->sc_r;
    911 
    912 	/* Queue IO only */
    913 	if (dk_strategy_defer(dksc, bp))
    914 		goto done;
    915 
    916 	/* schedule the IO to happen at the next convenient time */
    917 	raid_wakeup(raidPtr);
    918 
    919 done:
    920 	return;
    921 
    922 fail:
    923 	bp->b_resid = bp->b_bcount;
    924 	biodone(bp);
    925 }
    926 
    927 static int
    928 raid_diskstart(device_t dev, struct buf *bp)
    929 {
    930 	struct raid_softc *rs = raidsoftc(dev);
    931 	RF_Raid_t *raidPtr;
    932 
    933 	raidPtr = &rs->sc_r;
    934 	if (!raidPtr->valid) {
    935 		db1_printf(("raid is not valid..\n"));
    936 		return ENODEV;
    937 	}
    938 
    939 	/* XXX */
    940 	bp->b_resid = 0;
    941 
    942 	return raiddoaccess(raidPtr, bp);
    943 }
    944 
    945 void
    946 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    947 {
    948 	struct raid_softc *rs;
    949 	struct dk_softc *dksc;
    950 
    951 	rs = raidPtr->softc;
    952 	dksc = &rs->sc_dksc;
    953 
    954 	dk_done(dksc, bp);
    955 
    956 	rf_lock_mutex2(raidPtr->mutex);
    957 	raidPtr->openings++;
    958 	rf_unlock_mutex2(raidPtr->mutex);
    959 
    960 	/* schedule more IO */
    961 	raid_wakeup(raidPtr);
    962 }
    963 
    964 /* ARGSUSED */
    965 static int
    966 raidread(dev_t dev, struct uio *uio, int flags)
    967 {
    968 	int     unit = raidunit(dev);
    969 	struct raid_softc *rs;
    970 
    971 	if ((rs = raidget(unit, false)) == NULL)
    972 		return ENXIO;
    973 
    974 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    975 		return (ENXIO);
    976 
    977 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    978 
    979 }
    980 
    981 /* ARGSUSED */
    982 static int
    983 raidwrite(dev_t dev, struct uio *uio, int flags)
    984 {
    985 	int     unit = raidunit(dev);
    986 	struct raid_softc *rs;
    987 
    988 	if ((rs = raidget(unit, false)) == NULL)
    989 		return ENXIO;
    990 
    991 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    992 		return (ENXIO);
    993 
    994 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    995 
    996 }
    997 
    998 static int
    999 raid_detach_unlocked(struct raid_softc *rs)
   1000 {
   1001 	struct dk_softc *dksc = &rs->sc_dksc;
   1002 	RF_Raid_t *raidPtr;
   1003 	int error;
   1004 
   1005 	raidPtr = &rs->sc_r;
   1006 
   1007 	if (DK_BUSY(dksc, 0) ||
   1008 	    raidPtr->recon_in_progress != 0 ||
   1009 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1010 	    raidPtr->copyback_in_progress != 0)
   1011 		return EBUSY;
   1012 
   1013 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1014 		return 0;
   1015 
   1016 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1017 
   1018 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1019 		return error;
   1020 
   1021 	rs->sc_flags &= ~RAIDF_INITED;
   1022 
   1023 	/* Kill off any queued buffers */
   1024 	dk_drain(dksc);
   1025 	bufq_free(dksc->sc_bufq);
   1026 
   1027 	/* Detach the disk. */
   1028 	dkwedge_delall(&dksc->sc_dkdev);
   1029 	disk_detach(&dksc->sc_dkdev);
   1030 	disk_destroy(&dksc->sc_dkdev);
   1031 	dk_detach(dksc);
   1032 
   1033 	return 0;
   1034 }
   1035 
   1036 static bool
   1037 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1038 {
   1039 	switch (cmd) {
   1040 	case RAIDFRAME_ADD_HOT_SPARE:
   1041 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1042 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1043 	case RAIDFRAME_CHECK_PARITY:
   1044 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1045 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1046 	case RAIDFRAME_CHECK_RECON_STATUS:
   1047 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1048 	case RAIDFRAME_COPYBACK:
   1049 	case RAIDFRAME_DELETE_COMPONENT:
   1050 	case RAIDFRAME_FAIL_DISK:
   1051 	case RAIDFRAME_GET_ACCTOTALS:
   1052 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1053 	case RAIDFRAME_GET_INFO:
   1054 	case RAIDFRAME_GET_SIZE:
   1055 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1056 	case RAIDFRAME_INIT_LABELS:
   1057 	case RAIDFRAME_KEEP_ACCTOTALS:
   1058 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1059 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1060 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1061 	case RAIDFRAME_PARITYMAP_STATUS:
   1062 	case RAIDFRAME_REBUILD_IN_PLACE:
   1063 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1064 	case RAIDFRAME_RESET_ACCTOTALS:
   1065 	case RAIDFRAME_REWRITEPARITY:
   1066 	case RAIDFRAME_SET_AUTOCONFIG:
   1067 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1068 	case RAIDFRAME_SET_ROOT:
   1069 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1070 	}
   1071 	return false;
   1072 }
   1073 
   1074 int
   1075 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1076 {
   1077 	struct rf_recon_req_internal *rrint;
   1078 
   1079 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1080 		/* Can't do this on a RAID 0!! */
   1081 		return EINVAL;
   1082 	}
   1083 
   1084 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1085 		/* bad column */
   1086 		return EINVAL;
   1087 	}
   1088 
   1089 	rf_lock_mutex2(raidPtr->mutex);
   1090 	if (raidPtr->status == rf_rs_reconstructing) {
   1091 		/* you can't fail a disk while we're reconstructing! */
   1092 		/* XXX wrong for RAID6 */
   1093 		goto out;
   1094 	}
   1095 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1096 	    (raidPtr->numFailures > 0)) {
   1097 		/* some other component has failed.  Let's not make
   1098 		   things worse. XXX wrong for RAID6 */
   1099 		goto out;
   1100 	}
   1101 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1102 		/* Can't fail a spared disk! */
   1103 		goto out;
   1104 	}
   1105 	rf_unlock_mutex2(raidPtr->mutex);
   1106 
   1107 	/* make a copy of the recon request so that we don't rely on
   1108 	 * the user's buffer */
   1109 	rrint = RF_Malloc(sizeof(*rrint));
   1110 	if (rrint == NULL)
   1111 		return(ENOMEM);
   1112 	rrint->col = rr->col;
   1113 	rrint->flags = rr->flags;
   1114 	rrint->raidPtr = raidPtr;
   1115 
   1116 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1117 	    rrint, "raid_recon");
   1118 out:
   1119 	rf_unlock_mutex2(raidPtr->mutex);
   1120 	return EINVAL;
   1121 }
   1122 
   1123 static int
   1124 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1125 {
   1126 	/* allocate a buffer for the layout-specific data, and copy it in */
   1127 	if (k_cfg->layoutSpecificSize == 0)
   1128 		return 0;
   1129 
   1130 	if (k_cfg->layoutSpecificSize > 10000) {
   1131 	    /* sanity check */
   1132 	    return EINVAL;
   1133 	}
   1134 
   1135 	u_char *specific_buf;
   1136 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1137 	if (specific_buf == NULL)
   1138 		return ENOMEM;
   1139 
   1140 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1141 	    k_cfg->layoutSpecificSize);
   1142 	if (retcode) {
   1143 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1144 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1145 		return retcode;
   1146 	}
   1147 
   1148 	k_cfg->layoutSpecific = specific_buf;
   1149 	return 0;
   1150 }
   1151 
   1152 static int
   1153 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1154 {
   1155 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1156 
   1157 	if (rs->sc_r.valid) {
   1158 		/* There is a valid RAID set running on this unit! */
   1159 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1160 		return EINVAL;
   1161 	}
   1162 
   1163 	/* copy-in the configuration information */
   1164 	/* data points to a pointer to the configuration structure */
   1165 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1166 	if (*k_cfg == NULL) {
   1167 		return ENOMEM;
   1168 	}
   1169 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1170 	if (retcode == 0)
   1171 		return 0;
   1172 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1173 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1174 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1175 	return retcode;
   1176 }
   1177 
   1178 int
   1179 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1180 {
   1181 	int retcode;
   1182 	RF_Raid_t *raidPtr = &rs->sc_r;
   1183 
   1184 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1185 
   1186 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1187 		goto out;
   1188 
   1189 	/* should do some kind of sanity check on the configuration.
   1190 	 * Store the sum of all the bytes in the last byte? */
   1191 
   1192 	/* configure the system */
   1193 
   1194 	/*
   1195 	 * Clear the entire RAID descriptor, just to make sure
   1196 	 *  there is no stale data left in the case of a
   1197 	 *  reconfiguration
   1198 	 */
   1199 	memset(raidPtr, 0, sizeof(*raidPtr));
   1200 	raidPtr->softc = rs;
   1201 	raidPtr->raidid = rs->sc_unit;
   1202 
   1203 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1204 
   1205 	if (retcode == 0) {
   1206 		/* allow this many simultaneous IO's to
   1207 		   this RAID device */
   1208 		raidPtr->openings = RAIDOUTSTANDING;
   1209 
   1210 		raidinit(rs);
   1211 		raid_wakeup(raidPtr);
   1212 		rf_markalldirty(raidPtr);
   1213 	}
   1214 
   1215 	/* free the buffers.  No return code here. */
   1216 	if (k_cfg->layoutSpecificSize) {
   1217 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1218 	}
   1219 out:
   1220 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1221 	if (retcode) {
   1222 		/*
   1223 		 * If configuration failed, set sc_flags so that we
   1224 		 * will detach the device when we close it.
   1225 		 */
   1226 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1227 	}
   1228 	return retcode;
   1229 }
   1230 
   1231 #if RF_DISABLED
   1232 static int
   1233 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1234 {
   1235 
   1236 	/* XXX check the label for valid stuff... */
   1237 	/* Note that some things *should not* get modified --
   1238 	   the user should be re-initing the labels instead of
   1239 	   trying to patch things.
   1240 	   */
   1241 #ifdef DEBUG
   1242 	int raidid = raidPtr->raidid;
   1243 	printf("raid%d: Got component label:\n", raidid);
   1244 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1245 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1246 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1247 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1248 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1249 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1250 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1251 #endif	/* DEBUG */
   1252 	clabel->row = 0;
   1253 	int column = clabel->column;
   1254 
   1255 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1256 		return(EINVAL);
   1257 	}
   1258 
   1259 	/* XXX this isn't allowed to do anything for now :-) */
   1260 
   1261 	/* XXX and before it is, we need to fill in the rest
   1262 	   of the fields!?!?!?! */
   1263 	memcpy(raidget_component_label(raidPtr, column),
   1264 	    clabel, sizeof(*clabel));
   1265 	raidflush_component_label(raidPtr, column);
   1266 	return 0;
   1267 }
   1268 #endif
   1269 
   1270 static int
   1271 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1272 {
   1273 	/*
   1274 	   we only want the serial number from
   1275 	   the above.  We get all the rest of the information
   1276 	   from the config that was used to create this RAID
   1277 	   set.
   1278 	   */
   1279 
   1280 	raidPtr->serial_number = clabel->serial_number;
   1281 
   1282 	for (int column = 0; column < raidPtr->numCol; column++) {
   1283 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1284 		if (RF_DEAD_DISK(diskPtr->status))
   1285 			continue;
   1286 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1287 		    raidPtr, column);
   1288 		/* Zeroing this is important. */
   1289 		memset(ci_label, 0, sizeof(*ci_label));
   1290 		raid_init_component_label(raidPtr, ci_label);
   1291 		ci_label->serial_number = raidPtr->serial_number;
   1292 		ci_label->row = 0; /* we dont' pretend to support more */
   1293 		rf_component_label_set_partitionsize(ci_label,
   1294 		    diskPtr->partitionSize);
   1295 		ci_label->column = column;
   1296 		raidflush_component_label(raidPtr, column);
   1297 		/* XXXjld what about the spares? */
   1298 	}
   1299 
   1300 	return 0;
   1301 }
   1302 
   1303 static int
   1304 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1305 {
   1306 
   1307 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1308 		/* Can't do this on a RAID 0!! */
   1309 		return EINVAL;
   1310 	}
   1311 
   1312 	if (raidPtr->recon_in_progress == 1) {
   1313 		/* a reconstruct is already in progress! */
   1314 		return EINVAL;
   1315 	}
   1316 
   1317 	RF_SingleComponent_t component;
   1318 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1319 	component.row = 0; /* we don't support any more */
   1320 	int column = component.column;
   1321 
   1322 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1323 		return EINVAL;
   1324 	}
   1325 
   1326 	rf_lock_mutex2(raidPtr->mutex);
   1327 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1328 	    (raidPtr->numFailures > 0)) {
   1329 		/* XXX 0 above shouldn't be constant!!! */
   1330 		/* some component other than this has failed.
   1331 		   Let's not make things worse than they already
   1332 		   are... */
   1333 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1334 		       raidPtr->raidid);
   1335 		printf("raid%d:     Col: %d   Too many failures.\n",
   1336 		       raidPtr->raidid, column);
   1337 		rf_unlock_mutex2(raidPtr->mutex);
   1338 		return EINVAL;
   1339 	}
   1340 
   1341 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1342 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1343 		       raidPtr->raidid);
   1344 		printf("raid%d:    Col: %d   "
   1345 		    "Reconstruction already occurring!\n",
   1346 		    raidPtr->raidid, column);
   1347 
   1348 		rf_unlock_mutex2(raidPtr->mutex);
   1349 		return EINVAL;
   1350 	}
   1351 
   1352 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1353 		rf_unlock_mutex2(raidPtr->mutex);
   1354 		return EINVAL;
   1355 	}
   1356 
   1357 	rf_unlock_mutex2(raidPtr->mutex);
   1358 
   1359 	struct rf_recon_req_internal *rrint;
   1360 	rrint = RF_Malloc(sizeof(*rrint));
   1361 	if (rrint == NULL)
   1362 		return ENOMEM;
   1363 
   1364 	rrint->col = column;
   1365 	rrint->raidPtr = raidPtr;
   1366 
   1367 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1368 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1369 }
   1370 
   1371 static int
   1372 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1373 {
   1374 	/*
   1375 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1376 	 * so tell the user it's done.
   1377 	 */
   1378 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1379 	    raidPtr->status != rf_rs_reconstructing) {
   1380 		*data = 100;
   1381 		return 0;
   1382 	}
   1383 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1384 		*data = 0;
   1385 		return 0;
   1386 	}
   1387 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1388 	    / raidPtr->reconControl->numRUsTotal);
   1389 	return 0;
   1390 }
   1391 
   1392 static int
   1393 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1394 {
   1395 	int     unit = raidunit(dev);
   1396 	int     part, pmask;
   1397 	struct raid_softc *rs;
   1398 	struct dk_softc *dksc;
   1399 	RF_Config_t *k_cfg;
   1400 	RF_Raid_t *raidPtr;
   1401 	RF_AccTotals_t *totals;
   1402 	RF_SingleComponent_t component;
   1403 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1404 	int retcode = 0;
   1405 	int column;
   1406 	RF_ComponentLabel_t *clabel;
   1407 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1408 	int d;
   1409 
   1410 	if ((rs = raidget(unit, false)) == NULL)
   1411 		return ENXIO;
   1412 
   1413 	dksc = &rs->sc_dksc;
   1414 	raidPtr = &rs->sc_r;
   1415 
   1416 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1417 	    (int) DISKPART(dev), (int) unit, cmd));
   1418 
   1419 	/* Must be initialized for these... */
   1420 	if (rf_must_be_initialized(rs, cmd))
   1421 		return ENXIO;
   1422 
   1423 	switch (cmd) {
   1424 		/* configure the system */
   1425 	case RAIDFRAME_CONFIGURE:
   1426 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1427 			return retcode;
   1428 		return rf_construct(rs, k_cfg);
   1429 
   1430 		/* shutdown the system */
   1431 	case RAIDFRAME_SHUTDOWN:
   1432 
   1433 		part = DISKPART(dev);
   1434 		pmask = (1 << part);
   1435 
   1436 		if ((retcode = raidlock(rs)) != 0)
   1437 			return retcode;
   1438 
   1439 		if (DK_BUSY(dksc, pmask) ||
   1440 		    raidPtr->recon_in_progress != 0 ||
   1441 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1442 		    raidPtr->copyback_in_progress != 0)
   1443 			retcode = EBUSY;
   1444 		else {
   1445 			/* detach and free on close */
   1446 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1447 			retcode = 0;
   1448 		}
   1449 
   1450 		raidunlock(rs);
   1451 
   1452 		return retcode;
   1453 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1454 		return rf_get_component_label(raidPtr, data);
   1455 
   1456 #if RF_DISABLED
   1457 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1458 		return rf_set_component_label(raidPtr, data);
   1459 #endif
   1460 
   1461 	case RAIDFRAME_INIT_LABELS:
   1462 		return rf_init_component_label(raidPtr, data);
   1463 
   1464 	case RAIDFRAME_SET_AUTOCONFIG:
   1465 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1466 		printf("raid%d: New autoconfig value is: %d\n",
   1467 		       raidPtr->raidid, d);
   1468 		*(int *) data = d;
   1469 		return retcode;
   1470 
   1471 	case RAIDFRAME_SET_ROOT:
   1472 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1473 		printf("raid%d: New rootpartition value is: %d\n",
   1474 		       raidPtr->raidid, d);
   1475 		*(int *) data = d;
   1476 		return retcode;
   1477 
   1478 		/* initialize all parity */
   1479 	case RAIDFRAME_REWRITEPARITY:
   1480 
   1481 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1482 			/* Parity for RAID 0 is trivially correct */
   1483 			raidPtr->parity_good = RF_RAID_CLEAN;
   1484 			return 0;
   1485 		}
   1486 
   1487 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1488 			/* Re-write is already in progress! */
   1489 			return EINVAL;
   1490 		}
   1491 
   1492 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1493 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1494 
   1495 	case RAIDFRAME_ADD_HOT_SPARE:
   1496 		sparePtr = (RF_SingleComponent_t *) data;
   1497 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1498 		return rf_add_hot_spare(raidPtr, &component);
   1499 
   1500 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1501 		return retcode;
   1502 
   1503 	case RAIDFRAME_DELETE_COMPONENT:
   1504 		componentPtr = (RF_SingleComponent_t *)data;
   1505 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1506 		return rf_delete_component(raidPtr, &component);
   1507 
   1508 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1509 		componentPtr = (RF_SingleComponent_t *)data;
   1510 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1511 		return rf_incorporate_hot_spare(raidPtr, &component);
   1512 
   1513 	case RAIDFRAME_REBUILD_IN_PLACE:
   1514 		return rf_rebuild_in_place(raidPtr, data);
   1515 
   1516 	case RAIDFRAME_GET_INFO:
   1517 		ucfgp = *(RF_DeviceConfig_t **)data;
   1518 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1519 		if (d_cfg == NULL)
   1520 			return ENOMEM;
   1521 		retcode = rf_get_info(raidPtr, d_cfg);
   1522 		if (retcode == 0) {
   1523 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1524 		}
   1525 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1526 		return retcode;
   1527 
   1528 	case RAIDFRAME_CHECK_PARITY:
   1529 		*(int *) data = raidPtr->parity_good;
   1530 		return 0;
   1531 
   1532 	case RAIDFRAME_PARITYMAP_STATUS:
   1533 		if (rf_paritymap_ineligible(raidPtr))
   1534 			return EINVAL;
   1535 		rf_paritymap_status(raidPtr->parity_map, data);
   1536 		return 0;
   1537 
   1538 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1539 		if (rf_paritymap_ineligible(raidPtr))
   1540 			return EINVAL;
   1541 		if (raidPtr->parity_map == NULL)
   1542 			return ENOENT; /* ??? */
   1543 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1544 			return EINVAL;
   1545 		return 0;
   1546 
   1547 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1548 		if (rf_paritymap_ineligible(raidPtr))
   1549 			return EINVAL;
   1550 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1551 		return 0;
   1552 
   1553 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1554 		if (rf_paritymap_ineligible(raidPtr))
   1555 			return EINVAL;
   1556 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1557 		/* XXX should errors be passed up? */
   1558 		return 0;
   1559 
   1560 	case RAIDFRAME_RESET_ACCTOTALS:
   1561 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1562 		return 0;
   1563 
   1564 	case RAIDFRAME_GET_ACCTOTALS:
   1565 		totals = (RF_AccTotals_t *) data;
   1566 		*totals = raidPtr->acc_totals;
   1567 		return 0;
   1568 
   1569 	case RAIDFRAME_KEEP_ACCTOTALS:
   1570 		raidPtr->keep_acc_totals = *(int *)data;
   1571 		return 0;
   1572 
   1573 	case RAIDFRAME_GET_SIZE:
   1574 		*(int *) data = raidPtr->totalSectors;
   1575 		return 0;
   1576 
   1577 	case RAIDFRAME_FAIL_DISK:
   1578 		return rf_fail_disk(raidPtr, data);
   1579 
   1580 		/* invoke a copyback operation after recon on whatever disk
   1581 		 * needs it, if any */
   1582 	case RAIDFRAME_COPYBACK:
   1583 
   1584 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1585 			/* This makes no sense on a RAID 0!! */
   1586 			return EINVAL;
   1587 		}
   1588 
   1589 		if (raidPtr->copyback_in_progress == 1) {
   1590 			/* Copyback is already in progress! */
   1591 			return EINVAL;
   1592 		}
   1593 
   1594 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1595 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1596 
   1597 		/* return the percentage completion of reconstruction */
   1598 	case RAIDFRAME_CHECK_RECON_STATUS:
   1599 		return rf_check_recon_status(raidPtr, data);
   1600 
   1601 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1602 		rf_check_recon_status_ext(raidPtr, data);
   1603 		return 0;
   1604 
   1605 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1606 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1607 			/* This makes no sense on a RAID 0, so tell the
   1608 			   user it's done. */
   1609 			*(int *) data = 100;
   1610 			return 0;
   1611 		}
   1612 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1613 			*(int *) data = 100 *
   1614 				raidPtr->parity_rewrite_stripes_done /
   1615 				raidPtr->Layout.numStripe;
   1616 		} else {
   1617 			*(int *) data = 100;
   1618 		}
   1619 		return 0;
   1620 
   1621 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1622 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1623 		return 0;
   1624 
   1625 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1626 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1627 			/* This makes no sense on a RAID 0 */
   1628 			*(int *) data = 100;
   1629 			return 0;
   1630 		}
   1631 		if (raidPtr->copyback_in_progress == 1) {
   1632 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1633 				raidPtr->Layout.numStripe;
   1634 		} else {
   1635 			*(int *) data = 100;
   1636 		}
   1637 		return 0;
   1638 
   1639 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1640 		rf_check_copyback_status_ext(raidPtr, data);
   1641 		return 0;
   1642 
   1643 	case RAIDFRAME_SET_LAST_UNIT:
   1644 		for (column = 0; column < raidPtr->numCol; column++)
   1645 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1646 				return EBUSY;
   1647 
   1648 		for (column = 0; column < raidPtr->numCol; column++) {
   1649 			clabel = raidget_component_label(raidPtr, column);
   1650 			clabel->last_unit = *(int *)data;
   1651 			raidflush_component_label(raidPtr, column);
   1652 		}
   1653 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1654 		return 0;
   1655 
   1656 		/* the sparetable daemon calls this to wait for the kernel to
   1657 		 * need a spare table. this ioctl does not return until a
   1658 		 * spare table is needed. XXX -- calling mpsleep here in the
   1659 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1660 		 * -- I should either compute the spare table in the kernel,
   1661 		 * or have a different -- XXX XXX -- interface (a different
   1662 		 * character device) for delivering the table     -- XXX */
   1663 #if RF_DISABLED
   1664 	case RAIDFRAME_SPARET_WAIT:
   1665 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1666 		while (!rf_sparet_wait_queue)
   1667 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1668 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1669 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1670 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1671 
   1672 		/* structure assignment */
   1673 		*((RF_SparetWait_t *) data) = *waitreq;
   1674 
   1675 		RF_Free(waitreq, sizeof(*waitreq));
   1676 		return 0;
   1677 
   1678 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1679 		 * code in it that will cause the dameon to exit */
   1680 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1681 		waitreq = RF_Malloc(sizeof(*waitreq));
   1682 		waitreq->fcol = -1;
   1683 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1684 		waitreq->next = rf_sparet_wait_queue;
   1685 		rf_sparet_wait_queue = waitreq;
   1686 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1687 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1688 		return 0;
   1689 
   1690 		/* used by the spare table daemon to deliver a spare table
   1691 		 * into the kernel */
   1692 	case RAIDFRAME_SEND_SPARET:
   1693 
   1694 		/* install the spare table */
   1695 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1696 
   1697 		/* respond to the requestor.  the return status of the spare
   1698 		 * table installation is passed in the "fcol" field */
   1699 		waitred = RF_Malloc(sizeof(*waitreq));
   1700 		waitreq->fcol = retcode;
   1701 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1702 		waitreq->next = rf_sparet_resp_queue;
   1703 		rf_sparet_resp_queue = waitreq;
   1704 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1705 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1706 
   1707 		return retcode;
   1708 #endif
   1709 	default:
   1710 		/*
   1711 		 * Don't bother trying to load compat modules
   1712 		 * if it is not our ioctl. This is more efficient
   1713 		 * and makes rump tests not depend on compat code
   1714 		 */
   1715 		if (IOCGROUP(cmd) != 'r')
   1716 			break;
   1717 #ifdef _LP64
   1718 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1719 			module_autoload("compat_netbsd32_raid",
   1720 			    MODULE_CLASS_EXEC);
   1721 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1722 			    (rs, cmd, data), enosys(), retcode);
   1723 			if (retcode != EPASSTHROUGH)
   1724 				return retcode;
   1725 		}
   1726 #endif
   1727 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1728 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1729 		    (rs, cmd, data), enosys(), retcode);
   1730 		if (retcode != EPASSTHROUGH)
   1731 			return retcode;
   1732 
   1733 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1734 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1735 		    (rs, cmd, data), enosys(), retcode);
   1736 		if (retcode != EPASSTHROUGH)
   1737 			return retcode;
   1738 		break; /* fall through to the os-specific code below */
   1739 
   1740 	}
   1741 
   1742 	if (!raidPtr->valid)
   1743 		return (EINVAL);
   1744 
   1745 	/*
   1746 	 * Add support for "regular" device ioctls here.
   1747 	 */
   1748 
   1749 	switch (cmd) {
   1750 	case DIOCGCACHE:
   1751 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1752 		break;
   1753 
   1754 	case DIOCCACHESYNC:
   1755 		retcode = rf_sync_component_caches(raidPtr);
   1756 		break;
   1757 
   1758 	default:
   1759 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1760 		break;
   1761 	}
   1762 
   1763 	return (retcode);
   1764 
   1765 }
   1766 
   1767 
   1768 /* raidinit -- complete the rest of the initialization for the
   1769    RAIDframe device.  */
   1770 
   1771 
   1772 static void
   1773 raidinit(struct raid_softc *rs)
   1774 {
   1775 	cfdata_t cf;
   1776 	unsigned int unit;
   1777 	struct dk_softc *dksc = &rs->sc_dksc;
   1778 	RF_Raid_t *raidPtr = &rs->sc_r;
   1779 	device_t dev;
   1780 
   1781 	unit = raidPtr->raidid;
   1782 
   1783 	/* XXX doesn't check bounds. */
   1784 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1785 
   1786 	/* attach the pseudo device */
   1787 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1788 	cf->cf_name = raid_cd.cd_name;
   1789 	cf->cf_atname = raid_cd.cd_name;
   1790 	cf->cf_unit = unit;
   1791 	cf->cf_fstate = FSTATE_STAR;
   1792 
   1793 	dev = config_attach_pseudo(cf);
   1794 	if (dev == NULL) {
   1795 		printf("raid%d: config_attach_pseudo failed\n",
   1796 		    raidPtr->raidid);
   1797 		free(cf, M_RAIDFRAME);
   1798 		return;
   1799 	}
   1800 
   1801 	/* provide a backpointer to the real softc */
   1802 	raidsoftc(dev) = rs;
   1803 
   1804 	/* disk_attach actually creates space for the CPU disklabel, among
   1805 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1806 	 * with disklabels. */
   1807 	dk_init(dksc, dev, DKTYPE_RAID);
   1808 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1809 
   1810 	/* XXX There may be a weird interaction here between this, and
   1811 	 * protectedSectors, as used in RAIDframe.  */
   1812 
   1813 	rs->sc_size = raidPtr->totalSectors;
   1814 
   1815 	/* Attach dk and disk subsystems */
   1816 	dk_attach(dksc);
   1817 	disk_attach(&dksc->sc_dkdev);
   1818 	rf_set_geometry(rs, raidPtr);
   1819 
   1820 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1821 
   1822 	/* mark unit as usuable */
   1823 	rs->sc_flags |= RAIDF_INITED;
   1824 
   1825 	dkwedge_discover(&dksc->sc_dkdev);
   1826 }
   1827 
   1828 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1829 /* wake up the daemon & tell it to get us a spare table
   1830  * XXX
   1831  * the entries in the queues should be tagged with the raidPtr
   1832  * so that in the extremely rare case that two recons happen at once,
   1833  * we know for which device were requesting a spare table
   1834  * XXX
   1835  *
   1836  * XXX This code is not currently used. GO
   1837  */
   1838 int
   1839 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1840 {
   1841 	int     retcode;
   1842 
   1843 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1844 	req->next = rf_sparet_wait_queue;
   1845 	rf_sparet_wait_queue = req;
   1846 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1847 
   1848 	/* mpsleep unlocks the mutex */
   1849 	while (!rf_sparet_resp_queue) {
   1850 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1851 	}
   1852 	req = rf_sparet_resp_queue;
   1853 	rf_sparet_resp_queue = req->next;
   1854 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1855 
   1856 	retcode = req->fcol;
   1857 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1858 					 * alloc'd */
   1859 	return (retcode);
   1860 }
   1861 #endif
   1862 
   1863 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1864  * bp & passes it down.
   1865  * any calls originating in the kernel must use non-blocking I/O
   1866  * do some extra sanity checking to return "appropriate" error values for
   1867  * certain conditions (to make some standard utilities work)
   1868  *
   1869  * Formerly known as: rf_DoAccessKernel
   1870  */
   1871 void
   1872 raidstart(RF_Raid_t *raidPtr)
   1873 {
   1874 	struct raid_softc *rs;
   1875 	struct dk_softc *dksc;
   1876 
   1877 	rs = raidPtr->softc;
   1878 	dksc = &rs->sc_dksc;
   1879 	/* quick check to see if anything has died recently */
   1880 	rf_lock_mutex2(raidPtr->mutex);
   1881 	if (raidPtr->numNewFailures > 0) {
   1882 		rf_unlock_mutex2(raidPtr->mutex);
   1883 		rf_update_component_labels(raidPtr,
   1884 					   RF_NORMAL_COMPONENT_UPDATE);
   1885 		rf_lock_mutex2(raidPtr->mutex);
   1886 		raidPtr->numNewFailures--;
   1887 	}
   1888 	rf_unlock_mutex2(raidPtr->mutex);
   1889 
   1890 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1891 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1892 		return;
   1893 	}
   1894 
   1895 	dk_start(dksc, NULL);
   1896 }
   1897 
   1898 static int
   1899 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1900 {
   1901 	RF_SectorCount_t num_blocks, pb, sum;
   1902 	RF_RaidAddr_t raid_addr;
   1903 	daddr_t blocknum;
   1904 	int     do_async;
   1905 	int rc;
   1906 
   1907 	rf_lock_mutex2(raidPtr->mutex);
   1908 	if (raidPtr->openings == 0) {
   1909 		rf_unlock_mutex2(raidPtr->mutex);
   1910 		return EAGAIN;
   1911 	}
   1912 	rf_unlock_mutex2(raidPtr->mutex);
   1913 
   1914 	blocknum = bp->b_rawblkno;
   1915 
   1916 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1917 		    (int) blocknum));
   1918 
   1919 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1920 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1921 
   1922 	/* *THIS* is where we adjust what block we're going to...
   1923 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1924 	raid_addr = blocknum;
   1925 
   1926 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1927 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1928 	sum = raid_addr + num_blocks + pb;
   1929 	if (1 || rf_debugKernelAccess) {
   1930 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1931 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1932 			    (int) pb, (int) bp->b_resid));
   1933 	}
   1934 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1935 	    || (sum < num_blocks) || (sum < pb)) {
   1936 		rc = ENOSPC;
   1937 		goto done;
   1938 	}
   1939 	/*
   1940 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1941 	 */
   1942 
   1943 	if (bp->b_bcount & raidPtr->sectorMask) {
   1944 		rc = ENOSPC;
   1945 		goto done;
   1946 	}
   1947 	db1_printf(("Calling DoAccess..\n"));
   1948 
   1949 
   1950 	rf_lock_mutex2(raidPtr->mutex);
   1951 	raidPtr->openings--;
   1952 	rf_unlock_mutex2(raidPtr->mutex);
   1953 
   1954 	/*
   1955 	 * Everything is async.
   1956 	 */
   1957 	do_async = 1;
   1958 
   1959 	/* don't ever condition on bp->b_flags & B_WRITE.
   1960 	 * always condition on B_READ instead */
   1961 
   1962 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1963 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1964 			 do_async, raid_addr, num_blocks,
   1965 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1966 
   1967 done:
   1968 	return rc;
   1969 }
   1970 
   1971 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1972 
   1973 int
   1974 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1975 {
   1976 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1977 	struct buf *bp;
   1978 
   1979 	req->queue = queue;
   1980 	bp = req->bp;
   1981 
   1982 	switch (req->type) {
   1983 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1984 		/* XXX need to do something extra here.. */
   1985 		/* I'm leaving this in, as I've never actually seen it used,
   1986 		 * and I'd like folks to report it... GO */
   1987 		printf(("WAKEUP CALLED\n"));
   1988 		queue->numOutstanding++;
   1989 
   1990 		bp->b_flags = 0;
   1991 		bp->b_private = req;
   1992 
   1993 		KernelWakeupFunc(bp);
   1994 		break;
   1995 
   1996 	case RF_IO_TYPE_READ:
   1997 	case RF_IO_TYPE_WRITE:
   1998 #if RF_ACC_TRACE > 0
   1999 		if (req->tracerec) {
   2000 			RF_ETIMER_START(req->tracerec->timer);
   2001 		}
   2002 #endif
   2003 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2004 		    op, queue->rf_cinfo->ci_dev,
   2005 		    req->sectorOffset, req->numSector,
   2006 		    req->buf, KernelWakeupFunc, (void *) req,
   2007 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2008 
   2009 		if (rf_debugKernelAccess) {
   2010 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2011 				(long) bp->b_blkno));
   2012 		}
   2013 		queue->numOutstanding++;
   2014 		queue->last_deq_sector = req->sectorOffset;
   2015 		/* acc wouldn't have been let in if there were any pending
   2016 		 * reqs at any other priority */
   2017 		queue->curPriority = req->priority;
   2018 
   2019 		db1_printf(("Going for %c to unit %d col %d\n",
   2020 			    req->type, queue->raidPtr->raidid,
   2021 			    queue->col));
   2022 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2023 			(int) req->sectorOffset, (int) req->numSector,
   2024 			(int) (req->numSector <<
   2025 			    queue->raidPtr->logBytesPerSector),
   2026 			(int) queue->raidPtr->logBytesPerSector));
   2027 
   2028 		/*
   2029 		 * XXX: drop lock here since this can block at
   2030 		 * least with backing SCSI devices.  Retake it
   2031 		 * to minimize fuss with calling interfaces.
   2032 		 */
   2033 
   2034 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2035 		bdev_strategy(bp);
   2036 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2037 		break;
   2038 
   2039 	default:
   2040 		panic("bad req->type in rf_DispatchKernelIO");
   2041 	}
   2042 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2043 
   2044 	return (0);
   2045 }
   2046 /* this is the callback function associated with a I/O invoked from
   2047    kernel code.
   2048  */
   2049 static void
   2050 KernelWakeupFunc(struct buf *bp)
   2051 {
   2052 	RF_DiskQueueData_t *req = NULL;
   2053 	RF_DiskQueue_t *queue;
   2054 
   2055 	db1_printf(("recovering the request queue:\n"));
   2056 
   2057 	req = bp->b_private;
   2058 
   2059 	queue = (RF_DiskQueue_t *) req->queue;
   2060 
   2061 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2062 
   2063 #if RF_ACC_TRACE > 0
   2064 	if (req->tracerec) {
   2065 		RF_ETIMER_STOP(req->tracerec->timer);
   2066 		RF_ETIMER_EVAL(req->tracerec->timer);
   2067 		rf_lock_mutex2(rf_tracing_mutex);
   2068 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2069 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2070 		req->tracerec->num_phys_ios++;
   2071 		rf_unlock_mutex2(rf_tracing_mutex);
   2072 	}
   2073 #endif
   2074 
   2075 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2076 	 * ballistic, and mark the component as hosed... */
   2077 
   2078 	if (bp->b_error != 0) {
   2079 		/* Mark the disk as dead */
   2080 		/* but only mark it once... */
   2081 		/* and only if it wouldn't leave this RAID set
   2082 		   completely broken */
   2083 		if (((queue->raidPtr->Disks[queue->col].status ==
   2084 		      rf_ds_optimal) ||
   2085 		     (queue->raidPtr->Disks[queue->col].status ==
   2086 		      rf_ds_used_spare)) &&
   2087 		     (queue->raidPtr->numFailures <
   2088 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2089 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2090 			       queue->raidPtr->raidid,
   2091 			       bp->b_error,
   2092 			       queue->raidPtr->Disks[queue->col].devname);
   2093 			queue->raidPtr->Disks[queue->col].status =
   2094 			    rf_ds_failed;
   2095 			queue->raidPtr->status = rf_rs_degraded;
   2096 			queue->raidPtr->numFailures++;
   2097 			queue->raidPtr->numNewFailures++;
   2098 		} else {	/* Disk is already dead... */
   2099 			/* printf("Disk already marked as dead!\n"); */
   2100 		}
   2101 
   2102 	}
   2103 
   2104 	/* Fill in the error value */
   2105 	req->error = bp->b_error;
   2106 
   2107 	/* Drop this one on the "finished" queue... */
   2108 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2109 
   2110 	/* Let the raidio thread know there is work to be done. */
   2111 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2112 
   2113 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2114 }
   2115 
   2116 
   2117 /*
   2118  * initialize a buf structure for doing an I/O in the kernel.
   2119  */
   2120 static void
   2121 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2122        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2123        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2124        struct proc *b_proc)
   2125 {
   2126 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2127 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2128 	bp->b_oflags = 0;
   2129 	bp->b_cflags = 0;
   2130 	bp->b_bcount = numSect << logBytesPerSector;
   2131 	bp->b_bufsize = bp->b_bcount;
   2132 	bp->b_error = 0;
   2133 	bp->b_dev = dev;
   2134 	bp->b_data = bf;
   2135 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2136 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2137 	if (bp->b_bcount == 0) {
   2138 		panic("bp->b_bcount is zero in InitBP!!");
   2139 	}
   2140 	bp->b_proc = b_proc;
   2141 	bp->b_iodone = cbFunc;
   2142 	bp->b_private = cbArg;
   2143 }
   2144 
   2145 /*
   2146  * Wait interruptibly for an exclusive lock.
   2147  *
   2148  * XXX
   2149  * Several drivers do this; it should be abstracted and made MP-safe.
   2150  * (Hmm... where have we seen this warning before :->  GO )
   2151  */
   2152 static int
   2153 raidlock(struct raid_softc *rs)
   2154 {
   2155 	int     error;
   2156 
   2157 	error = 0;
   2158 	mutex_enter(&rs->sc_mutex);
   2159 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2160 		rs->sc_flags |= RAIDF_WANTED;
   2161 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2162 		if (error != 0)
   2163 			goto done;
   2164 	}
   2165 	rs->sc_flags |= RAIDF_LOCKED;
   2166 done:
   2167 	mutex_exit(&rs->sc_mutex);
   2168 	return (error);
   2169 }
   2170 /*
   2171  * Unlock and wake up any waiters.
   2172  */
   2173 static void
   2174 raidunlock(struct raid_softc *rs)
   2175 {
   2176 
   2177 	mutex_enter(&rs->sc_mutex);
   2178 	rs->sc_flags &= ~RAIDF_LOCKED;
   2179 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2180 		rs->sc_flags &= ~RAIDF_WANTED;
   2181 		cv_broadcast(&rs->sc_cv);
   2182 	}
   2183 	mutex_exit(&rs->sc_mutex);
   2184 }
   2185 
   2186 
   2187 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2188 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2189 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2190 
   2191 static daddr_t
   2192 rf_component_info_offset(void)
   2193 {
   2194 
   2195 	return RF_COMPONENT_INFO_OFFSET;
   2196 }
   2197 
   2198 static daddr_t
   2199 rf_component_info_size(unsigned secsize)
   2200 {
   2201 	daddr_t info_size;
   2202 
   2203 	KASSERT(secsize);
   2204 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2205 		info_size = secsize;
   2206 	else
   2207 		info_size = RF_COMPONENT_INFO_SIZE;
   2208 
   2209 	return info_size;
   2210 }
   2211 
   2212 static daddr_t
   2213 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2214 {
   2215 	daddr_t map_offset;
   2216 
   2217 	KASSERT(raidPtr->bytesPerSector);
   2218 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2219 		map_offset = raidPtr->bytesPerSector;
   2220 	else
   2221 		map_offset = RF_COMPONENT_INFO_SIZE;
   2222 	map_offset += rf_component_info_offset();
   2223 
   2224 	return map_offset;
   2225 }
   2226 
   2227 static daddr_t
   2228 rf_parity_map_size(RF_Raid_t *raidPtr)
   2229 {
   2230 	daddr_t map_size;
   2231 
   2232 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2233 		map_size = raidPtr->bytesPerSector;
   2234 	else
   2235 		map_size = RF_PARITY_MAP_SIZE;
   2236 
   2237 	return map_size;
   2238 }
   2239 
   2240 int
   2241 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2242 {
   2243 	RF_ComponentLabel_t *clabel;
   2244 
   2245 	clabel = raidget_component_label(raidPtr, col);
   2246 	clabel->clean = RF_RAID_CLEAN;
   2247 	raidflush_component_label(raidPtr, col);
   2248 	return(0);
   2249 }
   2250 
   2251 
   2252 int
   2253 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2254 {
   2255 	RF_ComponentLabel_t *clabel;
   2256 
   2257 	clabel = raidget_component_label(raidPtr, col);
   2258 	clabel->clean = RF_RAID_DIRTY;
   2259 	raidflush_component_label(raidPtr, col);
   2260 	return(0);
   2261 }
   2262 
   2263 int
   2264 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2265 {
   2266 	KASSERT(raidPtr->bytesPerSector);
   2267 	return raidread_component_label(raidPtr->bytesPerSector,
   2268 	    raidPtr->Disks[col].dev,
   2269 	    raidPtr->raid_cinfo[col].ci_vp,
   2270 	    &raidPtr->raid_cinfo[col].ci_label);
   2271 }
   2272 
   2273 RF_ComponentLabel_t *
   2274 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2275 {
   2276 	return &raidPtr->raid_cinfo[col].ci_label;
   2277 }
   2278 
   2279 int
   2280 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2281 {
   2282 	RF_ComponentLabel_t *label;
   2283 
   2284 	label = &raidPtr->raid_cinfo[col].ci_label;
   2285 	label->mod_counter = raidPtr->mod_counter;
   2286 #ifndef RF_NO_PARITY_MAP
   2287 	label->parity_map_modcount = label->mod_counter;
   2288 #endif
   2289 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2290 	    raidPtr->Disks[col].dev,
   2291 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2292 }
   2293 
   2294 
   2295 static int
   2296 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2297     RF_ComponentLabel_t *clabel)
   2298 {
   2299 	return raidread_component_area(dev, b_vp, clabel,
   2300 	    sizeof(RF_ComponentLabel_t),
   2301 	    rf_component_info_offset(),
   2302 	    rf_component_info_size(secsize));
   2303 }
   2304 
   2305 /* ARGSUSED */
   2306 static int
   2307 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2308     size_t msize, daddr_t offset, daddr_t dsize)
   2309 {
   2310 	struct buf *bp;
   2311 	int error;
   2312 
   2313 	/* XXX should probably ensure that we don't try to do this if
   2314 	   someone has changed rf_protected_sectors. */
   2315 
   2316 	if (b_vp == NULL) {
   2317 		/* For whatever reason, this component is not valid.
   2318 		   Don't try to read a component label from it. */
   2319 		return(EINVAL);
   2320 	}
   2321 
   2322 	/* get a block of the appropriate size... */
   2323 	bp = geteblk((int)dsize);
   2324 	bp->b_dev = dev;
   2325 
   2326 	/* get our ducks in a row for the read */
   2327 	bp->b_blkno = offset / DEV_BSIZE;
   2328 	bp->b_bcount = dsize;
   2329 	bp->b_flags |= B_READ;
   2330  	bp->b_resid = dsize;
   2331 
   2332 	bdev_strategy(bp);
   2333 	error = biowait(bp);
   2334 
   2335 	if (!error) {
   2336 		memcpy(data, bp->b_data, msize);
   2337 	}
   2338 
   2339 	brelse(bp, 0);
   2340 	return(error);
   2341 }
   2342 
   2343 
   2344 static int
   2345 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2346     RF_ComponentLabel_t *clabel)
   2347 {
   2348 	return raidwrite_component_area(dev, b_vp, clabel,
   2349 	    sizeof(RF_ComponentLabel_t),
   2350 	    rf_component_info_offset(),
   2351 	    rf_component_info_size(secsize), 0);
   2352 }
   2353 
   2354 /* ARGSUSED */
   2355 static int
   2356 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2357     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2358 {
   2359 	struct buf *bp;
   2360 	int error;
   2361 
   2362 	/* get a block of the appropriate size... */
   2363 	bp = geteblk((int)dsize);
   2364 	bp->b_dev = dev;
   2365 
   2366 	/* get our ducks in a row for the write */
   2367 	bp->b_blkno = offset / DEV_BSIZE;
   2368 	bp->b_bcount = dsize;
   2369 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2370  	bp->b_resid = dsize;
   2371 
   2372 	memset(bp->b_data, 0, dsize);
   2373 	memcpy(bp->b_data, data, msize);
   2374 
   2375 	bdev_strategy(bp);
   2376 	if (asyncp)
   2377 		return 0;
   2378 	error = biowait(bp);
   2379 	brelse(bp, 0);
   2380 	if (error) {
   2381 #if 1
   2382 		printf("Failed to write RAID component info!\n");
   2383 #endif
   2384 	}
   2385 
   2386 	return(error);
   2387 }
   2388 
   2389 void
   2390 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2391 {
   2392 	int c;
   2393 
   2394 	for (c = 0; c < raidPtr->numCol; c++) {
   2395 		/* Skip dead disks. */
   2396 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2397 			continue;
   2398 		/* XXXjld: what if an error occurs here? */
   2399 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2400 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2401 		    RF_PARITYMAP_NBYTE,
   2402 		    rf_parity_map_offset(raidPtr),
   2403 		    rf_parity_map_size(raidPtr), 0);
   2404 	}
   2405 }
   2406 
   2407 void
   2408 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2409 {
   2410 	struct rf_paritymap_ondisk tmp;
   2411 	int c,first;
   2412 
   2413 	first=1;
   2414 	for (c = 0; c < raidPtr->numCol; c++) {
   2415 		/* Skip dead disks. */
   2416 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2417 			continue;
   2418 		raidread_component_area(raidPtr->Disks[c].dev,
   2419 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2420 		    RF_PARITYMAP_NBYTE,
   2421 		    rf_parity_map_offset(raidPtr),
   2422 		    rf_parity_map_size(raidPtr));
   2423 		if (first) {
   2424 			memcpy(map, &tmp, sizeof(*map));
   2425 			first = 0;
   2426 		} else {
   2427 			rf_paritymap_merge(map, &tmp);
   2428 		}
   2429 	}
   2430 }
   2431 
   2432 void
   2433 rf_markalldirty(RF_Raid_t *raidPtr)
   2434 {
   2435 	RF_ComponentLabel_t *clabel;
   2436 	int sparecol;
   2437 	int c;
   2438 	int j;
   2439 	int scol = -1;
   2440 
   2441 	raidPtr->mod_counter++;
   2442 	for (c = 0; c < raidPtr->numCol; c++) {
   2443 		/* we don't want to touch (at all) a disk that has
   2444 		   failed */
   2445 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2446 			clabel = raidget_component_label(raidPtr, c);
   2447 			if (clabel->status == rf_ds_spared) {
   2448 				/* XXX do something special...
   2449 				   but whatever you do, don't
   2450 				   try to access it!! */
   2451 			} else {
   2452 				raidmarkdirty(raidPtr, c);
   2453 			}
   2454 		}
   2455 	}
   2456 
   2457 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2458 		sparecol = raidPtr->numCol + c;
   2459 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2460 			/*
   2461 
   2462 			   we claim this disk is "optimal" if it's
   2463 			   rf_ds_used_spare, as that means it should be
   2464 			   directly substitutable for the disk it replaced.
   2465 			   We note that too...
   2466 
   2467 			 */
   2468 
   2469 			for(j=0;j<raidPtr->numCol;j++) {
   2470 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2471 					scol = j;
   2472 					break;
   2473 				}
   2474 			}
   2475 
   2476 			clabel = raidget_component_label(raidPtr, sparecol);
   2477 			/* make sure status is noted */
   2478 
   2479 			raid_init_component_label(raidPtr, clabel);
   2480 
   2481 			clabel->row = 0;
   2482 			clabel->column = scol;
   2483 			/* Note: we *don't* change status from rf_ds_used_spare
   2484 			   to rf_ds_optimal */
   2485 			/* clabel.status = rf_ds_optimal; */
   2486 
   2487 			raidmarkdirty(raidPtr, sparecol);
   2488 		}
   2489 	}
   2490 }
   2491 
   2492 
   2493 void
   2494 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2495 {
   2496 	RF_ComponentLabel_t *clabel;
   2497 	int sparecol;
   2498 	int c;
   2499 	int j;
   2500 	int scol;
   2501 	struct raid_softc *rs = raidPtr->softc;
   2502 
   2503 	scol = -1;
   2504 
   2505 	/* XXX should do extra checks to make sure things really are clean,
   2506 	   rather than blindly setting the clean bit... */
   2507 
   2508 	raidPtr->mod_counter++;
   2509 
   2510 	for (c = 0; c < raidPtr->numCol; c++) {
   2511 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2512 			clabel = raidget_component_label(raidPtr, c);
   2513 			/* make sure status is noted */
   2514 			clabel->status = rf_ds_optimal;
   2515 
   2516 			/* note what unit we are configured as */
   2517 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2518 				clabel->last_unit = raidPtr->raidid;
   2519 
   2520 			raidflush_component_label(raidPtr, c);
   2521 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2522 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2523 					raidmarkclean(raidPtr, c);
   2524 				}
   2525 			}
   2526 		}
   2527 		/* else we don't touch it.. */
   2528 	}
   2529 
   2530 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2531 		sparecol = raidPtr->numCol + c;
   2532 		/* Need to ensure that the reconstruct actually completed! */
   2533 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2534 			/*
   2535 
   2536 			   we claim this disk is "optimal" if it's
   2537 			   rf_ds_used_spare, as that means it should be
   2538 			   directly substitutable for the disk it replaced.
   2539 			   We note that too...
   2540 
   2541 			 */
   2542 
   2543 			for(j=0;j<raidPtr->numCol;j++) {
   2544 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2545 					scol = j;
   2546 					break;
   2547 				}
   2548 			}
   2549 
   2550 			/* XXX shouldn't *really* need this... */
   2551 			clabel = raidget_component_label(raidPtr, sparecol);
   2552 			/* make sure status is noted */
   2553 
   2554 			raid_init_component_label(raidPtr, clabel);
   2555 
   2556 			clabel->column = scol;
   2557 			clabel->status = rf_ds_optimal;
   2558 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2559 				clabel->last_unit = raidPtr->raidid;
   2560 
   2561 			raidflush_component_label(raidPtr, sparecol);
   2562 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2563 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2564 					raidmarkclean(raidPtr, sparecol);
   2565 				}
   2566 			}
   2567 		}
   2568 	}
   2569 }
   2570 
   2571 void
   2572 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2573 {
   2574 
   2575 	if (vp != NULL) {
   2576 		if (auto_configured == 1) {
   2577 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2578 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2579 			vput(vp);
   2580 
   2581 		} else {
   2582 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2583 		}
   2584 	}
   2585 }
   2586 
   2587 
   2588 void
   2589 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2590 {
   2591 	int r,c;
   2592 	struct vnode *vp;
   2593 	int acd;
   2594 
   2595 
   2596 	/* We take this opportunity to close the vnodes like we should.. */
   2597 
   2598 	for (c = 0; c < raidPtr->numCol; c++) {
   2599 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2600 		acd = raidPtr->Disks[c].auto_configured;
   2601 		rf_close_component(raidPtr, vp, acd);
   2602 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2603 		raidPtr->Disks[c].auto_configured = 0;
   2604 	}
   2605 
   2606 	for (r = 0; r < raidPtr->numSpare; r++) {
   2607 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2608 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2609 		rf_close_component(raidPtr, vp, acd);
   2610 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2611 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2612 	}
   2613 }
   2614 
   2615 
   2616 void
   2617 rf_ReconThread(struct rf_recon_req_internal *req)
   2618 {
   2619 	int     s;
   2620 	RF_Raid_t *raidPtr;
   2621 
   2622 	s = splbio();
   2623 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2624 	raidPtr->recon_in_progress = 1;
   2625 
   2626 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2627 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2628 
   2629 	RF_Free(req, sizeof(*req));
   2630 
   2631 	raidPtr->recon_in_progress = 0;
   2632 	splx(s);
   2633 
   2634 	/* That's all... */
   2635 	kthread_exit(0);	/* does not return */
   2636 }
   2637 
   2638 void
   2639 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2640 {
   2641 	int retcode;
   2642 	int s;
   2643 
   2644 	raidPtr->parity_rewrite_stripes_done = 0;
   2645 	raidPtr->parity_rewrite_in_progress = 1;
   2646 	s = splbio();
   2647 	retcode = rf_RewriteParity(raidPtr);
   2648 	splx(s);
   2649 	if (retcode) {
   2650 		printf("raid%d: Error re-writing parity (%d)!\n",
   2651 		    raidPtr->raidid, retcode);
   2652 	} else {
   2653 		/* set the clean bit!  If we shutdown correctly,
   2654 		   the clean bit on each component label will get
   2655 		   set */
   2656 		raidPtr->parity_good = RF_RAID_CLEAN;
   2657 	}
   2658 	raidPtr->parity_rewrite_in_progress = 0;
   2659 
   2660 	/* Anyone waiting for us to stop?  If so, inform them... */
   2661 	if (raidPtr->waitShutdown) {
   2662 		rf_lock_mutex2(raidPtr->rad_lock);
   2663 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2664 		rf_unlock_mutex2(raidPtr->rad_lock);
   2665 	}
   2666 
   2667 	/* That's all... */
   2668 	kthread_exit(0);	/* does not return */
   2669 }
   2670 
   2671 
   2672 void
   2673 rf_CopybackThread(RF_Raid_t *raidPtr)
   2674 {
   2675 	int s;
   2676 
   2677 	raidPtr->copyback_in_progress = 1;
   2678 	s = splbio();
   2679 	rf_CopybackReconstructedData(raidPtr);
   2680 	splx(s);
   2681 	raidPtr->copyback_in_progress = 0;
   2682 
   2683 	/* That's all... */
   2684 	kthread_exit(0);	/* does not return */
   2685 }
   2686 
   2687 
   2688 void
   2689 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2690 {
   2691 	int s;
   2692 	RF_Raid_t *raidPtr;
   2693 
   2694 	s = splbio();
   2695 	raidPtr = req->raidPtr;
   2696 	raidPtr->recon_in_progress = 1;
   2697 	rf_ReconstructInPlace(raidPtr, req->col);
   2698 	RF_Free(req, sizeof(*req));
   2699 	raidPtr->recon_in_progress = 0;
   2700 	splx(s);
   2701 
   2702 	/* That's all... */
   2703 	kthread_exit(0);	/* does not return */
   2704 }
   2705 
   2706 static RF_AutoConfig_t *
   2707 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2708     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2709     unsigned secsize)
   2710 {
   2711 	int good_one = 0;
   2712 	RF_ComponentLabel_t *clabel;
   2713 	RF_AutoConfig_t *ac;
   2714 
   2715 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2716 
   2717 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2718 		/* Got the label.  Does it look reasonable? */
   2719 		if (rf_reasonable_label(clabel, numsecs) &&
   2720 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2721 #ifdef DEBUG
   2722 			printf("Component on: %s: %llu\n",
   2723 				cname, (unsigned long long)size);
   2724 			rf_print_component_label(clabel);
   2725 #endif
   2726 			/* if it's reasonable, add it, else ignore it. */
   2727 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2728 				M_WAITOK);
   2729 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2730 			ac->dev = dev;
   2731 			ac->vp = vp;
   2732 			ac->clabel = clabel;
   2733 			ac->next = ac_list;
   2734 			ac_list = ac;
   2735 			good_one = 1;
   2736 		}
   2737 	}
   2738 	if (!good_one) {
   2739 		/* cleanup */
   2740 		free(clabel, M_RAIDFRAME);
   2741 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2742 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2743 		vput(vp);
   2744 	}
   2745 	return ac_list;
   2746 }
   2747 
   2748 RF_AutoConfig_t *
   2749 rf_find_raid_components(void)
   2750 {
   2751 	struct vnode *vp;
   2752 	struct disklabel label;
   2753 	device_t dv;
   2754 	deviter_t di;
   2755 	dev_t dev;
   2756 	int bmajor, bminor, wedge, rf_part_found;
   2757 	int error;
   2758 	int i;
   2759 	RF_AutoConfig_t *ac_list;
   2760 	uint64_t numsecs;
   2761 	unsigned secsize;
   2762 	int dowedges;
   2763 
   2764 	/* initialize the AutoConfig list */
   2765 	ac_list = NULL;
   2766 
   2767 	/*
   2768 	 * we begin by trolling through *all* the devices on the system *twice*
   2769 	 * first we scan for wedges, second for other devices. This avoids
   2770 	 * using a raw partition instead of a wedge that covers the whole disk
   2771 	 */
   2772 
   2773 	for (dowedges=1; dowedges>=0; --dowedges) {
   2774 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2775 		     dv = deviter_next(&di)) {
   2776 
   2777 			/* we are only interested in disks... */
   2778 			if (device_class(dv) != DV_DISK)
   2779 				continue;
   2780 
   2781 			/* we don't care about floppies... */
   2782 			if (device_is_a(dv, "fd")) {
   2783 				continue;
   2784 			}
   2785 
   2786 			/* we don't care about CD's... */
   2787 			if (device_is_a(dv, "cd")) {
   2788 				continue;
   2789 			}
   2790 
   2791 			/* we don't care about md's... */
   2792 			if (device_is_a(dv, "md")) {
   2793 				continue;
   2794 			}
   2795 
   2796 			/* hdfd is the Atari/Hades floppy driver */
   2797 			if (device_is_a(dv, "hdfd")) {
   2798 				continue;
   2799 			}
   2800 
   2801 			/* fdisa is the Atari/Milan floppy driver */
   2802 			if (device_is_a(dv, "fdisa")) {
   2803 				continue;
   2804 			}
   2805 
   2806 			/* are we in the wedges pass ? */
   2807 			wedge = device_is_a(dv, "dk");
   2808 			if (wedge != dowedges) {
   2809 				continue;
   2810 			}
   2811 
   2812 			/* need to find the device_name_to_block_device_major stuff */
   2813 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2814 
   2815 			rf_part_found = 0; /*No raid partition as yet*/
   2816 
   2817 			/* get a vnode for the raw partition of this disk */
   2818 			bminor = minor(device_unit(dv));
   2819 			dev = wedge ? makedev(bmajor, bminor) :
   2820 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2821 			if (bdevvp(dev, &vp))
   2822 				panic("RAID can't alloc vnode");
   2823 
   2824 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2825 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2826 
   2827 			if (error) {
   2828 				/* "Who cares."  Continue looking
   2829 				   for something that exists*/
   2830 				vput(vp);
   2831 				continue;
   2832 			}
   2833 
   2834 			error = getdisksize(vp, &numsecs, &secsize);
   2835 			if (error) {
   2836 				/*
   2837 				 * Pseudo devices like vnd and cgd can be
   2838 				 * opened but may still need some configuration.
   2839 				 * Ignore these quietly.
   2840 				 */
   2841 				if (error != ENXIO)
   2842 					printf("RAIDframe: can't get disk size"
   2843 					    " for dev %s (%d)\n",
   2844 					    device_xname(dv), error);
   2845 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2846 				vput(vp);
   2847 				continue;
   2848 			}
   2849 			if (wedge) {
   2850 				struct dkwedge_info dkw;
   2851 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2852 				    NOCRED);
   2853 				if (error) {
   2854 					printf("RAIDframe: can't get wedge info for "
   2855 					    "dev %s (%d)\n", device_xname(dv), error);
   2856 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2857 					vput(vp);
   2858 					continue;
   2859 				}
   2860 
   2861 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2862 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2863 					vput(vp);
   2864 					continue;
   2865 				}
   2866 
   2867 				VOP_UNLOCK(vp);
   2868 				ac_list = rf_get_component(ac_list, dev, vp,
   2869 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2870 				rf_part_found = 1; /*There is a raid component on this disk*/
   2871 				continue;
   2872 			}
   2873 
   2874 			/* Ok, the disk exists.  Go get the disklabel. */
   2875 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2876 			if (error) {
   2877 				/*
   2878 				 * XXX can't happen - open() would
   2879 				 * have errored out (or faked up one)
   2880 				 */
   2881 				if (error != ENOTTY)
   2882 					printf("RAIDframe: can't get label for dev "
   2883 					    "%s (%d)\n", device_xname(dv), error);
   2884 			}
   2885 
   2886 			/* don't need this any more.  We'll allocate it again
   2887 			   a little later if we really do... */
   2888 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2889 			vput(vp);
   2890 
   2891 			if (error)
   2892 				continue;
   2893 
   2894 			rf_part_found = 0; /*No raid partitions yet*/
   2895 			for (i = 0; i < label.d_npartitions; i++) {
   2896 				char cname[sizeof(ac_list->devname)];
   2897 
   2898 				/* We only support partitions marked as RAID */
   2899 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2900 					continue;
   2901 
   2902 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2903 				if (bdevvp(dev, &vp))
   2904 					panic("RAID can't alloc vnode");
   2905 
   2906 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2907 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2908 				if (error) {
   2909 					/* Whatever... */
   2910 					vput(vp);
   2911 					continue;
   2912 				}
   2913 				VOP_UNLOCK(vp);
   2914 				snprintf(cname, sizeof(cname), "%s%c",
   2915 				    device_xname(dv), 'a' + i);
   2916 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2917 					label.d_partitions[i].p_size, numsecs, secsize);
   2918 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2919 			}
   2920 
   2921 			/*
   2922 			 *If there is no raid component on this disk, either in a
   2923 			 *disklabel or inside a wedge, check the raw partition as well,
   2924 			 *as it is possible to configure raid components on raw disk
   2925 			 *devices.
   2926 			 */
   2927 
   2928 			if (!rf_part_found) {
   2929 				char cname[sizeof(ac_list->devname)];
   2930 
   2931 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2932 				if (bdevvp(dev, &vp))
   2933 					panic("RAID can't alloc vnode");
   2934 
   2935 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2936 
   2937 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2938 				if (error) {
   2939 					/* Whatever... */
   2940 					vput(vp);
   2941 					continue;
   2942 				}
   2943 				VOP_UNLOCK(vp);
   2944 				snprintf(cname, sizeof(cname), "%s%c",
   2945 				    device_xname(dv), 'a' + RAW_PART);
   2946 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2947 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2948 			}
   2949 		}
   2950 		deviter_release(&di);
   2951 	}
   2952 	return ac_list;
   2953 }
   2954 
   2955 
   2956 int
   2957 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2958 {
   2959 
   2960 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2961 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2962 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2963 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2964 	    clabel->row >=0 &&
   2965 	    clabel->column >= 0 &&
   2966 	    clabel->num_rows > 0 &&
   2967 	    clabel->num_columns > 0 &&
   2968 	    clabel->row < clabel->num_rows &&
   2969 	    clabel->column < clabel->num_columns &&
   2970 	    clabel->blockSize > 0 &&
   2971 	    /*
   2972 	     * numBlocksHi may contain garbage, but it is ok since
   2973 	     * the type is unsigned.  If it is really garbage,
   2974 	     * rf_fix_old_label_size() will fix it.
   2975 	     */
   2976 	    rf_component_label_numblocks(clabel) > 0) {
   2977 		/*
   2978 		 * label looks reasonable enough...
   2979 		 * let's make sure it has no old garbage.
   2980 		 */
   2981 		if (numsecs)
   2982 			rf_fix_old_label_size(clabel, numsecs);
   2983 		return(1);
   2984 	}
   2985 	return(0);
   2986 }
   2987 
   2988 
   2989 /*
   2990  * For reasons yet unknown, some old component labels have garbage in
   2991  * the newer numBlocksHi region, and this causes lossage.  Since those
   2992  * disks will also have numsecs set to less than 32 bits of sectors,
   2993  * we can determine when this corruption has occurred, and fix it.
   2994  *
   2995  * The exact same problem, with the same unknown reason, happens to
   2996  * the partitionSizeHi member as well.
   2997  */
   2998 static void
   2999 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3000 {
   3001 
   3002 	if (numsecs < ((uint64_t)1 << 32)) {
   3003 		if (clabel->numBlocksHi) {
   3004 			printf("WARNING: total sectors < 32 bits, yet "
   3005 			       "numBlocksHi set\n"
   3006 			       "WARNING: resetting numBlocksHi to zero.\n");
   3007 			clabel->numBlocksHi = 0;
   3008 		}
   3009 
   3010 		if (clabel->partitionSizeHi) {
   3011 			printf("WARNING: total sectors < 32 bits, yet "
   3012 			       "partitionSizeHi set\n"
   3013 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3014 			clabel->partitionSizeHi = 0;
   3015 		}
   3016 	}
   3017 }
   3018 
   3019 
   3020 #ifdef DEBUG
   3021 void
   3022 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3023 {
   3024 	uint64_t numBlocks;
   3025 	static const char *rp[] = {
   3026 	    "No", "Force", "Soft", "*invalid*"
   3027 	};
   3028 
   3029 
   3030 	numBlocks = rf_component_label_numblocks(clabel);
   3031 
   3032 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3033 	       clabel->row, clabel->column,
   3034 	       clabel->num_rows, clabel->num_columns);
   3035 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3036 	       clabel->version, clabel->serial_number,
   3037 	       clabel->mod_counter);
   3038 	printf("   Clean: %s Status: %d\n",
   3039 	       clabel->clean ? "Yes" : "No", clabel->status);
   3040 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3041 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3042 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3043 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3044 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3045 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3046 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3047 #if 0
   3048 	   printf("   Config order: %d\n", clabel->config_order);
   3049 #endif
   3050 
   3051 }
   3052 #endif
   3053 
   3054 RF_ConfigSet_t *
   3055 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3056 {
   3057 	RF_AutoConfig_t *ac;
   3058 	RF_ConfigSet_t *config_sets;
   3059 	RF_ConfigSet_t *cset;
   3060 	RF_AutoConfig_t *ac_next;
   3061 
   3062 
   3063 	config_sets = NULL;
   3064 
   3065 	/* Go through the AutoConfig list, and figure out which components
   3066 	   belong to what sets.  */
   3067 	ac = ac_list;
   3068 	while(ac!=NULL) {
   3069 		/* we're going to putz with ac->next, so save it here
   3070 		   for use at the end of the loop */
   3071 		ac_next = ac->next;
   3072 
   3073 		if (config_sets == NULL) {
   3074 			/* will need at least this one... */
   3075 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3076 				       M_RAIDFRAME, M_WAITOK);
   3077 			/* this one is easy :) */
   3078 			config_sets->ac = ac;
   3079 			config_sets->next = NULL;
   3080 			config_sets->rootable = 0;
   3081 			ac->next = NULL;
   3082 		} else {
   3083 			/* which set does this component fit into? */
   3084 			cset = config_sets;
   3085 			while(cset!=NULL) {
   3086 				if (rf_does_it_fit(cset, ac)) {
   3087 					/* looks like it matches... */
   3088 					ac->next = cset->ac;
   3089 					cset->ac = ac;
   3090 					break;
   3091 				}
   3092 				cset = cset->next;
   3093 			}
   3094 			if (cset==NULL) {
   3095 				/* didn't find a match above... new set..*/
   3096 				cset = malloc(sizeof(RF_ConfigSet_t),
   3097 					       M_RAIDFRAME, M_WAITOK);
   3098 				cset->ac = ac;
   3099 				ac->next = NULL;
   3100 				cset->next = config_sets;
   3101 				cset->rootable = 0;
   3102 				config_sets = cset;
   3103 			}
   3104 		}
   3105 		ac = ac_next;
   3106 	}
   3107 
   3108 
   3109 	return(config_sets);
   3110 }
   3111 
   3112 static int
   3113 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3114 {
   3115 	RF_ComponentLabel_t *clabel1, *clabel2;
   3116 
   3117 	/* If this one matches the *first* one in the set, that's good
   3118 	   enough, since the other members of the set would have been
   3119 	   through here too... */
   3120 	/* note that we are not checking partitionSize here..
   3121 
   3122 	   Note that we are also not checking the mod_counters here.
   3123 	   If everything else matches except the mod_counter, that's
   3124 	   good enough for this test.  We will deal with the mod_counters
   3125 	   a little later in the autoconfiguration process.
   3126 
   3127 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3128 
   3129 	   The reason we don't check for this is that failed disks
   3130 	   will have lower modification counts.  If those disks are
   3131 	   not added to the set they used to belong to, then they will
   3132 	   form their own set, which may result in 2 different sets,
   3133 	   for example, competing to be configured at raid0, and
   3134 	   perhaps competing to be the root filesystem set.  If the
   3135 	   wrong ones get configured, or both attempt to become /,
   3136 	   weird behaviour and or serious lossage will occur.  Thus we
   3137 	   need to bring them into the fold here, and kick them out at
   3138 	   a later point.
   3139 
   3140 	*/
   3141 
   3142 	clabel1 = cset->ac->clabel;
   3143 	clabel2 = ac->clabel;
   3144 	if ((clabel1->version == clabel2->version) &&
   3145 	    (clabel1->serial_number == clabel2->serial_number) &&
   3146 	    (clabel1->num_rows == clabel2->num_rows) &&
   3147 	    (clabel1->num_columns == clabel2->num_columns) &&
   3148 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3149 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3150 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3151 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3152 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3153 	    (clabel1->blockSize == clabel2->blockSize) &&
   3154 	    rf_component_label_numblocks(clabel1) ==
   3155 	    rf_component_label_numblocks(clabel2) &&
   3156 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3157 	    (clabel1->root_partition == clabel2->root_partition) &&
   3158 	    (clabel1->last_unit == clabel2->last_unit) &&
   3159 	    (clabel1->config_order == clabel2->config_order)) {
   3160 		/* if it get's here, it almost *has* to be a match */
   3161 	} else {
   3162 		/* it's not consistent with somebody in the set..
   3163 		   punt */
   3164 		return(0);
   3165 	}
   3166 	/* all was fine.. it must fit... */
   3167 	return(1);
   3168 }
   3169 
   3170 int
   3171 rf_have_enough_components(RF_ConfigSet_t *cset)
   3172 {
   3173 	RF_AutoConfig_t *ac;
   3174 	RF_AutoConfig_t *auto_config;
   3175 	RF_ComponentLabel_t *clabel;
   3176 	int c;
   3177 	int num_cols;
   3178 	int num_missing;
   3179 	int mod_counter;
   3180 	int mod_counter_found;
   3181 	int even_pair_failed;
   3182 	char parity_type;
   3183 
   3184 
   3185 	/* check to see that we have enough 'live' components
   3186 	   of this set.  If so, we can configure it if necessary */
   3187 
   3188 	num_cols = cset->ac->clabel->num_columns;
   3189 	parity_type = cset->ac->clabel->parityConfig;
   3190 
   3191 	/* XXX Check for duplicate components!?!?!? */
   3192 
   3193 	/* Determine what the mod_counter is supposed to be for this set. */
   3194 
   3195 	mod_counter_found = 0;
   3196 	mod_counter = 0;
   3197 	ac = cset->ac;
   3198 	while(ac!=NULL) {
   3199 		if (mod_counter_found==0) {
   3200 			mod_counter = ac->clabel->mod_counter;
   3201 			mod_counter_found = 1;
   3202 		} else {
   3203 			if (ac->clabel->mod_counter > mod_counter) {
   3204 				mod_counter = ac->clabel->mod_counter;
   3205 			}
   3206 		}
   3207 		ac = ac->next;
   3208 	}
   3209 
   3210 	num_missing = 0;
   3211 	auto_config = cset->ac;
   3212 
   3213 	even_pair_failed = 0;
   3214 	for(c=0; c<num_cols; c++) {
   3215 		ac = auto_config;
   3216 		while(ac!=NULL) {
   3217 			if ((ac->clabel->column == c) &&
   3218 			    (ac->clabel->mod_counter == mod_counter)) {
   3219 				/* it's this one... */
   3220 #ifdef DEBUG
   3221 				printf("Found: %s at %d\n",
   3222 				       ac->devname,c);
   3223 #endif
   3224 				break;
   3225 			}
   3226 			ac=ac->next;
   3227 		}
   3228 		if (ac==NULL) {
   3229 				/* Didn't find one here! */
   3230 				/* special case for RAID 1, especially
   3231 				   where there are more than 2
   3232 				   components (where RAIDframe treats
   3233 				   things a little differently :( ) */
   3234 			if (parity_type == '1') {
   3235 				if (c%2 == 0) { /* even component */
   3236 					even_pair_failed = 1;
   3237 				} else { /* odd component.  If
   3238 					    we're failed, and
   3239 					    so is the even
   3240 					    component, it's
   3241 					    "Good Night, Charlie" */
   3242 					if (even_pair_failed == 1) {
   3243 						return(0);
   3244 					}
   3245 				}
   3246 			} else {
   3247 				/* normal accounting */
   3248 				num_missing++;
   3249 			}
   3250 		}
   3251 		if ((parity_type == '1') && (c%2 == 1)) {
   3252 				/* Just did an even component, and we didn't
   3253 				   bail.. reset the even_pair_failed flag,
   3254 				   and go on to the next component.... */
   3255 			even_pair_failed = 0;
   3256 		}
   3257 	}
   3258 
   3259 	clabel = cset->ac->clabel;
   3260 
   3261 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3262 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3263 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3264 		/* XXX this needs to be made *much* more general */
   3265 		/* Too many failures */
   3266 		return(0);
   3267 	}
   3268 	/* otherwise, all is well, and we've got enough to take a kick
   3269 	   at autoconfiguring this set */
   3270 	return(1);
   3271 }
   3272 
   3273 void
   3274 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3275 			RF_Raid_t *raidPtr)
   3276 {
   3277 	RF_ComponentLabel_t *clabel;
   3278 	int i;
   3279 
   3280 	clabel = ac->clabel;
   3281 
   3282 	/* 1. Fill in the common stuff */
   3283 	config->numCol = clabel->num_columns;
   3284 	config->numSpare = 0; /* XXX should this be set here? */
   3285 	config->sectPerSU = clabel->sectPerSU;
   3286 	config->SUsPerPU = clabel->SUsPerPU;
   3287 	config->SUsPerRU = clabel->SUsPerRU;
   3288 	config->parityConfig = clabel->parityConfig;
   3289 	/* XXX... */
   3290 	strcpy(config->diskQueueType,"fifo");
   3291 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3292 	config->layoutSpecificSize = 0; /* XXX ?? */
   3293 
   3294 	while(ac!=NULL) {
   3295 		/* row/col values will be in range due to the checks
   3296 		   in reasonable_label() */
   3297 		strcpy(config->devnames[0][ac->clabel->column],
   3298 		       ac->devname);
   3299 		ac = ac->next;
   3300 	}
   3301 
   3302 	for(i=0;i<RF_MAXDBGV;i++) {
   3303 		config->debugVars[i][0] = 0;
   3304 	}
   3305 }
   3306 
   3307 int
   3308 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3309 {
   3310 	RF_ComponentLabel_t *clabel;
   3311 	int column;
   3312 	int sparecol;
   3313 
   3314 	raidPtr->autoconfigure = new_value;
   3315 
   3316 	for(column=0; column<raidPtr->numCol; column++) {
   3317 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3318 			clabel = raidget_component_label(raidPtr, column);
   3319 			clabel->autoconfigure = new_value;
   3320 			raidflush_component_label(raidPtr, column);
   3321 		}
   3322 	}
   3323 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3324 		sparecol = raidPtr->numCol + column;
   3325 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3326 			clabel = raidget_component_label(raidPtr, sparecol);
   3327 			clabel->autoconfigure = new_value;
   3328 			raidflush_component_label(raidPtr, sparecol);
   3329 		}
   3330 	}
   3331 	return(new_value);
   3332 }
   3333 
   3334 int
   3335 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3336 {
   3337 	RF_ComponentLabel_t *clabel;
   3338 	int column;
   3339 	int sparecol;
   3340 
   3341 	raidPtr->root_partition = new_value;
   3342 	for(column=0; column<raidPtr->numCol; column++) {
   3343 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3344 			clabel = raidget_component_label(raidPtr, column);
   3345 			clabel->root_partition = new_value;
   3346 			raidflush_component_label(raidPtr, column);
   3347 		}
   3348 	}
   3349 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3350 		sparecol = raidPtr->numCol + column;
   3351 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3352 			clabel = raidget_component_label(raidPtr, sparecol);
   3353 			clabel->root_partition = new_value;
   3354 			raidflush_component_label(raidPtr, sparecol);
   3355 		}
   3356 	}
   3357 	return(new_value);
   3358 }
   3359 
   3360 void
   3361 rf_release_all_vps(RF_ConfigSet_t *cset)
   3362 {
   3363 	RF_AutoConfig_t *ac;
   3364 
   3365 	ac = cset->ac;
   3366 	while(ac!=NULL) {
   3367 		/* Close the vp, and give it back */
   3368 		if (ac->vp) {
   3369 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3370 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3371 			vput(ac->vp);
   3372 			ac->vp = NULL;
   3373 		}
   3374 		ac = ac->next;
   3375 	}
   3376 }
   3377 
   3378 
   3379 void
   3380 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3381 {
   3382 	RF_AutoConfig_t *ac;
   3383 	RF_AutoConfig_t *next_ac;
   3384 
   3385 	ac = cset->ac;
   3386 	while(ac!=NULL) {
   3387 		next_ac = ac->next;
   3388 		/* nuke the label */
   3389 		free(ac->clabel, M_RAIDFRAME);
   3390 		/* cleanup the config structure */
   3391 		free(ac, M_RAIDFRAME);
   3392 		/* "next.." */
   3393 		ac = next_ac;
   3394 	}
   3395 	/* and, finally, nuke the config set */
   3396 	free(cset, M_RAIDFRAME);
   3397 }
   3398 
   3399 
   3400 void
   3401 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3402 {
   3403 	/* current version number */
   3404 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3405 	clabel->serial_number = raidPtr->serial_number;
   3406 	clabel->mod_counter = raidPtr->mod_counter;
   3407 
   3408 	clabel->num_rows = 1;
   3409 	clabel->num_columns = raidPtr->numCol;
   3410 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3411 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3412 
   3413 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3414 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3415 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3416 
   3417 	clabel->blockSize = raidPtr->bytesPerSector;
   3418 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3419 
   3420 	/* XXX not portable */
   3421 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3422 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3423 	clabel->autoconfigure = raidPtr->autoconfigure;
   3424 	clabel->root_partition = raidPtr->root_partition;
   3425 	clabel->last_unit = raidPtr->raidid;
   3426 	clabel->config_order = raidPtr->config_order;
   3427 
   3428 #ifndef RF_NO_PARITY_MAP
   3429 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3430 #endif
   3431 }
   3432 
   3433 struct raid_softc *
   3434 rf_auto_config_set(RF_ConfigSet_t *cset)
   3435 {
   3436 	RF_Raid_t *raidPtr;
   3437 	RF_Config_t *config;
   3438 	int raidID;
   3439 	struct raid_softc *sc;
   3440 
   3441 #ifdef DEBUG
   3442 	printf("RAID autoconfigure\n");
   3443 #endif
   3444 
   3445 	/* 1. Create a config structure */
   3446 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3447 
   3448 	/*
   3449 	   2. Figure out what RAID ID this one is supposed to live at
   3450 	   See if we can get the same RAID dev that it was configured
   3451 	   on last time..
   3452 	*/
   3453 
   3454 	raidID = cset->ac->clabel->last_unit;
   3455 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3456 	     sc = raidget(++raidID, false))
   3457 		continue;
   3458 #ifdef DEBUG
   3459 	printf("Configuring raid%d:\n",raidID);
   3460 #endif
   3461 
   3462 	if (sc == NULL)
   3463 		sc = raidget(raidID, true);
   3464 	raidPtr = &sc->sc_r;
   3465 
   3466 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3467 	raidPtr->softc = sc;
   3468 	raidPtr->raidid = raidID;
   3469 	raidPtr->openings = RAIDOUTSTANDING;
   3470 
   3471 	/* 3. Build the configuration structure */
   3472 	rf_create_configuration(cset->ac, config, raidPtr);
   3473 
   3474 	/* 4. Do the configuration */
   3475 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3476 		raidinit(sc);
   3477 
   3478 		rf_markalldirty(raidPtr);
   3479 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3480 		switch (cset->ac->clabel->root_partition) {
   3481 		case 1:	/* Force Root */
   3482 		case 2:	/* Soft Root: root when boot partition part of raid */
   3483 			/*
   3484 			 * everything configured just fine.  Make a note
   3485 			 * that this set is eligible to be root,
   3486 			 * or forced to be root
   3487 			 */
   3488 			cset->rootable = cset->ac->clabel->root_partition;
   3489 			/* XXX do this here? */
   3490 			raidPtr->root_partition = cset->rootable;
   3491 			break;
   3492 		default:
   3493 			break;
   3494 		}
   3495 	} else {
   3496 		raidput(sc);
   3497 		sc = NULL;
   3498 	}
   3499 
   3500 	/* 5. Cleanup */
   3501 	free(config, M_RAIDFRAME);
   3502 	return sc;
   3503 }
   3504 
   3505 void
   3506 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3507 	     size_t xmin, size_t xmax)
   3508 {
   3509 
   3510 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3511 	pool_sethiwat(p, xmax);
   3512 	pool_prime(p, xmin);
   3513 }
   3514 
   3515 /*
   3516  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3517  * to see if there is IO pending and if that IO could possibly be done
   3518  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3519  * otherwise.
   3520  *
   3521  */
   3522 int
   3523 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3524 {
   3525 	struct raid_softc *rs;
   3526 	struct dk_softc *dksc;
   3527 
   3528 	rs = raidPtr->softc;
   3529 	dksc = &rs->sc_dksc;
   3530 
   3531 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3532 		return 1;
   3533 
   3534 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3535 		/* there is work to do */
   3536 		return 0;
   3537 	}
   3538 	/* default is nothing to do */
   3539 	return 1;
   3540 }
   3541 
   3542 int
   3543 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3544 {
   3545 	uint64_t numsecs;
   3546 	unsigned secsize;
   3547 	int error;
   3548 
   3549 	error = getdisksize(vp, &numsecs, &secsize);
   3550 	if (error == 0) {
   3551 		diskPtr->blockSize = secsize;
   3552 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3553 		diskPtr->partitionSize = numsecs;
   3554 		return 0;
   3555 	}
   3556 	return error;
   3557 }
   3558 
   3559 static int
   3560 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3561 {
   3562 	return 1;
   3563 }
   3564 
   3565 static void
   3566 raid_attach(device_t parent, device_t self, void *aux)
   3567 {
   3568 }
   3569 
   3570 
   3571 static int
   3572 raid_detach(device_t self, int flags)
   3573 {
   3574 	int error;
   3575 	struct raid_softc *rs = raidsoftc(self);
   3576 
   3577 	if (rs == NULL)
   3578 		return ENXIO;
   3579 
   3580 	if ((error = raidlock(rs)) != 0)
   3581 		return (error);
   3582 
   3583 	error = raid_detach_unlocked(rs);
   3584 
   3585 	raidunlock(rs);
   3586 
   3587 	/* XXX raid can be referenced here */
   3588 
   3589 	if (error)
   3590 		return error;
   3591 
   3592 	/* Free the softc */
   3593 	raidput(rs);
   3594 
   3595 	return 0;
   3596 }
   3597 
   3598 static void
   3599 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3600 {
   3601 	struct dk_softc *dksc = &rs->sc_dksc;
   3602 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3603 
   3604 	memset(dg, 0, sizeof(*dg));
   3605 
   3606 	dg->dg_secperunit = raidPtr->totalSectors;
   3607 	dg->dg_secsize = raidPtr->bytesPerSector;
   3608 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3609 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3610 
   3611 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3612 }
   3613 
   3614 /*
   3615  * Get cache info for all the components (including spares).
   3616  * Returns intersection of all the cache flags of all disks, or first
   3617  * error if any encountered.
   3618  * XXXfua feature flags can change as spares are added - lock down somehow
   3619  */
   3620 static int
   3621 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3622 {
   3623 	int c;
   3624 	int error;
   3625 	int dkwhole = 0, dkpart;
   3626 
   3627 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3628 		/*
   3629 		 * Check any non-dead disk, even when currently being
   3630 		 * reconstructed.
   3631 		 */
   3632 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3633 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3634 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3635 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3636 			if (error) {
   3637 				if (error != ENODEV) {
   3638 					printf("raid%d: get cache for component %s failed\n",
   3639 					    raidPtr->raidid,
   3640 					    raidPtr->Disks[c].devname);
   3641 				}
   3642 
   3643 				return error;
   3644 			}
   3645 
   3646 			if (c == 0)
   3647 				dkwhole = dkpart;
   3648 			else
   3649 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3650 		}
   3651 	}
   3652 
   3653 	*data = dkwhole;
   3654 
   3655 	return 0;
   3656 }
   3657 
   3658 /*
   3659  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3660  * We end up returning whatever error was returned by the first cache flush
   3661  * that fails.
   3662  */
   3663 
   3664 int
   3665 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3666 {
   3667 	int c, sparecol;
   3668 	int e,error;
   3669 	int force = 1;
   3670 
   3671 	error = 0;
   3672 	for (c = 0; c < raidPtr->numCol; c++) {
   3673 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3674 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3675 					  &force, FWRITE, NOCRED);
   3676 			if (e) {
   3677 				if (e != ENODEV)
   3678 					printf("raid%d: cache flush to component %s failed.\n",
   3679 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3680 				if (error == 0) {
   3681 					error = e;
   3682 				}
   3683 			}
   3684 		}
   3685 	}
   3686 
   3687 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3688 		sparecol = raidPtr->numCol + c;
   3689 		/* Need to ensure that the reconstruct actually completed! */
   3690 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3691 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3692 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3693 			if (e) {
   3694 				if (e != ENODEV)
   3695 					printf("raid%d: cache flush to component %s failed.\n",
   3696 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3697 				if (error == 0) {
   3698 					error = e;
   3699 				}
   3700 			}
   3701 		}
   3702 	}
   3703 	return error;
   3704 }
   3705 
   3706 /* Fill in info with the current status */
   3707 void
   3708 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3709 {
   3710 
   3711 	if (raidPtr->status != rf_rs_reconstructing) {
   3712 		info->total = 100;
   3713 		info->completed = 100;
   3714 	} else {
   3715 		info->total = raidPtr->reconControl->numRUsTotal;
   3716 		info->completed = raidPtr->reconControl->numRUsComplete;
   3717 	}
   3718 	info->remaining = info->total - info->completed;
   3719 }
   3720 
   3721 /* Fill in info with the current status */
   3722 void
   3723 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3724 {
   3725 
   3726 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3727 		info->total = raidPtr->Layout.numStripe;
   3728 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3729 	} else {
   3730 		info->completed = 100;
   3731 		info->total = 100;
   3732 	}
   3733 	info->remaining = info->total - info->completed;
   3734 }
   3735 
   3736 /* Fill in info with the current status */
   3737 void
   3738 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3739 {
   3740 
   3741 	if (raidPtr->copyback_in_progress == 1) {
   3742 		info->total = raidPtr->Layout.numStripe;
   3743 		info->completed = raidPtr->copyback_stripes_done;
   3744 		info->remaining = info->total - info->completed;
   3745 	} else {
   3746 		info->remaining = 0;
   3747 		info->completed = 100;
   3748 		info->total = 100;
   3749 	}
   3750 }
   3751 
   3752 /* Fill in config with the current info */
   3753 int
   3754 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3755 {
   3756 	int	d, i, j;
   3757 
   3758 	if (!raidPtr->valid)
   3759 		return (ENODEV);
   3760 	config->cols = raidPtr->numCol;
   3761 	config->ndevs = raidPtr->numCol;
   3762 	if (config->ndevs >= RF_MAX_DISKS)
   3763 		return (ENOMEM);
   3764 	config->nspares = raidPtr->numSpare;
   3765 	if (config->nspares >= RF_MAX_DISKS)
   3766 		return (ENOMEM);
   3767 	config->maxqdepth = raidPtr->maxQueueDepth;
   3768 	d = 0;
   3769 	for (j = 0; j < config->cols; j++) {
   3770 		config->devs[d] = raidPtr->Disks[j];
   3771 		d++;
   3772 	}
   3773 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3774 		config->spares[i] = raidPtr->Disks[j];
   3775 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3776 			/* XXX: raidctl(8) expects to see this as a used spare */
   3777 			config->spares[i].status = rf_ds_used_spare;
   3778 		}
   3779 	}
   3780 	return 0;
   3781 }
   3782 
   3783 int
   3784 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3785 {
   3786 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3787 	RF_ComponentLabel_t *raid_clabel;
   3788 	int column = clabel->column;
   3789 
   3790 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3791 		return EINVAL;
   3792 	raid_clabel = raidget_component_label(raidPtr, column);
   3793 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3794 
   3795 	return 0;
   3796 }
   3797 
   3798 /*
   3799  * Module interface
   3800  */
   3801 
   3802 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3803 
   3804 #ifdef _MODULE
   3805 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3806 #endif
   3807 
   3808 static int raid_modcmd(modcmd_t, void *);
   3809 static int raid_modcmd_init(void);
   3810 static int raid_modcmd_fini(void);
   3811 
   3812 static int
   3813 raid_modcmd(modcmd_t cmd, void *data)
   3814 {
   3815 	int error;
   3816 
   3817 	error = 0;
   3818 	switch (cmd) {
   3819 	case MODULE_CMD_INIT:
   3820 		error = raid_modcmd_init();
   3821 		break;
   3822 	case MODULE_CMD_FINI:
   3823 		error = raid_modcmd_fini();
   3824 		break;
   3825 	default:
   3826 		error = ENOTTY;
   3827 		break;
   3828 	}
   3829 	return error;
   3830 }
   3831 
   3832 static int
   3833 raid_modcmd_init(void)
   3834 {
   3835 	int error;
   3836 	int bmajor, cmajor;
   3837 
   3838 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3839 	mutex_enter(&raid_lock);
   3840 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3841 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3842 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3843 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3844 
   3845 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3846 #endif
   3847 
   3848 	bmajor = cmajor = -1;
   3849 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3850 	    &raid_cdevsw, &cmajor);
   3851 	if (error != 0 && error != EEXIST) {
   3852 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3853 		mutex_exit(&raid_lock);
   3854 		return error;
   3855 	}
   3856 #ifdef _MODULE
   3857 	error = config_cfdriver_attach(&raid_cd);
   3858 	if (error != 0) {
   3859 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3860 		    __func__, error);
   3861 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3862 		mutex_exit(&raid_lock);
   3863 		return error;
   3864 	}
   3865 #endif
   3866 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3867 	if (error != 0) {
   3868 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3869 		    __func__, error);
   3870 #ifdef _MODULE
   3871 		config_cfdriver_detach(&raid_cd);
   3872 #endif
   3873 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3874 		mutex_exit(&raid_lock);
   3875 		return error;
   3876 	}
   3877 
   3878 	raidautoconfigdone = false;
   3879 
   3880 	mutex_exit(&raid_lock);
   3881 
   3882 	if (error == 0) {
   3883 		if (rf_BootRaidframe(true) == 0)
   3884 			aprint_verbose("Kernelized RAIDframe activated\n");
   3885 		else
   3886 			panic("Serious error activating RAID!!");
   3887 	}
   3888 
   3889 	/*
   3890 	 * Register a finalizer which will be used to auto-config RAID
   3891 	 * sets once all real hardware devices have been found.
   3892 	 */
   3893 	error = config_finalize_register(NULL, rf_autoconfig);
   3894 	if (error != 0) {
   3895 		aprint_error("WARNING: unable to register RAIDframe "
   3896 		    "finalizer\n");
   3897 		error = 0;
   3898 	}
   3899 
   3900 	return error;
   3901 }
   3902 
   3903 static int
   3904 raid_modcmd_fini(void)
   3905 {
   3906 	int error;
   3907 
   3908 	mutex_enter(&raid_lock);
   3909 
   3910 	/* Don't allow unload if raid device(s) exist.  */
   3911 	if (!LIST_EMPTY(&raids)) {
   3912 		mutex_exit(&raid_lock);
   3913 		return EBUSY;
   3914 	}
   3915 
   3916 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3917 	if (error != 0) {
   3918 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3919 		mutex_exit(&raid_lock);
   3920 		return error;
   3921 	}
   3922 #ifdef _MODULE
   3923 	error = config_cfdriver_detach(&raid_cd);
   3924 	if (error != 0) {
   3925 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3926 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3927 		mutex_exit(&raid_lock);
   3928 		return error;
   3929 	}
   3930 #endif
   3931 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3932 	if (error != 0) {
   3933 		aprint_error("%s: cannot detach devsw\n",__func__);
   3934 #ifdef _MODULE
   3935 		config_cfdriver_attach(&raid_cd);
   3936 #endif
   3937 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3938 		mutex_exit(&raid_lock);
   3939 		return error;
   3940 	}
   3941 	rf_BootRaidframe(false);
   3942 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3943 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3944 	rf_destroy_cond2(rf_sparet_wait_cv);
   3945 	rf_destroy_cond2(rf_sparet_resp_cv);
   3946 #endif
   3947 	mutex_exit(&raid_lock);
   3948 	mutex_destroy(&raid_lock);
   3949 
   3950 	return error;
   3951 }
   3952