Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.384
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.384 2020/06/19 19:29:39 jdolecek Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.384 2020/06/19 19:29:39 jdolecek Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    179 
    180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    181 
    182 /* prototypes */
    183 static void KernelWakeupFunc(struct buf *);
    184 static void InitBP(struct buf *, struct vnode *, unsigned,
    185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    186     void *, int);
    187 static void raidinit(struct raid_softc *);
    188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    190 
    191 static int raid_match(device_t, cfdata_t, void *);
    192 static void raid_attach(device_t, device_t, void *);
    193 static int raid_detach(device_t, int);
    194 
    195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t);
    197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t, int);
    199 
    200 static int raidwrite_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 static int raidread_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 
    205 static int raid_diskstart(device_t, struct buf *bp);
    206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    207 static int raid_lastclose(device_t);
    208 
    209 static dev_type_open(raidopen);
    210 static dev_type_close(raidclose);
    211 static dev_type_read(raidread);
    212 static dev_type_write(raidwrite);
    213 static dev_type_ioctl(raidioctl);
    214 static dev_type_strategy(raidstrategy);
    215 static dev_type_dump(raiddump);
    216 static dev_type_size(raidsize);
    217 
    218 const struct bdevsw raid_bdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_strategy = raidstrategy,
    222 	.d_ioctl = raidioctl,
    223 	.d_dump = raiddump,
    224 	.d_psize = raidsize,
    225 	.d_discard = nodiscard,
    226 	.d_flag = D_DISK
    227 };
    228 
    229 const struct cdevsw raid_cdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_read = raidread,
    233 	.d_write = raidwrite,
    234 	.d_ioctl = raidioctl,
    235 	.d_stop = nostop,
    236 	.d_tty = notty,
    237 	.d_poll = nopoll,
    238 	.d_mmap = nommap,
    239 	.d_kqfilter = nokqfilter,
    240 	.d_discard = nodiscard,
    241 	.d_flag = D_DISK
    242 };
    243 
    244 static struct dkdriver rf_dkdriver = {
    245 	.d_open = raidopen,
    246 	.d_close = raidclose,
    247 	.d_strategy = raidstrategy,
    248 	.d_diskstart = raid_diskstart,
    249 	.d_dumpblocks = raid_dumpblocks,
    250 	.d_lastclose = raid_lastclose,
    251 	.d_minphys = minphys
    252 };
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /* Internal representation of a rf_recon_req */
    263 struct rf_recon_req_internal {
    264 	RF_RowCol_t col;
    265 	RF_ReconReqFlags_t flags;
    266 	void   *raidPtr;
    267 };
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 void rf_ReconThread(struct rf_recon_req_internal *);
    304 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    307 int rf_autoconfig(device_t);
    308 void rf_buildroothack(RF_ConfigSet_t *);
    309 
    310 RF_AutoConfig_t *rf_find_raid_components(void);
    311 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    313 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    314 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    315 int rf_set_autoconfig(RF_Raid_t *, int);
    316 int rf_set_rootpartition(RF_Raid_t *, int);
    317 void rf_release_all_vps(RF_ConfigSet_t *);
    318 void rf_cleanup_config_set(RF_ConfigSet_t *);
    319 int rf_have_enough_components(RF_ConfigSet_t *);
    320 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    322 
    323 /*
    324  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    325  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    326  * in the kernel config file.
    327  */
    328 #ifdef RAID_AUTOCONFIG
    329 int raidautoconfig = 1;
    330 #else
    331 int raidautoconfig = 0;
    332 #endif
    333 static bool raidautoconfigdone = false;
    334 
    335 struct RF_Pools_s rf_pools;
    336 
    337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    338 static kmutex_t raid_lock;
    339 
    340 static struct raid_softc *
    341 raidcreate(int unit) {
    342 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    343 	sc->sc_unit = unit;
    344 	cv_init(&sc->sc_cv, "raidunit");
    345 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    346 	return sc;
    347 }
    348 
    349 static void
    350 raiddestroy(struct raid_softc *sc) {
    351 	cv_destroy(&sc->sc_cv);
    352 	mutex_destroy(&sc->sc_mutex);
    353 	kmem_free(sc, sizeof(*sc));
    354 }
    355 
    356 static struct raid_softc *
    357 raidget(int unit, bool create) {
    358 	struct raid_softc *sc;
    359 	if (unit < 0) {
    360 #ifdef DIAGNOSTIC
    361 		panic("%s: unit %d!", __func__, unit);
    362 #endif
    363 		return NULL;
    364 	}
    365 	mutex_enter(&raid_lock);
    366 	LIST_FOREACH(sc, &raids, sc_link) {
    367 		if (sc->sc_unit == unit) {
    368 			mutex_exit(&raid_lock);
    369 			return sc;
    370 		}
    371 	}
    372 	mutex_exit(&raid_lock);
    373 	if (!create)
    374 		return NULL;
    375 	sc = raidcreate(unit);
    376 	mutex_enter(&raid_lock);
    377 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    378 	mutex_exit(&raid_lock);
    379 	return sc;
    380 }
    381 
    382 static void
    383 raidput(struct raid_softc *sc) {
    384 	mutex_enter(&raid_lock);
    385 	LIST_REMOVE(sc, sc_link);
    386 	mutex_exit(&raid_lock);
    387 	raiddestroy(sc);
    388 }
    389 
    390 void
    391 raidattach(int num)
    392 {
    393 
    394 	/*
    395 	 * Device attachment and associated initialization now occurs
    396 	 * as part of the module initialization.
    397 	 */
    398 }
    399 
    400 int
    401 rf_autoconfig(device_t self)
    402 {
    403 	RF_AutoConfig_t *ac_list;
    404 	RF_ConfigSet_t *config_sets;
    405 
    406 	if (!raidautoconfig || raidautoconfigdone == true)
    407 		return (0);
    408 
    409 	/* XXX This code can only be run once. */
    410 	raidautoconfigdone = true;
    411 
    412 #ifdef __HAVE_CPU_BOOTCONF
    413 	/*
    414 	 * 0. find the boot device if needed first so we can use it later
    415 	 * this needs to be done before we autoconfigure any raid sets,
    416 	 * because if we use wedges we are not going to be able to open
    417 	 * the boot device later
    418 	 */
    419 	if (booted_device == NULL)
    420 		cpu_bootconf();
    421 #endif
    422 	/* 1. locate all RAID components on the system */
    423 	aprint_debug("Searching for RAID components...\n");
    424 	ac_list = rf_find_raid_components();
    425 
    426 	/* 2. Sort them into their respective sets. */
    427 	config_sets = rf_create_auto_sets(ac_list);
    428 
    429 	/*
    430 	 * 3. Evaluate each set and configure the valid ones.
    431 	 * This gets done in rf_buildroothack().
    432 	 */
    433 	rf_buildroothack(config_sets);
    434 
    435 	return 1;
    436 }
    437 
    438 int
    439 rf_inited(const struct raid_softc *rs) {
    440 	return (rs->sc_flags & RAIDF_INITED) != 0;
    441 }
    442 
    443 RF_Raid_t *
    444 rf_get_raid(struct raid_softc *rs) {
    445 	return &rs->sc_r;
    446 }
    447 
    448 int
    449 rf_get_unit(const struct raid_softc *rs) {
    450 	return rs->sc_unit;
    451 }
    452 
    453 static int
    454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    455 	const char *bootname;
    456 	size_t len;
    457 
    458 	/* if bdv is NULL, the set can't contain it. exit early. */
    459 	if (bdv == NULL)
    460 		return 0;
    461 
    462 	bootname = device_xname(bdv);
    463 	len = strlen(bootname);
    464 
    465 	for (int col = 0; col < r->numCol; col++) {
    466 		const char *devname = r->Disks[col].devname;
    467 		devname += sizeof("/dev/") - 1;
    468 		if (strncmp(devname, "dk", 2) == 0) {
    469 			const char *parent =
    470 			    dkwedge_get_parent_name(r->Disks[col].dev);
    471 			if (parent != NULL)
    472 				devname = parent;
    473 		}
    474 		if (strncmp(devname, bootname, len) == 0) {
    475 			struct raid_softc *sc = r->softc;
    476 			aprint_debug("raid%d includes boot device %s\n",
    477 			    sc->sc_unit, devname);
    478 			return 1;
    479 		}
    480 	}
    481 	return 0;
    482 }
    483 
    484 void
    485 rf_buildroothack(RF_ConfigSet_t *config_sets)
    486 {
    487 	RF_ConfigSet_t *cset;
    488 	RF_ConfigSet_t *next_cset;
    489 	int num_root;
    490 	struct raid_softc *sc, *rsc;
    491 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    492 
    493 	sc = rsc = NULL;
    494 	num_root = 0;
    495 	cset = config_sets;
    496 	while (cset != NULL) {
    497 		next_cset = cset->next;
    498 		if (rf_have_enough_components(cset) &&
    499 		    cset->ac->clabel->autoconfigure == 1) {
    500 			sc = rf_auto_config_set(cset);
    501 			if (sc != NULL) {
    502 				aprint_debug("raid%d: configured ok, rootable %d\n",
    503 				    sc->sc_unit, cset->rootable);
    504 				if (cset->rootable) {
    505 					rsc = sc;
    506 					num_root++;
    507 				}
    508 			} else {
    509 				/* The autoconfig didn't work :( */
    510 				aprint_debug("Autoconfig failed\n");
    511 				rf_release_all_vps(cset);
    512 			}
    513 		} else {
    514 			/* we're not autoconfiguring this set...
    515 			   release the associated resources */
    516 			rf_release_all_vps(cset);
    517 		}
    518 		/* cleanup */
    519 		rf_cleanup_config_set(cset);
    520 		cset = next_cset;
    521 	}
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL) {
    527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    528 		return;
    529 	}
    530 
    531 	/* we found something bootable... */
    532 
    533 	/*
    534 	 * XXX: The following code assumes that the root raid
    535 	 * is the first ('a') partition. This is about the best
    536 	 * we can do with a BSD disklabel, but we might be able
    537 	 * to do better with a GPT label, by setting a specified
    538 	 * attribute to indicate the root partition. We can then
    539 	 * stash the partition number in the r->root_partition
    540 	 * high bits (the bottom 2 bits are already used). For
    541 	 * now we just set booted_partition to 0 when we override
    542 	 * root.
    543 	 */
    544 	if (num_root == 1) {
    545 		device_t candidate_root;
    546 		dksc = &rsc->sc_dksc;
    547 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    548 			char cname[sizeof(cset->ac->devname)];
    549 			/* XXX: assume partition 'a' first */
    550 			snprintf(cname, sizeof(cname), "%s%c",
    551 			    device_xname(dksc->sc_dev), 'a');
    552 			candidate_root = dkwedge_find_by_wname(cname);
    553 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    554 			    cname);
    555 			if (candidate_root == NULL) {
    556 				/*
    557 				 * If that is not found, because we don't use
    558 				 * disklabel, return the first dk child
    559 				 * XXX: we can skip the 'a' check above
    560 				 * and always do this...
    561 				 */
    562 				size_t i = 0;
    563 				candidate_root = dkwedge_find_by_parent(
    564 				    device_xname(dksc->sc_dev), &i);
    565 			}
    566 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    567 			    candidate_root);
    568 		} else
    569 			candidate_root = dksc->sc_dev;
    570 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    571 		DPRINTF("%s: booted_device=%p root_partition=%d "
    572 			"contains_boot=%d",
    573 		    __func__, booted_device, rsc->sc_r.root_partition,
    574 			   rf_containsboot(&rsc->sc_r, booted_device));
    575 		/* XXX the check for booted_device == NULL can probably be
    576 		 * dropped, now that rf_containsboot handles that case.
    577 		 */
    578 		if (booted_device == NULL ||
    579 		    rsc->sc_r.root_partition == 1 ||
    580 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    581 			booted_device = candidate_root;
    582 			booted_method = "raidframe/single";
    583 			booted_partition = 0;	/* XXX assume 'a' */
    584 		}
    585 	} else if (num_root > 1) {
    586 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    587 		    booted_device);
    588 
    589 		/*
    590 		 * Maybe the MD code can help. If it cannot, then
    591 		 * setroot() will discover that we have no
    592 		 * booted_device and will ask the user if nothing was
    593 		 * hardwired in the kernel config file
    594 		 */
    595 		if (booted_device == NULL)
    596 			return;
    597 
    598 		num_root = 0;
    599 		mutex_enter(&raid_lock);
    600 		LIST_FOREACH(sc, &raids, sc_link) {
    601 			RF_Raid_t *r = &sc->sc_r;
    602 			if (r->valid == 0)
    603 				continue;
    604 
    605 			if (r->root_partition == 0)
    606 				continue;
    607 
    608 			if (rf_containsboot(r, booted_device)) {
    609 				num_root++;
    610 				rsc = sc;
    611 				dksc = &rsc->sc_dksc;
    612 			}
    613 		}
    614 		mutex_exit(&raid_lock);
    615 
    616 		if (num_root == 1) {
    617 			booted_device = dksc->sc_dev;
    618 			booted_method = "raidframe/multi";
    619 			booted_partition = 0;	/* XXX assume 'a' */
    620 		} else {
    621 			/* we can't guess.. require the user to answer... */
    622 			boothowto |= RB_ASKNAME;
    623 		}
    624 	}
    625 }
    626 
    627 static int
    628 raidsize(dev_t dev)
    629 {
    630 	struct raid_softc *rs;
    631 	struct dk_softc *dksc;
    632 	unsigned int unit;
    633 
    634 	unit = raidunit(dev);
    635 	if ((rs = raidget(unit, false)) == NULL)
    636 		return -1;
    637 	dksc = &rs->sc_dksc;
    638 
    639 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    640 		return -1;
    641 
    642 	return dk_size(dksc, dev);
    643 }
    644 
    645 static int
    646 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    647 {
    648 	unsigned int unit;
    649 	struct raid_softc *rs;
    650 	struct dk_softc *dksc;
    651 
    652 	unit = raidunit(dev);
    653 	if ((rs = raidget(unit, false)) == NULL)
    654 		return ENXIO;
    655 	dksc = &rs->sc_dksc;
    656 
    657 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    658 		return ENODEV;
    659 
    660         /*
    661            Note that blkno is relative to this particular partition.
    662            By adding adding RF_PROTECTED_SECTORS, we get a value that
    663 	   is relative to the partition used for the underlying component.
    664         */
    665 	blkno += RF_PROTECTED_SECTORS;
    666 
    667 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    668 }
    669 
    670 static int
    671 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    672 {
    673 	struct raid_softc *rs = raidsoftc(dev);
    674 	const struct bdevsw *bdev;
    675 	RF_Raid_t *raidPtr;
    676 	int     c, sparecol, j, scol, dumpto;
    677 	int     error = 0;
    678 
    679 	raidPtr = &rs->sc_r;
    680 
    681 	/* we only support dumping to RAID 1 sets */
    682 	if (raidPtr->Layout.numDataCol != 1 ||
    683 	    raidPtr->Layout.numParityCol != 1)
    684 		return EINVAL;
    685 
    686 	if ((error = raidlock(rs)) != 0)
    687 		return error;
    688 
    689 	/* figure out what device is alive.. */
    690 
    691 	/*
    692 	   Look for a component to dump to.  The preference for the
    693 	   component to dump to is as follows:
    694 	   1) the first component
    695 	   2) a used_spare of the first component
    696 	   3) the second component
    697 	   4) a used_spare of the second component
    698 	*/
    699 
    700 	dumpto = -1;
    701 	for (c = 0; c < raidPtr->numCol; c++) {
    702 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    703 			/* this might be the one */
    704 			dumpto = c;
    705 			break;
    706 		}
    707 	}
    708 
    709 	/*
    710 	   At this point we have possibly selected a live component.
    711 	   If we didn't find a live ocmponent, we now check to see
    712 	   if there is a relevant spared component.
    713 	*/
    714 
    715 	for (c = 0; c < raidPtr->numSpare; c++) {
    716 		sparecol = raidPtr->numCol + c;
    717 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    718 			/* How about this one? */
    719 			scol = -1;
    720 			for(j=0;j<raidPtr->numCol;j++) {
    721 				if (raidPtr->Disks[j].spareCol == sparecol) {
    722 					scol = j;
    723 					break;
    724 				}
    725 			}
    726 			if (scol == 0) {
    727 				/*
    728 				   We must have found a spared first
    729 				   component!  We'll take that over
    730 				   anything else found so far.  (We
    731 				   couldn't have found a real first
    732 				   component before, since this is a
    733 				   used spare, and it's saying that
    734 				   it's replacing the first
    735 				   component.)  On reboot (with
    736 				   autoconfiguration turned on)
    737 				   sparecol will become the first
    738 				   component (component0) of this set.
    739 				*/
    740 				dumpto = sparecol;
    741 				break;
    742 			} else if (scol != -1) {
    743 				/*
    744 				   Must be a spared second component.
    745 				   We'll dump to that if we havn't found
    746 				   anything else so far.
    747 				*/
    748 				if (dumpto == -1)
    749 					dumpto = sparecol;
    750 			}
    751 		}
    752 	}
    753 
    754 	if (dumpto == -1) {
    755 		/* we couldn't find any live components to dump to!?!?
    756 		 */
    757 		error = EINVAL;
    758 		goto out;
    759 	}
    760 
    761 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    762 	if (bdev == NULL) {
    763 		error = ENXIO;
    764 		goto out;
    765 	}
    766 
    767 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    768 				blkno, va, nblk * raidPtr->bytesPerSector);
    769 
    770 out:
    771 	raidunlock(rs);
    772 
    773 	return error;
    774 }
    775 
    776 /* ARGSUSED */
    777 static int
    778 raidopen(dev_t dev, int flags, int fmt,
    779     struct lwp *l)
    780 {
    781 	int     unit = raidunit(dev);
    782 	struct raid_softc *rs;
    783 	struct dk_softc *dksc;
    784 	int     error = 0;
    785 	int     part, pmask;
    786 
    787 	if ((rs = raidget(unit, true)) == NULL)
    788 		return ENXIO;
    789 	if ((error = raidlock(rs)) != 0)
    790 		return (error);
    791 
    792 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    793 		error = EBUSY;
    794 		goto bad;
    795 	}
    796 
    797 	dksc = &rs->sc_dksc;
    798 
    799 	part = DISKPART(dev);
    800 	pmask = (1 << part);
    801 
    802 	if (!DK_BUSY(dksc, pmask) &&
    803 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    804 		/* First one... mark things as dirty... Note that we *MUST*
    805 		 have done a configure before this.  I DO NOT WANT TO BE
    806 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    807 		 THAT THEY BELONG TOGETHER!!!!! */
    808 		/* XXX should check to see if we're only open for reading
    809 		   here... If so, we needn't do this, but then need some
    810 		   other way of keeping track of what's happened.. */
    811 
    812 		rf_markalldirty(&rs->sc_r);
    813 	}
    814 
    815 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    816 		error = dk_open(dksc, dev, flags, fmt, l);
    817 
    818 bad:
    819 	raidunlock(rs);
    820 
    821 	return (error);
    822 
    823 
    824 }
    825 
    826 static int
    827 raid_lastclose(device_t self)
    828 {
    829 	struct raid_softc *rs = raidsoftc(self);
    830 
    831 	/* Last one... device is not unconfigured yet.
    832 	   Device shutdown has taken care of setting the
    833 	   clean bits if RAIDF_INITED is not set
    834 	   mark things as clean... */
    835 
    836 	rf_update_component_labels(&rs->sc_r,
    837 	    RF_FINAL_COMPONENT_UPDATE);
    838 
    839 	/* pass to unlocked code */
    840 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    841 		rs->sc_flags |= RAIDF_DETACH;
    842 
    843 	return 0;
    844 }
    845 
    846 /* ARGSUSED */
    847 static int
    848 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    849 {
    850 	int     unit = raidunit(dev);
    851 	struct raid_softc *rs;
    852 	struct dk_softc *dksc;
    853 	cfdata_t cf;
    854 	int     error = 0, do_detach = 0, do_put = 0;
    855 
    856 	if ((rs = raidget(unit, false)) == NULL)
    857 		return ENXIO;
    858 	dksc = &rs->sc_dksc;
    859 
    860 	if ((error = raidlock(rs)) != 0)
    861 		return (error);
    862 
    863 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    864 		error = dk_close(dksc, dev, flags, fmt, l);
    865 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    866 			do_detach = 1;
    867 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    868 		do_put = 1;
    869 
    870 	raidunlock(rs);
    871 
    872 	if (do_detach) {
    873 		/* free the pseudo device attach bits */
    874 		cf = device_cfdata(dksc->sc_dev);
    875 		error = config_detach(dksc->sc_dev, 0);
    876 		if (error == 0)
    877 			free(cf, M_RAIDFRAME);
    878 	} else if (do_put) {
    879 		raidput(rs);
    880 	}
    881 
    882 	return (error);
    883 
    884 }
    885 
    886 static void
    887 raid_wakeup(RF_Raid_t *raidPtr)
    888 {
    889 	rf_lock_mutex2(raidPtr->iodone_lock);
    890 	rf_signal_cond2(raidPtr->iodone_cv);
    891 	rf_unlock_mutex2(raidPtr->iodone_lock);
    892 }
    893 
    894 static void
    895 raidstrategy(struct buf *bp)
    896 {
    897 	unsigned int unit;
    898 	struct raid_softc *rs;
    899 	struct dk_softc *dksc;
    900 	RF_Raid_t *raidPtr;
    901 
    902 	unit = raidunit(bp->b_dev);
    903 	if ((rs = raidget(unit, false)) == NULL) {
    904 		bp->b_error = ENXIO;
    905 		goto fail;
    906 	}
    907 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    908 		bp->b_error = ENXIO;
    909 		goto fail;
    910 	}
    911 	dksc = &rs->sc_dksc;
    912 	raidPtr = &rs->sc_r;
    913 
    914 	/* Queue IO only */
    915 	if (dk_strategy_defer(dksc, bp))
    916 		goto done;
    917 
    918 	/* schedule the IO to happen at the next convenient time */
    919 	raid_wakeup(raidPtr);
    920 
    921 done:
    922 	return;
    923 
    924 fail:
    925 	bp->b_resid = bp->b_bcount;
    926 	biodone(bp);
    927 }
    928 
    929 static int
    930 raid_diskstart(device_t dev, struct buf *bp)
    931 {
    932 	struct raid_softc *rs = raidsoftc(dev);
    933 	RF_Raid_t *raidPtr;
    934 
    935 	raidPtr = &rs->sc_r;
    936 	if (!raidPtr->valid) {
    937 		db1_printf(("raid is not valid..\n"));
    938 		return ENODEV;
    939 	}
    940 
    941 	/* XXX */
    942 	bp->b_resid = 0;
    943 
    944 	return raiddoaccess(raidPtr, bp);
    945 }
    946 
    947 void
    948 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    949 {
    950 	struct raid_softc *rs;
    951 	struct dk_softc *dksc;
    952 
    953 	rs = raidPtr->softc;
    954 	dksc = &rs->sc_dksc;
    955 
    956 	dk_done(dksc, bp);
    957 
    958 	rf_lock_mutex2(raidPtr->mutex);
    959 	raidPtr->openings++;
    960 	rf_unlock_mutex2(raidPtr->mutex);
    961 
    962 	/* schedule more IO */
    963 	raid_wakeup(raidPtr);
    964 }
    965 
    966 /* ARGSUSED */
    967 static int
    968 raidread(dev_t dev, struct uio *uio, int flags)
    969 {
    970 	int     unit = raidunit(dev);
    971 	struct raid_softc *rs;
    972 
    973 	if ((rs = raidget(unit, false)) == NULL)
    974 		return ENXIO;
    975 
    976 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    977 		return (ENXIO);
    978 
    979 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    980 
    981 }
    982 
    983 /* ARGSUSED */
    984 static int
    985 raidwrite(dev_t dev, struct uio *uio, int flags)
    986 {
    987 	int     unit = raidunit(dev);
    988 	struct raid_softc *rs;
    989 
    990 	if ((rs = raidget(unit, false)) == NULL)
    991 		return ENXIO;
    992 
    993 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    994 		return (ENXIO);
    995 
    996 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    997 
    998 }
    999 
   1000 static int
   1001 raid_detach_unlocked(struct raid_softc *rs)
   1002 {
   1003 	struct dk_softc *dksc = &rs->sc_dksc;
   1004 	RF_Raid_t *raidPtr;
   1005 	int error;
   1006 
   1007 	raidPtr = &rs->sc_r;
   1008 
   1009 	if (DK_BUSY(dksc, 0) ||
   1010 	    raidPtr->recon_in_progress != 0 ||
   1011 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1012 	    raidPtr->copyback_in_progress != 0)
   1013 		return EBUSY;
   1014 
   1015 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1016 		return 0;
   1017 
   1018 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1019 
   1020 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1021 		return error;
   1022 
   1023 	rs->sc_flags &= ~RAIDF_INITED;
   1024 
   1025 	/* Kill off any queued buffers */
   1026 	dk_drain(dksc);
   1027 	bufq_free(dksc->sc_bufq);
   1028 
   1029 	/* Detach the disk. */
   1030 	dkwedge_delall(&dksc->sc_dkdev);
   1031 	disk_detach(&dksc->sc_dkdev);
   1032 	disk_destroy(&dksc->sc_dkdev);
   1033 	dk_detach(dksc);
   1034 
   1035 	return 0;
   1036 }
   1037 
   1038 static bool
   1039 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1040 {
   1041 	switch (cmd) {
   1042 	case RAIDFRAME_ADD_HOT_SPARE:
   1043 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1044 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1045 	case RAIDFRAME_CHECK_PARITY:
   1046 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1047 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1048 	case RAIDFRAME_CHECK_RECON_STATUS:
   1049 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1050 	case RAIDFRAME_COPYBACK:
   1051 	case RAIDFRAME_DELETE_COMPONENT:
   1052 	case RAIDFRAME_FAIL_DISK:
   1053 	case RAIDFRAME_GET_ACCTOTALS:
   1054 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1055 	case RAIDFRAME_GET_INFO:
   1056 	case RAIDFRAME_GET_SIZE:
   1057 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1058 	case RAIDFRAME_INIT_LABELS:
   1059 	case RAIDFRAME_KEEP_ACCTOTALS:
   1060 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1061 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1062 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1063 	case RAIDFRAME_PARITYMAP_STATUS:
   1064 	case RAIDFRAME_REBUILD_IN_PLACE:
   1065 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1066 	case RAIDFRAME_RESET_ACCTOTALS:
   1067 	case RAIDFRAME_REWRITEPARITY:
   1068 	case RAIDFRAME_SET_AUTOCONFIG:
   1069 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1070 	case RAIDFRAME_SET_ROOT:
   1071 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1072 	}
   1073 	return false;
   1074 }
   1075 
   1076 int
   1077 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1078 {
   1079 	struct rf_recon_req_internal *rrint;
   1080 
   1081 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1082 		/* Can't do this on a RAID 0!! */
   1083 		return EINVAL;
   1084 	}
   1085 
   1086 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1087 		/* bad column */
   1088 		return EINVAL;
   1089 	}
   1090 
   1091 	rf_lock_mutex2(raidPtr->mutex);
   1092 	if (raidPtr->status == rf_rs_reconstructing) {
   1093 		/* you can't fail a disk while we're reconstructing! */
   1094 		/* XXX wrong for RAID6 */
   1095 		goto out;
   1096 	}
   1097 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1098 	    (raidPtr->numFailures > 0)) {
   1099 		/* some other component has failed.  Let's not make
   1100 		   things worse. XXX wrong for RAID6 */
   1101 		goto out;
   1102 	}
   1103 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1104 		/* Can't fail a spared disk! */
   1105 		goto out;
   1106 	}
   1107 	rf_unlock_mutex2(raidPtr->mutex);
   1108 
   1109 	/* make a copy of the recon request so that we don't rely on
   1110 	 * the user's buffer */
   1111 	rrint = RF_Malloc(sizeof(*rrint));
   1112 	if (rrint == NULL)
   1113 		return(ENOMEM);
   1114 	rrint->col = rr->col;
   1115 	rrint->flags = rr->flags;
   1116 	rrint->raidPtr = raidPtr;
   1117 
   1118 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1119 	    rrint, "raid_recon");
   1120 out:
   1121 	rf_unlock_mutex2(raidPtr->mutex);
   1122 	return EINVAL;
   1123 }
   1124 
   1125 static int
   1126 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1127 {
   1128 	/* allocate a buffer for the layout-specific data, and copy it in */
   1129 	if (k_cfg->layoutSpecificSize == 0)
   1130 		return 0;
   1131 
   1132 	if (k_cfg->layoutSpecificSize > 10000) {
   1133 	    /* sanity check */
   1134 	    return EINVAL;
   1135 	}
   1136 
   1137 	u_char *specific_buf;
   1138 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1139 	if (specific_buf == NULL)
   1140 		return ENOMEM;
   1141 
   1142 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1143 	    k_cfg->layoutSpecificSize);
   1144 	if (retcode) {
   1145 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1146 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1147 		return retcode;
   1148 	}
   1149 
   1150 	k_cfg->layoutSpecific = specific_buf;
   1151 	return 0;
   1152 }
   1153 
   1154 static int
   1155 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1156 {
   1157 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1158 
   1159 	if (rs->sc_r.valid) {
   1160 		/* There is a valid RAID set running on this unit! */
   1161 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1162 		return EINVAL;
   1163 	}
   1164 
   1165 	/* copy-in the configuration information */
   1166 	/* data points to a pointer to the configuration structure */
   1167 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1168 	if (*k_cfg == NULL) {
   1169 		return ENOMEM;
   1170 	}
   1171 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1172 	if (retcode == 0)
   1173 		return 0;
   1174 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1175 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1176 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1177 	return retcode;
   1178 }
   1179 
   1180 int
   1181 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1182 {
   1183 	int retcode;
   1184 	RF_Raid_t *raidPtr = &rs->sc_r;
   1185 
   1186 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1187 
   1188 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1189 		goto out;
   1190 
   1191 	/* should do some kind of sanity check on the configuration.
   1192 	 * Store the sum of all the bytes in the last byte? */
   1193 
   1194 	/* configure the system */
   1195 
   1196 	/*
   1197 	 * Clear the entire RAID descriptor, just to make sure
   1198 	 *  there is no stale data left in the case of a
   1199 	 *  reconfiguration
   1200 	 */
   1201 	memset(raidPtr, 0, sizeof(*raidPtr));
   1202 	raidPtr->softc = rs;
   1203 	raidPtr->raidid = rs->sc_unit;
   1204 
   1205 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1206 
   1207 	if (retcode == 0) {
   1208 		/* allow this many simultaneous IO's to
   1209 		   this RAID device */
   1210 		raidPtr->openings = RAIDOUTSTANDING;
   1211 
   1212 		raidinit(rs);
   1213 		raid_wakeup(raidPtr);
   1214 		rf_markalldirty(raidPtr);
   1215 	}
   1216 
   1217 	/* free the buffers.  No return code here. */
   1218 	if (k_cfg->layoutSpecificSize) {
   1219 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1220 	}
   1221 out:
   1222 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1223 	if (retcode) {
   1224 		/*
   1225 		 * If configuration failed, set sc_flags so that we
   1226 		 * will detach the device when we close it.
   1227 		 */
   1228 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1229 	}
   1230 	return retcode;
   1231 }
   1232 
   1233 #if RF_DISABLED
   1234 static int
   1235 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1236 {
   1237 
   1238 	/* XXX check the label for valid stuff... */
   1239 	/* Note that some things *should not* get modified --
   1240 	   the user should be re-initing the labels instead of
   1241 	   trying to patch things.
   1242 	   */
   1243 #ifdef DEBUG
   1244 	int raidid = raidPtr->raidid;
   1245 	printf("raid%d: Got component label:\n", raidid);
   1246 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1247 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1248 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1249 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1250 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1251 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1252 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1253 #endif	/* DEBUG */
   1254 	clabel->row = 0;
   1255 	int column = clabel->column;
   1256 
   1257 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1258 		return(EINVAL);
   1259 	}
   1260 
   1261 	/* XXX this isn't allowed to do anything for now :-) */
   1262 
   1263 	/* XXX and before it is, we need to fill in the rest
   1264 	   of the fields!?!?!?! */
   1265 	memcpy(raidget_component_label(raidPtr, column),
   1266 	    clabel, sizeof(*clabel));
   1267 	raidflush_component_label(raidPtr, column);
   1268 	return 0;
   1269 }
   1270 #endif
   1271 
   1272 static int
   1273 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1274 {
   1275 	/*
   1276 	   we only want the serial number from
   1277 	   the above.  We get all the rest of the information
   1278 	   from the config that was used to create this RAID
   1279 	   set.
   1280 	   */
   1281 
   1282 	raidPtr->serial_number = clabel->serial_number;
   1283 
   1284 	for (int column = 0; column < raidPtr->numCol; column++) {
   1285 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1286 		if (RF_DEAD_DISK(diskPtr->status))
   1287 			continue;
   1288 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1289 		    raidPtr, column);
   1290 		/* Zeroing this is important. */
   1291 		memset(ci_label, 0, sizeof(*ci_label));
   1292 		raid_init_component_label(raidPtr, ci_label);
   1293 		ci_label->serial_number = raidPtr->serial_number;
   1294 		ci_label->row = 0; /* we dont' pretend to support more */
   1295 		rf_component_label_set_partitionsize(ci_label,
   1296 		    diskPtr->partitionSize);
   1297 		ci_label->column = column;
   1298 		raidflush_component_label(raidPtr, column);
   1299 		/* XXXjld what about the spares? */
   1300 	}
   1301 
   1302 	return 0;
   1303 }
   1304 
   1305 static int
   1306 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1307 {
   1308 
   1309 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1310 		/* Can't do this on a RAID 0!! */
   1311 		return EINVAL;
   1312 	}
   1313 
   1314 	if (raidPtr->recon_in_progress == 1) {
   1315 		/* a reconstruct is already in progress! */
   1316 		return EINVAL;
   1317 	}
   1318 
   1319 	RF_SingleComponent_t component;
   1320 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1321 	component.row = 0; /* we don't support any more */
   1322 	int column = component.column;
   1323 
   1324 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1325 		return EINVAL;
   1326 	}
   1327 
   1328 	rf_lock_mutex2(raidPtr->mutex);
   1329 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1330 	    (raidPtr->numFailures > 0)) {
   1331 		/* XXX 0 above shouldn't be constant!!! */
   1332 		/* some component other than this has failed.
   1333 		   Let's not make things worse than they already
   1334 		   are... */
   1335 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1336 		       raidPtr->raidid);
   1337 		printf("raid%d:     Col: %d   Too many failures.\n",
   1338 		       raidPtr->raidid, column);
   1339 		rf_unlock_mutex2(raidPtr->mutex);
   1340 		return EINVAL;
   1341 	}
   1342 
   1343 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1344 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1345 		       raidPtr->raidid);
   1346 		printf("raid%d:    Col: %d   "
   1347 		    "Reconstruction already occurring!\n",
   1348 		    raidPtr->raidid, column);
   1349 
   1350 		rf_unlock_mutex2(raidPtr->mutex);
   1351 		return EINVAL;
   1352 	}
   1353 
   1354 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1355 		rf_unlock_mutex2(raidPtr->mutex);
   1356 		return EINVAL;
   1357 	}
   1358 
   1359 	rf_unlock_mutex2(raidPtr->mutex);
   1360 
   1361 	struct rf_recon_req_internal *rrint;
   1362 	rrint = RF_Malloc(sizeof(*rrint));
   1363 	if (rrint == NULL)
   1364 		return ENOMEM;
   1365 
   1366 	rrint->col = column;
   1367 	rrint->raidPtr = raidPtr;
   1368 
   1369 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1370 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1371 }
   1372 
   1373 static int
   1374 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1375 {
   1376 	/*
   1377 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1378 	 * so tell the user it's done.
   1379 	 */
   1380 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1381 	    raidPtr->status != rf_rs_reconstructing) {
   1382 		*data = 100;
   1383 		return 0;
   1384 	}
   1385 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1386 		*data = 0;
   1387 		return 0;
   1388 	}
   1389 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1390 	    / raidPtr->reconControl->numRUsTotal);
   1391 	return 0;
   1392 }
   1393 
   1394 static int
   1395 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1396 {
   1397 	int     unit = raidunit(dev);
   1398 	int     part, pmask;
   1399 	struct raid_softc *rs;
   1400 	struct dk_softc *dksc;
   1401 	RF_Config_t *k_cfg;
   1402 	RF_Raid_t *raidPtr;
   1403 	RF_AccTotals_t *totals;
   1404 	RF_SingleComponent_t component;
   1405 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1406 	int retcode = 0;
   1407 	int column;
   1408 	RF_ComponentLabel_t *clabel;
   1409 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1410 	int d;
   1411 
   1412 	if ((rs = raidget(unit, false)) == NULL)
   1413 		return ENXIO;
   1414 
   1415 	dksc = &rs->sc_dksc;
   1416 	raidPtr = &rs->sc_r;
   1417 
   1418 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1419 	    (int) DISKPART(dev), (int) unit, cmd));
   1420 
   1421 	/* Must be initialized for these... */
   1422 	if (rf_must_be_initialized(rs, cmd))
   1423 		return ENXIO;
   1424 
   1425 	switch (cmd) {
   1426 		/* configure the system */
   1427 	case RAIDFRAME_CONFIGURE:
   1428 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1429 			return retcode;
   1430 		return rf_construct(rs, k_cfg);
   1431 
   1432 		/* shutdown the system */
   1433 	case RAIDFRAME_SHUTDOWN:
   1434 
   1435 		part = DISKPART(dev);
   1436 		pmask = (1 << part);
   1437 
   1438 		if ((retcode = raidlock(rs)) != 0)
   1439 			return retcode;
   1440 
   1441 		if (DK_BUSY(dksc, pmask) ||
   1442 		    raidPtr->recon_in_progress != 0 ||
   1443 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1444 		    raidPtr->copyback_in_progress != 0)
   1445 			retcode = EBUSY;
   1446 		else {
   1447 			/* detach and free on close */
   1448 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1449 			retcode = 0;
   1450 		}
   1451 
   1452 		raidunlock(rs);
   1453 
   1454 		return retcode;
   1455 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1456 		return rf_get_component_label(raidPtr, data);
   1457 
   1458 #if RF_DISABLED
   1459 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1460 		return rf_set_component_label(raidPtr, data);
   1461 #endif
   1462 
   1463 	case RAIDFRAME_INIT_LABELS:
   1464 		return rf_init_component_label(raidPtr, data);
   1465 
   1466 	case RAIDFRAME_SET_AUTOCONFIG:
   1467 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1468 		printf("raid%d: New autoconfig value is: %d\n",
   1469 		       raidPtr->raidid, d);
   1470 		*(int *) data = d;
   1471 		return retcode;
   1472 
   1473 	case RAIDFRAME_SET_ROOT:
   1474 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1475 		printf("raid%d: New rootpartition value is: %d\n",
   1476 		       raidPtr->raidid, d);
   1477 		*(int *) data = d;
   1478 		return retcode;
   1479 
   1480 		/* initialize all parity */
   1481 	case RAIDFRAME_REWRITEPARITY:
   1482 
   1483 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1484 			/* Parity for RAID 0 is trivially correct */
   1485 			raidPtr->parity_good = RF_RAID_CLEAN;
   1486 			return 0;
   1487 		}
   1488 
   1489 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1490 			/* Re-write is already in progress! */
   1491 			return EINVAL;
   1492 		}
   1493 
   1494 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1495 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1496 
   1497 	case RAIDFRAME_ADD_HOT_SPARE:
   1498 		sparePtr = (RF_SingleComponent_t *) data;
   1499 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1500 		return rf_add_hot_spare(raidPtr, &component);
   1501 
   1502 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1503 		return retcode;
   1504 
   1505 	case RAIDFRAME_DELETE_COMPONENT:
   1506 		componentPtr = (RF_SingleComponent_t *)data;
   1507 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1508 		return rf_delete_component(raidPtr, &component);
   1509 
   1510 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1511 		componentPtr = (RF_SingleComponent_t *)data;
   1512 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1513 		return rf_incorporate_hot_spare(raidPtr, &component);
   1514 
   1515 	case RAIDFRAME_REBUILD_IN_PLACE:
   1516 		return rf_rebuild_in_place(raidPtr, data);
   1517 
   1518 	case RAIDFRAME_GET_INFO:
   1519 		ucfgp = *(RF_DeviceConfig_t **)data;
   1520 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1521 		if (d_cfg == NULL)
   1522 			return ENOMEM;
   1523 		retcode = rf_get_info(raidPtr, d_cfg);
   1524 		if (retcode == 0) {
   1525 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1526 		}
   1527 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1528 		return retcode;
   1529 
   1530 	case RAIDFRAME_CHECK_PARITY:
   1531 		*(int *) data = raidPtr->parity_good;
   1532 		return 0;
   1533 
   1534 	case RAIDFRAME_PARITYMAP_STATUS:
   1535 		if (rf_paritymap_ineligible(raidPtr))
   1536 			return EINVAL;
   1537 		rf_paritymap_status(raidPtr->parity_map, data);
   1538 		return 0;
   1539 
   1540 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1541 		if (rf_paritymap_ineligible(raidPtr))
   1542 			return EINVAL;
   1543 		if (raidPtr->parity_map == NULL)
   1544 			return ENOENT; /* ??? */
   1545 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1546 			return EINVAL;
   1547 		return 0;
   1548 
   1549 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1550 		if (rf_paritymap_ineligible(raidPtr))
   1551 			return EINVAL;
   1552 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1553 		return 0;
   1554 
   1555 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1556 		if (rf_paritymap_ineligible(raidPtr))
   1557 			return EINVAL;
   1558 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1559 		/* XXX should errors be passed up? */
   1560 		return 0;
   1561 
   1562 	case RAIDFRAME_RESET_ACCTOTALS:
   1563 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1564 		return 0;
   1565 
   1566 	case RAIDFRAME_GET_ACCTOTALS:
   1567 		totals = (RF_AccTotals_t *) data;
   1568 		*totals = raidPtr->acc_totals;
   1569 		return 0;
   1570 
   1571 	case RAIDFRAME_KEEP_ACCTOTALS:
   1572 		raidPtr->keep_acc_totals = *(int *)data;
   1573 		return 0;
   1574 
   1575 	case RAIDFRAME_GET_SIZE:
   1576 		*(int *) data = raidPtr->totalSectors;
   1577 		return 0;
   1578 
   1579 	case RAIDFRAME_FAIL_DISK:
   1580 		return rf_fail_disk(raidPtr, data);
   1581 
   1582 		/* invoke a copyback operation after recon on whatever disk
   1583 		 * needs it, if any */
   1584 	case RAIDFRAME_COPYBACK:
   1585 
   1586 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1587 			/* This makes no sense on a RAID 0!! */
   1588 			return EINVAL;
   1589 		}
   1590 
   1591 		if (raidPtr->copyback_in_progress == 1) {
   1592 			/* Copyback is already in progress! */
   1593 			return EINVAL;
   1594 		}
   1595 
   1596 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1597 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1598 
   1599 		/* return the percentage completion of reconstruction */
   1600 	case RAIDFRAME_CHECK_RECON_STATUS:
   1601 		return rf_check_recon_status(raidPtr, data);
   1602 
   1603 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1604 		rf_check_recon_status_ext(raidPtr, data);
   1605 		return 0;
   1606 
   1607 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1608 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1609 			/* This makes no sense on a RAID 0, so tell the
   1610 			   user it's done. */
   1611 			*(int *) data = 100;
   1612 			return 0;
   1613 		}
   1614 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1615 			*(int *) data = 100 *
   1616 				raidPtr->parity_rewrite_stripes_done /
   1617 				raidPtr->Layout.numStripe;
   1618 		} else {
   1619 			*(int *) data = 100;
   1620 		}
   1621 		return 0;
   1622 
   1623 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1624 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1625 		return 0;
   1626 
   1627 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1628 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1629 			/* This makes no sense on a RAID 0 */
   1630 			*(int *) data = 100;
   1631 			return 0;
   1632 		}
   1633 		if (raidPtr->copyback_in_progress == 1) {
   1634 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1635 				raidPtr->Layout.numStripe;
   1636 		} else {
   1637 			*(int *) data = 100;
   1638 		}
   1639 		return 0;
   1640 
   1641 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1642 		rf_check_copyback_status_ext(raidPtr, data);
   1643 		return 0;
   1644 
   1645 	case RAIDFRAME_SET_LAST_UNIT:
   1646 		for (column = 0; column < raidPtr->numCol; column++)
   1647 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1648 				return EBUSY;
   1649 
   1650 		for (column = 0; column < raidPtr->numCol; column++) {
   1651 			clabel = raidget_component_label(raidPtr, column);
   1652 			clabel->last_unit = *(int *)data;
   1653 			raidflush_component_label(raidPtr, column);
   1654 		}
   1655 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1656 		return 0;
   1657 
   1658 		/* the sparetable daemon calls this to wait for the kernel to
   1659 		 * need a spare table. this ioctl does not return until a
   1660 		 * spare table is needed. XXX -- calling mpsleep here in the
   1661 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1662 		 * -- I should either compute the spare table in the kernel,
   1663 		 * or have a different -- XXX XXX -- interface (a different
   1664 		 * character device) for delivering the table     -- XXX */
   1665 #if RF_DISABLED
   1666 	case RAIDFRAME_SPARET_WAIT:
   1667 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1668 		while (!rf_sparet_wait_queue)
   1669 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1670 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1671 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1672 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1673 
   1674 		/* structure assignment */
   1675 		*((RF_SparetWait_t *) data) = *waitreq;
   1676 
   1677 		RF_Free(waitreq, sizeof(*waitreq));
   1678 		return 0;
   1679 
   1680 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1681 		 * code in it that will cause the dameon to exit */
   1682 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1683 		waitreq = RF_Malloc(sizeof(*waitreq));
   1684 		waitreq->fcol = -1;
   1685 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1686 		waitreq->next = rf_sparet_wait_queue;
   1687 		rf_sparet_wait_queue = waitreq;
   1688 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1689 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1690 		return 0;
   1691 
   1692 		/* used by the spare table daemon to deliver a spare table
   1693 		 * into the kernel */
   1694 	case RAIDFRAME_SEND_SPARET:
   1695 
   1696 		/* install the spare table */
   1697 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1698 
   1699 		/* respond to the requestor.  the return status of the spare
   1700 		 * table installation is passed in the "fcol" field */
   1701 		waitred = RF_Malloc(sizeof(*waitreq));
   1702 		waitreq->fcol = retcode;
   1703 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1704 		waitreq->next = rf_sparet_resp_queue;
   1705 		rf_sparet_resp_queue = waitreq;
   1706 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1707 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1708 
   1709 		return retcode;
   1710 #endif
   1711 	default:
   1712 		/*
   1713 		 * Don't bother trying to load compat modules
   1714 		 * if it is not our ioctl. This is more efficient
   1715 		 * and makes rump tests not depend on compat code
   1716 		 */
   1717 		if (IOCGROUP(cmd) != 'r')
   1718 			break;
   1719 #ifdef _LP64
   1720 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1721 			module_autoload("compat_netbsd32_raid",
   1722 			    MODULE_CLASS_EXEC);
   1723 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1724 			    (rs, cmd, data), enosys(), retcode);
   1725 			if (retcode != EPASSTHROUGH)
   1726 				return retcode;
   1727 		}
   1728 #endif
   1729 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1730 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1731 		    (rs, cmd, data), enosys(), retcode);
   1732 		if (retcode != EPASSTHROUGH)
   1733 			return retcode;
   1734 
   1735 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1736 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1737 		    (rs, cmd, data), enosys(), retcode);
   1738 		if (retcode != EPASSTHROUGH)
   1739 			return retcode;
   1740 		break; /* fall through to the os-specific code below */
   1741 
   1742 	}
   1743 
   1744 	if (!raidPtr->valid)
   1745 		return (EINVAL);
   1746 
   1747 	/*
   1748 	 * Add support for "regular" device ioctls here.
   1749 	 */
   1750 
   1751 	switch (cmd) {
   1752 	case DIOCGCACHE:
   1753 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1754 		break;
   1755 
   1756 	case DIOCCACHESYNC:
   1757 		retcode = rf_sync_component_caches(raidPtr);
   1758 		break;
   1759 
   1760 	default:
   1761 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1762 		break;
   1763 	}
   1764 
   1765 	return (retcode);
   1766 
   1767 }
   1768 
   1769 
   1770 /* raidinit -- complete the rest of the initialization for the
   1771    RAIDframe device.  */
   1772 
   1773 
   1774 static void
   1775 raidinit(struct raid_softc *rs)
   1776 {
   1777 	cfdata_t cf;
   1778 	unsigned int unit;
   1779 	struct dk_softc *dksc = &rs->sc_dksc;
   1780 	RF_Raid_t *raidPtr = &rs->sc_r;
   1781 	device_t dev;
   1782 
   1783 	unit = raidPtr->raidid;
   1784 
   1785 	/* XXX doesn't check bounds. */
   1786 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1787 
   1788 	/* attach the pseudo device */
   1789 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1790 	cf->cf_name = raid_cd.cd_name;
   1791 	cf->cf_atname = raid_cd.cd_name;
   1792 	cf->cf_unit = unit;
   1793 	cf->cf_fstate = FSTATE_STAR;
   1794 
   1795 	dev = config_attach_pseudo(cf);
   1796 	if (dev == NULL) {
   1797 		printf("raid%d: config_attach_pseudo failed\n",
   1798 		    raidPtr->raidid);
   1799 		free(cf, M_RAIDFRAME);
   1800 		return;
   1801 	}
   1802 
   1803 	/* provide a backpointer to the real softc */
   1804 	raidsoftc(dev) = rs;
   1805 
   1806 	/* disk_attach actually creates space for the CPU disklabel, among
   1807 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1808 	 * with disklabels. */
   1809 	dk_init(dksc, dev, DKTYPE_RAID);
   1810 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1811 
   1812 	/* XXX There may be a weird interaction here between this, and
   1813 	 * protectedSectors, as used in RAIDframe.  */
   1814 
   1815 	rs->sc_size = raidPtr->totalSectors;
   1816 
   1817 	/* Attach dk and disk subsystems */
   1818 	dk_attach(dksc);
   1819 	disk_attach(&dksc->sc_dkdev);
   1820 	rf_set_geometry(rs, raidPtr);
   1821 
   1822 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1823 
   1824 	/* mark unit as usuable */
   1825 	rs->sc_flags |= RAIDF_INITED;
   1826 
   1827 	dkwedge_discover(&dksc->sc_dkdev);
   1828 }
   1829 
   1830 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1831 /* wake up the daemon & tell it to get us a spare table
   1832  * XXX
   1833  * the entries in the queues should be tagged with the raidPtr
   1834  * so that in the extremely rare case that two recons happen at once,
   1835  * we know for which device were requesting a spare table
   1836  * XXX
   1837  *
   1838  * XXX This code is not currently used. GO
   1839  */
   1840 int
   1841 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1842 {
   1843 	int     retcode;
   1844 
   1845 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1846 	req->next = rf_sparet_wait_queue;
   1847 	rf_sparet_wait_queue = req;
   1848 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1849 
   1850 	/* mpsleep unlocks the mutex */
   1851 	while (!rf_sparet_resp_queue) {
   1852 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1853 	}
   1854 	req = rf_sparet_resp_queue;
   1855 	rf_sparet_resp_queue = req->next;
   1856 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1857 
   1858 	retcode = req->fcol;
   1859 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1860 					 * alloc'd */
   1861 	return (retcode);
   1862 }
   1863 #endif
   1864 
   1865 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1866  * bp & passes it down.
   1867  * any calls originating in the kernel must use non-blocking I/O
   1868  * do some extra sanity checking to return "appropriate" error values for
   1869  * certain conditions (to make some standard utilities work)
   1870  *
   1871  * Formerly known as: rf_DoAccessKernel
   1872  */
   1873 void
   1874 raidstart(RF_Raid_t *raidPtr)
   1875 {
   1876 	struct raid_softc *rs;
   1877 	struct dk_softc *dksc;
   1878 
   1879 	rs = raidPtr->softc;
   1880 	dksc = &rs->sc_dksc;
   1881 	/* quick check to see if anything has died recently */
   1882 	rf_lock_mutex2(raidPtr->mutex);
   1883 	if (raidPtr->numNewFailures > 0) {
   1884 		rf_unlock_mutex2(raidPtr->mutex);
   1885 		rf_update_component_labels(raidPtr,
   1886 					   RF_NORMAL_COMPONENT_UPDATE);
   1887 		rf_lock_mutex2(raidPtr->mutex);
   1888 		raidPtr->numNewFailures--;
   1889 	}
   1890 	rf_unlock_mutex2(raidPtr->mutex);
   1891 
   1892 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1893 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1894 		return;
   1895 	}
   1896 
   1897 	dk_start(dksc, NULL);
   1898 }
   1899 
   1900 static int
   1901 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1902 {
   1903 	RF_SectorCount_t num_blocks, pb, sum;
   1904 	RF_RaidAddr_t raid_addr;
   1905 	daddr_t blocknum;
   1906 	int     do_async;
   1907 	int rc;
   1908 
   1909 	rf_lock_mutex2(raidPtr->mutex);
   1910 	if (raidPtr->openings == 0) {
   1911 		rf_unlock_mutex2(raidPtr->mutex);
   1912 		return EAGAIN;
   1913 	}
   1914 	rf_unlock_mutex2(raidPtr->mutex);
   1915 
   1916 	blocknum = bp->b_rawblkno;
   1917 
   1918 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1919 		    (int) blocknum));
   1920 
   1921 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1922 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1923 
   1924 	/* *THIS* is where we adjust what block we're going to...
   1925 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1926 	raid_addr = blocknum;
   1927 
   1928 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1929 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1930 	sum = raid_addr + num_blocks + pb;
   1931 	if (1 || rf_debugKernelAccess) {
   1932 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1933 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1934 			    (int) pb, (int) bp->b_resid));
   1935 	}
   1936 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1937 	    || (sum < num_blocks) || (sum < pb)) {
   1938 		rc = ENOSPC;
   1939 		goto done;
   1940 	}
   1941 	/*
   1942 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1943 	 */
   1944 
   1945 	if (bp->b_bcount & raidPtr->sectorMask) {
   1946 		rc = ENOSPC;
   1947 		goto done;
   1948 	}
   1949 	db1_printf(("Calling DoAccess..\n"));
   1950 
   1951 
   1952 	rf_lock_mutex2(raidPtr->mutex);
   1953 	raidPtr->openings--;
   1954 	rf_unlock_mutex2(raidPtr->mutex);
   1955 
   1956 	/*
   1957 	 * Everything is async.
   1958 	 */
   1959 	do_async = 1;
   1960 
   1961 	/* don't ever condition on bp->b_flags & B_WRITE.
   1962 	 * always condition on B_READ instead */
   1963 
   1964 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1965 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1966 			 do_async, raid_addr, num_blocks,
   1967 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1968 
   1969 done:
   1970 	return rc;
   1971 }
   1972 
   1973 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1974 
   1975 int
   1976 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1977 {
   1978 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1979 	struct buf *bp;
   1980 
   1981 	req->queue = queue;
   1982 	bp = req->bp;
   1983 
   1984 	switch (req->type) {
   1985 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1986 		/* XXX need to do something extra here.. */
   1987 		/* I'm leaving this in, as I've never actually seen it used,
   1988 		 * and I'd like folks to report it... GO */
   1989 		printf(("WAKEUP CALLED\n"));
   1990 		queue->numOutstanding++;
   1991 
   1992 		bp->b_flags = 0;
   1993 		bp->b_private = req;
   1994 
   1995 		KernelWakeupFunc(bp);
   1996 		break;
   1997 
   1998 	case RF_IO_TYPE_READ:
   1999 	case RF_IO_TYPE_WRITE:
   2000 #if RF_ACC_TRACE > 0
   2001 		if (req->tracerec) {
   2002 			RF_ETIMER_START(req->tracerec->timer);
   2003 		}
   2004 #endif
   2005 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2006 		    op, queue->rf_cinfo->ci_dev,
   2007 		    req->sectorOffset, req->numSector,
   2008 		    req->buf, KernelWakeupFunc, (void *) req,
   2009 		    queue->raidPtr->logBytesPerSector);
   2010 
   2011 		if (rf_debugKernelAccess) {
   2012 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2013 				(long) bp->b_blkno));
   2014 		}
   2015 		queue->numOutstanding++;
   2016 		queue->last_deq_sector = req->sectorOffset;
   2017 		/* acc wouldn't have been let in if there were any pending
   2018 		 * reqs at any other priority */
   2019 		queue->curPriority = req->priority;
   2020 
   2021 		db1_printf(("Going for %c to unit %d col %d\n",
   2022 			    req->type, queue->raidPtr->raidid,
   2023 			    queue->col));
   2024 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2025 			(int) req->sectorOffset, (int) req->numSector,
   2026 			(int) (req->numSector <<
   2027 			    queue->raidPtr->logBytesPerSector),
   2028 			(int) queue->raidPtr->logBytesPerSector));
   2029 
   2030 		/*
   2031 		 * XXX: drop lock here since this can block at
   2032 		 * least with backing SCSI devices.  Retake it
   2033 		 * to minimize fuss with calling interfaces.
   2034 		 */
   2035 
   2036 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2037 		bdev_strategy(bp);
   2038 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2039 		break;
   2040 
   2041 	default:
   2042 		panic("bad req->type in rf_DispatchKernelIO");
   2043 	}
   2044 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2045 
   2046 	return (0);
   2047 }
   2048 /* this is the callback function associated with a I/O invoked from
   2049    kernel code.
   2050  */
   2051 static void
   2052 KernelWakeupFunc(struct buf *bp)
   2053 {
   2054 	RF_DiskQueueData_t *req = NULL;
   2055 	RF_DiskQueue_t *queue;
   2056 
   2057 	db1_printf(("recovering the request queue:\n"));
   2058 
   2059 	req = bp->b_private;
   2060 
   2061 	queue = (RF_DiskQueue_t *) req->queue;
   2062 
   2063 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2064 
   2065 #if RF_ACC_TRACE > 0
   2066 	if (req->tracerec) {
   2067 		RF_ETIMER_STOP(req->tracerec->timer);
   2068 		RF_ETIMER_EVAL(req->tracerec->timer);
   2069 		rf_lock_mutex2(rf_tracing_mutex);
   2070 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2071 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2072 		req->tracerec->num_phys_ios++;
   2073 		rf_unlock_mutex2(rf_tracing_mutex);
   2074 	}
   2075 #endif
   2076 
   2077 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2078 	 * ballistic, and mark the component as hosed... */
   2079 
   2080 	if (bp->b_error != 0) {
   2081 		/* Mark the disk as dead */
   2082 		/* but only mark it once... */
   2083 		/* and only if it wouldn't leave this RAID set
   2084 		   completely broken */
   2085 		if (((queue->raidPtr->Disks[queue->col].status ==
   2086 		      rf_ds_optimal) ||
   2087 		     (queue->raidPtr->Disks[queue->col].status ==
   2088 		      rf_ds_used_spare)) &&
   2089 		     (queue->raidPtr->numFailures <
   2090 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2091 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2092 			       queue->raidPtr->raidid,
   2093 			       bp->b_error,
   2094 			       queue->raidPtr->Disks[queue->col].devname);
   2095 			queue->raidPtr->Disks[queue->col].status =
   2096 			    rf_ds_failed;
   2097 			queue->raidPtr->status = rf_rs_degraded;
   2098 			queue->raidPtr->numFailures++;
   2099 			queue->raidPtr->numNewFailures++;
   2100 		} else {	/* Disk is already dead... */
   2101 			/* printf("Disk already marked as dead!\n"); */
   2102 		}
   2103 
   2104 	}
   2105 
   2106 	/* Fill in the error value */
   2107 	req->error = bp->b_error;
   2108 
   2109 	/* Drop this one on the "finished" queue... */
   2110 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2111 
   2112 	/* Let the raidio thread know there is work to be done. */
   2113 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2114 
   2115 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2116 }
   2117 
   2118 
   2119 /*
   2120  * initialize a buf structure for doing an I/O in the kernel.
   2121  */
   2122 static void
   2123 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2124        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2125        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2126 {
   2127 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2128 	bp->b_oflags = 0;
   2129 	bp->b_cflags = 0;
   2130 	bp->b_bcount = numSect << logBytesPerSector;
   2131 	bp->b_bufsize = bp->b_bcount;
   2132 	bp->b_error = 0;
   2133 	bp->b_dev = dev;
   2134 	bp->b_data = bf;
   2135 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2136 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2137 	if (bp->b_bcount == 0) {
   2138 		panic("bp->b_bcount is zero in InitBP!!");
   2139 	}
   2140 	bp->b_iodone = cbFunc;
   2141 	bp->b_private = cbArg;
   2142 }
   2143 
   2144 /*
   2145  * Wait interruptibly for an exclusive lock.
   2146  *
   2147  * XXX
   2148  * Several drivers do this; it should be abstracted and made MP-safe.
   2149  * (Hmm... where have we seen this warning before :->  GO )
   2150  */
   2151 static int
   2152 raidlock(struct raid_softc *rs)
   2153 {
   2154 	int     error;
   2155 
   2156 	error = 0;
   2157 	mutex_enter(&rs->sc_mutex);
   2158 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2159 		rs->sc_flags |= RAIDF_WANTED;
   2160 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2161 		if (error != 0)
   2162 			goto done;
   2163 	}
   2164 	rs->sc_flags |= RAIDF_LOCKED;
   2165 done:
   2166 	mutex_exit(&rs->sc_mutex);
   2167 	return (error);
   2168 }
   2169 /*
   2170  * Unlock and wake up any waiters.
   2171  */
   2172 static void
   2173 raidunlock(struct raid_softc *rs)
   2174 {
   2175 
   2176 	mutex_enter(&rs->sc_mutex);
   2177 	rs->sc_flags &= ~RAIDF_LOCKED;
   2178 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2179 		rs->sc_flags &= ~RAIDF_WANTED;
   2180 		cv_broadcast(&rs->sc_cv);
   2181 	}
   2182 	mutex_exit(&rs->sc_mutex);
   2183 }
   2184 
   2185 
   2186 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2187 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2188 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2189 
   2190 static daddr_t
   2191 rf_component_info_offset(void)
   2192 {
   2193 
   2194 	return RF_COMPONENT_INFO_OFFSET;
   2195 }
   2196 
   2197 static daddr_t
   2198 rf_component_info_size(unsigned secsize)
   2199 {
   2200 	daddr_t info_size;
   2201 
   2202 	KASSERT(secsize);
   2203 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2204 		info_size = secsize;
   2205 	else
   2206 		info_size = RF_COMPONENT_INFO_SIZE;
   2207 
   2208 	return info_size;
   2209 }
   2210 
   2211 static daddr_t
   2212 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2213 {
   2214 	daddr_t map_offset;
   2215 
   2216 	KASSERT(raidPtr->bytesPerSector);
   2217 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2218 		map_offset = raidPtr->bytesPerSector;
   2219 	else
   2220 		map_offset = RF_COMPONENT_INFO_SIZE;
   2221 	map_offset += rf_component_info_offset();
   2222 
   2223 	return map_offset;
   2224 }
   2225 
   2226 static daddr_t
   2227 rf_parity_map_size(RF_Raid_t *raidPtr)
   2228 {
   2229 	daddr_t map_size;
   2230 
   2231 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2232 		map_size = raidPtr->bytesPerSector;
   2233 	else
   2234 		map_size = RF_PARITY_MAP_SIZE;
   2235 
   2236 	return map_size;
   2237 }
   2238 
   2239 int
   2240 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2241 {
   2242 	RF_ComponentLabel_t *clabel;
   2243 
   2244 	clabel = raidget_component_label(raidPtr, col);
   2245 	clabel->clean = RF_RAID_CLEAN;
   2246 	raidflush_component_label(raidPtr, col);
   2247 	return(0);
   2248 }
   2249 
   2250 
   2251 int
   2252 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2253 {
   2254 	RF_ComponentLabel_t *clabel;
   2255 
   2256 	clabel = raidget_component_label(raidPtr, col);
   2257 	clabel->clean = RF_RAID_DIRTY;
   2258 	raidflush_component_label(raidPtr, col);
   2259 	return(0);
   2260 }
   2261 
   2262 int
   2263 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2264 {
   2265 	KASSERT(raidPtr->bytesPerSector);
   2266 	return raidread_component_label(raidPtr->bytesPerSector,
   2267 	    raidPtr->Disks[col].dev,
   2268 	    raidPtr->raid_cinfo[col].ci_vp,
   2269 	    &raidPtr->raid_cinfo[col].ci_label);
   2270 }
   2271 
   2272 RF_ComponentLabel_t *
   2273 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2274 {
   2275 	return &raidPtr->raid_cinfo[col].ci_label;
   2276 }
   2277 
   2278 int
   2279 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2280 {
   2281 	RF_ComponentLabel_t *label;
   2282 
   2283 	label = &raidPtr->raid_cinfo[col].ci_label;
   2284 	label->mod_counter = raidPtr->mod_counter;
   2285 #ifndef RF_NO_PARITY_MAP
   2286 	label->parity_map_modcount = label->mod_counter;
   2287 #endif
   2288 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2289 	    raidPtr->Disks[col].dev,
   2290 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2291 }
   2292 
   2293 
   2294 static int
   2295 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2296     RF_ComponentLabel_t *clabel)
   2297 {
   2298 	return raidread_component_area(dev, b_vp, clabel,
   2299 	    sizeof(RF_ComponentLabel_t),
   2300 	    rf_component_info_offset(),
   2301 	    rf_component_info_size(secsize));
   2302 }
   2303 
   2304 /* ARGSUSED */
   2305 static int
   2306 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2307     size_t msize, daddr_t offset, daddr_t dsize)
   2308 {
   2309 	struct buf *bp;
   2310 	int error;
   2311 
   2312 	/* XXX should probably ensure that we don't try to do this if
   2313 	   someone has changed rf_protected_sectors. */
   2314 
   2315 	if (b_vp == NULL) {
   2316 		/* For whatever reason, this component is not valid.
   2317 		   Don't try to read a component label from it. */
   2318 		return(EINVAL);
   2319 	}
   2320 
   2321 	/* get a block of the appropriate size... */
   2322 	bp = geteblk((int)dsize);
   2323 	bp->b_dev = dev;
   2324 
   2325 	/* get our ducks in a row for the read */
   2326 	bp->b_blkno = offset / DEV_BSIZE;
   2327 	bp->b_bcount = dsize;
   2328 	bp->b_flags |= B_READ;
   2329  	bp->b_resid = dsize;
   2330 
   2331 	bdev_strategy(bp);
   2332 	error = biowait(bp);
   2333 
   2334 	if (!error) {
   2335 		memcpy(data, bp->b_data, msize);
   2336 	}
   2337 
   2338 	brelse(bp, 0);
   2339 	return(error);
   2340 }
   2341 
   2342 
   2343 static int
   2344 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2345     RF_ComponentLabel_t *clabel)
   2346 {
   2347 	return raidwrite_component_area(dev, b_vp, clabel,
   2348 	    sizeof(RF_ComponentLabel_t),
   2349 	    rf_component_info_offset(),
   2350 	    rf_component_info_size(secsize), 0);
   2351 }
   2352 
   2353 /* ARGSUSED */
   2354 static int
   2355 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2356     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2357 {
   2358 	struct buf *bp;
   2359 	int error;
   2360 
   2361 	/* get a block of the appropriate size... */
   2362 	bp = geteblk((int)dsize);
   2363 	bp->b_dev = dev;
   2364 
   2365 	/* get our ducks in a row for the write */
   2366 	bp->b_blkno = offset / DEV_BSIZE;
   2367 	bp->b_bcount = dsize;
   2368 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2369  	bp->b_resid = dsize;
   2370 
   2371 	memset(bp->b_data, 0, dsize);
   2372 	memcpy(bp->b_data, data, msize);
   2373 
   2374 	bdev_strategy(bp);
   2375 	if (asyncp)
   2376 		return 0;
   2377 	error = biowait(bp);
   2378 	brelse(bp, 0);
   2379 	if (error) {
   2380 #if 1
   2381 		printf("Failed to write RAID component info!\n");
   2382 #endif
   2383 	}
   2384 
   2385 	return(error);
   2386 }
   2387 
   2388 void
   2389 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2390 {
   2391 	int c;
   2392 
   2393 	for (c = 0; c < raidPtr->numCol; c++) {
   2394 		/* Skip dead disks. */
   2395 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2396 			continue;
   2397 		/* XXXjld: what if an error occurs here? */
   2398 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2399 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2400 		    RF_PARITYMAP_NBYTE,
   2401 		    rf_parity_map_offset(raidPtr),
   2402 		    rf_parity_map_size(raidPtr), 0);
   2403 	}
   2404 }
   2405 
   2406 void
   2407 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2408 {
   2409 	struct rf_paritymap_ondisk tmp;
   2410 	int c,first;
   2411 
   2412 	first=1;
   2413 	for (c = 0; c < raidPtr->numCol; c++) {
   2414 		/* Skip dead disks. */
   2415 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2416 			continue;
   2417 		raidread_component_area(raidPtr->Disks[c].dev,
   2418 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2419 		    RF_PARITYMAP_NBYTE,
   2420 		    rf_parity_map_offset(raidPtr),
   2421 		    rf_parity_map_size(raidPtr));
   2422 		if (first) {
   2423 			memcpy(map, &tmp, sizeof(*map));
   2424 			first = 0;
   2425 		} else {
   2426 			rf_paritymap_merge(map, &tmp);
   2427 		}
   2428 	}
   2429 }
   2430 
   2431 void
   2432 rf_markalldirty(RF_Raid_t *raidPtr)
   2433 {
   2434 	RF_ComponentLabel_t *clabel;
   2435 	int sparecol;
   2436 	int c;
   2437 	int j;
   2438 	int scol = -1;
   2439 
   2440 	raidPtr->mod_counter++;
   2441 	for (c = 0; c < raidPtr->numCol; c++) {
   2442 		/* we don't want to touch (at all) a disk that has
   2443 		   failed */
   2444 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2445 			clabel = raidget_component_label(raidPtr, c);
   2446 			if (clabel->status == rf_ds_spared) {
   2447 				/* XXX do something special...
   2448 				   but whatever you do, don't
   2449 				   try to access it!! */
   2450 			} else {
   2451 				raidmarkdirty(raidPtr, c);
   2452 			}
   2453 		}
   2454 	}
   2455 
   2456 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2457 		sparecol = raidPtr->numCol + c;
   2458 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2459 			/*
   2460 
   2461 			   we claim this disk is "optimal" if it's
   2462 			   rf_ds_used_spare, as that means it should be
   2463 			   directly substitutable for the disk it replaced.
   2464 			   We note that too...
   2465 
   2466 			 */
   2467 
   2468 			for(j=0;j<raidPtr->numCol;j++) {
   2469 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2470 					scol = j;
   2471 					break;
   2472 				}
   2473 			}
   2474 
   2475 			clabel = raidget_component_label(raidPtr, sparecol);
   2476 			/* make sure status is noted */
   2477 
   2478 			raid_init_component_label(raidPtr, clabel);
   2479 
   2480 			clabel->row = 0;
   2481 			clabel->column = scol;
   2482 			/* Note: we *don't* change status from rf_ds_used_spare
   2483 			   to rf_ds_optimal */
   2484 			/* clabel.status = rf_ds_optimal; */
   2485 
   2486 			raidmarkdirty(raidPtr, sparecol);
   2487 		}
   2488 	}
   2489 }
   2490 
   2491 
   2492 void
   2493 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2494 {
   2495 	RF_ComponentLabel_t *clabel;
   2496 	int sparecol;
   2497 	int c;
   2498 	int j;
   2499 	int scol;
   2500 	struct raid_softc *rs = raidPtr->softc;
   2501 
   2502 	scol = -1;
   2503 
   2504 	/* XXX should do extra checks to make sure things really are clean,
   2505 	   rather than blindly setting the clean bit... */
   2506 
   2507 	raidPtr->mod_counter++;
   2508 
   2509 	for (c = 0; c < raidPtr->numCol; c++) {
   2510 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2511 			clabel = raidget_component_label(raidPtr, c);
   2512 			/* make sure status is noted */
   2513 			clabel->status = rf_ds_optimal;
   2514 
   2515 			/* note what unit we are configured as */
   2516 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2517 				clabel->last_unit = raidPtr->raidid;
   2518 
   2519 			raidflush_component_label(raidPtr, c);
   2520 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2521 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2522 					raidmarkclean(raidPtr, c);
   2523 				}
   2524 			}
   2525 		}
   2526 		/* else we don't touch it.. */
   2527 	}
   2528 
   2529 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2530 		sparecol = raidPtr->numCol + c;
   2531 		/* Need to ensure that the reconstruct actually completed! */
   2532 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2533 			/*
   2534 
   2535 			   we claim this disk is "optimal" if it's
   2536 			   rf_ds_used_spare, as that means it should be
   2537 			   directly substitutable for the disk it replaced.
   2538 			   We note that too...
   2539 
   2540 			 */
   2541 
   2542 			for(j=0;j<raidPtr->numCol;j++) {
   2543 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2544 					scol = j;
   2545 					break;
   2546 				}
   2547 			}
   2548 
   2549 			/* XXX shouldn't *really* need this... */
   2550 			clabel = raidget_component_label(raidPtr, sparecol);
   2551 			/* make sure status is noted */
   2552 
   2553 			raid_init_component_label(raidPtr, clabel);
   2554 
   2555 			clabel->column = scol;
   2556 			clabel->status = rf_ds_optimal;
   2557 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2558 				clabel->last_unit = raidPtr->raidid;
   2559 
   2560 			raidflush_component_label(raidPtr, sparecol);
   2561 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2562 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2563 					raidmarkclean(raidPtr, sparecol);
   2564 				}
   2565 			}
   2566 		}
   2567 	}
   2568 }
   2569 
   2570 void
   2571 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2572 {
   2573 
   2574 	if (vp != NULL) {
   2575 		if (auto_configured == 1) {
   2576 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2577 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2578 			vput(vp);
   2579 
   2580 		} else {
   2581 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2582 		}
   2583 	}
   2584 }
   2585 
   2586 
   2587 void
   2588 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2589 {
   2590 	int r,c;
   2591 	struct vnode *vp;
   2592 	int acd;
   2593 
   2594 
   2595 	/* We take this opportunity to close the vnodes like we should.. */
   2596 
   2597 	for (c = 0; c < raidPtr->numCol; c++) {
   2598 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2599 		acd = raidPtr->Disks[c].auto_configured;
   2600 		rf_close_component(raidPtr, vp, acd);
   2601 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2602 		raidPtr->Disks[c].auto_configured = 0;
   2603 	}
   2604 
   2605 	for (r = 0; r < raidPtr->numSpare; r++) {
   2606 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2607 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2608 		rf_close_component(raidPtr, vp, acd);
   2609 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2610 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2611 	}
   2612 }
   2613 
   2614 
   2615 void
   2616 rf_ReconThread(struct rf_recon_req_internal *req)
   2617 {
   2618 	int     s;
   2619 	RF_Raid_t *raidPtr;
   2620 
   2621 	s = splbio();
   2622 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2623 	raidPtr->recon_in_progress = 1;
   2624 
   2625 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2626 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2627 
   2628 	RF_Free(req, sizeof(*req));
   2629 
   2630 	raidPtr->recon_in_progress = 0;
   2631 	splx(s);
   2632 
   2633 	/* That's all... */
   2634 	kthread_exit(0);	/* does not return */
   2635 }
   2636 
   2637 void
   2638 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2639 {
   2640 	int retcode;
   2641 	int s;
   2642 
   2643 	raidPtr->parity_rewrite_stripes_done = 0;
   2644 	raidPtr->parity_rewrite_in_progress = 1;
   2645 	s = splbio();
   2646 	retcode = rf_RewriteParity(raidPtr);
   2647 	splx(s);
   2648 	if (retcode) {
   2649 		printf("raid%d: Error re-writing parity (%d)!\n",
   2650 		    raidPtr->raidid, retcode);
   2651 	} else {
   2652 		/* set the clean bit!  If we shutdown correctly,
   2653 		   the clean bit on each component label will get
   2654 		   set */
   2655 		raidPtr->parity_good = RF_RAID_CLEAN;
   2656 	}
   2657 	raidPtr->parity_rewrite_in_progress = 0;
   2658 
   2659 	/* Anyone waiting for us to stop?  If so, inform them... */
   2660 	if (raidPtr->waitShutdown) {
   2661 		rf_lock_mutex2(raidPtr->rad_lock);
   2662 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2663 		rf_unlock_mutex2(raidPtr->rad_lock);
   2664 	}
   2665 
   2666 	/* That's all... */
   2667 	kthread_exit(0);	/* does not return */
   2668 }
   2669 
   2670 
   2671 void
   2672 rf_CopybackThread(RF_Raid_t *raidPtr)
   2673 {
   2674 	int s;
   2675 
   2676 	raidPtr->copyback_in_progress = 1;
   2677 	s = splbio();
   2678 	rf_CopybackReconstructedData(raidPtr);
   2679 	splx(s);
   2680 	raidPtr->copyback_in_progress = 0;
   2681 
   2682 	/* That's all... */
   2683 	kthread_exit(0);	/* does not return */
   2684 }
   2685 
   2686 
   2687 void
   2688 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2689 {
   2690 	int s;
   2691 	RF_Raid_t *raidPtr;
   2692 
   2693 	s = splbio();
   2694 	raidPtr = req->raidPtr;
   2695 	raidPtr->recon_in_progress = 1;
   2696 	rf_ReconstructInPlace(raidPtr, req->col);
   2697 	RF_Free(req, sizeof(*req));
   2698 	raidPtr->recon_in_progress = 0;
   2699 	splx(s);
   2700 
   2701 	/* That's all... */
   2702 	kthread_exit(0);	/* does not return */
   2703 }
   2704 
   2705 static RF_AutoConfig_t *
   2706 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2707     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2708     unsigned secsize)
   2709 {
   2710 	int good_one = 0;
   2711 	RF_ComponentLabel_t *clabel;
   2712 	RF_AutoConfig_t *ac;
   2713 
   2714 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2715 
   2716 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2717 		/* Got the label.  Does it look reasonable? */
   2718 		if (rf_reasonable_label(clabel, numsecs) &&
   2719 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2720 #ifdef DEBUG
   2721 			printf("Component on: %s: %llu\n",
   2722 				cname, (unsigned long long)size);
   2723 			rf_print_component_label(clabel);
   2724 #endif
   2725 			/* if it's reasonable, add it, else ignore it. */
   2726 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2727 				M_WAITOK);
   2728 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2729 			ac->dev = dev;
   2730 			ac->vp = vp;
   2731 			ac->clabel = clabel;
   2732 			ac->next = ac_list;
   2733 			ac_list = ac;
   2734 			good_one = 1;
   2735 		}
   2736 	}
   2737 	if (!good_one) {
   2738 		/* cleanup */
   2739 		free(clabel, M_RAIDFRAME);
   2740 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2741 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2742 		vput(vp);
   2743 	}
   2744 	return ac_list;
   2745 }
   2746 
   2747 RF_AutoConfig_t *
   2748 rf_find_raid_components(void)
   2749 {
   2750 	struct vnode *vp;
   2751 	struct disklabel label;
   2752 	device_t dv;
   2753 	deviter_t di;
   2754 	dev_t dev;
   2755 	int bmajor, bminor, wedge, rf_part_found;
   2756 	int error;
   2757 	int i;
   2758 	RF_AutoConfig_t *ac_list;
   2759 	uint64_t numsecs;
   2760 	unsigned secsize;
   2761 	int dowedges;
   2762 
   2763 	/* initialize the AutoConfig list */
   2764 	ac_list = NULL;
   2765 
   2766 	/*
   2767 	 * we begin by trolling through *all* the devices on the system *twice*
   2768 	 * first we scan for wedges, second for other devices. This avoids
   2769 	 * using a raw partition instead of a wedge that covers the whole disk
   2770 	 */
   2771 
   2772 	for (dowedges=1; dowedges>=0; --dowedges) {
   2773 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2774 		     dv = deviter_next(&di)) {
   2775 
   2776 			/* we are only interested in disks... */
   2777 			if (device_class(dv) != DV_DISK)
   2778 				continue;
   2779 
   2780 			/* we don't care about floppies... */
   2781 			if (device_is_a(dv, "fd")) {
   2782 				continue;
   2783 			}
   2784 
   2785 			/* we don't care about CD's... */
   2786 			if (device_is_a(dv, "cd")) {
   2787 				continue;
   2788 			}
   2789 
   2790 			/* we don't care about md's... */
   2791 			if (device_is_a(dv, "md")) {
   2792 				continue;
   2793 			}
   2794 
   2795 			/* hdfd is the Atari/Hades floppy driver */
   2796 			if (device_is_a(dv, "hdfd")) {
   2797 				continue;
   2798 			}
   2799 
   2800 			/* fdisa is the Atari/Milan floppy driver */
   2801 			if (device_is_a(dv, "fdisa")) {
   2802 				continue;
   2803 			}
   2804 
   2805 			/* are we in the wedges pass ? */
   2806 			wedge = device_is_a(dv, "dk");
   2807 			if (wedge != dowedges) {
   2808 				continue;
   2809 			}
   2810 
   2811 			/* need to find the device_name_to_block_device_major stuff */
   2812 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2813 
   2814 			rf_part_found = 0; /*No raid partition as yet*/
   2815 
   2816 			/* get a vnode for the raw partition of this disk */
   2817 			bminor = minor(device_unit(dv));
   2818 			dev = wedge ? makedev(bmajor, bminor) :
   2819 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2820 			if (bdevvp(dev, &vp))
   2821 				panic("RAID can't alloc vnode");
   2822 
   2823 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2824 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2825 
   2826 			if (error) {
   2827 				/* "Who cares."  Continue looking
   2828 				   for something that exists*/
   2829 				vput(vp);
   2830 				continue;
   2831 			}
   2832 
   2833 			error = getdisksize(vp, &numsecs, &secsize);
   2834 			if (error) {
   2835 				/*
   2836 				 * Pseudo devices like vnd and cgd can be
   2837 				 * opened but may still need some configuration.
   2838 				 * Ignore these quietly.
   2839 				 */
   2840 				if (error != ENXIO)
   2841 					printf("RAIDframe: can't get disk size"
   2842 					    " for dev %s (%d)\n",
   2843 					    device_xname(dv), error);
   2844 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2845 				vput(vp);
   2846 				continue;
   2847 			}
   2848 			if (wedge) {
   2849 				struct dkwedge_info dkw;
   2850 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2851 				    NOCRED);
   2852 				if (error) {
   2853 					printf("RAIDframe: can't get wedge info for "
   2854 					    "dev %s (%d)\n", device_xname(dv), error);
   2855 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2856 					vput(vp);
   2857 					continue;
   2858 				}
   2859 
   2860 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2861 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2862 					vput(vp);
   2863 					continue;
   2864 				}
   2865 
   2866 				VOP_UNLOCK(vp);
   2867 				ac_list = rf_get_component(ac_list, dev, vp,
   2868 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2869 				rf_part_found = 1; /*There is a raid component on this disk*/
   2870 				continue;
   2871 			}
   2872 
   2873 			/* Ok, the disk exists.  Go get the disklabel. */
   2874 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2875 			if (error) {
   2876 				/*
   2877 				 * XXX can't happen - open() would
   2878 				 * have errored out (or faked up one)
   2879 				 */
   2880 				if (error != ENOTTY)
   2881 					printf("RAIDframe: can't get label for dev "
   2882 					    "%s (%d)\n", device_xname(dv), error);
   2883 			}
   2884 
   2885 			/* don't need this any more.  We'll allocate it again
   2886 			   a little later if we really do... */
   2887 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2888 			vput(vp);
   2889 
   2890 			if (error)
   2891 				continue;
   2892 
   2893 			rf_part_found = 0; /*No raid partitions yet*/
   2894 			for (i = 0; i < label.d_npartitions; i++) {
   2895 				char cname[sizeof(ac_list->devname)];
   2896 
   2897 				/* We only support partitions marked as RAID */
   2898 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2899 					continue;
   2900 
   2901 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2902 				if (bdevvp(dev, &vp))
   2903 					panic("RAID can't alloc vnode");
   2904 
   2905 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2906 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2907 				if (error) {
   2908 					/* Whatever... */
   2909 					vput(vp);
   2910 					continue;
   2911 				}
   2912 				VOP_UNLOCK(vp);
   2913 				snprintf(cname, sizeof(cname), "%s%c",
   2914 				    device_xname(dv), 'a' + i);
   2915 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2916 					label.d_partitions[i].p_size, numsecs, secsize);
   2917 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2918 			}
   2919 
   2920 			/*
   2921 			 *If there is no raid component on this disk, either in a
   2922 			 *disklabel or inside a wedge, check the raw partition as well,
   2923 			 *as it is possible to configure raid components on raw disk
   2924 			 *devices.
   2925 			 */
   2926 
   2927 			if (!rf_part_found) {
   2928 				char cname[sizeof(ac_list->devname)];
   2929 
   2930 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2931 				if (bdevvp(dev, &vp))
   2932 					panic("RAID can't alloc vnode");
   2933 
   2934 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2935 
   2936 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2937 				if (error) {
   2938 					/* Whatever... */
   2939 					vput(vp);
   2940 					continue;
   2941 				}
   2942 				VOP_UNLOCK(vp);
   2943 				snprintf(cname, sizeof(cname), "%s%c",
   2944 				    device_xname(dv), 'a' + RAW_PART);
   2945 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2946 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2947 			}
   2948 		}
   2949 		deviter_release(&di);
   2950 	}
   2951 	return ac_list;
   2952 }
   2953 
   2954 
   2955 int
   2956 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2957 {
   2958 
   2959 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2960 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2961 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2962 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2963 	    clabel->row >=0 &&
   2964 	    clabel->column >= 0 &&
   2965 	    clabel->num_rows > 0 &&
   2966 	    clabel->num_columns > 0 &&
   2967 	    clabel->row < clabel->num_rows &&
   2968 	    clabel->column < clabel->num_columns &&
   2969 	    clabel->blockSize > 0 &&
   2970 	    /*
   2971 	     * numBlocksHi may contain garbage, but it is ok since
   2972 	     * the type is unsigned.  If it is really garbage,
   2973 	     * rf_fix_old_label_size() will fix it.
   2974 	     */
   2975 	    rf_component_label_numblocks(clabel) > 0) {
   2976 		/*
   2977 		 * label looks reasonable enough...
   2978 		 * let's make sure it has no old garbage.
   2979 		 */
   2980 		if (numsecs)
   2981 			rf_fix_old_label_size(clabel, numsecs);
   2982 		return(1);
   2983 	}
   2984 	return(0);
   2985 }
   2986 
   2987 
   2988 /*
   2989  * For reasons yet unknown, some old component labels have garbage in
   2990  * the newer numBlocksHi region, and this causes lossage.  Since those
   2991  * disks will also have numsecs set to less than 32 bits of sectors,
   2992  * we can determine when this corruption has occurred, and fix it.
   2993  *
   2994  * The exact same problem, with the same unknown reason, happens to
   2995  * the partitionSizeHi member as well.
   2996  */
   2997 static void
   2998 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2999 {
   3000 
   3001 	if (numsecs < ((uint64_t)1 << 32)) {
   3002 		if (clabel->numBlocksHi) {
   3003 			printf("WARNING: total sectors < 32 bits, yet "
   3004 			       "numBlocksHi set\n"
   3005 			       "WARNING: resetting numBlocksHi to zero.\n");
   3006 			clabel->numBlocksHi = 0;
   3007 		}
   3008 
   3009 		if (clabel->partitionSizeHi) {
   3010 			printf("WARNING: total sectors < 32 bits, yet "
   3011 			       "partitionSizeHi set\n"
   3012 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3013 			clabel->partitionSizeHi = 0;
   3014 		}
   3015 	}
   3016 }
   3017 
   3018 
   3019 #ifdef DEBUG
   3020 void
   3021 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3022 {
   3023 	uint64_t numBlocks;
   3024 	static const char *rp[] = {
   3025 	    "No", "Force", "Soft", "*invalid*"
   3026 	};
   3027 
   3028 
   3029 	numBlocks = rf_component_label_numblocks(clabel);
   3030 
   3031 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3032 	       clabel->row, clabel->column,
   3033 	       clabel->num_rows, clabel->num_columns);
   3034 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3035 	       clabel->version, clabel->serial_number,
   3036 	       clabel->mod_counter);
   3037 	printf("   Clean: %s Status: %d\n",
   3038 	       clabel->clean ? "Yes" : "No", clabel->status);
   3039 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3040 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3041 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3042 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3043 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3044 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3045 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3046 #if 0
   3047 	   printf("   Config order: %d\n", clabel->config_order);
   3048 #endif
   3049 
   3050 }
   3051 #endif
   3052 
   3053 RF_ConfigSet_t *
   3054 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3055 {
   3056 	RF_AutoConfig_t *ac;
   3057 	RF_ConfigSet_t *config_sets;
   3058 	RF_ConfigSet_t *cset;
   3059 	RF_AutoConfig_t *ac_next;
   3060 
   3061 
   3062 	config_sets = NULL;
   3063 
   3064 	/* Go through the AutoConfig list, and figure out which components
   3065 	   belong to what sets.  */
   3066 	ac = ac_list;
   3067 	while(ac!=NULL) {
   3068 		/* we're going to putz with ac->next, so save it here
   3069 		   for use at the end of the loop */
   3070 		ac_next = ac->next;
   3071 
   3072 		if (config_sets == NULL) {
   3073 			/* will need at least this one... */
   3074 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3075 				       M_RAIDFRAME, M_WAITOK);
   3076 			/* this one is easy :) */
   3077 			config_sets->ac = ac;
   3078 			config_sets->next = NULL;
   3079 			config_sets->rootable = 0;
   3080 			ac->next = NULL;
   3081 		} else {
   3082 			/* which set does this component fit into? */
   3083 			cset = config_sets;
   3084 			while(cset!=NULL) {
   3085 				if (rf_does_it_fit(cset, ac)) {
   3086 					/* looks like it matches... */
   3087 					ac->next = cset->ac;
   3088 					cset->ac = ac;
   3089 					break;
   3090 				}
   3091 				cset = cset->next;
   3092 			}
   3093 			if (cset==NULL) {
   3094 				/* didn't find a match above... new set..*/
   3095 				cset = malloc(sizeof(RF_ConfigSet_t),
   3096 					       M_RAIDFRAME, M_WAITOK);
   3097 				cset->ac = ac;
   3098 				ac->next = NULL;
   3099 				cset->next = config_sets;
   3100 				cset->rootable = 0;
   3101 				config_sets = cset;
   3102 			}
   3103 		}
   3104 		ac = ac_next;
   3105 	}
   3106 
   3107 
   3108 	return(config_sets);
   3109 }
   3110 
   3111 static int
   3112 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3113 {
   3114 	RF_ComponentLabel_t *clabel1, *clabel2;
   3115 
   3116 	/* If this one matches the *first* one in the set, that's good
   3117 	   enough, since the other members of the set would have been
   3118 	   through here too... */
   3119 	/* note that we are not checking partitionSize here..
   3120 
   3121 	   Note that we are also not checking the mod_counters here.
   3122 	   If everything else matches except the mod_counter, that's
   3123 	   good enough for this test.  We will deal with the mod_counters
   3124 	   a little later in the autoconfiguration process.
   3125 
   3126 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3127 
   3128 	   The reason we don't check for this is that failed disks
   3129 	   will have lower modification counts.  If those disks are
   3130 	   not added to the set they used to belong to, then they will
   3131 	   form their own set, which may result in 2 different sets,
   3132 	   for example, competing to be configured at raid0, and
   3133 	   perhaps competing to be the root filesystem set.  If the
   3134 	   wrong ones get configured, or both attempt to become /,
   3135 	   weird behaviour and or serious lossage will occur.  Thus we
   3136 	   need to bring them into the fold here, and kick them out at
   3137 	   a later point.
   3138 
   3139 	*/
   3140 
   3141 	clabel1 = cset->ac->clabel;
   3142 	clabel2 = ac->clabel;
   3143 	if ((clabel1->version == clabel2->version) &&
   3144 	    (clabel1->serial_number == clabel2->serial_number) &&
   3145 	    (clabel1->num_rows == clabel2->num_rows) &&
   3146 	    (clabel1->num_columns == clabel2->num_columns) &&
   3147 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3148 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3149 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3150 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3151 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3152 	    (clabel1->blockSize == clabel2->blockSize) &&
   3153 	    rf_component_label_numblocks(clabel1) ==
   3154 	    rf_component_label_numblocks(clabel2) &&
   3155 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3156 	    (clabel1->root_partition == clabel2->root_partition) &&
   3157 	    (clabel1->last_unit == clabel2->last_unit) &&
   3158 	    (clabel1->config_order == clabel2->config_order)) {
   3159 		/* if it get's here, it almost *has* to be a match */
   3160 	} else {
   3161 		/* it's not consistent with somebody in the set..
   3162 		   punt */
   3163 		return(0);
   3164 	}
   3165 	/* all was fine.. it must fit... */
   3166 	return(1);
   3167 }
   3168 
   3169 int
   3170 rf_have_enough_components(RF_ConfigSet_t *cset)
   3171 {
   3172 	RF_AutoConfig_t *ac;
   3173 	RF_AutoConfig_t *auto_config;
   3174 	RF_ComponentLabel_t *clabel;
   3175 	int c;
   3176 	int num_cols;
   3177 	int num_missing;
   3178 	int mod_counter;
   3179 	int mod_counter_found;
   3180 	int even_pair_failed;
   3181 	char parity_type;
   3182 
   3183 
   3184 	/* check to see that we have enough 'live' components
   3185 	   of this set.  If so, we can configure it if necessary */
   3186 
   3187 	num_cols = cset->ac->clabel->num_columns;
   3188 	parity_type = cset->ac->clabel->parityConfig;
   3189 
   3190 	/* XXX Check for duplicate components!?!?!? */
   3191 
   3192 	/* Determine what the mod_counter is supposed to be for this set. */
   3193 
   3194 	mod_counter_found = 0;
   3195 	mod_counter = 0;
   3196 	ac = cset->ac;
   3197 	while(ac!=NULL) {
   3198 		if (mod_counter_found==0) {
   3199 			mod_counter = ac->clabel->mod_counter;
   3200 			mod_counter_found = 1;
   3201 		} else {
   3202 			if (ac->clabel->mod_counter > mod_counter) {
   3203 				mod_counter = ac->clabel->mod_counter;
   3204 			}
   3205 		}
   3206 		ac = ac->next;
   3207 	}
   3208 
   3209 	num_missing = 0;
   3210 	auto_config = cset->ac;
   3211 
   3212 	even_pair_failed = 0;
   3213 	for(c=0; c<num_cols; c++) {
   3214 		ac = auto_config;
   3215 		while(ac!=NULL) {
   3216 			if ((ac->clabel->column == c) &&
   3217 			    (ac->clabel->mod_counter == mod_counter)) {
   3218 				/* it's this one... */
   3219 #ifdef DEBUG
   3220 				printf("Found: %s at %d\n",
   3221 				       ac->devname,c);
   3222 #endif
   3223 				break;
   3224 			}
   3225 			ac=ac->next;
   3226 		}
   3227 		if (ac==NULL) {
   3228 				/* Didn't find one here! */
   3229 				/* special case for RAID 1, especially
   3230 				   where there are more than 2
   3231 				   components (where RAIDframe treats
   3232 				   things a little differently :( ) */
   3233 			if (parity_type == '1') {
   3234 				if (c%2 == 0) { /* even component */
   3235 					even_pair_failed = 1;
   3236 				} else { /* odd component.  If
   3237 					    we're failed, and
   3238 					    so is the even
   3239 					    component, it's
   3240 					    "Good Night, Charlie" */
   3241 					if (even_pair_failed == 1) {
   3242 						return(0);
   3243 					}
   3244 				}
   3245 			} else {
   3246 				/* normal accounting */
   3247 				num_missing++;
   3248 			}
   3249 		}
   3250 		if ((parity_type == '1') && (c%2 == 1)) {
   3251 				/* Just did an even component, and we didn't
   3252 				   bail.. reset the even_pair_failed flag,
   3253 				   and go on to the next component.... */
   3254 			even_pair_failed = 0;
   3255 		}
   3256 	}
   3257 
   3258 	clabel = cset->ac->clabel;
   3259 
   3260 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3261 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3262 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3263 		/* XXX this needs to be made *much* more general */
   3264 		/* Too many failures */
   3265 		return(0);
   3266 	}
   3267 	/* otherwise, all is well, and we've got enough to take a kick
   3268 	   at autoconfiguring this set */
   3269 	return(1);
   3270 }
   3271 
   3272 void
   3273 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3274 			RF_Raid_t *raidPtr)
   3275 {
   3276 	RF_ComponentLabel_t *clabel;
   3277 	int i;
   3278 
   3279 	clabel = ac->clabel;
   3280 
   3281 	/* 1. Fill in the common stuff */
   3282 	config->numCol = clabel->num_columns;
   3283 	config->numSpare = 0; /* XXX should this be set here? */
   3284 	config->sectPerSU = clabel->sectPerSU;
   3285 	config->SUsPerPU = clabel->SUsPerPU;
   3286 	config->SUsPerRU = clabel->SUsPerRU;
   3287 	config->parityConfig = clabel->parityConfig;
   3288 	/* XXX... */
   3289 	strcpy(config->diskQueueType,"fifo");
   3290 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3291 	config->layoutSpecificSize = 0; /* XXX ?? */
   3292 
   3293 	while(ac!=NULL) {
   3294 		/* row/col values will be in range due to the checks
   3295 		   in reasonable_label() */
   3296 		strcpy(config->devnames[0][ac->clabel->column],
   3297 		       ac->devname);
   3298 		ac = ac->next;
   3299 	}
   3300 
   3301 	for(i=0;i<RF_MAXDBGV;i++) {
   3302 		config->debugVars[i][0] = 0;
   3303 	}
   3304 }
   3305 
   3306 int
   3307 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3308 {
   3309 	RF_ComponentLabel_t *clabel;
   3310 	int column;
   3311 	int sparecol;
   3312 
   3313 	raidPtr->autoconfigure = new_value;
   3314 
   3315 	for(column=0; column<raidPtr->numCol; column++) {
   3316 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3317 			clabel = raidget_component_label(raidPtr, column);
   3318 			clabel->autoconfigure = new_value;
   3319 			raidflush_component_label(raidPtr, column);
   3320 		}
   3321 	}
   3322 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3323 		sparecol = raidPtr->numCol + column;
   3324 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3325 			clabel = raidget_component_label(raidPtr, sparecol);
   3326 			clabel->autoconfigure = new_value;
   3327 			raidflush_component_label(raidPtr, sparecol);
   3328 		}
   3329 	}
   3330 	return(new_value);
   3331 }
   3332 
   3333 int
   3334 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3335 {
   3336 	RF_ComponentLabel_t *clabel;
   3337 	int column;
   3338 	int sparecol;
   3339 
   3340 	raidPtr->root_partition = new_value;
   3341 	for(column=0; column<raidPtr->numCol; column++) {
   3342 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3343 			clabel = raidget_component_label(raidPtr, column);
   3344 			clabel->root_partition = new_value;
   3345 			raidflush_component_label(raidPtr, column);
   3346 		}
   3347 	}
   3348 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3349 		sparecol = raidPtr->numCol + column;
   3350 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3351 			clabel = raidget_component_label(raidPtr, sparecol);
   3352 			clabel->root_partition = new_value;
   3353 			raidflush_component_label(raidPtr, sparecol);
   3354 		}
   3355 	}
   3356 	return(new_value);
   3357 }
   3358 
   3359 void
   3360 rf_release_all_vps(RF_ConfigSet_t *cset)
   3361 {
   3362 	RF_AutoConfig_t *ac;
   3363 
   3364 	ac = cset->ac;
   3365 	while(ac!=NULL) {
   3366 		/* Close the vp, and give it back */
   3367 		if (ac->vp) {
   3368 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3369 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3370 			vput(ac->vp);
   3371 			ac->vp = NULL;
   3372 		}
   3373 		ac = ac->next;
   3374 	}
   3375 }
   3376 
   3377 
   3378 void
   3379 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3380 {
   3381 	RF_AutoConfig_t *ac;
   3382 	RF_AutoConfig_t *next_ac;
   3383 
   3384 	ac = cset->ac;
   3385 	while(ac!=NULL) {
   3386 		next_ac = ac->next;
   3387 		/* nuke the label */
   3388 		free(ac->clabel, M_RAIDFRAME);
   3389 		/* cleanup the config structure */
   3390 		free(ac, M_RAIDFRAME);
   3391 		/* "next.." */
   3392 		ac = next_ac;
   3393 	}
   3394 	/* and, finally, nuke the config set */
   3395 	free(cset, M_RAIDFRAME);
   3396 }
   3397 
   3398 
   3399 void
   3400 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3401 {
   3402 	/* current version number */
   3403 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3404 	clabel->serial_number = raidPtr->serial_number;
   3405 	clabel->mod_counter = raidPtr->mod_counter;
   3406 
   3407 	clabel->num_rows = 1;
   3408 	clabel->num_columns = raidPtr->numCol;
   3409 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3410 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3411 
   3412 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3413 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3414 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3415 
   3416 	clabel->blockSize = raidPtr->bytesPerSector;
   3417 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3418 
   3419 	/* XXX not portable */
   3420 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3421 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3422 	clabel->autoconfigure = raidPtr->autoconfigure;
   3423 	clabel->root_partition = raidPtr->root_partition;
   3424 	clabel->last_unit = raidPtr->raidid;
   3425 	clabel->config_order = raidPtr->config_order;
   3426 
   3427 #ifndef RF_NO_PARITY_MAP
   3428 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3429 #endif
   3430 }
   3431 
   3432 struct raid_softc *
   3433 rf_auto_config_set(RF_ConfigSet_t *cset)
   3434 {
   3435 	RF_Raid_t *raidPtr;
   3436 	RF_Config_t *config;
   3437 	int raidID;
   3438 	struct raid_softc *sc;
   3439 
   3440 #ifdef DEBUG
   3441 	printf("RAID autoconfigure\n");
   3442 #endif
   3443 
   3444 	/* 1. Create a config structure */
   3445 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3446 
   3447 	/*
   3448 	   2. Figure out what RAID ID this one is supposed to live at
   3449 	   See if we can get the same RAID dev that it was configured
   3450 	   on last time..
   3451 	*/
   3452 
   3453 	raidID = cset->ac->clabel->last_unit;
   3454 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3455 	     sc = raidget(++raidID, false))
   3456 		continue;
   3457 #ifdef DEBUG
   3458 	printf("Configuring raid%d:\n",raidID);
   3459 #endif
   3460 
   3461 	if (sc == NULL)
   3462 		sc = raidget(raidID, true);
   3463 	raidPtr = &sc->sc_r;
   3464 
   3465 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3466 	raidPtr->softc = sc;
   3467 	raidPtr->raidid = raidID;
   3468 	raidPtr->openings = RAIDOUTSTANDING;
   3469 
   3470 	/* 3. Build the configuration structure */
   3471 	rf_create_configuration(cset->ac, config, raidPtr);
   3472 
   3473 	/* 4. Do the configuration */
   3474 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3475 		raidinit(sc);
   3476 
   3477 		rf_markalldirty(raidPtr);
   3478 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3479 		switch (cset->ac->clabel->root_partition) {
   3480 		case 1:	/* Force Root */
   3481 		case 2:	/* Soft Root: root when boot partition part of raid */
   3482 			/*
   3483 			 * everything configured just fine.  Make a note
   3484 			 * that this set is eligible to be root,
   3485 			 * or forced to be root
   3486 			 */
   3487 			cset->rootable = cset->ac->clabel->root_partition;
   3488 			/* XXX do this here? */
   3489 			raidPtr->root_partition = cset->rootable;
   3490 			break;
   3491 		default:
   3492 			break;
   3493 		}
   3494 	} else {
   3495 		raidput(sc);
   3496 		sc = NULL;
   3497 	}
   3498 
   3499 	/* 5. Cleanup */
   3500 	free(config, M_RAIDFRAME);
   3501 	return sc;
   3502 }
   3503 
   3504 void
   3505 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3506 	     size_t xmin, size_t xmax)
   3507 {
   3508 
   3509 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3510 	pool_sethiwat(p, xmax);
   3511 	pool_prime(p, xmin);
   3512 }
   3513 
   3514 /*
   3515  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3516  * to see if there is IO pending and if that IO could possibly be done
   3517  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3518  * otherwise.
   3519  *
   3520  */
   3521 int
   3522 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3523 {
   3524 	struct raid_softc *rs;
   3525 	struct dk_softc *dksc;
   3526 
   3527 	rs = raidPtr->softc;
   3528 	dksc = &rs->sc_dksc;
   3529 
   3530 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3531 		return 1;
   3532 
   3533 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3534 		/* there is work to do */
   3535 		return 0;
   3536 	}
   3537 	/* default is nothing to do */
   3538 	return 1;
   3539 }
   3540 
   3541 int
   3542 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3543 {
   3544 	uint64_t numsecs;
   3545 	unsigned secsize;
   3546 	int error;
   3547 
   3548 	error = getdisksize(vp, &numsecs, &secsize);
   3549 	if (error == 0) {
   3550 		diskPtr->blockSize = secsize;
   3551 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3552 		diskPtr->partitionSize = numsecs;
   3553 		return 0;
   3554 	}
   3555 	return error;
   3556 }
   3557 
   3558 static int
   3559 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3560 {
   3561 	return 1;
   3562 }
   3563 
   3564 static void
   3565 raid_attach(device_t parent, device_t self, void *aux)
   3566 {
   3567 }
   3568 
   3569 
   3570 static int
   3571 raid_detach(device_t self, int flags)
   3572 {
   3573 	int error;
   3574 	struct raid_softc *rs = raidsoftc(self);
   3575 
   3576 	if (rs == NULL)
   3577 		return ENXIO;
   3578 
   3579 	if ((error = raidlock(rs)) != 0)
   3580 		return (error);
   3581 
   3582 	error = raid_detach_unlocked(rs);
   3583 
   3584 	raidunlock(rs);
   3585 
   3586 	/* XXX raid can be referenced here */
   3587 
   3588 	if (error)
   3589 		return error;
   3590 
   3591 	/* Free the softc */
   3592 	raidput(rs);
   3593 
   3594 	return 0;
   3595 }
   3596 
   3597 static void
   3598 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3599 {
   3600 	struct dk_softc *dksc = &rs->sc_dksc;
   3601 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3602 
   3603 	memset(dg, 0, sizeof(*dg));
   3604 
   3605 	dg->dg_secperunit = raidPtr->totalSectors;
   3606 	dg->dg_secsize = raidPtr->bytesPerSector;
   3607 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3608 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3609 
   3610 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3611 }
   3612 
   3613 /*
   3614  * Get cache info for all the components (including spares).
   3615  * Returns intersection of all the cache flags of all disks, or first
   3616  * error if any encountered.
   3617  * XXXfua feature flags can change as spares are added - lock down somehow
   3618  */
   3619 static int
   3620 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3621 {
   3622 	int c;
   3623 	int error;
   3624 	int dkwhole = 0, dkpart;
   3625 
   3626 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3627 		/*
   3628 		 * Check any non-dead disk, even when currently being
   3629 		 * reconstructed.
   3630 		 */
   3631 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3632 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3633 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3634 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3635 			if (error) {
   3636 				if (error != ENODEV) {
   3637 					printf("raid%d: get cache for component %s failed\n",
   3638 					    raidPtr->raidid,
   3639 					    raidPtr->Disks[c].devname);
   3640 				}
   3641 
   3642 				return error;
   3643 			}
   3644 
   3645 			if (c == 0)
   3646 				dkwhole = dkpart;
   3647 			else
   3648 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3649 		}
   3650 	}
   3651 
   3652 	*data = dkwhole;
   3653 
   3654 	return 0;
   3655 }
   3656 
   3657 /*
   3658  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3659  * We end up returning whatever error was returned by the first cache flush
   3660  * that fails.
   3661  */
   3662 
   3663 int
   3664 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3665 {
   3666 	int c, sparecol;
   3667 	int e,error;
   3668 	int force = 1;
   3669 
   3670 	error = 0;
   3671 	for (c = 0; c < raidPtr->numCol; c++) {
   3672 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3673 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3674 					  &force, FWRITE, NOCRED);
   3675 			if (e) {
   3676 				if (e != ENODEV)
   3677 					printf("raid%d: cache flush to component %s failed.\n",
   3678 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3679 				if (error == 0) {
   3680 					error = e;
   3681 				}
   3682 			}
   3683 		}
   3684 	}
   3685 
   3686 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3687 		sparecol = raidPtr->numCol + c;
   3688 		/* Need to ensure that the reconstruct actually completed! */
   3689 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3690 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3691 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3692 			if (e) {
   3693 				if (e != ENODEV)
   3694 					printf("raid%d: cache flush to component %s failed.\n",
   3695 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3696 				if (error == 0) {
   3697 					error = e;
   3698 				}
   3699 			}
   3700 		}
   3701 	}
   3702 	return error;
   3703 }
   3704 
   3705 /* Fill in info with the current status */
   3706 void
   3707 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3708 {
   3709 
   3710 	if (raidPtr->status != rf_rs_reconstructing) {
   3711 		info->total = 100;
   3712 		info->completed = 100;
   3713 	} else {
   3714 		info->total = raidPtr->reconControl->numRUsTotal;
   3715 		info->completed = raidPtr->reconControl->numRUsComplete;
   3716 	}
   3717 	info->remaining = info->total - info->completed;
   3718 }
   3719 
   3720 /* Fill in info with the current status */
   3721 void
   3722 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3723 {
   3724 
   3725 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3726 		info->total = raidPtr->Layout.numStripe;
   3727 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3728 	} else {
   3729 		info->completed = 100;
   3730 		info->total = 100;
   3731 	}
   3732 	info->remaining = info->total - info->completed;
   3733 }
   3734 
   3735 /* Fill in info with the current status */
   3736 void
   3737 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3738 {
   3739 
   3740 	if (raidPtr->copyback_in_progress == 1) {
   3741 		info->total = raidPtr->Layout.numStripe;
   3742 		info->completed = raidPtr->copyback_stripes_done;
   3743 		info->remaining = info->total - info->completed;
   3744 	} else {
   3745 		info->remaining = 0;
   3746 		info->completed = 100;
   3747 		info->total = 100;
   3748 	}
   3749 }
   3750 
   3751 /* Fill in config with the current info */
   3752 int
   3753 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3754 {
   3755 	int	d, i, j;
   3756 
   3757 	if (!raidPtr->valid)
   3758 		return (ENODEV);
   3759 	config->cols = raidPtr->numCol;
   3760 	config->ndevs = raidPtr->numCol;
   3761 	if (config->ndevs >= RF_MAX_DISKS)
   3762 		return (ENOMEM);
   3763 	config->nspares = raidPtr->numSpare;
   3764 	if (config->nspares >= RF_MAX_DISKS)
   3765 		return (ENOMEM);
   3766 	config->maxqdepth = raidPtr->maxQueueDepth;
   3767 	d = 0;
   3768 	for (j = 0; j < config->cols; j++) {
   3769 		config->devs[d] = raidPtr->Disks[j];
   3770 		d++;
   3771 	}
   3772 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3773 		config->spares[i] = raidPtr->Disks[j];
   3774 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3775 			/* XXX: raidctl(8) expects to see this as a used spare */
   3776 			config->spares[i].status = rf_ds_used_spare;
   3777 		}
   3778 	}
   3779 	return 0;
   3780 }
   3781 
   3782 int
   3783 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3784 {
   3785 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3786 	RF_ComponentLabel_t *raid_clabel;
   3787 	int column = clabel->column;
   3788 
   3789 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3790 		return EINVAL;
   3791 	raid_clabel = raidget_component_label(raidPtr, column);
   3792 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3793 
   3794 	return 0;
   3795 }
   3796 
   3797 /*
   3798  * Module interface
   3799  */
   3800 
   3801 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3802 
   3803 #ifdef _MODULE
   3804 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3805 #endif
   3806 
   3807 static int raid_modcmd(modcmd_t, void *);
   3808 static int raid_modcmd_init(void);
   3809 static int raid_modcmd_fini(void);
   3810 
   3811 static int
   3812 raid_modcmd(modcmd_t cmd, void *data)
   3813 {
   3814 	int error;
   3815 
   3816 	error = 0;
   3817 	switch (cmd) {
   3818 	case MODULE_CMD_INIT:
   3819 		error = raid_modcmd_init();
   3820 		break;
   3821 	case MODULE_CMD_FINI:
   3822 		error = raid_modcmd_fini();
   3823 		break;
   3824 	default:
   3825 		error = ENOTTY;
   3826 		break;
   3827 	}
   3828 	return error;
   3829 }
   3830 
   3831 static int
   3832 raid_modcmd_init(void)
   3833 {
   3834 	int error;
   3835 	int bmajor, cmajor;
   3836 
   3837 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3838 	mutex_enter(&raid_lock);
   3839 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3840 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3841 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3842 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3843 
   3844 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3845 #endif
   3846 
   3847 	bmajor = cmajor = -1;
   3848 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3849 	    &raid_cdevsw, &cmajor);
   3850 	if (error != 0 && error != EEXIST) {
   3851 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3852 		mutex_exit(&raid_lock);
   3853 		return error;
   3854 	}
   3855 #ifdef _MODULE
   3856 	error = config_cfdriver_attach(&raid_cd);
   3857 	if (error != 0) {
   3858 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3859 		    __func__, error);
   3860 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3861 		mutex_exit(&raid_lock);
   3862 		return error;
   3863 	}
   3864 #endif
   3865 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3866 	if (error != 0) {
   3867 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3868 		    __func__, error);
   3869 #ifdef _MODULE
   3870 		config_cfdriver_detach(&raid_cd);
   3871 #endif
   3872 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3873 		mutex_exit(&raid_lock);
   3874 		return error;
   3875 	}
   3876 
   3877 	raidautoconfigdone = false;
   3878 
   3879 	mutex_exit(&raid_lock);
   3880 
   3881 	if (error == 0) {
   3882 		if (rf_BootRaidframe(true) == 0)
   3883 			aprint_verbose("Kernelized RAIDframe activated\n");
   3884 		else
   3885 			panic("Serious error activating RAID!!");
   3886 	}
   3887 
   3888 	/*
   3889 	 * Register a finalizer which will be used to auto-config RAID
   3890 	 * sets once all real hardware devices have been found.
   3891 	 */
   3892 	error = config_finalize_register(NULL, rf_autoconfig);
   3893 	if (error != 0) {
   3894 		aprint_error("WARNING: unable to register RAIDframe "
   3895 		    "finalizer\n");
   3896 		error = 0;
   3897 	}
   3898 
   3899 	return error;
   3900 }
   3901 
   3902 static int
   3903 raid_modcmd_fini(void)
   3904 {
   3905 	int error;
   3906 
   3907 	mutex_enter(&raid_lock);
   3908 
   3909 	/* Don't allow unload if raid device(s) exist.  */
   3910 	if (!LIST_EMPTY(&raids)) {
   3911 		mutex_exit(&raid_lock);
   3912 		return EBUSY;
   3913 	}
   3914 
   3915 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3916 	if (error != 0) {
   3917 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3918 		mutex_exit(&raid_lock);
   3919 		return error;
   3920 	}
   3921 #ifdef _MODULE
   3922 	error = config_cfdriver_detach(&raid_cd);
   3923 	if (error != 0) {
   3924 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3925 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3926 		mutex_exit(&raid_lock);
   3927 		return error;
   3928 	}
   3929 #endif
   3930 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3931 	if (error != 0) {
   3932 		aprint_error("%s: cannot detach devsw\n",__func__);
   3933 #ifdef _MODULE
   3934 		config_cfdriver_attach(&raid_cd);
   3935 #endif
   3936 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3937 		mutex_exit(&raid_lock);
   3938 		return error;
   3939 	}
   3940 	rf_BootRaidframe(false);
   3941 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3942 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3943 	rf_destroy_cond2(rf_sparet_wait_cv);
   3944 	rf_destroy_cond2(rf_sparet_resp_cv);
   3945 #endif
   3946 	mutex_exit(&raid_lock);
   3947 	mutex_destroy(&raid_lock);
   3948 
   3949 	return error;
   3950 }
   3951