Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.376
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.376 2019/03/01 11:06:56 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.376 2019/03/01 11:06:56 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    179 
    180 /* prototypes */
    181 static void KernelWakeupFunc(struct buf *);
    182 static void InitBP(struct buf *, struct vnode *, unsigned,
    183     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    184     void *, int, struct proc *);
    185 static void raidinit(struct raid_softc *);
    186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    188 
    189 static int raid_match(device_t, cfdata_t, void *);
    190 static void raid_attach(device_t, device_t, void *);
    191 static int raid_detach(device_t, int);
    192 
    193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    194     daddr_t, daddr_t);
    195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t, int);
    197 
    198 static int raidwrite_component_label(unsigned,
    199     dev_t, struct vnode *, RF_ComponentLabel_t *);
    200 static int raidread_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 
    203 static int raid_diskstart(device_t, struct buf *bp);
    204 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    205 static int raid_lastclose(device_t);
    206 
    207 static dev_type_open(raidopen);
    208 static dev_type_close(raidclose);
    209 static dev_type_read(raidread);
    210 static dev_type_write(raidwrite);
    211 static dev_type_ioctl(raidioctl);
    212 static dev_type_strategy(raidstrategy);
    213 static dev_type_dump(raiddump);
    214 static dev_type_size(raidsize);
    215 
    216 const struct bdevsw raid_bdevsw = {
    217 	.d_open = raidopen,
    218 	.d_close = raidclose,
    219 	.d_strategy = raidstrategy,
    220 	.d_ioctl = raidioctl,
    221 	.d_dump = raiddump,
    222 	.d_psize = raidsize,
    223 	.d_discard = nodiscard,
    224 	.d_flag = D_DISK
    225 };
    226 
    227 const struct cdevsw raid_cdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_read = raidread,
    231 	.d_write = raidwrite,
    232 	.d_ioctl = raidioctl,
    233 	.d_stop = nostop,
    234 	.d_tty = notty,
    235 	.d_poll = nopoll,
    236 	.d_mmap = nommap,
    237 	.d_kqfilter = nokqfilter,
    238 	.d_discard = nodiscard,
    239 	.d_flag = D_DISK
    240 };
    241 
    242 static struct dkdriver rf_dkdriver = {
    243 	.d_open = raidopen,
    244 	.d_close = raidclose,
    245 	.d_strategy = raidstrategy,
    246 	.d_diskstart = raid_diskstart,
    247 	.d_dumpblocks = raid_dumpblocks,
    248 	.d_lastclose = raid_lastclose,
    249 	.d_minphys = minphys
    250 };
    251 
    252 #define	raidunit(x)	DISKUNIT(x)
    253 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /* Internal representation of a rf_recon_req */
    261 struct rf_recon_req_internal {
    262 	RF_RowCol_t col;
    263 	RF_ReconReqFlags_t flags;
    264 	void   *raidPtr;
    265 };
    266 
    267 /*
    268  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    269  * Be aware that large numbers can allow the driver to consume a lot of
    270  * kernel memory, especially on writes, and in degraded mode reads.
    271  *
    272  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    273  * a single 64K write will typically require 64K for the old data,
    274  * 64K for the old parity, and 64K for the new parity, for a total
    275  * of 192K (if the parity buffer is not re-used immediately).
    276  * Even it if is used immediately, that's still 128K, which when multiplied
    277  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    278  *
    279  * Now in degraded mode, for example, a 64K read on the above setup may
    280  * require data reconstruction, which will require *all* of the 4 remaining
    281  * disks to participate -- 4 * 32K/disk == 128K again.
    282  */
    283 
    284 #ifndef RAIDOUTSTANDING
    285 #define RAIDOUTSTANDING   6
    286 #endif
    287 
    288 #define RAIDLABELDEV(dev)	\
    289 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    290 
    291 /* declared here, and made public, for the benefit of KVM stuff.. */
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req_internal *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	sc->sc_unit = unit;
    342 	cv_init(&sc->sc_cv, "raidunit");
    343 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    344 	return sc;
    345 }
    346 
    347 static void
    348 raiddestroy(struct raid_softc *sc) {
    349 	cv_destroy(&sc->sc_cv);
    350 	mutex_destroy(&sc->sc_mutex);
    351 	kmem_free(sc, sizeof(*sc));
    352 }
    353 
    354 static struct raid_softc *
    355 raidget(int unit, bool create) {
    356 	struct raid_softc *sc;
    357 	if (unit < 0) {
    358 #ifdef DIAGNOSTIC
    359 		panic("%s: unit %d!", __func__, unit);
    360 #endif
    361 		return NULL;
    362 	}
    363 	mutex_enter(&raid_lock);
    364 	LIST_FOREACH(sc, &raids, sc_link) {
    365 		if (sc->sc_unit == unit) {
    366 			mutex_exit(&raid_lock);
    367 			return sc;
    368 		}
    369 	}
    370 	mutex_exit(&raid_lock);
    371 	if (!create)
    372 		return NULL;
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 
    393 	/*
    394 	 * Device attachment and associated initialization now occurs
    395 	 * as part of the module initialization.
    396 	 */
    397 }
    398 
    399 int
    400 rf_autoconfig(device_t self)
    401 {
    402 	RF_AutoConfig_t *ac_list;
    403 	RF_ConfigSet_t *config_sets;
    404 
    405 	if (!raidautoconfig || raidautoconfigdone == true)
    406 		return (0);
    407 
    408 	/* XXX This code can only be run once. */
    409 	raidautoconfigdone = true;
    410 
    411 #ifdef __HAVE_CPU_BOOTCONF
    412 	/*
    413 	 * 0. find the boot device if needed first so we can use it later
    414 	 * this needs to be done before we autoconfigure any raid sets,
    415 	 * because if we use wedges we are not going to be able to open
    416 	 * the boot device later
    417 	 */
    418 	if (booted_device == NULL)
    419 		cpu_bootconf();
    420 #endif
    421 	/* 1. locate all RAID components on the system */
    422 	aprint_debug("Searching for RAID components...\n");
    423 	ac_list = rf_find_raid_components();
    424 
    425 	/* 2. Sort them into their respective sets. */
    426 	config_sets = rf_create_auto_sets(ac_list);
    427 
    428 	/*
    429 	 * 3. Evaluate each set and configure the valid ones.
    430 	 * This gets done in rf_buildroothack().
    431 	 */
    432 	rf_buildroothack(config_sets);
    433 
    434 	return 1;
    435 }
    436 
    437 int
    438 rf_inited(const struct raid_softc *rs) {
    439 	return (rs->sc_flags & RAIDF_INITED) != 0;
    440 }
    441 
    442 RF_Raid_t *
    443 rf_get_raid(struct raid_softc *rs) {
    444 	return &rs->sc_r;
    445 }
    446 
    447 int
    448 rf_get_unit(const struct raid_softc *rs) {
    449 	return rs->sc_unit;
    450 }
    451 
    452 static int
    453 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    454 	const char *bootname;
    455 	size_t len;
    456 
    457 	/* if bdv is NULL, the set can't contain it. exit early. */
    458 	if (bdv == NULL)
    459 		return 0;
    460 
    461 	bootname = device_xname(bdv);
    462 	len = strlen(bootname);
    463 
    464 	for (int col = 0; col < r->numCol; col++) {
    465 		const char *devname = r->Disks[col].devname;
    466 		devname += sizeof("/dev/") - 1;
    467 		if (strncmp(devname, "dk", 2) == 0) {
    468 			const char *parent =
    469 			    dkwedge_get_parent_name(r->Disks[col].dev);
    470 			if (parent != NULL)
    471 				devname = parent;
    472 		}
    473 		if (strncmp(devname, bootname, len) == 0) {
    474 			struct raid_softc *sc = r->softc;
    475 			aprint_debug("raid%d includes boot device %s\n",
    476 			    sc->sc_unit, devname);
    477 			return 1;
    478 		}
    479 	}
    480 	return 0;
    481 }
    482 
    483 void
    484 rf_buildroothack(RF_ConfigSet_t *config_sets)
    485 {
    486 	RF_ConfigSet_t *cset;
    487 	RF_ConfigSet_t *next_cset;
    488 	int num_root;
    489 	struct raid_softc *sc, *rsc;
    490 	struct dk_softc *dksc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok, rootable %d\n",
    502 				    sc->sc_unit, cset->rootable);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 	dksc = &rsc->sc_dksc;
    522 
    523 	/* if the user has specified what the root device should be
    524 	   then we don't touch booted_device or boothowto... */
    525 
    526 	if (rootspec != NULL) {
    527 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    528 		return;
    529 	}
    530 
    531 	/* we found something bootable... */
    532 
    533 	/*
    534 	 * XXX: The following code assumes that the root raid
    535 	 * is the first ('a') partition. This is about the best
    536 	 * we can do with a BSD disklabel, but we might be able
    537 	 * to do better with a GPT label, by setting a specified
    538 	 * attribute to indicate the root partition. We can then
    539 	 * stash the partition number in the r->root_partition
    540 	 * high bits (the bottom 2 bits are already used). For
    541 	 * now we just set booted_partition to 0 when we override
    542 	 * root.
    543 	 */
    544 	if (num_root == 1) {
    545 		device_t candidate_root;
    546 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    547 			char cname[sizeof(cset->ac->devname)];
    548 			/* XXX: assume partition 'a' first */
    549 			snprintf(cname, sizeof(cname), "%s%c",
    550 			    device_xname(dksc->sc_dev), 'a');
    551 			candidate_root = dkwedge_find_by_wname(cname);
    552 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    553 			    cname);
    554 			if (candidate_root == NULL) {
    555 				/*
    556 				 * If that is not found, because we don't use
    557 				 * disklabel, return the first dk child
    558 				 * XXX: we can skip the 'a' check above
    559 				 * and always do this...
    560 				 */
    561 				size_t i = 0;
    562 				candidate_root = dkwedge_find_by_parent(
    563 				    device_xname(dksc->sc_dev), &i);
    564 			}
    565 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    566 			    candidate_root);
    567 		} else
    568 			candidate_root = dksc->sc_dev;
    569 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    570 		DPRINTF("%s: booted_device=%p root_partition=%d "
    571 			"contains_boot=%d",
    572 		    __func__, booted_device, rsc->sc_r.root_partition,
    573 			   rf_containsboot(&rsc->sc_r, booted_device));
    574 		/* XXX the check for booted_device == NULL can probably be
    575 		 * dropped, now that rf_containsboot handles that case.
    576 		 */
    577 		if (booted_device == NULL ||
    578 		    rsc->sc_r.root_partition == 1 ||
    579 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    580 			booted_device = candidate_root;
    581 			booted_method = "raidframe/single";
    582 			booted_partition = 0;	/* XXX assume 'a' */
    583 		}
    584 	} else if (num_root > 1) {
    585 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    586 		    booted_device);
    587 
    588 		/*
    589 		 * Maybe the MD code can help. If it cannot, then
    590 		 * setroot() will discover that we have no
    591 		 * booted_device and will ask the user if nothing was
    592 		 * hardwired in the kernel config file
    593 		 */
    594 		if (booted_device == NULL)
    595 			return;
    596 
    597 		num_root = 0;
    598 		mutex_enter(&raid_lock);
    599 		LIST_FOREACH(sc, &raids, sc_link) {
    600 			RF_Raid_t *r = &sc->sc_r;
    601 			if (r->valid == 0)
    602 				continue;
    603 
    604 			if (r->root_partition == 0)
    605 				continue;
    606 
    607 			if (rf_containsboot(r, booted_device)) {
    608 				num_root++;
    609 				rsc = sc;
    610 				dksc = &rsc->sc_dksc;
    611 			}
    612 		}
    613 		mutex_exit(&raid_lock);
    614 
    615 		if (num_root == 1) {
    616 			booted_device = dksc->sc_dev;
    617 			booted_method = "raidframe/multi";
    618 			booted_partition = 0;	/* XXX assume 'a' */
    619 		} else {
    620 			/* we can't guess.. require the user to answer... */
    621 			boothowto |= RB_ASKNAME;
    622 		}
    623 	}
    624 }
    625 
    626 static int
    627 raidsize(dev_t dev)
    628 {
    629 	struct raid_softc *rs;
    630 	struct dk_softc *dksc;
    631 	unsigned int unit;
    632 
    633 	unit = raidunit(dev);
    634 	if ((rs = raidget(unit, false)) == NULL)
    635 		return -1;
    636 	dksc = &rs->sc_dksc;
    637 
    638 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    639 		return -1;
    640 
    641 	return dk_size(dksc, dev);
    642 }
    643 
    644 static int
    645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    646 {
    647 	unsigned int unit;
    648 	struct raid_softc *rs;
    649 	struct dk_softc *dksc;
    650 
    651 	unit = raidunit(dev);
    652 	if ((rs = raidget(unit, false)) == NULL)
    653 		return ENXIO;
    654 	dksc = &rs->sc_dksc;
    655 
    656 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    657 		return ENODEV;
    658 
    659         /*
    660            Note that blkno is relative to this particular partition.
    661            By adding adding RF_PROTECTED_SECTORS, we get a value that
    662 	   is relative to the partition used for the underlying component.
    663         */
    664 	blkno += RF_PROTECTED_SECTORS;
    665 
    666 	return dk_dump(dksc, dev, blkno, va, size);
    667 }
    668 
    669 static int
    670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    671 {
    672 	struct raid_softc *rs = raidsoftc(dev);
    673 	const struct bdevsw *bdev;
    674 	RF_Raid_t *raidPtr;
    675 	int     c, sparecol, j, scol, dumpto;
    676 	int     error = 0;
    677 
    678 	raidPtr = &rs->sc_r;
    679 
    680 	/* we only support dumping to RAID 1 sets */
    681 	if (raidPtr->Layout.numDataCol != 1 ||
    682 	    raidPtr->Layout.numParityCol != 1)
    683 		return EINVAL;
    684 
    685 	if ((error = raidlock(rs)) != 0)
    686 		return error;
    687 
    688 	/* figure out what device is alive.. */
    689 
    690 	/*
    691 	   Look for a component to dump to.  The preference for the
    692 	   component to dump to is as follows:
    693 	   1) the master
    694 	   2) a used_spare of the master
    695 	   3) the slave
    696 	   4) a used_spare of the slave
    697 	*/
    698 
    699 	dumpto = -1;
    700 	for (c = 0; c < raidPtr->numCol; c++) {
    701 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    702 			/* this might be the one */
    703 			dumpto = c;
    704 			break;
    705 		}
    706 	}
    707 
    708 	/*
    709 	   At this point we have possibly selected a live master or a
    710 	   live slave.  We now check to see if there is a spared
    711 	   master (or a spared slave), if we didn't find a live master
    712 	   or a live slave.
    713 	*/
    714 
    715 	for (c = 0; c < raidPtr->numSpare; c++) {
    716 		sparecol = raidPtr->numCol + c;
    717 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    718 			/* How about this one? */
    719 			scol = -1;
    720 			for(j=0;j<raidPtr->numCol;j++) {
    721 				if (raidPtr->Disks[j].spareCol == sparecol) {
    722 					scol = j;
    723 					break;
    724 				}
    725 			}
    726 			if (scol == 0) {
    727 				/*
    728 				   We must have found a spared master!
    729 				   We'll take that over anything else
    730 				   found so far.  (We couldn't have
    731 				   found a real master before, since
    732 				   this is a used spare, and it's
    733 				   saying that it's replacing the
    734 				   master.)  On reboot (with
    735 				   autoconfiguration turned on)
    736 				   sparecol will become the 1st
    737 				   component (component0) of this set.
    738 				*/
    739 				dumpto = sparecol;
    740 				break;
    741 			} else if (scol != -1) {
    742 				/*
    743 				   Must be a spared slave.  We'll dump
    744 				   to that if we havn't found anything
    745 				   else so far.
    746 				*/
    747 				if (dumpto == -1)
    748 					dumpto = sparecol;
    749 			}
    750 		}
    751 	}
    752 
    753 	if (dumpto == -1) {
    754 		/* we couldn't find any live components to dump to!?!?
    755 		 */
    756 		error = EINVAL;
    757 		goto out;
    758 	}
    759 
    760 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    761 	if (bdev == NULL) {
    762 		error = ENXIO;
    763 		goto out;
    764 	}
    765 
    766 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    767 				blkno, va, nblk * raidPtr->bytesPerSector);
    768 
    769 out:
    770 	raidunlock(rs);
    771 
    772 	return error;
    773 }
    774 
    775 /* ARGSUSED */
    776 static int
    777 raidopen(dev_t dev, int flags, int fmt,
    778     struct lwp *l)
    779 {
    780 	int     unit = raidunit(dev);
    781 	struct raid_softc *rs;
    782 	struct dk_softc *dksc;
    783 	int     error = 0;
    784 	int     part, pmask;
    785 
    786 	if ((rs = raidget(unit, true)) == NULL)
    787 		return ENXIO;
    788 	if ((error = raidlock(rs)) != 0)
    789 		return (error);
    790 
    791 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    792 		error = EBUSY;
    793 		goto bad;
    794 	}
    795 
    796 	dksc = &rs->sc_dksc;
    797 
    798 	part = DISKPART(dev);
    799 	pmask = (1 << part);
    800 
    801 	if (!DK_BUSY(dksc, pmask) &&
    802 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    803 		/* First one... mark things as dirty... Note that we *MUST*
    804 		 have done a configure before this.  I DO NOT WANT TO BE
    805 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    806 		 THAT THEY BELONG TOGETHER!!!!! */
    807 		/* XXX should check to see if we're only open for reading
    808 		   here... If so, we needn't do this, but then need some
    809 		   other way of keeping track of what's happened.. */
    810 
    811 		rf_markalldirty(&rs->sc_r);
    812 	}
    813 
    814 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    815 		error = dk_open(dksc, dev, flags, fmt, l);
    816 
    817 bad:
    818 	raidunlock(rs);
    819 
    820 	return (error);
    821 
    822 
    823 }
    824 
    825 static int
    826 raid_lastclose(device_t self)
    827 {
    828 	struct raid_softc *rs = raidsoftc(self);
    829 
    830 	/* Last one... device is not unconfigured yet.
    831 	   Device shutdown has taken care of setting the
    832 	   clean bits if RAIDF_INITED is not set
    833 	   mark things as clean... */
    834 
    835 	rf_update_component_labels(&rs->sc_r,
    836 	    RF_FINAL_COMPONENT_UPDATE);
    837 
    838 	/* pass to unlocked code */
    839 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    840 		rs->sc_flags |= RAIDF_DETACH;
    841 
    842 	return 0;
    843 }
    844 
    845 /* ARGSUSED */
    846 static int
    847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    848 {
    849 	int     unit = raidunit(dev);
    850 	struct raid_softc *rs;
    851 	struct dk_softc *dksc;
    852 	cfdata_t cf;
    853 	int     error = 0, do_detach = 0, do_put = 0;
    854 
    855 	if ((rs = raidget(unit, false)) == NULL)
    856 		return ENXIO;
    857 	dksc = &rs->sc_dksc;
    858 
    859 	if ((error = raidlock(rs)) != 0)
    860 		return (error);
    861 
    862 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    863 		error = dk_close(dksc, dev, flags, fmt, l);
    864 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    865 			do_detach = 1;
    866 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    867 		do_put = 1;
    868 
    869 	raidunlock(rs);
    870 
    871 	if (do_detach) {
    872 		/* free the pseudo device attach bits */
    873 		cf = device_cfdata(dksc->sc_dev);
    874 		error = config_detach(dksc->sc_dev, 0);
    875 		if (error == 0)
    876 			free(cf, M_RAIDFRAME);
    877 	} else if (do_put) {
    878 		raidput(rs);
    879 	}
    880 
    881 	return (error);
    882 
    883 }
    884 
    885 static void
    886 raid_wakeup(RF_Raid_t *raidPtr)
    887 {
    888 	rf_lock_mutex2(raidPtr->iodone_lock);
    889 	rf_signal_cond2(raidPtr->iodone_cv);
    890 	rf_unlock_mutex2(raidPtr->iodone_lock);
    891 }
    892 
    893 static void
    894 raidstrategy(struct buf *bp)
    895 {
    896 	unsigned int unit;
    897 	struct raid_softc *rs;
    898 	struct dk_softc *dksc;
    899 	RF_Raid_t *raidPtr;
    900 
    901 	unit = raidunit(bp->b_dev);
    902 	if ((rs = raidget(unit, false)) == NULL) {
    903 		bp->b_error = ENXIO;
    904 		goto fail;
    905 	}
    906 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    907 		bp->b_error = ENXIO;
    908 		goto fail;
    909 	}
    910 	dksc = &rs->sc_dksc;
    911 	raidPtr = &rs->sc_r;
    912 
    913 	/* Queue IO only */
    914 	if (dk_strategy_defer(dksc, bp))
    915 		goto done;
    916 
    917 	/* schedule the IO to happen at the next convenient time */
    918 	raid_wakeup(raidPtr);
    919 
    920 done:
    921 	return;
    922 
    923 fail:
    924 	bp->b_resid = bp->b_bcount;
    925 	biodone(bp);
    926 }
    927 
    928 static int
    929 raid_diskstart(device_t dev, struct buf *bp)
    930 {
    931 	struct raid_softc *rs = raidsoftc(dev);
    932 	RF_Raid_t *raidPtr;
    933 
    934 	raidPtr = &rs->sc_r;
    935 	if (!raidPtr->valid) {
    936 		db1_printf(("raid is not valid..\n"));
    937 		return ENODEV;
    938 	}
    939 
    940 	/* XXX */
    941 	bp->b_resid = 0;
    942 
    943 	return raiddoaccess(raidPtr, bp);
    944 }
    945 
    946 void
    947 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    948 {
    949 	struct raid_softc *rs;
    950 	struct dk_softc *dksc;
    951 
    952 	rs = raidPtr->softc;
    953 	dksc = &rs->sc_dksc;
    954 
    955 	dk_done(dksc, bp);
    956 
    957 	rf_lock_mutex2(raidPtr->mutex);
    958 	raidPtr->openings++;
    959 	rf_unlock_mutex2(raidPtr->mutex);
    960 
    961 	/* schedule more IO */
    962 	raid_wakeup(raidPtr);
    963 }
    964 
    965 /* ARGSUSED */
    966 static int
    967 raidread(dev_t dev, struct uio *uio, int flags)
    968 {
    969 	int     unit = raidunit(dev);
    970 	struct raid_softc *rs;
    971 
    972 	if ((rs = raidget(unit, false)) == NULL)
    973 		return ENXIO;
    974 
    975 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    976 		return (ENXIO);
    977 
    978 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    979 
    980 }
    981 
    982 /* ARGSUSED */
    983 static int
    984 raidwrite(dev_t dev, struct uio *uio, int flags)
    985 {
    986 	int     unit = raidunit(dev);
    987 	struct raid_softc *rs;
    988 
    989 	if ((rs = raidget(unit, false)) == NULL)
    990 		return ENXIO;
    991 
    992 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    993 		return (ENXIO);
    994 
    995 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    996 
    997 }
    998 
    999 static int
   1000 raid_detach_unlocked(struct raid_softc *rs)
   1001 {
   1002 	struct dk_softc *dksc = &rs->sc_dksc;
   1003 	RF_Raid_t *raidPtr;
   1004 	int error;
   1005 
   1006 	raidPtr = &rs->sc_r;
   1007 
   1008 	if (DK_BUSY(dksc, 0) ||
   1009 	    raidPtr->recon_in_progress != 0 ||
   1010 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1011 	    raidPtr->copyback_in_progress != 0)
   1012 		return EBUSY;
   1013 
   1014 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1015 		return 0;
   1016 
   1017 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1018 
   1019 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1020 		return error;
   1021 
   1022 	rs->sc_flags &= ~RAIDF_INITED;
   1023 
   1024 	/* Kill off any queued buffers */
   1025 	dk_drain(dksc);
   1026 	bufq_free(dksc->sc_bufq);
   1027 
   1028 	/* Detach the disk. */
   1029 	dkwedge_delall(&dksc->sc_dkdev);
   1030 	disk_detach(&dksc->sc_dkdev);
   1031 	disk_destroy(&dksc->sc_dkdev);
   1032 	dk_detach(dksc);
   1033 
   1034 	return 0;
   1035 }
   1036 
   1037 static bool
   1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1039 {
   1040 	switch (cmd) {
   1041 	case RAIDFRAME_ADD_HOT_SPARE:
   1042 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1043 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1044 	case RAIDFRAME_CHECK_PARITY:
   1045 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1046 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1047 	case RAIDFRAME_CHECK_RECON_STATUS:
   1048 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1049 	case RAIDFRAME_COPYBACK:
   1050 	case RAIDFRAME_DELETE_COMPONENT:
   1051 	case RAIDFRAME_FAIL_DISK:
   1052 	case RAIDFRAME_GET_ACCTOTALS:
   1053 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1054 	case RAIDFRAME_GET_INFO:
   1055 	case RAIDFRAME_GET_SIZE:
   1056 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1057 	case RAIDFRAME_INIT_LABELS:
   1058 	case RAIDFRAME_KEEP_ACCTOTALS:
   1059 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1060 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1061 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1062 	case RAIDFRAME_PARITYMAP_STATUS:
   1063 	case RAIDFRAME_REBUILD_IN_PLACE:
   1064 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1065 	case RAIDFRAME_RESET_ACCTOTALS:
   1066 	case RAIDFRAME_REWRITEPARITY:
   1067 	case RAIDFRAME_SET_AUTOCONFIG:
   1068 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1069 	case RAIDFRAME_SET_ROOT:
   1070 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1071 	}
   1072 	return false;
   1073 }
   1074 
   1075 int
   1076 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1077 {
   1078 	struct rf_recon_req_internal *rrint;
   1079 
   1080 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1081 		/* Can't do this on a RAID 0!! */
   1082 		return EINVAL;
   1083 	}
   1084 
   1085 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1086 		/* bad column */
   1087 		return EINVAL;
   1088 	}
   1089 
   1090 	rf_lock_mutex2(raidPtr->mutex);
   1091 	if (raidPtr->status == rf_rs_reconstructing) {
   1092 		/* you can't fail a disk while we're reconstructing! */
   1093 		/* XXX wrong for RAID6 */
   1094 		goto out;
   1095 	}
   1096 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1097 	    (raidPtr->numFailures > 0)) {
   1098 		/* some other component has failed.  Let's not make
   1099 		   things worse. XXX wrong for RAID6 */
   1100 		goto out;
   1101 	}
   1102 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1103 		/* Can't fail a spared disk! */
   1104 		goto out;
   1105 	}
   1106 	rf_unlock_mutex2(raidPtr->mutex);
   1107 
   1108 	/* make a copy of the recon request so that we don't rely on
   1109 	 * the user's buffer */
   1110 	rrint = RF_Malloc(sizeof(*rrint));
   1111 	if (rrint == NULL)
   1112 		return(ENOMEM);
   1113 	rrint->col = rr->col;
   1114 	rrint->flags = rr->flags;
   1115 	rrint->raidPtr = raidPtr;
   1116 
   1117 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1118 	    rrint, "raid_recon");
   1119 out:
   1120 	rf_unlock_mutex2(raidPtr->mutex);
   1121 	return EINVAL;
   1122 }
   1123 
   1124 static int
   1125 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1126 {
   1127 	/* allocate a buffer for the layout-specific data, and copy it in */
   1128 	if (k_cfg->layoutSpecificSize == 0)
   1129 		return 0;
   1130 
   1131 	if (k_cfg->layoutSpecificSize > 10000) {
   1132 	    /* sanity check */
   1133 	    return EINVAL;
   1134 	}
   1135 
   1136 	u_char *specific_buf;
   1137 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1138 	if (specific_buf == NULL)
   1139 		return ENOMEM;
   1140 
   1141 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1142 	    k_cfg->layoutSpecificSize);
   1143 	if (retcode) {
   1144 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1145 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1146 		return retcode;
   1147 	}
   1148 
   1149 	k_cfg->layoutSpecific = specific_buf;
   1150 	return 0;
   1151 }
   1152 
   1153 static int
   1154 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1155 {
   1156 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1157 
   1158 	if (rs->sc_r.valid) {
   1159 		/* There is a valid RAID set running on this unit! */
   1160 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1161 		return EINVAL;
   1162 	}
   1163 
   1164 	/* copy-in the configuration information */
   1165 	/* data points to a pointer to the configuration structure */
   1166 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1167 	if (*k_cfg == NULL) {
   1168 		return ENOMEM;
   1169 	}
   1170 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1171 	if (retcode == 0)
   1172 		return 0;
   1173 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1174 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1175 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1176 	return retcode;
   1177 }
   1178 
   1179 int
   1180 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1181 {
   1182 	int retcode;
   1183 	RF_Raid_t *raidPtr = &rs->sc_r;
   1184 
   1185 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1186 
   1187 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1188 		goto out;
   1189 
   1190 	/* should do some kind of sanity check on the configuration.
   1191 	 * Store the sum of all the bytes in the last byte? */
   1192 
   1193 	/* configure the system */
   1194 
   1195 	/*
   1196 	 * Clear the entire RAID descriptor, just to make sure
   1197 	 *  there is no stale data left in the case of a
   1198 	 *  reconfiguration
   1199 	 */
   1200 	memset(raidPtr, 0, sizeof(*raidPtr));
   1201 	raidPtr->softc = rs;
   1202 	raidPtr->raidid = rs->sc_unit;
   1203 
   1204 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1205 
   1206 	if (retcode == 0) {
   1207 		/* allow this many simultaneous IO's to
   1208 		   this RAID device */
   1209 		raidPtr->openings = RAIDOUTSTANDING;
   1210 
   1211 		raidinit(rs);
   1212 		raid_wakeup(raidPtr);
   1213 		rf_markalldirty(raidPtr);
   1214 	}
   1215 
   1216 	/* free the buffers.  No return code here. */
   1217 	if (k_cfg->layoutSpecificSize) {
   1218 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1219 	}
   1220 out:
   1221 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1222 	if (retcode) {
   1223 		/*
   1224 		 * If configuration failed, set sc_flags so that we
   1225 		 * will detach the device when we close it.
   1226 		 */
   1227 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1228 	}
   1229 	return retcode;
   1230 }
   1231 
   1232 #if RF_DISABLED
   1233 static int
   1234 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1235 {
   1236 
   1237 	/* XXX check the label for valid stuff... */
   1238 	/* Note that some things *should not* get modified --
   1239 	   the user should be re-initing the labels instead of
   1240 	   trying to patch things.
   1241 	   */
   1242 #ifdef DEBUG
   1243 	int raidid = raidPtr->raidid;
   1244 	printf("raid%d: Got component label:\n", raidid);
   1245 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1246 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1247 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1248 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1249 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1250 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1251 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1252 #endif	/* DEBUG */
   1253 	clabel->row = 0;
   1254 	int column = clabel->column;
   1255 
   1256 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1257 		return(EINVAL);
   1258 	}
   1259 
   1260 	/* XXX this isn't allowed to do anything for now :-) */
   1261 
   1262 	/* XXX and before it is, we need to fill in the rest
   1263 	   of the fields!?!?!?! */
   1264 	memcpy(raidget_component_label(raidPtr, column),
   1265 	    clabel, sizeof(*clabel));
   1266 	raidflush_component_label(raidPtr, column);
   1267 	return 0;
   1268 }
   1269 #endif
   1270 
   1271 static int
   1272 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1273 {
   1274 	/*
   1275 	   we only want the serial number from
   1276 	   the above.  We get all the rest of the information
   1277 	   from the config that was used to create this RAID
   1278 	   set.
   1279 	   */
   1280 
   1281 	raidPtr->serial_number = clabel->serial_number;
   1282 
   1283 	for (int column = 0; column < raidPtr->numCol; column++) {
   1284 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1285 		if (RF_DEAD_DISK(diskPtr->status))
   1286 			continue;
   1287 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1288 		    raidPtr, column);
   1289 		/* Zeroing this is important. */
   1290 		memset(ci_label, 0, sizeof(*ci_label));
   1291 		raid_init_component_label(raidPtr, ci_label);
   1292 		ci_label->serial_number = raidPtr->serial_number;
   1293 		ci_label->row = 0; /* we dont' pretend to support more */
   1294 		rf_component_label_set_partitionsize(ci_label,
   1295 		    diskPtr->partitionSize);
   1296 		ci_label->column = column;
   1297 		raidflush_component_label(raidPtr, column);
   1298 		/* XXXjld what about the spares? */
   1299 	}
   1300 
   1301 	return 0;
   1302 }
   1303 
   1304 static int
   1305 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1306 {
   1307 
   1308 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1309 		/* Can't do this on a RAID 0!! */
   1310 		return EINVAL;
   1311 	}
   1312 
   1313 	if (raidPtr->recon_in_progress == 1) {
   1314 		/* a reconstruct is already in progress! */
   1315 		return EINVAL;
   1316 	}
   1317 
   1318 	RF_SingleComponent_t component;
   1319 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1320 	component.row = 0; /* we don't support any more */
   1321 	int column = component.column;
   1322 
   1323 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1324 		return EINVAL;
   1325 	}
   1326 
   1327 	rf_lock_mutex2(raidPtr->mutex);
   1328 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1329 	    (raidPtr->numFailures > 0)) {
   1330 		/* XXX 0 above shouldn't be constant!!! */
   1331 		/* some component other than this has failed.
   1332 		   Let's not make things worse than they already
   1333 		   are... */
   1334 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1335 		       raidPtr->raidid);
   1336 		printf("raid%d:     Col: %d   Too many failures.\n",
   1337 		       raidPtr->raidid, column);
   1338 		rf_unlock_mutex2(raidPtr->mutex);
   1339 		return EINVAL;
   1340 	}
   1341 
   1342 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1343 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1344 		       raidPtr->raidid);
   1345 		printf("raid%d:    Col: %d   "
   1346 		    "Reconstruction already occurring!\n",
   1347 		    raidPtr->raidid, column);
   1348 
   1349 		rf_unlock_mutex2(raidPtr->mutex);
   1350 		return EINVAL;
   1351 	}
   1352 
   1353 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1354 		rf_unlock_mutex2(raidPtr->mutex);
   1355 		return EINVAL;
   1356 	}
   1357 
   1358 	rf_unlock_mutex2(raidPtr->mutex);
   1359 
   1360 	struct rf_recon_req_internal *rrint;
   1361 	rrint = RF_Malloc(sizeof(*rrint));
   1362 	if (rrint == NULL)
   1363 		return ENOMEM;
   1364 
   1365 	rrint->col = column;
   1366 	rrint->raidPtr = raidPtr;
   1367 
   1368 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1369 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1370 }
   1371 
   1372 static int
   1373 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1374 {
   1375 	/*
   1376 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1377 	 * so tell the user it's done.
   1378 	 */
   1379 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1380 	    raidPtr->status != rf_rs_reconstructing) {
   1381 		*data = 100;
   1382 		return 0;
   1383 	}
   1384 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1385 		*data = 0;
   1386 		return 0;
   1387 	}
   1388 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1389 	    / raidPtr->reconControl->numRUsTotal);
   1390 	return 0;
   1391 }
   1392 
   1393 static int
   1394 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1395 {
   1396 	int     unit = raidunit(dev);
   1397 	int     part, pmask;
   1398 	struct raid_softc *rs;
   1399 	struct dk_softc *dksc;
   1400 	RF_Config_t *k_cfg;
   1401 	RF_Raid_t *raidPtr;
   1402 	RF_AccTotals_t *totals;
   1403 	RF_SingleComponent_t component;
   1404 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1405 	int retcode = 0;
   1406 	int column;
   1407 	RF_ComponentLabel_t *clabel;
   1408 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1409 	int d;
   1410 
   1411 	if ((rs = raidget(unit, false)) == NULL)
   1412 		return ENXIO;
   1413 
   1414 	dksc = &rs->sc_dksc;
   1415 	raidPtr = &rs->sc_r;
   1416 
   1417 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1418 	    (int) DISKPART(dev), (int) unit, cmd));
   1419 
   1420 	/* Must be initialized for these... */
   1421 	if (rf_must_be_initialized(rs, cmd))
   1422 		return ENXIO;
   1423 
   1424 	switch (cmd) {
   1425 		/* configure the system */
   1426 	case RAIDFRAME_CONFIGURE:
   1427 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1428 			return retcode;
   1429 		return rf_construct(rs, k_cfg);
   1430 
   1431 		/* shutdown the system */
   1432 	case RAIDFRAME_SHUTDOWN:
   1433 
   1434 		part = DISKPART(dev);
   1435 		pmask = (1 << part);
   1436 
   1437 		if ((retcode = raidlock(rs)) != 0)
   1438 			return retcode;
   1439 
   1440 		if (DK_BUSY(dksc, pmask) ||
   1441 		    raidPtr->recon_in_progress != 0 ||
   1442 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1443 		    raidPtr->copyback_in_progress != 0)
   1444 			retcode = EBUSY;
   1445 		else {
   1446 			/* detach and free on close */
   1447 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1448 			retcode = 0;
   1449 		}
   1450 
   1451 		raidunlock(rs);
   1452 
   1453 		return retcode;
   1454 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1455 		return rf_get_component_label(raidPtr, data);
   1456 
   1457 #if RF_DISABLED
   1458 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1459 		return rf_set_component_label(raidPtr, data);
   1460 #endif
   1461 
   1462 	case RAIDFRAME_INIT_LABELS:
   1463 		return rf_init_component_label(raidPtr, data);
   1464 
   1465 	case RAIDFRAME_SET_AUTOCONFIG:
   1466 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1467 		printf("raid%d: New autoconfig value is: %d\n",
   1468 		       raidPtr->raidid, d);
   1469 		*(int *) data = d;
   1470 		return retcode;
   1471 
   1472 	case RAIDFRAME_SET_ROOT:
   1473 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1474 		printf("raid%d: New rootpartition value is: %d\n",
   1475 		       raidPtr->raidid, d);
   1476 		*(int *) data = d;
   1477 		return retcode;
   1478 
   1479 		/* initialize all parity */
   1480 	case RAIDFRAME_REWRITEPARITY:
   1481 
   1482 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1483 			/* Parity for RAID 0 is trivially correct */
   1484 			raidPtr->parity_good = RF_RAID_CLEAN;
   1485 			return 0;
   1486 		}
   1487 
   1488 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1489 			/* Re-write is already in progress! */
   1490 			return EINVAL;
   1491 		}
   1492 
   1493 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1494 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1495 
   1496 	case RAIDFRAME_ADD_HOT_SPARE:
   1497 		sparePtr = (RF_SingleComponent_t *) data;
   1498 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1499 		return rf_add_hot_spare(raidPtr, &component);
   1500 
   1501 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1502 		return retcode;
   1503 
   1504 	case RAIDFRAME_DELETE_COMPONENT:
   1505 		componentPtr = (RF_SingleComponent_t *)data;
   1506 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1507 		return rf_delete_component(raidPtr, &component);
   1508 
   1509 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1510 		componentPtr = (RF_SingleComponent_t *)data;
   1511 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1512 		return rf_incorporate_hot_spare(raidPtr, &component);
   1513 
   1514 	case RAIDFRAME_REBUILD_IN_PLACE:
   1515 		return rf_rebuild_in_place(raidPtr, data);
   1516 
   1517 	case RAIDFRAME_GET_INFO:
   1518 		ucfgp = *(RF_DeviceConfig_t **)data;
   1519 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1520 		if (d_cfg == NULL)
   1521 			return ENOMEM;
   1522 		retcode = rf_get_info(raidPtr, d_cfg);
   1523 		if (retcode == 0) {
   1524 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1525 		}
   1526 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1527 		return retcode;
   1528 
   1529 	case RAIDFRAME_CHECK_PARITY:
   1530 		*(int *) data = raidPtr->parity_good;
   1531 		return 0;
   1532 
   1533 	case RAIDFRAME_PARITYMAP_STATUS:
   1534 		if (rf_paritymap_ineligible(raidPtr))
   1535 			return EINVAL;
   1536 		rf_paritymap_status(raidPtr->parity_map, data);
   1537 		return 0;
   1538 
   1539 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1540 		if (rf_paritymap_ineligible(raidPtr))
   1541 			return EINVAL;
   1542 		if (raidPtr->parity_map == NULL)
   1543 			return ENOENT; /* ??? */
   1544 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1545 			return EINVAL;
   1546 		return 0;
   1547 
   1548 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1549 		if (rf_paritymap_ineligible(raidPtr))
   1550 			return EINVAL;
   1551 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1552 		return 0;
   1553 
   1554 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1555 		if (rf_paritymap_ineligible(raidPtr))
   1556 			return EINVAL;
   1557 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1558 		/* XXX should errors be passed up? */
   1559 		return 0;
   1560 
   1561 	case RAIDFRAME_RESET_ACCTOTALS:
   1562 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1563 		return 0;
   1564 
   1565 	case RAIDFRAME_GET_ACCTOTALS:
   1566 		totals = (RF_AccTotals_t *) data;
   1567 		*totals = raidPtr->acc_totals;
   1568 		return 0;
   1569 
   1570 	case RAIDFRAME_KEEP_ACCTOTALS:
   1571 		raidPtr->keep_acc_totals = *(int *)data;
   1572 		return 0;
   1573 
   1574 	case RAIDFRAME_GET_SIZE:
   1575 		*(int *) data = raidPtr->totalSectors;
   1576 		return 0;
   1577 
   1578 	case RAIDFRAME_FAIL_DISK:
   1579 		return rf_fail_disk(raidPtr, data);
   1580 
   1581 		/* invoke a copyback operation after recon on whatever disk
   1582 		 * needs it, if any */
   1583 	case RAIDFRAME_COPYBACK:
   1584 
   1585 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1586 			/* This makes no sense on a RAID 0!! */
   1587 			return EINVAL;
   1588 		}
   1589 
   1590 		if (raidPtr->copyback_in_progress == 1) {
   1591 			/* Copyback is already in progress! */
   1592 			return EINVAL;
   1593 		}
   1594 
   1595 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1596 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1597 
   1598 		/* return the percentage completion of reconstruction */
   1599 	case RAIDFRAME_CHECK_RECON_STATUS:
   1600 		return rf_check_recon_status(raidPtr, data);
   1601 
   1602 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1603 		rf_check_recon_status_ext(raidPtr, data);
   1604 		return 0;
   1605 
   1606 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1607 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1608 			/* This makes no sense on a RAID 0, so tell the
   1609 			   user it's done. */
   1610 			*(int *) data = 100;
   1611 			return 0;
   1612 		}
   1613 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1614 			*(int *) data = 100 *
   1615 				raidPtr->parity_rewrite_stripes_done /
   1616 				raidPtr->Layout.numStripe;
   1617 		} else {
   1618 			*(int *) data = 100;
   1619 		}
   1620 		return 0;
   1621 
   1622 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1623 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1624 		return 0;
   1625 
   1626 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1627 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1628 			/* This makes no sense on a RAID 0 */
   1629 			*(int *) data = 100;
   1630 			return 0;
   1631 		}
   1632 		if (raidPtr->copyback_in_progress == 1) {
   1633 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1634 				raidPtr->Layout.numStripe;
   1635 		} else {
   1636 			*(int *) data = 100;
   1637 		}
   1638 		return 0;
   1639 
   1640 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1641 		rf_check_copyback_status_ext(raidPtr, data);
   1642 		return 0;
   1643 
   1644 	case RAIDFRAME_SET_LAST_UNIT:
   1645 		for (column = 0; column < raidPtr->numCol; column++)
   1646 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1647 				return EBUSY;
   1648 
   1649 		for (column = 0; column < raidPtr->numCol; column++) {
   1650 			clabel = raidget_component_label(raidPtr, column);
   1651 			clabel->last_unit = *(int *)data;
   1652 			raidflush_component_label(raidPtr, column);
   1653 		}
   1654 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1655 		return 0;
   1656 
   1657 		/* the sparetable daemon calls this to wait for the kernel to
   1658 		 * need a spare table. this ioctl does not return until a
   1659 		 * spare table is needed. XXX -- calling mpsleep here in the
   1660 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1661 		 * -- I should either compute the spare table in the kernel,
   1662 		 * or have a different -- XXX XXX -- interface (a different
   1663 		 * character device) for delivering the table     -- XXX */
   1664 #if RF_DISABLED
   1665 	case RAIDFRAME_SPARET_WAIT:
   1666 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1667 		while (!rf_sparet_wait_queue)
   1668 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1669 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1670 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1671 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1672 
   1673 		/* structure assignment */
   1674 		*((RF_SparetWait_t *) data) = *waitreq;
   1675 
   1676 		RF_Free(waitreq, sizeof(*waitreq));
   1677 		return 0;
   1678 
   1679 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1680 		 * code in it that will cause the dameon to exit */
   1681 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1682 		waitreq = RF_Malloc(sizeof(*waitreq));
   1683 		waitreq->fcol = -1;
   1684 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1685 		waitreq->next = rf_sparet_wait_queue;
   1686 		rf_sparet_wait_queue = waitreq;
   1687 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1688 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1689 		return 0;
   1690 
   1691 		/* used by the spare table daemon to deliver a spare table
   1692 		 * into the kernel */
   1693 	case RAIDFRAME_SEND_SPARET:
   1694 
   1695 		/* install the spare table */
   1696 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1697 
   1698 		/* respond to the requestor.  the return status of the spare
   1699 		 * table installation is passed in the "fcol" field */
   1700 		waitred = RF_Malloc(sizeof(*waitreq));
   1701 		waitreq->fcol = retcode;
   1702 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1703 		waitreq->next = rf_sparet_resp_queue;
   1704 		rf_sparet_resp_queue = waitreq;
   1705 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1706 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1707 
   1708 		return retcode;
   1709 #endif
   1710 	default:
   1711 		/*
   1712 		 * Don't bother trying to load compat modules
   1713 		 * if it is not our ioctl. This is more efficient
   1714 		 * and makes rump tests not depend on compat code
   1715 		 */
   1716 		if (IOCGROUP(cmd) != 'r')
   1717 			break;
   1718 #ifdef _LP64
   1719 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1720 			module_autoload("compat_netbsd32_raid",
   1721 			    MODULE_CLASS_EXEC);
   1722 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1723 			    (rs, cmd, data), enosys(), retcode);
   1724 			if (retcode != EPASSTHROUGH)
   1725 				return retcode;
   1726 		}
   1727 #endif
   1728 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1729 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1730 		    (rs, cmd, data), enosys(), retcode);
   1731 		if (retcode != EPASSTHROUGH)
   1732 			return retcode;
   1733 
   1734 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1735 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1736 		    (rs, cmd, data), enosys(), retcode);
   1737 		if (retcode != EPASSTHROUGH)
   1738 			return retcode;
   1739 		break; /* fall through to the os-specific code below */
   1740 
   1741 	}
   1742 
   1743 	if (!raidPtr->valid)
   1744 		return (EINVAL);
   1745 
   1746 	/*
   1747 	 * Add support for "regular" device ioctls here.
   1748 	 */
   1749 
   1750 	switch (cmd) {
   1751 	case DIOCGCACHE:
   1752 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1753 		break;
   1754 
   1755 	case DIOCCACHESYNC:
   1756 		retcode = rf_sync_component_caches(raidPtr);
   1757 		break;
   1758 
   1759 	default:
   1760 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1761 		break;
   1762 	}
   1763 
   1764 	return (retcode);
   1765 
   1766 }
   1767 
   1768 
   1769 /* raidinit -- complete the rest of the initialization for the
   1770    RAIDframe device.  */
   1771 
   1772 
   1773 static void
   1774 raidinit(struct raid_softc *rs)
   1775 {
   1776 	cfdata_t cf;
   1777 	unsigned int unit;
   1778 	struct dk_softc *dksc = &rs->sc_dksc;
   1779 	RF_Raid_t *raidPtr = &rs->sc_r;
   1780 	device_t dev;
   1781 
   1782 	unit = raidPtr->raidid;
   1783 
   1784 	/* XXX doesn't check bounds. */
   1785 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1786 
   1787 	/* attach the pseudo device */
   1788 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1789 	cf->cf_name = raid_cd.cd_name;
   1790 	cf->cf_atname = raid_cd.cd_name;
   1791 	cf->cf_unit = unit;
   1792 	cf->cf_fstate = FSTATE_STAR;
   1793 
   1794 	dev = config_attach_pseudo(cf);
   1795 	if (dev == NULL) {
   1796 		printf("raid%d: config_attach_pseudo failed\n",
   1797 		    raidPtr->raidid);
   1798 		free(cf, M_RAIDFRAME);
   1799 		return;
   1800 	}
   1801 
   1802 	/* provide a backpointer to the real softc */
   1803 	raidsoftc(dev) = rs;
   1804 
   1805 	/* disk_attach actually creates space for the CPU disklabel, among
   1806 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1807 	 * with disklabels. */
   1808 	dk_init(dksc, dev, DKTYPE_RAID);
   1809 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1810 
   1811 	/* XXX There may be a weird interaction here between this, and
   1812 	 * protectedSectors, as used in RAIDframe.  */
   1813 
   1814 	rs->sc_size = raidPtr->totalSectors;
   1815 
   1816 	/* Attach dk and disk subsystems */
   1817 	dk_attach(dksc);
   1818 	disk_attach(&dksc->sc_dkdev);
   1819 	rf_set_geometry(rs, raidPtr);
   1820 
   1821 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1822 
   1823 	/* mark unit as usuable */
   1824 	rs->sc_flags |= RAIDF_INITED;
   1825 
   1826 	dkwedge_discover(&dksc->sc_dkdev);
   1827 }
   1828 
   1829 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1830 /* wake up the daemon & tell it to get us a spare table
   1831  * XXX
   1832  * the entries in the queues should be tagged with the raidPtr
   1833  * so that in the extremely rare case that two recons happen at once,
   1834  * we know for which device were requesting a spare table
   1835  * XXX
   1836  *
   1837  * XXX This code is not currently used. GO
   1838  */
   1839 int
   1840 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1841 {
   1842 	int     retcode;
   1843 
   1844 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1845 	req->next = rf_sparet_wait_queue;
   1846 	rf_sparet_wait_queue = req;
   1847 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1848 
   1849 	/* mpsleep unlocks the mutex */
   1850 	while (!rf_sparet_resp_queue) {
   1851 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1852 	}
   1853 	req = rf_sparet_resp_queue;
   1854 	rf_sparet_resp_queue = req->next;
   1855 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1856 
   1857 	retcode = req->fcol;
   1858 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1859 					 * alloc'd */
   1860 	return (retcode);
   1861 }
   1862 #endif
   1863 
   1864 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1865  * bp & passes it down.
   1866  * any calls originating in the kernel must use non-blocking I/O
   1867  * do some extra sanity checking to return "appropriate" error values for
   1868  * certain conditions (to make some standard utilities work)
   1869  *
   1870  * Formerly known as: rf_DoAccessKernel
   1871  */
   1872 void
   1873 raidstart(RF_Raid_t *raidPtr)
   1874 {
   1875 	struct raid_softc *rs;
   1876 	struct dk_softc *dksc;
   1877 
   1878 	rs = raidPtr->softc;
   1879 	dksc = &rs->sc_dksc;
   1880 	/* quick check to see if anything has died recently */
   1881 	rf_lock_mutex2(raidPtr->mutex);
   1882 	if (raidPtr->numNewFailures > 0) {
   1883 		rf_unlock_mutex2(raidPtr->mutex);
   1884 		rf_update_component_labels(raidPtr,
   1885 					   RF_NORMAL_COMPONENT_UPDATE);
   1886 		rf_lock_mutex2(raidPtr->mutex);
   1887 		raidPtr->numNewFailures--;
   1888 	}
   1889 	rf_unlock_mutex2(raidPtr->mutex);
   1890 
   1891 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1892 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1893 		return;
   1894 	}
   1895 
   1896 	dk_start(dksc, NULL);
   1897 }
   1898 
   1899 static int
   1900 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1901 {
   1902 	RF_SectorCount_t num_blocks, pb, sum;
   1903 	RF_RaidAddr_t raid_addr;
   1904 	daddr_t blocknum;
   1905 	int     do_async;
   1906 	int rc;
   1907 
   1908 	rf_lock_mutex2(raidPtr->mutex);
   1909 	if (raidPtr->openings == 0) {
   1910 		rf_unlock_mutex2(raidPtr->mutex);
   1911 		return EAGAIN;
   1912 	}
   1913 	rf_unlock_mutex2(raidPtr->mutex);
   1914 
   1915 	blocknum = bp->b_rawblkno;
   1916 
   1917 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1918 		    (int) blocknum));
   1919 
   1920 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1921 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1922 
   1923 	/* *THIS* is where we adjust what block we're going to...
   1924 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1925 	raid_addr = blocknum;
   1926 
   1927 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1928 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1929 	sum = raid_addr + num_blocks + pb;
   1930 	if (1 || rf_debugKernelAccess) {
   1931 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1932 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1933 			    (int) pb, (int) bp->b_resid));
   1934 	}
   1935 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1936 	    || (sum < num_blocks) || (sum < pb)) {
   1937 		rc = ENOSPC;
   1938 		goto done;
   1939 	}
   1940 	/*
   1941 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1942 	 */
   1943 
   1944 	if (bp->b_bcount & raidPtr->sectorMask) {
   1945 		rc = ENOSPC;
   1946 		goto done;
   1947 	}
   1948 	db1_printf(("Calling DoAccess..\n"));
   1949 
   1950 
   1951 	rf_lock_mutex2(raidPtr->mutex);
   1952 	raidPtr->openings--;
   1953 	rf_unlock_mutex2(raidPtr->mutex);
   1954 
   1955 	/*
   1956 	 * Everything is async.
   1957 	 */
   1958 	do_async = 1;
   1959 
   1960 	/* don't ever condition on bp->b_flags & B_WRITE.
   1961 	 * always condition on B_READ instead */
   1962 
   1963 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1964 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1965 			 do_async, raid_addr, num_blocks,
   1966 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1967 
   1968 done:
   1969 	return rc;
   1970 }
   1971 
   1972 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1973 
   1974 int
   1975 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1976 {
   1977 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   1978 	struct buf *bp;
   1979 
   1980 	req->queue = queue;
   1981 	bp = req->bp;
   1982 
   1983 	switch (req->type) {
   1984 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   1985 		/* XXX need to do something extra here.. */
   1986 		/* I'm leaving this in, as I've never actually seen it used,
   1987 		 * and I'd like folks to report it... GO */
   1988 		printf(("WAKEUP CALLED\n"));
   1989 		queue->numOutstanding++;
   1990 
   1991 		bp->b_flags = 0;
   1992 		bp->b_private = req;
   1993 
   1994 		KernelWakeupFunc(bp);
   1995 		break;
   1996 
   1997 	case RF_IO_TYPE_READ:
   1998 	case RF_IO_TYPE_WRITE:
   1999 #if RF_ACC_TRACE > 0
   2000 		if (req->tracerec) {
   2001 			RF_ETIMER_START(req->tracerec->timer);
   2002 		}
   2003 #endif
   2004 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2005 		    op, queue->rf_cinfo->ci_dev,
   2006 		    req->sectorOffset, req->numSector,
   2007 		    req->buf, KernelWakeupFunc, (void *) req,
   2008 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2009 
   2010 		if (rf_debugKernelAccess) {
   2011 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2012 				(long) bp->b_blkno));
   2013 		}
   2014 		queue->numOutstanding++;
   2015 		queue->last_deq_sector = req->sectorOffset;
   2016 		/* acc wouldn't have been let in if there were any pending
   2017 		 * reqs at any other priority */
   2018 		queue->curPriority = req->priority;
   2019 
   2020 		db1_printf(("Going for %c to unit %d col %d\n",
   2021 			    req->type, queue->raidPtr->raidid,
   2022 			    queue->col));
   2023 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2024 			(int) req->sectorOffset, (int) req->numSector,
   2025 			(int) (req->numSector <<
   2026 			    queue->raidPtr->logBytesPerSector),
   2027 			(int) queue->raidPtr->logBytesPerSector));
   2028 
   2029 		/*
   2030 		 * XXX: drop lock here since this can block at
   2031 		 * least with backing SCSI devices.  Retake it
   2032 		 * to minimize fuss with calling interfaces.
   2033 		 */
   2034 
   2035 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2036 		bdev_strategy(bp);
   2037 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2038 		break;
   2039 
   2040 	default:
   2041 		panic("bad req->type in rf_DispatchKernelIO");
   2042 	}
   2043 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2044 
   2045 	return (0);
   2046 }
   2047 /* this is the callback function associated with a I/O invoked from
   2048    kernel code.
   2049  */
   2050 static void
   2051 KernelWakeupFunc(struct buf *bp)
   2052 {
   2053 	RF_DiskQueueData_t *req = NULL;
   2054 	RF_DiskQueue_t *queue;
   2055 
   2056 	db1_printf(("recovering the request queue:\n"));
   2057 
   2058 	req = bp->b_private;
   2059 
   2060 	queue = (RF_DiskQueue_t *) req->queue;
   2061 
   2062 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2063 
   2064 #if RF_ACC_TRACE > 0
   2065 	if (req->tracerec) {
   2066 		RF_ETIMER_STOP(req->tracerec->timer);
   2067 		RF_ETIMER_EVAL(req->tracerec->timer);
   2068 		rf_lock_mutex2(rf_tracing_mutex);
   2069 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2070 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2071 		req->tracerec->num_phys_ios++;
   2072 		rf_unlock_mutex2(rf_tracing_mutex);
   2073 	}
   2074 #endif
   2075 
   2076 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2077 	 * ballistic, and mark the component as hosed... */
   2078 
   2079 	if (bp->b_error != 0) {
   2080 		/* Mark the disk as dead */
   2081 		/* but only mark it once... */
   2082 		/* and only if it wouldn't leave this RAID set
   2083 		   completely broken */
   2084 		if (((queue->raidPtr->Disks[queue->col].status ==
   2085 		      rf_ds_optimal) ||
   2086 		     (queue->raidPtr->Disks[queue->col].status ==
   2087 		      rf_ds_used_spare)) &&
   2088 		     (queue->raidPtr->numFailures <
   2089 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2090 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2091 			       queue->raidPtr->raidid,
   2092 			       bp->b_error,
   2093 			       queue->raidPtr->Disks[queue->col].devname);
   2094 			queue->raidPtr->Disks[queue->col].status =
   2095 			    rf_ds_failed;
   2096 			queue->raidPtr->status = rf_rs_degraded;
   2097 			queue->raidPtr->numFailures++;
   2098 			queue->raidPtr->numNewFailures++;
   2099 		} else {	/* Disk is already dead... */
   2100 			/* printf("Disk already marked as dead!\n"); */
   2101 		}
   2102 
   2103 	}
   2104 
   2105 	/* Fill in the error value */
   2106 	req->error = bp->b_error;
   2107 
   2108 	/* Drop this one on the "finished" queue... */
   2109 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2110 
   2111 	/* Let the raidio thread know there is work to be done. */
   2112 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2113 
   2114 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2115 }
   2116 
   2117 
   2118 /*
   2119  * initialize a buf structure for doing an I/O in the kernel.
   2120  */
   2121 static void
   2122 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2123        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2124        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2125        struct proc *b_proc)
   2126 {
   2127 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2128 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2129 	bp->b_oflags = 0;
   2130 	bp->b_cflags = 0;
   2131 	bp->b_bcount = numSect << logBytesPerSector;
   2132 	bp->b_bufsize = bp->b_bcount;
   2133 	bp->b_error = 0;
   2134 	bp->b_dev = dev;
   2135 	bp->b_data = bf;
   2136 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2137 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2138 	if (bp->b_bcount == 0) {
   2139 		panic("bp->b_bcount is zero in InitBP!!");
   2140 	}
   2141 	bp->b_proc = b_proc;
   2142 	bp->b_iodone = cbFunc;
   2143 	bp->b_private = cbArg;
   2144 }
   2145 
   2146 /*
   2147  * Wait interruptibly for an exclusive lock.
   2148  *
   2149  * XXX
   2150  * Several drivers do this; it should be abstracted and made MP-safe.
   2151  * (Hmm... where have we seen this warning before :->  GO )
   2152  */
   2153 static int
   2154 raidlock(struct raid_softc *rs)
   2155 {
   2156 	int     error;
   2157 
   2158 	error = 0;
   2159 	mutex_enter(&rs->sc_mutex);
   2160 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2161 		rs->sc_flags |= RAIDF_WANTED;
   2162 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2163 		if (error != 0)
   2164 			goto done;
   2165 	}
   2166 	rs->sc_flags |= RAIDF_LOCKED;
   2167 done:
   2168 	mutex_exit(&rs->sc_mutex);
   2169 	return (error);
   2170 }
   2171 /*
   2172  * Unlock and wake up any waiters.
   2173  */
   2174 static void
   2175 raidunlock(struct raid_softc *rs)
   2176 {
   2177 
   2178 	mutex_enter(&rs->sc_mutex);
   2179 	rs->sc_flags &= ~RAIDF_LOCKED;
   2180 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2181 		rs->sc_flags &= ~RAIDF_WANTED;
   2182 		cv_broadcast(&rs->sc_cv);
   2183 	}
   2184 	mutex_exit(&rs->sc_mutex);
   2185 }
   2186 
   2187 
   2188 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2189 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2190 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2191 
   2192 static daddr_t
   2193 rf_component_info_offset(void)
   2194 {
   2195 
   2196 	return RF_COMPONENT_INFO_OFFSET;
   2197 }
   2198 
   2199 static daddr_t
   2200 rf_component_info_size(unsigned secsize)
   2201 {
   2202 	daddr_t info_size;
   2203 
   2204 	KASSERT(secsize);
   2205 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2206 		info_size = secsize;
   2207 	else
   2208 		info_size = RF_COMPONENT_INFO_SIZE;
   2209 
   2210 	return info_size;
   2211 }
   2212 
   2213 static daddr_t
   2214 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2215 {
   2216 	daddr_t map_offset;
   2217 
   2218 	KASSERT(raidPtr->bytesPerSector);
   2219 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2220 		map_offset = raidPtr->bytesPerSector;
   2221 	else
   2222 		map_offset = RF_COMPONENT_INFO_SIZE;
   2223 	map_offset += rf_component_info_offset();
   2224 
   2225 	return map_offset;
   2226 }
   2227 
   2228 static daddr_t
   2229 rf_parity_map_size(RF_Raid_t *raidPtr)
   2230 {
   2231 	daddr_t map_size;
   2232 
   2233 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2234 		map_size = raidPtr->bytesPerSector;
   2235 	else
   2236 		map_size = RF_PARITY_MAP_SIZE;
   2237 
   2238 	return map_size;
   2239 }
   2240 
   2241 int
   2242 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2243 {
   2244 	RF_ComponentLabel_t *clabel;
   2245 
   2246 	clabel = raidget_component_label(raidPtr, col);
   2247 	clabel->clean = RF_RAID_CLEAN;
   2248 	raidflush_component_label(raidPtr, col);
   2249 	return(0);
   2250 }
   2251 
   2252 
   2253 int
   2254 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2255 {
   2256 	RF_ComponentLabel_t *clabel;
   2257 
   2258 	clabel = raidget_component_label(raidPtr, col);
   2259 	clabel->clean = RF_RAID_DIRTY;
   2260 	raidflush_component_label(raidPtr, col);
   2261 	return(0);
   2262 }
   2263 
   2264 int
   2265 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2266 {
   2267 	KASSERT(raidPtr->bytesPerSector);
   2268 	return raidread_component_label(raidPtr->bytesPerSector,
   2269 	    raidPtr->Disks[col].dev,
   2270 	    raidPtr->raid_cinfo[col].ci_vp,
   2271 	    &raidPtr->raid_cinfo[col].ci_label);
   2272 }
   2273 
   2274 RF_ComponentLabel_t *
   2275 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2276 {
   2277 	return &raidPtr->raid_cinfo[col].ci_label;
   2278 }
   2279 
   2280 int
   2281 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2282 {
   2283 	RF_ComponentLabel_t *label;
   2284 
   2285 	label = &raidPtr->raid_cinfo[col].ci_label;
   2286 	label->mod_counter = raidPtr->mod_counter;
   2287 #ifndef RF_NO_PARITY_MAP
   2288 	label->parity_map_modcount = label->mod_counter;
   2289 #endif
   2290 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2291 	    raidPtr->Disks[col].dev,
   2292 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2293 }
   2294 
   2295 
   2296 static int
   2297 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2298     RF_ComponentLabel_t *clabel)
   2299 {
   2300 	return raidread_component_area(dev, b_vp, clabel,
   2301 	    sizeof(RF_ComponentLabel_t),
   2302 	    rf_component_info_offset(),
   2303 	    rf_component_info_size(secsize));
   2304 }
   2305 
   2306 /* ARGSUSED */
   2307 static int
   2308 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2309     size_t msize, daddr_t offset, daddr_t dsize)
   2310 {
   2311 	struct buf *bp;
   2312 	int error;
   2313 
   2314 	/* XXX should probably ensure that we don't try to do this if
   2315 	   someone has changed rf_protected_sectors. */
   2316 
   2317 	if (b_vp == NULL) {
   2318 		/* For whatever reason, this component is not valid.
   2319 		   Don't try to read a component label from it. */
   2320 		return(EINVAL);
   2321 	}
   2322 
   2323 	/* get a block of the appropriate size... */
   2324 	bp = geteblk((int)dsize);
   2325 	bp->b_dev = dev;
   2326 
   2327 	/* get our ducks in a row for the read */
   2328 	bp->b_blkno = offset / DEV_BSIZE;
   2329 	bp->b_bcount = dsize;
   2330 	bp->b_flags |= B_READ;
   2331  	bp->b_resid = dsize;
   2332 
   2333 	bdev_strategy(bp);
   2334 	error = biowait(bp);
   2335 
   2336 	if (!error) {
   2337 		memcpy(data, bp->b_data, msize);
   2338 	}
   2339 
   2340 	brelse(bp, 0);
   2341 	return(error);
   2342 }
   2343 
   2344 
   2345 static int
   2346 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2347     RF_ComponentLabel_t *clabel)
   2348 {
   2349 	return raidwrite_component_area(dev, b_vp, clabel,
   2350 	    sizeof(RF_ComponentLabel_t),
   2351 	    rf_component_info_offset(),
   2352 	    rf_component_info_size(secsize), 0);
   2353 }
   2354 
   2355 /* ARGSUSED */
   2356 static int
   2357 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2358     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2359 {
   2360 	struct buf *bp;
   2361 	int error;
   2362 
   2363 	/* get a block of the appropriate size... */
   2364 	bp = geteblk((int)dsize);
   2365 	bp->b_dev = dev;
   2366 
   2367 	/* get our ducks in a row for the write */
   2368 	bp->b_blkno = offset / DEV_BSIZE;
   2369 	bp->b_bcount = dsize;
   2370 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2371  	bp->b_resid = dsize;
   2372 
   2373 	memset(bp->b_data, 0, dsize);
   2374 	memcpy(bp->b_data, data, msize);
   2375 
   2376 	bdev_strategy(bp);
   2377 	if (asyncp)
   2378 		return 0;
   2379 	error = biowait(bp);
   2380 	brelse(bp, 0);
   2381 	if (error) {
   2382 #if 1
   2383 		printf("Failed to write RAID component info!\n");
   2384 #endif
   2385 	}
   2386 
   2387 	return(error);
   2388 }
   2389 
   2390 void
   2391 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2392 {
   2393 	int c;
   2394 
   2395 	for (c = 0; c < raidPtr->numCol; c++) {
   2396 		/* Skip dead disks. */
   2397 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2398 			continue;
   2399 		/* XXXjld: what if an error occurs here? */
   2400 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2401 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2402 		    RF_PARITYMAP_NBYTE,
   2403 		    rf_parity_map_offset(raidPtr),
   2404 		    rf_parity_map_size(raidPtr), 0);
   2405 	}
   2406 }
   2407 
   2408 void
   2409 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2410 {
   2411 	struct rf_paritymap_ondisk tmp;
   2412 	int c,first;
   2413 
   2414 	first=1;
   2415 	for (c = 0; c < raidPtr->numCol; c++) {
   2416 		/* Skip dead disks. */
   2417 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2418 			continue;
   2419 		raidread_component_area(raidPtr->Disks[c].dev,
   2420 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2421 		    RF_PARITYMAP_NBYTE,
   2422 		    rf_parity_map_offset(raidPtr),
   2423 		    rf_parity_map_size(raidPtr));
   2424 		if (first) {
   2425 			memcpy(map, &tmp, sizeof(*map));
   2426 			first = 0;
   2427 		} else {
   2428 			rf_paritymap_merge(map, &tmp);
   2429 		}
   2430 	}
   2431 }
   2432 
   2433 void
   2434 rf_markalldirty(RF_Raid_t *raidPtr)
   2435 {
   2436 	RF_ComponentLabel_t *clabel;
   2437 	int sparecol;
   2438 	int c;
   2439 	int j;
   2440 	int scol = -1;
   2441 
   2442 	raidPtr->mod_counter++;
   2443 	for (c = 0; c < raidPtr->numCol; c++) {
   2444 		/* we don't want to touch (at all) a disk that has
   2445 		   failed */
   2446 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2447 			clabel = raidget_component_label(raidPtr, c);
   2448 			if (clabel->status == rf_ds_spared) {
   2449 				/* XXX do something special...
   2450 				   but whatever you do, don't
   2451 				   try to access it!! */
   2452 			} else {
   2453 				raidmarkdirty(raidPtr, c);
   2454 			}
   2455 		}
   2456 	}
   2457 
   2458 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2459 		sparecol = raidPtr->numCol + c;
   2460 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2461 			/*
   2462 
   2463 			   we claim this disk is "optimal" if it's
   2464 			   rf_ds_used_spare, as that means it should be
   2465 			   directly substitutable for the disk it replaced.
   2466 			   We note that too...
   2467 
   2468 			 */
   2469 
   2470 			for(j=0;j<raidPtr->numCol;j++) {
   2471 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2472 					scol = j;
   2473 					break;
   2474 				}
   2475 			}
   2476 
   2477 			clabel = raidget_component_label(raidPtr, sparecol);
   2478 			/* make sure status is noted */
   2479 
   2480 			raid_init_component_label(raidPtr, clabel);
   2481 
   2482 			clabel->row = 0;
   2483 			clabel->column = scol;
   2484 			/* Note: we *don't* change status from rf_ds_used_spare
   2485 			   to rf_ds_optimal */
   2486 			/* clabel.status = rf_ds_optimal; */
   2487 
   2488 			raidmarkdirty(raidPtr, sparecol);
   2489 		}
   2490 	}
   2491 }
   2492 
   2493 
   2494 void
   2495 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2496 {
   2497 	RF_ComponentLabel_t *clabel;
   2498 	int sparecol;
   2499 	int c;
   2500 	int j;
   2501 	int scol;
   2502 	struct raid_softc *rs = raidPtr->softc;
   2503 
   2504 	scol = -1;
   2505 
   2506 	/* XXX should do extra checks to make sure things really are clean,
   2507 	   rather than blindly setting the clean bit... */
   2508 
   2509 	raidPtr->mod_counter++;
   2510 
   2511 	for (c = 0; c < raidPtr->numCol; c++) {
   2512 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2513 			clabel = raidget_component_label(raidPtr, c);
   2514 			/* make sure status is noted */
   2515 			clabel->status = rf_ds_optimal;
   2516 
   2517 			/* note what unit we are configured as */
   2518 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2519 				clabel->last_unit = raidPtr->raidid;
   2520 
   2521 			raidflush_component_label(raidPtr, c);
   2522 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2523 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2524 					raidmarkclean(raidPtr, c);
   2525 				}
   2526 			}
   2527 		}
   2528 		/* else we don't touch it.. */
   2529 	}
   2530 
   2531 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2532 		sparecol = raidPtr->numCol + c;
   2533 		/* Need to ensure that the reconstruct actually completed! */
   2534 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2535 			/*
   2536 
   2537 			   we claim this disk is "optimal" if it's
   2538 			   rf_ds_used_spare, as that means it should be
   2539 			   directly substitutable for the disk it replaced.
   2540 			   We note that too...
   2541 
   2542 			 */
   2543 
   2544 			for(j=0;j<raidPtr->numCol;j++) {
   2545 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2546 					scol = j;
   2547 					break;
   2548 				}
   2549 			}
   2550 
   2551 			/* XXX shouldn't *really* need this... */
   2552 			clabel = raidget_component_label(raidPtr, sparecol);
   2553 			/* make sure status is noted */
   2554 
   2555 			raid_init_component_label(raidPtr, clabel);
   2556 
   2557 			clabel->column = scol;
   2558 			clabel->status = rf_ds_optimal;
   2559 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2560 				clabel->last_unit = raidPtr->raidid;
   2561 
   2562 			raidflush_component_label(raidPtr, sparecol);
   2563 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2564 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2565 					raidmarkclean(raidPtr, sparecol);
   2566 				}
   2567 			}
   2568 		}
   2569 	}
   2570 }
   2571 
   2572 void
   2573 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2574 {
   2575 
   2576 	if (vp != NULL) {
   2577 		if (auto_configured == 1) {
   2578 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2579 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2580 			vput(vp);
   2581 
   2582 		} else {
   2583 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2584 		}
   2585 	}
   2586 }
   2587 
   2588 
   2589 void
   2590 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2591 {
   2592 	int r,c;
   2593 	struct vnode *vp;
   2594 	int acd;
   2595 
   2596 
   2597 	/* We take this opportunity to close the vnodes like we should.. */
   2598 
   2599 	for (c = 0; c < raidPtr->numCol; c++) {
   2600 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2601 		acd = raidPtr->Disks[c].auto_configured;
   2602 		rf_close_component(raidPtr, vp, acd);
   2603 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2604 		raidPtr->Disks[c].auto_configured = 0;
   2605 	}
   2606 
   2607 	for (r = 0; r < raidPtr->numSpare; r++) {
   2608 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2609 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2610 		rf_close_component(raidPtr, vp, acd);
   2611 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2612 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2613 	}
   2614 }
   2615 
   2616 
   2617 void
   2618 rf_ReconThread(struct rf_recon_req_internal *req)
   2619 {
   2620 	int     s;
   2621 	RF_Raid_t *raidPtr;
   2622 
   2623 	s = splbio();
   2624 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2625 	raidPtr->recon_in_progress = 1;
   2626 
   2627 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2628 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2629 
   2630 	RF_Free(req, sizeof(*req));
   2631 
   2632 	raidPtr->recon_in_progress = 0;
   2633 	splx(s);
   2634 
   2635 	/* That's all... */
   2636 	kthread_exit(0);	/* does not return */
   2637 }
   2638 
   2639 void
   2640 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2641 {
   2642 	int retcode;
   2643 	int s;
   2644 
   2645 	raidPtr->parity_rewrite_stripes_done = 0;
   2646 	raidPtr->parity_rewrite_in_progress = 1;
   2647 	s = splbio();
   2648 	retcode = rf_RewriteParity(raidPtr);
   2649 	splx(s);
   2650 	if (retcode) {
   2651 		printf("raid%d: Error re-writing parity (%d)!\n",
   2652 		    raidPtr->raidid, retcode);
   2653 	} else {
   2654 		/* set the clean bit!  If we shutdown correctly,
   2655 		   the clean bit on each component label will get
   2656 		   set */
   2657 		raidPtr->parity_good = RF_RAID_CLEAN;
   2658 	}
   2659 	raidPtr->parity_rewrite_in_progress = 0;
   2660 
   2661 	/* Anyone waiting for us to stop?  If so, inform them... */
   2662 	if (raidPtr->waitShutdown) {
   2663 		rf_lock_mutex2(raidPtr->rad_lock);
   2664 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2665 		rf_unlock_mutex2(raidPtr->rad_lock);
   2666 	}
   2667 
   2668 	/* That's all... */
   2669 	kthread_exit(0);	/* does not return */
   2670 }
   2671 
   2672 
   2673 void
   2674 rf_CopybackThread(RF_Raid_t *raidPtr)
   2675 {
   2676 	int s;
   2677 
   2678 	raidPtr->copyback_in_progress = 1;
   2679 	s = splbio();
   2680 	rf_CopybackReconstructedData(raidPtr);
   2681 	splx(s);
   2682 	raidPtr->copyback_in_progress = 0;
   2683 
   2684 	/* That's all... */
   2685 	kthread_exit(0);	/* does not return */
   2686 }
   2687 
   2688 
   2689 void
   2690 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2691 {
   2692 	int s;
   2693 	RF_Raid_t *raidPtr;
   2694 
   2695 	s = splbio();
   2696 	raidPtr = req->raidPtr;
   2697 	raidPtr->recon_in_progress = 1;
   2698 	rf_ReconstructInPlace(raidPtr, req->col);
   2699 	RF_Free(req, sizeof(*req));
   2700 	raidPtr->recon_in_progress = 0;
   2701 	splx(s);
   2702 
   2703 	/* That's all... */
   2704 	kthread_exit(0);	/* does not return */
   2705 }
   2706 
   2707 static RF_AutoConfig_t *
   2708 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2709     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2710     unsigned secsize)
   2711 {
   2712 	int good_one = 0;
   2713 	RF_ComponentLabel_t *clabel;
   2714 	RF_AutoConfig_t *ac;
   2715 
   2716 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2717 	if (clabel == NULL) {
   2718 oomem:
   2719 		    while(ac_list) {
   2720 			    ac = ac_list;
   2721 			    if (ac->clabel)
   2722 				    free(ac->clabel, M_RAIDFRAME);
   2723 			    ac_list = ac_list->next;
   2724 			    free(ac, M_RAIDFRAME);
   2725 		    }
   2726 		    printf("RAID auto config: out of memory!\n");
   2727 		    return NULL; /* XXX probably should panic? */
   2728 	}
   2729 
   2730 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2731 		/* Got the label.  Does it look reasonable? */
   2732 		if (rf_reasonable_label(clabel, numsecs) &&
   2733 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2734 #ifdef DEBUG
   2735 			printf("Component on: %s: %llu\n",
   2736 				cname, (unsigned long long)size);
   2737 			rf_print_component_label(clabel);
   2738 #endif
   2739 			/* if it's reasonable, add it, else ignore it. */
   2740 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2741 				M_NOWAIT);
   2742 			if (ac == NULL) {
   2743 				free(clabel, M_RAIDFRAME);
   2744 				goto oomem;
   2745 			}
   2746 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2747 			ac->dev = dev;
   2748 			ac->vp = vp;
   2749 			ac->clabel = clabel;
   2750 			ac->next = ac_list;
   2751 			ac_list = ac;
   2752 			good_one = 1;
   2753 		}
   2754 	}
   2755 	if (!good_one) {
   2756 		/* cleanup */
   2757 		free(clabel, M_RAIDFRAME);
   2758 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2759 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2760 		vput(vp);
   2761 	}
   2762 	return ac_list;
   2763 }
   2764 
   2765 RF_AutoConfig_t *
   2766 rf_find_raid_components(void)
   2767 {
   2768 	struct vnode *vp;
   2769 	struct disklabel label;
   2770 	device_t dv;
   2771 	deviter_t di;
   2772 	dev_t dev;
   2773 	int bmajor, bminor, wedge, rf_part_found;
   2774 	int error;
   2775 	int i;
   2776 	RF_AutoConfig_t *ac_list;
   2777 	uint64_t numsecs;
   2778 	unsigned secsize;
   2779 	int dowedges;
   2780 
   2781 	/* initialize the AutoConfig list */
   2782 	ac_list = NULL;
   2783 
   2784 	/*
   2785 	 * we begin by trolling through *all* the devices on the system *twice*
   2786 	 * first we scan for wedges, second for other devices. This avoids
   2787 	 * using a raw partition instead of a wedge that covers the whole disk
   2788 	 */
   2789 
   2790 	for (dowedges=1; dowedges>=0; --dowedges) {
   2791 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2792 		     dv = deviter_next(&di)) {
   2793 
   2794 			/* we are only interested in disks... */
   2795 			if (device_class(dv) != DV_DISK)
   2796 				continue;
   2797 
   2798 			/* we don't care about floppies... */
   2799 			if (device_is_a(dv, "fd")) {
   2800 				continue;
   2801 			}
   2802 
   2803 			/* we don't care about CD's... */
   2804 			if (device_is_a(dv, "cd")) {
   2805 				continue;
   2806 			}
   2807 
   2808 			/* we don't care about md's... */
   2809 			if (device_is_a(dv, "md")) {
   2810 				continue;
   2811 			}
   2812 
   2813 			/* hdfd is the Atari/Hades floppy driver */
   2814 			if (device_is_a(dv, "hdfd")) {
   2815 				continue;
   2816 			}
   2817 
   2818 			/* fdisa is the Atari/Milan floppy driver */
   2819 			if (device_is_a(dv, "fdisa")) {
   2820 				continue;
   2821 			}
   2822 
   2823 			/* are we in the wedges pass ? */
   2824 			wedge = device_is_a(dv, "dk");
   2825 			if (wedge != dowedges) {
   2826 				continue;
   2827 			}
   2828 
   2829 			/* need to find the device_name_to_block_device_major stuff */
   2830 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2831 
   2832 			rf_part_found = 0; /*No raid partition as yet*/
   2833 
   2834 			/* get a vnode for the raw partition of this disk */
   2835 			bminor = minor(device_unit(dv));
   2836 			dev = wedge ? makedev(bmajor, bminor) :
   2837 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2838 			if (bdevvp(dev, &vp))
   2839 				panic("RAID can't alloc vnode");
   2840 
   2841 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2842 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2843 
   2844 			if (error) {
   2845 				/* "Who cares."  Continue looking
   2846 				   for something that exists*/
   2847 				vput(vp);
   2848 				continue;
   2849 			}
   2850 
   2851 			error = getdisksize(vp, &numsecs, &secsize);
   2852 			if (error) {
   2853 				/*
   2854 				 * Pseudo devices like vnd and cgd can be
   2855 				 * opened but may still need some configuration.
   2856 				 * Ignore these quietly.
   2857 				 */
   2858 				if (error != ENXIO)
   2859 					printf("RAIDframe: can't get disk size"
   2860 					    " for dev %s (%d)\n",
   2861 					    device_xname(dv), error);
   2862 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2863 				vput(vp);
   2864 				continue;
   2865 			}
   2866 			if (wedge) {
   2867 				struct dkwedge_info dkw;
   2868 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2869 				    NOCRED);
   2870 				if (error) {
   2871 					printf("RAIDframe: can't get wedge info for "
   2872 					    "dev %s (%d)\n", device_xname(dv), error);
   2873 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2874 					vput(vp);
   2875 					continue;
   2876 				}
   2877 
   2878 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2879 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2880 					vput(vp);
   2881 					continue;
   2882 				}
   2883 
   2884 				VOP_UNLOCK(vp);
   2885 				ac_list = rf_get_component(ac_list, dev, vp,
   2886 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2887 				rf_part_found = 1; /*There is a raid component on this disk*/
   2888 				continue;
   2889 			}
   2890 
   2891 			/* Ok, the disk exists.  Go get the disklabel. */
   2892 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2893 			if (error) {
   2894 				/*
   2895 				 * XXX can't happen - open() would
   2896 				 * have errored out (or faked up one)
   2897 				 */
   2898 				if (error != ENOTTY)
   2899 					printf("RAIDframe: can't get label for dev "
   2900 					    "%s (%d)\n", device_xname(dv), error);
   2901 			}
   2902 
   2903 			/* don't need this any more.  We'll allocate it again
   2904 			   a little later if we really do... */
   2905 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2906 			vput(vp);
   2907 
   2908 			if (error)
   2909 				continue;
   2910 
   2911 			rf_part_found = 0; /*No raid partitions yet*/
   2912 			for (i = 0; i < label.d_npartitions; i++) {
   2913 				char cname[sizeof(ac_list->devname)];
   2914 
   2915 				/* We only support partitions marked as RAID */
   2916 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2917 					continue;
   2918 
   2919 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2920 				if (bdevvp(dev, &vp))
   2921 					panic("RAID can't alloc vnode");
   2922 
   2923 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2924 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2925 				if (error) {
   2926 					/* Whatever... */
   2927 					vput(vp);
   2928 					continue;
   2929 				}
   2930 				VOP_UNLOCK(vp);
   2931 				snprintf(cname, sizeof(cname), "%s%c",
   2932 				    device_xname(dv), 'a' + i);
   2933 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2934 					label.d_partitions[i].p_size, numsecs, secsize);
   2935 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2936 			}
   2937 
   2938 			/*
   2939 			 *If there is no raid component on this disk, either in a
   2940 			 *disklabel or inside a wedge, check the raw partition as well,
   2941 			 *as it is possible to configure raid components on raw disk
   2942 			 *devices.
   2943 			 */
   2944 
   2945 			if (!rf_part_found) {
   2946 				char cname[sizeof(ac_list->devname)];
   2947 
   2948 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2949 				if (bdevvp(dev, &vp))
   2950 					panic("RAID can't alloc vnode");
   2951 
   2952 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2953 
   2954 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2955 				if (error) {
   2956 					/* Whatever... */
   2957 					vput(vp);
   2958 					continue;
   2959 				}
   2960 				VOP_UNLOCK(vp);
   2961 				snprintf(cname, sizeof(cname), "%s%c",
   2962 				    device_xname(dv), 'a' + RAW_PART);
   2963 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2964 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2965 			}
   2966 		}
   2967 		deviter_release(&di);
   2968 	}
   2969 	return ac_list;
   2970 }
   2971 
   2972 
   2973 int
   2974 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2975 {
   2976 
   2977 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2978 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2979 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2980 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2981 	    clabel->row >=0 &&
   2982 	    clabel->column >= 0 &&
   2983 	    clabel->num_rows > 0 &&
   2984 	    clabel->num_columns > 0 &&
   2985 	    clabel->row < clabel->num_rows &&
   2986 	    clabel->column < clabel->num_columns &&
   2987 	    clabel->blockSize > 0 &&
   2988 	    /*
   2989 	     * numBlocksHi may contain garbage, but it is ok since
   2990 	     * the type is unsigned.  If it is really garbage,
   2991 	     * rf_fix_old_label_size() will fix it.
   2992 	     */
   2993 	    rf_component_label_numblocks(clabel) > 0) {
   2994 		/*
   2995 		 * label looks reasonable enough...
   2996 		 * let's make sure it has no old garbage.
   2997 		 */
   2998 		if (numsecs)
   2999 			rf_fix_old_label_size(clabel, numsecs);
   3000 		return(1);
   3001 	}
   3002 	return(0);
   3003 }
   3004 
   3005 
   3006 /*
   3007  * For reasons yet unknown, some old component labels have garbage in
   3008  * the newer numBlocksHi region, and this causes lossage.  Since those
   3009  * disks will also have numsecs set to less than 32 bits of sectors,
   3010  * we can determine when this corruption has occurred, and fix it.
   3011  *
   3012  * The exact same problem, with the same unknown reason, happens to
   3013  * the partitionSizeHi member as well.
   3014  */
   3015 static void
   3016 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3017 {
   3018 
   3019 	if (numsecs < ((uint64_t)1 << 32)) {
   3020 		if (clabel->numBlocksHi) {
   3021 			printf("WARNING: total sectors < 32 bits, yet "
   3022 			       "numBlocksHi set\n"
   3023 			       "WARNING: resetting numBlocksHi to zero.\n");
   3024 			clabel->numBlocksHi = 0;
   3025 		}
   3026 
   3027 		if (clabel->partitionSizeHi) {
   3028 			printf("WARNING: total sectors < 32 bits, yet "
   3029 			       "partitionSizeHi set\n"
   3030 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3031 			clabel->partitionSizeHi = 0;
   3032 		}
   3033 	}
   3034 }
   3035 
   3036 
   3037 #ifdef DEBUG
   3038 void
   3039 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3040 {
   3041 	uint64_t numBlocks;
   3042 	static const char *rp[] = {
   3043 	    "No", "Force", "Soft", "*invalid*"
   3044 	};
   3045 
   3046 
   3047 	numBlocks = rf_component_label_numblocks(clabel);
   3048 
   3049 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3050 	       clabel->row, clabel->column,
   3051 	       clabel->num_rows, clabel->num_columns);
   3052 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3053 	       clabel->version, clabel->serial_number,
   3054 	       clabel->mod_counter);
   3055 	printf("   Clean: %s Status: %d\n",
   3056 	       clabel->clean ? "Yes" : "No", clabel->status);
   3057 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3058 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3059 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3060 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3061 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3062 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3063 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3064 #if 0
   3065 	   printf("   Config order: %d\n", clabel->config_order);
   3066 #endif
   3067 
   3068 }
   3069 #endif
   3070 
   3071 RF_ConfigSet_t *
   3072 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3073 {
   3074 	RF_AutoConfig_t *ac;
   3075 	RF_ConfigSet_t *config_sets;
   3076 	RF_ConfigSet_t *cset;
   3077 	RF_AutoConfig_t *ac_next;
   3078 
   3079 
   3080 	config_sets = NULL;
   3081 
   3082 	/* Go through the AutoConfig list, and figure out which components
   3083 	   belong to what sets.  */
   3084 	ac = ac_list;
   3085 	while(ac!=NULL) {
   3086 		/* we're going to putz with ac->next, so save it here
   3087 		   for use at the end of the loop */
   3088 		ac_next = ac->next;
   3089 
   3090 		if (config_sets == NULL) {
   3091 			/* will need at least this one... */
   3092 			config_sets = (RF_ConfigSet_t *)
   3093 				malloc(sizeof(RF_ConfigSet_t),
   3094 				       M_RAIDFRAME, M_NOWAIT);
   3095 			if (config_sets == NULL) {
   3096 				panic("rf_create_auto_sets: No memory!");
   3097 			}
   3098 			/* this one is easy :) */
   3099 			config_sets->ac = ac;
   3100 			config_sets->next = NULL;
   3101 			config_sets->rootable = 0;
   3102 			ac->next = NULL;
   3103 		} else {
   3104 			/* which set does this component fit into? */
   3105 			cset = config_sets;
   3106 			while(cset!=NULL) {
   3107 				if (rf_does_it_fit(cset, ac)) {
   3108 					/* looks like it matches... */
   3109 					ac->next = cset->ac;
   3110 					cset->ac = ac;
   3111 					break;
   3112 				}
   3113 				cset = cset->next;
   3114 			}
   3115 			if (cset==NULL) {
   3116 				/* didn't find a match above... new set..*/
   3117 				cset = (RF_ConfigSet_t *)
   3118 					malloc(sizeof(RF_ConfigSet_t),
   3119 					       M_RAIDFRAME, M_NOWAIT);
   3120 				if (cset == NULL) {
   3121 					panic("rf_create_auto_sets: No memory!");
   3122 				}
   3123 				cset->ac = ac;
   3124 				ac->next = NULL;
   3125 				cset->next = config_sets;
   3126 				cset->rootable = 0;
   3127 				config_sets = cset;
   3128 			}
   3129 		}
   3130 		ac = ac_next;
   3131 	}
   3132 
   3133 
   3134 	return(config_sets);
   3135 }
   3136 
   3137 static int
   3138 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3139 {
   3140 	RF_ComponentLabel_t *clabel1, *clabel2;
   3141 
   3142 	/* If this one matches the *first* one in the set, that's good
   3143 	   enough, since the other members of the set would have been
   3144 	   through here too... */
   3145 	/* note that we are not checking partitionSize here..
   3146 
   3147 	   Note that we are also not checking the mod_counters here.
   3148 	   If everything else matches except the mod_counter, that's
   3149 	   good enough for this test.  We will deal with the mod_counters
   3150 	   a little later in the autoconfiguration process.
   3151 
   3152 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3153 
   3154 	   The reason we don't check for this is that failed disks
   3155 	   will have lower modification counts.  If those disks are
   3156 	   not added to the set they used to belong to, then they will
   3157 	   form their own set, which may result in 2 different sets,
   3158 	   for example, competing to be configured at raid0, and
   3159 	   perhaps competing to be the root filesystem set.  If the
   3160 	   wrong ones get configured, or both attempt to become /,
   3161 	   weird behaviour and or serious lossage will occur.  Thus we
   3162 	   need to bring them into the fold here, and kick them out at
   3163 	   a later point.
   3164 
   3165 	*/
   3166 
   3167 	clabel1 = cset->ac->clabel;
   3168 	clabel2 = ac->clabel;
   3169 	if ((clabel1->version == clabel2->version) &&
   3170 	    (clabel1->serial_number == clabel2->serial_number) &&
   3171 	    (clabel1->num_rows == clabel2->num_rows) &&
   3172 	    (clabel1->num_columns == clabel2->num_columns) &&
   3173 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3174 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3175 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3176 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3177 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3178 	    (clabel1->blockSize == clabel2->blockSize) &&
   3179 	    rf_component_label_numblocks(clabel1) ==
   3180 	    rf_component_label_numblocks(clabel2) &&
   3181 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3182 	    (clabel1->root_partition == clabel2->root_partition) &&
   3183 	    (clabel1->last_unit == clabel2->last_unit) &&
   3184 	    (clabel1->config_order == clabel2->config_order)) {
   3185 		/* if it get's here, it almost *has* to be a match */
   3186 	} else {
   3187 		/* it's not consistent with somebody in the set..
   3188 		   punt */
   3189 		return(0);
   3190 	}
   3191 	/* all was fine.. it must fit... */
   3192 	return(1);
   3193 }
   3194 
   3195 int
   3196 rf_have_enough_components(RF_ConfigSet_t *cset)
   3197 {
   3198 	RF_AutoConfig_t *ac;
   3199 	RF_AutoConfig_t *auto_config;
   3200 	RF_ComponentLabel_t *clabel;
   3201 	int c;
   3202 	int num_cols;
   3203 	int num_missing;
   3204 	int mod_counter;
   3205 	int mod_counter_found;
   3206 	int even_pair_failed;
   3207 	char parity_type;
   3208 
   3209 
   3210 	/* check to see that we have enough 'live' components
   3211 	   of this set.  If so, we can configure it if necessary */
   3212 
   3213 	num_cols = cset->ac->clabel->num_columns;
   3214 	parity_type = cset->ac->clabel->parityConfig;
   3215 
   3216 	/* XXX Check for duplicate components!?!?!? */
   3217 
   3218 	/* Determine what the mod_counter is supposed to be for this set. */
   3219 
   3220 	mod_counter_found = 0;
   3221 	mod_counter = 0;
   3222 	ac = cset->ac;
   3223 	while(ac!=NULL) {
   3224 		if (mod_counter_found==0) {
   3225 			mod_counter = ac->clabel->mod_counter;
   3226 			mod_counter_found = 1;
   3227 		} else {
   3228 			if (ac->clabel->mod_counter > mod_counter) {
   3229 				mod_counter = ac->clabel->mod_counter;
   3230 			}
   3231 		}
   3232 		ac = ac->next;
   3233 	}
   3234 
   3235 	num_missing = 0;
   3236 	auto_config = cset->ac;
   3237 
   3238 	even_pair_failed = 0;
   3239 	for(c=0; c<num_cols; c++) {
   3240 		ac = auto_config;
   3241 		while(ac!=NULL) {
   3242 			if ((ac->clabel->column == c) &&
   3243 			    (ac->clabel->mod_counter == mod_counter)) {
   3244 				/* it's this one... */
   3245 #ifdef DEBUG
   3246 				printf("Found: %s at %d\n",
   3247 				       ac->devname,c);
   3248 #endif
   3249 				break;
   3250 			}
   3251 			ac=ac->next;
   3252 		}
   3253 		if (ac==NULL) {
   3254 				/* Didn't find one here! */
   3255 				/* special case for RAID 1, especially
   3256 				   where there are more than 2
   3257 				   components (where RAIDframe treats
   3258 				   things a little differently :( ) */
   3259 			if (parity_type == '1') {
   3260 				if (c%2 == 0) { /* even component */
   3261 					even_pair_failed = 1;
   3262 				} else { /* odd component.  If
   3263 					    we're failed, and
   3264 					    so is the even
   3265 					    component, it's
   3266 					    "Good Night, Charlie" */
   3267 					if (even_pair_failed == 1) {
   3268 						return(0);
   3269 					}
   3270 				}
   3271 			} else {
   3272 				/* normal accounting */
   3273 				num_missing++;
   3274 			}
   3275 		}
   3276 		if ((parity_type == '1') && (c%2 == 1)) {
   3277 				/* Just did an even component, and we didn't
   3278 				   bail.. reset the even_pair_failed flag,
   3279 				   and go on to the next component.... */
   3280 			even_pair_failed = 0;
   3281 		}
   3282 	}
   3283 
   3284 	clabel = cset->ac->clabel;
   3285 
   3286 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3287 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3288 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3289 		/* XXX this needs to be made *much* more general */
   3290 		/* Too many failures */
   3291 		return(0);
   3292 	}
   3293 	/* otherwise, all is well, and we've got enough to take a kick
   3294 	   at autoconfiguring this set */
   3295 	return(1);
   3296 }
   3297 
   3298 void
   3299 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3300 			RF_Raid_t *raidPtr)
   3301 {
   3302 	RF_ComponentLabel_t *clabel;
   3303 	int i;
   3304 
   3305 	clabel = ac->clabel;
   3306 
   3307 	/* 1. Fill in the common stuff */
   3308 	config->numCol = clabel->num_columns;
   3309 	config->numSpare = 0; /* XXX should this be set here? */
   3310 	config->sectPerSU = clabel->sectPerSU;
   3311 	config->SUsPerPU = clabel->SUsPerPU;
   3312 	config->SUsPerRU = clabel->SUsPerRU;
   3313 	config->parityConfig = clabel->parityConfig;
   3314 	/* XXX... */
   3315 	strcpy(config->diskQueueType,"fifo");
   3316 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3317 	config->layoutSpecificSize = 0; /* XXX ?? */
   3318 
   3319 	while(ac!=NULL) {
   3320 		/* row/col values will be in range due to the checks
   3321 		   in reasonable_label() */
   3322 		strcpy(config->devnames[0][ac->clabel->column],
   3323 		       ac->devname);
   3324 		ac = ac->next;
   3325 	}
   3326 
   3327 	for(i=0;i<RF_MAXDBGV;i++) {
   3328 		config->debugVars[i][0] = 0;
   3329 	}
   3330 }
   3331 
   3332 int
   3333 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3334 {
   3335 	RF_ComponentLabel_t *clabel;
   3336 	int column;
   3337 	int sparecol;
   3338 
   3339 	raidPtr->autoconfigure = new_value;
   3340 
   3341 	for(column=0; column<raidPtr->numCol; column++) {
   3342 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3343 			clabel = raidget_component_label(raidPtr, column);
   3344 			clabel->autoconfigure = new_value;
   3345 			raidflush_component_label(raidPtr, column);
   3346 		}
   3347 	}
   3348 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3349 		sparecol = raidPtr->numCol + column;
   3350 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3351 			clabel = raidget_component_label(raidPtr, sparecol);
   3352 			clabel->autoconfigure = new_value;
   3353 			raidflush_component_label(raidPtr, sparecol);
   3354 		}
   3355 	}
   3356 	return(new_value);
   3357 }
   3358 
   3359 int
   3360 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3361 {
   3362 	RF_ComponentLabel_t *clabel;
   3363 	int column;
   3364 	int sparecol;
   3365 
   3366 	raidPtr->root_partition = new_value;
   3367 	for(column=0; column<raidPtr->numCol; column++) {
   3368 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3369 			clabel = raidget_component_label(raidPtr, column);
   3370 			clabel->root_partition = new_value;
   3371 			raidflush_component_label(raidPtr, column);
   3372 		}
   3373 	}
   3374 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3375 		sparecol = raidPtr->numCol + column;
   3376 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3377 			clabel = raidget_component_label(raidPtr, sparecol);
   3378 			clabel->root_partition = new_value;
   3379 			raidflush_component_label(raidPtr, sparecol);
   3380 		}
   3381 	}
   3382 	return(new_value);
   3383 }
   3384 
   3385 void
   3386 rf_release_all_vps(RF_ConfigSet_t *cset)
   3387 {
   3388 	RF_AutoConfig_t *ac;
   3389 
   3390 	ac = cset->ac;
   3391 	while(ac!=NULL) {
   3392 		/* Close the vp, and give it back */
   3393 		if (ac->vp) {
   3394 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3395 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3396 			vput(ac->vp);
   3397 			ac->vp = NULL;
   3398 		}
   3399 		ac = ac->next;
   3400 	}
   3401 }
   3402 
   3403 
   3404 void
   3405 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3406 {
   3407 	RF_AutoConfig_t *ac;
   3408 	RF_AutoConfig_t *next_ac;
   3409 
   3410 	ac = cset->ac;
   3411 	while(ac!=NULL) {
   3412 		next_ac = ac->next;
   3413 		/* nuke the label */
   3414 		free(ac->clabel, M_RAIDFRAME);
   3415 		/* cleanup the config structure */
   3416 		free(ac, M_RAIDFRAME);
   3417 		/* "next.." */
   3418 		ac = next_ac;
   3419 	}
   3420 	/* and, finally, nuke the config set */
   3421 	free(cset, M_RAIDFRAME);
   3422 }
   3423 
   3424 
   3425 void
   3426 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3427 {
   3428 	/* current version number */
   3429 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3430 	clabel->serial_number = raidPtr->serial_number;
   3431 	clabel->mod_counter = raidPtr->mod_counter;
   3432 
   3433 	clabel->num_rows = 1;
   3434 	clabel->num_columns = raidPtr->numCol;
   3435 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3436 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3437 
   3438 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3439 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3440 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3441 
   3442 	clabel->blockSize = raidPtr->bytesPerSector;
   3443 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3444 
   3445 	/* XXX not portable */
   3446 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3447 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3448 	clabel->autoconfigure = raidPtr->autoconfigure;
   3449 	clabel->root_partition = raidPtr->root_partition;
   3450 	clabel->last_unit = raidPtr->raidid;
   3451 	clabel->config_order = raidPtr->config_order;
   3452 
   3453 #ifndef RF_NO_PARITY_MAP
   3454 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3455 #endif
   3456 }
   3457 
   3458 struct raid_softc *
   3459 rf_auto_config_set(RF_ConfigSet_t *cset)
   3460 {
   3461 	RF_Raid_t *raidPtr;
   3462 	RF_Config_t *config;
   3463 	int raidID;
   3464 	struct raid_softc *sc;
   3465 
   3466 #ifdef DEBUG
   3467 	printf("RAID autoconfigure\n");
   3468 #endif
   3469 
   3470 	/* 1. Create a config structure */
   3471 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3472 	if (config == NULL) {
   3473 		printf("%s: Out of mem - config!?!?\n", __func__);
   3474 				/* XXX do something more intelligent here. */
   3475 		return NULL;
   3476 	}
   3477 
   3478 	/*
   3479 	   2. Figure out what RAID ID this one is supposed to live at
   3480 	   See if we can get the same RAID dev that it was configured
   3481 	   on last time..
   3482 	*/
   3483 
   3484 	raidID = cset->ac->clabel->last_unit;
   3485 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3486 	     sc = raidget(++raidID, false))
   3487 		continue;
   3488 #ifdef DEBUG
   3489 	printf("Configuring raid%d:\n",raidID);
   3490 #endif
   3491 
   3492 	if (sc == NULL)
   3493 		sc = raidget(raidID, true);
   3494 	if (sc == NULL) {
   3495 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3496 				/* XXX do something more intelligent here. */
   3497 		free(config, M_RAIDFRAME);
   3498 		return NULL;
   3499 	}
   3500 
   3501 	raidPtr = &sc->sc_r;
   3502 
   3503 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3504 	raidPtr->softc = sc;
   3505 	raidPtr->raidid = raidID;
   3506 	raidPtr->openings = RAIDOUTSTANDING;
   3507 
   3508 	/* 3. Build the configuration structure */
   3509 	rf_create_configuration(cset->ac, config, raidPtr);
   3510 
   3511 	/* 4. Do the configuration */
   3512 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3513 		raidinit(sc);
   3514 
   3515 		rf_markalldirty(raidPtr);
   3516 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3517 		switch (cset->ac->clabel->root_partition) {
   3518 		case 1:	/* Force Root */
   3519 		case 2:	/* Soft Root: root when boot partition part of raid */
   3520 			/*
   3521 			 * everything configured just fine.  Make a note
   3522 			 * that this set is eligible to be root,
   3523 			 * or forced to be root
   3524 			 */
   3525 			cset->rootable = cset->ac->clabel->root_partition;
   3526 			/* XXX do this here? */
   3527 			raidPtr->root_partition = cset->rootable;
   3528 			break;
   3529 		default:
   3530 			break;
   3531 		}
   3532 	} else {
   3533 		raidput(sc);
   3534 		sc = NULL;
   3535 	}
   3536 
   3537 	/* 5. Cleanup */
   3538 	free(config, M_RAIDFRAME);
   3539 	return sc;
   3540 }
   3541 
   3542 void
   3543 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3544 	     size_t xmin, size_t xmax)
   3545 {
   3546 	int error;
   3547 
   3548 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3549 	pool_sethiwat(p, xmax);
   3550 	if ((error = pool_prime(p, xmin)) != 0)
   3551 		panic("%s: failed to prime pool: %d", __func__, error);
   3552 	pool_setlowat(p, xmin);
   3553 }
   3554 
   3555 /*
   3556  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3557  * to see if there is IO pending and if that IO could possibly be done
   3558  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3559  * otherwise.
   3560  *
   3561  */
   3562 int
   3563 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3564 {
   3565 	struct raid_softc *rs;
   3566 	struct dk_softc *dksc;
   3567 
   3568 	rs = raidPtr->softc;
   3569 	dksc = &rs->sc_dksc;
   3570 
   3571 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3572 		return 1;
   3573 
   3574 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3575 		/* there is work to do */
   3576 		return 0;
   3577 	}
   3578 	/* default is nothing to do */
   3579 	return 1;
   3580 }
   3581 
   3582 int
   3583 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3584 {
   3585 	uint64_t numsecs;
   3586 	unsigned secsize;
   3587 	int error;
   3588 
   3589 	error = getdisksize(vp, &numsecs, &secsize);
   3590 	if (error == 0) {
   3591 		diskPtr->blockSize = secsize;
   3592 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3593 		diskPtr->partitionSize = numsecs;
   3594 		return 0;
   3595 	}
   3596 	return error;
   3597 }
   3598 
   3599 static int
   3600 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3601 {
   3602 	return 1;
   3603 }
   3604 
   3605 static void
   3606 raid_attach(device_t parent, device_t self, void *aux)
   3607 {
   3608 }
   3609 
   3610 
   3611 static int
   3612 raid_detach(device_t self, int flags)
   3613 {
   3614 	int error;
   3615 	struct raid_softc *rs = raidsoftc(self);
   3616 
   3617 	if (rs == NULL)
   3618 		return ENXIO;
   3619 
   3620 	if ((error = raidlock(rs)) != 0)
   3621 		return (error);
   3622 
   3623 	error = raid_detach_unlocked(rs);
   3624 
   3625 	raidunlock(rs);
   3626 
   3627 	/* XXX raid can be referenced here */
   3628 
   3629 	if (error)
   3630 		return error;
   3631 
   3632 	/* Free the softc */
   3633 	raidput(rs);
   3634 
   3635 	return 0;
   3636 }
   3637 
   3638 static void
   3639 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3640 {
   3641 	struct dk_softc *dksc = &rs->sc_dksc;
   3642 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3643 
   3644 	memset(dg, 0, sizeof(*dg));
   3645 
   3646 	dg->dg_secperunit = raidPtr->totalSectors;
   3647 	dg->dg_secsize = raidPtr->bytesPerSector;
   3648 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3649 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3650 
   3651 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3652 }
   3653 
   3654 /*
   3655  * Get cache info for all the components (including spares).
   3656  * Returns intersection of all the cache flags of all disks, or first
   3657  * error if any encountered.
   3658  * XXXfua feature flags can change as spares are added - lock down somehow
   3659  */
   3660 static int
   3661 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3662 {
   3663 	int c;
   3664 	int error;
   3665 	int dkwhole = 0, dkpart;
   3666 
   3667 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3668 		/*
   3669 		 * Check any non-dead disk, even when currently being
   3670 		 * reconstructed.
   3671 		 */
   3672 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3673 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3674 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3675 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3676 			if (error) {
   3677 				if (error != ENODEV) {
   3678 					printf("raid%d: get cache for component %s failed\n",
   3679 					    raidPtr->raidid,
   3680 					    raidPtr->Disks[c].devname);
   3681 				}
   3682 
   3683 				return error;
   3684 			}
   3685 
   3686 			if (c == 0)
   3687 				dkwhole = dkpart;
   3688 			else
   3689 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3690 		}
   3691 	}
   3692 
   3693 	*data = dkwhole;
   3694 
   3695 	return 0;
   3696 }
   3697 
   3698 /*
   3699  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3700  * We end up returning whatever error was returned by the first cache flush
   3701  * that fails.
   3702  */
   3703 
   3704 int
   3705 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3706 {
   3707 	int c, sparecol;
   3708 	int e,error;
   3709 	int force = 1;
   3710 
   3711 	error = 0;
   3712 	for (c = 0; c < raidPtr->numCol; c++) {
   3713 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3714 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3715 					  &force, FWRITE, NOCRED);
   3716 			if (e) {
   3717 				if (e != ENODEV)
   3718 					printf("raid%d: cache flush to component %s failed.\n",
   3719 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3720 				if (error == 0) {
   3721 					error = e;
   3722 				}
   3723 			}
   3724 		}
   3725 	}
   3726 
   3727 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3728 		sparecol = raidPtr->numCol + c;
   3729 		/* Need to ensure that the reconstruct actually completed! */
   3730 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3731 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3732 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3733 			if (e) {
   3734 				if (e != ENODEV)
   3735 					printf("raid%d: cache flush to component %s failed.\n",
   3736 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3737 				if (error == 0) {
   3738 					error = e;
   3739 				}
   3740 			}
   3741 		}
   3742 	}
   3743 	return error;
   3744 }
   3745 
   3746 /* Fill in info with the current status */
   3747 void
   3748 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3749 {
   3750 
   3751 	if (raidPtr->status != rf_rs_reconstructing) {
   3752 		info->total = 100;
   3753 		info->completed = 100;
   3754 	} else {
   3755 		info->total = raidPtr->reconControl->numRUsTotal;
   3756 		info->completed = raidPtr->reconControl->numRUsComplete;
   3757 	}
   3758 	info->remaining = info->total - info->completed;
   3759 }
   3760 
   3761 /* Fill in info with the current status */
   3762 void
   3763 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3764 {
   3765 
   3766 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3767 		info->total = raidPtr->Layout.numStripe;
   3768 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3769 	} else {
   3770 		info->completed = 100;
   3771 		info->total = 100;
   3772 	}
   3773 	info->remaining = info->total - info->completed;
   3774 }
   3775 
   3776 /* Fill in info with the current status */
   3777 void
   3778 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3779 {
   3780 
   3781 	if (raidPtr->copyback_in_progress == 1) {
   3782 		info->total = raidPtr->Layout.numStripe;
   3783 		info->completed = raidPtr->copyback_stripes_done;
   3784 		info->remaining = info->total - info->completed;
   3785 	} else {
   3786 		info->remaining = 0;
   3787 		info->completed = 100;
   3788 		info->total = 100;
   3789 	}
   3790 }
   3791 
   3792 /* Fill in config with the current info */
   3793 int
   3794 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3795 {
   3796 	int	d, i, j;
   3797 
   3798 	if (!raidPtr->valid)
   3799 		return (ENODEV);
   3800 	config->cols = raidPtr->numCol;
   3801 	config->ndevs = raidPtr->numCol;
   3802 	if (config->ndevs >= RF_MAX_DISKS)
   3803 		return (ENOMEM);
   3804 	config->nspares = raidPtr->numSpare;
   3805 	if (config->nspares >= RF_MAX_DISKS)
   3806 		return (ENOMEM);
   3807 	config->maxqdepth = raidPtr->maxQueueDepth;
   3808 	d = 0;
   3809 	for (j = 0; j < config->cols; j++) {
   3810 		config->devs[d] = raidPtr->Disks[j];
   3811 		d++;
   3812 	}
   3813 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3814 		config->spares[i] = raidPtr->Disks[j];
   3815 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3816 			/* XXX: raidctl(8) expects to see this as a used spare */
   3817 			config->spares[i].status = rf_ds_used_spare;
   3818 		}
   3819 	}
   3820 	return 0;
   3821 }
   3822 
   3823 int
   3824 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3825 {
   3826 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3827 	RF_ComponentLabel_t *raid_clabel;
   3828 	int column = clabel->column;
   3829 
   3830 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3831 		return EINVAL;
   3832 	raid_clabel = raidget_component_label(raidPtr, column);
   3833 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3834 
   3835 	return 0;
   3836 }
   3837 
   3838 /*
   3839  * Module interface
   3840  */
   3841 
   3842 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3843 
   3844 #ifdef _MODULE
   3845 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3846 #endif
   3847 
   3848 static int raid_modcmd(modcmd_t, void *);
   3849 static int raid_modcmd_init(void);
   3850 static int raid_modcmd_fini(void);
   3851 
   3852 static int
   3853 raid_modcmd(modcmd_t cmd, void *data)
   3854 {
   3855 	int error;
   3856 
   3857 	error = 0;
   3858 	switch (cmd) {
   3859 	case MODULE_CMD_INIT:
   3860 		error = raid_modcmd_init();
   3861 		break;
   3862 	case MODULE_CMD_FINI:
   3863 		error = raid_modcmd_fini();
   3864 		break;
   3865 	default:
   3866 		error = ENOTTY;
   3867 		break;
   3868 	}
   3869 	return error;
   3870 }
   3871 
   3872 static int
   3873 raid_modcmd_init(void)
   3874 {
   3875 	int error;
   3876 	int bmajor, cmajor;
   3877 
   3878 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3879 	mutex_enter(&raid_lock);
   3880 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3881 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3882 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3883 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3884 
   3885 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3886 #endif
   3887 
   3888 	bmajor = cmajor = -1;
   3889 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3890 	    &raid_cdevsw, &cmajor);
   3891 	if (error != 0 && error != EEXIST) {
   3892 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3893 		mutex_exit(&raid_lock);
   3894 		return error;
   3895 	}
   3896 #ifdef _MODULE
   3897 	error = config_cfdriver_attach(&raid_cd);
   3898 	if (error != 0) {
   3899 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3900 		    __func__, error);
   3901 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3902 		mutex_exit(&raid_lock);
   3903 		return error;
   3904 	}
   3905 #endif
   3906 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3907 	if (error != 0) {
   3908 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3909 		    __func__, error);
   3910 #ifdef _MODULE
   3911 		config_cfdriver_detach(&raid_cd);
   3912 #endif
   3913 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3914 		mutex_exit(&raid_lock);
   3915 		return error;
   3916 	}
   3917 
   3918 	raidautoconfigdone = false;
   3919 
   3920 	mutex_exit(&raid_lock);
   3921 
   3922 	if (error == 0) {
   3923 		if (rf_BootRaidframe(true) == 0)
   3924 			aprint_verbose("Kernelized RAIDframe activated\n");
   3925 		else
   3926 			panic("Serious error activating RAID!!");
   3927 	}
   3928 
   3929 	/*
   3930 	 * Register a finalizer which will be used to auto-config RAID
   3931 	 * sets once all real hardware devices have been found.
   3932 	 */
   3933 	error = config_finalize_register(NULL, rf_autoconfig);
   3934 	if (error != 0) {
   3935 		aprint_error("WARNING: unable to register RAIDframe "
   3936 		    "finalizer\n");
   3937 		error = 0;
   3938 	}
   3939 
   3940 	return error;
   3941 }
   3942 
   3943 static int
   3944 raid_modcmd_fini(void)
   3945 {
   3946 	int error;
   3947 
   3948 	mutex_enter(&raid_lock);
   3949 
   3950 	/* Don't allow unload if raid device(s) exist.  */
   3951 	if (!LIST_EMPTY(&raids)) {
   3952 		mutex_exit(&raid_lock);
   3953 		return EBUSY;
   3954 	}
   3955 
   3956 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3957 	if (error != 0) {
   3958 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3959 		mutex_exit(&raid_lock);
   3960 		return error;
   3961 	}
   3962 #ifdef _MODULE
   3963 	error = config_cfdriver_detach(&raid_cd);
   3964 	if (error != 0) {
   3965 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3966 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3967 		mutex_exit(&raid_lock);
   3968 		return error;
   3969 	}
   3970 #endif
   3971 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3972 	if (error != 0) {
   3973 		aprint_error("%s: cannot detach devsw\n",__func__);
   3974 #ifdef _MODULE
   3975 		config_cfdriver_attach(&raid_cd);
   3976 #endif
   3977 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3978 		mutex_exit(&raid_lock);
   3979 		return error;
   3980 	}
   3981 	rf_BootRaidframe(false);
   3982 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3983 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   3984 	rf_destroy_cond2(rf_sparet_wait_cv);
   3985 	rf_destroy_cond2(rf_sparet_resp_cv);
   3986 #endif
   3987 	mutex_exit(&raid_lock);
   3988 	mutex_destroy(&raid_lock);
   3989 
   3990 	return error;
   3991 }
   3992