Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.376.4.6
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.376.4.6 2023/10/18 12:11:52 martin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.376.4.6 2023/10/18 12:11:52 martin Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #define DEVICE_XNAME(dev) dev ? device_xname(dev) : "null"
    162 
    163 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    164 static rf_declare_mutex2(rf_sparet_wait_mutex);
    165 static rf_declare_cond2(rf_sparet_wait_cv);
    166 static rf_declare_cond2(rf_sparet_resp_cv);
    167 
    168 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    169 						 * spare table */
    170 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    171 						 * installation process */
    172 #endif
    173 
    174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    175 
    176 /* prototypes */
    177 static void KernelWakeupFunc(struct buf *);
    178 static void InitBP(struct buf *, struct vnode *, unsigned,
    179     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    180     void *, int, struct proc *);
    181 static void raidinit(struct raid_softc *);
    182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    184 
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 static int raid_diskstart(device_t, struct buf *bp);
    200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    201 static int raid_lastclose(device_t);
    202 
    203 static dev_type_open(raidopen);
    204 static dev_type_close(raidclose);
    205 static dev_type_read(raidread);
    206 static dev_type_write(raidwrite);
    207 static dev_type_ioctl(raidioctl);
    208 static dev_type_strategy(raidstrategy);
    209 static dev_type_dump(raiddump);
    210 static dev_type_size(raidsize);
    211 
    212 const struct bdevsw raid_bdevsw = {
    213 	.d_open = raidopen,
    214 	.d_close = raidclose,
    215 	.d_strategy = raidstrategy,
    216 	.d_ioctl = raidioctl,
    217 	.d_dump = raiddump,
    218 	.d_psize = raidsize,
    219 	.d_discard = nodiscard,
    220 	.d_flag = D_DISK
    221 };
    222 
    223 const struct cdevsw raid_cdevsw = {
    224 	.d_open = raidopen,
    225 	.d_close = raidclose,
    226 	.d_read = raidread,
    227 	.d_write = raidwrite,
    228 	.d_ioctl = raidioctl,
    229 	.d_stop = nostop,
    230 	.d_tty = notty,
    231 	.d_poll = nopoll,
    232 	.d_mmap = nommap,
    233 	.d_kqfilter = nokqfilter,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 static struct dkdriver rf_dkdriver = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_strategy = raidstrategy,
    242 	.d_diskstart = raid_diskstart,
    243 	.d_dumpblocks = raid_dumpblocks,
    244 	.d_lastclose = raid_lastclose,
    245 	.d_minphys = minphys
    246 };
    247 
    248 #define	raidunit(x)	DISKUNIT(x)
    249 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    250 
    251 extern struct cfdriver raid_cd;
    252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    253     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    254     DVF_DETACH_SHUTDOWN);
    255 
    256 /* Internal representation of a rf_recon_req */
    257 struct rf_recon_req_internal {
    258 	RF_RowCol_t col;
    259 	RF_ReconReqFlags_t flags;
    260 	void   *raidPtr;
    261 };
    262 
    263 /*
    264  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    265  * Be aware that large numbers can allow the driver to consume a lot of
    266  * kernel memory, especially on writes, and in degraded mode reads.
    267  *
    268  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    269  * a single 64K write will typically require 64K for the old data,
    270  * 64K for the old parity, and 64K for the new parity, for a total
    271  * of 192K (if the parity buffer is not re-used immediately).
    272  * Even it if is used immediately, that's still 128K, which when multiplied
    273  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    274  *
    275  * Now in degraded mode, for example, a 64K read on the above setup may
    276  * require data reconstruction, which will require *all* of the 4 remaining
    277  * disks to participate -- 4 * 32K/disk == 128K again.
    278  */
    279 
    280 #ifndef RAIDOUTSTANDING
    281 #define RAIDOUTSTANDING   6
    282 #endif
    283 
    284 #define RAIDLABELDEV(dev)	\
    285 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    286 
    287 /* declared here, and made public, for the benefit of KVM stuff.. */
    288 
    289 static int raidlock(struct raid_softc *);
    290 static void raidunlock(struct raid_softc *);
    291 
    292 static int raid_detach_unlocked(struct raid_softc *);
    293 
    294 static void rf_markalldirty(RF_Raid_t *);
    295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    296 
    297 void rf_ReconThread(struct rf_recon_req_internal *);
    298 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    299 void rf_CopybackThread(RF_Raid_t *raidPtr);
    300 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    301 int rf_autoconfig(device_t);
    302 void rf_buildroothack(RF_ConfigSet_t *);
    303 
    304 RF_AutoConfig_t *rf_find_raid_components(void);
    305 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    306 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    307 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    308 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    309 int rf_set_autoconfig(RF_Raid_t *, int);
    310 int rf_set_rootpartition(RF_Raid_t *, int);
    311 void rf_release_all_vps(RF_ConfigSet_t *);
    312 void rf_cleanup_config_set(RF_ConfigSet_t *);
    313 int rf_have_enough_components(RF_ConfigSet_t *);
    314 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    316 
    317 /*
    318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    320  * in the kernel config file.
    321  */
    322 #ifdef RAID_AUTOCONFIG
    323 int raidautoconfig = 1;
    324 #else
    325 int raidautoconfig = 0;
    326 #endif
    327 static bool raidautoconfigdone = false;
    328 
    329 struct RF_Pools_s rf_pools;
    330 
    331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    332 static kmutex_t raid_lock;
    333 
    334 static struct raid_softc *
    335 raidcreate(int unit) {
    336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    337 	sc->sc_unit = unit;
    338 	cv_init(&sc->sc_cv, "raidunit");
    339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    340 	return sc;
    341 }
    342 
    343 static void
    344 raiddestroy(struct raid_softc *sc) {
    345 	cv_destroy(&sc->sc_cv);
    346 	mutex_destroy(&sc->sc_mutex);
    347 	kmem_free(sc, sizeof(*sc));
    348 }
    349 
    350 static struct raid_softc *
    351 raidget(int unit, bool create) {
    352 	struct raid_softc *sc;
    353 	if (unit < 0) {
    354 #ifdef DIAGNOSTIC
    355 		panic("%s: unit %d!", __func__, unit);
    356 #endif
    357 		return NULL;
    358 	}
    359 	mutex_enter(&raid_lock);
    360 	LIST_FOREACH(sc, &raids, sc_link) {
    361 		if (sc->sc_unit == unit) {
    362 			mutex_exit(&raid_lock);
    363 			return sc;
    364 		}
    365 	}
    366 	mutex_exit(&raid_lock);
    367 	if (!create)
    368 		return NULL;
    369 	sc = raidcreate(unit);
    370 	mutex_enter(&raid_lock);
    371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    372 	mutex_exit(&raid_lock);
    373 	return sc;
    374 }
    375 
    376 static void
    377 raidput(struct raid_softc *sc) {
    378 	mutex_enter(&raid_lock);
    379 	LIST_REMOVE(sc, sc_link);
    380 	mutex_exit(&raid_lock);
    381 	raiddestroy(sc);
    382 }
    383 
    384 void
    385 raidattach(int num)
    386 {
    387 
    388 	/*
    389 	 * Device attachment and associated initialization now occurs
    390 	 * as part of the module initialization.
    391 	 */
    392 }
    393 
    394 int
    395 rf_autoconfig(device_t self)
    396 {
    397 	RF_AutoConfig_t *ac_list;
    398 	RF_ConfigSet_t *config_sets;
    399 
    400 	if (!raidautoconfig || raidautoconfigdone == true)
    401 		return (0);
    402 
    403 	/* XXX This code can only be run once. */
    404 	raidautoconfigdone = true;
    405 
    406 #ifdef __HAVE_CPU_BOOTCONF
    407 	/*
    408 	 * 0. find the boot device if needed first so we can use it later
    409 	 * this needs to be done before we autoconfigure any raid sets,
    410 	 * because if we use wedges we are not going to be able to open
    411 	 * the boot device later
    412 	 */
    413 	if (booted_device == NULL)
    414 		cpu_bootconf();
    415 #endif
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set and configure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 int
    433 rf_inited(const struct raid_softc *rs) {
    434 	return (rs->sc_flags & RAIDF_INITED) != 0;
    435 }
    436 
    437 RF_Raid_t *
    438 rf_get_raid(struct raid_softc *rs) {
    439 	return &rs->sc_r;
    440 }
    441 
    442 int
    443 rf_get_unit(const struct raid_softc *rs) {
    444 	return rs->sc_unit;
    445 }
    446 
    447 static int
    448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    449 	const char *bootname;
    450 	size_t len;
    451 
    452 	/* if bdv is NULL, the set can't contain it. exit early. */
    453 	if (bdv == NULL)
    454 		return 0;
    455 
    456 	bootname = device_xname(bdv);
    457 	len = strlen(bootname);
    458 
    459 	for (int col = 0; col < r->numCol; col++) {
    460 		const char *devname = r->Disks[col].devname;
    461 		devname += sizeof("/dev/") - 1;
    462 		if (strncmp(devname, "dk", 2) == 0) {
    463 			const char *parent =
    464 			    dkwedge_get_parent_name(r->Disks[col].dev);
    465 			if (parent != NULL)
    466 				devname = parent;
    467 		}
    468 		if (strncmp(devname, bootname, len) == 0) {
    469 			struct raid_softc *sc = r->softc;
    470 			aprint_debug("raid%d includes boot device %s\n",
    471 			    sc->sc_unit, devname);
    472 			return 1;
    473 		}
    474 	}
    475 	return 0;
    476 }
    477 
    478 
    479 /*
    480  * Example setup:
    481  * dk1 at wd0: "raid@wd0", 171965 blocks at 32802, type: raidframe
    482  * dk3 at wd1: "raid@wd1", 171965 blocks at 32802, type: raidframz
    483  * raid1: Components: /dev/dk1 /dev/dk3
    484  * dk4 at raid1: "empty@raid1", 8192 blocks at 34, type: msdos
    485  * dk5 at raid1: "root@raid1", 163517 blocks at 8226, type: ffs
    486  *
    487  * If booted from wd0, booted_device will be
    488  * disk wd0, startblk = 41092, nblks = 163517
    489  *
    490  * That is, dk5 with startblk computed from the beginning of wd0
    491  * instead of beginning of raid1:
    492  * 32802 + 64 (RF_PROTECTED_SECTORS) + 8226 = 41092
    493  *
    494  * In order to find the boot wedge, we must iterate on each component,
    495  * find its offset from disk beginning, abd look for the boot wedge with
    496  * startblck adjusted.
    497  */
    498 static device_t
    499 rf_find_bootwedge(struct raid_softc *rsc)
    500 {
    501 	RF_Raid_t *r = &rsc->sc_r;
    502 	const char *bootname;
    503 	size_t len;
    504 	device_t rdev = NULL;
    505 
    506 	if (booted_device == NULL)
    507 		goto out;
    508 
    509 	bootname = device_xname(booted_device);
    510 	len = strlen(bootname);
    511 
    512 	aprint_debug("%s: booted_device %s, startblk = %"PRId64", "
    513 		     "nblks = %"PRId64"\n", __func__,
    514 		     bootname, booted_startblk, booted_nblks);
    515 
    516 	for (int col = 0; col < r->numCol; col++) {
    517 		const char *devname = r->Disks[col].devname;
    518 		const char *parent;
    519 		struct disk *dk;
    520 		u_int nwedges;
    521 		struct dkwedge_info *dkwi;
    522 		struct dkwedge_list dkwl;
    523 		size_t dkwi_len;
    524 		int i;
    525 
    526 		devname += sizeof("/dev/") - 1;
    527 		if (strncmp(devname, "dk", 2) != 0)
    528 			continue;
    529 
    530 		parent = dkwedge_get_parent_name(r->Disks[col].dev);
    531 		if (parent == NULL) {
    532 			aprint_debug("%s: cannot find parent for "
    533 				     "component /dev/%s", __func__, devname);
    534 			continue;
    535 		}
    536 
    537 		if (strncmp(parent, bootname, len) != 0)
    538 			continue;
    539 
    540 		aprint_debug("%s: looking up wedge %s in device %s\n",
    541 			     __func__, devname, parent);
    542 
    543 		dk = disk_find(parent);
    544 		nwedges = dk->dk_nwedges;
    545 		dkwi_len = sizeof(*dkwi) * nwedges;
    546 		dkwi = RF_Malloc(dkwi_len);
    547 
    548 		dkwl.dkwl_buf = dkwi;
    549 		dkwl.dkwl_bufsize = dkwi_len;
    550 		dkwl.dkwl_nwedges = 0;
    551 		dkwl.dkwl_ncopied = 0;
    552 
    553 		if (dkwedge_list(dk, &dkwl, curlwp) == 0) {
    554 			daddr_t startblk;
    555 
    556 			for (i = 0; i < dkwl.dkwl_ncopied; i++) {
    557 				if (strcmp(dkwi[i].dkw_devname, devname) == 0)
    558 					break;
    559 			}
    560 
    561 			KASSERT(i < dkwl.dkwl_ncopied);
    562 
    563 			aprint_debug("%s: wedge %s, "
    564 				     "startblk = %"PRId64", "
    565 				     "nblks = %"PRId64"\n",
    566 				     __func__,
    567 				     dkwi[i].dkw_devname,
    568 				     dkwi[i].dkw_offset,
    569 				     dkwi[i].dkw_size);
    570 
    571 			startblk = booted_startblk
    572 				 - dkwi[i].dkw_offset
    573 				 - RF_PROTECTED_SECTORS;
    574 
    575 			aprint_debug("%s: looking for wedge in %s, "
    576 				     "startblk = %"PRId64", "
    577 				     "nblks = %"PRId64"\n",
    578 				     __func__,
    579 				     DEVICE_XNAME(rsc->sc_dksc.sc_dev),
    580 				     startblk, booted_nblks);
    581 
    582 			rdev = dkwedge_find_partition(rsc->sc_dksc.sc_dev,
    583 						      startblk,
    584 						      booted_nblks);
    585 			if (rdev) {
    586 				aprint_debug("%s: root candidate wedge %s "
    587 					     "shifted from %s\n", __func__,
    588 					     device_xname(rdev),
    589 					     dkwi[i].dkw_devname);
    590 				goto done;
    591 			} else {
    592 				aprint_debug("%s: not found\n", __func__);
    593 			}
    594 		}
    595 
    596 		aprint_debug("%s: nothing found for col %d\n", __func__, col);
    597 done:
    598 		RF_Free(dkwi, dkwi_len);
    599 	}
    600 
    601 out:
    602 	if (!rdev)
    603 		aprint_debug("%s: nothing found\n", __func__);
    604 
    605 	return rdev;
    606 }
    607 
    608 void
    609 rf_buildroothack(RF_ConfigSet_t *config_sets)
    610 {
    611 	RF_ConfigSet_t *cset;
    612 	RF_ConfigSet_t *next_cset;
    613 	int num_root;
    614 	struct raid_softc *sc, *rsc;
    615 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    616 
    617 	sc = rsc = NULL;
    618 	num_root = 0;
    619 	cset = config_sets;
    620 	while (cset != NULL) {
    621 		next_cset = cset->next;
    622 		if (rf_have_enough_components(cset) &&
    623 		    cset->ac->clabel->autoconfigure == 1) {
    624 			sc = rf_auto_config_set(cset);
    625 			if (sc != NULL) {
    626 				aprint_debug("raid%d: configured ok, rootable %d\n",
    627 				    sc->sc_unit, cset->rootable);
    628 				if (cset->rootable) {
    629 					rsc = sc;
    630 					num_root++;
    631 				}
    632 			} else {
    633 				/* The autoconfig didn't work :( */
    634 				aprint_debug("Autoconfig failed\n");
    635 				rf_release_all_vps(cset);
    636 			}
    637 		} else {
    638 			/* we're not autoconfiguring this set...
    639 			   release the associated resources */
    640 			rf_release_all_vps(cset);
    641 		}
    642 		/* cleanup */
    643 		rf_cleanup_config_set(cset);
    644 		cset = next_cset;
    645 	}
    646 
    647 	/* if the user has specified what the root device should be
    648 	   then we don't touch booted_device or boothowto... */
    649 
    650 	if (rootspec != NULL) {
    651 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
    652 		return;
    653 	}
    654 
    655 	/* we found something bootable... */
    656 	if (num_root == 1) {
    657 		device_t candidate_root = NULL;
    658 		dksc = &rsc->sc_dksc;
    659 
    660 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    661 
    662 			/* Find the wedge we booted from */
    663 			candidate_root = rf_find_bootwedge(rsc);
    664 
    665 			/* Try first partition */
    666 			if (candidate_root == NULL) {
    667 				size_t i = 0;
    668 				candidate_root = dkwedge_find_by_parent(
    669 				    device_xname(dksc->sc_dev), &i);
    670 			}
    671 			aprint_debug("%s: candidate wedge root %s\n",
    672 			    __func__, DEVICE_XNAME(candidate_root));
    673 		} else {
    674 			candidate_root = dksc->sc_dev;
    675 		}
    676 
    677 		aprint_debug("%s: candidate root = %s, booted_device = %s, "
    678 			     "root_partition = %d, contains_boot=%d\n",
    679 		    __func__, DEVICE_XNAME(candidate_root),
    680 		    DEVICE_XNAME(booted_device), rsc->sc_r.root_partition,
    681 		    rf_containsboot(&rsc->sc_r, booted_device));
    682 
    683 		/* XXX the check for booted_device == NULL can probably be
    684 		 * dropped, now that rf_containsboot handles that case.
    685 		 */
    686 		if (booted_device == NULL ||
    687 		    rsc->sc_r.root_partition == 1 ||
    688 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    689 			booted_device = candidate_root;
    690 			booted_method = "raidframe/single";
    691 			booted_partition = 0;	/* XXX assume 'a' */
    692 			aprint_debug("%s: set booted_device = %s\n", __func__,
    693 			    DEVICE_XNAME(booted_device));
    694 		}
    695 	} else if (num_root > 1) {
    696 		aprint_debug("%s: many roots=%d, %s\n", __func__, num_root,
    697 		    DEVICE_XNAME(booted_device));
    698 
    699 		/*
    700 		 * Maybe the MD code can help. If it cannot, then
    701 		 * setroot() will discover that we have no
    702 		 * booted_device and will ask the user if nothing was
    703 		 * hardwired in the kernel config file
    704 		 */
    705 		if (booted_device == NULL)
    706 			return;
    707 
    708 		num_root = 0;
    709 		mutex_enter(&raid_lock);
    710 		LIST_FOREACH(sc, &raids, sc_link) {
    711 			RF_Raid_t *r = &sc->sc_r;
    712 			if (r->valid == 0)
    713 				continue;
    714 
    715 			if (r->root_partition == 0)
    716 				continue;
    717 
    718 			if (rf_containsboot(r, booted_device)) {
    719 				num_root++;
    720 				rsc = sc;
    721 				dksc = &rsc->sc_dksc;
    722 			}
    723 		}
    724 		mutex_exit(&raid_lock);
    725 
    726 		if (num_root == 1) {
    727 			booted_device = dksc->sc_dev;
    728 			booted_method = "raidframe/multi";
    729 			booted_partition = 0;	/* XXX assume 'a' */
    730 		} else {
    731 			/* we can't guess.. require the user to answer... */
    732 			boothowto |= RB_ASKNAME;
    733 		}
    734 	}
    735 }
    736 
    737 static int
    738 raidsize(dev_t dev)
    739 {
    740 	struct raid_softc *rs;
    741 	struct dk_softc *dksc;
    742 	unsigned int unit;
    743 
    744 	unit = raidunit(dev);
    745 	if ((rs = raidget(unit, false)) == NULL)
    746 		return -1;
    747 	dksc = &rs->sc_dksc;
    748 
    749 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    750 		return -1;
    751 
    752 	return dk_size(dksc, dev);
    753 }
    754 
    755 static int
    756 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    757 {
    758 	unsigned int unit;
    759 	struct raid_softc *rs;
    760 	struct dk_softc *dksc;
    761 
    762 	unit = raidunit(dev);
    763 	if ((rs = raidget(unit, false)) == NULL)
    764 		return ENXIO;
    765 	dksc = &rs->sc_dksc;
    766 
    767 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    768 		return ENODEV;
    769 
    770         /*
    771            Note that blkno is relative to this particular partition.
    772            By adding adding RF_PROTECTED_SECTORS, we get a value that
    773 	   is relative to the partition used for the underlying component.
    774         */
    775 	blkno += RF_PROTECTED_SECTORS;
    776 
    777 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    778 }
    779 
    780 static int
    781 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    782 {
    783 	struct raid_softc *rs = raidsoftc(dev);
    784 	const struct bdevsw *bdev;
    785 	RF_Raid_t *raidPtr;
    786 	int     c, sparecol, j, scol, dumpto;
    787 	int     error = 0;
    788 
    789 	raidPtr = &rs->sc_r;
    790 
    791 	/* we only support dumping to RAID 1 sets */
    792 	if (raidPtr->Layout.numDataCol != 1 ||
    793 	    raidPtr->Layout.numParityCol != 1)
    794 		return EINVAL;
    795 
    796 	if ((error = raidlock(rs)) != 0)
    797 		return error;
    798 
    799 	/* figure out what device is alive.. */
    800 
    801 	/*
    802 	   Look for a component to dump to.  The preference for the
    803 	   component to dump to is as follows:
    804 	   1) the first component
    805 	   2) a used_spare of the first component
    806 	   3) the second component
    807 	   4) a used_spare of the second component
    808 	*/
    809 
    810 	dumpto = -1;
    811 	for (c = 0; c < raidPtr->numCol; c++) {
    812 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    813 			/* this might be the one */
    814 			dumpto = c;
    815 			break;
    816 		}
    817 	}
    818 
    819 	/*
    820 	   At this point we have possibly selected a live component.
    821 	   If we didn't find a live ocmponent, we now check to see
    822 	   if there is a relevant spared component.
    823 	*/
    824 
    825 	for (c = 0; c < raidPtr->numSpare; c++) {
    826 		sparecol = raidPtr->numCol + c;
    827 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    828 			/* How about this one? */
    829 			scol = -1;
    830 			for(j=0;j<raidPtr->numCol;j++) {
    831 				if (raidPtr->Disks[j].spareCol == sparecol) {
    832 					scol = j;
    833 					break;
    834 				}
    835 			}
    836 			if (scol == 0) {
    837 				/*
    838 				   We must have found a spared first
    839 				   component!  We'll take that over
    840 				   anything else found so far.  (We
    841 				   couldn't have found a real first
    842 				   component before, since this is a
    843 				   used spare, and it's saying that
    844 				   it's replacing the first
    845 				   component.)  On reboot (with
    846 				   autoconfiguration turned on)
    847 				   sparecol will become the first
    848 				   component (component0) of this set.
    849 				*/
    850 				dumpto = sparecol;
    851 				break;
    852 			} else if (scol != -1) {
    853 				/*
    854 				   Must be a spared second component.
    855 				   We'll dump to that if we havn't found
    856 				   anything else so far.
    857 				*/
    858 				if (dumpto == -1)
    859 					dumpto = sparecol;
    860 			}
    861 		}
    862 	}
    863 
    864 	if (dumpto == -1) {
    865 		/* we couldn't find any live components to dump to!?!?
    866 		 */
    867 		error = EINVAL;
    868 		goto out;
    869 	}
    870 
    871 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    872 	if (bdev == NULL) {
    873 		error = ENXIO;
    874 		goto out;
    875 	}
    876 
    877 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    878 				blkno, va, nblk * raidPtr->bytesPerSector);
    879 
    880 out:
    881 	raidunlock(rs);
    882 
    883 	return error;
    884 }
    885 
    886 /* ARGSUSED */
    887 static int
    888 raidopen(dev_t dev, int flags, int fmt,
    889     struct lwp *l)
    890 {
    891 	int     unit = raidunit(dev);
    892 	struct raid_softc *rs;
    893 	struct dk_softc *dksc;
    894 	int     error = 0;
    895 	int     part, pmask;
    896 
    897 	if ((rs = raidget(unit, true)) == NULL)
    898 		return ENXIO;
    899 	if ((error = raidlock(rs)) != 0)
    900 		return (error);
    901 
    902 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    903 		error = EBUSY;
    904 		goto bad;
    905 	}
    906 
    907 	dksc = &rs->sc_dksc;
    908 
    909 	part = DISKPART(dev);
    910 	pmask = (1 << part);
    911 
    912 	if (!DK_BUSY(dksc, pmask) &&
    913 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    914 		/* First one... mark things as dirty... Note that we *MUST*
    915 		 have done a configure before this.  I DO NOT WANT TO BE
    916 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    917 		 THAT THEY BELONG TOGETHER!!!!! */
    918 		/* XXX should check to see if we're only open for reading
    919 		   here... If so, we needn't do this, but then need some
    920 		   other way of keeping track of what's happened.. */
    921 
    922 		rf_markalldirty(&rs->sc_r);
    923 	}
    924 
    925 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    926 		error = dk_open(dksc, dev, flags, fmt, l);
    927 
    928 bad:
    929 	raidunlock(rs);
    930 
    931 	return (error);
    932 
    933 
    934 }
    935 
    936 static int
    937 raid_lastclose(device_t self)
    938 {
    939 	struct raid_softc *rs = raidsoftc(self);
    940 
    941 	/* Last one... device is not unconfigured yet.
    942 	   Device shutdown has taken care of setting the
    943 	   clean bits if RAIDF_INITED is not set
    944 	   mark things as clean... */
    945 
    946 	rf_update_component_labels(&rs->sc_r,
    947 	    RF_FINAL_COMPONENT_UPDATE);
    948 
    949 	/* pass to unlocked code */
    950 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    951 		rs->sc_flags |= RAIDF_DETACH;
    952 
    953 	return 0;
    954 }
    955 
    956 /* ARGSUSED */
    957 static int
    958 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    959 {
    960 	int     unit = raidunit(dev);
    961 	struct raid_softc *rs;
    962 	struct dk_softc *dksc;
    963 	cfdata_t cf;
    964 	int     error = 0, do_detach = 0, do_put = 0;
    965 
    966 	if ((rs = raidget(unit, false)) == NULL)
    967 		return ENXIO;
    968 	dksc = &rs->sc_dksc;
    969 
    970 	if ((error = raidlock(rs)) != 0)
    971 		return (error);
    972 
    973 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    974 		error = dk_close(dksc, dev, flags, fmt, l);
    975 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    976 			do_detach = 1;
    977 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    978 		do_put = 1;
    979 
    980 	raidunlock(rs);
    981 
    982 	if (do_detach) {
    983 		/* free the pseudo device attach bits */
    984 		cf = device_cfdata(dksc->sc_dev);
    985 		error = config_detach(dksc->sc_dev, 0);
    986 		if (error == 0)
    987 			free(cf, M_RAIDFRAME);
    988 	} else if (do_put) {
    989 		raidput(rs);
    990 	}
    991 
    992 	return (error);
    993 
    994 }
    995 
    996 static void
    997 raid_wakeup(RF_Raid_t *raidPtr)
    998 {
    999 	rf_lock_mutex2(raidPtr->iodone_lock);
   1000 	rf_signal_cond2(raidPtr->iodone_cv);
   1001 	rf_unlock_mutex2(raidPtr->iodone_lock);
   1002 }
   1003 
   1004 static void
   1005 raidstrategy(struct buf *bp)
   1006 {
   1007 	unsigned int unit;
   1008 	struct raid_softc *rs;
   1009 	struct dk_softc *dksc;
   1010 	RF_Raid_t *raidPtr;
   1011 
   1012 	unit = raidunit(bp->b_dev);
   1013 	if ((rs = raidget(unit, false)) == NULL) {
   1014 		bp->b_error = ENXIO;
   1015 		goto fail;
   1016 	}
   1017 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1018 		bp->b_error = ENXIO;
   1019 		goto fail;
   1020 	}
   1021 	dksc = &rs->sc_dksc;
   1022 	raidPtr = &rs->sc_r;
   1023 
   1024 	/* Queue IO only */
   1025 	if (dk_strategy_defer(dksc, bp))
   1026 		goto done;
   1027 
   1028 	/* schedule the IO to happen at the next convenient time */
   1029 	raid_wakeup(raidPtr);
   1030 
   1031 done:
   1032 	return;
   1033 
   1034 fail:
   1035 	bp->b_resid = bp->b_bcount;
   1036 	biodone(bp);
   1037 }
   1038 
   1039 static int
   1040 raid_diskstart(device_t dev, struct buf *bp)
   1041 {
   1042 	struct raid_softc *rs = raidsoftc(dev);
   1043 	RF_Raid_t *raidPtr;
   1044 
   1045 	raidPtr = &rs->sc_r;
   1046 	if (!raidPtr->valid) {
   1047 		db1_printf(("raid is not valid..\n"));
   1048 		return ENODEV;
   1049 	}
   1050 
   1051 	/* XXX */
   1052 	bp->b_resid = 0;
   1053 
   1054 	return raiddoaccess(raidPtr, bp);
   1055 }
   1056 
   1057 void
   1058 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1059 {
   1060 	struct raid_softc *rs;
   1061 	struct dk_softc *dksc;
   1062 
   1063 	rs = raidPtr->softc;
   1064 	dksc = &rs->sc_dksc;
   1065 
   1066 	dk_done(dksc, bp);
   1067 
   1068 	rf_lock_mutex2(raidPtr->mutex);
   1069 	raidPtr->openings++;
   1070 	rf_unlock_mutex2(raidPtr->mutex);
   1071 
   1072 	/* schedule more IO */
   1073 	raid_wakeup(raidPtr);
   1074 }
   1075 
   1076 /* ARGSUSED */
   1077 static int
   1078 raidread(dev_t dev, struct uio *uio, int flags)
   1079 {
   1080 	int     unit = raidunit(dev);
   1081 	struct raid_softc *rs;
   1082 
   1083 	if ((rs = raidget(unit, false)) == NULL)
   1084 		return ENXIO;
   1085 
   1086 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1087 		return (ENXIO);
   1088 
   1089 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
   1090 
   1091 }
   1092 
   1093 /* ARGSUSED */
   1094 static int
   1095 raidwrite(dev_t dev, struct uio *uio, int flags)
   1096 {
   1097 	int     unit = raidunit(dev);
   1098 	struct raid_softc *rs;
   1099 
   1100 	if ((rs = raidget(unit, false)) == NULL)
   1101 		return ENXIO;
   1102 
   1103 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1104 		return (ENXIO);
   1105 
   1106 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1107 
   1108 }
   1109 
   1110 static int
   1111 raid_detach_unlocked(struct raid_softc *rs)
   1112 {
   1113 	struct dk_softc *dksc = &rs->sc_dksc;
   1114 	RF_Raid_t *raidPtr;
   1115 	int error;
   1116 
   1117 	raidPtr = &rs->sc_r;
   1118 
   1119 	if (DK_BUSY(dksc, 0) ||
   1120 	    raidPtr->recon_in_progress != 0 ||
   1121 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1122 	    raidPtr->copyback_in_progress != 0)
   1123 		return EBUSY;
   1124 
   1125 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1126 		return 0;
   1127 
   1128 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1129 
   1130 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1131 		return error;
   1132 
   1133 	rs->sc_flags &= ~RAIDF_INITED;
   1134 
   1135 	/* Kill off any queued buffers */
   1136 	dk_drain(dksc);
   1137 	bufq_free(dksc->sc_bufq);
   1138 
   1139 	/* Detach the disk. */
   1140 	dkwedge_delall(&dksc->sc_dkdev);
   1141 	disk_detach(&dksc->sc_dkdev);
   1142 	disk_destroy(&dksc->sc_dkdev);
   1143 	dk_detach(dksc);
   1144 
   1145 	return 0;
   1146 }
   1147 
   1148 static bool
   1149 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1150 {
   1151 	switch (cmd) {
   1152 	case RAIDFRAME_ADD_HOT_SPARE:
   1153 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1154 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1155 	case RAIDFRAME_CHECK_PARITY:
   1156 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1157 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1158 	case RAIDFRAME_CHECK_RECON_STATUS:
   1159 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1160 	case RAIDFRAME_COPYBACK:
   1161 	case RAIDFRAME_DELETE_COMPONENT:
   1162 	case RAIDFRAME_FAIL_DISK:
   1163 	case RAIDFRAME_GET_ACCTOTALS:
   1164 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1165 	case RAIDFRAME_GET_INFO:
   1166 	case RAIDFRAME_GET_SIZE:
   1167 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1168 	case RAIDFRAME_INIT_LABELS:
   1169 	case RAIDFRAME_KEEP_ACCTOTALS:
   1170 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1171 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1172 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1173 	case RAIDFRAME_PARITYMAP_STATUS:
   1174 	case RAIDFRAME_REBUILD_IN_PLACE:
   1175 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1176 	case RAIDFRAME_RESET_ACCTOTALS:
   1177 	case RAIDFRAME_REWRITEPARITY:
   1178 	case RAIDFRAME_SET_AUTOCONFIG:
   1179 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1180 	case RAIDFRAME_SET_LAST_UNIT:
   1181 	case RAIDFRAME_SET_ROOT:
   1182 	case RAIDFRAME_SHUTDOWN:
   1183 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1184 	}
   1185 	return false;
   1186 }
   1187 
   1188 int
   1189 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1190 {
   1191 	struct rf_recon_req_internal *rrint;
   1192 
   1193 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1194 		/* Can't do this on a RAID 0!! */
   1195 		return EINVAL;
   1196 	}
   1197 
   1198 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1199 		/* bad column */
   1200 		return EINVAL;
   1201 	}
   1202 
   1203 	rf_lock_mutex2(raidPtr->mutex);
   1204 	if (raidPtr->status == rf_rs_reconstructing) {
   1205 		/* you can't fail a disk while we're reconstructing! */
   1206 		/* XXX wrong for RAID6 */
   1207 		goto out;
   1208 	}
   1209 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1210 	    (raidPtr->numFailures > 0)) {
   1211 		/* some other component has failed.  Let's not make
   1212 		   things worse. XXX wrong for RAID6 */
   1213 		goto out;
   1214 	}
   1215 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1216 		/* Can't fail a spared disk! */
   1217 		goto out;
   1218 	}
   1219 	rf_unlock_mutex2(raidPtr->mutex);
   1220 
   1221 	/* make a copy of the recon request so that we don't rely on
   1222 	 * the user's buffer */
   1223 	rrint = RF_Malloc(sizeof(*rrint));
   1224 	if (rrint == NULL)
   1225 		return(ENOMEM);
   1226 	rrint->col = rr->col;
   1227 	rrint->flags = rr->flags;
   1228 	rrint->raidPtr = raidPtr;
   1229 
   1230 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1231 	    rrint, "raid_recon");
   1232 out:
   1233 	rf_unlock_mutex2(raidPtr->mutex);
   1234 	return EINVAL;
   1235 }
   1236 
   1237 static int
   1238 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1239 {
   1240 	/* allocate a buffer for the layout-specific data, and copy it in */
   1241 	if (k_cfg->layoutSpecificSize == 0)
   1242 		return 0;
   1243 
   1244 	if (k_cfg->layoutSpecificSize > 10000) {
   1245 	    /* sanity check */
   1246 	    return EINVAL;
   1247 	}
   1248 
   1249 	u_char *specific_buf;
   1250 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1251 	if (specific_buf == NULL)
   1252 		return ENOMEM;
   1253 
   1254 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1255 	    k_cfg->layoutSpecificSize);
   1256 	if (retcode) {
   1257 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1258 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1259 		return retcode;
   1260 	}
   1261 
   1262 	k_cfg->layoutSpecific = specific_buf;
   1263 	return 0;
   1264 }
   1265 
   1266 static int
   1267 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1268 {
   1269 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1270 
   1271 	if (rs->sc_r.valid) {
   1272 		/* There is a valid RAID set running on this unit! */
   1273 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1274 		return EINVAL;
   1275 	}
   1276 
   1277 	/* copy-in the configuration information */
   1278 	/* data points to a pointer to the configuration structure */
   1279 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1280 	if (*k_cfg == NULL) {
   1281 		return ENOMEM;
   1282 	}
   1283 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1284 	if (retcode == 0)
   1285 		return 0;
   1286 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1287 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1288 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1289 	return retcode;
   1290 }
   1291 
   1292 int
   1293 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1294 {
   1295 	int retcode, i;
   1296 	RF_Raid_t *raidPtr = &rs->sc_r;
   1297 
   1298 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1299 
   1300 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1301 		goto out;
   1302 
   1303 	/* should do some kind of sanity check on the configuration.
   1304 	 * Store the sum of all the bytes in the last byte? */
   1305 
   1306 	/* Force nul-termination on all strings. */
   1307 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
   1308 	for (i = 0; i < RF_MAXCOL; i++) {
   1309 		ZERO_FINAL(k_cfg->devnames[0][i]);
   1310 	}
   1311 	for (i = 0; i < RF_MAXSPARE; i++) {
   1312 		ZERO_FINAL(k_cfg->spare_names[i]);
   1313 	}
   1314 	for (i = 0; i < RF_MAXDBGV; i++) {
   1315 		ZERO_FINAL(k_cfg->debugVars[i]);
   1316 	}
   1317 #undef ZERO_FINAL
   1318 
   1319 	/* Check some basic limits. */
   1320 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
   1321 		retcode = EINVAL;
   1322 		goto out;
   1323 	}
   1324 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
   1325 		retcode = EINVAL;
   1326 		goto out;
   1327 	}
   1328 
   1329 	/* configure the system */
   1330 
   1331 	/*
   1332 	 * Clear the entire RAID descriptor, just to make sure
   1333 	 *  there is no stale data left in the case of a
   1334 	 *  reconfiguration
   1335 	 */
   1336 	memset(raidPtr, 0, sizeof(*raidPtr));
   1337 	raidPtr->softc = rs;
   1338 	raidPtr->raidid = rs->sc_unit;
   1339 
   1340 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1341 
   1342 	if (retcode == 0) {
   1343 		/* allow this many simultaneous IO's to
   1344 		   this RAID device */
   1345 		raidPtr->openings = RAIDOUTSTANDING;
   1346 
   1347 		raidinit(rs);
   1348 		raid_wakeup(raidPtr);
   1349 		rf_markalldirty(raidPtr);
   1350 	}
   1351 
   1352 	/* free the buffers.  No return code here. */
   1353 	if (k_cfg->layoutSpecificSize) {
   1354 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1355 	}
   1356 out:
   1357 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1358 	if (retcode) {
   1359 		/*
   1360 		 * If configuration failed, set sc_flags so that we
   1361 		 * will detach the device when we close it.
   1362 		 */
   1363 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1364 	}
   1365 	return retcode;
   1366 }
   1367 
   1368 #if RF_DISABLED
   1369 static int
   1370 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1371 {
   1372 
   1373 	/* XXX check the label for valid stuff... */
   1374 	/* Note that some things *should not* get modified --
   1375 	   the user should be re-initing the labels instead of
   1376 	   trying to patch things.
   1377 	   */
   1378 #ifdef DEBUG
   1379 	int raidid = raidPtr->raidid;
   1380 	printf("raid%d: Got component label:\n", raidid);
   1381 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1382 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1383 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1384 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1385 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1386 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1387 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1388 #endif	/* DEBUG */
   1389 	clabel->row = 0;
   1390 	int column = clabel->column;
   1391 
   1392 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1393 		return(EINVAL);
   1394 	}
   1395 
   1396 	/* XXX this isn't allowed to do anything for now :-) */
   1397 
   1398 	/* XXX and before it is, we need to fill in the rest
   1399 	   of the fields!?!?!?! */
   1400 	memcpy(raidget_component_label(raidPtr, column),
   1401 	    clabel, sizeof(*clabel));
   1402 	raidflush_component_label(raidPtr, column);
   1403 	return 0;
   1404 }
   1405 #endif
   1406 
   1407 static int
   1408 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1409 {
   1410 	/*
   1411 	   we only want the serial number from
   1412 	   the above.  We get all the rest of the information
   1413 	   from the config that was used to create this RAID
   1414 	   set.
   1415 	   */
   1416 
   1417 	raidPtr->serial_number = clabel->serial_number;
   1418 
   1419 	for (int column = 0; column < raidPtr->numCol; column++) {
   1420 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1421 		if (RF_DEAD_DISK(diskPtr->status))
   1422 			continue;
   1423 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1424 		    raidPtr, column);
   1425 		/* Zeroing this is important. */
   1426 		memset(ci_label, 0, sizeof(*ci_label));
   1427 		raid_init_component_label(raidPtr, ci_label);
   1428 		ci_label->serial_number = raidPtr->serial_number;
   1429 		ci_label->row = 0; /* we dont' pretend to support more */
   1430 		rf_component_label_set_partitionsize(ci_label,
   1431 		    diskPtr->partitionSize);
   1432 		ci_label->column = column;
   1433 		raidflush_component_label(raidPtr, column);
   1434 		/* XXXjld what about the spares? */
   1435 	}
   1436 
   1437 	return 0;
   1438 }
   1439 
   1440 static int
   1441 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1442 {
   1443 
   1444 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1445 		/* Can't do this on a RAID 0!! */
   1446 		return EINVAL;
   1447 	}
   1448 
   1449 	if (raidPtr->recon_in_progress == 1) {
   1450 		/* a reconstruct is already in progress! */
   1451 		return EINVAL;
   1452 	}
   1453 
   1454 	RF_SingleComponent_t component;
   1455 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1456 	component.row = 0; /* we don't support any more */
   1457 	int column = component.column;
   1458 
   1459 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1460 		return EINVAL;
   1461 	}
   1462 
   1463 	rf_lock_mutex2(raidPtr->mutex);
   1464 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1465 	    (raidPtr->numFailures > 0)) {
   1466 		/* XXX 0 above shouldn't be constant!!! */
   1467 		/* some component other than this has failed.
   1468 		   Let's not make things worse than they already
   1469 		   are... */
   1470 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1471 		       raidPtr->raidid);
   1472 		printf("raid%d:     Col: %d   Too many failures.\n",
   1473 		       raidPtr->raidid, column);
   1474 		rf_unlock_mutex2(raidPtr->mutex);
   1475 		return EINVAL;
   1476 	}
   1477 
   1478 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1479 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1480 		       raidPtr->raidid);
   1481 		printf("raid%d:    Col: %d   "
   1482 		    "Reconstruction already occurring!\n",
   1483 		    raidPtr->raidid, column);
   1484 
   1485 		rf_unlock_mutex2(raidPtr->mutex);
   1486 		return EINVAL;
   1487 	}
   1488 
   1489 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1490 		rf_unlock_mutex2(raidPtr->mutex);
   1491 		return EINVAL;
   1492 	}
   1493 
   1494 	rf_unlock_mutex2(raidPtr->mutex);
   1495 
   1496 	struct rf_recon_req_internal *rrint;
   1497 	rrint = RF_Malloc(sizeof(*rrint));
   1498 	if (rrint == NULL)
   1499 		return ENOMEM;
   1500 
   1501 	rrint->col = column;
   1502 	rrint->raidPtr = raidPtr;
   1503 
   1504 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1505 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1506 }
   1507 
   1508 static int
   1509 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1510 {
   1511 	/*
   1512 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1513 	 * so tell the user it's done.
   1514 	 */
   1515 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1516 	    raidPtr->status != rf_rs_reconstructing) {
   1517 		*data = 100;
   1518 		return 0;
   1519 	}
   1520 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1521 		*data = 0;
   1522 		return 0;
   1523 	}
   1524 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1525 	    / raidPtr->reconControl->numRUsTotal);
   1526 	return 0;
   1527 }
   1528 
   1529 /*
   1530  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
   1531  * on the component_name[] array.
   1532  */
   1533 static void
   1534 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
   1535 {
   1536 
   1537 	memcpy(component, data, sizeof *component);
   1538 	component->component_name[sizeof(component->component_name) - 1] = '\0';
   1539 }
   1540 
   1541 static int
   1542 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1543 {
   1544 	int     unit = raidunit(dev);
   1545 	int     part, pmask;
   1546 	struct raid_softc *rs;
   1547 	struct dk_softc *dksc;
   1548 	RF_Config_t *k_cfg;
   1549 	RF_Raid_t *raidPtr;
   1550 	RF_AccTotals_t *totals;
   1551 	RF_SingleComponent_t component;
   1552 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1553 	int retcode = 0;
   1554 	int column;
   1555 	RF_ComponentLabel_t *clabel;
   1556 	int d;
   1557 
   1558 	if ((rs = raidget(unit, false)) == NULL)
   1559 		return ENXIO;
   1560 
   1561 	dksc = &rs->sc_dksc;
   1562 	raidPtr = &rs->sc_r;
   1563 
   1564 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1565 	    (int) DISKPART(dev), (int) unit, cmd));
   1566 
   1567 	/* Must be initialized for these... */
   1568 	if (rf_must_be_initialized(rs, cmd))
   1569 		return ENXIO;
   1570 
   1571 	switch (cmd) {
   1572 		/* configure the system */
   1573 	case RAIDFRAME_CONFIGURE:
   1574 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1575 			return retcode;
   1576 		return rf_construct(rs, k_cfg);
   1577 
   1578 		/* shutdown the system */
   1579 	case RAIDFRAME_SHUTDOWN:
   1580 
   1581 		part = DISKPART(dev);
   1582 		pmask = (1 << part);
   1583 
   1584 		if ((retcode = raidlock(rs)) != 0)
   1585 			return retcode;
   1586 
   1587 		if (DK_BUSY(dksc, pmask) ||
   1588 		    raidPtr->recon_in_progress != 0 ||
   1589 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1590 		    raidPtr->copyback_in_progress != 0)
   1591 			retcode = EBUSY;
   1592 		else {
   1593 			/* detach and free on close */
   1594 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1595 			retcode = 0;
   1596 		}
   1597 
   1598 		raidunlock(rs);
   1599 
   1600 		return retcode;
   1601 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1602 		return rf_get_component_label(raidPtr, data);
   1603 
   1604 #if RF_DISABLED
   1605 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1606 		return rf_set_component_label(raidPtr, data);
   1607 #endif
   1608 
   1609 	case RAIDFRAME_INIT_LABELS:
   1610 		return rf_init_component_label(raidPtr, data);
   1611 
   1612 	case RAIDFRAME_SET_AUTOCONFIG:
   1613 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1614 		printf("raid%d: New autoconfig value is: %d\n",
   1615 		       raidPtr->raidid, d);
   1616 		*(int *) data = d;
   1617 		return retcode;
   1618 
   1619 	case RAIDFRAME_SET_ROOT:
   1620 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1621 		printf("raid%d: New rootpartition value is: %d\n",
   1622 		       raidPtr->raidid, d);
   1623 		*(int *) data = d;
   1624 		return retcode;
   1625 
   1626 		/* initialize all parity */
   1627 	case RAIDFRAME_REWRITEPARITY:
   1628 
   1629 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1630 			/* Parity for RAID 0 is trivially correct */
   1631 			raidPtr->parity_good = RF_RAID_CLEAN;
   1632 			return 0;
   1633 		}
   1634 
   1635 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1636 			/* Re-write is already in progress! */
   1637 			return EINVAL;
   1638 		}
   1639 
   1640 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1641 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1642 
   1643 	case RAIDFRAME_ADD_HOT_SPARE:
   1644 		rf_copy_single_component(&component, data);
   1645 		return rf_add_hot_spare(raidPtr, &component);
   1646 
   1647 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1648 		return retcode;
   1649 
   1650 	case RAIDFRAME_DELETE_COMPONENT:
   1651 		rf_copy_single_component(&component, data);
   1652 		return rf_delete_component(raidPtr, &component);
   1653 
   1654 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1655 		rf_copy_single_component(&component, data);
   1656 		return rf_incorporate_hot_spare(raidPtr, &component);
   1657 
   1658 	case RAIDFRAME_REBUILD_IN_PLACE:
   1659 		return rf_rebuild_in_place(raidPtr, data);
   1660 
   1661 	case RAIDFRAME_GET_INFO:
   1662 		ucfgp = *(RF_DeviceConfig_t **)data;
   1663 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1664 		if (d_cfg == NULL)
   1665 			return ENOMEM;
   1666 		retcode = rf_get_info(raidPtr, d_cfg);
   1667 		if (retcode == 0) {
   1668 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1669 		}
   1670 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1671 		return retcode;
   1672 
   1673 	case RAIDFRAME_CHECK_PARITY:
   1674 		*(int *) data = raidPtr->parity_good;
   1675 		return 0;
   1676 
   1677 	case RAIDFRAME_PARITYMAP_STATUS:
   1678 		if (rf_paritymap_ineligible(raidPtr))
   1679 			return EINVAL;
   1680 		rf_paritymap_status(raidPtr->parity_map, data);
   1681 		return 0;
   1682 
   1683 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1684 		if (rf_paritymap_ineligible(raidPtr))
   1685 			return EINVAL;
   1686 		if (raidPtr->parity_map == NULL)
   1687 			return ENOENT; /* ??? */
   1688 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1689 			return EINVAL;
   1690 		return 0;
   1691 
   1692 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1693 		if (rf_paritymap_ineligible(raidPtr))
   1694 			return EINVAL;
   1695 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1696 		return 0;
   1697 
   1698 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1699 		if (rf_paritymap_ineligible(raidPtr))
   1700 			return EINVAL;
   1701 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1702 		/* XXX should errors be passed up? */
   1703 		return 0;
   1704 
   1705 	case RAIDFRAME_RESET_ACCTOTALS:
   1706 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1707 		return 0;
   1708 
   1709 	case RAIDFRAME_GET_ACCTOTALS:
   1710 		totals = (RF_AccTotals_t *) data;
   1711 		*totals = raidPtr->acc_totals;
   1712 		return 0;
   1713 
   1714 	case RAIDFRAME_KEEP_ACCTOTALS:
   1715 		raidPtr->keep_acc_totals = *(int *)data;
   1716 		return 0;
   1717 
   1718 	case RAIDFRAME_GET_SIZE:
   1719 		*(int *) data = raidPtr->totalSectors;
   1720 		return 0;
   1721 
   1722 	case RAIDFRAME_FAIL_DISK:
   1723 		return rf_fail_disk(raidPtr, data);
   1724 
   1725 		/* invoke a copyback operation after recon on whatever disk
   1726 		 * needs it, if any */
   1727 	case RAIDFRAME_COPYBACK:
   1728 
   1729 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1730 			/* This makes no sense on a RAID 0!! */
   1731 			return EINVAL;
   1732 		}
   1733 
   1734 		if (raidPtr->copyback_in_progress == 1) {
   1735 			/* Copyback is already in progress! */
   1736 			return EINVAL;
   1737 		}
   1738 
   1739 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1740 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1741 
   1742 		/* return the percentage completion of reconstruction */
   1743 	case RAIDFRAME_CHECK_RECON_STATUS:
   1744 		return rf_check_recon_status(raidPtr, data);
   1745 
   1746 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1747 		rf_check_recon_status_ext(raidPtr, data);
   1748 		return 0;
   1749 
   1750 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1751 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1752 			/* This makes no sense on a RAID 0, so tell the
   1753 			   user it's done. */
   1754 			*(int *) data = 100;
   1755 			return 0;
   1756 		}
   1757 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1758 			*(int *) data = 100 *
   1759 				raidPtr->parity_rewrite_stripes_done /
   1760 				raidPtr->Layout.numStripe;
   1761 		} else {
   1762 			*(int *) data = 100;
   1763 		}
   1764 		return 0;
   1765 
   1766 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1767 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1768 		return 0;
   1769 
   1770 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1771 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1772 			/* This makes no sense on a RAID 0 */
   1773 			*(int *) data = 100;
   1774 			return 0;
   1775 		}
   1776 		if (raidPtr->copyback_in_progress == 1) {
   1777 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1778 				raidPtr->Layout.numStripe;
   1779 		} else {
   1780 			*(int *) data = 100;
   1781 		}
   1782 		return 0;
   1783 
   1784 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1785 		rf_check_copyback_status_ext(raidPtr, data);
   1786 		return 0;
   1787 
   1788 	case RAIDFRAME_SET_LAST_UNIT:
   1789 		for (column = 0; column < raidPtr->numCol; column++)
   1790 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1791 				return EBUSY;
   1792 
   1793 		for (column = 0; column < raidPtr->numCol; column++) {
   1794 			clabel = raidget_component_label(raidPtr, column);
   1795 			clabel->last_unit = *(int *)data;
   1796 			raidflush_component_label(raidPtr, column);
   1797 		}
   1798 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1799 		return 0;
   1800 
   1801 		/* the sparetable daemon calls this to wait for the kernel to
   1802 		 * need a spare table. this ioctl does not return until a
   1803 		 * spare table is needed. XXX -- calling mpsleep here in the
   1804 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1805 		 * -- I should either compute the spare table in the kernel,
   1806 		 * or have a different -- XXX XXX -- interface (a different
   1807 		 * character device) for delivering the table     -- XXX */
   1808 #if RF_DISABLED
   1809 	case RAIDFRAME_SPARET_WAIT:
   1810 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1811 		while (!rf_sparet_wait_queue)
   1812 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1813 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1814 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1815 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1816 
   1817 		/* structure assignment */
   1818 		*((RF_SparetWait_t *) data) = *waitreq;
   1819 
   1820 		RF_Free(waitreq, sizeof(*waitreq));
   1821 		return 0;
   1822 
   1823 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1824 		 * code in it that will cause the dameon to exit */
   1825 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1826 		waitreq = RF_Malloc(sizeof(*waitreq));
   1827 		waitreq->fcol = -1;
   1828 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1829 		waitreq->next = rf_sparet_wait_queue;
   1830 		rf_sparet_wait_queue = waitreq;
   1831 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1832 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1833 		return 0;
   1834 
   1835 		/* used by the spare table daemon to deliver a spare table
   1836 		 * into the kernel */
   1837 	case RAIDFRAME_SEND_SPARET:
   1838 
   1839 		/* install the spare table */
   1840 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1841 
   1842 		/* respond to the requestor.  the return status of the spare
   1843 		 * table installation is passed in the "fcol" field */
   1844 		waitred = RF_Malloc(sizeof(*waitreq));
   1845 		waitreq->fcol = retcode;
   1846 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1847 		waitreq->next = rf_sparet_resp_queue;
   1848 		rf_sparet_resp_queue = waitreq;
   1849 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1850 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1851 
   1852 		return retcode;
   1853 #endif
   1854 	default:
   1855 		/*
   1856 		 * Don't bother trying to load compat modules
   1857 		 * if it is not our ioctl. This is more efficient
   1858 		 * and makes rump tests not depend on compat code
   1859 		 */
   1860 		if (IOCGROUP(cmd) != 'r')
   1861 			break;
   1862 #ifdef _LP64
   1863 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1864 			module_autoload("compat_netbsd32_raid",
   1865 			    MODULE_CLASS_EXEC);
   1866 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1867 			    (rs, cmd, data), enosys(), retcode);
   1868 			if (retcode != EPASSTHROUGH)
   1869 				return retcode;
   1870 		}
   1871 #endif
   1872 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1873 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1874 		    (rs, cmd, data), enosys(), retcode);
   1875 		if (retcode != EPASSTHROUGH)
   1876 			return retcode;
   1877 
   1878 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1879 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1880 		    (rs, cmd, data), enosys(), retcode);
   1881 		if (retcode != EPASSTHROUGH)
   1882 			return retcode;
   1883 		break; /* fall through to the os-specific code below */
   1884 
   1885 	}
   1886 
   1887 	if (!raidPtr->valid)
   1888 		return (EINVAL);
   1889 
   1890 	/*
   1891 	 * Add support for "regular" device ioctls here.
   1892 	 */
   1893 
   1894 	switch (cmd) {
   1895 	case DIOCGCACHE:
   1896 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1897 		break;
   1898 
   1899 	case DIOCCACHESYNC:
   1900 		retcode = rf_sync_component_caches(raidPtr);
   1901 		break;
   1902 
   1903 	default:
   1904 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1905 		break;
   1906 	}
   1907 
   1908 	return (retcode);
   1909 
   1910 }
   1911 
   1912 
   1913 /* raidinit -- complete the rest of the initialization for the
   1914    RAIDframe device.  */
   1915 
   1916 
   1917 static void
   1918 raidinit(struct raid_softc *rs)
   1919 {
   1920 	cfdata_t cf;
   1921 	unsigned int unit;
   1922 	struct dk_softc *dksc = &rs->sc_dksc;
   1923 	RF_Raid_t *raidPtr = &rs->sc_r;
   1924 	device_t dev;
   1925 
   1926 	unit = raidPtr->raidid;
   1927 
   1928 	/* XXX doesn't check bounds. */
   1929 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1930 
   1931 	/* attach the pseudo device */
   1932 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1933 	cf->cf_name = raid_cd.cd_name;
   1934 	cf->cf_atname = raid_cd.cd_name;
   1935 	cf->cf_unit = unit;
   1936 	cf->cf_fstate = FSTATE_STAR;
   1937 
   1938 	dev = config_attach_pseudo(cf);
   1939 	if (dev == NULL) {
   1940 		printf("raid%d: config_attach_pseudo failed\n",
   1941 		    raidPtr->raidid);
   1942 		free(cf, M_RAIDFRAME);
   1943 		return;
   1944 	}
   1945 
   1946 	/* provide a backpointer to the real softc */
   1947 	raidsoftc(dev) = rs;
   1948 
   1949 	/* disk_attach actually creates space for the CPU disklabel, among
   1950 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1951 	 * with disklabels. */
   1952 	dk_init(dksc, dev, DKTYPE_RAID);
   1953 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1954 
   1955 	/* XXX There may be a weird interaction here between this, and
   1956 	 * protectedSectors, as used in RAIDframe.  */
   1957 
   1958 	rs->sc_size = raidPtr->totalSectors;
   1959 
   1960 	/* Attach dk and disk subsystems */
   1961 	dk_attach(dksc);
   1962 	disk_attach(&dksc->sc_dkdev);
   1963 	rf_set_geometry(rs, raidPtr);
   1964 
   1965 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1966 
   1967 	/* mark unit as usuable */
   1968 	rs->sc_flags |= RAIDF_INITED;
   1969 
   1970 	dkwedge_discover(&dksc->sc_dkdev);
   1971 }
   1972 
   1973 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1974 /* wake up the daemon & tell it to get us a spare table
   1975  * XXX
   1976  * the entries in the queues should be tagged with the raidPtr
   1977  * so that in the extremely rare case that two recons happen at once,
   1978  * we know for which device were requesting a spare table
   1979  * XXX
   1980  *
   1981  * XXX This code is not currently used. GO
   1982  */
   1983 int
   1984 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1985 {
   1986 	int     retcode;
   1987 
   1988 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1989 	req->next = rf_sparet_wait_queue;
   1990 	rf_sparet_wait_queue = req;
   1991 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1992 
   1993 	/* mpsleep unlocks the mutex */
   1994 	while (!rf_sparet_resp_queue) {
   1995 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1996 	}
   1997 	req = rf_sparet_resp_queue;
   1998 	rf_sparet_resp_queue = req->next;
   1999 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2000 
   2001 	retcode = req->fcol;
   2002 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2003 					 * alloc'd */
   2004 	return (retcode);
   2005 }
   2006 #endif
   2007 
   2008 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2009  * bp & passes it down.
   2010  * any calls originating in the kernel must use non-blocking I/O
   2011  * do some extra sanity checking to return "appropriate" error values for
   2012  * certain conditions (to make some standard utilities work)
   2013  *
   2014  * Formerly known as: rf_DoAccessKernel
   2015  */
   2016 void
   2017 raidstart(RF_Raid_t *raidPtr)
   2018 {
   2019 	struct raid_softc *rs;
   2020 	struct dk_softc *dksc;
   2021 
   2022 	rs = raidPtr->softc;
   2023 	dksc = &rs->sc_dksc;
   2024 	/* quick check to see if anything has died recently */
   2025 	rf_lock_mutex2(raidPtr->mutex);
   2026 	if (raidPtr->numNewFailures > 0) {
   2027 		rf_unlock_mutex2(raidPtr->mutex);
   2028 		rf_update_component_labels(raidPtr,
   2029 					   RF_NORMAL_COMPONENT_UPDATE);
   2030 		rf_lock_mutex2(raidPtr->mutex);
   2031 		raidPtr->numNewFailures--;
   2032 	}
   2033 	rf_unlock_mutex2(raidPtr->mutex);
   2034 
   2035 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   2036 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   2037 		return;
   2038 	}
   2039 
   2040 	dk_start(dksc, NULL);
   2041 }
   2042 
   2043 static int
   2044 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   2045 {
   2046 	RF_SectorCount_t num_blocks, pb, sum;
   2047 	RF_RaidAddr_t raid_addr;
   2048 	daddr_t blocknum;
   2049 	int     do_async;
   2050 	int rc;
   2051 
   2052 	rf_lock_mutex2(raidPtr->mutex);
   2053 	if (raidPtr->openings == 0) {
   2054 		rf_unlock_mutex2(raidPtr->mutex);
   2055 		return EAGAIN;
   2056 	}
   2057 	rf_unlock_mutex2(raidPtr->mutex);
   2058 
   2059 	blocknum = bp->b_rawblkno;
   2060 
   2061 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2062 		    (int) blocknum));
   2063 
   2064 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2065 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2066 
   2067 	/* *THIS* is where we adjust what block we're going to...
   2068 	 * but DO NOT TOUCH bp->b_blkno!!! */
   2069 	raid_addr = blocknum;
   2070 
   2071 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2072 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2073 	sum = raid_addr + num_blocks + pb;
   2074 	if (1 || rf_debugKernelAccess) {
   2075 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2076 			    (int) raid_addr, (int) sum, (int) num_blocks,
   2077 			    (int) pb, (int) bp->b_resid));
   2078 	}
   2079 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2080 	    || (sum < num_blocks) || (sum < pb)) {
   2081 		rc = ENOSPC;
   2082 		goto done;
   2083 	}
   2084 	/*
   2085 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2086 	 */
   2087 
   2088 	if (bp->b_bcount & raidPtr->sectorMask) {
   2089 		rc = ENOSPC;
   2090 		goto done;
   2091 	}
   2092 	db1_printf(("Calling DoAccess..\n"));
   2093 
   2094 
   2095 	rf_lock_mutex2(raidPtr->mutex);
   2096 	raidPtr->openings--;
   2097 	rf_unlock_mutex2(raidPtr->mutex);
   2098 
   2099 	/*
   2100 	 * Everything is async.
   2101 	 */
   2102 	do_async = 1;
   2103 
   2104 	/* don't ever condition on bp->b_flags & B_WRITE.
   2105 	 * always condition on B_READ instead */
   2106 
   2107 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2108 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2109 			 do_async, raid_addr, num_blocks,
   2110 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2111 
   2112 done:
   2113 	return rc;
   2114 }
   2115 
   2116 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2117 
   2118 int
   2119 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2120 {
   2121 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2122 	struct buf *bp;
   2123 
   2124 	req->queue = queue;
   2125 	bp = req->bp;
   2126 
   2127 	switch (req->type) {
   2128 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2129 		/* XXX need to do something extra here.. */
   2130 		/* I'm leaving this in, as I've never actually seen it used,
   2131 		 * and I'd like folks to report it... GO */
   2132 		printf(("WAKEUP CALLED\n"));
   2133 		queue->numOutstanding++;
   2134 
   2135 		bp->b_flags = 0;
   2136 		bp->b_private = req;
   2137 
   2138 		KernelWakeupFunc(bp);
   2139 		break;
   2140 
   2141 	case RF_IO_TYPE_READ:
   2142 	case RF_IO_TYPE_WRITE:
   2143 #if RF_ACC_TRACE > 0
   2144 		if (req->tracerec) {
   2145 			RF_ETIMER_START(req->tracerec->timer);
   2146 		}
   2147 #endif
   2148 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2149 		    op, queue->rf_cinfo->ci_dev,
   2150 		    req->sectorOffset, req->numSector,
   2151 		    req->buf, KernelWakeupFunc, (void *) req,
   2152 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2153 
   2154 		if (rf_debugKernelAccess) {
   2155 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2156 				(long) bp->b_blkno));
   2157 		}
   2158 		queue->numOutstanding++;
   2159 		queue->last_deq_sector = req->sectorOffset;
   2160 		/* acc wouldn't have been let in if there were any pending
   2161 		 * reqs at any other priority */
   2162 		queue->curPriority = req->priority;
   2163 
   2164 		db1_printf(("Going for %c to unit %d col %d\n",
   2165 			    req->type, queue->raidPtr->raidid,
   2166 			    queue->col));
   2167 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2168 			(int) req->sectorOffset, (int) req->numSector,
   2169 			(int) (req->numSector <<
   2170 			    queue->raidPtr->logBytesPerSector),
   2171 			(int) queue->raidPtr->logBytesPerSector));
   2172 
   2173 		/*
   2174 		 * XXX: drop lock here since this can block at
   2175 		 * least with backing SCSI devices.  Retake it
   2176 		 * to minimize fuss with calling interfaces.
   2177 		 */
   2178 
   2179 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2180 		bdev_strategy(bp);
   2181 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2182 		break;
   2183 
   2184 	default:
   2185 		panic("bad req->type in rf_DispatchKernelIO");
   2186 	}
   2187 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2188 
   2189 	return (0);
   2190 }
   2191 /* this is the callback function associated with a I/O invoked from
   2192    kernel code.
   2193  */
   2194 static void
   2195 KernelWakeupFunc(struct buf *bp)
   2196 {
   2197 	RF_DiskQueueData_t *req = NULL;
   2198 	RF_DiskQueue_t *queue;
   2199 
   2200 	db1_printf(("recovering the request queue:\n"));
   2201 
   2202 	req = bp->b_private;
   2203 
   2204 	queue = (RF_DiskQueue_t *) req->queue;
   2205 
   2206 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2207 
   2208 #if RF_ACC_TRACE > 0
   2209 	if (req->tracerec) {
   2210 		RF_ETIMER_STOP(req->tracerec->timer);
   2211 		RF_ETIMER_EVAL(req->tracerec->timer);
   2212 		rf_lock_mutex2(rf_tracing_mutex);
   2213 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2214 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2215 		req->tracerec->num_phys_ios++;
   2216 		rf_unlock_mutex2(rf_tracing_mutex);
   2217 	}
   2218 #endif
   2219 
   2220 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2221 	 * ballistic, and mark the component as hosed... */
   2222 
   2223 	if (bp->b_error != 0) {
   2224 		/* Mark the disk as dead */
   2225 		/* but only mark it once... */
   2226 		/* and only if it wouldn't leave this RAID set
   2227 		   completely broken */
   2228 		if (((queue->raidPtr->Disks[queue->col].status ==
   2229 		      rf_ds_optimal) ||
   2230 		     (queue->raidPtr->Disks[queue->col].status ==
   2231 		      rf_ds_used_spare)) &&
   2232 		     (queue->raidPtr->numFailures <
   2233 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2234 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2235 			       queue->raidPtr->raidid,
   2236 			       bp->b_error,
   2237 			       queue->raidPtr->Disks[queue->col].devname);
   2238 			queue->raidPtr->Disks[queue->col].status =
   2239 			    rf_ds_failed;
   2240 			queue->raidPtr->status = rf_rs_degraded;
   2241 			queue->raidPtr->numFailures++;
   2242 			queue->raidPtr->numNewFailures++;
   2243 		} else {	/* Disk is already dead... */
   2244 			/* printf("Disk already marked as dead!\n"); */
   2245 		}
   2246 
   2247 	}
   2248 
   2249 	/* Fill in the error value */
   2250 	req->error = bp->b_error;
   2251 
   2252 	/* Drop this one on the "finished" queue... */
   2253 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2254 
   2255 	/* Let the raidio thread know there is work to be done. */
   2256 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2257 
   2258 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2259 }
   2260 
   2261 
   2262 /*
   2263  * initialize a buf structure for doing an I/O in the kernel.
   2264  */
   2265 static void
   2266 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2267        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2268        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2269        struct proc *b_proc)
   2270 {
   2271 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2272 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2273 	bp->b_oflags = 0;
   2274 	bp->b_cflags = 0;
   2275 	bp->b_bcount = numSect << logBytesPerSector;
   2276 	bp->b_bufsize = bp->b_bcount;
   2277 	bp->b_error = 0;
   2278 	bp->b_dev = dev;
   2279 	bp->b_data = bf;
   2280 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2281 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2282 	if (bp->b_bcount == 0) {
   2283 		panic("bp->b_bcount is zero in InitBP!!");
   2284 	}
   2285 	bp->b_proc = b_proc;
   2286 	bp->b_iodone = cbFunc;
   2287 	bp->b_private = cbArg;
   2288 }
   2289 
   2290 /*
   2291  * Wait interruptibly for an exclusive lock.
   2292  *
   2293  * XXX
   2294  * Several drivers do this; it should be abstracted and made MP-safe.
   2295  * (Hmm... where have we seen this warning before :->  GO )
   2296  */
   2297 static int
   2298 raidlock(struct raid_softc *rs)
   2299 {
   2300 	int     error;
   2301 
   2302 	error = 0;
   2303 	mutex_enter(&rs->sc_mutex);
   2304 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2305 		rs->sc_flags |= RAIDF_WANTED;
   2306 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2307 		if (error != 0)
   2308 			goto done;
   2309 	}
   2310 	rs->sc_flags |= RAIDF_LOCKED;
   2311 done:
   2312 	mutex_exit(&rs->sc_mutex);
   2313 	return (error);
   2314 }
   2315 /*
   2316  * Unlock and wake up any waiters.
   2317  */
   2318 static void
   2319 raidunlock(struct raid_softc *rs)
   2320 {
   2321 
   2322 	mutex_enter(&rs->sc_mutex);
   2323 	rs->sc_flags &= ~RAIDF_LOCKED;
   2324 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2325 		rs->sc_flags &= ~RAIDF_WANTED;
   2326 		cv_broadcast(&rs->sc_cv);
   2327 	}
   2328 	mutex_exit(&rs->sc_mutex);
   2329 }
   2330 
   2331 
   2332 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2333 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2334 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2335 
   2336 static daddr_t
   2337 rf_component_info_offset(void)
   2338 {
   2339 
   2340 	return RF_COMPONENT_INFO_OFFSET;
   2341 }
   2342 
   2343 static daddr_t
   2344 rf_component_info_size(unsigned secsize)
   2345 {
   2346 	daddr_t info_size;
   2347 
   2348 	KASSERT(secsize);
   2349 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2350 		info_size = secsize;
   2351 	else
   2352 		info_size = RF_COMPONENT_INFO_SIZE;
   2353 
   2354 	return info_size;
   2355 }
   2356 
   2357 static daddr_t
   2358 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2359 {
   2360 	daddr_t map_offset;
   2361 
   2362 	KASSERT(raidPtr->bytesPerSector);
   2363 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2364 		map_offset = raidPtr->bytesPerSector;
   2365 	else
   2366 		map_offset = RF_COMPONENT_INFO_SIZE;
   2367 	map_offset += rf_component_info_offset();
   2368 
   2369 	return map_offset;
   2370 }
   2371 
   2372 static daddr_t
   2373 rf_parity_map_size(RF_Raid_t *raidPtr)
   2374 {
   2375 	daddr_t map_size;
   2376 
   2377 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2378 		map_size = raidPtr->bytesPerSector;
   2379 	else
   2380 		map_size = RF_PARITY_MAP_SIZE;
   2381 
   2382 	return map_size;
   2383 }
   2384 
   2385 int
   2386 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2387 {
   2388 	RF_ComponentLabel_t *clabel;
   2389 
   2390 	clabel = raidget_component_label(raidPtr, col);
   2391 	clabel->clean = RF_RAID_CLEAN;
   2392 	raidflush_component_label(raidPtr, col);
   2393 	return(0);
   2394 }
   2395 
   2396 
   2397 int
   2398 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2399 {
   2400 	RF_ComponentLabel_t *clabel;
   2401 
   2402 	clabel = raidget_component_label(raidPtr, col);
   2403 	clabel->clean = RF_RAID_DIRTY;
   2404 	raidflush_component_label(raidPtr, col);
   2405 	return(0);
   2406 }
   2407 
   2408 int
   2409 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2410 {
   2411 	KASSERT(raidPtr->bytesPerSector);
   2412 	return raidread_component_label(raidPtr->bytesPerSector,
   2413 	    raidPtr->Disks[col].dev,
   2414 	    raidPtr->raid_cinfo[col].ci_vp,
   2415 	    &raidPtr->raid_cinfo[col].ci_label);
   2416 }
   2417 
   2418 RF_ComponentLabel_t *
   2419 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2420 {
   2421 	return &raidPtr->raid_cinfo[col].ci_label;
   2422 }
   2423 
   2424 int
   2425 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2426 {
   2427 	RF_ComponentLabel_t *label;
   2428 
   2429 	label = &raidPtr->raid_cinfo[col].ci_label;
   2430 	label->mod_counter = raidPtr->mod_counter;
   2431 #ifndef RF_NO_PARITY_MAP
   2432 	label->parity_map_modcount = label->mod_counter;
   2433 #endif
   2434 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2435 	    raidPtr->Disks[col].dev,
   2436 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2437 }
   2438 
   2439 
   2440 static int
   2441 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2442     RF_ComponentLabel_t *clabel)
   2443 {
   2444 	return raidread_component_area(dev, b_vp, clabel,
   2445 	    sizeof(RF_ComponentLabel_t),
   2446 	    rf_component_info_offset(),
   2447 	    rf_component_info_size(secsize));
   2448 }
   2449 
   2450 /* ARGSUSED */
   2451 static int
   2452 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2453     size_t msize, daddr_t offset, daddr_t dsize)
   2454 {
   2455 	struct buf *bp;
   2456 	int error;
   2457 
   2458 	/* XXX should probably ensure that we don't try to do this if
   2459 	   someone has changed rf_protected_sectors. */
   2460 
   2461 	if (b_vp == NULL) {
   2462 		/* For whatever reason, this component is not valid.
   2463 		   Don't try to read a component label from it. */
   2464 		return(EINVAL);
   2465 	}
   2466 
   2467 	/* get a block of the appropriate size... */
   2468 	bp = geteblk((int)dsize);
   2469 	bp->b_dev = dev;
   2470 
   2471 	/* get our ducks in a row for the read */
   2472 	bp->b_blkno = offset / DEV_BSIZE;
   2473 	bp->b_bcount = dsize;
   2474 	bp->b_flags |= B_READ;
   2475  	bp->b_resid = dsize;
   2476 
   2477 	bdev_strategy(bp);
   2478 	error = biowait(bp);
   2479 
   2480 	if (!error) {
   2481 		memcpy(data, bp->b_data, msize);
   2482 	}
   2483 
   2484 	brelse(bp, 0);
   2485 	return(error);
   2486 }
   2487 
   2488 
   2489 static int
   2490 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2491     RF_ComponentLabel_t *clabel)
   2492 {
   2493 	return raidwrite_component_area(dev, b_vp, clabel,
   2494 	    sizeof(RF_ComponentLabel_t),
   2495 	    rf_component_info_offset(),
   2496 	    rf_component_info_size(secsize), 0);
   2497 }
   2498 
   2499 /* ARGSUSED */
   2500 static int
   2501 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2502     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2503 {
   2504 	struct buf *bp;
   2505 	int error;
   2506 
   2507 	/* get a block of the appropriate size... */
   2508 	bp = geteblk((int)dsize);
   2509 	bp->b_dev = dev;
   2510 
   2511 	/* get our ducks in a row for the write */
   2512 	bp->b_blkno = offset / DEV_BSIZE;
   2513 	bp->b_bcount = dsize;
   2514 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2515  	bp->b_resid = dsize;
   2516 
   2517 	memset(bp->b_data, 0, dsize);
   2518 	memcpy(bp->b_data, data, msize);
   2519 
   2520 	bdev_strategy(bp);
   2521 	if (asyncp)
   2522 		return 0;
   2523 	error = biowait(bp);
   2524 	brelse(bp, 0);
   2525 	if (error) {
   2526 #if 1
   2527 		printf("Failed to write RAID component info!\n");
   2528 #endif
   2529 	}
   2530 
   2531 	return(error);
   2532 }
   2533 
   2534 void
   2535 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2536 {
   2537 	int c;
   2538 
   2539 	for (c = 0; c < raidPtr->numCol; c++) {
   2540 		/* Skip dead disks. */
   2541 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2542 			continue;
   2543 		/* XXXjld: what if an error occurs here? */
   2544 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2545 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2546 		    RF_PARITYMAP_NBYTE,
   2547 		    rf_parity_map_offset(raidPtr),
   2548 		    rf_parity_map_size(raidPtr), 0);
   2549 	}
   2550 }
   2551 
   2552 void
   2553 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2554 {
   2555 	struct rf_paritymap_ondisk tmp;
   2556 	int c,first;
   2557 
   2558 	first=1;
   2559 	for (c = 0; c < raidPtr->numCol; c++) {
   2560 		/* Skip dead disks. */
   2561 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2562 			continue;
   2563 		raidread_component_area(raidPtr->Disks[c].dev,
   2564 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2565 		    RF_PARITYMAP_NBYTE,
   2566 		    rf_parity_map_offset(raidPtr),
   2567 		    rf_parity_map_size(raidPtr));
   2568 		if (first) {
   2569 			memcpy(map, &tmp, sizeof(*map));
   2570 			first = 0;
   2571 		} else {
   2572 			rf_paritymap_merge(map, &tmp);
   2573 		}
   2574 	}
   2575 }
   2576 
   2577 void
   2578 rf_markalldirty(RF_Raid_t *raidPtr)
   2579 {
   2580 	RF_ComponentLabel_t *clabel;
   2581 	int sparecol;
   2582 	int c;
   2583 	int j;
   2584 	int scol = -1;
   2585 
   2586 	raidPtr->mod_counter++;
   2587 	for (c = 0; c < raidPtr->numCol; c++) {
   2588 		/* we don't want to touch (at all) a disk that has
   2589 		   failed */
   2590 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2591 			clabel = raidget_component_label(raidPtr, c);
   2592 			if (clabel->status == rf_ds_spared) {
   2593 				/* XXX do something special...
   2594 				   but whatever you do, don't
   2595 				   try to access it!! */
   2596 			} else {
   2597 				raidmarkdirty(raidPtr, c);
   2598 			}
   2599 		}
   2600 	}
   2601 
   2602 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2603 		sparecol = raidPtr->numCol + c;
   2604 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2605 			/*
   2606 
   2607 			   we claim this disk is "optimal" if it's
   2608 			   rf_ds_used_spare, as that means it should be
   2609 			   directly substitutable for the disk it replaced.
   2610 			   We note that too...
   2611 
   2612 			 */
   2613 
   2614 			for(j=0;j<raidPtr->numCol;j++) {
   2615 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2616 					scol = j;
   2617 					break;
   2618 				}
   2619 			}
   2620 
   2621 			clabel = raidget_component_label(raidPtr, sparecol);
   2622 			/* make sure status is noted */
   2623 
   2624 			raid_init_component_label(raidPtr, clabel);
   2625 
   2626 			clabel->row = 0;
   2627 			clabel->column = scol;
   2628 			/* Note: we *don't* change status from rf_ds_used_spare
   2629 			   to rf_ds_optimal */
   2630 			/* clabel.status = rf_ds_optimal; */
   2631 
   2632 			raidmarkdirty(raidPtr, sparecol);
   2633 		}
   2634 	}
   2635 }
   2636 
   2637 
   2638 void
   2639 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2640 {
   2641 	RF_ComponentLabel_t *clabel;
   2642 	int sparecol;
   2643 	int c;
   2644 	int j;
   2645 	int scol;
   2646 	struct raid_softc *rs = raidPtr->softc;
   2647 
   2648 	scol = -1;
   2649 
   2650 	/* XXX should do extra checks to make sure things really are clean,
   2651 	   rather than blindly setting the clean bit... */
   2652 
   2653 	raidPtr->mod_counter++;
   2654 
   2655 	for (c = 0; c < raidPtr->numCol; c++) {
   2656 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2657 			clabel = raidget_component_label(raidPtr, c);
   2658 			/* make sure status is noted */
   2659 			clabel->status = rf_ds_optimal;
   2660 
   2661 			/* note what unit we are configured as */
   2662 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2663 				clabel->last_unit = raidPtr->raidid;
   2664 
   2665 			raidflush_component_label(raidPtr, c);
   2666 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2667 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2668 					raidmarkclean(raidPtr, c);
   2669 				}
   2670 			}
   2671 		}
   2672 		/* else we don't touch it.. */
   2673 	}
   2674 
   2675 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2676 		sparecol = raidPtr->numCol + c;
   2677 		/* Need to ensure that the reconstruct actually completed! */
   2678 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2679 			/*
   2680 
   2681 			   we claim this disk is "optimal" if it's
   2682 			   rf_ds_used_spare, as that means it should be
   2683 			   directly substitutable for the disk it replaced.
   2684 			   We note that too...
   2685 
   2686 			 */
   2687 
   2688 			for(j=0;j<raidPtr->numCol;j++) {
   2689 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2690 					scol = j;
   2691 					break;
   2692 				}
   2693 			}
   2694 
   2695 			/* XXX shouldn't *really* need this... */
   2696 			clabel = raidget_component_label(raidPtr, sparecol);
   2697 			/* make sure status is noted */
   2698 
   2699 			raid_init_component_label(raidPtr, clabel);
   2700 
   2701 			clabel->column = scol;
   2702 			clabel->status = rf_ds_optimal;
   2703 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2704 				clabel->last_unit = raidPtr->raidid;
   2705 
   2706 			raidflush_component_label(raidPtr, sparecol);
   2707 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2708 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2709 					raidmarkclean(raidPtr, sparecol);
   2710 				}
   2711 			}
   2712 		}
   2713 	}
   2714 }
   2715 
   2716 void
   2717 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2718 {
   2719 
   2720 	if (vp != NULL) {
   2721 		if (auto_configured == 1) {
   2722 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2723 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2724 			vput(vp);
   2725 
   2726 		} else {
   2727 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2728 		}
   2729 	}
   2730 }
   2731 
   2732 
   2733 void
   2734 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2735 {
   2736 	int r,c;
   2737 	struct vnode *vp;
   2738 	int acd;
   2739 
   2740 
   2741 	/* We take this opportunity to close the vnodes like we should.. */
   2742 
   2743 	for (c = 0; c < raidPtr->numCol; c++) {
   2744 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2745 		acd = raidPtr->Disks[c].auto_configured;
   2746 		rf_close_component(raidPtr, vp, acd);
   2747 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2748 		raidPtr->Disks[c].auto_configured = 0;
   2749 	}
   2750 
   2751 	for (r = 0; r < raidPtr->numSpare; r++) {
   2752 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2753 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2754 		rf_close_component(raidPtr, vp, acd);
   2755 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2756 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2757 	}
   2758 }
   2759 
   2760 
   2761 void
   2762 rf_ReconThread(struct rf_recon_req_internal *req)
   2763 {
   2764 	int     s;
   2765 	RF_Raid_t *raidPtr;
   2766 
   2767 	s = splbio();
   2768 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2769 	raidPtr->recon_in_progress = 1;
   2770 
   2771 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2772 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2773 
   2774 	RF_Free(req, sizeof(*req));
   2775 
   2776 	raidPtr->recon_in_progress = 0;
   2777 	splx(s);
   2778 
   2779 	/* That's all... */
   2780 	kthread_exit(0);	/* does not return */
   2781 }
   2782 
   2783 void
   2784 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2785 {
   2786 	int retcode;
   2787 	int s;
   2788 
   2789 	raidPtr->parity_rewrite_stripes_done = 0;
   2790 	raidPtr->parity_rewrite_in_progress = 1;
   2791 	s = splbio();
   2792 	retcode = rf_RewriteParity(raidPtr);
   2793 	splx(s);
   2794 	if (retcode) {
   2795 		printf("raid%d: Error re-writing parity (%d)!\n",
   2796 		    raidPtr->raidid, retcode);
   2797 	} else {
   2798 		/* set the clean bit!  If we shutdown correctly,
   2799 		   the clean bit on each component label will get
   2800 		   set */
   2801 		raidPtr->parity_good = RF_RAID_CLEAN;
   2802 	}
   2803 	raidPtr->parity_rewrite_in_progress = 0;
   2804 
   2805 	/* Anyone waiting for us to stop?  If so, inform them... */
   2806 	if (raidPtr->waitShutdown) {
   2807 		rf_lock_mutex2(raidPtr->rad_lock);
   2808 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2809 		rf_unlock_mutex2(raidPtr->rad_lock);
   2810 	}
   2811 
   2812 	/* That's all... */
   2813 	kthread_exit(0);	/* does not return */
   2814 }
   2815 
   2816 
   2817 void
   2818 rf_CopybackThread(RF_Raid_t *raidPtr)
   2819 {
   2820 	int s;
   2821 
   2822 	raidPtr->copyback_in_progress = 1;
   2823 	s = splbio();
   2824 	rf_CopybackReconstructedData(raidPtr);
   2825 	splx(s);
   2826 	raidPtr->copyback_in_progress = 0;
   2827 
   2828 	/* That's all... */
   2829 	kthread_exit(0);	/* does not return */
   2830 }
   2831 
   2832 
   2833 void
   2834 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2835 {
   2836 	int s;
   2837 	RF_Raid_t *raidPtr;
   2838 
   2839 	s = splbio();
   2840 	raidPtr = req->raidPtr;
   2841 	raidPtr->recon_in_progress = 1;
   2842 	rf_ReconstructInPlace(raidPtr, req->col);
   2843 	RF_Free(req, sizeof(*req));
   2844 	raidPtr->recon_in_progress = 0;
   2845 	splx(s);
   2846 
   2847 	/* That's all... */
   2848 	kthread_exit(0);	/* does not return */
   2849 }
   2850 
   2851 static RF_AutoConfig_t *
   2852 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2853     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2854     unsigned secsize)
   2855 {
   2856 	int good_one = 0;
   2857 	RF_ComponentLabel_t *clabel;
   2858 	RF_AutoConfig_t *ac;
   2859 
   2860 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2861 
   2862 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2863 		/* Got the label.  Does it look reasonable? */
   2864 		if (rf_reasonable_label(clabel, numsecs) &&
   2865 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2866 #ifdef DEBUG
   2867 			printf("Component on: %s: %llu\n",
   2868 				cname, (unsigned long long)size);
   2869 			rf_print_component_label(clabel);
   2870 #endif
   2871 			/* if it's reasonable, add it, else ignore it. */
   2872 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2873 				M_WAITOK);
   2874 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2875 			ac->dev = dev;
   2876 			ac->vp = vp;
   2877 			ac->clabel = clabel;
   2878 			ac->next = ac_list;
   2879 			ac_list = ac;
   2880 			good_one = 1;
   2881 		}
   2882 	}
   2883 	if (!good_one) {
   2884 		/* cleanup */
   2885 		free(clabel, M_RAIDFRAME);
   2886 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2887 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2888 		vput(vp);
   2889 	}
   2890 	return ac_list;
   2891 }
   2892 
   2893 RF_AutoConfig_t *
   2894 rf_find_raid_components(void)
   2895 {
   2896 	struct vnode *vp;
   2897 	struct disklabel label;
   2898 	device_t dv;
   2899 	deviter_t di;
   2900 	dev_t dev;
   2901 	int bmajor, bminor, wedge, rf_part_found;
   2902 	int error;
   2903 	int i;
   2904 	RF_AutoConfig_t *ac_list;
   2905 	uint64_t numsecs;
   2906 	unsigned secsize;
   2907 	int dowedges;
   2908 
   2909 	/* initialize the AutoConfig list */
   2910 	ac_list = NULL;
   2911 
   2912 	/*
   2913 	 * we begin by trolling through *all* the devices on the system *twice*
   2914 	 * first we scan for wedges, second for other devices. This avoids
   2915 	 * using a raw partition instead of a wedge that covers the whole disk
   2916 	 */
   2917 
   2918 	for (dowedges=1; dowedges>=0; --dowedges) {
   2919 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2920 		     dv = deviter_next(&di)) {
   2921 
   2922 			/* we are only interested in disks... */
   2923 			if (device_class(dv) != DV_DISK)
   2924 				continue;
   2925 
   2926 			/* we don't care about floppies... */
   2927 			if (device_is_a(dv, "fd")) {
   2928 				continue;
   2929 			}
   2930 
   2931 			/* we don't care about CD's... */
   2932 			if (device_is_a(dv, "cd")) {
   2933 				continue;
   2934 			}
   2935 
   2936 			/* we don't care about md's... */
   2937 			if (device_is_a(dv, "md")) {
   2938 				continue;
   2939 			}
   2940 
   2941 			/* hdfd is the Atari/Hades floppy driver */
   2942 			if (device_is_a(dv, "hdfd")) {
   2943 				continue;
   2944 			}
   2945 
   2946 			/* fdisa is the Atari/Milan floppy driver */
   2947 			if (device_is_a(dv, "fdisa")) {
   2948 				continue;
   2949 			}
   2950 
   2951 			/* are we in the wedges pass ? */
   2952 			wedge = device_is_a(dv, "dk");
   2953 			if (wedge != dowedges) {
   2954 				continue;
   2955 			}
   2956 
   2957 			/* need to find the device_name_to_block_device_major stuff */
   2958 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2959 
   2960 			rf_part_found = 0; /*No raid partition as yet*/
   2961 
   2962 			/* get a vnode for the raw partition of this disk */
   2963 			bminor = minor(device_unit(dv));
   2964 			dev = wedge ? makedev(bmajor, bminor) :
   2965 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2966 			if (bdevvp(dev, &vp))
   2967 				panic("RAID can't alloc vnode");
   2968 
   2969 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2970 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2971 
   2972 			if (error) {
   2973 				/* "Who cares."  Continue looking
   2974 				   for something that exists*/
   2975 				vput(vp);
   2976 				continue;
   2977 			}
   2978 
   2979 			error = getdisksize(vp, &numsecs, &secsize);
   2980 			if (error) {
   2981 				/*
   2982 				 * Pseudo devices like vnd and cgd can be
   2983 				 * opened but may still need some configuration.
   2984 				 * Ignore these quietly.
   2985 				 */
   2986 				if (error != ENXIO)
   2987 					printf("RAIDframe: can't get disk size"
   2988 					    " for dev %s (%d)\n",
   2989 					    device_xname(dv), error);
   2990 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2991 				vput(vp);
   2992 				continue;
   2993 			}
   2994 			if (wedge) {
   2995 				struct dkwedge_info dkw;
   2996 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2997 				    NOCRED);
   2998 				if (error) {
   2999 					printf("RAIDframe: can't get wedge info for "
   3000 					    "dev %s (%d)\n", device_xname(dv), error);
   3001 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3002 					vput(vp);
   3003 					continue;
   3004 				}
   3005 
   3006 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3007 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3008 					vput(vp);
   3009 					continue;
   3010 				}
   3011 
   3012 				VOP_UNLOCK(vp);
   3013 				ac_list = rf_get_component(ac_list, dev, vp,
   3014 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3015 				rf_part_found = 1; /*There is a raid component on this disk*/
   3016 				continue;
   3017 			}
   3018 
   3019 			/* Ok, the disk exists.  Go get the disklabel. */
   3020 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3021 			if (error) {
   3022 				/*
   3023 				 * XXX can't happen - open() would
   3024 				 * have errored out (or faked up one)
   3025 				 */
   3026 				if (error != ENOTTY)
   3027 					printf("RAIDframe: can't get label for dev "
   3028 					    "%s (%d)\n", device_xname(dv), error);
   3029 			}
   3030 
   3031 			/* don't need this any more.  We'll allocate it again
   3032 			   a little later if we really do... */
   3033 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3034 			vput(vp);
   3035 
   3036 			if (error)
   3037 				continue;
   3038 
   3039 			rf_part_found = 0; /*No raid partitions yet*/
   3040 			for (i = 0; i < label.d_npartitions; i++) {
   3041 				char cname[sizeof(ac_list->devname)];
   3042 
   3043 				/* We only support partitions marked as RAID */
   3044 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3045 					continue;
   3046 
   3047 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3048 				if (bdevvp(dev, &vp))
   3049 					panic("RAID can't alloc vnode");
   3050 
   3051 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3052 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3053 				if (error) {
   3054 					/* Whatever... */
   3055 					vput(vp);
   3056 					continue;
   3057 				}
   3058 				VOP_UNLOCK(vp);
   3059 				snprintf(cname, sizeof(cname), "%s%c",
   3060 				    device_xname(dv), 'a' + i);
   3061 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3062 					label.d_partitions[i].p_size, numsecs, secsize);
   3063 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3064 			}
   3065 
   3066 			/*
   3067 			 *If there is no raid component on this disk, either in a
   3068 			 *disklabel or inside a wedge, check the raw partition as well,
   3069 			 *as it is possible to configure raid components on raw disk
   3070 			 *devices.
   3071 			 */
   3072 
   3073 			if (!rf_part_found) {
   3074 				char cname[sizeof(ac_list->devname)];
   3075 
   3076 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3077 				if (bdevvp(dev, &vp))
   3078 					panic("RAID can't alloc vnode");
   3079 
   3080 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3081 
   3082 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3083 				if (error) {
   3084 					/* Whatever... */
   3085 					vput(vp);
   3086 					continue;
   3087 				}
   3088 				VOP_UNLOCK(vp);
   3089 				snprintf(cname, sizeof(cname), "%s%c",
   3090 				    device_xname(dv), 'a' + RAW_PART);
   3091 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3092 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3093 			}
   3094 		}
   3095 		deviter_release(&di);
   3096 	}
   3097 	return ac_list;
   3098 }
   3099 
   3100 
   3101 int
   3102 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3103 {
   3104 
   3105 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3106 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3107 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3108 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3109 	    clabel->row >=0 &&
   3110 	    clabel->column >= 0 &&
   3111 	    clabel->num_rows > 0 &&
   3112 	    clabel->num_columns > 0 &&
   3113 	    clabel->row < clabel->num_rows &&
   3114 	    clabel->column < clabel->num_columns &&
   3115 	    clabel->blockSize > 0 &&
   3116 	    /*
   3117 	     * numBlocksHi may contain garbage, but it is ok since
   3118 	     * the type is unsigned.  If it is really garbage,
   3119 	     * rf_fix_old_label_size() will fix it.
   3120 	     */
   3121 	    rf_component_label_numblocks(clabel) > 0) {
   3122 		/*
   3123 		 * label looks reasonable enough...
   3124 		 * let's make sure it has no old garbage.
   3125 		 */
   3126 		if (numsecs)
   3127 			rf_fix_old_label_size(clabel, numsecs);
   3128 		return(1);
   3129 	}
   3130 	return(0);
   3131 }
   3132 
   3133 
   3134 /*
   3135  * For reasons yet unknown, some old component labels have garbage in
   3136  * the newer numBlocksHi region, and this causes lossage.  Since those
   3137  * disks will also have numsecs set to less than 32 bits of sectors,
   3138  * we can determine when this corruption has occurred, and fix it.
   3139  *
   3140  * The exact same problem, with the same unknown reason, happens to
   3141  * the partitionSizeHi member as well.
   3142  */
   3143 static void
   3144 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3145 {
   3146 
   3147 	if (numsecs < ((uint64_t)1 << 32)) {
   3148 		if (clabel->numBlocksHi) {
   3149 			printf("WARNING: total sectors < 32 bits, yet "
   3150 			       "numBlocksHi set\n"
   3151 			       "WARNING: resetting numBlocksHi to zero.\n");
   3152 			clabel->numBlocksHi = 0;
   3153 		}
   3154 
   3155 		if (clabel->partitionSizeHi) {
   3156 			printf("WARNING: total sectors < 32 bits, yet "
   3157 			       "partitionSizeHi set\n"
   3158 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3159 			clabel->partitionSizeHi = 0;
   3160 		}
   3161 	}
   3162 }
   3163 
   3164 
   3165 #ifdef DEBUG
   3166 void
   3167 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3168 {
   3169 	uint64_t numBlocks;
   3170 	static const char *rp[] = {
   3171 	    "No", "Force", "Soft", "*invalid*"
   3172 	};
   3173 
   3174 
   3175 	numBlocks = rf_component_label_numblocks(clabel);
   3176 
   3177 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3178 	       clabel->row, clabel->column,
   3179 	       clabel->num_rows, clabel->num_columns);
   3180 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3181 	       clabel->version, clabel->serial_number,
   3182 	       clabel->mod_counter);
   3183 	printf("   Clean: %s Status: %d\n",
   3184 	       clabel->clean ? "Yes" : "No", clabel->status);
   3185 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3186 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3187 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3188 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3189 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3190 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3191 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3192 #if 0
   3193 	   printf("   Config order: %d\n", clabel->config_order);
   3194 #endif
   3195 
   3196 }
   3197 #endif
   3198 
   3199 RF_ConfigSet_t *
   3200 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3201 {
   3202 	RF_AutoConfig_t *ac;
   3203 	RF_ConfigSet_t *config_sets;
   3204 	RF_ConfigSet_t *cset;
   3205 	RF_AutoConfig_t *ac_next;
   3206 
   3207 
   3208 	config_sets = NULL;
   3209 
   3210 	/* Go through the AutoConfig list, and figure out which components
   3211 	   belong to what sets.  */
   3212 	ac = ac_list;
   3213 	while(ac!=NULL) {
   3214 		/* we're going to putz with ac->next, so save it here
   3215 		   for use at the end of the loop */
   3216 		ac_next = ac->next;
   3217 
   3218 		if (config_sets == NULL) {
   3219 			/* will need at least this one... */
   3220 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3221 				       M_RAIDFRAME, M_WAITOK);
   3222 			/* this one is easy :) */
   3223 			config_sets->ac = ac;
   3224 			config_sets->next = NULL;
   3225 			config_sets->rootable = 0;
   3226 			ac->next = NULL;
   3227 		} else {
   3228 			/* which set does this component fit into? */
   3229 			cset = config_sets;
   3230 			while(cset!=NULL) {
   3231 				if (rf_does_it_fit(cset, ac)) {
   3232 					/* looks like it matches... */
   3233 					ac->next = cset->ac;
   3234 					cset->ac = ac;
   3235 					break;
   3236 				}
   3237 				cset = cset->next;
   3238 			}
   3239 			if (cset==NULL) {
   3240 				/* didn't find a match above... new set..*/
   3241 				cset = malloc(sizeof(RF_ConfigSet_t),
   3242 					       M_RAIDFRAME, M_WAITOK);
   3243 				cset->ac = ac;
   3244 				ac->next = NULL;
   3245 				cset->next = config_sets;
   3246 				cset->rootable = 0;
   3247 				config_sets = cset;
   3248 			}
   3249 		}
   3250 		ac = ac_next;
   3251 	}
   3252 
   3253 
   3254 	return(config_sets);
   3255 }
   3256 
   3257 static int
   3258 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3259 {
   3260 	RF_ComponentLabel_t *clabel1, *clabel2;
   3261 
   3262 	/* If this one matches the *first* one in the set, that's good
   3263 	   enough, since the other members of the set would have been
   3264 	   through here too... */
   3265 	/* note that we are not checking partitionSize here..
   3266 
   3267 	   Note that we are also not checking the mod_counters here.
   3268 	   If everything else matches except the mod_counter, that's
   3269 	   good enough for this test.  We will deal with the mod_counters
   3270 	   a little later in the autoconfiguration process.
   3271 
   3272 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3273 
   3274 	   The reason we don't check for this is that failed disks
   3275 	   will have lower modification counts.  If those disks are
   3276 	   not added to the set they used to belong to, then they will
   3277 	   form their own set, which may result in 2 different sets,
   3278 	   for example, competing to be configured at raid0, and
   3279 	   perhaps competing to be the root filesystem set.  If the
   3280 	   wrong ones get configured, or both attempt to become /,
   3281 	   weird behaviour and or serious lossage will occur.  Thus we
   3282 	   need to bring them into the fold here, and kick them out at
   3283 	   a later point.
   3284 
   3285 	*/
   3286 
   3287 	clabel1 = cset->ac->clabel;
   3288 	clabel2 = ac->clabel;
   3289 	if ((clabel1->version == clabel2->version) &&
   3290 	    (clabel1->serial_number == clabel2->serial_number) &&
   3291 	    (clabel1->num_rows == clabel2->num_rows) &&
   3292 	    (clabel1->num_columns == clabel2->num_columns) &&
   3293 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3294 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3295 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3296 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3297 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3298 	    (clabel1->blockSize == clabel2->blockSize) &&
   3299 	    rf_component_label_numblocks(clabel1) ==
   3300 	    rf_component_label_numblocks(clabel2) &&
   3301 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3302 	    (clabel1->root_partition == clabel2->root_partition) &&
   3303 	    (clabel1->last_unit == clabel2->last_unit) &&
   3304 	    (clabel1->config_order == clabel2->config_order)) {
   3305 		/* if it get's here, it almost *has* to be a match */
   3306 	} else {
   3307 		/* it's not consistent with somebody in the set..
   3308 		   punt */
   3309 		return(0);
   3310 	}
   3311 	/* all was fine.. it must fit... */
   3312 	return(1);
   3313 }
   3314 
   3315 int
   3316 rf_have_enough_components(RF_ConfigSet_t *cset)
   3317 {
   3318 	RF_AutoConfig_t *ac;
   3319 	RF_AutoConfig_t *auto_config;
   3320 	RF_ComponentLabel_t *clabel;
   3321 	int c;
   3322 	int num_cols;
   3323 	int num_missing;
   3324 	int mod_counter;
   3325 	int mod_counter_found;
   3326 	int even_pair_failed;
   3327 	char parity_type;
   3328 
   3329 
   3330 	/* check to see that we have enough 'live' components
   3331 	   of this set.  If so, we can configure it if necessary */
   3332 
   3333 	num_cols = cset->ac->clabel->num_columns;
   3334 	parity_type = cset->ac->clabel->parityConfig;
   3335 
   3336 	/* XXX Check for duplicate components!?!?!? */
   3337 
   3338 	/* Determine what the mod_counter is supposed to be for this set. */
   3339 
   3340 	mod_counter_found = 0;
   3341 	mod_counter = 0;
   3342 	ac = cset->ac;
   3343 	while(ac!=NULL) {
   3344 		if (mod_counter_found==0) {
   3345 			mod_counter = ac->clabel->mod_counter;
   3346 			mod_counter_found = 1;
   3347 		} else {
   3348 			if (ac->clabel->mod_counter > mod_counter) {
   3349 				mod_counter = ac->clabel->mod_counter;
   3350 			}
   3351 		}
   3352 		ac = ac->next;
   3353 	}
   3354 
   3355 	num_missing = 0;
   3356 	auto_config = cset->ac;
   3357 
   3358 	even_pair_failed = 0;
   3359 	for(c=0; c<num_cols; c++) {
   3360 		ac = auto_config;
   3361 		while(ac!=NULL) {
   3362 			if ((ac->clabel->column == c) &&
   3363 			    (ac->clabel->mod_counter == mod_counter)) {
   3364 				/* it's this one... */
   3365 #ifdef DEBUG
   3366 				printf("Found: %s at %d\n",
   3367 				       ac->devname,c);
   3368 #endif
   3369 				break;
   3370 			}
   3371 			ac=ac->next;
   3372 		}
   3373 		if (ac==NULL) {
   3374 				/* Didn't find one here! */
   3375 				/* special case for RAID 1, especially
   3376 				   where there are more than 2
   3377 				   components (where RAIDframe treats
   3378 				   things a little differently :( ) */
   3379 			if (parity_type == '1') {
   3380 				if (c%2 == 0) { /* even component */
   3381 					even_pair_failed = 1;
   3382 				} else { /* odd component.  If
   3383 					    we're failed, and
   3384 					    so is the even
   3385 					    component, it's
   3386 					    "Good Night, Charlie" */
   3387 					if (even_pair_failed == 1) {
   3388 						return(0);
   3389 					}
   3390 				}
   3391 			} else {
   3392 				/* normal accounting */
   3393 				num_missing++;
   3394 			}
   3395 		}
   3396 		if ((parity_type == '1') && (c%2 == 1)) {
   3397 				/* Just did an even component, and we didn't
   3398 				   bail.. reset the even_pair_failed flag,
   3399 				   and go on to the next component.... */
   3400 			even_pair_failed = 0;
   3401 		}
   3402 	}
   3403 
   3404 	clabel = cset->ac->clabel;
   3405 
   3406 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3407 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3408 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3409 		/* XXX this needs to be made *much* more general */
   3410 		/* Too many failures */
   3411 		return(0);
   3412 	}
   3413 	/* otherwise, all is well, and we've got enough to take a kick
   3414 	   at autoconfiguring this set */
   3415 	return(1);
   3416 }
   3417 
   3418 void
   3419 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3420 			RF_Raid_t *raidPtr)
   3421 {
   3422 	RF_ComponentLabel_t *clabel;
   3423 	int i;
   3424 
   3425 	clabel = ac->clabel;
   3426 
   3427 	/* 1. Fill in the common stuff */
   3428 	config->numCol = clabel->num_columns;
   3429 	config->numSpare = 0; /* XXX should this be set here? */
   3430 	config->sectPerSU = clabel->sectPerSU;
   3431 	config->SUsPerPU = clabel->SUsPerPU;
   3432 	config->SUsPerRU = clabel->SUsPerRU;
   3433 	config->parityConfig = clabel->parityConfig;
   3434 	/* XXX... */
   3435 	strcpy(config->diskQueueType,"fifo");
   3436 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3437 	config->layoutSpecificSize = 0; /* XXX ?? */
   3438 
   3439 	while(ac!=NULL) {
   3440 		/* row/col values will be in range due to the checks
   3441 		   in reasonable_label() */
   3442 		strcpy(config->devnames[0][ac->clabel->column],
   3443 		       ac->devname);
   3444 		ac = ac->next;
   3445 	}
   3446 
   3447 	for(i=0;i<RF_MAXDBGV;i++) {
   3448 		config->debugVars[i][0] = 0;
   3449 	}
   3450 }
   3451 
   3452 int
   3453 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3454 {
   3455 	RF_ComponentLabel_t *clabel;
   3456 	int column;
   3457 	int sparecol;
   3458 
   3459 	raidPtr->autoconfigure = new_value;
   3460 
   3461 	for(column=0; column<raidPtr->numCol; column++) {
   3462 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3463 			clabel = raidget_component_label(raidPtr, column);
   3464 			clabel->autoconfigure = new_value;
   3465 			raidflush_component_label(raidPtr, column);
   3466 		}
   3467 	}
   3468 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3469 		sparecol = raidPtr->numCol + column;
   3470 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3471 			clabel = raidget_component_label(raidPtr, sparecol);
   3472 			clabel->autoconfigure = new_value;
   3473 			raidflush_component_label(raidPtr, sparecol);
   3474 		}
   3475 	}
   3476 	return(new_value);
   3477 }
   3478 
   3479 int
   3480 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3481 {
   3482 	RF_ComponentLabel_t *clabel;
   3483 	int column;
   3484 	int sparecol;
   3485 
   3486 	raidPtr->root_partition = new_value;
   3487 	for(column=0; column<raidPtr->numCol; column++) {
   3488 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3489 			clabel = raidget_component_label(raidPtr, column);
   3490 			clabel->root_partition = new_value;
   3491 			raidflush_component_label(raidPtr, column);
   3492 		}
   3493 	}
   3494 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3495 		sparecol = raidPtr->numCol + column;
   3496 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3497 			clabel = raidget_component_label(raidPtr, sparecol);
   3498 			clabel->root_partition = new_value;
   3499 			raidflush_component_label(raidPtr, sparecol);
   3500 		}
   3501 	}
   3502 	return(new_value);
   3503 }
   3504 
   3505 void
   3506 rf_release_all_vps(RF_ConfigSet_t *cset)
   3507 {
   3508 	RF_AutoConfig_t *ac;
   3509 
   3510 	ac = cset->ac;
   3511 	while(ac!=NULL) {
   3512 		/* Close the vp, and give it back */
   3513 		if (ac->vp) {
   3514 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3515 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3516 			vput(ac->vp);
   3517 			ac->vp = NULL;
   3518 		}
   3519 		ac = ac->next;
   3520 	}
   3521 }
   3522 
   3523 
   3524 void
   3525 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3526 {
   3527 	RF_AutoConfig_t *ac;
   3528 	RF_AutoConfig_t *next_ac;
   3529 
   3530 	ac = cset->ac;
   3531 	while(ac!=NULL) {
   3532 		next_ac = ac->next;
   3533 		/* nuke the label */
   3534 		free(ac->clabel, M_RAIDFRAME);
   3535 		/* cleanup the config structure */
   3536 		free(ac, M_RAIDFRAME);
   3537 		/* "next.." */
   3538 		ac = next_ac;
   3539 	}
   3540 	/* and, finally, nuke the config set */
   3541 	free(cset, M_RAIDFRAME);
   3542 }
   3543 
   3544 
   3545 void
   3546 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3547 {
   3548 	/* current version number */
   3549 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3550 	clabel->serial_number = raidPtr->serial_number;
   3551 	clabel->mod_counter = raidPtr->mod_counter;
   3552 
   3553 	clabel->num_rows = 1;
   3554 	clabel->num_columns = raidPtr->numCol;
   3555 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3556 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3557 
   3558 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3559 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3560 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3561 
   3562 	clabel->blockSize = raidPtr->bytesPerSector;
   3563 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3564 
   3565 	/* XXX not portable */
   3566 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3567 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3568 	clabel->autoconfigure = raidPtr->autoconfigure;
   3569 	clabel->root_partition = raidPtr->root_partition;
   3570 	clabel->last_unit = raidPtr->raidid;
   3571 	clabel->config_order = raidPtr->config_order;
   3572 
   3573 #ifndef RF_NO_PARITY_MAP
   3574 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3575 #endif
   3576 }
   3577 
   3578 struct raid_softc *
   3579 rf_auto_config_set(RF_ConfigSet_t *cset)
   3580 {
   3581 	RF_Raid_t *raidPtr;
   3582 	RF_Config_t *config;
   3583 	int raidID;
   3584 	struct raid_softc *sc;
   3585 
   3586 #ifdef DEBUG
   3587 	printf("RAID autoconfigure\n");
   3588 #endif
   3589 
   3590 	/* 1. Create a config structure */
   3591 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3592 
   3593 	/*
   3594 	   2. Figure out what RAID ID this one is supposed to live at
   3595 	   See if we can get the same RAID dev that it was configured
   3596 	   on last time..
   3597 	*/
   3598 
   3599 	raidID = cset->ac->clabel->last_unit;
   3600 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3601 	     sc = raidget(++raidID, false))
   3602 		continue;
   3603 #ifdef DEBUG
   3604 	printf("Configuring raid%d:\n",raidID);
   3605 #endif
   3606 
   3607 	if (sc == NULL)
   3608 		sc = raidget(raidID, true);
   3609 	raidPtr = &sc->sc_r;
   3610 
   3611 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3612 	raidPtr->softc = sc;
   3613 	raidPtr->raidid = raidID;
   3614 	raidPtr->openings = RAIDOUTSTANDING;
   3615 
   3616 	/* 3. Build the configuration structure */
   3617 	rf_create_configuration(cset->ac, config, raidPtr);
   3618 
   3619 	/* 4. Do the configuration */
   3620 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3621 		raidinit(sc);
   3622 
   3623 		rf_markalldirty(raidPtr);
   3624 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3625 		switch (cset->ac->clabel->root_partition) {
   3626 		case 1:	/* Force Root */
   3627 		case 2:	/* Soft Root: root when boot partition part of raid */
   3628 			/*
   3629 			 * everything configured just fine.  Make a note
   3630 			 * that this set is eligible to be root,
   3631 			 * or forced to be root
   3632 			 */
   3633 			cset->rootable = cset->ac->clabel->root_partition;
   3634 			/* XXX do this here? */
   3635 			raidPtr->root_partition = cset->rootable;
   3636 			break;
   3637 		default:
   3638 			break;
   3639 		}
   3640 	} else {
   3641 		raidput(sc);
   3642 		sc = NULL;
   3643 	}
   3644 
   3645 	/* 5. Cleanup */
   3646 	free(config, M_RAIDFRAME);
   3647 	return sc;
   3648 }
   3649 
   3650 void
   3651 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3652 	     size_t xmin, size_t xmax)
   3653 {
   3654 	int error;
   3655 
   3656 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3657 	pool_sethiwat(p, xmax);
   3658 	if ((error = pool_prime(p, xmin)) != 0)
   3659 		panic("%s: failed to prime pool: %d", __func__, error);
   3660 	pool_setlowat(p, xmin);
   3661 }
   3662 
   3663 /*
   3664  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3665  * to see if there is IO pending and if that IO could possibly be done
   3666  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3667  * otherwise.
   3668  *
   3669  */
   3670 int
   3671 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3672 {
   3673 	struct raid_softc *rs;
   3674 	struct dk_softc *dksc;
   3675 
   3676 	rs = raidPtr->softc;
   3677 	dksc = &rs->sc_dksc;
   3678 
   3679 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3680 		return 1;
   3681 
   3682 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3683 		/* there is work to do */
   3684 		return 0;
   3685 	}
   3686 	/* default is nothing to do */
   3687 	return 1;
   3688 }
   3689 
   3690 int
   3691 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3692 {
   3693 	uint64_t numsecs;
   3694 	unsigned secsize;
   3695 	int error;
   3696 
   3697 	error = getdisksize(vp, &numsecs, &secsize);
   3698 	if (error == 0) {
   3699 		diskPtr->blockSize = secsize;
   3700 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3701 		diskPtr->partitionSize = numsecs;
   3702 		return 0;
   3703 	}
   3704 	return error;
   3705 }
   3706 
   3707 static int
   3708 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3709 {
   3710 	return 1;
   3711 }
   3712 
   3713 static void
   3714 raid_attach(device_t parent, device_t self, void *aux)
   3715 {
   3716 }
   3717 
   3718 
   3719 static int
   3720 raid_detach(device_t self, int flags)
   3721 {
   3722 	int error;
   3723 	struct raid_softc *rs = raidsoftc(self);
   3724 
   3725 	if (rs == NULL)
   3726 		return ENXIO;
   3727 
   3728 	if ((error = raidlock(rs)) != 0)
   3729 		return (error);
   3730 
   3731 	error = raid_detach_unlocked(rs);
   3732 
   3733 	raidunlock(rs);
   3734 
   3735 	/* XXX raid can be referenced here */
   3736 
   3737 	if (error)
   3738 		return error;
   3739 
   3740 	/* Free the softc */
   3741 	raidput(rs);
   3742 
   3743 	return 0;
   3744 }
   3745 
   3746 static void
   3747 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3748 {
   3749 	struct dk_softc *dksc = &rs->sc_dksc;
   3750 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3751 
   3752 	memset(dg, 0, sizeof(*dg));
   3753 
   3754 	dg->dg_secperunit = raidPtr->totalSectors;
   3755 	dg->dg_secsize = raidPtr->bytesPerSector;
   3756 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3757 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3758 
   3759 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3760 }
   3761 
   3762 /*
   3763  * Get cache info for all the components (including spares).
   3764  * Returns intersection of all the cache flags of all disks, or first
   3765  * error if any encountered.
   3766  * XXXfua feature flags can change as spares are added - lock down somehow
   3767  */
   3768 static int
   3769 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3770 {
   3771 	int c;
   3772 	int error;
   3773 	int dkwhole = 0, dkpart;
   3774 
   3775 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3776 		/*
   3777 		 * Check any non-dead disk, even when currently being
   3778 		 * reconstructed.
   3779 		 */
   3780 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   3781 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3782 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3783 			if (error) {
   3784 				if (error != ENODEV) {
   3785 					printf("raid%d: get cache for component %s failed\n",
   3786 					    raidPtr->raidid,
   3787 					    raidPtr->Disks[c].devname);
   3788 				}
   3789 
   3790 				return error;
   3791 			}
   3792 
   3793 			if (c == 0)
   3794 				dkwhole = dkpart;
   3795 			else
   3796 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3797 		}
   3798 	}
   3799 
   3800 	*data = dkwhole;
   3801 
   3802 	return 0;
   3803 }
   3804 
   3805 /*
   3806  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3807  * We end up returning whatever error was returned by the first cache flush
   3808  * that fails.
   3809  */
   3810 
   3811 int
   3812 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3813 {
   3814 	int c, sparecol;
   3815 	int e,error;
   3816 	int force = 1;
   3817 
   3818 	error = 0;
   3819 	for (c = 0; c < raidPtr->numCol; c++) {
   3820 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3821 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3822 					  &force, FWRITE, NOCRED);
   3823 			if (e) {
   3824 				if (e != ENODEV)
   3825 					printf("raid%d: cache flush to component %s failed.\n",
   3826 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3827 				if (error == 0) {
   3828 					error = e;
   3829 				}
   3830 			}
   3831 		}
   3832 	}
   3833 
   3834 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3835 		sparecol = raidPtr->numCol + c;
   3836 		/* Need to ensure that the reconstruct actually completed! */
   3837 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3838 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3839 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3840 			if (e) {
   3841 				if (e != ENODEV)
   3842 					printf("raid%d: cache flush to component %s failed.\n",
   3843 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3844 				if (error == 0) {
   3845 					error = e;
   3846 				}
   3847 			}
   3848 		}
   3849 	}
   3850 	return error;
   3851 }
   3852 
   3853 /* Fill in info with the current status */
   3854 void
   3855 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3856 {
   3857 
   3858 	memset(info, 0, sizeof(*info));
   3859 
   3860 	if (raidPtr->status != rf_rs_reconstructing) {
   3861 		info->total = 100;
   3862 		info->completed = 100;
   3863 	} else {
   3864 		info->total = raidPtr->reconControl->numRUsTotal;
   3865 		info->completed = raidPtr->reconControl->numRUsComplete;
   3866 	}
   3867 	info->remaining = info->total - info->completed;
   3868 }
   3869 
   3870 /* Fill in info with the current status */
   3871 void
   3872 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3873 {
   3874 
   3875 	memset(info, 0, sizeof(*info));
   3876 
   3877 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3878 		info->total = raidPtr->Layout.numStripe;
   3879 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3880 	} else {
   3881 		info->completed = 100;
   3882 		info->total = 100;
   3883 	}
   3884 	info->remaining = info->total - info->completed;
   3885 }
   3886 
   3887 /* Fill in info with the current status */
   3888 void
   3889 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3890 {
   3891 
   3892 	memset(info, 0, sizeof(*info));
   3893 
   3894 	if (raidPtr->copyback_in_progress == 1) {
   3895 		info->total = raidPtr->Layout.numStripe;
   3896 		info->completed = raidPtr->copyback_stripes_done;
   3897 		info->remaining = info->total - info->completed;
   3898 	} else {
   3899 		info->remaining = 0;
   3900 		info->completed = 100;
   3901 		info->total = 100;
   3902 	}
   3903 }
   3904 
   3905 /* Fill in config with the current info */
   3906 int
   3907 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3908 {
   3909 	int	d, i, j;
   3910 
   3911 	if (!raidPtr->valid)
   3912 		return (ENODEV);
   3913 	config->cols = raidPtr->numCol;
   3914 	config->ndevs = raidPtr->numCol;
   3915 	if (config->ndevs >= RF_MAX_DISKS)
   3916 		return (ENOMEM);
   3917 	config->nspares = raidPtr->numSpare;
   3918 	if (config->nspares >= RF_MAX_DISKS)
   3919 		return (ENOMEM);
   3920 	config->maxqdepth = raidPtr->maxQueueDepth;
   3921 	d = 0;
   3922 	for (j = 0; j < config->cols; j++) {
   3923 		config->devs[d] = raidPtr->Disks[j];
   3924 		d++;
   3925 	}
   3926 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3927 		config->spares[i] = raidPtr->Disks[j];
   3928 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3929 			/* XXX: raidctl(8) expects to see this as a used spare */
   3930 			config->spares[i].status = rf_ds_used_spare;
   3931 		}
   3932 	}
   3933 	return 0;
   3934 }
   3935 
   3936 int
   3937 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3938 {
   3939 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3940 	RF_ComponentLabel_t *raid_clabel;
   3941 	int column = clabel->column;
   3942 
   3943 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3944 		return EINVAL;
   3945 	raid_clabel = raidget_component_label(raidPtr, column);
   3946 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3947 
   3948 	return 0;
   3949 }
   3950 
   3951 /*
   3952  * Module interface
   3953  */
   3954 
   3955 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3956 
   3957 #ifdef _MODULE
   3958 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3959 #endif
   3960 
   3961 static int raid_modcmd(modcmd_t, void *);
   3962 static int raid_modcmd_init(void);
   3963 static int raid_modcmd_fini(void);
   3964 
   3965 static int
   3966 raid_modcmd(modcmd_t cmd, void *data)
   3967 {
   3968 	int error;
   3969 
   3970 	error = 0;
   3971 	switch (cmd) {
   3972 	case MODULE_CMD_INIT:
   3973 		error = raid_modcmd_init();
   3974 		break;
   3975 	case MODULE_CMD_FINI:
   3976 		error = raid_modcmd_fini();
   3977 		break;
   3978 	default:
   3979 		error = ENOTTY;
   3980 		break;
   3981 	}
   3982 	return error;
   3983 }
   3984 
   3985 static int
   3986 raid_modcmd_init(void)
   3987 {
   3988 	int error;
   3989 	int bmajor, cmajor;
   3990 
   3991 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3992 	mutex_enter(&raid_lock);
   3993 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3994 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3995 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3996 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3997 
   3998 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3999 #endif
   4000 
   4001 	bmajor = cmajor = -1;
   4002 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4003 	    &raid_cdevsw, &cmajor);
   4004 	if (error != 0 && error != EEXIST) {
   4005 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4006 		mutex_exit(&raid_lock);
   4007 		return error;
   4008 	}
   4009 #ifdef _MODULE
   4010 	error = config_cfdriver_attach(&raid_cd);
   4011 	if (error != 0) {
   4012 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4013 		    __func__, error);
   4014 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4015 		mutex_exit(&raid_lock);
   4016 		return error;
   4017 	}
   4018 #endif
   4019 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4020 	if (error != 0) {
   4021 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4022 		    __func__, error);
   4023 #ifdef _MODULE
   4024 		config_cfdriver_detach(&raid_cd);
   4025 #endif
   4026 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4027 		mutex_exit(&raid_lock);
   4028 		return error;
   4029 	}
   4030 
   4031 	raidautoconfigdone = false;
   4032 
   4033 	mutex_exit(&raid_lock);
   4034 
   4035 	if (error == 0) {
   4036 		if (rf_BootRaidframe(true) == 0)
   4037 			aprint_verbose("Kernelized RAIDframe activated\n");
   4038 		else
   4039 			panic("Serious error activating RAID!!");
   4040 	}
   4041 
   4042 	/*
   4043 	 * Register a finalizer which will be used to auto-config RAID
   4044 	 * sets once all real hardware devices have been found.
   4045 	 */
   4046 	error = config_finalize_register(NULL, rf_autoconfig);
   4047 	if (error != 0) {
   4048 		aprint_error("WARNING: unable to register RAIDframe "
   4049 		    "finalizer\n");
   4050 		error = 0;
   4051 	}
   4052 
   4053 	return error;
   4054 }
   4055 
   4056 static int
   4057 raid_modcmd_fini(void)
   4058 {
   4059 	int error;
   4060 
   4061 	mutex_enter(&raid_lock);
   4062 
   4063 	/* Don't allow unload if raid device(s) exist.  */
   4064 	if (!LIST_EMPTY(&raids)) {
   4065 		mutex_exit(&raid_lock);
   4066 		return EBUSY;
   4067 	}
   4068 
   4069 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4070 	if (error != 0) {
   4071 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4072 		mutex_exit(&raid_lock);
   4073 		return error;
   4074 	}
   4075 #ifdef _MODULE
   4076 	error = config_cfdriver_detach(&raid_cd);
   4077 	if (error != 0) {
   4078 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4079 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4080 		mutex_exit(&raid_lock);
   4081 		return error;
   4082 	}
   4083 #endif
   4084 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4085 	if (error != 0) {
   4086 		aprint_error("%s: cannot detach devsw\n",__func__);
   4087 #ifdef _MODULE
   4088 		config_cfdriver_attach(&raid_cd);
   4089 #endif
   4090 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4091 		mutex_exit(&raid_lock);
   4092 		return error;
   4093 	}
   4094 	rf_BootRaidframe(false);
   4095 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4096 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4097 	rf_destroy_cond2(rf_sparet_wait_cv);
   4098 	rf_destroy_cond2(rf_sparet_resp_cv);
   4099 #endif
   4100 	mutex_exit(&raid_lock);
   4101 	mutex_destroy(&raid_lock);
   4102 
   4103 	return error;
   4104 }
   4105