Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.250.4.13
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.250.4.13 2012/10/24 03:03:53 riz Exp $	*/
      2 /*-
      3  * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This code is derived from software contributed to The NetBSD Foundation
      7  * by Greg Oster; Jason R. Thorpe.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     28  * POSSIBILITY OF SUCH DAMAGE.
     29  */
     30 
     31 /*
     32  * Copyright (c) 1990, 1993
     33  *      The Regents of the University of California.  All rights reserved.
     34  *
     35  * This code is derived from software contributed to Berkeley by
     36  * the Systems Programming Group of the University of Utah Computer
     37  * Science Department.
     38  *
     39  * Redistribution and use in source and binary forms, with or without
     40  * modification, are permitted provided that the following conditions
     41  * are met:
     42  * 1. Redistributions of source code must retain the above copyright
     43  *    notice, this list of conditions and the following disclaimer.
     44  * 2. Redistributions in binary form must reproduce the above copyright
     45  *    notice, this list of conditions and the following disclaimer in the
     46  *    documentation and/or other materials provided with the distribution.
     47  * 3. Neither the name of the University nor the names of its contributors
     48  *    may be used to endorse or promote products derived from this software
     49  *    without specific prior written permission.
     50  *
     51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     61  * SUCH DAMAGE.
     62  *
     63  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     64  *
     65  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     66  */
     67 
     68 /*
     69  * Copyright (c) 1988 University of Utah.
     70  *
     71  * This code is derived from software contributed to Berkeley by
     72  * the Systems Programming Group of the University of Utah Computer
     73  * Science Department.
     74  *
     75  * Redistribution and use in source and binary forms, with or without
     76  * modification, are permitted provided that the following conditions
     77  * are met:
     78  * 1. Redistributions of source code must retain the above copyright
     79  *    notice, this list of conditions and the following disclaimer.
     80  * 2. Redistributions in binary form must reproduce the above copyright
     81  *    notice, this list of conditions and the following disclaimer in the
     82  *    documentation and/or other materials provided with the distribution.
     83  * 3. All advertising materials mentioning features or use of this software
     84  *    must display the following acknowledgement:
     85  *      This product includes software developed by the University of
     86  *      California, Berkeley and its contributors.
     87  * 4. Neither the name of the University nor the names of its contributors
     88  *    may be used to endorse or promote products derived from this software
     89  *    without specific prior written permission.
     90  *
     91  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     92  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     93  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     94  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     95  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     96  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     97  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     98  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     99  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    100  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    101  * SUCH DAMAGE.
    102  *
    103  * from: Utah $Hdr: cd.c 1.6 90/11/28$
    104  *
    105  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
    106  */
    107 
    108 /*
    109  * Copyright (c) 1995 Carnegie-Mellon University.
    110  * All rights reserved.
    111  *
    112  * Authors: Mark Holland, Jim Zelenka
    113  *
    114  * Permission to use, copy, modify and distribute this software and
    115  * its documentation is hereby granted, provided that both the copyright
    116  * notice and this permission notice appear in all copies of the
    117  * software, derivative works or modified versions, and any portions
    118  * thereof, and that both notices appear in supporting documentation.
    119  *
    120  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
    121  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
    122  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
    123  *
    124  * Carnegie Mellon requests users of this software to return to
    125  *
    126  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
    127  *  School of Computer Science
    128  *  Carnegie Mellon University
    129  *  Pittsburgh PA 15213-3890
    130  *
    131  * any improvements or extensions that they make and grant Carnegie the
    132  * rights to redistribute these changes.
    133  */
    134 
    135 /***********************************************************
    136  *
    137  * rf_kintf.c -- the kernel interface routines for RAIDframe
    138  *
    139  ***********************************************************/
    140 
    141 #include <sys/cdefs.h>
    142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.250.4.13 2012/10/24 03:03:53 riz Exp $");
    143 
    144 #include <sys/param.h>
    145 #include <sys/errno.h>
    146 #include <sys/pool.h>
    147 #include <sys/proc.h>
    148 #include <sys/queue.h>
    149 #include <sys/disk.h>
    150 #include <sys/device.h>
    151 #include <sys/stat.h>
    152 #include <sys/ioctl.h>
    153 #include <sys/fcntl.h>
    154 #include <sys/systm.h>
    155 #include <sys/vnode.h>
    156 #include <sys/disklabel.h>
    157 #include <sys/conf.h>
    158 #include <sys/buf.h>
    159 #include <sys/bufq.h>
    160 #include <sys/user.h>
    161 #include <sys/reboot.h>
    162 #include <sys/kauth.h>
    163 
    164 #include <prop/proplib.h>
    165 
    166 #include <dev/raidframe/raidframevar.h>
    167 #include <dev/raidframe/raidframeio.h>
    168 #include <dev/raidframe/rf_paritymap.h>
    169 #include "raid.h"
    170 #include "opt_raid_autoconfig.h"
    171 #include "rf_raid.h"
    172 #include "rf_copyback.h"
    173 #include "rf_dag.h"
    174 #include "rf_dagflags.h"
    175 #include "rf_desc.h"
    176 #include "rf_diskqueue.h"
    177 #include "rf_etimer.h"
    178 #include "rf_general.h"
    179 #include "rf_kintf.h"
    180 #include "rf_options.h"
    181 #include "rf_driver.h"
    182 #include "rf_parityscan.h"
    183 #include "rf_threadstuff.h"
    184 
    185 #ifdef DEBUG
    186 int     rf_kdebug_level = 0;
    187 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    188 #else				/* DEBUG */
    189 #define db1_printf(a) { }
    190 #endif				/* DEBUG */
    191 
    192 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    193 
    194 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    195 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
    196 
    197 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    198 						 * spare table */
    199 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    200 						 * installation process */
    201 #endif
    202 
    203 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    204 
    205 /* prototypes */
    206 static void KernelWakeupFunc(struct buf *);
    207 static void InitBP(struct buf *, struct vnode *, unsigned,
    208     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    209     void *, int, struct proc *);
    210 static void raidinit(RF_Raid_t *);
    211 
    212 void raidattach(int);
    213 static int raid_match(struct device *, struct cfdata *, void *);
    214 static void raid_attach(struct device *, struct device *, void *);
    215 static int raid_detach(struct device *, int);
    216 
    217 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    218     daddr_t, daddr_t);
    219 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    220     daddr_t, daddr_t, int);
    221 
    222 static int raidwrite_component_label(unsigned,
    223     dev_t, struct vnode *, RF_ComponentLabel_t *);
    224 static int raidread_component_label(unsigned,
    225     dev_t, struct vnode *, RF_ComponentLabel_t *);
    226 
    227 
    228 dev_type_open(raidopen);
    229 dev_type_close(raidclose);
    230 dev_type_read(raidread);
    231 dev_type_write(raidwrite);
    232 dev_type_ioctl(raidioctl);
    233 dev_type_strategy(raidstrategy);
    234 dev_type_dump(raiddump);
    235 dev_type_size(raidsize);
    236 
    237 const struct bdevsw raid_bdevsw = {
    238 	raidopen, raidclose, raidstrategy, raidioctl,
    239 	raiddump, raidsize, D_DISK
    240 };
    241 
    242 const struct cdevsw raid_cdevsw = {
    243 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    244 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    245 };
    246 
    247 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    248 
    249 /* XXX Not sure if the following should be replacing the raidPtrs above,
    250    or if it should be used in conjunction with that...
    251 */
    252 
    253 struct raid_softc {
    254 	struct device *sc_dev;
    255 	int     sc_flags;	/* flags */
    256 	int     sc_cflags;	/* configuration flags */
    257 	uint64_t sc_size;	/* size of the raid device */
    258 	char    sc_xname[20];	/* XXX external name */
    259 	struct disk sc_dkdev;	/* generic disk device info */
    260 	struct bufq_state *buf_queue;	/* used for the device queue */
    261 };
    262 /* sc_flags */
    263 #define RAIDF_INITED	0x01	/* unit has been initialized */
    264 #define RAIDF_WLABEL	0x02	/* label area is writable */
    265 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    266 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    267 #define RAIDF_LOCKED	0x80	/* unit is locked */
    268 
    269 #define	raidunit(x)	DISKUNIT(x)
    270 int numraid = 0;
    271 
    272 extern struct cfdriver raid_cd;
    273 CFATTACH_DECL_NEW(raid, sizeof(struct raid_softc),
    274     raid_match, raid_attach, raid_detach, NULL);
    275 
    276 /*
    277  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    278  * Be aware that large numbers can allow the driver to consume a lot of
    279  * kernel memory, especially on writes, and in degraded mode reads.
    280  *
    281  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    282  * a single 64K write will typically require 64K for the old data,
    283  * 64K for the old parity, and 64K for the new parity, for a total
    284  * of 192K (if the parity buffer is not re-used immediately).
    285  * Even it if is used immediately, that's still 128K, which when multiplied
    286  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    287  *
    288  * Now in degraded mode, for example, a 64K read on the above setup may
    289  * require data reconstruction, which will require *all* of the 4 remaining
    290  * disks to participate -- 4 * 32K/disk == 128K again.
    291  */
    292 
    293 #ifndef RAIDOUTSTANDING
    294 #define RAIDOUTSTANDING   6
    295 #endif
    296 
    297 #define RAIDLABELDEV(dev)	\
    298 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    299 
    300 /* declared here, and made public, for the benefit of KVM stuff.. */
    301 struct raid_softc *raid_softc;
    302 
    303 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    304 				     struct disklabel *);
    305 static void raidgetdisklabel(dev_t);
    306 static void raidmakedisklabel(struct raid_softc *);
    307 
    308 static int raidlock(struct raid_softc *);
    309 static void raidunlock(struct raid_softc *);
    310 
    311 static void rf_markalldirty(RF_Raid_t *);
    312 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    313 
    314 void rf_ReconThread(struct rf_recon_req *);
    315 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    316 void rf_CopybackThread(RF_Raid_t *raidPtr);
    317 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    318 int rf_autoconfig(struct device *self);
    319 void rf_buildroothack(RF_ConfigSet_t *);
    320 
    321 RF_AutoConfig_t *rf_find_raid_components(void);
    322 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    323 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    324 static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    325 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    326 int rf_set_autoconfig(RF_Raid_t *, int);
    327 int rf_set_rootpartition(RF_Raid_t *, int);
    328 void rf_release_all_vps(RF_ConfigSet_t *);
    329 void rf_cleanup_config_set(RF_ConfigSet_t *);
    330 int rf_have_enough_components(RF_ConfigSet_t *);
    331 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    332 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    333 
    334 static int raidautoconfig = 0; /* Debugging, mostly.  Set to 0 to not
    335 				  allow autoconfig to take place.
    336 				  Note that this is overridden by having
    337 				  RAID_AUTOCONFIG as an option in the
    338 				  kernel config file.  */
    339 
    340 struct RF_Pools_s rf_pools;
    341 
    342 void
    343 raidattach(int num)
    344 {
    345 	int raidID;
    346 	int i, rc;
    347 
    348 	aprint_debug("raidattach: Asked for %d units\n", num);
    349 
    350 	if (num <= 0) {
    351 #ifdef DIAGNOSTIC
    352 		panic("raidattach: count <= 0");
    353 #endif
    354 		return;
    355 	}
    356 	/* This is where all the initialization stuff gets done. */
    357 
    358 	numraid = num;
    359 
    360 	/* Make some space for requested number of units... */
    361 
    362 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    363 	if (raidPtrs == NULL) {
    364 		panic("raidPtrs is NULL!!");
    365 	}
    366 
    367 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    368 	rf_mutex_init(&rf_sparet_wait_mutex);
    369 
    370 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    371 #endif
    372 
    373 	for (i = 0; i < num; i++)
    374 		raidPtrs[i] = NULL;
    375 	rc = rf_BootRaidframe();
    376 	if (rc == 0)
    377 		aprint_normal("Kernelized RAIDframe activated\n");
    378 	else
    379 		panic("Serious error booting RAID!!");
    380 
    381 	/* put together some datastructures like the CCD device does.. This
    382 	 * lets us lock the device and what-not when it gets opened. */
    383 
    384 	raid_softc = (struct raid_softc *)
    385 		malloc(num * sizeof(struct raid_softc),
    386 		       M_RAIDFRAME, M_NOWAIT);
    387 	if (raid_softc == NULL) {
    388 		aprint_error("WARNING: no memory for RAIDframe driver\n");
    389 		return;
    390 	}
    391 
    392 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    393 
    394 	for (raidID = 0; raidID < num; raidID++) {
    395 		bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
    396 
    397 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    398 			  (RF_Raid_t *));
    399 		if (raidPtrs[raidID] == NULL) {
    400 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
    401 			numraid = raidID;
    402 			return;
    403 		}
    404 	}
    405 
    406 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    407 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    408 	}
    409 
    410 #ifdef RAID_AUTOCONFIG
    411 	raidautoconfig = 1;
    412 #endif
    413 
    414 	/*
    415 	 * Register a finalizer which will be used to auto-config RAID
    416 	 * sets once all real hardware devices have been found.
    417 	 */
    418 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    419 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    420 }
    421 
    422 int
    423 rf_autoconfig(struct device *self)
    424 {
    425 	RF_AutoConfig_t *ac_list;
    426 	RF_ConfigSet_t *config_sets;
    427 
    428 	if (raidautoconfig == 0)
    429 		return (0);
    430 
    431 	/* XXX This code can only be run once. */
    432 	raidautoconfig = 0;
    433 
    434 	/* 1. locate all RAID components on the system */
    435 	aprint_debug("Searching for RAID components...\n");
    436 	ac_list = rf_find_raid_components();
    437 
    438 	/* 2. Sort them into their respective sets. */
    439 	config_sets = rf_create_auto_sets(ac_list);
    440 
    441 	/*
    442 	 * 3. Evaluate each set andconfigure the valid ones.
    443 	 * This gets done in rf_buildroothack().
    444 	 */
    445 	rf_buildroothack(config_sets);
    446 
    447 	return 1;
    448 }
    449 
    450 void
    451 rf_buildroothack(RF_ConfigSet_t *config_sets)
    452 {
    453 	RF_ConfigSet_t *cset;
    454 	RF_ConfigSet_t *next_cset;
    455 	int retcode;
    456 	int raidID;
    457 	int rootID;
    458 	int col;
    459 	int num_root;
    460 	char *devname;
    461 
    462 	rootID = 0;
    463 	num_root = 0;
    464 	cset = config_sets;
    465 	while(cset != NULL ) {
    466 		next_cset = cset->next;
    467 		if (rf_have_enough_components(cset) &&
    468 		    cset->ac->clabel->autoconfigure==1) {
    469 			retcode = rf_auto_config_set(cset,&raidID);
    470 			if (!retcode) {
    471 				aprint_debug("raid%d: configured ok\n", raidID);
    472 				if (cset->rootable) {
    473 					rootID = raidID;
    474 					num_root++;
    475 				}
    476 			} else {
    477 				/* The autoconfig didn't work :( */
    478 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    479 				rf_release_all_vps(cset);
    480 			}
    481 		} else {
    482 			/* we're not autoconfiguring this set...
    483 			   release the associated resources */
    484 			rf_release_all_vps(cset);
    485 		}
    486 		/* cleanup */
    487 		rf_cleanup_config_set(cset);
    488 		cset = next_cset;
    489 	}
    490 
    491 	/* if the user has specified what the root device should be
    492 	   then we don't touch booted_device or boothowto... */
    493 
    494 	if (rootspec != NULL)
    495 		return;
    496 
    497 	/* we found something bootable... */
    498 
    499 	if (num_root == 1) {
    500 		booted_device = raid_softc[rootID].sc_dev;
    501 	} else if (num_root > 1) {
    502 
    503 		/*
    504 		 * Maybe the MD code can help. If it cannot, then
    505 		 * setroot() will discover that we have no
    506 		 * booted_device and will ask the user if nothing was
    507 		 * hardwired in the kernel config file
    508 		 */
    509 
    510 		if (booted_device == NULL)
    511 			cpu_rootconf();
    512 		if (booted_device == NULL)
    513 			return;
    514 
    515 		num_root = 0;
    516 		for (raidID = 0; raidID < numraid; raidID++) {
    517 			if (raidPtrs[raidID]->valid == 0)
    518 				continue;
    519 
    520 			if (raidPtrs[raidID]->root_partition == 0)
    521 				continue;
    522 
    523 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
    524 				devname = raidPtrs[raidID]->Disks[col].devname;
    525 				devname += sizeof("/dev/") - 1;
    526 				if (strncmp(devname, device_xname(booted_device),
    527 					    strlen(device_xname(booted_device))) != 0)
    528 					continue;
    529 				aprint_debug("raid%d includes boot device %s\n",
    530 				       raidID, devname);
    531 				num_root++;
    532 				rootID = raidID;
    533 			}
    534 		}
    535 
    536 		if (num_root == 1) {
    537 			booted_device = raid_softc[rootID].sc_dev;
    538 		} else {
    539 			/* we can't guess.. require the user to answer... */
    540 			boothowto |= RB_ASKNAME;
    541 		}
    542 	}
    543 }
    544 
    545 
    546 int
    547 raidsize(dev_t dev)
    548 {
    549 	struct raid_softc *rs;
    550 	struct disklabel *lp;
    551 	int     part, unit, omask, size;
    552 
    553 	unit = raidunit(dev);
    554 	if (unit >= numraid)
    555 		return (-1);
    556 	rs = &raid_softc[unit];
    557 
    558 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    559 		return (-1);
    560 
    561 	part = DISKPART(dev);
    562 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    563 	lp = rs->sc_dkdev.dk_label;
    564 
    565 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    566 		return (-1);
    567 
    568 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    569 		size = -1;
    570 	else
    571 		size = lp->d_partitions[part].p_size *
    572 		    (lp->d_secsize / DEV_BSIZE);
    573 
    574 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    575 		return (-1);
    576 
    577 	return (size);
    578 
    579 }
    580 
    581 int
    582 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    583 {
    584 	int     unit = raidunit(dev);
    585 	struct raid_softc *rs;
    586 	const struct bdevsw *bdev;
    587 	struct disklabel *lp;
    588 	RF_Raid_t *raidPtr;
    589 	daddr_t offset;
    590 	int     part, c, sparecol, j, scol, dumpto;
    591 	int     error = 0;
    592 
    593 	if (unit >= numraid)
    594 		return (ENXIO);
    595 
    596 	rs = &raid_softc[unit];
    597 	raidPtr = raidPtrs[unit];
    598 
    599 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    600 		return ENXIO;
    601 
    602 	/* we only support dumping to RAID 1 sets */
    603 	if (raidPtr->Layout.numDataCol != 1 ||
    604 	    raidPtr->Layout.numParityCol != 1)
    605 		return EINVAL;
    606 
    607 
    608 	if ((error = raidlock(rs)) != 0)
    609 		return error;
    610 
    611 	if (size % DEV_BSIZE != 0) {
    612 		error = EINVAL;
    613 		goto out;
    614 	}
    615 
    616 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    617 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    618 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    619 		    size / DEV_BSIZE, rs->sc_size);
    620 		error = EINVAL;
    621 		goto out;
    622 	}
    623 
    624 	part = DISKPART(dev);
    625 	lp = rs->sc_dkdev.dk_label;
    626 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    627 
    628 	/* figure out what device is alive.. */
    629 
    630 	/*
    631 	   Look for a component to dump to.  The preference for the
    632 	   component to dump to is as follows:
    633 	   1) the master
    634 	   2) a used_spare of the master
    635 	   3) the slave
    636 	   4) a used_spare of the slave
    637 	*/
    638 
    639 	dumpto = -1;
    640 	for (c = 0; c < raidPtr->numCol; c++) {
    641 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    642 			/* this might be the one */
    643 			dumpto = c;
    644 			break;
    645 		}
    646 	}
    647 
    648 	/*
    649 	   At this point we have possibly selected a live master or a
    650 	   live slave.  We now check to see if there is a spared
    651 	   master (or a spared slave), if we didn't find a live master
    652 	   or a live slave.
    653 	*/
    654 
    655 	for (c = 0; c < raidPtr->numSpare; c++) {
    656 		sparecol = raidPtr->numCol + c;
    657 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    658 			/* How about this one? */
    659 			scol = -1;
    660 			for(j=0;j<raidPtr->numCol;j++) {
    661 				if (raidPtr->Disks[j].spareCol == sparecol) {
    662 					scol = j;
    663 					break;
    664 				}
    665 			}
    666 			if (scol == 0) {
    667 				/*
    668 				   We must have found a spared master!
    669 				   We'll take that over anything else
    670 				   found so far.  (We couldn't have
    671 				   found a real master before, since
    672 				   this is a used spare, and it's
    673 				   saying that it's replacing the
    674 				   master.)  On reboot (with
    675 				   autoconfiguration turned on)
    676 				   sparecol will become the 1st
    677 				   component (component0) of this set.
    678 				*/
    679 				dumpto = sparecol;
    680 				break;
    681 			} else if (scol != -1) {
    682 				/*
    683 				   Must be a spared slave.  We'll dump
    684 				   to that if we havn't found anything
    685 				   else so far.
    686 				*/
    687 				if (dumpto == -1)
    688 					dumpto = sparecol;
    689 			}
    690 		}
    691 	}
    692 
    693 	if (dumpto == -1) {
    694 		/* we couldn't find any live components to dump to!?!?
    695 		 */
    696 		error = EINVAL;
    697 		goto out;
    698 	}
    699 
    700 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    701 
    702 	/*
    703 	   Note that blkno is relative to this particular partition.
    704 	   By adding the offset of this partition in the RAID
    705 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    706 	   value that is relative to the partition used for the
    707 	   underlying component.
    708 	*/
    709 
    710 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    711 				blkno + offset, va, size);
    712 
    713 out:
    714 	raidunlock(rs);
    715 
    716 	return error;
    717 }
    718 /* ARGSUSED */
    719 int
    720 raidopen(dev_t dev, int flags, int fmt,
    721     struct lwp *l)
    722 {
    723 	int     unit = raidunit(dev);
    724 	struct raid_softc *rs;
    725 	struct disklabel *lp;
    726 	int     part, pmask;
    727 	int     error = 0;
    728 
    729 	if (unit >= numraid)
    730 		return (ENXIO);
    731 	rs = &raid_softc[unit];
    732 
    733 	if ((error = raidlock(rs)) != 0)
    734 		return (error);
    735 	lp = rs->sc_dkdev.dk_label;
    736 
    737 	part = DISKPART(dev);
    738 
    739 	/*
    740 	 * If there are wedges, and this is not RAW_PART, then we
    741 	 * need to fail.
    742 	 */
    743 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    744 		error = EBUSY;
    745 		goto bad;
    746 	}
    747 	pmask = (1 << part);
    748 
    749 	if ((rs->sc_flags & RAIDF_INITED) &&
    750 	    (rs->sc_dkdev.dk_openmask == 0))
    751 		raidgetdisklabel(dev);
    752 
    753 	/* make sure that this partition exists */
    754 
    755 	if (part != RAW_PART) {
    756 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    757 		    ((part >= lp->d_npartitions) ||
    758 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    759 			error = ENXIO;
    760 			goto bad;
    761 		}
    762 	}
    763 	/* Prevent this unit from being unconfigured while open. */
    764 	switch (fmt) {
    765 	case S_IFCHR:
    766 		rs->sc_dkdev.dk_copenmask |= pmask;
    767 		break;
    768 
    769 	case S_IFBLK:
    770 		rs->sc_dkdev.dk_bopenmask |= pmask;
    771 		break;
    772 	}
    773 
    774 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    775 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    776 		/* First one... mark things as dirty... Note that we *MUST*
    777 		 have done a configure before this.  I DO NOT WANT TO BE
    778 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    779 		 THAT THEY BELONG TOGETHER!!!!! */
    780 		/* XXX should check to see if we're only open for reading
    781 		   here... If so, we needn't do this, but then need some
    782 		   other way of keeping track of what's happened.. */
    783 
    784 		rf_markalldirty( raidPtrs[unit] );
    785 	}
    786 
    787 
    788 	rs->sc_dkdev.dk_openmask =
    789 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    790 
    791 bad:
    792 	raidunlock(rs);
    793 
    794 	return (error);
    795 
    796 
    797 }
    798 /* ARGSUSED */
    799 int
    800 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    801 {
    802 	int     unit = raidunit(dev);
    803 	struct cfdata *cf;
    804 	struct raid_softc *rs;
    805 	int     error = 0;
    806 	int     part;
    807 
    808 	if (unit >= numraid)
    809 		return (ENXIO);
    810 	rs = &raid_softc[unit];
    811 
    812 	if ((error = raidlock(rs)) != 0)
    813 		return (error);
    814 
    815 	part = DISKPART(dev);
    816 
    817 	/* ...that much closer to allowing unconfiguration... */
    818 	switch (fmt) {
    819 	case S_IFCHR:
    820 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    821 		break;
    822 
    823 	case S_IFBLK:
    824 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    825 		break;
    826 	}
    827 	rs->sc_dkdev.dk_openmask =
    828 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    829 
    830 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    831 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    832 		/* Last one... device is not unconfigured yet.
    833 		   Device shutdown has taken care of setting the
    834 		   clean bits if RAIDF_INITED is not set
    835 		   mark things as clean... */
    836 
    837 		rf_update_component_labels(raidPtrs[unit],
    838 						 RF_FINAL_COMPONENT_UPDATE);
    839 		if (doing_shutdown) {
    840 			/* last one, and we're going down, so
    841 			   lights out for this RAID set too. */
    842 			error = rf_Shutdown(raidPtrs[unit]);
    843 
    844 			/* It's no longer initialized... */
    845 			rs->sc_flags &= ~RAIDF_INITED;
    846 
    847 			/* detach the device */
    848 
    849 			cf = device_cfdata(rs->sc_dev);
    850 			error = config_detach(rs->sc_dev, DETACH_QUIET);
    851 			free(cf, M_RAIDFRAME);
    852 
    853 			/* Detach the disk. */
    854 			dkwedge_delall(&rs->sc_dkdev);
    855 			disk_detach(&rs->sc_dkdev);
    856 			disk_destroy(&rs->sc_dkdev);
    857 		}
    858 	}
    859 
    860 	raidunlock(rs);
    861 	return (0);
    862 
    863 }
    864 
    865 void
    866 raidstrategy(struct buf *bp)
    867 {
    868 	int s;
    869 
    870 	unsigned int raidID = raidunit(bp->b_dev);
    871 	RF_Raid_t *raidPtr;
    872 	struct raid_softc *rs = &raid_softc[raidID];
    873 	int     wlabel;
    874 
    875 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    876 		bp->b_error = ENXIO;
    877 		goto done;
    878 	}
    879 	if (raidID >= numraid || !raidPtrs[raidID]) {
    880 		bp->b_error = ENODEV;
    881 		goto done;
    882 	}
    883 	raidPtr = raidPtrs[raidID];
    884 	if (!raidPtr->valid) {
    885 		bp->b_error = ENODEV;
    886 		goto done;
    887 	}
    888 	if (bp->b_bcount == 0) {
    889 		db1_printf(("b_bcount is zero..\n"));
    890 		goto done;
    891 	}
    892 
    893 	/*
    894 	 * Do bounds checking and adjust transfer.  If there's an
    895 	 * error, the bounds check will flag that for us.
    896 	 */
    897 
    898 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    899 	if (DISKPART(bp->b_dev) == RAW_PART) {
    900 		uint64_t size; /* device size in DEV_BSIZE unit */
    901 
    902 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    903 			size = raidPtr->totalSectors <<
    904 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    905 		} else {
    906 			size = raidPtr->totalSectors >>
    907 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    908 		}
    909 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    910 			goto done;
    911 		}
    912 	} else {
    913 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    914 			db1_printf(("Bounds check failed!!:%d %d\n",
    915 				(int) bp->b_blkno, (int) wlabel));
    916 			goto done;
    917 		}
    918 	}
    919 	s = splbio();
    920 
    921 	bp->b_resid = 0;
    922 
    923 	/* stuff it onto our queue */
    924 	BUFQ_PUT(rs->buf_queue, bp);
    925 
    926 	/* scheduled the IO to happen at the next convenient time */
    927 	wakeup(&(raidPtrs[raidID]->iodone));
    928 
    929 	splx(s);
    930 	return;
    931 
    932 done:
    933 	bp->b_resid = bp->b_bcount;
    934 	biodone(bp);
    935 }
    936 /* ARGSUSED */
    937 int
    938 raidread(dev_t dev, struct uio *uio, int flags)
    939 {
    940 	int     unit = raidunit(dev);
    941 	struct raid_softc *rs;
    942 
    943 	if (unit >= numraid)
    944 		return (ENXIO);
    945 	rs = &raid_softc[unit];
    946 
    947 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    948 		return (ENXIO);
    949 
    950 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    951 
    952 }
    953 /* ARGSUSED */
    954 int
    955 raidwrite(dev_t dev, struct uio *uio, int flags)
    956 {
    957 	int     unit = raidunit(dev);
    958 	struct raid_softc *rs;
    959 
    960 	if (unit >= numraid)
    961 		return (ENXIO);
    962 	rs = &raid_softc[unit];
    963 
    964 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    965 		return (ENXIO);
    966 
    967 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    968 
    969 }
    970 
    971 int
    972 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    973 {
    974 	int     unit = raidunit(dev);
    975 	int     error = 0;
    976 	int     part, pmask, s;
    977 	struct cfdata *cf;
    978 	struct raid_softc *rs;
    979 	RF_Config_t *k_cfg, *u_cfg;
    980 	RF_Raid_t *raidPtr;
    981 	RF_RaidDisk_t *diskPtr;
    982 	RF_AccTotals_t *totals;
    983 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    984 	u_char *specific_buf;
    985 	int retcode = 0;
    986 	int column;
    987 /*	int raidid; */
    988 	struct rf_recon_req *rrcopy, *rr;
    989 	RF_ComponentLabel_t *clabel;
    990 	RF_ComponentLabel_t *ci_label;
    991 	RF_ComponentLabel_t **clabel_ptr;
    992 	RF_SingleComponent_t *sparePtr,*componentPtr;
    993 	RF_SingleComponent_t component;
    994 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
    995 	int i, j, d;
    996 #ifdef __HAVE_OLD_DISKLABEL
    997 	struct disklabel newlabel;
    998 #endif
    999 	struct dkwedge_info *dkw;
   1000 
   1001 	if (unit >= numraid)
   1002 		return (ENXIO);
   1003 	rs = &raid_softc[unit];
   1004 	raidPtr = raidPtrs[unit];
   1005 
   1006 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1007 		(int) DISKPART(dev), (int) unit, cmd));
   1008 
   1009 	/* Must be open for writes for these commands... */
   1010 	switch (cmd) {
   1011 #ifdef DIOCGSECTORSIZE
   1012 	case DIOCGSECTORSIZE:
   1013 		*(u_int *)data = raidPtr->bytesPerSector;
   1014 		return 0;
   1015 	case DIOCGMEDIASIZE:
   1016 		*(off_t *)data =
   1017 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1018 		return 0;
   1019 #endif
   1020 	case DIOCSDINFO:
   1021 	case DIOCWDINFO:
   1022 #ifdef __HAVE_OLD_DISKLABEL
   1023 	case ODIOCWDINFO:
   1024 	case ODIOCSDINFO:
   1025 #endif
   1026 	case DIOCWLABEL:
   1027 	case DIOCAWEDGE:
   1028 	case DIOCDWEDGE:
   1029 	case DIOCSSTRATEGY:
   1030 		if ((flag & FWRITE) == 0)
   1031 			return (EBADF);
   1032 	}
   1033 
   1034 	/* Must be initialized for these... */
   1035 	switch (cmd) {
   1036 	case DIOCGDINFO:
   1037 	case DIOCSDINFO:
   1038 	case DIOCWDINFO:
   1039 #ifdef __HAVE_OLD_DISKLABEL
   1040 	case ODIOCGDINFO:
   1041 	case ODIOCWDINFO:
   1042 	case ODIOCSDINFO:
   1043 	case ODIOCGDEFLABEL:
   1044 #endif
   1045 	case DIOCGPART:
   1046 	case DIOCWLABEL:
   1047 	case DIOCGDEFLABEL:
   1048 	case DIOCAWEDGE:
   1049 	case DIOCDWEDGE:
   1050 	case DIOCLWEDGES:
   1051 	case DIOCCACHESYNC:
   1052 	case RAIDFRAME_SHUTDOWN:
   1053 	case RAIDFRAME_REWRITEPARITY:
   1054 	case RAIDFRAME_GET_INFO:
   1055 	case RAIDFRAME_RESET_ACCTOTALS:
   1056 	case RAIDFRAME_GET_ACCTOTALS:
   1057 	case RAIDFRAME_KEEP_ACCTOTALS:
   1058 	case RAIDFRAME_GET_SIZE:
   1059 	case RAIDFRAME_FAIL_DISK:
   1060 	case RAIDFRAME_COPYBACK:
   1061 	case RAIDFRAME_CHECK_RECON_STATUS:
   1062 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1063 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1064 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1065 	case RAIDFRAME_ADD_HOT_SPARE:
   1066 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1067 	case RAIDFRAME_INIT_LABELS:
   1068 	case RAIDFRAME_REBUILD_IN_PLACE:
   1069 	case RAIDFRAME_CHECK_PARITY:
   1070 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1071 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1072 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1073 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1074 	case RAIDFRAME_SET_AUTOCONFIG:
   1075 	case RAIDFRAME_SET_ROOT:
   1076 	case RAIDFRAME_DELETE_COMPONENT:
   1077 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1078 	case RAIDFRAME_PARITYMAP_STATUS:
   1079 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1080 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1081 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1082 	case DIOCGSTRATEGY:
   1083 	case DIOCSSTRATEGY:
   1084 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1085 			return (ENXIO);
   1086 	}
   1087 
   1088 	switch (cmd) {
   1089 
   1090 		/* configure the system */
   1091 	case RAIDFRAME_CONFIGURE:
   1092 
   1093 		if (raidPtr->valid) {
   1094 			/* There is a valid RAID set running on this unit! */
   1095 			printf("raid%d: Device already configured!\n",unit);
   1096 			return(EINVAL);
   1097 		}
   1098 
   1099 		/* copy-in the configuration information */
   1100 		/* data points to a pointer to the configuration structure */
   1101 
   1102 		u_cfg = *((RF_Config_t **) data);
   1103 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1104 		if (k_cfg == NULL) {
   1105 			return (ENOMEM);
   1106 		}
   1107 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1108 		if (retcode) {
   1109 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1110 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1111 				retcode));
   1112 			return (retcode);
   1113 		}
   1114 		/* allocate a buffer for the layout-specific data, and copy it
   1115 		 * in */
   1116 		if (k_cfg->layoutSpecificSize) {
   1117 			if (k_cfg->layoutSpecificSize > 10000) {
   1118 				/* sanity check */
   1119 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1120 				return (EINVAL);
   1121 			}
   1122 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1123 			    (u_char *));
   1124 			if (specific_buf == NULL) {
   1125 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1126 				return (ENOMEM);
   1127 			}
   1128 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1129 			    k_cfg->layoutSpecificSize);
   1130 			if (retcode) {
   1131 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1132 				RF_Free(specific_buf,
   1133 					k_cfg->layoutSpecificSize);
   1134 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1135 					retcode));
   1136 				return (retcode);
   1137 			}
   1138 		} else
   1139 			specific_buf = NULL;
   1140 		k_cfg->layoutSpecific = specific_buf;
   1141 
   1142 		/* should do some kind of sanity check on the configuration.
   1143 		 * Store the sum of all the bytes in the last byte? */
   1144 
   1145 		/* configure the system */
   1146 
   1147 		/*
   1148 		 * Clear the entire RAID descriptor, just to make sure
   1149 		 *  there is no stale data left in the case of a
   1150 		 *  reconfiguration
   1151 		 */
   1152 		memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
   1153 		raidPtr->raidid = unit;
   1154 
   1155 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1156 
   1157 		if (retcode == 0) {
   1158 
   1159 			/* allow this many simultaneous IO's to
   1160 			   this RAID device */
   1161 			raidPtr->openings = RAIDOUTSTANDING;
   1162 
   1163 			raidinit(raidPtr);
   1164 			rf_markalldirty(raidPtr);
   1165 		}
   1166 		/* free the buffers.  No return code here. */
   1167 		if (k_cfg->layoutSpecificSize) {
   1168 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1169 		}
   1170 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1171 
   1172 		return (retcode);
   1173 
   1174 		/* shutdown the system */
   1175 	case RAIDFRAME_SHUTDOWN:
   1176 
   1177 		if ((error = raidlock(rs)) != 0)
   1178 			return (error);
   1179 
   1180 		/*
   1181 		 * If somebody has a partition mounted, we shouldn't
   1182 		 * shutdown.
   1183 		 */
   1184 
   1185 		part = DISKPART(dev);
   1186 		pmask = (1 << part);
   1187 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1188 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1189 			(rs->sc_dkdev.dk_copenmask & pmask))) {
   1190 			raidunlock(rs);
   1191 			return (EBUSY);
   1192 		}
   1193 
   1194 		retcode = rf_Shutdown(raidPtr);
   1195 
   1196 		/* It's no longer initialized... */
   1197 		rs->sc_flags &= ~RAIDF_INITED;
   1198 
   1199 		/* free the pseudo device attach bits */
   1200 
   1201 		cf = device_cfdata(rs->sc_dev);
   1202 		/* XXX this causes us to not return any errors
   1203 		   from the above call to rf_Shutdown() */
   1204 		retcode = config_detach(rs->sc_dev, DETACH_QUIET);
   1205 		free(cf, M_RAIDFRAME);
   1206 
   1207 		/* Detach the disk. */
   1208 		dkwedge_delall(&rs->sc_dkdev);
   1209 		disk_detach(&rs->sc_dkdev);
   1210 		disk_destroy(&rs->sc_dkdev);
   1211 
   1212 		raidunlock(rs);
   1213 
   1214 		return (retcode);
   1215 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1216 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1217 		/* need to read the component label for the disk indicated
   1218 		   by row,column in clabel */
   1219 
   1220 		/*
   1221 		 * Perhaps there should be an option to skip the in-core
   1222 		 * copy and hit the disk, as with disklabel(8).
   1223 		 */
   1224 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1225 
   1226 		retcode = copyin( *clabel_ptr, clabel,
   1227 				  sizeof(RF_ComponentLabel_t));
   1228 
   1229 		if (retcode) {
   1230 			return(retcode);
   1231 		}
   1232 
   1233 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1234 
   1235 		column = clabel->column;
   1236 
   1237 		if ((column < 0) || (column >= raidPtr->numCol +
   1238 				     raidPtr->numSpare)) {
   1239 			return(EINVAL);
   1240 		}
   1241 
   1242 		RF_Free(clabel, sizeof(*clabel));
   1243 
   1244 		clabel = raidget_component_label(raidPtr, column);
   1245 
   1246 		if (retcode == 0) {
   1247 			retcode = copyout(clabel, *clabel_ptr,
   1248 					  sizeof(RF_ComponentLabel_t));
   1249 		}
   1250 		return (retcode);
   1251 
   1252 #if 0
   1253 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1254 		clabel = (RF_ComponentLabel_t *) data;
   1255 
   1256 		/* XXX check the label for valid stuff... */
   1257 		/* Note that some things *should not* get modified --
   1258 		   the user should be re-initing the labels instead of
   1259 		   trying to patch things.
   1260 		   */
   1261 
   1262 		raidid = raidPtr->raidid;
   1263 #ifdef DEBUG
   1264 		printf("raid%d: Got component label:\n", raidid);
   1265 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1266 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1267 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1268 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1269 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1270 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1271 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1272 #endif
   1273 		clabel->row = 0;
   1274 		column = clabel->column;
   1275 
   1276 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1277 			return(EINVAL);
   1278 		}
   1279 
   1280 		/* XXX this isn't allowed to do anything for now :-) */
   1281 
   1282 		/* XXX and before it is, we need to fill in the rest
   1283 		   of the fields!?!?!?! */
   1284 		memcpy(raidget_component_label(raidPtr, column),
   1285 		    clabel, sizeof(*clabel));
   1286 		raidflush_component_label(raidPtr, column);
   1287 		return (0);
   1288 #endif
   1289 
   1290 	case RAIDFRAME_INIT_LABELS:
   1291 		clabel = (RF_ComponentLabel_t *) data;
   1292 		/*
   1293 		   we only want the serial number from
   1294 		   the above.  We get all the rest of the information
   1295 		   from the config that was used to create this RAID
   1296 		   set.
   1297 		   */
   1298 
   1299 		raidPtr->serial_number = clabel->serial_number;
   1300 
   1301 		for(column=0;column<raidPtr->numCol;column++) {
   1302 			diskPtr = &raidPtr->Disks[column];
   1303 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1304 				ci_label = raidget_component_label(raidPtr,
   1305 				    column);
   1306 				/* Zeroing this is important. */
   1307 				memset(ci_label, 0, sizeof(*ci_label));
   1308 				raid_init_component_label(raidPtr, ci_label);
   1309 				ci_label->serial_number =
   1310 				    raidPtr->serial_number;
   1311 				ci_label->row = 0; /* we dont' pretend to support more */
   1312 				rf_component_label_set_partitionsize(ci_label,
   1313 				    diskPtr->partitionSize);
   1314 				ci_label->column = column;
   1315 				raidflush_component_label(raidPtr, column);
   1316 			}
   1317 			/* XXXjld what about the spares? */
   1318 		}
   1319 
   1320 		return (retcode);
   1321 	case RAIDFRAME_SET_AUTOCONFIG:
   1322 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1323 		printf("raid%d: New autoconfig value is: %d\n",
   1324 		       raidPtr->raidid, d);
   1325 		*(int *) data = d;
   1326 		return (retcode);
   1327 
   1328 	case RAIDFRAME_SET_ROOT:
   1329 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1330 		printf("raid%d: New rootpartition value is: %d\n",
   1331 		       raidPtr->raidid, d);
   1332 		*(int *) data = d;
   1333 		return (retcode);
   1334 
   1335 		/* initialize all parity */
   1336 	case RAIDFRAME_REWRITEPARITY:
   1337 
   1338 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1339 			/* Parity for RAID 0 is trivially correct */
   1340 			raidPtr->parity_good = RF_RAID_CLEAN;
   1341 			return(0);
   1342 		}
   1343 
   1344 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1345 			/* Re-write is already in progress! */
   1346 			return(EINVAL);
   1347 		}
   1348 
   1349 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1350 					   rf_RewriteParityThread,
   1351 					   raidPtr,"raid_parity");
   1352 		return (retcode);
   1353 
   1354 
   1355 	case RAIDFRAME_ADD_HOT_SPARE:
   1356 		sparePtr = (RF_SingleComponent_t *) data;
   1357 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1358 		retcode = rf_add_hot_spare(raidPtr, &component);
   1359 		return(retcode);
   1360 
   1361 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1362 		return(retcode);
   1363 
   1364 	case RAIDFRAME_DELETE_COMPONENT:
   1365 		componentPtr = (RF_SingleComponent_t *)data;
   1366 		memcpy( &component, componentPtr,
   1367 			sizeof(RF_SingleComponent_t));
   1368 		retcode = rf_delete_component(raidPtr, &component);
   1369 		return(retcode);
   1370 
   1371 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1372 		componentPtr = (RF_SingleComponent_t *)data;
   1373 		memcpy( &component, componentPtr,
   1374 			sizeof(RF_SingleComponent_t));
   1375 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1376 		return(retcode);
   1377 
   1378 	case RAIDFRAME_REBUILD_IN_PLACE:
   1379 
   1380 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1381 			/* Can't do this on a RAID 0!! */
   1382 			return(EINVAL);
   1383 		}
   1384 
   1385 		if (raidPtr->recon_in_progress == 1) {
   1386 			/* a reconstruct is already in progress! */
   1387 			return(EINVAL);
   1388 		}
   1389 
   1390 		componentPtr = (RF_SingleComponent_t *) data;
   1391 		memcpy( &component, componentPtr,
   1392 			sizeof(RF_SingleComponent_t));
   1393 		component.row = 0; /* we don't support any more */
   1394 		column = component.column;
   1395 
   1396 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1397 			return(EINVAL);
   1398 		}
   1399 
   1400 		RF_LOCK_MUTEX(raidPtr->mutex);
   1401 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1402 		    (raidPtr->numFailures > 0)) {
   1403 			/* XXX 0 above shouldn't be constant!!! */
   1404 			/* some component other than this has failed.
   1405 			   Let's not make things worse than they already
   1406 			   are... */
   1407 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1408 			       raidPtr->raidid);
   1409 			printf("raid%d:     Col: %d   Too many failures.\n",
   1410 			       raidPtr->raidid, column);
   1411 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1412 			return (EINVAL);
   1413 		}
   1414 		if (raidPtr->Disks[column].status ==
   1415 		    rf_ds_reconstructing) {
   1416 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1417 			       raidPtr->raidid);
   1418 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1419 
   1420 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1421 			return (EINVAL);
   1422 		}
   1423 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1424 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1425 			return (EINVAL);
   1426 		}
   1427 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1428 
   1429 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1430 		if (rrcopy == NULL)
   1431 			return(ENOMEM);
   1432 
   1433 		rrcopy->raidPtr = (void *) raidPtr;
   1434 		rrcopy->col = column;
   1435 
   1436 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1437 					   rf_ReconstructInPlaceThread,
   1438 					   rrcopy,"raid_reconip");
   1439 		return(retcode);
   1440 
   1441 	case RAIDFRAME_GET_INFO:
   1442 		if (!raidPtr->valid)
   1443 			return (ENODEV);
   1444 		ucfgp = (RF_DeviceConfig_t **) data;
   1445 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1446 			  (RF_DeviceConfig_t *));
   1447 		if (d_cfg == NULL)
   1448 			return (ENOMEM);
   1449 		d_cfg->rows = 1; /* there is only 1 row now */
   1450 		d_cfg->cols = raidPtr->numCol;
   1451 		d_cfg->ndevs = raidPtr->numCol;
   1452 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1453 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1454 			return (ENOMEM);
   1455 		}
   1456 		d_cfg->nspares = raidPtr->numSpare;
   1457 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1458 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1459 			return (ENOMEM);
   1460 		}
   1461 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1462 		d = 0;
   1463 		for (j = 0; j < d_cfg->cols; j++) {
   1464 			d_cfg->devs[d] = raidPtr->Disks[j];
   1465 			d++;
   1466 		}
   1467 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1468 			d_cfg->spares[i] = raidPtr->Disks[j];
   1469 		}
   1470 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1471 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1472 
   1473 		return (retcode);
   1474 
   1475 	case RAIDFRAME_CHECK_PARITY:
   1476 		*(int *) data = raidPtr->parity_good;
   1477 		return (0);
   1478 
   1479 	case RAIDFRAME_PARITYMAP_STATUS:
   1480 		rf_paritymap_status(raidPtr->parity_map,
   1481 		    (struct rf_pmstat *)data);
   1482 		return 0;
   1483 
   1484 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1485 		if (raidPtr->parity_map == NULL)
   1486 			return ENOENT; /* ??? */
   1487 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1488 			(struct rf_pmparams *)data, 1))
   1489 			return EINVAL;
   1490 		return 0;
   1491 
   1492 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1493 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1494 		return 0;
   1495 
   1496 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1497 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1498 		/* XXX should errors be passed up? */
   1499 		return 0;
   1500 
   1501 	case RAIDFRAME_RESET_ACCTOTALS:
   1502 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1503 		return (0);
   1504 
   1505 	case RAIDFRAME_GET_ACCTOTALS:
   1506 		totals = (RF_AccTotals_t *) data;
   1507 		*totals = raidPtr->acc_totals;
   1508 		return (0);
   1509 
   1510 	case RAIDFRAME_KEEP_ACCTOTALS:
   1511 		raidPtr->keep_acc_totals = *(int *)data;
   1512 		return (0);
   1513 
   1514 	case RAIDFRAME_GET_SIZE:
   1515 		*(int *) data = raidPtr->totalSectors;
   1516 		return (0);
   1517 
   1518 		/* fail a disk & optionally start reconstruction */
   1519 	case RAIDFRAME_FAIL_DISK:
   1520 
   1521 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1522 			/* Can't do this on a RAID 0!! */
   1523 			return(EINVAL);
   1524 		}
   1525 
   1526 		rr = (struct rf_recon_req *) data;
   1527 		rr->row = 0;
   1528 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1529 			return (EINVAL);
   1530 
   1531 
   1532 		RF_LOCK_MUTEX(raidPtr->mutex);
   1533 		if (raidPtr->status == rf_rs_reconstructing) {
   1534 			/* you can't fail a disk while we're reconstructing! */
   1535 			/* XXX wrong for RAID6 */
   1536 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1537 			return (EINVAL);
   1538 		}
   1539 		if ((raidPtr->Disks[rr->col].status ==
   1540 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1541 			/* some other component has failed.  Let's not make
   1542 			   things worse. XXX wrong for RAID6 */
   1543 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1544 			return (EINVAL);
   1545 		}
   1546 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1547 			/* Can't fail a spared disk! */
   1548 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1549 			return (EINVAL);
   1550 		}
   1551 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1552 
   1553 		/* make a copy of the recon request so that we don't rely on
   1554 		 * the user's buffer */
   1555 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1556 		if (rrcopy == NULL)
   1557 			return(ENOMEM);
   1558 		memcpy(rrcopy, rr, sizeof(*rr));
   1559 		rrcopy->raidPtr = (void *) raidPtr;
   1560 
   1561 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1562 					   rf_ReconThread,
   1563 					   rrcopy,"raid_recon");
   1564 		return (0);
   1565 
   1566 		/* invoke a copyback operation after recon on whatever disk
   1567 		 * needs it, if any */
   1568 	case RAIDFRAME_COPYBACK:
   1569 
   1570 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1571 			/* This makes no sense on a RAID 0!! */
   1572 			return(EINVAL);
   1573 		}
   1574 
   1575 		if (raidPtr->copyback_in_progress == 1) {
   1576 			/* Copyback is already in progress! */
   1577 			return(EINVAL);
   1578 		}
   1579 
   1580 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1581 					   rf_CopybackThread,
   1582 					   raidPtr,"raid_copyback");
   1583 		return (retcode);
   1584 
   1585 		/* return the percentage completion of reconstruction */
   1586 	case RAIDFRAME_CHECK_RECON_STATUS:
   1587 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1588 			/* This makes no sense on a RAID 0, so tell the
   1589 			   user it's done. */
   1590 			*(int *) data = 100;
   1591 			return(0);
   1592 		}
   1593 		if (raidPtr->status != rf_rs_reconstructing)
   1594 			*(int *) data = 100;
   1595 		else {
   1596 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1597 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1598 			} else {
   1599 				*(int *) data = 0;
   1600 			}
   1601 		}
   1602 		return (0);
   1603 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1604 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1605 		if (raidPtr->status != rf_rs_reconstructing) {
   1606 			progressInfo.remaining = 0;
   1607 			progressInfo.completed = 100;
   1608 			progressInfo.total = 100;
   1609 		} else {
   1610 			progressInfo.total =
   1611 				raidPtr->reconControl->numRUsTotal;
   1612 			progressInfo.completed =
   1613 				raidPtr->reconControl->numRUsComplete;
   1614 			progressInfo.remaining = progressInfo.total -
   1615 				progressInfo.completed;
   1616 		}
   1617 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1618 				  sizeof(RF_ProgressInfo_t));
   1619 		return (retcode);
   1620 
   1621 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1622 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1623 			/* This makes no sense on a RAID 0, so tell the
   1624 			   user it's done. */
   1625 			*(int *) data = 100;
   1626 			return(0);
   1627 		}
   1628 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1629 			*(int *) data = 100 *
   1630 				raidPtr->parity_rewrite_stripes_done /
   1631 				raidPtr->Layout.numStripe;
   1632 		} else {
   1633 			*(int *) data = 100;
   1634 		}
   1635 		return (0);
   1636 
   1637 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1638 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1639 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1640 			progressInfo.total = raidPtr->Layout.numStripe;
   1641 			progressInfo.completed =
   1642 				raidPtr->parity_rewrite_stripes_done;
   1643 			progressInfo.remaining = progressInfo.total -
   1644 				progressInfo.completed;
   1645 		} else {
   1646 			progressInfo.remaining = 0;
   1647 			progressInfo.completed = 100;
   1648 			progressInfo.total = 100;
   1649 		}
   1650 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1651 				  sizeof(RF_ProgressInfo_t));
   1652 		return (retcode);
   1653 
   1654 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1655 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1656 			/* This makes no sense on a RAID 0 */
   1657 			*(int *) data = 100;
   1658 			return(0);
   1659 		}
   1660 		if (raidPtr->copyback_in_progress == 1) {
   1661 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1662 				raidPtr->Layout.numStripe;
   1663 		} else {
   1664 			*(int *) data = 100;
   1665 		}
   1666 		return (0);
   1667 
   1668 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1669 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1670 		if (raidPtr->copyback_in_progress == 1) {
   1671 			progressInfo.total = raidPtr->Layout.numStripe;
   1672 			progressInfo.completed =
   1673 				raidPtr->copyback_stripes_done;
   1674 			progressInfo.remaining = progressInfo.total -
   1675 				progressInfo.completed;
   1676 		} else {
   1677 			progressInfo.remaining = 0;
   1678 			progressInfo.completed = 100;
   1679 			progressInfo.total = 100;
   1680 		}
   1681 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1682 				  sizeof(RF_ProgressInfo_t));
   1683 		return (retcode);
   1684 
   1685 		/* the sparetable daemon calls this to wait for the kernel to
   1686 		 * need a spare table. this ioctl does not return until a
   1687 		 * spare table is needed. XXX -- calling mpsleep here in the
   1688 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1689 		 * -- I should either compute the spare table in the kernel,
   1690 		 * or have a different -- XXX XXX -- interface (a different
   1691 		 * character device) for delivering the table     -- XXX */
   1692 #if 0
   1693 	case RAIDFRAME_SPARET_WAIT:
   1694 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1695 		while (!rf_sparet_wait_queue)
   1696 			mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
   1697 		waitreq = rf_sparet_wait_queue;
   1698 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1699 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1700 
   1701 		/* structure assignment */
   1702 		*((RF_SparetWait_t *) data) = *waitreq;
   1703 
   1704 		RF_Free(waitreq, sizeof(*waitreq));
   1705 		return (0);
   1706 
   1707 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1708 		 * code in it that will cause the dameon to exit */
   1709 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1710 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1711 		waitreq->fcol = -1;
   1712 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1713 		waitreq->next = rf_sparet_wait_queue;
   1714 		rf_sparet_wait_queue = waitreq;
   1715 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1716 		wakeup(&rf_sparet_wait_queue);
   1717 		return (0);
   1718 
   1719 		/* used by the spare table daemon to deliver a spare table
   1720 		 * into the kernel */
   1721 	case RAIDFRAME_SEND_SPARET:
   1722 
   1723 		/* install the spare table */
   1724 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1725 
   1726 		/* respond to the requestor.  the return status of the spare
   1727 		 * table installation is passed in the "fcol" field */
   1728 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1729 		waitreq->fcol = retcode;
   1730 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1731 		waitreq->next = rf_sparet_resp_queue;
   1732 		rf_sparet_resp_queue = waitreq;
   1733 		wakeup(&rf_sparet_resp_queue);
   1734 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1735 
   1736 		return (retcode);
   1737 #endif
   1738 
   1739 	default:
   1740 		break; /* fall through to the os-specific code below */
   1741 
   1742 	}
   1743 
   1744 	if (!raidPtr->valid)
   1745 		return (EINVAL);
   1746 
   1747 	/*
   1748 	 * Add support for "regular" device ioctls here.
   1749 	 */
   1750 
   1751 	switch (cmd) {
   1752 	case DIOCGDINFO:
   1753 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1754 		break;
   1755 #ifdef __HAVE_OLD_DISKLABEL
   1756 	case ODIOCGDINFO:
   1757 		newlabel = *(rs->sc_dkdev.dk_label);
   1758 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1759 			return ENOTTY;
   1760 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1761 		break;
   1762 #endif
   1763 
   1764 	case DIOCGPART:
   1765 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1766 		((struct partinfo *) data)->part =
   1767 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1768 		break;
   1769 
   1770 	case DIOCWDINFO:
   1771 	case DIOCSDINFO:
   1772 #ifdef __HAVE_OLD_DISKLABEL
   1773 	case ODIOCWDINFO:
   1774 	case ODIOCSDINFO:
   1775 #endif
   1776 	{
   1777 		struct disklabel *lp;
   1778 #ifdef __HAVE_OLD_DISKLABEL
   1779 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1780 			memset(&newlabel, 0, sizeof newlabel);
   1781 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1782 			lp = &newlabel;
   1783 		} else
   1784 #endif
   1785 		lp = (struct disklabel *)data;
   1786 
   1787 		if ((error = raidlock(rs)) != 0)
   1788 			return (error);
   1789 
   1790 		rs->sc_flags |= RAIDF_LABELLING;
   1791 
   1792 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1793 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1794 		if (error == 0) {
   1795 			if (cmd == DIOCWDINFO
   1796 #ifdef __HAVE_OLD_DISKLABEL
   1797 			    || cmd == ODIOCWDINFO
   1798 #endif
   1799 			   )
   1800 				error = writedisklabel(RAIDLABELDEV(dev),
   1801 				    raidstrategy, rs->sc_dkdev.dk_label,
   1802 				    rs->sc_dkdev.dk_cpulabel);
   1803 		}
   1804 		rs->sc_flags &= ~RAIDF_LABELLING;
   1805 
   1806 		raidunlock(rs);
   1807 
   1808 		if (error)
   1809 			return (error);
   1810 		break;
   1811 	}
   1812 
   1813 	case DIOCWLABEL:
   1814 		if (*(int *) data != 0)
   1815 			rs->sc_flags |= RAIDF_WLABEL;
   1816 		else
   1817 			rs->sc_flags &= ~RAIDF_WLABEL;
   1818 		break;
   1819 
   1820 	case DIOCGDEFLABEL:
   1821 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1822 		break;
   1823 
   1824 #ifdef __HAVE_OLD_DISKLABEL
   1825 	case ODIOCGDEFLABEL:
   1826 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1827 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1828 			return ENOTTY;
   1829 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1830 		break;
   1831 #endif
   1832 
   1833 	case DIOCAWEDGE:
   1834 	case DIOCDWEDGE:
   1835 	    	dkw = (void *)data;
   1836 
   1837 		/* If the ioctl happens here, the parent is us. */
   1838 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1839 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1840 
   1841 	case DIOCLWEDGES:
   1842 		return dkwedge_list(&rs->sc_dkdev,
   1843 		    (struct dkwedge_list *)data, l);
   1844 	case DIOCCACHESYNC:
   1845 		return rf_sync_component_caches(raidPtr);
   1846 
   1847 	case DIOCGSTRATEGY:
   1848 	    {
   1849 		struct disk_strategy *dks = (void *)data;
   1850 
   1851 		s = splbio();
   1852 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1853 		    sizeof(dks->dks_name));
   1854 		splx(s);
   1855 		dks->dks_paramlen = 0;
   1856 
   1857 		return 0;
   1858 	    }
   1859 
   1860 	case DIOCSSTRATEGY:
   1861 	    {
   1862 		struct disk_strategy *dks = (void *)data;
   1863 		struct bufq_state *new;
   1864 		struct bufq_state *old;
   1865 
   1866 		if (dks->dks_param != NULL) {
   1867 			return EINVAL;
   1868 		}
   1869 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1870 		error = bufq_alloc(&new, dks->dks_name,
   1871 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1872 		if (error) {
   1873 			return error;
   1874 		}
   1875 		s = splbio();
   1876 		old = rs->buf_queue;
   1877 		bufq_move(new, old);
   1878 		rs->buf_queue = new;
   1879 		splx(s);
   1880 		bufq_free(old);
   1881 
   1882 		return 0;
   1883 	    }
   1884 
   1885 	default:
   1886 		retcode = ENOTTY;
   1887 	}
   1888 	return (retcode);
   1889 
   1890 }
   1891 
   1892 
   1893 /* raidinit -- complete the rest of the initialization for the
   1894    RAIDframe device.  */
   1895 
   1896 
   1897 static void
   1898 raidinit(RF_Raid_t *raidPtr)
   1899 {
   1900 	struct cfdata *cf;
   1901 	struct raid_softc *rs;
   1902 	int     unit;
   1903 
   1904 	unit = raidPtr->raidid;
   1905 
   1906 	rs = &raid_softc[unit];
   1907 
   1908 	/* XXX should check return code first... */
   1909 	rs->sc_flags |= RAIDF_INITED;
   1910 
   1911 	/* XXX doesn't check bounds. */
   1912 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1913 
   1914 	/* attach the pseudo device */
   1915 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1916 	cf->cf_name = raid_cd.cd_name;
   1917 	cf->cf_atname = raid_cd.cd_name;
   1918 	cf->cf_unit = unit;
   1919 	cf->cf_fstate = FSTATE_STAR;
   1920 
   1921 	rs->sc_dev = config_attach_pseudo(cf);
   1922 
   1923 	if (rs->sc_dev==NULL) {
   1924 		printf("raid%d: config_attach_pseudo failed\n",
   1925 		       raidPtr->raidid);
   1926 	}
   1927 
   1928 	/* disk_attach actually creates space for the CPU disklabel, among
   1929 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1930 	 * with disklabels. */
   1931 
   1932 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1933 	disk_attach(&rs->sc_dkdev);
   1934 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1935 
   1936 	/* XXX There may be a weird interaction here between this, and
   1937 	 * protectedSectors, as used in RAIDframe.  */
   1938 
   1939 	rs->sc_size = raidPtr->totalSectors;
   1940 
   1941 	dkwedge_discover(&rs->sc_dkdev);
   1942 
   1943 	rf_set_properties(rs, raidPtr);
   1944 
   1945 }
   1946 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1947 /* wake up the daemon & tell it to get us a spare table
   1948  * XXX
   1949  * the entries in the queues should be tagged with the raidPtr
   1950  * so that in the extremely rare case that two recons happen at once,
   1951  * we know for which device were requesting a spare table
   1952  * XXX
   1953  *
   1954  * XXX This code is not currently used. GO
   1955  */
   1956 int
   1957 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1958 {
   1959 	int     retcode;
   1960 
   1961 	RF_LOCK_MUTEX(rf_sparet_wait_mutex);
   1962 	req->next = rf_sparet_wait_queue;
   1963 	rf_sparet_wait_queue = req;
   1964 	wakeup(&rf_sparet_wait_queue);
   1965 
   1966 	/* mpsleep unlocks the mutex */
   1967 	while (!rf_sparet_resp_queue) {
   1968 		tsleep(&rf_sparet_resp_queue, PRIBIO,
   1969 		    "raidframe getsparetable", 0);
   1970 	}
   1971 	req = rf_sparet_resp_queue;
   1972 	rf_sparet_resp_queue = req->next;
   1973 	RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
   1974 
   1975 	retcode = req->fcol;
   1976 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1977 					 * alloc'd */
   1978 	return (retcode);
   1979 }
   1980 #endif
   1981 
   1982 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1983  * bp & passes it down.
   1984  * any calls originating in the kernel must use non-blocking I/O
   1985  * do some extra sanity checking to return "appropriate" error values for
   1986  * certain conditions (to make some standard utilities work)
   1987  *
   1988  * Formerly known as: rf_DoAccessKernel
   1989  */
   1990 void
   1991 raidstart(RF_Raid_t *raidPtr)
   1992 {
   1993 	RF_SectorCount_t num_blocks, pb, sum;
   1994 	RF_RaidAddr_t raid_addr;
   1995 	struct partition *pp;
   1996 	daddr_t blocknum;
   1997 	int     unit;
   1998 	struct raid_softc *rs;
   1999 	int     do_async;
   2000 	struct buf *bp;
   2001 	int rc;
   2002 
   2003 	unit = raidPtr->raidid;
   2004 	rs = &raid_softc[unit];
   2005 
   2006 	/* quick check to see if anything has died recently */
   2007 	RF_LOCK_MUTEX(raidPtr->mutex);
   2008 	if (raidPtr->numNewFailures > 0) {
   2009 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   2010 		rf_update_component_labels(raidPtr,
   2011 					   RF_NORMAL_COMPONENT_UPDATE);
   2012 		RF_LOCK_MUTEX(raidPtr->mutex);
   2013 		raidPtr->numNewFailures--;
   2014 	}
   2015 
   2016 	/* Check to see if we're at the limit... */
   2017 	while (raidPtr->openings > 0) {
   2018 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   2019 
   2020 		/* get the next item, if any, from the queue */
   2021 		if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
   2022 			/* nothing more to do */
   2023 			return;
   2024 		}
   2025 
   2026 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2027 		 * partition.. Need to make it absolute to the underlying
   2028 		 * device.. */
   2029 
   2030 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2031 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2032 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2033 			blocknum += pp->p_offset;
   2034 		}
   2035 
   2036 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2037 			    (int) blocknum));
   2038 
   2039 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2040 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2041 
   2042 		/* *THIS* is where we adjust what block we're going to...
   2043 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2044 		raid_addr = blocknum;
   2045 
   2046 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2047 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2048 		sum = raid_addr + num_blocks + pb;
   2049 		if (1 || rf_debugKernelAccess) {
   2050 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2051 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2052 				    (int) pb, (int) bp->b_resid));
   2053 		}
   2054 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2055 		    || (sum < num_blocks) || (sum < pb)) {
   2056 			bp->b_error = ENOSPC;
   2057 			bp->b_resid = bp->b_bcount;
   2058 			biodone(bp);
   2059 			RF_LOCK_MUTEX(raidPtr->mutex);
   2060 			continue;
   2061 		}
   2062 		/*
   2063 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2064 		 */
   2065 
   2066 		if (bp->b_bcount & raidPtr->sectorMask) {
   2067 			bp->b_error = EINVAL;
   2068 			bp->b_resid = bp->b_bcount;
   2069 			biodone(bp);
   2070 			RF_LOCK_MUTEX(raidPtr->mutex);
   2071 			continue;
   2072 
   2073 		}
   2074 		db1_printf(("Calling DoAccess..\n"));
   2075 
   2076 
   2077 		RF_LOCK_MUTEX(raidPtr->mutex);
   2078 		raidPtr->openings--;
   2079 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   2080 
   2081 		/*
   2082 		 * Everything is async.
   2083 		 */
   2084 		do_async = 1;
   2085 
   2086 		disk_busy(&rs->sc_dkdev);
   2087 
   2088 		/* XXX we're still at splbio() here... do we *really*
   2089 		   need to be? */
   2090 
   2091 		/* don't ever condition on bp->b_flags & B_WRITE.
   2092 		 * always condition on B_READ instead */
   2093 
   2094 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2095 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2096 				 do_async, raid_addr, num_blocks,
   2097 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2098 
   2099 		if (rc) {
   2100 			bp->b_error = rc;
   2101 			bp->b_resid = bp->b_bcount;
   2102 			biodone(bp);
   2103 			/* continue loop */
   2104 		}
   2105 
   2106 		RF_LOCK_MUTEX(raidPtr->mutex);
   2107 	}
   2108 	RF_UNLOCK_MUTEX(raidPtr->mutex);
   2109 }
   2110 
   2111 
   2112 
   2113 
   2114 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2115 
   2116 int
   2117 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2118 {
   2119 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2120 	struct buf *bp;
   2121 
   2122 	req->queue = queue;
   2123 
   2124 #if DIAGNOSTIC
   2125 	if (queue->raidPtr->raidid >= numraid) {
   2126 		printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
   2127 		    numraid);
   2128 		panic("Invalid Unit number in rf_DispatchKernelIO");
   2129 	}
   2130 #endif
   2131 
   2132 	bp = req->bp;
   2133 
   2134 	switch (req->type) {
   2135 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2136 		/* XXX need to do something extra here.. */
   2137 		/* I'm leaving this in, as I've never actually seen it used,
   2138 		 * and I'd like folks to report it... GO */
   2139 		printf(("WAKEUP CALLED\n"));
   2140 		queue->numOutstanding++;
   2141 
   2142 		bp->b_flags = 0;
   2143 		bp->b_private = req;
   2144 
   2145 		KernelWakeupFunc(bp);
   2146 		break;
   2147 
   2148 	case RF_IO_TYPE_READ:
   2149 	case RF_IO_TYPE_WRITE:
   2150 #if RF_ACC_TRACE > 0
   2151 		if (req->tracerec) {
   2152 			RF_ETIMER_START(req->tracerec->timer);
   2153 		}
   2154 #endif
   2155 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2156 		    op, queue->rf_cinfo->ci_dev,
   2157 		    req->sectorOffset, req->numSector,
   2158 		    req->buf, KernelWakeupFunc, (void *) req,
   2159 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2160 
   2161 		if (rf_debugKernelAccess) {
   2162 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2163 				(long) bp->b_blkno));
   2164 		}
   2165 		queue->numOutstanding++;
   2166 		queue->last_deq_sector = req->sectorOffset;
   2167 		/* acc wouldn't have been let in if there were any pending
   2168 		 * reqs at any other priority */
   2169 		queue->curPriority = req->priority;
   2170 
   2171 		db1_printf(("Going for %c to unit %d col %d\n",
   2172 			    req->type, queue->raidPtr->raidid,
   2173 			    queue->col));
   2174 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2175 			(int) req->sectorOffset, (int) req->numSector,
   2176 			(int) (req->numSector <<
   2177 			    queue->raidPtr->logBytesPerSector),
   2178 			(int) queue->raidPtr->logBytesPerSector));
   2179 
   2180 		/*
   2181 		 * XXX: drop lock here since this can block at
   2182 		 * least with backing SCSI devices.  Retake it
   2183 		 * to minimize fuss with calling interfaces.
   2184 		 */
   2185 
   2186 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2187 		bdev_strategy(bp);
   2188 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2189 		break;
   2190 
   2191 	default:
   2192 		panic("bad req->type in rf_DispatchKernelIO");
   2193 	}
   2194 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2195 
   2196 	return (0);
   2197 }
   2198 /* this is the callback function associated with a I/O invoked from
   2199    kernel code.
   2200  */
   2201 static void
   2202 KernelWakeupFunc(struct buf *bp)
   2203 {
   2204 	RF_DiskQueueData_t *req = NULL;
   2205 	RF_DiskQueue_t *queue;
   2206 	int s;
   2207 
   2208 	s = splbio();
   2209 	db1_printf(("recovering the request queue:\n"));
   2210 	req = bp->b_private;
   2211 
   2212 	queue = (RF_DiskQueue_t *) req->queue;
   2213 
   2214 #if RF_ACC_TRACE > 0
   2215 	if (req->tracerec) {
   2216 		RF_ETIMER_STOP(req->tracerec->timer);
   2217 		RF_ETIMER_EVAL(req->tracerec->timer);
   2218 		RF_LOCK_MUTEX(rf_tracing_mutex);
   2219 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2220 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2221 		req->tracerec->num_phys_ios++;
   2222 		RF_UNLOCK_MUTEX(rf_tracing_mutex);
   2223 	}
   2224 #endif
   2225 
   2226 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2227 	 * ballistic, and mark the component as hosed... */
   2228 
   2229 	if (bp->b_error != 0) {
   2230 		/* Mark the disk as dead */
   2231 		/* but only mark it once... */
   2232 		/* and only if it wouldn't leave this RAID set
   2233 		   completely broken */
   2234 		if (((queue->raidPtr->Disks[queue->col].status ==
   2235 		      rf_ds_optimal) ||
   2236 		     (queue->raidPtr->Disks[queue->col].status ==
   2237 		      rf_ds_used_spare)) &&
   2238 		     (queue->raidPtr->numFailures <
   2239 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2240 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2241 			       queue->raidPtr->raidid,
   2242 			       queue->raidPtr->Disks[queue->col].devname);
   2243 			queue->raidPtr->Disks[queue->col].status =
   2244 			    rf_ds_failed;
   2245 			queue->raidPtr->status = rf_rs_degraded;
   2246 			queue->raidPtr->numFailures++;
   2247 			queue->raidPtr->numNewFailures++;
   2248 		} else {	/* Disk is already dead... */
   2249 			/* printf("Disk already marked as dead!\n"); */
   2250 		}
   2251 
   2252 	}
   2253 
   2254 	/* Fill in the error value */
   2255 
   2256 	req->error = bp->b_error;
   2257 
   2258 	simple_lock(&queue->raidPtr->iodone_lock);
   2259 
   2260 	/* Drop this one on the "finished" queue... */
   2261 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2262 
   2263 	/* Let the raidio thread know there is work to be done. */
   2264 	wakeup(&(queue->raidPtr->iodone));
   2265 
   2266 	simple_unlock(&queue->raidPtr->iodone_lock);
   2267 
   2268 	splx(s);
   2269 }
   2270 
   2271 
   2272 
   2273 /*
   2274  * initialize a buf structure for doing an I/O in the kernel.
   2275  */
   2276 static void
   2277 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2278        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2279        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2280        struct proc *b_proc)
   2281 {
   2282 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2283 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2284 	bp->b_oflags = 0;
   2285 	bp->b_cflags = 0;
   2286 	bp->b_bcount = numSect << logBytesPerSector;
   2287 	bp->b_bufsize = bp->b_bcount;
   2288 	bp->b_error = 0;
   2289 	bp->b_dev = dev;
   2290 	bp->b_data = bf;
   2291 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2292 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2293 	if (bp->b_bcount == 0) {
   2294 		panic("bp->b_bcount is zero in InitBP!!");
   2295 	}
   2296 	bp->b_proc = b_proc;
   2297 	bp->b_iodone = cbFunc;
   2298 	bp->b_private = cbArg;
   2299 }
   2300 
   2301 static void
   2302 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2303 		    struct disklabel *lp)
   2304 {
   2305 	memset(lp, 0, sizeof(*lp));
   2306 
   2307 	/* fabricate a label... */
   2308 	lp->d_secperunit = raidPtr->totalSectors;
   2309 	lp->d_secsize = raidPtr->bytesPerSector;
   2310 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2311 	lp->d_ntracks = 4 * raidPtr->numCol;
   2312 	lp->d_ncylinders = raidPtr->totalSectors /
   2313 		(lp->d_nsectors * lp->d_ntracks);
   2314 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2315 
   2316 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2317 	lp->d_type = DTYPE_RAID;
   2318 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2319 	lp->d_rpm = 3600;
   2320 	lp->d_interleave = 1;
   2321 	lp->d_flags = 0;
   2322 
   2323 	lp->d_partitions[RAW_PART].p_offset = 0;
   2324 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2325 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2326 	lp->d_npartitions = RAW_PART + 1;
   2327 
   2328 	lp->d_magic = DISKMAGIC;
   2329 	lp->d_magic2 = DISKMAGIC;
   2330 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2331 
   2332 }
   2333 /*
   2334  * Read the disklabel from the raid device.  If one is not present, fake one
   2335  * up.
   2336  */
   2337 static void
   2338 raidgetdisklabel(dev_t dev)
   2339 {
   2340 	int     unit = raidunit(dev);
   2341 	struct raid_softc *rs = &raid_softc[unit];
   2342 	const char   *errstring;
   2343 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2344 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2345 	RF_Raid_t *raidPtr;
   2346 
   2347 	db1_printf(("Getting the disklabel...\n"));
   2348 
   2349 	memset(clp, 0, sizeof(*clp));
   2350 
   2351 	raidPtr = raidPtrs[unit];
   2352 
   2353 	raidgetdefaultlabel(raidPtr, rs, lp);
   2354 
   2355 	/*
   2356 	 * Call the generic disklabel extraction routine.
   2357 	 */
   2358 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2359 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2360 	if (errstring)
   2361 		raidmakedisklabel(rs);
   2362 	else {
   2363 		int     i;
   2364 		struct partition *pp;
   2365 
   2366 		/*
   2367 		 * Sanity check whether the found disklabel is valid.
   2368 		 *
   2369 		 * This is necessary since total size of the raid device
   2370 		 * may vary when an interleave is changed even though exactly
   2371 		 * same components are used, and old disklabel may used
   2372 		 * if that is found.
   2373 		 */
   2374 		if (lp->d_secperunit != rs->sc_size)
   2375 			printf("raid%d: WARNING: %s: "
   2376 			    "total sector size in disklabel (%" PRIu32 ") != "
   2377 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2378 			    lp->d_secperunit, rs->sc_size);
   2379 		for (i = 0; i < lp->d_npartitions; i++) {
   2380 			pp = &lp->d_partitions[i];
   2381 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2382 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2383 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2384 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2385 		}
   2386 	}
   2387 
   2388 }
   2389 /*
   2390  * Take care of things one might want to take care of in the event
   2391  * that a disklabel isn't present.
   2392  */
   2393 static void
   2394 raidmakedisklabel(struct raid_softc *rs)
   2395 {
   2396 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2397 	db1_printf(("Making a label..\n"));
   2398 
   2399 	/*
   2400 	 * For historical reasons, if there's no disklabel present
   2401 	 * the raw partition must be marked FS_BSDFFS.
   2402 	 */
   2403 
   2404 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2405 
   2406 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2407 
   2408 	lp->d_checksum = dkcksum(lp);
   2409 }
   2410 /*
   2411  * Wait interruptibly for an exclusive lock.
   2412  *
   2413  * XXX
   2414  * Several drivers do this; it should be abstracted and made MP-safe.
   2415  * (Hmm... where have we seen this warning before :->  GO )
   2416  */
   2417 static int
   2418 raidlock(struct raid_softc *rs)
   2419 {
   2420 	int     error;
   2421 
   2422 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2423 		rs->sc_flags |= RAIDF_WANTED;
   2424 		if ((error =
   2425 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2426 			return (error);
   2427 	}
   2428 	rs->sc_flags |= RAIDF_LOCKED;
   2429 	return (0);
   2430 }
   2431 /*
   2432  * Unlock and wake up any waiters.
   2433  */
   2434 static void
   2435 raidunlock(struct raid_softc *rs)
   2436 {
   2437 
   2438 	rs->sc_flags &= ~RAIDF_LOCKED;
   2439 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2440 		rs->sc_flags &= ~RAIDF_WANTED;
   2441 		wakeup(rs);
   2442 	}
   2443 }
   2444 
   2445 
   2446 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2447 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2448 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2449 
   2450 static daddr_t
   2451 rf_component_info_offset(void)
   2452 {
   2453 
   2454 	return RF_COMPONENT_INFO_OFFSET;
   2455 }
   2456 
   2457 static daddr_t
   2458 rf_component_info_size(unsigned secsize)
   2459 {
   2460 	daddr_t info_size;
   2461 
   2462 	KASSERT(secsize);
   2463 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2464 		info_size = secsize;
   2465 	else
   2466 		info_size = RF_COMPONENT_INFO_SIZE;
   2467 
   2468 	return info_size;
   2469 }
   2470 
   2471 static daddr_t
   2472 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2473 {
   2474 	daddr_t map_offset;
   2475 
   2476 	KASSERT(raidPtr->bytesPerSector);
   2477 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2478 		map_offset = raidPtr->bytesPerSector;
   2479 	else
   2480 		map_offset = RF_COMPONENT_INFO_SIZE;
   2481 	map_offset += rf_component_info_offset();
   2482 
   2483 	return map_offset;
   2484 }
   2485 
   2486 static daddr_t
   2487 rf_parity_map_size(RF_Raid_t *raidPtr)
   2488 {
   2489 	daddr_t map_size;
   2490 
   2491 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2492 		map_size = raidPtr->bytesPerSector;
   2493 	else
   2494 		map_size = RF_PARITY_MAP_SIZE;
   2495 
   2496 	return map_size;
   2497 }
   2498 
   2499 int
   2500 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2501 {
   2502 	RF_ComponentLabel_t *clabel;
   2503 
   2504 	clabel = raidget_component_label(raidPtr, col);
   2505 	clabel->clean = RF_RAID_CLEAN;
   2506 	raidflush_component_label(raidPtr, col);
   2507 	return(0);
   2508 }
   2509 
   2510 
   2511 int
   2512 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2513 {
   2514 	RF_ComponentLabel_t *clabel;
   2515 
   2516 	clabel = raidget_component_label(raidPtr, col);
   2517 	clabel->clean = RF_RAID_DIRTY;
   2518 	raidflush_component_label(raidPtr, col);
   2519 	return(0);
   2520 }
   2521 
   2522 int
   2523 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2524 {
   2525 	KASSERT(raidPtr->bytesPerSector);
   2526 	return raidread_component_label(raidPtr->bytesPerSector,
   2527 	    raidPtr->Disks[col].dev,
   2528 	    raidPtr->raid_cinfo[col].ci_vp,
   2529 	    &raidPtr->raid_cinfo[col].ci_label);
   2530 }
   2531 
   2532 RF_ComponentLabel_t *
   2533 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2534 {
   2535 	return &raidPtr->raid_cinfo[col].ci_label;
   2536 }
   2537 
   2538 int
   2539 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2540 {
   2541 	RF_ComponentLabel_t *label;
   2542 
   2543 	label = &raidPtr->raid_cinfo[col].ci_label;
   2544 	label->mod_counter = raidPtr->mod_counter;
   2545 #ifndef RF_NO_PARITY_MAP
   2546 	label->parity_map_modcount = label->mod_counter;
   2547 #endif
   2548 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2549 	    raidPtr->Disks[col].dev,
   2550 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2551 }
   2552 
   2553 
   2554 static int
   2555 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2556     RF_ComponentLabel_t *clabel)
   2557 {
   2558 	return raidread_component_area(dev, b_vp, clabel,
   2559 	    sizeof(RF_ComponentLabel_t),
   2560 	    rf_component_info_offset(),
   2561 	    rf_component_info_size(secsize));
   2562 }
   2563 
   2564 /* ARGSUSED */
   2565 static int
   2566 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2567     size_t msize, daddr_t offset, daddr_t dsize)
   2568 {
   2569 	struct buf *bp;
   2570 	const struct bdevsw *bdev;
   2571 	int error;
   2572 
   2573 	/* XXX should probably ensure that we don't try to do this if
   2574 	   someone has changed rf_protected_sectors. */
   2575 
   2576 	if (b_vp == NULL) {
   2577 		/* For whatever reason, this component is not valid.
   2578 		   Don't try to read a component label from it. */
   2579 		return(EINVAL);
   2580 	}
   2581 
   2582 	/* get a block of the appropriate size... */
   2583 	bp = geteblk((int)dsize);
   2584 	bp->b_dev = dev;
   2585 
   2586 	/* get our ducks in a row for the read */
   2587 	bp->b_blkno = offset / DEV_BSIZE;
   2588 	bp->b_bcount = dsize;
   2589 	bp->b_flags |= B_READ;
   2590  	bp->b_resid = dsize;
   2591 
   2592 	bdev = bdevsw_lookup(bp->b_dev);
   2593 	if (bdev == NULL)
   2594 		return (ENXIO);
   2595 	(*bdev->d_strategy)(bp);
   2596 
   2597 	error = biowait(bp);
   2598 
   2599 	if (!error) {
   2600 		memcpy(data, bp->b_data, msize);
   2601 	}
   2602 
   2603 	brelse(bp, 0);
   2604 	return(error);
   2605 }
   2606 
   2607 
   2608 static int
   2609 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2610     RF_ComponentLabel_t *clabel)
   2611 {
   2612 	return raidwrite_component_area(dev, b_vp, clabel,
   2613 	    sizeof(RF_ComponentLabel_t),
   2614 	    rf_component_info_offset(),
   2615 	    rf_component_info_size(secsize), 0);
   2616 }
   2617 
   2618 /* ARGSUSED */
   2619 static int
   2620 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2621     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2622 {
   2623 	struct buf *bp;
   2624 	const struct bdevsw *bdev;
   2625 	int error;
   2626 
   2627 	/* get a block of the appropriate size... */
   2628 	bp = geteblk((int)dsize);
   2629 	bp->b_dev = dev;
   2630 
   2631 	/* get our ducks in a row for the write */
   2632 	bp->b_blkno = offset / DEV_BSIZE;
   2633 	bp->b_bcount = dsize;
   2634 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2635  	bp->b_resid = dsize;
   2636 
   2637 	memset(bp->b_data, 0, dsize);
   2638 	memcpy(bp->b_data, data, msize);
   2639 
   2640 	bdev = bdevsw_lookup(bp->b_dev);
   2641 	if (bdev == NULL)
   2642 		return (ENXIO);
   2643 	(*bdev->d_strategy)(bp);
   2644 	if (asyncp)
   2645 		return 0;
   2646 	error = biowait(bp);
   2647 	brelse(bp, 0);
   2648 	if (error) {
   2649 #if 1
   2650 		printf("Failed to write RAID component info!\n");
   2651 #endif
   2652 	}
   2653 
   2654 	return(error);
   2655 }
   2656 
   2657 void
   2658 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2659 {
   2660 	int c;
   2661 
   2662 	for (c = 0; c < raidPtr->numCol; c++) {
   2663 		/* Skip dead disks. */
   2664 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2665 			continue;
   2666 		/* XXXjld: what if an error occurs here? */
   2667 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2668 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2669 		    RF_PARITYMAP_NBYTE,
   2670 		    rf_parity_map_offset(raidPtr),
   2671 		    rf_parity_map_size(raidPtr), 0);
   2672 	}
   2673 }
   2674 
   2675 void
   2676 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2677 {
   2678 	struct rf_paritymap_ondisk tmp;
   2679 	int c,first;
   2680 
   2681 	first=1;
   2682 	for (c = 0; c < raidPtr->numCol; c++) {
   2683 		/* Skip dead disks. */
   2684 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2685 			continue;
   2686 		raidread_component_area(raidPtr->Disks[c].dev,
   2687 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2688 		    RF_PARITYMAP_NBYTE,
   2689 		    rf_parity_map_offset(raidPtr),
   2690 		    rf_parity_map_size(raidPtr));
   2691 		if (first) {
   2692 			memcpy(map, &tmp, sizeof(*map));
   2693 			first = 0;
   2694 		} else {
   2695 			rf_paritymap_merge(map, &tmp);
   2696 		}
   2697 	}
   2698 }
   2699 
   2700 void
   2701 rf_markalldirty(RF_Raid_t *raidPtr)
   2702 {
   2703 	RF_ComponentLabel_t *clabel;
   2704 	int sparecol;
   2705 	int c;
   2706 	int j;
   2707 	int scol = -1;
   2708 
   2709 	raidPtr->mod_counter++;
   2710 	for (c = 0; c < raidPtr->numCol; c++) {
   2711 		/* we don't want to touch (at all) a disk that has
   2712 		   failed */
   2713 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2714 			clabel = raidget_component_label(raidPtr, c);
   2715 			if (clabel->status == rf_ds_spared) {
   2716 				/* XXX do something special...
   2717 				   but whatever you do, don't
   2718 				   try to access it!! */
   2719 			} else {
   2720 				raidmarkdirty(raidPtr, c);
   2721 			}
   2722 		}
   2723 	}
   2724 
   2725 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2726 		sparecol = raidPtr->numCol + c;
   2727 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2728 			/*
   2729 
   2730 			   we claim this disk is "optimal" if it's
   2731 			   rf_ds_used_spare, as that means it should be
   2732 			   directly substitutable for the disk it replaced.
   2733 			   We note that too...
   2734 
   2735 			 */
   2736 
   2737 			for(j=0;j<raidPtr->numCol;j++) {
   2738 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2739 					scol = j;
   2740 					break;
   2741 				}
   2742 			}
   2743 
   2744 			clabel = raidget_component_label(raidPtr, sparecol);
   2745 			/* make sure status is noted */
   2746 
   2747 			raid_init_component_label(raidPtr, clabel);
   2748 
   2749 			clabel->row = 0;
   2750 			clabel->column = scol;
   2751 			/* Note: we *don't* change status from rf_ds_used_spare
   2752 			   to rf_ds_optimal */
   2753 			/* clabel.status = rf_ds_optimal; */
   2754 
   2755 			raidmarkdirty(raidPtr, sparecol);
   2756 		}
   2757 	}
   2758 }
   2759 
   2760 
   2761 void
   2762 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2763 {
   2764 	RF_ComponentLabel_t *clabel;
   2765 	int sparecol;
   2766 	int c;
   2767 	int j;
   2768 	int scol;
   2769 
   2770 	scol = -1;
   2771 
   2772 	/* XXX should do extra checks to make sure things really are clean,
   2773 	   rather than blindly setting the clean bit... */
   2774 
   2775 	raidPtr->mod_counter++;
   2776 
   2777 	for (c = 0; c < raidPtr->numCol; c++) {
   2778 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2779 			clabel = raidget_component_label(raidPtr, c);
   2780 			/* make sure status is noted */
   2781 			clabel->status = rf_ds_optimal;
   2782 
   2783 			/* note what unit we are configured as */
   2784 			clabel->last_unit = raidPtr->raidid;
   2785 
   2786 			raidflush_component_label(raidPtr, c);
   2787 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2788 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2789 					raidmarkclean(raidPtr, c);
   2790 				}
   2791 			}
   2792 		}
   2793 		/* else we don't touch it.. */
   2794 	}
   2795 
   2796 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2797 		sparecol = raidPtr->numCol + c;
   2798 		/* Need to ensure that the reconstruct actually completed! */
   2799 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2800 			/*
   2801 
   2802 			   we claim this disk is "optimal" if it's
   2803 			   rf_ds_used_spare, as that means it should be
   2804 			   directly substitutable for the disk it replaced.
   2805 			   We note that too...
   2806 
   2807 			 */
   2808 
   2809 			for(j=0;j<raidPtr->numCol;j++) {
   2810 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2811 					scol = j;
   2812 					break;
   2813 				}
   2814 			}
   2815 
   2816 			/* XXX shouldn't *really* need this... */
   2817 			clabel = raidget_component_label(raidPtr, sparecol);
   2818 			/* make sure status is noted */
   2819 
   2820 			raid_init_component_label(raidPtr, clabel);
   2821 
   2822 			clabel->column = scol;
   2823 			clabel->status = rf_ds_optimal;
   2824 			clabel->last_unit = raidPtr->raidid;
   2825 
   2826 			raidflush_component_label(raidPtr, sparecol);
   2827 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2828 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2829 					raidmarkclean(raidPtr, sparecol);
   2830 				}
   2831 			}
   2832 		}
   2833 	}
   2834 }
   2835 
   2836 void
   2837 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2838 {
   2839 
   2840 	if (vp != NULL) {
   2841 		if (auto_configured == 1) {
   2842 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2843 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2844 			vput(vp);
   2845 
   2846 		} else {
   2847 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2848 		}
   2849 	}
   2850 }
   2851 
   2852 
   2853 void
   2854 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2855 {
   2856 	int r,c;
   2857 	struct vnode *vp;
   2858 	int acd;
   2859 
   2860 
   2861 	/* We take this opportunity to close the vnodes like we should.. */
   2862 
   2863 	for (c = 0; c < raidPtr->numCol; c++) {
   2864 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2865 		acd = raidPtr->Disks[c].auto_configured;
   2866 		rf_close_component(raidPtr, vp, acd);
   2867 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2868 		raidPtr->Disks[c].auto_configured = 0;
   2869 	}
   2870 
   2871 	for (r = 0; r < raidPtr->numSpare; r++) {
   2872 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2873 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2874 		rf_close_component(raidPtr, vp, acd);
   2875 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2876 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2877 	}
   2878 }
   2879 
   2880 
   2881 void
   2882 rf_ReconThread(struct rf_recon_req *req)
   2883 {
   2884 	int     s;
   2885 	RF_Raid_t *raidPtr;
   2886 
   2887 	s = splbio();
   2888 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2889 	raidPtr->recon_in_progress = 1;
   2890 
   2891 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2892 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2893 
   2894 	RF_Free(req, sizeof(*req));
   2895 
   2896 	raidPtr->recon_in_progress = 0;
   2897 	splx(s);
   2898 
   2899 	/* That's all... */
   2900 	kthread_exit(0);	/* does not return */
   2901 }
   2902 
   2903 void
   2904 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2905 {
   2906 	int retcode;
   2907 	int s;
   2908 
   2909 	raidPtr->parity_rewrite_stripes_done = 0;
   2910 	raidPtr->parity_rewrite_in_progress = 1;
   2911 	s = splbio();
   2912 	retcode = rf_RewriteParity(raidPtr);
   2913 	splx(s);
   2914 	if (retcode) {
   2915 		printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
   2916 	} else {
   2917 		/* set the clean bit!  If we shutdown correctly,
   2918 		   the clean bit on each component label will get
   2919 		   set */
   2920 		raidPtr->parity_good = RF_RAID_CLEAN;
   2921 	}
   2922 	raidPtr->parity_rewrite_in_progress = 0;
   2923 
   2924 	/* Anyone waiting for us to stop?  If so, inform them... */
   2925 	if (raidPtr->waitShutdown) {
   2926 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2927 	}
   2928 
   2929 	/* That's all... */
   2930 	kthread_exit(0);	/* does not return */
   2931 }
   2932 
   2933 
   2934 void
   2935 rf_CopybackThread(RF_Raid_t *raidPtr)
   2936 {
   2937 	int s;
   2938 
   2939 	raidPtr->copyback_in_progress = 1;
   2940 	s = splbio();
   2941 	rf_CopybackReconstructedData(raidPtr);
   2942 	splx(s);
   2943 	raidPtr->copyback_in_progress = 0;
   2944 
   2945 	/* That's all... */
   2946 	kthread_exit(0);	/* does not return */
   2947 }
   2948 
   2949 
   2950 void
   2951 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2952 {
   2953 	int s;
   2954 	RF_Raid_t *raidPtr;
   2955 
   2956 	s = splbio();
   2957 	raidPtr = req->raidPtr;
   2958 	raidPtr->recon_in_progress = 1;
   2959 	rf_ReconstructInPlace(raidPtr, req->col);
   2960 	RF_Free(req, sizeof(*req));
   2961 	raidPtr->recon_in_progress = 0;
   2962 	splx(s);
   2963 
   2964 	/* That's all... */
   2965 	kthread_exit(0);	/* does not return */
   2966 }
   2967 
   2968 static RF_AutoConfig_t *
   2969 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2970     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2971     unsigned secsize)
   2972 {
   2973 	int good_one = 0;
   2974 	RF_ComponentLabel_t *clabel;
   2975 	RF_AutoConfig_t *ac;
   2976 
   2977 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2978 	if (clabel == NULL) {
   2979 oomem:
   2980 		    while(ac_list) {
   2981 			    ac = ac_list;
   2982 			    if (ac->clabel)
   2983 				    free(ac->clabel, M_RAIDFRAME);
   2984 			    ac_list = ac_list->next;
   2985 			    free(ac, M_RAIDFRAME);
   2986 		    }
   2987 		    printf("RAID auto config: out of memory!\n");
   2988 		    return NULL; /* XXX probably should panic? */
   2989 	}
   2990 
   2991 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2992 		/* Got the label.  Does it look reasonable? */
   2993 		if (rf_reasonable_label(clabel, numsecs) &&
   2994 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2995 #ifdef DEBUG
   2996 			printf("Component on: %s: %llu\n",
   2997 				cname, (unsigned long long)size);
   2998 			rf_print_component_label(clabel);
   2999 #endif
   3000 			/* if it's reasonable, add it, else ignore it. */
   3001 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3002 				M_NOWAIT);
   3003 			if (ac == NULL) {
   3004 				free(clabel, M_RAIDFRAME);
   3005 				goto oomem;
   3006 			}
   3007 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3008 			ac->dev = dev;
   3009 			ac->vp = vp;
   3010 			ac->clabel = clabel;
   3011 			ac->next = ac_list;
   3012 			ac_list = ac;
   3013 			good_one = 1;
   3014 		}
   3015 	}
   3016 	if (!good_one) {
   3017 		/* cleanup */
   3018 		free(clabel, M_RAIDFRAME);
   3019 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3020 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3021 		vput(vp);
   3022 	}
   3023 	return ac_list;
   3024 }
   3025 
   3026 RF_AutoConfig_t *
   3027 rf_find_raid_components()
   3028 {
   3029 	struct vnode *vp;
   3030 	struct disklabel label;
   3031 	struct device *dv;
   3032 	dev_t dev;
   3033 	int bmajor, bminor, wedge;
   3034 	int error;
   3035 	int i;
   3036 	RF_AutoConfig_t *ac_list;
   3037 	uint64_t numsecs;
   3038 	unsigned secsize;
   3039 
   3040 	RF_ASSERT(raidPtr->bytesPerSector < rf_component_info_offset());
   3041 
   3042 	/* initialize the AutoConfig list */
   3043 	ac_list = NULL;
   3044 
   3045 	/* we begin by trolling through *all* the devices on the system */
   3046 
   3047 	for (dv = alldevs.tqh_first; dv != NULL;
   3048 	     dv = dv->dv_list.tqe_next) {
   3049 
   3050 		/* we are only interested in disks... */
   3051 		if (device_class(dv) != DV_DISK)
   3052 			continue;
   3053 
   3054 		/* we don't care about floppies... */
   3055 		if (device_is_a(dv, "fd")) {
   3056 			continue;
   3057 		}
   3058 
   3059 		/* we don't care about CD's... */
   3060 		if (device_is_a(dv, "cd")) {
   3061 			continue;
   3062 		}
   3063 
   3064 		/* we don't care about md's... */
   3065 		if (device_is_a(dv, "md")) {
   3066 			continue;
   3067 		}
   3068 
   3069 		/* hdfd is the Atari/Hades floppy driver */
   3070 		if (device_is_a(dv, "hdfd")) {
   3071 			continue;
   3072 		}
   3073 
   3074 		/* fdisa is the Atari/Milan floppy driver */
   3075 		if (device_is_a(dv, "fdisa")) {
   3076 			continue;
   3077 		}
   3078 
   3079 		/* need to find the device_name_to_block_device_major stuff */
   3080 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3081 
   3082 		/* get a vnode for the raw partition of this disk */
   3083 
   3084 		wedge = device_is_a(dv, "dk");
   3085 		bminor = minor(device_unit(dv));
   3086 		dev = wedge ? makedev(bmajor, bminor) :
   3087 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3088 		if (bdevvp(dev, &vp))
   3089 			panic("RAID can't alloc vnode");
   3090 
   3091 		error = VOP_OPEN(vp, FREAD, NOCRED);
   3092 
   3093 		if (error) {
   3094 			/* "Who cares."  Continue looking
   3095 			   for something that exists*/
   3096 			vput(vp);
   3097 			continue;
   3098 		}
   3099 
   3100 		error = getdisksize(vp, &numsecs, &secsize);
   3101 		if (error) {
   3102 			vput(vp);
   3103 			continue;
   3104 		}
   3105 		if (wedge) {
   3106 			struct dkwedge_info dkw;
   3107 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3108 			    NOCRED);
   3109 			if (error) {
   3110 				printf("RAIDframe: can't get wedge info for "
   3111 				    "dev %s (%d)\n", device_xname(dv), error);
   3112 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3113 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3114 				vput(vp);
   3115 				continue;
   3116 			}
   3117 
   3118 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3119 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3120 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3121 				vput(vp);
   3122 				continue;
   3123 			}
   3124 
   3125 			ac_list = rf_get_component(ac_list, dev, vp,
   3126 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3127 			continue;
   3128 		}
   3129 
   3130 		/* Ok, the disk exists.  Go get the disklabel. */
   3131 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3132 		if (error) {
   3133 			/*
   3134 			 * XXX can't happen - open() would
   3135 			 * have errored out (or faked up one)
   3136 			 */
   3137 			if (error != ENOTTY)
   3138 				printf("RAIDframe: can't get label for dev "
   3139 				    "%s (%d)\n", device_xname(dv), error);
   3140 		}
   3141 
   3142 		/* don't need this any more.  We'll allocate it again
   3143 		   a little later if we really do... */
   3144 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3145 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3146 		vput(vp);
   3147 
   3148 		if (error)
   3149 			continue;
   3150 
   3151 		for (i = 0; i < label.d_npartitions; i++) {
   3152 			char cname[sizeof(ac_list->devname)];
   3153 
   3154 			/* We only support partitions marked as RAID */
   3155 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3156 				continue;
   3157 
   3158 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3159 			if (bdevvp(dev, &vp))
   3160 				panic("RAID can't alloc vnode");
   3161 
   3162 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3163 			if (error) {
   3164 				/* Whatever... */
   3165 				vput(vp);
   3166 				continue;
   3167 			}
   3168 			snprintf(cname, sizeof(cname), "%s%c",
   3169 			    device_xname(dv), 'a' + i);
   3170 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3171 				label.d_partitions[i].p_size, numsecs, secsize);
   3172 		}
   3173 	}
   3174 	return ac_list;
   3175 }
   3176 
   3177 
   3178 static int
   3179 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3180 {
   3181 
   3182 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3183 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3184 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3185 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3186 	    clabel->row >=0 &&
   3187 	    clabel->column >= 0 &&
   3188 	    clabel->num_rows > 0 &&
   3189 	    clabel->num_columns > 0 &&
   3190 	    clabel->row < clabel->num_rows &&
   3191 	    clabel->column < clabel->num_columns &&
   3192 	    clabel->blockSize > 0 &&
   3193 	    /*
   3194 	     * numBlocksHi may contain garbage, but it is ok since
   3195 	     * the type is unsigned.  If it is really garbage,
   3196 	     * rf_fix_old_label_size() will fix it.
   3197 	     */
   3198 	    rf_component_label_numblocks(clabel) > 0) {
   3199 		/*
   3200 		 * label looks reasonable enough...
   3201 		 * let's make sure it has no old garbage.
   3202 		 */
   3203 		rf_fix_old_label_size(clabel, numsecs);
   3204 		return(1);
   3205 	}
   3206 	return(0);
   3207 }
   3208 
   3209 
   3210 /*
   3211  * For reasons yet unknown, some old component labels have garbage in
   3212  * the newer numBlocksHi region, and this causes lossage.  Since those
   3213  * disks will also have numsecs set to less than 32 bits of sectors,
   3214  * we can determine when this corruption has occured, and fix it.
   3215  *
   3216  * The exact same problem, with the same unknown reason, happens to
   3217  * the partitionSizeHi member as well.
   3218  */
   3219 static void
   3220 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3221 {
   3222 
   3223 	if (numsecs < ((uint64_t)1 << 32)) {
   3224 		if (clabel->numBlocksHi) {
   3225 			printf("WARNING: total sectors < 32 bits, yet "
   3226 			       "numBlocksHi set\n"
   3227 			       "WARNING: resetting numBlocksHi to zero.\n");
   3228 			clabel->numBlocksHi = 0;
   3229 		}
   3230 
   3231 		if (clabel->partitionSizeHi) {
   3232 			printf("WARNING: total sectors < 32 bits, yet "
   3233 			       "partitionSizeHi set\n"
   3234 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3235 			clabel->partitionSizeHi = 0;
   3236 		}
   3237 	}
   3238 }
   3239 
   3240 
   3241 #ifdef DEBUG
   3242 void
   3243 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3244 {
   3245 	uint64_t numBlocks;
   3246 
   3247 	numBlocks = rf_component_label_numblocks(clabel);
   3248 
   3249 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3250 	       clabel->row, clabel->column,
   3251 	       clabel->num_rows, clabel->num_columns);
   3252 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3253 	       clabel->version, clabel->serial_number,
   3254 	       clabel->mod_counter);
   3255 	printf("   Clean: %s Status: %d\n",
   3256 	       clabel->clean ? "Yes" : "No", clabel->status );
   3257 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3258 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3259 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3260 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3261 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
   3262 	printf("   Contains root partition: %s\n",
   3263 	       clabel->root_partition ? "Yes" : "No" );
   3264 	printf("   Last configured as: raid%d\n", clabel->last_unit );
   3265 #if 0
   3266 	   printf("   Config order: %d\n", clabel->config_order);
   3267 #endif
   3268 
   3269 }
   3270 #endif
   3271 
   3272 RF_ConfigSet_t *
   3273 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3274 {
   3275 	RF_AutoConfig_t *ac;
   3276 	RF_ConfigSet_t *config_sets;
   3277 	RF_ConfigSet_t *cset;
   3278 	RF_AutoConfig_t *ac_next;
   3279 
   3280 
   3281 	config_sets = NULL;
   3282 
   3283 	/* Go through the AutoConfig list, and figure out which components
   3284 	   belong to what sets.  */
   3285 	ac = ac_list;
   3286 	while(ac!=NULL) {
   3287 		/* we're going to putz with ac->next, so save it here
   3288 		   for use at the end of the loop */
   3289 		ac_next = ac->next;
   3290 
   3291 		if (config_sets == NULL) {
   3292 			/* will need at least this one... */
   3293 			config_sets = (RF_ConfigSet_t *)
   3294 				malloc(sizeof(RF_ConfigSet_t),
   3295 				       M_RAIDFRAME, M_NOWAIT);
   3296 			if (config_sets == NULL) {
   3297 				panic("rf_create_auto_sets: No memory!");
   3298 			}
   3299 			/* this one is easy :) */
   3300 			config_sets->ac = ac;
   3301 			config_sets->next = NULL;
   3302 			config_sets->rootable = 0;
   3303 			ac->next = NULL;
   3304 		} else {
   3305 			/* which set does this component fit into? */
   3306 			cset = config_sets;
   3307 			while(cset!=NULL) {
   3308 				if (rf_does_it_fit(cset, ac)) {
   3309 					/* looks like it matches... */
   3310 					ac->next = cset->ac;
   3311 					cset->ac = ac;
   3312 					break;
   3313 				}
   3314 				cset = cset->next;
   3315 			}
   3316 			if (cset==NULL) {
   3317 				/* didn't find a match above... new set..*/
   3318 				cset = (RF_ConfigSet_t *)
   3319 					malloc(sizeof(RF_ConfigSet_t),
   3320 					       M_RAIDFRAME, M_NOWAIT);
   3321 				if (cset == NULL) {
   3322 					panic("rf_create_auto_sets: No memory!");
   3323 				}
   3324 				cset->ac = ac;
   3325 				ac->next = NULL;
   3326 				cset->next = config_sets;
   3327 				cset->rootable = 0;
   3328 				config_sets = cset;
   3329 			}
   3330 		}
   3331 		ac = ac_next;
   3332 	}
   3333 
   3334 
   3335 	return(config_sets);
   3336 }
   3337 
   3338 static int
   3339 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3340 {
   3341 	RF_ComponentLabel_t *clabel1, *clabel2;
   3342 
   3343 	/* If this one matches the *first* one in the set, that's good
   3344 	   enough, since the other members of the set would have been
   3345 	   through here too... */
   3346 	/* note that we are not checking partitionSize here..
   3347 
   3348 	   Note that we are also not checking the mod_counters here.
   3349 	   If everything else matches execpt the mod_counter, that's
   3350 	   good enough for this test.  We will deal with the mod_counters
   3351 	   a little later in the autoconfiguration process.
   3352 
   3353 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3354 
   3355 	   The reason we don't check for this is that failed disks
   3356 	   will have lower modification counts.  If those disks are
   3357 	   not added to the set they used to belong to, then they will
   3358 	   form their own set, which may result in 2 different sets,
   3359 	   for example, competing to be configured at raid0, and
   3360 	   perhaps competing to be the root filesystem set.  If the
   3361 	   wrong ones get configured, or both attempt to become /,
   3362 	   weird behaviour and or serious lossage will occur.  Thus we
   3363 	   need to bring them into the fold here, and kick them out at
   3364 	   a later point.
   3365 
   3366 	*/
   3367 
   3368 	clabel1 = cset->ac->clabel;
   3369 	clabel2 = ac->clabel;
   3370 	if ((clabel1->version == clabel2->version) &&
   3371 	    (clabel1->serial_number == clabel2->serial_number) &&
   3372 	    (clabel1->num_rows == clabel2->num_rows) &&
   3373 	    (clabel1->num_columns == clabel2->num_columns) &&
   3374 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3375 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3376 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3377 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3378 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3379 	    (clabel1->blockSize == clabel2->blockSize) &&
   3380 	    rf_component_label_numblocks(clabel1) ==
   3381 	    rf_component_label_numblocks(clabel2) &&
   3382 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3383 	    (clabel1->root_partition == clabel2->root_partition) &&
   3384 	    (clabel1->last_unit == clabel2->last_unit) &&
   3385 	    (clabel1->config_order == clabel2->config_order)) {
   3386 		/* if it get's here, it almost *has* to be a match */
   3387 	} else {
   3388 		/* it's not consistent with somebody in the set..
   3389 		   punt */
   3390 		return(0);
   3391 	}
   3392 	/* all was fine.. it must fit... */
   3393 	return(1);
   3394 }
   3395 
   3396 int
   3397 rf_have_enough_components(RF_ConfigSet_t *cset)
   3398 {
   3399 	RF_AutoConfig_t *ac;
   3400 	RF_AutoConfig_t *auto_config;
   3401 	RF_ComponentLabel_t *clabel;
   3402 	int c;
   3403 	int num_cols;
   3404 	int num_missing;
   3405 	int mod_counter;
   3406 	int mod_counter_found;
   3407 	int even_pair_failed;
   3408 	char parity_type;
   3409 
   3410 
   3411 	/* check to see that we have enough 'live' components
   3412 	   of this set.  If so, we can configure it if necessary */
   3413 
   3414 	num_cols = cset->ac->clabel->num_columns;
   3415 	parity_type = cset->ac->clabel->parityConfig;
   3416 
   3417 	/* XXX Check for duplicate components!?!?!? */
   3418 
   3419 	/* Determine what the mod_counter is supposed to be for this set. */
   3420 
   3421 	mod_counter_found = 0;
   3422 	mod_counter = 0;
   3423 	ac = cset->ac;
   3424 	while(ac!=NULL) {
   3425 		if (mod_counter_found==0) {
   3426 			mod_counter = ac->clabel->mod_counter;
   3427 			mod_counter_found = 1;
   3428 		} else {
   3429 			if (ac->clabel->mod_counter > mod_counter) {
   3430 				mod_counter = ac->clabel->mod_counter;
   3431 			}
   3432 		}
   3433 		ac = ac->next;
   3434 	}
   3435 
   3436 	num_missing = 0;
   3437 	auto_config = cset->ac;
   3438 
   3439 	even_pair_failed = 0;
   3440 	for(c=0; c<num_cols; c++) {
   3441 		ac = auto_config;
   3442 		while(ac!=NULL) {
   3443 			if ((ac->clabel->column == c) &&
   3444 			    (ac->clabel->mod_counter == mod_counter)) {
   3445 				/* it's this one... */
   3446 #ifdef DEBUG
   3447 				printf("Found: %s at %d\n",
   3448 				       ac->devname,c);
   3449 #endif
   3450 				break;
   3451 			}
   3452 			ac=ac->next;
   3453 		}
   3454 		if (ac==NULL) {
   3455 				/* Didn't find one here! */
   3456 				/* special case for RAID 1, especially
   3457 				   where there are more than 2
   3458 				   components (where RAIDframe treats
   3459 				   things a little differently :( ) */
   3460 			if (parity_type == '1') {
   3461 				if (c%2 == 0) { /* even component */
   3462 					even_pair_failed = 1;
   3463 				} else { /* odd component.  If
   3464 					    we're failed, and
   3465 					    so is the even
   3466 					    component, it's
   3467 					    "Good Night, Charlie" */
   3468 					if (even_pair_failed == 1) {
   3469 						return(0);
   3470 					}
   3471 				}
   3472 			} else {
   3473 				/* normal accounting */
   3474 				num_missing++;
   3475 			}
   3476 		}
   3477 		if ((parity_type == '1') && (c%2 == 1)) {
   3478 				/* Just did an even component, and we didn't
   3479 				   bail.. reset the even_pair_failed flag,
   3480 				   and go on to the next component.... */
   3481 			even_pair_failed = 0;
   3482 		}
   3483 	}
   3484 
   3485 	clabel = cset->ac->clabel;
   3486 
   3487 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3488 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3489 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3490 		/* XXX this needs to be made *much* more general */
   3491 		/* Too many failures */
   3492 		return(0);
   3493 	}
   3494 	/* otherwise, all is well, and we've got enough to take a kick
   3495 	   at autoconfiguring this set */
   3496 	return(1);
   3497 }
   3498 
   3499 void
   3500 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3501 			RF_Raid_t *raidPtr)
   3502 {
   3503 	RF_ComponentLabel_t *clabel;
   3504 	int i;
   3505 
   3506 	clabel = ac->clabel;
   3507 
   3508 	/* 1. Fill in the common stuff */
   3509 	config->numRow = clabel->num_rows = 1;
   3510 	config->numCol = clabel->num_columns;
   3511 	config->numSpare = 0; /* XXX should this be set here? */
   3512 	config->sectPerSU = clabel->sectPerSU;
   3513 	config->SUsPerPU = clabel->SUsPerPU;
   3514 	config->SUsPerRU = clabel->SUsPerRU;
   3515 	config->parityConfig = clabel->parityConfig;
   3516 	/* XXX... */
   3517 	strcpy(config->diskQueueType,"fifo");
   3518 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3519 	config->layoutSpecificSize = 0; /* XXX ?? */
   3520 
   3521 	while(ac!=NULL) {
   3522 		/* row/col values will be in range due to the checks
   3523 		   in reasonable_label() */
   3524 		strcpy(config->devnames[0][ac->clabel->column],
   3525 		       ac->devname);
   3526 		ac = ac->next;
   3527 	}
   3528 
   3529 	for(i=0;i<RF_MAXDBGV;i++) {
   3530 		config->debugVars[i][0] = 0;
   3531 	}
   3532 }
   3533 
   3534 int
   3535 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3536 {
   3537 	RF_ComponentLabel_t *clabel;
   3538 	int column;
   3539 	int sparecol;
   3540 
   3541 	raidPtr->autoconfigure = new_value;
   3542 
   3543 	for(column=0; column<raidPtr->numCol; column++) {
   3544 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3545 			clabel = raidget_component_label(raidPtr, column);
   3546 			clabel->autoconfigure = new_value;
   3547 			raidflush_component_label(raidPtr, column);
   3548 		}
   3549 	}
   3550 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3551 		sparecol = raidPtr->numCol + column;
   3552 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3553 			clabel = raidget_component_label(raidPtr, sparecol);
   3554 			clabel->autoconfigure = new_value;
   3555 			raidflush_component_label(raidPtr, sparecol);
   3556 		}
   3557 	}
   3558 	return(new_value);
   3559 }
   3560 
   3561 int
   3562 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3563 {
   3564 	RF_ComponentLabel_t *clabel;
   3565 	int column;
   3566 	int sparecol;
   3567 
   3568 	raidPtr->root_partition = new_value;
   3569 	for(column=0; column<raidPtr->numCol; column++) {
   3570 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3571 			clabel = raidget_component_label(raidPtr, column);
   3572 			clabel->root_partition = new_value;
   3573 			raidflush_component_label(raidPtr, column);
   3574 		}
   3575 	}
   3576 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3577 		sparecol = raidPtr->numCol + column;
   3578 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3579 			clabel = raidget_component_label(raidPtr, sparecol);
   3580 			clabel->root_partition = new_value;
   3581 			raidflush_component_label(raidPtr, sparecol);
   3582 		}
   3583 	}
   3584 	return(new_value);
   3585 }
   3586 
   3587 void
   3588 rf_release_all_vps(RF_ConfigSet_t *cset)
   3589 {
   3590 	RF_AutoConfig_t *ac;
   3591 
   3592 	ac = cset->ac;
   3593 	while(ac!=NULL) {
   3594 		/* Close the vp, and give it back */
   3595 		if (ac->vp) {
   3596 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3597 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3598 			vput(ac->vp);
   3599 			ac->vp = NULL;
   3600 		}
   3601 		ac = ac->next;
   3602 	}
   3603 }
   3604 
   3605 
   3606 void
   3607 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3608 {
   3609 	RF_AutoConfig_t *ac;
   3610 	RF_AutoConfig_t *next_ac;
   3611 
   3612 	ac = cset->ac;
   3613 	while(ac!=NULL) {
   3614 		next_ac = ac->next;
   3615 		/* nuke the label */
   3616 		free(ac->clabel, M_RAIDFRAME);
   3617 		/* cleanup the config structure */
   3618 		free(ac, M_RAIDFRAME);
   3619 		/* "next.." */
   3620 		ac = next_ac;
   3621 	}
   3622 	/* and, finally, nuke the config set */
   3623 	free(cset, M_RAIDFRAME);
   3624 }
   3625 
   3626 
   3627 void
   3628 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3629 {
   3630 	/* current version number */
   3631 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3632 	clabel->serial_number = raidPtr->serial_number;
   3633 	clabel->mod_counter = raidPtr->mod_counter;
   3634 
   3635 	clabel->num_rows = 1;
   3636 	clabel->num_columns = raidPtr->numCol;
   3637 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3638 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3639 
   3640 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3641 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3642 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3643 
   3644 	clabel->blockSize = raidPtr->bytesPerSector;
   3645 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3646 
   3647 	/* XXX not portable */
   3648 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3649 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3650 	clabel->autoconfigure = raidPtr->autoconfigure;
   3651 	clabel->root_partition = raidPtr->root_partition;
   3652 	clabel->last_unit = raidPtr->raidid;
   3653 	clabel->config_order = raidPtr->config_order;
   3654 
   3655 #ifndef RF_NO_PARITY_MAP
   3656 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3657 #endif
   3658 }
   3659 
   3660 int
   3661 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3662 {
   3663 	RF_Raid_t *raidPtr;
   3664 	RF_Config_t *config;
   3665 	int raidID;
   3666 	int retcode;
   3667 
   3668 #ifdef DEBUG
   3669 	printf("RAID autoconfigure\n");
   3670 #endif
   3671 
   3672 	retcode = 0;
   3673 	*unit = -1;
   3674 
   3675 	/* 1. Create a config structure */
   3676 
   3677 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3678 				       M_RAIDFRAME,
   3679 				       M_NOWAIT);
   3680 	if (config==NULL) {
   3681 		printf("Out of mem!?!?\n");
   3682 				/* XXX do something more intelligent here. */
   3683 		return(1);
   3684 	}
   3685 
   3686 	memset(config, 0, sizeof(RF_Config_t));
   3687 
   3688 	/*
   3689 	   2. Figure out what RAID ID this one is supposed to live at
   3690 	   See if we can get the same RAID dev that it was configured
   3691 	   on last time..
   3692 	*/
   3693 
   3694 	raidID = cset->ac->clabel->last_unit;
   3695 	if ((raidID < 0) || (raidID >= numraid)) {
   3696 		/* let's not wander off into lala land. */
   3697 		raidID = numraid - 1;
   3698 	}
   3699 	if (raidPtrs[raidID]->valid != 0) {
   3700 
   3701 		/*
   3702 		   Nope... Go looking for an alternative...
   3703 		   Start high so we don't immediately use raid0 if that's
   3704 		   not taken.
   3705 		*/
   3706 
   3707 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3708 			if (raidPtrs[raidID]->valid == 0) {
   3709 				/* can use this one! */
   3710 				break;
   3711 			}
   3712 		}
   3713 	}
   3714 
   3715 	if (raidID < 0) {
   3716 		/* punt... */
   3717 		printf("Unable to auto configure this set!\n");
   3718 		printf("(Out of RAID devs!)\n");
   3719 		free(config, M_RAIDFRAME);
   3720 		return(1);
   3721 	}
   3722 
   3723 #ifdef DEBUG
   3724 	printf("Configuring raid%d:\n",raidID);
   3725 #endif
   3726 
   3727 	raidPtr = raidPtrs[raidID];
   3728 
   3729 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3730 	raidPtr->raidid = raidID;
   3731 	raidPtr->openings = RAIDOUTSTANDING;
   3732 
   3733 	/* 3. Build the configuration structure */
   3734 	rf_create_configuration(cset->ac, config, raidPtr);
   3735 
   3736 	/* 4. Do the configuration */
   3737 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3738 
   3739 	if (retcode == 0) {
   3740 
   3741 		raidinit(raidPtrs[raidID]);
   3742 
   3743 		rf_markalldirty(raidPtrs[raidID]);
   3744 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3745 		if (cset->ac->clabel->root_partition==1) {
   3746 			/* everything configured just fine.  Make a note
   3747 			   that this set is eligible to be root. */
   3748 			cset->rootable = 1;
   3749 			/* XXX do this here? */
   3750 			raidPtrs[raidID]->root_partition = 1;
   3751 		}
   3752 	}
   3753 
   3754 	/* 5. Cleanup */
   3755 	free(config, M_RAIDFRAME);
   3756 
   3757 	*unit = raidID;
   3758 	return(retcode);
   3759 }
   3760 
   3761 void
   3762 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3763 {
   3764 	struct buf *bp;
   3765 
   3766 	bp = (struct buf *)desc->bp;
   3767 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3768 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3769 }
   3770 
   3771 void
   3772 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3773 	     size_t xmin, size_t xmax)
   3774 {
   3775 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3776 	pool_sethiwat(p, xmax);
   3777 	pool_prime(p, xmin);
   3778 	pool_setlowat(p, xmin);
   3779 }
   3780 
   3781 /*
   3782  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3783  * if there is IO pending and if that IO could possibly be done for a
   3784  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3785  * otherwise.
   3786  *
   3787  */
   3788 
   3789 int
   3790 rf_buf_queue_check(int raidid)
   3791 {
   3792 	if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
   3793 	    raidPtrs[raidid]->openings > 0) {
   3794 		/* there is work to do */
   3795 		return 0;
   3796 	}
   3797 	/* default is nothing to do */
   3798 	return 1;
   3799 }
   3800 
   3801 int
   3802 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
   3803 {
   3804 	struct partinfo dpart;
   3805 	struct dkwedge_info dkw;
   3806 	int error;
   3807 
   3808 	error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
   3809 	if (error == 0) {
   3810 		diskPtr->blockSize = dpart.disklab->d_secsize;
   3811 		diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
   3812 		diskPtr->partitionSize = dpart.part->p_size;
   3813 		return 0;
   3814 	}
   3815 
   3816 	error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
   3817 	if (error == 0) {
   3818 		struct disk *pdk;
   3819 
   3820 		if ((pdk = disk_find(dkw.dkw_parent)) != NULL)
   3821 			diskPtr->blockSize = DEV_BSIZE << pdk->dk_blkshift;
   3822 		else
   3823 			diskPtr->blockSize = 512;	/* XXX */
   3824 		diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
   3825 		diskPtr->partitionSize = dkw.dkw_size;
   3826 		return 0;
   3827 	}
   3828 	return error;
   3829 }
   3830 
   3831 static int
   3832 raid_match(struct device *self, struct cfdata *cfdata,
   3833     void *aux)
   3834 {
   3835 	return 1;
   3836 }
   3837 
   3838 static void
   3839 raid_attach(struct device *parent, struct device *self,
   3840     void *aux)
   3841 {
   3842 
   3843 }
   3844 
   3845 
   3846 static int
   3847 raid_detach(struct device *self, int flags)
   3848 {
   3849 	struct raid_softc *rs = (struct raid_softc *)self;
   3850 
   3851 	if (rs->sc_flags & RAIDF_INITED)
   3852 		return EBUSY;
   3853 
   3854 	return 0;
   3855 }
   3856 
   3857 static void
   3858 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3859 {
   3860 	prop_dictionary_t disk_info, odisk_info, geom;
   3861 	disk_info = prop_dictionary_create();
   3862 	geom = prop_dictionary_create();
   3863 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3864 				   raidPtr->totalSectors);
   3865 	prop_dictionary_set_uint32(geom, "sector-size",
   3866 				   raidPtr->bytesPerSector);
   3867 
   3868 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3869 				   raidPtr->Layout.dataSectorsPerStripe);
   3870 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3871 				   4 * raidPtr->numCol);
   3872 
   3873 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3874 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3875 	   (4 * raidPtr->numCol)));
   3876 
   3877 	prop_dictionary_set(disk_info, "geometry", geom);
   3878 	prop_object_release(geom);
   3879 	prop_dictionary_set(device_properties(rs->sc_dev),
   3880 			    "disk-info", disk_info);
   3881 	odisk_info = rs->sc_dkdev.dk_info;
   3882 	rs->sc_dkdev.dk_info = disk_info;
   3883 	if (odisk_info)
   3884 		prop_object_release(odisk_info);
   3885 }
   3886 
   3887 /*
   3888  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3889  * We end up returning whatever error was returned by the first cache flush
   3890  * that fails.
   3891  */
   3892 
   3893 int
   3894 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3895 {
   3896 	int c, sparecol;
   3897 	int e,error;
   3898 	int force = 1;
   3899 
   3900 	error = 0;
   3901 	for (c = 0; c < raidPtr->numCol; c++) {
   3902 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3903 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3904 					  &force, FWRITE, NOCRED);
   3905 			if (e) {
   3906 				if (e != ENODEV)
   3907 					printf("raid%d: cache flush to component %s failed.\n",
   3908 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3909 				if (error == 0) {
   3910 					error = e;
   3911 				}
   3912 			}
   3913 		}
   3914 	}
   3915 
   3916 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3917 		sparecol = raidPtr->numCol + c;
   3918 		/* Need to ensure that the reconstruct actually completed! */
   3919 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3920 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3921 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3922 			if (e) {
   3923 				if (e != ENODEV)
   3924 					printf("raid%d: cache flush to component %s failed.\n",
   3925 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3926 				if (error == 0) {
   3927 					error = e;
   3928 				}
   3929 			}
   3930 		}
   3931 	}
   3932 	return error;
   3933 }
   3934