Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.295.6.2
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.295.6.2 2012/08/13 19:41:29 riz Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.295.6.2 2012/08/13 19:41:29 riz Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #include "raid.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #ifdef DEBUG
    156 int     rf_kdebug_level = 0;
    157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    158 #else				/* DEBUG */
    159 #define db1_printf(a) { }
    160 #endif				/* DEBUG */
    161 
    162 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 static void raidinit(RF_Raid_t *);
    183 
    184 void raidattach(int);
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 
    200 dev_type_open(raidopen);
    201 dev_type_close(raidclose);
    202 dev_type_read(raidread);
    203 dev_type_write(raidwrite);
    204 dev_type_ioctl(raidioctl);
    205 dev_type_strategy(raidstrategy);
    206 dev_type_dump(raiddump);
    207 dev_type_size(raidsize);
    208 
    209 const struct bdevsw raid_bdevsw = {
    210 	raidopen, raidclose, raidstrategy, raidioctl,
    211 	raiddump, raidsize, D_DISK
    212 };
    213 
    214 const struct cdevsw raid_cdevsw = {
    215 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    216 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    217 };
    218 
    219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    220 
    221 /* XXX Not sure if the following should be replacing the raidPtrs above,
    222    or if it should be used in conjunction with that...
    223 */
    224 
    225 struct raid_softc {
    226 	device_t sc_dev;
    227 	int     sc_flags;	/* flags */
    228 	int     sc_cflags;	/* configuration flags */
    229 	uint64_t sc_size;	/* size of the raid device */
    230 	char    sc_xname[20];	/* XXX external name */
    231 	struct disk sc_dkdev;	/* generic disk device info */
    232 	struct bufq_state *buf_queue;	/* used for the device queue */
    233 };
    234 /* sc_flags */
    235 #define RAIDF_INITED	0x01	/* unit has been initialized */
    236 #define RAIDF_WLABEL	0x02	/* label area is writable */
    237 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    238 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    239 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    240 #define RAIDF_LOCKED	0x80	/* unit is locked */
    241 
    242 #define	raidunit(x)	DISKUNIT(x)
    243 int numraid = 0;
    244 
    245 extern struct cfdriver raid_cd;
    246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    247     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    248     DVF_DETACH_SHUTDOWN);
    249 
    250 /*
    251  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    252  * Be aware that large numbers can allow the driver to consume a lot of
    253  * kernel memory, especially on writes, and in degraded mode reads.
    254  *
    255  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    256  * a single 64K write will typically require 64K for the old data,
    257  * 64K for the old parity, and 64K for the new parity, for a total
    258  * of 192K (if the parity buffer is not re-used immediately).
    259  * Even it if is used immediately, that's still 128K, which when multiplied
    260  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    261  *
    262  * Now in degraded mode, for example, a 64K read on the above setup may
    263  * require data reconstruction, which will require *all* of the 4 remaining
    264  * disks to participate -- 4 * 32K/disk == 128K again.
    265  */
    266 
    267 #ifndef RAIDOUTSTANDING
    268 #define RAIDOUTSTANDING   6
    269 #endif
    270 
    271 #define RAIDLABELDEV(dev)	\
    272 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    273 
    274 /* declared here, and made public, for the benefit of KVM stuff.. */
    275 struct raid_softc *raid_softc;
    276 
    277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    278 				     struct disklabel *);
    279 static void raidgetdisklabel(dev_t);
    280 static void raidmakedisklabel(struct raid_softc *);
    281 
    282 static int raidlock(struct raid_softc *);
    283 static void raidunlock(struct raid_softc *);
    284 
    285 static int raid_detach_unlocked(struct raid_softc *);
    286 
    287 static void rf_markalldirty(RF_Raid_t *);
    288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    289 
    290 void rf_ReconThread(struct rf_recon_req *);
    291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    292 void rf_CopybackThread(RF_Raid_t *raidPtr);
    293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    294 int rf_autoconfig(device_t);
    295 void rf_buildroothack(RF_ConfigSet_t *);
    296 
    297 RF_AutoConfig_t *rf_find_raid_components(void);
    298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    302 int rf_set_autoconfig(RF_Raid_t *, int);
    303 int rf_set_rootpartition(RF_Raid_t *, int);
    304 void rf_release_all_vps(RF_ConfigSet_t *);
    305 void rf_cleanup_config_set(RF_ConfigSet_t *);
    306 int rf_have_enough_components(RF_ConfigSet_t *);
    307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    309 
    310 /*
    311  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    312  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    313  * in the kernel config file.
    314  */
    315 #ifdef RAID_AUTOCONFIG
    316 int raidautoconfig = 1;
    317 #else
    318 int raidautoconfig = 0;
    319 #endif
    320 static bool raidautoconfigdone = false;
    321 
    322 struct RF_Pools_s rf_pools;
    323 
    324 void
    325 raidattach(int num)
    326 {
    327 	int raidID;
    328 	int i, rc;
    329 
    330 	aprint_debug("raidattach: Asked for %d units\n", num);
    331 
    332 	if (num <= 0) {
    333 #ifdef DIAGNOSTIC
    334 		panic("raidattach: count <= 0");
    335 #endif
    336 		return;
    337 	}
    338 	/* This is where all the initialization stuff gets done. */
    339 
    340 	numraid = num;
    341 
    342 	/* Make some space for requested number of units... */
    343 
    344 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    345 	if (raidPtrs == NULL) {
    346 		panic("raidPtrs is NULL!!");
    347 	}
    348 
    349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    350 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    351 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    352 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    353 
    354 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    355 #endif
    356 
    357 	for (i = 0; i < num; i++)
    358 		raidPtrs[i] = NULL;
    359 	rc = rf_BootRaidframe();
    360 	if (rc == 0)
    361 		aprint_verbose("Kernelized RAIDframe activated\n");
    362 	else
    363 		panic("Serious error booting RAID!!");
    364 
    365 	/* put together some datastructures like the CCD device does.. This
    366 	 * lets us lock the device and what-not when it gets opened. */
    367 
    368 	raid_softc = (struct raid_softc *)
    369 		malloc(num * sizeof(struct raid_softc),
    370 		       M_RAIDFRAME, M_NOWAIT);
    371 	if (raid_softc == NULL) {
    372 		aprint_error("WARNING: no memory for RAIDframe driver\n");
    373 		return;
    374 	}
    375 
    376 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    377 
    378 	for (raidID = 0; raidID < num; raidID++) {
    379 		bufq_alloc(&raid_softc[raidID].buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    380 
    381 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    382 			  (RF_Raid_t *));
    383 		if (raidPtrs[raidID] == NULL) {
    384 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
    385 			numraid = raidID;
    386 			return;
    387 		}
    388 	}
    389 
    390 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    391 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    392 	}
    393 
    394 	raidautoconfigdone = false;
    395 
    396 	/*
    397 	 * Register a finalizer which will be used to auto-config RAID
    398 	 * sets once all real hardware devices have been found.
    399 	 */
    400 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    401 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    402 }
    403 
    404 int
    405 rf_autoconfig(device_t self)
    406 {
    407 	RF_AutoConfig_t *ac_list;
    408 	RF_ConfigSet_t *config_sets;
    409 
    410 	if (!raidautoconfig || raidautoconfigdone == true)
    411 		return (0);
    412 
    413 	/* XXX This code can only be run once. */
    414 	raidautoconfigdone = true;
    415 
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set andconfigure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 void
    433 rf_buildroothack(RF_ConfigSet_t *config_sets)
    434 {
    435 	RF_ConfigSet_t *cset;
    436 	RF_ConfigSet_t *next_cset;
    437 	int retcode;
    438 	int raidID;
    439 	int rootID;
    440 	int col;
    441 	int num_root;
    442 	char *devname;
    443 
    444 	rootID = 0;
    445 	num_root = 0;
    446 	cset = config_sets;
    447 	while (cset != NULL) {
    448 		next_cset = cset->next;
    449 		if (rf_have_enough_components(cset) &&
    450 		    cset->ac->clabel->autoconfigure==1) {
    451 			retcode = rf_auto_config_set(cset,&raidID);
    452 			if (!retcode) {
    453 				aprint_debug("raid%d: configured ok\n", raidID);
    454 				if (cset->rootable) {
    455 					rootID = raidID;
    456 					num_root++;
    457 				}
    458 			} else {
    459 				/* The autoconfig didn't work :( */
    460 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    461 				rf_release_all_vps(cset);
    462 			}
    463 		} else {
    464 			/* we're not autoconfiguring this set...
    465 			   release the associated resources */
    466 			rf_release_all_vps(cset);
    467 		}
    468 		/* cleanup */
    469 		rf_cleanup_config_set(cset);
    470 		cset = next_cset;
    471 	}
    472 
    473 	/* if the user has specified what the root device should be
    474 	   then we don't touch booted_device or boothowto... */
    475 
    476 	if (rootspec != NULL)
    477 		return;
    478 
    479 	/* we found something bootable... */
    480 
    481 	if (num_root == 1) {
    482 		booted_device = raid_softc[rootID].sc_dev;
    483 	} else if (num_root > 1) {
    484 
    485 		/*
    486 		 * Maybe the MD code can help. If it cannot, then
    487 		 * setroot() will discover that we have no
    488 		 * booted_device and will ask the user if nothing was
    489 		 * hardwired in the kernel config file
    490 		 */
    491 
    492 		if (booted_device == NULL)
    493 			cpu_rootconf();
    494 		if (booted_device == NULL)
    495 			return;
    496 
    497 		num_root = 0;
    498 		for (raidID = 0; raidID < numraid; raidID++) {
    499 			if (raidPtrs[raidID]->valid == 0)
    500 				continue;
    501 
    502 			if (raidPtrs[raidID]->root_partition == 0)
    503 				continue;
    504 
    505 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
    506 				devname = raidPtrs[raidID]->Disks[col].devname;
    507 				devname += sizeof("/dev/") - 1;
    508 				if (strncmp(devname, device_xname(booted_device),
    509 					    strlen(device_xname(booted_device))) != 0)
    510 					continue;
    511 				aprint_debug("raid%d includes boot device %s\n",
    512 				       raidID, devname);
    513 				num_root++;
    514 				rootID = raidID;
    515 			}
    516 		}
    517 
    518 		if (num_root == 1) {
    519 			booted_device = raid_softc[rootID].sc_dev;
    520 		} else {
    521 			/* we can't guess.. require the user to answer... */
    522 			boothowto |= RB_ASKNAME;
    523 		}
    524 	}
    525 }
    526 
    527 
    528 int
    529 raidsize(dev_t dev)
    530 {
    531 	struct raid_softc *rs;
    532 	struct disklabel *lp;
    533 	int     part, unit, omask, size;
    534 
    535 	unit = raidunit(dev);
    536 	if (unit >= numraid)
    537 		return (-1);
    538 	rs = &raid_softc[unit];
    539 
    540 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    541 		return (-1);
    542 
    543 	part = DISKPART(dev);
    544 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    545 	lp = rs->sc_dkdev.dk_label;
    546 
    547 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    548 		return (-1);
    549 
    550 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    551 		size = -1;
    552 	else
    553 		size = lp->d_partitions[part].p_size *
    554 		    (lp->d_secsize / DEV_BSIZE);
    555 
    556 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    557 		return (-1);
    558 
    559 	return (size);
    560 
    561 }
    562 
    563 int
    564 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    565 {
    566 	int     unit = raidunit(dev);
    567 	struct raid_softc *rs;
    568 	const struct bdevsw *bdev;
    569 	struct disklabel *lp;
    570 	RF_Raid_t *raidPtr;
    571 	daddr_t offset;
    572 	int     part, c, sparecol, j, scol, dumpto;
    573 	int     error = 0;
    574 
    575 	if (unit >= numraid)
    576 		return (ENXIO);
    577 
    578 	rs = &raid_softc[unit];
    579 	raidPtr = raidPtrs[unit];
    580 
    581 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    582 		return ENXIO;
    583 
    584 	/* we only support dumping to RAID 1 sets */
    585 	if (raidPtr->Layout.numDataCol != 1 ||
    586 	    raidPtr->Layout.numParityCol != 1)
    587 		return EINVAL;
    588 
    589 
    590 	if ((error = raidlock(rs)) != 0)
    591 		return error;
    592 
    593 	if (size % DEV_BSIZE != 0) {
    594 		error = EINVAL;
    595 		goto out;
    596 	}
    597 
    598 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    599 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    600 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    601 		    size / DEV_BSIZE, rs->sc_size);
    602 		error = EINVAL;
    603 		goto out;
    604 	}
    605 
    606 	part = DISKPART(dev);
    607 	lp = rs->sc_dkdev.dk_label;
    608 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    609 
    610 	/* figure out what device is alive.. */
    611 
    612 	/*
    613 	   Look for a component to dump to.  The preference for the
    614 	   component to dump to is as follows:
    615 	   1) the master
    616 	   2) a used_spare of the master
    617 	   3) the slave
    618 	   4) a used_spare of the slave
    619 	*/
    620 
    621 	dumpto = -1;
    622 	for (c = 0; c < raidPtr->numCol; c++) {
    623 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    624 			/* this might be the one */
    625 			dumpto = c;
    626 			break;
    627 		}
    628 	}
    629 
    630 	/*
    631 	   At this point we have possibly selected a live master or a
    632 	   live slave.  We now check to see if there is a spared
    633 	   master (or a spared slave), if we didn't find a live master
    634 	   or a live slave.
    635 	*/
    636 
    637 	for (c = 0; c < raidPtr->numSpare; c++) {
    638 		sparecol = raidPtr->numCol + c;
    639 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    640 			/* How about this one? */
    641 			scol = -1;
    642 			for(j=0;j<raidPtr->numCol;j++) {
    643 				if (raidPtr->Disks[j].spareCol == sparecol) {
    644 					scol = j;
    645 					break;
    646 				}
    647 			}
    648 			if (scol == 0) {
    649 				/*
    650 				   We must have found a spared master!
    651 				   We'll take that over anything else
    652 				   found so far.  (We couldn't have
    653 				   found a real master before, since
    654 				   this is a used spare, and it's
    655 				   saying that it's replacing the
    656 				   master.)  On reboot (with
    657 				   autoconfiguration turned on)
    658 				   sparecol will become the 1st
    659 				   component (component0) of this set.
    660 				*/
    661 				dumpto = sparecol;
    662 				break;
    663 			} else if (scol != -1) {
    664 				/*
    665 				   Must be a spared slave.  We'll dump
    666 				   to that if we havn't found anything
    667 				   else so far.
    668 				*/
    669 				if (dumpto == -1)
    670 					dumpto = sparecol;
    671 			}
    672 		}
    673 	}
    674 
    675 	if (dumpto == -1) {
    676 		/* we couldn't find any live components to dump to!?!?
    677 		 */
    678 		error = EINVAL;
    679 		goto out;
    680 	}
    681 
    682 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    683 
    684 	/*
    685 	   Note that blkno is relative to this particular partition.
    686 	   By adding the offset of this partition in the RAID
    687 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    688 	   value that is relative to the partition used for the
    689 	   underlying component.
    690 	*/
    691 
    692 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    693 				blkno + offset, va, size);
    694 
    695 out:
    696 	raidunlock(rs);
    697 
    698 	return error;
    699 }
    700 /* ARGSUSED */
    701 int
    702 raidopen(dev_t dev, int flags, int fmt,
    703     struct lwp *l)
    704 {
    705 	int     unit = raidunit(dev);
    706 	struct raid_softc *rs;
    707 	struct disklabel *lp;
    708 	int     part, pmask;
    709 	int     error = 0;
    710 
    711 	if (unit >= numraid)
    712 		return (ENXIO);
    713 	rs = &raid_softc[unit];
    714 
    715 	if ((error = raidlock(rs)) != 0)
    716 		return (error);
    717 
    718 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    719 		error = EBUSY;
    720 		goto bad;
    721 	}
    722 
    723 	lp = rs->sc_dkdev.dk_label;
    724 
    725 	part = DISKPART(dev);
    726 
    727 	/*
    728 	 * If there are wedges, and this is not RAW_PART, then we
    729 	 * need to fail.
    730 	 */
    731 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    732 		error = EBUSY;
    733 		goto bad;
    734 	}
    735 	pmask = (1 << part);
    736 
    737 	if ((rs->sc_flags & RAIDF_INITED) &&
    738 	    (rs->sc_dkdev.dk_openmask == 0))
    739 		raidgetdisklabel(dev);
    740 
    741 	/* make sure that this partition exists */
    742 
    743 	if (part != RAW_PART) {
    744 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    745 		    ((part >= lp->d_npartitions) ||
    746 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    747 			error = ENXIO;
    748 			goto bad;
    749 		}
    750 	}
    751 	/* Prevent this unit from being unconfigured while open. */
    752 	switch (fmt) {
    753 	case S_IFCHR:
    754 		rs->sc_dkdev.dk_copenmask |= pmask;
    755 		break;
    756 
    757 	case S_IFBLK:
    758 		rs->sc_dkdev.dk_bopenmask |= pmask;
    759 		break;
    760 	}
    761 
    762 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    763 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    764 		/* First one... mark things as dirty... Note that we *MUST*
    765 		 have done a configure before this.  I DO NOT WANT TO BE
    766 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    767 		 THAT THEY BELONG TOGETHER!!!!! */
    768 		/* XXX should check to see if we're only open for reading
    769 		   here... If so, we needn't do this, but then need some
    770 		   other way of keeping track of what's happened.. */
    771 
    772 		rf_markalldirty(raidPtrs[unit]);
    773 	}
    774 
    775 
    776 	rs->sc_dkdev.dk_openmask =
    777 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    778 
    779 bad:
    780 	raidunlock(rs);
    781 
    782 	return (error);
    783 
    784 
    785 }
    786 /* ARGSUSED */
    787 int
    788 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    789 {
    790 	int     unit = raidunit(dev);
    791 	struct raid_softc *rs;
    792 	int     error = 0;
    793 	int     part;
    794 
    795 	if (unit >= numraid)
    796 		return (ENXIO);
    797 	rs = &raid_softc[unit];
    798 
    799 	if ((error = raidlock(rs)) != 0)
    800 		return (error);
    801 
    802 	part = DISKPART(dev);
    803 
    804 	/* ...that much closer to allowing unconfiguration... */
    805 	switch (fmt) {
    806 	case S_IFCHR:
    807 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    808 		break;
    809 
    810 	case S_IFBLK:
    811 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    812 		break;
    813 	}
    814 	rs->sc_dkdev.dk_openmask =
    815 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    816 
    817 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    818 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    819 		/* Last one... device is not unconfigured yet.
    820 		   Device shutdown has taken care of setting the
    821 		   clean bits if RAIDF_INITED is not set
    822 		   mark things as clean... */
    823 
    824 		rf_update_component_labels(raidPtrs[unit],
    825 						 RF_FINAL_COMPONENT_UPDATE);
    826 
    827 		/* If the kernel is shutting down, it will detach
    828 		 * this RAID set soon enough.
    829 		 */
    830 	}
    831 
    832 	raidunlock(rs);
    833 	return (0);
    834 
    835 }
    836 
    837 void
    838 raidstrategy(struct buf *bp)
    839 {
    840 	unsigned int raidID = raidunit(bp->b_dev);
    841 	RF_Raid_t *raidPtr;
    842 	struct raid_softc *rs = &raid_softc[raidID];
    843 	int     wlabel;
    844 
    845 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    846 		bp->b_error = ENXIO;
    847 		goto done;
    848 	}
    849 	if (raidID >= numraid || !raidPtrs[raidID]) {
    850 		bp->b_error = ENODEV;
    851 		goto done;
    852 	}
    853 	raidPtr = raidPtrs[raidID];
    854 	if (!raidPtr->valid) {
    855 		bp->b_error = ENODEV;
    856 		goto done;
    857 	}
    858 	if (bp->b_bcount == 0) {
    859 		db1_printf(("b_bcount is zero..\n"));
    860 		goto done;
    861 	}
    862 
    863 	/*
    864 	 * Do bounds checking and adjust transfer.  If there's an
    865 	 * error, the bounds check will flag that for us.
    866 	 */
    867 
    868 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    869 	if (DISKPART(bp->b_dev) == RAW_PART) {
    870 		uint64_t size; /* device size in DEV_BSIZE unit */
    871 
    872 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    873 			size = raidPtr->totalSectors <<
    874 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    875 		} else {
    876 			size = raidPtr->totalSectors >>
    877 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    878 		}
    879 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    880 			goto done;
    881 		}
    882 	} else {
    883 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    884 			db1_printf(("Bounds check failed!!:%d %d\n",
    885 				(int) bp->b_blkno, (int) wlabel));
    886 			goto done;
    887 		}
    888 	}
    889 
    890 	rf_lock_mutex2(raidPtr->iodone_lock);
    891 
    892 	bp->b_resid = 0;
    893 
    894 	/* stuff it onto our queue */
    895 	bufq_put(rs->buf_queue, bp);
    896 
    897 	/* scheduled the IO to happen at the next convenient time */
    898 	rf_signal_cond2(raidPtr->iodone_cv);
    899 	rf_unlock_mutex2(raidPtr->iodone_lock);
    900 
    901 	return;
    902 
    903 done:
    904 	bp->b_resid = bp->b_bcount;
    905 	biodone(bp);
    906 }
    907 /* ARGSUSED */
    908 int
    909 raidread(dev_t dev, struct uio *uio, int flags)
    910 {
    911 	int     unit = raidunit(dev);
    912 	struct raid_softc *rs;
    913 
    914 	if (unit >= numraid)
    915 		return (ENXIO);
    916 	rs = &raid_softc[unit];
    917 
    918 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    919 		return (ENXIO);
    920 
    921 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    922 
    923 }
    924 /* ARGSUSED */
    925 int
    926 raidwrite(dev_t dev, struct uio *uio, int flags)
    927 {
    928 	int     unit = raidunit(dev);
    929 	struct raid_softc *rs;
    930 
    931 	if (unit >= numraid)
    932 		return (ENXIO);
    933 	rs = &raid_softc[unit];
    934 
    935 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    936 		return (ENXIO);
    937 
    938 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    939 
    940 }
    941 
    942 static int
    943 raid_detach_unlocked(struct raid_softc *rs)
    944 {
    945 	int error;
    946 	RF_Raid_t *raidPtr;
    947 
    948 	raidPtr = raidPtrs[device_unit(rs->sc_dev)];
    949 
    950 	/*
    951 	 * If somebody has a partition mounted, we shouldn't
    952 	 * shutdown.
    953 	 */
    954 	if (rs->sc_dkdev.dk_openmask != 0)
    955 		return EBUSY;
    956 
    957 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    958 		;	/* not initialized: nothing to do */
    959 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    960 		return error;
    961 	else
    962 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    963 
    964 	/* Detach the disk. */
    965 	dkwedge_delall(&rs->sc_dkdev);
    966 	disk_detach(&rs->sc_dkdev);
    967 	disk_destroy(&rs->sc_dkdev);
    968 
    969 	aprint_normal_dev(rs->sc_dev, "detached\n");
    970 
    971 	return 0;
    972 }
    973 
    974 int
    975 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    976 {
    977 	int     unit = raidunit(dev);
    978 	int     error = 0;
    979 	int     part, pmask, s;
    980 	cfdata_t cf;
    981 	struct raid_softc *rs;
    982 	RF_Config_t *k_cfg, *u_cfg;
    983 	RF_Raid_t *raidPtr;
    984 	RF_RaidDisk_t *diskPtr;
    985 	RF_AccTotals_t *totals;
    986 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    987 	u_char *specific_buf;
    988 	int retcode = 0;
    989 	int column;
    990 /*	int raidid; */
    991 	struct rf_recon_req *rrcopy, *rr;
    992 	RF_ComponentLabel_t *clabel;
    993 	RF_ComponentLabel_t *ci_label;
    994 	RF_ComponentLabel_t **clabel_ptr;
    995 	RF_SingleComponent_t *sparePtr,*componentPtr;
    996 	RF_SingleComponent_t component;
    997 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
    998 	int i, j, d;
    999 #ifdef __HAVE_OLD_DISKLABEL
   1000 	struct disklabel newlabel;
   1001 #endif
   1002 	struct dkwedge_info *dkw;
   1003 
   1004 	if (unit >= numraid)
   1005 		return (ENXIO);
   1006 	rs = &raid_softc[unit];
   1007 	raidPtr = raidPtrs[unit];
   1008 
   1009 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1010 		(int) DISKPART(dev), (int) unit, cmd));
   1011 
   1012 	/* Must be open for writes for these commands... */
   1013 	switch (cmd) {
   1014 #ifdef DIOCGSECTORSIZE
   1015 	case DIOCGSECTORSIZE:
   1016 		*(u_int *)data = raidPtr->bytesPerSector;
   1017 		return 0;
   1018 	case DIOCGMEDIASIZE:
   1019 		*(off_t *)data =
   1020 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1021 		return 0;
   1022 #endif
   1023 	case DIOCSDINFO:
   1024 	case DIOCWDINFO:
   1025 #ifdef __HAVE_OLD_DISKLABEL
   1026 	case ODIOCWDINFO:
   1027 	case ODIOCSDINFO:
   1028 #endif
   1029 	case DIOCWLABEL:
   1030 	case DIOCAWEDGE:
   1031 	case DIOCDWEDGE:
   1032 	case DIOCSSTRATEGY:
   1033 		if ((flag & FWRITE) == 0)
   1034 			return (EBADF);
   1035 	}
   1036 
   1037 	/* Must be initialized for these... */
   1038 	switch (cmd) {
   1039 	case DIOCGDINFO:
   1040 	case DIOCSDINFO:
   1041 	case DIOCWDINFO:
   1042 #ifdef __HAVE_OLD_DISKLABEL
   1043 	case ODIOCGDINFO:
   1044 	case ODIOCWDINFO:
   1045 	case ODIOCSDINFO:
   1046 	case ODIOCGDEFLABEL:
   1047 #endif
   1048 	case DIOCGPART:
   1049 	case DIOCWLABEL:
   1050 	case DIOCGDEFLABEL:
   1051 	case DIOCAWEDGE:
   1052 	case DIOCDWEDGE:
   1053 	case DIOCLWEDGES:
   1054 	case DIOCCACHESYNC:
   1055 	case RAIDFRAME_SHUTDOWN:
   1056 	case RAIDFRAME_REWRITEPARITY:
   1057 	case RAIDFRAME_GET_INFO:
   1058 	case RAIDFRAME_RESET_ACCTOTALS:
   1059 	case RAIDFRAME_GET_ACCTOTALS:
   1060 	case RAIDFRAME_KEEP_ACCTOTALS:
   1061 	case RAIDFRAME_GET_SIZE:
   1062 	case RAIDFRAME_FAIL_DISK:
   1063 	case RAIDFRAME_COPYBACK:
   1064 	case RAIDFRAME_CHECK_RECON_STATUS:
   1065 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1066 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1067 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1068 	case RAIDFRAME_ADD_HOT_SPARE:
   1069 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1070 	case RAIDFRAME_INIT_LABELS:
   1071 	case RAIDFRAME_REBUILD_IN_PLACE:
   1072 	case RAIDFRAME_CHECK_PARITY:
   1073 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1074 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1075 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1076 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1077 	case RAIDFRAME_SET_AUTOCONFIG:
   1078 	case RAIDFRAME_SET_ROOT:
   1079 	case RAIDFRAME_DELETE_COMPONENT:
   1080 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1081 	case RAIDFRAME_PARITYMAP_STATUS:
   1082 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1083 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1084 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1085 	case DIOCGSTRATEGY:
   1086 	case DIOCSSTRATEGY:
   1087 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1088 			return (ENXIO);
   1089 	}
   1090 
   1091 	switch (cmd) {
   1092 #ifdef COMPAT_50
   1093 	case RAIDFRAME_GET_INFO50:
   1094 		return rf_get_info50(raidPtr, data);
   1095 
   1096 	case RAIDFRAME_CONFIGURE50:
   1097 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1098 			return retcode;
   1099 		goto config;
   1100 #endif
   1101 		/* configure the system */
   1102 	case RAIDFRAME_CONFIGURE:
   1103 
   1104 		if (raidPtr->valid) {
   1105 			/* There is a valid RAID set running on this unit! */
   1106 			printf("raid%d: Device already configured!\n",unit);
   1107 			return(EINVAL);
   1108 		}
   1109 
   1110 		/* copy-in the configuration information */
   1111 		/* data points to a pointer to the configuration structure */
   1112 
   1113 		u_cfg = *((RF_Config_t **) data);
   1114 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1115 		if (k_cfg == NULL) {
   1116 			return (ENOMEM);
   1117 		}
   1118 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1119 		if (retcode) {
   1120 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1121 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1122 				retcode));
   1123 			return (retcode);
   1124 		}
   1125 		goto config;
   1126 	config:
   1127 		/* allocate a buffer for the layout-specific data, and copy it
   1128 		 * in */
   1129 		if (k_cfg->layoutSpecificSize) {
   1130 			if (k_cfg->layoutSpecificSize > 10000) {
   1131 				/* sanity check */
   1132 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1133 				return (EINVAL);
   1134 			}
   1135 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1136 			    (u_char *));
   1137 			if (specific_buf == NULL) {
   1138 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1139 				return (ENOMEM);
   1140 			}
   1141 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1142 			    k_cfg->layoutSpecificSize);
   1143 			if (retcode) {
   1144 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1145 				RF_Free(specific_buf,
   1146 					k_cfg->layoutSpecificSize);
   1147 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1148 					retcode));
   1149 				return (retcode);
   1150 			}
   1151 		} else
   1152 			specific_buf = NULL;
   1153 		k_cfg->layoutSpecific = specific_buf;
   1154 
   1155 		/* should do some kind of sanity check on the configuration.
   1156 		 * Store the sum of all the bytes in the last byte? */
   1157 
   1158 		/* configure the system */
   1159 
   1160 		/*
   1161 		 * Clear the entire RAID descriptor, just to make sure
   1162 		 *  there is no stale data left in the case of a
   1163 		 *  reconfiguration
   1164 		 */
   1165 		memset(raidPtr, 0, sizeof(*raidPtr));
   1166 		raidPtr->raidid = unit;
   1167 
   1168 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1169 
   1170 		if (retcode == 0) {
   1171 
   1172 			/* allow this many simultaneous IO's to
   1173 			   this RAID device */
   1174 			raidPtr->openings = RAIDOUTSTANDING;
   1175 
   1176 			raidinit(raidPtr);
   1177 			rf_markalldirty(raidPtr);
   1178 		}
   1179 		/* free the buffers.  No return code here. */
   1180 		if (k_cfg->layoutSpecificSize) {
   1181 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1182 		}
   1183 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1184 
   1185 		return (retcode);
   1186 
   1187 		/* shutdown the system */
   1188 	case RAIDFRAME_SHUTDOWN:
   1189 
   1190 		part = DISKPART(dev);
   1191 		pmask = (1 << part);
   1192 
   1193 		if ((error = raidlock(rs)) != 0)
   1194 			return (error);
   1195 
   1196 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1197 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1198 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1199 			retcode = EBUSY;
   1200 		else {
   1201 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1202 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1203 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1204 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1205 			retcode = 0;
   1206 		}
   1207 
   1208 		raidunlock(rs);
   1209 
   1210 		if (retcode != 0)
   1211 			return retcode;
   1212 
   1213 		/* free the pseudo device attach bits */
   1214 
   1215 		cf = device_cfdata(rs->sc_dev);
   1216 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1217 			free(cf, M_RAIDFRAME);
   1218 
   1219 		return (retcode);
   1220 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1221 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1222 		/* need to read the component label for the disk indicated
   1223 		   by row,column in clabel */
   1224 
   1225 		/*
   1226 		 * Perhaps there should be an option to skip the in-core
   1227 		 * copy and hit the disk, as with disklabel(8).
   1228 		 */
   1229 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1230 
   1231 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1232 
   1233 		if (retcode) {
   1234 			RF_Free(clabel, sizeof(*clabel));
   1235 			return retcode;
   1236 		}
   1237 
   1238 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1239 
   1240 		column = clabel->column;
   1241 
   1242 		if ((column < 0) || (column >= raidPtr->numCol +
   1243 		    raidPtr->numSpare)) {
   1244 			RF_Free(clabel, sizeof(*clabel));
   1245 			return EINVAL;
   1246 		}
   1247 
   1248 		RF_Free(clabel, sizeof(*clabel));
   1249 
   1250 		clabel = raidget_component_label(raidPtr, column);
   1251 
   1252 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1253 
   1254 #if 0
   1255 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1256 		clabel = (RF_ComponentLabel_t *) data;
   1257 
   1258 		/* XXX check the label for valid stuff... */
   1259 		/* Note that some things *should not* get modified --
   1260 		   the user should be re-initing the labels instead of
   1261 		   trying to patch things.
   1262 		   */
   1263 
   1264 		raidid = raidPtr->raidid;
   1265 #ifdef DEBUG
   1266 		printf("raid%d: Got component label:\n", raidid);
   1267 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1268 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1269 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1270 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1271 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1272 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1273 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1274 #endif
   1275 		clabel->row = 0;
   1276 		column = clabel->column;
   1277 
   1278 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1279 			return(EINVAL);
   1280 		}
   1281 
   1282 		/* XXX this isn't allowed to do anything for now :-) */
   1283 
   1284 		/* XXX and before it is, we need to fill in the rest
   1285 		   of the fields!?!?!?! */
   1286 		memcpy(raidget_component_label(raidPtr, column),
   1287 		    clabel, sizeof(*clabel));
   1288 		raidflush_component_label(raidPtr, column);
   1289 		return (0);
   1290 #endif
   1291 
   1292 	case RAIDFRAME_INIT_LABELS:
   1293 		clabel = (RF_ComponentLabel_t *) data;
   1294 		/*
   1295 		   we only want the serial number from
   1296 		   the above.  We get all the rest of the information
   1297 		   from the config that was used to create this RAID
   1298 		   set.
   1299 		   */
   1300 
   1301 		raidPtr->serial_number = clabel->serial_number;
   1302 
   1303 		for(column=0;column<raidPtr->numCol;column++) {
   1304 			diskPtr = &raidPtr->Disks[column];
   1305 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1306 				ci_label = raidget_component_label(raidPtr,
   1307 				    column);
   1308 				/* Zeroing this is important. */
   1309 				memset(ci_label, 0, sizeof(*ci_label));
   1310 				raid_init_component_label(raidPtr, ci_label);
   1311 				ci_label->serial_number =
   1312 				    raidPtr->serial_number;
   1313 				ci_label->row = 0; /* we dont' pretend to support more */
   1314 				rf_component_label_set_partitionsize(ci_label,
   1315 				    diskPtr->partitionSize);
   1316 				ci_label->column = column;
   1317 				raidflush_component_label(raidPtr, column);
   1318 			}
   1319 			/* XXXjld what about the spares? */
   1320 		}
   1321 
   1322 		return (retcode);
   1323 	case RAIDFRAME_SET_AUTOCONFIG:
   1324 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1325 		printf("raid%d: New autoconfig value is: %d\n",
   1326 		       raidPtr->raidid, d);
   1327 		*(int *) data = d;
   1328 		return (retcode);
   1329 
   1330 	case RAIDFRAME_SET_ROOT:
   1331 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1332 		printf("raid%d: New rootpartition value is: %d\n",
   1333 		       raidPtr->raidid, d);
   1334 		*(int *) data = d;
   1335 		return (retcode);
   1336 
   1337 		/* initialize all parity */
   1338 	case RAIDFRAME_REWRITEPARITY:
   1339 
   1340 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1341 			/* Parity for RAID 0 is trivially correct */
   1342 			raidPtr->parity_good = RF_RAID_CLEAN;
   1343 			return(0);
   1344 		}
   1345 
   1346 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1347 			/* Re-write is already in progress! */
   1348 			return(EINVAL);
   1349 		}
   1350 
   1351 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1352 					   rf_RewriteParityThread,
   1353 					   raidPtr,"raid_parity");
   1354 		return (retcode);
   1355 
   1356 
   1357 	case RAIDFRAME_ADD_HOT_SPARE:
   1358 		sparePtr = (RF_SingleComponent_t *) data;
   1359 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1360 		retcode = rf_add_hot_spare(raidPtr, &component);
   1361 		return(retcode);
   1362 
   1363 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1364 		return(retcode);
   1365 
   1366 	case RAIDFRAME_DELETE_COMPONENT:
   1367 		componentPtr = (RF_SingleComponent_t *)data;
   1368 		memcpy( &component, componentPtr,
   1369 			sizeof(RF_SingleComponent_t));
   1370 		retcode = rf_delete_component(raidPtr, &component);
   1371 		return(retcode);
   1372 
   1373 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1374 		componentPtr = (RF_SingleComponent_t *)data;
   1375 		memcpy( &component, componentPtr,
   1376 			sizeof(RF_SingleComponent_t));
   1377 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1378 		return(retcode);
   1379 
   1380 	case RAIDFRAME_REBUILD_IN_PLACE:
   1381 
   1382 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1383 			/* Can't do this on a RAID 0!! */
   1384 			return(EINVAL);
   1385 		}
   1386 
   1387 		if (raidPtr->recon_in_progress == 1) {
   1388 			/* a reconstruct is already in progress! */
   1389 			return(EINVAL);
   1390 		}
   1391 
   1392 		componentPtr = (RF_SingleComponent_t *) data;
   1393 		memcpy( &component, componentPtr,
   1394 			sizeof(RF_SingleComponent_t));
   1395 		component.row = 0; /* we don't support any more */
   1396 		column = component.column;
   1397 
   1398 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1399 			return(EINVAL);
   1400 		}
   1401 
   1402 		rf_lock_mutex2(raidPtr->mutex);
   1403 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1404 		    (raidPtr->numFailures > 0)) {
   1405 			/* XXX 0 above shouldn't be constant!!! */
   1406 			/* some component other than this has failed.
   1407 			   Let's not make things worse than they already
   1408 			   are... */
   1409 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1410 			       raidPtr->raidid);
   1411 			printf("raid%d:     Col: %d   Too many failures.\n",
   1412 			       raidPtr->raidid, column);
   1413 			rf_unlock_mutex2(raidPtr->mutex);
   1414 			return (EINVAL);
   1415 		}
   1416 		if (raidPtr->Disks[column].status ==
   1417 		    rf_ds_reconstructing) {
   1418 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1419 			       raidPtr->raidid);
   1420 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1421 
   1422 			rf_unlock_mutex2(raidPtr->mutex);
   1423 			return (EINVAL);
   1424 		}
   1425 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1426 			rf_unlock_mutex2(raidPtr->mutex);
   1427 			return (EINVAL);
   1428 		}
   1429 		rf_unlock_mutex2(raidPtr->mutex);
   1430 
   1431 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1432 		if (rrcopy == NULL)
   1433 			return(ENOMEM);
   1434 
   1435 		rrcopy->raidPtr = (void *) raidPtr;
   1436 		rrcopy->col = column;
   1437 
   1438 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1439 					   rf_ReconstructInPlaceThread,
   1440 					   rrcopy,"raid_reconip");
   1441 		return(retcode);
   1442 
   1443 	case RAIDFRAME_GET_INFO:
   1444 		if (!raidPtr->valid)
   1445 			return (ENODEV);
   1446 		ucfgp = (RF_DeviceConfig_t **) data;
   1447 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1448 			  (RF_DeviceConfig_t *));
   1449 		if (d_cfg == NULL)
   1450 			return (ENOMEM);
   1451 		d_cfg->rows = 1; /* there is only 1 row now */
   1452 		d_cfg->cols = raidPtr->numCol;
   1453 		d_cfg->ndevs = raidPtr->numCol;
   1454 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1455 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1456 			return (ENOMEM);
   1457 		}
   1458 		d_cfg->nspares = raidPtr->numSpare;
   1459 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1460 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1461 			return (ENOMEM);
   1462 		}
   1463 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1464 		d = 0;
   1465 		for (j = 0; j < d_cfg->cols; j++) {
   1466 			d_cfg->devs[d] = raidPtr->Disks[j];
   1467 			d++;
   1468 		}
   1469 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1470 			d_cfg->spares[i] = raidPtr->Disks[j];
   1471 		}
   1472 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1473 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1474 
   1475 		return (retcode);
   1476 
   1477 	case RAIDFRAME_CHECK_PARITY:
   1478 		*(int *) data = raidPtr->parity_good;
   1479 		return (0);
   1480 
   1481 	case RAIDFRAME_PARITYMAP_STATUS:
   1482 		if (rf_paritymap_ineligible(raidPtr))
   1483 			return EINVAL;
   1484 		rf_paritymap_status(raidPtr->parity_map,
   1485 		    (struct rf_pmstat *)data);
   1486 		return 0;
   1487 
   1488 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1489 		if (rf_paritymap_ineligible(raidPtr))
   1490 			return EINVAL;
   1491 		if (raidPtr->parity_map == NULL)
   1492 			return ENOENT; /* ??? */
   1493 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1494 			(struct rf_pmparams *)data, 1))
   1495 			return EINVAL;
   1496 		return 0;
   1497 
   1498 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1499 		if (rf_paritymap_ineligible(raidPtr))
   1500 			return EINVAL;
   1501 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1502 		return 0;
   1503 
   1504 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1505 		if (rf_paritymap_ineligible(raidPtr))
   1506 			return EINVAL;
   1507 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1508 		/* XXX should errors be passed up? */
   1509 		return 0;
   1510 
   1511 	case RAIDFRAME_RESET_ACCTOTALS:
   1512 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1513 		return (0);
   1514 
   1515 	case RAIDFRAME_GET_ACCTOTALS:
   1516 		totals = (RF_AccTotals_t *) data;
   1517 		*totals = raidPtr->acc_totals;
   1518 		return (0);
   1519 
   1520 	case RAIDFRAME_KEEP_ACCTOTALS:
   1521 		raidPtr->keep_acc_totals = *(int *)data;
   1522 		return (0);
   1523 
   1524 	case RAIDFRAME_GET_SIZE:
   1525 		*(int *) data = raidPtr->totalSectors;
   1526 		return (0);
   1527 
   1528 		/* fail a disk & optionally start reconstruction */
   1529 	case RAIDFRAME_FAIL_DISK:
   1530 
   1531 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1532 			/* Can't do this on a RAID 0!! */
   1533 			return(EINVAL);
   1534 		}
   1535 
   1536 		rr = (struct rf_recon_req *) data;
   1537 		rr->row = 0;
   1538 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1539 			return (EINVAL);
   1540 
   1541 
   1542 		rf_lock_mutex2(raidPtr->mutex);
   1543 		if (raidPtr->status == rf_rs_reconstructing) {
   1544 			/* you can't fail a disk while we're reconstructing! */
   1545 			/* XXX wrong for RAID6 */
   1546 			rf_unlock_mutex2(raidPtr->mutex);
   1547 			return (EINVAL);
   1548 		}
   1549 		if ((raidPtr->Disks[rr->col].status ==
   1550 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1551 			/* some other component has failed.  Let's not make
   1552 			   things worse. XXX wrong for RAID6 */
   1553 			rf_unlock_mutex2(raidPtr->mutex);
   1554 			return (EINVAL);
   1555 		}
   1556 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1557 			/* Can't fail a spared disk! */
   1558 			rf_unlock_mutex2(raidPtr->mutex);
   1559 			return (EINVAL);
   1560 		}
   1561 		rf_unlock_mutex2(raidPtr->mutex);
   1562 
   1563 		/* make a copy of the recon request so that we don't rely on
   1564 		 * the user's buffer */
   1565 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1566 		if (rrcopy == NULL)
   1567 			return(ENOMEM);
   1568 		memcpy(rrcopy, rr, sizeof(*rr));
   1569 		rrcopy->raidPtr = (void *) raidPtr;
   1570 
   1571 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1572 					   rf_ReconThread,
   1573 					   rrcopy,"raid_recon");
   1574 		return (0);
   1575 
   1576 		/* invoke a copyback operation after recon on whatever disk
   1577 		 * needs it, if any */
   1578 	case RAIDFRAME_COPYBACK:
   1579 
   1580 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1581 			/* This makes no sense on a RAID 0!! */
   1582 			return(EINVAL);
   1583 		}
   1584 
   1585 		if (raidPtr->copyback_in_progress == 1) {
   1586 			/* Copyback is already in progress! */
   1587 			return(EINVAL);
   1588 		}
   1589 
   1590 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1591 					   rf_CopybackThread,
   1592 					   raidPtr,"raid_copyback");
   1593 		return (retcode);
   1594 
   1595 		/* return the percentage completion of reconstruction */
   1596 	case RAIDFRAME_CHECK_RECON_STATUS:
   1597 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1598 			/* This makes no sense on a RAID 0, so tell the
   1599 			   user it's done. */
   1600 			*(int *) data = 100;
   1601 			return(0);
   1602 		}
   1603 		if (raidPtr->status != rf_rs_reconstructing)
   1604 			*(int *) data = 100;
   1605 		else {
   1606 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1607 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1608 			} else {
   1609 				*(int *) data = 0;
   1610 			}
   1611 		}
   1612 		return (0);
   1613 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1614 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1615 		if (raidPtr->status != rf_rs_reconstructing) {
   1616 			progressInfo.remaining = 0;
   1617 			progressInfo.completed = 100;
   1618 			progressInfo.total = 100;
   1619 		} else {
   1620 			progressInfo.total =
   1621 				raidPtr->reconControl->numRUsTotal;
   1622 			progressInfo.completed =
   1623 				raidPtr->reconControl->numRUsComplete;
   1624 			progressInfo.remaining = progressInfo.total -
   1625 				progressInfo.completed;
   1626 		}
   1627 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1628 				  sizeof(RF_ProgressInfo_t));
   1629 		return (retcode);
   1630 
   1631 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1632 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1633 			/* This makes no sense on a RAID 0, so tell the
   1634 			   user it's done. */
   1635 			*(int *) data = 100;
   1636 			return(0);
   1637 		}
   1638 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1639 			*(int *) data = 100 *
   1640 				raidPtr->parity_rewrite_stripes_done /
   1641 				raidPtr->Layout.numStripe;
   1642 		} else {
   1643 			*(int *) data = 100;
   1644 		}
   1645 		return (0);
   1646 
   1647 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1648 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1649 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1650 			progressInfo.total = raidPtr->Layout.numStripe;
   1651 			progressInfo.completed =
   1652 				raidPtr->parity_rewrite_stripes_done;
   1653 			progressInfo.remaining = progressInfo.total -
   1654 				progressInfo.completed;
   1655 		} else {
   1656 			progressInfo.remaining = 0;
   1657 			progressInfo.completed = 100;
   1658 			progressInfo.total = 100;
   1659 		}
   1660 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1661 				  sizeof(RF_ProgressInfo_t));
   1662 		return (retcode);
   1663 
   1664 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1665 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1666 			/* This makes no sense on a RAID 0 */
   1667 			*(int *) data = 100;
   1668 			return(0);
   1669 		}
   1670 		if (raidPtr->copyback_in_progress == 1) {
   1671 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1672 				raidPtr->Layout.numStripe;
   1673 		} else {
   1674 			*(int *) data = 100;
   1675 		}
   1676 		return (0);
   1677 
   1678 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1679 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1680 		if (raidPtr->copyback_in_progress == 1) {
   1681 			progressInfo.total = raidPtr->Layout.numStripe;
   1682 			progressInfo.completed =
   1683 				raidPtr->copyback_stripes_done;
   1684 			progressInfo.remaining = progressInfo.total -
   1685 				progressInfo.completed;
   1686 		} else {
   1687 			progressInfo.remaining = 0;
   1688 			progressInfo.completed = 100;
   1689 			progressInfo.total = 100;
   1690 		}
   1691 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1692 				  sizeof(RF_ProgressInfo_t));
   1693 		return (retcode);
   1694 
   1695 		/* the sparetable daemon calls this to wait for the kernel to
   1696 		 * need a spare table. this ioctl does not return until a
   1697 		 * spare table is needed. XXX -- calling mpsleep here in the
   1698 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1699 		 * -- I should either compute the spare table in the kernel,
   1700 		 * or have a different -- XXX XXX -- interface (a different
   1701 		 * character device) for delivering the table     -- XXX */
   1702 #if 0
   1703 	case RAIDFRAME_SPARET_WAIT:
   1704 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1705 		while (!rf_sparet_wait_queue)
   1706 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1707 		waitreq = rf_sparet_wait_queue;
   1708 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1709 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1710 
   1711 		/* structure assignment */
   1712 		*((RF_SparetWait_t *) data) = *waitreq;
   1713 
   1714 		RF_Free(waitreq, sizeof(*waitreq));
   1715 		return (0);
   1716 
   1717 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1718 		 * code in it that will cause the dameon to exit */
   1719 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1720 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1721 		waitreq->fcol = -1;
   1722 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1723 		waitreq->next = rf_sparet_wait_queue;
   1724 		rf_sparet_wait_queue = waitreq;
   1725 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1726 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1727 		return (0);
   1728 
   1729 		/* used by the spare table daemon to deliver a spare table
   1730 		 * into the kernel */
   1731 	case RAIDFRAME_SEND_SPARET:
   1732 
   1733 		/* install the spare table */
   1734 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1735 
   1736 		/* respond to the requestor.  the return status of the spare
   1737 		 * table installation is passed in the "fcol" field */
   1738 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1739 		waitreq->fcol = retcode;
   1740 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1741 		waitreq->next = rf_sparet_resp_queue;
   1742 		rf_sparet_resp_queue = waitreq;
   1743 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1744 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1745 
   1746 		return (retcode);
   1747 #endif
   1748 
   1749 	default:
   1750 		break; /* fall through to the os-specific code below */
   1751 
   1752 	}
   1753 
   1754 	if (!raidPtr->valid)
   1755 		return (EINVAL);
   1756 
   1757 	/*
   1758 	 * Add support for "regular" device ioctls here.
   1759 	 */
   1760 
   1761 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1762 	if (error != EPASSTHROUGH)
   1763 		return (error);
   1764 
   1765 	switch (cmd) {
   1766 	case DIOCGDINFO:
   1767 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1768 		break;
   1769 #ifdef __HAVE_OLD_DISKLABEL
   1770 	case ODIOCGDINFO:
   1771 		newlabel = *(rs->sc_dkdev.dk_label);
   1772 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1773 			return ENOTTY;
   1774 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1775 		break;
   1776 #endif
   1777 
   1778 	case DIOCGPART:
   1779 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1780 		((struct partinfo *) data)->part =
   1781 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1782 		break;
   1783 
   1784 	case DIOCWDINFO:
   1785 	case DIOCSDINFO:
   1786 #ifdef __HAVE_OLD_DISKLABEL
   1787 	case ODIOCWDINFO:
   1788 	case ODIOCSDINFO:
   1789 #endif
   1790 	{
   1791 		struct disklabel *lp;
   1792 #ifdef __HAVE_OLD_DISKLABEL
   1793 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1794 			memset(&newlabel, 0, sizeof newlabel);
   1795 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1796 			lp = &newlabel;
   1797 		} else
   1798 #endif
   1799 		lp = (struct disklabel *)data;
   1800 
   1801 		if ((error = raidlock(rs)) != 0)
   1802 			return (error);
   1803 
   1804 		rs->sc_flags |= RAIDF_LABELLING;
   1805 
   1806 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1807 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1808 		if (error == 0) {
   1809 			if (cmd == DIOCWDINFO
   1810 #ifdef __HAVE_OLD_DISKLABEL
   1811 			    || cmd == ODIOCWDINFO
   1812 #endif
   1813 			   )
   1814 				error = writedisklabel(RAIDLABELDEV(dev),
   1815 				    raidstrategy, rs->sc_dkdev.dk_label,
   1816 				    rs->sc_dkdev.dk_cpulabel);
   1817 		}
   1818 		rs->sc_flags &= ~RAIDF_LABELLING;
   1819 
   1820 		raidunlock(rs);
   1821 
   1822 		if (error)
   1823 			return (error);
   1824 		break;
   1825 	}
   1826 
   1827 	case DIOCWLABEL:
   1828 		if (*(int *) data != 0)
   1829 			rs->sc_flags |= RAIDF_WLABEL;
   1830 		else
   1831 			rs->sc_flags &= ~RAIDF_WLABEL;
   1832 		break;
   1833 
   1834 	case DIOCGDEFLABEL:
   1835 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1836 		break;
   1837 
   1838 #ifdef __HAVE_OLD_DISKLABEL
   1839 	case ODIOCGDEFLABEL:
   1840 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1841 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1842 			return ENOTTY;
   1843 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1844 		break;
   1845 #endif
   1846 
   1847 	case DIOCAWEDGE:
   1848 	case DIOCDWEDGE:
   1849 	    	dkw = (void *)data;
   1850 
   1851 		/* If the ioctl happens here, the parent is us. */
   1852 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1853 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1854 
   1855 	case DIOCLWEDGES:
   1856 		return dkwedge_list(&rs->sc_dkdev,
   1857 		    (struct dkwedge_list *)data, l);
   1858 	case DIOCCACHESYNC:
   1859 		return rf_sync_component_caches(raidPtr);
   1860 
   1861 	case DIOCGSTRATEGY:
   1862 	    {
   1863 		struct disk_strategy *dks = (void *)data;
   1864 
   1865 		s = splbio();
   1866 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1867 		    sizeof(dks->dks_name));
   1868 		splx(s);
   1869 		dks->dks_paramlen = 0;
   1870 
   1871 		return 0;
   1872 	    }
   1873 
   1874 	case DIOCSSTRATEGY:
   1875 	    {
   1876 		struct disk_strategy *dks = (void *)data;
   1877 		struct bufq_state *new;
   1878 		struct bufq_state *old;
   1879 
   1880 		if (dks->dks_param != NULL) {
   1881 			return EINVAL;
   1882 		}
   1883 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1884 		error = bufq_alloc(&new, dks->dks_name,
   1885 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1886 		if (error) {
   1887 			return error;
   1888 		}
   1889 		s = splbio();
   1890 		old = rs->buf_queue;
   1891 		bufq_move(new, old);
   1892 		rs->buf_queue = new;
   1893 		splx(s);
   1894 		bufq_free(old);
   1895 
   1896 		return 0;
   1897 	    }
   1898 
   1899 	default:
   1900 		retcode = ENOTTY;
   1901 	}
   1902 	return (retcode);
   1903 
   1904 }
   1905 
   1906 
   1907 /* raidinit -- complete the rest of the initialization for the
   1908    RAIDframe device.  */
   1909 
   1910 
   1911 static void
   1912 raidinit(RF_Raid_t *raidPtr)
   1913 {
   1914 	cfdata_t cf;
   1915 	struct raid_softc *rs;
   1916 	int     unit;
   1917 
   1918 	unit = raidPtr->raidid;
   1919 
   1920 	rs = &raid_softc[unit];
   1921 
   1922 	/* XXX should check return code first... */
   1923 	rs->sc_flags |= RAIDF_INITED;
   1924 
   1925 	/* XXX doesn't check bounds. */
   1926 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1927 
   1928 	/* attach the pseudo device */
   1929 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1930 	cf->cf_name = raid_cd.cd_name;
   1931 	cf->cf_atname = raid_cd.cd_name;
   1932 	cf->cf_unit = unit;
   1933 	cf->cf_fstate = FSTATE_STAR;
   1934 
   1935 	rs->sc_dev = config_attach_pseudo(cf);
   1936 
   1937 	if (rs->sc_dev == NULL) {
   1938 		printf("raid%d: config_attach_pseudo failed\n",
   1939 		    raidPtr->raidid);
   1940 		rs->sc_flags &= ~RAIDF_INITED;
   1941 		free(cf, M_RAIDFRAME);
   1942 		return;
   1943 	}
   1944 
   1945 	/* disk_attach actually creates space for the CPU disklabel, among
   1946 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1947 	 * with disklabels. */
   1948 
   1949 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1950 	disk_attach(&rs->sc_dkdev);
   1951 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1952 
   1953 	/* XXX There may be a weird interaction here between this, and
   1954 	 * protectedSectors, as used in RAIDframe.  */
   1955 
   1956 	rs->sc_size = raidPtr->totalSectors;
   1957 
   1958 	dkwedge_discover(&rs->sc_dkdev);
   1959 
   1960 	rf_set_properties(rs, raidPtr);
   1961 
   1962 }
   1963 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1964 /* wake up the daemon & tell it to get us a spare table
   1965  * XXX
   1966  * the entries in the queues should be tagged with the raidPtr
   1967  * so that in the extremely rare case that two recons happen at once,
   1968  * we know for which device were requesting a spare table
   1969  * XXX
   1970  *
   1971  * XXX This code is not currently used. GO
   1972  */
   1973 int
   1974 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1975 {
   1976 	int     retcode;
   1977 
   1978 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1979 	req->next = rf_sparet_wait_queue;
   1980 	rf_sparet_wait_queue = req;
   1981 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1982 
   1983 	/* mpsleep unlocks the mutex */
   1984 	while (!rf_sparet_resp_queue) {
   1985 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1986 	}
   1987 	req = rf_sparet_resp_queue;
   1988 	rf_sparet_resp_queue = req->next;
   1989 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1990 
   1991 	retcode = req->fcol;
   1992 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1993 					 * alloc'd */
   1994 	return (retcode);
   1995 }
   1996 #endif
   1997 
   1998 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1999  * bp & passes it down.
   2000  * any calls originating in the kernel must use non-blocking I/O
   2001  * do some extra sanity checking to return "appropriate" error values for
   2002  * certain conditions (to make some standard utilities work)
   2003  *
   2004  * Formerly known as: rf_DoAccessKernel
   2005  */
   2006 void
   2007 raidstart(RF_Raid_t *raidPtr)
   2008 {
   2009 	RF_SectorCount_t num_blocks, pb, sum;
   2010 	RF_RaidAddr_t raid_addr;
   2011 	struct partition *pp;
   2012 	daddr_t blocknum;
   2013 	int     unit;
   2014 	struct raid_softc *rs;
   2015 	int     do_async;
   2016 	struct buf *bp;
   2017 	int rc;
   2018 
   2019 	unit = raidPtr->raidid;
   2020 	rs = &raid_softc[unit];
   2021 
   2022 	/* quick check to see if anything has died recently */
   2023 	rf_lock_mutex2(raidPtr->mutex);
   2024 	if (raidPtr->numNewFailures > 0) {
   2025 		rf_unlock_mutex2(raidPtr->mutex);
   2026 		rf_update_component_labels(raidPtr,
   2027 					   RF_NORMAL_COMPONENT_UPDATE);
   2028 		rf_lock_mutex2(raidPtr->mutex);
   2029 		raidPtr->numNewFailures--;
   2030 	}
   2031 
   2032 	/* Check to see if we're at the limit... */
   2033 	while (raidPtr->openings > 0) {
   2034 		rf_unlock_mutex2(raidPtr->mutex);
   2035 
   2036 		/* get the next item, if any, from the queue */
   2037 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2038 			/* nothing more to do */
   2039 			return;
   2040 		}
   2041 
   2042 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2043 		 * partition.. Need to make it absolute to the underlying
   2044 		 * device.. */
   2045 
   2046 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2047 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2048 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2049 			blocknum += pp->p_offset;
   2050 		}
   2051 
   2052 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2053 			    (int) blocknum));
   2054 
   2055 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2056 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2057 
   2058 		/* *THIS* is where we adjust what block we're going to...
   2059 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2060 		raid_addr = blocknum;
   2061 
   2062 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2063 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2064 		sum = raid_addr + num_blocks + pb;
   2065 		if (1 || rf_debugKernelAccess) {
   2066 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2067 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2068 				    (int) pb, (int) bp->b_resid));
   2069 		}
   2070 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2071 		    || (sum < num_blocks) || (sum < pb)) {
   2072 			bp->b_error = ENOSPC;
   2073 			bp->b_resid = bp->b_bcount;
   2074 			biodone(bp);
   2075 			rf_lock_mutex2(raidPtr->mutex);
   2076 			continue;
   2077 		}
   2078 		/*
   2079 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2080 		 */
   2081 
   2082 		if (bp->b_bcount & raidPtr->sectorMask) {
   2083 			bp->b_error = EINVAL;
   2084 			bp->b_resid = bp->b_bcount;
   2085 			biodone(bp);
   2086 			rf_lock_mutex2(raidPtr->mutex);
   2087 			continue;
   2088 
   2089 		}
   2090 		db1_printf(("Calling DoAccess..\n"));
   2091 
   2092 
   2093 		rf_lock_mutex2(raidPtr->mutex);
   2094 		raidPtr->openings--;
   2095 		rf_unlock_mutex2(raidPtr->mutex);
   2096 
   2097 		/*
   2098 		 * Everything is async.
   2099 		 */
   2100 		do_async = 1;
   2101 
   2102 		disk_busy(&rs->sc_dkdev);
   2103 
   2104 		/* XXX we're still at splbio() here... do we *really*
   2105 		   need to be? */
   2106 
   2107 		/* don't ever condition on bp->b_flags & B_WRITE.
   2108 		 * always condition on B_READ instead */
   2109 
   2110 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2111 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2112 				 do_async, raid_addr, num_blocks,
   2113 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2114 
   2115 		if (rc) {
   2116 			bp->b_error = rc;
   2117 			bp->b_resid = bp->b_bcount;
   2118 			biodone(bp);
   2119 			/* continue loop */
   2120 		}
   2121 
   2122 		rf_lock_mutex2(raidPtr->mutex);
   2123 	}
   2124 	rf_unlock_mutex2(raidPtr->mutex);
   2125 }
   2126 
   2127 
   2128 
   2129 
   2130 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2131 
   2132 int
   2133 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2134 {
   2135 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2136 	struct buf *bp;
   2137 
   2138 	req->queue = queue;
   2139 	bp = req->bp;
   2140 
   2141 	switch (req->type) {
   2142 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2143 		/* XXX need to do something extra here.. */
   2144 		/* I'm leaving this in, as I've never actually seen it used,
   2145 		 * and I'd like folks to report it... GO */
   2146 		printf(("WAKEUP CALLED\n"));
   2147 		queue->numOutstanding++;
   2148 
   2149 		bp->b_flags = 0;
   2150 		bp->b_private = req;
   2151 
   2152 		KernelWakeupFunc(bp);
   2153 		break;
   2154 
   2155 	case RF_IO_TYPE_READ:
   2156 	case RF_IO_TYPE_WRITE:
   2157 #if RF_ACC_TRACE > 0
   2158 		if (req->tracerec) {
   2159 			RF_ETIMER_START(req->tracerec->timer);
   2160 		}
   2161 #endif
   2162 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2163 		    op, queue->rf_cinfo->ci_dev,
   2164 		    req->sectorOffset, req->numSector,
   2165 		    req->buf, KernelWakeupFunc, (void *) req,
   2166 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2167 
   2168 		if (rf_debugKernelAccess) {
   2169 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2170 				(long) bp->b_blkno));
   2171 		}
   2172 		queue->numOutstanding++;
   2173 		queue->last_deq_sector = req->sectorOffset;
   2174 		/* acc wouldn't have been let in if there were any pending
   2175 		 * reqs at any other priority */
   2176 		queue->curPriority = req->priority;
   2177 
   2178 		db1_printf(("Going for %c to unit %d col %d\n",
   2179 			    req->type, queue->raidPtr->raidid,
   2180 			    queue->col));
   2181 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2182 			(int) req->sectorOffset, (int) req->numSector,
   2183 			(int) (req->numSector <<
   2184 			    queue->raidPtr->logBytesPerSector),
   2185 			(int) queue->raidPtr->logBytesPerSector));
   2186 
   2187 		/*
   2188 		 * XXX: drop lock here since this can block at
   2189 		 * least with backing SCSI devices.  Retake it
   2190 		 * to minimize fuss with calling interfaces.
   2191 		 */
   2192 
   2193 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2194 		bdev_strategy(bp);
   2195 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2196 		break;
   2197 
   2198 	default:
   2199 		panic("bad req->type in rf_DispatchKernelIO");
   2200 	}
   2201 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2202 
   2203 	return (0);
   2204 }
   2205 /* this is the callback function associated with a I/O invoked from
   2206    kernel code.
   2207  */
   2208 static void
   2209 KernelWakeupFunc(struct buf *bp)
   2210 {
   2211 	RF_DiskQueueData_t *req = NULL;
   2212 	RF_DiskQueue_t *queue;
   2213 
   2214 	db1_printf(("recovering the request queue:\n"));
   2215 
   2216 	req = bp->b_private;
   2217 
   2218 	queue = (RF_DiskQueue_t *) req->queue;
   2219 
   2220 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2221 
   2222 #if RF_ACC_TRACE > 0
   2223 	if (req->tracerec) {
   2224 		RF_ETIMER_STOP(req->tracerec->timer);
   2225 		RF_ETIMER_EVAL(req->tracerec->timer);
   2226 		rf_lock_mutex2(rf_tracing_mutex);
   2227 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2228 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2229 		req->tracerec->num_phys_ios++;
   2230 		rf_unlock_mutex2(rf_tracing_mutex);
   2231 	}
   2232 #endif
   2233 
   2234 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2235 	 * ballistic, and mark the component as hosed... */
   2236 
   2237 	if (bp->b_error != 0) {
   2238 		/* Mark the disk as dead */
   2239 		/* but only mark it once... */
   2240 		/* and only if it wouldn't leave this RAID set
   2241 		   completely broken */
   2242 		if (((queue->raidPtr->Disks[queue->col].status ==
   2243 		      rf_ds_optimal) ||
   2244 		     (queue->raidPtr->Disks[queue->col].status ==
   2245 		      rf_ds_used_spare)) &&
   2246 		     (queue->raidPtr->numFailures <
   2247 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2248 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2249 			       queue->raidPtr->raidid,
   2250 			       queue->raidPtr->Disks[queue->col].devname);
   2251 			queue->raidPtr->Disks[queue->col].status =
   2252 			    rf_ds_failed;
   2253 			queue->raidPtr->status = rf_rs_degraded;
   2254 			queue->raidPtr->numFailures++;
   2255 			queue->raidPtr->numNewFailures++;
   2256 		} else {	/* Disk is already dead... */
   2257 			/* printf("Disk already marked as dead!\n"); */
   2258 		}
   2259 
   2260 	}
   2261 
   2262 	/* Fill in the error value */
   2263 	req->error = bp->b_error;
   2264 
   2265 	/* Drop this one on the "finished" queue... */
   2266 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2267 
   2268 	/* Let the raidio thread know there is work to be done. */
   2269 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2270 
   2271 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2272 }
   2273 
   2274 
   2275 /*
   2276  * initialize a buf structure for doing an I/O in the kernel.
   2277  */
   2278 static void
   2279 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2280        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2281        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2282        struct proc *b_proc)
   2283 {
   2284 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2285 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2286 	bp->b_oflags = 0;
   2287 	bp->b_cflags = 0;
   2288 	bp->b_bcount = numSect << logBytesPerSector;
   2289 	bp->b_bufsize = bp->b_bcount;
   2290 	bp->b_error = 0;
   2291 	bp->b_dev = dev;
   2292 	bp->b_data = bf;
   2293 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2294 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2295 	if (bp->b_bcount == 0) {
   2296 		panic("bp->b_bcount is zero in InitBP!!");
   2297 	}
   2298 	bp->b_proc = b_proc;
   2299 	bp->b_iodone = cbFunc;
   2300 	bp->b_private = cbArg;
   2301 }
   2302 
   2303 static void
   2304 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2305 		    struct disklabel *lp)
   2306 {
   2307 	memset(lp, 0, sizeof(*lp));
   2308 
   2309 	/* fabricate a label... */
   2310 	lp->d_secperunit = raidPtr->totalSectors;
   2311 	lp->d_secsize = raidPtr->bytesPerSector;
   2312 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2313 	lp->d_ntracks = 4 * raidPtr->numCol;
   2314 	lp->d_ncylinders = raidPtr->totalSectors /
   2315 		(lp->d_nsectors * lp->d_ntracks);
   2316 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2317 
   2318 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2319 	lp->d_type = DTYPE_RAID;
   2320 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2321 	lp->d_rpm = 3600;
   2322 	lp->d_interleave = 1;
   2323 	lp->d_flags = 0;
   2324 
   2325 	lp->d_partitions[RAW_PART].p_offset = 0;
   2326 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2327 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2328 	lp->d_npartitions = RAW_PART + 1;
   2329 
   2330 	lp->d_magic = DISKMAGIC;
   2331 	lp->d_magic2 = DISKMAGIC;
   2332 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2333 
   2334 }
   2335 /*
   2336  * Read the disklabel from the raid device.  If one is not present, fake one
   2337  * up.
   2338  */
   2339 static void
   2340 raidgetdisklabel(dev_t dev)
   2341 {
   2342 	int     unit = raidunit(dev);
   2343 	struct raid_softc *rs = &raid_softc[unit];
   2344 	const char   *errstring;
   2345 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2346 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2347 	RF_Raid_t *raidPtr;
   2348 
   2349 	db1_printf(("Getting the disklabel...\n"));
   2350 
   2351 	memset(clp, 0, sizeof(*clp));
   2352 
   2353 	raidPtr = raidPtrs[unit];
   2354 
   2355 	raidgetdefaultlabel(raidPtr, rs, lp);
   2356 
   2357 	/*
   2358 	 * Call the generic disklabel extraction routine.
   2359 	 */
   2360 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2361 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2362 	if (errstring)
   2363 		raidmakedisklabel(rs);
   2364 	else {
   2365 		int     i;
   2366 		struct partition *pp;
   2367 
   2368 		/*
   2369 		 * Sanity check whether the found disklabel is valid.
   2370 		 *
   2371 		 * This is necessary since total size of the raid device
   2372 		 * may vary when an interleave is changed even though exactly
   2373 		 * same components are used, and old disklabel may used
   2374 		 * if that is found.
   2375 		 */
   2376 		if (lp->d_secperunit != rs->sc_size)
   2377 			printf("raid%d: WARNING: %s: "
   2378 			    "total sector size in disklabel (%" PRIu32 ") != "
   2379 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2380 			    lp->d_secperunit, rs->sc_size);
   2381 		for (i = 0; i < lp->d_npartitions; i++) {
   2382 			pp = &lp->d_partitions[i];
   2383 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2384 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2385 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2386 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2387 		}
   2388 	}
   2389 
   2390 }
   2391 /*
   2392  * Take care of things one might want to take care of in the event
   2393  * that a disklabel isn't present.
   2394  */
   2395 static void
   2396 raidmakedisklabel(struct raid_softc *rs)
   2397 {
   2398 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2399 	db1_printf(("Making a label..\n"));
   2400 
   2401 	/*
   2402 	 * For historical reasons, if there's no disklabel present
   2403 	 * the raw partition must be marked FS_BSDFFS.
   2404 	 */
   2405 
   2406 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2407 
   2408 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2409 
   2410 	lp->d_checksum = dkcksum(lp);
   2411 }
   2412 /*
   2413  * Wait interruptibly for an exclusive lock.
   2414  *
   2415  * XXX
   2416  * Several drivers do this; it should be abstracted and made MP-safe.
   2417  * (Hmm... where have we seen this warning before :->  GO )
   2418  */
   2419 static int
   2420 raidlock(struct raid_softc *rs)
   2421 {
   2422 	int     error;
   2423 
   2424 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2425 		rs->sc_flags |= RAIDF_WANTED;
   2426 		if ((error =
   2427 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2428 			return (error);
   2429 	}
   2430 	rs->sc_flags |= RAIDF_LOCKED;
   2431 	return (0);
   2432 }
   2433 /*
   2434  * Unlock and wake up any waiters.
   2435  */
   2436 static void
   2437 raidunlock(struct raid_softc *rs)
   2438 {
   2439 
   2440 	rs->sc_flags &= ~RAIDF_LOCKED;
   2441 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2442 		rs->sc_flags &= ~RAIDF_WANTED;
   2443 		wakeup(rs);
   2444 	}
   2445 }
   2446 
   2447 
   2448 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2449 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2450 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2451 
   2452 static daddr_t
   2453 rf_component_info_offset(void)
   2454 {
   2455 
   2456 	return RF_COMPONENT_INFO_OFFSET;
   2457 }
   2458 
   2459 static daddr_t
   2460 rf_component_info_size(unsigned secsize)
   2461 {
   2462 	daddr_t info_size;
   2463 
   2464 	KASSERT(secsize);
   2465 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2466 		info_size = secsize;
   2467 	else
   2468 		info_size = RF_COMPONENT_INFO_SIZE;
   2469 
   2470 	return info_size;
   2471 }
   2472 
   2473 static daddr_t
   2474 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2475 {
   2476 	daddr_t map_offset;
   2477 
   2478 	KASSERT(raidPtr->bytesPerSector);
   2479 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2480 		map_offset = raidPtr->bytesPerSector;
   2481 	else
   2482 		map_offset = RF_COMPONENT_INFO_SIZE;
   2483 	map_offset += rf_component_info_offset();
   2484 
   2485 	return map_offset;
   2486 }
   2487 
   2488 static daddr_t
   2489 rf_parity_map_size(RF_Raid_t *raidPtr)
   2490 {
   2491 	daddr_t map_size;
   2492 
   2493 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2494 		map_size = raidPtr->bytesPerSector;
   2495 	else
   2496 		map_size = RF_PARITY_MAP_SIZE;
   2497 
   2498 	return map_size;
   2499 }
   2500 
   2501 int
   2502 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2503 {
   2504 	RF_ComponentLabel_t *clabel;
   2505 
   2506 	clabel = raidget_component_label(raidPtr, col);
   2507 	clabel->clean = RF_RAID_CLEAN;
   2508 	raidflush_component_label(raidPtr, col);
   2509 	return(0);
   2510 }
   2511 
   2512 
   2513 int
   2514 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2515 {
   2516 	RF_ComponentLabel_t *clabel;
   2517 
   2518 	clabel = raidget_component_label(raidPtr, col);
   2519 	clabel->clean = RF_RAID_DIRTY;
   2520 	raidflush_component_label(raidPtr, col);
   2521 	return(0);
   2522 }
   2523 
   2524 int
   2525 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2526 {
   2527 	KASSERT(raidPtr->bytesPerSector);
   2528 	return raidread_component_label(raidPtr->bytesPerSector,
   2529 	    raidPtr->Disks[col].dev,
   2530 	    raidPtr->raid_cinfo[col].ci_vp,
   2531 	    &raidPtr->raid_cinfo[col].ci_label);
   2532 }
   2533 
   2534 RF_ComponentLabel_t *
   2535 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2536 {
   2537 	return &raidPtr->raid_cinfo[col].ci_label;
   2538 }
   2539 
   2540 int
   2541 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2542 {
   2543 	RF_ComponentLabel_t *label;
   2544 
   2545 	label = &raidPtr->raid_cinfo[col].ci_label;
   2546 	label->mod_counter = raidPtr->mod_counter;
   2547 #ifndef RF_NO_PARITY_MAP
   2548 	label->parity_map_modcount = label->mod_counter;
   2549 #endif
   2550 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2551 	    raidPtr->Disks[col].dev,
   2552 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2553 }
   2554 
   2555 
   2556 static int
   2557 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2558     RF_ComponentLabel_t *clabel)
   2559 {
   2560 	return raidread_component_area(dev, b_vp, clabel,
   2561 	    sizeof(RF_ComponentLabel_t),
   2562 	    rf_component_info_offset(),
   2563 	    rf_component_info_size(secsize));
   2564 }
   2565 
   2566 /* ARGSUSED */
   2567 static int
   2568 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2569     size_t msize, daddr_t offset, daddr_t dsize)
   2570 {
   2571 	struct buf *bp;
   2572 	const struct bdevsw *bdev;
   2573 	int error;
   2574 
   2575 	/* XXX should probably ensure that we don't try to do this if
   2576 	   someone has changed rf_protected_sectors. */
   2577 
   2578 	if (b_vp == NULL) {
   2579 		/* For whatever reason, this component is not valid.
   2580 		   Don't try to read a component label from it. */
   2581 		return(EINVAL);
   2582 	}
   2583 
   2584 	/* get a block of the appropriate size... */
   2585 	bp = geteblk((int)dsize);
   2586 	bp->b_dev = dev;
   2587 
   2588 	/* get our ducks in a row for the read */
   2589 	bp->b_blkno = offset / DEV_BSIZE;
   2590 	bp->b_bcount = dsize;
   2591 	bp->b_flags |= B_READ;
   2592  	bp->b_resid = dsize;
   2593 
   2594 	bdev = bdevsw_lookup(bp->b_dev);
   2595 	if (bdev == NULL)
   2596 		return (ENXIO);
   2597 	(*bdev->d_strategy)(bp);
   2598 
   2599 	error = biowait(bp);
   2600 
   2601 	if (!error) {
   2602 		memcpy(data, bp->b_data, msize);
   2603 	}
   2604 
   2605 	brelse(bp, 0);
   2606 	return(error);
   2607 }
   2608 
   2609 
   2610 static int
   2611 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2612     RF_ComponentLabel_t *clabel)
   2613 {
   2614 	return raidwrite_component_area(dev, b_vp, clabel,
   2615 	    sizeof(RF_ComponentLabel_t),
   2616 	    rf_component_info_offset(),
   2617 	    rf_component_info_size(secsize), 0);
   2618 }
   2619 
   2620 /* ARGSUSED */
   2621 static int
   2622 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2623     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2624 {
   2625 	struct buf *bp;
   2626 	const struct bdevsw *bdev;
   2627 	int error;
   2628 
   2629 	/* get a block of the appropriate size... */
   2630 	bp = geteblk((int)dsize);
   2631 	bp->b_dev = dev;
   2632 
   2633 	/* get our ducks in a row for the write */
   2634 	bp->b_blkno = offset / DEV_BSIZE;
   2635 	bp->b_bcount = dsize;
   2636 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2637  	bp->b_resid = dsize;
   2638 
   2639 	memset(bp->b_data, 0, dsize);
   2640 	memcpy(bp->b_data, data, msize);
   2641 
   2642 	bdev = bdevsw_lookup(bp->b_dev);
   2643 	if (bdev == NULL)
   2644 		return (ENXIO);
   2645 	(*bdev->d_strategy)(bp);
   2646 	if (asyncp)
   2647 		return 0;
   2648 	error = biowait(bp);
   2649 	brelse(bp, 0);
   2650 	if (error) {
   2651 #if 1
   2652 		printf("Failed to write RAID component info!\n");
   2653 #endif
   2654 	}
   2655 
   2656 	return(error);
   2657 }
   2658 
   2659 void
   2660 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2661 {
   2662 	int c;
   2663 
   2664 	for (c = 0; c < raidPtr->numCol; c++) {
   2665 		/* Skip dead disks. */
   2666 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2667 			continue;
   2668 		/* XXXjld: what if an error occurs here? */
   2669 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2670 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2671 		    RF_PARITYMAP_NBYTE,
   2672 		    rf_parity_map_offset(raidPtr),
   2673 		    rf_parity_map_size(raidPtr), 0);
   2674 	}
   2675 }
   2676 
   2677 void
   2678 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2679 {
   2680 	struct rf_paritymap_ondisk tmp;
   2681 	int c,first;
   2682 
   2683 	first=1;
   2684 	for (c = 0; c < raidPtr->numCol; c++) {
   2685 		/* Skip dead disks. */
   2686 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2687 			continue;
   2688 		raidread_component_area(raidPtr->Disks[c].dev,
   2689 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2690 		    RF_PARITYMAP_NBYTE,
   2691 		    rf_parity_map_offset(raidPtr),
   2692 		    rf_parity_map_size(raidPtr));
   2693 		if (first) {
   2694 			memcpy(map, &tmp, sizeof(*map));
   2695 			first = 0;
   2696 		} else {
   2697 			rf_paritymap_merge(map, &tmp);
   2698 		}
   2699 	}
   2700 }
   2701 
   2702 void
   2703 rf_markalldirty(RF_Raid_t *raidPtr)
   2704 {
   2705 	RF_ComponentLabel_t *clabel;
   2706 	int sparecol;
   2707 	int c;
   2708 	int j;
   2709 	int scol = -1;
   2710 
   2711 	raidPtr->mod_counter++;
   2712 	for (c = 0; c < raidPtr->numCol; c++) {
   2713 		/* we don't want to touch (at all) a disk that has
   2714 		   failed */
   2715 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2716 			clabel = raidget_component_label(raidPtr, c);
   2717 			if (clabel->status == rf_ds_spared) {
   2718 				/* XXX do something special...
   2719 				   but whatever you do, don't
   2720 				   try to access it!! */
   2721 			} else {
   2722 				raidmarkdirty(raidPtr, c);
   2723 			}
   2724 		}
   2725 	}
   2726 
   2727 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2728 		sparecol = raidPtr->numCol + c;
   2729 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2730 			/*
   2731 
   2732 			   we claim this disk is "optimal" if it's
   2733 			   rf_ds_used_spare, as that means it should be
   2734 			   directly substitutable for the disk it replaced.
   2735 			   We note that too...
   2736 
   2737 			 */
   2738 
   2739 			for(j=0;j<raidPtr->numCol;j++) {
   2740 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2741 					scol = j;
   2742 					break;
   2743 				}
   2744 			}
   2745 
   2746 			clabel = raidget_component_label(raidPtr, sparecol);
   2747 			/* make sure status is noted */
   2748 
   2749 			raid_init_component_label(raidPtr, clabel);
   2750 
   2751 			clabel->row = 0;
   2752 			clabel->column = scol;
   2753 			/* Note: we *don't* change status from rf_ds_used_spare
   2754 			   to rf_ds_optimal */
   2755 			/* clabel.status = rf_ds_optimal; */
   2756 
   2757 			raidmarkdirty(raidPtr, sparecol);
   2758 		}
   2759 	}
   2760 }
   2761 
   2762 
   2763 void
   2764 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2765 {
   2766 	RF_ComponentLabel_t *clabel;
   2767 	int sparecol;
   2768 	int c;
   2769 	int j;
   2770 	int scol;
   2771 
   2772 	scol = -1;
   2773 
   2774 	/* XXX should do extra checks to make sure things really are clean,
   2775 	   rather than blindly setting the clean bit... */
   2776 
   2777 	raidPtr->mod_counter++;
   2778 
   2779 	for (c = 0; c < raidPtr->numCol; c++) {
   2780 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2781 			clabel = raidget_component_label(raidPtr, c);
   2782 			/* make sure status is noted */
   2783 			clabel->status = rf_ds_optimal;
   2784 
   2785 			/* note what unit we are configured as */
   2786 			clabel->last_unit = raidPtr->raidid;
   2787 
   2788 			raidflush_component_label(raidPtr, c);
   2789 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2790 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2791 					raidmarkclean(raidPtr, c);
   2792 				}
   2793 			}
   2794 		}
   2795 		/* else we don't touch it.. */
   2796 	}
   2797 
   2798 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2799 		sparecol = raidPtr->numCol + c;
   2800 		/* Need to ensure that the reconstruct actually completed! */
   2801 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2802 			/*
   2803 
   2804 			   we claim this disk is "optimal" if it's
   2805 			   rf_ds_used_spare, as that means it should be
   2806 			   directly substitutable for the disk it replaced.
   2807 			   We note that too...
   2808 
   2809 			 */
   2810 
   2811 			for(j=0;j<raidPtr->numCol;j++) {
   2812 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2813 					scol = j;
   2814 					break;
   2815 				}
   2816 			}
   2817 
   2818 			/* XXX shouldn't *really* need this... */
   2819 			clabel = raidget_component_label(raidPtr, sparecol);
   2820 			/* make sure status is noted */
   2821 
   2822 			raid_init_component_label(raidPtr, clabel);
   2823 
   2824 			clabel->column = scol;
   2825 			clabel->status = rf_ds_optimal;
   2826 			clabel->last_unit = raidPtr->raidid;
   2827 
   2828 			raidflush_component_label(raidPtr, sparecol);
   2829 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2830 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2831 					raidmarkclean(raidPtr, sparecol);
   2832 				}
   2833 			}
   2834 		}
   2835 	}
   2836 }
   2837 
   2838 void
   2839 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2840 {
   2841 
   2842 	if (vp != NULL) {
   2843 		if (auto_configured == 1) {
   2844 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2845 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2846 			vput(vp);
   2847 
   2848 		} else {
   2849 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2850 		}
   2851 	}
   2852 }
   2853 
   2854 
   2855 void
   2856 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2857 {
   2858 	int r,c;
   2859 	struct vnode *vp;
   2860 	int acd;
   2861 
   2862 
   2863 	/* We take this opportunity to close the vnodes like we should.. */
   2864 
   2865 	for (c = 0; c < raidPtr->numCol; c++) {
   2866 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2867 		acd = raidPtr->Disks[c].auto_configured;
   2868 		rf_close_component(raidPtr, vp, acd);
   2869 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2870 		raidPtr->Disks[c].auto_configured = 0;
   2871 	}
   2872 
   2873 	for (r = 0; r < raidPtr->numSpare; r++) {
   2874 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2875 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2876 		rf_close_component(raidPtr, vp, acd);
   2877 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2878 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2879 	}
   2880 }
   2881 
   2882 
   2883 void
   2884 rf_ReconThread(struct rf_recon_req *req)
   2885 {
   2886 	int     s;
   2887 	RF_Raid_t *raidPtr;
   2888 
   2889 	s = splbio();
   2890 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2891 	raidPtr->recon_in_progress = 1;
   2892 
   2893 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2894 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2895 
   2896 	RF_Free(req, sizeof(*req));
   2897 
   2898 	raidPtr->recon_in_progress = 0;
   2899 	splx(s);
   2900 
   2901 	/* That's all... */
   2902 	kthread_exit(0);	/* does not return */
   2903 }
   2904 
   2905 void
   2906 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2907 {
   2908 	int retcode;
   2909 	int s;
   2910 
   2911 	raidPtr->parity_rewrite_stripes_done = 0;
   2912 	raidPtr->parity_rewrite_in_progress = 1;
   2913 	s = splbio();
   2914 	retcode = rf_RewriteParity(raidPtr);
   2915 	splx(s);
   2916 	if (retcode) {
   2917 		printf("raid%d: Error re-writing parity (%d)!\n",
   2918 		    raidPtr->raidid, retcode);
   2919 	} else {
   2920 		/* set the clean bit!  If we shutdown correctly,
   2921 		   the clean bit on each component label will get
   2922 		   set */
   2923 		raidPtr->parity_good = RF_RAID_CLEAN;
   2924 	}
   2925 	raidPtr->parity_rewrite_in_progress = 0;
   2926 
   2927 	/* Anyone waiting for us to stop?  If so, inform them... */
   2928 	if (raidPtr->waitShutdown) {
   2929 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2930 	}
   2931 
   2932 	/* That's all... */
   2933 	kthread_exit(0);	/* does not return */
   2934 }
   2935 
   2936 
   2937 void
   2938 rf_CopybackThread(RF_Raid_t *raidPtr)
   2939 {
   2940 	int s;
   2941 
   2942 	raidPtr->copyback_in_progress = 1;
   2943 	s = splbio();
   2944 	rf_CopybackReconstructedData(raidPtr);
   2945 	splx(s);
   2946 	raidPtr->copyback_in_progress = 0;
   2947 
   2948 	/* That's all... */
   2949 	kthread_exit(0);	/* does not return */
   2950 }
   2951 
   2952 
   2953 void
   2954 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2955 {
   2956 	int s;
   2957 	RF_Raid_t *raidPtr;
   2958 
   2959 	s = splbio();
   2960 	raidPtr = req->raidPtr;
   2961 	raidPtr->recon_in_progress = 1;
   2962 	rf_ReconstructInPlace(raidPtr, req->col);
   2963 	RF_Free(req, sizeof(*req));
   2964 	raidPtr->recon_in_progress = 0;
   2965 	splx(s);
   2966 
   2967 	/* That's all... */
   2968 	kthread_exit(0);	/* does not return */
   2969 }
   2970 
   2971 static RF_AutoConfig_t *
   2972 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2973     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2974     unsigned secsize)
   2975 {
   2976 	int good_one = 0;
   2977 	RF_ComponentLabel_t *clabel;
   2978 	RF_AutoConfig_t *ac;
   2979 
   2980 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2981 	if (clabel == NULL) {
   2982 oomem:
   2983 		    while(ac_list) {
   2984 			    ac = ac_list;
   2985 			    if (ac->clabel)
   2986 				    free(ac->clabel, M_RAIDFRAME);
   2987 			    ac_list = ac_list->next;
   2988 			    free(ac, M_RAIDFRAME);
   2989 		    }
   2990 		    printf("RAID auto config: out of memory!\n");
   2991 		    return NULL; /* XXX probably should panic? */
   2992 	}
   2993 
   2994 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2995 		/* Got the label.  Does it look reasonable? */
   2996 		if (rf_reasonable_label(clabel, numsecs) &&
   2997 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2998 #ifdef DEBUG
   2999 			printf("Component on: %s: %llu\n",
   3000 				cname, (unsigned long long)size);
   3001 			rf_print_component_label(clabel);
   3002 #endif
   3003 			/* if it's reasonable, add it, else ignore it. */
   3004 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3005 				M_NOWAIT);
   3006 			if (ac == NULL) {
   3007 				free(clabel, M_RAIDFRAME);
   3008 				goto oomem;
   3009 			}
   3010 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3011 			ac->dev = dev;
   3012 			ac->vp = vp;
   3013 			ac->clabel = clabel;
   3014 			ac->next = ac_list;
   3015 			ac_list = ac;
   3016 			good_one = 1;
   3017 		}
   3018 	}
   3019 	if (!good_one) {
   3020 		/* cleanup */
   3021 		free(clabel, M_RAIDFRAME);
   3022 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3023 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3024 		vput(vp);
   3025 	}
   3026 	return ac_list;
   3027 }
   3028 
   3029 RF_AutoConfig_t *
   3030 rf_find_raid_components(void)
   3031 {
   3032 	struct vnode *vp;
   3033 	struct disklabel label;
   3034 	device_t dv;
   3035 	deviter_t di;
   3036 	dev_t dev;
   3037 	int bmajor, bminor, wedge, rf_part_found;
   3038 	int error;
   3039 	int i;
   3040 	RF_AutoConfig_t *ac_list;
   3041 	uint64_t numsecs;
   3042 	unsigned secsize;
   3043 
   3044 	/* initialize the AutoConfig list */
   3045 	ac_list = NULL;
   3046 
   3047 	/* we begin by trolling through *all* the devices on the system */
   3048 
   3049 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3050 	     dv = deviter_next(&di)) {
   3051 
   3052 		/* we are only interested in disks... */
   3053 		if (device_class(dv) != DV_DISK)
   3054 			continue;
   3055 
   3056 		/* we don't care about floppies... */
   3057 		if (device_is_a(dv, "fd")) {
   3058 			continue;
   3059 		}
   3060 
   3061 		/* we don't care about CD's... */
   3062 		if (device_is_a(dv, "cd")) {
   3063 			continue;
   3064 		}
   3065 
   3066 		/* we don't care about md's... */
   3067 		if (device_is_a(dv, "md")) {
   3068 			continue;
   3069 		}
   3070 
   3071 		/* hdfd is the Atari/Hades floppy driver */
   3072 		if (device_is_a(dv, "hdfd")) {
   3073 			continue;
   3074 		}
   3075 
   3076 		/* fdisa is the Atari/Milan floppy driver */
   3077 		if (device_is_a(dv, "fdisa")) {
   3078 			continue;
   3079 		}
   3080 
   3081 		/* need to find the device_name_to_block_device_major stuff */
   3082 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3083 
   3084 		rf_part_found = 0; /*No raid partition as yet*/
   3085 
   3086 		/* get a vnode for the raw partition of this disk */
   3087 
   3088 		wedge = device_is_a(dv, "dk");
   3089 		bminor = minor(device_unit(dv));
   3090 		dev = wedge ? makedev(bmajor, bminor) :
   3091 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3092 		if (bdevvp(dev, &vp))
   3093 			panic("RAID can't alloc vnode");
   3094 
   3095 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3096 
   3097 		if (error) {
   3098 			/* "Who cares."  Continue looking
   3099 			   for something that exists*/
   3100 			vput(vp);
   3101 			continue;
   3102 		}
   3103 
   3104 		error = getdisksize(vp, &numsecs, &secsize);
   3105 		if (error) {
   3106 			vput(vp);
   3107 			continue;
   3108 		}
   3109 		if (wedge) {
   3110 			struct dkwedge_info dkw;
   3111 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3112 			    NOCRED);
   3113 			if (error) {
   3114 				printf("RAIDframe: can't get wedge info for "
   3115 				    "dev %s (%d)\n", device_xname(dv), error);
   3116 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3117 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3118 				vput(vp);
   3119 				continue;
   3120 			}
   3121 
   3122 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3123 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3124 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3125 				vput(vp);
   3126 				continue;
   3127 			}
   3128 
   3129 			ac_list = rf_get_component(ac_list, dev, vp,
   3130 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3131 			rf_part_found = 1; /*There is a raid component on this disk*/
   3132 			continue;
   3133 		}
   3134 
   3135 		/* Ok, the disk exists.  Go get the disklabel. */
   3136 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3137 		if (error) {
   3138 			/*
   3139 			 * XXX can't happen - open() would
   3140 			 * have errored out (or faked up one)
   3141 			 */
   3142 			if (error != ENOTTY)
   3143 				printf("RAIDframe: can't get label for dev "
   3144 				    "%s (%d)\n", device_xname(dv), error);
   3145 		}
   3146 
   3147 		/* don't need this any more.  We'll allocate it again
   3148 		   a little later if we really do... */
   3149 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3150 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3151 		vput(vp);
   3152 
   3153 		if (error)
   3154 			continue;
   3155 
   3156 		rf_part_found = 0; /*No raid partitions yet*/
   3157 		for (i = 0; i < label.d_npartitions; i++) {
   3158 			char cname[sizeof(ac_list->devname)];
   3159 
   3160 			/* We only support partitions marked as RAID */
   3161 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3162 				continue;
   3163 
   3164 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3165 			if (bdevvp(dev, &vp))
   3166 				panic("RAID can't alloc vnode");
   3167 
   3168 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3169 			if (error) {
   3170 				/* Whatever... */
   3171 				vput(vp);
   3172 				continue;
   3173 			}
   3174 			snprintf(cname, sizeof(cname), "%s%c",
   3175 			    device_xname(dv), 'a' + i);
   3176 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3177 				label.d_partitions[i].p_size, numsecs, secsize);
   3178 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3179 		}
   3180 
   3181 		/*
   3182 		 *If there is no raid component on this disk, either in a
   3183 		 *disklabel or inside a wedge, check the raw partition as well,
   3184 		 *as it is possible to configure raid components on raw disk
   3185 		 *devices.
   3186 		 */
   3187 
   3188 		if (!rf_part_found) {
   3189 			char cname[sizeof(ac_list->devname)];
   3190 
   3191 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3192 			if (bdevvp(dev, &vp))
   3193 				panic("RAID can't alloc vnode");
   3194 
   3195 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3196 			if (error) {
   3197 				/* Whatever... */
   3198 				vput(vp);
   3199 				continue;
   3200 			}
   3201 			snprintf(cname, sizeof(cname), "%s%c",
   3202 			    device_xname(dv), 'a' + RAW_PART);
   3203 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3204 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3205 		}
   3206 	}
   3207 	deviter_release(&di);
   3208 	return ac_list;
   3209 }
   3210 
   3211 
   3212 int
   3213 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3214 {
   3215 
   3216 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3217 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3218 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3219 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3220 	    clabel->row >=0 &&
   3221 	    clabel->column >= 0 &&
   3222 	    clabel->num_rows > 0 &&
   3223 	    clabel->num_columns > 0 &&
   3224 	    clabel->row < clabel->num_rows &&
   3225 	    clabel->column < clabel->num_columns &&
   3226 	    clabel->blockSize > 0 &&
   3227 	    /*
   3228 	     * numBlocksHi may contain garbage, but it is ok since
   3229 	     * the type is unsigned.  If it is really garbage,
   3230 	     * rf_fix_old_label_size() will fix it.
   3231 	     */
   3232 	    rf_component_label_numblocks(clabel) > 0) {
   3233 		/*
   3234 		 * label looks reasonable enough...
   3235 		 * let's make sure it has no old garbage.
   3236 		 */
   3237 		if (numsecs)
   3238 			rf_fix_old_label_size(clabel, numsecs);
   3239 		return(1);
   3240 	}
   3241 	return(0);
   3242 }
   3243 
   3244 
   3245 /*
   3246  * For reasons yet unknown, some old component labels have garbage in
   3247  * the newer numBlocksHi region, and this causes lossage.  Since those
   3248  * disks will also have numsecs set to less than 32 bits of sectors,
   3249  * we can determine when this corruption has occured, and fix it.
   3250  *
   3251  * The exact same problem, with the same unknown reason, happens to
   3252  * the partitionSizeHi member as well.
   3253  */
   3254 static void
   3255 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3256 {
   3257 
   3258 	if (numsecs < ((uint64_t)1 << 32)) {
   3259 		if (clabel->numBlocksHi) {
   3260 			printf("WARNING: total sectors < 32 bits, yet "
   3261 			       "numBlocksHi set\n"
   3262 			       "WARNING: resetting numBlocksHi to zero.\n");
   3263 			clabel->numBlocksHi = 0;
   3264 		}
   3265 
   3266 		if (clabel->partitionSizeHi) {
   3267 			printf("WARNING: total sectors < 32 bits, yet "
   3268 			       "partitionSizeHi set\n"
   3269 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3270 			clabel->partitionSizeHi = 0;
   3271 		}
   3272 	}
   3273 }
   3274 
   3275 
   3276 #ifdef DEBUG
   3277 void
   3278 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3279 {
   3280 	uint64_t numBlocks;
   3281 
   3282 	numBlocks = rf_component_label_numblocks(clabel);
   3283 
   3284 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3285 	       clabel->row, clabel->column,
   3286 	       clabel->num_rows, clabel->num_columns);
   3287 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3288 	       clabel->version, clabel->serial_number,
   3289 	       clabel->mod_counter);
   3290 	printf("   Clean: %s Status: %d\n",
   3291 	       clabel->clean ? "Yes" : "No", clabel->status);
   3292 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3293 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3294 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3295 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3296 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3297 	printf("   Contains root partition: %s\n",
   3298 	       clabel->root_partition ? "Yes" : "No");
   3299 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3300 #if 0
   3301 	   printf("   Config order: %d\n", clabel->config_order);
   3302 #endif
   3303 
   3304 }
   3305 #endif
   3306 
   3307 RF_ConfigSet_t *
   3308 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3309 {
   3310 	RF_AutoConfig_t *ac;
   3311 	RF_ConfigSet_t *config_sets;
   3312 	RF_ConfigSet_t *cset;
   3313 	RF_AutoConfig_t *ac_next;
   3314 
   3315 
   3316 	config_sets = NULL;
   3317 
   3318 	/* Go through the AutoConfig list, and figure out which components
   3319 	   belong to what sets.  */
   3320 	ac = ac_list;
   3321 	while(ac!=NULL) {
   3322 		/* we're going to putz with ac->next, so save it here
   3323 		   for use at the end of the loop */
   3324 		ac_next = ac->next;
   3325 
   3326 		if (config_sets == NULL) {
   3327 			/* will need at least this one... */
   3328 			config_sets = (RF_ConfigSet_t *)
   3329 				malloc(sizeof(RF_ConfigSet_t),
   3330 				       M_RAIDFRAME, M_NOWAIT);
   3331 			if (config_sets == NULL) {
   3332 				panic("rf_create_auto_sets: No memory!");
   3333 			}
   3334 			/* this one is easy :) */
   3335 			config_sets->ac = ac;
   3336 			config_sets->next = NULL;
   3337 			config_sets->rootable = 0;
   3338 			ac->next = NULL;
   3339 		} else {
   3340 			/* which set does this component fit into? */
   3341 			cset = config_sets;
   3342 			while(cset!=NULL) {
   3343 				if (rf_does_it_fit(cset, ac)) {
   3344 					/* looks like it matches... */
   3345 					ac->next = cset->ac;
   3346 					cset->ac = ac;
   3347 					break;
   3348 				}
   3349 				cset = cset->next;
   3350 			}
   3351 			if (cset==NULL) {
   3352 				/* didn't find a match above... new set..*/
   3353 				cset = (RF_ConfigSet_t *)
   3354 					malloc(sizeof(RF_ConfigSet_t),
   3355 					       M_RAIDFRAME, M_NOWAIT);
   3356 				if (cset == NULL) {
   3357 					panic("rf_create_auto_sets: No memory!");
   3358 				}
   3359 				cset->ac = ac;
   3360 				ac->next = NULL;
   3361 				cset->next = config_sets;
   3362 				cset->rootable = 0;
   3363 				config_sets = cset;
   3364 			}
   3365 		}
   3366 		ac = ac_next;
   3367 	}
   3368 
   3369 
   3370 	return(config_sets);
   3371 }
   3372 
   3373 static int
   3374 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3375 {
   3376 	RF_ComponentLabel_t *clabel1, *clabel2;
   3377 
   3378 	/* If this one matches the *first* one in the set, that's good
   3379 	   enough, since the other members of the set would have been
   3380 	   through here too... */
   3381 	/* note that we are not checking partitionSize here..
   3382 
   3383 	   Note that we are also not checking the mod_counters here.
   3384 	   If everything else matches execpt the mod_counter, that's
   3385 	   good enough for this test.  We will deal with the mod_counters
   3386 	   a little later in the autoconfiguration process.
   3387 
   3388 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3389 
   3390 	   The reason we don't check for this is that failed disks
   3391 	   will have lower modification counts.  If those disks are
   3392 	   not added to the set they used to belong to, then they will
   3393 	   form their own set, which may result in 2 different sets,
   3394 	   for example, competing to be configured at raid0, and
   3395 	   perhaps competing to be the root filesystem set.  If the
   3396 	   wrong ones get configured, or both attempt to become /,
   3397 	   weird behaviour and or serious lossage will occur.  Thus we
   3398 	   need to bring them into the fold here, and kick them out at
   3399 	   a later point.
   3400 
   3401 	*/
   3402 
   3403 	clabel1 = cset->ac->clabel;
   3404 	clabel2 = ac->clabel;
   3405 	if ((clabel1->version == clabel2->version) &&
   3406 	    (clabel1->serial_number == clabel2->serial_number) &&
   3407 	    (clabel1->num_rows == clabel2->num_rows) &&
   3408 	    (clabel1->num_columns == clabel2->num_columns) &&
   3409 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3410 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3411 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3412 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3413 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3414 	    (clabel1->blockSize == clabel2->blockSize) &&
   3415 	    rf_component_label_numblocks(clabel1) ==
   3416 	    rf_component_label_numblocks(clabel2) &&
   3417 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3418 	    (clabel1->root_partition == clabel2->root_partition) &&
   3419 	    (clabel1->last_unit == clabel2->last_unit) &&
   3420 	    (clabel1->config_order == clabel2->config_order)) {
   3421 		/* if it get's here, it almost *has* to be a match */
   3422 	} else {
   3423 		/* it's not consistent with somebody in the set..
   3424 		   punt */
   3425 		return(0);
   3426 	}
   3427 	/* all was fine.. it must fit... */
   3428 	return(1);
   3429 }
   3430 
   3431 int
   3432 rf_have_enough_components(RF_ConfigSet_t *cset)
   3433 {
   3434 	RF_AutoConfig_t *ac;
   3435 	RF_AutoConfig_t *auto_config;
   3436 	RF_ComponentLabel_t *clabel;
   3437 	int c;
   3438 	int num_cols;
   3439 	int num_missing;
   3440 	int mod_counter;
   3441 	int mod_counter_found;
   3442 	int even_pair_failed;
   3443 	char parity_type;
   3444 
   3445 
   3446 	/* check to see that we have enough 'live' components
   3447 	   of this set.  If so, we can configure it if necessary */
   3448 
   3449 	num_cols = cset->ac->clabel->num_columns;
   3450 	parity_type = cset->ac->clabel->parityConfig;
   3451 
   3452 	/* XXX Check for duplicate components!?!?!? */
   3453 
   3454 	/* Determine what the mod_counter is supposed to be for this set. */
   3455 
   3456 	mod_counter_found = 0;
   3457 	mod_counter = 0;
   3458 	ac = cset->ac;
   3459 	while(ac!=NULL) {
   3460 		if (mod_counter_found==0) {
   3461 			mod_counter = ac->clabel->mod_counter;
   3462 			mod_counter_found = 1;
   3463 		} else {
   3464 			if (ac->clabel->mod_counter > mod_counter) {
   3465 				mod_counter = ac->clabel->mod_counter;
   3466 			}
   3467 		}
   3468 		ac = ac->next;
   3469 	}
   3470 
   3471 	num_missing = 0;
   3472 	auto_config = cset->ac;
   3473 
   3474 	even_pair_failed = 0;
   3475 	for(c=0; c<num_cols; c++) {
   3476 		ac = auto_config;
   3477 		while(ac!=NULL) {
   3478 			if ((ac->clabel->column == c) &&
   3479 			    (ac->clabel->mod_counter == mod_counter)) {
   3480 				/* it's this one... */
   3481 #ifdef DEBUG
   3482 				printf("Found: %s at %d\n",
   3483 				       ac->devname,c);
   3484 #endif
   3485 				break;
   3486 			}
   3487 			ac=ac->next;
   3488 		}
   3489 		if (ac==NULL) {
   3490 				/* Didn't find one here! */
   3491 				/* special case for RAID 1, especially
   3492 				   where there are more than 2
   3493 				   components (where RAIDframe treats
   3494 				   things a little differently :( ) */
   3495 			if (parity_type == '1') {
   3496 				if (c%2 == 0) { /* even component */
   3497 					even_pair_failed = 1;
   3498 				} else { /* odd component.  If
   3499 					    we're failed, and
   3500 					    so is the even
   3501 					    component, it's
   3502 					    "Good Night, Charlie" */
   3503 					if (even_pair_failed == 1) {
   3504 						return(0);
   3505 					}
   3506 				}
   3507 			} else {
   3508 				/* normal accounting */
   3509 				num_missing++;
   3510 			}
   3511 		}
   3512 		if ((parity_type == '1') && (c%2 == 1)) {
   3513 				/* Just did an even component, and we didn't
   3514 				   bail.. reset the even_pair_failed flag,
   3515 				   and go on to the next component.... */
   3516 			even_pair_failed = 0;
   3517 		}
   3518 	}
   3519 
   3520 	clabel = cset->ac->clabel;
   3521 
   3522 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3523 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3524 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3525 		/* XXX this needs to be made *much* more general */
   3526 		/* Too many failures */
   3527 		return(0);
   3528 	}
   3529 	/* otherwise, all is well, and we've got enough to take a kick
   3530 	   at autoconfiguring this set */
   3531 	return(1);
   3532 }
   3533 
   3534 void
   3535 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3536 			RF_Raid_t *raidPtr)
   3537 {
   3538 	RF_ComponentLabel_t *clabel;
   3539 	int i;
   3540 
   3541 	clabel = ac->clabel;
   3542 
   3543 	/* 1. Fill in the common stuff */
   3544 	config->numRow = clabel->num_rows = 1;
   3545 	config->numCol = clabel->num_columns;
   3546 	config->numSpare = 0; /* XXX should this be set here? */
   3547 	config->sectPerSU = clabel->sectPerSU;
   3548 	config->SUsPerPU = clabel->SUsPerPU;
   3549 	config->SUsPerRU = clabel->SUsPerRU;
   3550 	config->parityConfig = clabel->parityConfig;
   3551 	/* XXX... */
   3552 	strcpy(config->diskQueueType,"fifo");
   3553 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3554 	config->layoutSpecificSize = 0; /* XXX ?? */
   3555 
   3556 	while(ac!=NULL) {
   3557 		/* row/col values will be in range due to the checks
   3558 		   in reasonable_label() */
   3559 		strcpy(config->devnames[0][ac->clabel->column],
   3560 		       ac->devname);
   3561 		ac = ac->next;
   3562 	}
   3563 
   3564 	for(i=0;i<RF_MAXDBGV;i++) {
   3565 		config->debugVars[i][0] = 0;
   3566 	}
   3567 }
   3568 
   3569 int
   3570 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3571 {
   3572 	RF_ComponentLabel_t *clabel;
   3573 	int column;
   3574 	int sparecol;
   3575 
   3576 	raidPtr->autoconfigure = new_value;
   3577 
   3578 	for(column=0; column<raidPtr->numCol; column++) {
   3579 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3580 			clabel = raidget_component_label(raidPtr, column);
   3581 			clabel->autoconfigure = new_value;
   3582 			raidflush_component_label(raidPtr, column);
   3583 		}
   3584 	}
   3585 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3586 		sparecol = raidPtr->numCol + column;
   3587 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3588 			clabel = raidget_component_label(raidPtr, sparecol);
   3589 			clabel->autoconfigure = new_value;
   3590 			raidflush_component_label(raidPtr, sparecol);
   3591 		}
   3592 	}
   3593 	return(new_value);
   3594 }
   3595 
   3596 int
   3597 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3598 {
   3599 	RF_ComponentLabel_t *clabel;
   3600 	int column;
   3601 	int sparecol;
   3602 
   3603 	raidPtr->root_partition = new_value;
   3604 	for(column=0; column<raidPtr->numCol; column++) {
   3605 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3606 			clabel = raidget_component_label(raidPtr, column);
   3607 			clabel->root_partition = new_value;
   3608 			raidflush_component_label(raidPtr, column);
   3609 		}
   3610 	}
   3611 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3612 		sparecol = raidPtr->numCol + column;
   3613 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3614 			clabel = raidget_component_label(raidPtr, sparecol);
   3615 			clabel->root_partition = new_value;
   3616 			raidflush_component_label(raidPtr, sparecol);
   3617 		}
   3618 	}
   3619 	return(new_value);
   3620 }
   3621 
   3622 void
   3623 rf_release_all_vps(RF_ConfigSet_t *cset)
   3624 {
   3625 	RF_AutoConfig_t *ac;
   3626 
   3627 	ac = cset->ac;
   3628 	while(ac!=NULL) {
   3629 		/* Close the vp, and give it back */
   3630 		if (ac->vp) {
   3631 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3632 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3633 			vput(ac->vp);
   3634 			ac->vp = NULL;
   3635 		}
   3636 		ac = ac->next;
   3637 	}
   3638 }
   3639 
   3640 
   3641 void
   3642 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3643 {
   3644 	RF_AutoConfig_t *ac;
   3645 	RF_AutoConfig_t *next_ac;
   3646 
   3647 	ac = cset->ac;
   3648 	while(ac!=NULL) {
   3649 		next_ac = ac->next;
   3650 		/* nuke the label */
   3651 		free(ac->clabel, M_RAIDFRAME);
   3652 		/* cleanup the config structure */
   3653 		free(ac, M_RAIDFRAME);
   3654 		/* "next.." */
   3655 		ac = next_ac;
   3656 	}
   3657 	/* and, finally, nuke the config set */
   3658 	free(cset, M_RAIDFRAME);
   3659 }
   3660 
   3661 
   3662 void
   3663 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3664 {
   3665 	/* current version number */
   3666 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3667 	clabel->serial_number = raidPtr->serial_number;
   3668 	clabel->mod_counter = raidPtr->mod_counter;
   3669 
   3670 	clabel->num_rows = 1;
   3671 	clabel->num_columns = raidPtr->numCol;
   3672 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3673 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3674 
   3675 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3676 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3677 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3678 
   3679 	clabel->blockSize = raidPtr->bytesPerSector;
   3680 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3681 
   3682 	/* XXX not portable */
   3683 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3684 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3685 	clabel->autoconfigure = raidPtr->autoconfigure;
   3686 	clabel->root_partition = raidPtr->root_partition;
   3687 	clabel->last_unit = raidPtr->raidid;
   3688 	clabel->config_order = raidPtr->config_order;
   3689 
   3690 #ifndef RF_NO_PARITY_MAP
   3691 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3692 #endif
   3693 }
   3694 
   3695 int
   3696 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3697 {
   3698 	RF_Raid_t *raidPtr;
   3699 	RF_Config_t *config;
   3700 	int raidID;
   3701 	int retcode;
   3702 
   3703 #ifdef DEBUG
   3704 	printf("RAID autoconfigure\n");
   3705 #endif
   3706 
   3707 	retcode = 0;
   3708 	*unit = -1;
   3709 
   3710 	/* 1. Create a config structure */
   3711 
   3712 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3713 				       M_RAIDFRAME,
   3714 				       M_NOWAIT);
   3715 	if (config==NULL) {
   3716 		printf("Out of mem!?!?\n");
   3717 				/* XXX do something more intelligent here. */
   3718 		return(1);
   3719 	}
   3720 
   3721 	memset(config, 0, sizeof(RF_Config_t));
   3722 
   3723 	/*
   3724 	   2. Figure out what RAID ID this one is supposed to live at
   3725 	   See if we can get the same RAID dev that it was configured
   3726 	   on last time..
   3727 	*/
   3728 
   3729 	raidID = cset->ac->clabel->last_unit;
   3730 	if ((raidID < 0) || (raidID >= numraid)) {
   3731 		/* let's not wander off into lala land. */
   3732 		raidID = numraid - 1;
   3733 	}
   3734 	if (raidPtrs[raidID]->valid != 0) {
   3735 
   3736 		/*
   3737 		   Nope... Go looking for an alternative...
   3738 		   Start high so we don't immediately use raid0 if that's
   3739 		   not taken.
   3740 		*/
   3741 
   3742 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3743 			if (raidPtrs[raidID]->valid == 0) {
   3744 				/* can use this one! */
   3745 				break;
   3746 			}
   3747 		}
   3748 	}
   3749 
   3750 	if (raidID < 0) {
   3751 		/* punt... */
   3752 		printf("Unable to auto configure this set!\n");
   3753 		printf("(Out of RAID devs!)\n");
   3754 		free(config, M_RAIDFRAME);
   3755 		return(1);
   3756 	}
   3757 
   3758 #ifdef DEBUG
   3759 	printf("Configuring raid%d:\n",raidID);
   3760 #endif
   3761 
   3762 	raidPtr = raidPtrs[raidID];
   3763 
   3764 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3765 	raidPtr->raidid = raidID;
   3766 	raidPtr->openings = RAIDOUTSTANDING;
   3767 
   3768 	/* 3. Build the configuration structure */
   3769 	rf_create_configuration(cset->ac, config, raidPtr);
   3770 
   3771 	/* 4. Do the configuration */
   3772 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3773 
   3774 	if (retcode == 0) {
   3775 
   3776 		raidinit(raidPtrs[raidID]);
   3777 
   3778 		rf_markalldirty(raidPtrs[raidID]);
   3779 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3780 		if (cset->ac->clabel->root_partition==1) {
   3781 			/* everything configured just fine.  Make a note
   3782 			   that this set is eligible to be root. */
   3783 			cset->rootable = 1;
   3784 			/* XXX do this here? */
   3785 			raidPtrs[raidID]->root_partition = 1;
   3786 		}
   3787 	}
   3788 
   3789 	/* 5. Cleanup */
   3790 	free(config, M_RAIDFRAME);
   3791 
   3792 	*unit = raidID;
   3793 	return(retcode);
   3794 }
   3795 
   3796 void
   3797 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3798 {
   3799 	struct buf *bp;
   3800 
   3801 	bp = (struct buf *)desc->bp;
   3802 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3803 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3804 }
   3805 
   3806 void
   3807 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3808 	     size_t xmin, size_t xmax)
   3809 {
   3810 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3811 	pool_sethiwat(p, xmax);
   3812 	pool_prime(p, xmin);
   3813 	pool_setlowat(p, xmin);
   3814 }
   3815 
   3816 /*
   3817  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3818  * if there is IO pending and if that IO could possibly be done for a
   3819  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3820  * otherwise.
   3821  *
   3822  */
   3823 
   3824 int
   3825 rf_buf_queue_check(int raidid)
   3826 {
   3827 	if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
   3828 	    raidPtrs[raidid]->openings > 0) {
   3829 		/* there is work to do */
   3830 		return 0;
   3831 	}
   3832 	/* default is nothing to do */
   3833 	return 1;
   3834 }
   3835 
   3836 int
   3837 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3838 {
   3839 	uint64_t numsecs;
   3840 	unsigned secsize;
   3841 	int error;
   3842 
   3843 	error = getdisksize(vp, &numsecs, &secsize);
   3844 	if (error == 0) {
   3845 		diskPtr->blockSize = secsize;
   3846 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3847 		diskPtr->partitionSize = numsecs;
   3848 		return 0;
   3849 	}
   3850 	return error;
   3851 }
   3852 
   3853 static int
   3854 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3855 {
   3856 	return 1;
   3857 }
   3858 
   3859 static void
   3860 raid_attach(device_t parent, device_t self, void *aux)
   3861 {
   3862 
   3863 }
   3864 
   3865 
   3866 static int
   3867 raid_detach(device_t self, int flags)
   3868 {
   3869 	int error;
   3870 	struct raid_softc *rs = &raid_softc[device_unit(self)];
   3871 
   3872 	if ((error = raidlock(rs)) != 0)
   3873 		return (error);
   3874 
   3875 	error = raid_detach_unlocked(rs);
   3876 
   3877 	raidunlock(rs);
   3878 
   3879 	return error;
   3880 }
   3881 
   3882 static void
   3883 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3884 {
   3885 	prop_dictionary_t disk_info, odisk_info, geom;
   3886 	disk_info = prop_dictionary_create();
   3887 	geom = prop_dictionary_create();
   3888 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3889 				   raidPtr->totalSectors);
   3890 	prop_dictionary_set_uint32(geom, "sector-size",
   3891 				   raidPtr->bytesPerSector);
   3892 
   3893 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3894 				   raidPtr->Layout.dataSectorsPerStripe);
   3895 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3896 				   4 * raidPtr->numCol);
   3897 
   3898 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3899 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3900 	   (4 * raidPtr->numCol)));
   3901 
   3902 	prop_dictionary_set(disk_info, "geometry", geom);
   3903 	prop_object_release(geom);
   3904 	prop_dictionary_set(device_properties(rs->sc_dev),
   3905 			    "disk-info", disk_info);
   3906 	odisk_info = rs->sc_dkdev.dk_info;
   3907 	rs->sc_dkdev.dk_info = disk_info;
   3908 	if (odisk_info)
   3909 		prop_object_release(odisk_info);
   3910 }
   3911 
   3912 /*
   3913  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3914  * We end up returning whatever error was returned by the first cache flush
   3915  * that fails.
   3916  */
   3917 
   3918 int
   3919 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3920 {
   3921 	int c, sparecol;
   3922 	int e,error;
   3923 	int force = 1;
   3924 
   3925 	error = 0;
   3926 	for (c = 0; c < raidPtr->numCol; c++) {
   3927 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3928 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3929 					  &force, FWRITE, NOCRED);
   3930 			if (e) {
   3931 				if (e != ENODEV)
   3932 					printf("raid%d: cache flush to component %s failed.\n",
   3933 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3934 				if (error == 0) {
   3935 					error = e;
   3936 				}
   3937 			}
   3938 		}
   3939 	}
   3940 
   3941 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3942 		sparecol = raidPtr->numCol + c;
   3943 		/* Need to ensure that the reconstruct actually completed! */
   3944 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3945 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3946 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3947 			if (e) {
   3948 				if (e != ENODEV)
   3949 					printf("raid%d: cache flush to component %s failed.\n",
   3950 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3951 				if (error == 0) {
   3952 					error = e;
   3953 				}
   3954 			}
   3955 		}
   3956 	}
   3957 	return error;
   3958 }
   3959