Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.288
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.288 2011/05/01 06:22:54 mrg Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.288 2011/05/01 06:22:54 mrg Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #include "raid.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #ifdef DEBUG
    156 int     rf_kdebug_level = 0;
    157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    158 #else				/* DEBUG */
    159 #define db1_printf(a) { }
    160 #endif				/* DEBUG */
    161 
    162 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex):
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 static void raidinit(RF_Raid_t *);
    183 
    184 void raidattach(int);
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 
    200 dev_type_open(raidopen);
    201 dev_type_close(raidclose);
    202 dev_type_read(raidread);
    203 dev_type_write(raidwrite);
    204 dev_type_ioctl(raidioctl);
    205 dev_type_strategy(raidstrategy);
    206 dev_type_dump(raiddump);
    207 dev_type_size(raidsize);
    208 
    209 const struct bdevsw raid_bdevsw = {
    210 	raidopen, raidclose, raidstrategy, raidioctl,
    211 	raiddump, raidsize, D_DISK
    212 };
    213 
    214 const struct cdevsw raid_cdevsw = {
    215 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    216 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    217 };
    218 
    219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    220 
    221 /* XXX Not sure if the following should be replacing the raidPtrs above,
    222    or if it should be used in conjunction with that...
    223 */
    224 
    225 struct raid_softc {
    226 	device_t sc_dev;
    227 	int     sc_flags;	/* flags */
    228 	int     sc_cflags;	/* configuration flags */
    229 	uint64_t sc_size;	/* size of the raid device */
    230 	char    sc_xname[20];	/* XXX external name */
    231 	struct disk sc_dkdev;	/* generic disk device info */
    232 	struct bufq_state *buf_queue;	/* used for the device queue */
    233 };
    234 /* sc_flags */
    235 #define RAIDF_INITED	0x01	/* unit has been initialized */
    236 #define RAIDF_WLABEL	0x02	/* label area is writable */
    237 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    238 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    239 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    240 #define RAIDF_LOCKED	0x80	/* unit is locked */
    241 
    242 #define	raidunit(x)	DISKUNIT(x)
    243 int numraid = 0;
    244 
    245 extern struct cfdriver raid_cd;
    246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    247     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    248     DVF_DETACH_SHUTDOWN);
    249 
    250 /*
    251  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    252  * Be aware that large numbers can allow the driver to consume a lot of
    253  * kernel memory, especially on writes, and in degraded mode reads.
    254  *
    255  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    256  * a single 64K write will typically require 64K for the old data,
    257  * 64K for the old parity, and 64K for the new parity, for a total
    258  * of 192K (if the parity buffer is not re-used immediately).
    259  * Even it if is used immediately, that's still 128K, which when multiplied
    260  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    261  *
    262  * Now in degraded mode, for example, a 64K read on the above setup may
    263  * require data reconstruction, which will require *all* of the 4 remaining
    264  * disks to participate -- 4 * 32K/disk == 128K again.
    265  */
    266 
    267 #ifndef RAIDOUTSTANDING
    268 #define RAIDOUTSTANDING   6
    269 #endif
    270 
    271 #define RAIDLABELDEV(dev)	\
    272 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    273 
    274 /* declared here, and made public, for the benefit of KVM stuff.. */
    275 struct raid_softc *raid_softc;
    276 
    277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    278 				     struct disklabel *);
    279 static void raidgetdisklabel(dev_t);
    280 static void raidmakedisklabel(struct raid_softc *);
    281 
    282 static int raidlock(struct raid_softc *);
    283 static void raidunlock(struct raid_softc *);
    284 
    285 static int raid_detach_unlocked(struct raid_softc *);
    286 
    287 static void rf_markalldirty(RF_Raid_t *);
    288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    289 
    290 void rf_ReconThread(struct rf_recon_req *);
    291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    292 void rf_CopybackThread(RF_Raid_t *raidPtr);
    293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    294 int rf_autoconfig(device_t);
    295 void rf_buildroothack(RF_ConfigSet_t *);
    296 
    297 RF_AutoConfig_t *rf_find_raid_components(void);
    298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    300 static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    302 int rf_set_autoconfig(RF_Raid_t *, int);
    303 int rf_set_rootpartition(RF_Raid_t *, int);
    304 void rf_release_all_vps(RF_ConfigSet_t *);
    305 void rf_cleanup_config_set(RF_ConfigSet_t *);
    306 int rf_have_enough_components(RF_ConfigSet_t *);
    307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    309 
    310 static int raidautoconfig = 0; /* Debugging, mostly.  Set to 0 to not
    311 				  allow autoconfig to take place.
    312 				  Note that this is overridden by having
    313 				  RAID_AUTOCONFIG as an option in the
    314 				  kernel config file.  */
    315 
    316 struct RF_Pools_s rf_pools;
    317 
    318 void
    319 raidattach(int num)
    320 {
    321 	int raidID;
    322 	int i, rc;
    323 
    324 	aprint_debug("raidattach: Asked for %d units\n", num);
    325 
    326 	if (num <= 0) {
    327 #ifdef DIAGNOSTIC
    328 		panic("raidattach: count <= 0");
    329 #endif
    330 		return;
    331 	}
    332 	/* This is where all the initialization stuff gets done. */
    333 
    334 	numraid = num;
    335 
    336 	/* Make some space for requested number of units... */
    337 
    338 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    339 	if (raidPtrs == NULL) {
    340 		panic("raidPtrs is NULL!!");
    341 	}
    342 
    343 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    344 	rf_init_mutex2(&rf_sparet_wait_mutex);
    345 	rf_init_cond2(&rf_sparet_wait_cv, "sparetw");
    346 	rf_init_cond2(&rf_sparet_resp_cv, "rfgst");
    347 
    348 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    349 #endif
    350 
    351 	for (i = 0; i < num; i++)
    352 		raidPtrs[i] = NULL;
    353 	rc = rf_BootRaidframe();
    354 	if (rc == 0)
    355 		aprint_verbose("Kernelized RAIDframe activated\n");
    356 	else
    357 		panic("Serious error booting RAID!!");
    358 
    359 	/* put together some datastructures like the CCD device does.. This
    360 	 * lets us lock the device and what-not when it gets opened. */
    361 
    362 	raid_softc = (struct raid_softc *)
    363 		malloc(num * sizeof(struct raid_softc),
    364 		       M_RAIDFRAME, M_NOWAIT);
    365 	if (raid_softc == NULL) {
    366 		aprint_error("WARNING: no memory for RAIDframe driver\n");
    367 		return;
    368 	}
    369 
    370 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    371 
    372 	for (raidID = 0; raidID < num; raidID++) {
    373 		bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
    374 
    375 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    376 			  (RF_Raid_t *));
    377 		if (raidPtrs[raidID] == NULL) {
    378 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
    379 			numraid = raidID;
    380 			return;
    381 		}
    382 	}
    383 
    384 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    385 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    386 	}
    387 
    388 #ifdef RAID_AUTOCONFIG
    389 	raidautoconfig = 1;
    390 #endif
    391 
    392 	/*
    393 	 * Register a finalizer which will be used to auto-config RAID
    394 	 * sets once all real hardware devices have been found.
    395 	 */
    396 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    397 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    398 }
    399 
    400 int
    401 rf_autoconfig(device_t self)
    402 {
    403 	RF_AutoConfig_t *ac_list;
    404 	RF_ConfigSet_t *config_sets;
    405 
    406 	if (raidautoconfig == 0)
    407 		return (0);
    408 
    409 	/* XXX This code can only be run once. */
    410 	raidautoconfig = 0;
    411 
    412 	/* 1. locate all RAID components on the system */
    413 	aprint_debug("Searching for RAID components...\n");
    414 	ac_list = rf_find_raid_components();
    415 
    416 	/* 2. Sort them into their respective sets. */
    417 	config_sets = rf_create_auto_sets(ac_list);
    418 
    419 	/*
    420 	 * 3. Evaluate each set andconfigure the valid ones.
    421 	 * This gets done in rf_buildroothack().
    422 	 */
    423 	rf_buildroothack(config_sets);
    424 
    425 	return 1;
    426 }
    427 
    428 void
    429 rf_buildroothack(RF_ConfigSet_t *config_sets)
    430 {
    431 	RF_ConfigSet_t *cset;
    432 	RF_ConfigSet_t *next_cset;
    433 	int retcode;
    434 	int raidID;
    435 	int rootID;
    436 	int col;
    437 	int num_root;
    438 	char *devname;
    439 
    440 	rootID = 0;
    441 	num_root = 0;
    442 	cset = config_sets;
    443 	while (cset != NULL) {
    444 		next_cset = cset->next;
    445 		if (rf_have_enough_components(cset) &&
    446 		    cset->ac->clabel->autoconfigure==1) {
    447 			retcode = rf_auto_config_set(cset,&raidID);
    448 			if (!retcode) {
    449 				aprint_debug("raid%d: configured ok\n", raidID);
    450 				if (cset->rootable) {
    451 					rootID = raidID;
    452 					num_root++;
    453 				}
    454 			} else {
    455 				/* The autoconfig didn't work :( */
    456 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    457 				rf_release_all_vps(cset);
    458 			}
    459 		} else {
    460 			/* we're not autoconfiguring this set...
    461 			   release the associated resources */
    462 			rf_release_all_vps(cset);
    463 		}
    464 		/* cleanup */
    465 		rf_cleanup_config_set(cset);
    466 		cset = next_cset;
    467 	}
    468 
    469 	/* if the user has specified what the root device should be
    470 	   then we don't touch booted_device or boothowto... */
    471 
    472 	if (rootspec != NULL)
    473 		return;
    474 
    475 	/* we found something bootable... */
    476 
    477 	if (num_root == 1) {
    478 		booted_device = raid_softc[rootID].sc_dev;
    479 	} else if (num_root > 1) {
    480 
    481 		/*
    482 		 * Maybe the MD code can help. If it cannot, then
    483 		 * setroot() will discover that we have no
    484 		 * booted_device and will ask the user if nothing was
    485 		 * hardwired in the kernel config file
    486 		 */
    487 
    488 		if (booted_device == NULL)
    489 			cpu_rootconf();
    490 		if (booted_device == NULL)
    491 			return;
    492 
    493 		num_root = 0;
    494 		for (raidID = 0; raidID < numraid; raidID++) {
    495 			if (raidPtrs[raidID]->valid == 0)
    496 				continue;
    497 
    498 			if (raidPtrs[raidID]->root_partition == 0)
    499 				continue;
    500 
    501 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
    502 				devname = raidPtrs[raidID]->Disks[col].devname;
    503 				devname += sizeof("/dev/") - 1;
    504 				if (strncmp(devname, device_xname(booted_device),
    505 					    strlen(device_xname(booted_device))) != 0)
    506 					continue;
    507 				aprint_debug("raid%d includes boot device %s\n",
    508 				       raidID, devname);
    509 				num_root++;
    510 				rootID = raidID;
    511 			}
    512 		}
    513 
    514 		if (num_root == 1) {
    515 			booted_device = raid_softc[rootID].sc_dev;
    516 		} else {
    517 			/* we can't guess.. require the user to answer... */
    518 			boothowto |= RB_ASKNAME;
    519 		}
    520 	}
    521 }
    522 
    523 
    524 int
    525 raidsize(dev_t dev)
    526 {
    527 	struct raid_softc *rs;
    528 	struct disklabel *lp;
    529 	int     part, unit, omask, size;
    530 
    531 	unit = raidunit(dev);
    532 	if (unit >= numraid)
    533 		return (-1);
    534 	rs = &raid_softc[unit];
    535 
    536 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    537 		return (-1);
    538 
    539 	part = DISKPART(dev);
    540 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    541 	lp = rs->sc_dkdev.dk_label;
    542 
    543 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    544 		return (-1);
    545 
    546 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    547 		size = -1;
    548 	else
    549 		size = lp->d_partitions[part].p_size *
    550 		    (lp->d_secsize / DEV_BSIZE);
    551 
    552 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    553 		return (-1);
    554 
    555 	return (size);
    556 
    557 }
    558 
    559 int
    560 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    561 {
    562 	int     unit = raidunit(dev);
    563 	struct raid_softc *rs;
    564 	const struct bdevsw *bdev;
    565 	struct disklabel *lp;
    566 	RF_Raid_t *raidPtr;
    567 	daddr_t offset;
    568 	int     part, c, sparecol, j, scol, dumpto;
    569 	int     error = 0;
    570 
    571 	if (unit >= numraid)
    572 		return (ENXIO);
    573 
    574 	rs = &raid_softc[unit];
    575 	raidPtr = raidPtrs[unit];
    576 
    577 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    578 		return ENXIO;
    579 
    580 	/* we only support dumping to RAID 1 sets */
    581 	if (raidPtr->Layout.numDataCol != 1 ||
    582 	    raidPtr->Layout.numParityCol != 1)
    583 		return EINVAL;
    584 
    585 
    586 	if ((error = raidlock(rs)) != 0)
    587 		return error;
    588 
    589 	if (size % DEV_BSIZE != 0) {
    590 		error = EINVAL;
    591 		goto out;
    592 	}
    593 
    594 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    595 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    596 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    597 		    size / DEV_BSIZE, rs->sc_size);
    598 		error = EINVAL;
    599 		goto out;
    600 	}
    601 
    602 	part = DISKPART(dev);
    603 	lp = rs->sc_dkdev.dk_label;
    604 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    605 
    606 	/* figure out what device is alive.. */
    607 
    608 	/*
    609 	   Look for a component to dump to.  The preference for the
    610 	   component to dump to is as follows:
    611 	   1) the master
    612 	   2) a used_spare of the master
    613 	   3) the slave
    614 	   4) a used_spare of the slave
    615 	*/
    616 
    617 	dumpto = -1;
    618 	for (c = 0; c < raidPtr->numCol; c++) {
    619 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    620 			/* this might be the one */
    621 			dumpto = c;
    622 			break;
    623 		}
    624 	}
    625 
    626 	/*
    627 	   At this point we have possibly selected a live master or a
    628 	   live slave.  We now check to see if there is a spared
    629 	   master (or a spared slave), if we didn't find a live master
    630 	   or a live slave.
    631 	*/
    632 
    633 	for (c = 0; c < raidPtr->numSpare; c++) {
    634 		sparecol = raidPtr->numCol + c;
    635 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    636 			/* How about this one? */
    637 			scol = -1;
    638 			for(j=0;j<raidPtr->numCol;j++) {
    639 				if (raidPtr->Disks[j].spareCol == sparecol) {
    640 					scol = j;
    641 					break;
    642 				}
    643 			}
    644 			if (scol == 0) {
    645 				/*
    646 				   We must have found a spared master!
    647 				   We'll take that over anything else
    648 				   found so far.  (We couldn't have
    649 				   found a real master before, since
    650 				   this is a used spare, and it's
    651 				   saying that it's replacing the
    652 				   master.)  On reboot (with
    653 				   autoconfiguration turned on)
    654 				   sparecol will become the 1st
    655 				   component (component0) of this set.
    656 				*/
    657 				dumpto = sparecol;
    658 				break;
    659 			} else if (scol != -1) {
    660 				/*
    661 				   Must be a spared slave.  We'll dump
    662 				   to that if we havn't found anything
    663 				   else so far.
    664 				*/
    665 				if (dumpto == -1)
    666 					dumpto = sparecol;
    667 			}
    668 		}
    669 	}
    670 
    671 	if (dumpto == -1) {
    672 		/* we couldn't find any live components to dump to!?!?
    673 		 */
    674 		error = EINVAL;
    675 		goto out;
    676 	}
    677 
    678 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    679 
    680 	/*
    681 	   Note that blkno is relative to this particular partition.
    682 	   By adding the offset of this partition in the RAID
    683 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    684 	   value that is relative to the partition used for the
    685 	   underlying component.
    686 	*/
    687 
    688 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    689 				blkno + offset, va, size);
    690 
    691 out:
    692 	raidunlock(rs);
    693 
    694 	return error;
    695 }
    696 /* ARGSUSED */
    697 int
    698 raidopen(dev_t dev, int flags, int fmt,
    699     struct lwp *l)
    700 {
    701 	int     unit = raidunit(dev);
    702 	struct raid_softc *rs;
    703 	struct disklabel *lp;
    704 	int     part, pmask;
    705 	int     error = 0;
    706 
    707 	if (unit >= numraid)
    708 		return (ENXIO);
    709 	rs = &raid_softc[unit];
    710 
    711 	if ((error = raidlock(rs)) != 0)
    712 		return (error);
    713 
    714 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    715 		error = EBUSY;
    716 		goto bad;
    717 	}
    718 
    719 	lp = rs->sc_dkdev.dk_label;
    720 
    721 	part = DISKPART(dev);
    722 
    723 	/*
    724 	 * If there are wedges, and this is not RAW_PART, then we
    725 	 * need to fail.
    726 	 */
    727 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    728 		error = EBUSY;
    729 		goto bad;
    730 	}
    731 	pmask = (1 << part);
    732 
    733 	if ((rs->sc_flags & RAIDF_INITED) &&
    734 	    (rs->sc_dkdev.dk_openmask == 0))
    735 		raidgetdisklabel(dev);
    736 
    737 	/* make sure that this partition exists */
    738 
    739 	if (part != RAW_PART) {
    740 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    741 		    ((part >= lp->d_npartitions) ||
    742 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    743 			error = ENXIO;
    744 			goto bad;
    745 		}
    746 	}
    747 	/* Prevent this unit from being unconfigured while open. */
    748 	switch (fmt) {
    749 	case S_IFCHR:
    750 		rs->sc_dkdev.dk_copenmask |= pmask;
    751 		break;
    752 
    753 	case S_IFBLK:
    754 		rs->sc_dkdev.dk_bopenmask |= pmask;
    755 		break;
    756 	}
    757 
    758 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    759 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    760 		/* First one... mark things as dirty... Note that we *MUST*
    761 		 have done a configure before this.  I DO NOT WANT TO BE
    762 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    763 		 THAT THEY BELONG TOGETHER!!!!! */
    764 		/* XXX should check to see if we're only open for reading
    765 		   here... If so, we needn't do this, but then need some
    766 		   other way of keeping track of what's happened.. */
    767 
    768 		rf_markalldirty(raidPtrs[unit]);
    769 	}
    770 
    771 
    772 	rs->sc_dkdev.dk_openmask =
    773 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    774 
    775 bad:
    776 	raidunlock(rs);
    777 
    778 	return (error);
    779 
    780 
    781 }
    782 /* ARGSUSED */
    783 int
    784 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    785 {
    786 	int     unit = raidunit(dev);
    787 	struct raid_softc *rs;
    788 	int     error = 0;
    789 	int     part;
    790 
    791 	if (unit >= numraid)
    792 		return (ENXIO);
    793 	rs = &raid_softc[unit];
    794 
    795 	if ((error = raidlock(rs)) != 0)
    796 		return (error);
    797 
    798 	part = DISKPART(dev);
    799 
    800 	/* ...that much closer to allowing unconfiguration... */
    801 	switch (fmt) {
    802 	case S_IFCHR:
    803 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    804 		break;
    805 
    806 	case S_IFBLK:
    807 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    808 		break;
    809 	}
    810 	rs->sc_dkdev.dk_openmask =
    811 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    812 
    813 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    814 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    815 		/* Last one... device is not unconfigured yet.
    816 		   Device shutdown has taken care of setting the
    817 		   clean bits if RAIDF_INITED is not set
    818 		   mark things as clean... */
    819 
    820 		rf_update_component_labels(raidPtrs[unit],
    821 						 RF_FINAL_COMPONENT_UPDATE);
    822 
    823 		/* If the kernel is shutting down, it will detach
    824 		 * this RAID set soon enough.
    825 		 */
    826 	}
    827 
    828 	raidunlock(rs);
    829 	return (0);
    830 
    831 }
    832 
    833 void
    834 raidstrategy(struct buf *bp)
    835 {
    836 	unsigned int raidID = raidunit(bp->b_dev);
    837 	RF_Raid_t *raidPtr;
    838 	struct raid_softc *rs = &raid_softc[raidID];
    839 	int     wlabel;
    840 
    841 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    842 		bp->b_error = ENXIO;
    843 		goto done;
    844 	}
    845 	if (raidID >= numraid || !raidPtrs[raidID]) {
    846 		bp->b_error = ENODEV;
    847 		goto done;
    848 	}
    849 	raidPtr = raidPtrs[raidID];
    850 	if (!raidPtr->valid) {
    851 		bp->b_error = ENODEV;
    852 		goto done;
    853 	}
    854 	if (bp->b_bcount == 0) {
    855 		db1_printf(("b_bcount is zero..\n"));
    856 		goto done;
    857 	}
    858 
    859 	/*
    860 	 * Do bounds checking and adjust transfer.  If there's an
    861 	 * error, the bounds check will flag that for us.
    862 	 */
    863 
    864 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    865 	if (DISKPART(bp->b_dev) == RAW_PART) {
    866 		uint64_t size; /* device size in DEV_BSIZE unit */
    867 
    868 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    869 			size = raidPtr->totalSectors <<
    870 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    871 		} else {
    872 			size = raidPtr->totalSectors >>
    873 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    874 		}
    875 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    876 			goto done;
    877 		}
    878 	} else {
    879 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    880 			db1_printf(("Bounds check failed!!:%d %d\n",
    881 				(int) bp->b_blkno, (int) wlabel));
    882 			goto done;
    883 		}
    884 	}
    885 
    886 	rf_lock_mutex2(raidPtr->iodone_lock);
    887 
    888 	bp->b_resid = 0;
    889 
    890 	/* stuff it onto our queue */
    891 	bufq_put(rs->buf_queue, bp);
    892 
    893 	/* scheduled the IO to happen at the next convenient time */
    894 	rf_signal_cond2(raidPtr->iodone_cv);
    895 	rf_unlock_mutex2(raidPtr->iodone_lock);
    896 
    897 	return;
    898 
    899 done:
    900 	bp->b_resid = bp->b_bcount;
    901 	biodone(bp);
    902 }
    903 /* ARGSUSED */
    904 int
    905 raidread(dev_t dev, struct uio *uio, int flags)
    906 {
    907 	int     unit = raidunit(dev);
    908 	struct raid_softc *rs;
    909 
    910 	if (unit >= numraid)
    911 		return (ENXIO);
    912 	rs = &raid_softc[unit];
    913 
    914 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    915 		return (ENXIO);
    916 
    917 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    918 
    919 }
    920 /* ARGSUSED */
    921 int
    922 raidwrite(dev_t dev, struct uio *uio, int flags)
    923 {
    924 	int     unit = raidunit(dev);
    925 	struct raid_softc *rs;
    926 
    927 	if (unit >= numraid)
    928 		return (ENXIO);
    929 	rs = &raid_softc[unit];
    930 
    931 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    932 		return (ENXIO);
    933 
    934 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    935 
    936 }
    937 
    938 static int
    939 raid_detach_unlocked(struct raid_softc *rs)
    940 {
    941 	int error;
    942 	RF_Raid_t *raidPtr;
    943 
    944 	raidPtr = raidPtrs[device_unit(rs->sc_dev)];
    945 
    946 	/*
    947 	 * If somebody has a partition mounted, we shouldn't
    948 	 * shutdown.
    949 	 */
    950 	if (rs->sc_dkdev.dk_openmask != 0)
    951 		return EBUSY;
    952 
    953 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    954 		;	/* not initialized: nothing to do */
    955 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    956 		return error;
    957 	else
    958 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    959 
    960 	/* Detach the disk. */
    961 	dkwedge_delall(&rs->sc_dkdev);
    962 	disk_detach(&rs->sc_dkdev);
    963 	disk_destroy(&rs->sc_dkdev);
    964 
    965 	return 0;
    966 }
    967 
    968 int
    969 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    970 {
    971 	int     unit = raidunit(dev);
    972 	int     error = 0;
    973 	int     part, pmask;
    974 	cfdata_t cf;
    975 	struct raid_softc *rs;
    976 	RF_Config_t *k_cfg, *u_cfg;
    977 	RF_Raid_t *raidPtr;
    978 	RF_RaidDisk_t *diskPtr;
    979 	RF_AccTotals_t *totals;
    980 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    981 	u_char *specific_buf;
    982 	int retcode = 0;
    983 	int column;
    984 /*	int raidid; */
    985 	struct rf_recon_req *rrcopy, *rr;
    986 	RF_ComponentLabel_t *clabel;
    987 	RF_ComponentLabel_t *ci_label;
    988 	RF_ComponentLabel_t **clabel_ptr;
    989 	RF_SingleComponent_t *sparePtr,*componentPtr;
    990 	RF_SingleComponent_t component;
    991 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
    992 	int i, j, d;
    993 #ifdef __HAVE_OLD_DISKLABEL
    994 	struct disklabel newlabel;
    995 #endif
    996 	struct dkwedge_info *dkw;
    997 
    998 	if (unit >= numraid)
    999 		return (ENXIO);
   1000 	rs = &raid_softc[unit];
   1001 	raidPtr = raidPtrs[unit];
   1002 
   1003 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1004 		(int) DISKPART(dev), (int) unit, cmd));
   1005 
   1006 	/* Must be open for writes for these commands... */
   1007 	switch (cmd) {
   1008 #ifdef DIOCGSECTORSIZE
   1009 	case DIOCGSECTORSIZE:
   1010 		*(u_int *)data = raidPtr->bytesPerSector;
   1011 		return 0;
   1012 	case DIOCGMEDIASIZE:
   1013 		*(off_t *)data =
   1014 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1015 		return 0;
   1016 #endif
   1017 	case DIOCSDINFO:
   1018 	case DIOCWDINFO:
   1019 #ifdef __HAVE_OLD_DISKLABEL
   1020 	case ODIOCWDINFO:
   1021 	case ODIOCSDINFO:
   1022 #endif
   1023 	case DIOCWLABEL:
   1024 	case DIOCAWEDGE:
   1025 	case DIOCDWEDGE:
   1026 		if ((flag & FWRITE) == 0)
   1027 			return (EBADF);
   1028 	}
   1029 
   1030 	/* Must be initialized for these... */
   1031 	switch (cmd) {
   1032 	case DIOCGDINFO:
   1033 	case DIOCSDINFO:
   1034 	case DIOCWDINFO:
   1035 #ifdef __HAVE_OLD_DISKLABEL
   1036 	case ODIOCGDINFO:
   1037 	case ODIOCWDINFO:
   1038 	case ODIOCSDINFO:
   1039 	case ODIOCGDEFLABEL:
   1040 #endif
   1041 	case DIOCGPART:
   1042 	case DIOCWLABEL:
   1043 	case DIOCGDEFLABEL:
   1044 	case DIOCAWEDGE:
   1045 	case DIOCDWEDGE:
   1046 	case DIOCLWEDGES:
   1047 	case DIOCCACHESYNC:
   1048 	case RAIDFRAME_SHUTDOWN:
   1049 	case RAIDFRAME_REWRITEPARITY:
   1050 	case RAIDFRAME_GET_INFO:
   1051 	case RAIDFRAME_RESET_ACCTOTALS:
   1052 	case RAIDFRAME_GET_ACCTOTALS:
   1053 	case RAIDFRAME_KEEP_ACCTOTALS:
   1054 	case RAIDFRAME_GET_SIZE:
   1055 	case RAIDFRAME_FAIL_DISK:
   1056 	case RAIDFRAME_COPYBACK:
   1057 	case RAIDFRAME_CHECK_RECON_STATUS:
   1058 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1059 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1060 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1061 	case RAIDFRAME_ADD_HOT_SPARE:
   1062 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1063 	case RAIDFRAME_INIT_LABELS:
   1064 	case RAIDFRAME_REBUILD_IN_PLACE:
   1065 	case RAIDFRAME_CHECK_PARITY:
   1066 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1067 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1068 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1069 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1070 	case RAIDFRAME_SET_AUTOCONFIG:
   1071 	case RAIDFRAME_SET_ROOT:
   1072 	case RAIDFRAME_DELETE_COMPONENT:
   1073 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1074 	case RAIDFRAME_PARITYMAP_STATUS:
   1075 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1076 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1077 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1078 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1079 			return (ENXIO);
   1080 	}
   1081 
   1082 	switch (cmd) {
   1083 #ifdef COMPAT_50
   1084 	case RAIDFRAME_GET_INFO50:
   1085 		return rf_get_info50(raidPtr, data);
   1086 
   1087 	case RAIDFRAME_CONFIGURE50:
   1088 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1089 			return retcode;
   1090 		goto config;
   1091 #endif
   1092 		/* configure the system */
   1093 	case RAIDFRAME_CONFIGURE:
   1094 
   1095 		if (raidPtr->valid) {
   1096 			/* There is a valid RAID set running on this unit! */
   1097 			printf("raid%d: Device already configured!\n",unit);
   1098 			return(EINVAL);
   1099 		}
   1100 
   1101 		/* copy-in the configuration information */
   1102 		/* data points to a pointer to the configuration structure */
   1103 
   1104 		u_cfg = *((RF_Config_t **) data);
   1105 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1106 		if (k_cfg == NULL) {
   1107 			return (ENOMEM);
   1108 		}
   1109 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1110 		if (retcode) {
   1111 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1112 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1113 				retcode));
   1114 			return (retcode);
   1115 		}
   1116 		goto config;
   1117 	config:
   1118 		/* allocate a buffer for the layout-specific data, and copy it
   1119 		 * in */
   1120 		if (k_cfg->layoutSpecificSize) {
   1121 			if (k_cfg->layoutSpecificSize > 10000) {
   1122 				/* sanity check */
   1123 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1124 				return (EINVAL);
   1125 			}
   1126 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1127 			    (u_char *));
   1128 			if (specific_buf == NULL) {
   1129 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1130 				return (ENOMEM);
   1131 			}
   1132 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1133 			    k_cfg->layoutSpecificSize);
   1134 			if (retcode) {
   1135 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1136 				RF_Free(specific_buf,
   1137 					k_cfg->layoutSpecificSize);
   1138 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1139 					retcode));
   1140 				return (retcode);
   1141 			}
   1142 		} else
   1143 			specific_buf = NULL;
   1144 		k_cfg->layoutSpecific = specific_buf;
   1145 
   1146 		/* should do some kind of sanity check on the configuration.
   1147 		 * Store the sum of all the bytes in the last byte? */
   1148 
   1149 		/* configure the system */
   1150 
   1151 		/*
   1152 		 * Clear the entire RAID descriptor, just to make sure
   1153 		 *  there is no stale data left in the case of a
   1154 		 *  reconfiguration
   1155 		 */
   1156 		memset(raidPtr, 0, sizeof(*raidPtr));
   1157 		raidPtr->raidid = unit;
   1158 
   1159 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1160 
   1161 		if (retcode == 0) {
   1162 
   1163 			/* allow this many simultaneous IO's to
   1164 			   this RAID device */
   1165 			raidPtr->openings = RAIDOUTSTANDING;
   1166 
   1167 			raidinit(raidPtr);
   1168 			rf_markalldirty(raidPtr);
   1169 		}
   1170 		/* free the buffers.  No return code here. */
   1171 		if (k_cfg->layoutSpecificSize) {
   1172 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1173 		}
   1174 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1175 
   1176 		return (retcode);
   1177 
   1178 		/* shutdown the system */
   1179 	case RAIDFRAME_SHUTDOWN:
   1180 
   1181 		part = DISKPART(dev);
   1182 		pmask = (1 << part);
   1183 
   1184 		if ((error = raidlock(rs)) != 0)
   1185 			return (error);
   1186 
   1187 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1188 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1189 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1190 			retcode = EBUSY;
   1191 		else {
   1192 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1193 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1194 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1195 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1196 			retcode = 0;
   1197 		}
   1198 
   1199 		raidunlock(rs);
   1200 
   1201 		if (retcode != 0)
   1202 			return retcode;
   1203 
   1204 		/* free the pseudo device attach bits */
   1205 
   1206 		cf = device_cfdata(rs->sc_dev);
   1207 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1208 			free(cf, M_RAIDFRAME);
   1209 
   1210 		return (retcode);
   1211 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1212 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1213 		/* need to read the component label for the disk indicated
   1214 		   by row,column in clabel */
   1215 
   1216 		/*
   1217 		 * Perhaps there should be an option to skip the in-core
   1218 		 * copy and hit the disk, as with disklabel(8).
   1219 		 */
   1220 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1221 
   1222 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1223 
   1224 		if (retcode) {
   1225 			RF_Free(clabel, sizeof(*clabel));
   1226 			return retcode;
   1227 		}
   1228 
   1229 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1230 
   1231 		column = clabel->column;
   1232 
   1233 		if ((column < 0) || (column >= raidPtr->numCol +
   1234 		    raidPtr->numSpare)) {
   1235 			RF_Free(clabel, sizeof(*clabel));
   1236 			return EINVAL;
   1237 		}
   1238 
   1239 		RF_Free(clabel, sizeof(*clabel));
   1240 
   1241 		clabel = raidget_component_label(raidPtr, column);
   1242 
   1243 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1244 
   1245 #if 0
   1246 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1247 		clabel = (RF_ComponentLabel_t *) data;
   1248 
   1249 		/* XXX check the label for valid stuff... */
   1250 		/* Note that some things *should not* get modified --
   1251 		   the user should be re-initing the labels instead of
   1252 		   trying to patch things.
   1253 		   */
   1254 
   1255 		raidid = raidPtr->raidid;
   1256 #ifdef DEBUG
   1257 		printf("raid%d: Got component label:\n", raidid);
   1258 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1259 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1260 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1261 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1262 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1263 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1264 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1265 #endif
   1266 		clabel->row = 0;
   1267 		column = clabel->column;
   1268 
   1269 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1270 			return(EINVAL);
   1271 		}
   1272 
   1273 		/* XXX this isn't allowed to do anything for now :-) */
   1274 
   1275 		/* XXX and before it is, we need to fill in the rest
   1276 		   of the fields!?!?!?! */
   1277 		memcpy(raidget_component_label(raidPtr, column),
   1278 		    clabel, sizeof(*clabel));
   1279 		raidflush_component_label(raidPtr, column);
   1280 		return (0);
   1281 #endif
   1282 
   1283 	case RAIDFRAME_INIT_LABELS:
   1284 		clabel = (RF_ComponentLabel_t *) data;
   1285 		/*
   1286 		   we only want the serial number from
   1287 		   the above.  We get all the rest of the information
   1288 		   from the config that was used to create this RAID
   1289 		   set.
   1290 		   */
   1291 
   1292 		raidPtr->serial_number = clabel->serial_number;
   1293 
   1294 		for(column=0;column<raidPtr->numCol;column++) {
   1295 			diskPtr = &raidPtr->Disks[column];
   1296 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1297 				ci_label = raidget_component_label(raidPtr,
   1298 				    column);
   1299 				/* Zeroing this is important. */
   1300 				memset(ci_label, 0, sizeof(*ci_label));
   1301 				raid_init_component_label(raidPtr, ci_label);
   1302 				ci_label->serial_number =
   1303 				    raidPtr->serial_number;
   1304 				ci_label->row = 0; /* we dont' pretend to support more */
   1305 				rf_component_label_set_partitionsize(ci_label,
   1306 				    diskPtr->partitionSize);
   1307 				ci_label->column = column;
   1308 				raidflush_component_label(raidPtr, column);
   1309 			}
   1310 			/* XXXjld what about the spares? */
   1311 		}
   1312 
   1313 		return (retcode);
   1314 	case RAIDFRAME_SET_AUTOCONFIG:
   1315 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1316 		printf("raid%d: New autoconfig value is: %d\n",
   1317 		       raidPtr->raidid, d);
   1318 		*(int *) data = d;
   1319 		return (retcode);
   1320 
   1321 	case RAIDFRAME_SET_ROOT:
   1322 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1323 		printf("raid%d: New rootpartition value is: %d\n",
   1324 		       raidPtr->raidid, d);
   1325 		*(int *) data = d;
   1326 		return (retcode);
   1327 
   1328 		/* initialize all parity */
   1329 	case RAIDFRAME_REWRITEPARITY:
   1330 
   1331 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1332 			/* Parity for RAID 0 is trivially correct */
   1333 			raidPtr->parity_good = RF_RAID_CLEAN;
   1334 			return(0);
   1335 		}
   1336 
   1337 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1338 			/* Re-write is already in progress! */
   1339 			return(EINVAL);
   1340 		}
   1341 
   1342 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1343 					   rf_RewriteParityThread,
   1344 					   raidPtr,"raid_parity");
   1345 		return (retcode);
   1346 
   1347 
   1348 	case RAIDFRAME_ADD_HOT_SPARE:
   1349 		sparePtr = (RF_SingleComponent_t *) data;
   1350 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1351 		retcode = rf_add_hot_spare(raidPtr, &component);
   1352 		return(retcode);
   1353 
   1354 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1355 		return(retcode);
   1356 
   1357 	case RAIDFRAME_DELETE_COMPONENT:
   1358 		componentPtr = (RF_SingleComponent_t *)data;
   1359 		memcpy( &component, componentPtr,
   1360 			sizeof(RF_SingleComponent_t));
   1361 		retcode = rf_delete_component(raidPtr, &component);
   1362 		return(retcode);
   1363 
   1364 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1365 		componentPtr = (RF_SingleComponent_t *)data;
   1366 		memcpy( &component, componentPtr,
   1367 			sizeof(RF_SingleComponent_t));
   1368 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1369 		return(retcode);
   1370 
   1371 	case RAIDFRAME_REBUILD_IN_PLACE:
   1372 
   1373 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1374 			/* Can't do this on a RAID 0!! */
   1375 			return(EINVAL);
   1376 		}
   1377 
   1378 		if (raidPtr->recon_in_progress == 1) {
   1379 			/* a reconstruct is already in progress! */
   1380 			return(EINVAL);
   1381 		}
   1382 
   1383 		componentPtr = (RF_SingleComponent_t *) data;
   1384 		memcpy( &component, componentPtr,
   1385 			sizeof(RF_SingleComponent_t));
   1386 		component.row = 0; /* we don't support any more */
   1387 		column = component.column;
   1388 
   1389 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1390 			return(EINVAL);
   1391 		}
   1392 
   1393 		RF_LOCK_MUTEX(raidPtr->mutex);
   1394 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1395 		    (raidPtr->numFailures > 0)) {
   1396 			/* XXX 0 above shouldn't be constant!!! */
   1397 			/* some component other than this has failed.
   1398 			   Let's not make things worse than they already
   1399 			   are... */
   1400 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1401 			       raidPtr->raidid);
   1402 			printf("raid%d:     Col: %d   Too many failures.\n",
   1403 			       raidPtr->raidid, column);
   1404 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1405 			return (EINVAL);
   1406 		}
   1407 		if (raidPtr->Disks[column].status ==
   1408 		    rf_ds_reconstructing) {
   1409 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1410 			       raidPtr->raidid);
   1411 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1412 
   1413 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1414 			return (EINVAL);
   1415 		}
   1416 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1417 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1418 			return (EINVAL);
   1419 		}
   1420 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1421 
   1422 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1423 		if (rrcopy == NULL)
   1424 			return(ENOMEM);
   1425 
   1426 		rrcopy->raidPtr = (void *) raidPtr;
   1427 		rrcopy->col = column;
   1428 
   1429 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1430 					   rf_ReconstructInPlaceThread,
   1431 					   rrcopy,"raid_reconip");
   1432 		return(retcode);
   1433 
   1434 	case RAIDFRAME_GET_INFO:
   1435 		if (!raidPtr->valid)
   1436 			return (ENODEV);
   1437 		ucfgp = (RF_DeviceConfig_t **) data;
   1438 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1439 			  (RF_DeviceConfig_t *));
   1440 		if (d_cfg == NULL)
   1441 			return (ENOMEM);
   1442 		d_cfg->rows = 1; /* there is only 1 row now */
   1443 		d_cfg->cols = raidPtr->numCol;
   1444 		d_cfg->ndevs = raidPtr->numCol;
   1445 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1446 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1447 			return (ENOMEM);
   1448 		}
   1449 		d_cfg->nspares = raidPtr->numSpare;
   1450 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1451 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1452 			return (ENOMEM);
   1453 		}
   1454 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1455 		d = 0;
   1456 		for (j = 0; j < d_cfg->cols; j++) {
   1457 			d_cfg->devs[d] = raidPtr->Disks[j];
   1458 			d++;
   1459 		}
   1460 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1461 			d_cfg->spares[i] = raidPtr->Disks[j];
   1462 		}
   1463 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1464 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1465 
   1466 		return (retcode);
   1467 
   1468 	case RAIDFRAME_CHECK_PARITY:
   1469 		*(int *) data = raidPtr->parity_good;
   1470 		return (0);
   1471 
   1472 	case RAIDFRAME_PARITYMAP_STATUS:
   1473 		if (rf_paritymap_ineligible(raidPtr))
   1474 			return EINVAL;
   1475 		rf_paritymap_status(raidPtr->parity_map,
   1476 		    (struct rf_pmstat *)data);
   1477 		return 0;
   1478 
   1479 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1480 		if (rf_paritymap_ineligible(raidPtr))
   1481 			return EINVAL;
   1482 		if (raidPtr->parity_map == NULL)
   1483 			return ENOENT; /* ??? */
   1484 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1485 			(struct rf_pmparams *)data, 1))
   1486 			return EINVAL;
   1487 		return 0;
   1488 
   1489 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1490 		if (rf_paritymap_ineligible(raidPtr))
   1491 			return EINVAL;
   1492 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1493 		return 0;
   1494 
   1495 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1496 		if (rf_paritymap_ineligible(raidPtr))
   1497 			return EINVAL;
   1498 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1499 		/* XXX should errors be passed up? */
   1500 		return 0;
   1501 
   1502 	case RAIDFRAME_RESET_ACCTOTALS:
   1503 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1504 		return (0);
   1505 
   1506 	case RAIDFRAME_GET_ACCTOTALS:
   1507 		totals = (RF_AccTotals_t *) data;
   1508 		*totals = raidPtr->acc_totals;
   1509 		return (0);
   1510 
   1511 	case RAIDFRAME_KEEP_ACCTOTALS:
   1512 		raidPtr->keep_acc_totals = *(int *)data;
   1513 		return (0);
   1514 
   1515 	case RAIDFRAME_GET_SIZE:
   1516 		*(int *) data = raidPtr->totalSectors;
   1517 		return (0);
   1518 
   1519 		/* fail a disk & optionally start reconstruction */
   1520 	case RAIDFRAME_FAIL_DISK:
   1521 
   1522 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1523 			/* Can't do this on a RAID 0!! */
   1524 			return(EINVAL);
   1525 		}
   1526 
   1527 		rr = (struct rf_recon_req *) data;
   1528 		rr->row = 0;
   1529 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1530 			return (EINVAL);
   1531 
   1532 
   1533 		RF_LOCK_MUTEX(raidPtr->mutex);
   1534 		if (raidPtr->status == rf_rs_reconstructing) {
   1535 			/* you can't fail a disk while we're reconstructing! */
   1536 			/* XXX wrong for RAID6 */
   1537 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1538 			return (EINVAL);
   1539 		}
   1540 		if ((raidPtr->Disks[rr->col].status ==
   1541 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1542 			/* some other component has failed.  Let's not make
   1543 			   things worse. XXX wrong for RAID6 */
   1544 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1545 			return (EINVAL);
   1546 		}
   1547 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1548 			/* Can't fail a spared disk! */
   1549 			RF_UNLOCK_MUTEX(raidPtr->mutex);
   1550 			return (EINVAL);
   1551 		}
   1552 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1553 
   1554 		/* make a copy of the recon request so that we don't rely on
   1555 		 * the user's buffer */
   1556 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1557 		if (rrcopy == NULL)
   1558 			return(ENOMEM);
   1559 		memcpy(rrcopy, rr, sizeof(*rr));
   1560 		rrcopy->raidPtr = (void *) raidPtr;
   1561 
   1562 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1563 					   rf_ReconThread,
   1564 					   rrcopy,"raid_recon");
   1565 		return (0);
   1566 
   1567 		/* invoke a copyback operation after recon on whatever disk
   1568 		 * needs it, if any */
   1569 	case RAIDFRAME_COPYBACK:
   1570 
   1571 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1572 			/* This makes no sense on a RAID 0!! */
   1573 			return(EINVAL);
   1574 		}
   1575 
   1576 		if (raidPtr->copyback_in_progress == 1) {
   1577 			/* Copyback is already in progress! */
   1578 			return(EINVAL);
   1579 		}
   1580 
   1581 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1582 					   rf_CopybackThread,
   1583 					   raidPtr,"raid_copyback");
   1584 		return (retcode);
   1585 
   1586 		/* return the percentage completion of reconstruction */
   1587 	case RAIDFRAME_CHECK_RECON_STATUS:
   1588 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1589 			/* This makes no sense on a RAID 0, so tell the
   1590 			   user it's done. */
   1591 			*(int *) data = 100;
   1592 			return(0);
   1593 		}
   1594 		if (raidPtr->status != rf_rs_reconstructing)
   1595 			*(int *) data = 100;
   1596 		else {
   1597 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1598 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1599 			} else {
   1600 				*(int *) data = 0;
   1601 			}
   1602 		}
   1603 		return (0);
   1604 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1605 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1606 		if (raidPtr->status != rf_rs_reconstructing) {
   1607 			progressInfo.remaining = 0;
   1608 			progressInfo.completed = 100;
   1609 			progressInfo.total = 100;
   1610 		} else {
   1611 			progressInfo.total =
   1612 				raidPtr->reconControl->numRUsTotal;
   1613 			progressInfo.completed =
   1614 				raidPtr->reconControl->numRUsComplete;
   1615 			progressInfo.remaining = progressInfo.total -
   1616 				progressInfo.completed;
   1617 		}
   1618 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1619 				  sizeof(RF_ProgressInfo_t));
   1620 		return (retcode);
   1621 
   1622 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1623 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1624 			/* This makes no sense on a RAID 0, so tell the
   1625 			   user it's done. */
   1626 			*(int *) data = 100;
   1627 			return(0);
   1628 		}
   1629 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1630 			*(int *) data = 100 *
   1631 				raidPtr->parity_rewrite_stripes_done /
   1632 				raidPtr->Layout.numStripe;
   1633 		} else {
   1634 			*(int *) data = 100;
   1635 		}
   1636 		return (0);
   1637 
   1638 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1639 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1640 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1641 			progressInfo.total = raidPtr->Layout.numStripe;
   1642 			progressInfo.completed =
   1643 				raidPtr->parity_rewrite_stripes_done;
   1644 			progressInfo.remaining = progressInfo.total -
   1645 				progressInfo.completed;
   1646 		} else {
   1647 			progressInfo.remaining = 0;
   1648 			progressInfo.completed = 100;
   1649 			progressInfo.total = 100;
   1650 		}
   1651 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1652 				  sizeof(RF_ProgressInfo_t));
   1653 		return (retcode);
   1654 
   1655 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1656 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1657 			/* This makes no sense on a RAID 0 */
   1658 			*(int *) data = 100;
   1659 			return(0);
   1660 		}
   1661 		if (raidPtr->copyback_in_progress == 1) {
   1662 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1663 				raidPtr->Layout.numStripe;
   1664 		} else {
   1665 			*(int *) data = 100;
   1666 		}
   1667 		return (0);
   1668 
   1669 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1670 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1671 		if (raidPtr->copyback_in_progress == 1) {
   1672 			progressInfo.total = raidPtr->Layout.numStripe;
   1673 			progressInfo.completed =
   1674 				raidPtr->copyback_stripes_done;
   1675 			progressInfo.remaining = progressInfo.total -
   1676 				progressInfo.completed;
   1677 		} else {
   1678 			progressInfo.remaining = 0;
   1679 			progressInfo.completed = 100;
   1680 			progressInfo.total = 100;
   1681 		}
   1682 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1683 				  sizeof(RF_ProgressInfo_t));
   1684 		return (retcode);
   1685 
   1686 		/* the sparetable daemon calls this to wait for the kernel to
   1687 		 * need a spare table. this ioctl does not return until a
   1688 		 * spare table is needed. XXX -- calling mpsleep here in the
   1689 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1690 		 * -- I should either compute the spare table in the kernel,
   1691 		 * or have a different -- XXX XXX -- interface (a different
   1692 		 * character device) for delivering the table     -- XXX */
   1693 #if 0
   1694 	case RAIDFRAME_SPARET_WAIT:
   1695 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1696 		while (!rf_sparet_wait_queue)
   1697 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1698 		waitreq = rf_sparet_wait_queue;
   1699 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1700 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1701 
   1702 		/* structure assignment */
   1703 		*((RF_SparetWait_t *) data) = *waitreq;
   1704 
   1705 		RF_Free(waitreq, sizeof(*waitreq));
   1706 		return (0);
   1707 
   1708 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1709 		 * code in it that will cause the dameon to exit */
   1710 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1711 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1712 		waitreq->fcol = -1;
   1713 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1714 		waitreq->next = rf_sparet_wait_queue;
   1715 		rf_sparet_wait_queue = waitreq;
   1716 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1717 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1718 		return (0);
   1719 
   1720 		/* used by the spare table daemon to deliver a spare table
   1721 		 * into the kernel */
   1722 	case RAIDFRAME_SEND_SPARET:
   1723 
   1724 		/* install the spare table */
   1725 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1726 
   1727 		/* respond to the requestor.  the return status of the spare
   1728 		 * table installation is passed in the "fcol" field */
   1729 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1730 		waitreq->fcol = retcode;
   1731 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1732 		waitreq->next = rf_sparet_resp_queue;
   1733 		rf_sparet_resp_queue = waitreq;
   1734 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1735 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1736 
   1737 		return (retcode);
   1738 #endif
   1739 
   1740 	default:
   1741 		break; /* fall through to the os-specific code below */
   1742 
   1743 	}
   1744 
   1745 	if (!raidPtr->valid)
   1746 		return (EINVAL);
   1747 
   1748 	/*
   1749 	 * Add support for "regular" device ioctls here.
   1750 	 */
   1751 
   1752 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1753 	if (error != EPASSTHROUGH)
   1754 		return (error);
   1755 
   1756 	switch (cmd) {
   1757 	case DIOCGDINFO:
   1758 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1759 		break;
   1760 #ifdef __HAVE_OLD_DISKLABEL
   1761 	case ODIOCGDINFO:
   1762 		newlabel = *(rs->sc_dkdev.dk_label);
   1763 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1764 			return ENOTTY;
   1765 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1766 		break;
   1767 #endif
   1768 
   1769 	case DIOCGPART:
   1770 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1771 		((struct partinfo *) data)->part =
   1772 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1773 		break;
   1774 
   1775 	case DIOCWDINFO:
   1776 	case DIOCSDINFO:
   1777 #ifdef __HAVE_OLD_DISKLABEL
   1778 	case ODIOCWDINFO:
   1779 	case ODIOCSDINFO:
   1780 #endif
   1781 	{
   1782 		struct disklabel *lp;
   1783 #ifdef __HAVE_OLD_DISKLABEL
   1784 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1785 			memset(&newlabel, 0, sizeof newlabel);
   1786 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1787 			lp = &newlabel;
   1788 		} else
   1789 #endif
   1790 		lp = (struct disklabel *)data;
   1791 
   1792 		if ((error = raidlock(rs)) != 0)
   1793 			return (error);
   1794 
   1795 		rs->sc_flags |= RAIDF_LABELLING;
   1796 
   1797 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1798 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1799 		if (error == 0) {
   1800 			if (cmd == DIOCWDINFO
   1801 #ifdef __HAVE_OLD_DISKLABEL
   1802 			    || cmd == ODIOCWDINFO
   1803 #endif
   1804 			   )
   1805 				error = writedisklabel(RAIDLABELDEV(dev),
   1806 				    raidstrategy, rs->sc_dkdev.dk_label,
   1807 				    rs->sc_dkdev.dk_cpulabel);
   1808 		}
   1809 		rs->sc_flags &= ~RAIDF_LABELLING;
   1810 
   1811 		raidunlock(rs);
   1812 
   1813 		if (error)
   1814 			return (error);
   1815 		break;
   1816 	}
   1817 
   1818 	case DIOCWLABEL:
   1819 		if (*(int *) data != 0)
   1820 			rs->sc_flags |= RAIDF_WLABEL;
   1821 		else
   1822 			rs->sc_flags &= ~RAIDF_WLABEL;
   1823 		break;
   1824 
   1825 	case DIOCGDEFLABEL:
   1826 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1827 		break;
   1828 
   1829 #ifdef __HAVE_OLD_DISKLABEL
   1830 	case ODIOCGDEFLABEL:
   1831 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1832 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1833 			return ENOTTY;
   1834 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1835 		break;
   1836 #endif
   1837 
   1838 	case DIOCAWEDGE:
   1839 	case DIOCDWEDGE:
   1840 	    	dkw = (void *)data;
   1841 
   1842 		/* If the ioctl happens here, the parent is us. */
   1843 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1844 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1845 
   1846 	case DIOCLWEDGES:
   1847 		return dkwedge_list(&rs->sc_dkdev,
   1848 		    (struct dkwedge_list *)data, l);
   1849 	case DIOCCACHESYNC:
   1850 		return rf_sync_component_caches(raidPtr);
   1851 	default:
   1852 		retcode = ENOTTY;
   1853 	}
   1854 	return (retcode);
   1855 
   1856 }
   1857 
   1858 
   1859 /* raidinit -- complete the rest of the initialization for the
   1860    RAIDframe device.  */
   1861 
   1862 
   1863 static void
   1864 raidinit(RF_Raid_t *raidPtr)
   1865 {
   1866 	cfdata_t cf;
   1867 	struct raid_softc *rs;
   1868 	int     unit;
   1869 
   1870 	unit = raidPtr->raidid;
   1871 
   1872 	rs = &raid_softc[unit];
   1873 
   1874 	/* XXX should check return code first... */
   1875 	rs->sc_flags |= RAIDF_INITED;
   1876 
   1877 	/* XXX doesn't check bounds. */
   1878 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1879 
   1880 	/* attach the pseudo device */
   1881 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1882 	cf->cf_name = raid_cd.cd_name;
   1883 	cf->cf_atname = raid_cd.cd_name;
   1884 	cf->cf_unit = unit;
   1885 	cf->cf_fstate = FSTATE_STAR;
   1886 
   1887 	rs->sc_dev = config_attach_pseudo(cf);
   1888 
   1889 	if (rs->sc_dev == NULL) {
   1890 		printf("raid%d: config_attach_pseudo failed\n",
   1891 		    raidPtr->raidid);
   1892 		rs->sc_flags &= ~RAIDF_INITED;
   1893 		free(cf, M_RAIDFRAME);
   1894 		return;
   1895 	}
   1896 
   1897 	/* disk_attach actually creates space for the CPU disklabel, among
   1898 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1899 	 * with disklabels. */
   1900 
   1901 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1902 	disk_attach(&rs->sc_dkdev);
   1903 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1904 
   1905 	/* XXX There may be a weird interaction here between this, and
   1906 	 * protectedSectors, as used in RAIDframe.  */
   1907 
   1908 	rs->sc_size = raidPtr->totalSectors;
   1909 
   1910 	dkwedge_discover(&rs->sc_dkdev);
   1911 
   1912 	rf_set_properties(rs, raidPtr);
   1913 
   1914 }
   1915 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1916 /* wake up the daemon & tell it to get us a spare table
   1917  * XXX
   1918  * the entries in the queues should be tagged with the raidPtr
   1919  * so that in the extremely rare case that two recons happen at once,
   1920  * we know for which device were requesting a spare table
   1921  * XXX
   1922  *
   1923  * XXX This code is not currently used. GO
   1924  */
   1925 int
   1926 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1927 {
   1928 	int     retcode;
   1929 
   1930 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1931 	req->next = rf_sparet_wait_queue;
   1932 	rf_sparet_wait_queue = req;
   1933 	rf_broadcast_conf2(rf_sparet_wait_cv);
   1934 
   1935 	/* mpsleep unlocks the mutex */
   1936 	while (!rf_sparet_resp_queue) {
   1937 		cv_wait(rf_sparet_resp_cv, rf_sparet_resp_mutex);
   1938 	}
   1939 	req = rf_sparet_resp_queue;
   1940 	rf_sparet_resp_queue = req->next;
   1941 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1942 
   1943 	retcode = req->fcol;
   1944 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1945 					 * alloc'd */
   1946 	return (retcode);
   1947 }
   1948 #endif
   1949 
   1950 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1951  * bp & passes it down.
   1952  * any calls originating in the kernel must use non-blocking I/O
   1953  * do some extra sanity checking to return "appropriate" error values for
   1954  * certain conditions (to make some standard utilities work)
   1955  *
   1956  * Formerly known as: rf_DoAccessKernel
   1957  */
   1958 void
   1959 raidstart(RF_Raid_t *raidPtr)
   1960 {
   1961 	RF_SectorCount_t num_blocks, pb, sum;
   1962 	RF_RaidAddr_t raid_addr;
   1963 	struct partition *pp;
   1964 	daddr_t blocknum;
   1965 	int     unit;
   1966 	struct raid_softc *rs;
   1967 	int     do_async;
   1968 	struct buf *bp;
   1969 	int rc;
   1970 
   1971 	unit = raidPtr->raidid;
   1972 	rs = &raid_softc[unit];
   1973 
   1974 	/* quick check to see if anything has died recently */
   1975 	RF_LOCK_MUTEX(raidPtr->mutex);
   1976 	if (raidPtr->numNewFailures > 0) {
   1977 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1978 		rf_update_component_labels(raidPtr,
   1979 					   RF_NORMAL_COMPONENT_UPDATE);
   1980 		RF_LOCK_MUTEX(raidPtr->mutex);
   1981 		raidPtr->numNewFailures--;
   1982 	}
   1983 
   1984 	/* Check to see if we're at the limit... */
   1985 	while (raidPtr->openings > 0) {
   1986 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   1987 
   1988 		/* get the next item, if any, from the queue */
   1989 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   1990 			/* nothing more to do */
   1991 			return;
   1992 		}
   1993 
   1994 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   1995 		 * partition.. Need to make it absolute to the underlying
   1996 		 * device.. */
   1997 
   1998 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   1999 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2000 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2001 			blocknum += pp->p_offset;
   2002 		}
   2003 
   2004 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2005 			    (int) blocknum));
   2006 
   2007 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2008 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2009 
   2010 		/* *THIS* is where we adjust what block we're going to...
   2011 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2012 		raid_addr = blocknum;
   2013 
   2014 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2015 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2016 		sum = raid_addr + num_blocks + pb;
   2017 		if (1 || rf_debugKernelAccess) {
   2018 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2019 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2020 				    (int) pb, (int) bp->b_resid));
   2021 		}
   2022 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2023 		    || (sum < num_blocks) || (sum < pb)) {
   2024 			bp->b_error = ENOSPC;
   2025 			bp->b_resid = bp->b_bcount;
   2026 			biodone(bp);
   2027 			RF_LOCK_MUTEX(raidPtr->mutex);
   2028 			continue;
   2029 		}
   2030 		/*
   2031 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2032 		 */
   2033 
   2034 		if (bp->b_bcount & raidPtr->sectorMask) {
   2035 			bp->b_error = EINVAL;
   2036 			bp->b_resid = bp->b_bcount;
   2037 			biodone(bp);
   2038 			RF_LOCK_MUTEX(raidPtr->mutex);
   2039 			continue;
   2040 
   2041 		}
   2042 		db1_printf(("Calling DoAccess..\n"));
   2043 
   2044 
   2045 		RF_LOCK_MUTEX(raidPtr->mutex);
   2046 		raidPtr->openings--;
   2047 		RF_UNLOCK_MUTEX(raidPtr->mutex);
   2048 
   2049 		/*
   2050 		 * Everything is async.
   2051 		 */
   2052 		do_async = 1;
   2053 
   2054 		disk_busy(&rs->sc_dkdev);
   2055 
   2056 		/* XXX we're still at splbio() here... do we *really*
   2057 		   need to be? */
   2058 
   2059 		/* don't ever condition on bp->b_flags & B_WRITE.
   2060 		 * always condition on B_READ instead */
   2061 
   2062 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2063 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2064 				 do_async, raid_addr, num_blocks,
   2065 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2066 
   2067 		if (rc) {
   2068 			bp->b_error = rc;
   2069 			bp->b_resid = bp->b_bcount;
   2070 			biodone(bp);
   2071 			/* continue loop */
   2072 		}
   2073 
   2074 		RF_LOCK_MUTEX(raidPtr->mutex);
   2075 	}
   2076 	RF_UNLOCK_MUTEX(raidPtr->mutex);
   2077 }
   2078 
   2079 
   2080 
   2081 
   2082 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2083 
   2084 int
   2085 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2086 {
   2087 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2088 	struct buf *bp;
   2089 
   2090 	req->queue = queue;
   2091 	bp = req->bp;
   2092 
   2093 	switch (req->type) {
   2094 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2095 		/* XXX need to do something extra here.. */
   2096 		/* I'm leaving this in, as I've never actually seen it used,
   2097 		 * and I'd like folks to report it... GO */
   2098 		printf(("WAKEUP CALLED\n"));
   2099 		queue->numOutstanding++;
   2100 
   2101 		bp->b_flags = 0;
   2102 		bp->b_private = req;
   2103 
   2104 		KernelWakeupFunc(bp);
   2105 		break;
   2106 
   2107 	case RF_IO_TYPE_READ:
   2108 	case RF_IO_TYPE_WRITE:
   2109 #if RF_ACC_TRACE > 0
   2110 		if (req->tracerec) {
   2111 			RF_ETIMER_START(req->tracerec->timer);
   2112 		}
   2113 #endif
   2114 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2115 		    op, queue->rf_cinfo->ci_dev,
   2116 		    req->sectorOffset, req->numSector,
   2117 		    req->buf, KernelWakeupFunc, (void *) req,
   2118 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2119 
   2120 		if (rf_debugKernelAccess) {
   2121 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2122 				(long) bp->b_blkno));
   2123 		}
   2124 		queue->numOutstanding++;
   2125 		queue->last_deq_sector = req->sectorOffset;
   2126 		/* acc wouldn't have been let in if there were any pending
   2127 		 * reqs at any other priority */
   2128 		queue->curPriority = req->priority;
   2129 
   2130 		db1_printf(("Going for %c to unit %d col %d\n",
   2131 			    req->type, queue->raidPtr->raidid,
   2132 			    queue->col));
   2133 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2134 			(int) req->sectorOffset, (int) req->numSector,
   2135 			(int) (req->numSector <<
   2136 			    queue->raidPtr->logBytesPerSector),
   2137 			(int) queue->raidPtr->logBytesPerSector));
   2138 
   2139 		/*
   2140 		 * XXX: drop lock here since this can block at
   2141 		 * least with backing SCSI devices.  Retake it
   2142 		 * to minimize fuss with calling interfaces.
   2143 		 */
   2144 
   2145 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2146 		bdev_strategy(bp);
   2147 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2148 		break;
   2149 
   2150 	default:
   2151 		panic("bad req->type in rf_DispatchKernelIO");
   2152 	}
   2153 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2154 
   2155 	return (0);
   2156 }
   2157 /* this is the callback function associated with a I/O invoked from
   2158    kernel code.
   2159  */
   2160 static void
   2161 KernelWakeupFunc(struct buf *bp)
   2162 {
   2163 	RF_DiskQueueData_t *req = NULL;
   2164 	RF_DiskQueue_t *queue;
   2165 
   2166 	db1_printf(("recovering the request queue:\n"));
   2167 
   2168 	req = bp->b_private;
   2169 
   2170 	queue = (RF_DiskQueue_t *) req->queue;
   2171 
   2172 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2173 
   2174 #if RF_ACC_TRACE > 0
   2175 	if (req->tracerec) {
   2176 		RF_ETIMER_STOP(req->tracerec->timer);
   2177 		RF_ETIMER_EVAL(req->tracerec->timer);
   2178 		rf_lock_mutex2(rf_tracing_mutex);
   2179 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2180 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2181 		req->tracerec->num_phys_ios++;
   2182 		rf_unlock_mutex2(rf_tracing_mutex);
   2183 	}
   2184 #endif
   2185 
   2186 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2187 	 * ballistic, and mark the component as hosed... */
   2188 
   2189 	if (bp->b_error != 0) {
   2190 		/* Mark the disk as dead */
   2191 		/* but only mark it once... */
   2192 		/* and only if it wouldn't leave this RAID set
   2193 		   completely broken */
   2194 		if (((queue->raidPtr->Disks[queue->col].status ==
   2195 		      rf_ds_optimal) ||
   2196 		     (queue->raidPtr->Disks[queue->col].status ==
   2197 		      rf_ds_used_spare)) &&
   2198 		     (queue->raidPtr->numFailures <
   2199 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2200 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2201 			       queue->raidPtr->raidid,
   2202 			       queue->raidPtr->Disks[queue->col].devname);
   2203 			queue->raidPtr->Disks[queue->col].status =
   2204 			    rf_ds_failed;
   2205 			queue->raidPtr->status = rf_rs_degraded;
   2206 			queue->raidPtr->numFailures++;
   2207 			queue->raidPtr->numNewFailures++;
   2208 		} else {	/* Disk is already dead... */
   2209 			/* printf("Disk already marked as dead!\n"); */
   2210 		}
   2211 
   2212 	}
   2213 
   2214 	/* Fill in the error value */
   2215 	req->error = bp->b_error;
   2216 
   2217 	/* Drop this one on the "finished" queue... */
   2218 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2219 
   2220 	/* Let the raidio thread know there is work to be done. */
   2221 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2222 
   2223 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2224 }
   2225 
   2226 
   2227 /*
   2228  * initialize a buf structure for doing an I/O in the kernel.
   2229  */
   2230 static void
   2231 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2232        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2233        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2234        struct proc *b_proc)
   2235 {
   2236 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2237 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2238 	bp->b_oflags = 0;
   2239 	bp->b_cflags = 0;
   2240 	bp->b_bcount = numSect << logBytesPerSector;
   2241 	bp->b_bufsize = bp->b_bcount;
   2242 	bp->b_error = 0;
   2243 	bp->b_dev = dev;
   2244 	bp->b_data = bf;
   2245 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2246 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2247 	if (bp->b_bcount == 0) {
   2248 		panic("bp->b_bcount is zero in InitBP!!");
   2249 	}
   2250 	bp->b_proc = b_proc;
   2251 	bp->b_iodone = cbFunc;
   2252 	bp->b_private = cbArg;
   2253 }
   2254 
   2255 static void
   2256 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2257 		    struct disklabel *lp)
   2258 {
   2259 	memset(lp, 0, sizeof(*lp));
   2260 
   2261 	/* fabricate a label... */
   2262 	lp->d_secperunit = raidPtr->totalSectors;
   2263 	lp->d_secsize = raidPtr->bytesPerSector;
   2264 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2265 	lp->d_ntracks = 4 * raidPtr->numCol;
   2266 	lp->d_ncylinders = raidPtr->totalSectors /
   2267 		(lp->d_nsectors * lp->d_ntracks);
   2268 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2269 
   2270 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2271 	lp->d_type = DTYPE_RAID;
   2272 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2273 	lp->d_rpm = 3600;
   2274 	lp->d_interleave = 1;
   2275 	lp->d_flags = 0;
   2276 
   2277 	lp->d_partitions[RAW_PART].p_offset = 0;
   2278 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2279 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2280 	lp->d_npartitions = RAW_PART + 1;
   2281 
   2282 	lp->d_magic = DISKMAGIC;
   2283 	lp->d_magic2 = DISKMAGIC;
   2284 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2285 
   2286 }
   2287 /*
   2288  * Read the disklabel from the raid device.  If one is not present, fake one
   2289  * up.
   2290  */
   2291 static void
   2292 raidgetdisklabel(dev_t dev)
   2293 {
   2294 	int     unit = raidunit(dev);
   2295 	struct raid_softc *rs = &raid_softc[unit];
   2296 	const char   *errstring;
   2297 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2298 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2299 	RF_Raid_t *raidPtr;
   2300 
   2301 	db1_printf(("Getting the disklabel...\n"));
   2302 
   2303 	memset(clp, 0, sizeof(*clp));
   2304 
   2305 	raidPtr = raidPtrs[unit];
   2306 
   2307 	raidgetdefaultlabel(raidPtr, rs, lp);
   2308 
   2309 	/*
   2310 	 * Call the generic disklabel extraction routine.
   2311 	 */
   2312 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2313 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2314 	if (errstring)
   2315 		raidmakedisklabel(rs);
   2316 	else {
   2317 		int     i;
   2318 		struct partition *pp;
   2319 
   2320 		/*
   2321 		 * Sanity check whether the found disklabel is valid.
   2322 		 *
   2323 		 * This is necessary since total size of the raid device
   2324 		 * may vary when an interleave is changed even though exactly
   2325 		 * same components are used, and old disklabel may used
   2326 		 * if that is found.
   2327 		 */
   2328 		if (lp->d_secperunit != rs->sc_size)
   2329 			printf("raid%d: WARNING: %s: "
   2330 			    "total sector size in disklabel (%" PRIu32 ") != "
   2331 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2332 			    lp->d_secperunit, rs->sc_size);
   2333 		for (i = 0; i < lp->d_npartitions; i++) {
   2334 			pp = &lp->d_partitions[i];
   2335 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2336 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2337 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2338 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2339 		}
   2340 	}
   2341 
   2342 }
   2343 /*
   2344  * Take care of things one might want to take care of in the event
   2345  * that a disklabel isn't present.
   2346  */
   2347 static void
   2348 raidmakedisklabel(struct raid_softc *rs)
   2349 {
   2350 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2351 	db1_printf(("Making a label..\n"));
   2352 
   2353 	/*
   2354 	 * For historical reasons, if there's no disklabel present
   2355 	 * the raw partition must be marked FS_BSDFFS.
   2356 	 */
   2357 
   2358 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2359 
   2360 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2361 
   2362 	lp->d_checksum = dkcksum(lp);
   2363 }
   2364 /*
   2365  * Wait interruptibly for an exclusive lock.
   2366  *
   2367  * XXX
   2368  * Several drivers do this; it should be abstracted and made MP-safe.
   2369  * (Hmm... where have we seen this warning before :->  GO )
   2370  */
   2371 static int
   2372 raidlock(struct raid_softc *rs)
   2373 {
   2374 	int     error;
   2375 
   2376 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2377 		rs->sc_flags |= RAIDF_WANTED;
   2378 		if ((error =
   2379 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2380 			return (error);
   2381 	}
   2382 	rs->sc_flags |= RAIDF_LOCKED;
   2383 	return (0);
   2384 }
   2385 /*
   2386  * Unlock and wake up any waiters.
   2387  */
   2388 static void
   2389 raidunlock(struct raid_softc *rs)
   2390 {
   2391 
   2392 	rs->sc_flags &= ~RAIDF_LOCKED;
   2393 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2394 		rs->sc_flags &= ~RAIDF_WANTED;
   2395 		wakeup(rs);
   2396 	}
   2397 }
   2398 
   2399 
   2400 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2401 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2402 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2403 
   2404 static daddr_t
   2405 rf_component_info_offset(void)
   2406 {
   2407 
   2408 	return RF_COMPONENT_INFO_OFFSET;
   2409 }
   2410 
   2411 static daddr_t
   2412 rf_component_info_size(unsigned secsize)
   2413 {
   2414 	daddr_t info_size;
   2415 
   2416 	KASSERT(secsize);
   2417 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2418 		info_size = secsize;
   2419 	else
   2420 		info_size = RF_COMPONENT_INFO_SIZE;
   2421 
   2422 	return info_size;
   2423 }
   2424 
   2425 static daddr_t
   2426 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2427 {
   2428 	daddr_t map_offset;
   2429 
   2430 	KASSERT(raidPtr->bytesPerSector);
   2431 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2432 		map_offset = raidPtr->bytesPerSector;
   2433 	else
   2434 		map_offset = RF_COMPONENT_INFO_SIZE;
   2435 	map_offset += rf_component_info_offset();
   2436 
   2437 	return map_offset;
   2438 }
   2439 
   2440 static daddr_t
   2441 rf_parity_map_size(RF_Raid_t *raidPtr)
   2442 {
   2443 	daddr_t map_size;
   2444 
   2445 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2446 		map_size = raidPtr->bytesPerSector;
   2447 	else
   2448 		map_size = RF_PARITY_MAP_SIZE;
   2449 
   2450 	return map_size;
   2451 }
   2452 
   2453 int
   2454 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2455 {
   2456 	RF_ComponentLabel_t *clabel;
   2457 
   2458 	clabel = raidget_component_label(raidPtr, col);
   2459 	clabel->clean = RF_RAID_CLEAN;
   2460 	raidflush_component_label(raidPtr, col);
   2461 	return(0);
   2462 }
   2463 
   2464 
   2465 int
   2466 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2467 {
   2468 	RF_ComponentLabel_t *clabel;
   2469 
   2470 	clabel = raidget_component_label(raidPtr, col);
   2471 	clabel->clean = RF_RAID_DIRTY;
   2472 	raidflush_component_label(raidPtr, col);
   2473 	return(0);
   2474 }
   2475 
   2476 int
   2477 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2478 {
   2479 	KASSERT(raidPtr->bytesPerSector);
   2480 	return raidread_component_label(raidPtr->bytesPerSector,
   2481 	    raidPtr->Disks[col].dev,
   2482 	    raidPtr->raid_cinfo[col].ci_vp,
   2483 	    &raidPtr->raid_cinfo[col].ci_label);
   2484 }
   2485 
   2486 RF_ComponentLabel_t *
   2487 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2488 {
   2489 	return &raidPtr->raid_cinfo[col].ci_label;
   2490 }
   2491 
   2492 int
   2493 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2494 {
   2495 	RF_ComponentLabel_t *label;
   2496 
   2497 	label = &raidPtr->raid_cinfo[col].ci_label;
   2498 	label->mod_counter = raidPtr->mod_counter;
   2499 #ifndef RF_NO_PARITY_MAP
   2500 	label->parity_map_modcount = label->mod_counter;
   2501 #endif
   2502 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2503 	    raidPtr->Disks[col].dev,
   2504 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2505 }
   2506 
   2507 
   2508 static int
   2509 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2510     RF_ComponentLabel_t *clabel)
   2511 {
   2512 	return raidread_component_area(dev, b_vp, clabel,
   2513 	    sizeof(RF_ComponentLabel_t),
   2514 	    rf_component_info_offset(),
   2515 	    rf_component_info_size(secsize));
   2516 }
   2517 
   2518 /* ARGSUSED */
   2519 static int
   2520 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2521     size_t msize, daddr_t offset, daddr_t dsize)
   2522 {
   2523 	struct buf *bp;
   2524 	const struct bdevsw *bdev;
   2525 	int error;
   2526 
   2527 	/* XXX should probably ensure that we don't try to do this if
   2528 	   someone has changed rf_protected_sectors. */
   2529 
   2530 	if (b_vp == NULL) {
   2531 		/* For whatever reason, this component is not valid.
   2532 		   Don't try to read a component label from it. */
   2533 		return(EINVAL);
   2534 	}
   2535 
   2536 	/* get a block of the appropriate size... */
   2537 	bp = geteblk((int)dsize);
   2538 	bp->b_dev = dev;
   2539 
   2540 	/* get our ducks in a row for the read */
   2541 	bp->b_blkno = offset / DEV_BSIZE;
   2542 	bp->b_bcount = dsize;
   2543 	bp->b_flags |= B_READ;
   2544  	bp->b_resid = dsize;
   2545 
   2546 	bdev = bdevsw_lookup(bp->b_dev);
   2547 	if (bdev == NULL)
   2548 		return (ENXIO);
   2549 	(*bdev->d_strategy)(bp);
   2550 
   2551 	error = biowait(bp);
   2552 
   2553 	if (!error) {
   2554 		memcpy(data, bp->b_data, msize);
   2555 	}
   2556 
   2557 	brelse(bp, 0);
   2558 	return(error);
   2559 }
   2560 
   2561 
   2562 static int
   2563 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2564     RF_ComponentLabel_t *clabel)
   2565 {
   2566 	return raidwrite_component_area(dev, b_vp, clabel,
   2567 	    sizeof(RF_ComponentLabel_t),
   2568 	    rf_component_info_offset(),
   2569 	    rf_component_info_size(secsize), 0);
   2570 }
   2571 
   2572 /* ARGSUSED */
   2573 static int
   2574 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2575     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2576 {
   2577 	struct buf *bp;
   2578 	const struct bdevsw *bdev;
   2579 	int error;
   2580 
   2581 	/* get a block of the appropriate size... */
   2582 	bp = geteblk((int)dsize);
   2583 	bp->b_dev = dev;
   2584 
   2585 	/* get our ducks in a row for the write */
   2586 	bp->b_blkno = offset / DEV_BSIZE;
   2587 	bp->b_bcount = dsize;
   2588 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2589  	bp->b_resid = dsize;
   2590 
   2591 	memset(bp->b_data, 0, dsize);
   2592 	memcpy(bp->b_data, data, msize);
   2593 
   2594 	bdev = bdevsw_lookup(bp->b_dev);
   2595 	if (bdev == NULL)
   2596 		return (ENXIO);
   2597 	(*bdev->d_strategy)(bp);
   2598 	if (asyncp)
   2599 		return 0;
   2600 	error = biowait(bp);
   2601 	brelse(bp, 0);
   2602 	if (error) {
   2603 #if 1
   2604 		printf("Failed to write RAID component info!\n");
   2605 #endif
   2606 	}
   2607 
   2608 	return(error);
   2609 }
   2610 
   2611 void
   2612 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2613 {
   2614 	int c;
   2615 
   2616 	for (c = 0; c < raidPtr->numCol; c++) {
   2617 		/* Skip dead disks. */
   2618 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2619 			continue;
   2620 		/* XXXjld: what if an error occurs here? */
   2621 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2622 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2623 		    RF_PARITYMAP_NBYTE,
   2624 		    rf_parity_map_offset(raidPtr),
   2625 		    rf_parity_map_size(raidPtr), 0);
   2626 	}
   2627 }
   2628 
   2629 void
   2630 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2631 {
   2632 	struct rf_paritymap_ondisk tmp;
   2633 	int c,first;
   2634 
   2635 	first=1;
   2636 	for (c = 0; c < raidPtr->numCol; c++) {
   2637 		/* Skip dead disks. */
   2638 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2639 			continue;
   2640 		raidread_component_area(raidPtr->Disks[c].dev,
   2641 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2642 		    RF_PARITYMAP_NBYTE,
   2643 		    rf_parity_map_offset(raidPtr),
   2644 		    rf_parity_map_size(raidPtr));
   2645 		if (first) {
   2646 			memcpy(map, &tmp, sizeof(*map));
   2647 			first = 0;
   2648 		} else {
   2649 			rf_paritymap_merge(map, &tmp);
   2650 		}
   2651 	}
   2652 }
   2653 
   2654 void
   2655 rf_markalldirty(RF_Raid_t *raidPtr)
   2656 {
   2657 	RF_ComponentLabel_t *clabel;
   2658 	int sparecol;
   2659 	int c;
   2660 	int j;
   2661 	int scol = -1;
   2662 
   2663 	raidPtr->mod_counter++;
   2664 	for (c = 0; c < raidPtr->numCol; c++) {
   2665 		/* we don't want to touch (at all) a disk that has
   2666 		   failed */
   2667 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2668 			clabel = raidget_component_label(raidPtr, c);
   2669 			if (clabel->status == rf_ds_spared) {
   2670 				/* XXX do something special...
   2671 				   but whatever you do, don't
   2672 				   try to access it!! */
   2673 			} else {
   2674 				raidmarkdirty(raidPtr, c);
   2675 			}
   2676 		}
   2677 	}
   2678 
   2679 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2680 		sparecol = raidPtr->numCol + c;
   2681 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2682 			/*
   2683 
   2684 			   we claim this disk is "optimal" if it's
   2685 			   rf_ds_used_spare, as that means it should be
   2686 			   directly substitutable for the disk it replaced.
   2687 			   We note that too...
   2688 
   2689 			 */
   2690 
   2691 			for(j=0;j<raidPtr->numCol;j++) {
   2692 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2693 					scol = j;
   2694 					break;
   2695 				}
   2696 			}
   2697 
   2698 			clabel = raidget_component_label(raidPtr, sparecol);
   2699 			/* make sure status is noted */
   2700 
   2701 			raid_init_component_label(raidPtr, clabel);
   2702 
   2703 			clabel->row = 0;
   2704 			clabel->column = scol;
   2705 			/* Note: we *don't* change status from rf_ds_used_spare
   2706 			   to rf_ds_optimal */
   2707 			/* clabel.status = rf_ds_optimal; */
   2708 
   2709 			raidmarkdirty(raidPtr, sparecol);
   2710 		}
   2711 	}
   2712 }
   2713 
   2714 
   2715 void
   2716 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2717 {
   2718 	RF_ComponentLabel_t *clabel;
   2719 	int sparecol;
   2720 	int c;
   2721 	int j;
   2722 	int scol;
   2723 
   2724 	scol = -1;
   2725 
   2726 	/* XXX should do extra checks to make sure things really are clean,
   2727 	   rather than blindly setting the clean bit... */
   2728 
   2729 	raidPtr->mod_counter++;
   2730 
   2731 	for (c = 0; c < raidPtr->numCol; c++) {
   2732 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2733 			clabel = raidget_component_label(raidPtr, c);
   2734 			/* make sure status is noted */
   2735 			clabel->status = rf_ds_optimal;
   2736 
   2737 			/* note what unit we are configured as */
   2738 			clabel->last_unit = raidPtr->raidid;
   2739 
   2740 			raidflush_component_label(raidPtr, c);
   2741 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2742 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2743 					raidmarkclean(raidPtr, c);
   2744 				}
   2745 			}
   2746 		}
   2747 		/* else we don't touch it.. */
   2748 	}
   2749 
   2750 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2751 		sparecol = raidPtr->numCol + c;
   2752 		/* Need to ensure that the reconstruct actually completed! */
   2753 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2754 			/*
   2755 
   2756 			   we claim this disk is "optimal" if it's
   2757 			   rf_ds_used_spare, as that means it should be
   2758 			   directly substitutable for the disk it replaced.
   2759 			   We note that too...
   2760 
   2761 			 */
   2762 
   2763 			for(j=0;j<raidPtr->numCol;j++) {
   2764 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2765 					scol = j;
   2766 					break;
   2767 				}
   2768 			}
   2769 
   2770 			/* XXX shouldn't *really* need this... */
   2771 			clabel = raidget_component_label(raidPtr, sparecol);
   2772 			/* make sure status is noted */
   2773 
   2774 			raid_init_component_label(raidPtr, clabel);
   2775 
   2776 			clabel->column = scol;
   2777 			clabel->status = rf_ds_optimal;
   2778 			clabel->last_unit = raidPtr->raidid;
   2779 
   2780 			raidflush_component_label(raidPtr, sparecol);
   2781 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2782 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2783 					raidmarkclean(raidPtr, sparecol);
   2784 				}
   2785 			}
   2786 		}
   2787 	}
   2788 }
   2789 
   2790 void
   2791 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2792 {
   2793 
   2794 	if (vp != NULL) {
   2795 		if (auto_configured == 1) {
   2796 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2797 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2798 			vput(vp);
   2799 
   2800 		} else {
   2801 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2802 		}
   2803 	}
   2804 }
   2805 
   2806 
   2807 void
   2808 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2809 {
   2810 	int r,c;
   2811 	struct vnode *vp;
   2812 	int acd;
   2813 
   2814 
   2815 	/* We take this opportunity to close the vnodes like we should.. */
   2816 
   2817 	for (c = 0; c < raidPtr->numCol; c++) {
   2818 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2819 		acd = raidPtr->Disks[c].auto_configured;
   2820 		rf_close_component(raidPtr, vp, acd);
   2821 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2822 		raidPtr->Disks[c].auto_configured = 0;
   2823 	}
   2824 
   2825 	for (r = 0; r < raidPtr->numSpare; r++) {
   2826 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2827 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2828 		rf_close_component(raidPtr, vp, acd);
   2829 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2830 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2831 	}
   2832 }
   2833 
   2834 
   2835 void
   2836 rf_ReconThread(struct rf_recon_req *req)
   2837 {
   2838 	int     s;
   2839 	RF_Raid_t *raidPtr;
   2840 
   2841 	s = splbio();
   2842 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2843 	raidPtr->recon_in_progress = 1;
   2844 
   2845 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2846 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2847 
   2848 	RF_Free(req, sizeof(*req));
   2849 
   2850 	raidPtr->recon_in_progress = 0;
   2851 	splx(s);
   2852 
   2853 	/* That's all... */
   2854 	kthread_exit(0);	/* does not return */
   2855 }
   2856 
   2857 void
   2858 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2859 {
   2860 	int retcode;
   2861 	int s;
   2862 
   2863 	raidPtr->parity_rewrite_stripes_done = 0;
   2864 	raidPtr->parity_rewrite_in_progress = 1;
   2865 	s = splbio();
   2866 	retcode = rf_RewriteParity(raidPtr);
   2867 	splx(s);
   2868 	if (retcode) {
   2869 		printf("raid%d: Error re-writing parity (%d)!\n",
   2870 		    raidPtr->raidid, retcode);
   2871 	} else {
   2872 		/* set the clean bit!  If we shutdown correctly,
   2873 		   the clean bit on each component label will get
   2874 		   set */
   2875 		raidPtr->parity_good = RF_RAID_CLEAN;
   2876 	}
   2877 	raidPtr->parity_rewrite_in_progress = 0;
   2878 
   2879 	/* Anyone waiting for us to stop?  If so, inform them... */
   2880 	if (raidPtr->waitShutdown) {
   2881 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2882 	}
   2883 
   2884 	/* That's all... */
   2885 	kthread_exit(0);	/* does not return */
   2886 }
   2887 
   2888 
   2889 void
   2890 rf_CopybackThread(RF_Raid_t *raidPtr)
   2891 {
   2892 	int s;
   2893 
   2894 	raidPtr->copyback_in_progress = 1;
   2895 	s = splbio();
   2896 	rf_CopybackReconstructedData(raidPtr);
   2897 	splx(s);
   2898 	raidPtr->copyback_in_progress = 0;
   2899 
   2900 	/* That's all... */
   2901 	kthread_exit(0);	/* does not return */
   2902 }
   2903 
   2904 
   2905 void
   2906 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2907 {
   2908 	int s;
   2909 	RF_Raid_t *raidPtr;
   2910 
   2911 	s = splbio();
   2912 	raidPtr = req->raidPtr;
   2913 	raidPtr->recon_in_progress = 1;
   2914 	rf_ReconstructInPlace(raidPtr, req->col);
   2915 	RF_Free(req, sizeof(*req));
   2916 	raidPtr->recon_in_progress = 0;
   2917 	splx(s);
   2918 
   2919 	/* That's all... */
   2920 	kthread_exit(0);	/* does not return */
   2921 }
   2922 
   2923 static RF_AutoConfig_t *
   2924 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2925     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2926     unsigned secsize)
   2927 {
   2928 	int good_one = 0;
   2929 	RF_ComponentLabel_t *clabel;
   2930 	RF_AutoConfig_t *ac;
   2931 
   2932 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2933 	if (clabel == NULL) {
   2934 oomem:
   2935 		    while(ac_list) {
   2936 			    ac = ac_list;
   2937 			    if (ac->clabel)
   2938 				    free(ac->clabel, M_RAIDFRAME);
   2939 			    ac_list = ac_list->next;
   2940 			    free(ac, M_RAIDFRAME);
   2941 		    }
   2942 		    printf("RAID auto config: out of memory!\n");
   2943 		    return NULL; /* XXX probably should panic? */
   2944 	}
   2945 
   2946 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2947 		/* Got the label.  Does it look reasonable? */
   2948 		if (rf_reasonable_label(clabel, numsecs) &&
   2949 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2950 #ifdef DEBUG
   2951 			printf("Component on: %s: %llu\n",
   2952 				cname, (unsigned long long)size);
   2953 			rf_print_component_label(clabel);
   2954 #endif
   2955 			/* if it's reasonable, add it, else ignore it. */
   2956 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2957 				M_NOWAIT);
   2958 			if (ac == NULL) {
   2959 				free(clabel, M_RAIDFRAME);
   2960 				goto oomem;
   2961 			}
   2962 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2963 			ac->dev = dev;
   2964 			ac->vp = vp;
   2965 			ac->clabel = clabel;
   2966 			ac->next = ac_list;
   2967 			ac_list = ac;
   2968 			good_one = 1;
   2969 		}
   2970 	}
   2971 	if (!good_one) {
   2972 		/* cleanup */
   2973 		free(clabel, M_RAIDFRAME);
   2974 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2975 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2976 		vput(vp);
   2977 	}
   2978 	return ac_list;
   2979 }
   2980 
   2981 RF_AutoConfig_t *
   2982 rf_find_raid_components(void)
   2983 {
   2984 	struct vnode *vp;
   2985 	struct disklabel label;
   2986 	device_t dv;
   2987 	deviter_t di;
   2988 	dev_t dev;
   2989 	int bmajor, bminor, wedge;
   2990 	int error;
   2991 	int i;
   2992 	RF_AutoConfig_t *ac_list;
   2993 	uint64_t numsecs;
   2994 	unsigned secsize;
   2995 
   2996 	/* initialize the AutoConfig list */
   2997 	ac_list = NULL;
   2998 
   2999 	/* we begin by trolling through *all* the devices on the system */
   3000 
   3001 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3002 	     dv = deviter_next(&di)) {
   3003 
   3004 		/* we are only interested in disks... */
   3005 		if (device_class(dv) != DV_DISK)
   3006 			continue;
   3007 
   3008 		/* we don't care about floppies... */
   3009 		if (device_is_a(dv, "fd")) {
   3010 			continue;
   3011 		}
   3012 
   3013 		/* we don't care about CD's... */
   3014 		if (device_is_a(dv, "cd")) {
   3015 			continue;
   3016 		}
   3017 
   3018 		/* we don't care about md's... */
   3019 		if (device_is_a(dv, "md")) {
   3020 			continue;
   3021 		}
   3022 
   3023 		/* hdfd is the Atari/Hades floppy driver */
   3024 		if (device_is_a(dv, "hdfd")) {
   3025 			continue;
   3026 		}
   3027 
   3028 		/* fdisa is the Atari/Milan floppy driver */
   3029 		if (device_is_a(dv, "fdisa")) {
   3030 			continue;
   3031 		}
   3032 
   3033 		/* need to find the device_name_to_block_device_major stuff */
   3034 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3035 
   3036 		/* get a vnode for the raw partition of this disk */
   3037 
   3038 		wedge = device_is_a(dv, "dk");
   3039 		bminor = minor(device_unit(dv));
   3040 		dev = wedge ? makedev(bmajor, bminor) :
   3041 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3042 		if (bdevvp(dev, &vp))
   3043 			panic("RAID can't alloc vnode");
   3044 
   3045 		error = VOP_OPEN(vp, FREAD, NOCRED);
   3046 
   3047 		if (error) {
   3048 			/* "Who cares."  Continue looking
   3049 			   for something that exists*/
   3050 			vput(vp);
   3051 			continue;
   3052 		}
   3053 
   3054 		error = getdisksize(vp, &numsecs, &secsize);
   3055 		if (error) {
   3056 			vput(vp);
   3057 			continue;
   3058 		}
   3059 		if (wedge) {
   3060 			struct dkwedge_info dkw;
   3061 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3062 			    NOCRED);
   3063 			if (error) {
   3064 				printf("RAIDframe: can't get wedge info for "
   3065 				    "dev %s (%d)\n", device_xname(dv), error);
   3066 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3067 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3068 				vput(vp);
   3069 				continue;
   3070 			}
   3071 
   3072 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3073 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3074 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3075 				vput(vp);
   3076 				continue;
   3077 			}
   3078 
   3079 			ac_list = rf_get_component(ac_list, dev, vp,
   3080 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3081 			continue;
   3082 		}
   3083 
   3084 		/* Ok, the disk exists.  Go get the disklabel. */
   3085 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3086 		if (error) {
   3087 			/*
   3088 			 * XXX can't happen - open() would
   3089 			 * have errored out (or faked up one)
   3090 			 */
   3091 			if (error != ENOTTY)
   3092 				printf("RAIDframe: can't get label for dev "
   3093 				    "%s (%d)\n", device_xname(dv), error);
   3094 		}
   3095 
   3096 		/* don't need this any more.  We'll allocate it again
   3097 		   a little later if we really do... */
   3098 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3099 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3100 		vput(vp);
   3101 
   3102 		if (error)
   3103 			continue;
   3104 
   3105 		for (i = 0; i < label.d_npartitions; i++) {
   3106 			char cname[sizeof(ac_list->devname)];
   3107 
   3108 			/* We only support partitions marked as RAID */
   3109 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3110 				continue;
   3111 
   3112 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3113 			if (bdevvp(dev, &vp))
   3114 				panic("RAID can't alloc vnode");
   3115 
   3116 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3117 			if (error) {
   3118 				/* Whatever... */
   3119 				vput(vp);
   3120 				continue;
   3121 			}
   3122 			snprintf(cname, sizeof(cname), "%s%c",
   3123 			    device_xname(dv), 'a' + i);
   3124 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3125 				label.d_partitions[i].p_size, numsecs, secsize);
   3126 		}
   3127 	}
   3128 	deviter_release(&di);
   3129 	return ac_list;
   3130 }
   3131 
   3132 
   3133 static int
   3134 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3135 {
   3136 
   3137 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3138 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3139 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3140 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3141 	    clabel->row >=0 &&
   3142 	    clabel->column >= 0 &&
   3143 	    clabel->num_rows > 0 &&
   3144 	    clabel->num_columns > 0 &&
   3145 	    clabel->row < clabel->num_rows &&
   3146 	    clabel->column < clabel->num_columns &&
   3147 	    clabel->blockSize > 0 &&
   3148 	    /*
   3149 	     * numBlocksHi may contain garbage, but it is ok since
   3150 	     * the type is unsigned.  If it is really garbage,
   3151 	     * rf_fix_old_label_size() will fix it.
   3152 	     */
   3153 	    rf_component_label_numblocks(clabel) > 0) {
   3154 		/*
   3155 		 * label looks reasonable enough...
   3156 		 * let's make sure it has no old garbage.
   3157 		 */
   3158 		rf_fix_old_label_size(clabel, numsecs);
   3159 		return(1);
   3160 	}
   3161 	return(0);
   3162 }
   3163 
   3164 
   3165 /*
   3166  * For reasons yet unknown, some old component labels have garbage in
   3167  * the newer numBlocksHi region, and this causes lossage.  Since those
   3168  * disks will also have numsecs set to less than 32 bits of sectors,
   3169  * we can determine when this corruption has occured, and fix it.
   3170  *
   3171  * The exact same problem, with the same unknown reason, happens to
   3172  * the partitionSizeHi member as well.
   3173  */
   3174 static void
   3175 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3176 {
   3177 
   3178 	if (numsecs < ((uint64_t)1 << 32)) {
   3179 		if (clabel->numBlocksHi) {
   3180 			printf("WARNING: total sectors < 32 bits, yet "
   3181 			       "numBlocksHi set\n"
   3182 			       "WARNING: resetting numBlocksHi to zero.\n");
   3183 			clabel->numBlocksHi = 0;
   3184 		}
   3185 
   3186 		if (clabel->partitionSizeHi) {
   3187 			printf("WARNING: total sectors < 32 bits, yet "
   3188 			       "partitionSizeHi set\n"
   3189 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3190 			clabel->partitionSizeHi = 0;
   3191 		}
   3192 	}
   3193 }
   3194 
   3195 
   3196 #ifdef DEBUG
   3197 void
   3198 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3199 {
   3200 	uint64_t numBlocks;
   3201 
   3202 	numBlocks = rf_component_label_numblocks(clabel);
   3203 
   3204 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3205 	       clabel->row, clabel->column,
   3206 	       clabel->num_rows, clabel->num_columns);
   3207 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3208 	       clabel->version, clabel->serial_number,
   3209 	       clabel->mod_counter);
   3210 	printf("   Clean: %s Status: %d\n",
   3211 	       clabel->clean ? "Yes" : "No", clabel->status);
   3212 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3213 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3214 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3215 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3216 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3217 	printf("   Contains root partition: %s\n",
   3218 	       clabel->root_partition ? "Yes" : "No");
   3219 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3220 #if 0
   3221 	   printf("   Config order: %d\n", clabel->config_order);
   3222 #endif
   3223 
   3224 }
   3225 #endif
   3226 
   3227 RF_ConfigSet_t *
   3228 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3229 {
   3230 	RF_AutoConfig_t *ac;
   3231 	RF_ConfigSet_t *config_sets;
   3232 	RF_ConfigSet_t *cset;
   3233 	RF_AutoConfig_t *ac_next;
   3234 
   3235 
   3236 	config_sets = NULL;
   3237 
   3238 	/* Go through the AutoConfig list, and figure out which components
   3239 	   belong to what sets.  */
   3240 	ac = ac_list;
   3241 	while(ac!=NULL) {
   3242 		/* we're going to putz with ac->next, so save it here
   3243 		   for use at the end of the loop */
   3244 		ac_next = ac->next;
   3245 
   3246 		if (config_sets == NULL) {
   3247 			/* will need at least this one... */
   3248 			config_sets = (RF_ConfigSet_t *)
   3249 				malloc(sizeof(RF_ConfigSet_t),
   3250 				       M_RAIDFRAME, M_NOWAIT);
   3251 			if (config_sets == NULL) {
   3252 				panic("rf_create_auto_sets: No memory!");
   3253 			}
   3254 			/* this one is easy :) */
   3255 			config_sets->ac = ac;
   3256 			config_sets->next = NULL;
   3257 			config_sets->rootable = 0;
   3258 			ac->next = NULL;
   3259 		} else {
   3260 			/* which set does this component fit into? */
   3261 			cset = config_sets;
   3262 			while(cset!=NULL) {
   3263 				if (rf_does_it_fit(cset, ac)) {
   3264 					/* looks like it matches... */
   3265 					ac->next = cset->ac;
   3266 					cset->ac = ac;
   3267 					break;
   3268 				}
   3269 				cset = cset->next;
   3270 			}
   3271 			if (cset==NULL) {
   3272 				/* didn't find a match above... new set..*/
   3273 				cset = (RF_ConfigSet_t *)
   3274 					malloc(sizeof(RF_ConfigSet_t),
   3275 					       M_RAIDFRAME, M_NOWAIT);
   3276 				if (cset == NULL) {
   3277 					panic("rf_create_auto_sets: No memory!");
   3278 				}
   3279 				cset->ac = ac;
   3280 				ac->next = NULL;
   3281 				cset->next = config_sets;
   3282 				cset->rootable = 0;
   3283 				config_sets = cset;
   3284 			}
   3285 		}
   3286 		ac = ac_next;
   3287 	}
   3288 
   3289 
   3290 	return(config_sets);
   3291 }
   3292 
   3293 static int
   3294 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3295 {
   3296 	RF_ComponentLabel_t *clabel1, *clabel2;
   3297 
   3298 	/* If this one matches the *first* one in the set, that's good
   3299 	   enough, since the other members of the set would have been
   3300 	   through here too... */
   3301 	/* note that we are not checking partitionSize here..
   3302 
   3303 	   Note that we are also not checking the mod_counters here.
   3304 	   If everything else matches execpt the mod_counter, that's
   3305 	   good enough for this test.  We will deal with the mod_counters
   3306 	   a little later in the autoconfiguration process.
   3307 
   3308 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3309 
   3310 	   The reason we don't check for this is that failed disks
   3311 	   will have lower modification counts.  If those disks are
   3312 	   not added to the set they used to belong to, then they will
   3313 	   form their own set, which may result in 2 different sets,
   3314 	   for example, competing to be configured at raid0, and
   3315 	   perhaps competing to be the root filesystem set.  If the
   3316 	   wrong ones get configured, or both attempt to become /,
   3317 	   weird behaviour and or serious lossage will occur.  Thus we
   3318 	   need to bring them into the fold here, and kick them out at
   3319 	   a later point.
   3320 
   3321 	*/
   3322 
   3323 	clabel1 = cset->ac->clabel;
   3324 	clabel2 = ac->clabel;
   3325 	if ((clabel1->version == clabel2->version) &&
   3326 	    (clabel1->serial_number == clabel2->serial_number) &&
   3327 	    (clabel1->num_rows == clabel2->num_rows) &&
   3328 	    (clabel1->num_columns == clabel2->num_columns) &&
   3329 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3330 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3331 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3332 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3333 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3334 	    (clabel1->blockSize == clabel2->blockSize) &&
   3335 	    rf_component_label_numblocks(clabel1) ==
   3336 	    rf_component_label_numblocks(clabel2) &&
   3337 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3338 	    (clabel1->root_partition == clabel2->root_partition) &&
   3339 	    (clabel1->last_unit == clabel2->last_unit) &&
   3340 	    (clabel1->config_order == clabel2->config_order)) {
   3341 		/* if it get's here, it almost *has* to be a match */
   3342 	} else {
   3343 		/* it's not consistent with somebody in the set..
   3344 		   punt */
   3345 		return(0);
   3346 	}
   3347 	/* all was fine.. it must fit... */
   3348 	return(1);
   3349 }
   3350 
   3351 int
   3352 rf_have_enough_components(RF_ConfigSet_t *cset)
   3353 {
   3354 	RF_AutoConfig_t *ac;
   3355 	RF_AutoConfig_t *auto_config;
   3356 	RF_ComponentLabel_t *clabel;
   3357 	int c;
   3358 	int num_cols;
   3359 	int num_missing;
   3360 	int mod_counter;
   3361 	int mod_counter_found;
   3362 	int even_pair_failed;
   3363 	char parity_type;
   3364 
   3365 
   3366 	/* check to see that we have enough 'live' components
   3367 	   of this set.  If so, we can configure it if necessary */
   3368 
   3369 	num_cols = cset->ac->clabel->num_columns;
   3370 	parity_type = cset->ac->clabel->parityConfig;
   3371 
   3372 	/* XXX Check for duplicate components!?!?!? */
   3373 
   3374 	/* Determine what the mod_counter is supposed to be for this set. */
   3375 
   3376 	mod_counter_found = 0;
   3377 	mod_counter = 0;
   3378 	ac = cset->ac;
   3379 	while(ac!=NULL) {
   3380 		if (mod_counter_found==0) {
   3381 			mod_counter = ac->clabel->mod_counter;
   3382 			mod_counter_found = 1;
   3383 		} else {
   3384 			if (ac->clabel->mod_counter > mod_counter) {
   3385 				mod_counter = ac->clabel->mod_counter;
   3386 			}
   3387 		}
   3388 		ac = ac->next;
   3389 	}
   3390 
   3391 	num_missing = 0;
   3392 	auto_config = cset->ac;
   3393 
   3394 	even_pair_failed = 0;
   3395 	for(c=0; c<num_cols; c++) {
   3396 		ac = auto_config;
   3397 		while(ac!=NULL) {
   3398 			if ((ac->clabel->column == c) &&
   3399 			    (ac->clabel->mod_counter == mod_counter)) {
   3400 				/* it's this one... */
   3401 #ifdef DEBUG
   3402 				printf("Found: %s at %d\n",
   3403 				       ac->devname,c);
   3404 #endif
   3405 				break;
   3406 			}
   3407 			ac=ac->next;
   3408 		}
   3409 		if (ac==NULL) {
   3410 				/* Didn't find one here! */
   3411 				/* special case for RAID 1, especially
   3412 				   where there are more than 2
   3413 				   components (where RAIDframe treats
   3414 				   things a little differently :( ) */
   3415 			if (parity_type == '1') {
   3416 				if (c%2 == 0) { /* even component */
   3417 					even_pair_failed = 1;
   3418 				} else { /* odd component.  If
   3419 					    we're failed, and
   3420 					    so is the even
   3421 					    component, it's
   3422 					    "Good Night, Charlie" */
   3423 					if (even_pair_failed == 1) {
   3424 						return(0);
   3425 					}
   3426 				}
   3427 			} else {
   3428 				/* normal accounting */
   3429 				num_missing++;
   3430 			}
   3431 		}
   3432 		if ((parity_type == '1') && (c%2 == 1)) {
   3433 				/* Just did an even component, and we didn't
   3434 				   bail.. reset the even_pair_failed flag,
   3435 				   and go on to the next component.... */
   3436 			even_pair_failed = 0;
   3437 		}
   3438 	}
   3439 
   3440 	clabel = cset->ac->clabel;
   3441 
   3442 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3443 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3444 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3445 		/* XXX this needs to be made *much* more general */
   3446 		/* Too many failures */
   3447 		return(0);
   3448 	}
   3449 	/* otherwise, all is well, and we've got enough to take a kick
   3450 	   at autoconfiguring this set */
   3451 	return(1);
   3452 }
   3453 
   3454 void
   3455 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3456 			RF_Raid_t *raidPtr)
   3457 {
   3458 	RF_ComponentLabel_t *clabel;
   3459 	int i;
   3460 
   3461 	clabel = ac->clabel;
   3462 
   3463 	/* 1. Fill in the common stuff */
   3464 	config->numRow = clabel->num_rows = 1;
   3465 	config->numCol = clabel->num_columns;
   3466 	config->numSpare = 0; /* XXX should this be set here? */
   3467 	config->sectPerSU = clabel->sectPerSU;
   3468 	config->SUsPerPU = clabel->SUsPerPU;
   3469 	config->SUsPerRU = clabel->SUsPerRU;
   3470 	config->parityConfig = clabel->parityConfig;
   3471 	/* XXX... */
   3472 	strcpy(config->diskQueueType,"fifo");
   3473 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3474 	config->layoutSpecificSize = 0; /* XXX ?? */
   3475 
   3476 	while(ac!=NULL) {
   3477 		/* row/col values will be in range due to the checks
   3478 		   in reasonable_label() */
   3479 		strcpy(config->devnames[0][ac->clabel->column],
   3480 		       ac->devname);
   3481 		ac = ac->next;
   3482 	}
   3483 
   3484 	for(i=0;i<RF_MAXDBGV;i++) {
   3485 		config->debugVars[i][0] = 0;
   3486 	}
   3487 }
   3488 
   3489 int
   3490 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3491 {
   3492 	RF_ComponentLabel_t *clabel;
   3493 	int column;
   3494 	int sparecol;
   3495 
   3496 	raidPtr->autoconfigure = new_value;
   3497 
   3498 	for(column=0; column<raidPtr->numCol; column++) {
   3499 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3500 			clabel = raidget_component_label(raidPtr, column);
   3501 			clabel->autoconfigure = new_value;
   3502 			raidflush_component_label(raidPtr, column);
   3503 		}
   3504 	}
   3505 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3506 		sparecol = raidPtr->numCol + column;
   3507 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3508 			clabel = raidget_component_label(raidPtr, sparecol);
   3509 			clabel->autoconfigure = new_value;
   3510 			raidflush_component_label(raidPtr, sparecol);
   3511 		}
   3512 	}
   3513 	return(new_value);
   3514 }
   3515 
   3516 int
   3517 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3518 {
   3519 	RF_ComponentLabel_t *clabel;
   3520 	int column;
   3521 	int sparecol;
   3522 
   3523 	raidPtr->root_partition = new_value;
   3524 	for(column=0; column<raidPtr->numCol; column++) {
   3525 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3526 			clabel = raidget_component_label(raidPtr, column);
   3527 			clabel->root_partition = new_value;
   3528 			raidflush_component_label(raidPtr, column);
   3529 		}
   3530 	}
   3531 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3532 		sparecol = raidPtr->numCol + column;
   3533 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3534 			clabel = raidget_component_label(raidPtr, sparecol);
   3535 			clabel->root_partition = new_value;
   3536 			raidflush_component_label(raidPtr, sparecol);
   3537 		}
   3538 	}
   3539 	return(new_value);
   3540 }
   3541 
   3542 void
   3543 rf_release_all_vps(RF_ConfigSet_t *cset)
   3544 {
   3545 	RF_AutoConfig_t *ac;
   3546 
   3547 	ac = cset->ac;
   3548 	while(ac!=NULL) {
   3549 		/* Close the vp, and give it back */
   3550 		if (ac->vp) {
   3551 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3552 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3553 			vput(ac->vp);
   3554 			ac->vp = NULL;
   3555 		}
   3556 		ac = ac->next;
   3557 	}
   3558 }
   3559 
   3560 
   3561 void
   3562 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3563 {
   3564 	RF_AutoConfig_t *ac;
   3565 	RF_AutoConfig_t *next_ac;
   3566 
   3567 	ac = cset->ac;
   3568 	while(ac!=NULL) {
   3569 		next_ac = ac->next;
   3570 		/* nuke the label */
   3571 		free(ac->clabel, M_RAIDFRAME);
   3572 		/* cleanup the config structure */
   3573 		free(ac, M_RAIDFRAME);
   3574 		/* "next.." */
   3575 		ac = next_ac;
   3576 	}
   3577 	/* and, finally, nuke the config set */
   3578 	free(cset, M_RAIDFRAME);
   3579 }
   3580 
   3581 
   3582 void
   3583 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3584 {
   3585 	/* current version number */
   3586 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3587 	clabel->serial_number = raidPtr->serial_number;
   3588 	clabel->mod_counter = raidPtr->mod_counter;
   3589 
   3590 	clabel->num_rows = 1;
   3591 	clabel->num_columns = raidPtr->numCol;
   3592 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3593 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3594 
   3595 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3596 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3597 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3598 
   3599 	clabel->blockSize = raidPtr->bytesPerSector;
   3600 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3601 
   3602 	/* XXX not portable */
   3603 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3604 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3605 	clabel->autoconfigure = raidPtr->autoconfigure;
   3606 	clabel->root_partition = raidPtr->root_partition;
   3607 	clabel->last_unit = raidPtr->raidid;
   3608 	clabel->config_order = raidPtr->config_order;
   3609 
   3610 #ifndef RF_NO_PARITY_MAP
   3611 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3612 #endif
   3613 }
   3614 
   3615 int
   3616 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3617 {
   3618 	RF_Raid_t *raidPtr;
   3619 	RF_Config_t *config;
   3620 	int raidID;
   3621 	int retcode;
   3622 
   3623 #ifdef DEBUG
   3624 	printf("RAID autoconfigure\n");
   3625 #endif
   3626 
   3627 	retcode = 0;
   3628 	*unit = -1;
   3629 
   3630 	/* 1. Create a config structure */
   3631 
   3632 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3633 				       M_RAIDFRAME,
   3634 				       M_NOWAIT);
   3635 	if (config==NULL) {
   3636 		printf("Out of mem!?!?\n");
   3637 				/* XXX do something more intelligent here. */
   3638 		return(1);
   3639 	}
   3640 
   3641 	memset(config, 0, sizeof(RF_Config_t));
   3642 
   3643 	/*
   3644 	   2. Figure out what RAID ID this one is supposed to live at
   3645 	   See if we can get the same RAID dev that it was configured
   3646 	   on last time..
   3647 	*/
   3648 
   3649 	raidID = cset->ac->clabel->last_unit;
   3650 	if ((raidID < 0) || (raidID >= numraid)) {
   3651 		/* let's not wander off into lala land. */
   3652 		raidID = numraid - 1;
   3653 	}
   3654 	if (raidPtrs[raidID]->valid != 0) {
   3655 
   3656 		/*
   3657 		   Nope... Go looking for an alternative...
   3658 		   Start high so we don't immediately use raid0 if that's
   3659 		   not taken.
   3660 		*/
   3661 
   3662 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3663 			if (raidPtrs[raidID]->valid == 0) {
   3664 				/* can use this one! */
   3665 				break;
   3666 			}
   3667 		}
   3668 	}
   3669 
   3670 	if (raidID < 0) {
   3671 		/* punt... */
   3672 		printf("Unable to auto configure this set!\n");
   3673 		printf("(Out of RAID devs!)\n");
   3674 		free(config, M_RAIDFRAME);
   3675 		return(1);
   3676 	}
   3677 
   3678 #ifdef DEBUG
   3679 	printf("Configuring raid%d:\n",raidID);
   3680 #endif
   3681 
   3682 	raidPtr = raidPtrs[raidID];
   3683 
   3684 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3685 	raidPtr->raidid = raidID;
   3686 	raidPtr->openings = RAIDOUTSTANDING;
   3687 
   3688 	/* 3. Build the configuration structure */
   3689 	rf_create_configuration(cset->ac, config, raidPtr);
   3690 
   3691 	/* 4. Do the configuration */
   3692 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3693 
   3694 	if (retcode == 0) {
   3695 
   3696 		raidinit(raidPtrs[raidID]);
   3697 
   3698 		rf_markalldirty(raidPtrs[raidID]);
   3699 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3700 		if (cset->ac->clabel->root_partition==1) {
   3701 			/* everything configured just fine.  Make a note
   3702 			   that this set is eligible to be root. */
   3703 			cset->rootable = 1;
   3704 			/* XXX do this here? */
   3705 			raidPtrs[raidID]->root_partition = 1;
   3706 		}
   3707 	}
   3708 
   3709 	/* 5. Cleanup */
   3710 	free(config, M_RAIDFRAME);
   3711 
   3712 	*unit = raidID;
   3713 	return(retcode);
   3714 }
   3715 
   3716 void
   3717 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3718 {
   3719 	struct buf *bp;
   3720 
   3721 	bp = (struct buf *)desc->bp;
   3722 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3723 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3724 }
   3725 
   3726 void
   3727 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3728 	     size_t xmin, size_t xmax)
   3729 {
   3730 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3731 	pool_sethiwat(p, xmax);
   3732 	pool_prime(p, xmin);
   3733 	pool_setlowat(p, xmin);
   3734 }
   3735 
   3736 /*
   3737  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3738  * if there is IO pending and if that IO could possibly be done for a
   3739  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3740  * otherwise.
   3741  *
   3742  */
   3743 
   3744 int
   3745 rf_buf_queue_check(int raidid)
   3746 {
   3747 	if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
   3748 	    raidPtrs[raidid]->openings > 0) {
   3749 		/* there is work to do */
   3750 		return 0;
   3751 	}
   3752 	/* default is nothing to do */
   3753 	return 1;
   3754 }
   3755 
   3756 int
   3757 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
   3758 {
   3759 	uint64_t numsecs;
   3760 	unsigned secsize;
   3761 	int error;
   3762 
   3763 	error = getdisksize(vp, &numsecs, &secsize);
   3764 	if (error == 0) {
   3765 		diskPtr->blockSize = secsize;
   3766 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3767 		diskPtr->partitionSize = numsecs;
   3768 		return 0;
   3769 	}
   3770 	return error;
   3771 }
   3772 
   3773 static int
   3774 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3775 {
   3776 	return 1;
   3777 }
   3778 
   3779 static void
   3780 raid_attach(device_t parent, device_t self, void *aux)
   3781 {
   3782 
   3783 }
   3784 
   3785 
   3786 static int
   3787 raid_detach(device_t self, int flags)
   3788 {
   3789 	int error;
   3790 	struct raid_softc *rs = &raid_softc[device_unit(self)];
   3791 
   3792 	if ((error = raidlock(rs)) != 0)
   3793 		return (error);
   3794 
   3795 	error = raid_detach_unlocked(rs);
   3796 
   3797 	raidunlock(rs);
   3798 
   3799 	return error;
   3800 }
   3801 
   3802 static void
   3803 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3804 {
   3805 	prop_dictionary_t disk_info, odisk_info, geom;
   3806 	disk_info = prop_dictionary_create();
   3807 	geom = prop_dictionary_create();
   3808 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3809 				   raidPtr->totalSectors);
   3810 	prop_dictionary_set_uint32(geom, "sector-size",
   3811 				   raidPtr->bytesPerSector);
   3812 
   3813 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3814 				   raidPtr->Layout.dataSectorsPerStripe);
   3815 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3816 				   4 * raidPtr->numCol);
   3817 
   3818 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3819 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3820 	   (4 * raidPtr->numCol)));
   3821 
   3822 	prop_dictionary_set(disk_info, "geometry", geom);
   3823 	prop_object_release(geom);
   3824 	prop_dictionary_set(device_properties(rs->sc_dev),
   3825 			    "disk-info", disk_info);
   3826 	odisk_info = rs->sc_dkdev.dk_info;
   3827 	rs->sc_dkdev.dk_info = disk_info;
   3828 	if (odisk_info)
   3829 		prop_object_release(odisk_info);
   3830 }
   3831 
   3832 /*
   3833  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3834  * We end up returning whatever error was returned by the first cache flush
   3835  * that fails.
   3836  */
   3837 
   3838 int
   3839 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3840 {
   3841 	int c, sparecol;
   3842 	int e,error;
   3843 	int force = 1;
   3844 
   3845 	error = 0;
   3846 	for (c = 0; c < raidPtr->numCol; c++) {
   3847 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3848 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3849 					  &force, FWRITE, NOCRED);
   3850 			if (e) {
   3851 				if (e != ENODEV)
   3852 					printf("raid%d: cache flush to component %s failed.\n",
   3853 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3854 				if (error == 0) {
   3855 					error = e;
   3856 				}
   3857 			}
   3858 		}
   3859 	}
   3860 
   3861 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3862 		sparecol = raidPtr->numCol + c;
   3863 		/* Need to ensure that the reconstruct actually completed! */
   3864 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3865 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3866 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3867 			if (e) {
   3868 				if (e != ENODEV)
   3869 					printf("raid%d: cache flush to component %s failed.\n",
   3870 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3871 				if (error == 0) {
   3872 					error = e;
   3873 				}
   3874 			}
   3875 		}
   3876 	}
   3877 	return error;
   3878 }
   3879