Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.294.2.3
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.294.2.3 2012/10/30 17:21:59 yamt Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.294.2.3 2012/10/30 17:21:59 yamt Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #include "raid.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #ifdef DEBUG
    156 int     rf_kdebug_level = 0;
    157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    158 #else				/* DEBUG */
    159 #define db1_printf(a) { }
    160 #endif				/* DEBUG */
    161 
    162 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 static void raidinit(RF_Raid_t *);
    183 
    184 void raidattach(int);
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 
    200 dev_type_open(raidopen);
    201 dev_type_close(raidclose);
    202 dev_type_read(raidread);
    203 dev_type_write(raidwrite);
    204 dev_type_ioctl(raidioctl);
    205 dev_type_strategy(raidstrategy);
    206 dev_type_dump(raiddump);
    207 dev_type_size(raidsize);
    208 
    209 const struct bdevsw raid_bdevsw = {
    210 	raidopen, raidclose, raidstrategy, raidioctl,
    211 	raiddump, raidsize, D_DISK
    212 };
    213 
    214 const struct cdevsw raid_cdevsw = {
    215 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    216 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    217 };
    218 
    219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    220 
    221 /* XXX Not sure if the following should be replacing the raidPtrs above,
    222    or if it should be used in conjunction with that...
    223 */
    224 
    225 struct raid_softc {
    226 	device_t sc_dev;
    227 	int     sc_flags;	/* flags */
    228 	int     sc_cflags;	/* configuration flags */
    229 	uint64_t sc_size;	/* size of the raid device */
    230 	char    sc_xname[20];	/* XXX external name */
    231 	struct disk sc_dkdev;	/* generic disk device info */
    232 	struct bufq_state *buf_queue;	/* used for the device queue */
    233 };
    234 /* sc_flags */
    235 #define RAIDF_INITED	0x01	/* unit has been initialized */
    236 #define RAIDF_WLABEL	0x02	/* label area is writable */
    237 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    238 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    239 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    240 #define RAIDF_LOCKED	0x80	/* unit is locked */
    241 
    242 #define	raidunit(x)	DISKUNIT(x)
    243 int numraid = 0;
    244 
    245 extern struct cfdriver raid_cd;
    246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    247     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    248     DVF_DETACH_SHUTDOWN);
    249 
    250 /*
    251  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    252  * Be aware that large numbers can allow the driver to consume a lot of
    253  * kernel memory, especially on writes, and in degraded mode reads.
    254  *
    255  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    256  * a single 64K write will typically require 64K for the old data,
    257  * 64K for the old parity, and 64K for the new parity, for a total
    258  * of 192K (if the parity buffer is not re-used immediately).
    259  * Even it if is used immediately, that's still 128K, which when multiplied
    260  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    261  *
    262  * Now in degraded mode, for example, a 64K read on the above setup may
    263  * require data reconstruction, which will require *all* of the 4 remaining
    264  * disks to participate -- 4 * 32K/disk == 128K again.
    265  */
    266 
    267 #ifndef RAIDOUTSTANDING
    268 #define RAIDOUTSTANDING   6
    269 #endif
    270 
    271 #define RAIDLABELDEV(dev)	\
    272 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    273 
    274 /* declared here, and made public, for the benefit of KVM stuff.. */
    275 struct raid_softc *raid_softc;
    276 
    277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    278 				     struct disklabel *);
    279 static void raidgetdisklabel(dev_t);
    280 static void raidmakedisklabel(struct raid_softc *);
    281 
    282 static int raidlock(struct raid_softc *);
    283 static void raidunlock(struct raid_softc *);
    284 
    285 static int raid_detach_unlocked(struct raid_softc *);
    286 
    287 static void rf_markalldirty(RF_Raid_t *);
    288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    289 
    290 void rf_ReconThread(struct rf_recon_req *);
    291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    292 void rf_CopybackThread(RF_Raid_t *raidPtr);
    293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    294 int rf_autoconfig(device_t);
    295 void rf_buildroothack(RF_ConfigSet_t *);
    296 
    297 RF_AutoConfig_t *rf_find_raid_components(void);
    298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    302 int rf_set_autoconfig(RF_Raid_t *, int);
    303 int rf_set_rootpartition(RF_Raid_t *, int);
    304 void rf_release_all_vps(RF_ConfigSet_t *);
    305 void rf_cleanup_config_set(RF_ConfigSet_t *);
    306 int rf_have_enough_components(RF_ConfigSet_t *);
    307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    309 
    310 /*
    311  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    312  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    313  * in the kernel config file.
    314  */
    315 #ifdef RAID_AUTOCONFIG
    316 int raidautoconfig = 1;
    317 #else
    318 int raidautoconfig = 0;
    319 #endif
    320 static bool raidautoconfigdone = false;
    321 
    322 struct RF_Pools_s rf_pools;
    323 
    324 void
    325 raidattach(int num)
    326 {
    327 	int raidID;
    328 	int i, rc;
    329 
    330 	aprint_debug("raidattach: Asked for %d units\n", num);
    331 
    332 	if (num <= 0) {
    333 #ifdef DIAGNOSTIC
    334 		panic("raidattach: count <= 0");
    335 #endif
    336 		return;
    337 	}
    338 	/* This is where all the initialization stuff gets done. */
    339 
    340 	numraid = num;
    341 
    342 	/* Make some space for requested number of units... */
    343 
    344 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    345 	if (raidPtrs == NULL) {
    346 		panic("raidPtrs is NULL!!");
    347 	}
    348 
    349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    350 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    351 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    352 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    353 
    354 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    355 #endif
    356 
    357 	for (i = 0; i < num; i++)
    358 		raidPtrs[i] = NULL;
    359 	rc = rf_BootRaidframe();
    360 	if (rc == 0)
    361 		aprint_verbose("Kernelized RAIDframe activated\n");
    362 	else
    363 		panic("Serious error booting RAID!!");
    364 
    365 	/* put together some datastructures like the CCD device does.. This
    366 	 * lets us lock the device and what-not when it gets opened. */
    367 
    368 	raid_softc = (struct raid_softc *)
    369 		malloc(num * sizeof(struct raid_softc),
    370 		       M_RAIDFRAME, M_NOWAIT);
    371 	if (raid_softc == NULL) {
    372 		aprint_error("WARNING: no memory for RAIDframe driver\n");
    373 		return;
    374 	}
    375 
    376 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    377 
    378 	for (raidID = 0; raidID < num; raidID++) {
    379 		bufq_alloc(&raid_softc[raidID].buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    380 
    381 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    382 			  (RF_Raid_t *));
    383 		if (raidPtrs[raidID] == NULL) {
    384 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
    385 			numraid = raidID;
    386 			return;
    387 		}
    388 	}
    389 
    390 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    391 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    392 	}
    393 
    394 	raidautoconfigdone = false;
    395 
    396 	/*
    397 	 * Register a finalizer which will be used to auto-config RAID
    398 	 * sets once all real hardware devices have been found.
    399 	 */
    400 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    401 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    402 }
    403 
    404 int
    405 rf_autoconfig(device_t self)
    406 {
    407 	RF_AutoConfig_t *ac_list;
    408 	RF_ConfigSet_t *config_sets;
    409 
    410 	if (!raidautoconfig || raidautoconfigdone == true)
    411 		return (0);
    412 
    413 	/* XXX This code can only be run once. */
    414 	raidautoconfigdone = true;
    415 
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set andconfigure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 void
    433 rf_buildroothack(RF_ConfigSet_t *config_sets)
    434 {
    435 	RF_ConfigSet_t *cset;
    436 	RF_ConfigSet_t *next_cset;
    437 	int retcode;
    438 	int raidID;
    439 	int rootID;
    440 	int col;
    441 	int num_root;
    442 	char *devname;
    443 
    444 	rootID = 0;
    445 	num_root = 0;
    446 	cset = config_sets;
    447 	while (cset != NULL) {
    448 		next_cset = cset->next;
    449 		if (rf_have_enough_components(cset) &&
    450 		    cset->ac->clabel->autoconfigure==1) {
    451 			retcode = rf_auto_config_set(cset,&raidID);
    452 			if (!retcode) {
    453 				aprint_debug("raid%d: configured ok\n", raidID);
    454 				if (cset->rootable) {
    455 					rootID = raidID;
    456 					num_root++;
    457 				}
    458 			} else {
    459 				/* The autoconfig didn't work :( */
    460 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    461 				rf_release_all_vps(cset);
    462 			}
    463 		} else {
    464 			/* we're not autoconfiguring this set...
    465 			   release the associated resources */
    466 			rf_release_all_vps(cset);
    467 		}
    468 		/* cleanup */
    469 		rf_cleanup_config_set(cset);
    470 		cset = next_cset;
    471 	}
    472 
    473 	/* if the user has specified what the root device should be
    474 	   then we don't touch booted_device or boothowto... */
    475 
    476 	if (rootspec != NULL)
    477 		return;
    478 
    479 	/* we found something bootable... */
    480 
    481 	if (num_root == 1) {
    482 		if (raid_softc[rootID].sc_dkdev.dk_nwedges != 0) {
    483 			/* XXX: How do we find the real root partition? */
    484 			char cname[sizeof(cset->ac->devname)];
    485 			snprintf(cname, sizeof(cname), "%s%c",
    486 			    device_xname(raid_softc[rootID].sc_dev), 'a');
    487 			booted_device = dkwedge_find_by_wname(cname);
    488 		} else
    489 			booted_device = raid_softc[rootID].sc_dev;
    490 	} else if (num_root > 1) {
    491 
    492 		/*
    493 		 * Maybe the MD code can help. If it cannot, then
    494 		 * setroot() will discover that we have no
    495 		 * booted_device and will ask the user if nothing was
    496 		 * hardwired in the kernel config file
    497 		 */
    498 
    499 		if (booted_device == NULL)
    500 			cpu_rootconf();
    501 		if (booted_device == NULL)
    502 			return;
    503 
    504 		num_root = 0;
    505 		for (raidID = 0; raidID < numraid; raidID++) {
    506 			if (raidPtrs[raidID]->valid == 0)
    507 				continue;
    508 
    509 			if (raidPtrs[raidID]->root_partition == 0)
    510 				continue;
    511 
    512 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
    513 				devname = raidPtrs[raidID]->Disks[col].devname;
    514 				devname += sizeof("/dev/") - 1;
    515 				if (strncmp(devname, device_xname(booted_device),
    516 					    strlen(device_xname(booted_device))) != 0)
    517 					continue;
    518 				aprint_debug("raid%d includes boot device %s\n",
    519 				       raidID, devname);
    520 				num_root++;
    521 				rootID = raidID;
    522 			}
    523 		}
    524 
    525 		if (num_root == 1) {
    526 			booted_device = raid_softc[rootID].sc_dev;
    527 		} else {
    528 			/* we can't guess.. require the user to answer... */
    529 			boothowto |= RB_ASKNAME;
    530 		}
    531 	}
    532 }
    533 
    534 
    535 int
    536 raidsize(dev_t dev)
    537 {
    538 	struct raid_softc *rs;
    539 	struct disklabel *lp;
    540 	int     part, unit, omask, size;
    541 
    542 	unit = raidunit(dev);
    543 	if (unit >= numraid)
    544 		return (-1);
    545 	rs = &raid_softc[unit];
    546 
    547 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    548 		return (-1);
    549 
    550 	part = DISKPART(dev);
    551 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    552 	lp = rs->sc_dkdev.dk_label;
    553 
    554 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    555 		return (-1);
    556 
    557 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    558 		size = -1;
    559 	else
    560 		size = lp->d_partitions[part].p_size *
    561 		    (lp->d_secsize / DEV_BSIZE);
    562 
    563 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    564 		return (-1);
    565 
    566 	return (size);
    567 
    568 }
    569 
    570 int
    571 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    572 {
    573 	int     unit = raidunit(dev);
    574 	struct raid_softc *rs;
    575 	const struct bdevsw *bdev;
    576 	struct disklabel *lp;
    577 	RF_Raid_t *raidPtr;
    578 	daddr_t offset;
    579 	int     part, c, sparecol, j, scol, dumpto;
    580 	int     error = 0;
    581 
    582 	if (unit >= numraid)
    583 		return (ENXIO);
    584 
    585 	rs = &raid_softc[unit];
    586 	raidPtr = raidPtrs[unit];
    587 
    588 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    589 		return ENXIO;
    590 
    591 	/* we only support dumping to RAID 1 sets */
    592 	if (raidPtr->Layout.numDataCol != 1 ||
    593 	    raidPtr->Layout.numParityCol != 1)
    594 		return EINVAL;
    595 
    596 
    597 	if ((error = raidlock(rs)) != 0)
    598 		return error;
    599 
    600 	if (size % DEV_BSIZE != 0) {
    601 		error = EINVAL;
    602 		goto out;
    603 	}
    604 
    605 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    606 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    607 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    608 		    size / DEV_BSIZE, rs->sc_size);
    609 		error = EINVAL;
    610 		goto out;
    611 	}
    612 
    613 	part = DISKPART(dev);
    614 	lp = rs->sc_dkdev.dk_label;
    615 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    616 
    617 	/* figure out what device is alive.. */
    618 
    619 	/*
    620 	   Look for a component to dump to.  The preference for the
    621 	   component to dump to is as follows:
    622 	   1) the master
    623 	   2) a used_spare of the master
    624 	   3) the slave
    625 	   4) a used_spare of the slave
    626 	*/
    627 
    628 	dumpto = -1;
    629 	for (c = 0; c < raidPtr->numCol; c++) {
    630 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    631 			/* this might be the one */
    632 			dumpto = c;
    633 			break;
    634 		}
    635 	}
    636 
    637 	/*
    638 	   At this point we have possibly selected a live master or a
    639 	   live slave.  We now check to see if there is a spared
    640 	   master (or a spared slave), if we didn't find a live master
    641 	   or a live slave.
    642 	*/
    643 
    644 	for (c = 0; c < raidPtr->numSpare; c++) {
    645 		sparecol = raidPtr->numCol + c;
    646 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    647 			/* How about this one? */
    648 			scol = -1;
    649 			for(j=0;j<raidPtr->numCol;j++) {
    650 				if (raidPtr->Disks[j].spareCol == sparecol) {
    651 					scol = j;
    652 					break;
    653 				}
    654 			}
    655 			if (scol == 0) {
    656 				/*
    657 				   We must have found a spared master!
    658 				   We'll take that over anything else
    659 				   found so far.  (We couldn't have
    660 				   found a real master before, since
    661 				   this is a used spare, and it's
    662 				   saying that it's replacing the
    663 				   master.)  On reboot (with
    664 				   autoconfiguration turned on)
    665 				   sparecol will become the 1st
    666 				   component (component0) of this set.
    667 				*/
    668 				dumpto = sparecol;
    669 				break;
    670 			} else if (scol != -1) {
    671 				/*
    672 				   Must be a spared slave.  We'll dump
    673 				   to that if we havn't found anything
    674 				   else so far.
    675 				*/
    676 				if (dumpto == -1)
    677 					dumpto = sparecol;
    678 			}
    679 		}
    680 	}
    681 
    682 	if (dumpto == -1) {
    683 		/* we couldn't find any live components to dump to!?!?
    684 		 */
    685 		error = EINVAL;
    686 		goto out;
    687 	}
    688 
    689 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    690 
    691 	/*
    692 	   Note that blkno is relative to this particular partition.
    693 	   By adding the offset of this partition in the RAID
    694 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    695 	   value that is relative to the partition used for the
    696 	   underlying component.
    697 	*/
    698 
    699 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    700 				blkno + offset, va, size);
    701 
    702 out:
    703 	raidunlock(rs);
    704 
    705 	return error;
    706 }
    707 /* ARGSUSED */
    708 int
    709 raidopen(dev_t dev, int flags, int fmt,
    710     struct lwp *l)
    711 {
    712 	int     unit = raidunit(dev);
    713 	struct raid_softc *rs;
    714 	struct disklabel *lp;
    715 	int     part, pmask;
    716 	int     error = 0;
    717 
    718 	if (unit >= numraid)
    719 		return (ENXIO);
    720 	rs = &raid_softc[unit];
    721 
    722 	if ((error = raidlock(rs)) != 0)
    723 		return (error);
    724 
    725 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    726 		error = EBUSY;
    727 		goto bad;
    728 	}
    729 
    730 	lp = rs->sc_dkdev.dk_label;
    731 
    732 	part = DISKPART(dev);
    733 
    734 	/*
    735 	 * If there are wedges, and this is not RAW_PART, then we
    736 	 * need to fail.
    737 	 */
    738 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    739 		error = EBUSY;
    740 		goto bad;
    741 	}
    742 	pmask = (1 << part);
    743 
    744 	if ((rs->sc_flags & RAIDF_INITED) &&
    745 	    (rs->sc_dkdev.dk_openmask == 0))
    746 		raidgetdisklabel(dev);
    747 
    748 	/* make sure that this partition exists */
    749 
    750 	if (part != RAW_PART) {
    751 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    752 		    ((part >= lp->d_npartitions) ||
    753 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    754 			error = ENXIO;
    755 			goto bad;
    756 		}
    757 	}
    758 	/* Prevent this unit from being unconfigured while open. */
    759 	switch (fmt) {
    760 	case S_IFCHR:
    761 		rs->sc_dkdev.dk_copenmask |= pmask;
    762 		break;
    763 
    764 	case S_IFBLK:
    765 		rs->sc_dkdev.dk_bopenmask |= pmask;
    766 		break;
    767 	}
    768 
    769 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    770 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    771 		/* First one... mark things as dirty... Note that we *MUST*
    772 		 have done a configure before this.  I DO NOT WANT TO BE
    773 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    774 		 THAT THEY BELONG TOGETHER!!!!! */
    775 		/* XXX should check to see if we're only open for reading
    776 		   here... If so, we needn't do this, but then need some
    777 		   other way of keeping track of what's happened.. */
    778 
    779 		rf_markalldirty(raidPtrs[unit]);
    780 	}
    781 
    782 
    783 	rs->sc_dkdev.dk_openmask =
    784 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    785 
    786 bad:
    787 	raidunlock(rs);
    788 
    789 	return (error);
    790 
    791 
    792 }
    793 /* ARGSUSED */
    794 int
    795 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    796 {
    797 	int     unit = raidunit(dev);
    798 	struct raid_softc *rs;
    799 	int     error = 0;
    800 	int     part;
    801 
    802 	if (unit >= numraid)
    803 		return (ENXIO);
    804 	rs = &raid_softc[unit];
    805 
    806 	if ((error = raidlock(rs)) != 0)
    807 		return (error);
    808 
    809 	part = DISKPART(dev);
    810 
    811 	/* ...that much closer to allowing unconfiguration... */
    812 	switch (fmt) {
    813 	case S_IFCHR:
    814 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    815 		break;
    816 
    817 	case S_IFBLK:
    818 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    819 		break;
    820 	}
    821 	rs->sc_dkdev.dk_openmask =
    822 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    823 
    824 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    825 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    826 		/* Last one... device is not unconfigured yet.
    827 		   Device shutdown has taken care of setting the
    828 		   clean bits if RAIDF_INITED is not set
    829 		   mark things as clean... */
    830 
    831 		rf_update_component_labels(raidPtrs[unit],
    832 						 RF_FINAL_COMPONENT_UPDATE);
    833 
    834 		/* If the kernel is shutting down, it will detach
    835 		 * this RAID set soon enough.
    836 		 */
    837 	}
    838 
    839 	raidunlock(rs);
    840 	return (0);
    841 
    842 }
    843 
    844 void
    845 raidstrategy(struct buf *bp)
    846 {
    847 	unsigned int raidID = raidunit(bp->b_dev);
    848 	RF_Raid_t *raidPtr;
    849 	struct raid_softc *rs = &raid_softc[raidID];
    850 	int     wlabel;
    851 
    852 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    853 		bp->b_error = ENXIO;
    854 		goto done;
    855 	}
    856 	if (raidID >= numraid || !raidPtrs[raidID]) {
    857 		bp->b_error = ENODEV;
    858 		goto done;
    859 	}
    860 	raidPtr = raidPtrs[raidID];
    861 	if (!raidPtr->valid) {
    862 		bp->b_error = ENODEV;
    863 		goto done;
    864 	}
    865 	if (bp->b_bcount == 0) {
    866 		db1_printf(("b_bcount is zero..\n"));
    867 		goto done;
    868 	}
    869 
    870 	/*
    871 	 * Do bounds checking and adjust transfer.  If there's an
    872 	 * error, the bounds check will flag that for us.
    873 	 */
    874 
    875 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    876 	if (DISKPART(bp->b_dev) == RAW_PART) {
    877 		uint64_t size; /* device size in DEV_BSIZE unit */
    878 
    879 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    880 			size = raidPtr->totalSectors <<
    881 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    882 		} else {
    883 			size = raidPtr->totalSectors >>
    884 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    885 		}
    886 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    887 			goto done;
    888 		}
    889 	} else {
    890 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    891 			db1_printf(("Bounds check failed!!:%d %d\n",
    892 				(int) bp->b_blkno, (int) wlabel));
    893 			goto done;
    894 		}
    895 	}
    896 
    897 	rf_lock_mutex2(raidPtr->iodone_lock);
    898 
    899 	bp->b_resid = 0;
    900 
    901 	/* stuff it onto our queue */
    902 	bufq_put(rs->buf_queue, bp);
    903 
    904 	/* scheduled the IO to happen at the next convenient time */
    905 	rf_signal_cond2(raidPtr->iodone_cv);
    906 	rf_unlock_mutex2(raidPtr->iodone_lock);
    907 
    908 	return;
    909 
    910 done:
    911 	bp->b_resid = bp->b_bcount;
    912 	biodone(bp);
    913 }
    914 /* ARGSUSED */
    915 int
    916 raidread(dev_t dev, struct uio *uio, int flags)
    917 {
    918 	int     unit = raidunit(dev);
    919 	struct raid_softc *rs;
    920 
    921 	if (unit >= numraid)
    922 		return (ENXIO);
    923 	rs = &raid_softc[unit];
    924 
    925 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    926 		return (ENXIO);
    927 
    928 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    929 
    930 }
    931 /* ARGSUSED */
    932 int
    933 raidwrite(dev_t dev, struct uio *uio, int flags)
    934 {
    935 	int     unit = raidunit(dev);
    936 	struct raid_softc *rs;
    937 
    938 	if (unit >= numraid)
    939 		return (ENXIO);
    940 	rs = &raid_softc[unit];
    941 
    942 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    943 		return (ENXIO);
    944 
    945 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    946 
    947 }
    948 
    949 static int
    950 raid_detach_unlocked(struct raid_softc *rs)
    951 {
    952 	int error;
    953 	RF_Raid_t *raidPtr;
    954 
    955 	raidPtr = raidPtrs[device_unit(rs->sc_dev)];
    956 
    957 	/*
    958 	 * If somebody has a partition mounted, we shouldn't
    959 	 * shutdown.
    960 	 */
    961 	if (rs->sc_dkdev.dk_openmask != 0)
    962 		return EBUSY;
    963 
    964 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    965 		;	/* not initialized: nothing to do */
    966 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    967 		return error;
    968 	else
    969 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    970 
    971 	/* Detach the disk. */
    972 	dkwedge_delall(&rs->sc_dkdev);
    973 	disk_detach(&rs->sc_dkdev);
    974 	disk_destroy(&rs->sc_dkdev);
    975 
    976 	aprint_normal_dev(rs->sc_dev, "detached\n");
    977 
    978 	return 0;
    979 }
    980 
    981 int
    982 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    983 {
    984 	int     unit = raidunit(dev);
    985 	int     error = 0;
    986 	int     part, pmask, s;
    987 	cfdata_t cf;
    988 	struct raid_softc *rs;
    989 	RF_Config_t *k_cfg, *u_cfg;
    990 	RF_Raid_t *raidPtr;
    991 	RF_RaidDisk_t *diskPtr;
    992 	RF_AccTotals_t *totals;
    993 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    994 	u_char *specific_buf;
    995 	int retcode = 0;
    996 	int column;
    997 /*	int raidid; */
    998 	struct rf_recon_req *rrcopy, *rr;
    999 	RF_ComponentLabel_t *clabel;
   1000 	RF_ComponentLabel_t *ci_label;
   1001 	RF_ComponentLabel_t **clabel_ptr;
   1002 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1003 	RF_SingleComponent_t component;
   1004 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1005 	int i, j, d;
   1006 #ifdef __HAVE_OLD_DISKLABEL
   1007 	struct disklabel newlabel;
   1008 #endif
   1009 	struct dkwedge_info *dkw;
   1010 
   1011 	if (unit >= numraid)
   1012 		return (ENXIO);
   1013 	rs = &raid_softc[unit];
   1014 	raidPtr = raidPtrs[unit];
   1015 
   1016 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1017 		(int) DISKPART(dev), (int) unit, cmd));
   1018 
   1019 	/* Must be open for writes for these commands... */
   1020 	switch (cmd) {
   1021 #ifdef DIOCGSECTORSIZE
   1022 	case DIOCGSECTORSIZE:
   1023 		*(u_int *)data = raidPtr->bytesPerSector;
   1024 		return 0;
   1025 	case DIOCGMEDIASIZE:
   1026 		*(off_t *)data =
   1027 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1028 		return 0;
   1029 #endif
   1030 	case DIOCSDINFO:
   1031 	case DIOCWDINFO:
   1032 #ifdef __HAVE_OLD_DISKLABEL
   1033 	case ODIOCWDINFO:
   1034 	case ODIOCSDINFO:
   1035 #endif
   1036 	case DIOCWLABEL:
   1037 	case DIOCAWEDGE:
   1038 	case DIOCDWEDGE:
   1039 	case DIOCSSTRATEGY:
   1040 		if ((flag & FWRITE) == 0)
   1041 			return (EBADF);
   1042 	}
   1043 
   1044 	/* Must be initialized for these... */
   1045 	switch (cmd) {
   1046 	case DIOCGDINFO:
   1047 	case DIOCSDINFO:
   1048 	case DIOCWDINFO:
   1049 #ifdef __HAVE_OLD_DISKLABEL
   1050 	case ODIOCGDINFO:
   1051 	case ODIOCWDINFO:
   1052 	case ODIOCSDINFO:
   1053 	case ODIOCGDEFLABEL:
   1054 #endif
   1055 	case DIOCGPART:
   1056 	case DIOCWLABEL:
   1057 	case DIOCGDEFLABEL:
   1058 	case DIOCAWEDGE:
   1059 	case DIOCDWEDGE:
   1060 	case DIOCLWEDGES:
   1061 	case DIOCCACHESYNC:
   1062 	case RAIDFRAME_SHUTDOWN:
   1063 	case RAIDFRAME_REWRITEPARITY:
   1064 	case RAIDFRAME_GET_INFO:
   1065 	case RAIDFRAME_RESET_ACCTOTALS:
   1066 	case RAIDFRAME_GET_ACCTOTALS:
   1067 	case RAIDFRAME_KEEP_ACCTOTALS:
   1068 	case RAIDFRAME_GET_SIZE:
   1069 	case RAIDFRAME_FAIL_DISK:
   1070 	case RAIDFRAME_COPYBACK:
   1071 	case RAIDFRAME_CHECK_RECON_STATUS:
   1072 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1073 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1074 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1075 	case RAIDFRAME_ADD_HOT_SPARE:
   1076 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1077 	case RAIDFRAME_INIT_LABELS:
   1078 	case RAIDFRAME_REBUILD_IN_PLACE:
   1079 	case RAIDFRAME_CHECK_PARITY:
   1080 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1081 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1082 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1083 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1084 	case RAIDFRAME_SET_AUTOCONFIG:
   1085 	case RAIDFRAME_SET_ROOT:
   1086 	case RAIDFRAME_DELETE_COMPONENT:
   1087 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1088 	case RAIDFRAME_PARITYMAP_STATUS:
   1089 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1090 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1091 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1092 	case DIOCGSTRATEGY:
   1093 	case DIOCSSTRATEGY:
   1094 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1095 			return (ENXIO);
   1096 	}
   1097 
   1098 	switch (cmd) {
   1099 #ifdef COMPAT_50
   1100 	case RAIDFRAME_GET_INFO50:
   1101 		return rf_get_info50(raidPtr, data);
   1102 
   1103 	case RAIDFRAME_CONFIGURE50:
   1104 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1105 			return retcode;
   1106 		goto config;
   1107 #endif
   1108 		/* configure the system */
   1109 	case RAIDFRAME_CONFIGURE:
   1110 
   1111 		if (raidPtr->valid) {
   1112 			/* There is a valid RAID set running on this unit! */
   1113 			printf("raid%d: Device already configured!\n",unit);
   1114 			return(EINVAL);
   1115 		}
   1116 
   1117 		/* copy-in the configuration information */
   1118 		/* data points to a pointer to the configuration structure */
   1119 
   1120 		u_cfg = *((RF_Config_t **) data);
   1121 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1122 		if (k_cfg == NULL) {
   1123 			return (ENOMEM);
   1124 		}
   1125 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1126 		if (retcode) {
   1127 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1128 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1129 				retcode));
   1130 			return (retcode);
   1131 		}
   1132 		goto config;
   1133 	config:
   1134 		/* allocate a buffer for the layout-specific data, and copy it
   1135 		 * in */
   1136 		if (k_cfg->layoutSpecificSize) {
   1137 			if (k_cfg->layoutSpecificSize > 10000) {
   1138 				/* sanity check */
   1139 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1140 				return (EINVAL);
   1141 			}
   1142 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1143 			    (u_char *));
   1144 			if (specific_buf == NULL) {
   1145 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1146 				return (ENOMEM);
   1147 			}
   1148 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1149 			    k_cfg->layoutSpecificSize);
   1150 			if (retcode) {
   1151 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1152 				RF_Free(specific_buf,
   1153 					k_cfg->layoutSpecificSize);
   1154 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1155 					retcode));
   1156 				return (retcode);
   1157 			}
   1158 		} else
   1159 			specific_buf = NULL;
   1160 		k_cfg->layoutSpecific = specific_buf;
   1161 
   1162 		/* should do some kind of sanity check on the configuration.
   1163 		 * Store the sum of all the bytes in the last byte? */
   1164 
   1165 		/* configure the system */
   1166 
   1167 		/*
   1168 		 * Clear the entire RAID descriptor, just to make sure
   1169 		 *  there is no stale data left in the case of a
   1170 		 *  reconfiguration
   1171 		 */
   1172 		memset(raidPtr, 0, sizeof(*raidPtr));
   1173 		raidPtr->raidid = unit;
   1174 
   1175 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1176 
   1177 		if (retcode == 0) {
   1178 
   1179 			/* allow this many simultaneous IO's to
   1180 			   this RAID device */
   1181 			raidPtr->openings = RAIDOUTSTANDING;
   1182 
   1183 			raidinit(raidPtr);
   1184 			rf_markalldirty(raidPtr);
   1185 		}
   1186 		/* free the buffers.  No return code here. */
   1187 		if (k_cfg->layoutSpecificSize) {
   1188 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1189 		}
   1190 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1191 
   1192 		return (retcode);
   1193 
   1194 		/* shutdown the system */
   1195 	case RAIDFRAME_SHUTDOWN:
   1196 
   1197 		part = DISKPART(dev);
   1198 		pmask = (1 << part);
   1199 
   1200 		if ((error = raidlock(rs)) != 0)
   1201 			return (error);
   1202 
   1203 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1204 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1205 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1206 			retcode = EBUSY;
   1207 		else {
   1208 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1209 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1210 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1211 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1212 			retcode = 0;
   1213 		}
   1214 
   1215 		raidunlock(rs);
   1216 
   1217 		if (retcode != 0)
   1218 			return retcode;
   1219 
   1220 		/* free the pseudo device attach bits */
   1221 
   1222 		cf = device_cfdata(rs->sc_dev);
   1223 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1224 			free(cf, M_RAIDFRAME);
   1225 
   1226 		return (retcode);
   1227 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1228 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1229 		/* need to read the component label for the disk indicated
   1230 		   by row,column in clabel */
   1231 
   1232 		/*
   1233 		 * Perhaps there should be an option to skip the in-core
   1234 		 * copy and hit the disk, as with disklabel(8).
   1235 		 */
   1236 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1237 
   1238 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1239 
   1240 		if (retcode) {
   1241 			RF_Free(clabel, sizeof(*clabel));
   1242 			return retcode;
   1243 		}
   1244 
   1245 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1246 
   1247 		column = clabel->column;
   1248 
   1249 		if ((column < 0) || (column >= raidPtr->numCol +
   1250 		    raidPtr->numSpare)) {
   1251 			RF_Free(clabel, sizeof(*clabel));
   1252 			return EINVAL;
   1253 		}
   1254 
   1255 		RF_Free(clabel, sizeof(*clabel));
   1256 
   1257 		clabel = raidget_component_label(raidPtr, column);
   1258 
   1259 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1260 
   1261 #if 0
   1262 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1263 		clabel = (RF_ComponentLabel_t *) data;
   1264 
   1265 		/* XXX check the label for valid stuff... */
   1266 		/* Note that some things *should not* get modified --
   1267 		   the user should be re-initing the labels instead of
   1268 		   trying to patch things.
   1269 		   */
   1270 
   1271 		raidid = raidPtr->raidid;
   1272 #ifdef DEBUG
   1273 		printf("raid%d: Got component label:\n", raidid);
   1274 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1275 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1276 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1277 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1278 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1279 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1280 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1281 #endif
   1282 		clabel->row = 0;
   1283 		column = clabel->column;
   1284 
   1285 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1286 			return(EINVAL);
   1287 		}
   1288 
   1289 		/* XXX this isn't allowed to do anything for now :-) */
   1290 
   1291 		/* XXX and before it is, we need to fill in the rest
   1292 		   of the fields!?!?!?! */
   1293 		memcpy(raidget_component_label(raidPtr, column),
   1294 		    clabel, sizeof(*clabel));
   1295 		raidflush_component_label(raidPtr, column);
   1296 		return (0);
   1297 #endif
   1298 
   1299 	case RAIDFRAME_INIT_LABELS:
   1300 		clabel = (RF_ComponentLabel_t *) data;
   1301 		/*
   1302 		   we only want the serial number from
   1303 		   the above.  We get all the rest of the information
   1304 		   from the config that was used to create this RAID
   1305 		   set.
   1306 		   */
   1307 
   1308 		raidPtr->serial_number = clabel->serial_number;
   1309 
   1310 		for(column=0;column<raidPtr->numCol;column++) {
   1311 			diskPtr = &raidPtr->Disks[column];
   1312 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1313 				ci_label = raidget_component_label(raidPtr,
   1314 				    column);
   1315 				/* Zeroing this is important. */
   1316 				memset(ci_label, 0, sizeof(*ci_label));
   1317 				raid_init_component_label(raidPtr, ci_label);
   1318 				ci_label->serial_number =
   1319 				    raidPtr->serial_number;
   1320 				ci_label->row = 0; /* we dont' pretend to support more */
   1321 				rf_component_label_set_partitionsize(ci_label,
   1322 				    diskPtr->partitionSize);
   1323 				ci_label->column = column;
   1324 				raidflush_component_label(raidPtr, column);
   1325 			}
   1326 			/* XXXjld what about the spares? */
   1327 		}
   1328 
   1329 		return (retcode);
   1330 	case RAIDFRAME_SET_AUTOCONFIG:
   1331 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1332 		printf("raid%d: New autoconfig value is: %d\n",
   1333 		       raidPtr->raidid, d);
   1334 		*(int *) data = d;
   1335 		return (retcode);
   1336 
   1337 	case RAIDFRAME_SET_ROOT:
   1338 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1339 		printf("raid%d: New rootpartition value is: %d\n",
   1340 		       raidPtr->raidid, d);
   1341 		*(int *) data = d;
   1342 		return (retcode);
   1343 
   1344 		/* initialize all parity */
   1345 	case RAIDFRAME_REWRITEPARITY:
   1346 
   1347 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1348 			/* Parity for RAID 0 is trivially correct */
   1349 			raidPtr->parity_good = RF_RAID_CLEAN;
   1350 			return(0);
   1351 		}
   1352 
   1353 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1354 			/* Re-write is already in progress! */
   1355 			return(EINVAL);
   1356 		}
   1357 
   1358 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1359 					   rf_RewriteParityThread,
   1360 					   raidPtr,"raid_parity");
   1361 		return (retcode);
   1362 
   1363 
   1364 	case RAIDFRAME_ADD_HOT_SPARE:
   1365 		sparePtr = (RF_SingleComponent_t *) data;
   1366 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1367 		retcode = rf_add_hot_spare(raidPtr, &component);
   1368 		return(retcode);
   1369 
   1370 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1371 		return(retcode);
   1372 
   1373 	case RAIDFRAME_DELETE_COMPONENT:
   1374 		componentPtr = (RF_SingleComponent_t *)data;
   1375 		memcpy( &component, componentPtr,
   1376 			sizeof(RF_SingleComponent_t));
   1377 		retcode = rf_delete_component(raidPtr, &component);
   1378 		return(retcode);
   1379 
   1380 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1381 		componentPtr = (RF_SingleComponent_t *)data;
   1382 		memcpy( &component, componentPtr,
   1383 			sizeof(RF_SingleComponent_t));
   1384 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1385 		return(retcode);
   1386 
   1387 	case RAIDFRAME_REBUILD_IN_PLACE:
   1388 
   1389 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1390 			/* Can't do this on a RAID 0!! */
   1391 			return(EINVAL);
   1392 		}
   1393 
   1394 		if (raidPtr->recon_in_progress == 1) {
   1395 			/* a reconstruct is already in progress! */
   1396 			return(EINVAL);
   1397 		}
   1398 
   1399 		componentPtr = (RF_SingleComponent_t *) data;
   1400 		memcpy( &component, componentPtr,
   1401 			sizeof(RF_SingleComponent_t));
   1402 		component.row = 0; /* we don't support any more */
   1403 		column = component.column;
   1404 
   1405 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1406 			return(EINVAL);
   1407 		}
   1408 
   1409 		rf_lock_mutex2(raidPtr->mutex);
   1410 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1411 		    (raidPtr->numFailures > 0)) {
   1412 			/* XXX 0 above shouldn't be constant!!! */
   1413 			/* some component other than this has failed.
   1414 			   Let's not make things worse than they already
   1415 			   are... */
   1416 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1417 			       raidPtr->raidid);
   1418 			printf("raid%d:     Col: %d   Too many failures.\n",
   1419 			       raidPtr->raidid, column);
   1420 			rf_unlock_mutex2(raidPtr->mutex);
   1421 			return (EINVAL);
   1422 		}
   1423 		if (raidPtr->Disks[column].status ==
   1424 		    rf_ds_reconstructing) {
   1425 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1426 			       raidPtr->raidid);
   1427 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1428 
   1429 			rf_unlock_mutex2(raidPtr->mutex);
   1430 			return (EINVAL);
   1431 		}
   1432 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1433 			rf_unlock_mutex2(raidPtr->mutex);
   1434 			return (EINVAL);
   1435 		}
   1436 		rf_unlock_mutex2(raidPtr->mutex);
   1437 
   1438 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1439 		if (rrcopy == NULL)
   1440 			return(ENOMEM);
   1441 
   1442 		rrcopy->raidPtr = (void *) raidPtr;
   1443 		rrcopy->col = column;
   1444 
   1445 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1446 					   rf_ReconstructInPlaceThread,
   1447 					   rrcopy,"raid_reconip");
   1448 		return(retcode);
   1449 
   1450 	case RAIDFRAME_GET_INFO:
   1451 		if (!raidPtr->valid)
   1452 			return (ENODEV);
   1453 		ucfgp = (RF_DeviceConfig_t **) data;
   1454 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1455 			  (RF_DeviceConfig_t *));
   1456 		if (d_cfg == NULL)
   1457 			return (ENOMEM);
   1458 		d_cfg->rows = 1; /* there is only 1 row now */
   1459 		d_cfg->cols = raidPtr->numCol;
   1460 		d_cfg->ndevs = raidPtr->numCol;
   1461 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1462 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1463 			return (ENOMEM);
   1464 		}
   1465 		d_cfg->nspares = raidPtr->numSpare;
   1466 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1467 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1468 			return (ENOMEM);
   1469 		}
   1470 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1471 		d = 0;
   1472 		for (j = 0; j < d_cfg->cols; j++) {
   1473 			d_cfg->devs[d] = raidPtr->Disks[j];
   1474 			d++;
   1475 		}
   1476 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1477 			d_cfg->spares[i] = raidPtr->Disks[j];
   1478 		}
   1479 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1480 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1481 
   1482 		return (retcode);
   1483 
   1484 	case RAIDFRAME_CHECK_PARITY:
   1485 		*(int *) data = raidPtr->parity_good;
   1486 		return (0);
   1487 
   1488 	case RAIDFRAME_PARITYMAP_STATUS:
   1489 		if (rf_paritymap_ineligible(raidPtr))
   1490 			return EINVAL;
   1491 		rf_paritymap_status(raidPtr->parity_map,
   1492 		    (struct rf_pmstat *)data);
   1493 		return 0;
   1494 
   1495 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1496 		if (rf_paritymap_ineligible(raidPtr))
   1497 			return EINVAL;
   1498 		if (raidPtr->parity_map == NULL)
   1499 			return ENOENT; /* ??? */
   1500 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1501 			(struct rf_pmparams *)data, 1))
   1502 			return EINVAL;
   1503 		return 0;
   1504 
   1505 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1506 		if (rf_paritymap_ineligible(raidPtr))
   1507 			return EINVAL;
   1508 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1509 		return 0;
   1510 
   1511 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1512 		if (rf_paritymap_ineligible(raidPtr))
   1513 			return EINVAL;
   1514 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1515 		/* XXX should errors be passed up? */
   1516 		return 0;
   1517 
   1518 	case RAIDFRAME_RESET_ACCTOTALS:
   1519 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1520 		return (0);
   1521 
   1522 	case RAIDFRAME_GET_ACCTOTALS:
   1523 		totals = (RF_AccTotals_t *) data;
   1524 		*totals = raidPtr->acc_totals;
   1525 		return (0);
   1526 
   1527 	case RAIDFRAME_KEEP_ACCTOTALS:
   1528 		raidPtr->keep_acc_totals = *(int *)data;
   1529 		return (0);
   1530 
   1531 	case RAIDFRAME_GET_SIZE:
   1532 		*(int *) data = raidPtr->totalSectors;
   1533 		return (0);
   1534 
   1535 		/* fail a disk & optionally start reconstruction */
   1536 	case RAIDFRAME_FAIL_DISK:
   1537 
   1538 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1539 			/* Can't do this on a RAID 0!! */
   1540 			return(EINVAL);
   1541 		}
   1542 
   1543 		rr = (struct rf_recon_req *) data;
   1544 		rr->row = 0;
   1545 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1546 			return (EINVAL);
   1547 
   1548 
   1549 		rf_lock_mutex2(raidPtr->mutex);
   1550 		if (raidPtr->status == rf_rs_reconstructing) {
   1551 			/* you can't fail a disk while we're reconstructing! */
   1552 			/* XXX wrong for RAID6 */
   1553 			rf_unlock_mutex2(raidPtr->mutex);
   1554 			return (EINVAL);
   1555 		}
   1556 		if ((raidPtr->Disks[rr->col].status ==
   1557 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1558 			/* some other component has failed.  Let's not make
   1559 			   things worse. XXX wrong for RAID6 */
   1560 			rf_unlock_mutex2(raidPtr->mutex);
   1561 			return (EINVAL);
   1562 		}
   1563 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1564 			/* Can't fail a spared disk! */
   1565 			rf_unlock_mutex2(raidPtr->mutex);
   1566 			return (EINVAL);
   1567 		}
   1568 		rf_unlock_mutex2(raidPtr->mutex);
   1569 
   1570 		/* make a copy of the recon request so that we don't rely on
   1571 		 * the user's buffer */
   1572 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1573 		if (rrcopy == NULL)
   1574 			return(ENOMEM);
   1575 		memcpy(rrcopy, rr, sizeof(*rr));
   1576 		rrcopy->raidPtr = (void *) raidPtr;
   1577 
   1578 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1579 					   rf_ReconThread,
   1580 					   rrcopy,"raid_recon");
   1581 		return (0);
   1582 
   1583 		/* invoke a copyback operation after recon on whatever disk
   1584 		 * needs it, if any */
   1585 	case RAIDFRAME_COPYBACK:
   1586 
   1587 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1588 			/* This makes no sense on a RAID 0!! */
   1589 			return(EINVAL);
   1590 		}
   1591 
   1592 		if (raidPtr->copyback_in_progress == 1) {
   1593 			/* Copyback is already in progress! */
   1594 			return(EINVAL);
   1595 		}
   1596 
   1597 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1598 					   rf_CopybackThread,
   1599 					   raidPtr,"raid_copyback");
   1600 		return (retcode);
   1601 
   1602 		/* return the percentage completion of reconstruction */
   1603 	case RAIDFRAME_CHECK_RECON_STATUS:
   1604 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1605 			/* This makes no sense on a RAID 0, so tell the
   1606 			   user it's done. */
   1607 			*(int *) data = 100;
   1608 			return(0);
   1609 		}
   1610 		if (raidPtr->status != rf_rs_reconstructing)
   1611 			*(int *) data = 100;
   1612 		else {
   1613 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1614 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1615 			} else {
   1616 				*(int *) data = 0;
   1617 			}
   1618 		}
   1619 		return (0);
   1620 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1621 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1622 		if (raidPtr->status != rf_rs_reconstructing) {
   1623 			progressInfo.remaining = 0;
   1624 			progressInfo.completed = 100;
   1625 			progressInfo.total = 100;
   1626 		} else {
   1627 			progressInfo.total =
   1628 				raidPtr->reconControl->numRUsTotal;
   1629 			progressInfo.completed =
   1630 				raidPtr->reconControl->numRUsComplete;
   1631 			progressInfo.remaining = progressInfo.total -
   1632 				progressInfo.completed;
   1633 		}
   1634 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1635 				  sizeof(RF_ProgressInfo_t));
   1636 		return (retcode);
   1637 
   1638 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1639 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1640 			/* This makes no sense on a RAID 0, so tell the
   1641 			   user it's done. */
   1642 			*(int *) data = 100;
   1643 			return(0);
   1644 		}
   1645 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1646 			*(int *) data = 100 *
   1647 				raidPtr->parity_rewrite_stripes_done /
   1648 				raidPtr->Layout.numStripe;
   1649 		} else {
   1650 			*(int *) data = 100;
   1651 		}
   1652 		return (0);
   1653 
   1654 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1655 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1656 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1657 			progressInfo.total = raidPtr->Layout.numStripe;
   1658 			progressInfo.completed =
   1659 				raidPtr->parity_rewrite_stripes_done;
   1660 			progressInfo.remaining = progressInfo.total -
   1661 				progressInfo.completed;
   1662 		} else {
   1663 			progressInfo.remaining = 0;
   1664 			progressInfo.completed = 100;
   1665 			progressInfo.total = 100;
   1666 		}
   1667 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1668 				  sizeof(RF_ProgressInfo_t));
   1669 		return (retcode);
   1670 
   1671 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1672 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1673 			/* This makes no sense on a RAID 0 */
   1674 			*(int *) data = 100;
   1675 			return(0);
   1676 		}
   1677 		if (raidPtr->copyback_in_progress == 1) {
   1678 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1679 				raidPtr->Layout.numStripe;
   1680 		} else {
   1681 			*(int *) data = 100;
   1682 		}
   1683 		return (0);
   1684 
   1685 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1686 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1687 		if (raidPtr->copyback_in_progress == 1) {
   1688 			progressInfo.total = raidPtr->Layout.numStripe;
   1689 			progressInfo.completed =
   1690 				raidPtr->copyback_stripes_done;
   1691 			progressInfo.remaining = progressInfo.total -
   1692 				progressInfo.completed;
   1693 		} else {
   1694 			progressInfo.remaining = 0;
   1695 			progressInfo.completed = 100;
   1696 			progressInfo.total = 100;
   1697 		}
   1698 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1699 				  sizeof(RF_ProgressInfo_t));
   1700 		return (retcode);
   1701 
   1702 		/* the sparetable daemon calls this to wait for the kernel to
   1703 		 * need a spare table. this ioctl does not return until a
   1704 		 * spare table is needed. XXX -- calling mpsleep here in the
   1705 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1706 		 * -- I should either compute the spare table in the kernel,
   1707 		 * or have a different -- XXX XXX -- interface (a different
   1708 		 * character device) for delivering the table     -- XXX */
   1709 #if 0
   1710 	case RAIDFRAME_SPARET_WAIT:
   1711 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1712 		while (!rf_sparet_wait_queue)
   1713 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1714 		waitreq = rf_sparet_wait_queue;
   1715 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1716 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1717 
   1718 		/* structure assignment */
   1719 		*((RF_SparetWait_t *) data) = *waitreq;
   1720 
   1721 		RF_Free(waitreq, sizeof(*waitreq));
   1722 		return (0);
   1723 
   1724 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1725 		 * code in it that will cause the dameon to exit */
   1726 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1727 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1728 		waitreq->fcol = -1;
   1729 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1730 		waitreq->next = rf_sparet_wait_queue;
   1731 		rf_sparet_wait_queue = waitreq;
   1732 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1733 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1734 		return (0);
   1735 
   1736 		/* used by the spare table daemon to deliver a spare table
   1737 		 * into the kernel */
   1738 	case RAIDFRAME_SEND_SPARET:
   1739 
   1740 		/* install the spare table */
   1741 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1742 
   1743 		/* respond to the requestor.  the return status of the spare
   1744 		 * table installation is passed in the "fcol" field */
   1745 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1746 		waitreq->fcol = retcode;
   1747 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1748 		waitreq->next = rf_sparet_resp_queue;
   1749 		rf_sparet_resp_queue = waitreq;
   1750 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1751 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1752 
   1753 		return (retcode);
   1754 #endif
   1755 
   1756 	default:
   1757 		break; /* fall through to the os-specific code below */
   1758 
   1759 	}
   1760 
   1761 	if (!raidPtr->valid)
   1762 		return (EINVAL);
   1763 
   1764 	/*
   1765 	 * Add support for "regular" device ioctls here.
   1766 	 */
   1767 
   1768 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1769 	if (error != EPASSTHROUGH)
   1770 		return (error);
   1771 
   1772 	switch (cmd) {
   1773 	case DIOCGDINFO:
   1774 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1775 		break;
   1776 #ifdef __HAVE_OLD_DISKLABEL
   1777 	case ODIOCGDINFO:
   1778 		newlabel = *(rs->sc_dkdev.dk_label);
   1779 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1780 			return ENOTTY;
   1781 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1782 		break;
   1783 #endif
   1784 
   1785 	case DIOCGPART:
   1786 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1787 		((struct partinfo *) data)->part =
   1788 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1789 		break;
   1790 
   1791 	case DIOCWDINFO:
   1792 	case DIOCSDINFO:
   1793 #ifdef __HAVE_OLD_DISKLABEL
   1794 	case ODIOCWDINFO:
   1795 	case ODIOCSDINFO:
   1796 #endif
   1797 	{
   1798 		struct disklabel *lp;
   1799 #ifdef __HAVE_OLD_DISKLABEL
   1800 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1801 			memset(&newlabel, 0, sizeof newlabel);
   1802 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1803 			lp = &newlabel;
   1804 		} else
   1805 #endif
   1806 		lp = (struct disklabel *)data;
   1807 
   1808 		if ((error = raidlock(rs)) != 0)
   1809 			return (error);
   1810 
   1811 		rs->sc_flags |= RAIDF_LABELLING;
   1812 
   1813 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1814 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1815 		if (error == 0) {
   1816 			if (cmd == DIOCWDINFO
   1817 #ifdef __HAVE_OLD_DISKLABEL
   1818 			    || cmd == ODIOCWDINFO
   1819 #endif
   1820 			   )
   1821 				error = writedisklabel(RAIDLABELDEV(dev),
   1822 				    raidstrategy, rs->sc_dkdev.dk_label,
   1823 				    rs->sc_dkdev.dk_cpulabel);
   1824 		}
   1825 		rs->sc_flags &= ~RAIDF_LABELLING;
   1826 
   1827 		raidunlock(rs);
   1828 
   1829 		if (error)
   1830 			return (error);
   1831 		break;
   1832 	}
   1833 
   1834 	case DIOCWLABEL:
   1835 		if (*(int *) data != 0)
   1836 			rs->sc_flags |= RAIDF_WLABEL;
   1837 		else
   1838 			rs->sc_flags &= ~RAIDF_WLABEL;
   1839 		break;
   1840 
   1841 	case DIOCGDEFLABEL:
   1842 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1843 		break;
   1844 
   1845 #ifdef __HAVE_OLD_DISKLABEL
   1846 	case ODIOCGDEFLABEL:
   1847 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1848 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1849 			return ENOTTY;
   1850 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1851 		break;
   1852 #endif
   1853 
   1854 	case DIOCAWEDGE:
   1855 	case DIOCDWEDGE:
   1856 	    	dkw = (void *)data;
   1857 
   1858 		/* If the ioctl happens here, the parent is us. */
   1859 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1860 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1861 
   1862 	case DIOCLWEDGES:
   1863 		return dkwedge_list(&rs->sc_dkdev,
   1864 		    (struct dkwedge_list *)data, l);
   1865 	case DIOCCACHESYNC:
   1866 		return rf_sync_component_caches(raidPtr);
   1867 
   1868 	case DIOCGSTRATEGY:
   1869 	    {
   1870 		struct disk_strategy *dks = (void *)data;
   1871 
   1872 		s = splbio();
   1873 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1874 		    sizeof(dks->dks_name));
   1875 		splx(s);
   1876 		dks->dks_paramlen = 0;
   1877 
   1878 		return 0;
   1879 	    }
   1880 
   1881 	case DIOCSSTRATEGY:
   1882 	    {
   1883 		struct disk_strategy *dks = (void *)data;
   1884 		struct bufq_state *new;
   1885 		struct bufq_state *old;
   1886 
   1887 		if (dks->dks_param != NULL) {
   1888 			return EINVAL;
   1889 		}
   1890 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1891 		error = bufq_alloc(&new, dks->dks_name,
   1892 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1893 		if (error) {
   1894 			return error;
   1895 		}
   1896 		s = splbio();
   1897 		old = rs->buf_queue;
   1898 		bufq_move(new, old);
   1899 		rs->buf_queue = new;
   1900 		splx(s);
   1901 		bufq_free(old);
   1902 
   1903 		return 0;
   1904 	    }
   1905 
   1906 	default:
   1907 		retcode = ENOTTY;
   1908 	}
   1909 	return (retcode);
   1910 
   1911 }
   1912 
   1913 
   1914 /* raidinit -- complete the rest of the initialization for the
   1915    RAIDframe device.  */
   1916 
   1917 
   1918 static void
   1919 raidinit(RF_Raid_t *raidPtr)
   1920 {
   1921 	cfdata_t cf;
   1922 	struct raid_softc *rs;
   1923 	int     unit;
   1924 
   1925 	unit = raidPtr->raidid;
   1926 
   1927 	rs = &raid_softc[unit];
   1928 
   1929 	/* XXX should check return code first... */
   1930 	rs->sc_flags |= RAIDF_INITED;
   1931 
   1932 	/* XXX doesn't check bounds. */
   1933 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1934 
   1935 	/* attach the pseudo device */
   1936 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1937 	cf->cf_name = raid_cd.cd_name;
   1938 	cf->cf_atname = raid_cd.cd_name;
   1939 	cf->cf_unit = unit;
   1940 	cf->cf_fstate = FSTATE_STAR;
   1941 
   1942 	rs->sc_dev = config_attach_pseudo(cf);
   1943 
   1944 	if (rs->sc_dev == NULL) {
   1945 		printf("raid%d: config_attach_pseudo failed\n",
   1946 		    raidPtr->raidid);
   1947 		rs->sc_flags &= ~RAIDF_INITED;
   1948 		free(cf, M_RAIDFRAME);
   1949 		return;
   1950 	}
   1951 
   1952 	/* disk_attach actually creates space for the CPU disklabel, among
   1953 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1954 	 * with disklabels. */
   1955 
   1956 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1957 	disk_attach(&rs->sc_dkdev);
   1958 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1959 
   1960 	/* XXX There may be a weird interaction here between this, and
   1961 	 * protectedSectors, as used in RAIDframe.  */
   1962 
   1963 	rs->sc_size = raidPtr->totalSectors;
   1964 
   1965 	dkwedge_discover(&rs->sc_dkdev);
   1966 
   1967 	rf_set_properties(rs, raidPtr);
   1968 
   1969 }
   1970 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1971 /* wake up the daemon & tell it to get us a spare table
   1972  * XXX
   1973  * the entries in the queues should be tagged with the raidPtr
   1974  * so that in the extremely rare case that two recons happen at once,
   1975  * we know for which device were requesting a spare table
   1976  * XXX
   1977  *
   1978  * XXX This code is not currently used. GO
   1979  */
   1980 int
   1981 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1982 {
   1983 	int     retcode;
   1984 
   1985 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1986 	req->next = rf_sparet_wait_queue;
   1987 	rf_sparet_wait_queue = req;
   1988 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1989 
   1990 	/* mpsleep unlocks the mutex */
   1991 	while (!rf_sparet_resp_queue) {
   1992 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1993 	}
   1994 	req = rf_sparet_resp_queue;
   1995 	rf_sparet_resp_queue = req->next;
   1996 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1997 
   1998 	retcode = req->fcol;
   1999 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2000 					 * alloc'd */
   2001 	return (retcode);
   2002 }
   2003 #endif
   2004 
   2005 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2006  * bp & passes it down.
   2007  * any calls originating in the kernel must use non-blocking I/O
   2008  * do some extra sanity checking to return "appropriate" error values for
   2009  * certain conditions (to make some standard utilities work)
   2010  *
   2011  * Formerly known as: rf_DoAccessKernel
   2012  */
   2013 void
   2014 raidstart(RF_Raid_t *raidPtr)
   2015 {
   2016 	RF_SectorCount_t num_blocks, pb, sum;
   2017 	RF_RaidAddr_t raid_addr;
   2018 	struct partition *pp;
   2019 	daddr_t blocknum;
   2020 	int     unit;
   2021 	struct raid_softc *rs;
   2022 	int     do_async;
   2023 	struct buf *bp;
   2024 	int rc;
   2025 
   2026 	unit = raidPtr->raidid;
   2027 	rs = &raid_softc[unit];
   2028 
   2029 	/* quick check to see if anything has died recently */
   2030 	rf_lock_mutex2(raidPtr->mutex);
   2031 	if (raidPtr->numNewFailures > 0) {
   2032 		rf_unlock_mutex2(raidPtr->mutex);
   2033 		rf_update_component_labels(raidPtr,
   2034 					   RF_NORMAL_COMPONENT_UPDATE);
   2035 		rf_lock_mutex2(raidPtr->mutex);
   2036 		raidPtr->numNewFailures--;
   2037 	}
   2038 
   2039 	/* Check to see if we're at the limit... */
   2040 	while (raidPtr->openings > 0) {
   2041 		rf_unlock_mutex2(raidPtr->mutex);
   2042 
   2043 		/* get the next item, if any, from the queue */
   2044 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2045 			/* nothing more to do */
   2046 			return;
   2047 		}
   2048 
   2049 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2050 		 * partition.. Need to make it absolute to the underlying
   2051 		 * device.. */
   2052 
   2053 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2054 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2055 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2056 			blocknum += pp->p_offset;
   2057 		}
   2058 
   2059 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2060 			    (int) blocknum));
   2061 
   2062 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2063 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2064 
   2065 		/* *THIS* is where we adjust what block we're going to...
   2066 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2067 		raid_addr = blocknum;
   2068 
   2069 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2070 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2071 		sum = raid_addr + num_blocks + pb;
   2072 		if (1 || rf_debugKernelAccess) {
   2073 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2074 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2075 				    (int) pb, (int) bp->b_resid));
   2076 		}
   2077 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2078 		    || (sum < num_blocks) || (sum < pb)) {
   2079 			bp->b_error = ENOSPC;
   2080 			bp->b_resid = bp->b_bcount;
   2081 			biodone(bp);
   2082 			rf_lock_mutex2(raidPtr->mutex);
   2083 			continue;
   2084 		}
   2085 		/*
   2086 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2087 		 */
   2088 
   2089 		if (bp->b_bcount & raidPtr->sectorMask) {
   2090 			bp->b_error = EINVAL;
   2091 			bp->b_resid = bp->b_bcount;
   2092 			biodone(bp);
   2093 			rf_lock_mutex2(raidPtr->mutex);
   2094 			continue;
   2095 
   2096 		}
   2097 		db1_printf(("Calling DoAccess..\n"));
   2098 
   2099 
   2100 		rf_lock_mutex2(raidPtr->mutex);
   2101 		raidPtr->openings--;
   2102 		rf_unlock_mutex2(raidPtr->mutex);
   2103 
   2104 		/*
   2105 		 * Everything is async.
   2106 		 */
   2107 		do_async = 1;
   2108 
   2109 		disk_busy(&rs->sc_dkdev);
   2110 
   2111 		/* XXX we're still at splbio() here... do we *really*
   2112 		   need to be? */
   2113 
   2114 		/* don't ever condition on bp->b_flags & B_WRITE.
   2115 		 * always condition on B_READ instead */
   2116 
   2117 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2118 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2119 				 do_async, raid_addr, num_blocks,
   2120 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2121 
   2122 		if (rc) {
   2123 			bp->b_error = rc;
   2124 			bp->b_resid = bp->b_bcount;
   2125 			biodone(bp);
   2126 			/* continue loop */
   2127 		}
   2128 
   2129 		rf_lock_mutex2(raidPtr->mutex);
   2130 	}
   2131 	rf_unlock_mutex2(raidPtr->mutex);
   2132 }
   2133 
   2134 
   2135 
   2136 
   2137 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2138 
   2139 int
   2140 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2141 {
   2142 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2143 	struct buf *bp;
   2144 
   2145 	req->queue = queue;
   2146 	bp = req->bp;
   2147 
   2148 	switch (req->type) {
   2149 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2150 		/* XXX need to do something extra here.. */
   2151 		/* I'm leaving this in, as I've never actually seen it used,
   2152 		 * and I'd like folks to report it... GO */
   2153 		printf(("WAKEUP CALLED\n"));
   2154 		queue->numOutstanding++;
   2155 
   2156 		bp->b_flags = 0;
   2157 		bp->b_private = req;
   2158 
   2159 		KernelWakeupFunc(bp);
   2160 		break;
   2161 
   2162 	case RF_IO_TYPE_READ:
   2163 	case RF_IO_TYPE_WRITE:
   2164 #if RF_ACC_TRACE > 0
   2165 		if (req->tracerec) {
   2166 			RF_ETIMER_START(req->tracerec->timer);
   2167 		}
   2168 #endif
   2169 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2170 		    op, queue->rf_cinfo->ci_dev,
   2171 		    req->sectorOffset, req->numSector,
   2172 		    req->buf, KernelWakeupFunc, (void *) req,
   2173 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2174 
   2175 		if (rf_debugKernelAccess) {
   2176 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2177 				(long) bp->b_blkno));
   2178 		}
   2179 		queue->numOutstanding++;
   2180 		queue->last_deq_sector = req->sectorOffset;
   2181 		/* acc wouldn't have been let in if there were any pending
   2182 		 * reqs at any other priority */
   2183 		queue->curPriority = req->priority;
   2184 
   2185 		db1_printf(("Going for %c to unit %d col %d\n",
   2186 			    req->type, queue->raidPtr->raidid,
   2187 			    queue->col));
   2188 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2189 			(int) req->sectorOffset, (int) req->numSector,
   2190 			(int) (req->numSector <<
   2191 			    queue->raidPtr->logBytesPerSector),
   2192 			(int) queue->raidPtr->logBytesPerSector));
   2193 
   2194 		/*
   2195 		 * XXX: drop lock here since this can block at
   2196 		 * least with backing SCSI devices.  Retake it
   2197 		 * to minimize fuss with calling interfaces.
   2198 		 */
   2199 
   2200 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2201 		bdev_strategy(bp);
   2202 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2203 		break;
   2204 
   2205 	default:
   2206 		panic("bad req->type in rf_DispatchKernelIO");
   2207 	}
   2208 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2209 
   2210 	return (0);
   2211 }
   2212 /* this is the callback function associated with a I/O invoked from
   2213    kernel code.
   2214  */
   2215 static void
   2216 KernelWakeupFunc(struct buf *bp)
   2217 {
   2218 	RF_DiskQueueData_t *req = NULL;
   2219 	RF_DiskQueue_t *queue;
   2220 
   2221 	db1_printf(("recovering the request queue:\n"));
   2222 
   2223 	req = bp->b_private;
   2224 
   2225 	queue = (RF_DiskQueue_t *) req->queue;
   2226 
   2227 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2228 
   2229 #if RF_ACC_TRACE > 0
   2230 	if (req->tracerec) {
   2231 		RF_ETIMER_STOP(req->tracerec->timer);
   2232 		RF_ETIMER_EVAL(req->tracerec->timer);
   2233 		rf_lock_mutex2(rf_tracing_mutex);
   2234 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2235 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2236 		req->tracerec->num_phys_ios++;
   2237 		rf_unlock_mutex2(rf_tracing_mutex);
   2238 	}
   2239 #endif
   2240 
   2241 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2242 	 * ballistic, and mark the component as hosed... */
   2243 
   2244 	if (bp->b_error != 0) {
   2245 		/* Mark the disk as dead */
   2246 		/* but only mark it once... */
   2247 		/* and only if it wouldn't leave this RAID set
   2248 		   completely broken */
   2249 		if (((queue->raidPtr->Disks[queue->col].status ==
   2250 		      rf_ds_optimal) ||
   2251 		     (queue->raidPtr->Disks[queue->col].status ==
   2252 		      rf_ds_used_spare)) &&
   2253 		     (queue->raidPtr->numFailures <
   2254 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2255 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2256 			       queue->raidPtr->raidid,
   2257 			       queue->raidPtr->Disks[queue->col].devname);
   2258 			queue->raidPtr->Disks[queue->col].status =
   2259 			    rf_ds_failed;
   2260 			queue->raidPtr->status = rf_rs_degraded;
   2261 			queue->raidPtr->numFailures++;
   2262 			queue->raidPtr->numNewFailures++;
   2263 		} else {	/* Disk is already dead... */
   2264 			/* printf("Disk already marked as dead!\n"); */
   2265 		}
   2266 
   2267 	}
   2268 
   2269 	/* Fill in the error value */
   2270 	req->error = bp->b_error;
   2271 
   2272 	/* Drop this one on the "finished" queue... */
   2273 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2274 
   2275 	/* Let the raidio thread know there is work to be done. */
   2276 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2277 
   2278 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2279 }
   2280 
   2281 
   2282 /*
   2283  * initialize a buf structure for doing an I/O in the kernel.
   2284  */
   2285 static void
   2286 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2287        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2288        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2289        struct proc *b_proc)
   2290 {
   2291 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2292 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2293 	bp->b_oflags = 0;
   2294 	bp->b_cflags = 0;
   2295 	bp->b_bcount = numSect << logBytesPerSector;
   2296 	bp->b_bufsize = bp->b_bcount;
   2297 	bp->b_error = 0;
   2298 	bp->b_dev = dev;
   2299 	bp->b_data = bf;
   2300 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2301 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2302 	if (bp->b_bcount == 0) {
   2303 		panic("bp->b_bcount is zero in InitBP!!");
   2304 	}
   2305 	bp->b_proc = b_proc;
   2306 	bp->b_iodone = cbFunc;
   2307 	bp->b_private = cbArg;
   2308 }
   2309 
   2310 static void
   2311 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2312 		    struct disklabel *lp)
   2313 {
   2314 	memset(lp, 0, sizeof(*lp));
   2315 
   2316 	/* fabricate a label... */
   2317 	lp->d_secperunit = raidPtr->totalSectors;
   2318 	lp->d_secsize = raidPtr->bytesPerSector;
   2319 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2320 	lp->d_ntracks = 4 * raidPtr->numCol;
   2321 	lp->d_ncylinders = raidPtr->totalSectors /
   2322 		(lp->d_nsectors * lp->d_ntracks);
   2323 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2324 
   2325 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2326 	lp->d_type = DTYPE_RAID;
   2327 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2328 	lp->d_rpm = 3600;
   2329 	lp->d_interleave = 1;
   2330 	lp->d_flags = 0;
   2331 
   2332 	lp->d_partitions[RAW_PART].p_offset = 0;
   2333 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2334 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2335 	lp->d_npartitions = RAW_PART + 1;
   2336 
   2337 	lp->d_magic = DISKMAGIC;
   2338 	lp->d_magic2 = DISKMAGIC;
   2339 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2340 
   2341 }
   2342 /*
   2343  * Read the disklabel from the raid device.  If one is not present, fake one
   2344  * up.
   2345  */
   2346 static void
   2347 raidgetdisklabel(dev_t dev)
   2348 {
   2349 	int     unit = raidunit(dev);
   2350 	struct raid_softc *rs = &raid_softc[unit];
   2351 	const char   *errstring;
   2352 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2353 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2354 	RF_Raid_t *raidPtr;
   2355 
   2356 	db1_printf(("Getting the disklabel...\n"));
   2357 
   2358 	memset(clp, 0, sizeof(*clp));
   2359 
   2360 	raidPtr = raidPtrs[unit];
   2361 
   2362 	raidgetdefaultlabel(raidPtr, rs, lp);
   2363 
   2364 	/*
   2365 	 * Call the generic disklabel extraction routine.
   2366 	 */
   2367 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2368 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2369 	if (errstring)
   2370 		raidmakedisklabel(rs);
   2371 	else {
   2372 		int     i;
   2373 		struct partition *pp;
   2374 
   2375 		/*
   2376 		 * Sanity check whether the found disklabel is valid.
   2377 		 *
   2378 		 * This is necessary since total size of the raid device
   2379 		 * may vary when an interleave is changed even though exactly
   2380 		 * same components are used, and old disklabel may used
   2381 		 * if that is found.
   2382 		 */
   2383 		if (lp->d_secperunit != rs->sc_size)
   2384 			printf("raid%d: WARNING: %s: "
   2385 			    "total sector size in disklabel (%" PRIu32 ") != "
   2386 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2387 			    lp->d_secperunit, rs->sc_size);
   2388 		for (i = 0; i < lp->d_npartitions; i++) {
   2389 			pp = &lp->d_partitions[i];
   2390 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2391 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2392 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2393 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2394 		}
   2395 	}
   2396 
   2397 }
   2398 /*
   2399  * Take care of things one might want to take care of in the event
   2400  * that a disklabel isn't present.
   2401  */
   2402 static void
   2403 raidmakedisklabel(struct raid_softc *rs)
   2404 {
   2405 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2406 	db1_printf(("Making a label..\n"));
   2407 
   2408 	/*
   2409 	 * For historical reasons, if there's no disklabel present
   2410 	 * the raw partition must be marked FS_BSDFFS.
   2411 	 */
   2412 
   2413 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2414 
   2415 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2416 
   2417 	lp->d_checksum = dkcksum(lp);
   2418 }
   2419 /*
   2420  * Wait interruptibly for an exclusive lock.
   2421  *
   2422  * XXX
   2423  * Several drivers do this; it should be abstracted and made MP-safe.
   2424  * (Hmm... where have we seen this warning before :->  GO )
   2425  */
   2426 static int
   2427 raidlock(struct raid_softc *rs)
   2428 {
   2429 	int     error;
   2430 
   2431 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2432 		rs->sc_flags |= RAIDF_WANTED;
   2433 		if ((error =
   2434 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2435 			return (error);
   2436 	}
   2437 	rs->sc_flags |= RAIDF_LOCKED;
   2438 	return (0);
   2439 }
   2440 /*
   2441  * Unlock and wake up any waiters.
   2442  */
   2443 static void
   2444 raidunlock(struct raid_softc *rs)
   2445 {
   2446 
   2447 	rs->sc_flags &= ~RAIDF_LOCKED;
   2448 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2449 		rs->sc_flags &= ~RAIDF_WANTED;
   2450 		wakeup(rs);
   2451 	}
   2452 }
   2453 
   2454 
   2455 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2456 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2457 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2458 
   2459 static daddr_t
   2460 rf_component_info_offset(void)
   2461 {
   2462 
   2463 	return RF_COMPONENT_INFO_OFFSET;
   2464 }
   2465 
   2466 static daddr_t
   2467 rf_component_info_size(unsigned secsize)
   2468 {
   2469 	daddr_t info_size;
   2470 
   2471 	KASSERT(secsize);
   2472 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2473 		info_size = secsize;
   2474 	else
   2475 		info_size = RF_COMPONENT_INFO_SIZE;
   2476 
   2477 	return info_size;
   2478 }
   2479 
   2480 static daddr_t
   2481 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2482 {
   2483 	daddr_t map_offset;
   2484 
   2485 	KASSERT(raidPtr->bytesPerSector);
   2486 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2487 		map_offset = raidPtr->bytesPerSector;
   2488 	else
   2489 		map_offset = RF_COMPONENT_INFO_SIZE;
   2490 	map_offset += rf_component_info_offset();
   2491 
   2492 	return map_offset;
   2493 }
   2494 
   2495 static daddr_t
   2496 rf_parity_map_size(RF_Raid_t *raidPtr)
   2497 {
   2498 	daddr_t map_size;
   2499 
   2500 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2501 		map_size = raidPtr->bytesPerSector;
   2502 	else
   2503 		map_size = RF_PARITY_MAP_SIZE;
   2504 
   2505 	return map_size;
   2506 }
   2507 
   2508 int
   2509 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2510 {
   2511 	RF_ComponentLabel_t *clabel;
   2512 
   2513 	clabel = raidget_component_label(raidPtr, col);
   2514 	clabel->clean = RF_RAID_CLEAN;
   2515 	raidflush_component_label(raidPtr, col);
   2516 	return(0);
   2517 }
   2518 
   2519 
   2520 int
   2521 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2522 {
   2523 	RF_ComponentLabel_t *clabel;
   2524 
   2525 	clabel = raidget_component_label(raidPtr, col);
   2526 	clabel->clean = RF_RAID_DIRTY;
   2527 	raidflush_component_label(raidPtr, col);
   2528 	return(0);
   2529 }
   2530 
   2531 int
   2532 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2533 {
   2534 	KASSERT(raidPtr->bytesPerSector);
   2535 	return raidread_component_label(raidPtr->bytesPerSector,
   2536 	    raidPtr->Disks[col].dev,
   2537 	    raidPtr->raid_cinfo[col].ci_vp,
   2538 	    &raidPtr->raid_cinfo[col].ci_label);
   2539 }
   2540 
   2541 RF_ComponentLabel_t *
   2542 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2543 {
   2544 	return &raidPtr->raid_cinfo[col].ci_label;
   2545 }
   2546 
   2547 int
   2548 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2549 {
   2550 	RF_ComponentLabel_t *label;
   2551 
   2552 	label = &raidPtr->raid_cinfo[col].ci_label;
   2553 	label->mod_counter = raidPtr->mod_counter;
   2554 #ifndef RF_NO_PARITY_MAP
   2555 	label->parity_map_modcount = label->mod_counter;
   2556 #endif
   2557 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2558 	    raidPtr->Disks[col].dev,
   2559 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2560 }
   2561 
   2562 
   2563 static int
   2564 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2565     RF_ComponentLabel_t *clabel)
   2566 {
   2567 	return raidread_component_area(dev, b_vp, clabel,
   2568 	    sizeof(RF_ComponentLabel_t),
   2569 	    rf_component_info_offset(),
   2570 	    rf_component_info_size(secsize));
   2571 }
   2572 
   2573 /* ARGSUSED */
   2574 static int
   2575 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2576     size_t msize, daddr_t offset, daddr_t dsize)
   2577 {
   2578 	struct buf *bp;
   2579 	const struct bdevsw *bdev;
   2580 	int error;
   2581 
   2582 	/* XXX should probably ensure that we don't try to do this if
   2583 	   someone has changed rf_protected_sectors. */
   2584 
   2585 	if (b_vp == NULL) {
   2586 		/* For whatever reason, this component is not valid.
   2587 		   Don't try to read a component label from it. */
   2588 		return(EINVAL);
   2589 	}
   2590 
   2591 	/* get a block of the appropriate size... */
   2592 	bp = geteblk((int)dsize);
   2593 	bp->b_dev = dev;
   2594 
   2595 	/* get our ducks in a row for the read */
   2596 	bp->b_blkno = offset / DEV_BSIZE;
   2597 	bp->b_bcount = dsize;
   2598 	bp->b_flags |= B_READ;
   2599  	bp->b_resid = dsize;
   2600 
   2601 	bdev = bdevsw_lookup(bp->b_dev);
   2602 	if (bdev == NULL)
   2603 		return (ENXIO);
   2604 	(*bdev->d_strategy)(bp);
   2605 
   2606 	error = biowait(bp);
   2607 
   2608 	if (!error) {
   2609 		memcpy(data, bp->b_data, msize);
   2610 	}
   2611 
   2612 	brelse(bp, 0);
   2613 	return(error);
   2614 }
   2615 
   2616 
   2617 static int
   2618 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2619     RF_ComponentLabel_t *clabel)
   2620 {
   2621 	return raidwrite_component_area(dev, b_vp, clabel,
   2622 	    sizeof(RF_ComponentLabel_t),
   2623 	    rf_component_info_offset(),
   2624 	    rf_component_info_size(secsize), 0);
   2625 }
   2626 
   2627 /* ARGSUSED */
   2628 static int
   2629 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2630     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2631 {
   2632 	struct buf *bp;
   2633 	const struct bdevsw *bdev;
   2634 	int error;
   2635 
   2636 	/* get a block of the appropriate size... */
   2637 	bp = geteblk((int)dsize);
   2638 	bp->b_dev = dev;
   2639 
   2640 	/* get our ducks in a row for the write */
   2641 	bp->b_blkno = offset / DEV_BSIZE;
   2642 	bp->b_bcount = dsize;
   2643 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2644  	bp->b_resid = dsize;
   2645 
   2646 	memset(bp->b_data, 0, dsize);
   2647 	memcpy(bp->b_data, data, msize);
   2648 
   2649 	bdev = bdevsw_lookup(bp->b_dev);
   2650 	if (bdev == NULL)
   2651 		return (ENXIO);
   2652 	(*bdev->d_strategy)(bp);
   2653 	if (asyncp)
   2654 		return 0;
   2655 	error = biowait(bp);
   2656 	brelse(bp, 0);
   2657 	if (error) {
   2658 #if 1
   2659 		printf("Failed to write RAID component info!\n");
   2660 #endif
   2661 	}
   2662 
   2663 	return(error);
   2664 }
   2665 
   2666 void
   2667 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2668 {
   2669 	int c;
   2670 
   2671 	for (c = 0; c < raidPtr->numCol; c++) {
   2672 		/* Skip dead disks. */
   2673 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2674 			continue;
   2675 		/* XXXjld: what if an error occurs here? */
   2676 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2677 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2678 		    RF_PARITYMAP_NBYTE,
   2679 		    rf_parity_map_offset(raidPtr),
   2680 		    rf_parity_map_size(raidPtr), 0);
   2681 	}
   2682 }
   2683 
   2684 void
   2685 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2686 {
   2687 	struct rf_paritymap_ondisk tmp;
   2688 	int c,first;
   2689 
   2690 	first=1;
   2691 	for (c = 0; c < raidPtr->numCol; c++) {
   2692 		/* Skip dead disks. */
   2693 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2694 			continue;
   2695 		raidread_component_area(raidPtr->Disks[c].dev,
   2696 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2697 		    RF_PARITYMAP_NBYTE,
   2698 		    rf_parity_map_offset(raidPtr),
   2699 		    rf_parity_map_size(raidPtr));
   2700 		if (first) {
   2701 			memcpy(map, &tmp, sizeof(*map));
   2702 			first = 0;
   2703 		} else {
   2704 			rf_paritymap_merge(map, &tmp);
   2705 		}
   2706 	}
   2707 }
   2708 
   2709 void
   2710 rf_markalldirty(RF_Raid_t *raidPtr)
   2711 {
   2712 	RF_ComponentLabel_t *clabel;
   2713 	int sparecol;
   2714 	int c;
   2715 	int j;
   2716 	int scol = -1;
   2717 
   2718 	raidPtr->mod_counter++;
   2719 	for (c = 0; c < raidPtr->numCol; c++) {
   2720 		/* we don't want to touch (at all) a disk that has
   2721 		   failed */
   2722 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2723 			clabel = raidget_component_label(raidPtr, c);
   2724 			if (clabel->status == rf_ds_spared) {
   2725 				/* XXX do something special...
   2726 				   but whatever you do, don't
   2727 				   try to access it!! */
   2728 			} else {
   2729 				raidmarkdirty(raidPtr, c);
   2730 			}
   2731 		}
   2732 	}
   2733 
   2734 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2735 		sparecol = raidPtr->numCol + c;
   2736 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2737 			/*
   2738 
   2739 			   we claim this disk is "optimal" if it's
   2740 			   rf_ds_used_spare, as that means it should be
   2741 			   directly substitutable for the disk it replaced.
   2742 			   We note that too...
   2743 
   2744 			 */
   2745 
   2746 			for(j=0;j<raidPtr->numCol;j++) {
   2747 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2748 					scol = j;
   2749 					break;
   2750 				}
   2751 			}
   2752 
   2753 			clabel = raidget_component_label(raidPtr, sparecol);
   2754 			/* make sure status is noted */
   2755 
   2756 			raid_init_component_label(raidPtr, clabel);
   2757 
   2758 			clabel->row = 0;
   2759 			clabel->column = scol;
   2760 			/* Note: we *don't* change status from rf_ds_used_spare
   2761 			   to rf_ds_optimal */
   2762 			/* clabel.status = rf_ds_optimal; */
   2763 
   2764 			raidmarkdirty(raidPtr, sparecol);
   2765 		}
   2766 	}
   2767 }
   2768 
   2769 
   2770 void
   2771 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2772 {
   2773 	RF_ComponentLabel_t *clabel;
   2774 	int sparecol;
   2775 	int c;
   2776 	int j;
   2777 	int scol;
   2778 
   2779 	scol = -1;
   2780 
   2781 	/* XXX should do extra checks to make sure things really are clean,
   2782 	   rather than blindly setting the clean bit... */
   2783 
   2784 	raidPtr->mod_counter++;
   2785 
   2786 	for (c = 0; c < raidPtr->numCol; c++) {
   2787 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2788 			clabel = raidget_component_label(raidPtr, c);
   2789 			/* make sure status is noted */
   2790 			clabel->status = rf_ds_optimal;
   2791 
   2792 			/* note what unit we are configured as */
   2793 			clabel->last_unit = raidPtr->raidid;
   2794 
   2795 			raidflush_component_label(raidPtr, c);
   2796 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2797 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2798 					raidmarkclean(raidPtr, c);
   2799 				}
   2800 			}
   2801 		}
   2802 		/* else we don't touch it.. */
   2803 	}
   2804 
   2805 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2806 		sparecol = raidPtr->numCol + c;
   2807 		/* Need to ensure that the reconstruct actually completed! */
   2808 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2809 			/*
   2810 
   2811 			   we claim this disk is "optimal" if it's
   2812 			   rf_ds_used_spare, as that means it should be
   2813 			   directly substitutable for the disk it replaced.
   2814 			   We note that too...
   2815 
   2816 			 */
   2817 
   2818 			for(j=0;j<raidPtr->numCol;j++) {
   2819 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2820 					scol = j;
   2821 					break;
   2822 				}
   2823 			}
   2824 
   2825 			/* XXX shouldn't *really* need this... */
   2826 			clabel = raidget_component_label(raidPtr, sparecol);
   2827 			/* make sure status is noted */
   2828 
   2829 			raid_init_component_label(raidPtr, clabel);
   2830 
   2831 			clabel->column = scol;
   2832 			clabel->status = rf_ds_optimal;
   2833 			clabel->last_unit = raidPtr->raidid;
   2834 
   2835 			raidflush_component_label(raidPtr, sparecol);
   2836 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2837 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2838 					raidmarkclean(raidPtr, sparecol);
   2839 				}
   2840 			}
   2841 		}
   2842 	}
   2843 }
   2844 
   2845 void
   2846 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2847 {
   2848 
   2849 	if (vp != NULL) {
   2850 		if (auto_configured == 1) {
   2851 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2852 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2853 			vput(vp);
   2854 
   2855 		} else {
   2856 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2857 		}
   2858 	}
   2859 }
   2860 
   2861 
   2862 void
   2863 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2864 {
   2865 	int r,c;
   2866 	struct vnode *vp;
   2867 	int acd;
   2868 
   2869 
   2870 	/* We take this opportunity to close the vnodes like we should.. */
   2871 
   2872 	for (c = 0; c < raidPtr->numCol; c++) {
   2873 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2874 		acd = raidPtr->Disks[c].auto_configured;
   2875 		rf_close_component(raidPtr, vp, acd);
   2876 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2877 		raidPtr->Disks[c].auto_configured = 0;
   2878 	}
   2879 
   2880 	for (r = 0; r < raidPtr->numSpare; r++) {
   2881 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2882 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2883 		rf_close_component(raidPtr, vp, acd);
   2884 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2885 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2886 	}
   2887 }
   2888 
   2889 
   2890 void
   2891 rf_ReconThread(struct rf_recon_req *req)
   2892 {
   2893 	int     s;
   2894 	RF_Raid_t *raidPtr;
   2895 
   2896 	s = splbio();
   2897 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2898 	raidPtr->recon_in_progress = 1;
   2899 
   2900 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2901 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2902 
   2903 	RF_Free(req, sizeof(*req));
   2904 
   2905 	raidPtr->recon_in_progress = 0;
   2906 	splx(s);
   2907 
   2908 	/* That's all... */
   2909 	kthread_exit(0);	/* does not return */
   2910 }
   2911 
   2912 void
   2913 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2914 {
   2915 	int retcode;
   2916 	int s;
   2917 
   2918 	raidPtr->parity_rewrite_stripes_done = 0;
   2919 	raidPtr->parity_rewrite_in_progress = 1;
   2920 	s = splbio();
   2921 	retcode = rf_RewriteParity(raidPtr);
   2922 	splx(s);
   2923 	if (retcode) {
   2924 		printf("raid%d: Error re-writing parity (%d)!\n",
   2925 		    raidPtr->raidid, retcode);
   2926 	} else {
   2927 		/* set the clean bit!  If we shutdown correctly,
   2928 		   the clean bit on each component label will get
   2929 		   set */
   2930 		raidPtr->parity_good = RF_RAID_CLEAN;
   2931 	}
   2932 	raidPtr->parity_rewrite_in_progress = 0;
   2933 
   2934 	/* Anyone waiting for us to stop?  If so, inform them... */
   2935 	if (raidPtr->waitShutdown) {
   2936 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2937 	}
   2938 
   2939 	/* That's all... */
   2940 	kthread_exit(0);	/* does not return */
   2941 }
   2942 
   2943 
   2944 void
   2945 rf_CopybackThread(RF_Raid_t *raidPtr)
   2946 {
   2947 	int s;
   2948 
   2949 	raidPtr->copyback_in_progress = 1;
   2950 	s = splbio();
   2951 	rf_CopybackReconstructedData(raidPtr);
   2952 	splx(s);
   2953 	raidPtr->copyback_in_progress = 0;
   2954 
   2955 	/* That's all... */
   2956 	kthread_exit(0);	/* does not return */
   2957 }
   2958 
   2959 
   2960 void
   2961 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2962 {
   2963 	int s;
   2964 	RF_Raid_t *raidPtr;
   2965 
   2966 	s = splbio();
   2967 	raidPtr = req->raidPtr;
   2968 	raidPtr->recon_in_progress = 1;
   2969 	rf_ReconstructInPlace(raidPtr, req->col);
   2970 	RF_Free(req, sizeof(*req));
   2971 	raidPtr->recon_in_progress = 0;
   2972 	splx(s);
   2973 
   2974 	/* That's all... */
   2975 	kthread_exit(0);	/* does not return */
   2976 }
   2977 
   2978 static RF_AutoConfig_t *
   2979 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2980     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2981     unsigned secsize)
   2982 {
   2983 	int good_one = 0;
   2984 	RF_ComponentLabel_t *clabel;
   2985 	RF_AutoConfig_t *ac;
   2986 
   2987 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2988 	if (clabel == NULL) {
   2989 oomem:
   2990 		    while(ac_list) {
   2991 			    ac = ac_list;
   2992 			    if (ac->clabel)
   2993 				    free(ac->clabel, M_RAIDFRAME);
   2994 			    ac_list = ac_list->next;
   2995 			    free(ac, M_RAIDFRAME);
   2996 		    }
   2997 		    printf("RAID auto config: out of memory!\n");
   2998 		    return NULL; /* XXX probably should panic? */
   2999 	}
   3000 
   3001 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3002 		/* Got the label.  Does it look reasonable? */
   3003 		if (rf_reasonable_label(clabel, numsecs) &&
   3004 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3005 #ifdef DEBUG
   3006 			printf("Component on: %s: %llu\n",
   3007 				cname, (unsigned long long)size);
   3008 			rf_print_component_label(clabel);
   3009 #endif
   3010 			/* if it's reasonable, add it, else ignore it. */
   3011 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3012 				M_NOWAIT);
   3013 			if (ac == NULL) {
   3014 				free(clabel, M_RAIDFRAME);
   3015 				goto oomem;
   3016 			}
   3017 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3018 			ac->dev = dev;
   3019 			ac->vp = vp;
   3020 			ac->clabel = clabel;
   3021 			ac->next = ac_list;
   3022 			ac_list = ac;
   3023 			good_one = 1;
   3024 		}
   3025 	}
   3026 	if (!good_one) {
   3027 		/* cleanup */
   3028 		free(clabel, M_RAIDFRAME);
   3029 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3030 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3031 		vput(vp);
   3032 	}
   3033 	return ac_list;
   3034 }
   3035 
   3036 RF_AutoConfig_t *
   3037 rf_find_raid_components(void)
   3038 {
   3039 	struct vnode *vp;
   3040 	struct disklabel label;
   3041 	device_t dv;
   3042 	deviter_t di;
   3043 	dev_t dev;
   3044 	int bmajor, bminor, wedge, rf_part_found;
   3045 	int error;
   3046 	int i;
   3047 	RF_AutoConfig_t *ac_list;
   3048 	uint64_t numsecs;
   3049 	unsigned secsize;
   3050 
   3051 	/* initialize the AutoConfig list */
   3052 	ac_list = NULL;
   3053 
   3054 	/* we begin by trolling through *all* the devices on the system */
   3055 
   3056 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3057 	     dv = deviter_next(&di)) {
   3058 
   3059 		/* we are only interested in disks... */
   3060 		if (device_class(dv) != DV_DISK)
   3061 			continue;
   3062 
   3063 		/* we don't care about floppies... */
   3064 		if (device_is_a(dv, "fd")) {
   3065 			continue;
   3066 		}
   3067 
   3068 		/* we don't care about CD's... */
   3069 		if (device_is_a(dv, "cd")) {
   3070 			continue;
   3071 		}
   3072 
   3073 		/* we don't care about md's... */
   3074 		if (device_is_a(dv, "md")) {
   3075 			continue;
   3076 		}
   3077 
   3078 		/* hdfd is the Atari/Hades floppy driver */
   3079 		if (device_is_a(dv, "hdfd")) {
   3080 			continue;
   3081 		}
   3082 
   3083 		/* fdisa is the Atari/Milan floppy driver */
   3084 		if (device_is_a(dv, "fdisa")) {
   3085 			continue;
   3086 		}
   3087 
   3088 		/* need to find the device_name_to_block_device_major stuff */
   3089 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3090 
   3091 		rf_part_found = 0; /*No raid partition as yet*/
   3092 
   3093 		/* get a vnode for the raw partition of this disk */
   3094 
   3095 		wedge = device_is_a(dv, "dk");
   3096 		bminor = minor(device_unit(dv));
   3097 		dev = wedge ? makedev(bmajor, bminor) :
   3098 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3099 		if (bdevvp(dev, &vp))
   3100 			panic("RAID can't alloc vnode");
   3101 
   3102 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3103 
   3104 		if (error) {
   3105 			/* "Who cares."  Continue looking
   3106 			   for something that exists*/
   3107 			vput(vp);
   3108 			continue;
   3109 		}
   3110 
   3111 		error = getdisksize(vp, &numsecs, &secsize);
   3112 		if (error) {
   3113 			vput(vp);
   3114 			continue;
   3115 		}
   3116 		if (wedge) {
   3117 			struct dkwedge_info dkw;
   3118 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3119 			    NOCRED);
   3120 			if (error) {
   3121 				printf("RAIDframe: can't get wedge info for "
   3122 				    "dev %s (%d)\n", device_xname(dv), error);
   3123 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3124 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3125 				vput(vp);
   3126 				continue;
   3127 			}
   3128 
   3129 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3130 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3131 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3132 				vput(vp);
   3133 				continue;
   3134 			}
   3135 
   3136 			ac_list = rf_get_component(ac_list, dev, vp,
   3137 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3138 			rf_part_found = 1; /*There is a raid component on this disk*/
   3139 			continue;
   3140 		}
   3141 
   3142 		/* Ok, the disk exists.  Go get the disklabel. */
   3143 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3144 		if (error) {
   3145 			/*
   3146 			 * XXX can't happen - open() would
   3147 			 * have errored out (or faked up one)
   3148 			 */
   3149 			if (error != ENOTTY)
   3150 				printf("RAIDframe: can't get label for dev "
   3151 				    "%s (%d)\n", device_xname(dv), error);
   3152 		}
   3153 
   3154 		/* don't need this any more.  We'll allocate it again
   3155 		   a little later if we really do... */
   3156 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3157 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3158 		vput(vp);
   3159 
   3160 		if (error)
   3161 			continue;
   3162 
   3163 		rf_part_found = 0; /*No raid partitions yet*/
   3164 		for (i = 0; i < label.d_npartitions; i++) {
   3165 			char cname[sizeof(ac_list->devname)];
   3166 
   3167 			/* We only support partitions marked as RAID */
   3168 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3169 				continue;
   3170 
   3171 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3172 			if (bdevvp(dev, &vp))
   3173 				panic("RAID can't alloc vnode");
   3174 
   3175 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3176 			if (error) {
   3177 				/* Whatever... */
   3178 				vput(vp);
   3179 				continue;
   3180 			}
   3181 			snprintf(cname, sizeof(cname), "%s%c",
   3182 			    device_xname(dv), 'a' + i);
   3183 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3184 				label.d_partitions[i].p_size, numsecs, secsize);
   3185 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3186 		}
   3187 
   3188 		/*
   3189 		 *If there is no raid component on this disk, either in a
   3190 		 *disklabel or inside a wedge, check the raw partition as well,
   3191 		 *as it is possible to configure raid components on raw disk
   3192 		 *devices.
   3193 		 */
   3194 
   3195 		if (!rf_part_found) {
   3196 			char cname[sizeof(ac_list->devname)];
   3197 
   3198 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3199 			if (bdevvp(dev, &vp))
   3200 				panic("RAID can't alloc vnode");
   3201 
   3202 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3203 			if (error) {
   3204 				/* Whatever... */
   3205 				vput(vp);
   3206 				continue;
   3207 			}
   3208 			snprintf(cname, sizeof(cname), "%s%c",
   3209 			    device_xname(dv), 'a' + RAW_PART);
   3210 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3211 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3212 		}
   3213 	}
   3214 	deviter_release(&di);
   3215 	return ac_list;
   3216 }
   3217 
   3218 
   3219 int
   3220 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3221 {
   3222 
   3223 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3224 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3225 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3226 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3227 	    clabel->row >=0 &&
   3228 	    clabel->column >= 0 &&
   3229 	    clabel->num_rows > 0 &&
   3230 	    clabel->num_columns > 0 &&
   3231 	    clabel->row < clabel->num_rows &&
   3232 	    clabel->column < clabel->num_columns &&
   3233 	    clabel->blockSize > 0 &&
   3234 	    /*
   3235 	     * numBlocksHi may contain garbage, but it is ok since
   3236 	     * the type is unsigned.  If it is really garbage,
   3237 	     * rf_fix_old_label_size() will fix it.
   3238 	     */
   3239 	    rf_component_label_numblocks(clabel) > 0) {
   3240 		/*
   3241 		 * label looks reasonable enough...
   3242 		 * let's make sure it has no old garbage.
   3243 		 */
   3244 		if (numsecs)
   3245 			rf_fix_old_label_size(clabel, numsecs);
   3246 		return(1);
   3247 	}
   3248 	return(0);
   3249 }
   3250 
   3251 
   3252 /*
   3253  * For reasons yet unknown, some old component labels have garbage in
   3254  * the newer numBlocksHi region, and this causes lossage.  Since those
   3255  * disks will also have numsecs set to less than 32 bits of sectors,
   3256  * we can determine when this corruption has occured, and fix it.
   3257  *
   3258  * The exact same problem, with the same unknown reason, happens to
   3259  * the partitionSizeHi member as well.
   3260  */
   3261 static void
   3262 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3263 {
   3264 
   3265 	if (numsecs < ((uint64_t)1 << 32)) {
   3266 		if (clabel->numBlocksHi) {
   3267 			printf("WARNING: total sectors < 32 bits, yet "
   3268 			       "numBlocksHi set\n"
   3269 			       "WARNING: resetting numBlocksHi to zero.\n");
   3270 			clabel->numBlocksHi = 0;
   3271 		}
   3272 
   3273 		if (clabel->partitionSizeHi) {
   3274 			printf("WARNING: total sectors < 32 bits, yet "
   3275 			       "partitionSizeHi set\n"
   3276 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3277 			clabel->partitionSizeHi = 0;
   3278 		}
   3279 	}
   3280 }
   3281 
   3282 
   3283 #ifdef DEBUG
   3284 void
   3285 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3286 {
   3287 	uint64_t numBlocks;
   3288 
   3289 	numBlocks = rf_component_label_numblocks(clabel);
   3290 
   3291 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3292 	       clabel->row, clabel->column,
   3293 	       clabel->num_rows, clabel->num_columns);
   3294 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3295 	       clabel->version, clabel->serial_number,
   3296 	       clabel->mod_counter);
   3297 	printf("   Clean: %s Status: %d\n",
   3298 	       clabel->clean ? "Yes" : "No", clabel->status);
   3299 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3300 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3301 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3302 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3303 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3304 	printf("   Contains root partition: %s\n",
   3305 	       clabel->root_partition ? "Yes" : "No");
   3306 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3307 #if 0
   3308 	   printf("   Config order: %d\n", clabel->config_order);
   3309 #endif
   3310 
   3311 }
   3312 #endif
   3313 
   3314 RF_ConfigSet_t *
   3315 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3316 {
   3317 	RF_AutoConfig_t *ac;
   3318 	RF_ConfigSet_t *config_sets;
   3319 	RF_ConfigSet_t *cset;
   3320 	RF_AutoConfig_t *ac_next;
   3321 
   3322 
   3323 	config_sets = NULL;
   3324 
   3325 	/* Go through the AutoConfig list, and figure out which components
   3326 	   belong to what sets.  */
   3327 	ac = ac_list;
   3328 	while(ac!=NULL) {
   3329 		/* we're going to putz with ac->next, so save it here
   3330 		   for use at the end of the loop */
   3331 		ac_next = ac->next;
   3332 
   3333 		if (config_sets == NULL) {
   3334 			/* will need at least this one... */
   3335 			config_sets = (RF_ConfigSet_t *)
   3336 				malloc(sizeof(RF_ConfigSet_t),
   3337 				       M_RAIDFRAME, M_NOWAIT);
   3338 			if (config_sets == NULL) {
   3339 				panic("rf_create_auto_sets: No memory!");
   3340 			}
   3341 			/* this one is easy :) */
   3342 			config_sets->ac = ac;
   3343 			config_sets->next = NULL;
   3344 			config_sets->rootable = 0;
   3345 			ac->next = NULL;
   3346 		} else {
   3347 			/* which set does this component fit into? */
   3348 			cset = config_sets;
   3349 			while(cset!=NULL) {
   3350 				if (rf_does_it_fit(cset, ac)) {
   3351 					/* looks like it matches... */
   3352 					ac->next = cset->ac;
   3353 					cset->ac = ac;
   3354 					break;
   3355 				}
   3356 				cset = cset->next;
   3357 			}
   3358 			if (cset==NULL) {
   3359 				/* didn't find a match above... new set..*/
   3360 				cset = (RF_ConfigSet_t *)
   3361 					malloc(sizeof(RF_ConfigSet_t),
   3362 					       M_RAIDFRAME, M_NOWAIT);
   3363 				if (cset == NULL) {
   3364 					panic("rf_create_auto_sets: No memory!");
   3365 				}
   3366 				cset->ac = ac;
   3367 				ac->next = NULL;
   3368 				cset->next = config_sets;
   3369 				cset->rootable = 0;
   3370 				config_sets = cset;
   3371 			}
   3372 		}
   3373 		ac = ac_next;
   3374 	}
   3375 
   3376 
   3377 	return(config_sets);
   3378 }
   3379 
   3380 static int
   3381 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3382 {
   3383 	RF_ComponentLabel_t *clabel1, *clabel2;
   3384 
   3385 	/* If this one matches the *first* one in the set, that's good
   3386 	   enough, since the other members of the set would have been
   3387 	   through here too... */
   3388 	/* note that we are not checking partitionSize here..
   3389 
   3390 	   Note that we are also not checking the mod_counters here.
   3391 	   If everything else matches execpt the mod_counter, that's
   3392 	   good enough for this test.  We will deal with the mod_counters
   3393 	   a little later in the autoconfiguration process.
   3394 
   3395 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3396 
   3397 	   The reason we don't check for this is that failed disks
   3398 	   will have lower modification counts.  If those disks are
   3399 	   not added to the set they used to belong to, then they will
   3400 	   form their own set, which may result in 2 different sets,
   3401 	   for example, competing to be configured at raid0, and
   3402 	   perhaps competing to be the root filesystem set.  If the
   3403 	   wrong ones get configured, or both attempt to become /,
   3404 	   weird behaviour and or serious lossage will occur.  Thus we
   3405 	   need to bring them into the fold here, and kick them out at
   3406 	   a later point.
   3407 
   3408 	*/
   3409 
   3410 	clabel1 = cset->ac->clabel;
   3411 	clabel2 = ac->clabel;
   3412 	if ((clabel1->version == clabel2->version) &&
   3413 	    (clabel1->serial_number == clabel2->serial_number) &&
   3414 	    (clabel1->num_rows == clabel2->num_rows) &&
   3415 	    (clabel1->num_columns == clabel2->num_columns) &&
   3416 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3417 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3418 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3419 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3420 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3421 	    (clabel1->blockSize == clabel2->blockSize) &&
   3422 	    rf_component_label_numblocks(clabel1) ==
   3423 	    rf_component_label_numblocks(clabel2) &&
   3424 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3425 	    (clabel1->root_partition == clabel2->root_partition) &&
   3426 	    (clabel1->last_unit == clabel2->last_unit) &&
   3427 	    (clabel1->config_order == clabel2->config_order)) {
   3428 		/* if it get's here, it almost *has* to be a match */
   3429 	} else {
   3430 		/* it's not consistent with somebody in the set..
   3431 		   punt */
   3432 		return(0);
   3433 	}
   3434 	/* all was fine.. it must fit... */
   3435 	return(1);
   3436 }
   3437 
   3438 int
   3439 rf_have_enough_components(RF_ConfigSet_t *cset)
   3440 {
   3441 	RF_AutoConfig_t *ac;
   3442 	RF_AutoConfig_t *auto_config;
   3443 	RF_ComponentLabel_t *clabel;
   3444 	int c;
   3445 	int num_cols;
   3446 	int num_missing;
   3447 	int mod_counter;
   3448 	int mod_counter_found;
   3449 	int even_pair_failed;
   3450 	char parity_type;
   3451 
   3452 
   3453 	/* check to see that we have enough 'live' components
   3454 	   of this set.  If so, we can configure it if necessary */
   3455 
   3456 	num_cols = cset->ac->clabel->num_columns;
   3457 	parity_type = cset->ac->clabel->parityConfig;
   3458 
   3459 	/* XXX Check for duplicate components!?!?!? */
   3460 
   3461 	/* Determine what the mod_counter is supposed to be for this set. */
   3462 
   3463 	mod_counter_found = 0;
   3464 	mod_counter = 0;
   3465 	ac = cset->ac;
   3466 	while(ac!=NULL) {
   3467 		if (mod_counter_found==0) {
   3468 			mod_counter = ac->clabel->mod_counter;
   3469 			mod_counter_found = 1;
   3470 		} else {
   3471 			if (ac->clabel->mod_counter > mod_counter) {
   3472 				mod_counter = ac->clabel->mod_counter;
   3473 			}
   3474 		}
   3475 		ac = ac->next;
   3476 	}
   3477 
   3478 	num_missing = 0;
   3479 	auto_config = cset->ac;
   3480 
   3481 	even_pair_failed = 0;
   3482 	for(c=0; c<num_cols; c++) {
   3483 		ac = auto_config;
   3484 		while(ac!=NULL) {
   3485 			if ((ac->clabel->column == c) &&
   3486 			    (ac->clabel->mod_counter == mod_counter)) {
   3487 				/* it's this one... */
   3488 #ifdef DEBUG
   3489 				printf("Found: %s at %d\n",
   3490 				       ac->devname,c);
   3491 #endif
   3492 				break;
   3493 			}
   3494 			ac=ac->next;
   3495 		}
   3496 		if (ac==NULL) {
   3497 				/* Didn't find one here! */
   3498 				/* special case for RAID 1, especially
   3499 				   where there are more than 2
   3500 				   components (where RAIDframe treats
   3501 				   things a little differently :( ) */
   3502 			if (parity_type == '1') {
   3503 				if (c%2 == 0) { /* even component */
   3504 					even_pair_failed = 1;
   3505 				} else { /* odd component.  If
   3506 					    we're failed, and
   3507 					    so is the even
   3508 					    component, it's
   3509 					    "Good Night, Charlie" */
   3510 					if (even_pair_failed == 1) {
   3511 						return(0);
   3512 					}
   3513 				}
   3514 			} else {
   3515 				/* normal accounting */
   3516 				num_missing++;
   3517 			}
   3518 		}
   3519 		if ((parity_type == '1') && (c%2 == 1)) {
   3520 				/* Just did an even component, and we didn't
   3521 				   bail.. reset the even_pair_failed flag,
   3522 				   and go on to the next component.... */
   3523 			even_pair_failed = 0;
   3524 		}
   3525 	}
   3526 
   3527 	clabel = cset->ac->clabel;
   3528 
   3529 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3530 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3531 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3532 		/* XXX this needs to be made *much* more general */
   3533 		/* Too many failures */
   3534 		return(0);
   3535 	}
   3536 	/* otherwise, all is well, and we've got enough to take a kick
   3537 	   at autoconfiguring this set */
   3538 	return(1);
   3539 }
   3540 
   3541 void
   3542 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3543 			RF_Raid_t *raidPtr)
   3544 {
   3545 	RF_ComponentLabel_t *clabel;
   3546 	int i;
   3547 
   3548 	clabel = ac->clabel;
   3549 
   3550 	/* 1. Fill in the common stuff */
   3551 	config->numRow = clabel->num_rows = 1;
   3552 	config->numCol = clabel->num_columns;
   3553 	config->numSpare = 0; /* XXX should this be set here? */
   3554 	config->sectPerSU = clabel->sectPerSU;
   3555 	config->SUsPerPU = clabel->SUsPerPU;
   3556 	config->SUsPerRU = clabel->SUsPerRU;
   3557 	config->parityConfig = clabel->parityConfig;
   3558 	/* XXX... */
   3559 	strcpy(config->diskQueueType,"fifo");
   3560 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3561 	config->layoutSpecificSize = 0; /* XXX ?? */
   3562 
   3563 	while(ac!=NULL) {
   3564 		/* row/col values will be in range due to the checks
   3565 		   in reasonable_label() */
   3566 		strcpy(config->devnames[0][ac->clabel->column],
   3567 		       ac->devname);
   3568 		ac = ac->next;
   3569 	}
   3570 
   3571 	for(i=0;i<RF_MAXDBGV;i++) {
   3572 		config->debugVars[i][0] = 0;
   3573 	}
   3574 }
   3575 
   3576 int
   3577 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3578 {
   3579 	RF_ComponentLabel_t *clabel;
   3580 	int column;
   3581 	int sparecol;
   3582 
   3583 	raidPtr->autoconfigure = new_value;
   3584 
   3585 	for(column=0; column<raidPtr->numCol; column++) {
   3586 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3587 			clabel = raidget_component_label(raidPtr, column);
   3588 			clabel->autoconfigure = new_value;
   3589 			raidflush_component_label(raidPtr, column);
   3590 		}
   3591 	}
   3592 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3593 		sparecol = raidPtr->numCol + column;
   3594 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3595 			clabel = raidget_component_label(raidPtr, sparecol);
   3596 			clabel->autoconfigure = new_value;
   3597 			raidflush_component_label(raidPtr, sparecol);
   3598 		}
   3599 	}
   3600 	return(new_value);
   3601 }
   3602 
   3603 int
   3604 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3605 {
   3606 	RF_ComponentLabel_t *clabel;
   3607 	int column;
   3608 	int sparecol;
   3609 
   3610 	raidPtr->root_partition = new_value;
   3611 	for(column=0; column<raidPtr->numCol; column++) {
   3612 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3613 			clabel = raidget_component_label(raidPtr, column);
   3614 			clabel->root_partition = new_value;
   3615 			raidflush_component_label(raidPtr, column);
   3616 		}
   3617 	}
   3618 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3619 		sparecol = raidPtr->numCol + column;
   3620 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3621 			clabel = raidget_component_label(raidPtr, sparecol);
   3622 			clabel->root_partition = new_value;
   3623 			raidflush_component_label(raidPtr, sparecol);
   3624 		}
   3625 	}
   3626 	return(new_value);
   3627 }
   3628 
   3629 void
   3630 rf_release_all_vps(RF_ConfigSet_t *cset)
   3631 {
   3632 	RF_AutoConfig_t *ac;
   3633 
   3634 	ac = cset->ac;
   3635 	while(ac!=NULL) {
   3636 		/* Close the vp, and give it back */
   3637 		if (ac->vp) {
   3638 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3639 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3640 			vput(ac->vp);
   3641 			ac->vp = NULL;
   3642 		}
   3643 		ac = ac->next;
   3644 	}
   3645 }
   3646 
   3647 
   3648 void
   3649 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3650 {
   3651 	RF_AutoConfig_t *ac;
   3652 	RF_AutoConfig_t *next_ac;
   3653 
   3654 	ac = cset->ac;
   3655 	while(ac!=NULL) {
   3656 		next_ac = ac->next;
   3657 		/* nuke the label */
   3658 		free(ac->clabel, M_RAIDFRAME);
   3659 		/* cleanup the config structure */
   3660 		free(ac, M_RAIDFRAME);
   3661 		/* "next.." */
   3662 		ac = next_ac;
   3663 	}
   3664 	/* and, finally, nuke the config set */
   3665 	free(cset, M_RAIDFRAME);
   3666 }
   3667 
   3668 
   3669 void
   3670 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3671 {
   3672 	/* current version number */
   3673 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3674 	clabel->serial_number = raidPtr->serial_number;
   3675 	clabel->mod_counter = raidPtr->mod_counter;
   3676 
   3677 	clabel->num_rows = 1;
   3678 	clabel->num_columns = raidPtr->numCol;
   3679 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3680 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3681 
   3682 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3683 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3684 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3685 
   3686 	clabel->blockSize = raidPtr->bytesPerSector;
   3687 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3688 
   3689 	/* XXX not portable */
   3690 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3691 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3692 	clabel->autoconfigure = raidPtr->autoconfigure;
   3693 	clabel->root_partition = raidPtr->root_partition;
   3694 	clabel->last_unit = raidPtr->raidid;
   3695 	clabel->config_order = raidPtr->config_order;
   3696 
   3697 #ifndef RF_NO_PARITY_MAP
   3698 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3699 #endif
   3700 }
   3701 
   3702 int
   3703 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3704 {
   3705 	RF_Raid_t *raidPtr;
   3706 	RF_Config_t *config;
   3707 	int raidID;
   3708 	int retcode;
   3709 
   3710 #ifdef DEBUG
   3711 	printf("RAID autoconfigure\n");
   3712 #endif
   3713 
   3714 	retcode = 0;
   3715 	*unit = -1;
   3716 
   3717 	/* 1. Create a config structure */
   3718 
   3719 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3720 				       M_RAIDFRAME,
   3721 				       M_NOWAIT);
   3722 	if (config==NULL) {
   3723 		printf("Out of mem!?!?\n");
   3724 				/* XXX do something more intelligent here. */
   3725 		return(1);
   3726 	}
   3727 
   3728 	memset(config, 0, sizeof(RF_Config_t));
   3729 
   3730 	/*
   3731 	   2. Figure out what RAID ID this one is supposed to live at
   3732 	   See if we can get the same RAID dev that it was configured
   3733 	   on last time..
   3734 	*/
   3735 
   3736 	raidID = cset->ac->clabel->last_unit;
   3737 	if ((raidID < 0) || (raidID >= numraid)) {
   3738 		/* let's not wander off into lala land. */
   3739 		raidID = numraid - 1;
   3740 	}
   3741 	if (raidPtrs[raidID]->valid != 0) {
   3742 
   3743 		/*
   3744 		   Nope... Go looking for an alternative...
   3745 		   Start high so we don't immediately use raid0 if that's
   3746 		   not taken.
   3747 		*/
   3748 
   3749 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3750 			if (raidPtrs[raidID]->valid == 0) {
   3751 				/* can use this one! */
   3752 				break;
   3753 			}
   3754 		}
   3755 	}
   3756 
   3757 	if (raidID < 0) {
   3758 		/* punt... */
   3759 		printf("Unable to auto configure this set!\n");
   3760 		printf("(Out of RAID devs!)\n");
   3761 		free(config, M_RAIDFRAME);
   3762 		return(1);
   3763 	}
   3764 
   3765 #ifdef DEBUG
   3766 	printf("Configuring raid%d:\n",raidID);
   3767 #endif
   3768 
   3769 	raidPtr = raidPtrs[raidID];
   3770 
   3771 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3772 	raidPtr->raidid = raidID;
   3773 	raidPtr->openings = RAIDOUTSTANDING;
   3774 
   3775 	/* 3. Build the configuration structure */
   3776 	rf_create_configuration(cset->ac, config, raidPtr);
   3777 
   3778 	/* 4. Do the configuration */
   3779 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3780 
   3781 	if (retcode == 0) {
   3782 
   3783 		raidinit(raidPtrs[raidID]);
   3784 
   3785 		rf_markalldirty(raidPtrs[raidID]);
   3786 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3787 		if (cset->ac->clabel->root_partition==1) {
   3788 			/* everything configured just fine.  Make a note
   3789 			   that this set is eligible to be root. */
   3790 			cset->rootable = 1;
   3791 			/* XXX do this here? */
   3792 			raidPtrs[raidID]->root_partition = 1;
   3793 		}
   3794 	}
   3795 
   3796 	/* 5. Cleanup */
   3797 	free(config, M_RAIDFRAME);
   3798 
   3799 	*unit = raidID;
   3800 	return(retcode);
   3801 }
   3802 
   3803 void
   3804 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3805 {
   3806 	struct buf *bp;
   3807 
   3808 	bp = (struct buf *)desc->bp;
   3809 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3810 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3811 }
   3812 
   3813 void
   3814 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3815 	     size_t xmin, size_t xmax)
   3816 {
   3817 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3818 	pool_sethiwat(p, xmax);
   3819 	pool_prime(p, xmin);
   3820 	pool_setlowat(p, xmin);
   3821 }
   3822 
   3823 /*
   3824  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3825  * if there is IO pending and if that IO could possibly be done for a
   3826  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3827  * otherwise.
   3828  *
   3829  */
   3830 
   3831 int
   3832 rf_buf_queue_check(int raidid)
   3833 {
   3834 	if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
   3835 	    raidPtrs[raidid]->openings > 0) {
   3836 		/* there is work to do */
   3837 		return 0;
   3838 	}
   3839 	/* default is nothing to do */
   3840 	return 1;
   3841 }
   3842 
   3843 int
   3844 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3845 {
   3846 	uint64_t numsecs;
   3847 	unsigned secsize;
   3848 	int error;
   3849 
   3850 	error = getdisksize(vp, &numsecs, &secsize);
   3851 	if (error == 0) {
   3852 		diskPtr->blockSize = secsize;
   3853 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3854 		diskPtr->partitionSize = numsecs;
   3855 		return 0;
   3856 	}
   3857 	return error;
   3858 }
   3859 
   3860 static int
   3861 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3862 {
   3863 	return 1;
   3864 }
   3865 
   3866 static void
   3867 raid_attach(device_t parent, device_t self, void *aux)
   3868 {
   3869 
   3870 }
   3871 
   3872 
   3873 static int
   3874 raid_detach(device_t self, int flags)
   3875 {
   3876 	int error;
   3877 	struct raid_softc *rs = &raid_softc[device_unit(self)];
   3878 
   3879 	if ((error = raidlock(rs)) != 0)
   3880 		return (error);
   3881 
   3882 	error = raid_detach_unlocked(rs);
   3883 
   3884 	raidunlock(rs);
   3885 
   3886 	return error;
   3887 }
   3888 
   3889 static void
   3890 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3891 {
   3892 	prop_dictionary_t disk_info, odisk_info, geom;
   3893 	disk_info = prop_dictionary_create();
   3894 	geom = prop_dictionary_create();
   3895 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3896 				   raidPtr->totalSectors);
   3897 	prop_dictionary_set_uint32(geom, "sector-size",
   3898 				   raidPtr->bytesPerSector);
   3899 
   3900 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3901 				   raidPtr->Layout.dataSectorsPerStripe);
   3902 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3903 				   4 * raidPtr->numCol);
   3904 
   3905 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3906 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3907 	   (4 * raidPtr->numCol)));
   3908 
   3909 	prop_dictionary_set(disk_info, "geometry", geom);
   3910 	prop_object_release(geom);
   3911 	prop_dictionary_set(device_properties(rs->sc_dev),
   3912 			    "disk-info", disk_info);
   3913 	odisk_info = rs->sc_dkdev.dk_info;
   3914 	rs->sc_dkdev.dk_info = disk_info;
   3915 	if (odisk_info)
   3916 		prop_object_release(odisk_info);
   3917 }
   3918 
   3919 /*
   3920  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3921  * We end up returning whatever error was returned by the first cache flush
   3922  * that fails.
   3923  */
   3924 
   3925 int
   3926 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3927 {
   3928 	int c, sparecol;
   3929 	int e,error;
   3930 	int force = 1;
   3931 
   3932 	error = 0;
   3933 	for (c = 0; c < raidPtr->numCol; c++) {
   3934 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3935 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3936 					  &force, FWRITE, NOCRED);
   3937 			if (e) {
   3938 				if (e != ENODEV)
   3939 					printf("raid%d: cache flush to component %s failed.\n",
   3940 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3941 				if (error == 0) {
   3942 					error = e;
   3943 				}
   3944 			}
   3945 		}
   3946 	}
   3947 
   3948 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3949 		sparecol = raidPtr->numCol + c;
   3950 		/* Need to ensure that the reconstruct actually completed! */
   3951 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3952 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3953 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3954 			if (e) {
   3955 				if (e != ENODEV)
   3956 					printf("raid%d: cache flush to component %s failed.\n",
   3957 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3958 				if (error == 0) {
   3959 					error = e;
   3960 				}
   3961 			}
   3962 		}
   3963 	}
   3964 	return error;
   3965 }
   3966