Home | History | Annotate | Line # | Download | only in raidframe
rf_disks.c revision 1.14
      1 /*	$NetBSD: rf_disks.c,v 1.14 2000/01/09 01:29:28 oster Exp $	*/
      2 /*-
      3  * Copyright (c) 1999 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This code is derived from software contributed to The NetBSD Foundation
      7  * by Greg Oster
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 3. All advertising materials mentioning features or use of this software
     18  *    must display the following acknowledgement:
     19  *        This product includes software developed by the NetBSD
     20  *        Foundation, Inc. and its contributors.
     21  * 4. Neither the name of The NetBSD Foundation nor the names of its
     22  *    contributors may be used to endorse or promote products derived
     23  *    from this software without specific prior written permission.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     35  * POSSIBILITY OF SUCH DAMAGE.
     36  */
     37 
     38 /*
     39  * Copyright (c) 1995 Carnegie-Mellon University.
     40  * All rights reserved.
     41  *
     42  * Author: Mark Holland
     43  *
     44  * Permission to use, copy, modify and distribute this software and
     45  * its documentation is hereby granted, provided that both the copyright
     46  * notice and this permission notice appear in all copies of the
     47  * software, derivative works or modified versions, and any portions
     48  * thereof, and that both notices appear in supporting documentation.
     49  *
     50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     53  *
     54  * Carnegie Mellon requests users of this software to return to
     55  *
     56  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     57  *  School of Computer Science
     58  *  Carnegie Mellon University
     59  *  Pittsburgh PA 15213-3890
     60  *
     61  * any improvements or extensions that they make and grant Carnegie the
     62  * rights to redistribute these changes.
     63  */
     64 
     65 /***************************************************************
     66  * rf_disks.c -- code to perform operations on the actual disks
     67  ***************************************************************/
     68 
     69 #include "rf_types.h"
     70 #include "rf_raid.h"
     71 #include "rf_alloclist.h"
     72 #include "rf_utils.h"
     73 #include "rf_configure.h"
     74 #include "rf_general.h"
     75 #include "rf_options.h"
     76 #include "rf_kintf.h"
     77 
     78 #include <sys/types.h>
     79 #include <sys/param.h>
     80 #include <sys/systm.h>
     81 #include <sys/proc.h>
     82 #include <sys/ioctl.h>
     83 #include <sys/fcntl.h>
     84 #include <sys/vnode.h>
     85 
     86 /* XXX these should be in a header file somewhere */
     87 void rf_UnconfigureVnodes( RF_Raid_t * );
     88 int rf_CheckLabels( RF_Raid_t *, RF_Config_t *);
     89 
     90 #define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f)
     91 #define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g)
     92 
     93 /**************************************************************************
     94  *
     95  * initialize the disks comprising the array
     96  *
     97  * We want the spare disks to have regular row,col numbers so that we can
     98  * easily substitue a spare for a failed disk.  But, the driver code assumes
     99  * throughout that the array contains numRow by numCol _non-spare_ disks, so
    100  * it's not clear how to fit in the spares.  This is an unfortunate holdover
    101  * from raidSim.  The quick and dirty fix is to make row zero bigger than the
    102  * rest, and put all the spares in it.  This probably needs to get changed
    103  * eventually.
    104  *
    105  **************************************************************************/
    106 
    107 int
    108 rf_ConfigureDisks( listp, raidPtr, cfgPtr )
    109 	RF_ShutdownList_t **listp;
    110 	RF_Raid_t *raidPtr;
    111 	RF_Config_t *cfgPtr;
    112 {
    113 	RF_RaidDisk_t **disks;
    114 	RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL;
    115 	RF_RowCol_t r, c;
    116 	int bs, ret;
    117 	unsigned i, count, foundone = 0, numFailuresThisRow;
    118 	int num_rows_done, num_cols_done;
    119 	int force;
    120 
    121 	num_rows_done = 0;
    122 	num_cols_done = 0;
    123 	force = cfgPtr->force;
    124 
    125 	RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *),
    126 			(RF_RaidDisk_t **), raidPtr->cleanupList);
    127 	if (disks == NULL) {
    128 		ret = ENOMEM;
    129 		goto fail;
    130 	}
    131 	raidPtr->Disks = disks;
    132 
    133 	/* get space for the device-specific stuff... */
    134 	RF_CallocAndAdd(raidPtr->raid_cinfo, raidPtr->numRow,
    135 	    sizeof(struct raidcinfo *), (struct raidcinfo **),
    136 	    raidPtr->cleanupList);
    137 	if (raidPtr->raid_cinfo == NULL) {
    138 		ret = ENOMEM;
    139 		goto fail;
    140 	}
    141 	for (r = 0; r < raidPtr->numRow; r++) {
    142 		numFailuresThisRow = 0;
    143 		/* We allocate RF_MAXSPARE on the first row so that we
    144 		   have room to do hot-swapping of spares */
    145 		RF_CallocAndAdd(disks[r], raidPtr->numCol
    146 				+ ((r == 0) ? RF_MAXSPARE : 0),
    147 				sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *),
    148 				raidPtr->cleanupList);
    149 		if (disks[r] == NULL) {
    150 			ret = ENOMEM;
    151 			goto fail;
    152 		}
    153 		/* get more space for device specific stuff.. */
    154 		RF_CallocAndAdd(raidPtr->raid_cinfo[r],
    155 		    raidPtr->numCol + ((r == 0) ? raidPtr->numSpare : 0),
    156 		    sizeof(struct raidcinfo), (struct raidcinfo *),
    157 		    raidPtr->cleanupList);
    158 		if (raidPtr->raid_cinfo[r] == NULL) {
    159 			ret = ENOMEM;
    160 			goto fail;
    161 		}
    162 		for (c = 0; c < raidPtr->numCol; c++) {
    163 			ret = rf_ConfigureDisk(raidPtr,
    164 					       &cfgPtr->devnames[r][c][0],
    165 					       &disks[r][c], r, c);
    166 			if (ret)
    167 				goto fail;
    168 
    169 			if (disks[r][c].status == rf_ds_optimal) {
    170 				raidread_component_label(
    171 					 raidPtr->raid_cinfo[r][c].ci_dev,
    172 					 raidPtr->raid_cinfo[r][c].ci_vp,
    173 					 &raidPtr->raid_cinfo[r][c].ci_label);
    174 			}
    175 
    176 			if (disks[r][c].status != rf_ds_optimal) {
    177 				numFailuresThisRow++;
    178 			} else {
    179 				if (disks[r][c].numBlocks < min_numblks)
    180 					min_numblks = disks[r][c].numBlocks;
    181 				DPRINTF7("Disk at row %d col %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n",
    182 				    r, c, disks[r][c].devname,
    183 				    (long int) disks[r][c].numBlocks,
    184 				    disks[r][c].blockSize,
    185 				    (long int) disks[r][c].numBlocks *
    186 					 disks[r][c].blockSize / 1024 / 1024);
    187 			}
    188 			num_cols_done++;
    189 		}
    190 		/* XXX fix for n-fault tolerant */
    191 		/* XXX this should probably check to see how many failures
    192 		   we can handle for this configuration! */
    193 		if (numFailuresThisRow > 0)
    194 			raidPtr->status[r] = rf_rs_degraded;
    195 		num_rows_done++;
    196 	}
    197 
    198 	/* all disks must be the same size & have the same block size, bs must
    199 	 * be a power of 2 */
    200 	bs = 0;
    201 	for (foundone = r = 0; !foundone && r < raidPtr->numRow; r++) {
    202 		for (c = 0; !foundone && c < raidPtr->numCol; c++) {
    203 			if (disks[r][c].status == rf_ds_optimal) {
    204 				bs = disks[r][c].blockSize;
    205 				foundone = 1;
    206 			}
    207 		}
    208 	}
    209 	if (!foundone) {
    210 		RF_ERRORMSG("RAIDFRAME: Did not find any live disks in the array.\n");
    211 		ret = EINVAL;
    212 		goto fail;
    213 	}
    214 	for (count = 0, i = 1; i; i <<= 1)
    215 		if (bs & i)
    216 			count++;
    217 	if (count != 1) {
    218 		RF_ERRORMSG1("Error: block size on disks (%d) must be a power of 2\n", bs);
    219 		ret = EINVAL;
    220 		goto fail;
    221 	}
    222 
    223 	if (rf_CheckLabels( raidPtr, cfgPtr )) {
    224 		printf("raid%d: There were fatal errors\n", raidPtr->raidid);
    225 		if (force != 0) {
    226 			printf("raid%d: Fatal errors being ignored.\n",
    227 			       raidPtr->raidid);
    228 		} else {
    229 			ret = EINVAL;
    230 			goto fail;
    231 		}
    232 	}
    233 
    234 	for (r = 0; r < raidPtr->numRow; r++) {
    235 		for (c = 0; c < raidPtr->numCol; c++) {
    236 			if (disks[r][c].status == rf_ds_optimal) {
    237 				if (disks[r][c].blockSize != bs) {
    238 					RF_ERRORMSG2("Error: block size of disk at r %d c %d different from disk at r 0 c 0\n", r, c);
    239 					ret = EINVAL;
    240 					goto fail;
    241 				}
    242 				if (disks[r][c].numBlocks != min_numblks) {
    243 					RF_ERRORMSG3("WARNING: truncating disk at r %d c %d to %d blocks\n",
    244 					    r, c, (int) min_numblks);
    245 					disks[r][c].numBlocks = min_numblks;
    246 				}
    247 			}
    248 		}
    249 	}
    250 
    251 	raidPtr->sectorsPerDisk = min_numblks;
    252 	raidPtr->logBytesPerSector = ffs(bs) - 1;
    253 	raidPtr->bytesPerSector = bs;
    254 	raidPtr->sectorMask = bs - 1;
    255 	return (0);
    256 
    257 fail:
    258 
    259 	rf_UnconfigureVnodes( raidPtr );
    260 
    261 	return (ret);
    262 }
    263 
    264 
    265 /****************************************************************************
    266  * set up the data structures describing the spare disks in the array
    267  * recall from the above comment that the spare disk descriptors are stored
    268  * in row zero, which is specially expanded to hold them.
    269  ****************************************************************************/
    270 int
    271 rf_ConfigureSpareDisks( listp, raidPtr, cfgPtr )
    272 	RF_ShutdownList_t ** listp;
    273 	RF_Raid_t * raidPtr;
    274 	RF_Config_t * cfgPtr;
    275 {
    276 	int     i, ret;
    277 	unsigned int bs;
    278 	RF_RaidDisk_t *disks;
    279 	int     num_spares_done;
    280 
    281 	num_spares_done = 0;
    282 
    283 	/* The space for the spares should have already been allocated by
    284 	 * ConfigureDisks() */
    285 
    286 	disks = &raidPtr->Disks[0][raidPtr->numCol];
    287 	for (i = 0; i < raidPtr->numSpare; i++) {
    288 		ret = rf_ConfigureDisk(raidPtr, &cfgPtr->spare_names[i][0],
    289 				       &disks[i], 0, raidPtr->numCol + i);
    290 		if (ret)
    291 			goto fail;
    292 		if (disks[i].status != rf_ds_optimal) {
    293 			RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",
    294 				     &cfgPtr->spare_names[i][0]);
    295 		} else {
    296 			disks[i].status = rf_ds_spare;	/* change status to
    297 							 * spare */
    298 			DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", i,
    299 			    disks[i].devname,
    300 			    (long int) disks[i].numBlocks, disks[i].blockSize,
    301 			    (long int) disks[i].numBlocks *
    302 				 disks[i].blockSize / 1024 / 1024);
    303 		}
    304 		num_spares_done++;
    305 	}
    306 
    307 	/* check sizes and block sizes on spare disks */
    308 	bs = 1 << raidPtr->logBytesPerSector;
    309 	for (i = 0; i < raidPtr->numSpare; i++) {
    310 		if (disks[i].blockSize != bs) {
    311 			RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[i].blockSize, disks[i].devname, bs);
    312 			ret = EINVAL;
    313 			goto fail;
    314 		}
    315 		if (disks[i].numBlocks < raidPtr->sectorsPerDisk) {
    316 			RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
    317 				     disks[i].devname, disks[i].blockSize,
    318 				     (long int) raidPtr->sectorsPerDisk);
    319 			ret = EINVAL;
    320 			goto fail;
    321 		} else
    322 			if (disks[i].numBlocks > raidPtr->sectorsPerDisk) {
    323 				RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[i].devname, (long int) raidPtr->sectorsPerDisk);
    324 
    325 				disks[i].numBlocks = raidPtr->sectorsPerDisk;
    326 			}
    327 	}
    328 
    329 	return (0);
    330 
    331 fail:
    332 
    333 	/* Release the hold on the main components.  We've failed to allocate
    334 	 * a spare, and since we're failing, we need to free things..
    335 
    336 	 XXX failing to allocate a spare is *not* that big of a deal...
    337 	 We *can* survive without it, if need be, esp. if we get hot
    338 	 adding working.
    339 
    340 	 If we don't fail out here, then we need a way to remove this spare...
    341 	 that should be easier to do here than if we are "live"...
    342 
    343 	 */
    344 
    345 	rf_UnconfigureVnodes( raidPtr );
    346 
    347 	return (ret);
    348 }
    349 
    350 
    351 
    352 /* configure a single disk in the array */
    353 int
    354 rf_ConfigureDisk(raidPtr, buf, diskPtr, row, col)
    355 	RF_Raid_t *raidPtr;
    356 	char   *buf;
    357 	RF_RaidDisk_t *diskPtr;
    358 	RF_RowCol_t row;
    359 	RF_RowCol_t col;
    360 {
    361 	char   *p;
    362 	int     retcode;
    363 
    364 	struct partinfo dpart;
    365 	struct vnode *vp;
    366 	struct vattr va;
    367 	struct proc *proc;
    368 	int     error;
    369 
    370 	retcode = 0;
    371 	p = rf_find_non_white(buf);
    372 	if (p[strlen(p) - 1] == '\n') {
    373 		/* strip off the newline */
    374 		p[strlen(p) - 1] = '\0';
    375 	}
    376 	(void) strcpy(diskPtr->devname, p);
    377 
    378 	proc = raidPtr->engine_thread;
    379 
    380 	/* Let's start by claiming the component is fine and well... */
    381 	diskPtr->status = rf_ds_optimal;
    382 
    383 	raidPtr->raid_cinfo[row][col].ci_vp = NULL;
    384 	raidPtr->raid_cinfo[row][col].ci_dev = NULL;
    385 
    386 	error = raidlookup(diskPtr->devname, proc, &vp);
    387 	if (error) {
    388 		printf("raidlookup on device: %s failed!\n", diskPtr->devname);
    389 		if (error == ENXIO) {
    390 			/* the component isn't there... must be dead :-( */
    391 			diskPtr->status = rf_ds_failed;
    392 		} else {
    393 			return (error);
    394 		}
    395 	}
    396 	if (diskPtr->status == rf_ds_optimal) {
    397 
    398 		if ((error = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
    399 			return (error);
    400 		}
    401 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart,
    402 				  FREAD, proc->p_ucred, proc);
    403 		if (error) {
    404 			return (error);
    405 		}
    406 
    407 		diskPtr->blockSize = dpart.disklab->d_secsize;
    408 
    409 		diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
    410 
    411 		raidPtr->raid_cinfo[row][col].ci_vp = vp;
    412 		raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
    413 
    414 		diskPtr->dev = va.va_rdev;
    415 
    416 		/* we allow the user to specify that only a fraction of the
    417 		 * disks should be used this is just for debug:  it speeds up
    418 		 * the parity scan */
    419 		diskPtr->numBlocks = diskPtr->numBlocks *
    420 			rf_sizePercentage / 100;
    421 	}
    422 	return (0);
    423 }
    424 
    425 static void rf_print_label_status( RF_Raid_t *, int, int, char *,
    426 				  RF_ComponentLabel_t *);
    427 
    428 static void
    429 rf_print_label_status( raidPtr, row, column, dev_name, ci_label )
    430 	RF_Raid_t *raidPtr;
    431 	int row;
    432 	int column;
    433 	char *dev_name;
    434 	RF_ComponentLabel_t *ci_label;
    435 {
    436 
    437 	printf("raid%d: Component %s being configured at row: %d col: %d\n",
    438 	       raidPtr->raidid, dev_name, row, column );
    439 	printf("         Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
    440 	       ci_label->row, ci_label->column,
    441 	       ci_label->num_rows, ci_label->num_columns);
    442 	printf("         Version: %d Serial Number: %d Mod Counter: %d\n",
    443 	       ci_label->version, ci_label->serial_number,
    444 	       ci_label->mod_counter);
    445 	printf("         Clean: %s Status: %d\n",
    446 	       ci_label->clean ? "Yes" : "No", ci_label->status );
    447 }
    448 
    449 static int rf_check_label_vitals( RF_Raid_t *, int, int, char *,
    450 				  RF_ComponentLabel_t *, int, int );
    451 static int rf_check_label_vitals( raidPtr, row, column, dev_name, ci_label,
    452 				  serial_number, mod_counter )
    453 	RF_Raid_t *raidPtr;
    454 	int row;
    455 	int column;
    456 	char *dev_name;
    457 	RF_ComponentLabel_t *ci_label;
    458 	int serial_number;
    459 	int mod_counter;
    460 {
    461 	int fatal_error = 0;
    462 
    463 	if (serial_number != ci_label->serial_number) {
    464 		printf("%s has a different serial number: %d %d\n",
    465 		       dev_name, serial_number, ci_label->serial_number);
    466 		fatal_error = 1;
    467 	}
    468 	if (mod_counter != ci_label->mod_counter) {
    469 		printf("%s has a different modfication count: %d %d\n",
    470 		       dev_name, mod_counter, ci_label->mod_counter);
    471 	}
    472 
    473 	if (row != ci_label->row) {
    474 		printf("Row out of alignment for: %s\n", dev_name);
    475 		fatal_error = 1;
    476 	}
    477 	if (column != ci_label->column) {
    478 		printf("Column out of alignment for: %s\n", dev_name);
    479 		fatal_error = 1;
    480 	}
    481 	if (raidPtr->numRow != ci_label->num_rows) {
    482 		printf("Number of rows do not match for: %s\n", dev_name);
    483 		fatal_error = 1;
    484 	}
    485 	if (raidPtr->numCol != ci_label->num_columns) {
    486 		printf("Number of columns do not match for: %s\n", dev_name);
    487 		fatal_error = 1;
    488 	}
    489 	if (ci_label->clean == 0) {
    490 		/* it's not clean, but that's not fatal */
    491 		printf("%s is not clean!\n", dev_name);
    492 	}
    493 	return(fatal_error);
    494 }
    495 
    496 
    497 /*
    498 
    499    rf_CheckLabels() - check all the component labels for consistency.
    500    Return an error if there is anything major amiss.
    501 
    502  */
    503 
    504 int
    505 rf_CheckLabels( raidPtr, cfgPtr )
    506 	RF_Raid_t *raidPtr;
    507 	RF_Config_t *cfgPtr;
    508 {
    509 	int r,c;
    510 	char *dev_name;
    511 	RF_ComponentLabel_t *ci_label;
    512 	int serial_number = 0;
    513 	int mod_number = 0;
    514 	int fatal_error = 0;
    515 	int mod_values[4];
    516 	int mod_count[4];
    517 	int ser_values[4];
    518 	int ser_count[4];
    519 	int num_ser;
    520 	int num_mod;
    521 	int i;
    522 	int found;
    523 	int hosed_row;
    524 	int hosed_column;
    525 	int too_fatal;
    526 	int parity_good;
    527 	int force;
    528 
    529 	hosed_row = -1;
    530 	hosed_column = -1;
    531 	too_fatal = 0;
    532 	force = cfgPtr->force;
    533 
    534 	/*
    535 	   We're going to try to be a little intelligent here.  If one
    536 	   component's label is bogus, and we can identify that it's the
    537 	   *only* one that's gone, we'll mark it as "failed" and allow
    538 	   the configuration to proceed.  This will be the *only* case
    539 	   that we'll proceed if there would be (otherwise) fatal errors.
    540 
    541 	   Basically we simply keep a count of how many components had
    542 	   what serial number.  If all but one agree, we simply mark
    543 	   the disagreeing component as being failed, and allow
    544 	   things to come up "normally".
    545 
    546 	   We do this first for serial numbers, and then for "mod_counter".
    547 
    548 	 */
    549 
    550 	num_ser = 0;
    551 	num_mod = 0;
    552 	for (r = 0; r < raidPtr->numRow && !fatal_error ; r++) {
    553 		for (c = 0; c < raidPtr->numCol; c++) {
    554 			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    555 			found=0;
    556 			for(i=0;i<num_ser;i++) {
    557 				if (ser_values[i] == ci_label->serial_number) {
    558 					ser_count[i]++;
    559 					found=1;
    560 					break;
    561 				}
    562 			}
    563 			if (!found) {
    564 				ser_values[num_ser] = ci_label->serial_number;
    565 				ser_count[num_ser] = 1;
    566 				num_ser++;
    567 				if (num_ser>2) {
    568 					fatal_error = 1;
    569 					break;
    570 				}
    571 			}
    572 			found=0;
    573 			for(i=0;i<num_mod;i++) {
    574 				if (mod_values[i] == ci_label->mod_counter) {
    575 					mod_count[i]++;
    576 					found=1;
    577 					break;
    578 				}
    579 			}
    580 			if (!found) {
    581 			        mod_values[num_mod] = ci_label->mod_counter;
    582 				mod_count[num_mod] = 1;
    583 				num_mod++;
    584 				if (num_mod>2) {
    585 					fatal_error = 1;
    586 					break;
    587 				}
    588 			}
    589 		}
    590 	}
    591 #if DEBUG
    592 	printf("raid%d: Summary of serial numbers:\n", raidPtr->raidid);
    593 	for(i=0;i<num_ser;i++) {
    594 		printf("%d %d\n", ser_values[i], ser_count[i]);
    595 	}
    596 	printf("raid%d: Summary of mod counters:\n", raidPtr->raidid);
    597 	for(i=0;i<num_mod;i++) {
    598 		printf("%d %d\n", mod_values[i], mod_count[i]);
    599 	}
    600 #endif
    601 	serial_number = ser_values[0];
    602 	if (num_ser == 2) {
    603 		if ((ser_count[0] == 1) || (ser_count[1] == 1)) {
    604 			/* Locate the maverick component */
    605 			if (ser_count[1] > ser_count[0]) {
    606 				serial_number = ser_values[1];
    607 			}
    608 			for (r = 0; r < raidPtr->numRow; r++) {
    609 				for (c = 0; c < raidPtr->numCol; c++) {
    610 				ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    611 					if (serial_number !=
    612 					    ci_label->serial_number) {
    613 						hosed_row = r;
    614 						hosed_column = c;
    615 						break;
    616 					}
    617 				}
    618 			}
    619 			printf("Hosed component: %s\n",
    620 			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
    621 			if (!force) {
    622 				/* we'll fail this component, as if there are
    623 				   other major errors, we arn't forcing things
    624 				   and we'll abort the config anyways */
    625 				raidPtr->Disks[hosed_row][hosed_column].status
    626 					= rf_ds_failed;
    627 				raidPtr->numFailures++;
    628 				raidPtr->status[hosed_row] = rf_rs_degraded;
    629 			}
    630 		} else {
    631 			too_fatal = 1;
    632 		}
    633 		if (cfgPtr->parityConfig == '0') {
    634 			/* We've identified two different serial numbers.
    635 			   RAID 0 can't cope with that, so we'll punt */
    636 			too_fatal = 1;
    637 		}
    638 
    639 	}
    640 
    641 	/* record the serial number for later.  If we bail later, setting
    642 	   this doesn't matter, otherwise we've got the best guess at the
    643 	   correct serial number */
    644 	raidPtr->serial_number = serial_number;
    645 
    646 	mod_number = mod_values[0];
    647 	if (num_mod == 2) {
    648 		if ((mod_count[0] == 1) || (mod_count[1] == 1)) {
    649 			/* Locate the maverick component */
    650 			if (mod_count[1] > mod_count[0]) {
    651 				mod_number = mod_values[1];
    652 			} else if (mod_count[1] < mod_count[0]) {
    653 				mod_number = mod_values[0];
    654 			} else {
    655 				/* counts of different modification values
    656 				   are the same.   Assume greater value is
    657 				   the correct one, all other things
    658 				   considered */
    659 				if (mod_values[0] > mod_values[1]) {
    660 					mod_number = mod_values[0];
    661 				} else {
    662 					mod_number = mod_values[1];
    663 				}
    664 
    665 			}
    666 			for (r = 0; r < raidPtr->numRow && !too_fatal ; r++) {
    667 				for (c = 0; c < raidPtr->numCol; c++) {
    668 					ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    669 					if (mod_number !=
    670 					    ci_label->mod_counter) {
    671 						if ( ( hosed_row == r ) &&
    672 						     ( hosed_column == c )) {
    673 							/* same one.  Can
    674 							   deal with it.  */
    675 						} else {
    676 							hosed_row = r;
    677 							hosed_column = c;
    678 							if (num_ser != 1) {
    679 								too_fatal = 1;
    680 								break;
    681 							}
    682 						}
    683 					}
    684 				}
    685 			}
    686 			printf("Hosed component: %s\n",
    687 			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
    688 			if (!force) {
    689 				/* we'll fail this component, as if there are
    690 				   other major errors, we arn't forcing things
    691 				   and we'll abort the config anyways */
    692 				if (raidPtr->Disks[hosed_row][hosed_column].status != rf_ds_failed) {
    693 					raidPtr->Disks[hosed_row][hosed_column].status
    694 						= rf_ds_failed;
    695 					raidPtr->numFailures++;
    696 					raidPtr->status[hosed_row] = rf_rs_degraded;
    697 				}
    698 			}
    699 		} else {
    700 			too_fatal = 1;
    701 		}
    702 		if (cfgPtr->parityConfig == '0') {
    703 			/* We've identified two different mod counters.
    704 			   RAID 0 can't cope with that, so we'll punt */
    705 			too_fatal = 1;
    706 		}
    707 	}
    708 
    709 	raidPtr->mod_counter = mod_number;
    710 
    711 	if (too_fatal) {
    712 		/* we've had both a serial number mismatch, and a mod_counter
    713 		   mismatch -- and they involved two different components!!
    714 		   Bail -- make things fail so that the user must force
    715 		   the issue... */
    716 		hosed_row = -1;
    717 		hosed_column = -1;
    718 	}
    719 
    720 	if (num_ser > 2) {
    721 		printf("raid%d: Too many different serial numbers!\n",
    722 		       raidPtr->raidid);
    723 	}
    724 
    725 	if (num_mod > 2) {
    726 		printf("raid%d: Too many different mod counters!\n",
    727 		       raidPtr->raidid);
    728 	}
    729 
    730 	/* we start by assuming the parity will be good, and flee from
    731 	   that notion at the slightest sign of trouble */
    732 
    733 	parity_good = RF_RAID_CLEAN;
    734 	for (r = 0; r < raidPtr->numRow; r++) {
    735 		for (c = 0; c < raidPtr->numCol; c++) {
    736 			dev_name = &cfgPtr->devnames[r][c][0];
    737 			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    738 
    739 			if ((r == hosed_row) && (c == hosed_column)) {
    740 				printf("raid%d: Ignoring %s\n",
    741 				       raidPtr->raidid, dev_name);
    742 			} else {
    743 				rf_print_label_status( raidPtr, r, c,
    744 						       dev_name, ci_label );
    745 				if (rf_check_label_vitals( raidPtr, r, c,
    746 							   dev_name, ci_label,
    747 							   serial_number,
    748 							   mod_number )) {
    749 					fatal_error = 1;
    750 				}
    751 				if (ci_label->clean != RF_RAID_CLEAN) {
    752 					parity_good = RF_RAID_DIRTY;
    753 				}
    754 			}
    755 		}
    756 	}
    757 	if (fatal_error) {
    758 		parity_good = RF_RAID_DIRTY;
    759 	}
    760 
    761 	/* we note the state of the parity */
    762 	raidPtr->parity_good = parity_good;
    763 
    764 	return(fatal_error);
    765 }
    766 
    767 int config_disk_queue(RF_Raid_t *, RF_DiskQueue_t *, RF_RowCol_t,
    768 		      RF_RowCol_t, RF_DiskQueueSW_t *,
    769 		      RF_SectorCount_t, dev_t, int,
    770 		      RF_ShutdownList_t **,
    771 		      RF_AllocListElem_t *);
    772 int rf_add_hot_spare(RF_Raid_t *, RF_SingleComponent_t *);
    773 int
    774 rf_add_hot_spare(raidPtr, sparePtr)
    775 	RF_Raid_t *raidPtr;
    776 	RF_SingleComponent_t *sparePtr;
    777 {
    778 	RF_RaidDisk_t *disks;
    779 	RF_DiskQueue_t *spareQueues;
    780 	int ret;
    781 	unsigned int bs;
    782 	int spare_number;
    783 
    784 	printf("Just in rf_add_hot_spare: %d\n",raidPtr->numSpare);
    785 	printf("Num col: %d\n",raidPtr->numCol);
    786 	if (raidPtr->numSpare >= RF_MAXSPARE) {
    787 		RF_ERRORMSG1("Too many spares: %d\n", raidPtr->numSpare);
    788 		return(EINVAL);
    789 	}
    790 
    791 	RF_LOCK_MUTEX(raidPtr->mutex);
    792 
    793 	/* the beginning of the spares... */
    794 	disks = &raidPtr->Disks[0][raidPtr->numCol];
    795 
    796 	spare_number = raidPtr->numSpare;
    797 
    798 	ret = rf_ConfigureDisk(raidPtr, sparePtr->component_name,
    799 			       &disks[spare_number], 0,
    800 			       raidPtr->numCol + spare_number);
    801 
    802 	if (ret)
    803 		goto fail;
    804 	if (disks[spare_number].status != rf_ds_optimal) {
    805 		RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",
    806 			     sparePtr->component_name);
    807 		ret=EINVAL;
    808 		goto fail;
    809 	} else {
    810 		disks[spare_number].status = rf_ds_spare;
    811 		DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", spare_number,
    812 			 disks[spare_number].devname,
    813 			 (long int) disks[spare_number].numBlocks,
    814 			 disks[spare_number].blockSize,
    815 			 (long int) disks[spare_number].numBlocks *
    816 			 disks[spare_number].blockSize / 1024 / 1024);
    817 	}
    818 
    819 
    820 	/* check sizes and block sizes on the spare disk */
    821 	bs = 1 << raidPtr->logBytesPerSector;
    822 	if (disks[spare_number].blockSize != bs) {
    823 		RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[spare_number].blockSize, disks[spare_number].devname, bs);
    824 		ret = EINVAL;
    825 		goto fail;
    826 	}
    827 	if (disks[spare_number].numBlocks < raidPtr->sectorsPerDisk) {
    828 		RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
    829 			     disks[spare_number].devname,
    830 			     disks[spare_number].blockSize,
    831 			     (long int) raidPtr->sectorsPerDisk);
    832 		ret = EINVAL;
    833 		goto fail;
    834 	} else {
    835 		if (disks[spare_number].numBlocks >
    836 		    raidPtr->sectorsPerDisk) {
    837 			RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[spare_number].devname,
    838 				     (long int) raidPtr->sectorsPerDisk);
    839 
    840 			disks[spare_number].numBlocks = raidPtr->sectorsPerDisk;
    841 		}
    842 	}
    843 
    844 	spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
    845 	ret = config_disk_queue( raidPtr, &spareQueues[spare_number],
    846 				 0, raidPtr->numCol + spare_number,
    847 				 raidPtr->Queues[0][0].qPtr, /* XXX */
    848 				 raidPtr->sectorsPerDisk,
    849 				 raidPtr->Disks[0][raidPtr->numCol + spare_number].dev,
    850 				 raidPtr->Queues[0][0].maxOutstanding, /* XXX */
    851 				 &raidPtr->shutdownList,
    852 				 raidPtr->cleanupList);
    853 
    854 
    855 	raidPtr->numSpare++;
    856 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    857 	return (0);
    858 
    859 fail:
    860 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    861 	return(ret);
    862 }
    863 
    864 int
    865 rf_remove_hot_spare(raidPtr,sparePtr)
    866 	RF_Raid_t *raidPtr;
    867 	RF_SingleComponent_t *sparePtr;
    868 {
    869 	int spare_number;
    870 
    871 
    872 	if (raidPtr->numSpare==0) {
    873 		printf("No spares to remove!\n");
    874 		return(EINVAL);
    875 	}
    876 
    877 	spare_number = sparePtr->column;
    878 
    879 	return(EINVAL); /* XXX not implemented yet */
    880 #if 0
    881 	if (spare_number < 0 || spare_number > raidPtr->numSpare) {
    882 		return(EINVAL);
    883 	}
    884 
    885 	/* verify that this spare isn't in use... */
    886 
    887 
    888 
    889 
    890 	/* it's gone.. */
    891 
    892 	raidPtr->numSpare--;
    893 
    894 	return(0);
    895 #endif
    896 }
    897 
    898 
    899