Home | History | Annotate | Line # | Download | only in raidframe
rf_disks.c revision 1.8
      1 /*	$NetBSD: rf_disks.c,v 1.8 1999/03/18 03:02:38 oster Exp $	*/
      2 /*-
      3  * Copyright (c) 1999 The NetBSD Foundation, Inc.
      4  * All rights reserved.
      5  *
      6  * This code is derived from software contributed to The NetBSD Foundation
      7  * by Greg Oster
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 3. All advertising materials mentioning features or use of this software
     18  *    must display the following acknowledgement:
     19  *        This product includes software developed by the NetBSD
     20  *        Foundation, Inc. and its contributors.
     21  * 4. Neither the name of The NetBSD Foundation nor the names of its
     22  *    contributors may be used to endorse or promote products derived
     23  *    from this software without specific prior written permission.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     35  * POSSIBILITY OF SUCH DAMAGE.
     36  */
     37 
     38 /*
     39  * Copyright (c) 1995 Carnegie-Mellon University.
     40  * All rights reserved.
     41  *
     42  * Author: Mark Holland
     43  *
     44  * Permission to use, copy, modify and distribute this software and
     45  * its documentation is hereby granted, provided that both the copyright
     46  * notice and this permission notice appear in all copies of the
     47  * software, derivative works or modified versions, and any portions
     48  * thereof, and that both notices appear in supporting documentation.
     49  *
     50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     53  *
     54  * Carnegie Mellon requests users of this software to return to
     55  *
     56  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     57  *  School of Computer Science
     58  *  Carnegie Mellon University
     59  *  Pittsburgh PA 15213-3890
     60  *
     61  * any improvements or extensions that they make and grant Carnegie the
     62  * rights to redistribute these changes.
     63  */
     64 
     65 /***************************************************************
     66  * rf_disks.c -- code to perform operations on the actual disks
     67  ***************************************************************/
     68 
     69 #include "rf_types.h"
     70 #include "rf_raid.h"
     71 #include "rf_alloclist.h"
     72 #include "rf_utils.h"
     73 #include "rf_configure.h"
     74 #include "rf_general.h"
     75 #include "rf_options.h"
     76 #include "rf_sys.h"
     77 
     78 #include <sys/types.h>
     79 #include <sys/param.h>
     80 #include <sys/systm.h>
     81 #include <sys/proc.h>
     82 #include <sys/ioctl.h>
     83 #include <sys/fcntl.h>
     84 #include <sys/vnode.h>
     85 
     86 /* XXX these should be in a header file somewhere */
     87 int raidlookup __P((char *, struct proc * p, struct vnode **));
     88 int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
     89 int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
     90 void rf_UnconfigureVnodes( RF_Raid_t * );
     91 int rf_CheckLabels( RF_Raid_t *, RF_Config_t *);
     92 
     93 #define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f)
     94 #define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g)
     95 
     96 /**************************************************************************
     97  *
     98  * initialize the disks comprising the array
     99  *
    100  * We want the spare disks to have regular row,col numbers so that we can
    101  * easily substitue a spare for a failed disk.  But, the driver code assumes
    102  * throughout that the array contains numRow by numCol _non-spare_ disks, so
    103  * it's not clear how to fit in the spares.  This is an unfortunate holdover
    104  * from raidSim.  The quick and dirty fix is to make row zero bigger than the
    105  * rest, and put all the spares in it.  This probably needs to get changed
    106  * eventually.
    107  *
    108  **************************************************************************/
    109 
    110 int
    111 rf_ConfigureDisks( listp, raidPtr, cfgPtr )
    112 	RF_ShutdownList_t **listp;
    113 	RF_Raid_t *raidPtr;
    114 	RF_Config_t *cfgPtr;
    115 {
    116 	RF_RaidDisk_t **disks;
    117 	RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL;
    118 	RF_RowCol_t r, c;
    119 	int bs, ret;
    120 	unsigned i, count, foundone = 0, numFailuresThisRow;
    121 	int num_rows_done, num_cols_done;
    122 	int force;
    123 
    124 	num_rows_done = 0;
    125 	num_cols_done = 0;
    126 	force = cfgPtr->force;
    127 
    128 	RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *),
    129 			(RF_RaidDisk_t **), raidPtr->cleanupList);
    130 	if (disks == NULL) {
    131 		ret = ENOMEM;
    132 		goto fail;
    133 	}
    134 	raidPtr->Disks = disks;
    135 
    136 	/* get space for the device-specific stuff... */
    137 	RF_CallocAndAdd(raidPtr->raid_cinfo, raidPtr->numRow,
    138 	    sizeof(struct raidcinfo *), (struct raidcinfo **),
    139 	    raidPtr->cleanupList);
    140 	if (raidPtr->raid_cinfo == NULL) {
    141 		ret = ENOMEM;
    142 		goto fail;
    143 	}
    144 	for (r = 0; r < raidPtr->numRow; r++) {
    145 		numFailuresThisRow = 0;
    146 		/* We allocate RF_MAXSPARE on the first row so that we
    147 		   have room to do hot-swapping of spares */
    148 		RF_CallocAndAdd(disks[r], raidPtr->numCol
    149 				+ ((r == 0) ? RF_MAXSPARE : 0),
    150 				sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *),
    151 				raidPtr->cleanupList);
    152 		if (disks[r] == NULL) {
    153 			ret = ENOMEM;
    154 			goto fail;
    155 		}
    156 		/* get more space for device specific stuff.. */
    157 		RF_CallocAndAdd(raidPtr->raid_cinfo[r],
    158 		    raidPtr->numCol + ((r == 0) ? raidPtr->numSpare : 0),
    159 		    sizeof(struct raidcinfo), (struct raidcinfo *),
    160 		    raidPtr->cleanupList);
    161 		if (raidPtr->raid_cinfo[r] == NULL) {
    162 			ret = ENOMEM;
    163 			goto fail;
    164 		}
    165 		for (c = 0; c < raidPtr->numCol; c++) {
    166 			ret = rf_ConfigureDisk(raidPtr,
    167 					       &cfgPtr->devnames[r][c][0],
    168 					       &disks[r][c], r, c);
    169 			if (ret)
    170 				goto fail;
    171 
    172 			if (disks[r][c].status == rf_ds_optimal) {
    173 				raidread_component_label(
    174 					 raidPtr->raid_cinfo[r][c].ci_dev,
    175 					 raidPtr->raid_cinfo[r][c].ci_vp,
    176 					 &raidPtr->raid_cinfo[r][c].ci_label);
    177 			}
    178 
    179 			if (disks[r][c].status != rf_ds_optimal) {
    180 				numFailuresThisRow++;
    181 			} else {
    182 				if (disks[r][c].numBlocks < min_numblks)
    183 					min_numblks = disks[r][c].numBlocks;
    184 				DPRINTF7("Disk at row %d col %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n",
    185 				    r, c, disks[r][c].devname,
    186 				    (long int) disks[r][c].numBlocks,
    187 				    disks[r][c].blockSize,
    188 				    (long int) disks[r][c].numBlocks *
    189 					 disks[r][c].blockSize / 1024 / 1024);
    190 			}
    191 			num_cols_done++;
    192 		}
    193 		/* XXX fix for n-fault tolerant */
    194 		/* XXX this should probably check to see how many failures
    195 		   we can handle for this configuration! */
    196 		if (numFailuresThisRow > 0)
    197 			raidPtr->status[r] = rf_rs_degraded;
    198 		num_rows_done++;
    199 	}
    200 
    201 	/* all disks must be the same size & have the same block size, bs must
    202 	 * be a power of 2 */
    203 	bs = 0;
    204 	for (foundone = r = 0; !foundone && r < raidPtr->numRow; r++) {
    205 		for (c = 0; !foundone && c < raidPtr->numCol; c++) {
    206 			if (disks[r][c].status == rf_ds_optimal) {
    207 				bs = disks[r][c].blockSize;
    208 				foundone = 1;
    209 			}
    210 		}
    211 	}
    212 	if (!foundone) {
    213 		RF_ERRORMSG("RAIDFRAME: Did not find any live disks in the array.\n");
    214 		ret = EINVAL;
    215 		goto fail;
    216 	}
    217 	for (count = 0, i = 1; i; i <<= 1)
    218 		if (bs & i)
    219 			count++;
    220 	if (count != 1) {
    221 		RF_ERRORMSG1("Error: block size on disks (%d) must be a power of 2\n", bs);
    222 		ret = EINVAL;
    223 		goto fail;
    224 	}
    225 
    226 	if (rf_CheckLabels( raidPtr, cfgPtr )) {
    227 		printf("raid%d: There were fatal errors\n", raidPtr->raidid);
    228 		if (force != 0) {
    229 			printf("raid%d: Fatal errors being ignored.\n",
    230 			       raidPtr->raidid);
    231 		} else {
    232 			ret = EINVAL;
    233 			goto fail;
    234 		}
    235 	}
    236 
    237 	for (r = 0; r < raidPtr->numRow; r++) {
    238 		for (c = 0; c < raidPtr->numCol; c++) {
    239 			if (disks[r][c].status == rf_ds_optimal) {
    240 				if (disks[r][c].blockSize != bs) {
    241 					RF_ERRORMSG2("Error: block size of disk at r %d c %d different from disk at r 0 c 0\n", r, c);
    242 					ret = EINVAL;
    243 					goto fail;
    244 				}
    245 				if (disks[r][c].numBlocks != min_numblks) {
    246 					RF_ERRORMSG3("WARNING: truncating disk at r %d c %d to %d blocks\n",
    247 					    r, c, (int) min_numblks);
    248 					disks[r][c].numBlocks = min_numblks;
    249 				}
    250 			}
    251 		}
    252 	}
    253 
    254 	raidPtr->sectorsPerDisk = min_numblks;
    255 	raidPtr->logBytesPerSector = ffs(bs) - 1;
    256 	raidPtr->bytesPerSector = bs;
    257 	raidPtr->sectorMask = bs - 1;
    258 	return (0);
    259 
    260 fail:
    261 
    262 	rf_UnconfigureVnodes( raidPtr );
    263 
    264 	return (ret);
    265 }
    266 
    267 
    268 /****************************************************************************
    269  * set up the data structures describing the spare disks in the array
    270  * recall from the above comment that the spare disk descriptors are stored
    271  * in row zero, which is specially expanded to hold them.
    272  ****************************************************************************/
    273 int
    274 rf_ConfigureSpareDisks( listp, raidPtr, cfgPtr )
    275 	RF_ShutdownList_t ** listp;
    276 	RF_Raid_t * raidPtr;
    277 	RF_Config_t * cfgPtr;
    278 {
    279 	int     i, ret;
    280 	unsigned int bs;
    281 	RF_RaidDisk_t *disks;
    282 	int     num_spares_done;
    283 
    284 	num_spares_done = 0;
    285 
    286 	/* The space for the spares should have already been allocated by
    287 	 * ConfigureDisks() */
    288 
    289 	disks = &raidPtr->Disks[0][raidPtr->numCol];
    290 	for (i = 0; i < raidPtr->numSpare; i++) {
    291 		ret = rf_ConfigureDisk(raidPtr, &cfgPtr->spare_names[i][0],
    292 				       &disks[i], 0, raidPtr->numCol + i);
    293 		if (ret)
    294 			goto fail;
    295 		if (disks[i].status != rf_ds_optimal) {
    296 			RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",
    297 				     &cfgPtr->spare_names[i][0]);
    298 		} else {
    299 			disks[i].status = rf_ds_spare;	/* change status to
    300 							 * spare */
    301 			DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", i,
    302 			    disks[i].devname,
    303 			    (long int) disks[i].numBlocks, disks[i].blockSize,
    304 			    (long int) disks[i].numBlocks *
    305 				 disks[i].blockSize / 1024 / 1024);
    306 		}
    307 		num_spares_done++;
    308 	}
    309 
    310 	/* check sizes and block sizes on spare disks */
    311 	bs = 1 << raidPtr->logBytesPerSector;
    312 	for (i = 0; i < raidPtr->numSpare; i++) {
    313 		if (disks[i].blockSize != bs) {
    314 			RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[i].blockSize, disks[i].devname, bs);
    315 			ret = EINVAL;
    316 			goto fail;
    317 		}
    318 		if (disks[i].numBlocks < raidPtr->sectorsPerDisk) {
    319 			RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
    320 				     disks[i].devname, disks[i].blockSize,
    321 				     (long int) raidPtr->sectorsPerDisk);
    322 			ret = EINVAL;
    323 			goto fail;
    324 		} else
    325 			if (disks[i].numBlocks > raidPtr->sectorsPerDisk) {
    326 				RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[i].devname, (long int) raidPtr->sectorsPerDisk);
    327 
    328 				disks[i].numBlocks = raidPtr->sectorsPerDisk;
    329 			}
    330 	}
    331 
    332 	return (0);
    333 
    334 fail:
    335 
    336 	/* Release the hold on the main components.  We've failed to allocate
    337 	 * a spare, and since we're failing, we need to free things..
    338 
    339 	 XXX failing to allocate a spare is *not* that big of a deal...
    340 	 We *can* survive without it, if need be, esp. if we get hot
    341 	 adding working.
    342 
    343 	 If we don't fail out here, then we need a way to remove this spare...
    344 	 that should be easier to do here than if we are "live"...
    345 
    346 	 */
    347 
    348 	rf_UnconfigureVnodes( raidPtr );
    349 
    350 	return (ret);
    351 }
    352 
    353 
    354 
    355 /* configure a single disk in the array */
    356 int
    357 rf_ConfigureDisk(raidPtr, buf, diskPtr, row, col)
    358 	RF_Raid_t *raidPtr;
    359 	char   *buf;
    360 	RF_RaidDisk_t *diskPtr;
    361 	RF_RowCol_t row;
    362 	RF_RowCol_t col;
    363 {
    364 	char   *p;
    365 	int     retcode;
    366 
    367 	struct partinfo dpart;
    368 	struct vnode *vp;
    369 	struct vattr va;
    370 	struct proc *proc;
    371 	int     error;
    372 
    373 	retcode = 0;
    374 	p = rf_find_non_white(buf);
    375 	if (p[strlen(p) - 1] == '\n') {
    376 		/* strip off the newline */
    377 		p[strlen(p) - 1] = '\0';
    378 	}
    379 	(void) strcpy(diskPtr->devname, p);
    380 
    381 	proc = raidPtr->proc;	/* XXX Yes, this is not nice.. */
    382 
    383 	/* Let's start by claiming the component is fine and well... */
    384 	diskPtr->status = rf_ds_optimal;
    385 
    386 	raidPtr->raid_cinfo[row][col].ci_vp = NULL;
    387 	raidPtr->raid_cinfo[row][col].ci_dev = NULL;
    388 
    389 	error = raidlookup(diskPtr->devname, proc, &vp);
    390 	if (error) {
    391 		printf("raidlookup on device: %s failed!\n", diskPtr->devname);
    392 		if (error == ENXIO) {
    393 			/* the component isn't there... must be dead :-( */
    394 			diskPtr->status = rf_ds_failed;
    395 		} else {
    396 			return (error);
    397 		}
    398 	}
    399 	if (diskPtr->status == rf_ds_optimal) {
    400 
    401 		if ((error = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
    402 			return (error);
    403 		}
    404 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart,
    405 				  FREAD, proc->p_ucred, proc);
    406 		if (error) {
    407 			return (error);
    408 		}
    409 
    410 		diskPtr->blockSize = dpart.disklab->d_secsize;
    411 
    412 		diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
    413 
    414 		raidPtr->raid_cinfo[row][col].ci_vp = vp;
    415 		raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
    416 
    417 		diskPtr->dev = va.va_rdev;
    418 
    419 		/* we allow the user to specify that only a fraction of the
    420 		 * disks should be used this is just for debug:  it speeds up
    421 		 * the parity scan */
    422 		diskPtr->numBlocks = diskPtr->numBlocks *
    423 			rf_sizePercentage / 100;
    424 	}
    425 	return (0);
    426 }
    427 
    428 static void rf_print_label_status( RF_Raid_t *, int, int, char *,
    429 				  RF_ComponentLabel_t *);
    430 
    431 static void
    432 rf_print_label_status( raidPtr, row, column, dev_name, ci_label )
    433 	RF_Raid_t *raidPtr;
    434 	int row;
    435 	int column;
    436 	char *dev_name;
    437 	RF_ComponentLabel_t *ci_label;
    438 {
    439 
    440 	printf("raid%d: Component %s being configured at row: %d col: %d\n",
    441 	       raidPtr->raidid, dev_name, row, column );
    442 	printf("         Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
    443 	       ci_label->row, ci_label->column,
    444 	       ci_label->num_rows, ci_label->num_columns);
    445 	printf("         Version: %d Serial Number: %d Mod Counter: %d\n",
    446 	       ci_label->version, ci_label->serial_number,
    447 	       ci_label->mod_counter);
    448 	printf("         Clean: %d Status: %d\n",
    449 	       ci_label->clean, ci_label->status );
    450 }
    451 
    452 static int rf_check_label_vitals( RF_Raid_t *, int, int, char *,
    453 				  RF_ComponentLabel_t *, int, int );
    454 static int rf_check_label_vitals( raidPtr, row, column, dev_name, ci_label,
    455 				  serial_number, mod_counter )
    456 	RF_Raid_t *raidPtr;
    457 	int row;
    458 	int column;
    459 	char *dev_name;
    460 	RF_ComponentLabel_t *ci_label;
    461 	int serial_number;
    462 	int mod_counter;
    463 {
    464 	int fatal_error = 0;
    465 
    466 	if (serial_number != ci_label->serial_number) {
    467 		printf("%s has a different serial number: %d %d\n",
    468 		       dev_name, serial_number, ci_label->serial_number);
    469 		fatal_error = 1;
    470 	}
    471 	if (mod_counter != ci_label->mod_counter) {
    472 		printf("%s has a different modfication count: %d %d\n",
    473 		       dev_name, mod_counter, ci_label->mod_counter);
    474 	}
    475 
    476 	if (row != ci_label->row) {
    477 		printf("Row out of alignment for: %s\n", dev_name);
    478 		fatal_error = 1;
    479 	}
    480 	if (column != ci_label->column) {
    481 		printf("Column out of alignment for: %s\n", dev_name);
    482 		fatal_error = 1;
    483 	}
    484 	if (raidPtr->numRow != ci_label->num_rows) {
    485 		printf("Number of rows do not match for: %s\n", dev_name);
    486 		fatal_error = 1;
    487 	}
    488 	if (raidPtr->numCol != ci_label->num_columns) {
    489 		printf("Number of columns do not match for: %s\n", dev_name);
    490 		fatal_error = 1;
    491 	}
    492 	if (ci_label->clean == 0) {
    493 		/* it's not clean, but that's not fatal */
    494 		printf("%s is not clean!\n", dev_name);
    495 	}
    496 	return(fatal_error);
    497 }
    498 
    499 
    500 /*
    501 
    502    rf_CheckLabels() - check all the component labels for consistency.
    503    Return an error if there is anything major amiss.
    504 
    505  */
    506 
    507 int
    508 rf_CheckLabels( raidPtr, cfgPtr )
    509 	RF_Raid_t *raidPtr;
    510 	RF_Config_t *cfgPtr;
    511 {
    512 	int r,c;
    513 	char *dev_name;
    514 	RF_ComponentLabel_t *ci_label;
    515 	int serial_number = 0;
    516 	int mod_number = 0;
    517 	int fatal_error = 0;
    518 	int mod_values[4];
    519 	int mod_count[4];
    520 	int ser_values[4];
    521 	int ser_count[4];
    522 	int num_ser;
    523 	int num_mod;
    524 	int i;
    525 	int found;
    526 	int hosed_row;
    527 	int hosed_column;
    528 	int too_fatal;
    529 	int parity_good;
    530 	int force;
    531 
    532 	hosed_row = -1;
    533 	hosed_column = -1;
    534 	too_fatal = 0;
    535 	force = cfgPtr->force;
    536 
    537 	/*
    538 	   We're going to try to be a little intelligent here.  If one
    539 	   component's label is bogus, and we can identify that it's the
    540 	   *only* one that's gone, we'll mark it as "failed" and allow
    541 	   the configuration to proceed.  This will be the *only* case
    542 	   that we'll proceed if there would be (otherwise) fatal errors.
    543 
    544 	   Basically we simply keep a count of how many components had
    545 	   what serial number.  If all but one agree, we simply mark
    546 	   the disagreeing component as being failed, and allow
    547 	   things to come up "normally".
    548 
    549 	   We do this first for serial numbers, and then for "mod_counter".
    550 
    551 	 */
    552 
    553 	num_ser = 0;
    554 	num_mod = 0;
    555 	for (r = 0; r < raidPtr->numRow && !fatal_error ; r++) {
    556 		for (c = 0; c < raidPtr->numCol; c++) {
    557 			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    558 			found=0;
    559 			for(i=0;i<num_ser;i++) {
    560 				if (ser_values[i] == ci_label->serial_number) {
    561 					ser_count[i]++;
    562 					found=1;
    563 					break;
    564 				}
    565 			}
    566 			if (!found) {
    567 				ser_values[num_ser] = ci_label->serial_number;
    568 				ser_count[num_ser] = 1;
    569 				num_ser++;
    570 				if (num_ser>2) {
    571 					fatal_error = 1;
    572 					break;
    573 				}
    574 			}
    575 			found=0;
    576 			for(i=0;i<num_mod;i++) {
    577 				if (mod_values[i] == ci_label->mod_counter) {
    578 					mod_count[i]++;
    579 					found=1;
    580 					break;
    581 				}
    582 			}
    583 			if (!found) {
    584 			        mod_values[num_mod] = ci_label->mod_counter;
    585 				mod_count[num_mod] = 1;
    586 				num_mod++;
    587 				if (num_mod>2) {
    588 					fatal_error = 1;
    589 					break;
    590 				}
    591 			}
    592 		}
    593 	}
    594 #if DEBUG
    595 	printf("raid%d: Summary of serial numbers:\n", raidPtr->raidid);
    596 	for(i=0;i<num_ser;i++) {
    597 		printf("%d %d\n", ser_values[i], ser_count[i]);
    598 	}
    599 	printf("raid%d: Summary of mod counters:\n", raidPtr->raidid);
    600 	for(i=0;i<num_mod;i++) {
    601 		printf("%d %d\n", mod_values[i], mod_count[i]);
    602 	}
    603 #endif
    604 	serial_number = ser_values[0];
    605 	if (num_ser == 2) {
    606 		if ((ser_count[0] == 1) || (ser_count[1] == 1)) {
    607 			/* Locate the maverick component */
    608 			if (ser_count[1] > ser_count[0]) {
    609 				serial_number = ser_values[1];
    610 			}
    611 			for (r = 0; r < raidPtr->numRow; r++) {
    612 				for (c = 0; c < raidPtr->numCol; c++) {
    613 				ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    614 					if (serial_number !=
    615 					    ci_label->serial_number) {
    616 						hosed_row = r;
    617 						hosed_column = c;
    618 						break;
    619 					}
    620 				}
    621 			}
    622 			printf("Hosed component: %s\n",
    623 			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
    624 			if (!force) {
    625 				/* we'll fail this component, as if there are
    626 				   other major errors, we arn't forcing things
    627 				   and we'll abort the config anyways */
    628 				raidPtr->Disks[hosed_row][hosed_column].status
    629 					= rf_ds_failed;
    630 				raidPtr->numFailures++;
    631 				raidPtr->status[hosed_row] = rf_rs_degraded;
    632 			}
    633 		} else {
    634 			too_fatal = 1;
    635 		}
    636 		if (cfgPtr->parityConfig == '0') {
    637 			/* We've identified two different serial numbers.
    638 			   RAID 0 can't cope with that, so we'll punt */
    639 			too_fatal = 1;
    640 		}
    641 
    642 	}
    643 
    644 	/* record the serial number for later.  If we bail later, setting
    645 	   this doesn't matter, otherwise we've got the best guess at the
    646 	   correct serial number */
    647 	raidPtr->serial_number = serial_number;
    648 
    649 	mod_number = mod_values[0];
    650 	if (num_mod == 2) {
    651 		if ((mod_count[0] == 1) || (mod_count[1] == 1)) {
    652 			/* Locate the maverick component */
    653 			if (mod_count[1] > mod_count[0]) {
    654 				mod_number = mod_values[1];
    655 			} else if (mod_count[1] < mod_count[0]) {
    656 				mod_number = mod_values[0];
    657 			} else {
    658 				/* counts of different modification values
    659 				   are the same.   Assume greater value is
    660 				   the correct one, all other things
    661 				   considered */
    662 				if (mod_values[0] > mod_values[1]) {
    663 					mod_number = mod_values[0];
    664 				} else {
    665 					mod_number = mod_values[1];
    666 				}
    667 
    668 			}
    669 			for (r = 0; r < raidPtr->numRow && !too_fatal ; r++) {
    670 				for (c = 0; c < raidPtr->numCol; c++) {
    671 					ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    672 					if (mod_number !=
    673 					    ci_label->mod_counter) {
    674 						if ( ( hosed_row == r ) &&
    675 						     ( hosed_column == c )) {
    676 							/* same one.  Can
    677 							   deal with it.  */
    678 						} else {
    679 							hosed_row = r;
    680 							hosed_column = c;
    681 							if (num_ser != 1) {
    682 								too_fatal = 1;
    683 								break;
    684 							}
    685 						}
    686 					}
    687 				}
    688 			}
    689 			printf("Hosed component: %s\n",
    690 			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
    691 			if (!force) {
    692 				/* we'll fail this component, as if there are
    693 				   other major errors, we arn't forcing things
    694 				   and we'll abort the config anyways */
    695 				raidPtr->Disks[hosed_row][hosed_column].status
    696 					= rf_ds_failed;
    697 				raidPtr->numFailures++;
    698 				raidPtr->status[hosed_row] = rf_rs_degraded;
    699 			}
    700 		} else {
    701 			too_fatal = 1;
    702 		}
    703 		if (cfgPtr->parityConfig == '0') {
    704 			/* We've identified two different mod counters.
    705 			   RAID 0 can't cope with that, so we'll punt */
    706 			too_fatal = 1;
    707 		}
    708 	}
    709 
    710 	raidPtr->mod_counter = mod_number;
    711 
    712 	if (too_fatal) {
    713 		/* we've had both a serial number mismatch, and a mod_counter
    714 		   mismatch -- and they involved two different components!!
    715 		   Bail -- make things fail so that the user must force
    716 		   the issue... */
    717 		hosed_row = -1;
    718 		hosed_column = -1;
    719 	}
    720 
    721 	if (num_ser > 2) {
    722 		printf("raid%d: Too many different serial numbers!\n",
    723 		       raidPtr->raidid);
    724 	}
    725 
    726 	if (num_mod > 2) {
    727 		printf("raid%d: Too many different mod counters!\n",
    728 		       raidPtr->raidid);
    729 	}
    730 
    731 	/* we start by assuming the parity will be good, and flee from
    732 	   that notion at the slightest sign of trouble */
    733 
    734 	parity_good = RF_RAID_CLEAN;
    735 	for (r = 0; r < raidPtr->numRow; r++) {
    736 		for (c = 0; c < raidPtr->numCol; c++) {
    737 			dev_name = &cfgPtr->devnames[r][c][0];
    738 			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
    739 
    740 			if ((r == hosed_row) && (c == hosed_column)) {
    741 				printf("raid%d: Ignoring %s\n",
    742 				       raidPtr->raidid, dev_name);
    743 			} else {
    744 				rf_print_label_status( raidPtr, r, c,
    745 						       dev_name, ci_label );
    746 				if (rf_check_label_vitals( raidPtr, r, c,
    747 							   dev_name, ci_label,
    748 							   serial_number,
    749 							   mod_number )) {
    750 					fatal_error = 1;
    751 				}
    752 				if (ci_label->clean != RF_RAID_CLEAN) {
    753 					parity_good = RF_RAID_DIRTY;
    754 				}
    755 			}
    756 		}
    757 	}
    758 	if (fatal_error) {
    759 		parity_good = RF_RAID_DIRTY;
    760 	}
    761 
    762 	/* we note the state of the parity */
    763 	raidPtr->parity_good = parity_good;
    764 
    765 	return(fatal_error);
    766 }
    767 
    768 
    769 int rf_add_hot_spare(RF_Raid_t *, RF_SingleComponent_t *);
    770 int
    771 rf_add_hot_spare(raidPtr, sparePtr)
    772 	RF_Raid_t *raidPtr;
    773 	RF_SingleComponent_t *sparePtr;
    774 {
    775 	RF_RaidDisk_t *disks;
    776 	int ret;
    777 	unsigned int bs;
    778 	int spare_number;
    779 
    780 	printf("Just in rf_add_hot_spare: %d\n",raidPtr->numSpare);
    781 	printf("Num col: %d\n",raidPtr->numCol);
    782 	if (raidPtr->numSpare >= RF_MAXSPARE) {
    783 		RF_ERRORMSG1("Too many spares: %d\n", raidPtr->numSpare);
    784 		return(EINVAL);
    785 	}
    786 
    787 	/* the beginning of the spares... */
    788 	disks = &raidPtr->Disks[0][raidPtr->numCol];
    789 
    790 	spare_number = raidPtr->numSpare;
    791 
    792 	ret = rf_ConfigureDisk(raidPtr, sparePtr->component_name,
    793 			       &disks[spare_number], 0,
    794 			       raidPtr->numCol + spare_number);
    795 
    796 	if (ret)
    797 		goto fail;
    798 	if (disks[spare_number].status != rf_ds_optimal) {
    799 		RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",
    800 			     sparePtr->component_name);
    801 		ret=EINVAL;
    802 		goto fail;
    803 	} else {
    804 		disks[spare_number].status = rf_ds_spare;
    805 		DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", spare_number,
    806 			 disks[spare_number].devname,
    807 			 (long int) disks[spare_number].numBlocks,
    808 			 disks[spare_number].blockSize,
    809 			 (long int) disks[spare_number].numBlocks *
    810 			 disks[spare_number].blockSize / 1024 / 1024);
    811 	}
    812 
    813 
    814 	/* check sizes and block sizes on the spare disk */
    815 	bs = 1 << raidPtr->logBytesPerSector;
    816 	if (disks[spare_number].blockSize != bs) {
    817 		RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[spare_number].blockSize, disks[spare_number].devname, bs);
    818 		ret = EINVAL;
    819 		goto fail;
    820 	}
    821 	if (disks[spare_number].numBlocks < raidPtr->sectorsPerDisk) {
    822 		RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
    823 			     disks[spare_number].devname,
    824 			     disks[spare_number].blockSize,
    825 			     (long int) raidPtr->sectorsPerDisk);
    826 		ret = EINVAL;
    827 		goto fail;
    828 	} else {
    829 		if (disks[spare_number].numBlocks >
    830 		    raidPtr->sectorsPerDisk) {
    831 			RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[spare_number].devname,
    832 				     (long int) raidPtr->sectorsPerDisk);
    833 
    834 			disks[spare_number].numBlocks = raidPtr->sectorsPerDisk;
    835 		}
    836 	}
    837 
    838 	raidPtr->numSpare++;
    839 
    840 	return (0);
    841 
    842 fail:
    843 	return(ret);
    844 }
    845 
    846 int
    847 rf_remove_hot_spare(raidPtr,sparePtr)
    848 	RF_Raid_t *raidPtr;
    849 	RF_SingleComponent_t *sparePtr;
    850 {
    851 	int spare_number;
    852 
    853 
    854 	if (raidPtr->numSpare==0) {
    855 		printf("No spares to remove!\n");
    856 		return(EINVAL);
    857 	}
    858 
    859 	spare_number = sparePtr->column;
    860 
    861 	return(EINVAL); /* XXX not implemented yet */
    862 #if 0
    863 	if (spare_number < 0 || spare_number > raidPtr->numSpare) {
    864 		return(EINVAL);
    865 	}
    866 
    867 	/* verify that this spare isn't in use... */
    868 
    869 
    870 
    871 
    872 	/* it's gone.. */
    873 
    874 	raidPtr->numSpare--;
    875 
    876 	return(0);
    877 #endif
    878 }
    879 
    880 
    881