rf_reconstruct.c revision 1.105.4.5 1 /* $NetBSD: rf_reconstruct.c,v 1.105.4.5 2012/02/24 17:58:44 sborrill Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.105.4.5 2012/02/24 17:58:44 sborrill Exp $");
37
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/errno.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <dev/raidframe/raidframevar.h>
48
49 #include "rf_raid.h"
50 #include "rf_reconutil.h"
51 #include "rf_revent.h"
52 #include "rf_reconbuffer.h"
53 #include "rf_acctrace.h"
54 #include "rf_etimer.h"
55 #include "rf_dag.h"
56 #include "rf_desc.h"
57 #include "rf_debugprint.h"
58 #include "rf_general.h"
59 #include "rf_driver.h"
60 #include "rf_utils.h"
61 #include "rf_shutdown.h"
62
63 #include "rf_kintf.h"
64
65 /* setting these to -1 causes them to be set to their default values if not set by debug options */
66
67 #if RF_DEBUG_RECON
68 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
69 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
70 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
71 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
72 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
76
77 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
78 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
79
80 #else /* RF_DEBUG_RECON */
81
82 #define Dprintf(s) {}
83 #define Dprintf1(s,a) {}
84 #define Dprintf2(s,a,b) {}
85 #define Dprintf3(s,a,b,c) {}
86 #define Dprintf4(s,a,b,c,d) {}
87 #define Dprintf5(s,a,b,c,d,e) {}
88 #define Dprintf6(s,a,b,c,d,e,f) {}
89 #define Dprintf7(s,a,b,c,d,e,f,g) {}
90
91 #define DDprintf1(s,a) {}
92 #define DDprintf2(s,a,b) {}
93
94 #endif /* RF_DEBUG_RECON */
95
96 #define RF_RECON_DONE_READS 1
97 #define RF_RECON_READ_ERROR 2
98 #define RF_RECON_WRITE_ERROR 3
99 #define RF_RECON_READ_STOPPED 4
100 #define RF_RECON_WRITE_DONE 5
101
102 #define RF_MAX_FREE_RECONBUFFER 32
103 #define RF_MIN_FREE_RECONBUFFER 16
104
105 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
106 RF_RaidDisk_t *, int, RF_RowCol_t);
107 static void FreeReconDesc(RF_RaidReconDesc_t *);
108 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
109 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
110 static int TryToRead(RF_Raid_t *, RF_RowCol_t);
111 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
112 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
113 RF_SectorNum_t *);
114 static int IssueNextWriteRequest(RF_Raid_t *);
115 static int ReconReadDoneProc(void *, int);
116 static int ReconWriteDoneProc(void *, int);
117 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
118 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
119 RF_RowCol_t, RF_HeadSepLimit_t,
120 RF_ReconUnitNum_t);
121 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
122 RF_ReconParityStripeStatus_t *,
123 RF_PerDiskReconCtrl_t *,
124 RF_RowCol_t, RF_StripeNum_t,
125 RF_ReconUnitNum_t);
126 static void ForceReconReadDoneProc(void *, int);
127 static void rf_ShutdownReconstruction(void *);
128
129 struct RF_ReconDoneProc_s {
130 void (*proc) (RF_Raid_t *, void *);
131 void *arg;
132 RF_ReconDoneProc_t *next;
133 };
134
135 /**************************************************************************
136 *
137 * sets up the parameters that will be used by the reconstruction process
138 * currently there are none, except for those that the layout-specific
139 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
140 *
141 * in the kernel, we fire off the recon thread.
142 *
143 **************************************************************************/
144 static void
145 rf_ShutdownReconstruction(void *ignored)
146 {
147 pool_destroy(&rf_pools.reconbuffer);
148 }
149
150 int
151 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
152 {
153
154 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
155 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
156 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
157
158 return (0);
159 }
160
161 static RF_RaidReconDesc_t *
162 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
163 RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
164 RF_RowCol_t scol)
165 {
166
167 RF_RaidReconDesc_t *reconDesc;
168
169 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
170 (RF_RaidReconDesc_t *));
171 reconDesc->raidPtr = raidPtr;
172 reconDesc->col = col;
173 reconDesc->spareDiskPtr = spareDiskPtr;
174 reconDesc->numDisksDone = numDisksDone;
175 reconDesc->scol = scol;
176 reconDesc->next = NULL;
177
178 return (reconDesc);
179 }
180
181 static void
182 FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
183 {
184 #if RF_RECON_STATS > 0
185 printf("raid%d: %lu recon event waits, %lu recon delays\n",
186 reconDesc->raidPtr->raidid,
187 (long) reconDesc->numReconEventWaits,
188 (long) reconDesc->numReconExecDelays);
189 #endif /* RF_RECON_STATS > 0 */
190 printf("raid%d: %lu max exec ticks\n",
191 reconDesc->raidPtr->raidid,
192 (long) reconDesc->maxReconExecTicks);
193 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
194 }
195
196
197 /*****************************************************************************
198 *
199 * primary routine to reconstruct a failed disk. This should be called from
200 * within its own thread. It won't return until reconstruction completes,
201 * fails, or is aborted.
202 *****************************************************************************/
203 int
204 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
205 {
206 const RF_LayoutSW_t *lp;
207 int rc;
208
209 lp = raidPtr->Layout.map;
210 if (lp->SubmitReconBuffer) {
211 /*
212 * The current infrastructure only supports reconstructing one
213 * disk at a time for each array.
214 */
215 RF_LOCK_MUTEX(raidPtr->mutex);
216 while (raidPtr->reconInProgress) {
217 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
218 }
219 raidPtr->reconInProgress++;
220 RF_UNLOCK_MUTEX(raidPtr->mutex);
221 rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
222 RF_LOCK_MUTEX(raidPtr->mutex);
223 raidPtr->reconInProgress--;
224 RF_UNLOCK_MUTEX(raidPtr->mutex);
225 } else {
226 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
227 lp->parityConfig);
228 rc = EIO;
229 }
230 RF_SIGNAL_COND(raidPtr->waitForReconCond);
231 return (rc);
232 }
233
234 int
235 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
236 {
237 RF_ComponentLabel_t *c_label;
238 RF_RaidDisk_t *spareDiskPtr = NULL;
239 RF_RaidReconDesc_t *reconDesc;
240 RF_RowCol_t scol;
241 int numDisksDone = 0, rc;
242
243 /* first look for a spare drive onto which to reconstruct the data */
244 /* spare disk descriptors are stored in row 0. This may have to
245 * change eventually */
246
247 RF_LOCK_MUTEX(raidPtr->mutex);
248 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
249 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
250 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
251 if (raidPtr->status != rf_rs_degraded) {
252 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
253 RF_UNLOCK_MUTEX(raidPtr->mutex);
254 return (EINVAL);
255 }
256 scol = (-1);
257 } else {
258 #endif
259 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
260 if (raidPtr->Disks[scol].status == rf_ds_spare) {
261 spareDiskPtr = &raidPtr->Disks[scol];
262 spareDiskPtr->status = rf_ds_used_spare;
263 break;
264 }
265 }
266 if (!spareDiskPtr) {
267 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
268 RF_UNLOCK_MUTEX(raidPtr->mutex);
269 return (ENOSPC);
270 }
271 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
272 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
273 }
274 #endif
275 RF_UNLOCK_MUTEX(raidPtr->mutex);
276
277 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
278 raidPtr->reconDesc = (void *) reconDesc;
279 #if RF_RECON_STATS > 0
280 reconDesc->hsStallCount = 0;
281 reconDesc->numReconExecDelays = 0;
282 reconDesc->numReconEventWaits = 0;
283 #endif /* RF_RECON_STATS > 0 */
284 reconDesc->reconExecTimerRunning = 0;
285 reconDesc->reconExecTicks = 0;
286 reconDesc->maxReconExecTicks = 0;
287 rc = rf_ContinueReconstructFailedDisk(reconDesc);
288
289 if (!rc) {
290 /* fix up the component label */
291 /* Don't actually need the read here.. */
292 c_label = raidget_component_label(raidPtr, scol);
293
294 raid_init_component_label(raidPtr, c_label);
295 c_label->row = 0;
296 c_label->column = col;
297 c_label->clean = RF_RAID_DIRTY;
298 c_label->status = rf_ds_optimal;
299 c_label->partitionSize = raidPtr->Disks[scol].partitionSize;
300 c_label->partitionSizeHi =
301 raidPtr->Disks[scol].partitionSize >> 32;
302
303 /* We've just done a rebuild based on all the other
304 disks, so at this point the parity is known to be
305 clean, even if it wasn't before. */
306
307 /* XXX doesn't hold for RAID 6!!*/
308
309 RF_LOCK_MUTEX(raidPtr->mutex);
310 raidPtr->parity_good = RF_RAID_CLEAN;
311 RF_UNLOCK_MUTEX(raidPtr->mutex);
312
313 /* XXXX MORE NEEDED HERE */
314
315 raidflush_component_label(raidPtr, scol);
316 } else {
317 /* Reconstruct failed. */
318
319 RF_LOCK_MUTEX(raidPtr->mutex);
320 /* Failed disk goes back to "failed" status */
321 raidPtr->Disks[col].status = rf_ds_failed;
322
323 /* Spare disk goes back to "spare" status. */
324 spareDiskPtr->status = rf_ds_spare;
325 RF_UNLOCK_MUTEX(raidPtr->mutex);
326
327 }
328 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
329 return (rc);
330 }
331
332 /*
333
334 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
335 and you don't get a spare until the next Monday. With this function
336 (and hot-swappable drives) you can now put your new disk containing
337 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
338 rebuild the data "on the spot".
339
340 */
341
342 int
343 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
344 {
345 RF_RaidDisk_t *spareDiskPtr = NULL;
346 RF_RaidReconDesc_t *reconDesc;
347 const RF_LayoutSW_t *lp;
348 RF_ComponentLabel_t *c_label;
349 int numDisksDone = 0, rc;
350 struct partinfo dpart;
351 struct vnode *vp;
352 struct vattr va;
353 int retcode;
354 int ac;
355
356 lp = raidPtr->Layout.map;
357 if (!lp->SubmitReconBuffer) {
358 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
359 lp->parityConfig);
360 /* wakeup anyone who might be waiting to do a reconstruct */
361 RF_SIGNAL_COND(raidPtr->waitForReconCond);
362 return(EIO);
363 }
364
365 /*
366 * The current infrastructure only supports reconstructing one
367 * disk at a time for each array.
368 */
369 RF_LOCK_MUTEX(raidPtr->mutex);
370
371 if (raidPtr->Disks[col].status != rf_ds_failed) {
372 /* "It's gone..." */
373 raidPtr->numFailures++;
374 raidPtr->Disks[col].status = rf_ds_failed;
375 raidPtr->status = rf_rs_degraded;
376 RF_UNLOCK_MUTEX(raidPtr->mutex);
377 rf_update_component_labels(raidPtr,
378 RF_NORMAL_COMPONENT_UPDATE);
379 RF_LOCK_MUTEX(raidPtr->mutex);
380 }
381
382 while (raidPtr->reconInProgress) {
383 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
384 }
385
386 raidPtr->reconInProgress++;
387
388 /* first look for a spare drive onto which to reconstruct the
389 data. spare disk descriptors are stored in row 0. This
390 may have to change eventually */
391
392 /* Actually, we don't care if it's failed or not... On a RAID
393 set with correct parity, this function should be callable
394 on any component without ill effects. */
395 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
396
397 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
398 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
399 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
400
401 raidPtr->reconInProgress--;
402 RF_UNLOCK_MUTEX(raidPtr->mutex);
403 RF_SIGNAL_COND(raidPtr->waitForReconCond);
404 return (EINVAL);
405 }
406 #endif
407
408 /* This device may have been opened successfully the
409 first time. Close it before trying to open it again.. */
410
411 if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
412 #if 0
413 printf("Closed the open device: %s\n",
414 raidPtr->Disks[col].devname);
415 #endif
416 vp = raidPtr->raid_cinfo[col].ci_vp;
417 ac = raidPtr->Disks[col].auto_configured;
418 RF_UNLOCK_MUTEX(raidPtr->mutex);
419 rf_close_component(raidPtr, vp, ac);
420 RF_LOCK_MUTEX(raidPtr->mutex);
421 raidPtr->raid_cinfo[col].ci_vp = NULL;
422 }
423 /* note that this disk was *not* auto_configured (any longer)*/
424 raidPtr->Disks[col].auto_configured = 0;
425
426 #if 0
427 printf("About to (re-)open the device for rebuilding: %s\n",
428 raidPtr->Disks[col].devname);
429 #endif
430 RF_UNLOCK_MUTEX(raidPtr->mutex);
431 retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE);
432
433 if (retcode) {
434 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
435 raidPtr->Disks[col].devname, retcode);
436
437 /* the component isn't responding properly...
438 must be still dead :-( */
439 RF_LOCK_MUTEX(raidPtr->mutex);
440 raidPtr->reconInProgress--;
441 RF_UNLOCK_MUTEX(raidPtr->mutex);
442 RF_SIGNAL_COND(raidPtr->waitForReconCond);
443 return(retcode);
444 }
445
446 /* Ok, so we can at least do a lookup...
447 How about actually getting a vp for it? */
448
449 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) {
450 RF_LOCK_MUTEX(raidPtr->mutex);
451 raidPtr->reconInProgress--;
452 RF_UNLOCK_MUTEX(raidPtr->mutex);
453 RF_SIGNAL_COND(raidPtr->waitForReconCond);
454 return(retcode);
455 }
456
457 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred);
458 if (retcode) {
459 RF_LOCK_MUTEX(raidPtr->mutex);
460 raidPtr->reconInProgress--;
461 RF_UNLOCK_MUTEX(raidPtr->mutex);
462 RF_SIGNAL_COND(raidPtr->waitForReconCond);
463 return(retcode);
464 }
465 RF_LOCK_MUTEX(raidPtr->mutex);
466 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize;
467
468 raidPtr->Disks[col].numBlocks = dpart.part->p_size -
469 rf_protectedSectors;
470
471 raidPtr->raid_cinfo[col].ci_vp = vp;
472 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
473
474 raidPtr->Disks[col].dev = va.va_rdev;
475
476 /* we allow the user to specify that only a fraction
477 of the disks should be used this is just for debug:
478 it speeds up * the parity scan */
479 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
480 rf_sizePercentage / 100;
481 RF_UNLOCK_MUTEX(raidPtr->mutex);
482
483 spareDiskPtr = &raidPtr->Disks[col];
484 spareDiskPtr->status = rf_ds_used_spare;
485
486 printf("raid%d: initiating in-place reconstruction on column %d\n",
487 raidPtr->raidid, col);
488
489 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
490 numDisksDone, col);
491 raidPtr->reconDesc = (void *) reconDesc;
492 #if RF_RECON_STATS > 0
493 reconDesc->hsStallCount = 0;
494 reconDesc->numReconExecDelays = 0;
495 reconDesc->numReconEventWaits = 0;
496 #endif /* RF_RECON_STATS > 0 */
497 reconDesc->reconExecTimerRunning = 0;
498 reconDesc->reconExecTicks = 0;
499 reconDesc->maxReconExecTicks = 0;
500 rc = rf_ContinueReconstructFailedDisk(reconDesc);
501
502 if (!rc) {
503 RF_LOCK_MUTEX(raidPtr->mutex);
504 /* Need to set these here, as at this point it'll be claiming
505 that the disk is in rf_ds_spared! But we know better :-) */
506
507 raidPtr->Disks[col].status = rf_ds_optimal;
508 raidPtr->status = rf_rs_optimal;
509 RF_UNLOCK_MUTEX(raidPtr->mutex);
510
511 /* fix up the component label */
512 /* Don't actually need the read here.. */
513 c_label = raidget_component_label(raidPtr, col);
514
515 RF_LOCK_MUTEX(raidPtr->mutex);
516 raid_init_component_label(raidPtr, c_label);
517
518 c_label->row = 0;
519 c_label->column = col;
520
521 /* We've just done a rebuild based on all the other
522 disks, so at this point the parity is known to be
523 clean, even if it wasn't before. */
524
525 /* XXX doesn't hold for RAID 6!!*/
526
527 raidPtr->parity_good = RF_RAID_CLEAN;
528 RF_UNLOCK_MUTEX(raidPtr->mutex);
529
530 raidflush_component_label(raidPtr, col);
531 } else {
532 /* Reconstruct-in-place failed. Disk goes back to
533 "failed" status, regardless of what it was before. */
534 RF_LOCK_MUTEX(raidPtr->mutex);
535 raidPtr->Disks[col].status = rf_ds_failed;
536 RF_UNLOCK_MUTEX(raidPtr->mutex);
537 }
538
539 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
540
541 RF_LOCK_MUTEX(raidPtr->mutex);
542 raidPtr->reconInProgress--;
543 RF_UNLOCK_MUTEX(raidPtr->mutex);
544
545 RF_SIGNAL_COND(raidPtr->waitForReconCond);
546 return (rc);
547 }
548
549
550 int
551 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
552 {
553 RF_Raid_t *raidPtr = reconDesc->raidPtr;
554 RF_RowCol_t col = reconDesc->col;
555 RF_RowCol_t scol = reconDesc->scol;
556 RF_ReconMap_t *mapPtr;
557 RF_ReconCtrl_t *tmp_reconctrl;
558 RF_ReconEvent_t *event;
559 RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev;
560 #if RF_INCLUDE_RAID5_RS > 0
561 RF_StripeCount_t startPSID,endPSID,aPSID,bPSID,offPSID;
562 #endif
563 RF_ReconUnitCount_t RUsPerPU;
564 struct timeval etime, elpsd;
565 unsigned long xor_s, xor_resid_us;
566 int i, ds;
567 int status, done;
568 int recon_error, write_error;
569
570 raidPtr->accumXorTimeUs = 0;
571 #if RF_ACC_TRACE > 0
572 /* create one trace record per physical disk */
573 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
574 #endif
575
576 /* quiesce the array prior to starting recon. this is needed
577 * to assure no nasty interactions with pending user writes.
578 * We need to do this before we change the disk or row status. */
579
580 Dprintf("RECON: begin request suspend\n");
581 rf_SuspendNewRequestsAndWait(raidPtr);
582 Dprintf("RECON: end request suspend\n");
583
584 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
585 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
586
587 RF_LOCK_MUTEX(raidPtr->mutex);
588
589 /* create the reconstruction control pointer and install it in
590 * the right slot */
591 raidPtr->reconControl = tmp_reconctrl;
592 mapPtr = raidPtr->reconControl->reconMap;
593 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
594 raidPtr->reconControl->numRUsComplete = 0;
595 raidPtr->status = rf_rs_reconstructing;
596 raidPtr->Disks[col].status = rf_ds_reconstructing;
597 raidPtr->Disks[col].spareCol = scol;
598
599 RF_UNLOCK_MUTEX(raidPtr->mutex);
600
601 RF_GETTIME(raidPtr->reconControl->starttime);
602
603 Dprintf("RECON: resume requests\n");
604 rf_ResumeNewRequests(raidPtr);
605
606
607 mapPtr = raidPtr->reconControl->reconMap;
608
609 incPSID = RF_RECONMAP_SIZE;
610 lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU;
611 RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU;
612 recon_error = 0;
613 write_error = 0;
614 pending_writes = incPSID;
615 raidPtr->reconControl->lastPSID = incPSID - 1;
616
617 /* bounds check raidPtr->reconControl->lastPSID and
618 pending_writes so that we don't attempt to wait for more IO
619 than can possibly happen */
620
621 if (raidPtr->reconControl->lastPSID > lastPSID)
622 raidPtr->reconControl->lastPSID = lastPSID;
623
624 if (pending_writes > lastPSID)
625 pending_writes = lastPSID;
626
627 /* start the actual reconstruction */
628
629 done = 0;
630 while (!done) {
631
632 if (raidPtr->waitShutdown) {
633 /* someone is unconfiguring this array... bail on the reconstruct.. */
634 recon_error = 1;
635 break;
636 }
637
638 num_writes = 0;
639
640 #if RF_INCLUDE_RAID5_RS > 0
641 /* For RAID5 with Rotated Spares we will be 'short'
642 some number of writes since no writes will get
643 issued for stripes where the spare is on the
644 component being rebuilt. Account for the shortage
645 here so that we don't hang indefinitely below
646 waiting for writes to complete that were never
647 scheduled.
648
649 XXX: Should be fixed for PARITY_DECLUSTERING and
650 others too!
651
652 */
653
654 if (raidPtr->Layout.numDataCol <
655 raidPtr->numCol - raidPtr->Layout.numParityCol) {
656 /* numDataCol is at least 2 less than numCol, so
657 should be RAID 5 with Rotated Spares */
658
659 /* XXX need to update for RAID 6 */
660
661 startPSID = raidPtr->reconControl->lastPSID - pending_writes + 1;
662 endPSID = raidPtr->reconControl->lastPSID;
663
664 offPSID = raidPtr->numCol - col - 1;
665
666 aPSID = startPSID - startPSID % raidPtr->numCol + offPSID;
667 if (aPSID < startPSID) {
668 aPSID += raidPtr->numCol;
669 }
670
671 bPSID = endPSID - ((endPSID - offPSID) % raidPtr->numCol);
672
673 if (aPSID < endPSID) {
674 num_writes = ((bPSID - aPSID) / raidPtr->numCol) + 1;
675 }
676
677 if ((aPSID == endPSID) && (bPSID == endPSID)) {
678 num_writes++;
679 }
680 }
681 #endif
682
683 /* issue a read for each surviving disk */
684
685 reconDesc->numDisksDone = 0;
686 for (i = 0; i < raidPtr->numCol; i++) {
687 if (i != col) {
688 /* find and issue the next I/O on the
689 * indicated disk */
690 if (IssueNextReadRequest(raidPtr, i)) {
691 Dprintf1("RECON: done issuing for c%d\n", i);
692 reconDesc->numDisksDone++;
693 }
694 }
695 }
696
697 /* process reconstruction events until all disks report that
698 * they've completed all work */
699
700 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
701
702 event = rf_GetNextReconEvent(reconDesc);
703 status = ProcessReconEvent(raidPtr, event);
704
705 /* the normal case is that a read completes, and all is well. */
706 if (status == RF_RECON_DONE_READS) {
707 reconDesc->numDisksDone++;
708 } else if ((status == RF_RECON_READ_ERROR) ||
709 (status == RF_RECON_WRITE_ERROR)) {
710 /* an error was encountered while reconstructing...
711 Pretend we've finished this disk.
712 */
713 recon_error = 1;
714 raidPtr->reconControl->error = 1;
715
716 /* bump the numDisksDone count for reads,
717 but not for writes */
718 if (status == RF_RECON_READ_ERROR)
719 reconDesc->numDisksDone++;
720
721 /* write errors are special -- when we are
722 done dealing with the reads that are
723 finished, we don't want to wait for any
724 writes */
725 if (status == RF_RECON_WRITE_ERROR) {
726 write_error = 1;
727 num_writes++;
728 }
729
730 } else if (status == RF_RECON_READ_STOPPED) {
731 /* count this component as being "done" */
732 reconDesc->numDisksDone++;
733 } else if (status == RF_RECON_WRITE_DONE) {
734 num_writes++;
735 }
736
737 if (recon_error) {
738 /* make sure any stragglers are woken up so that
739 their theads will complete, and we can get out
740 of here with all IO processed */
741
742 rf_WakeupHeadSepCBWaiters(raidPtr);
743 }
744
745 raidPtr->reconControl->numRUsTotal =
746 mapPtr->totalRUs;
747 raidPtr->reconControl->numRUsComplete =
748 mapPtr->totalRUs -
749 rf_UnitsLeftToReconstruct(mapPtr);
750
751 #if RF_DEBUG_RECON
752 raidPtr->reconControl->percentComplete =
753 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
754 if (rf_prReconSched) {
755 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
756 }
757 #endif
758 }
759
760 /* reads done, wakeup any waiters, and then wait for writes */
761
762 rf_WakeupHeadSepCBWaiters(raidPtr);
763
764 while (!recon_error && (num_writes < pending_writes)) {
765 event = rf_GetNextReconEvent(reconDesc);
766 status = ProcessReconEvent(raidPtr, event);
767
768 if (status == RF_RECON_WRITE_ERROR) {
769 num_writes++;
770 recon_error = 1;
771 raidPtr->reconControl->error = 1;
772 /* an error was encountered at the very end... bail */
773 } else if (status == RF_RECON_WRITE_DONE) {
774 num_writes++;
775 } /* else it's something else, and we don't care */
776 }
777 if (recon_error ||
778 (raidPtr->reconControl->lastPSID == lastPSID)) {
779 done = 1;
780 break;
781 }
782
783 prev = raidPtr->reconControl->lastPSID;
784 raidPtr->reconControl->lastPSID += incPSID;
785
786 if (raidPtr->reconControl->lastPSID > lastPSID) {
787 pending_writes = lastPSID - prev;
788 raidPtr->reconControl->lastPSID = lastPSID;
789 }
790
791 /* back down curPSID to get ready for the next round... */
792 for (i = 0; i < raidPtr->numCol; i++) {
793 if (i != col) {
794 raidPtr->reconControl->perDiskInfo[i].curPSID--;
795 raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1;
796 }
797 }
798 }
799
800 mapPtr = raidPtr->reconControl->reconMap;
801 if (rf_reconDebug) {
802 printf("RECON: all reads completed\n");
803 }
804 /* at this point all the reads have completed. We now wait
805 * for any pending writes to complete, and then we're done */
806
807 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
808
809 event = rf_GetNextReconEvent(reconDesc);
810 status = ProcessReconEvent(raidPtr, event);
811
812 if (status == RF_RECON_WRITE_ERROR) {
813 recon_error = 1;
814 raidPtr->reconControl->error = 1;
815 /* an error was encountered at the very end... bail */
816 } else {
817 #if RF_DEBUG_RECON
818 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
819 if (rf_prReconSched) {
820 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
821 }
822 #endif
823 }
824 }
825
826 if (recon_error) {
827 /* we've encountered an error in reconstructing. */
828 printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
829
830 /* we start by blocking IO to the RAID set. */
831 rf_SuspendNewRequestsAndWait(raidPtr);
832
833 RF_LOCK_MUTEX(raidPtr->mutex);
834 /* mark set as being degraded, rather than
835 rf_rs_reconstructing as we were before the problem.
836 After this is done we can update status of the
837 component disks without worrying about someone
838 trying to read from a failed component.
839 */
840 raidPtr->status = rf_rs_degraded;
841 RF_UNLOCK_MUTEX(raidPtr->mutex);
842
843 /* resume IO */
844 rf_ResumeNewRequests(raidPtr);
845
846 /* At this point there are two cases:
847 1) If we've experienced a read error, then we've
848 already waited for all the reads we're going to get,
849 and we just need to wait for the writes.
850
851 2) If we've experienced a write error, we've also
852 already waited for all the reads to complete,
853 but there is little point in waiting for the writes --
854 when they do complete, they will just be ignored.
855
856 So we just wait for writes to complete if we didn't have a
857 write error.
858 */
859
860 if (!write_error) {
861 /* wait for writes to complete */
862 while (raidPtr->reconControl->pending_writes > 0) {
863
864 event = rf_GetNextReconEvent(reconDesc);
865 status = ProcessReconEvent(raidPtr, event);
866
867 if (status == RF_RECON_WRITE_ERROR) {
868 raidPtr->reconControl->error = 1;
869 /* an error was encountered at the very end... bail.
870 This will be very bad news for the user, since
871 at this point there will have been a read error
872 on one component, and a write error on another!
873 */
874 break;
875 }
876 }
877 }
878
879
880 /* cleanup */
881
882 /* drain the event queue - after waiting for the writes above,
883 there shouldn't be much (if anything!) left in the queue. */
884
885 rf_DrainReconEventQueue(reconDesc);
886
887 /* XXX As much as we'd like to free the recon control structure
888 and the reconDesc, we have no way of knowing if/when those will
889 be touched by IO that has yet to occur. It is rather poor to be
890 basically causing a 'memory leak' here, but there doesn't seem to be
891 a cleaner alternative at this time. Perhaps when the reconstruct code
892 gets a makeover this problem will go away.
893 */
894 #if 0
895 rf_FreeReconControl(raidPtr);
896 #endif
897
898 #if RF_ACC_TRACE > 0
899 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
900 #endif
901 /* XXX see comment above */
902 #if 0
903 FreeReconDesc(reconDesc);
904 #endif
905
906 return (1);
907 }
908
909 /* Success: mark the dead disk as reconstructed. We quiesce
910 * the array here to assure no nasty interactions with pending
911 * user accesses when we free up the psstatus structure as
912 * part of FreeReconControl() */
913
914 rf_SuspendNewRequestsAndWait(raidPtr);
915
916 RF_LOCK_MUTEX(raidPtr->mutex);
917 raidPtr->numFailures--;
918 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
919 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
920 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
921 RF_UNLOCK_MUTEX(raidPtr->mutex);
922 RF_GETTIME(etime);
923 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
924
925 rf_ResumeNewRequests(raidPtr);
926
927 printf("raid%d: Reconstruction of disk at col %d completed\n",
928 raidPtr->raidid, col);
929 xor_s = raidPtr->accumXorTimeUs / 1000000;
930 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
931 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
932 raidPtr->raidid,
933 (int) elpsd.tv_sec, (int) elpsd.tv_usec,
934 raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
935 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n",
936 raidPtr->raidid,
937 (int) raidPtr->reconControl->starttime.tv_sec,
938 (int) raidPtr->reconControl->starttime.tv_usec,
939 (int) etime.tv_sec, (int) etime.tv_usec);
940 #if RF_RECON_STATS > 0
941 printf("raid%d: Total head-sep stall count was %d\n",
942 raidPtr->raidid, (int) reconDesc->hsStallCount);
943 #endif /* RF_RECON_STATS > 0 */
944 rf_FreeReconControl(raidPtr);
945 #if RF_ACC_TRACE > 0
946 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
947 #endif
948 FreeReconDesc(reconDesc);
949
950 return (0);
951
952 }
953 /*****************************************************************************
954 * do the right thing upon each reconstruction event.
955 *****************************************************************************/
956 static int
957 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
958 {
959 int retcode = 0, submitblocked;
960 RF_ReconBuffer_t *rbuf;
961 RF_SectorCount_t sectorsPerRU;
962
963 retcode = RF_RECON_READ_STOPPED;
964
965 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
966
967 switch (event->type) {
968
969 /* a read I/O has completed */
970 case RF_REVENT_READDONE:
971 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
972 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
973 event->col, rbuf->parityStripeID);
974 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n",
975 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
976 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
977 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
978 if (!raidPtr->reconControl->error) {
979 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
980 Dprintf1("RECON: submitblocked=%d\n", submitblocked);
981 if (!submitblocked)
982 retcode = IssueNextReadRequest(raidPtr, event->col);
983 else
984 retcode = 0;
985 }
986 break;
987
988 /* a write I/O has completed */
989 case RF_REVENT_WRITEDONE:
990 #if RF_DEBUG_RECON
991 if (rf_floatingRbufDebug) {
992 rf_CheckFloatingRbufCount(raidPtr, 1);
993 }
994 #endif
995 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
996 rbuf = (RF_ReconBuffer_t *) event->arg;
997 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
998 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
999 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
1000 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
1001 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
1002 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
1003
1004 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1005 raidPtr->reconControl->pending_writes--;
1006 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1007
1008 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
1009 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1010 while(raidPtr->reconControl->rb_lock) {
1011 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
1012 &raidPtr->reconControl->rb_mutex);
1013 }
1014 raidPtr->reconControl->rb_lock = 1;
1015 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1016
1017 raidPtr->numFullReconBuffers--;
1018 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
1019
1020 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1021 raidPtr->reconControl->rb_lock = 0;
1022 wakeup(&raidPtr->reconControl->rb_lock);
1023 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1024 } else
1025 if (rbuf->type == RF_RBUF_TYPE_FORCED)
1026 rf_FreeReconBuffer(rbuf);
1027 else
1028 RF_ASSERT(0);
1029 retcode = RF_RECON_WRITE_DONE;
1030 break;
1031
1032 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been
1033 * cleared */
1034 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
1035 if (!raidPtr->reconControl->error) {
1036 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
1037 0, (int) (long) event->arg);
1038 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the
1039 * BUFCLEAR event if we
1040 * couldn't submit */
1041 retcode = IssueNextReadRequest(raidPtr, event->col);
1042 }
1043 break;
1044
1045 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction
1046 * blockage has been cleared */
1047 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
1048 if (!raidPtr->reconControl->error) {
1049 retcode = TryToRead(raidPtr, event->col);
1050 }
1051 break;
1052
1053 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation
1054 * reconstruction blockage has been
1055 * cleared */
1056 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
1057 if (!raidPtr->reconControl->error) {
1058 retcode = TryToRead(raidPtr, event->col);
1059 }
1060 break;
1061
1062 /* a buffer has become ready to write */
1063 case RF_REVENT_BUFREADY:
1064 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
1065 if (!raidPtr->reconControl->error) {
1066 retcode = IssueNextWriteRequest(raidPtr);
1067 #if RF_DEBUG_RECON
1068 if (rf_floatingRbufDebug) {
1069 rf_CheckFloatingRbufCount(raidPtr, 1);
1070 }
1071 #endif
1072 }
1073 break;
1074
1075 /* we need to skip the current RU entirely because it got
1076 * recon'd while we were waiting for something else to happen */
1077 case RF_REVENT_SKIP:
1078 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
1079 if (!raidPtr->reconControl->error) {
1080 retcode = IssueNextReadRequest(raidPtr, event->col);
1081 }
1082 break;
1083
1084 /* a forced-reconstruction read access has completed. Just
1085 * submit the buffer */
1086 case RF_REVENT_FORCEDREADDONE:
1087 rbuf = (RF_ReconBuffer_t *) event->arg;
1088 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1089 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
1090 if (!raidPtr->reconControl->error) {
1091 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
1092 RF_ASSERT(!submitblocked);
1093 retcode = 0;
1094 }
1095 break;
1096
1097 /* A read I/O failed to complete */
1098 case RF_REVENT_READ_FAILED:
1099 retcode = RF_RECON_READ_ERROR;
1100 break;
1101
1102 /* A write I/O failed to complete */
1103 case RF_REVENT_WRITE_FAILED:
1104 retcode = RF_RECON_WRITE_ERROR;
1105
1106 /* This is an error, but it was a pending write.
1107 Account for it. */
1108 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1109 raidPtr->reconControl->pending_writes--;
1110 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1111
1112 rbuf = (RF_ReconBuffer_t *) event->arg;
1113
1114 /* cleanup the disk queue data */
1115 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1116
1117 /* At this point we're erroring out, badly, and floatingRbufs
1118 may not even be valid. Rather than putting this back onto
1119 the floatingRbufs list, just arrange for its immediate
1120 destruction.
1121 */
1122 rf_FreeReconBuffer(rbuf);
1123 break;
1124
1125 /* a forced read I/O failed to complete */
1126 case RF_REVENT_FORCEDREAD_FAILED:
1127 retcode = RF_RECON_READ_ERROR;
1128 break;
1129
1130 default:
1131 RF_PANIC();
1132 }
1133 rf_FreeReconEventDesc(event);
1134 return (retcode);
1135 }
1136 /*****************************************************************************
1137 *
1138 * find the next thing that's needed on the indicated disk, and issue
1139 * a read request for it. We assume that the reconstruction buffer
1140 * associated with this process is free to receive the data. If
1141 * reconstruction is blocked on the indicated RU, we issue a
1142 * blockage-release request instead of a physical disk read request.
1143 * If the current disk gets too far ahead of the others, we issue a
1144 * head-separation wait request and return.
1145 *
1146 * ctrl->{ru_count, curPSID, diskOffset} and
1147 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1148 * we're currently accessing. Note that this deviates from the
1149 * standard C idiom of having counters point to the next thing to be
1150 * accessed. This allows us to easily retry when we're blocked by
1151 * head separation or reconstruction-blockage events.
1152 *
1153 *****************************************************************************/
1154 static int
1155 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1156 {
1157 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1158 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1159 RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1160 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1161 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1162 int do_new_check = 0, retcode = 0, status;
1163
1164 /* if we are currently the slowest disk, mark that we have to do a new
1165 * check */
1166 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1167 do_new_check = 1;
1168
1169 while (1) {
1170
1171 ctrl->ru_count++;
1172 if (ctrl->ru_count < RUsPerPU) {
1173 ctrl->diskOffset += sectorsPerRU;
1174 rbuf->failedDiskSectorOffset += sectorsPerRU;
1175 } else {
1176 ctrl->curPSID++;
1177 ctrl->ru_count = 0;
1178 /* code left over from when head-sep was based on
1179 * parity stripe id */
1180 if (ctrl->curPSID > raidPtr->reconControl->lastPSID) {
1181 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1182 return (RF_RECON_DONE_READS); /* finito! */
1183 }
1184 /* find the disk offsets of the start of the parity
1185 * stripe on both the current disk and the failed
1186 * disk. skip this entire parity stripe if either disk
1187 * does not appear in the indicated PS */
1188 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1189 &rbuf->spCol, &rbuf->spOffset);
1190 if (status) {
1191 ctrl->ru_count = RUsPerPU - 1;
1192 continue;
1193 }
1194 }
1195 rbuf->which_ru = ctrl->ru_count;
1196
1197 /* skip this RU if it's already been reconstructed */
1198 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1199 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1200 continue;
1201 }
1202 break;
1203 }
1204 ctrl->headSepCounter++;
1205 if (do_new_check)
1206 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */
1207
1208
1209 /* at this point, we have definitely decided what to do, and we have
1210 * only to see if we can actually do it now */
1211 rbuf->parityStripeID = ctrl->curPSID;
1212 rbuf->which_ru = ctrl->ru_count;
1213 #if RF_ACC_TRACE > 0
1214 memset((char *) &raidPtr->recon_tracerecs[col], 0,
1215 sizeof(raidPtr->recon_tracerecs[col]));
1216 raidPtr->recon_tracerecs[col].reconacc = 1;
1217 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1218 #endif
1219 retcode = TryToRead(raidPtr, col);
1220 return (retcode);
1221 }
1222
1223 /*
1224 * tries to issue the next read on the indicated disk. We may be
1225 * blocked by (a) the heads being too far apart, or (b) recon on the
1226 * indicated RU being blocked due to a write by a user thread. In
1227 * this case, we issue a head-sep or blockage wait request, which will
1228 * cause this same routine to be invoked again later when the blockage
1229 * has cleared.
1230 */
1231
1232 static int
1233 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1234 {
1235 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1236 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1237 RF_StripeNum_t psid = ctrl->curPSID;
1238 RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1239 RF_DiskQueueData_t *req;
1240 int status;
1241 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1242
1243 /* if the current disk is too far ahead of the others, issue a
1244 * head-separation wait and return */
1245 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1246 return (0);
1247
1248 /* allocate a new PSS in case we need it */
1249 newpssPtr = rf_AllocPSStatus(raidPtr);
1250
1251 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1252 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1253
1254 if (pssPtr != newpssPtr) {
1255 rf_FreePSStatus(raidPtr, newpssPtr);
1256 }
1257
1258 /* if recon is blocked on the indicated parity stripe, issue a
1259 * block-wait request and return. this also must mark the indicated RU
1260 * in the stripe as under reconstruction if not blocked. */
1261 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1262 if (status == RF_PSS_RECON_BLOCKED) {
1263 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1264 goto out;
1265 } else
1266 if (status == RF_PSS_FORCED_ON_WRITE) {
1267 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1268 goto out;
1269 }
1270 /* make one last check to be sure that the indicated RU didn't get
1271 * reconstructed while we were waiting for something else to happen.
1272 * This is unfortunate in that it causes us to make this check twice
1273 * in the normal case. Might want to make some attempt to re-work
1274 * this so that we only do this check if we've definitely blocked on
1275 * one of the above checks. When this condition is detected, we may
1276 * have just created a bogus status entry, which we need to delete. */
1277 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1278 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1279 if (pssPtr == newpssPtr)
1280 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1281 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1282 goto out;
1283 }
1284 /* found something to read. issue the I/O */
1285 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1286 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1287 #if RF_ACC_TRACE > 0
1288 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1289 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1290 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1291 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1292 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1293 #endif
1294 /* should be ok to use a NULL proc pointer here, all the bufs we use
1295 * should be in kernel space */
1296 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1297 ReconReadDoneProc, (void *) ctrl,
1298 #if RF_ACC_TRACE > 0
1299 &raidPtr->recon_tracerecs[col],
1300 #else
1301 NULL,
1302 #endif
1303 (void *) raidPtr, 0, NULL, PR_WAITOK);
1304
1305 ctrl->rbuf->arg = (void *) req;
1306 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1307 pssPtr->issued[col] = 1;
1308
1309 out:
1310 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1311 return (0);
1312 }
1313
1314
1315 /*
1316 * given a parity stripe ID, we want to find out whether both the
1317 * current disk and the failed disk exist in that parity stripe. If
1318 * not, we want to skip this whole PS. If so, we want to find the
1319 * disk offset of the start of the PS on both the current disk and the
1320 * failed disk.
1321 *
1322 * this works by getting a list of disks comprising the indicated
1323 * parity stripe, and searching the list for the current and failed
1324 * disks. Once we've decided they both exist in the parity stripe, we
1325 * need to decide whether each is data or parity, so that we'll know
1326 * which mapping function to call to get the corresponding disk
1327 * offsets.
1328 *
1329 * this is kind of unpleasant, but doing it this way allows the
1330 * reconstruction code to use parity stripe IDs rather than physical
1331 * disks address to march through the failed disk, which greatly
1332 * simplifies a lot of code, as well as eliminating the need for a
1333 * reverse-mapping function. I also think it will execute faster,
1334 * since the calls to the mapping module are kept to a minimum.
1335 *
1336 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1337 * THE STRIPE IN THE CORRECT ORDER
1338 *
1339 * raidPtr - raid descriptor
1340 * psid - parity stripe identifier
1341 * col - column of disk to find the offsets for
1342 * spCol - out: col of spare unit for failed unit
1343 * spOffset - out: offset into disk containing spare unit
1344 *
1345 */
1346
1347
1348 static int
1349 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1350 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1351 RF_SectorNum_t *outFailedDiskSectorOffset,
1352 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1353 {
1354 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1355 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1356 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1357 RF_RowCol_t *diskids;
1358 u_int i, j, k, i_offset, j_offset;
1359 RF_RowCol_t pcol;
1360 int testcol;
1361 RF_SectorNum_t poffset;
1362 char i_is_parity = 0, j_is_parity = 0;
1363 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1364
1365 /* get a listing of the disks comprising that stripe */
1366 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1367 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1368 RF_ASSERT(diskids);
1369
1370 /* reject this entire parity stripe if it does not contain the
1371 * indicated disk or it does not contain the failed disk */
1372
1373 for (i = 0; i < stripeWidth; i++) {
1374 if (col == diskids[i])
1375 break;
1376 }
1377 if (i == stripeWidth)
1378 goto skipit;
1379 for (j = 0; j < stripeWidth; j++) {
1380 if (fcol == diskids[j])
1381 break;
1382 }
1383 if (j == stripeWidth) {
1384 goto skipit;
1385 }
1386 /* find out which disk the parity is on */
1387 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1388
1389 /* find out if either the current RU or the failed RU is parity */
1390 /* also, if the parity occurs in this stripe prior to the data and/or
1391 * failed col, we need to decrement i and/or j */
1392 for (k = 0; k < stripeWidth; k++)
1393 if (diskids[k] == pcol)
1394 break;
1395 RF_ASSERT(k < stripeWidth);
1396 i_offset = i;
1397 j_offset = j;
1398 if (k < i)
1399 i_offset--;
1400 else
1401 if (k == i) {
1402 i_is_parity = 1;
1403 i_offset = 0;
1404 } /* set offsets to zero to disable multiply
1405 * below */
1406 if (k < j)
1407 j_offset--;
1408 else
1409 if (k == j) {
1410 j_is_parity = 1;
1411 j_offset = 0;
1412 }
1413 /* at this point, [ij]_is_parity tells us whether the [current,failed]
1414 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1415 * tells us how far into the stripe the [current,failed] disk is. */
1416
1417 /* call the mapping routine to get the offset into the current disk,
1418 * repeat for failed disk. */
1419 if (i_is_parity)
1420 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1421 else
1422 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1423
1424 RF_ASSERT(col == testcol);
1425
1426 if (j_is_parity)
1427 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1428 else
1429 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1430 RF_ASSERT(fcol == testcol);
1431
1432 /* now locate the spare unit for the failed unit */
1433 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1434 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1435 if (j_is_parity)
1436 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1437 else
1438 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1439 } else {
1440 #endif
1441 *spCol = raidPtr->reconControl->spareCol;
1442 *spOffset = *outFailedDiskSectorOffset;
1443 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1444 }
1445 #endif
1446 return (0);
1447
1448 skipit:
1449 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
1450 psid, col);
1451 return (1);
1452 }
1453 /* this is called when a buffer has become ready to write to the replacement disk */
1454 static int
1455 IssueNextWriteRequest(RF_Raid_t *raidPtr)
1456 {
1457 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1458 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1459 #if RF_ACC_TRACE > 0
1460 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1461 #endif
1462 RF_ReconBuffer_t *rbuf;
1463 RF_DiskQueueData_t *req;
1464
1465 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1466 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't
1467 * have gotten the event that sent us here */
1468 RF_ASSERT(rbuf->pssPtr);
1469
1470 rbuf->pssPtr->writeRbuf = rbuf;
1471 rbuf->pssPtr = NULL;
1472
1473 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1474 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1475 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1476 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n",
1477 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1478 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1479
1480 /* should be ok to use a NULL b_proc here b/c all addrs should be in
1481 * kernel space */
1482 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1483 sectorsPerRU, rbuf->buffer,
1484 rbuf->parityStripeID, rbuf->which_ru,
1485 ReconWriteDoneProc, (void *) rbuf,
1486 #if RF_ACC_TRACE > 0
1487 &raidPtr->recon_tracerecs[fcol],
1488 #else
1489 NULL,
1490 #endif
1491 (void *) raidPtr, 0, NULL, PR_WAITOK);
1492
1493 rbuf->arg = (void *) req;
1494 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1495 raidPtr->reconControl->pending_writes++;
1496 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1497 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1498
1499 return (0);
1500 }
1501
1502 /*
1503 * this gets called upon the completion of a reconstruction read
1504 * operation the arg is a pointer to the per-disk reconstruction
1505 * control structure for the process that just finished a read.
1506 *
1507 * called at interrupt context in the kernel, so don't do anything
1508 * illegal here.
1509 */
1510 static int
1511 ReconReadDoneProc(void *arg, int status)
1512 {
1513 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1514 RF_Raid_t *raidPtr;
1515
1516 /* Detect that reconCtrl is no longer valid, and if that
1517 is the case, bail without calling rf_CauseReconEvent().
1518 There won't be anyone listening for this event anyway */
1519
1520 if (ctrl->reconCtrl == NULL)
1521 return(0);
1522
1523 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1524
1525 if (status) {
1526 printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status);
1527 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1528 return(0);
1529 }
1530 #if RF_ACC_TRACE > 0
1531 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1532 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1533 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1534 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1535 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1536 #endif
1537 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1538 return (0);
1539 }
1540 /* this gets called upon the completion of a reconstruction write operation.
1541 * the arg is a pointer to the rbuf that was just written
1542 *
1543 * called at interrupt context in the kernel, so don't do anything illegal here.
1544 */
1545 static int
1546 ReconWriteDoneProc(void *arg, int status)
1547 {
1548 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1549
1550 /* Detect that reconControl is no longer valid, and if that
1551 is the case, bail without calling rf_CauseReconEvent().
1552 There won't be anyone listening for this event anyway */
1553
1554 if (rbuf->raidPtr->reconControl == NULL)
1555 return(0);
1556
1557 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1558 if (status) {
1559 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
1560 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1561 return(0);
1562 }
1563 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1564 return (0);
1565 }
1566
1567
1568 /*
1569 * computes a new minimum head sep, and wakes up anyone who needs to
1570 * be woken as a result
1571 */
1572 static void
1573 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1574 {
1575 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1576 RF_HeadSepLimit_t new_min;
1577 RF_RowCol_t i;
1578 RF_CallbackDesc_t *p;
1579 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition
1580 * of a minimum */
1581
1582
1583 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1584 while(reconCtrlPtr->rb_lock) {
1585 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
1586 }
1587 reconCtrlPtr->rb_lock = 1;
1588 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1589
1590 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1591 for (i = 0; i < raidPtr->numCol; i++)
1592 if (i != reconCtrlPtr->fcol) {
1593 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1594 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1595 }
1596 /* set the new minimum and wake up anyone who can now run again */
1597 if (new_min != reconCtrlPtr->minHeadSepCounter) {
1598 reconCtrlPtr->minHeadSepCounter = new_min;
1599 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min);
1600 while (reconCtrlPtr->headSepCBList) {
1601 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1602 break;
1603 p = reconCtrlPtr->headSepCBList;
1604 reconCtrlPtr->headSepCBList = p->next;
1605 p->next = NULL;
1606 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1607 rf_FreeCallbackDesc(p);
1608 }
1609
1610 }
1611 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1612 reconCtrlPtr->rb_lock = 0;
1613 wakeup(&reconCtrlPtr->rb_lock);
1614 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1615 }
1616
1617 /*
1618 * checks to see that the maximum head separation will not be violated
1619 * if we initiate a reconstruction I/O on the indicated disk.
1620 * Limiting the maximum head separation between two disks eliminates
1621 * the nasty buffer-stall conditions that occur when one disk races
1622 * ahead of the others and consumes all of the floating recon buffers.
1623 * This code is complex and unpleasant but it's necessary to avoid
1624 * some very nasty, albeit fairly rare, reconstruction behavior.
1625 *
1626 * returns non-zero if and only if we have to stop working on the
1627 * indicated disk due to a head-separation delay.
1628 */
1629 static int
1630 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1631 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1632 RF_ReconUnitNum_t which_ru)
1633 {
1634 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1635 RF_CallbackDesc_t *cb, *p, *pt;
1636 int retval = 0;
1637
1638 /* if we're too far ahead of the slowest disk, stop working on this
1639 * disk until the slower ones catch up. We do this by scheduling a
1640 * wakeup callback for the time when the slowest disk has caught up.
1641 * We define "caught up" with 20% hysteresis, i.e. the head separation
1642 * must have fallen to at most 80% of the max allowable head
1643 * separation before we'll wake up.
1644 *
1645 */
1646 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1647 while(reconCtrlPtr->rb_lock) {
1648 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
1649 }
1650 reconCtrlPtr->rb_lock = 1;
1651 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1652 if ((raidPtr->headSepLimit >= 0) &&
1653 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1654 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1655 raidPtr->raidid, col, ctrl->headSepCounter,
1656 reconCtrlPtr->minHeadSepCounter,
1657 raidPtr->headSepLimit);
1658 cb = rf_AllocCallbackDesc();
1659 /* the minHeadSepCounter value we have to get to before we'll
1660 * wake up. build in 20% hysteresis. */
1661 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1662 cb->col = col;
1663 cb->next = NULL;
1664
1665 /* insert this callback descriptor into the sorted list of
1666 * pending head-sep callbacks */
1667 p = reconCtrlPtr->headSepCBList;
1668 if (!p)
1669 reconCtrlPtr->headSepCBList = cb;
1670 else
1671 if (cb->callbackArg.v < p->callbackArg.v) {
1672 cb->next = reconCtrlPtr->headSepCBList;
1673 reconCtrlPtr->headSepCBList = cb;
1674 } else {
1675 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1676 cb->next = p;
1677 pt->next = cb;
1678 }
1679 retval = 1;
1680 #if RF_RECON_STATS > 0
1681 ctrl->reconCtrl->reconDesc->hsStallCount++;
1682 #endif /* RF_RECON_STATS > 0 */
1683 }
1684 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1685 reconCtrlPtr->rb_lock = 0;
1686 wakeup(&reconCtrlPtr->rb_lock);
1687 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1688
1689 return (retval);
1690 }
1691 /*
1692 * checks to see if reconstruction has been either forced or blocked
1693 * by a user operation. if forced, we skip this RU entirely. else if
1694 * blocked, put ourselves on the wait list. else return 0.
1695 *
1696 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1697 */
1698 static int
1699 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1700 RF_ReconParityStripeStatus_t *pssPtr,
1701 RF_PerDiskReconCtrl_t *ctrl,
1702 RF_RowCol_t col,
1703 RF_StripeNum_t psid,
1704 RF_ReconUnitNum_t which_ru)
1705 {
1706 RF_CallbackDesc_t *cb;
1707 int retcode = 0;
1708
1709 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1710 retcode = RF_PSS_FORCED_ON_WRITE;
1711 else
1712 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1713 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1714 cb = rf_AllocCallbackDesc(); /* append ourselves to
1715 * the blockage-wait
1716 * list */
1717 cb->col = col;
1718 cb->next = pssPtr->blockWaitList;
1719 pssPtr->blockWaitList = cb;
1720 retcode = RF_PSS_RECON_BLOCKED;
1721 }
1722 if (!retcode)
1723 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under
1724 * reconstruction */
1725
1726 return (retcode);
1727 }
1728 /*
1729 * if reconstruction is currently ongoing for the indicated stripeID,
1730 * reconstruction is forced to completion and we return non-zero to
1731 * indicate that the caller must wait. If not, then reconstruction is
1732 * blocked on the indicated stripe and the routine returns zero. If
1733 * and only if we return non-zero, we'll cause the cbFunc to get
1734 * invoked with the cbArg when the reconstruction has completed.
1735 */
1736 int
1737 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1738 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1739 {
1740 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're
1741 * forcing recon on */
1742 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */
1743 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity
1744 * stripe status structure */
1745 RF_StripeNum_t psid; /* parity stripe id */
1746 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk
1747 * offset */
1748 RF_RowCol_t *diskids;
1749 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */
1750 RF_RowCol_t fcol, diskno, i;
1751 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */
1752 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1753 RF_CallbackDesc_t *cb;
1754 int nPromoted;
1755
1756 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1757
1758 /* allocate a new PSS in case we need it */
1759 newpssPtr = rf_AllocPSStatus(raidPtr);
1760
1761 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1762
1763 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1764
1765 if (pssPtr != newpssPtr) {
1766 rf_FreePSStatus(raidPtr, newpssPtr);
1767 }
1768
1769 /* if recon is not ongoing on this PS, just return */
1770 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1771 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1772 return (0);
1773 }
1774 /* otherwise, we have to wait for reconstruction to complete on this
1775 * RU. */
1776 /* In order to avoid waiting for a potentially large number of
1777 * low-priority accesses to complete, we force a normal-priority (i.e.
1778 * not low-priority) reconstruction on this RU. */
1779 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1780 DDprintf1("Forcing recon on psid %ld\n", psid);
1781 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under
1782 * forced recon */
1783 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage
1784 * that we just set */
1785 fcol = raidPtr->reconControl->fcol;
1786
1787 /* get a listing of the disks comprising the indicated stripe */
1788 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1789
1790 /* For previously issued reads, elevate them to normal
1791 * priority. If the I/O has already completed, it won't be
1792 * found in the queue, and hence this will be a no-op. For
1793 * unissued reads, allocate buffers and issue new reads. The
1794 * fact that we've set the FORCED bit means that the regular
1795 * recon procs will not re-issue these reqs */
1796 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1797 if ((diskno = diskids[i]) != fcol) {
1798 if (pssPtr->issued[diskno]) {
1799 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1800 if (rf_reconDebug && nPromoted)
1801 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1802 } else {
1803 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */
1804 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1805 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare
1806 * location */
1807 new_rbuf->parityStripeID = psid; /* fill in the buffer */
1808 new_rbuf->which_ru = which_ru;
1809 new_rbuf->failedDiskSectorOffset = fd_offset;
1810 new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1811
1812 /* use NULL b_proc b/c all addrs
1813 * should be in kernel space */
1814 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1815 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1816 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1817
1818 new_rbuf->arg = req;
1819 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */
1820 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1821 }
1822 }
1823 /* if the write is sitting in the disk queue, elevate its
1824 * priority */
1825 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1826 if (rf_reconDebug)
1827 printf("raid%d: promoted write to col %d\n",
1828 raidPtr->raidid, fcol);
1829 }
1830 /* install a callback descriptor to be invoked when recon completes on
1831 * this parity stripe. */
1832 cb = rf_AllocCallbackDesc();
1833 /* XXX the following is bogus.. These functions don't really match!!
1834 * GO */
1835 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1836 cb->callbackArg.p = (void *) cbArg;
1837 cb->next = pssPtr->procWaitList;
1838 pssPtr->procWaitList = cb;
1839 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1840 raidPtr->raidid, psid);
1841
1842 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1843 return (1);
1844 }
1845 /* called upon the completion of a forced reconstruction read.
1846 * all we do is schedule the FORCEDREADONE event.
1847 * called at interrupt context in the kernel, so don't do anything illegal here.
1848 */
1849 static void
1850 ForceReconReadDoneProc(void *arg, int status)
1851 {
1852 RF_ReconBuffer_t *rbuf = arg;
1853
1854 /* Detect that reconControl is no longer valid, and if that
1855 is the case, bail without calling rf_CauseReconEvent().
1856 There won't be anyone listening for this event anyway */
1857
1858 if (rbuf->raidPtr->reconControl == NULL)
1859 return;
1860
1861 if (status) {
1862 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1863 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1864 return;
1865 }
1866 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1867 }
1868 /* releases a block on the reconstruction of the indicated stripe */
1869 int
1870 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1871 {
1872 RF_StripeNum_t stripeID = asmap->stripeID;
1873 RF_ReconParityStripeStatus_t *pssPtr;
1874 RF_ReconUnitNum_t which_ru;
1875 RF_StripeNum_t psid;
1876 RF_CallbackDesc_t *cb;
1877
1878 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1879 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1880 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1881
1882 /* When recon is forced, the pss desc can get deleted before we get
1883 * back to unblock recon. But, this can _only_ happen when recon is
1884 * forced. It would be good to put some kind of sanity check here, but
1885 * how to decide if recon was just forced or not? */
1886 if (!pssPtr) {
1887 /* printf("Warning: no pss descriptor upon unblock on psid %ld
1888 * RU %d\n",psid,which_ru); */
1889 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1890 if (rf_reconDebug || rf_pssDebug)
1891 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1892 #endif
1893 goto out;
1894 }
1895 pssPtr->blockCount--;
1896 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1897 raidPtr->raidid, psid, pssPtr->blockCount);
1898 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */
1899
1900 /* unblock recon before calling CauseReconEvent in case
1901 * CauseReconEvent causes us to try to issue a new read before
1902 * returning here. */
1903 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1904
1905
1906 while (pssPtr->blockWaitList) {
1907 /* spin through the block-wait list and
1908 release all the waiters */
1909 cb = pssPtr->blockWaitList;
1910 pssPtr->blockWaitList = cb->next;
1911 cb->next = NULL;
1912 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1913 rf_FreeCallbackDesc(cb);
1914 }
1915 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1916 /* if no recon was requested while recon was blocked */
1917 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1918 }
1919 }
1920 out:
1921 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1922 return (0);
1923 }
1924
1925 void
1926 rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr)
1927 {
1928 RF_CallbackDesc_t *p;
1929
1930 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1931 while(raidPtr->reconControl->rb_lock) {
1932 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO,
1933 "rf_wakeuphscbw", 0, &raidPtr->reconControl->rb_mutex);
1934 }
1935
1936 raidPtr->reconControl->rb_lock = 1;
1937 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1938
1939 while (raidPtr->reconControl->headSepCBList) {
1940 p = raidPtr->reconControl->headSepCBList;
1941 raidPtr->reconControl->headSepCBList = p->next;
1942 p->next = NULL;
1943 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1944 rf_FreeCallbackDesc(p);
1945 }
1946 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1947 raidPtr->reconControl->rb_lock = 0;
1948 wakeup(&raidPtr->reconControl->rb_lock);
1949 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1950
1951 }
1952
1953