rf_reconstruct.c revision 1.105.4.6.2.1 1 /* $NetBSD: rf_reconstruct.c,v 1.105.4.6.2.1 2014/11/20 12:25:10 sborrill Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.105.4.6.2.1 2014/11/20 12:25:10 sborrill Exp $");
37
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/errno.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <dev/raidframe/raidframevar.h>
48
49 #include "rf_raid.h"
50 #include "rf_reconutil.h"
51 #include "rf_revent.h"
52 #include "rf_reconbuffer.h"
53 #include "rf_acctrace.h"
54 #include "rf_etimer.h"
55 #include "rf_dag.h"
56 #include "rf_desc.h"
57 #include "rf_debugprint.h"
58 #include "rf_general.h"
59 #include "rf_driver.h"
60 #include "rf_utils.h"
61 #include "rf_shutdown.h"
62
63 #include "rf_kintf.h"
64
65 /* setting these to -1 causes them to be set to their default values if not set by debug options */
66
67 #if RF_DEBUG_RECON
68 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
69 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
70 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
71 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
72 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
76
77 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
78 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
79
80 #else /* RF_DEBUG_RECON */
81
82 #define Dprintf(s) {}
83 #define Dprintf1(s,a) {}
84 #define Dprintf2(s,a,b) {}
85 #define Dprintf3(s,a,b,c) {}
86 #define Dprintf4(s,a,b,c,d) {}
87 #define Dprintf5(s,a,b,c,d,e) {}
88 #define Dprintf6(s,a,b,c,d,e,f) {}
89 #define Dprintf7(s,a,b,c,d,e,f,g) {}
90
91 #define DDprintf1(s,a) {}
92 #define DDprintf2(s,a,b) {}
93
94 #endif /* RF_DEBUG_RECON */
95
96 #define RF_RECON_DONE_READS 1
97 #define RF_RECON_READ_ERROR 2
98 #define RF_RECON_WRITE_ERROR 3
99 #define RF_RECON_READ_STOPPED 4
100 #define RF_RECON_WRITE_DONE 5
101
102 #define RF_MAX_FREE_RECONBUFFER 32
103 #define RF_MIN_FREE_RECONBUFFER 16
104
105 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
106 RF_RaidDisk_t *, int, RF_RowCol_t);
107 static void FreeReconDesc(RF_RaidReconDesc_t *);
108 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
109 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
110 static int TryToRead(RF_Raid_t *, RF_RowCol_t);
111 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
112 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
113 RF_SectorNum_t *);
114 static int IssueNextWriteRequest(RF_Raid_t *);
115 static int ReconReadDoneProc(void *, int);
116 static int ReconWriteDoneProc(void *, int);
117 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
118 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
119 RF_RowCol_t, RF_HeadSepLimit_t,
120 RF_ReconUnitNum_t);
121 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
122 RF_ReconParityStripeStatus_t *,
123 RF_PerDiskReconCtrl_t *,
124 RF_RowCol_t, RF_StripeNum_t,
125 RF_ReconUnitNum_t);
126 static void ForceReconReadDoneProc(void *, int);
127 static void rf_ShutdownReconstruction(void *);
128
129 struct RF_ReconDoneProc_s {
130 void (*proc) (RF_Raid_t *, void *);
131 void *arg;
132 RF_ReconDoneProc_t *next;
133 };
134
135 /**************************************************************************
136 *
137 * sets up the parameters that will be used by the reconstruction process
138 * currently there are none, except for those that the layout-specific
139 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
140 *
141 * in the kernel, we fire off the recon thread.
142 *
143 **************************************************************************/
144 static void
145 rf_ShutdownReconstruction(void *ignored)
146 {
147 pool_destroy(&rf_pools.reconbuffer);
148 }
149
150 int
151 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
152 {
153
154 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
155 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
156 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
157
158 return (0);
159 }
160
161 static RF_RaidReconDesc_t *
162 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
163 RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
164 RF_RowCol_t scol)
165 {
166
167 RF_RaidReconDesc_t *reconDesc;
168
169 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
170 (RF_RaidReconDesc_t *));
171 reconDesc->raidPtr = raidPtr;
172 reconDesc->col = col;
173 reconDesc->spareDiskPtr = spareDiskPtr;
174 reconDesc->numDisksDone = numDisksDone;
175 reconDesc->scol = scol;
176 reconDesc->next = NULL;
177
178 return (reconDesc);
179 }
180
181 static void
182 FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
183 {
184 #if RF_RECON_STATS > 0
185 printf("raid%d: %lu recon event waits, %lu recon delays\n",
186 reconDesc->raidPtr->raidid,
187 (long) reconDesc->numReconEventWaits,
188 (long) reconDesc->numReconExecDelays);
189 #endif /* RF_RECON_STATS > 0 */
190 printf("raid%d: %lu max exec ticks\n",
191 reconDesc->raidPtr->raidid,
192 (long) reconDesc->maxReconExecTicks);
193 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
194 }
195
196
197 /*****************************************************************************
198 *
199 * primary routine to reconstruct a failed disk. This should be called from
200 * within its own thread. It won't return until reconstruction completes,
201 * fails, or is aborted.
202 *****************************************************************************/
203 int
204 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
205 {
206 const RF_LayoutSW_t *lp;
207 int rc;
208
209 lp = raidPtr->Layout.map;
210 if (lp->SubmitReconBuffer) {
211 /*
212 * The current infrastructure only supports reconstructing one
213 * disk at a time for each array.
214 */
215 RF_LOCK_MUTEX(raidPtr->mutex);
216 while (raidPtr->reconInProgress) {
217 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
218 }
219 raidPtr->reconInProgress++;
220 RF_UNLOCK_MUTEX(raidPtr->mutex);
221 rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
222 RF_LOCK_MUTEX(raidPtr->mutex);
223 raidPtr->reconInProgress--;
224 RF_UNLOCK_MUTEX(raidPtr->mutex);
225 } else {
226 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
227 lp->parityConfig);
228 rc = EIO;
229 }
230 RF_SIGNAL_COND(raidPtr->waitForReconCond);
231 return (rc);
232 }
233
234 int
235 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
236 {
237 RF_ComponentLabel_t *c_label;
238 RF_RaidDisk_t *spareDiskPtr = NULL;
239 RF_RaidReconDesc_t *reconDesc;
240 RF_RowCol_t scol;
241 int numDisksDone = 0, rc;
242
243 /* first look for a spare drive onto which to reconstruct the data */
244 /* spare disk descriptors are stored in row 0. This may have to
245 * change eventually */
246
247 RF_LOCK_MUTEX(raidPtr->mutex);
248 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
249 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
250 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
251 if (raidPtr->status != rf_rs_degraded) {
252 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
253 RF_UNLOCK_MUTEX(raidPtr->mutex);
254 return (EINVAL);
255 }
256 scol = (-1);
257 } else {
258 #endif
259 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
260 if (raidPtr->Disks[scol].status == rf_ds_spare) {
261 spareDiskPtr = &raidPtr->Disks[scol];
262 spareDiskPtr->status = rf_ds_rebuilding_spare;
263 break;
264 }
265 }
266 if (!spareDiskPtr) {
267 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
268 RF_UNLOCK_MUTEX(raidPtr->mutex);
269 return (ENOSPC);
270 }
271 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
272 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
273 }
274 #endif
275 RF_UNLOCK_MUTEX(raidPtr->mutex);
276
277 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
278 raidPtr->reconDesc = (void *) reconDesc;
279 #if RF_RECON_STATS > 0
280 reconDesc->hsStallCount = 0;
281 reconDesc->numReconExecDelays = 0;
282 reconDesc->numReconEventWaits = 0;
283 #endif /* RF_RECON_STATS > 0 */
284 reconDesc->reconExecTimerRunning = 0;
285 reconDesc->reconExecTicks = 0;
286 reconDesc->maxReconExecTicks = 0;
287 rc = rf_ContinueReconstructFailedDisk(reconDesc);
288
289 if (!rc) {
290 /* fix up the component label */
291 /* Don't actually need the read here.. */
292 c_label = raidget_component_label(raidPtr, scol);
293
294 raid_init_component_label(raidPtr, c_label);
295 c_label->row = 0;
296 c_label->column = col;
297 c_label->clean = RF_RAID_DIRTY;
298 c_label->status = rf_ds_optimal;
299 rf_component_label_set_partitionsize(c_label,
300 raidPtr->Disks[scol].partitionSize);
301
302 /* We've just done a rebuild based on all the other
303 disks, so at this point the parity is known to be
304 clean, even if it wasn't before. */
305
306 /* XXX doesn't hold for RAID 6!!*/
307
308 RF_LOCK_MUTEX(raidPtr->mutex);
309 /* The failed disk has already been marked as rf_ds_spared
310 (or rf_ds_dist_spared) in
311 rf_ContinueReconstructFailedDisk()
312 so we just update the spare disk as being a used spare
313 */
314
315 spareDiskPtr->status = rf_ds_used_spare;
316 raidPtr->parity_good = RF_RAID_CLEAN;
317 RF_UNLOCK_MUTEX(raidPtr->mutex);
318
319 /* XXXX MORE NEEDED HERE */
320
321 raidflush_component_label(raidPtr, scol);
322 } else {
323 /* Reconstruct failed. */
324
325 RF_LOCK_MUTEX(raidPtr->mutex);
326 /* Failed disk goes back to "failed" status */
327 raidPtr->Disks[col].status = rf_ds_failed;
328
329 /* Spare disk goes back to "spare" status. */
330 spareDiskPtr->status = rf_ds_spare;
331 RF_UNLOCK_MUTEX(raidPtr->mutex);
332
333 }
334 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
335 return (rc);
336 }
337
338 /*
339
340 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
341 and you don't get a spare until the next Monday. With this function
342 (and hot-swappable drives) you can now put your new disk containing
343 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
344 rebuild the data "on the spot".
345
346 */
347
348 int
349 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
350 {
351 RF_RaidDisk_t *spareDiskPtr = NULL;
352 RF_RaidReconDesc_t *reconDesc;
353 const RF_LayoutSW_t *lp;
354 RF_ComponentLabel_t *c_label;
355 int numDisksDone = 0, rc;
356 struct partinfo dpart;
357 struct vnode *vp;
358 struct vattr va;
359 int retcode;
360 int ac;
361
362 lp = raidPtr->Layout.map;
363 if (!lp->SubmitReconBuffer) {
364 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
365 lp->parityConfig);
366 /* wakeup anyone who might be waiting to do a reconstruct */
367 RF_SIGNAL_COND(raidPtr->waitForReconCond);
368 return(EIO);
369 }
370
371 /*
372 * The current infrastructure only supports reconstructing one
373 * disk at a time for each array.
374 */
375 RF_LOCK_MUTEX(raidPtr->mutex);
376
377 if (raidPtr->Disks[col].status != rf_ds_failed) {
378 /* "It's gone..." */
379 raidPtr->numFailures++;
380 raidPtr->Disks[col].status = rf_ds_failed;
381 raidPtr->status = rf_rs_degraded;
382 RF_UNLOCK_MUTEX(raidPtr->mutex);
383 rf_update_component_labels(raidPtr,
384 RF_NORMAL_COMPONENT_UPDATE);
385 RF_LOCK_MUTEX(raidPtr->mutex);
386 }
387
388 while (raidPtr->reconInProgress) {
389 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
390 }
391
392 raidPtr->reconInProgress++;
393
394 /* first look for a spare drive onto which to reconstruct the
395 data. spare disk descriptors are stored in row 0. This
396 may have to change eventually */
397
398 /* Actually, we don't care if it's failed or not... On a RAID
399 set with correct parity, this function should be callable
400 on any component without ill effects. */
401 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
402
403 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
404 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
405 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
406
407 raidPtr->reconInProgress--;
408 RF_UNLOCK_MUTEX(raidPtr->mutex);
409 RF_SIGNAL_COND(raidPtr->waitForReconCond);
410 return (EINVAL);
411 }
412 #endif
413
414 /* This device may have been opened successfully the
415 first time. Close it before trying to open it again.. */
416
417 if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
418 #if 0
419 printf("Closed the open device: %s\n",
420 raidPtr->Disks[col].devname);
421 #endif
422 vp = raidPtr->raid_cinfo[col].ci_vp;
423 ac = raidPtr->Disks[col].auto_configured;
424 RF_UNLOCK_MUTEX(raidPtr->mutex);
425 rf_close_component(raidPtr, vp, ac);
426 RF_LOCK_MUTEX(raidPtr->mutex);
427 raidPtr->raid_cinfo[col].ci_vp = NULL;
428 }
429 /* note that this disk was *not* auto_configured (any longer)*/
430 raidPtr->Disks[col].auto_configured = 0;
431
432 #if 0
433 printf("About to (re-)open the device for rebuilding: %s\n",
434 raidPtr->Disks[col].devname);
435 #endif
436 RF_UNLOCK_MUTEX(raidPtr->mutex);
437 retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE);
438
439 if (retcode) {
440 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
441 raidPtr->Disks[col].devname, retcode);
442
443 /* the component isn't responding properly...
444 must be still dead :-( */
445 RF_LOCK_MUTEX(raidPtr->mutex);
446 raidPtr->reconInProgress--;
447 RF_UNLOCK_MUTEX(raidPtr->mutex);
448 RF_SIGNAL_COND(raidPtr->waitForReconCond);
449 return(retcode);
450 }
451
452 /* Ok, so we can at least do a lookup...
453 How about actually getting a vp for it? */
454
455 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) {
456 RF_LOCK_MUTEX(raidPtr->mutex);
457 raidPtr->reconInProgress--;
458 RF_UNLOCK_MUTEX(raidPtr->mutex);
459 RF_SIGNAL_COND(raidPtr->waitForReconCond);
460 return(retcode);
461 }
462
463 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred);
464 if (retcode) {
465 RF_LOCK_MUTEX(raidPtr->mutex);
466 raidPtr->reconInProgress--;
467 RF_UNLOCK_MUTEX(raidPtr->mutex);
468 RF_SIGNAL_COND(raidPtr->waitForReconCond);
469 return(retcode);
470 }
471 RF_LOCK_MUTEX(raidPtr->mutex);
472 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize;
473
474 raidPtr->Disks[col].numBlocks = dpart.part->p_size -
475 rf_protectedSectors;
476
477 raidPtr->raid_cinfo[col].ci_vp = vp;
478 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
479
480 raidPtr->Disks[col].dev = va.va_rdev;
481
482 /* we allow the user to specify that only a fraction
483 of the disks should be used this is just for debug:
484 it speeds up * the parity scan */
485 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
486 rf_sizePercentage / 100;
487 RF_UNLOCK_MUTEX(raidPtr->mutex);
488
489 spareDiskPtr = &raidPtr->Disks[col];
490 spareDiskPtr->status = rf_ds_rebuilding_spare;
491
492 printf("raid%d: initiating in-place reconstruction on column %d\n",
493 raidPtr->raidid, col);
494
495 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
496 numDisksDone, col);
497 raidPtr->reconDesc = (void *) reconDesc;
498 #if RF_RECON_STATS > 0
499 reconDesc->hsStallCount = 0;
500 reconDesc->numReconExecDelays = 0;
501 reconDesc->numReconEventWaits = 0;
502 #endif /* RF_RECON_STATS > 0 */
503 reconDesc->reconExecTimerRunning = 0;
504 reconDesc->reconExecTicks = 0;
505 reconDesc->maxReconExecTicks = 0;
506 rc = rf_ContinueReconstructFailedDisk(reconDesc);
507
508 if (!rc) {
509 RF_LOCK_MUTEX(raidPtr->mutex);
510 /* Need to set these here, as at this point it'll be claiming
511 that the disk is in rf_ds_spared! But we know better :-) */
512
513 raidPtr->Disks[col].status = rf_ds_optimal;
514 raidPtr->status = rf_rs_optimal;
515 RF_UNLOCK_MUTEX(raidPtr->mutex);
516
517 /* fix up the component label */
518 /* Don't actually need the read here.. */
519 c_label = raidget_component_label(raidPtr, col);
520
521 RF_LOCK_MUTEX(raidPtr->mutex);
522 raid_init_component_label(raidPtr, c_label);
523
524 c_label->row = 0;
525 c_label->column = col;
526
527 /* We've just done a rebuild based on all the other
528 disks, so at this point the parity is known to be
529 clean, even if it wasn't before. */
530
531 /* XXX doesn't hold for RAID 6!!*/
532
533 raidPtr->parity_good = RF_RAID_CLEAN;
534 RF_UNLOCK_MUTEX(raidPtr->mutex);
535
536 raidflush_component_label(raidPtr, col);
537 } else {
538 /* Reconstruct-in-place failed. Disk goes back to
539 "failed" status, regardless of what it was before. */
540 RF_LOCK_MUTEX(raidPtr->mutex);
541 raidPtr->Disks[col].status = rf_ds_failed;
542 RF_UNLOCK_MUTEX(raidPtr->mutex);
543 }
544
545 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
546
547 RF_LOCK_MUTEX(raidPtr->mutex);
548 raidPtr->reconInProgress--;
549 RF_UNLOCK_MUTEX(raidPtr->mutex);
550
551 RF_SIGNAL_COND(raidPtr->waitForReconCond);
552 return (rc);
553 }
554
555
556 int
557 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
558 {
559 RF_Raid_t *raidPtr = reconDesc->raidPtr;
560 RF_RowCol_t col = reconDesc->col;
561 RF_RowCol_t scol = reconDesc->scol;
562 RF_ReconMap_t *mapPtr;
563 RF_ReconCtrl_t *tmp_reconctrl;
564 RF_ReconEvent_t *event;
565 RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev;
566 #if RF_INCLUDE_RAID5_RS > 0
567 RF_StripeCount_t startPSID,endPSID,aPSID,bPSID,offPSID;
568 #endif
569 RF_ReconUnitCount_t RUsPerPU;
570 struct timeval etime, elpsd;
571 unsigned long xor_s, xor_resid_us;
572 int i, ds;
573 int status, done;
574 int recon_error, write_error;
575
576 raidPtr->accumXorTimeUs = 0;
577 #if RF_ACC_TRACE > 0
578 /* create one trace record per physical disk */
579 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
580 #endif
581
582 /* quiesce the array prior to starting recon. this is needed
583 * to assure no nasty interactions with pending user writes.
584 * We need to do this before we change the disk or row status. */
585
586 Dprintf("RECON: begin request suspend\n");
587 rf_SuspendNewRequestsAndWait(raidPtr);
588 Dprintf("RECON: end request suspend\n");
589
590 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
591 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
592
593 RF_LOCK_MUTEX(raidPtr->mutex);
594
595 /* create the reconstruction control pointer and install it in
596 * the right slot */
597 raidPtr->reconControl = tmp_reconctrl;
598 mapPtr = raidPtr->reconControl->reconMap;
599 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
600 raidPtr->reconControl->numRUsComplete = 0;
601 raidPtr->status = rf_rs_reconstructing;
602 raidPtr->Disks[col].status = rf_ds_reconstructing;
603 raidPtr->Disks[col].spareCol = scol;
604
605 RF_UNLOCK_MUTEX(raidPtr->mutex);
606
607 RF_GETTIME(raidPtr->reconControl->starttime);
608
609 Dprintf("RECON: resume requests\n");
610 rf_ResumeNewRequests(raidPtr);
611
612
613 mapPtr = raidPtr->reconControl->reconMap;
614
615 incPSID = RF_RECONMAP_SIZE;
616 lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU;
617 RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU;
618 recon_error = 0;
619 write_error = 0;
620 pending_writes = incPSID;
621 raidPtr->reconControl->lastPSID = incPSID - 1;
622
623 /* bounds check raidPtr->reconControl->lastPSID and
624 pending_writes so that we don't attempt to wait for more IO
625 than can possibly happen */
626
627 if (raidPtr->reconControl->lastPSID > lastPSID)
628 raidPtr->reconControl->lastPSID = lastPSID;
629
630 if (pending_writes > lastPSID)
631 pending_writes = lastPSID;
632
633 /* start the actual reconstruction */
634
635 done = 0;
636 while (!done) {
637
638 if (raidPtr->waitShutdown) {
639 /* someone is unconfiguring this array... bail on the reconstruct.. */
640 recon_error = 1;
641 break;
642 }
643
644 num_writes = 0;
645
646 #if RF_INCLUDE_RAID5_RS > 0
647 /* For RAID5 with Rotated Spares we will be 'short'
648 some number of writes since no writes will get
649 issued for stripes where the spare is on the
650 component being rebuilt. Account for the shortage
651 here so that we don't hang indefinitely below
652 waiting for writes to complete that were never
653 scheduled.
654
655 XXX: Should be fixed for PARITY_DECLUSTERING and
656 others too!
657
658 */
659
660 if (raidPtr->Layout.numDataCol <
661 raidPtr->numCol - raidPtr->Layout.numParityCol) {
662 /* numDataCol is at least 2 less than numCol, so
663 should be RAID 5 with Rotated Spares */
664
665 /* XXX need to update for RAID 6 */
666
667 startPSID = raidPtr->reconControl->lastPSID - pending_writes + 1;
668 endPSID = raidPtr->reconControl->lastPSID;
669
670 offPSID = raidPtr->numCol - col - 1;
671
672 aPSID = startPSID - startPSID % raidPtr->numCol + offPSID;
673 if (aPSID < startPSID) {
674 aPSID += raidPtr->numCol;
675 }
676
677 bPSID = endPSID - ((endPSID - offPSID) % raidPtr->numCol);
678
679 if (aPSID < endPSID) {
680 num_writes = ((bPSID - aPSID) / raidPtr->numCol) + 1;
681 }
682
683 if ((aPSID == endPSID) && (bPSID == endPSID)) {
684 num_writes++;
685 }
686 }
687 #endif
688
689 /* issue a read for each surviving disk */
690
691 reconDesc->numDisksDone = 0;
692 for (i = 0; i < raidPtr->numCol; i++) {
693 if (i != col) {
694 /* find and issue the next I/O on the
695 * indicated disk */
696 if (IssueNextReadRequest(raidPtr, i)) {
697 Dprintf1("RECON: done issuing for c%d\n", i);
698 reconDesc->numDisksDone++;
699 }
700 }
701 }
702
703 /* process reconstruction events until all disks report that
704 * they've completed all work */
705
706 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
707
708 event = rf_GetNextReconEvent(reconDesc);
709 status = ProcessReconEvent(raidPtr, event);
710
711 /* the normal case is that a read completes, and all is well. */
712 if (status == RF_RECON_DONE_READS) {
713 reconDesc->numDisksDone++;
714 } else if ((status == RF_RECON_READ_ERROR) ||
715 (status == RF_RECON_WRITE_ERROR)) {
716 /* an error was encountered while reconstructing...
717 Pretend we've finished this disk.
718 */
719 recon_error = 1;
720 raidPtr->reconControl->error = 1;
721
722 /* bump the numDisksDone count for reads,
723 but not for writes */
724 if (status == RF_RECON_READ_ERROR)
725 reconDesc->numDisksDone++;
726
727 /* write errors are special -- when we are
728 done dealing with the reads that are
729 finished, we don't want to wait for any
730 writes */
731 if (status == RF_RECON_WRITE_ERROR) {
732 write_error = 1;
733 num_writes++;
734 }
735
736 } else if (status == RF_RECON_READ_STOPPED) {
737 /* count this component as being "done" */
738 reconDesc->numDisksDone++;
739 } else if (status == RF_RECON_WRITE_DONE) {
740 num_writes++;
741 }
742
743 if (recon_error) {
744 /* make sure any stragglers are woken up so that
745 their theads will complete, and we can get out
746 of here with all IO processed */
747
748 rf_WakeupHeadSepCBWaiters(raidPtr);
749 }
750
751 raidPtr->reconControl->numRUsTotal =
752 mapPtr->totalRUs;
753 raidPtr->reconControl->numRUsComplete =
754 mapPtr->totalRUs -
755 rf_UnitsLeftToReconstruct(mapPtr);
756
757 #if RF_DEBUG_RECON
758 raidPtr->reconControl->percentComplete =
759 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
760 if (rf_prReconSched) {
761 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
762 }
763 #endif
764 }
765
766 /* reads done, wakeup any waiters, and then wait for writes */
767
768 rf_WakeupHeadSepCBWaiters(raidPtr);
769
770 while (!recon_error && (num_writes < pending_writes)) {
771 event = rf_GetNextReconEvent(reconDesc);
772 status = ProcessReconEvent(raidPtr, event);
773
774 if (status == RF_RECON_WRITE_ERROR) {
775 num_writes++;
776 recon_error = 1;
777 raidPtr->reconControl->error = 1;
778 /* an error was encountered at the very end... bail */
779 } else if (status == RF_RECON_WRITE_DONE) {
780 num_writes++;
781 } /* else it's something else, and we don't care */
782 }
783 if (recon_error ||
784 (raidPtr->reconControl->lastPSID == lastPSID)) {
785 done = 1;
786 break;
787 }
788
789 prev = raidPtr->reconControl->lastPSID;
790 raidPtr->reconControl->lastPSID += incPSID;
791
792 if (raidPtr->reconControl->lastPSID > lastPSID) {
793 pending_writes = lastPSID - prev;
794 raidPtr->reconControl->lastPSID = lastPSID;
795 }
796
797 /* back down curPSID to get ready for the next round... */
798 for (i = 0; i < raidPtr->numCol; i++) {
799 if (i != col) {
800 raidPtr->reconControl->perDiskInfo[i].curPSID--;
801 raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1;
802 }
803 }
804 }
805
806 mapPtr = raidPtr->reconControl->reconMap;
807 if (rf_reconDebug) {
808 printf("RECON: all reads completed\n");
809 }
810 /* at this point all the reads have completed. We now wait
811 * for any pending writes to complete, and then we're done */
812
813 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
814
815 event = rf_GetNextReconEvent(reconDesc);
816 status = ProcessReconEvent(raidPtr, event);
817
818 if (status == RF_RECON_WRITE_ERROR) {
819 recon_error = 1;
820 raidPtr->reconControl->error = 1;
821 /* an error was encountered at the very end... bail */
822 } else {
823 #if RF_DEBUG_RECON
824 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
825 if (rf_prReconSched) {
826 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
827 }
828 #endif
829 }
830 }
831
832 if (recon_error) {
833 /* we've encountered an error in reconstructing. */
834 printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
835
836 /* we start by blocking IO to the RAID set. */
837 rf_SuspendNewRequestsAndWait(raidPtr);
838
839 RF_LOCK_MUTEX(raidPtr->mutex);
840 /* mark set as being degraded, rather than
841 rf_rs_reconstructing as we were before the problem.
842 After this is done we can update status of the
843 component disks without worrying about someone
844 trying to read from a failed component.
845 */
846 raidPtr->status = rf_rs_degraded;
847 RF_UNLOCK_MUTEX(raidPtr->mutex);
848
849 /* resume IO */
850 rf_ResumeNewRequests(raidPtr);
851
852 /* At this point there are two cases:
853 1) If we've experienced a read error, then we've
854 already waited for all the reads we're going to get,
855 and we just need to wait for the writes.
856
857 2) If we've experienced a write error, we've also
858 already waited for all the reads to complete,
859 but there is little point in waiting for the writes --
860 when they do complete, they will just be ignored.
861
862 So we just wait for writes to complete if we didn't have a
863 write error.
864 */
865
866 if (!write_error) {
867 /* wait for writes to complete */
868 while (raidPtr->reconControl->pending_writes > 0) {
869
870 event = rf_GetNextReconEvent(reconDesc);
871 status = ProcessReconEvent(raidPtr, event);
872
873 if (status == RF_RECON_WRITE_ERROR) {
874 raidPtr->reconControl->error = 1;
875 /* an error was encountered at the very end... bail.
876 This will be very bad news for the user, since
877 at this point there will have been a read error
878 on one component, and a write error on another!
879 */
880 break;
881 }
882 }
883 }
884
885
886 /* cleanup */
887
888 /* drain the event queue - after waiting for the writes above,
889 there shouldn't be much (if anything!) left in the queue. */
890
891 rf_DrainReconEventQueue(reconDesc);
892
893 /* XXX As much as we'd like to free the recon control structure
894 and the reconDesc, we have no way of knowing if/when those will
895 be touched by IO that has yet to occur. It is rather poor to be
896 basically causing a 'memory leak' here, but there doesn't seem to be
897 a cleaner alternative at this time. Perhaps when the reconstruct code
898 gets a makeover this problem will go away.
899 */
900 #if 0
901 rf_FreeReconControl(raidPtr);
902 #endif
903
904 #if RF_ACC_TRACE > 0
905 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
906 #endif
907 /* XXX see comment above */
908 #if 0
909 FreeReconDesc(reconDesc);
910 #endif
911
912 return (1);
913 }
914
915 /* Success: mark the dead disk as reconstructed. We quiesce
916 * the array here to assure no nasty interactions with pending
917 * user accesses when we free up the psstatus structure as
918 * part of FreeReconControl() */
919
920 rf_SuspendNewRequestsAndWait(raidPtr);
921
922 RF_LOCK_MUTEX(raidPtr->mutex);
923 raidPtr->numFailures--;
924 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
925 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
926 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
927 RF_UNLOCK_MUTEX(raidPtr->mutex);
928 RF_GETTIME(etime);
929 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
930
931 rf_ResumeNewRequests(raidPtr);
932
933 printf("raid%d: Reconstruction of disk at col %d completed\n",
934 raidPtr->raidid, col);
935 xor_s = raidPtr->accumXorTimeUs / 1000000;
936 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
937 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
938 raidPtr->raidid,
939 (int) elpsd.tv_sec, (int) elpsd.tv_usec,
940 raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
941 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n",
942 raidPtr->raidid,
943 (int) raidPtr->reconControl->starttime.tv_sec,
944 (int) raidPtr->reconControl->starttime.tv_usec,
945 (int) etime.tv_sec, (int) etime.tv_usec);
946 #if RF_RECON_STATS > 0
947 printf("raid%d: Total head-sep stall count was %d\n",
948 raidPtr->raidid, (int) reconDesc->hsStallCount);
949 #endif /* RF_RECON_STATS > 0 */
950 rf_FreeReconControl(raidPtr);
951 #if RF_ACC_TRACE > 0
952 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
953 #endif
954 FreeReconDesc(reconDesc);
955
956 return (0);
957
958 }
959 /*****************************************************************************
960 * do the right thing upon each reconstruction event.
961 *****************************************************************************/
962 static int
963 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
964 {
965 int retcode = 0, submitblocked;
966 RF_ReconBuffer_t *rbuf;
967 RF_SectorCount_t sectorsPerRU;
968
969 retcode = RF_RECON_READ_STOPPED;
970
971 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
972
973 switch (event->type) {
974
975 /* a read I/O has completed */
976 case RF_REVENT_READDONE:
977 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
978 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
979 event->col, rbuf->parityStripeID);
980 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n",
981 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
982 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
983 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
984 if (!raidPtr->reconControl->error) {
985 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
986 Dprintf1("RECON: submitblocked=%d\n", submitblocked);
987 if (!submitblocked)
988 retcode = IssueNextReadRequest(raidPtr, event->col);
989 else
990 retcode = 0;
991 }
992 break;
993
994 /* a write I/O has completed */
995 case RF_REVENT_WRITEDONE:
996 #if RF_DEBUG_RECON
997 if (rf_floatingRbufDebug) {
998 rf_CheckFloatingRbufCount(raidPtr, 1);
999 }
1000 #endif
1001 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1002 rbuf = (RF_ReconBuffer_t *) event->arg;
1003 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1004 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
1005 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
1006 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
1007 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
1008 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
1009
1010 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1011 raidPtr->reconControl->pending_writes--;
1012 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1013
1014 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
1015 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1016 while(raidPtr->reconControl->rb_lock) {
1017 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
1018 &raidPtr->reconControl->rb_mutex);
1019 }
1020 raidPtr->reconControl->rb_lock = 1;
1021 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1022
1023 raidPtr->numFullReconBuffers--;
1024 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
1025
1026 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1027 raidPtr->reconControl->rb_lock = 0;
1028 wakeup(&raidPtr->reconControl->rb_lock);
1029 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1030 } else
1031 if (rbuf->type == RF_RBUF_TYPE_FORCED)
1032 rf_FreeReconBuffer(rbuf);
1033 else
1034 RF_ASSERT(0);
1035 retcode = RF_RECON_WRITE_DONE;
1036 break;
1037
1038 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been
1039 * cleared */
1040 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
1041 if (!raidPtr->reconControl->error) {
1042 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
1043 0, (int) (long) event->arg);
1044 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the
1045 * BUFCLEAR event if we
1046 * couldn't submit */
1047 retcode = IssueNextReadRequest(raidPtr, event->col);
1048 }
1049 break;
1050
1051 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction
1052 * blockage has been cleared */
1053 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
1054 if (!raidPtr->reconControl->error) {
1055 retcode = TryToRead(raidPtr, event->col);
1056 }
1057 break;
1058
1059 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation
1060 * reconstruction blockage has been
1061 * cleared */
1062 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
1063 if (!raidPtr->reconControl->error) {
1064 retcode = TryToRead(raidPtr, event->col);
1065 }
1066 break;
1067
1068 /* a buffer has become ready to write */
1069 case RF_REVENT_BUFREADY:
1070 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
1071 if (!raidPtr->reconControl->error) {
1072 retcode = IssueNextWriteRequest(raidPtr);
1073 #if RF_DEBUG_RECON
1074 if (rf_floatingRbufDebug) {
1075 rf_CheckFloatingRbufCount(raidPtr, 1);
1076 }
1077 #endif
1078 }
1079 break;
1080
1081 /* we need to skip the current RU entirely because it got
1082 * recon'd while we were waiting for something else to happen */
1083 case RF_REVENT_SKIP:
1084 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
1085 if (!raidPtr->reconControl->error) {
1086 retcode = IssueNextReadRequest(raidPtr, event->col);
1087 }
1088 break;
1089
1090 /* a forced-reconstruction read access has completed. Just
1091 * submit the buffer */
1092 case RF_REVENT_FORCEDREADDONE:
1093 rbuf = (RF_ReconBuffer_t *) event->arg;
1094 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1095 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
1096 if (!raidPtr->reconControl->error) {
1097 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
1098 RF_ASSERT(!submitblocked);
1099 retcode = 0;
1100 }
1101 break;
1102
1103 /* A read I/O failed to complete */
1104 case RF_REVENT_READ_FAILED:
1105 retcode = RF_RECON_READ_ERROR;
1106 break;
1107
1108 /* A write I/O failed to complete */
1109 case RF_REVENT_WRITE_FAILED:
1110 retcode = RF_RECON_WRITE_ERROR;
1111
1112 /* This is an error, but it was a pending write.
1113 Account for it. */
1114 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1115 raidPtr->reconControl->pending_writes--;
1116 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1117
1118 rbuf = (RF_ReconBuffer_t *) event->arg;
1119
1120 /* cleanup the disk queue data */
1121 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1122
1123 /* At this point we're erroring out, badly, and floatingRbufs
1124 may not even be valid. Rather than putting this back onto
1125 the floatingRbufs list, just arrange for its immediate
1126 destruction.
1127 */
1128 rf_FreeReconBuffer(rbuf);
1129 break;
1130
1131 /* a forced read I/O failed to complete */
1132 case RF_REVENT_FORCEDREAD_FAILED:
1133 retcode = RF_RECON_READ_ERROR;
1134 break;
1135
1136 default:
1137 RF_PANIC();
1138 }
1139 rf_FreeReconEventDesc(event);
1140 return (retcode);
1141 }
1142 /*****************************************************************************
1143 *
1144 * find the next thing that's needed on the indicated disk, and issue
1145 * a read request for it. We assume that the reconstruction buffer
1146 * associated with this process is free to receive the data. If
1147 * reconstruction is blocked on the indicated RU, we issue a
1148 * blockage-release request instead of a physical disk read request.
1149 * If the current disk gets too far ahead of the others, we issue a
1150 * head-separation wait request and return.
1151 *
1152 * ctrl->{ru_count, curPSID, diskOffset} and
1153 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1154 * we're currently accessing. Note that this deviates from the
1155 * standard C idiom of having counters point to the next thing to be
1156 * accessed. This allows us to easily retry when we're blocked by
1157 * head separation or reconstruction-blockage events.
1158 *
1159 *****************************************************************************/
1160 static int
1161 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1162 {
1163 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1164 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1165 RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1166 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1167 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1168 int do_new_check = 0, retcode = 0, status;
1169
1170 /* if we are currently the slowest disk, mark that we have to do a new
1171 * check */
1172 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1173 do_new_check = 1;
1174
1175 while (1) {
1176
1177 ctrl->ru_count++;
1178 if (ctrl->ru_count < RUsPerPU) {
1179 ctrl->diskOffset += sectorsPerRU;
1180 rbuf->failedDiskSectorOffset += sectorsPerRU;
1181 } else {
1182 ctrl->curPSID++;
1183 ctrl->ru_count = 0;
1184 /* code left over from when head-sep was based on
1185 * parity stripe id */
1186 if (ctrl->curPSID > raidPtr->reconControl->lastPSID) {
1187 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1188 return (RF_RECON_DONE_READS); /* finito! */
1189 }
1190 /* find the disk offsets of the start of the parity
1191 * stripe on both the current disk and the failed
1192 * disk. skip this entire parity stripe if either disk
1193 * does not appear in the indicated PS */
1194 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1195 &rbuf->spCol, &rbuf->spOffset);
1196 if (status) {
1197 ctrl->ru_count = RUsPerPU - 1;
1198 continue;
1199 }
1200 }
1201 rbuf->which_ru = ctrl->ru_count;
1202
1203 /* skip this RU if it's already been reconstructed */
1204 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1205 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1206 continue;
1207 }
1208 break;
1209 }
1210 ctrl->headSepCounter++;
1211 if (do_new_check)
1212 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */
1213
1214
1215 /* at this point, we have definitely decided what to do, and we have
1216 * only to see if we can actually do it now */
1217 rbuf->parityStripeID = ctrl->curPSID;
1218 rbuf->which_ru = ctrl->ru_count;
1219 #if RF_ACC_TRACE > 0
1220 memset((char *) &raidPtr->recon_tracerecs[col], 0,
1221 sizeof(raidPtr->recon_tracerecs[col]));
1222 raidPtr->recon_tracerecs[col].reconacc = 1;
1223 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1224 #endif
1225 retcode = TryToRead(raidPtr, col);
1226 return (retcode);
1227 }
1228
1229 /*
1230 * tries to issue the next read on the indicated disk. We may be
1231 * blocked by (a) the heads being too far apart, or (b) recon on the
1232 * indicated RU being blocked due to a write by a user thread. In
1233 * this case, we issue a head-sep or blockage wait request, which will
1234 * cause this same routine to be invoked again later when the blockage
1235 * has cleared.
1236 */
1237
1238 static int
1239 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1240 {
1241 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1242 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1243 RF_StripeNum_t psid = ctrl->curPSID;
1244 RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1245 RF_DiskQueueData_t *req;
1246 int status;
1247 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1248
1249 /* if the current disk is too far ahead of the others, issue a
1250 * head-separation wait and return */
1251 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1252 return (0);
1253
1254 /* allocate a new PSS in case we need it */
1255 newpssPtr = rf_AllocPSStatus(raidPtr);
1256
1257 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1258 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1259
1260 if (pssPtr != newpssPtr) {
1261 rf_FreePSStatus(raidPtr, newpssPtr);
1262 }
1263
1264 /* if recon is blocked on the indicated parity stripe, issue a
1265 * block-wait request and return. this also must mark the indicated RU
1266 * in the stripe as under reconstruction if not blocked. */
1267 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1268 if (status == RF_PSS_RECON_BLOCKED) {
1269 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1270 goto out;
1271 } else
1272 if (status == RF_PSS_FORCED_ON_WRITE) {
1273 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1274 goto out;
1275 }
1276 /* make one last check to be sure that the indicated RU didn't get
1277 * reconstructed while we were waiting for something else to happen.
1278 * This is unfortunate in that it causes us to make this check twice
1279 * in the normal case. Might want to make some attempt to re-work
1280 * this so that we only do this check if we've definitely blocked on
1281 * one of the above checks. When this condition is detected, we may
1282 * have just created a bogus status entry, which we need to delete. */
1283 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1284 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1285 if (pssPtr == newpssPtr)
1286 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1287 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1288 goto out;
1289 }
1290 /* found something to read. issue the I/O */
1291 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1292 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1293 #if RF_ACC_TRACE > 0
1294 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1295 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1296 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1297 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1298 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1299 #endif
1300 /* should be ok to use a NULL proc pointer here, all the bufs we use
1301 * should be in kernel space */
1302 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1303 ReconReadDoneProc, (void *) ctrl,
1304 #if RF_ACC_TRACE > 0
1305 &raidPtr->recon_tracerecs[col],
1306 #else
1307 NULL,
1308 #endif
1309 (void *) raidPtr, 0, NULL, PR_WAITOK);
1310
1311 ctrl->rbuf->arg = (void *) req;
1312 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1313 pssPtr->issued[col] = 1;
1314
1315 out:
1316 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1317 return (0);
1318 }
1319
1320
1321 /*
1322 * given a parity stripe ID, we want to find out whether both the
1323 * current disk and the failed disk exist in that parity stripe. If
1324 * not, we want to skip this whole PS. If so, we want to find the
1325 * disk offset of the start of the PS on both the current disk and the
1326 * failed disk.
1327 *
1328 * this works by getting a list of disks comprising the indicated
1329 * parity stripe, and searching the list for the current and failed
1330 * disks. Once we've decided they both exist in the parity stripe, we
1331 * need to decide whether each is data or parity, so that we'll know
1332 * which mapping function to call to get the corresponding disk
1333 * offsets.
1334 *
1335 * this is kind of unpleasant, but doing it this way allows the
1336 * reconstruction code to use parity stripe IDs rather than physical
1337 * disks address to march through the failed disk, which greatly
1338 * simplifies a lot of code, as well as eliminating the need for a
1339 * reverse-mapping function. I also think it will execute faster,
1340 * since the calls to the mapping module are kept to a minimum.
1341 *
1342 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1343 * THE STRIPE IN THE CORRECT ORDER
1344 *
1345 * raidPtr - raid descriptor
1346 * psid - parity stripe identifier
1347 * col - column of disk to find the offsets for
1348 * spCol - out: col of spare unit for failed unit
1349 * spOffset - out: offset into disk containing spare unit
1350 *
1351 */
1352
1353
1354 static int
1355 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1356 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1357 RF_SectorNum_t *outFailedDiskSectorOffset,
1358 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1359 {
1360 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1361 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1362 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1363 RF_RowCol_t *diskids;
1364 u_int i, j, k, i_offset, j_offset;
1365 RF_RowCol_t pcol;
1366 int testcol;
1367 RF_SectorNum_t poffset;
1368 char i_is_parity = 0, j_is_parity = 0;
1369 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1370
1371 /* get a listing of the disks comprising that stripe */
1372 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1373 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1374 RF_ASSERT(diskids);
1375
1376 /* reject this entire parity stripe if it does not contain the
1377 * indicated disk or it does not contain the failed disk */
1378
1379 for (i = 0; i < stripeWidth; i++) {
1380 if (col == diskids[i])
1381 break;
1382 }
1383 if (i == stripeWidth)
1384 goto skipit;
1385 for (j = 0; j < stripeWidth; j++) {
1386 if (fcol == diskids[j])
1387 break;
1388 }
1389 if (j == stripeWidth) {
1390 goto skipit;
1391 }
1392 /* find out which disk the parity is on */
1393 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1394
1395 /* find out if either the current RU or the failed RU is parity */
1396 /* also, if the parity occurs in this stripe prior to the data and/or
1397 * failed col, we need to decrement i and/or j */
1398 for (k = 0; k < stripeWidth; k++)
1399 if (diskids[k] == pcol)
1400 break;
1401 RF_ASSERT(k < stripeWidth);
1402 i_offset = i;
1403 j_offset = j;
1404 if (k < i)
1405 i_offset--;
1406 else
1407 if (k == i) {
1408 i_is_parity = 1;
1409 i_offset = 0;
1410 } /* set offsets to zero to disable multiply
1411 * below */
1412 if (k < j)
1413 j_offset--;
1414 else
1415 if (k == j) {
1416 j_is_parity = 1;
1417 j_offset = 0;
1418 }
1419 /* at this point, [ij]_is_parity tells us whether the [current,failed]
1420 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1421 * tells us how far into the stripe the [current,failed] disk is. */
1422
1423 /* call the mapping routine to get the offset into the current disk,
1424 * repeat for failed disk. */
1425 if (i_is_parity)
1426 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1427 else
1428 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1429
1430 RF_ASSERT(col == testcol);
1431
1432 if (j_is_parity)
1433 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1434 else
1435 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1436 RF_ASSERT(fcol == testcol);
1437
1438 /* now locate the spare unit for the failed unit */
1439 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1440 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1441 if (j_is_parity)
1442 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1443 else
1444 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1445 } else {
1446 #endif
1447 *spCol = raidPtr->reconControl->spareCol;
1448 *spOffset = *outFailedDiskSectorOffset;
1449 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1450 }
1451 #endif
1452 return (0);
1453
1454 skipit:
1455 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
1456 psid, col);
1457 return (1);
1458 }
1459 /* this is called when a buffer has become ready to write to the replacement disk */
1460 static int
1461 IssueNextWriteRequest(RF_Raid_t *raidPtr)
1462 {
1463 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1464 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1465 #if RF_ACC_TRACE > 0
1466 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1467 #endif
1468 RF_ReconBuffer_t *rbuf;
1469 RF_DiskQueueData_t *req;
1470
1471 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1472 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't
1473 * have gotten the event that sent us here */
1474 RF_ASSERT(rbuf->pssPtr);
1475
1476 rbuf->pssPtr->writeRbuf = rbuf;
1477 rbuf->pssPtr = NULL;
1478
1479 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1480 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1481 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1482 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n",
1483 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1484 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1485
1486 /* should be ok to use a NULL b_proc here b/c all addrs should be in
1487 * kernel space */
1488 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1489 sectorsPerRU, rbuf->buffer,
1490 rbuf->parityStripeID, rbuf->which_ru,
1491 ReconWriteDoneProc, (void *) rbuf,
1492 #if RF_ACC_TRACE > 0
1493 &raidPtr->recon_tracerecs[fcol],
1494 #else
1495 NULL,
1496 #endif
1497 (void *) raidPtr, 0, NULL, PR_WAITOK);
1498
1499 rbuf->arg = (void *) req;
1500 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1501 raidPtr->reconControl->pending_writes++;
1502 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1503 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1504
1505 return (0);
1506 }
1507
1508 /*
1509 * this gets called upon the completion of a reconstruction read
1510 * operation the arg is a pointer to the per-disk reconstruction
1511 * control structure for the process that just finished a read.
1512 *
1513 * called at interrupt context in the kernel, so don't do anything
1514 * illegal here.
1515 */
1516 static int
1517 ReconReadDoneProc(void *arg, int status)
1518 {
1519 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1520 RF_Raid_t *raidPtr;
1521
1522 /* Detect that reconCtrl is no longer valid, and if that
1523 is the case, bail without calling rf_CauseReconEvent().
1524 There won't be anyone listening for this event anyway */
1525
1526 if (ctrl->reconCtrl == NULL)
1527 return(0);
1528
1529 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1530
1531 if (status) {
1532 printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status);
1533 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1534 return(0);
1535 }
1536 #if RF_ACC_TRACE > 0
1537 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1538 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1539 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1540 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1541 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1542 #endif
1543 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1544 return (0);
1545 }
1546 /* this gets called upon the completion of a reconstruction write operation.
1547 * the arg is a pointer to the rbuf that was just written
1548 *
1549 * called at interrupt context in the kernel, so don't do anything illegal here.
1550 */
1551 static int
1552 ReconWriteDoneProc(void *arg, int status)
1553 {
1554 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1555
1556 /* Detect that reconControl is no longer valid, and if that
1557 is the case, bail without calling rf_CauseReconEvent().
1558 There won't be anyone listening for this event anyway */
1559
1560 if (rbuf->raidPtr->reconControl == NULL)
1561 return(0);
1562
1563 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1564 if (status) {
1565 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
1566 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1567 return(0);
1568 }
1569 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1570 return (0);
1571 }
1572
1573
1574 /*
1575 * computes a new minimum head sep, and wakes up anyone who needs to
1576 * be woken as a result
1577 */
1578 static void
1579 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1580 {
1581 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1582 RF_HeadSepLimit_t new_min;
1583 RF_RowCol_t i;
1584 RF_CallbackDesc_t *p;
1585 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition
1586 * of a minimum */
1587
1588
1589 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1590 while(reconCtrlPtr->rb_lock) {
1591 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
1592 }
1593 reconCtrlPtr->rb_lock = 1;
1594 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1595
1596 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1597 for (i = 0; i < raidPtr->numCol; i++)
1598 if (i != reconCtrlPtr->fcol) {
1599 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1600 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1601 }
1602 /* set the new minimum and wake up anyone who can now run again */
1603 if (new_min != reconCtrlPtr->minHeadSepCounter) {
1604 reconCtrlPtr->minHeadSepCounter = new_min;
1605 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min);
1606 while (reconCtrlPtr->headSepCBList) {
1607 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1608 break;
1609 p = reconCtrlPtr->headSepCBList;
1610 reconCtrlPtr->headSepCBList = p->next;
1611 p->next = NULL;
1612 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1613 rf_FreeCallbackDesc(p);
1614 }
1615
1616 }
1617 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1618 reconCtrlPtr->rb_lock = 0;
1619 wakeup(&reconCtrlPtr->rb_lock);
1620 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1621 }
1622
1623 /*
1624 * checks to see that the maximum head separation will not be violated
1625 * if we initiate a reconstruction I/O on the indicated disk.
1626 * Limiting the maximum head separation between two disks eliminates
1627 * the nasty buffer-stall conditions that occur when one disk races
1628 * ahead of the others and consumes all of the floating recon buffers.
1629 * This code is complex and unpleasant but it's necessary to avoid
1630 * some very nasty, albeit fairly rare, reconstruction behavior.
1631 *
1632 * returns non-zero if and only if we have to stop working on the
1633 * indicated disk due to a head-separation delay.
1634 */
1635 static int
1636 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1637 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1638 RF_ReconUnitNum_t which_ru)
1639 {
1640 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1641 RF_CallbackDesc_t *cb, *p, *pt;
1642 int retval = 0;
1643
1644 /* if we're too far ahead of the slowest disk, stop working on this
1645 * disk until the slower ones catch up. We do this by scheduling a
1646 * wakeup callback for the time when the slowest disk has caught up.
1647 * We define "caught up" with 20% hysteresis, i.e. the head separation
1648 * must have fallen to at most 80% of the max allowable head
1649 * separation before we'll wake up.
1650 *
1651 */
1652 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1653 while(reconCtrlPtr->rb_lock) {
1654 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
1655 }
1656 reconCtrlPtr->rb_lock = 1;
1657 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1658 if ((raidPtr->headSepLimit >= 0) &&
1659 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1660 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1661 raidPtr->raidid, col, ctrl->headSepCounter,
1662 reconCtrlPtr->minHeadSepCounter,
1663 raidPtr->headSepLimit);
1664 cb = rf_AllocCallbackDesc();
1665 /* the minHeadSepCounter value we have to get to before we'll
1666 * wake up. build in 20% hysteresis. */
1667 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1668 cb->col = col;
1669 cb->next = NULL;
1670
1671 /* insert this callback descriptor into the sorted list of
1672 * pending head-sep callbacks */
1673 p = reconCtrlPtr->headSepCBList;
1674 if (!p)
1675 reconCtrlPtr->headSepCBList = cb;
1676 else
1677 if (cb->callbackArg.v < p->callbackArg.v) {
1678 cb->next = reconCtrlPtr->headSepCBList;
1679 reconCtrlPtr->headSepCBList = cb;
1680 } else {
1681 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1682 cb->next = p;
1683 pt->next = cb;
1684 }
1685 retval = 1;
1686 #if RF_RECON_STATS > 0
1687 ctrl->reconCtrl->reconDesc->hsStallCount++;
1688 #endif /* RF_RECON_STATS > 0 */
1689 }
1690 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1691 reconCtrlPtr->rb_lock = 0;
1692 wakeup(&reconCtrlPtr->rb_lock);
1693 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1694
1695 return (retval);
1696 }
1697 /*
1698 * checks to see if reconstruction has been either forced or blocked
1699 * by a user operation. if forced, we skip this RU entirely. else if
1700 * blocked, put ourselves on the wait list. else return 0.
1701 *
1702 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1703 */
1704 static int
1705 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1706 RF_ReconParityStripeStatus_t *pssPtr,
1707 RF_PerDiskReconCtrl_t *ctrl,
1708 RF_RowCol_t col,
1709 RF_StripeNum_t psid,
1710 RF_ReconUnitNum_t which_ru)
1711 {
1712 RF_CallbackDesc_t *cb;
1713 int retcode = 0;
1714
1715 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1716 retcode = RF_PSS_FORCED_ON_WRITE;
1717 else
1718 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1719 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1720 cb = rf_AllocCallbackDesc(); /* append ourselves to
1721 * the blockage-wait
1722 * list */
1723 cb->col = col;
1724 cb->next = pssPtr->blockWaitList;
1725 pssPtr->blockWaitList = cb;
1726 retcode = RF_PSS_RECON_BLOCKED;
1727 }
1728 if (!retcode)
1729 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under
1730 * reconstruction */
1731
1732 return (retcode);
1733 }
1734 /*
1735 * if reconstruction is currently ongoing for the indicated stripeID,
1736 * reconstruction is forced to completion and we return non-zero to
1737 * indicate that the caller must wait. If not, then reconstruction is
1738 * blocked on the indicated stripe and the routine returns zero. If
1739 * and only if we return non-zero, we'll cause the cbFunc to get
1740 * invoked with the cbArg when the reconstruction has completed.
1741 */
1742 int
1743 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1744 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1745 {
1746 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're
1747 * forcing recon on */
1748 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */
1749 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity
1750 * stripe status structure */
1751 RF_StripeNum_t psid; /* parity stripe id */
1752 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk
1753 * offset */
1754 RF_RowCol_t *diskids;
1755 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */
1756 RF_RowCol_t fcol, diskno, i;
1757 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */
1758 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1759 RF_CallbackDesc_t *cb;
1760 int nPromoted;
1761
1762 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1763
1764 /* allocate a new PSS in case we need it */
1765 newpssPtr = rf_AllocPSStatus(raidPtr);
1766
1767 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1768
1769 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1770
1771 if (pssPtr != newpssPtr) {
1772 rf_FreePSStatus(raidPtr, newpssPtr);
1773 }
1774
1775 /* if recon is not ongoing on this PS, just return */
1776 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1777 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1778 return (0);
1779 }
1780 /* otherwise, we have to wait for reconstruction to complete on this
1781 * RU. */
1782 /* In order to avoid waiting for a potentially large number of
1783 * low-priority accesses to complete, we force a normal-priority (i.e.
1784 * not low-priority) reconstruction on this RU. */
1785 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1786 DDprintf1("Forcing recon on psid %ld\n", psid);
1787 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under
1788 * forced recon */
1789 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage
1790 * that we just set */
1791 fcol = raidPtr->reconControl->fcol;
1792
1793 /* get a listing of the disks comprising the indicated stripe */
1794 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1795
1796 /* For previously issued reads, elevate them to normal
1797 * priority. If the I/O has already completed, it won't be
1798 * found in the queue, and hence this will be a no-op. For
1799 * unissued reads, allocate buffers and issue new reads. The
1800 * fact that we've set the FORCED bit means that the regular
1801 * recon procs will not re-issue these reqs */
1802 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1803 if ((diskno = diskids[i]) != fcol) {
1804 if (pssPtr->issued[diskno]) {
1805 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1806 if (rf_reconDebug && nPromoted)
1807 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1808 } else {
1809 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */
1810 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1811 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare
1812 * location */
1813 new_rbuf->parityStripeID = psid; /* fill in the buffer */
1814 new_rbuf->which_ru = which_ru;
1815 new_rbuf->failedDiskSectorOffset = fd_offset;
1816 new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1817
1818 /* use NULL b_proc b/c all addrs
1819 * should be in kernel space */
1820 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1821 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1822 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1823
1824 new_rbuf->arg = req;
1825 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */
1826 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1827 }
1828 }
1829 /* if the write is sitting in the disk queue, elevate its
1830 * priority */
1831 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1832 if (rf_reconDebug)
1833 printf("raid%d: promoted write to col %d\n",
1834 raidPtr->raidid, fcol);
1835 }
1836 /* install a callback descriptor to be invoked when recon completes on
1837 * this parity stripe. */
1838 cb = rf_AllocCallbackDesc();
1839 /* XXX the following is bogus.. These functions don't really match!!
1840 * GO */
1841 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1842 cb->callbackArg.p = (void *) cbArg;
1843 cb->next = pssPtr->procWaitList;
1844 pssPtr->procWaitList = cb;
1845 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1846 raidPtr->raidid, psid);
1847
1848 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1849 return (1);
1850 }
1851 /* called upon the completion of a forced reconstruction read.
1852 * all we do is schedule the FORCEDREADONE event.
1853 * called at interrupt context in the kernel, so don't do anything illegal here.
1854 */
1855 static void
1856 ForceReconReadDoneProc(void *arg, int status)
1857 {
1858 RF_ReconBuffer_t *rbuf = arg;
1859
1860 /* Detect that reconControl is no longer valid, and if that
1861 is the case, bail without calling rf_CauseReconEvent().
1862 There won't be anyone listening for this event anyway */
1863
1864 if (rbuf->raidPtr->reconControl == NULL)
1865 return;
1866
1867 if (status) {
1868 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1869 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1870 return;
1871 }
1872 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1873 }
1874 /* releases a block on the reconstruction of the indicated stripe */
1875 int
1876 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1877 {
1878 RF_StripeNum_t stripeID = asmap->stripeID;
1879 RF_ReconParityStripeStatus_t *pssPtr;
1880 RF_ReconUnitNum_t which_ru;
1881 RF_StripeNum_t psid;
1882 RF_CallbackDesc_t *cb;
1883
1884 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1885 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1886 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1887
1888 /* When recon is forced, the pss desc can get deleted before we get
1889 * back to unblock recon. But, this can _only_ happen when recon is
1890 * forced. It would be good to put some kind of sanity check here, but
1891 * how to decide if recon was just forced or not? */
1892 if (!pssPtr) {
1893 /* printf("Warning: no pss descriptor upon unblock on psid %ld
1894 * RU %d\n",psid,which_ru); */
1895 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1896 if (rf_reconDebug || rf_pssDebug)
1897 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1898 #endif
1899 goto out;
1900 }
1901 pssPtr->blockCount--;
1902 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1903 raidPtr->raidid, psid, pssPtr->blockCount);
1904 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */
1905
1906 /* unblock recon before calling CauseReconEvent in case
1907 * CauseReconEvent causes us to try to issue a new read before
1908 * returning here. */
1909 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1910
1911
1912 while (pssPtr->blockWaitList) {
1913 /* spin through the block-wait list and
1914 release all the waiters */
1915 cb = pssPtr->blockWaitList;
1916 pssPtr->blockWaitList = cb->next;
1917 cb->next = NULL;
1918 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1919 rf_FreeCallbackDesc(cb);
1920 }
1921 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1922 /* if no recon was requested while recon was blocked */
1923 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1924 }
1925 }
1926 out:
1927 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1928 return (0);
1929 }
1930
1931 void
1932 rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr)
1933 {
1934 RF_CallbackDesc_t *p;
1935
1936 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1937 while(raidPtr->reconControl->rb_lock) {
1938 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO,
1939 "rf_wakeuphscbw", 0, &raidPtr->reconControl->rb_mutex);
1940 }
1941
1942 raidPtr->reconControl->rb_lock = 1;
1943 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1944
1945 while (raidPtr->reconControl->headSepCBList) {
1946 p = raidPtr->reconControl->headSepCBList;
1947 raidPtr->reconControl->headSepCBList = p->next;
1948 p->next = NULL;
1949 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1950 rf_FreeCallbackDesc(p);
1951 }
1952 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1953 raidPtr->reconControl->rb_lock = 0;
1954 wakeup(&raidPtr->reconControl->rb_lock);
1955 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1956
1957 }
1958
1959